def calculateTprFprRate(self, evaluation, dataset, testSet) -> dict: date = Date() Logger.Info("Starting TPR/FPR calculations : " + str(date)) # trpFprRates = {} # we convert the results into a format that's more comfortable to work with classificationItems = self.getClassificationItemList( testSet, evaluation) # for (Prediction prediction: evaluation.predictions()) { # ClassificationItem ci = new ClassificationItem((int)prediction.actual(),((NominalPrediction)prediction).distribution()); # classificationItems.add(ci); # } # now we need to know what is the minority class and the number of samples for each class minorityClassIndex = dataset.getMinorityClassIndex() numOfNonMinorityClassItems = 0 #all non-minority class samples are counted together (multi-class cases) for cls in dataset.getNumOfRowsPerClassInTestSet().keys(): if cls != minorityClassIndex: numOfNonMinorityClassItems += dataset.getNumOfRowsPerClassInTestSet( )[cls] # sort all samples by their probability of belonging to the minority class classificationItems.sort( reverse=True, key=lambda x: x.getProbabilitiesOfClass(minorityClassIndex)) # Collections.sort(classificationItems, new ClassificationItemsComparator(minorityClassIndex)); # Collections.reverse(classificationItems); tprFprValues = {} tprFprValues[0.0] = 0.0 minoritySamplesCounter = 0 majoritySamplesCounter = 0 currentProb = 2 for ci in classificationItems: currentSampleProb = ci.getProbabilitiesOfClass(minorityClassIndex) # if the probability is different, time to update the TPR/FPR statistics if currentSampleProb != currentProb: tpr = minoritySamplesCounter / dataset.getNumOfRowsPerClassInTestSet( )[minorityClassIndex] fpr = majoritySamplesCounter / numOfNonMinorityClassItems tprFprValues[tpr] = fpr currentProb = currentSampleProb if ci.getTrueClass() == minorityClassIndex: minoritySamplesCounter += 1 else: majoritySamplesCounter += 1 tprFprValues[1.0] = 1.0 tprFprValues[1.0001] = 1.0 date = Date() Logger.Info("Done : " + str(date)) return tprFprValues
def createDatasetMetaFeaturesInstances(self, dataset: Dataset, includeValueBased: bool): directoryForDataset = Properties.DatasetInstancesFilesLocation + dataset.name # File[] files; if os.path.isdir(directoryForDataset): _, _, filenames = next(os.walk(directoryForDataset)) if (filenames is not None) and (len(filenames)!=0): Logger.Info('Candidate attributes for ' + dataset.name + ' were already calculated') return try: os.mkdir(directoryForDataset) except OSError as ex: if ex.errno != errno.EEXIST: Logger.Warn(f'getDatasetMetaFeaturesInstances -> Error creating directory {directoryForDataset}\nError: {ex}') raise # List<String> metadataTypes; if includeValueBased: # This is the line that activates the (time consuming) background datasets feature generation process self.generateTrainingSetDatasetAttributes(dataset) metadataTypes = [self.DATASET_BASED, self.OA_BASED, self.VALUES_BASED] else: # for pre-ranker model self.generateTrainingSetDatasetAttributesWithoutValues(dataset) metadataTypes = [self.DATASET_BASED, self.OA_BASED] self.appendARFFFilesPerMetadataTypeForDataset(directoryForDataset, metadataTypes)
def generateBackgroundARFFFileForDataset(self, dataset:Dataset, backgroundFilePath: str, candidateAttrDirectories: list, includeValueBased: bool): addHeader = True for candidateAttrDirectory in candidateAttrDirectories: if (not candidateAttrDirectory.__contains__(dataset.name)) and FileUtils.listFilesInDir(candidateAttrDirectory)!=None: #none means dir exist merged = self.getMergedFile(candidateAttrDirectory,includeValueBased) if merged is not None: MLAttributeManager.addArffFileContentToTargetFile(backgroundFilePath, merged[0].getAbsolutePath(),addHeader) addHeader = False else: instances = [] #List<Instances> instances = new ArrayList<>(); for file in listFilesInDir(candidateAttrDirectory): if (file.contains('.arff') and not(not includeValueBased and file.contains(self.VALUES_BASED)) and not(file.contains('merged'))): absFilePath = os.path.abspath(file) instance = Loader().readArffAsDataframe(absFilePath) instances.append(instance) else: Logger.Info(f'Skipping file: {file}') mergedFile = self.mergeInstancesToFile(includeValueBased, candidateAttrDirectory, instances) if mergedFile is None: continue self.addArffFileContentToTargetFile(backgroundFilePath, FileUtils.getAbsPath(mergedFile), addHeader) addHeader = False
def generateColumn(dataset: Dataset, os: OperatorAssignment, finalAttribute: bool): writeToFile = False try: ci = None # No writing to files # if finalAttribute and writeToFile: # ci = OperatorsAssignmentsManager.readColumnInfoFromFile(dataset.name, os.getName()) if ci == None: operator = None try: operator = OperatorsAssignmentsManager.getOperator( os.getOperator()) except Exception as ex: Logger.Info("Sleeping, try again") time.sleep(0.1) operator = OperatorsAssignmentsManager.getOperator( os.getOperator()) operator.processTrainingSet(dataset, os.getSources(), os.getTargets()) try: ci = operator.generate(dataset, os.getSources(), os.getTargets()) except: x = 5 if (ci is not None) and (os is not None) and ( os.getSecondaryOperator() is not None): replica = dataset.emptyReplica() replica.addColumn(ci) uOperator = os.getSecondaryOperator() tempList = [] tempList.append(ci) try: uOperator.processTrainingSet(replica, tempList, None) ci2 = uOperator.generate(replica, tempList, None, True) ci = ci2 except Exception as ex: pass if finalAttribute and writeToFile: # write the column to file, so we don't have to calculate it again OperatorsAssignmentsManager.writeColumnInfoToFile( dataset.name, os.getName(), ci) return ci except Exception as ex: operator = OperatorsAssignmentsManager.getOperator( os.getOperator()) operator.processTrainingSet(dataset, os.getSources(), os.getTargets()) Logger.Error("Error while generating column: " + str(ex), ex) raise Exception("Failure to generate column")
def initializeBackgroundModel(self, dataset: Dataset): Logger.Info('Initializing background model for dataset' + dataset.name) mlam = MLAttributeManager() self.classifier = mlam.getBackgroundClassificationModel(dataset, True) dba = DatasetBasedAttributes() self.datasetAttributes = dba.getDatasetBasedFeatures( dataset, Properties.classifier)
def getInstancesFromARFF(self, backgroundFilePath: str): # BufferedReader reader = new BufferedReader(new FileReader(backgroundFilePath + ".arff")); data = Loader().readArffAsDataframe(backgroundFilePath + '.arff') Logger.Info('reading from file ' + backgroundFilePath + '.arff') # ArffLoader.ArffReader arffReader = new ArffLoader.ArffReader(reader); # Instances data = arffReader.getData(); # data.setClassIndex(data.numAttributes() - 1); return data
def initializeBackgroundModel(self, dataset: Dataset): Logger.Info("Initializing background model for pre-ranking process") mlam = MLAttributeManager() classifier = mlam.getBackgroundClassificationModel(dataset, False) dba = DatasetBasedAttributes() datasetAttributes = dba.getDatasetBasedFeatures( dataset, Properties.classifier) return classifier, datasetAttributes
def createClassAttribute(self, originalAuc: float, datasetReplica: Dataset, evaluationResults1): auc = self.CalculateAUC(evaluationResults1, datasetReplica.df) deltaAuc = auc - originalAuc if deltaAuc > 0.01: classAttribute = AttributeInfo("classAttribute", Operator.outputType.Discrete, 1, 2) Logger.Info("found positive match with delta " + str(deltaAuc)) else: classAttribute = AttributeInfo("classAttribute", Operator.outputType.Discrete, 0, 2) return classAttribute
def generateAttributeAndCalculateFilterEvaluatorScore( dataset: Dataset, filterEvaluator: FilterEvaluator, subFoldTrainingDatasets: List[Dataset], currentScores: List[ClassificationResults], operatorAssignments: List[OperatorAssignment]): # //System.setProperty("java.util.concurrent.ForkJoinPool.common.parallelism", "1"); Logger.Info( "generateAttributeAndCalculateFilterEvaluatorScore -> num of attributes to evaluate: " + str(len(operatorAssignments))) counter = 0 numOfThread = Properties.numOfThreads def evaluateScore(oa): try: # attributeGenerationLock.lock(); replicatedDataset = dataset.replicateDataset() # counter += 1 # if (counter % 1000) == 0: # date = Date() # Logger.Info("generateAttributeAndCalculateFilterEvaluatorScore -> analyzed " + counter + " attributes : " + date.toString()) # // attributeGenerationLock.unlock(); ci = OperatorsAssignmentsManager.generateColumn( replicatedDataset, oa, True) # if the filter evaluator is not null, we'll conduct the initial evaluation of the new attribute if (ci is not None) and (filterEvaluator is not None): # filterEvaluationLock.lock(); cloneEvaluator = filterEvaluator.getCopy() replicatedSubFoldsList = [] for subFoldDataset in subFoldTrainingDatasets: replicatedSubFoldsList.append( subFoldDataset.replicateDataset()) # filterEvaluationLock.unlock(); filterEvaluatorScore = OperatorsAssignmentsManager.EvaluateAttributeUsingTrainingSubFolds( replicatedSubFoldsList, cloneEvaluator, oa, currentScores) # oa.setFilterEvaluatorScore(filterEvaluatorScore) return filterEvaluatorScore except Exception as ex: Logger.Error( "generateAttributeAndCalculateFilterEvaluatorScore -> error when generating and evaluating attribute: " + oa.getName(), ex) return None if (numOfThread > 1): filterEvaluatorScores = Parallel.ParallelForEach( evaluateScore, [[oa] for oa in operatorAssignments]) for i, oa in enumerate(operatorAssignments): oa.setFilterEvaluatorScore(filterEvaluatorScores[i]) else: for oa in operatorAssignments: oa.setFilterEvaluatorScore(evaluateScore(oa))
def produceClassificationResults(self, datasets: list) -> list: classificationResultsPerFold = [] for dataset in datasets: date = Date() Logger.Info("Starting to run classifier " + str(date)) trainSet = dataset.generateSet(True) testSet = dataset.generateSet(False) evaluationResults = self.runClassifier(Properties.classifier, trainSet, testSet) date = Date() Logger.Info("Starting to process classification results " + str(date)) classificationResults = self.getClassificationResults( evaluationResults, dataset, testSet) date = Date() Logger.Info("Done " + str(date)) classificationResultsPerFold.append(classificationResults) return classificationResultsPerFold
def generateMetaFeaturesInstances(self, includeValueBased: bool): datasetFilesForBackgroundArray = self.getOriginalBackgroundDatasets() for datasetForBackgroundModel in datasetFilesForBackgroundArray: possibleFolderName = Properties.DatasetInstancesFilesLocation + \ FileUtils.getFilenameFromPath(datasetForBackgroundModel) + '_' + str(Properties.randomSeed) if not os.path.isdir(possibleFolderName): loader = Loader() Logger.Info("Getting candidate attributes for " + datasetForBackgroundModel) backgroundDataset = loader.readArff(datasetForBackgroundModel, int(Properties.randomSeed), None, None, 0.66) self.createDatasetMetaFeaturesInstances(backgroundDataset, includeValueBased)
def getFilter(self, filterName: str, dataset: Dataset) -> FilterEvaluator: Logger.Info("Getting filter evaluator - " + filterName) # switch try: return { "InformationGainFilterEvaluator": InformationGainFilterEvaluator(), "MLFilterEvaluator": MLFilterEvaluator(dataset) }[filterName] except: raise Exception("Unidentified evaluator")
def buildClassifierModel(self, backgroundFilePath: str, data): # the chosen classifier classifier = RandomForestClassifier() # classifier.setNumExecutionSlots(Integer.parseInt(properties.getProperty("numOfThreads"))); # classifier.buildClassifier(data); classifier.fit(data.drop(['class']), data['class']) file = backgroundFilePath + '.arff' FileUtils.deleteFile(file) Logger.Info('Saving classifier model ' + backgroundFilePath) self.writeClassifierTobackgroundFile(backgroundFilePath, classifier) return classifier
def run_list_of_commands(self, commands, dryrun): # run the commands if dryrun: Logger.Info("\nDryrun! Not executed commands: ", color="red") for cmd in commands: if cmd.workpath is None: print(cmd.getCommand() + "\n") else: print("cd %s ; %s\n" % (cmd.workpath, cmd.getCommand())) else: SLURM_NTASKS_PER_NODE = os.getenv("SLURM_NTASKS_PER_NODE") if SLURM_NTASKS_PER_NODE is not None: SLURM_NTASKS_PER_NODE = int(SLURM_NTASKS_PER_NODE) Logger.Info("Number of processes limited by SLURM to %d" % SLURM_NTASKS_PER_NODE) exitcodes = ShellScript.run_scripts_parallel( commands, nproc=SLURM_NTASKS_PER_NODE) # check for any errors for c in exitcodes: if c[0] != 0: raise Exception, "exitcode: %d during parallel execution:\ncommand:\n%s\n\noutput:\n%s!" % ( c[0], c[2], c[1])
def getBackgroundClassificationModel(self, dataset: Dataset, includeValueBased: bool): backgroundFilePath = self.getBackgroundFilePath(dataset, includeValueBased) path = backgroundFilePath # If the classification model already exists, load and return it if os.path.isfile(path): Logger.Info("Background model already exists. Extracting from " + path) return self.getClassificationModel(dataset, backgroundFilePath) #Otherwise, generate, save and return it (WARNING - takes time) else: Logger.Info("Background model doesn't exist for dataset " + dataset.name + ". Creating it...") # We begin by getting a list of all the datasets that need to participate in the creation of the background model self.generateMetaFeaturesInstances(includeValueBased) candidateAtrrDirectories = self.getDirectoriesInFolder(Properties.DatasetInstancesFilesLocation) self.generateBackgroundARFFFileForDataset(dataset, backgroundFilePath, candidateAtrrDirectories, includeValueBased) # now we load the contents of the ARFF file into an Instances object and train the classifier data = self.getInstancesFromARFF(backgroundFilePath) return self.buildClassifierModel(backgroundFilePath, data)
def readArff(self, filePath: str, randomSeed: int, distinctValIndices: list, classAttIndex: str, trainingSetPercentageOfDataset: float) -> Dataset: try: data = arff.loadarff(filePath) df = pd.DataFrame(data[0]) Logger.Info(f'num of attributes: {len(df.keys())}') Logger.Info(f'num of instances: {len(df.values)}') if (classAttIndex == None) or (classAttIndex == ''): targetClassName = df.keys()[-1] else: targetClassName = classAttIndex df[targetClassName] = df[targetClassName].str.decode("utf-8") if distinctValIndices == None: folds = self.GenerateFolds(df[targetClassName], randomSeed, trainingSetPercentageOfDataset) else: pass #TODO: missing func? distinctValColumnInfos = [] if distinctValIndices != None: for distinctColumnIndex in distinctValIndices: distinctValColumnInfos.append(df[distinctColumnIndex]) # Fially, we can create the Dataset object return Dataset( df, folds, targetClassName, data[1].name, randomSeed, Properties.maxNumberOfDiscreteValuesForInclusionInSet) except Exception as ex: Logger.Error(f'Exception in readArff. message: {ex}') return None
def generateTrainingSetDatasetAttributesWithoutValues(self, dataset): Logger.Info("Generating dataset attributes for dataset: " + dataset.name) # DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss"); startDate = Date() # The structure: Classifier -> candidate feature (operator assignment, to be exact) -> meta-feature type -> A map of feature indices and values # { classifier: # { OperatorAssigment: # { meta-feature type: {indice, value}} # TreeMap<String, HashMap<OperatorAssignment,HashMap<String,TreeMap<Integer,AttributeInfo>>>> candidateAttributesList = new TreeMap<>() candidateAttributesList = {} classifiers = Properties.classifiersForMLAttributesGeneration.split(',') # obtaining the attributes for the dataset itself is straightforward dba = DatasetBasedAttributes() for classifier in classifiers: candidateAttributesList[classifier] = {} originalAuc = self.getOriginalAuc(dataset, classifier) # Generate the dataset attributes datasetAttributes = dba.getDatasetBasedFeatures(dataset, classifier) # now we need to generate the candidate attributes and evaluate them. This requires a few preliminary steps: # 1) Replicate the dataset and create the discretized features and add them to the dataset unaryOperators = OperatorsAssignmentsManager.getUnaryOperatorsList() # The unary operators need to be evaluated like all other operator assignments (i.e. attribtues generation) unaryOperatorAssignments = OperatorsAssignmentsManager.getOperatorAssignments(dataset, None, unaryOperators, int(Properties.maxNumOfAttsInOperatorSource)) replicatedDataset = self.generateDatasetReplicaWithDiscretizedAttributes(dataset, unaryOperatorAssignments) # 2) Obtain all other operator assignments (non-unary). IMPORTANT: this is applied on the REPLICATED dataset so we can take advantage of the discretized features nonUnaryOperators = OperatorsAssignmentsManager.getNonUnaryOperatorsList() nonUnaryOperatorAssignments = OperatorsAssignmentsManager.getOperatorAssignments(replicatedDataset, None, nonUnaryOperators, int(Properties.maxNumOfAttsInOperatorSource)) # 3) Generate the candidate attribute and generate its attributes nonUnaryOperatorAssignments.addAll(unaryOperatorAssignments) # oaList.parallelStream().forEach(oa -> { # ReentrantLock wrapperResultsLock = new ReentrantLock(); # for (OperatorAssignment oa : nonUnaryOperatorAssignments) { position = [0] #new int[]{0}; # TODO: keep it pararell, temporary changed to single thread # nonUnaryOperatorAssignments.parallelStream().forEach(oa -> { for oa in nonUnaryOperatorAssignments: try: datasetReplica = dataset.replicateDataset() # Here we generate all the meta-features that are "parent dependent" and do not require us to generate the values of the new attribute oaba = OperatorAssignmentBasedAttributes() # TreeMap < Integer, AttributeInfo > candidateAttributeValuesFreeMetaFeatures = oaba.getOperatorAssignmentBasedMetaFeatures(dataset, oa) evaluationInfo = self.runClassifier(classifier, datasetReplica.generateSet(True), datasetReplica.generateSet(False)) evaluationResults1 = evaluationInfo.getEvaluationStats() # synchronized (this){ #TODO: part of the pararell stream # candidateAttributesList.get(classifier).put(oa, new HashMap<>()); # candidateAttributesList.get(classifier).get(oa).put(DATASET_BASED, datasetAttributes); candidateAttributesList[classifier][oa][MLAttributeManager.DATASET_BASED] = datasetAttributes # Add the identifier of the classifier that was used classifierAttribute = AttributeInfo("Classifier", Operator.outputType.Discrete, self.getClassifierIndex(classifier), 3) candidateAttributeValuesFreeMetaFeatures[len(candidateAttributeValuesFreeMetaFeatures)] = classifierAttribute candidateAttributesList[classifier][oa][MLAttributeManager.OA_BASED] = candidateAttributeValuesFreeMetaFeatures # candidateAttributeValuesDependentMetaFeatures = oaba.getGeneratedAttributeValuesMetaFeatures(dataset, oa, candidateAttribute) # candidateAttributesList[classifier][oa][MLAttributeManager.VALUES_BASED] = candidateAttributeValuesDependentMetaFeatures candidateAttributesList[classifier][oa][MLAttributeManager.OA_BASED][candidateAttributesList[classifier][oa][MLAttributeManager.OA_BASED].size()] = self.createClassAttribute(originalAuc, datasetReplica, evaluationResults1) # wrapperResultsLock.lock(); #TODO: part of the pararell stream if (len(candidateAttributesList[classifier]) % 1000) == 0: date = Date() Logger.Info(date.__str__() + ": Finished processing " + ((position[0] * MLAttributeManager.ITERATION) + len(candidateAttributesList[classifier]) + '/' + nonUnaryOperatorAssignments.size() + ' elements for background model')) if (len(candidateAttributesList[classifier]) % MLAttributeManager.ITERATION) == 0: self.savePartArffCandidateAttributes(candidateAttributesList,classifier,dataset,position[0]) position[0] += 1 candidateAttributesList[classifier].clear() # wrapperResultsLock.unlock(); #TODO: part of the pararell stream except Exception as ex: Logger.Error("Error in ML features generation : " + oa.getName() + " : " + str(ex)) self.savePartArffCandidateAttributes(candidateAttributesList,classifier,dataset,position[0]) finishDate = Date() diffInMillies = finishDate - startDate Logger.Info("Getting candidate attributes for dataset " + dataset.name + " took " + diffInMillies.seconds.__str__() + " seconds")
def run(self, originalDataset: Dataset, runInfo: str): Logger.Info('Initializing evaluators') filterEvaluator = MLFilterEvaluator(originalDataset) preRankerEvaluator = None if bool(Properties.usePreRanker): preRankerEvaluator = FilterPreRankerEvaluator(originalDataset) if Properties.wrapperApproach == 'AucWrapperEvaluator': wrapperEvaluator = AucWrapperEvaluator() else: Logger.Error('Missing wrapper approach') raise Exception('Missing wrapper approach') experimentStartDate = Date() Logger.Info("Experiment Start Date/Time: " + str(self.experimentStartDate) + " for dataset " + originalDataset.name) # The first step is to evaluate the initial attributes, so we get a reference point to how well we did wrapperEvaluator.EvaluationAndWriteResultsToFile( originalDataset, "", 0, runInfo, True, 0, -1, -1) # now we create the replica of the original dataset, to which we can add columns dataset = originalDataset.replicateDataset() # Get the training set sub-folds, used to evaluate the various candidate attributes originalDatasetTrainingFolds = originalDataset.GenerateTrainingSetSubFolds( ) subFoldTrainingDatasets = dataset.GenerateTrainingSetSubFolds() date = Date() # We now apply the wrapper on the training subfolds in order to get the baseline score. This is the score a candidate attribute needs to "beat" currentScore = wrapperEvaluator.produceAverageScore( subFoldTrainingDatasets, None, None, None, None) Logger.Info(f"Initial score: {str(currentScore)} : {date}") # The probabilities assigned to each instance using the ORIGINAL dataset (training folds only) Logger.Info(f"Producing initial classification results: {date}") currentClassificationProbs = wrapperEvaluator.produceClassificationResults( originalDatasetTrainingFolds) date = Date() Logger.Info(f" .....done {date}") # Apply the unary operators (discretizers, normalizers) on all the original features. The attributes generated # here are different than the ones generated at later stages because they are included in the dataset that is # used to generate attributes in the iterative search phase Logger.Info(f"Starting to apply unary operators: {date}") oam = OperatorsAssignmentsManager() candidateAttributes = oam.applyUnaryOperators( dataset, None, filterEvaluator, subFoldTrainingDatasets, currentClassificationProbs) date = Date() Logger.Info(" .....done " + str(date)) # Now we add the new attributes to the dataset (they are added even though they may not be included in the # final dataset beacuse they are essential to the full generation of additional features Logger.Info("Starting to generate and add columns to dataset: " + str(date)) oam.GenerateAndAddColumnToDataset(dataset, candidateAttributes) date = Date() Logger.Info(" .....done " + str(date)) # The initial dataset has been populated with the discretized/normalized features. Time to begin the search iterationsCounter = 1 columnsAddedInthePreviousIteration = None self.performIterativeSearch( originalDataset, runInfo, preRankerEvaluator, filterEvaluator, wrapperEvaluator, dataset, originalDatasetTrainingFolds, subFoldTrainingDatasets, currentClassificationProbs, oam, candidateAttributes, iterationsCounter, columnsAddedInthePreviousIteration)
def performIterativeSearch( self, originalDataset: Dataset, runInfo: str, preRankerEvaluator: FilterPreRankerEvaluator, filterEvaluator: FilterEvaluator, wrapperEvaluator: AucWrapperEvaluator, dataset: Dataset, originalDatasetTrainingFolds: List[Dataset], subFoldTrainingDatasets: List[Dataset], currentClassificationProbs: List[ClassificationResults], oam: OperatorsAssignmentsManager, candidateAttributes: List[OperatorAssignment], iterationsCounter: int, columnsAddedInthePreviousIteration): totalNumberOfWrapperEvaluations = 0 rankerFilter = self.getRankerFilter(Properties.rankerApproach) #TODO: make sure not exceeding property "maxNumOfWrapperEvaluationsPerIteration" def evaluateOperationAssignment( oa: OperatorAssignment ) -> Tuple[float, Optional[OperatorAssignment]]: try: if oa.getFilterEvaluatorScore() != float( '-inf') and oa.getFilterEvaluatorScore() > 0.001: score = OperatorsAssignmentsManager.applyOperatorAndPerformWrapperEvaluation( originalDatasetTrainingFolds, oa, wrapperEvaluator, localCurrentClassificationProbs, None) oa.setWrapperEvaluatorScore(score) return (score, oa) # wrapperResultsLock.lock(); # evaluatedAttsCounter ++; # we want to keep tabs on the OA with the best observed wrapper performance # if topRankingAssignment == None or topRankingAssignment.getWrapperEvaluatorScore() < score: # Logger.Info("found new top ranking assignment") # topRankingAssignment = oa # if isStoppingCriteriaMet(filterEvaluator, wrapperEvaluator, oa, score, topRankingAssignment): # chosenOperatorAssignment = oa # if (evaluatedAttsCounter % 100) == 0: # currentDate = Date() # Logger.Info( # f"performIterativeSearch -> Evaluated: {evaluatedAttsCounter} attributes: {str(currentDate)}") # # wrapperResultsLock.unlock(); except Exception as ex: Logger.Error(f"Exception occurred {ex}", ex) return (0.0, None) while iterationsCounter <= self.maxIteration: filterEvaluator.recalculateDatasetBasedFeatures(originalDataset) date = Date() Logger.Info( f"performIterativeSearch -> Starting search iteration {int(iterationsCounter)}{str(date)}" ) # recalculte the filter evaluator score of the existing attributes OperatorsAssignmentsManager.recalculateFilterEvaluatorScores( dataset, candidateAttributes, subFoldTrainingDatasets, filterEvaluator, currentClassificationProbs) # now we generate all the candidate features date = Date() Logger.Info( f"performIterativeSearch -> Starting feature generation: {str(date)}" ) candidateAttributes.addAll( oam.applyNonUnaryOperators(dataset, columnsAddedInthePreviousIteration, preRankerEvaluator, filterEvaluator, subFoldTrainingDatasets, currentClassificationProbs)) date = Date() Logger.Info( f"performIterativeSearch -> Finished feature generation: {str(date)}" ) # Sort the candidates by their initial (filter) score and test them using the wrapper evaluator candidateAttributes = rankerFilter.rankAndFilter( candidateAttributes, columnsAddedInthePreviousIteration, subFoldTrainingDatasets, currentClassificationProbs) Logger.Info( f"performIterativeSearch -> Starting wrapper evaluation : {str(date)}" ) evaluatedAttsCounter = 0 chosenOperatorAssignment = None topRankingAssignment = None # ReentrantLock wrapperResultsLock = new ReentrantLock(); numOfThreads = Properties.numOfThreads localCurrentClassificationProbs = currentClassificationProbs # for i in range(len(candidateAttributes), numOfThreads): # if chosenOperatorAssignment != None: # break # oaList = candidateAttributes[i, i + min(numOfThreads, len(candidateAttributes)-i)] # oaList.parallelStream().forEach(oa -> { evaluatedCandidateAttrs = Parallel.ParallelForEach( evaluateOperationAssignment, [[oa] for oa in candidateAttributes]) from operator import itemgetter tempTopRank = max(evaluatedCandidateAttrs, key=itemgetter(0)) # if self.isStoppingCriteriaMet(tempTopRank[0]): totalNumberOfWrapperEvaluations += len(evaluatedCandidateAttrs) Logger.Info( f"performIterativeSearch -> Finished wrapper evaluation : {str(date)}" ) # remove the chosen attribute from the list of "candidates" candidateAttributes.remove(chosenOperatorAssignment) # The final step - add the new attribute to the datasets # start with the dataset used in the following search iterations columnsAddedInthePreviousIteration = OperatorsAssignmentsManager.addAddtibuteToDataset( dataset, chosenOperatorAssignment, True, currentClassificationProbs) # continue with the final dataset OperatorsAssignmentsManager.addAddtibuteToDataset( originalDataset, chosenOperatorAssignment, False, currentClassificationProbs) # finally, we need to recalculate the baseline score used for the attribute selection (using the updated final dataset) currentClassificationProbs = wrapperEvaluator.produceClassificationResults( originalDatasetTrainingFolds) expDescription = '' expDescription += f"Evaluation results for iteration {str(iterationsCounter)}\n" expDescription += f"Added attribute: {chosenOperatorAssignment.getName()}\n" wrapperEvaluator.EvaluationAndWriteResultsToFile( originalDataset, chosenOperatorAssignment.getName(), iterationsCounter, runInfo, False, evaluatedAttsCounter, chosenOperatorAssignment.getFilterEvaluatorScore(), chosenOperatorAssignment.getWrapperEvaluatorScore()) iterationsCounter += 1 # some cleanup, if required filterEvaluator.deleteBackgroundClassificationModel(originalDataset) # After the search process is over, write the total amount of time spent and the number of wrapper evaluations that were conducted self.writeFinalStatisticsToResultsFile( dataset.name, runInfo, self.experimentStartDate, totalNumberOfWrapperEvaluations)
class DataReader(): def __init__(self): self.Logger = Logger('./logs/', 'log') def find(self, name, path): for root, directory, files in os.walk(path): if name in files: return os.path.join(root, name), root return None, None def find_directory(self, name, path): for root, _, files in os.walk(path): directory = root.split('/')[-1] if name == directory: return root def read_kdef(self): images = [] labels = {} kdef_labels_path, _ = self.find('kdef_labels.csv', BaseDirectory) label_file = open(kdef_labels_path, 'r') lines = label_file.readlines()[1:] #find kdef image directory kdef_directory = self.find_directory('KDEF', BaseDirectory) self.Logger.Info("Reading KDEF images from: " + str(kdef_directory)) for line in lines: #split the csv imageName, _, _, _, _, trustworthiness, dominance, attractiveness = line.rstrip( '\n').split(',') imageName = imageName.upper() imagePath, _ = self.find(imageName, kdef_directory) if (imagePath == None): self.Logger.Error( "The image path was None while reading kdef data: " + str(imageName)) continue images.append(imagePath) labels[imagePath] = float(trustworthiness), float( dominance), float(attractiveness) return ({ 'train': images[:int(len(images) * 0.70)], 'validation': images[int(len(images) * .71):int(len(images) * .90)], 'test': images[int(len(images) * .91):int(len(images))] }, labels) def read_celeb_a(self): images = [] #load labels label_file_path, _ = self.find('list_attr_celeba.txt', BaseDirectory) label_file = open(label_file_path, 'r') labels_lines = label_file.readlines()[2:] labels = [] labels_dict = {} for line in labels_lines: #find image path imagePath = BaseDirectory + '/img_align_celeba/' + line.split( ' ', 1)[0] images.append(imagePath) #extract features line = line.strip().split(' ')[1:] #labels.append([int(x) for x in line[1:]]) for label in line: if (label == '-1' or label == '-1\n'): labels.append(0) elif (label == '1' or label == '1\n'): labels.append(1) labels_dict[imagePath] = labels labels = [] return ({ 'train': images[:int(len(images) * .70)], 'validation': images[int(len(images) * .71):int(len(images) * .90)], 'test': images[int(len(images) * 0.91):int(len(images))] }, labels_dict) def Read_Splits(self, split_count): if (split_count >= len(SPLIT_LIST)): print("Split count outside of index range") return VALNAMES = [] TRAINNAMES = [] VALLABELS = [] TRAINLABELS = [] IMGNAMES = [] LABELS = [] labels_dict = {} for i in range(5): if SPLIT_LIST[i] != SPLIT_LIST[split_count]: f = open(SPLIT_LIST[i], 'r') for line in f: line = line.strip().split(',') temp = line[0].replace('.jpg', '.JPG') IMGNAMES.append(temp) LABELS.append(line[5]) LABELS.append(line[6]) LABELS.append(line[7]) f.close LABELS = np.asarray(LABELS) LABELS = np.reshape(LABELS, (-1, 3)) for i in range(len(LABELS)): TRAINNAMES.append(IMGNAMES[i]) TRAINLABELS.append(LABELS[i]) labels_dict[IMGNAMES[i]] = LABELS[i] # Read in validation data IMGNAMES = [] LABELS = [] f = open(SPLIT_LIST[split_count], 'r') for line in f: line = line.strip().split(',') temp = line[0].replace('.jpg', '.JPG') IMGNAMES.append(temp) LABELS.append(line[5]) LABELS.append(line[6]) LABELS.append(line[7]) f.close LABELS = np.asarray(LABELS) LABELS = np.reshape(LABELS, (-1, 3)) for i in range(len(LABELS)): VALNAMES.append(IMGNAMES[i]) VALLABELS.append(LABELS[i]) labels_dict[IMGNAMES[i]] = LABELS[i] return ({'train': TRAINNAMES, 'test': VALNAMES}, labels_dict) def Fix_Paths(self): f = open('./split_labels/kdef_split_five.csv', 'r') fout = open('./split_labels/kdef_split_five_fullPath.csv', 'w') for readline in f: line = readline.strip().split(',') imageName = line[0].replace('.jpg', '.JPG') fullPath, _ = self.find(imageName, './KDEF') newstring = readline.replace(line[0], fullPath) fout.write(newstring) f.close() fout.close() def weights_exist(self, file_path): return os.path.exists(file_path)