Пример #1
0
    def calculateTprFprRate(self, evaluation, dataset, testSet) -> dict:
        date = Date()
        Logger.Info("Starting TPR/FPR calculations : " + str(date))

        # trpFprRates = {}

        # we convert the results into a format that's more comfortable to work with
        classificationItems = self.getClassificationItemList(
            testSet, evaluation)
        # for (Prediction prediction: evaluation.predictions()) {
        #     ClassificationItem ci = new ClassificationItem((int)prediction.actual(),((NominalPrediction)prediction).distribution());
        #     classificationItems.add(ci);
        # }

        # now we need to know what is the minority class and the number of samples for each class
        minorityClassIndex = dataset.getMinorityClassIndex()
        numOfNonMinorityClassItems = 0  #all non-minority class samples are counted together (multi-class cases)
        for cls in dataset.getNumOfRowsPerClassInTestSet().keys():
            if cls != minorityClassIndex:
                numOfNonMinorityClassItems += dataset.getNumOfRowsPerClassInTestSet(
                )[cls]

        # sort all samples by their probability of belonging to the minority class
        classificationItems.sort(
            reverse=True,
            key=lambda x: x.getProbabilitiesOfClass(minorityClassIndex))
        # Collections.sort(classificationItems, new ClassificationItemsComparator(minorityClassIndex));
        # Collections.reverse(classificationItems);

        tprFprValues = {}
        tprFprValues[0.0] = 0.0
        minoritySamplesCounter = 0
        majoritySamplesCounter = 0
        currentProb = 2
        for ci in classificationItems:
            currentSampleProb = ci.getProbabilitiesOfClass(minorityClassIndex)
            # if the probability is different, time to update the TPR/FPR statistics
            if currentSampleProb != currentProb:
                tpr = minoritySamplesCounter / dataset.getNumOfRowsPerClassInTestSet(
                )[minorityClassIndex]
                fpr = majoritySamplesCounter / numOfNonMinorityClassItems
                tprFprValues[tpr] = fpr
                currentProb = currentSampleProb

            if ci.getTrueClass() == minorityClassIndex:
                minoritySamplesCounter += 1
            else:
                majoritySamplesCounter += 1

        tprFprValues[1.0] = 1.0
        tprFprValues[1.0001] = 1.0
        date = Date()
        Logger.Info("Done : " + str(date))
        return tprFprValues
Пример #2
0
    def createDatasetMetaFeaturesInstances(self, dataset: Dataset, includeValueBased: bool):
        directoryForDataset = Properties.DatasetInstancesFilesLocation + dataset.name
        # File[] files;

        if os.path.isdir(directoryForDataset):
            _, _, filenames = next(os.walk(directoryForDataset))
            if (filenames is not None) and (len(filenames)!=0):
                Logger.Info('Candidate attributes for ' + dataset.name + ' were already calculated')
                return

        try:
            os.mkdir(directoryForDataset)
        except OSError as ex:
            if ex.errno != errno.EEXIST:
                Logger.Warn(f'getDatasetMetaFeaturesInstances -> Error creating directory {directoryForDataset}\nError: {ex}')
                raise

        # List<String> metadataTypes;
        if includeValueBased:
            # This is the line that activates the (time consuming) background datasets feature generation process
            self.generateTrainingSetDatasetAttributes(dataset)
            metadataTypes = [self.DATASET_BASED, self.OA_BASED, self.VALUES_BASED]
        else:
            # for pre-ranker model
            self.generateTrainingSetDatasetAttributesWithoutValues(dataset)
            metadataTypes = [self.DATASET_BASED, self.OA_BASED]

        self.appendARFFFilesPerMetadataTypeForDataset(directoryForDataset, metadataTypes)
Пример #3
0
    def generateBackgroundARFFFileForDataset(self, dataset:Dataset, backgroundFilePath: str, candidateAttrDirectories: list, includeValueBased: bool):
        addHeader = True
        for candidateAttrDirectory in candidateAttrDirectories:

            if (not candidateAttrDirectory.__contains__(dataset.name)) and FileUtils.listFilesInDir(candidateAttrDirectory)!=None: #none means dir exist

                merged = self.getMergedFile(candidateAttrDirectory,includeValueBased)
                if merged is not None:
                    MLAttributeManager.addArffFileContentToTargetFile(backgroundFilePath, merged[0].getAbsolutePath(),addHeader)
                    addHeader = False

                else:
                    instances = [] #List<Instances> instances = new ArrayList<>();
                    for file in listFilesInDir(candidateAttrDirectory):
                        if (file.contains('.arff') and not(not includeValueBased and file.contains(self.VALUES_BASED)) and not(file.contains('merged'))):
                            absFilePath = os.path.abspath(file)
                            instance = Loader().readArffAsDataframe(absFilePath)
                            instances.append(instance)

                        else:
                            Logger.Info(f'Skipping file: {file}')

                    mergedFile = self.mergeInstancesToFile(includeValueBased, candidateAttrDirectory, instances)
                    if mergedFile is None:
                        continue
                    self.addArffFileContentToTargetFile(backgroundFilePath, FileUtils.getAbsPath(mergedFile), addHeader)
                    addHeader = False
Пример #4
0
    def generateColumn(dataset: Dataset, os: OperatorAssignment,
                       finalAttribute: bool):
        writeToFile = False
        try:
            ci = None
            # No writing to files
            # if finalAttribute and writeToFile:
            #     ci = OperatorsAssignmentsManager.readColumnInfoFromFile(dataset.name, os.getName())
            if ci == None:
                operator = None
                try:
                    operator = OperatorsAssignmentsManager.getOperator(
                        os.getOperator())

                except Exception as ex:
                    Logger.Info("Sleeping, try again")
                    time.sleep(0.1)
                    operator = OperatorsAssignmentsManager.getOperator(
                        os.getOperator())

                operator.processTrainingSet(dataset, os.getSources(),
                                            os.getTargets())

                try:
                    ci = operator.generate(dataset, os.getSources(),
                                           os.getTargets())

                except:
                    x = 5

                if (ci is not None) and (os is not None) and (
                        os.getSecondaryOperator() is not None):
                    replica = dataset.emptyReplica()
                    replica.addColumn(ci)
                    uOperator = os.getSecondaryOperator()
                    tempList = []
                    tempList.append(ci)
                    try:
                        uOperator.processTrainingSet(replica, tempList, None)
                        ci2 = uOperator.generate(replica, tempList, None, True)
                        ci = ci2

                    except Exception as ex:
                        pass

                if finalAttribute and writeToFile:
                    # write the column to file, so we don't have to calculate it again
                    OperatorsAssignmentsManager.writeColumnInfoToFile(
                        dataset.name, os.getName(), ci)

            return ci

        except Exception as ex:
            operator = OperatorsAssignmentsManager.getOperator(
                os.getOperator())
            operator.processTrainingSet(dataset, os.getSources(),
                                        os.getTargets())
            Logger.Error("Error while generating column: " + str(ex), ex)
            raise Exception("Failure to generate column")
Пример #5
0
    def initializeBackgroundModel(self, dataset: Dataset):
        Logger.Info('Initializing background model for dataset' + dataset.name)
        mlam = MLAttributeManager()
        self.classifier = mlam.getBackgroundClassificationModel(dataset, True)

        dba = DatasetBasedAttributes()
        self.datasetAttributes = dba.getDatasetBasedFeatures(
            dataset, Properties.classifier)
Пример #6
0
 def getInstancesFromARFF(self, backgroundFilePath: str):
     # BufferedReader reader = new BufferedReader(new FileReader(backgroundFilePath + ".arff"));
     data = Loader().readArffAsDataframe(backgroundFilePath + '.arff')
     Logger.Info('reading from file ' + backgroundFilePath + '.arff')
     # ArffLoader.ArffReader arffReader = new ArffLoader.ArffReader(reader);
     # Instances data = arffReader.getData();
     # data.setClassIndex(data.numAttributes() - 1);
     return data
    def initializeBackgroundModel(self, dataset: Dataset):
        Logger.Info("Initializing background model for pre-ranking process")
        mlam = MLAttributeManager()
        classifier = mlam.getBackgroundClassificationModel(dataset, False)

        dba = DatasetBasedAttributes()
        datasetAttributes = dba.getDatasetBasedFeatures(
            dataset, Properties.classifier)
        return classifier, datasetAttributes
Пример #8
0
 def createClassAttribute(self, originalAuc: float, datasetReplica: Dataset, evaluationResults1):
     auc = self.CalculateAUC(evaluationResults1, datasetReplica.df)
     deltaAuc = auc - originalAuc
     if deltaAuc > 0.01:
         classAttribute =  AttributeInfo("classAttribute", Operator.outputType.Discrete, 1, 2)
         Logger.Info("found positive match with delta " + str(deltaAuc))
     else:
         classAttribute = AttributeInfo("classAttribute", Operator.outputType.Discrete, 0, 2)
     return classAttribute
Пример #9
0
    def generateAttributeAndCalculateFilterEvaluatorScore(
            dataset: Dataset, filterEvaluator: FilterEvaluator,
            subFoldTrainingDatasets: List[Dataset],
            currentScores: List[ClassificationResults],
            operatorAssignments: List[OperatorAssignment]):
        # //System.setProperty("java.util.concurrent.ForkJoinPool.common.parallelism", "1");
        Logger.Info(
            "generateAttributeAndCalculateFilterEvaluatorScore -> num of attributes to evaluate: "
            + str(len(operatorAssignments)))
        counter = 0
        numOfThread = Properties.numOfThreads

        def evaluateScore(oa):
            try:
                # attributeGenerationLock.lock();
                replicatedDataset = dataset.replicateDataset()
                # counter += 1
                # if (counter % 1000) == 0:
                #     date = Date()
                #     Logger.Info("generateAttributeAndCalculateFilterEvaluatorScore -> analyzed " + counter + " attributes : " + date.toString())

                # // attributeGenerationLock.unlock();

                ci = OperatorsAssignmentsManager.generateColumn(
                    replicatedDataset, oa, True)
                # if the filter evaluator is not null, we'll conduct the initial evaluation of the new attribute
                if (ci is not None) and (filterEvaluator is not None):
                    # filterEvaluationLock.lock();
                    cloneEvaluator = filterEvaluator.getCopy()
                    replicatedSubFoldsList = []
                    for subFoldDataset in subFoldTrainingDatasets:
                        replicatedSubFoldsList.append(
                            subFoldDataset.replicateDataset())

                    # filterEvaluationLock.unlock();
                    filterEvaluatorScore = OperatorsAssignmentsManager.EvaluateAttributeUsingTrainingSubFolds(
                        replicatedSubFoldsList, cloneEvaluator, oa,
                        currentScores)
                    # oa.setFilterEvaluatorScore(filterEvaluatorScore)
                return filterEvaluatorScore

            except Exception as ex:
                Logger.Error(
                    "generateAttributeAndCalculateFilterEvaluatorScore -> error when generating and evaluating attribute: "
                    + oa.getName(), ex)
                return None

        if (numOfThread > 1):
            filterEvaluatorScores = Parallel.ParallelForEach(
                evaluateScore, [[oa] for oa in operatorAssignments])
            for i, oa in enumerate(operatorAssignments):
                oa.setFilterEvaluatorScore(filterEvaluatorScores[i])

        else:
            for oa in operatorAssignments:
                oa.setFilterEvaluatorScore(evaluateScore(oa))
Пример #10
0
    def produceClassificationResults(self, datasets: list) -> list:
        classificationResultsPerFold = []
        for dataset in datasets:
            date = Date()
            Logger.Info("Starting to run classifier " + str(date))
            trainSet = dataset.generateSet(True)
            testSet = dataset.generateSet(False)
            evaluationResults = self.runClassifier(Properties.classifier,
                                                   trainSet, testSet)
            date = Date()
            Logger.Info("Starting to process classification results " +
                        str(date))
            classificationResults = self.getClassificationResults(
                evaluationResults, dataset, testSet)
            date = Date()
            Logger.Info("Done " + str(date))
            classificationResultsPerFold.append(classificationResults)

        return classificationResultsPerFold
Пример #11
0
    def generateMetaFeaturesInstances(self, includeValueBased: bool):
        datasetFilesForBackgroundArray = self.getOriginalBackgroundDatasets()
        for datasetForBackgroundModel in datasetFilesForBackgroundArray:
            possibleFolderName = Properties.DatasetInstancesFilesLocation + \
                                 FileUtils.getFilenameFromPath(datasetForBackgroundModel) + '_' + str(Properties.randomSeed)

            if not os.path.isdir(possibleFolderName):
                loader = Loader()
                Logger.Info("Getting candidate attributes for " + datasetForBackgroundModel)
                backgroundDataset = loader.readArff(datasetForBackgroundModel, int(Properties.randomSeed), None, None, 0.66)
                self.createDatasetMetaFeaturesInstances(backgroundDataset, includeValueBased)
Пример #12
0
 def getFilter(self, filterName: str, dataset: Dataset) -> FilterEvaluator:
     Logger.Info("Getting filter evaluator - " + filterName)
     # switch
     try:
         return {
             "InformationGainFilterEvaluator":
             InformationGainFilterEvaluator(),
             "MLFilterEvaluator": MLFilterEvaluator(dataset)
         }[filterName]
     except:
         raise Exception("Unidentified evaluator")
Пример #13
0
    def buildClassifierModel(self, backgroundFilePath: str, data):
        # the chosen classifier
        classifier = RandomForestClassifier()
        # classifier.setNumExecutionSlots(Integer.parseInt(properties.getProperty("numOfThreads")));

        # classifier.buildClassifier(data);
        classifier.fit(data.drop(['class']), data['class'])
        file = backgroundFilePath + '.arff'
        FileUtils.deleteFile(file)

        Logger.Info('Saving classifier model ' + backgroundFilePath)
        self.writeClassifierTobackgroundFile(backgroundFilePath, classifier)
        return classifier
Пример #14
0
 def run_list_of_commands(self, commands, dryrun):
     # run the commands
     if dryrun:
         Logger.Info("\nDryrun! Not executed commands: ", color="red")
         for cmd in commands:
             if cmd.workpath is None:
                 print(cmd.getCommand() + "\n")
             else:
                 print("cd %s ; %s\n" % (cmd.workpath, cmd.getCommand()))
     else:
         SLURM_NTASKS_PER_NODE = os.getenv("SLURM_NTASKS_PER_NODE")
         if SLURM_NTASKS_PER_NODE is not None:
             SLURM_NTASKS_PER_NODE = int(SLURM_NTASKS_PER_NODE)
             Logger.Info("Number of processes limited by SLURM to %d" %
                         SLURM_NTASKS_PER_NODE)
         exitcodes = ShellScript.run_scripts_parallel(
             commands, nproc=SLURM_NTASKS_PER_NODE)
         # check for any errors
         for c in exitcodes:
             if c[0] != 0:
                 raise Exception, "exitcode: %d during parallel execution:\ncommand:\n%s\n\noutput:\n%s!" % (
                     c[0], c[2], c[1])
Пример #15
0
    def getBackgroundClassificationModel(self, dataset: Dataset, includeValueBased: bool):
        backgroundFilePath = self.getBackgroundFilePath(dataset, includeValueBased)
        path = backgroundFilePath


        # If the classification model already exists, load and return it
        if os.path.isfile(path):
            Logger.Info("Background model already exists. Extracting from " + path)
            return self.getClassificationModel(dataset, backgroundFilePath)

        #Otherwise, generate, save and return it (WARNING - takes time)
        else:
            Logger.Info("Background model doesn't exist for dataset " + dataset.name + ". Creating it...")

            # We begin by getting a list of all the datasets that need to participate in the creation of the background model
            self.generateMetaFeaturesInstances(includeValueBased)

            candidateAtrrDirectories = self.getDirectoriesInFolder(Properties.DatasetInstancesFilesLocation)
            self.generateBackgroundARFFFileForDataset(dataset, backgroundFilePath, candidateAtrrDirectories, includeValueBased)

            # now we load the contents of the ARFF file into an Instances object and train the classifier
            data = self.getInstancesFromARFF(backgroundFilePath)
            return self.buildClassifierModel(backgroundFilePath, data)
Пример #16
0
    def readArff(self, filePath: str, randomSeed: int,
                 distinctValIndices: list, classAttIndex: str,
                 trainingSetPercentageOfDataset: float) -> Dataset:

        try:
            data = arff.loadarff(filePath)
            df = pd.DataFrame(data[0])

            Logger.Info(f'num of attributes: {len(df.keys())}')
            Logger.Info(f'num of instances: {len(df.values)}')

            if (classAttIndex == None) or (classAttIndex == ''):
                targetClassName = df.keys()[-1]
            else:
                targetClassName = classAttIndex
            df[targetClassName] = df[targetClassName].str.decode("utf-8")

            if distinctValIndices == None:
                folds = self.GenerateFolds(df[targetClassName], randomSeed,
                                           trainingSetPercentageOfDataset)
            else:
                pass  #TODO: missing func?

            distinctValColumnInfos = []
            if distinctValIndices != None:
                for distinctColumnIndex in distinctValIndices:
                    distinctValColumnInfos.append(df[distinctColumnIndex])

            # Fially, we can create the Dataset object
            return Dataset(
                df, folds, targetClassName, data[1].name, randomSeed,
                Properties.maxNumberOfDiscreteValuesForInclusionInSet)

        except Exception as ex:
            Logger.Error(f'Exception in readArff. message: {ex}')
            return None
Пример #17
0
    def generateTrainingSetDatasetAttributesWithoutValues(self, dataset):
        Logger.Info("Generating dataset attributes for dataset: " + dataset.name)

        # DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss");
        startDate = Date()
        # The structure: Classifier -> candidate feature (operator assignment, to be exact) -> meta-feature type -> A map of feature indices and values
        # { classifier:
        #     { OperatorAssigment:
        #           { meta-feature type: {indice, value}}
        # TreeMap<String, HashMap<OperatorAssignment,HashMap<String,TreeMap<Integer,AttributeInfo>>>> candidateAttributesList = new TreeMap<>()
        candidateAttributesList = {}

        classifiers = Properties.classifiersForMLAttributesGeneration.split(',')

        # obtaining the attributes for the dataset itself is straightforward
        dba = DatasetBasedAttributes()
        for classifier in classifiers:
            candidateAttributesList[classifier] = {}
            originalAuc = self.getOriginalAuc(dataset, classifier)

            # Generate the dataset attributes
            datasetAttributes = dba.getDatasetBasedFeatures(dataset, classifier)

            # now we need to generate the candidate attributes and evaluate them. This requires a few preliminary steps:
            # 1) Replicate the dataset and create the discretized features and add them to the dataset
            unaryOperators = OperatorsAssignmentsManager.getUnaryOperatorsList()

            # The unary operators need to be evaluated like all other operator assignments (i.e. attribtues generation)
            unaryOperatorAssignments = OperatorsAssignmentsManager.getOperatorAssignments(dataset, None, unaryOperators, int(Properties.maxNumOfAttsInOperatorSource))
            replicatedDataset = self.generateDatasetReplicaWithDiscretizedAttributes(dataset, unaryOperatorAssignments)

            # 2) Obtain all other operator assignments (non-unary). IMPORTANT: this is applied on the REPLICATED dataset so we can take advantage of the discretized features
            nonUnaryOperators = OperatorsAssignmentsManager.getNonUnaryOperatorsList()
            nonUnaryOperatorAssignments = OperatorsAssignmentsManager.getOperatorAssignments(replicatedDataset, None, nonUnaryOperators, int(Properties.maxNumOfAttsInOperatorSource))

            # 3) Generate the candidate attribute and generate its attributes
            nonUnaryOperatorAssignments.addAll(unaryOperatorAssignments)

            # oaList.parallelStream().forEach(oa -> {
            # ReentrantLock wrapperResultsLock = new ReentrantLock();
            # for (OperatorAssignment oa : nonUnaryOperatorAssignments) {
            position = [0] #new int[]{0};

            # TODO: keep it pararell, temporary changed to single thread
            # nonUnaryOperatorAssignments.parallelStream().forEach(oa -> {
            for oa in nonUnaryOperatorAssignments:
                try:
                    datasetReplica = dataset.replicateDataset()

                    # Here we generate all the meta-features that are "parent dependent" and do not require us to generate the values of the new attribute
                    oaba = OperatorAssignmentBasedAttributes()

                    # TreeMap < Integer, AttributeInfo >
                    candidateAttributeValuesFreeMetaFeatures = oaba.getOperatorAssignmentBasedMetaFeatures(dataset, oa)


                    evaluationInfo = self.runClassifier(classifier, datasetReplica.generateSet(True), datasetReplica.generateSet(False))
                    evaluationResults1 = evaluationInfo.getEvaluationStats()

                    # synchronized (this){ #TODO: part of the pararell stream
                    #     candidateAttributesList.get(classifier).put(oa, new HashMap<>());
                    #     candidateAttributesList.get(classifier).get(oa).put(DATASET_BASED, datasetAttributes);
                    candidateAttributesList[classifier][oa][MLAttributeManager.DATASET_BASED] =  datasetAttributes
                    # Add the identifier of the classifier that was used
                    classifierAttribute = AttributeInfo("Classifier", Operator.outputType.Discrete, self.getClassifierIndex(classifier), 3)
                    candidateAttributeValuesFreeMetaFeatures[len(candidateAttributeValuesFreeMetaFeatures)] = classifierAttribute
                    candidateAttributesList[classifier][oa][MLAttributeManager.OA_BASED] = candidateAttributeValuesFreeMetaFeatures

                    # candidateAttributeValuesDependentMetaFeatures = oaba.getGeneratedAttributeValuesMetaFeatures(dataset, oa, candidateAttribute)
                    # candidateAttributesList[classifier][oa][MLAttributeManager.VALUES_BASED] = candidateAttributeValuesDependentMetaFeatures
                    candidateAttributesList[classifier][oa][MLAttributeManager.OA_BASED][candidateAttributesList[classifier][oa][MLAttributeManager.OA_BASED].size()] = self.createClassAttribute(originalAuc, datasetReplica, evaluationResults1)


                    # wrapperResultsLock.lock(); #TODO: part of the pararell stream
                    if (len(candidateAttributesList[classifier]) % 1000) == 0:
                        date = Date()
                        Logger.Info(date.__str__() + ": Finished processing " + ((position[0] * MLAttributeManager.ITERATION) + len(candidateAttributesList[classifier]) + '/' + nonUnaryOperatorAssignments.size() + ' elements for background model'))

                    if (len(candidateAttributesList[classifier]) % MLAttributeManager.ITERATION) == 0:
                        self.savePartArffCandidateAttributes(candidateAttributesList,classifier,dataset,position[0])
                        position[0] += 1
                        candidateAttributesList[classifier].clear()
                    # wrapperResultsLock.unlock(); #TODO: part of the pararell stream
                except Exception as ex:
                    Logger.Error("Error in ML features generation : " + oa.getName() + "  :  " + str(ex))


            self.savePartArffCandidateAttributes(candidateAttributesList,classifier,dataset,position[0])

        finishDate = Date()
        diffInMillies = finishDate - startDate
        Logger.Info("Getting candidate attributes for dataset " + dataset.name + " took " + diffInMillies.seconds.__str__() + " seconds")
    def run(self, originalDataset: Dataset, runInfo: str):
        Logger.Info('Initializing evaluators')
        filterEvaluator = MLFilterEvaluator(originalDataset)

        preRankerEvaluator = None
        if bool(Properties.usePreRanker):
            preRankerEvaluator = FilterPreRankerEvaluator(originalDataset)

        if Properties.wrapperApproach == 'AucWrapperEvaluator':
            wrapperEvaluator = AucWrapperEvaluator()
        else:
            Logger.Error('Missing wrapper approach')
            raise Exception('Missing wrapper approach')

        experimentStartDate = Date()
        Logger.Info("Experiment Start Date/Time: " +
                    str(self.experimentStartDate) + " for dataset " +
                    originalDataset.name)

        # The first step is to evaluate the initial attributes, so we get a reference point to how well we did
        wrapperEvaluator.EvaluationAndWriteResultsToFile(
            originalDataset, "", 0, runInfo, True, 0, -1, -1)

        # now we create the replica of the original dataset, to which we can add columns
        dataset = originalDataset.replicateDataset()

        # Get the training set sub-folds, used to evaluate the various candidate attributes
        originalDatasetTrainingFolds = originalDataset.GenerateTrainingSetSubFolds(
        )
        subFoldTrainingDatasets = dataset.GenerateTrainingSetSubFolds()

        date = Date()

        # We now apply the wrapper on the training subfolds in order to get the baseline score. This is the score a candidate attribute needs to "beat"
        currentScore = wrapperEvaluator.produceAverageScore(
            subFoldTrainingDatasets, None, None, None, None)
        Logger.Info(f"Initial score: {str(currentScore)} : {date}")

        # The probabilities assigned to each instance using the ORIGINAL dataset (training folds only)
        Logger.Info(f"Producing initial classification results: {date}")
        currentClassificationProbs = wrapperEvaluator.produceClassificationResults(
            originalDatasetTrainingFolds)
        date = Date()
        Logger.Info(f"  .....done {date}")

        # Apply the unary operators (discretizers, normalizers) on all the original features. The attributes generated
        # here are different than the ones generated at later stages because they are included in the dataset that is
        # used to generate attributes in the iterative search phase
        Logger.Info(f"Starting to apply unary operators: {date}")
        oam = OperatorsAssignmentsManager()
        candidateAttributes = oam.applyUnaryOperators(
            dataset, None, filterEvaluator, subFoldTrainingDatasets,
            currentClassificationProbs)
        date = Date()
        Logger.Info("  .....done " + str(date))

        # Now we add the new attributes to the dataset (they are added even though they may not be included in the
        # final dataset beacuse they are essential to the full generation of additional features
        Logger.Info("Starting to generate and add columns to dataset: " +
                    str(date))
        oam.GenerateAndAddColumnToDataset(dataset, candidateAttributes)
        date = Date()
        Logger.Info("  .....done " + str(date))

        # The initial dataset has been populated with the discretized/normalized features. Time to begin the search
        iterationsCounter = 1
        columnsAddedInthePreviousIteration = None

        self.performIterativeSearch(
            originalDataset, runInfo, preRankerEvaluator, filterEvaluator,
            wrapperEvaluator, dataset, originalDatasetTrainingFolds,
            subFoldTrainingDatasets, currentClassificationProbs, oam,
            candidateAttributes, iterationsCounter,
            columnsAddedInthePreviousIteration)
    def performIterativeSearch(
            self, originalDataset: Dataset, runInfo: str,
            preRankerEvaluator: FilterPreRankerEvaluator,
            filterEvaluator: FilterEvaluator,
            wrapperEvaluator: AucWrapperEvaluator, dataset: Dataset,
            originalDatasetTrainingFolds: List[Dataset],
            subFoldTrainingDatasets: List[Dataset],
            currentClassificationProbs: List[ClassificationResults],
            oam: OperatorsAssignmentsManager,
            candidateAttributes: List[OperatorAssignment],
            iterationsCounter: int, columnsAddedInthePreviousIteration):
        totalNumberOfWrapperEvaluations = 0
        rankerFilter = self.getRankerFilter(Properties.rankerApproach)

        #TODO: make sure not exceeding property "maxNumOfWrapperEvaluationsPerIteration"
        def evaluateOperationAssignment(
            oa: OperatorAssignment
        ) -> Tuple[float, Optional[OperatorAssignment]]:
            try:
                if oa.getFilterEvaluatorScore() != float(
                        '-inf') and oa.getFilterEvaluatorScore() > 0.001:
                    score = OperatorsAssignmentsManager.applyOperatorAndPerformWrapperEvaluation(
                        originalDatasetTrainingFolds, oa, wrapperEvaluator,
                        localCurrentClassificationProbs, None)
                    oa.setWrapperEvaluatorScore(score)
                    return (score, oa)
                    # wrapperResultsLock.lock();
                    # evaluatedAttsCounter ++;

                    # we want to keep tabs on the OA with the best observed wrapper performance
                    # if topRankingAssignment == None or topRankingAssignment.getWrapperEvaluatorScore() < score:
                    #     Logger.Info("found new top ranking assignment")
                    #     topRankingAssignment = oa

                    # if isStoppingCriteriaMet(filterEvaluator, wrapperEvaluator, oa, score, topRankingAssignment):
                    #     chosenOperatorAssignment = oa

                    # if (evaluatedAttsCounter % 100) == 0:
                    #     currentDate = Date()
                    #     Logger.Info(
                    #         f"performIterativeSearch ->                     Evaluated: {evaluatedAttsCounter} attributes: {str(currentDate)}")
                    #
                    # wrapperResultsLock.unlock();

            except Exception as ex:
                Logger.Error(f"Exception occurred {ex}", ex)
            return (0.0, None)

        while iterationsCounter <= self.maxIteration:
            filterEvaluator.recalculateDatasetBasedFeatures(originalDataset)
            date = Date()
            Logger.Info(
                f"performIterativeSearch -> Starting search iteration {int(iterationsCounter)}{str(date)}"
            )

            # recalculte the filter evaluator score of the existing attributes
            OperatorsAssignmentsManager.recalculateFilterEvaluatorScores(
                dataset, candidateAttributes, subFoldTrainingDatasets,
                filterEvaluator, currentClassificationProbs)

            # now we generate all the candidate features
            date = Date()
            Logger.Info(
                f"performIterativeSearch ->            Starting feature generation:  {str(date)}"
            )
            candidateAttributes.addAll(
                oam.applyNonUnaryOperators(dataset,
                                           columnsAddedInthePreviousIteration,
                                           preRankerEvaluator, filterEvaluator,
                                           subFoldTrainingDatasets,
                                           currentClassificationProbs))
            date = Date()
            Logger.Info(
                f"performIterativeSearch ->            Finished feature generation: {str(date)}"
            )

            # Sort the candidates by their initial (filter) score and test them using the wrapper evaluator
            candidateAttributes = rankerFilter.rankAndFilter(
                candidateAttributes, columnsAddedInthePreviousIteration,
                subFoldTrainingDatasets, currentClassificationProbs)

            Logger.Info(
                f"performIterativeSearch ->            Starting wrapper evaluation : {str(date)}"
            )
            evaluatedAttsCounter = 0
            chosenOperatorAssignment = None
            topRankingAssignment = None

            # ReentrantLock wrapperResultsLock = new ReentrantLock();
            numOfThreads = Properties.numOfThreads

            localCurrentClassificationProbs = currentClassificationProbs
            # for i in range(len(candidateAttributes), numOfThreads):
            #     if chosenOperatorAssignment != None:
            #         break
            # oaList = candidateAttributes[i, i + min(numOfThreads, len(candidateAttributes)-i)]
            # oaList.parallelStream().forEach(oa -> {
            evaluatedCandidateAttrs = Parallel.ParallelForEach(
                evaluateOperationAssignment,
                [[oa] for oa in candidateAttributes])
            from operator import itemgetter
            tempTopRank = max(evaluatedCandidateAttrs, key=itemgetter(0))
            # if self.isStoppingCriteriaMet(tempTopRank[0]):

            totalNumberOfWrapperEvaluations += len(evaluatedCandidateAttrs)
            Logger.Info(
                f"performIterativeSearch ->            Finished wrapper evaluation : {str(date)}"
            )

            # remove the chosen attribute from the list of "candidates"
            candidateAttributes.remove(chosenOperatorAssignment)

            # The final step - add the new attribute to the datasets
            # start with the dataset used in the following search iterations
            columnsAddedInthePreviousIteration = OperatorsAssignmentsManager.addAddtibuteToDataset(
                dataset, chosenOperatorAssignment, True,
                currentClassificationProbs)

            # continue with the final dataset
            OperatorsAssignmentsManager.addAddtibuteToDataset(
                originalDataset, chosenOperatorAssignment, False,
                currentClassificationProbs)

            # finally, we need to recalculate the baseline score used for the attribute selection (using the updated final dataset)
            currentClassificationProbs = wrapperEvaluator.produceClassificationResults(
                originalDatasetTrainingFolds)

            expDescription = ''
            expDescription += f"Evaluation results for iteration {str(iterationsCounter)}\n"
            expDescription += f"Added attribute: {chosenOperatorAssignment.getName()}\n"
            wrapperEvaluator.EvaluationAndWriteResultsToFile(
                originalDataset, chosenOperatorAssignment.getName(),
                iterationsCounter, runInfo, False, evaluatedAttsCounter,
                chosenOperatorAssignment.getFilterEvaluatorScore(),
                chosenOperatorAssignment.getWrapperEvaluatorScore())
            iterationsCounter += 1

        # some cleanup, if required
        filterEvaluator.deleteBackgroundClassificationModel(originalDataset)

        # After the search process is over, write the total amount of time spent and the number of wrapper evaluations that were conducted
        self.writeFinalStatisticsToResultsFile(
            dataset.name, runInfo, self.experimentStartDate,
            totalNumberOfWrapperEvaluations)
Пример #20
0
class DataReader():
    def __init__(self):
        self.Logger = Logger('./logs/', 'log')

    def find(self, name, path):
        for root, directory, files in os.walk(path):
            if name in files:
                return os.path.join(root, name), root
        return None, None

    def find_directory(self, name, path):
        for root, _, files in os.walk(path):

            directory = root.split('/')[-1]

            if name == directory:
                return root

    def read_kdef(self):

        images = []
        labels = {}

        kdef_labels_path, _ = self.find('kdef_labels.csv', BaseDirectory)
        label_file = open(kdef_labels_path, 'r')
        lines = label_file.readlines()[1:]

        #find kdef image directory
        kdef_directory = self.find_directory('KDEF', BaseDirectory)

        self.Logger.Info("Reading KDEF images from: " + str(kdef_directory))

        for line in lines:

            #split the csv
            imageName, _, _, _, _, trustworthiness, dominance, attractiveness = line.rstrip(
                '\n').split(',')
            imageName = imageName.upper()
            imagePath, _ = self.find(imageName, kdef_directory)

            if (imagePath == None):
                self.Logger.Error(
                    "The image path was None while reading kdef data: " +
                    str(imageName))
                continue

            images.append(imagePath)

            labels[imagePath] = float(trustworthiness), float(
                dominance), float(attractiveness)

        return ({
            'train':
            images[:int(len(images) * 0.70)],
            'validation':
            images[int(len(images) * .71):int(len(images) * .90)],
            'test':
            images[int(len(images) * .91):int(len(images))]
        }, labels)

    def read_celeb_a(self):

        images = []

        #load labels
        label_file_path, _ = self.find('list_attr_celeba.txt', BaseDirectory)
        label_file = open(label_file_path, 'r')
        labels_lines = label_file.readlines()[2:]

        labels = []
        labels_dict = {}

        for line in labels_lines:

            #find image path
            imagePath = BaseDirectory + '/img_align_celeba/' + line.split(
                ' ', 1)[0]
            images.append(imagePath)

            #extract features
            line = line.strip().split(' ')[1:]

            #labels.append([int(x) for x in line[1:]])
            for label in line:

                if (label == '-1' or label == '-1\n'):
                    labels.append(0)

                elif (label == '1' or label == '1\n'):
                    labels.append(1)

            labels_dict[imagePath] = labels
            labels = []

        return ({
            'train':
            images[:int(len(images) * .70)],
            'validation':
            images[int(len(images) * .71):int(len(images) * .90)],
            'test':
            images[int(len(images) * 0.91):int(len(images))]
        }, labels_dict)

    def Read_Splits(self, split_count):

        if (split_count >= len(SPLIT_LIST)):
            print("Split count outside of index range")
            return

        VALNAMES = []
        TRAINNAMES = []
        VALLABELS = []
        TRAINLABELS = []
        IMGNAMES = []
        LABELS = []

        labels_dict = {}

        for i in range(5):
            if SPLIT_LIST[i] != SPLIT_LIST[split_count]:
                f = open(SPLIT_LIST[i], 'r')
                for line in f:
                    line = line.strip().split(',')
                    temp = line[0].replace('.jpg', '.JPG')
                    IMGNAMES.append(temp)
                    LABELS.append(line[5])
                    LABELS.append(line[6])
                    LABELS.append(line[7])
                f.close

        LABELS = np.asarray(LABELS)
        LABELS = np.reshape(LABELS, (-1, 3))

        for i in range(len(LABELS)):
            TRAINNAMES.append(IMGNAMES[i])
            TRAINLABELS.append(LABELS[i])

            labels_dict[IMGNAMES[i]] = LABELS[i]

        # Read in validation data
        IMGNAMES = []
        LABELS = []

        f = open(SPLIT_LIST[split_count], 'r')
        for line in f:
            line = line.strip().split(',')
            temp = line[0].replace('.jpg', '.JPG')
            IMGNAMES.append(temp)
            LABELS.append(line[5])
            LABELS.append(line[6])
            LABELS.append(line[7])
        f.close

        LABELS = np.asarray(LABELS)
        LABELS = np.reshape(LABELS, (-1, 3))

        for i in range(len(LABELS)):
            VALNAMES.append(IMGNAMES[i])
            VALLABELS.append(LABELS[i])

            labels_dict[IMGNAMES[i]] = LABELS[i]

        return ({'train': TRAINNAMES, 'test': VALNAMES}, labels_dict)

    def Fix_Paths(self):

        f = open('./split_labels/kdef_split_five.csv', 'r')
        fout = open('./split_labels/kdef_split_five_fullPath.csv', 'w')

        for readline in f:
            line = readline.strip().split(',')
            imageName = line[0].replace('.jpg', '.JPG')
            fullPath, _ = self.find(imageName, './KDEF')

            newstring = readline.replace(line[0], fullPath)
            fout.write(newstring)

        f.close()
        fout.close()

    def weights_exist(self, file_path):
        return os.path.exists(file_path)