Пример #1
0
class Runner(object):
    """
  Class to run the baseline NLP experiments with the specified data, models,
  text processing, and evaluation metrics.
  """
    def __init__(self,
                 dataPath,
                 resultsDir,
                 experimentName,
                 modelName,
                 loadPath=None,
                 numClasses=3,
                 plots=0,
                 orderedSplit=False,
                 trainSizes=None,
                 verbosity=0):
        """
    @param dataPath         (str)     Path to raw data file for the experiment.
    @param resultsDir       (str)     Directory where for the results metrics.
    @param experimentName   (str)     Experiment name, used for saving results.
    @param loadPath         (str)     Path to serialized model for loading.
    @param modelName        (str)     Name of nupic.fluent Model subclass.
    @param numClasses       (int)     Number of classes (labels) per sample.
    @param plots            (int)     Specifies plotting of evaluation metrics.
    @param orderedSplit     (bool)    Indicates method for splitting train/test
                                      samples; False is random, True is ordered.
    @param trainSizes       (list)    Number of samples to use in training, per
                                      trial.
    @param verbosity        (int)     Greater value prints out more progress.
    """
        self.dataPath = dataPath
        self.resultsDir = resultsDir
        self.experimentName = experimentName
        self.loadPath = loadPath
        self.modelName = modelName
        self.numClasses = numClasses
        self.plots = plots
        self.orderedSplit = orderedSplit
        self.trainSizes = trainSizes if trainSizes else []
        self.verbosity = verbosity

        self.modelDir = os.path.join(self.resultsDir, self.experimentName,
                                     self.modelName)
        if not os.path.exists(self.modelDir):
            os.makedirs(self.modelDir)

        if self.plots:
            from fluent.utils.plotting import PlotNLP
            self.plotter = PlotNLP()

        self.dataDict = None
        self.labels = None
        self.labelRefs = None
        self.partitions = []
        self.samples = {}
        self.patterns = None
        self.results = []
        self.model = None

    def initModel(self, modelName):
        """Load or instantiate the classification model."""
        if self.loadPath:
            self.model = self.loadModel()
        else:
            self.model = self._createModel(modelName)

    def _createModel(self, modelName):
        """Return an instantiated model."""
        modelCls = _MODEL_MAPPING.get(modelName, None)

        if modelCls is None:
            raise ValueError(
                "Could not instantiate model \'{}\'.".format(modelName))

        # TODO: remove these if blocks and just use the else; either specify the Cio
        # FP type elsewhere, or split Word and Doc into separate classes.

        if modelName == "CioWordFingerprint":
            return modelCls(verbosity=self.verbosity,
                            numLabels=self.numClasses,
                            modelDir=self.modelDir,
                            fingerprintType=EncoderTypes.word)

        elif modelName == "CioDocumentFingerprint":
            return modelCls(verbosity=self.verbosity,
                            numLabels=self.numClasses,
                            modelDir=self.modelDir,
                            fingerprintType=EncoderTypes.document)

        else:
            return modelCls(verbosity=self.verbosity,
                            numLabels=self.numClasses,
                            modelDir=self.modelDir)

    def loadModel(self):
        """Load the serialized model."""
        try:
            with open(self.loadPath, "rb") as f:
                model = pkl.load(f)
            print "Model loaded from \'{}\'.".format(self.loadPath)
            return model
        except IOError as e:
            print "Could not load model from \'{}\'.".format(self.loadPath)
            raise e

    def resetModel(self, _):
        self.model.resetModel()

    def saveModel(self):
        self.model.saveModel()

    def _mapLabelRefs(self):
        """Replace the label strings in self.dataDict with corresponding ints."""
        self.labelRefs = [
            label for label in set(
                itertools.chain.from_iterable(
                    [x[1] for x in self.dataDict.values()]))
        ]

        for uniqueID, data in self.dataDict.iteritems():
            self.dataDict[uniqueID] = (data[0],
                                       numpy.array([
                                           self.labelRefs.index(label)
                                           for label in data[1]
                                       ]))

    def setupData(self, preprocess=False):
        """
    Get the data from CSV and preprocess if specified. The call to readCSV()
    assumes a specific CSV format, detailed in its docstring.

    @param preprocess   (bool)    Whether or not to preprocess the data when
                                  reading in samples.
    """
        self.dataDict = readCSV(self.dataPath, numLabels=self.numClasses)

        if (not isinstance(self.trainSizes, list) or not all(
            [0 <= size <= len(self.dataDict) for size in self.trainSizes])):
            raise ValueError("Invalid size(s) for training set.")

        self._mapLabelRefs()

        self.samples = self.model.prepData(self.dataDict, preprocess)

        self.encodeSamples()

        if self.verbosity > 1:
            for i, s in self.samples.iteritems():
                print i, s

    def encodeSamples(self):
        self.patterns = self.model.encodeSamples(self.samples)

    def runExperiment(self):
        """Train and test the model for each trial specified by self.trainSizes."""
        if not self.partitions:
            # An experiment (e.g. k-folds) may do this elsewhere
            self.partitionIndices()

        for i, _ in enumerate(self.trainSizes):
            self.resetModel(i)

            if self.verbosity > 0:
                print "\tTraining for run {0} of {1}.".format(
                    i + 1, len(self.trainSizes))
            self._training(i)

            if self.verbosity > 0:
                print "\tTesting for this run."
            self._testing(i)

    def partitionIndices(self):
        """
    Partitions list of two-tuples of train and test indices for each trial.

    TODO: use StandardSplit in data_split.py
    """
        length = len(self.samples)
        if self.orderedSplit:
            for split in self.trainSizes:
                trainIndices = range(split)
                testIndices = range(split, length)
                self.partitions.append((trainIndices, testIndices))
        else:
            # Randomly sampled, not repeated
            for split in self.trainSizes:
                trainIndices = random.sample(xrange(length), split)
                testIndices = [
                    i for i in xrange(length) if i not in trainIndices
                ]
                self.partitions.append((trainIndices, testIndices))

    def _training(self, trial):
        """
    Train the model one-by-one on each pattern specified in this trials
    partition of indices. Models' training methods require the sample and label
    to be in a list.
    """
        if self.verbosity > 0:
            print("\tRunner selects to train on sample(s) {}".format(
                self.partitions[trial][0]))

        for i in self.partitions[trial][0]:
            self.model.trainModel(i)

    def _testing(self, trial):
        if self.verbosity > 0:
            print("\tRunner selects to test on sample(s) {}".format(
                self.partitions[trial][1]))

        results = ([], [])
        for i in self.partitions[trial][1]:
            predicted = self.model.testModel(i)
            results[0].append(predicted)
            results[1].append(self.patterns[i]["labels"])

        self.results.append(results)

    def writeOutClassifications(self):
        """Write the samples, actual, and predicted classes to a CSV."""
        headers = ("Tokenized sample", "Actual", "Predicted")
        for trial, _ in enumerate(self.trainSizes):
            resultsDict = defaultdict(list)
            for i, sampleNum in enumerate(self.partitions[trial][1]):
                # Loop through the indices in the test set of this trial.
                sample = self.samples.values()[sampleNum][0]
                pred = sorted(
                    [self.labelRefs[j] for j in self.results[trial][0][i]])
                actual = sorted(
                    [self.labelRefs[j] for j in self.results[trial][1][i]])
                resultsDict[sampleNum] = (sampleNum, sample, actual, pred)

            resultsPath = os.path.join(self.model.modelDir,
                                       "results_trial" + str(trial) + ".csv")
            writeFromDict(resultsDict, headers, resultsPath)

    def calculateResults(self):
        """
    Calculate evaluation metrics from the result classifications.

    TODO: pass intended CM results to plotter.plotConfusionMatrix()
    """
        resultCalcs = [
            self.model.evaluateResults(self.results[i], self.labelRefs,
                                       self.partitions[i][1])
            for i in xrange(len(self.partitions))
        ]

        self.printFinalReport(self.trainSizes, [r[0] for r in resultCalcs])

        if self.plots:
            trialAccuracies = self._calculateTrialAccuracies()
            classificationAccuracies = self._calculateClassificationAccuracies(
                trialAccuracies)

            self.plotter.plotCategoryAccuracies(trialAccuracies,
                                                self.trainSizes)
            self.plotter.plotCumulativeAccuracies(classificationAccuracies,
                                                  self.trainSizes)

            if self.plots > 1:
                # Plot extra evaluation figures -- confusion matrix.
                self.plotter.plotConfusionMatrix(
                    self.setupConfusionMatrices(resultCalcs))

        return resultCalcs

    def _calculateTrialAccuracies(self):
        """
    @return trialAccuracies     (defaultdict)   Items are defaultdicts, one for
        each size of the training set. Inner defaultdicts keys are
        categories, with numpy array values that contain one accuracy value for
        each trial.
    """
        # To handle multiple trials of the same size:
        # trialSize -> (category -> list of accuracies)
        trialAccuracies = defaultdict(
            lambda: defaultdict(lambda: numpy.ndarray(0)))
        for result, size in itertools.izip(self.results, self.trainSizes):
            accuracies = self.model.calculateClassificationResults(result)
            for label, acc in accuracies:
                category = self.labelRefs[label]
                accList = trialAccuracies[size][category]
                trialAccuracies[size][category] = numpy.append(accList, acc)

        return trialAccuracies

    def _calculateClassificationAccuracies(self, trialAccuracies):
        """
    @param trialAccuracies            (defaultdict)   Please see the description
        in self._calculateClassificationAccuracies().

    @return classificationAccuracies  (defaultdict)   Keys are classification
        categories, with multiple numpy arrays as values -- one for each size of
        training sets, with one accuracy value for each run of that training set
        size.
    """
        # Need the accuracies to be ordered for the plot
        trials = sorted(set(self.trainSizes))
        # category -> list of list of accuracies
        classificationAccuracies = defaultdict(list)
        for trial in trials:
            accuracies = trialAccuracies[trial]
            for label, acc in accuracies.iteritems():
                classificationAccuracies[label].append(acc)

        return classificationAccuracies

    def validateExperiment(self, expectationFilePath):
        """Returns accuracy of predicted labels against expected labels."""
        dataDict = readCSV(expectationFilePath, numLabels=self.numClasses)

        accuracies = numpy.zeros((len(self.results)))
        for i, trial in enumerate(self.results):
            for j, predictionList in enumerate(trial[0]):
                predictions = [self.labelRefs[p] for p in predictionList]
                if predictions == []:
                    predictions = ["(none)"]
                expected = dataDict.items()[j + self.trainSizes[i]][1]

                accuracies[i] += (
                    float(len(set(predictions) & set(expected[1]))) /
                    len(expected[1]))

            accuracies[i] = accuracies[i] / len(trial[0])

        return accuracies

    @staticmethod
    def printFinalReport(trainSizes, accuracies):
        """Prints result accuracies."""
        template = "{0:<20}|{1:<10}"
        print "Evaluation results for this experiment:"
        print template.format("Size of training set", "Accuracy")
        for size, acc in itertools.izip(trainSizes, accuracies):
            print template.format(size, acc)

    def evaluateCumulativeResults(self, intermResults):
        """
    Cumulative statistics for the outputs of evaluateTrialResults().

    @param intermResults      (list)          List of returned results from
                                              evaluateTrialResults().
    @return                   (dict)          Returns a dictionary with entries
                                              for max, mean, and min accuracies,
                                              and the mean confusion matrix.
    """
        accuracy = []
        cm = numpy.zeros((intermResults[0][1].shape))

        # Find mean, max, and min values for the metrics.
        for result in intermResults:
            accuracy.append(result[0])
            cm = numpy.add(cm, result[1])

        results = {
            "max_accuracy": max(accuracy),
            "mean_accuracy": sum(accuracy) / float(len(accuracy)),
            "min_accuracy": min(accuracy),
            "total_cm": cm
        }

        if self.verbosity > 0:
            self._printCumulativeReport(results)

        return results

    @staticmethod
    def _printCumulativeReport(results):
        """
    Prints results as returned by evaluateFinalResults() after several trials.
    """
        print "---------- RESULTS ----------"
        print "max, mean, min accuracies = "
        print "{0:.3f}, {1:.3f}, {2:.3f}".format(results["max_accuracy"],
                                                 results["mean_accuracy"],
                                                 results["min_accuracy"])
        print "total confusion matrix =\n", results["total_cm"]
Пример #2
0
class Runner(object):
    """
  Class to run the baseline NLP experiments with the specified data, models,
  text processing, and evaluation metrics.
  """

    def __init__(
        self,
        dataPath,
        resultsDir,
        experimentName,
        modelName,
        loadPath=None,
        numClasses=3,
        plots=0,
        orderedSplit=False,
        trainSizes=None,
        verbosity=0,
    ):
        """
    @param dataPath         (str)     Path to raw data file for the experiment.
    @param resultsDir       (str)     Directory where for the results metrics.
    @param experimentName   (str)     Experiment name, used for saving results.
    @param loadPath         (str)     Path to serialized model for loading.
    @param modelName        (str)     Name of nupic.fluent Model subclass.
    @param numClasses       (int)     Number of classes (labels) per sample.
    @param plots            (int)     Specifies plotting of evaluation metrics.
    @param orderedSplit     (bool)    Indicates method for splitting train/test
                                      samples; False is random, True is ordered.
    @param trainSizes       (list)    Number of samples to use in training, per
                                      trial.
    @param verbosity        (int)     Greater value prints out more progress.
    """
        self.dataPath = dataPath
        self.resultsDir = resultsDir
        self.experimentName = experimentName
        self.loadPath = loadPath
        self.modelName = modelName
        self.numClasses = numClasses
        self.plots = plots
        self.orderedSplit = orderedSplit
        self.trainSizes = trainSizes if trainSizes else []
        self.verbosity = verbosity

        self.modelDir = os.path.join(self.resultsDir, self.experimentName, self.modelName)
        if not os.path.exists(self.modelDir):
            os.makedirs(self.modelDir)

        if self.plots:
            from fluent.utils.plotting import PlotNLP

            self.plotter = PlotNLP()

        self.dataDict = None
        self.labels = None
        self.labelRefs = None
        self.partitions = []
        self.samples = {}
        self.patterns = None
        self.results = []
        self.model = None

    def initModel(self, modelName):
        """Load or instantiate the classification model."""
        if self.loadPath:
            self.model = self.loadModel()
        else:
            self.model = self._createModel(modelName)

    def _createModel(self, modelName):
        """Return an instantiated model."""
        modelCls = _MODEL_MAPPING.get(modelName, None)

        if modelCls is None:
            raise ValueError("Could not instantiate model '{}'.".format(modelName))

        # TODO: remove these if blocks and just use the else; either specify the Cio
        # FP type elsewhere, or split Word and Doc into separate classes.

        if modelName == "CioWordFingerprint":
            return modelCls(
                verbosity=self.verbosity,
                numLabels=self.numClasses,
                modelDir=self.modelDir,
                fingerprintType=EncoderTypes.word,
            )

        elif modelName == "CioDocumentFingerprint":
            return modelCls(
                verbosity=self.verbosity,
                numLabels=self.numClasses,
                modelDir=self.modelDir,
                fingerprintType=EncoderTypes.document,
            )

        else:
            return modelCls(verbosity=self.verbosity, numLabels=self.numClasses, modelDir=self.modelDir)

    def loadModel(self):
        """Load the serialized model."""
        try:
            with open(self.loadPath, "rb") as f:
                model = pkl.load(f)
            print "Model loaded from '{}'.".format(self.loadPath)
            return model
        except IOError as e:
            print "Could not load model from '{}'.".format(self.loadPath)
            raise e

    def resetModel(self, _):
        self.model.resetModel()

    def saveModel(self):
        self.model.saveModel()

    def _mapLabelRefs(self):
        """Replace the label strings in self.dataDict with corresponding ints."""
        self.labelRefs = [label for label in set(itertools.chain.from_iterable([x[1] for x in self.dataDict.values()]))]

        for uniqueID, data in self.dataDict.iteritems():
            self.dataDict[uniqueID] = (data[0], numpy.array([self.labelRefs.index(label) for label in data[1]]))

    def setupData(self, preprocess=False):
        """
    Get the data from CSV and preprocess if specified. The call to readCSV()
    assumes a specific CSV format, detailed in its docstring.

    @param preprocess   (bool)    Whether or not to preprocess the data when
                                  reading in samples.
    """
        self.dataDict = readCSV(self.dataPath, numLabels=self.numClasses)

        if not isinstance(self.trainSizes, list) or not all(
            [0 <= size <= len(self.dataDict) for size in self.trainSizes]
        ):
            raise ValueError("Invalid size(s) for training set.")

        self._mapLabelRefs()

        self.samples = self.model.prepData(self.dataDict, preprocess)

        self.encodeSamples()

        if self.verbosity > 1:
            for i, s in self.samples.iteritems():
                print i, s

    def encodeSamples(self):
        self.patterns = self.model.encodeSamples(self.samples)

    def runExperiment(self):
        """Train and test the model for each trial specified by self.trainSizes."""
        if not self.partitions:
            # An experiment (e.g. k-folds) may do this elsewhere
            self.partitionIndices()

        for i, _ in enumerate(self.trainSizes):
            self.resetModel(i)

            if self.verbosity > 0:
                print "\tTraining for run {0} of {1}.".format(i + 1, len(self.trainSizes))
            self._training(i)

            if self.verbosity > 0:
                print "\tTesting for this run."
            self._testing(i)

    def partitionIndices(self):
        """
    Partitions list of two-tuples of train and test indices for each trial.

    TODO: use StandardSplit in data_split.py
    """
        length = len(self.samples)
        if self.orderedSplit:
            for split in self.trainSizes:
                trainIndices = range(split)
                testIndices = range(split, length)
                self.partitions.append((trainIndices, testIndices))
        else:
            # Randomly sampled, not repeated
            for split in self.trainSizes:
                trainIndices = random.sample(xrange(length), split)
                testIndices = [i for i in xrange(length) if i not in trainIndices]
                self.partitions.append((trainIndices, testIndices))

    def _training(self, trial):
        """
    Train the model one-by-one on each pattern specified in this trials
    partition of indices. Models' training methods require the sample and label
    to be in a list.
    """
        if self.verbosity > 0:
            print ("\tRunner selects to train on sample(s) {}".format(self.partitions[trial][0]))

        for i in self.partitions[trial][0]:
            self.model.trainModel(i)

    def _testing(self, trial):
        if self.verbosity > 0:
            print ("\tRunner selects to test on sample(s) {}".format(self.partitions[trial][1]))

        results = ([], [])
        for i in self.partitions[trial][1]:
            predicted = self.model.testModel(i)
            results[0].append(predicted)
            results[1].append(self.patterns[i]["labels"])

        self.results.append(results)

    def writeOutClassifications(self):
        """Write the samples, actual, and predicted classes to a CSV."""
        headers = ("Tokenized sample", "Actual", "Predicted")
        for trial, _ in enumerate(self.trainSizes):
            resultsDict = defaultdict(list)
            for i, sampleNum in enumerate(self.partitions[trial][1]):
                # Loop through the indices in the test set of this trial.
                sample = self.samples.values()[sampleNum][0]
                pred = sorted([self.labelRefs[j] for j in self.results[trial][0][i]])
                actual = sorted([self.labelRefs[j] for j in self.results[trial][1][i]])
                resultsDict[sampleNum] = (sampleNum, sample, actual, pred)

            resultsPath = os.path.join(self.model.modelDir, "results_trial" + str(trial) + ".csv")
            writeFromDict(resultsDict, headers, resultsPath)

    def calculateResults(self):
        """
    Calculate evaluation metrics from the result classifications.

    TODO: pass intended CM results to plotter.plotConfusionMatrix()
    """
        resultCalcs = [
            self.model.evaluateResults(self.results[i], self.labelRefs, self.partitions[i][1])
            for i in xrange(len(self.partitions))
        ]

        self.printFinalReport(self.trainSizes, [r[0] for r in resultCalcs])

        if self.plots:
            trialAccuracies = self._calculateTrialAccuracies()
            classificationAccuracies = self._calculateClassificationAccuracies(trialAccuracies)

            self.plotter.plotCategoryAccuracies(trialAccuracies, self.trainSizes)
            self.plotter.plotCumulativeAccuracies(classificationAccuracies, self.trainSizes)

            if self.plots > 1:
                # Plot extra evaluation figures -- confusion matrix.
                self.plotter.plotConfusionMatrix(self.setupConfusionMatrices(resultCalcs))

        return resultCalcs

    def _calculateTrialAccuracies(self):
        """
    @return trialAccuracies     (defaultdict)   Items are defaultdicts, one for
        each size of the training set. Inner defaultdicts keys are
        categories, with numpy array values that contain one accuracy value for
        each trial.
    """
        # To handle multiple trials of the same size:
        # trialSize -> (category -> list of accuracies)
        trialAccuracies = defaultdict(lambda: defaultdict(lambda: numpy.ndarray(0)))
        for result, size in itertools.izip(self.results, self.trainSizes):
            accuracies = self.model.calculateClassificationResults(result)
            for label, acc in accuracies:
                category = self.labelRefs[label]
                accList = trialAccuracies[size][category]
                trialAccuracies[size][category] = numpy.append(accList, acc)

        return trialAccuracies

    def _calculateClassificationAccuracies(self, trialAccuracies):
        """
    @param trialAccuracies            (defaultdict)   Please see the description
        in self._calculateClassificationAccuracies().

    @return classificationAccuracies  (defaultdict)   Keys are classification
        categories, with multiple numpy arrays as values -- one for each size of
        training sets, with one accuracy value for each run of that training set
        size.
    """
        # Need the accuracies to be ordered for the plot
        trials = sorted(set(self.trainSizes))
        # category -> list of list of accuracies
        classificationAccuracies = defaultdict(list)
        for trial in trials:
            accuracies = trialAccuracies[trial]
            for label, acc in accuracies.iteritems():
                classificationAccuracies[label].append(acc)

        return classificationAccuracies

    def validateExperiment(self, expectationFilePath):
        """Returns accuracy of predicted labels against expected labels."""
        dataDict = readCSV(expectationFilePath, numLabels=self.numClasses)

        accuracies = numpy.zeros((len(self.results)))
        for i, trial in enumerate(self.results):
            for j, predictionList in enumerate(trial[0]):
                predictions = [self.labelRefs[p] for p in predictionList]
                if predictions == []:
                    predictions = ["(none)"]
                expected = dataDict.items()[j + self.trainSizes[i]][1]

                accuracies[i] += float(len(set(predictions) & set(expected[1]))) / len(expected[1])

            accuracies[i] = accuracies[i] / len(trial[0])

        return accuracies

    @staticmethod
    def printFinalReport(trainSizes, accuracies):
        """Prints result accuracies."""
        template = "{0:<20}|{1:<10}"
        print "Evaluation results for this experiment:"
        print template.format("Size of training set", "Accuracy")
        for size, acc in itertools.izip(trainSizes, accuracies):
            print template.format(size, acc)

    def evaluateCumulativeResults(self, intermResults):
        """
    Cumulative statistics for the outputs of evaluateTrialResults().

    @param intermResults      (list)          List of returned results from
                                              evaluateTrialResults().
    @return                   (dict)          Returns a dictionary with entries
                                              for max, mean, and min accuracies,
                                              and the mean confusion matrix.
    """
        accuracy = []
        cm = numpy.zeros((intermResults[0][1].shape))

        # Find mean, max, and min values for the metrics.
        for result in intermResults:
            accuracy.append(result[0])
            cm = numpy.add(cm, result[1])

        results = {
            "max_accuracy": max(accuracy),
            "mean_accuracy": sum(accuracy) / float(len(accuracy)),
            "min_accuracy": min(accuracy),
            "total_cm": cm,
        }

        if self.verbosity > 0:
            self._printCumulativeReport(results)

        return results

    @staticmethod
    def _printCumulativeReport(results):
        """
    Prints results as returned by evaluateFinalResults() after several trials.
    """
        print "---------- RESULTS ----------"
        print "max, mean, min accuracies = "
        print "{0:.3f}, {1:.3f}, {2:.3f}".format(
            results["max_accuracy"], results["mean_accuracy"], results["min_accuracy"]
        )
        print "total confusion matrix =\n", results["total_cm"]
Пример #3
0
class Runner(object):
  """
  Class to run the baseline NLP experiments with the specified data, models,
  text processing, and evaluation metrics.
  """

  def __init__(self,
               dataPath,
               resultsDir,
               experimentName,
               load,
               modelName,
               modelModuleName,
               numClasses,
               plots,
               orderedSplit,
               trainSize,
               verbosity):
    """
    @param dataPath         (str)     Path to raw data file for the experiment.
    @param resultsDir       (str)     Directory where for the results metrics.
    @param experimentName   (str)     Experiment name, used for saving results.
    @param load             (bool)    True if a serialized model is to be
                                      loaded.
    @param modelName        (str)     Name of nupic.fluent Model subclass.
    @param modeModuleName   (str)     Model module -- location of the subclass.
    @param numClasses       (int)     Number of classes (labels) per sample.
    @param plots            (int)     Specifies plotting of evaluation metrics.
    @param orderedSplit     (bool)    Indicates method for splitting train/test
                                      samples; False is random, True is ordered.
    @param trainSize        (str)     Number of samples to use in training.
    @param verbosity        (int)     Greater value prints out more progress.

    """
    self.dataPath = dataPath
    self.resultsDir = resultsDir
    self.experimentName = experimentName
    self.load = load
    self.modelName = modelName
    self.modelModuleName = modelModuleName
    self.numClasses = numClasses
    self.plots = plots
    self.orderedSplit = orderedSplit
    self.trainSize = trainSize
    self.verbosity = verbosity

    self.modelDir = os.path.join(
        self.resultsDir, self.experimentName, self.modelName)
    if not os.path.exists(self.modelDir):
      os.makedirs(self.modelDir)

    if self.plots:
      self.plotter = PlotNLP()

    self.dataDict = None
    self.labels = None
    self.labelRefs = None
    self.partitions = []
    self.samples = None
    self.patterns = None
    self.results = []
    self.model = None


  def _calculateTrialAccuracies(self):
    """
    @return trialAccuracies     (defaultdict)   Items are defaultdicts, one for
        each size of the training set. Inner defaultdicts keys are
        categories, with numpy array values that contain one accuracy value for
        each trial.
    """
    # To handle multiple trials of the same size:
    # trialSize -> (category -> list of accuracies)
    trialAccuracies = defaultdict(lambda: defaultdict(lambda: numpy.ndarray(0)))
    for i, size in enumerate(self.trainSize):
      accuracies = self.model.calculateClassificationResults(self.results[i])
      for label, acc in accuracies:
        category = self.labelRefs[label]
        accList = trialAccuracies[size][category]
        trialAccuracies[size][category] = numpy.append(accList, acc)

    return trialAccuracies


  def _calculateClassificationAccuracies(self, trialAccuracies):
    """
    @param trialAccuracies            (defaultdict)   Please see the description
        in self._calculateClassificationAccuracies().

    @return classificationAccuracies  (defaultdict)   Keys are classification
        categories, with multiple numpy arrays as values -- one for each size of
        training sets, with one accuracy value for each run of that training set
        size.
    """
    # Need the accuracies to be ordered for the plot
    trials = sorted(set(self.trainSize))
    # category -> list of list of accuracies
    classificationAccuracies = defaultdict(list)
    for trial in trials:
      accuracies = trialAccuracies[trial]
      for label, acc in accuracies.iteritems():
        classificationAccuracies[label].append(acc)

    return classificationAccuracies


  def _mapLabelRefs(self):
    """Replace the label strings in self.dataDict with corresponding ints."""
    self.labelRefs = [label for label in set(
        itertools.chain.from_iterable([x[1] for x in self.dataDict.values()]))]

    for uniqueID, data in self.dataDict.iteritems():
      self.dataDict[uniqueID] = (data[0], numpy.array(
          [self.labelRefs.index(label) for label in data[1]]))


  def _preprocess(self, preprocess):
    """Tokenize the samples, with or without preprocessing."""
    texter = TextPreprocess()
    if preprocess:
      self.samples = [(texter.tokenize(data[0],
                                       ignoreCommon=100,
                                       removeStrings=["[identifier deleted]"],
                                       correctSpell=True),
                       data[1]) for _, data in self.dataDict.iteritems()]
    else:
      self.samples = [(texter.tokenize(data[0]), data[1])
                      for _, data in self.dataDict.iteritems()]


  def setupData(self, preprocess=False, sampleIdx=2):
    """
    Get the data from CSV and preprocess if specified.
    One index in labelIdx implies the model will train on a single
    classification per sample.
    @param preprocess   (bool)    Whether or not to preprocess the data when
                                  generating the files
    @param sampleIdx    (int)     Column number of the text samples in the csv
    """
    self.dataDict = readCSV(self.dataPath, sampleIdx, self.numClasses)

    if (not isinstance(self.trainSize, list) or not
        all([0 <= size <= len(self.dataDict) for size in self.trainSize])):
      raise ValueError("Invalid size(s) for training set.")

    self._mapLabelRefs()

    self._preprocess(preprocess)

    if self.verbosity > 1:
      for i, s in enumerate(self.samples):
        print i, s


  def initModel(self):
    """Load or instantiate the classification model."""
    if self.load:
      self.loadModel()
    else:
      try:
        module = __import__(self.modelModuleName, {}, {}, self.modelName)
        modelClass = getattr(module, self.modelName)
        self.model = modelClass(
            verbosity=self.verbosity, modelDir=self.modelDir)
      except ImportError:
        raise RuntimeError("Could not import model class \'{0}\'.".
                           format(self.modelName))


  def loadModel(self):
    """Load the serialized model."""
    try:
      with open(os.path.join(self.modelDir, "model.pkl"), "rb") as f:
        model = pkl.load(f)
      print "Model loaded from \'{}\'.".format(self.modelDir)
      return model
    except IOError as e:
      print "Could not load model from \'{}\'.".format(self.modelDir)
      raise e


  def resetModel(self, trial):
    self.model.resetModel()


  def encodeSamples(self):
    self.patterns = self.model.encodeSamples(self.samples)


  def runExperiment(self):
    """Train and test the model for each trial specified by self.trainSize."""
    for i, size in enumerate(self.trainSize):
      self.partitions.append(self.partitionIndices(size, i))

      self.resetModel(i)
      if self.verbosity > 0:
        print "\tTraining for run {0} of {1}.".format(
            i + 1, len(self.trainSize))
      self.training(i)
      if self.verbosity > 0:
        print "\tTesting for this run."
      self.testing(i)


  def training(self, trial):
    """
    Train the model one-by-one on each pattern specified in this trials
    partition of indices. Models' training methods require the sample and label
    to be in a list.
    """
    if self.verbosity > 0:
      print ("\tRunner selects to train on sample(s) {}".format(
          self.partitions[trial][0]))

    for i in self.partitions[trial][0]:
      self.model.trainModel(i)


  def testing(self, trial):
    if self.verbosity > 0:
      print ("\tRunner selects to test on sample(s) {}".format(
          self.partitions[trial][1]))

    results = ([], [])
    for i in self.partitions[trial][1]:
      predicted = self.model.testModel(i)
      results[0].append(predicted)
      results[1].append(self.patterns[i]["labels"])

    self.results.append(results)


  def writeOutClassifications(self):
    """Write the samples, actual, and predicted classes to a CSV."""
    headers = ("Tokenized sample", "Actual", "Predicted")
    for trial, _ in enumerate(self.trainSize):
      resultsDict = defaultdict(list)
      for i, sampleNum in enumerate(self.partitions[trial][1]):
        # Loop through the indices in the test set of this trial.
        sample = self.samples[sampleNum][0]
        pred = sorted([self.labelRefs[j] for j in self.results[trial][0][i]])
        actual = sorted([self.labelRefs[j] for j in self.results[trial][1][i]])
        resultsDict[sampleNum] = (sampleNum, sample, actual, pred)

      resultsPath = os.path.join(self.model.modelDir,
                                 "results_trial" + str(trial) + ".csv")
      writeFromDict(resultsDict, headers, resultsPath)


  def calculateResults(self):
    """
    Calculate evaluation metrics from the result classifications.

    TODO: pass intended CM results to plotter.plotConfusionMatrix()
    """
    resultCalcs = [self.model.evaluateResults(self.results[i],
                                              self.labelRefs,
                                              self.partitions[i][1])
                   for i in xrange(len(self.trainSize))]

    self.model.printFinalReport(self.trainSize, [r[0] for r in resultCalcs])

    if self.plots:
      trialAccuracies = self._calculateTrialAccuracies()
      classificationAccuracies = self._calculateClassificationAccuracies(
          trialAccuracies)

      self.plotter.plotCategoryAccuracies(trialAccuracies, self.trainSize)
      self.plotter.plotCumulativeAccuracies(
          classificationAccuracies, self.trainSize)

      if self.plots > 1:
        # Plot extra evaluation figures -- confusion matrix.
        self.plotter.plotConfusionMatrix(
            self.setupConfusionMatrices(resultCalcs))


  def partitionIndices(self, split, trial):
    """
    Returns train and test indices.

    TODO: use StandardSplit in data_split.py
    """
    length = len(self.samples)
    if self.orderedSplit:
      trainIdx = range(split)
      testIdx = range(split, length)
    else:
      # Randomly sampled, not repeated
      trainIdx = random.sample(xrange(length), split)
      testIdx = [i for i in xrange(length) if i not in trainIdx]

    return (trainIdx, testIdx)


  def validateExperiment(self, expectationFilePath):
    """Returns accuracy of predicted labels against expected labels."""
    dataDict = readCSV(expectationFilePath, 2, self.numClasses)

    accuracies = numpy.zeros((len(self.results)))
    for i, trial in enumerate(self.results):
      for j, predictionList in enumerate(trial[0]):
        predictions = [self.labelRefs[p] for p in predictionList]
        if predictions == []:
          predictions = ["(none)"]
        expected = dataDict.items()[j+self.trainSize[i]][1]

        accuracies[i] += (float(len(set(predictions) & set(expected[1])))
                          / len(expected[1]))

      accuracies[i] = accuracies[i] / len(trial[0])

    return accuracies
Пример #4
0
class Runner(object):
    """
  Class to run the baseline NLP experiments with the specified data, models,
  text processing, and evaluation metrics.
  """
    def __init__(self, dataPath, resultsDir, experimentName, load, modelName,
                 modelModuleName, numClasses, plots, orderedSplit, trainSize,
                 verbosity):
        """
    @param dataPath         (str)     Path to raw data file for the experiment.
    @param resultsDir       (str)     Directory where for the results metrics.
    @param experimentName   (str)     Experiment name, used for saving results.
    @param load             (bool)    True if a serialized model is to be
                                      loaded.
    @param modelName        (str)     Name of nupic.fluent Model subclass.
    @param modeModuleName   (str)     Model module -- location of the subclass.
    @param numClasses       (int)     Number of classes (labels) per sample.
    @param plots            (int)     Specifies plotting of evaluation metrics.
    @param orderedSplit     (bool)    Indicates method for splitting train/test
                                      samples; False is random, True is ordered.
    @param trainSize        (str)     Number of samples to use in training.
    @param verbosity        (int)     Greater value prints out more progress.

    """
        self.dataPath = dataPath
        self.resultsDir = resultsDir
        self.experimentName = experimentName
        self.load = load
        self.modelName = modelName
        self.modelModuleName = modelModuleName
        self.numClasses = numClasses
        self.plots = plots
        self.orderedSplit = orderedSplit
        self.trainSize = trainSize
        self.verbosity = verbosity

        self.modelPath = os.path.join(self.resultsDir, self.experimentName,
                                      self.modelName)
        if not os.path.exists(self.modelPath):
            os.makedirs(self.modelPath)

        if self.plots:
            self.plotter = PlotNLP()

        self.dataDict = None
        self.labels = None
        self.labelRefs = None
        self.partitions = []
        self.samples = None
        self.patterns = None
        self.results = []
        self.model = None

    def _calculateTrialAccuracies(self):
        """
    @return trialAccuracies     (defaultdict)   Items are defaultdicts, one for
        each size of the training set. Inner defaultdicts keys are classification
        categories, with numpy array values that contain one accuracy value for
        each trial.
    """
        # To handle multiple trials of the same size:
        # trialSize -> (category -> list of accuracies)
        trialAccuracies = defaultdict(
            lambda: defaultdict(lambda: numpy.ndarray(0)))
        for i, size in enumerate(self.trainSize):
            accuracies = self.model.calculateClassificationResults(
                self.results[i])
            for label, acc in accuracies:
                category = self.labelRefs[label]
                acc_list = trialAccuracies[size][category]
                trialAccuracies[size][category] = numpy.append(acc_list, acc)

        return trialAccuracies

    def _calculateClassificationAccuracies(self, trialAccuracies):
        """
    @param trialAccuracies            (defaultdict)   Please see the description
        in self._calculateClassificationAccuracies().

    @return classificationAccuracies  (defaultdict)   Keys are classification
        categories, with multiple numpy arrays as values -- one for each size of
        training sets, with one accuracy value for each run of that training set
        size.
    """
        # Need the accuracies to be ordered for the plot
        trials = sorted(set(self.trainSize))
        # category -> list of list of accuracies
        classificationAccuracies = defaultdict(list)
        for trial in trials:
            accuracies = trialAccuracies[trial]
            for label, acc in accuracies.iteritems():
                classificationAccuracies[label].append(acc)

        return classificationAccuracies

    def _mapLabelRefs(self):
        """Replace the label strings in self.dataDict with corresponding ints."""
        self.labelRefs = list(
            set(
                itertools.chain.from_iterable(
                    map(lambda x: x[1], self.dataDict.values()))))

        for id, data in self.dataDict.iteritems():
            self.dataDict[id] = (data[0],
                                 numpy.array([
                                     self.labelRefs.index(label)
                                     for label in data[1]
                                 ]))

    def _preprocess(self, preprocess):
        """Tokenize the samples, with or without preprocessing."""
        texter = TextPreprocess()
        if preprocess:
            self.samples = [
                (texter.tokenize(data[0],
                                 ignoreCommon=100,
                                 removeStrings=["[identifier deleted]"],
                                 correctSpell=True), data[1])
                for id, data in self.dataDict.iteritems()
            ]
        else:
            self.samples = [(texter.tokenize(data[0]), data[1])
                            for id, data in self.dataDict.iteritems()]

    def setupData(self, preprocess=False, sampleIdx=2):
        """
    Get the data from CSV and preprocess if specified.
    One index in labelIdx implies the model will train on a single
    classification per sample.
    @param preprocess   (bool)    Whether or not to preprocess the data when
                                  generating the files
    @param sampleIdx    (int)     Column number of the text samples in the csv
    """
        self.dataDict = readCSV(self.dataPath, sampleIdx, self.numClasses)

        if (not isinstance(self.trainSize, list) or not all(
            [0 <= size <= len(self.dataDict) for size in self.trainSize])):
            raise ValueError("Invalid size(s) for training set.")

        self._mapLabelRefs()

        self._preprocess(preprocess)

        if self.verbosity > 1:
            for i, s in enumerate(self.samples):
                print i, s

    def initModel(self):
        """Load or instantiate the classification model."""
        if self.load:
            with open(os.path.join(self.modelPath, "model.pkl"), "rb") as f:
                self.model = pkl.load(f)
            print "Model loaded from \'{0}\'.".format(self.modelPath)
        else:
            try:
                module = __import__(self.modelModuleName, {}, {},
                                    self.modelName)
                modelClass = getattr(module, self.modelName)
                self.model = modelClass(verbosity=self.verbosity)
            except ImportError:
                raise RuntimeError(
                    "Could not import model class \'{0}\'.".format(
                        self.modelName))

    def resetModel(self, trial):
        """Resets or initializes the model"""
        if self.model is None:
            self.initModel()
        else:
            self.model.resetModel()

    def encodeSamples(self):
        """
    Encode the text samples into bitmap patterns, and log to txt file. The
    encoded patterns are stored in a dict along with their corresponding class
    labels.
    """
        self.patterns = [{
            "pattern": self.model.encodePattern(s[0]),
            "labels": s[1]
        } for s in self.samples]
        self.model.writeOutEncodings(self.patterns, self.modelPath)

    def runExperiment(self):
        """Train and test the model for each trial specified by self.trainSize."""
        for i, size in enumerate(self.trainSize):
            self.partitions.append(self.partitionIndices(size, i))

            self.resetModel(i)
            if self.verbosity > 0:
                print "\tTraining for run {0} of {1}.".format(
                    i + 1, len(self.trainSize))
            self.training(i)
            if self.verbosity > 0:
                print "\tTesting for this run."
            self.testing(i)

    def training(self, trial):
        """
    Train the model one-by-one on each pattern specified in this trials
    partition of indices. Models' training methods require the sample and label
    to be in a list.
    """
        if self.verbosity > 0:
            print("\tRunner selects to train on sample(s) {}".format(
                self.partitions[trial][0]))

        for i in self.partitions[trial][0]:
            self.model.trainModel([self.patterns[i]["pattern"]],
                                  [self.patterns[i]["labels"]])

    def testing(self, trial):
        if self.verbosity > 0:
            print("\tRunner selects to test on sample(s) {}".format(
                self.partitions[trial][1]))

        results = ([], [])
        for i in self.partitions[trial][1]:
            predicted = self.model.testModel(self.patterns[i]["pattern"])
            results[0].append(predicted)
            results[1].append(self.patterns[i]["labels"])

        self.results.append(results)

    def writeOutClassifications(self):
        """Write the samples, actual, and predicted classes to a CSV."""
        headers = ("Tokenized sample", "Actual", "Predicted")
        for trial, size in enumerate(self.trainSize):
            resultsDict = defaultdict(list)
            for i, sampleNum in enumerate(self.partitions[trial][1]):
                # Loop through the indices in the test set of this trial.
                sample = self.samples[sampleNum][0]
                pred = sorted(
                    [self.labelRefs[j] for j in self.results[trial][0][i]])
                actual = sorted(
                    [self.labelRefs[j] for j in self.results[trial][1][i]])
                resultsDict[sampleNum] = (sampleNum, sample, actual, pred)

            resultsPath = os.path.join(self.modelPath,
                                       "results_trial" + str(trial) + ".csv")
            writeFromDict(resultsDict, headers, resultsPath)

    def calculateResults(self):
        """
    Calculate evaluation metrics from the result classifications.

    TODO: pass intended CM results to plotter.plotConfusionMatrix()
    """
        resultCalcs = [
            self.model.evaluateResults(self.results[i], self.labelRefs,
                                       self.partitions[i][1])
            for i in xrange(len(self.trainSize))
        ]

        self.model.printFinalReport(self.trainSize,
                                    [r[0] for r in resultCalcs])

        if self.plots:
            trialAccuracies = self._calculateTrialAccuracies()
            classificationAccuracies = self._calculateClassificationAccuracies(
                trialAccuracies)

            self.plotter.plotCategoryAccuracies(trialAccuracies,
                                                self.trainSize)
            self.plotter.plotCumulativeAccuracies(classificationAccuracies,
                                                  self.trainSize)

            if self.plots > 1:
                # Plot extra evaluation figures -- confusion matrix.
                self.plotter.plotConfusionMatrix(
                    self.setupConfusionMatrices(resultCalcs))

    def save(self):
        """Save the serialized model."""
        print "Saving model to \'{0}\' directory.".format(self.modelPath)
        with open(os.path.join(self.modelPath, "model.pkl"), "wb") as f:
            pkl.dump(self.model, f)

    def partitionIndices(self, split, trial):
        """
    Returns train and test indices.

    TODO: use StandardSplit in data_split.py
    """
        length = len(self.samples)
        if self.orderedSplit:
            trainIdx = range(split)
            testIdx = range(split, length)
        else:
            # Randomly sampled, not repeated
            trainIdx = random.sample(xrange(length), split)
            testIdx = [i for i in xrange(length) if i not in trainIdx]

        return (trainIdx, testIdx)

    def validateExperiment(self, expectationFilePath):
        """Returns accuracy of predicted labels against expected labels."""
        dataDict = readCSV(expectationFilePath, 2, self.numClasses)

        accuracies = numpy.zeros((len(self.results)))
        for i, trial in enumerate(self.results):
            for j, predictionList in enumerate(trial[0]):
                predictions = [self.labelRefs[p] for p in predictionList]
                if predictions == []:
                    predictions = ["(none)"]
                expected = dataDict.items()[j + self.trainSize[i]][1]

                accuracies[i] += (
                    float(len(set(predictions) & set(expected[1]))) /
                    len(expected[1]))

            accuracies[i] = accuracies[i] / len(trial[0])

        return accuracies