class Runner(object): """ Class to run the baseline NLP experiments with the specified data, models, text processing, and evaluation metrics. """ def __init__(self, dataPath, resultsDir, experimentName, modelName, loadPath=None, numClasses=3, plots=0, orderedSplit=False, trainSizes=None, verbosity=0): """ @param dataPath (str) Path to raw data file for the experiment. @param resultsDir (str) Directory where for the results metrics. @param experimentName (str) Experiment name, used for saving results. @param loadPath (str) Path to serialized model for loading. @param modelName (str) Name of nupic.fluent Model subclass. @param numClasses (int) Number of classes (labels) per sample. @param plots (int) Specifies plotting of evaluation metrics. @param orderedSplit (bool) Indicates method for splitting train/test samples; False is random, True is ordered. @param trainSizes (list) Number of samples to use in training, per trial. @param verbosity (int) Greater value prints out more progress. """ self.dataPath = dataPath self.resultsDir = resultsDir self.experimentName = experimentName self.loadPath = loadPath self.modelName = modelName self.numClasses = numClasses self.plots = plots self.orderedSplit = orderedSplit self.trainSizes = trainSizes if trainSizes else [] self.verbosity = verbosity self.modelDir = os.path.join(self.resultsDir, self.experimentName, self.modelName) if not os.path.exists(self.modelDir): os.makedirs(self.modelDir) if self.plots: from fluent.utils.plotting import PlotNLP self.plotter = PlotNLP() self.dataDict = None self.labels = None self.labelRefs = None self.partitions = [] self.samples = {} self.patterns = None self.results = [] self.model = None def initModel(self, modelName): """Load or instantiate the classification model.""" if self.loadPath: self.model = self.loadModel() else: self.model = self._createModel(modelName) def _createModel(self, modelName): """Return an instantiated model.""" modelCls = _MODEL_MAPPING.get(modelName, None) if modelCls is None: raise ValueError( "Could not instantiate model \'{}\'.".format(modelName)) # TODO: remove these if blocks and just use the else; either specify the Cio # FP type elsewhere, or split Word and Doc into separate classes. if modelName == "CioWordFingerprint": return modelCls(verbosity=self.verbosity, numLabels=self.numClasses, modelDir=self.modelDir, fingerprintType=EncoderTypes.word) elif modelName == "CioDocumentFingerprint": return modelCls(verbosity=self.verbosity, numLabels=self.numClasses, modelDir=self.modelDir, fingerprintType=EncoderTypes.document) else: return modelCls(verbosity=self.verbosity, numLabels=self.numClasses, modelDir=self.modelDir) def loadModel(self): """Load the serialized model.""" try: with open(self.loadPath, "rb") as f: model = pkl.load(f) print "Model loaded from \'{}\'.".format(self.loadPath) return model except IOError as e: print "Could not load model from \'{}\'.".format(self.loadPath) raise e def resetModel(self, _): self.model.resetModel() def saveModel(self): self.model.saveModel() def _mapLabelRefs(self): """Replace the label strings in self.dataDict with corresponding ints.""" self.labelRefs = [ label for label in set( itertools.chain.from_iterable( [x[1] for x in self.dataDict.values()])) ] for uniqueID, data in self.dataDict.iteritems(): self.dataDict[uniqueID] = (data[0], numpy.array([ self.labelRefs.index(label) for label in data[1] ])) def setupData(self, preprocess=False): """ Get the data from CSV and preprocess if specified. The call to readCSV() assumes a specific CSV format, detailed in its docstring. @param preprocess (bool) Whether or not to preprocess the data when reading in samples. """ self.dataDict = readCSV(self.dataPath, numLabels=self.numClasses) if (not isinstance(self.trainSizes, list) or not all( [0 <= size <= len(self.dataDict) for size in self.trainSizes])): raise ValueError("Invalid size(s) for training set.") self._mapLabelRefs() self.samples = self.model.prepData(self.dataDict, preprocess) self.encodeSamples() if self.verbosity > 1: for i, s in self.samples.iteritems(): print i, s def encodeSamples(self): self.patterns = self.model.encodeSamples(self.samples) def runExperiment(self): """Train and test the model for each trial specified by self.trainSizes.""" if not self.partitions: # An experiment (e.g. k-folds) may do this elsewhere self.partitionIndices() for i, _ in enumerate(self.trainSizes): self.resetModel(i) if self.verbosity > 0: print "\tTraining for run {0} of {1}.".format( i + 1, len(self.trainSizes)) self._training(i) if self.verbosity > 0: print "\tTesting for this run." self._testing(i) def partitionIndices(self): """ Partitions list of two-tuples of train and test indices for each trial. TODO: use StandardSplit in data_split.py """ length = len(self.samples) if self.orderedSplit: for split in self.trainSizes: trainIndices = range(split) testIndices = range(split, length) self.partitions.append((trainIndices, testIndices)) else: # Randomly sampled, not repeated for split in self.trainSizes: trainIndices = random.sample(xrange(length), split) testIndices = [ i for i in xrange(length) if i not in trainIndices ] self.partitions.append((trainIndices, testIndices)) def _training(self, trial): """ Train the model one-by-one on each pattern specified in this trials partition of indices. Models' training methods require the sample and label to be in a list. """ if self.verbosity > 0: print("\tRunner selects to train on sample(s) {}".format( self.partitions[trial][0])) for i in self.partitions[trial][0]: self.model.trainModel(i) def _testing(self, trial): if self.verbosity > 0: print("\tRunner selects to test on sample(s) {}".format( self.partitions[trial][1])) results = ([], []) for i in self.partitions[trial][1]: predicted = self.model.testModel(i) results[0].append(predicted) results[1].append(self.patterns[i]["labels"]) self.results.append(results) def writeOutClassifications(self): """Write the samples, actual, and predicted classes to a CSV.""" headers = ("Tokenized sample", "Actual", "Predicted") for trial, _ in enumerate(self.trainSizes): resultsDict = defaultdict(list) for i, sampleNum in enumerate(self.partitions[trial][1]): # Loop through the indices in the test set of this trial. sample = self.samples.values()[sampleNum][0] pred = sorted( [self.labelRefs[j] for j in self.results[trial][0][i]]) actual = sorted( [self.labelRefs[j] for j in self.results[trial][1][i]]) resultsDict[sampleNum] = (sampleNum, sample, actual, pred) resultsPath = os.path.join(self.model.modelDir, "results_trial" + str(trial) + ".csv") writeFromDict(resultsDict, headers, resultsPath) def calculateResults(self): """ Calculate evaluation metrics from the result classifications. TODO: pass intended CM results to plotter.plotConfusionMatrix() """ resultCalcs = [ self.model.evaluateResults(self.results[i], self.labelRefs, self.partitions[i][1]) for i in xrange(len(self.partitions)) ] self.printFinalReport(self.trainSizes, [r[0] for r in resultCalcs]) if self.plots: trialAccuracies = self._calculateTrialAccuracies() classificationAccuracies = self._calculateClassificationAccuracies( trialAccuracies) self.plotter.plotCategoryAccuracies(trialAccuracies, self.trainSizes) self.plotter.plotCumulativeAccuracies(classificationAccuracies, self.trainSizes) if self.plots > 1: # Plot extra evaluation figures -- confusion matrix. self.plotter.plotConfusionMatrix( self.setupConfusionMatrices(resultCalcs)) return resultCalcs def _calculateTrialAccuracies(self): """ @return trialAccuracies (defaultdict) Items are defaultdicts, one for each size of the training set. Inner defaultdicts keys are categories, with numpy array values that contain one accuracy value for each trial. """ # To handle multiple trials of the same size: # trialSize -> (category -> list of accuracies) trialAccuracies = defaultdict( lambda: defaultdict(lambda: numpy.ndarray(0))) for result, size in itertools.izip(self.results, self.trainSizes): accuracies = self.model.calculateClassificationResults(result) for label, acc in accuracies: category = self.labelRefs[label] accList = trialAccuracies[size][category] trialAccuracies[size][category] = numpy.append(accList, acc) return trialAccuracies def _calculateClassificationAccuracies(self, trialAccuracies): """ @param trialAccuracies (defaultdict) Please see the description in self._calculateClassificationAccuracies(). @return classificationAccuracies (defaultdict) Keys are classification categories, with multiple numpy arrays as values -- one for each size of training sets, with one accuracy value for each run of that training set size. """ # Need the accuracies to be ordered for the plot trials = sorted(set(self.trainSizes)) # category -> list of list of accuracies classificationAccuracies = defaultdict(list) for trial in trials: accuracies = trialAccuracies[trial] for label, acc in accuracies.iteritems(): classificationAccuracies[label].append(acc) return classificationAccuracies def validateExperiment(self, expectationFilePath): """Returns accuracy of predicted labels against expected labels.""" dataDict = readCSV(expectationFilePath, numLabels=self.numClasses) accuracies = numpy.zeros((len(self.results))) for i, trial in enumerate(self.results): for j, predictionList in enumerate(trial[0]): predictions = [self.labelRefs[p] for p in predictionList] if predictions == []: predictions = ["(none)"] expected = dataDict.items()[j + self.trainSizes[i]][1] accuracies[i] += ( float(len(set(predictions) & set(expected[1]))) / len(expected[1])) accuracies[i] = accuracies[i] / len(trial[0]) return accuracies @staticmethod def printFinalReport(trainSizes, accuracies): """Prints result accuracies.""" template = "{0:<20}|{1:<10}" print "Evaluation results for this experiment:" print template.format("Size of training set", "Accuracy") for size, acc in itertools.izip(trainSizes, accuracies): print template.format(size, acc) def evaluateCumulativeResults(self, intermResults): """ Cumulative statistics for the outputs of evaluateTrialResults(). @param intermResults (list) List of returned results from evaluateTrialResults(). @return (dict) Returns a dictionary with entries for max, mean, and min accuracies, and the mean confusion matrix. """ accuracy = [] cm = numpy.zeros((intermResults[0][1].shape)) # Find mean, max, and min values for the metrics. for result in intermResults: accuracy.append(result[0]) cm = numpy.add(cm, result[1]) results = { "max_accuracy": max(accuracy), "mean_accuracy": sum(accuracy) / float(len(accuracy)), "min_accuracy": min(accuracy), "total_cm": cm } if self.verbosity > 0: self._printCumulativeReport(results) return results @staticmethod def _printCumulativeReport(results): """ Prints results as returned by evaluateFinalResults() after several trials. """ print "---------- RESULTS ----------" print "max, mean, min accuracies = " print "{0:.3f}, {1:.3f}, {2:.3f}".format(results["max_accuracy"], results["mean_accuracy"], results["min_accuracy"]) print "total confusion matrix =\n", results["total_cm"]
class Runner(object): """ Class to run the baseline NLP experiments with the specified data, models, text processing, and evaluation metrics. """ def __init__( self, dataPath, resultsDir, experimentName, modelName, loadPath=None, numClasses=3, plots=0, orderedSplit=False, trainSizes=None, verbosity=0, ): """ @param dataPath (str) Path to raw data file for the experiment. @param resultsDir (str) Directory where for the results metrics. @param experimentName (str) Experiment name, used for saving results. @param loadPath (str) Path to serialized model for loading. @param modelName (str) Name of nupic.fluent Model subclass. @param numClasses (int) Number of classes (labels) per sample. @param plots (int) Specifies plotting of evaluation metrics. @param orderedSplit (bool) Indicates method for splitting train/test samples; False is random, True is ordered. @param trainSizes (list) Number of samples to use in training, per trial. @param verbosity (int) Greater value prints out more progress. """ self.dataPath = dataPath self.resultsDir = resultsDir self.experimentName = experimentName self.loadPath = loadPath self.modelName = modelName self.numClasses = numClasses self.plots = plots self.orderedSplit = orderedSplit self.trainSizes = trainSizes if trainSizes else [] self.verbosity = verbosity self.modelDir = os.path.join(self.resultsDir, self.experimentName, self.modelName) if not os.path.exists(self.modelDir): os.makedirs(self.modelDir) if self.plots: from fluent.utils.plotting import PlotNLP self.plotter = PlotNLP() self.dataDict = None self.labels = None self.labelRefs = None self.partitions = [] self.samples = {} self.patterns = None self.results = [] self.model = None def initModel(self, modelName): """Load or instantiate the classification model.""" if self.loadPath: self.model = self.loadModel() else: self.model = self._createModel(modelName) def _createModel(self, modelName): """Return an instantiated model.""" modelCls = _MODEL_MAPPING.get(modelName, None) if modelCls is None: raise ValueError("Could not instantiate model '{}'.".format(modelName)) # TODO: remove these if blocks and just use the else; either specify the Cio # FP type elsewhere, or split Word and Doc into separate classes. if modelName == "CioWordFingerprint": return modelCls( verbosity=self.verbosity, numLabels=self.numClasses, modelDir=self.modelDir, fingerprintType=EncoderTypes.word, ) elif modelName == "CioDocumentFingerprint": return modelCls( verbosity=self.verbosity, numLabels=self.numClasses, modelDir=self.modelDir, fingerprintType=EncoderTypes.document, ) else: return modelCls(verbosity=self.verbosity, numLabels=self.numClasses, modelDir=self.modelDir) def loadModel(self): """Load the serialized model.""" try: with open(self.loadPath, "rb") as f: model = pkl.load(f) print "Model loaded from '{}'.".format(self.loadPath) return model except IOError as e: print "Could not load model from '{}'.".format(self.loadPath) raise e def resetModel(self, _): self.model.resetModel() def saveModel(self): self.model.saveModel() def _mapLabelRefs(self): """Replace the label strings in self.dataDict with corresponding ints.""" self.labelRefs = [label for label in set(itertools.chain.from_iterable([x[1] for x in self.dataDict.values()]))] for uniqueID, data in self.dataDict.iteritems(): self.dataDict[uniqueID] = (data[0], numpy.array([self.labelRefs.index(label) for label in data[1]])) def setupData(self, preprocess=False): """ Get the data from CSV and preprocess if specified. The call to readCSV() assumes a specific CSV format, detailed in its docstring. @param preprocess (bool) Whether or not to preprocess the data when reading in samples. """ self.dataDict = readCSV(self.dataPath, numLabels=self.numClasses) if not isinstance(self.trainSizes, list) or not all( [0 <= size <= len(self.dataDict) for size in self.trainSizes] ): raise ValueError("Invalid size(s) for training set.") self._mapLabelRefs() self.samples = self.model.prepData(self.dataDict, preprocess) self.encodeSamples() if self.verbosity > 1: for i, s in self.samples.iteritems(): print i, s def encodeSamples(self): self.patterns = self.model.encodeSamples(self.samples) def runExperiment(self): """Train and test the model for each trial specified by self.trainSizes.""" if not self.partitions: # An experiment (e.g. k-folds) may do this elsewhere self.partitionIndices() for i, _ in enumerate(self.trainSizes): self.resetModel(i) if self.verbosity > 0: print "\tTraining for run {0} of {1}.".format(i + 1, len(self.trainSizes)) self._training(i) if self.verbosity > 0: print "\tTesting for this run." self._testing(i) def partitionIndices(self): """ Partitions list of two-tuples of train and test indices for each trial. TODO: use StandardSplit in data_split.py """ length = len(self.samples) if self.orderedSplit: for split in self.trainSizes: trainIndices = range(split) testIndices = range(split, length) self.partitions.append((trainIndices, testIndices)) else: # Randomly sampled, not repeated for split in self.trainSizes: trainIndices = random.sample(xrange(length), split) testIndices = [i for i in xrange(length) if i not in trainIndices] self.partitions.append((trainIndices, testIndices)) def _training(self, trial): """ Train the model one-by-one on each pattern specified in this trials partition of indices. Models' training methods require the sample and label to be in a list. """ if self.verbosity > 0: print ("\tRunner selects to train on sample(s) {}".format(self.partitions[trial][0])) for i in self.partitions[trial][0]: self.model.trainModel(i) def _testing(self, trial): if self.verbosity > 0: print ("\tRunner selects to test on sample(s) {}".format(self.partitions[trial][1])) results = ([], []) for i in self.partitions[trial][1]: predicted = self.model.testModel(i) results[0].append(predicted) results[1].append(self.patterns[i]["labels"]) self.results.append(results) def writeOutClassifications(self): """Write the samples, actual, and predicted classes to a CSV.""" headers = ("Tokenized sample", "Actual", "Predicted") for trial, _ in enumerate(self.trainSizes): resultsDict = defaultdict(list) for i, sampleNum in enumerate(self.partitions[trial][1]): # Loop through the indices in the test set of this trial. sample = self.samples.values()[sampleNum][0] pred = sorted([self.labelRefs[j] for j in self.results[trial][0][i]]) actual = sorted([self.labelRefs[j] for j in self.results[trial][1][i]]) resultsDict[sampleNum] = (sampleNum, sample, actual, pred) resultsPath = os.path.join(self.model.modelDir, "results_trial" + str(trial) + ".csv") writeFromDict(resultsDict, headers, resultsPath) def calculateResults(self): """ Calculate evaluation metrics from the result classifications. TODO: pass intended CM results to plotter.plotConfusionMatrix() """ resultCalcs = [ self.model.evaluateResults(self.results[i], self.labelRefs, self.partitions[i][1]) for i in xrange(len(self.partitions)) ] self.printFinalReport(self.trainSizes, [r[0] for r in resultCalcs]) if self.plots: trialAccuracies = self._calculateTrialAccuracies() classificationAccuracies = self._calculateClassificationAccuracies(trialAccuracies) self.plotter.plotCategoryAccuracies(trialAccuracies, self.trainSizes) self.plotter.plotCumulativeAccuracies(classificationAccuracies, self.trainSizes) if self.plots > 1: # Plot extra evaluation figures -- confusion matrix. self.plotter.plotConfusionMatrix(self.setupConfusionMatrices(resultCalcs)) return resultCalcs def _calculateTrialAccuracies(self): """ @return trialAccuracies (defaultdict) Items are defaultdicts, one for each size of the training set. Inner defaultdicts keys are categories, with numpy array values that contain one accuracy value for each trial. """ # To handle multiple trials of the same size: # trialSize -> (category -> list of accuracies) trialAccuracies = defaultdict(lambda: defaultdict(lambda: numpy.ndarray(0))) for result, size in itertools.izip(self.results, self.trainSizes): accuracies = self.model.calculateClassificationResults(result) for label, acc in accuracies: category = self.labelRefs[label] accList = trialAccuracies[size][category] trialAccuracies[size][category] = numpy.append(accList, acc) return trialAccuracies def _calculateClassificationAccuracies(self, trialAccuracies): """ @param trialAccuracies (defaultdict) Please see the description in self._calculateClassificationAccuracies(). @return classificationAccuracies (defaultdict) Keys are classification categories, with multiple numpy arrays as values -- one for each size of training sets, with one accuracy value for each run of that training set size. """ # Need the accuracies to be ordered for the plot trials = sorted(set(self.trainSizes)) # category -> list of list of accuracies classificationAccuracies = defaultdict(list) for trial in trials: accuracies = trialAccuracies[trial] for label, acc in accuracies.iteritems(): classificationAccuracies[label].append(acc) return classificationAccuracies def validateExperiment(self, expectationFilePath): """Returns accuracy of predicted labels against expected labels.""" dataDict = readCSV(expectationFilePath, numLabels=self.numClasses) accuracies = numpy.zeros((len(self.results))) for i, trial in enumerate(self.results): for j, predictionList in enumerate(trial[0]): predictions = [self.labelRefs[p] for p in predictionList] if predictions == []: predictions = ["(none)"] expected = dataDict.items()[j + self.trainSizes[i]][1] accuracies[i] += float(len(set(predictions) & set(expected[1]))) / len(expected[1]) accuracies[i] = accuracies[i] / len(trial[0]) return accuracies @staticmethod def printFinalReport(trainSizes, accuracies): """Prints result accuracies.""" template = "{0:<20}|{1:<10}" print "Evaluation results for this experiment:" print template.format("Size of training set", "Accuracy") for size, acc in itertools.izip(trainSizes, accuracies): print template.format(size, acc) def evaluateCumulativeResults(self, intermResults): """ Cumulative statistics for the outputs of evaluateTrialResults(). @param intermResults (list) List of returned results from evaluateTrialResults(). @return (dict) Returns a dictionary with entries for max, mean, and min accuracies, and the mean confusion matrix. """ accuracy = [] cm = numpy.zeros((intermResults[0][1].shape)) # Find mean, max, and min values for the metrics. for result in intermResults: accuracy.append(result[0]) cm = numpy.add(cm, result[1]) results = { "max_accuracy": max(accuracy), "mean_accuracy": sum(accuracy) / float(len(accuracy)), "min_accuracy": min(accuracy), "total_cm": cm, } if self.verbosity > 0: self._printCumulativeReport(results) return results @staticmethod def _printCumulativeReport(results): """ Prints results as returned by evaluateFinalResults() after several trials. """ print "---------- RESULTS ----------" print "max, mean, min accuracies = " print "{0:.3f}, {1:.3f}, {2:.3f}".format( results["max_accuracy"], results["mean_accuracy"], results["min_accuracy"] ) print "total confusion matrix =\n", results["total_cm"]
class Runner(object): """ Class to run the baseline NLP experiments with the specified data, models, text processing, and evaluation metrics. """ def __init__(self, dataPath, resultsDir, experimentName, load, modelName, modelModuleName, numClasses, plots, orderedSplit, trainSize, verbosity): """ @param dataPath (str) Path to raw data file for the experiment. @param resultsDir (str) Directory where for the results metrics. @param experimentName (str) Experiment name, used for saving results. @param load (bool) True if a serialized model is to be loaded. @param modelName (str) Name of nupic.fluent Model subclass. @param modeModuleName (str) Model module -- location of the subclass. @param numClasses (int) Number of classes (labels) per sample. @param plots (int) Specifies plotting of evaluation metrics. @param orderedSplit (bool) Indicates method for splitting train/test samples; False is random, True is ordered. @param trainSize (str) Number of samples to use in training. @param verbosity (int) Greater value prints out more progress. """ self.dataPath = dataPath self.resultsDir = resultsDir self.experimentName = experimentName self.load = load self.modelName = modelName self.modelModuleName = modelModuleName self.numClasses = numClasses self.plots = plots self.orderedSplit = orderedSplit self.trainSize = trainSize self.verbosity = verbosity self.modelDir = os.path.join( self.resultsDir, self.experimentName, self.modelName) if not os.path.exists(self.modelDir): os.makedirs(self.modelDir) if self.plots: self.plotter = PlotNLP() self.dataDict = None self.labels = None self.labelRefs = None self.partitions = [] self.samples = None self.patterns = None self.results = [] self.model = None def _calculateTrialAccuracies(self): """ @return trialAccuracies (defaultdict) Items are defaultdicts, one for each size of the training set. Inner defaultdicts keys are categories, with numpy array values that contain one accuracy value for each trial. """ # To handle multiple trials of the same size: # trialSize -> (category -> list of accuracies) trialAccuracies = defaultdict(lambda: defaultdict(lambda: numpy.ndarray(0))) for i, size in enumerate(self.trainSize): accuracies = self.model.calculateClassificationResults(self.results[i]) for label, acc in accuracies: category = self.labelRefs[label] accList = trialAccuracies[size][category] trialAccuracies[size][category] = numpy.append(accList, acc) return trialAccuracies def _calculateClassificationAccuracies(self, trialAccuracies): """ @param trialAccuracies (defaultdict) Please see the description in self._calculateClassificationAccuracies(). @return classificationAccuracies (defaultdict) Keys are classification categories, with multiple numpy arrays as values -- one for each size of training sets, with one accuracy value for each run of that training set size. """ # Need the accuracies to be ordered for the plot trials = sorted(set(self.trainSize)) # category -> list of list of accuracies classificationAccuracies = defaultdict(list) for trial in trials: accuracies = trialAccuracies[trial] for label, acc in accuracies.iteritems(): classificationAccuracies[label].append(acc) return classificationAccuracies def _mapLabelRefs(self): """Replace the label strings in self.dataDict with corresponding ints.""" self.labelRefs = [label for label in set( itertools.chain.from_iterable([x[1] for x in self.dataDict.values()]))] for uniqueID, data in self.dataDict.iteritems(): self.dataDict[uniqueID] = (data[0], numpy.array( [self.labelRefs.index(label) for label in data[1]])) def _preprocess(self, preprocess): """Tokenize the samples, with or without preprocessing.""" texter = TextPreprocess() if preprocess: self.samples = [(texter.tokenize(data[0], ignoreCommon=100, removeStrings=["[identifier deleted]"], correctSpell=True), data[1]) for _, data in self.dataDict.iteritems()] else: self.samples = [(texter.tokenize(data[0]), data[1]) for _, data in self.dataDict.iteritems()] def setupData(self, preprocess=False, sampleIdx=2): """ Get the data from CSV and preprocess if specified. One index in labelIdx implies the model will train on a single classification per sample. @param preprocess (bool) Whether or not to preprocess the data when generating the files @param sampleIdx (int) Column number of the text samples in the csv """ self.dataDict = readCSV(self.dataPath, sampleIdx, self.numClasses) if (not isinstance(self.trainSize, list) or not all([0 <= size <= len(self.dataDict) for size in self.trainSize])): raise ValueError("Invalid size(s) for training set.") self._mapLabelRefs() self._preprocess(preprocess) if self.verbosity > 1: for i, s in enumerate(self.samples): print i, s def initModel(self): """Load or instantiate the classification model.""" if self.load: self.loadModel() else: try: module = __import__(self.modelModuleName, {}, {}, self.modelName) modelClass = getattr(module, self.modelName) self.model = modelClass( verbosity=self.verbosity, modelDir=self.modelDir) except ImportError: raise RuntimeError("Could not import model class \'{0}\'.". format(self.modelName)) def loadModel(self): """Load the serialized model.""" try: with open(os.path.join(self.modelDir, "model.pkl"), "rb") as f: model = pkl.load(f) print "Model loaded from \'{}\'.".format(self.modelDir) return model except IOError as e: print "Could not load model from \'{}\'.".format(self.modelDir) raise e def resetModel(self, trial): self.model.resetModel() def encodeSamples(self): self.patterns = self.model.encodeSamples(self.samples) def runExperiment(self): """Train and test the model for each trial specified by self.trainSize.""" for i, size in enumerate(self.trainSize): self.partitions.append(self.partitionIndices(size, i)) self.resetModel(i) if self.verbosity > 0: print "\tTraining for run {0} of {1}.".format( i + 1, len(self.trainSize)) self.training(i) if self.verbosity > 0: print "\tTesting for this run." self.testing(i) def training(self, trial): """ Train the model one-by-one on each pattern specified in this trials partition of indices. Models' training methods require the sample and label to be in a list. """ if self.verbosity > 0: print ("\tRunner selects to train on sample(s) {}".format( self.partitions[trial][0])) for i in self.partitions[trial][0]: self.model.trainModel(i) def testing(self, trial): if self.verbosity > 0: print ("\tRunner selects to test on sample(s) {}".format( self.partitions[trial][1])) results = ([], []) for i in self.partitions[trial][1]: predicted = self.model.testModel(i) results[0].append(predicted) results[1].append(self.patterns[i]["labels"]) self.results.append(results) def writeOutClassifications(self): """Write the samples, actual, and predicted classes to a CSV.""" headers = ("Tokenized sample", "Actual", "Predicted") for trial, _ in enumerate(self.trainSize): resultsDict = defaultdict(list) for i, sampleNum in enumerate(self.partitions[trial][1]): # Loop through the indices in the test set of this trial. sample = self.samples[sampleNum][0] pred = sorted([self.labelRefs[j] for j in self.results[trial][0][i]]) actual = sorted([self.labelRefs[j] for j in self.results[trial][1][i]]) resultsDict[sampleNum] = (sampleNum, sample, actual, pred) resultsPath = os.path.join(self.model.modelDir, "results_trial" + str(trial) + ".csv") writeFromDict(resultsDict, headers, resultsPath) def calculateResults(self): """ Calculate evaluation metrics from the result classifications. TODO: pass intended CM results to plotter.plotConfusionMatrix() """ resultCalcs = [self.model.evaluateResults(self.results[i], self.labelRefs, self.partitions[i][1]) for i in xrange(len(self.trainSize))] self.model.printFinalReport(self.trainSize, [r[0] for r in resultCalcs]) if self.plots: trialAccuracies = self._calculateTrialAccuracies() classificationAccuracies = self._calculateClassificationAccuracies( trialAccuracies) self.plotter.plotCategoryAccuracies(trialAccuracies, self.trainSize) self.plotter.plotCumulativeAccuracies( classificationAccuracies, self.trainSize) if self.plots > 1: # Plot extra evaluation figures -- confusion matrix. self.plotter.plotConfusionMatrix( self.setupConfusionMatrices(resultCalcs)) def partitionIndices(self, split, trial): """ Returns train and test indices. TODO: use StandardSplit in data_split.py """ length = len(self.samples) if self.orderedSplit: trainIdx = range(split) testIdx = range(split, length) else: # Randomly sampled, not repeated trainIdx = random.sample(xrange(length), split) testIdx = [i for i in xrange(length) if i not in trainIdx] return (trainIdx, testIdx) def validateExperiment(self, expectationFilePath): """Returns accuracy of predicted labels against expected labels.""" dataDict = readCSV(expectationFilePath, 2, self.numClasses) accuracies = numpy.zeros((len(self.results))) for i, trial in enumerate(self.results): for j, predictionList in enumerate(trial[0]): predictions = [self.labelRefs[p] for p in predictionList] if predictions == []: predictions = ["(none)"] expected = dataDict.items()[j+self.trainSize[i]][1] accuracies[i] += (float(len(set(predictions) & set(expected[1]))) / len(expected[1])) accuracies[i] = accuracies[i] / len(trial[0]) return accuracies
class Runner(object): """ Class to run the baseline NLP experiments with the specified data, models, text processing, and evaluation metrics. """ def __init__(self, dataPath, resultsDir, experimentName, load, modelName, modelModuleName, numClasses, plots, orderedSplit, trainSize, verbosity): """ @param dataPath (str) Path to raw data file for the experiment. @param resultsDir (str) Directory where for the results metrics. @param experimentName (str) Experiment name, used for saving results. @param load (bool) True if a serialized model is to be loaded. @param modelName (str) Name of nupic.fluent Model subclass. @param modeModuleName (str) Model module -- location of the subclass. @param numClasses (int) Number of classes (labels) per sample. @param plots (int) Specifies plotting of evaluation metrics. @param orderedSplit (bool) Indicates method for splitting train/test samples; False is random, True is ordered. @param trainSize (str) Number of samples to use in training. @param verbosity (int) Greater value prints out more progress. """ self.dataPath = dataPath self.resultsDir = resultsDir self.experimentName = experimentName self.load = load self.modelName = modelName self.modelModuleName = modelModuleName self.numClasses = numClasses self.plots = plots self.orderedSplit = orderedSplit self.trainSize = trainSize self.verbosity = verbosity self.modelPath = os.path.join(self.resultsDir, self.experimentName, self.modelName) if not os.path.exists(self.modelPath): os.makedirs(self.modelPath) if self.plots: self.plotter = PlotNLP() self.dataDict = None self.labels = None self.labelRefs = None self.partitions = [] self.samples = None self.patterns = None self.results = [] self.model = None def _calculateTrialAccuracies(self): """ @return trialAccuracies (defaultdict) Items are defaultdicts, one for each size of the training set. Inner defaultdicts keys are classification categories, with numpy array values that contain one accuracy value for each trial. """ # To handle multiple trials of the same size: # trialSize -> (category -> list of accuracies) trialAccuracies = defaultdict( lambda: defaultdict(lambda: numpy.ndarray(0))) for i, size in enumerate(self.trainSize): accuracies = self.model.calculateClassificationResults( self.results[i]) for label, acc in accuracies: category = self.labelRefs[label] acc_list = trialAccuracies[size][category] trialAccuracies[size][category] = numpy.append(acc_list, acc) return trialAccuracies def _calculateClassificationAccuracies(self, trialAccuracies): """ @param trialAccuracies (defaultdict) Please see the description in self._calculateClassificationAccuracies(). @return classificationAccuracies (defaultdict) Keys are classification categories, with multiple numpy arrays as values -- one for each size of training sets, with one accuracy value for each run of that training set size. """ # Need the accuracies to be ordered for the plot trials = sorted(set(self.trainSize)) # category -> list of list of accuracies classificationAccuracies = defaultdict(list) for trial in trials: accuracies = trialAccuracies[trial] for label, acc in accuracies.iteritems(): classificationAccuracies[label].append(acc) return classificationAccuracies def _mapLabelRefs(self): """Replace the label strings in self.dataDict with corresponding ints.""" self.labelRefs = list( set( itertools.chain.from_iterable( map(lambda x: x[1], self.dataDict.values())))) for id, data in self.dataDict.iteritems(): self.dataDict[id] = (data[0], numpy.array([ self.labelRefs.index(label) for label in data[1] ])) def _preprocess(self, preprocess): """Tokenize the samples, with or without preprocessing.""" texter = TextPreprocess() if preprocess: self.samples = [ (texter.tokenize(data[0], ignoreCommon=100, removeStrings=["[identifier deleted]"], correctSpell=True), data[1]) for id, data in self.dataDict.iteritems() ] else: self.samples = [(texter.tokenize(data[0]), data[1]) for id, data in self.dataDict.iteritems()] def setupData(self, preprocess=False, sampleIdx=2): """ Get the data from CSV and preprocess if specified. One index in labelIdx implies the model will train on a single classification per sample. @param preprocess (bool) Whether or not to preprocess the data when generating the files @param sampleIdx (int) Column number of the text samples in the csv """ self.dataDict = readCSV(self.dataPath, sampleIdx, self.numClasses) if (not isinstance(self.trainSize, list) or not all( [0 <= size <= len(self.dataDict) for size in self.trainSize])): raise ValueError("Invalid size(s) for training set.") self._mapLabelRefs() self._preprocess(preprocess) if self.verbosity > 1: for i, s in enumerate(self.samples): print i, s def initModel(self): """Load or instantiate the classification model.""" if self.load: with open(os.path.join(self.modelPath, "model.pkl"), "rb") as f: self.model = pkl.load(f) print "Model loaded from \'{0}\'.".format(self.modelPath) else: try: module = __import__(self.modelModuleName, {}, {}, self.modelName) modelClass = getattr(module, self.modelName) self.model = modelClass(verbosity=self.verbosity) except ImportError: raise RuntimeError( "Could not import model class \'{0}\'.".format( self.modelName)) def resetModel(self, trial): """Resets or initializes the model""" if self.model is None: self.initModel() else: self.model.resetModel() def encodeSamples(self): """ Encode the text samples into bitmap patterns, and log to txt file. The encoded patterns are stored in a dict along with their corresponding class labels. """ self.patterns = [{ "pattern": self.model.encodePattern(s[0]), "labels": s[1] } for s in self.samples] self.model.writeOutEncodings(self.patterns, self.modelPath) def runExperiment(self): """Train and test the model for each trial specified by self.trainSize.""" for i, size in enumerate(self.trainSize): self.partitions.append(self.partitionIndices(size, i)) self.resetModel(i) if self.verbosity > 0: print "\tTraining for run {0} of {1}.".format( i + 1, len(self.trainSize)) self.training(i) if self.verbosity > 0: print "\tTesting for this run." self.testing(i) def training(self, trial): """ Train the model one-by-one on each pattern specified in this trials partition of indices. Models' training methods require the sample and label to be in a list. """ if self.verbosity > 0: print("\tRunner selects to train on sample(s) {}".format( self.partitions[trial][0])) for i in self.partitions[trial][0]: self.model.trainModel([self.patterns[i]["pattern"]], [self.patterns[i]["labels"]]) def testing(self, trial): if self.verbosity > 0: print("\tRunner selects to test on sample(s) {}".format( self.partitions[trial][1])) results = ([], []) for i in self.partitions[trial][1]: predicted = self.model.testModel(self.patterns[i]["pattern"]) results[0].append(predicted) results[1].append(self.patterns[i]["labels"]) self.results.append(results) def writeOutClassifications(self): """Write the samples, actual, and predicted classes to a CSV.""" headers = ("Tokenized sample", "Actual", "Predicted") for trial, size in enumerate(self.trainSize): resultsDict = defaultdict(list) for i, sampleNum in enumerate(self.partitions[trial][1]): # Loop through the indices in the test set of this trial. sample = self.samples[sampleNum][0] pred = sorted( [self.labelRefs[j] for j in self.results[trial][0][i]]) actual = sorted( [self.labelRefs[j] for j in self.results[trial][1][i]]) resultsDict[sampleNum] = (sampleNum, sample, actual, pred) resultsPath = os.path.join(self.modelPath, "results_trial" + str(trial) + ".csv") writeFromDict(resultsDict, headers, resultsPath) def calculateResults(self): """ Calculate evaluation metrics from the result classifications. TODO: pass intended CM results to plotter.plotConfusionMatrix() """ resultCalcs = [ self.model.evaluateResults(self.results[i], self.labelRefs, self.partitions[i][1]) for i in xrange(len(self.trainSize)) ] self.model.printFinalReport(self.trainSize, [r[0] for r in resultCalcs]) if self.plots: trialAccuracies = self._calculateTrialAccuracies() classificationAccuracies = self._calculateClassificationAccuracies( trialAccuracies) self.plotter.plotCategoryAccuracies(trialAccuracies, self.trainSize) self.plotter.plotCumulativeAccuracies(classificationAccuracies, self.trainSize) if self.plots > 1: # Plot extra evaluation figures -- confusion matrix. self.plotter.plotConfusionMatrix( self.setupConfusionMatrices(resultCalcs)) def save(self): """Save the serialized model.""" print "Saving model to \'{0}\' directory.".format(self.modelPath) with open(os.path.join(self.modelPath, "model.pkl"), "wb") as f: pkl.dump(self.model, f) def partitionIndices(self, split, trial): """ Returns train and test indices. TODO: use StandardSplit in data_split.py """ length = len(self.samples) if self.orderedSplit: trainIdx = range(split) testIdx = range(split, length) else: # Randomly sampled, not repeated trainIdx = random.sample(xrange(length), split) testIdx = [i for i in xrange(length) if i not in trainIdx] return (trainIdx, testIdx) def validateExperiment(self, expectationFilePath): """Returns accuracy of predicted labels against expected labels.""" dataDict = readCSV(expectationFilePath, 2, self.numClasses) accuracies = numpy.zeros((len(self.results))) for i, trial in enumerate(self.results): for j, predictionList in enumerate(trial[0]): predictions = [self.labelRefs[p] for p in predictionList] if predictions == []: predictions = ["(none)"] expected = dataDict.items()[j + self.trainSize[i]][1] accuracies[i] += ( float(len(set(predictions) & set(expected[1]))) / len(expected[1])) accuracies[i] = accuracies[i] / len(trial[0]) return accuracies