def setupData(args): """ Performs data preprocessing and setup given the user-specified args. @param args (Namespace) User-provided arguments via the cmd line. @return (tuple) Tuple where first entry is a list of the samples, the second is the list of gold labels per example, the third is the list of all possible labels, and the fourth is the labels per example in the data. """ dataDict = readCSV(args.dataPath, 2, args.numLabels) # Collect each possible label string into a list, where the indices will be # their references throughout the experiment. labelReference = list(set( itertools.chain.from_iterable(dataDict.values()))) for sample, labels in dataDict.iteritems(): dataDict[sample] = numpy.array([labelReference.index(label) for label in labels], dtype="int8") texter = TextPreprocess() if args.textPreprocess: samples = [(texter.tokenize(sample, ignoreCommon=100, removeStrings=["[identifier deleted]"], correctSpell=True), labels) for sample, labels in dataDict.iteritems()] else: samples = [(texter.tokenize(sample), labels) for sample, labels in dataDict.iteritems()] return samples, labelReference
def generateDataFile(inputData, outputDataDir, type): """ Generates a samples data file with all of the words in the sample reversed. @param (str) Path to input original samples data file @param (TextPreprocess) Processor to perform some text cleanup. """ if not os.path.exists(outputDataDir): os.makedirs(outputDataDir) fileName = string.join(inputData.split(".")[:-1], ".") + "_" + type + ".csv" dataDict = readCSV(inputData, 2, 3) headers = ["QID", "QuestionText", "Response", "Classification1", "Classification2", "Classification3"] data = [] for sample in dataDict.items(): response = sample[1][0] tokens = response.split(" ") tokens = cleanTokens(tokens) response = None if type == "scrambled": random.shuffle(tokens) response = " ".join(tokens) elif type == "reversed": response = " ".join(tokens[::-1]) dataToWrite = [sample[0], "", response] dataToWrite.extend(sample[1][1]) data.append(dataToWrite) writeCSV(data, headers, os.path.join(outputDataDir, fileName))
def split(self, filePath, numLabels, textPreprocess=False, abbrCSV="", contrCSV="", ignoreCommon=100, removeStrings="[identifier deleted]", correctSpell=True): """ Split all the comments in a file into tokens. Preprocess if necessary. @param filePath (str) Path to csv file @param numLabels (int) Number of columns of category labels. @param textPreprocess (bool) True will preprocess text while tokenizing. Please see TextPreprocess tokenize() for the other parameters; they're only used when textPrepricess is True. """ dataDict = readCSV(filePath, numLabels=numLabels) if dataDict is None: raise Exception("Could not read CSV.") preprocessor = TextPreprocess(abbrCSV=abbrCSV, contrCSV=contrCSV) expandAbbr = (abbrCSV != "") expandContr = (contrCSV != "") for i, uniqueID in enumerate(dataDict.keys()): comment, categories = dataDict[uniqueID] # Convert the categories to a string of their IDs categories = string.join( [str(self.categoryToId[c]) for c in categories]) if textPreprocess: tokens = preprocessor.tokenize(comment, ignoreCommon, removeStrings, correctSpell, expandAbbr, expandContr) else: tokens = preprocessor.tokenize(comment) # Write the sequence of data records for this sample. record = {"_categories": categories, "_sequenceID": i} data = [] reset = 1 for t in tokens: tokenRecord = record.copy() tokenRecord["_token"] = t tokenRecord["_reset"] = reset tokenRecord["ID"] = uniqueID reset = 0 data.append(tokenRecord) self.records.append(data)
def computeExpectedAccuracy(predictedLabels, dataPath): """ Compute the accuracy of the models predictions against what we expect it to predict; considers only single classification. """ _, expectedLabels = readCSV(dataPath, 2, [3]) if len(expectedLabels) != len(predictedLabels): raise ValueError("Lists of labels must have the same length.") accuracy = len([i for i in xrange(len(expectedLabels)) if expectedLabels[i]==predictedLabels[i]]) / float(len(expectedLabels)) print "Accuracy against expected classifications = ", accuracy
def split(self, filePath, numLabels, textPreprocess=False, abbrCSV="", contrCSV="", ignoreCommon=100, removeStrings="[identifier deleted]", correctSpell=True): """ Split all the comments in a file into tokens. Preprocess if necessary. @param filePath (str) Path to csv file @param numLabels (int) Number of columns of category labels. @param textPreprocess (bool) True will preprocess text while tokenizing. Please see TextPreprocess tokenize() for the other parameters; they're only used when textPrepricess is True. """ dataDict = readCSV(filePath, numLabels=numLabels) if dataDict is None: raise Exception("Could not read CSV.") preprocessor = TextPreprocess(abbrCSV=abbrCSV, contrCSV=contrCSV) expandAbbr = (abbrCSV != "") expandContr = (contrCSV != "") for i, uniqueID in enumerate(dataDict.keys()): comment, categories = dataDict[uniqueID] # Convert the categories to a string of their IDs categories = string.join([str(self.categoryToId[c]) for c in categories]) if textPreprocess: tokens = preprocessor.tokenize( comment, ignoreCommon, removeStrings, correctSpell, expandAbbr, expandContr) else: tokens = preprocessor.tokenize(comment) # Write the sequence of data records for this sample. record = {"_categories":categories, "_sequenceID":i} data = [] reset = 1 for t in tokens: tokenRecord = record.copy() tokenRecord["_token"] = t tokenRecord["_reset"] = reset tokenRecord["ID"] = uniqueID reset = 0 data.append(tokenRecord) self.records.append(data)
def validateExperiment(self, expectationFilePath): """Returns accuracy of predicted labels against expected labels.""" dataDict = readCSV(expectationFilePath, numLabels=self.numClasses) accuracies = numpy.zeros((len(self.results))) for i, trial in enumerate(self.results): for j, predictionList in enumerate(trial[0]): predictions = [self.labelRefs[p] for p in predictionList] if predictions == []: predictions = ["(none)"] expected = dataDict.items()[j + self.trainSizes[i]][1] accuracies[i] += float(len(set(predictions) & set(expected[1]))) / len(expected[1]) accuracies[i] = accuracies[i] / len(trial[0]) return accuracies
def getExpectedClassifications(runner, expectationFilePath): """ Return a list of the labels predicted by runner and a list of expected labels from the expected classifications file path. """ dataDict = readCSV(expectationFilePath, 2, 3) expectedClasses = [] resultClasses = [] for i, trial in enumerate(runner.results): for j, predictionList in enumerate(trial[0]): predictions = [runner.labelRefs[p] for p in predictionList] if predictions == []: predictions = ["(none)"] resultClasses.append(predictions) expectedClasses.append(dataDict.items()[j+runner.trainSize[i]][1][1]) return expectedClasses, resultClasses
def validateExperiment(self, expectationFilePath): """Returns accuracy of predicted labels against expected labels.""" dataDict = readCSV(expectationFilePath, numLabels=self.numClasses) accuracies = numpy.zeros((len(self.results))) for i, trial in enumerate(self.results): for j, predictionList in enumerate(trial[0]): predictions = [self.labelRefs[p] for p in predictionList] if predictions == []: predictions = ["(none)"] expected = dataDict.items()[j + self.trainSizes[i]][1] accuracies[i] += ( float(len(set(predictions) & set(expected[1]))) / len(expected[1])) accuracies[i] = accuracies[i] / len(trial[0]) return accuracies
def getExpectedClassifications(runner, expectationFilePath): """ Return a list of the labels predicted by runner and a list of expected labels from the expected classifications file path. """ dataDict = readCSV(expectationFilePath, numLabels=3) expectedClasses = [] resultClasses = [] for i, trial in enumerate(runner.results): for j, predictionList in enumerate(trial[0]): predictions = [runner.labelRefs[p] for p in predictionList] if predictions == []: predictions = ["(none)"] resultClasses.append(predictions) expectedClasses.append( dataDict.items()[j + runner.trainSizes[i]][1][1]) return expectedClasses, resultClasses
def setupData(args): """ Performs data preprocessing and setup given the user-specified args. @param args (Namespace) User-provided arguments via the cmd line. @return (tuple) Tuple where first entry is a list of the samples, the second is the list of gold labels per example, the third is the list of all possible labels, and the fourth is the labels per example in the data. """ dataDict = readCSV(args.dataPath, 2, args.numLabels) # Collect each possible label string into a list, where the indices will be # their references throughout the experiment. labelReference = list( set( itertools.chain.from_iterable( map(lambda x: x[1], dataDict.values())))) for idx, data in dataDict.iteritems(): comment, labels = data dataDict[idx] = (comment, numpy.array( [labelReference.index(label) for label in labels], dtype="int8")) texter = TextPreprocess(abbrCSV=args.abbrCSV, contrCSV=args.contrCSV) expandAbbr = (args.abbrCSV != "") expandContr = (args.contrCSV != "") if args.textPreprocess: samples = [(texter.tokenize(data[0], ignoreCommon=100, removeStrings=["[identifier deleted]"], correctSpell=True, expandAbbr=expandAbbr, expandContr=expandContr), data[1]) for _, data in dataDict.iteritems()] else: samples = [(texter.tokenize(data[0]), data[1]) for _, data in dataDict.iteritems()] return samples, labelReference
def setupData(self, preprocess=False, sampleIdx=2): """ Get the data from CSV and preprocess if specified. One index in labelIdx implies the model will train on a single classification per sample. @param preprocess (bool) Whether or not to preprocess the data when generating the files @param sampleIdx (int) Column number of the text samples in the csv """ self.dataDict = readCSV(self.dataPath, sampleIdx, self.numClasses) if (not isinstance(self.trainSize, list) or not all([0 <= size <= len(self.dataDict) for size in self.trainSize])): raise ValueError("Invalid size(s) for training set.") self._mapLabelRefs() self._preprocess(preprocess) if self.verbosity > 1: for i, s in enumerate(self.samples): print i, s
def setupData(self, preprocess=False, sampleIdx=2): """ Get the data from a directory and preprocess if specified. One index in labelIdx implies the model will train on a single classification per sample. """ self.dataDict = readDir(self.dataPath, sampleIdx, self.numClasses, True) if self.test: self.testDict = readCSV(self.test, sampleIdx, self.numClasses) minCategorySize = min(map(len, self.dataDict.values())) if not (isinstance(self.trainSize, list) or all([0 <= size <= minCategorySize for size in self.trainSize])): raise ValueError("Invalid size(s) for training set.") self._mapLabelRefs() self._preprocess(preprocess) if self.verbosity > 1: for i, s in enumerate(self.samples): print i, s
def setupData(self, preprocess=False, sampleIdx=2): """ Get the data from CSV and preprocess if specified. One index in labelIdx implies the model will train on a single classification per sample. @param preprocess (bool) Whether or not to preprocess the data when generating the files @param sampleIdx (int) Column number of the text samples in the csv """ self.dataDict = readCSV(self.dataPath, sampleIdx, self.numClasses) if (not isinstance(self.trainSize, list) or not all( [0 <= size <= len(self.dataDict) for size in self.trainSize])): raise ValueError("Invalid size(s) for training set.") self._mapLabelRefs() self._preprocess(preprocess) if self.verbosity > 1: for i, s in enumerate(self.samples): print i, s
def setupData(self, preprocess=False): """ Get the data from CSV and preprocess if specified. The call to readCSV() assumes a specific CSV format, detailed in its docstring. @param preprocess (bool) Whether or not to preprocess the data when reading in samples. """ self.dataDict = readCSV(self.dataPath, numLabels=self.numClasses) if (not isinstance(self.trainSizes, list) or not all( [0 <= size <= len(self.dataDict) for size in self.trainSizes])): raise ValueError("Invalid size(s) for training set.") self._mapLabelRefs() self.samples = self.model.prepData(self.dataDict, preprocess) self.encodeSamples() if self.verbosity > 1: for i, s in self.samples.iteritems(): print i, s
def setupData(self, preprocess=False): """ Get the data from CSV and preprocess if specified. The call to readCSV() assumes a specific CSV format, detailed in its docstring. @param preprocess (bool) Whether or not to preprocess the data when reading in samples. """ self.dataDict = readCSV(self.dataPath, numLabels=self.numClasses) if not isinstance(self.trainSizes, list) or not all( [0 <= size <= len(self.dataDict) for size in self.trainSizes] ): raise ValueError("Invalid size(s) for training set.") self._mapLabelRefs() self.samples = self.model.prepData(self.dataDict, preprocess) self.encodeSamples() if self.verbosity > 1: for i, s in self.samples.iteritems(): print i, s
def run(args): """ The experiment is configured to run on question response data. The runner sets up the data path to such that the experiment runs on a single data file located in the nupic.fluent/data directory. The data path MUST BE SPECIFIED at the cmd line, e.g. from the fluent dir: python experiments/random_baseline_runner.py data/sample_reviews/sample_reviews_data_training.csv To run k-folds cross validation, arguments must be: kFolds > 1, train = False, test = False. To run either training or testing, kFolds = 1. """ start = time.time() # Setup directories. root = os.path.dirname(__file__) dataPath = os.path.abspath(os.path.join(root, '../..', args.dataFile)) modelPath = os.path.abspath( os.path.join(root, args.resultsDir, args.expName, args.modelName)) if not os.path.exists(modelPath): os.makedirs(modelPath) # Verify input params. if not os.path.isfile(dataPath): raise ValueError("Invalid data path.") if (not isinstance(args.kFolds, int)) or (args.kFolds < 1): raise ValueError("Invalid value for number of cross-validation folds.") if args.train and args.test: raise ValueError("Run training and testing independently.") if (args.train or args.test) and args.kFolds > 1: raise ValueError("Experiment runs either k-folds CV or training/testing, " "not both.") # Load or init model. if args.load: with open( os.path.join(modelPath, "model.pkl"), "rb") as f: model = pkl.load(f) print "Model loaded from \'{0}\'.".format(modelPath) else: model = ClassificationModelRandomSDR(verbosity=args.verbosity) # Get and prep data. texter = TextPreprocess() samples, labels = readCSV(dataPath, 2, [3]) # Y data, [3] -> range(3,6) labelReference = list(set(labels)) labels = numpy.array([labelReference.index(l) for l in labels], dtype=int) split = len(samples)/args.kFolds samples = [texter.tokenize(sample, ignoreCommon=100, removeStrings=["[identifier deleted]"], correctSpell=True) for sample in samples] if args.verbosity > 1: for i, s in enumerate(samples): print i, s, labelReference[labels[i]] patterns = [[model.encodePattern(t) for t in tokens] for tokens in samples] # Either we train on all the data, test on all the data, or run k-fold CV. if args.train: training(model, [(p, labels[i]) for i, p in enumerate(patterns)]) elif args.test: trialResults = testing(model, [(p, labels[i]) for i, p in enumerate(patterns)]) elif args.kFolds>1: intermResults = [] predictions = [] for k in range(args.kFolds): # Train the model on a subset, and hold the evaluation subset. model.resetModel() evalIndices = range(k*split, (k+1)*split) trainIndices = [i for i in range(len(samples)) if not i in evalIndices] print "Training for CV fold {0}.".format(k) training(model, [(patterns[i], labels[i]) for i in trainIndices]) print "Evaluating for trial {0}.".format(k) trialResults = testing(model, [(patterns[i], labels[i]) for i in evalIndices]) if args.expectationDataPath: # Keep the predicted labels (top prediction only) for later. p = [l if l else [None] for l in trialResults[0]] predictions.append([labelReference[idx[0]] if idx[0] != None else '(none)' for idx in p]) print "Calculating intermediate results for this fold." result = model.evaluateTrialResults( trialResults, labelReference, evalIndices) intermResults.append(result) result[1].to_csv(os.path.join( modelPath, "evaluation_fold_" + str(k) + ".csv")) print "Calculating cumulative results for {0} trials.".format(args.kFolds) results = model.evaluateFinalResults(intermResults) results["total_cm"].to_csv(os.path.join(modelPath, "evaluation_totals.csv")) if args.expectationDataPath: computeExpectedAccuracy(list(itertools.chain.from_iterable(predictions)), os.path.abspath(os.path.join(root, '../..', args.expectationDataPath))) print "Calculating random classifier results for comparison." print model.classifyRandomly(labels) print "Saving model to \'{0}\' directory.".format(modelPath) with open( os.path.join(modelPath, "model.pkl"), "wb") as f: pkl.dump(model, f) print "Experiment complete in {0:.2f} seconds.".format(time.time() - start)
def run(args): """ The experiment is configured to run on question response data. To run k-folds cross validation, arguments must be: kFolds > 1, train = False, test = False. To run either training or testing, kFolds = 1. """ start = time.time() # Setup directories. root = os.path.dirname(__file__) dataPath = os.path.abspath(os.path.join(root, '../..', args.dataFile)) modelPath = os.path.abspath( os.path.join(root, args.resultsDir, args.expName, args.modelName)) if not os.path.exists(modelPath): os.makedirs(modelPath) # Verify input params. if not os.path.isfile(dataPath): raise ValueError("Invalid data path.") if (not isinstance(args.kFolds, int)) or (args.kFolds < 1): raise ValueError("Invalid value for number of cross-validation folds.") if args.train and args.test: raise ValueError("Run training and testing independently.") if (args.train or args.test) and args.kFolds > 1: raise ValueError("Experiment runs either k-folds CV or training/testing, " "not both.") # Load or init model. if args.load: with open( os.path.join(modelPath, "model.pkl"), "rb") as f: model = pkl.load(f) print "Model loaded from \'{0}\'.".format(modelPath) else: try: module = __import__(args.modelModuleName, {}, {}, args.modelName) modelClass = getattr(module, args.modelName) model = modelClass(verbosity=args.verbosity) except ImportError: raise RuntimeError("Could not find model class \'%s\' to import." % args.modelName) print "Reading in data and preprocessing." texter = TextPreprocess() samples, labels = readCSV(dataPath, 2, [3]) # Y data, [3] -> range(3,6) labelReference = list(set(labels)) labels = numpy.array([labelReference.index(l) for l in labels], dtype="int8") samples = [texter.tokenize(sample, ignoreCommon=100, removeStrings=["[identifier deleted]"], correctSpell=True) for sample in samples] if args.verbosity > 1: for i, s in enumerate(samples): print i, s, labelReference[labels[i]] patterns = [model.encodePattern(s) for s in samples] # Either we train on all the data, test on all the data, or run k-fold CV. if args.train: training(model, [(p, labels[i]) for i, p in enumerate(patterns)]) elif args.test: results = testing(model, [(p, labels[i]) for i, p in enumerate(patterns)]) calculateTrialResults(model, results, labelReference, xrange(len(samples)), os.path.join(modelPath, "test_results.csv")) elif args.kFolds>1: # Run k-folds cross validation -- train the model on a subset, and evaluate # on the remaining subset. partitions = KFolds(args.kFolds).split(xrange(len(samples))) intermResults = [] predictions = [] for k in xrange(args.kFolds): print "Training and testing for CV fold {0}.".format(k) trialResults = runExperiment(model, patterns, labels, partitions[k]) if args.expectationDataPath: # Keep the predicted labels (top prediction only) for later. p = [l if l else [None] for l in trialResults[0]] predictions.append( [labelReference[idx[0]] if idx[0] != None else '(none)' for idx in p]) print "Calculating intermediate results for this fold. Writing to CSV." intermResults.append(calculateTrialResults(model, trialResults, labelReference, partitions[k][1], os.path.join(modelPath, "evaluation_fold_" + str(k) + ".csv"))) print "Calculating cumulative results for {0} trials.".format(args.kFolds) results = model.evaluateFinalResults(intermResults) results["total_cm"].to_csv(os.path.join(modelPath, "evaluation_totals.csv")) if args.expectationDataPath: computeExpectedAccuracy(list(itertools.chain.from_iterable(predictions)), os.path.abspath(os.path.join(root, '../..', args.expectationDataPath))) print "Calculating random classifier results for comparison." print model.classifyRandomly(labels) print "Saving model to \'{0}\' directory.".format(modelPath) with open( os.path.join(modelPath, "model.pkl"), "wb") as f: pkl.dump(model, f) print "Experiment complete in {0:.2f} seconds.".format(time.time() - start)