def _loadData(self): """ Load data, returning a dict of text data objects. Keys are line numbers at which the text appears in the CSV file. """ return readCSV( self.dataPath, numLabels=0) # 0 to train models in unsupervised fashion
def generateDataFile(inputData, outputDataDir, type): """ Generates a samples data file with all of the words in the sample reversed. @param (str) Path to input original samples data file @param (TextPreprocess) Processor to perform some text cleanup. """ if not os.path.exists(outputDataDir): os.makedirs(outputDataDir) fileName = string.join(inputData.split(".")[:-1], ".") + "_" + type + ".csv" dataDict = readCSV(inputData, numLabels=3) headers = ["QID", "QuestionText", "Response", "Classification1", "Classification2", "Classification3"] data = [] for sample in dataDict.items(): response = sample[1][0] tokens = response.split(" ") tokens = cleanTokens(tokens) response = None if type == "scrambled": random.shuffle(tokens) response = " ".join(tokens) elif type == "reversed": response = " ".join(tokens[::-1]) dataToWrite = [sample[0], "", response] dataToWrite.extend(sample[1][1]) data.append(dataToWrite) writeCSV(data, headers, os.path.join(outputDataDir, fileName))
def split(self, filePath=None, numLabels=3, textPreprocess=False, dataDict=None, abbrCSV="", contrCSV="", ignoreCommon=100, removeStrings="[identifier deleted]", correctSpell=True): """ Split all the comments in a file into tokens, w/ or w/o preprocessing. Specifying both filePath and dataDict will prefer filePath. @param filePath (str) Path to csv file @param dataDict (dict) Data as returned by readCSV() @param numLabels (int) Number of columns of category labels. @param textPreprocess (bool) True will preprocess text while tokenizing. @return dataDict (dict) Data as read in from filePath. Please see TextPreprocess tokenize() for the other parameters; they're only used when textPreprocess is True. """ if filePath: dataDict = readCSV(filePath, numLabels=numLabels) if dataDict is None: raise Exception("No data given, or could not read CSV.") preprocessor = TextPreprocess(abbrCSV=abbrCSV, contrCSV=contrCSV) expandAbbr = (abbrCSV != "") expandContr = (contrCSV != "") for recordNum, record in dataDict.iteritems(): comment, categories, uniqueID = record # Convert the categories to a string of their IDs categories = string.join( [str(self.categoryToId[c]) for c in categories]) if textPreprocess: tokens, _ = preprocessor.tokenizeAndFilter( comment, ignoreCommon, removeStrings, correctSpell, expandAbbr, expandContr) else: tokens = preprocessor.tokenize(comment) data = self._formatSequence(tokens, categories, recordNum, uniqueID) self.records.append(data) self.sequenceCount += 1 return dataDict
def split( self, filePath=None, numLabels=3, textPreprocess=False, dataDict=None, abbrCSV="", contrCSV="", ignoreCommon=100, removeStrings="[identifier deleted]", correctSpell=True, ): """ Split all the comments in a file into tokens, w/ or w/o preprocessing. Specifying both filePath and dataDict will prefer filePath. @param filePath (str) Path to csv file @param dataDict (dict) Data as returned by readCSV() @param numLabels (int) Number of columns of category labels. @param textPreprocess (bool) True will preprocess text while tokenizing. @return dataDict (dict) Data as read in from filePath. Please see TextPreprocess tokenize() for the other parameters; they're only used when textPreprocess is True. """ if filePath: dataDict = readCSV(filePath, numLabels=numLabels) if dataDict is None: raise Exception("No data given, or could not read CSV.") preprocessor = TextPreprocess(abbrCSV=abbrCSV, contrCSV=contrCSV) expandAbbr = abbrCSV != "" expandContr = contrCSV != "" for recordNum, record in dataDict.iteritems(): comment, categories, uniqueID = record # Convert the categories to a string of their IDs categories = string.join([str(self.categoryToId[c]) for c in categories]) if textPreprocess: tokens, _ = preprocessor.tokenizeAndFilter( comment, ignoreCommon, removeStrings, correctSpell, expandAbbr, expandContr ) else: tokens = preprocessor.tokenize(comment) data = self._formatSequence(tokens, categories, recordNum, uniqueID) self.records.append(data) self.sequenceCount += 1 return dataDict
def run(args): if args.loadPath: model = loadModel(args.loadPath) elif args.modelName == "HTMNetwork": networkConfig = loadJSON(_NETWORK_JSON) print "Creating the network model..." model = _createModel(modelName=args.modelName, savePath=args.savePath, networkConfig=networkConfig, inputFilePath=args.dataPath, prepData=True, numLabels=0, stripCats=True, retinaScaling=1.0) numRecords = sum( model.networkDataGen.getNumberOfTokens(model.networkDataPath)) print "Training the model..." model.trainModel( iterations=numRecords) # TODO: switch to using trainNetwork else: model = _createModel(modelName=args.modelName, savePath=args.savePath) dataDict = readCSV(args.dataPath, numLabels=0) print "Preparing and encoding the data..." samples = model.prepData(dataDict, args.preprocess) patterns = model.encodeSamples(samples) print "Training the model..." for i in xrange(len(samples)): model.trainModel(i) if args.savePath: model.saveModel() # Query the model. printTemplate = "{0:<10}|{1:<30}" while 1 < 2: print "Now we query the model for samples (quit with 'q')..." input = raw_input("Enter a query: ") if input == "q": break sortedDistances = model.queryModel(input, args.preprocess) print printTemplate.format("Sample ID", "Distance from query") for sID, dist in sortedDistances: print printTemplate.format(sID, dist) return
def readData(args): """ Read data file and print out some statistics Return a training set, test set, labelId to text map, and docId to categories map. Return format: trainingData = [ ["fox eats carrots", [0], docId], ["fox eats peppers", [0], docId], ["carrots are healthy", [1], docId], ["peppers is healthy", [1], docId], ] """ # Read data dataDict = readCSV(args.dataPath, 1) labelRefs, dataDict = mapLabelRefs(dataDict) categoriesInOrderOfInterest = [8, 9, 10, 5, 6, 11, 13][0 : args.numLabels] # Select data based on categories of interest. Shift category indices down # so we go from 0 to numLabels-1 trainingData = [] counts = numpy.zeros(len(labelRefs)) for document in dataDict.itervalues(): docId = document[2] oldCategoryIndex = document[1][0] if oldCategoryIndex in categoriesInOrderOfInterest: newIndex = categoriesInOrderOfInterest.index(oldCategoryIndex) trainingData.append([document[0], [newIndex], docId]) counts[newIndex] += 1 # For each document, figure out which categories it belongs to # Include the shifted category index documentCategoryMap = {} for doc in dataDict.iteritems(): docId = doc[1][2] oldCategoryIndex = doc[1][1][0] if oldCategoryIndex in categoriesInOrderOfInterest: newIndex = categoriesInOrderOfInterest.index(oldCategoryIndex) v = documentCategoryMap.get(docId, []) v.append(newIndex) documentCategoryMap[docId] = v labelRefs = [labelRefs[i] for i in categoriesInOrderOfInterest] print "Total number of unique documents", len(documentCategoryMap) print "Category counts: ", counts print "Categories in training/test data:", labelRefs return trainingData, trainingData, labelRefs, documentCategoryMap
def __init__(self, dataPath="data.csv"): """ initializes imbu model with given sample data :param str dataPath: Path to sample data file. Must be a CSV file having 'ID and 'Sample' columns """ g_log.info("Initialize imbu model") csvdata = readCSV(dataPath, numLabels=0) self.samples = OrderedDict() for dataID, text in csvdata.iteritems(): self.samples[dataID] = text self.models = {modelName: createModel(modelName, dataPath, csvdata) for modelName, modelFactory in _MODEL_MAPPING.iteritems()}
def run(args): if args.loadPath: model = loadModel(args.loadPath) elif args.modelName == "HTMNetwork": networkConfig = loadJSON(_NETWORK_JSON) print "Creating the network model..." model = _createModel(modelName=args.modelName, savePath=args.savePath, networkConfig=networkConfig, inputFilePath=args.dataPath, prepData=True, numLabels=0, stripCats=True, retinaScaling=1.0) numRecords = sum( model.networkDataGen.getNumberOfTokens(model.networkDataPath)) print "Training the model..." model.trainModel(iterations=numRecords) # TODO: switch to using trainNetwork else: model = _createModel(modelName=args.modelName, savePath=args.savePath) dataDict = readCSV(args.dataPath, numLabels=0) print "Preparing and encoding the data..." samples = model.prepData(dataDict, args.preprocess) patterns = model.encodeSamples(samples) print "Training the model..." for i in xrange(len(samples)): model.trainModel(i) if args.savePath: model.saveModel() # Query the model. printTemplate = "{0:<10}|{1:<30}" while 1<2: print "Now we query the model for samples (quit with 'q')..." input = raw_input("Enter a query: ") if input == "q": break sortedDistances = model.queryModel(input, args.preprocess) print printTemplate.format("Sample ID", "Distance from query") for sID, dist in sortedDistances: print printTemplate.format(sID, dist) return
def __init__(self, dataPath="data.csv"): """ initializes imbu model with given sample data :param str dataPath: Path to sample data file. Must be a CSV file having 'ID and 'Sample' columns """ g_log.info("Initialize imbu model") csvdata = readCSV(dataPath, numLabels=0) self.samples = OrderedDict() for dataID, text in csvdata.iteritems(): self.samples[dataID] = text self.models = { modelName: createModel(modelName, dataPath, csvdata) for modelName, modelFactory in _MODEL_MAPPING.iteritems() }
def validateExperiment(self, expectationFilePath): """Returns accuracy of predicted labels against expected labels.""" dataDict = readCSV(expectationFilePath, numLabels=self.numClasses) accuracies = numpy.zeros((len(self.results))) for i, trial in enumerate(self.results): for j, predictionList in enumerate(trial[0]): predictions = [self.labelRefs[p] for p in predictionList] if predictions == []: predictions = ["(none)"] expected = dataDict.items()[j+self.trainSizes[i]][1] accuracies[i] += (float(len(set(predictions) & set(expected[1]))) / len(expected[1])) accuracies[i] = accuracies[i] / len(trial[0]) return accuracies
def getExpectedClassifications(runner, expectationFilePath): """ Return a list of the labels predicted by runner and a list of expected labels from the expected classifications file path. """ dataDict = readCSV(expectationFilePath, numLabels=3) expectedClasses = [] resultClasses = [] for trial, trialResults in enumerate(runner.results): for i, predictionList in enumerate(trialResults[0]): predictions = [runner.labelRefs[p] for p in predictionList] if predictions == []: predictions = ["(none)"] resultClasses.append(predictions) expectedClasses.append(dataDict.items()[i+runner.trainSizes[trial]][1][1]) return expectedClasses, resultClasses
def validateExperiment(self, expectationFilePath): """Returns accuracy of predicted labels against expected labels.""" dataDict = readCSV(expectationFilePath, numLabels=self.numClasses) accuracies = numpy.zeros((len(self.results))) for i, trial in enumerate(self.results): for j, predictionList in enumerate(trial[0]): predictions = [self.labelRefs[p] for p in predictionList] if predictions == []: predictions = ["(none)"] expected = dataDict.items()[j + self.trainSizes[i]][1] accuracies[i] += ( float(len(set(predictions) & set(expected[1]))) / len(expected[1])) accuracies[i] = accuracies[i] / len(trial[0]) return accuracies
def getExpectedClassifications(runner, expectationFilePath): """ Return a list of the labels predicted by runner and a list of expected labels from the expected classifications file path. """ dataDict = readCSV(expectationFilePath, numLabels=3) expectedClasses = [] resultClasses = [] for trial, trialResults in enumerate(runner.results): for i, predictionList in enumerate(trialResults[0]): predictions = [runner.labelRefs[p] for p in predictionList] if predictions == []: predictions = ["(none)"] resultClasses.append(predictions) expectedClasses.append( dataDict.items()[i + runner.trainSizes[trial]][1][1]) return expectedClasses, resultClasses
def setupData(self, preprocess=False): """ Get the data from CSV and preprocess if specified. The call to readCSV() assumes a specific CSV format, detailed in its docstring. @param preprocess (bool) Whether or not to preprocess the data when reading in samples. """ self.dataDict = readCSV(self.dataPath, numLabels=self.numClasses) if self.experimentType == "incremental": # stop now if the data won't work for the specified experiment if (not isinstance(self.trainSizes, list) or not all([0 <= size <= len(self.dataDict) for size in self.trainSizes])): raise ValueError("Invalid size(s) for training set(s).") self.labelRefs, self.dataDict = mapLabelRefs(self.dataDict) self.samples = self.model.prepData(self.dataDict, preprocess) if self.verbosity > 1: for i, s in self.samples.iteritems(): print i, s
def generateDataFile(inputData, outputDataDir, type): """ Generates a samples data file with all of the words in the sample reversed. @param (str) Path to input original samples data file @param (TextPreprocess) Processor to perform some text cleanup. """ if not os.path.exists(outputDataDir): os.makedirs(outputDataDir) fileName = string.join(inputData.split(".")[:-1], ".") + "_" + type + ".csv" dataDict = readCSV(inputData, numLabels=3) headers = [ "QID", "QuestionText", "Response", "Classification1", "Classification2", "Classification3" ] data = [] for sample in dataDict.items(): response = sample[1][0] tokens = response.split(" ") tokens = cleanTokens(tokens) response = None if type == "scrambled": random.shuffle(tokens) response = " ".join(tokens) elif type == "reversed": response = " ".join(tokens[::-1]) dataToWrite = [sample[0], "", response] dataToWrite.extend(sample[1][1]) data.append(dataToWrite) writeCSV(data, headers, os.path.join(outputDataDir, fileName))
def setupData(self, preprocess=False): """ Get the data from CSV and preprocess if specified. The call to readCSV() assumes a specific CSV format, detailed in its docstring. @param preprocess (bool) Whether or not to preprocess the data when reading in samples. """ self.dataDict = readCSV(self.dataPath, numLabels=self.numClasses) if self.experimentType == "incremental": # stop now if the data won't work for the specified experiment if (not isinstance(self.trainSizes, list) or not all( [0 <= size <= len(self.dataDict) for size in self.trainSizes])): raise ValueError("Invalid size(s) for training set(s).") self._mapLabelRefs() self.samples = self.model.prepData(self.dataDict, preprocess) if self.verbosity > 1: for i, s in self.samples.iteritems(): print i, s
try: with pkg_resources.resource_filename(__name__, jsonPath) as f: return json.load(f) except IOError as e: print "Could not find JSON at '{}'.".format(jsonPath) raise e # Indicates global ready status of all models. g_ready will transition to # True when all models have been created, trained, and are ready to handle # requests g_ready = False g_models = {} g_csvdata = ( readCSV( os.getenv("IMBU_DATA", pkg_resources.resource_filename(__name__, "data.csv")), numLabels=0) ) # Get data and order by unique ID g_samples = OrderedDict( (sample[2], sample[0]) for sample in g_csvdata.values() ) def createModel(modelName, modelFactory): """Return an instantiated model.""" global g_models
def _loadData(self): """ Load data, returning a dict of text data objects. Keys are line numbers at which the text appears in the CSV file. """ return readCSV(self.dataPath, numLabels=0) # 0 to train models in unsupervised fashion
def _loadData(self): """ Load data. """ return readCSV(self.dataPath, numLabels=0) # 0 to train models in unsupervised fashion
with pkg_resources.resource_filename(__name__, jsonPath) as f: return json.load(f) except IOError as e: print "Could not find JSON at '{}'.".format(jsonPath) raise e # Indicates global ready status of all models. g_ready will transition to # True when all models have been created, trained, and are ready to handle # requests g_ready = False g_models = {} # Get data and order by unique ID g_csvdata = (readCSV(_DATA_PATH, numLabels=0)) g_samples = OrderedDict( (int(sample[2]), sample[0]) for sample in g_csvdata.values()) def createModel(modelName, modelFactory): """Return an instantiated model.""" global g_models modelDir = os.path.join(_MODEL_CACHE_DIR_PREFIX, modelName) try: print "Attempting to load from", modelDir model = ClassificationModel.loadModel(modelDir) modelProxy = SynchronousBackgroundModelProxy(model)
def _loadData(self): """ Load data. """ return readCSV( self.dataPath, numLabels=0) # 0 to train models in unsupervised fashion
return json.load(f) except IOError as e: print "Could not find JSON at '{}'.".format(jsonPath) raise e # Indicates global ready status of all models. g_ready will transition to # True when all models have been created, trained, and are ready to handle # requests g_ready = False g_models = {} # Get data and order by unique ID g_csvdata = ( readCSV(_DATA_PATH, numLabels=0) ) g_samples = OrderedDict( (int(sample[2]), sample[0]) for sample in g_csvdata.values() ) def createModel(modelName, modelFactory): """Return an instantiated model.""" global g_models modelDir = os.path.join(_MODEL_CACHE_DIR_PREFIX, modelName) try:
try: with pkg_resources.resource_filename(__name__, jsonPath) as fin: return json.load(fin) except IOError as e: print "Could not find JSON at '{}'.".format(jsonPath) raise e # Indicates global ready status of all models. g_ready will transition to # True when all models have been created, trained, and are ready to handle # requests g_ready = False g_models = {} g_csvdata = ( readCSV( os.getenv("IMBU_DATA", pkg_resources.resource_filename(__name__, "data.csv")), numLabels=0) ) # Get data and order by unique ID g_samples = OrderedDict( (sample[2], sample[0]) for sample in g_csvdata.values() ) def createModel(modelName, modelFactory): """Return an instantiated model.""" global g_models