def train(self): """ Train the network regions on the entire dataset. There should be one datafile for each training rep in self.dataFiles, where every data sample (i.e. sequence) appears only once in each file. """ # TODO: ignore patterns < minSparsity (= 0.9 * unionSparsity) if self.trainingReps != len(self.dataFiles): raise RuntimeError( "Mismatch between the number of specified training " "reps and the number of data files (should be 1:1).") for dataFile in self.dataFiles: if self.verbosity > 0: print "Running all the data through the network for training..." self.model.swapRecordStream(dataFile) numTokens = NetworkDataGenerator().getNumberOfTokens(dataFile) n = sum(numTokens) self.model.trainNetwork(n) # Populate the classifier space by running through the current data file; # learning (in other regions) is turned off by the model. if self.verbosity > 1: print "Populating the classifier with all of the sequences." self.classifiedSeqIds = self.model.classifyNetwork(n)
def testSplitPreprocess(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") expected = [[{ "_token": "gohbkchoo", "_category": "0 1", "_sequenceId": 0, "ID": "1", "_reset": 1 }], [{ "_token": "o", "_category": "2", "_sequenceId": 1, "ID": "2", "_reset": 1 }, { "_token": "ca", "_category": "2", "_sequenceId": 1, "ID": "2", "_reset": 0 }]] ndg.split(filename, 3, True, ignoreCommon=100, correctSpell=True) self.assertRecordsEqual(ndg.records, expected)
def generateNetworkDataFiles(self, splits, seed, preprocess, **kwargs): # TODO: use model.prepData()? ndg = NetworkDataGenerator() self.dataDict = ndg.split(filePath=self.dataPath, numLabels=self.numClasses, textPreprocess=preprocess, **kwargs) filename, ext = os.path.splitext(self.dataPath) self.classificationFile = "{}_categories.json".format(filename) # Generate one data file for each experiment iteration. if self.experimentType == "k-folds" and not self.orderedSplit: # only randomize the data order once for k-folds cross validation ndg.randomizeData(seed) for i in xrange(splits): if self.experimentType != "k-folds" and not self.orderedSplit: ndg.randomizeData(seed) seed += 1 # ext='.csv' dataFile = "{}_network_{}{}".format(filename, i, ext) ndg.saveData(dataFile, self.classificationFile) self.dataFiles.append(dataFile) if self.verbosity > 0: print "{} file(s) generated at {}".format(len(self.dataFiles), self.dataFiles) print "Classification JSON is at: {}".format( self.classificationFile)
def testSaveDataIncorrectType(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") dataOutputFile = os.path.join(self.dirName, "test_data/multi_sample_split.csv") categoriesOutputFile = os.path.join( self.dirName, "test_data/multi_sample_categories.csv") ndg.split(filename, 3, False) with self.assertRaises(TypeError): ndg.saveData(dataOutputFile, categoriesOutputFile)
def _deSerializeExtraData(self, extraDataDir): """ Protected method that is called during deserialization (after __setstate__) with an external directory path. We override it here to load the Network API instance. @param extraDataDir (string) Model's extra data directory path """ self.network = Network(os.path.join(extraDataDir, "network.nta")) self._initializeRegionHelpers() self.networkDataGen = NetworkDataGenerator()
def testFileRecordStreamReadData(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") ndg.split(filename, 3, False) dataOutputFile = os.path.join(self.dirName, "test_data/multi_sample_split.csv") categoriesOutputFile = os.path.join( self.dirName, "test_data/multi_sample_categories.json") ndg.saveData(dataOutputFile, categoriesOutputFile) # If no error is raised, then the data is in the correct format frs = FileRecordStream(dataOutputFile)
def testSaveData(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") ndg.split(filename, 3, False) dataOutputFile = os.path.join(self.dirName, "test_data/multi_sample_split.csv") categoriesOutputFile = os.path.join( self.dirName, "test_data/multi_sample_categories.json") success = ndg.saveData(dataOutputFile, categoriesOutputFile) self.assertTrue(success) dataTable = pandas.read_csv(dataOutputFile).fillna("") types = { "_category": "list", "_token": "string", "_sequenceId": "int", "_reset": "int", "ID": "string" } specials = { "_category": "C", "_token": "", "_sequenceId": "S", "_reset": "R", "ID": "" } expected_records = [ record for data in self.expected for record in data ] expected_records.insert(0, specials) expected_records.insert(0, types) for idx, values in dataTable.iterrows(): record = values.to_dict() if idx > 1: # csv values are strings, so cast the ints record["_sequenceId"] = int(record["_sequenceId"]) record["_reset"] = int(record["_reset"]) self.assertDictEqual(record, expected_records[idx]) with open(categoriesOutputFile) as f: categories = json.load(f) expected_categories = { "kitchen": 0, "environment": 1, "not helpful": 2 } self.assertDictEqual(categories, expected_categories)
def __init__(self, networkConfig, inputFilePath, retinaScaling=1.0, retina="en_associative", apiKey=None, verbosity=1, numLabels=3, modelDir="ClassificationModelHTM", prepData=True, stripCats=False): """ @param networkConfig (dict) Network configuration dict with region parameters. @param inputFilePath (str) Path to data file. @param retinaScaling (float) Scales the dimensions of the SDRs. @param retina (str) Name of Cio retina. @param apiKey (str) Key for Cio API. @param prepData (bool) Prepare the input data into network API format. @param stripCats (bool) Remove the categories and replace them with the sequence_Id. See ClassificationModel for remaining parameters. Note classifierMetric is not specified here as it is in other models. This is done in the network config file. """ super(ClassificationModelHTM, self).__init__(verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) self.networkConfig = networkConfig self.retinaScaling = retinaScaling self.retina = retina self.apiKey = apiKey self.inputFilePath = inputFilePath self.networkDataGen = NetworkDataGenerator() if prepData: self.networkDataPath = self.prepData(self.inputFilePath, stripCats=stripCats) else: self.networkDataPath = self.inputFilePath self.network = self.initModel() self._initializeRegionHelpers()
def partitionIndices(self, seed=42, numInference=10): """ Sets self.partitions for the buckets' querying and ranking sets. The corresponding numbers of tokens for each sequence are stored in self.numTokens. The order of sequences is already specified by the network data files; if generated by the experiment, these are in order or randomized as specified by the orderedSplit arg. """ super(BucketHTMRunner, self).partitionIndices( seed=seed, numInference=numInference) # Get the number of tokens in each bucket file so the network knows how many # iterations to run. The order of buckets in self.bucketFiles is not # necessarily the same ndg = NetworkDataGenerator() for dataFile in self.bucketFiles: self.numTokens.append(ndg.getNumberOfTokens(dataFile))
def testRandomize(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") ndg.split(filename, 3, False) random.seed(1) ndg.randomizeData() dataOutputFile = os.path.join(self.dirName, "test_data/multi_sample_split.csv") categoriesOutputFile = os.path.join( self.dirName, "test_data/multi_sample_categories.json") success = ndg.saveData(dataOutputFile, categoriesOutputFile) randomizedIDs = [] dataTable = pandas.read_csv(dataOutputFile) for _, values in dataTable.iterrows(): record = values.to_dict() idx = record["_sequenceId"] if idx.isdigit() and (not randomizedIDs or randomizedIDs[-1] != idx): randomizedIDs.append(idx) self.assertNotEqual(randomizedIDs, range(len(randomizedIDs)))
def setupNetData( self, generateData=True, seed=42, preprocess=False, **kwargs): """ Resulting network data files created: - One for each bucket - One for each training rep, where samples are not repeated in a given file. Each samples is given its own category (_category = _sequenceId). The classification json is saved when generating the final training file. """ if generateData: ndg = NetworkDataGenerator() self.dataDict = ndg.split( filePath=self.dataPath, numLabels=1, textPreprocess=preprocess, **kwargs) filename, ext = os.path.splitext(self.dataPath) self.classificationFile = "{}_categories.json".format(filename) # Generate test data files: one network data file for each bucket. bucketFilePaths = bucketCSVs(self.dataPath) for bucketFile in bucketFilePaths: ndg.reset() ndg.split( filePath=bucketFile, numLabels=1, textPreprocess=preprocess, **kwargs) bucketFileName, ext = os.path.splitext(bucketFile) if not self.orderedSplit: # the sequences will be written to the file in random order ndg.randomizeData(seed) dataFile = "{}_network{}".format(bucketFileName, ext) ndg.saveData(dataFile, self.classificationFile) # the classification file here gets (correctly) overwritten later self.bucketFiles.append(dataFile) # Generate training data file(s). self.trainingDicts = [] uniqueDataDict = OrderedDict() included = [] seqID = 0 for dataEntry in self.dataDict.values(): uniqueID = dataEntry[2] if uniqueID not in included: # skip over the samples that are repeated in multiple buckets uniqueDataDict[seqID] = dataEntry included.append(uniqueID) seqID += 1 self.trainingDicts.append(uniqueDataDict) ndg.reset() ndg.split( dataDict=uniqueDataDict, numLabels=1, textPreprocess=preprocess, **kwargs) for rep in xrange(self.trainingReps): # use a different file for each training rep if not self.orderedSplit: ndg.randomizeData(seed) ndg.stripCategories() # replace the categories w/ seqId dataFile = "{}_network_training_{}{}".format(filename, rep, ext) ndg.saveData(dataFile, self.classificationFile) self.dataFiles.append(dataFile) # TODO: maybe add a method (and arg) for removing all these data files else: # TODO (only if needed) raise NotImplementedError("Must generate data.") # labels references match the classification json self.mapLabelRefs()
def testSplitNoPreprocess(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") ndg.split(filename, 3, False) self.assertRecordsEqual(ndg.records, self.expected)