def generateNetworkDataFiles(self, splits, seed, preprocess, **kwargs): # TODO: use model.prepData()? ndg = NetworkDataGenerator() self.dataDict = ndg.split( filePath=self.dataPath, numLabels=self.numClasses, textPreprocess=preprocess, **kwargs) filename, ext = os.path.splitext(self.dataPath) self.classificationFile = "{}_categories.json".format(filename) # Generate one data file for each experiment iteration. if self.experimentType == "k-folds" and not self.orderedSplit: # only randomize the data order once for k-folds cross validation ndg.randomizeData(seed) for i in xrange(splits): if self.experimentType != "k-folds" and not self.orderedSplit: ndg.randomizeData(seed) seed += 1 # ext='.csv' dataFile = "{}_network_{}{}".format(filename, i, ext) ndg.saveData(dataFile, self.classificationFile) self.dataFiles.append(dataFile) if self.verbosity > 0: print "{} file(s) generated at {}".format(len(self.dataFiles), self.dataFiles) print "Classification JSON is at: {}".format(self.classificationFile)
def generateNetworkDataFiles(self, splits, seed, preprocess, **kwargs): # TODO: use model.prepData()? ndg = NetworkDataGenerator() self.dataDict = ndg.split(filePath=self.dataPath, numLabels=self.numClasses, textPreprocess=preprocess, **kwargs) filename, ext = os.path.splitext(self.dataPath) self.classificationFile = "{}_categories.json".format(filename) # Generate one data file for each experiment iteration. if self.experimentType == "k-folds" and not self.orderedSplit: # only randomize the data order once for k-folds cross validation ndg.randomizeData(seed) for i in xrange(splits): if self.experimentType != "k-folds" and not self.orderedSplit: ndg.randomizeData(seed) seed += 1 # ext='.csv' dataFile = "{}_network_{}{}".format(filename, i, ext) ndg.saveData(dataFile, self.classificationFile) self.dataFiles.append(dataFile) if self.verbosity > 0: print "{} file(s) generated at {}".format(len(self.dataFiles), self.dataFiles) print "Classification JSON is at: {}".format( self.classificationFile)
def testSaveDataIncorrectType(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") dataOutputFile = os.path.join(self.dirName, "test_data/multi_sample_split.csv") categoriesOutputFile = os.path.join(self.dirName, "test_data/multi_sample_categories.csv") ndg.split(filename, 3, False) with self.assertRaises(TypeError): ndg.saveData(dataOutputFile, categoriesOutputFile)
def testFileRecordStreamReadData(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") ndg.split(filename, 3, False) dataOutputFile = os.path.join(self.dirName, "test_data/multi_sample_split.csv") categoriesOutputFile = os.path.join(self.dirName, "test_data/multi_sample_categories.json") ndg.saveData(dataOutputFile, categoriesOutputFile) # If no error is raised, then the data is in the correct format frs = FileRecordStream(dataOutputFile)
def testSaveDataIncorrectType(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") dataOutputFile = os.path.join(self.dirName, "test_data/multi_sample_split.csv") categoriesOutputFile = os.path.join( self.dirName, "test_data/multi_sample_categories.csv") ndg.split(filename, 3, False) with self.assertRaises(TypeError): ndg.saveData(dataOutputFile, categoriesOutputFile)
def testFileRecordStreamReadData(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") ndg.split(filename, 3, False) dataOutputFile = os.path.join(self.dirName, "test_data/multi_sample_split.csv") categoriesOutputFile = os.path.join( self.dirName, "test_data/multi_sample_categories.json") ndg.saveData(dataOutputFile, categoriesOutputFile) # If no error is raised, then the data is in the correct format frs = FileRecordStream(dataOutputFile)
def testSaveData(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") ndg.split(filename, 3, False) dataOutputFile = os.path.join(self.dirName, "test_data/multi_sample_split.csv") categoriesOutputFile = os.path.join(self.dirName, "test_data/multi_sample_categories.json") success = ndg.saveData(dataOutputFile, categoriesOutputFile) self.assertTrue(success) dataTable = pandas.read_csv(dataOutputFile).fillna("") types = {"_category": "list", "_token": "string", "_sequenceId": "int", "_reset": "int", "ID": "string"} specials = {"_category": "C", "_token": "", "_sequenceId": "S", "_reset": "R", "ID": ""} expected_records = [record for data in self.expected for record in data] expected_records.insert(0, specials) expected_records.insert(0, types) for idx, values in dataTable.iterrows(): record = values.to_dict() if idx > 1: # csv values are strings, so cast the ints record["_sequenceId"] = int(record["_sequenceId"]) record["_reset"] = int(record["_reset"]) self.assertDictEqual(record, expected_records[idx]) with open(categoriesOutputFile) as f: categories = json.load(f) expected_categories = {"kitchen": 0, "environment": 1, "not helpful": 2} self.assertDictEqual(categories, expected_categories)
def testSaveData(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") ndg.split(filename, 3, False) dataOutputFile = os.path.join(self.dirName, "test_data/multi_sample_split.csv") categoriesOutputFile = os.path.join( self.dirName, "test_data/multi_sample_categories.json") success = ndg.saveData(dataOutputFile, categoriesOutputFile) self.assertTrue(success) dataTable = pandas.read_csv(dataOutputFile).fillna("") types = { "_category": "list", "_token": "string", "_sequenceId": "int", "_reset": "int", "ID": "string" } specials = { "_category": "C", "_token": "", "_sequenceId": "S", "_reset": "R", "ID": "" } expected_records = [ record for data in self.expected for record in data ] expected_records.insert(0, specials) expected_records.insert(0, types) for idx, values in dataTable.iterrows(): record = values.to_dict() if idx > 1: # csv values are strings, so cast the ints record["_sequenceId"] = int(record["_sequenceId"]) record["_reset"] = int(record["_reset"]) self.assertDictEqual(record, expected_records[idx]) with open(categoriesOutputFile) as f: categories = json.load(f) expected_categories = { "kitchen": 0, "environment": 1, "not helpful": 2 } self.assertDictEqual(categories, expected_categories)
def testRandomize(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") ndg.split(filename, 3, False) random.seed(1) ndg.randomizeData() dataOutputFile = os.path.join(self.dirName, "test_data/multi_sample_split.csv") categoriesOutputFile = os.path.join(self.dirName, "test_data/multi_sample_categories.json") success = ndg.saveData(dataOutputFile, categoriesOutputFile) randomizedIDs = [] dataTable = pandas.read_csv(dataOutputFile) for _, values in dataTable.iterrows(): record = values.to_dict() idx = record["_sequenceId"] if idx.isdigit() and (not randomizedIDs or randomizedIDs[-1] != idx): randomizedIDs.append(idx) self.assertNotEqual(randomizedIDs, range(len(randomizedIDs)))
def testRandomize(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") ndg.split(filename, 3, False) random.seed(1) ndg.randomizeData() dataOutputFile = os.path.join(self.dirName, "test_data/multi_sample_split.csv") categoriesOutputFile = os.path.join( self.dirName, "test_data/multi_sample_categories.json") success = ndg.saveData(dataOutputFile, categoriesOutputFile) randomizedIDs = [] dataTable = pandas.read_csv(dataOutputFile) for _, values in dataTable.iterrows(): record = values.to_dict() idx = record["_sequenceId"] if idx.isdigit() and (not randomizedIDs or randomizedIDs[-1] != idx): randomizedIDs.append(idx) self.assertNotEqual(randomizedIDs, range(len(randomizedIDs)))
def setupNetData( self, generateData=True, seed=42, preprocess=False, **kwargs): """ Resulting network data files created: - One for each bucket - One for each training rep, where samples are not repeated in a given file. Each samples is given its own category (_category = _sequenceId). The classification json is saved when generating the final training file. """ if generateData: ndg = NetworkDataGenerator() self.dataDict = ndg.split( filePath=self.dataPath, numLabels=1, textPreprocess=preprocess, **kwargs) filename, ext = os.path.splitext(self.dataPath) self.classificationFile = "{}_categories.json".format(filename) # Generate test data files: one network data file for each bucket. bucketFilePaths = bucketCSVs(self.dataPath) for bucketFile in bucketFilePaths: ndg.reset() ndg.split( filePath=bucketFile, numLabels=1, textPreprocess=preprocess, **kwargs) bucketFileName, ext = os.path.splitext(bucketFile) if not self.orderedSplit: # the sequences will be written to the file in random order ndg.randomizeData(seed) dataFile = "{}_network{}".format(bucketFileName, ext) ndg.saveData(dataFile, self.classificationFile) # the classification file here gets (correctly) overwritten later self.bucketFiles.append(dataFile) # Generate training data file(s). self.trainingDicts = [] uniqueDataDict = OrderedDict() included = [] seqID = 0 for dataEntry in self.dataDict.values(): uniqueID = dataEntry[2] if uniqueID not in included: # skip over the samples that are repeated in multiple buckets uniqueDataDict[seqID] = dataEntry included.append(uniqueID) seqID += 1 self.trainingDicts.append(uniqueDataDict) ndg.reset() ndg.split( dataDict=uniqueDataDict, numLabels=1, textPreprocess=preprocess, **kwargs) for rep in xrange(self.trainingReps): # use a different file for each training rep if not self.orderedSplit: ndg.randomizeData(seed) ndg.stripCategories() # replace the categories w/ seqId dataFile = "{}_network_training_{}{}".format(filename, rep, ext) ndg.saveData(dataFile, self.classificationFile) self.dataFiles.append(dataFile) # TODO: maybe add a method (and arg) for removing all these data files else: # TODO (only if needed) raise NotImplementedError("Must generate data.") # labels references match the classification json self.mapLabelRefs()