def setupData(self, preprocess=False, sampleIdx=2, **kwargs): """ Generate the data in network API format if necessary. self.dataFiles is populated with the paths of network data files, one for each trial Look at runner.py (setupData) and network_data_generator.py (split) for the parameters """ if self.generateData: ndg = NetworkDataGenerator() ndg.split(self.dataPath, sampleIdx, self.numClasses, preprocess, **kwargs) filename, ext = os.path.splitext(self.dataPath) self.classificationFile = "{}-classifications.json".format(filename) for i in xrange(len(self.trainSize)): if not self.orderedSplit: ndg.randomizeData() dataFile = "{}-{}{}".format(filename, i, ext) ndg.saveData(dataFile, self.classificationFile) self.dataFiles.append(dataFile) if self.verbosity > 0: print "{} file(s) generated at {}".format(len(self.dataFiles), self.dataFiles) print "Classification json is at: {}".format(self.classificationFile) else: # Does an orderedSplit self.dataFiles = [self.dataPath] * len(self.trainSize) self.actualLabels = [self._getClassifications(size, i) for i, size in enumerate(self.trainSize)] self._mapLabelRefs()
def testRandomize(self): ndg = NetworkDataGenerator() filename = ( self.dirName + "/../../../data/sample_reviews_multi/sample_reviews_data_training.csv" ) ndg.split(filename, 2, 3, False) random.seed(1) ndg.randomizeData() dataOutputFile = ( self.dirName + "/../../../data/network_data_generator/multi_sample_split.csv") categoriesOutputFile = ( self.dirName + "/../../../data/network_data_generator/multi_sample_categories.json" ) success = ndg.saveData(dataOutputFile, categoriesOutputFile) randomizedIDs = [] dataTable = pandas.read_csv(dataOutputFile) for _, values in dataTable.iterrows(): record = values.to_dict() idx = record["_sequenceID"] if idx.isdigit() and (not randomizedIDs or randomizedIDs[-1] != idx): randomizedIDs.append(idx) self.assertNotEqual(randomizedIDs, range(len(randomizedIDs)))
def setupData(self, preprocess=False, sampleIdx=2, **kwargs): """ Generate the data in network API format if necessary. self.dataFiles is populated with the paths of network data files, one for each trial Look at runner.py (setupData) and network_data_generator.py (split) for the parameters """ if self.generateData: ndg = NetworkDataGenerator() ndg.split(self.dataPath, sampleIdx, self.numClasses, preprocess, **kwargs) filename, ext = os.path.splitext(self.dataPath) self.classificationFile = "{}-classifications.json".format( filename) for i in xrange(len(self.trainSize)): if not self.orderedSplit: ndg.randomizeData() dataFile = "{}-{}{}".format(filename, i, ext) ndg.saveData(dataFile, self.classificationFile) self.dataFiles.append(dataFile) if self.verbosity > 0: print "{} file(s) generated at {}".format( len(self.dataFiles), self.dataFiles) print "Classification json is at: {}".format( self.classificationFile) else: # Does an orderedSplit self.dataFiles = [self.dataPath] * len(self.trainSize) self.actualLabels = [ self._getClassifications(size, i) for i, size in enumerate(self.trainSize) ] self._mapLabelRefs()
def testRandomize(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") ndg.split(filename, 3, False) random.seed(1) ndg.randomizeData() dataOutputFile = os.path.join( self.dirName, "test_data/multi_sample_split.csv") categoriesOutputFile = os.path.join( self.dirName, "test_data/multi_sample_categories.json") success = ndg.saveData(dataOutputFile, categoriesOutputFile) randomizedIDs = [] dataTable = pandas.read_csv(dataOutputFile) for _, values in dataTable.iterrows(): record = values.to_dict() idx = record["_sequenceID"] if idx.isdigit() and (not randomizedIDs or randomizedIDs[-1] != idx): randomizedIDs.append(idx) self.assertNotEqual(randomizedIDs, range(len(randomizedIDs)))
def setupNetData(self, preprocess=False, generateData=False, **kwargs): """ Generate the data in network API format if necessary. self.dataFiles is populated with the paths of network data files, one for each trial Look at runner.py (setupData) and network_data_generator.py (split) for the parameters. """ if generateData: # TODO: use model.prepData()? ndg = NetworkDataGenerator() ndg.split(self.dataPath, self.numClasses, preprocess, **kwargs) filename, ext = os.path.splitext(self.dataPath) self.classificationFile = "{}_categories.json".format(filename) for i in xrange(len(self.trainSizes)): if not self.orderedSplit: ndg.randomizeData() dataFile = "{}_network_{}{}".format(filename, i, ext) ndg.saveData(dataFile, self.classificationFile) self.dataFiles.append(dataFile) if self.verbosity > 0: print "{} file(s) generated at {}".format(len(self.dataFiles), self.dataFiles) print "Classification JSON is at: {}".format(self.classificationFile) else: # Use the input file for each trial; maintains the order of samples. self.dataFiles = [self.dataPath] * len(self.trainSizes) if self.numClasses > 0: # Setup labels data objects self.actualLabels = [self._getClassifications(size, i) for i, size in enumerate(self.trainSizes)] self._mapLabelRefs()