def testSplitPreprocess(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")

        expected = [[{
            "_token": "gohbkchoo",
            "_categories": "0 1",
            "_sequenceID": 0,
            "ID": "1",
            "_reset": 1
                        "_token": "o",
                        "_categories": "2",
                        "_sequenceID": 1,
                        "ID": "2",
                        "_reset": 1
                    }, {
                        "_token": "ca",
                        "_categories": "2",
                        "_sequenceID": 1,
                        "ID": "2",
                        "_reset": 0

        ndg.split(filename, 3, True, ignoreCommon=100, correctSpell=True)
        self.assertRecordsEqual(ndg.records, expected)
    def testSplitNoPreprocess(self):
        ndg = NetworkDataGenerator()
        filename = (self.dirName +

        ndg.split(filename, 2, 3, False)
        self.assertRecordsEqual(ndg.records, self.expected)
    def testSaveData(self):
        ndg = NetworkDataGenerator()
        filename = (self.dirName +
        ndg.split(filename, 2, 3, False)
        dataOutputFile = (
            self.dirName +
        categoriesOutputFile = (
            self.dirName +
        success = ndg.saveData(dataOutputFile, categoriesOutputFile)

        dataTable = pandas.read_csv(dataOutputFile).fillna("")

        types = {
            "_category0": "int",
            "_category1": "int",
            "_category2": "int",
            "token": "string",
            "_sequenceID": "int",
            "_reset": "int"
        specials = {
            "_category0": "C",
            "_category1": "C",
            "_category2": "C",
            "token": "",
            "_sequenceID": "S",
            "_reset": "R"

        expected_records = [
            record for data in self.expected for record in data
        expected_records.insert(0, specials)
        expected_records.insert(0, types)

        for idx, values in dataTable.iterrows():
            record = values.to_dict()
            if record["_category1"] == "":
                del record["_category1"]

            if record["_category2"] == "":
                del record["_category2"]

            self.assertDictEqual(record, expected_records[idx])

        with open(categoriesOutputFile) as f:
            categories = json.load(f)

        expected_categories = {
            "kitchen": 0,
            "environment": 1,
            "not helpful": 2
        self.assertDictEqual(categories, expected_categories)
  def testSaveDataIncorrectType(self):
    ndg = NetworkDataGenerator()
    filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
    dataOutputFile = os.path.join(
        self.dirName, "test_data/multi_sample_split.csv")
    categoriesOutputFile = os.path.join(
        self.dirName, "test_data/multi_sample_categories.csv")
    ndg.split(filename, 3, False)

    with self.assertRaises(TypeError):
      ndg.saveData(dataOutputFile, categoriesOutputFile)
  def testFileRecordStreamReadData(self):
    ndg = NetworkDataGenerator()
    filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
    ndg.split(filename, 3, False)
    dataOutputFile = os.path.join(
        self.dirName, "test_data/multi_sample_split.csv")
    categoriesOutputFile = os.path.join(
        self.dirName, "test_data/multi_sample_categories.json")
    ndg.saveData(dataOutputFile, categoriesOutputFile)

    # If no error is raised, then the data is in the correct format
    frs = FileRecordStream(dataOutputFile)
Пример #6
  def prepData(self, dataPath, ordered=False, **kwargs):
    Generate the data in network API format.

    @param dataPath          (str)  Path to input data file; format as expected
                                    by NetworkDataGenerator.
    @return networkDataPath  (str)  Path to data formtted for network API.
    @return ndg              (NetworkDataGenerator)
    ndg = NetworkDataGenerator()
    networkDataPath = ndg.setupData(dataPath, self.numLabels, ordered, **kwargs)

    return networkDataPath, ndg
    def testSaveData(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        ndg.split(filename, 3, False)
        dataOutputFile = os.path.join(self.dirName,
        categoriesOutputFile = os.path.join(
            self.dirName, "test_data/multi_sample_categories.json")
        success = ndg.saveData(dataOutputFile, categoriesOutputFile)

        dataTable = pandas.read_csv(dataOutputFile).fillna("")

        types = {
            "_categories": "list",
            "_token": "string",
            "_sequenceID": "int",
            "_reset": "int",
            "ID": "string"
        specials = {
            "_categories": "C",
            "_token": "",
            "_sequenceID": "S",
            "_reset": "R",
            "ID": ""

        expected_records = [
            record for data in self.expected for record in data
        expected_records.insert(0, specials)
        expected_records.insert(0, types)

        for idx, values in dataTable.iterrows():
            record = values.to_dict()
            if idx > 1:
                # csv values are strings, so cast the ints
                record["_sequenceID"] = int(record["_sequenceID"])
                record["_reset"] = int(record["_reset"])
            self.assertDictEqual(record, expected_records[idx])

        with open(categoriesOutputFile) as f:
            categories = json.load(f)

        expected_categories = {
            "kitchen": 0,
            "environment": 1,
            "not helpful": 2
        self.assertDictEqual(categories, expected_categories)
Пример #8
    def prepData(self, dataPath, ordered=False, **kwargs):
    Generate the data in network API format.

    @param dataPath          (str)  Path to input data file; format as expected
                                    by NetworkDataGenerator.
    @return networkDataPath  (str)  Path to data formtted for network API.
    @return ndg              (NetworkDataGenerator)
        ndg = NetworkDataGenerator()
        networkDataPath = ndg.setupData(dataPath, self.numLabels, ordered,

        return networkDataPath, ndg
    def testSplitPreprocess(self):
        ndg = NetworkDataGenerator()
        filename = (self.dirName +

        expected = [[{
            "_category0": "0",
            "_category1": "1",
            "_sequenceID": "0",
            "token": "get",
            "_reset": "1"
        }, {
            "_category0": "0",
            "_category1": "1",
            "_sequenceID": "0",
            "token": "rid",
            "_reset": "0"
        }, {
            "_category0": "0",
            "_category1": "1",
            "_sequenceID": "0",
            "token": "trouble",
            "_reset": "0"
        }, {
            "_category0": "0",
            "_category1": "1",
            "_sequenceID": "0",
            "token": "kitchen",
            "_reset": "0"
        }, {
            "_category0": "0",
            "_category1": "1",
            "_sequenceID": "0",
            "token": "odor",
            "_reset": "0"
                        "_category0": "2",
                        "_sequenceID": "1",
                        "token": "don",
                        "_reset": "1"
                    }, {
                        "_category0": "2",
                        "_sequenceID": "1",
                        "token": "care",
                        "_reset": "0"

        ndg.split(filename, 2, 3, True, ignoreCommon=100, correctSpell=True)
        self.assertRecordsEqual(ndg.records, expected)
Пример #10
  def _testing(self, trial):
    Test the network on the test set for a particular trial and store the
    @param trial      (int)       trial count
    if self.verbosity > 0:
      i = sum(self.partitions[trial][0])
      indices = []
      for numTokens in self.partitions[trial][1]:
        i += numTokens
      print ("\tRunner selects to test on sequences starting at indices "

    results = ([], [])
    for i, numTokens in enumerate(self.partitions[trial][1]):
      predictions = []
      for _ in xrange(numTokens):
        predicted = self.model.testModel()
      winningPredictions = self._selectWinners(predictions)

      # TODO: switch to standard (expected, actual) format

    # Prepare data for writeOutClassifications
    trainIdx = range(len(self.partitions[trial][0]))
    testIdx = range(len(self.partitions[trial][0]),
      len(self.partitions[trial][0]) + len(self.partitions[trial][1]))
    self.partitions[trial] = (trainIdx, testIdx)
    self.samples = NetworkDataGenerator.getSamples(self.dataFiles[trial])

Пример #11
    def testing(self, trial):
    Test the network on the test set for a particular trial and store the
    @param trial      (int)       trial count
        if self.verbosity > 0:
            i = sum(self.partitions[trial][0])
            indices = []
            for numTokens in self.partitions[trial][1]:
                i += numTokens
            print "\tRunner selects to test on sample(s) {}".format(indices)

        results = ([], [])
        for i, numTokens in enumerate(self.partitions[trial][1]):
            predictions = []
            for _ in xrange(numTokens):
                predicted = self.model.testModel()
            winningPredictions = self._selectWinners(predictions)

        # Prepare data for writeOutClassifications
        trainIdx = range(len(self.partitions[trial][0]))
        testIdx = range(
            len(self.partitions[trial][0]) + len(self.partitions[trial][1]))
        self.partitions[trial] = (trainIdx, testIdx)
        self.samples = NetworkDataGenerator.getSamples(self.dataFiles[trial])

Пример #12
    def prepData(self, dataPath, ordered=False, stripCats=False, **kwargs):
    Generate the data in network API format.

    @param dataPath          (str)  Path to input data file; format as expected
                                    by NetworkDataGenerator.
    @param ordered           (bool) Keep order of data, or randomize.
    @param stripCats         (bool) Remove the categories and replace them with
                                    the sequence_Id.
    @return networkDataPath  (str)  Path to data formtted for network API.
    @return ndg              (NetworkDataGenerator)
        ndg = NetworkDataGenerator()
        networkDataPath = ndg.setupData(dataPath, self.numLabels, ordered,
                                        stripCats, **kwargs)

        return networkDataPath, ndg
Пример #13
 def partitionIndices(self, split, trial):
   Returns the number of tokens for each sample in the training and test set
   when doing an ordered split
   dataFile = self.dataFiles[trial]
   numTokens = NetworkDataGenerator.getNumberOfTokens(dataFile)
   return (numTokens[:split], numTokens[split:])
Пример #14
  def prepData(self, dataPath, ordered=False, stripCats=False, **kwargs):
    Generate the data in network API format.

    @param dataPath          (str)  Path to input data file; format as expected
                                    by NetworkDataGenerator.
    @param ordered           (bool) Keep order of data, or randomize.
    @param stripCats         (bool) Remove the categories and replace them with
                                    the sequence_Id.
    @return networkDataPath  (str)  Path to data formtted for network API.
    @return ndg              (NetworkDataGenerator)
    ndg = NetworkDataGenerator()
    networkDataPath = ndg.setupData(
      dataPath, self.numLabels, ordered, stripCats, **kwargs)

    return networkDataPath, ndg
Пример #15
 def partitionIndices(self, split, trial):
 Returns the number of tokens for each sample in the training and test set
 when doing an ordered split
     dataFile = self.dataFiles[trial]
     numTokens = NetworkDataGenerator.getNumberOfTokens(dataFile)
     return (numTokens[:split], numTokens[split:])
    def testRandomize(self):
        ndg = NetworkDataGenerator()
        filename = (
            self.dirName +
        ndg.split(filename, 2, 3, False)


        dataOutputFile = (
            self.dirName +
        categoriesOutputFile = (
            self.dirName +
        success = ndg.saveData(dataOutputFile, categoriesOutputFile)

        randomizedIDs = []
        dataTable = pandas.read_csv(dataOutputFile)
        for _, values in dataTable.iterrows():
            record = values.to_dict()
            idx = record["_sequenceID"]
            if idx.isdigit() and (not randomizedIDs
                                  or randomizedIDs[-1] != idx):

        self.assertNotEqual(randomizedIDs, range(len(randomizedIDs)))
Пример #17
  def setupData(self, preprocess=False, sampleIdx=2, **kwargs):
    Generate the data in network API format if necessary. self.dataFiles is
    populated with the paths of network data files, one for each trial

    Look at (setupData) and (split) for the
    if self.generateData:
      ndg = NetworkDataGenerator()
      ndg.split(self.dataPath, sampleIdx, self.numClasses, preprocess,

      filename, ext = os.path.splitext(self.dataPath)
      self.classificationFile = "{}-classifications.json".format(filename)
      for i in xrange(len(self.trainSize)):
        if not self.orderedSplit:
        dataFile = "{}-{}{}".format(filename, i, ext)
        ndg.saveData(dataFile, self.classificationFile)

      if self.verbosity > 0:
        print "{} file(s) generated at {}".format(len(self.dataFiles),
        print "Classification json is at: {}".format(self.classificationFile)
      # Does an orderedSplit
      self.dataFiles = [self.dataPath] * len(self.trainSize)

    self.actualLabels = [self._getClassifications(size, i)
      for i, size in enumerate(self.trainSize)]

Пример #18
 def partitionIndices(self):
   Sets self.partitions for the number of tokens for each sample in the
   training and test sets (when doing an ordered split).
   for trial, split in enumerate(self.trainSizes):
     dataFile = self.dataFiles[trial]
     numTokens = NetworkDataGenerator.getNumberOfTokens(dataFile)
     self.partitions.append((numTokens[:split], numTokens[split:]))
  def testSaveData(self):
    ndg = NetworkDataGenerator()
    filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
    ndg.split(filename, 3, False)
    dataOutputFile = os.path.join(
        self.dirName, "test_data/multi_sample_split.csv")
    categoriesOutputFile = os.path.join(
        self.dirName, "test_data/multi_sample_categories.json")
    success = ndg.saveData(dataOutputFile, categoriesOutputFile)

    dataTable = pandas.read_csv(dataOutputFile).fillna("")

    types = {"_categories": "list",
             "_token": "string",
             "_sequenceID": "int",
             "_reset": "int",
             "ID": "string"}
    specials = {"_categories": "C",
                "_token": "",
                "_sequenceID": "S",
                "_reset": "R",
                "ID": ""}
    expected_records = [record for data in self.expected for record in data]
    expected_records.insert(0, specials)
    expected_records.insert(0, types)

    for idx, values in dataTable.iterrows():
      record = values.to_dict()
      if idx > 1:
        # csv values are strings, so cast the ints
        record["_sequenceID"] = int(record["_sequenceID"])
        record["_reset"] = int(record["_reset"])
      self.assertDictEqual(record, expected_records[idx])

    with open(categoriesOutputFile) as f:
      categories = json.load(f)

    expected_categories = {"kitchen": 0, "environment": 1, "not helpful": 2}
    self.assertDictEqual(categories, expected_categories)
Пример #20
 def _getClassifications(self, split, trial):
   Gets the classifications for testing samples for a particular trial
   @param split      (int)       Size of training set
   @param trial      (int)       trial count
   @return           (list)      List of list of ids of classifications for a
   dataFile = self.dataFiles[trial]
   classifications = NetworkDataGenerator.getClassifications(dataFile)
   return [[int(c) for c in classes.strip().split(" ")]
            for classes in classifications][split:]
Пример #21
 def _getClassifications(self, split, trial):
 Gets the classifications for testing samples for a particular trial
 @param split      (int)       Size of training set
 @param trial      (int)       trial count
 @return           (list)      List of list of ids of classifications for a
     dataFile = self.dataFiles[trial]
     classifications = NetworkDataGenerator.getClassifications(dataFile)
     return [[int(c) for c in classes.strip().split(" ")]
             for classes in classifications][split:]
  def testSplitPreprocess(self):
    ndg = NetworkDataGenerator()
    filename = os.path.join(self.dirName, "test_data/multi_sample.csv")

    expected = [[
      {"_token": "gohbkchoo",
      "_categories": "0 1",
      "_sequenceID": 0,
      "ID": "1",
      "_reset": 1}],
      [{"_token": "o",
      "_categories": "2",
      "_sequenceID": 1,
      "ID": "2",
      "_reset": 1},
      {"_token": "ca",
      "_categories": "2",
      "_sequenceID": 1,
      "ID": "2",
      "_reset": 0}]]

    ndg.split(filename, 3, True, ignoreCommon=100, correctSpell=True)
    self.assertRecordsEqual(ndg.records, expected)
    def testSaveDataIncorrectType(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        dataOutputFile = os.path.join(self.dirName,
        categoriesOutputFile = os.path.join(
            self.dirName, "test_data/multi_sample_categories.csv")
        ndg.split(filename, 3, False)

        with self.assertRaises(TypeError):
            ndg.saveData(dataOutputFile, categoriesOutputFile)
    def testFileRecordStreamReadData(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        ndg.split(filename, 3, False)
        dataOutputFile = os.path.join(self.dirName,
        categoriesOutputFile = os.path.join(
            self.dirName, "test_data/multi_sample_categories.json")
        ndg.saveData(dataOutputFile, categoriesOutputFile)

        # If no error is raised, then the data is in the correct format
        frs = FileRecordStream(dataOutputFile)
    def testSaveDataIncorrectType(self):
        ndg = NetworkDataGenerator()
        filename = (self.dirName +
        dataOutputFile = (
            self.dirName +
        categoriesOutputFile = (
            self.dirName +
        ndg.split(filename, 2, 3, False)

        with self.assertRaises(TypeError):
            ndg.saveData(dataOutputFile, categoriesOutputFile)
Пример #26
    def setupData(self, preprocess=False, sampleIdx=2, **kwargs):
    Generate the data in network API format if necessary. self.dataFiles is
    populated with the paths of network data files, one for each trial

    Look at (setupData) and (split) for the
        if self.generateData:
            ndg = NetworkDataGenerator()
            ndg.split(self.dataPath, sampleIdx, self.numClasses, preprocess,

            filename, ext = os.path.splitext(self.dataPath)
            self.classificationFile = "{}-classifications.json".format(
            for i in xrange(len(self.trainSize)):
                if not self.orderedSplit:
                dataFile = "{}-{}{}".format(filename, i, ext)
                ndg.saveData(dataFile, self.classificationFile)

            if self.verbosity > 0:
                print "{} file(s) generated at {}".format(
                    len(self.dataFiles), self.dataFiles)
                print "Classification json is at: {}".format(
            # Does an orderedSplit
            self.dataFiles = [self.dataPath] * len(self.trainSize)

        self.actualLabels = [
            self._getClassifications(size, i)
            for i, size in enumerate(self.trainSize)

Пример #27
  def setupNetData(self, preprocess=False, generateData=False, **kwargs):
    Generate the data in network API format if necessary. self.dataFiles is
    populated with the paths of network data files, one for each trial

    Look at (setupData) and (split) for the
    if generateData:
      # TODO: use model.prepData()?
      ndg = NetworkDataGenerator()
      ndg.split(self.dataPath, self.numClasses, preprocess, **kwargs)

      filename, ext = os.path.splitext(self.dataPath)
      self.classificationFile = "{}_categories.json".format(filename)

      for i in xrange(len(self.trainSizes)):
        if not self.orderedSplit:
        dataFile = "{}_network_{}{}".format(filename, i, ext)
        ndg.saveData(dataFile, self.classificationFile)

      if self.verbosity > 0:
        print "{} file(s) generated at {}".format(len(self.dataFiles),
        print "Classification JSON is at: {}".format(self.classificationFile)
      # Use the input file for each trial; maintains the order of samples.
      self.dataFiles = [self.dataPath] * len(self.trainSizes)

    if self.numClasses > 0:
      # Setup labels data objects
      self.actualLabels = [self._getClassifications(size, i)
        for i, size in enumerate(self.trainSizes)]
  def testRandomize(self):
    ndg = NetworkDataGenerator()
    filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
    ndg.split(filename, 3, False)


    dataOutputFile = os.path.join(
        self.dirName, "test_data/multi_sample_split.csv")
    categoriesOutputFile = os.path.join(
        self.dirName, "test_data/multi_sample_categories.json")
    success = ndg.saveData(dataOutputFile, categoriesOutputFile)

    randomizedIDs = []
    dataTable = pandas.read_csv(dataOutputFile)
    for _, values in dataTable.iterrows():
      record = values.to_dict()
      idx = record["_sequenceID"]
      if idx.isdigit() and (not randomizedIDs or randomizedIDs[-1] != idx):

    self.assertNotEqual(randomizedIDs, range(len(randomizedIDs)))
  def testSplitNoPreprocess(self):
    ndg = NetworkDataGenerator()
    filename = os.path.join(self.dirName, "test_data/multi_sample.csv")

    ndg.split(filename, 3, False)
    self.assertRecordsEqual(ndg.records, self.expected)
    def testSplitNoPreprocess(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")

        ndg.split(filename, 3, False)
        self.assertRecordsEqual(ndg.records, self.expected)