def test_WeightedMean(self): # Cleanup old files #for f in glob.glob('*.*'): # if 'auto_specials' in f: # os.remove(f) fields = [ ('dummy1', 'int', ''), ('dummy2', 'int', ''), ('timestamp', 'datetime', 'T'), ] records = ( [10, 1, datetime.datetime(2000, 3, 1)], [5, 2, datetime.datetime(2000, 3, 2)], [1, 100, datetime.datetime(2000, 3, 3)], [2, 4, datetime.datetime(2000, 3, 4)], [4, 1, datetime.datetime(2000, 3, 5)], [4, 0, datetime.datetime(2000, 3, 6)], [5, 0, datetime.datetime(2000, 3, 7)], [6, 0, datetime.datetime(2000, 3, 8)], ) if not os.path.isdir('data'): os.makedirs('data') with FileRecordStream('data/weighted_mean.csv', write=True, fields=fields) \ as o: for r in records: o.appendRecord(r) # Aggregate just the dummy field, all the specials should be added ai = dict(fields=[('dummy1', 'wmean:dummy2', None), ('dummy2', 'mean', None)], days=2) handle = \ tempfile.NamedTemporaryFile(prefix='weighted_mean', suffix='.csv', dir='.') tempFile = handle.name handle.close() outputFile = generateDataset(ai, 'weighted_mean.csv', tempFile) result = [] with FileRecordStream(outputFile) as f: print f.getFields() for r in f: result.append(r) self.assertEqual(result[0][0], 6.0) self.assertEqual(result[0][1], 1.0) self.assertEqual(result[1][0], 1.0) self.assertEqual(result[1][1], 52.0) self.assertEqual(result[2][0], 4.0) self.assertEqual(result[2][1], 0.0) self.assertEqual(result[3][0], None) self.assertEqual(result[3][1], 0.0) return
def test_WeightedMean(self): # Cleanup old files #for f in glob.glob('*.*'): # if 'auto_specials' in f: # os.remove(f) fields = [('dummy1', 'int', ''), ('dummy2', 'int', ''), ('timestamp', 'datetime', 'T'), ] records = ( [10, 1, datetime.datetime(2000, 3, 1)], [5, 2, datetime.datetime(2000, 3, 2)], [1, 100, datetime.datetime(2000, 3, 3)], [2, 4, datetime.datetime(2000, 3, 4)], [4, 1, datetime.datetime(2000, 3, 5)], [4, 0, datetime.datetime(2000, 3, 6)], [5, 0, datetime.datetime(2000, 3, 7)], [6, 0, datetime.datetime(2000, 3, 8)], ) with FileRecordStream(resource_filename('nupic.datafiles', 'weighted_mean.csv'), write=True, fields=fields) \ as o: for r in records: o.appendRecord(r) # Aggregate just the dummy field, all the specials should be added ai = dict( fields=[('dummy1', 'wmean:dummy2', None), ('dummy2', 'mean', None)], days=2 ) handle = \ tempfile.NamedTemporaryFile(prefix='weighted_mean', suffix='.csv', dir='.') tempFile = handle.name handle.close() outputFile = generateDataset(ai, 'weighted_mean.csv', tempFile) result = [] with FileRecordStream(outputFile) as f: print f.getFields() for r in f: result.append(r) self.assertEqual(result[0][0], 6.0) self.assertEqual(result[0][1], 1.0) self.assertEqual(result[1][0], 1.0) self.assertEqual(result[1][1], 52.0) self.assertEqual(result[2][0], 4.0) self.assertEqual(result[2][1], 0.0) self.assertEqual(result[3][0], None) self.assertEqual(result[3][1], 0.0) return
def test_GenerateDataset(self): dataset = 'extra/gym/gym.csv' print "Using input dataset: ", dataset gymFileds = None with FileRecordStream(findDataset(dataset)) as f: gymFields = f.getFieldNames() aggregationOptions = dict( timeField=gymFields.index('timestamp'), fields=[('attendeeCount', sum), ('consumption', sum), ('timestamp', lambda x: x[0])], hours=5 ) handle = \ tempfile.NamedTemporaryFile(prefix='agg_gym_hours_5', suffix='.csv', dir=os.path.dirname(findDataset(dataset))) outputFile = handle.name handle.close() print "Expected outputFile path: ", outputFile print "Files in the destination folder before the test:" print os.listdir(os.path.abspath(os.path.dirname(findDataset(dataset)))) if os.path.isfile(outputFile): print "Removing existing outputFile: ", outputFile os.remove(outputFile) self.assertFalse(os.path.exists(outputFile), msg="Shouldn't exist, but does: " + str(outputFile)) result = generateDataset(aggregationOptions, dataset, outputFile) print "generateDataset() returned: ", result f1 = os.path.abspath(os.path.normpath(result)) print "normalized generateDataset() result path: ", f1 f2 = os.path.normpath(outputFile) print "normalized outputFile path: ", f2 self.assertEqual(f1, f2) print "Checking for presence of outputFile: ", outputFile self.assertTrue( os.path.isfile(outputFile), msg="Missing outputFile: %r; normalized generateDataset() result: %r" % ( outputFile, f1)) print "Files in the destination folder after the test:" print os.listdir(os.path.abspath(os.path.dirname(findDataset(dataset)))) print result print '-' * 30 return
def test_GenerateDataset(self): dataset = 'extra/gym/gym.csv' print "Using input dataset: ", dataset gymFileds = None with FileRecordStream(findDataset(dataset)) as f: gymFields = f.getFieldNames() aggregationOptions = dict(timeField=gymFields.index('timestamp'), fields=[('attendeeCount', sum), ('consumption', sum), ('timestamp', lambda x: x[0])], hours=5) handle = \ tempfile.NamedTemporaryFile(prefix='agg_gym_hours_5', suffix='.csv', dir=os.path.dirname(findDataset(dataset))) outputFile = handle.name handle.close() print "Expected outputFile path: ", outputFile print "Files in the destination folder before the test:" print os.listdir(os.path.abspath(os.path.dirname( findDataset(dataset)))) if os.path.isfile(outputFile): print "Removing existing outputFile: ", outputFile os.remove(outputFile) self.assertFalse(os.path.exists(outputFile), msg="Shouldn't exist, but does: " + str(outputFile)) result = generateDataset(aggregationOptions, dataset, outputFile) print "generateDataset() returned: ", result f1 = os.path.abspath(os.path.normpath(result)) print "normalized generateDataset() result path: ", f1 f2 = os.path.normpath(outputFile) print "normalized outputFile path: ", f2 self.assertEqual(f1, f2) print "Checking for presence of outputFile: ", outputFile self.assertTrue( os.path.isfile(outputFile), msg= "Missing outputFile: %r; normalized generateDataset() result: %r" % (outputFile, f1)) print "Files in the destination folder after the test:" print os.listdir(os.path.abspath(os.path.dirname( findDataset(dataset)))) print result print '-' * 30 return
def test_AutoSpecialFields(self): # Cleanup old files #for f in glob.glob('*.*'): # if 'auto_specials' in f: # os.remove(f) fields = [('dummy', 'string', ''), ('timestamp', 'datetime', 'T'), ('reset', 'int', 'R'), ('sid', 'int', 'S'), ] records = ( ['dummy-1', datetime.datetime(2000, 3, 1), 1, 1], ['dummy-2', datetime.datetime(2000, 3, 2), 0, 1], ['dummy-3', datetime.datetime(2000, 3, 3), 0, 1], ['dummy-4', datetime.datetime(2000, 3, 4), 1, 2], ['dummy-5', datetime.datetime(2000, 3, 5), 0, 2], ) if not os.path.isdir('data'): os.makedirs('data') with FileRecordStream('data/auto_specials.csv', write=True, fields=fields) \ as o: for r in records: o.appendRecord(r) # Aggregate just the dummy field, all the specials should be added ai = dict( fields=[('dummy', lambda x: x[0])], weeks=3 ) handle = \ tempfile.NamedTemporaryFile(prefix='auto_specials', suffix='.csv', dir='.') tempFile = handle.name handle.close() outputFile = generateDataset(ai, 'auto_specials.csv', tempFile) result = [] with FileRecordStream(outputFile) as f: print f.getFields() for r in f: result.append(r) self.assertEqual(result[0][2], 1) # reset self.assertEqual(result[0][3], 1) # seq id self.assertEqual(result[0][0], 'dummy-1') self.assertEqual(result[1][2], 1) # reset self.assertEqual(result[1][3], 2) # seq id self.assertEqual(result[1][0], 'dummy-4') return
def getDatasetsImpl(baseDatasets, generate, config): """ Implementation for description.py getDatasets() entry point function. Given a list of base datasets, returns a list of possibly transformed dataset paths to use; if config['aggregationInfo'] is disabled, then an identical dataset list is returned. Optionally, generates new datasets by applying transformations specified in config['aggregationInfo']. baseDatasets: a dictionaary of base dataset paths, where each key/value pair corresponds to a base (raw) dataset. The keys are as generated by our getBaseDatasets(); NOTE: the paths are absolute (fixed up by the framework) NOTE: Note that the paths in the baseDatasets dict will have been adjusted by the prediction framework to point to actual dataset locations as found on disk, and are not likely to be the same as the (local) paths initially returned by getBaseDatasets generate: if True and config['aggregationInfo'] is enabled, then new datasets will be generated per config['aggregationInfo']; otherwise, new datasets will not be generated config: configuration dictionary from description.py Returns: dictionary of dataset paths to use with same keys as in baseDatasets; the values may differ from baseDatasets as follows: if config['aggregationInfo'] is enabled, then new dataset paths will be generated per config['aggregationInfo']. """ # Aggregation info aggInfo = config['aggregationInfo'] if config['aggregationInfo'] else dict() datasets = dict() targetPaths = [] for name in baseDatasets: if generate: # NOTE: Avoid processing the same dataset more than once, such as when the # same dataset is used for training and inference in some tests tempPath = getFilename(aggInfo, baseDatasets[name]) if tempPath not in targetPaths: path = generateDataset(aggInfo, baseDatasets[name]) assert(path == tempPath) else: path = tempPath targetPaths.append(path) else: path = getFilename(aggInfo, baseDatasets[name]) datasets[name] = path return datasets
def getDatasetsImpl(baseDatasets, generate, config): """ Implementation for description.py getDatasets() entry point function. Given a list of base datasets, returns a list of possibly transformed dataset paths to use; if config['aggregationInfo'] is disabled, then an identical dataset list is returned. Optionally, generates new datasets by applying transformations specified in config['aggregationInfo']. baseDatasets: a dictionaary of base dataset paths, where each key/value pair corresponds to a base (raw) dataset. The keys are as generated by our getBaseDatasets(); NOTE: the paths are absolute (fixed up by the framework) NOTE: Note that the paths in the baseDatasets dict will have been adjusted by the prediction framework to point to actual dataset locations as found on disk, and are not likely to be the same as the (local) paths initially returned by getBaseDatasets generate: if True and config['aggregationInfo'] is enabled, then new datasets will be generated per config['aggregationInfo']; otherwise, new datasets will not be generated config: configuration dictionary from description.py Returns: dictionary of dataset paths to use with same keys as in baseDatasets; the values may differ from baseDatasets as follows: if config['aggregationInfo'] is enabled, then new dataset paths will be generated per config['aggregationInfo']. """ # Aggregation info aggInfo = config['aggregationInfo'] if config['aggregationInfo'] else dict() datasets = dict() targetPaths = [] for name in baseDatasets: if generate: # NOTE: Avoid processing the same dataset more than once, such as when the # same dataset is used for training and inference in some tests tempPath = getFilename(aggInfo, baseDatasets[name]) if tempPath not in targetPaths: path = generateDataset(aggInfo, baseDatasets[name]) assert(path == tempPath) else: path = tempPath targetPaths.append(path) else: path = getFilename(aggInfo, baseDatasets[name]) datasets[name] = path return datasets
def test_AutoSpecialFields(self): # Cleanup old files #for f in glob.glob('*.*'): # if 'auto_specials' in f: # os.remove(f) fields = [ ('dummy', 'string', ''), ('timestamp', 'datetime', 'T'), ('reset', 'int', 'R'), ('sid', 'int', 'S'), ] records = ( ['dummy-1', datetime.datetime(2000, 3, 1), 1, 1], ['dummy-2', datetime.datetime(2000, 3, 2), 0, 1], ['dummy-3', datetime.datetime(2000, 3, 3), 0, 1], ['dummy-4', datetime.datetime(2000, 3, 4), 1, 2], ['dummy-5', datetime.datetime(2000, 3, 5), 0, 2], ) if not os.path.isdir('data'): os.makedirs('data') with FileRecordStream('data/auto_specials.csv', write=True, fields=fields) \ as o: for r in records: o.appendRecord(r) # Aggregate just the dummy field, all the specials should be added ai = dict(fields=[('dummy', lambda x: x[0])], weeks=3) handle = \ tempfile.NamedTemporaryFile(prefix='auto_specials', suffix='.csv', dir='.') tempFile = handle.name handle.close() outputFile = generateDataset(ai, 'auto_specials.csv', tempFile) result = [] with FileRecordStream(outputFile) as f: print f.getFields() for r in f: result.append(r) self.assertEqual(result[0][2], 1) # reset self.assertEqual(result[0][3], 1) # seq id self.assertEqual(result[0][0], 'dummy-1') self.assertEqual(result[1][2], 1) # reset self.assertEqual(result[1][3], 2) # seq id self.assertEqual(result[1][0], 'dummy-4') return
def test_AutoSpecialFields(self): # Cleanup old files # for f in glob.glob('*.*'): # if 'auto_specials' in f: # os.remove(f) fields = [("dummy", "string", ""), ("timestamp", "datetime", "T"), ("reset", "int", "R"), ("sid", "int", "S")] records = ( ["dummy-1", datetime.datetime(2000, 3, 1), 1, 1], ["dummy-2", datetime.datetime(2000, 3, 2), 0, 1], ["dummy-3", datetime.datetime(2000, 3, 3), 0, 1], ["dummy-4", datetime.datetime(2000, 3, 4), 1, 2], ["dummy-5", datetime.datetime(2000, 3, 5), 0, 2], ) with FileRecordStream( resource_filename("nupic.datafiles", "auto_specials.csv"), write=True, fields=fields ) as o: for r in records: o.appendRecord(r) # Aggregate just the dummy field, all the specials should be added ai = dict(fields=[("dummy", lambda x: x[0])], weeks=3) handle = tempfile.NamedTemporaryFile(prefix="auto_specials", suffix=".csv", dir=".") tempFile = handle.name handle.close() outputFile = generateDataset(ai, "auto_specials.csv", tempFile) result = [] with FileRecordStream(outputFile) as f: print f.getFields() for r in f: result.append(r) self.assertEqual(result[0][2], 1) # reset self.assertEqual(result[0][3], 1) # seq id self.assertEqual(result[0][0], "dummy-1") self.assertEqual(result[1][2], 1) # reset self.assertEqual(result[1][3], 2) # seq id self.assertEqual(result[1][0], "dummy-4") return
def test_GapsInIrregularData(self): # Cleanup previous files if exist import glob for f in glob.glob('gap.*'): print 'Removing', f os.remove(f) #class TestParser(BaseParser): # def __init__(self): # def parseTimestamp(s): # d,t = s.split() # year, month, day = [int(x) for x in d.split('-')] # hour, minute, second = [int(x) for x in t.split(':')] # return datetime.datetime(year, month, day, hour, minute, second) # # BaseParser.__init__(self, # [('dateTime', parseTimestamp), # ('sequenceId', int), # ('cardtype', int), # ('fraud', bool), # ('amount', float)], # delimiter=',') # def parse(self, line): # values = BaseParser.parse(self, line) # return values #dateTime,cardnum,cardtype,fraud,amount data = """\ 2009-04-03 19:05:06,129.3 2009-04-04 15:19:12,46.6 2009-04-07 02:54:04,30.32 2009-04-07 06:27:12,84.52 2009-04-07 06:42:21,21.1 2009-04-09 01:01:14,29.24 2009-04-09 06:47:42,99.76 2009-04-11 18:06:11,29.66 2009-04-11 18:12:53,148.32 2009-04-11 19:15:08,61.03 2009-04-15 19:25:40,53.14 2009-05-04 21:07:02,816.75 2009-05-04 21:08:27,686.07 2009-05-06 20:40:04,489.08 2009-05-06 20:40:42,586.9 2009-05-06 20:41:15,554.3 2009-05-06 20:41:51,652.11""" fields = [('timestamp', 'datetime', 'T'), ('amount', 'float', '')] with FileRecordStream(resource_filename('nupic.datafiles', 'gap.csv'), write=True, fields=fields) as f: lines = data.split('\n') for line in lines: t, a = line.split(',') components = t.split() yyyy, mm, dd = [int(x) for x in components[0].split('-')] h, m, s = [int(x) for x in components[1].split(':')] t = datetime.datetime(yyyy, mm, dd, h, m, s) a = float(a) f.appendRecord([t, a]) aggregationOptions = dict( timeField='timestamp', fields=[('timestamp', lambda x: x[0]), ('amount', sum)], hours=24 ) handle = \ tempfile.NamedTemporaryFile(prefix='agg_gap_hours_24', suffix='.csv', dir='nupic/datafiles') outputFile = handle.name handle.close() if os.path.isfile(outputFile): os.remove(outputFile) self.assertFalse(os.path.exists(outputFile), msg="shouldn't exist, but does: " + str(outputFile)) result = generateDataset(aggregationOptions, 'gap.csv', outputFile) self.assertEqual( os.path.normpath(os.path.abspath(outputFile)), os.path.normpath(result), msg="result = '%s'; outputFile = '%s'" % (result, outputFile)) self.assertTrue(os.path.isfile(outputFile), msg="outputFile missing or is not file: %r" % (outputFile)) print outputFile print '-' * 30 s = '' for r in FileRecordStream(outputFile): s += ', '.join([str(x) for x in r]) + '\n' expected = """\ 2009-04-03 19:05:06, 175.9 2009-04-06 19:05:06, 135.94 2009-04-08 19:05:06, 129.0 2009-04-10 19:05:06, 177.98 2009-04-11 19:05:06, 61.03 2009-04-15 19:05:06, 53.14 2009-05-04 19:05:06, 1502.82 2009-05-06 19:05:06, 2282.39 """ self.assertEqual(s, expected) return
def test_GapsInIrregularData(self): # Cleanup previous files if exist import glob for f in glob.glob('gap.*'): print 'Removing', f os.remove(f) #class TestParser(BaseParser): # def __init__(self): # def parseTimestamp(s): # d,t = s.split() # year, month, day = [int(x) for x in d.split('-')] # hour, minute, second = [int(x) for x in t.split(':')] # return datetime.datetime(year, month, day, hour, minute, second) # # BaseParser.__init__(self, # [('dateTime', parseTimestamp), # ('sequenceId', int), # ('cardtype', int), # ('fraud', bool), # ('amount', float)], # delimiter=',') # def parse(self, line): # values = BaseParser.parse(self, line) # return values #dateTime,cardnum,cardtype,fraud,amount data = """\ 2009-04-03 19:05:06,129.3 2009-04-04 15:19:12,46.6 2009-04-07 02:54:04,30.32 2009-04-07 06:27:12,84.52 2009-04-07 06:42:21,21.1 2009-04-09 01:01:14,29.24 2009-04-09 06:47:42,99.76 2009-04-11 18:06:11,29.66 2009-04-11 18:12:53,148.32 2009-04-11 19:15:08,61.03 2009-04-15 19:25:40,53.14 2009-05-04 21:07:02,816.75 2009-05-04 21:08:27,686.07 2009-05-06 20:40:04,489.08 2009-05-06 20:40:42,586.9 2009-05-06 20:41:15,554.3 2009-05-06 20:41:51,652.11""" fields = [('timestamp', 'datetime', 'T'), ('amount', 'float', '')] with FileRecordStream(resource_filename('nupic.datafiles', 'gap.csv'), write=True, fields=fields) as f: lines = data.split('\n') for line in lines: t, a = line.split(',') components = t.split() yyyy, mm, dd = [int(x) for x in components[0].split('-')] h, m, s = [int(x) for x in components[1].split(':')] t = datetime.datetime(yyyy, mm, dd, h, m, s) a = float(a) f.appendRecord([t, a]) aggregationOptions = dict(timeField='timestamp', fields=[('timestamp', lambda x: x[0]), ('amount', sum)], hours=24) handle = \ tempfile.NamedTemporaryFile(prefix='agg_gap_hours_24', suffix='.csv', dir='.') outputFile = handle.name handle.close() if os.path.isfile(outputFile): os.remove(outputFile) self.assertFalse(os.path.exists(outputFile), msg="shouldn't exist, but does: " + str(outputFile)) result = generateDataset(aggregationOptions, 'gap.csv', outputFile) self.assertEqual(os.path.normpath(os.path.abspath(outputFile)), os.path.normpath(result), msg="result = '%s'; outputFile = '%s'" % (result, outputFile)) self.assertTrue(os.path.isfile(outputFile), msg="outputFile missing or is not file: %r" % (outputFile)) print outputFile print '-' * 30 s = '' for r in FileRecordStream(outputFile): s += ', '.join([str(x) for x in r]) + '\n' expected = """\ 2009-04-03 19:05:06, 175.9 2009-04-06 19:05:06, 135.94 2009-04-08 19:05:06, 129.0 2009-04-10 19:05:06, 177.98 2009-04-11 19:05:06, 61.03 2009-04-15 19:05:06, 53.14 2009-05-04 19:05:06, 1502.82 2009-05-06 19:05:06, 2282.39 """ self.assertEqual(s, expected) return