def testCopyOneRow(self): expectedOutput = ("Timestamp,Value\n" "datetime,int\n" "T,\n" "2011-09-04 02:00:00.000000,1\n" "2011-09-04 02:05:00.000000,2\n" "2011-09-04 02:10:00.000000,2\n" "2011-09-04 02:15:00.000000,3\n" "2011-09-04 02:20:00.000000,4\n" "2011-09-04 02:25:00.000000,5\n" "2011-09-04 02:30:00.000000,6\n") mockInput = MagicMock(return_value=StringIO(self.sampleInput)) output = StringIO() mockOutput = MagicMock(return_value=output) with patch("__builtin__.open", mockInput): inputFile = FileRecordStream("input_path") with patch("__builtin__.open", mockOutput): outputFile = FileRecordStream("output_path", fields=inputFile.getFields(), write=True) anomalyzer.copy(inputFile, outputFile, 1, 1, 1) result = output.getvalue() result = result.replace("\r\n", "\n") result = result.replace("\r", "\n") self.assertSequenceEqual(expectedOutput, result)
def testSample(self): mockInput = MagicMock(return_value=StringIO(self.sampleInput)) output = StringIO() mockOutput = MagicMock(return_value=output) with patch("__builtin__.open", mockInput): inputFile = FileRecordStream("input_path") with patch("__builtin__.open", mockOutput): outputFile = FileRecordStream("output_path", fields=inputFile.getFields(), write=True) anomalyzer.sample(inputFile, outputFile, 1) result = StringIO(output.getvalue()) result.next() result.next() result.next() reader = csv.reader(result) _, value = reader.next() self.assertIn(int(value), (1, 2, 3, 4, 5, 6)) self.assertRaises(StopIteration, result.next)
def _testSamePredictions(self, experiment, predSteps, checkpointAt, predictionsFilename, additionalFields=None, newSerialization=False): """ Test that we get the same predictions out from the following two scenarios: a_plus_b: Run the network for 'a' iterations followed by 'b' iterations a, followed by b: Run the network for 'a' iterations, save it, load it back in, then run for 'b' iterations. Parameters: ----------------------------------------------------------------------- experiment: base directory of the experiment. This directory should contain the following: base.py a_plus_b/description.py a/description.py b/description.py The sub-directory description files should import the base.py and only change the first and last record used from the data file. predSteps: Number of steps ahead predictions are for checkpointAt: Number of iterations that 'a' runs for. IMPORTANT: This must match the number of records that a/description.py runs for - it is NOT dynamically stuffed into the a/description.py. predictionsFilename: The name of the predictions file that the OPF generates for this experiment (for example 'DefaulTask.NontemporalMultiStep.predictionLog.csv') newSerialization: Whether to use new capnproto serialization. """ # Get the 3 sub-experiment directories aPlusBExpDir = os.path.join(_EXPERIMENT_BASE, experiment, "a_plus_b") aExpDir = os.path.join(_EXPERIMENT_BASE, experiment, "a") bExpDir = os.path.join(_EXPERIMENT_BASE, experiment, "b") # Run a+b args = self._createExperimentArgs(aPlusBExpDir, newSerialization=newSerialization) _aPlusBExp = runExperiment(args) # Run a, the copy the saved checkpoint into the b directory args = self._createExperimentArgs(aExpDir, newSerialization=newSerialization) _aExp = runExperiment(args) if os.path.exists(os.path.join(bExpDir, 'savedmodels')): shutil.rmtree(os.path.join(bExpDir, 'savedmodels')) shutil.copytree(src=os.path.join(aExpDir, 'savedmodels'), dst=os.path.join(bExpDir, 'savedmodels')) args = self._createExperimentArgs(bExpDir, newSerialization=newSerialization, additionalArgs=['--load=DefaultTask']) _bExp = runExperiment(args) # Now, compare the predictions at the end of a+b to those in b. aPlusBPred = FileRecordStream(os.path.join(aPlusBExpDir, 'inference', predictionsFilename)) bPred = FileRecordStream(os.path.join(bExpDir, 'inference', predictionsFilename)) colNames = [x[0] for x in aPlusBPred.getFields()] actValueColIdx = colNames.index('multiStepPredictions.actual') predValueColIdx = colNames.index('multiStepPredictions.%d' % (predSteps)) # Skip past the 'a' records in aPlusB for i in range(checkpointAt): aPlusBPred.next() # Now, read through the records that don't have predictions yet for i in range(predSteps): aPlusBPred.next() bPred.next() # Now, compare predictions in the two files rowIdx = checkpointAt + predSteps + 4 - 1 epsilon = 0.0001 while True: rowIdx += 1 try: rowAPB = aPlusBPred.next() rowB = bPred.next() # Compare actuals self.assertEqual(rowAPB[actValueColIdx], rowB[actValueColIdx], "Mismatch in actual values: row %d of a+b has %s and row %d of " "b has %s" % (rowIdx, rowAPB[actValueColIdx], rowIdx-checkpointAt, rowB[actValueColIdx])) # Compare predictions, within nearest epsilon predAPB = eval(rowAPB[predValueColIdx]) predB = eval(rowB[predValueColIdx]) # Sort with highest probabilities first predAPB = [(a, b) for b, a in predAPB.items()] predB = [(a, b) for b, a in predB.items()] predAPB.sort(reverse=True) predB.sort(reverse=True) if additionalFields is not None: for additionalField in additionalFields: fieldIdx = colNames.index(additionalField) self.assertEqual(rowAPB[fieldIdx], rowB[fieldIdx], "Mismatch in field \'%s\' values: row %d of a+b has value: (%s)\n" " and row %d of b has value: %s" % \ (additionalField, rowIdx, rowAPB[fieldIdx], rowIdx-checkpointAt, rowB[fieldIdx])) self.assertEqual(len(predAPB), len(predB), "Mismatch in predicted values: row %d of a+b has %d predictions: " "\n (%s) and row %d of b has %d predictions:\n (%s)" % \ (rowIdx, len(predAPB), predAPB, rowIdx-checkpointAt, len(predB), predB)) for i in range(len(predAPB)): (aProb, aValue) = predAPB[i] (bProb, bValue) = predB[i] self.assertLess(abs(aValue-bValue), epsilon, "Mismatch in predicted values: row %d of a+b predicts value %s " "and row %d of b predicts %s" % (rowIdx, aValue, rowIdx-checkpointAt, bValue)) self.assertLess(abs(aProb-bProb), epsilon, "Mismatch in probabilities: row %d of a+b predicts %s with " "probability %s and row %d of b predicts %s with probability %s" \ % (rowIdx, aValue, aProb, rowIdx-checkpointAt, bValue, bProb)) except StopIteration: break # clean up model checkpoint directories shutil.rmtree(getCheckpointParentDir(aExpDir)) shutil.rmtree(getCheckpointParentDir(bExpDir)) shutil.rmtree(getCheckpointParentDir(aPlusBExpDir)) print "Predictions match!"
def testExperimentResults(self): """Run specific experiments and verify that they are producing the correct results. opfDir is the examples/opf directory in the install path and is used to find run_opf_experiment.py The testdir is the directory that contains the experiments we will be running. When running in the auto-build setup, this will be a temporary directory that has had this script, as well as the specific experiments we will be running, copied into it by the qa/autotest/prediction_results.py script. When running stand-alone from the command line, this will point to the examples/prediction directory in the install tree (same as predictionDir) """ nupic_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..", "..", "..") opfDir = os.path.join(nupic_dir, "examples", "opf") testDir = opfDir # The testdir is the directory that contains the experiments we will be # running. When running in the auto-build setup, this will be a temporary # directory that has had this script, as well as the specific experiments # we will be running, copied into it by the # qa/autotest/prediction_results.py script. # When running stand-alone from the command line, we can simply point to the # examples/prediction directory in the install tree. if not os.path.exists(os.path.join(testDir, "experiments/classification")): testDir = opfDir # Generate any dynamically generated datasets now command = ['python', os.path.join(testDir, 'experiments', 'classification', 'makeDatasets.py')] retval = call(command) self.assertEqual(retval, 0) # Generate any dynamically generated datasets now command = ['python', os.path.join(testDir, 'experiments', 'multistep', 'make_datasets.py')] retval = call(command) self.assertEqual(retval, 0) # Generate any dynamically generated datasets now command = ['python', os.path.join(testDir, 'experiments', 'spatial_classification', 'make_datasets.py')] retval = call(command) self.assertEqual(retval, 0) # Run from the test directory so that we can find our experiments os.chdir(testDir) runExperiment = os.path.join(nupic_dir, "scripts", "run_opf_experiment.py") # A list of experiments to run. Valid attributes: # experimentDir - Required, path to the experiment directory containing # description.py # args - optional. List of arguments for run_opf_experiment # results - A dictionary of expected results. The keys are tuples # containing (predictionLogFileName, columnName). The # value is a (min, max) expected value from the last row # in the prediction log. multistepTests = [ # For this one, in theory the error for 1 step should be < 0.20 { 'experimentDir': 'experiments/multistep/simple_0', 'results': { ('DefaultTask.TemporalMultiStep.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=1:window=200:field=field1"): (0.0, 0.20), } }, # For this one, in theory the error for 1 step should be < 0.50, but we # get slightly higher because our sample size is smaller than ideal { 'experimentDir': 'experiments/multistep/simple_0_f2', 'results': { ('DefaultTask.TemporalMultiStep.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='aae':steps=1:window=200:field=field2"): (0.0, 0.66), } }, # For this one, in theory the error for 1 step should be < 0.20 { 'experimentDir': 'experiments/multistep/simple_1', 'results': { ('DefaultTask.TemporalMultiStep.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=1:window=200:field=field1"): (0.0, 0.20), } }, # For this test, we haven't figured out the theoretical error, this # error is determined empirically from actual results { 'experimentDir': 'experiments/multistep/simple_1_f2', 'results': { ('DefaultTask.TemporalMultiStep.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='aae':steps=1:window=200:field=field2"): (0.0, 3.76), } }, # For this one, in theory the error for 1 step should be < 0.20, but we # get slightly higher because our sample size is smaller than ideal { 'experimentDir': 'experiments/multistep/simple_2', 'results': { ('DefaultTask.TemporalMultiStep.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=1:window=200:field=field1"): (0.0, 0.31), } }, # For this one, in theory the error for 1 step should be < 0.10 and for # 3 step < 0.30, but our actual results are better. { 'experimentDir': 'experiments/multistep/simple_3', 'results': { ('DefaultTask.TemporalMultiStep.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=1:window=200:field=field1"): (0.0, 0.06), ('DefaultTask.TemporalMultiStep.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=3:window=200:field=field1"): (0.0, 0.20), } }, # For this test, we haven't figured out the theoretical error, this # error is determined empirically from actual results { 'experimentDir': 'experiments/multistep/simple_3_f2', 'results': { ('DefaultTask.TemporalMultiStep.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='aae':steps=1:window=200:field=field2"): (0.0, 0.6), ('DefaultTask.TemporalMultiStep.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='aae':steps=3:window=200:field=field2"): (0.0, 1.8), } }, # Test missing record support. # Should have 0 error by the end of the dataset { 'experimentDir': 'experiments/missing_record/simple_0', 'results': { ('DefaultTask.NontemporalMultiStep.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=1:window=25:field=field1"): (1.0, 1.0), } }, ] # end of multistepTests classificationTests = [ # ---------------------------------------------------------------------- # Classification Experiments { 'experimentDir': 'experiments/classification/category_hub_TP_0', 'results': { ('OnlineLearning.TemporalClassification.predictionLog.csv', 'classification:avg_err:window=200'): (0.0, 0.020), } }, { 'experimentDir': 'experiments/classification/category_TM_0', 'results': { ('OnlineLearning.TemporalClassification.predictionLog.csv', 'classification:avg_err:window=200'): (0.0, 0.045), ('OnlineLearning.TemporalClassification.predictionLog.csv', 'classConfidences:neg_auc:computeEvery=10:window=200'): (-1.0, -0.98), } }, { 'experimentDir': 'experiments/classification/category_TM_1', 'results': { ('OnlineLearning.TemporalClassification.predictionLog.csv', 'classification:avg_err:window=200'): (0.0, 0.005), } }, { 'experimentDir': 'experiments/classification/scalar_TP_0', 'results': { ('OnlineLearning.TemporalClassification.predictionLog.csv', 'classification:avg_err:window=200'): (0.0, 0.155), ('OnlineLearning.TemporalClassification.predictionLog.csv', 'classConfidences:neg_auc:computeEvery=10:window=200'): (-1.0, -0.900), } }, { 'experimentDir': 'experiments/classification/scalar_TP_1', 'results': { ('OnlineLearning.TemporalClassification.predictionLog.csv', 'classification:avg_err:window=200'): (0.0, 0.03), } }, ] # End of classification tests spatialClassificationTests = [ { 'experimentDir': 'experiments/spatial_classification/category_0', 'results': { ('DefaultTask.NontemporalClassification.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=0:window=100:field=classification"): (0.0, 0.05), } }, { 'experimentDir': 'experiments/spatial_classification/category_1', 'results': { ('DefaultTask.NontemporalClassification.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=0:window=100:field=classification"): (0.0, 0.0), } }, { 'experimentDir': 'experiments/spatial_classification/scalar_0', 'results': { ('DefaultTask.NontemporalClassification.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='aae':steps=0:window=100:field=classification"): (0.0, 0.025), } }, { 'experimentDir': 'experiments/spatial_classification/scalar_1', 'results': { ('DefaultTask.NontemporalClassification.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='aae':steps=0:window=100:field=classification"): (-1e-10, 0.01), } }, ] anomalyTests = [ # ---------------------------------------------------------------------- # Classification Experiments { 'experimentDir': 'experiments/anomaly/temporal/simple', 'results': { ('DefaultTask.TemporalAnomaly.predictionLog.csv', 'anomalyScore:passThruPrediction:window=1000:field=f'): (0.02, 0.04), } }, ] # End of anomaly tests tests = [] tests += multistepTests tests += classificationTests tests += spatialClassificationTests tests += anomalyTests # Uncomment this to only run a specific experiment(s) #tests = tests[7:8] # This contains a list of tuples: (expDir, key, results) summaryOfResults = [] startTime = time.time() testIdx = -1 for test in tests: testIdx += 1 expDirectory = test['experimentDir'] # ------------------------------------------------------------------- # Remove files/directories generated by previous tests: toDelete = [] # Remove inference results path = os.path.join(expDirectory, "inference") toDelete.append(path) path = os.path.join(expDirectory, "savedmodels") toDelete.append(path) for path in toDelete: if not os.path.exists(path): continue print "Removing %s ..." % path if os.path.isfile(path): os.remove(path) else: shutil.rmtree(path) # ------------------------------------------------------------------------ # Run the test. args = test.get('args', []) print "Running experiment %s ..." % (expDirectory) command = ['python', runExperiment, expDirectory] + args retVal = call(command) # If retVal is non-zero and this was not a negative test or if retVal is # zero and this is a negative test something went wrong. if retVal: print "Details of failed test: %s" % test print("TestIdx %d, OPF experiment '%s' failed with return code %i." % (testIdx, expDirectory, retVal)) self.assertFalse(retVal) # ----------------------------------------------------------------------- # Check the results for (key, expValues) in test['results'].items(): (logFilename, colName) = key # Open the prediction log file logFile = FileRecordStream(os.path.join(expDirectory, 'inference', logFilename)) colNames = [x[0] for x in logFile.getFields()] if not colName in colNames: print "TestIdx %d: %s not one of the columns in " \ "prediction log file. Available column names are: %s" % (testIdx, colName, colNames) self.assertTrue(colName in colNames) colIndex = colNames.index(colName) # Read till we get to the last line while True: try: row = logFile.next() except StopIteration: break result = row[colIndex] # Save summary of results summaryOfResults.append((expDirectory, colName, result)) print "Actual result for %s, %s:" % (expDirectory, colName), result print "Expected range:", expValues failed = (expValues[0] is not None and result < expValues[0]) \ or (expValues[1] is not None and result > expValues[1]) if failed: print ("TestIdx %d: Experiment %s failed. \nThe actual result" " for %s (%s) was outside the allowed range of %s" % (testIdx, expDirectory, colName, result, expValues)) else: print " Within expected range." self.assertFalse(failed) # ======================================================================= # Print summary of results: print print "Summary of results in all experiments run:" print "=========================================" prevExpDir = None for (expDir, key, results) in summaryOfResults: if expDir != prevExpDir: print print expDir prevExpDir = expDir print " %s: %s" % (key, results) print "\nElapsed time: %.1f seconds" % (time.time() - startTime)
def testExperimentResults(self): """Run specific experiments and verify that they are producing the correct results. opfDir is the examples/opf directory in the install path and is used to find run_opf_experiment.py The testdir is the directory that contains the experiments we will be running. When running in the auto-build setup, this will be a temporary directory that has had this script, as well as the specific experiments we will be running, copied into it by the qa/autotest/prediction_results.py script. When running stand-alone from the command line, this will point to the examples/prediction directory in the install tree (same as predictionDir) """ nupic_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..", "..", "..") opfDir = os.path.join(nupic_dir, "examples", "opf") testDir = opfDir # The testdir is the directory that contains the experiments we will be # running. When running in the auto-build setup, this will be a temporary # directory that has had this script, as well as the specific experiments # we will be running, copied into it by the # qa/autotest/prediction_results.py script. # When running stand-alone from the command line, we can simply point to the # examples/prediction directory in the install tree. if not os.path.exists( os.path.join(testDir, "experiments/classification")): testDir = opfDir # Generate any dynamically generated datasets now command = [ 'python', os.path.join(testDir, 'experiments', 'classification', 'makeDatasets.py') ] retval = call(command) self.assertEqual(retval, 0) # Generate any dynamically generated datasets now command = [ 'python', os.path.join(testDir, 'experiments', 'multistep', 'make_datasets.py') ] retval = call(command) self.assertEqual(retval, 0) # Generate any dynamically generated datasets now command = [ 'python', os.path.join(testDir, 'experiments', 'spatial_classification', 'make_datasets.py') ] retval = call(command) self.assertEqual(retval, 0) # Run from the test directory so that we can find our experiments os.chdir(testDir) runExperiment = os.path.join(nupic_dir, "scripts", "run_opf_experiment.py") # A list of experiments to run. Valid attributes: # experimentDir - Required, path to the experiment directory containing # description.py # args - optional. List of arguments for run_opf_experiment # results - A dictionary of expected results. The keys are tuples # containing (predictionLogFileName, columnName). The # value is a (min, max) expected value from the last row # in the prediction log. multistepTests = [ # For this one, in theory the error for 1 step should be < 0.20 { 'experimentDir': 'experiments/multistep/simple_0', 'results': { ('DefaultTask.TemporalMultiStep.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=1:window=200:field=field1"): (0.0, 0.20), } }, # For this one, in theory the error for 1 step should be < 0.50, but we # get slightly higher because our sample size is smaller than ideal { 'experimentDir': 'experiments/multistep/simple_0_f2', 'results': { ('DefaultTask.TemporalMultiStep.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='aae':steps=1:window=200:field=field2"): (0.0, 0.66), } }, # For this one, in theory the error for 1 step should be < 0.20 { 'experimentDir': 'experiments/multistep/simple_1', 'results': { ('DefaultTask.TemporalMultiStep.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=1:window=200:field=field1"): (0.0, 0.20), } }, # For this test, we haven't figured out the theoretical error, this # error is determined empirically from actual results { 'experimentDir': 'experiments/multistep/simple_1_f2', 'results': { ('DefaultTask.TemporalMultiStep.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='aae':steps=1:window=200:field=field2"): (0.0, 3.76), } }, # For this one, in theory the error for 1 step should be < 0.20, but we # get slightly higher because our sample size is smaller than ideal { 'experimentDir': 'experiments/multistep/simple_2', 'results': { ('DefaultTask.TemporalMultiStep.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=1:window=200:field=field1"): (0.0, 0.31), } }, # For this one, in theory the error for 1 step should be < 0.10 and for # 3 step < 0.30, but our actual results are better. { 'experimentDir': 'experiments/multistep/simple_3', 'results': { ('DefaultTask.TemporalMultiStep.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=1:window=200:field=field1"): (0.0, 0.06), ('DefaultTask.TemporalMultiStep.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=3:window=200:field=field1"): (0.0, 0.20), } }, # For this test, we haven't figured out the theoretical error, this # error is determined empirically from actual results { 'experimentDir': 'experiments/multistep/simple_3_f2', 'results': { ('DefaultTask.TemporalMultiStep.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='aae':steps=1:window=200:field=field2"): (0.0, 0.6), ('DefaultTask.TemporalMultiStep.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='aae':steps=3:window=200:field=field2"): (0.0, 1.8), } }, # Test missing record support. # Should have 0 error by the end of the dataset { 'experimentDir': 'experiments/missing_record/simple_0', 'results': { ('DefaultTask.NontemporalMultiStep.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=1:window=25:field=field1"): (1.0, 1.0), } }, ] # end of multistepTests classificationTests = [ # ---------------------------------------------------------------------- # Classification Experiments { 'experimentDir': 'experiments/classification/category_hub_TP_0', 'results': { ('OnlineLearning.TemporalClassification.predictionLog.csv', 'classification:avg_err:window=200'): (0.0, 0.020), } }, { 'experimentDir': 'experiments/classification/category_TM_0', 'results': { ('OnlineLearning.TemporalClassification.predictionLog.csv', 'classification:avg_err:window=200'): (0.0, 0.045), ('OnlineLearning.TemporalClassification.predictionLog.csv', 'classConfidences:neg_auc:computeEvery=10:window=200'): (-1.0, -0.98), } }, { 'experimentDir': 'experiments/classification/category_TM_1', 'results': { ('OnlineLearning.TemporalClassification.predictionLog.csv', 'classification:avg_err:window=200'): (0.0, 0.005), } }, { 'experimentDir': 'experiments/classification/scalar_TP_0', 'results': { ('OnlineLearning.TemporalClassification.predictionLog.csv', 'classification:avg_err:window=200'): (0.0, 0.155), ('OnlineLearning.TemporalClassification.predictionLog.csv', 'classConfidences:neg_auc:computeEvery=10:window=200'): (-1.0, -0.900), } }, { 'experimentDir': 'experiments/classification/scalar_TP_1', 'results': { ('OnlineLearning.TemporalClassification.predictionLog.csv', 'classification:avg_err:window=200'): (0.0, 0.03), } }, ] # End of classification tests spatialClassificationTests = [ { 'experimentDir': 'experiments/spatial_classification/category_0', 'results': { ('DefaultTask.NontemporalClassification.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=0:window=100:field=classification"): (0.0, 0.05), } }, { 'experimentDir': 'experiments/spatial_classification/category_1', 'results': { ('DefaultTask.NontemporalClassification.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=0:window=100:field=classification"): (0.0, 0.0), } }, { 'experimentDir': 'experiments/spatial_classification/scalar_0', 'results': { ('DefaultTask.NontemporalClassification.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='aae':steps=0:window=100:field=classification"): (0.0, 0.025), } }, { 'experimentDir': 'experiments/spatial_classification/scalar_1', 'results': { ('DefaultTask.NontemporalClassification.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='aae':steps=0:window=100:field=classification"): (-1e-10, 0.01), } }, ] anomalyTests = [ # ---------------------------------------------------------------------- # Classification Experiments { 'experimentDir': 'experiments/anomaly/temporal/simple', 'results': { ('DefaultTask.TemporalAnomaly.predictionLog.csv', 'anomalyScore:passThruPrediction:window=1000:field=f'): (0.02, 0.04), } }, ] # End of anomaly tests tests = [] tests += multistepTests tests += classificationTests tests += spatialClassificationTests tests += anomalyTests # Uncomment this to only run a specific experiment(s) #tests = tests[7:8] # This contains a list of tuples: (expDir, key, results) summaryOfResults = [] startTime = time.time() testIdx = -1 for test in tests: testIdx += 1 expDirectory = test['experimentDir'] # ------------------------------------------------------------------- # Remove files/directories generated by previous tests: toDelete = [] # Remove inference results path = os.path.join(expDirectory, "inference") toDelete.append(path) path = os.path.join(expDirectory, "savedmodels") toDelete.append(path) for path in toDelete: if not os.path.exists(path): continue print("Removing %s ..." % path) if os.path.isfile(path): os.remove(path) else: shutil.rmtree(path) # ------------------------------------------------------------------------ # Run the test. args = test.get('args', []) print("Running experiment %s ..." % (expDirectory)) command = ['python', runExperiment, expDirectory] + args retVal = call(command) # If retVal is non-zero and this was not a negative test or if retVal is # zero and this is a negative test something went wrong. if retVal: print("Details of failed test: %s" % test) print(( "TestIdx %d, OPF experiment '%s' failed with return code %i." % (testIdx, expDirectory, retVal))) self.assertFalse(retVal) # ----------------------------------------------------------------------- # Check the results for (key, expValues) in list(test['results'].items()): (logFilename, colName) = key # Open the prediction log file logFile = FileRecordStream( os.path.join(expDirectory, 'inference', logFilename)) colNames = [x[0] for x in logFile.getFields()] if not colName in colNames: print("TestIdx %d: %s not one of the columns in " \ "prediction log file. Available column names are: %s" % (testIdx, colName, colNames)) self.assertTrue(colName in colNames) colIndex = colNames.index(colName) # Read till we get to the last line while True: try: row = next(logFile) except StopIteration: break result = row[colIndex] # Save summary of results summaryOfResults.append((expDirectory, colName, result)) print("Actual result for %s, %s:" % (expDirectory, colName), result) print("Expected range:", expValues) failed = (expValues[0] is not None and result < expValues[0]) \ or (expValues[1] is not None and result > expValues[1]) if failed: print(( "TestIdx %d: Experiment %s failed. \nThe actual result" " for %s (%s) was outside the allowed range of %s" % (testIdx, expDirectory, colName, result, expValues))) else: print(" Within expected range.") self.assertFalse(failed) # ======================================================================= # Print summary of results: print() print("Summary of results in all experiments run:") print("=========================================") prevExpDir = None for (expDir, key, results) in summaryOfResults: if expDir != prevExpDir: print() print(expDir) prevExpDir = expDir print(" %s: %s" % (key, results)) print("\nElapsed time: %.1f seconds" % (time.time() - startTime))
def generateStats(filename, maxSamples = None,): """ Collect statistics for each of the fields in the user input data file and return a stats dict object. Parameters: ------------------------------------------------------------------------------ filename: The path and name of the data file. maxSamples: Upper bound on the number of rows to be processed retval: A dictionary of dictionaries. The top level keys are the field names and the corresponding values are the statistics collected for the individual file. Example: { 'consumption':{'min':0,'max':90,'mean':50,...}, 'gym':{'numDistinctCategories':10,...}, ... } """ # Mapping from field type to stats collector object statsCollectorMapping = {'float': FloatStatsCollector, 'int': IntStatsCollector, 'string': StringStatsCollector, 'datetime': DateTimeStatsCollector, 'bool': BoolStatsCollector, } filename = resource_filename("nupic.datafiles", filename) print "*"*40 print "Collecting statistics for file:'%s'" % (filename,) dataFile = FileRecordStream(filename) # Initialize collector objects # statsCollectors list holds statsCollector objects for each field statsCollectors = [] for fieldName, fieldType, fieldSpecial in dataFile.getFields(): # Find the corresponding stats collector for each field based on field type # and intialize an instance statsCollector = \ statsCollectorMapping[fieldType](fieldName, fieldType, fieldSpecial) statsCollectors.append(statsCollector) # Now collect the stats if maxSamples is None: maxSamples = 500000 for i in xrange(maxSamples): record = dataFile.getNextRecord() if record is None: break for i, value in enumerate(record): statsCollectors[i].addValue(value) # stats dict holds the statistics for each field stats = {} for statsCollector in statsCollectors: statsCollector.getStats(stats) # We don't want to include reset field in permutations # TODO: handle reset field in a clean way if dataFile.getResetFieldIdx() is not None: resetFieldName,_,_ = dataFile.getFields()[dataFile.reset] stats.pop(resetFieldName) if VERBOSITY > 0: pprint.pprint(stats) return stats
def generateDataset(aggregationInfo, inputFilename, outputFilename=None): """Generate a dataset of aggregated values Parameters: ---------------------------------------------------------------------------- aggregationInfo: a dictionary that contains the following entries - fields: a list of pairs. Each pair is a field name and an aggregation function (e.g. sum). The function will be used to aggregate multiple values during the aggregation period. aggregation period: 0 or more of unit=value fields; allowed units are: [years months] | [weeks days hours minutes seconds milliseconds microseconds] NOTE: years and months are mutually-exclusive with the other units. See getEndTime() and _aggregate() for more details. Example1: years=1, months=6, Example2: hours=1, minutes=30, If none of the period fields are specified or if all that are specified have values of 0, then aggregation will be suppressed, and the given inputFile parameter value will be returned. inputFilename: filename of the input dataset within examples/prediction/data outputFilename: name for the output file. If not given, a name will be generated based on the input filename and the aggregation params retval: Name of the generated output file. This will be the same as the input file name if no aggregation needed to be performed If the input file contained a time field, sequence id field or reset field that were not specified in aggregationInfo fields, those fields will be added automatically with the following rules: 1. The order will be R, S, T, rest of the fields 2. The aggregation function for all will be to pick the first: lambda x: x[0] Returns: the path of the aggregated data file if aggregation was performed (in the same directory as the given input file); if aggregation did not need to be performed, then the given inputFile argument value is returned. """ # Create the input stream inputFullPath = resource_filename("nupic.datafiles", inputFilename) inputObj = FileRecordStream(inputFullPath) # Instantiate the aggregator aggregator = Aggregator(aggregationInfo=aggregationInfo, inputFields=inputObj.getFields()) # Is it a null aggregation? If so, just return the input file unmodified if aggregator.isNullAggregation(): return inputFullPath # ------------------------------------------------------------------------ # If we were not given an output filename, create one based on the # aggregation settings if outputFilename is None: outputFilename = 'agg_%s' % \ os.path.splitext(os.path.basename(inputFullPath))[0] timePeriods = 'years months weeks days '\ 'hours minutes seconds milliseconds microseconds' for k in timePeriods.split(): if aggregationInfo.get(k, 0) > 0: outputFilename += '_%s_%d' % (k, aggregationInfo[k]) outputFilename += '.csv' outputFilename = os.path.join(os.path.dirname(inputFullPath), outputFilename) # ------------------------------------------------------------------------ # If some other process already started creating this file, simply # wait for it to finish and return without doing anything lockFilePath = outputFilename + '.please_wait' if os.path.isfile(outputFilename) or \ os.path.isfile(lockFilePath): while os.path.isfile(lockFilePath): print('Waiting for %s to be fully written by another process' % \ lockFilePath) time.sleep(1) return outputFilename # Create the lock file lockFD = open(lockFilePath, 'w') # ------------------------------------------------------------------------- # Create the output stream outputObj = FileRecordStream(streamID=outputFilename, write=True, fields=inputObj.getFields()) # ------------------------------------------------------------------------- # Write all aggregated records to the output while True: inRecord = inputObj.getNextRecord() (aggRecord, aggBookmark) = aggregator.next(inRecord, None) if aggRecord is None and inRecord is None: break if aggRecord is not None: outputObj.appendRecord(aggRecord) return outputFilename
def generateDataset(aggregationInfo, inputFilename, outputFilename=None): """Generate a dataset of aggregated values Parameters: ---------------------------------------------------------------------------- aggregationInfo: a dictionary that contains the following entries - fields: a list of pairs. Each pair is a field name and an aggregation function (e.g. sum). The function will be used to aggregate multiple values during the aggregation period. aggregation period: 0 or more of unit=value fields; allowed units are: [years months] | [weeks days hours minutes seconds milliseconds microseconds] NOTE: years and months are mutually-exclusive with the other units. See getEndTime() and _aggregate() for more details. Example1: years=1, months=6, Example2: hours=1, minutes=30, If none of the period fields are specified or if all that are specified have values of 0, then aggregation will be suppressed, and the given inputFile parameter value will be returned. inputFilename: filename (or relative path form NTA_DATA_PATH) of the input dataset outputFilename: name for the output file. If not given, a name will be generated based on the input filename and the aggregation params retval: Name of the generated output file. This will be the same as the input file name if no aggregation needed to be performed If the input file contained a time field, sequence id field or reset field that were not specified in aggregationInfo fields, those fields will be added automatically with the following rules: 1. The order will be R, S, T, rest of the fields 2. The aggregation function for all will be to pick the first: lambda x: x[0] Returns: the path of the aggregated data file if aggregation was performed (in the same directory as the given input file); if aggregation did not need to be performed, then the given inputFile argument value is returned. """ # Create the input stream inputFullPath = findDataset(inputFilename) inputObj = FileRecordStream(inputFullPath) # Instantiate the aggregator aggregator = Aggregator(aggregationInfo=aggregationInfo, inputFields=inputObj.getFields()) # Is it a null aggregation? If so, just return the input file unmodified if aggregator.isNullAggregation(): return inputFullPath # ------------------------------------------------------------------------ # If we were not given an output filename, create one based on the # aggregation settings if outputFilename is None: outputFilename = 'agg_%s' % \ os.path.splitext(os.path.basename(inputFullPath))[0] timePeriods = 'years months weeks days '\ 'hours minutes seconds milliseconds microseconds' for k in timePeriods.split(): if aggregationInfo.get(k, 0) > 0: outputFilename += '_%s_%d' % (k, aggregationInfo[k]) outputFilename += '.csv' outputFilename = os.path.join(os.path.dirname(inputFullPath), outputFilename) # ------------------------------------------------------------------------ # If some other process already started creating this file, simply # wait for it to finish and return without doing anything lockFilePath = outputFilename + '.please_wait' if os.path.isfile(outputFilename) or \ os.path.isfile(lockFilePath): while os.path.isfile(lockFilePath): print 'Waiting for %s to be fully written by another process' % \ lockFilePath time.sleep(1) return outputFilename # Create the lock file lockFD = open(lockFilePath, 'w') # ------------------------------------------------------------------------- # Create the output stream outputObj = FileRecordStream(streamID=outputFilename, write=True, fields=inputObj.getFields()) # ------------------------------------------------------------------------- # Write all aggregated records to the output while True: inRecord = inputObj.getNextRecord() (aggRecord, aggBookmark) = aggregator.next(inRecord, None) if aggRecord is None and inRecord is None: break if aggRecord is not None: outputObj.appendRecord(aggRecord) return outputFilename
def testMissingValues(self): print "Beginning Missing Data test..." filename = _getTempFileName() # Some values missing of each type # read dataset from disk, retrieve values # string should return empty string, numeric types sentinelValue print 'Creating tempfile:', filename # write dataset to disk with float, int, and string fields fields = [ FieldMetaInfo('timestamp', FieldMetaType.datetime, FieldMetaSpecial.timestamp), FieldMetaInfo('name', FieldMetaType.string, FieldMetaSpecial.none), FieldMetaInfo('integer', FieldMetaType.integer, FieldMetaSpecial.none), FieldMetaInfo('real', FieldMetaType.float, FieldMetaSpecial.none) ] s = FileRecordStream(streamID=filename, write=True, fields=fields) # Records records = ([datetime(day=1, month=3, year=2010), 'rec_1', 5, 6.5], [ datetime(day=2, month=3, year=2010), '', 8, 7.5 ], [datetime(day=3, month=3, year=2010), 'rec_3', '', 8.5], [ datetime(day=4, month=3, year=2010), 'rec_4', 12, '' ], [datetime(day=5, month=3, year=2010), 'rec_5', -87657496599, 6.5], [ datetime(day=6, month=3, year=2010), 'rec_6', 12, -87657496599 ], [datetime(day=6, month=3, year=2010), str(-87657496599), 12, 6.5]) for r in records: s.appendRecord(list(r)) s.close() # Read the standard file s = FileRecordStream(streamID=filename, write=False) fieldsRead = s.getFields() self.assertEqual(fields, fieldsRead) recordsRead = [] while True: r = s.getNextRecord() if r is None: break print 'Reading record ...' print r recordsRead.append(r) # sort the records by date, so we know for sure which is which sorted(recordsRead, key=lambda rec: rec[0]) # empty string self.assertEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[1][1]) # missing int self.assertEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[2][2]) # missing float self.assertEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[3][3]) # sentinel value in input handled correctly for int field self.assertNotEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[4][2]) # sentinel value in input handled correctly for float field self.assertNotEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[5][3]) # sentinel value in input handled correctly for string field # this should leave the string as-is, since a missing string # is encoded not with a sentinel value but with an empty string self.assertNotEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[6][1])
def testBasic(self): """Runs basic FileRecordStream tests.""" filename = _getTempFileName() # Write a standard file fields = [('name', 'string', ''), ('timestamp', 'datetime', 'T'), ('integer', 'int', ''), ('real', 'float', ''), ('reset', 'int', 'R'), ('sid', 'string', 'S'), ('categoryField', 'int', 'C'),] fieldNames = ['name', 'timestamp', 'integer', 'real', 'reset', 'sid', 'categoryField'] print 'Creating temp file:', filename s = FileRecordStream(streamID=filename, write=True, fields=fields) self.assertTrue(s.getDataRowCount() == 0) # Records records = ( ['rec_1', datetime(day=1, month=3, year=2010), 5, 6.5, 1, 'seq-1', 10], ['rec_2', datetime(day=2, month=3, year=2010), 8, 7.5, 0, 'seq-1', 11], ['rec_3', datetime(day=3, month=3, year=2010), 12, 8.5, 0, 'seq-1', 12]) self.assertTrue(s.getFields() == fields) self.assertTrue(s.getNextRecordIdx() == 0) print 'Writing records ...' for r in records: print list(r) s.appendRecord(list(r)) self.assertTrue(s.getDataRowCount() == 3) recordsBatch = ( ['rec_4', datetime(day=4, month=3, year=2010), 2, 9.5, 1, 'seq-1', 13], ['rec_5', datetime(day=5, month=3, year=2010), 6, 10.5, 0, 'seq-1', 14], ['rec_6', datetime(day=6, month=3, year=2010), 11, 11.5, 0, 'seq-1', 15]) print 'Adding batch of records...' for rec in recordsBatch: print rec s.appendRecords(recordsBatch) self.assertTrue(s.getDataRowCount() == 6) s.close() # Read the standard file s = FileRecordStream(filename) self.assertTrue(s.getDataRowCount() == 6) self.assertTrue(s.getFieldNames() == fieldNames) # Note! this is the number of records read so far self.assertTrue(s.getNextRecordIdx() == 0) readStats = s.getStats() print 'Got stats:', readStats expectedStats = { 'max': [None, None, 12, 11.5, 1, None, 15], 'min': [None, None, 2, 6.5, 0, None, 10] } self.assertTrue(readStats == expectedStats) readRecords = [] print 'Reading records ...' while True: r = s.getNextRecord() print r if r is None: break readRecords.append(r) allRecords = records + recordsBatch for r1, r2 in zip(allRecords, readRecords): print 'Expected:', r1 print 'Read :', r2 self.assertTrue(r1 == r2) s.close()
def testMissingValues(self): print "Beginning Missing Data test..." filename = _getTempFileName() # Some values missing of each type # read dataset from disk, retrieve values # string should return empty string, numeric types sentinelValue print 'Creating tempfile:', filename # write dataset to disk with float, int, and string fields fields = [('timestamp', 'datetime', 'T'), ('name', 'string', ''), ('integer', 'int', ''), ('real', 'float', '')] s = FileRecordStream(streamID=filename, write=True, fields=fields) # Records records = ( [datetime(day=1, month=3, year=2010), 'rec_1', 5, 6.5], [datetime(day=2, month=3, year=2010), '', 8, 7.5], [datetime(day=3, month=3, year=2010), 'rec_3', '', 8.5], [datetime(day=4, month=3, year=2010), 'rec_4', 12, ''], [datetime(day=5, month=3, year=2010), 'rec_5', -87657496599, 6.5], [datetime(day=6, month=3, year=2010), 'rec_6', 12, -87657496599], [datetime(day=6, month=3, year=2010), str(-87657496599), 12, 6.5]) for r in records: s.appendRecord(list(r)) s.close() # Read the standard file s = FileRecordStream(streamID=filename, write=False) fieldsRead = s.getFields() self.assertTrue(fields == fieldsRead) recordsRead = [] while True: r = s.getNextRecord() if r is None: break print 'Reading record ...' print r recordsRead.append(r) # sort the records by date, so we know for sure which is which sorted(recordsRead, key=lambda rec: rec[0]) # empty string self.assertTrue(recordsRead[1][1] == SENTINEL_VALUE_FOR_MISSING_DATA) # missing int self.assertTrue(recordsRead[2][2] == SENTINEL_VALUE_FOR_MISSING_DATA) # missing float self.assertTrue(recordsRead[3][3] == SENTINEL_VALUE_FOR_MISSING_DATA) # sentinel value in input handled correctly for int field self.assertTrue(recordsRead[4][2] != SENTINEL_VALUE_FOR_MISSING_DATA) # sentinel value in input handled correctly for float field self.assertTrue(recordsRead[5][3] != SENTINEL_VALUE_FOR_MISSING_DATA) # sentinel value in input handled correctly for string field # this should leave the string as-is, since a missing string # is encoded not with a sentinel value but with an empty string self.assertTrue(recordsRead[6][1] != SENTINEL_VALUE_FOR_MISSING_DATA)
def _testSamePredictions(self, experiment, predSteps, checkpointAt, predictionsFilename, additionalFields=None): """ Test that we get the same predictions out from the following two scenarios: a_plus_b: Run the network for 'a' iterations followed by 'b' iterations a, followed by b: Run the network for 'a' iterations, save it, load it back in, then run for 'b' iterations. Parameters: ----------------------------------------------------------------------- experiment: base directory of the experiment. This directory should contain the following: base.py a_plus_b/description.py a/description.py b/description.py The sub-directory description files should import the base.py and only change the first and last record used from the data file. predSteps: Number of steps ahead predictions are for checkpointAt: Number of iterations that 'a' runs for. IMPORTANT: This must match the number of records that a/description.py runs for - it is NOT dynamically stuffed into the a/description.py. predictionsFilename: The name of the predictions file that the OPF generates for this experiment (for example 'DefaulTask.NontemporalMultiStep.predictionLog.csv') """ # Get the 3 sub-experiment directories aPlusBExpDir = os.path.join(_EXPERIMENT_BASE, experiment, "a_plus_b") aExpDir = os.path.join(_EXPERIMENT_BASE, experiment, "a") bExpDir = os.path.join(_EXPERIMENT_BASE, experiment, "b") # Run a+b _aPlusBExp = runExperiment(args=[aPlusBExpDir]) # Run a, the copy the saved checkpoint into the b directory _aExp = runExperiment(args=[aExpDir]) if os.path.exists(os.path.join(bExpDir, 'savedmodels')): shutil.rmtree(os.path.join(bExpDir, 'savedmodels')) shutil.copytree(src=os.path.join(aExpDir, 'savedmodels'), dst=os.path.join(bExpDir, 'savedmodels')) _bExp = runExperiment(args=[bExpDir, '--load=DefaultTask']) # Now, compare the predictions at the end of a+b to those in b. aPlusBPred = FileRecordStream( os.path.join(aPlusBExpDir, 'inference', predictionsFilename)) bPred = FileRecordStream( os.path.join(bExpDir, 'inference', predictionsFilename)) colNames = [x[0] for x in aPlusBPred.getFields()] actValueColIdx = colNames.index('multiStepPredictions.actual') predValueColIdx = colNames.index('multiStepPredictions.%d' % (predSteps)) # Skip past the 'a' records in aPlusB for i in range(checkpointAt): aPlusBPred.next() # Now, read through the records that don't have predictions yet for i in range(predSteps): aPlusBPred.next() bPred.next() # Now, compare predictions in the two files rowIdx = checkpointAt + predSteps + 4 - 1 epsilon = 0.0001 while True: rowIdx += 1 try: rowAPB = aPlusBPred.next() rowB = bPred.next() # Compare actuals self.assertEqual( rowAPB[actValueColIdx], rowB[actValueColIdx], "Mismatch in actual values: row %d of a+b has %s and row %d of " "b has %s" % (rowIdx, rowAPB[actValueColIdx], rowIdx - checkpointAt, rowB[actValueColIdx])) # Compare predictions, within nearest epsilon predAPB = eval(rowAPB[predValueColIdx]) predB = eval(rowB[predValueColIdx]) # Sort with highest probabilities first predAPB = [(a, b) for b, a in predAPB.items()] predB = [(a, b) for b, a in predB.items()] predAPB.sort(reverse=True) predB.sort(reverse=True) if additionalFields is not None: for additionalField in additionalFields: fieldIdx = colNames.index(additionalField) self.assertEqual(rowAPB[fieldIdx], rowB[fieldIdx], "Mismatch in field \'%s\' values: row %d of a+b has value: (%s)\n" " and row %d of b has value: %s" % \ (additionalField, rowIdx, rowAPB[fieldIdx], rowIdx-checkpointAt, rowB[fieldIdx])) self.assertEqual(len(predAPB), len(predB), "Mismatch in predicted values: row %d of a+b has %d predictions: " "\n (%s) and row %d of b has %d predictions:\n (%s)" % \ (rowIdx, len(predAPB), predAPB, rowIdx-checkpointAt, len(predB), predB)) for i in range(len(predAPB)): (aProb, aValue) = predAPB[i] (bProb, bValue) = predB[i] self.assertLess( abs(aValue - bValue), epsilon, "Mismatch in predicted values: row %d of a+b predicts value %s " "and row %d of b predicts %s" % (rowIdx, aValue, rowIdx - checkpointAt, bValue)) self.assertLess(abs(aProb-bProb), epsilon, "Mismatch in probabilities: row %d of a+b predicts %s with " "probability %s and row %d of b predicts %s with probability %s" \ % (rowIdx, aValue, aProb, rowIdx-checkpointAt, bValue, bProb)) except StopIteration: break # clean up model checkpoint directories shutil.rmtree(getCheckpointParentDir(aExpDir)) shutil.rmtree(getCheckpointParentDir(bExpDir)) shutil.rmtree(getCheckpointParentDir(aPlusBExpDir)) print "Predictions match!"
def testMissingValues(self): print "Beginning Missing Data test..." filename = _getTempFileName() # Some values missing of each type # read dataset from disk, retrieve values # string should return empty string, numeric types sentinelValue print "Creating tempfile:", filename # write dataset to disk with float, int, and string fields fields = [("timestamp", "datetime", "T"), ("name", "string", ""), ("integer", "int", ""), ("real", "float", "")] s = FileRecordStream(streamID=filename, write=True, fields=fields) # Records records = ( [datetime(day=1, month=3, year=2010), "rec_1", 5, 6.5], [datetime(day=2, month=3, year=2010), "", 8, 7.5], [datetime(day=3, month=3, year=2010), "rec_3", "", 8.5], [datetime(day=4, month=3, year=2010), "rec_4", 12, ""], [datetime(day=5, month=3, year=2010), "rec_5", -87657496599, 6.5], [datetime(day=6, month=3, year=2010), "rec_6", 12, -87657496599], [datetime(day=6, month=3, year=2010), str(-87657496599), 12, 6.5], ) for r in records: s.appendRecord(list(r)) s.close() # Read the standard file s = FileRecordStream(streamID=filename, write=False) fieldsRead = s.getFields() self.assertEqual(fields, fieldsRead) recordsRead = [] while True: r = s.getNextRecord() if r is None: break print "Reading record ..." print r recordsRead.append(r) # sort the records by date, so we know for sure which is which sorted(recordsRead, key=lambda rec: rec[0]) # empty string self.assertEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[1][1]) # missing int self.assertEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[2][2]) # missing float self.assertEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[3][3]) # sentinel value in input handled correctly for int field self.assertNotEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[4][2]) # sentinel value in input handled correctly for float field self.assertNotEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[5][3]) # sentinel value in input handled correctly for string field # this should leave the string as-is, since a missing string # is encoded not with a sentinel value but with an empty string self.assertNotEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[6][1])