def testClassifyKeywordsAsExpected(self): """ Tests ClassificationModelKeywords. Training on the first five samples of the dataset, and testing on the rest, the model's classifications should match those in the expected classes data file. """ modelName = "Keywords" runner = Runner(dataPath=os.path.join(DATA_DIR, "responses.csv"), resultsDir="", experimentName="keywords_test", loadPath=None, modelName=modelName, numClasses=3, plots=0, orderedSplit=True, trainSizes=[5], verbosity=0) runner.initModel(modelName) self.runExperiment(runner) expectedClasses, resultClasses = self.getExpectedClassifications( runner, os.path.join(DATA_DIR, "responses_expected_classes_keywords.csv")) for i, (e, r) in enumerate(zip(expectedClasses, resultClasses)): if i in (7, 9, 12): # Ties amongst winning labels are handled randomly, which affects the # third classification in these test samples. e = e[:2] r = r[:2] self.assertEqual( sorted(e), sorted(r), "Keywords model predicted classes other than what we expect.")
def testClassifyEndpointAsExpected(self): """ Tests ClassificationModelEndpoint. Training on the first five samples of the dataset, and testing on the rest, the model's classifications should match those in the expected classes data file. """ modelName = "CioEndpoint" runner = Runner(dataPath=os.path.join(DATA_DIR, "responses.csv"), resultsDir="", experimentName="endpoint_test", loadPath=None, modelName=modelName, numClasses=3, plots=0, orderedSplit=True, trainSizes=[5], verbosity=0) runner.initModel(modelName) self.runExperiment(runner) expectedClasses, resultClasses = self.getExpectedClassifications( runner, os.path.join(DATA_DIR, "responses_expected_classes_endpoint.csv")) [ self.assertEqual( sorted(e), sorted(r), "Endpoint model predicted classes other than what we expect.") for e, r in zip(expectedClasses, resultClasses) ]
def testClassifyWordFingerprintsAsExpected(self): """ Tests ClassificationModelFingerprint (for encoder type 'word'). Training on the first five samples of the dataset, and testing on the rest, the model's classifications should match those in the expected classes data file. """ modelName = "CioWordFingerprint" runner = Runner(dataPath=os.path.join(DATA_DIR, "responses.csv"), resultsDir="", experimentName="fingerprints_test", loadPath=None, modelName=modelName, numClasses=3, plots=0, orderedSplit=True, trainSizes=[5], verbosity=0) runner.initModel(modelName) runner.model.encoder.fingerprintType = EncoderTypes.word self.runExperiment(runner) expectedClasses, resultClasses = self.getExpectedClassifications( runner, os.path.join(DATA_DIR, "responses_expected_classes_fingerprint_word.csv")) for i, (e, r) in enumerate(zip(expectedClasses, resultClasses)): if sorted(e) != sorted(r): print i, e, r [ self.assertEqual( sorted(e), sorted(r), "Fingerprint model predicted classes other than what we expect." ) for e, r in zip(expectedClasses, resultClasses) ]
def testClassifyDocumentFingerprintsAsExpected(self): """ Tests ClassificationModelFingerprint (for encoder type 'document'). Training on the first five samples of the dataset, and testing on the rest, the model's classifications should match those in the expected classes data file. """ runner = Runner(dataPath=os.path.join(DATA_DIR, "responses.csv"), resultsDir="", experimentName="fingerprints_test", load=False, modelName="ClassificationModelFingerprint", modelModuleName="fluent.models.classify_fingerprint", numClasses=3, plots=0, orderedSplit=True, trainSize=[5], verbosity=0) runner.initModel() runner.model.encoder.fingerprintType = EncoderTypes.document self.runExperiment(runner) expectedClasses, resultClasses = self.getExpectedClassifications( runner, os.path.join( DATA_DIR, "responses_expected_classes_fingerprint_document.csv")) [ self.assertEqual( sorted(e), sorted(r), "Fingerprint model predicted classes other than what we expect." ) for e, r in zip(expectedClasses, resultClasses) ]
def run(args): start = time.time() root = os.path.dirname(os.path.realpath(__file__)) resultsDir = os.path.join(root, args.resultsDir) if os.path.isdir(args.dataPath): runner = MultiRunner(dataPath=args.dataPath, resultsDir=resultsDir, experimentName=args.experimentName, load=args.load, modelName=args.modelName, modelModuleName=args.modelModuleName, numClasses=args.numClasses, plots=args.plots, orderedSplit=args.orderedSplit, trainSize=args.trainSize, verbosity=args.verbosity, test=args.test) elif args.modelName == "ClassificationModelHTM": runner = HTMRunner(dataPath=args.dataPath, resultsDir=resultsDir, experimentName=args.experimentName, load=args.load, modelName=args.modelName, modelModuleName=args.modelModuleName, numClasses=args.numClasses, plots=args.plots, orderedSplit=args.orderedSplit, trainSize=args.trainSize, verbosity=args.verbosity, generateData=args.generateData, votingMethod=args.votingMethod, classificationFile=args.classificationFile, classifierType=args.classifierType) else: runner = Runner(dataPath=args.dataPath, resultsDir=resultsDir, experimentName=args.experimentName, load=args.load, modelName=args.modelName, modelModuleName=args.modelModuleName, numClasses=args.numClasses, plots=args.plots, orderedSplit=args.orderedSplit, trainSize=args.trainSize, verbosity=args.verbosity) if args.modelName != "ClassificationModelHTM": # The data isn't ready yet to initialize an htm model runner.initModel() print "Reading in data and preprocessing." dataTime = time.time() runner.setupData(args.textPreprocess) print( "Data setup complete; elapsed time is {0:.2f} seconds.\nNow encoding " "the data".format(time.time() - dataTime)) encodeTime = time.time() runner.encodeSamples() print( "Encoding complete; elapsed time is {0:.2f} seconds.\nNow running the " "experiment.".format(time.time() - encodeTime)) runner.runExperiment() runner.writeOutClassifications() runner.calculateResults() print "Saving..." runner.save() print "Experiment complete in {0:.2f} seconds.".format(time.time() - start) if args.validation: print "Validating experiment against expected classifications..." print runner.validateExperiment(args.validation)
def run(args): start = time.time() if (not isinstance(args.kFolds, int)) or (args.kFolds < 1): raise ValueError("Invalid value for number of cross-validation folds.") root = os.path.dirname(os.path.realpath(__file__)) resultsDir = os.path.join(root, args.resultsDir) if args.modelName == "HTMNetwork": runner = HTMRunner(dataPath=args.dataPath, networkConfigPath=args.networkConfigPath, resultsDir=resultsDir, experimentName=args.experimentName, loadPath=args.loadPath, modelName=args.modelName, numClasses=args.numClasses, plots=args.plots, orderedSplit=args.orderedSplit, trainSizes=[], verbosity=args.verbosity, generateData=args.generateData, votingMethod=args.votingMethod, classificationFile=args.classificationFile, classifierType=args.classifierType) else: runner = Runner(dataPath=args.dataPath, resultsDir=resultsDir, experimentName=args.experimentName, loadPath=args.loadPath, modelName=args.modelName, numClasses=args.numClasses, plots=args.plots, orderedSplit=args.orderedSplit, trainSizes=[], verbosity=args.verbosity) # HTM network data isn't ready yet to initialize the model runner.initModel(args.modelName) print "Reading in data and preprocessing." dataTime = time.time() runner.setupData(args.textPreprocess) # TODO: move kfolds splitting to Runner random = False if args.orderedSplit else True runner.partitions = KFolds(args.kFolds).split(range(len(runner.samples)), randomize=random) runner.trainSizes = [len(x[0]) for x in runner.partitions] print( "Data setup complete; elapsed time is {0:.2f} seconds.\nNow encoding " "the data".format(time.time() - dataTime)) encodeTime = time.time() runner.encodeSamples() print( "Encoding complete; elapsed time is {0:.2f} seconds.\nNow running the " "experiment.".format(time.time() - encodeTime)) runner.runExperiment() print "Experiment complete in {0:.2f} seconds.".format(time.time() - start) resultCalcs = runner.calculateResults() _ = runner.evaluateCumulativeResults(resultCalcs) print "Saving..." runner.saveModel() if args.validation: print "Validating experiment against expected classifications..." print runner.validateExperiment(args.validation)