def testClassifyEndpointAsExpected(self): """ Tests ClassificationModelEndpoint. Training on the first five samples of the dataset, and testing on the rest, the model's classifications should match those in the expected classes data file. """ runner = Runner(dataPath=os.path.join(DATA_DIR, "responses.csv"), resultsDir="", experimentName="endpoint_test", load=False, modelName="ClassificationModelEndpoint", modelModuleName="fluent.models.classify_endpoint", numClasses=3, plots=0, orderedSplit=True, trainSize=[5], verbosity=0) runner.initModel() self.runExperiment(runner) expectedClasses, resultClasses = self.getExpectedClassifications(runner, os.path.join(DATA_DIR, "responses_expected_classes_endpoint.csv")) [self.assertEqual(sorted(e), sorted(r), "Endpoint model predicted classes other than what we expect.") for e, r in zip(expectedClasses, resultClasses)]
def testClassifyKeywordsAsExpected(self): """ Tests ClassificationModelKeywords. Training on the first five samples of the dataset, and testing on the rest, the model's classifications should match those in the expected classes data file. """ runner = Runner(dataPath=os.path.join(DATA_DIR, "responses.csv"), resultsDir="", experimentName="keywords_test", load=False, modelName="ClassificationModelKeywords", modelModuleName="fluent.models.classify_keywords", numClasses=3, plots=0, orderedSplit=True, trainSize=[5], verbosity=0) runner.initModel() self.runExperiment(runner) expectedClasses, resultClasses = self.getExpectedClassifications( runner, os.path.join(DATA_DIR, "responses_expected_classes_keywords.csv")) for i, (e, r) in enumerate(zip(expectedClasses, resultClasses)): if i in (7, 9, 12): # Ties amongst winning labels are handled randomly, which affects the # third classification in these test samples. e = e[:2] r = r[:2] self.assertEqual(sorted(e), sorted(r), "Keywords model predicted classes other than what we expect.")
def testClassifyKeywordsAsExpected(self): """ Tests ClassificationModelKeywords. Training on the first five samples of the dataset, and testing on the rest, the model's classifications should match those in the expected classes data file. """ modelName = "Keywords" runner = Runner(dataPath=os.path.join(DATA_DIR, "responses.csv"), resultsDir="", experimentName="keywords_test", loadPath=None, modelName=modelName, numClasses=3, plots=0, orderedSplit=True, trainSizes=[5], verbosity=0) runner.initModel(modelName) self.runExperiment(runner) expectedClasses, resultClasses = self.getExpectedClassifications( runner, os.path.join(DATA_DIR, "responses_expected_classes_keywords.csv")) for i, (e, r) in enumerate(zip(expectedClasses, resultClasses)): if i in (7, 9, 12): # Ties amongst winning labels are handled randomly, which affects the # third classification in these test samples. e = e[:2] r = r[:2] self.assertEqual( sorted(e), sorted(r), "Keywords model predicted classes other than what we expect.")
def testClassifyWordFingerprintsAsExpected(self): """ Tests ClassificationModelFingerprint (for encoder type 'word'). Training on the first five samples of the dataset, and testing on the rest, the model's classifications should match those in the expected classes data file. """ modelName = "CioWordFingerprint" runner = Runner(dataPath=os.path.join(DATA_DIR, "responses.csv"), resultsDir="", experimentName="fingerprints_test", loadPath=None, modelName=modelName, numClasses=3, plots=0, orderedSplit=True, trainSizes=[5], verbosity=0) runner.initModel(modelName) runner.model.encoder.fingerprintType = EncoderTypes.word self.runExperiment(runner) expectedClasses, resultClasses = self.getExpectedClassifications( runner, os.path.join(DATA_DIR, "responses_expected_classes_fingerprint_word.csv")) for i, (e, r) in enumerate(zip(expectedClasses, resultClasses)): if sorted(e) != sorted(r): print i, e, r [ self.assertEqual( sorted(e), sorted(r), "Fingerprint model predicted classes other than what we expect." ) for e, r in zip(expectedClasses, resultClasses) ]
def testClassifyEndpointAsExpected(self): """ Tests ClassificationModelEndpoint. Training on the first five samples of the dataset, and testing on the rest, the model's classifications should match those in the expected classes data file. """ modelName = "CioEndpoint" runner = Runner(dataPath=os.path.join(DATA_DIR, "responses.csv"), resultsDir="", experimentName="endpoint_test", loadPath=None, modelName=modelName, numClasses=3, plots=0, orderedSplit=True, trainSizes=[5], verbosity=0) runner.initModel(modelName) self.runExperiment(runner) expectedClasses, resultClasses = self.getExpectedClassifications( runner, os.path.join(DATA_DIR, "responses_expected_classes_endpoint.csv")) [ self.assertEqual( sorted(e), sorted(r), "Endpoint model predicted classes other than what we expect.") for e, r in zip(expectedClasses, resultClasses) ]
def testClassifyDocumentFingerprintsAsExpected(self): """ Tests ClassificationModelFingerprint (for encoder type 'document'). Training on the first five samples of the dataset, and testing on the rest, the model's classifications should match those in the expected classes data file. """ runner = Runner(dataPath=os.path.join(DATA_DIR, "responses.csv"), resultsDir="", experimentName="fingerprints_test", load=False, modelName="ClassificationModelFingerprint", modelModuleName="fluent.models.classify_fingerprint", numClasses=3, plots=0, orderedSplit=True, trainSize=[5], verbosity=0) runner.initModel() runner.model.encoder.fingerprintType = EncoderTypes.document self.runExperiment(runner) expectedClasses, resultClasses = self.getExpectedClassifications( runner, os.path.join( DATA_DIR, "responses_expected_classes_fingerprint_document.csv")) [ self.assertEqual( sorted(e), sorted(r), "Fingerprint model predicted classes other than what we expect." ) for e, r in zip(expectedClasses, resultClasses) ]
class FluentWrapper(object): """ Wraps nupic.fluent Model """ def __init__(self, dataPath): """ initializes nupic.fluent model with given sample data :param str dataPath: Path to sample data file. Must be a CSV file having 'ID and 'Sample' columns """ g_log.info("Initialize nupic.fluent") # Initialize nupic.fluent model runner self._fluent = FluentRunner(dataPath=dataPath, resultsDir="", experimentName="imbu_fingerprints", load=False, modelName="ClassificationModelFingerprint", modelModuleName="fluent.models.classify_fingerprint", numClasses=1, # must be >0 to go through training plots=0, orderedSplit=False, trainSizes=[], verbosity=0) # Train model with given sample data self._fluent.initModel() self._fluent.setupData() self._fluent.trainSize = len(self._fluent.samples) self._fluent.encodeSamples() self._fluent.resetModel(0) for i in range(self._fluent.trainSize): self._fluent.model.trainModel(i) def query(self, text): """ Queries fluent model and returns an ordered list of matching documents. :param str text: The text to match. :returns: a sequence of matching samples. :: [ {"id": "1", "text": "sampleText", "score": "0.75"}, ... ] """ results = [] if text: g_log.info("Query model for : %s", text) sampleIDs, sampleDists = self._fluent.model.queryModel(text, False) for sID, dist in zip (sampleIDs, sampleDists): results.append({"id": sID, "text": self._fluent.dataDict[sID][0], "score": dist.item()}) return results
def __init__(self, dataPath): """ initializes nupic.fluent model with given sample data :param str dataPath: Path to sample data file. Must be a CSV file having 'ID and 'Sample' columns """ g_log.info("Initialize nupic.fluent") # Initialize nupic.fluent model runner self._fluent = FluentRunner(dataPath=dataPath, resultsDir="", experimentName="imbu_fingerprints", load=False, modelName="ClassificationModelFingerprint", modelModuleName="fluent.models.classify_fingerprint", numClasses=1, # must be >0 to go through training plots=0, orderedSplit=False, trainSizes=[], verbosity=0) # Train model with given sample data self._fluent.initModel() self._fluent.setupData() self._fluent.trainSize = len(self._fluent.samples) self._fluent.encodeSamples() self._fluent.resetModel(0) for i in range(self._fluent.trainSize): self._fluent.model.trainModel(i)
def __init__(self, dataPath): """ initializes nupic.fluent model with given sample data :param str dataPath: Path to sample data file. Must be a CSV file having 'ID and 'Sample' columns """ g_log.info("Initialize nupic.fluent") # Initialize nupic.fluent model runner self._fluent = FluentRunner( dataPath=dataPath, resultsDir="", experimentName="imbu_fingerprints", load=False, modelName="ClassificationModelFingerprint", modelModuleName="fluent.models.classify_fingerprint", numClasses=1, # must be >0 to go through training plots=0, orderedSplit=False, trainSizes=[], verbosity=0, ) # Train model with given sample data self._fluent.initModel() self._fluent.setupData() self._fluent.trainSize = len(self._fluent.samples) self._fluent.encodeSamples() self._fluent.resetModel(0) for i in range(self._fluent.trainSize): self._fluent.model.trainModel(i)
class FluentWrapper(object): """ Wraps nupic.fluent Model """ def __init__(self, dataPath): """ initializes nupic.fluent model with given sample data :param str dataPath: Path to sample data file. Must be a CSV file having 'ID and 'Sample' columns """ g_log.info("Initialize nupic.fluent") # Initialize nupic.fluent model runner self._fluent = FluentRunner( dataPath=dataPath, resultsDir="", experimentName="imbu_fingerprints", load=False, modelName="ClassificationModelFingerprint", modelModuleName="fluent.models.classify_fingerprint", numClasses=1, # must be >0 to go through training plots=0, orderedSplit=False, trainSizes=[], verbosity=0, ) # Train model with given sample data self._fluent.initModel() self._fluent.setupData() self._fluent.trainSize = len(self._fluent.samples) self._fluent.encodeSamples() self._fluent.resetModel(0) for i in range(self._fluent.trainSize): self._fluent.model.trainModel(i) def query(self, text): """ Queries fluent model and returns an ordered list of matching documents. :param str text: The text to match. :returns: a sequence of matching samples. :: [ {"id": "1", "text": "sampleText", "score": "0.75"}, ... ] """ results = [] if text: g_log.info("Query model for : %s", text) sampleIDs, sampleDists = self._fluent.model.queryModel(text, False) for sID, dist in zip(sampleIDs, sampleDists): results.append({"id": sID, "text": self._fluent.dataDict[sID][0], "score": dist.item()}) return results
def run(args): start = time.time() root = os.path.dirname(os.path.realpath(__file__)) resultsDir = os.path.join(root, args.resultsDir) runner = Runner(dataPath=args.dataPath, resultsDir=resultsDir, experimentName=args.experimentName, load=args.load, modelName=args.modelName, modelModuleName=args.modelModuleName, numClasses=args.numClasses, plots=args.plots, orderedSplit=args.orderedSplit, trainSize=args.trainSize, verbosity=args.verbosity) runner.initModel() print "Reading in data and preprocessing." dataTime = time.time() runner.setupData() print ("Data setup complete; elapsed time is {0:.2f} seconds.\nNow encoding " "the data".format(time.time() - dataTime)) encodeTime = time.time() runner.encodeSamples() print ("Encoding complete; elapsed time is {0:.2f} seconds.\nNow running the " "experiment.".format(time.time() - encodeTime)) runner.runExperiment() runner.calculateResults() runner.save() print "Experiment complete in {0:.2f} seconds.".format(time.time() - start) if args.validation: print "Validating experiment against expected classifications..." print runner.validateExperiment(args.validation)
def run(args): start = time.time() root = os.path.dirname(os.path.realpath(__file__)) resultsDir = os.path.join(root, args.resultsDir) if os.path.isdir(args.dataPath): runner = MultiRunner(dataPath=args.dataPath, resultsDir=resultsDir, experimentName=args.experimentName, load=args.load, modelName=args.modelName, modelModuleName=args.modelModuleName, numClasses=args.numClasses, plots=args.plots, orderedSplit=args.orderedSplit, trainSize=args.trainSize, verbosity=args.verbosity, test=args.test) elif args.modelName == "ClassificationModelHTM": runner = HTMRunner(dataPath=args.dataPath, resultsDir=resultsDir, experimentName=args.experimentName, load=args.load, modelName=args.modelName, modelModuleName=args.modelModuleName, numClasses=args.numClasses, plots=args.plots, orderedSplit=args.orderedSplit, trainSize=args.trainSize, verbosity=args.verbosity, generateData=args.generateData, votingMethod=args.votingMethod, classificationFile=args.classificationFile, classifierType=args.classifierType) else: runner = Runner(dataPath=args.dataPath, resultsDir=resultsDir, experimentName=args.experimentName, load=args.load, modelName=args.modelName, modelModuleName=args.modelModuleName, numClasses=args.numClasses, plots=args.plots, orderedSplit=args.orderedSplit, trainSize=args.trainSize, verbosity=args.verbosity) if args.modelName != "ClassificationModelHTM": # The data isn't ready yet to initialize an htm model runner.initModel() print "Reading in data and preprocessing." dataTime = time.time() runner.setupData(args.textPreprocess) print( "Data setup complete; elapsed time is {0:.2f} seconds.\nNow encoding " "the data".format(time.time() - dataTime)) encodeTime = time.time() runner.encodeSamples() print( "Encoding complete; elapsed time is {0:.2f} seconds.\nNow running the " "experiment.".format(time.time() - encodeTime)) runner.runExperiment() runner.writeOutClassifications() runner.calculateResults() print "Saving..." runner.save() print "Experiment complete in {0:.2f} seconds.".format(time.time() - start) if args.validation: print "Validating experiment against expected classifications..." print runner.validateExperiment(args.validation)
def run(args): start = time.time() if (not isinstance(args.kFolds, int)) or (args.kFolds < 1): raise ValueError("Invalid value for number of cross-validation folds.") root = os.path.dirname(os.path.realpath(__file__)) resultsDir = os.path.join(root, args.resultsDir) if args.modelName == "HTMNetwork": runner = HTMRunner(dataPath=args.dataPath, networkConfigPath=args.networkConfigPath, resultsDir=resultsDir, experimentName=args.experimentName, loadPath=args.loadPath, modelName=args.modelName, numClasses=args.numClasses, plots=args.plots, orderedSplit=args.orderedSplit, trainSizes=[], verbosity=args.verbosity, generateData=args.generateData, votingMethod=args.votingMethod, classificationFile=args.classificationFile, classifierType=args.classifierType) else: runner = Runner(dataPath=args.dataPath, resultsDir=resultsDir, experimentName=args.experimentName, loadPath=args.loadPath, modelName=args.modelName, numClasses=args.numClasses, plots=args.plots, orderedSplit=args.orderedSplit, trainSizes=[], verbosity=args.verbosity) # HTM network data isn't ready yet to initialize the model runner.initModel(args.modelName) print "Reading in data and preprocessing." dataTime = time.time() runner.setupData(args.textPreprocess) # TODO: move kfolds splitting to Runner random = False if args.orderedSplit else True runner.partitions = KFolds(args.kFolds).split(range(len(runner.samples)), randomize=random) runner.trainSizes = [len(x[0]) for x in runner.partitions] print( "Data setup complete; elapsed time is {0:.2f} seconds.\nNow encoding " "the data".format(time.time() - dataTime)) encodeTime = time.time() runner.encodeSamples() print( "Encoding complete; elapsed time is {0:.2f} seconds.\nNow running the " "experiment.".format(time.time() - encodeTime)) runner.runExperiment() print "Experiment complete in {0:.2f} seconds.".format(time.time() - start) resultCalcs = runner.calculateResults() _ = runner.evaluateCumulativeResults(resultCalcs) print "Saving..." runner.saveModel() if args.validation: print "Validating experiment against expected classifications..." print runner.validateExperiment(args.validation)