def testWindowEncodings(self): """Test the CioEncoder for the sliding window encodings.""" cio = CioEncoder(fingerprintType=EncoderTypes.word) text = """ I grok people. I am people, so now I can say it in people talk. I've found out why people laugh. They laugh because it hurts so much, because it's the only thing that'll make it stop hurting.""" tokens = TextPreprocess().tokenize(text) encodingDicts = cio.getWindowEncoding(tokens, minSparsity=0.19) # Test that only dense windows get encoded self.assertTrue(len(tokens) > len(encodingDicts), "Returned incorrect number of window encodings.") # Test window windowEncoding = getTestData("cio_encoding_window.json") self.assertEqual(windowEncoding["text"], encodingDicts[-1]["text"], "Window encoding represents the wrong text.") self.assertTrue(encodingDicts[-1]["sparsity"] <= cio.unionSparsity, "Sparsity for large window is larger than the max.") self.assertSequenceEqual( windowEncoding["bitmap"], encodingDicts[-1]["bitmap"].tolist(), "Window encoding's bitmap is not as expected.")
def testWordFingerprint(self): """Test the Cortical.io term (word-lelevl) encoding.""" cio = CioEncoder(fingerprintType=EncoderTypes.word) response = cio.encode(self.text) self.assertFingerprintFields(response) encodingDict = getTestData("cio_encoding_word.json") self.assertEqual(encodingDict["fingerprint"]["positions"], response["fingerprint"]["positions"], "Cio bitmap is not as expected.")
def __init__(self, verbosity=1, numLabels=1): """ Initialize the CorticalClient and CioEncoder. Requires a valid API key. """ super(ClassificationModelContext, self).__init__(verbosity) root = os.path.dirname(os.path.realpath(__file__)) self.encoder = CioEncoder(cacheDir=os.path.join(root, "CioCache")) self.client = CorticalClient(self.encoder.apiKey) self.n = self.encoder.n self.w = int((self.encoder.targetSparsity / 100) * self.n) self.categoryBitmaps = {} self.numLabels = numLabels
def testDocumentFingerprint(self): """Test the Cortical.io text (document-level) encoding.""" cio = CioEncoder(fingerprintType=EncoderTypes.document) response = cio.encode(self.text) self.assertFingerprintFields(response) encodingDict = getTestData("cio_encoding_document.json") self.assertEqual( encodingDict["fingerprint"]["positions"], response["fingerprint"]["positions"], "Cio bitmap is not as expected.", )
def __init__(self, fingerprintType=EncoderTypes.word, unionSparsity=0.20, retinaScaling=1.0, retina="en_associative", apiKey=None, k=1, classifierMetric="rawOverlap", cacheRoot=None, **kwargs): super(ClassificationModelFingerprint, self).__init__(**kwargs) self.classifier = KNNClassifier(k=k, distanceMethod=classifierMetric, exact=False, verbosity=self.verbosity-1) # Need a valid API key for the Cortical.io encoder (see CioEncoder # constructor for details). if fingerprintType is (not EncoderTypes.document or not EncoderTypes.word): raise ValueError("Invalid type of fingerprint encoding; see the " "EncoderTypes class for eligble types.") self.encoder = CioEncoder(retinaScaling=retinaScaling, fingerprintType=fingerprintType, unionSparsity=unionSparsity, retina=retina, apiKey=apiKey, cacheDir=cacheRoot) self.currentDocument = None
def __init__(self, verbosity=1, numLabels=3, modelDir="ClassificationModelFingerprint", fingerprintType=EncoderTypes.word, unionSparsity=0.20, retinaScaling=1.0, retina="en_associative", apiKey=None, classifierMetric="rawOverlap", cacheRoot=None): super(ClassificationModelFingerprint, self).__init__( verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) # Init kNN classifier and Cortical.io encoder; need valid API key (see # CioEncoder init for details). self.classifier = KNNClassifier(k=numLabels, distanceMethod=classifierMetric, exact=False, verbosity=verbosity-1) if fingerprintType is (not EncoderTypes.document or not EncoderTypes.word): raise ValueError("Invaid type of fingerprint encoding; see the " "EncoderTypes class for eligble types.") cacheRoot = cacheRoot or os.path.dirname(os.path.realpath(__file__)) self.encoder = CioEncoder(retinaScaling=retinaScaling, cacheDir=os.path.join(cacheRoot, "CioCache"), fingerprintType=fingerprintType, unionSparsity=unionSparsity, retina=retina, apiKey=apiKey)
def __init__(self, verbosity=1, numLabels=3, modelDir="ClassificationModelWindow", unionSparsity=0.20, retinaScaling=1.0, retina="en_associative", apiKey=None, classifierMetric="rawOverlap", cacheRoot=None): super(ClassificationModelWindows, self).__init__( verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) # window patterns below minSparsity will be skipped over self.minSparsity = 0.9 * unionSparsity self.classifier = KNNClassifier(k=numLabels, distanceMethod=classifierMetric, exact=False, verbosity=verbosity-1) # need valid API key (see CioEncoder init for details) cacheRoot = cacheRoot or os.path.dirname(os.path.realpath(__file__)) self.encoder = CioEncoder(retinaScaling=retinaScaling, cacheDir=os.path.join(cacheRoot, "CioCache"), fingerprintType=EncoderTypes.word, unionSparsity=unionSparsity, retina=retina, apiKey=apiKey)
def testRetinaScaling(self): """Test the CioEncoder for retina dimension scaling.""" cio = CioEncoder( retinaScaling = 0.25, fingerprintType=EncoderTypes.document) response = cio.encode(self.text) encodingDict = getTestData("cio_encoding_scaled_retina.json") self.assertEqual(encodingDict["fingerprint"]["positions"], response["fingerprint"]["positions"], "Cio bitmap is not as expected.") fullRetinaEncodingDict = getTestData("cio_encoding_document.json") fullLength = len(fullRetinaEncodingDict["fingerprint"]["positions"]) responseLength = len(response["fingerprint"]["positions"]) self.assertTrue(responseLength <= fullLength, "Retina scaling did not decrease the fingerprint size.")
def testRetinaScaling(self): """Test the CioEncoder for retina dimension scaling.""" cio = CioEncoder(retinaScaling=0.25, fingerprintType=EncoderTypes.document) response = cio.encode(self.text) encodingDict = getTestData("cio_encoding_scaled_retina.json") self.assertEqual(encodingDict["fingerprint"]["positions"], response["fingerprint"]["positions"], "Cio bitmap is not as expected.") fullRetinaEncodingDict = getTestData("cio_encoding_document.json") fullLength = len(fullRetinaEncodingDict["fingerprint"]["positions"]) responseLength = len(response["fingerprint"]["positions"]) self.assertTrue( responseLength <= fullLength, "Retina scaling did not decrease the fingerprint size.")
def initModel(self): """ Initialize the network; self.networdDataPath must already be set. """ encoder = CioEncoder(retinaScaling=self.retinaScaling, retina=self.retina, apiKey=self.apiKey, maxSparsity=self.maxSparsity, verbosity=self.verbosity - 1) # This encoder specifies the LanguageSensor output width. return configureNetwork(None, self.networkConfig, encoder)
def __init__(self, verbosity=1, numLabels=3, modelDir="ClassificationModelEndpoint", unionSparsity=0.20): """ Initializes the encoder as CioEncoder; requires a valid API key. """ super(ClassificationModelEndpoint, self).__init__(verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) root = os.path.dirname(os.path.realpath(__file__)) self.encoder = CioEncoder(cacheDir=os.path.join(root, "CioCache"), unionSparsity=unionSparsity) self.compareEncoder = LanguageEncoder() self.n = self.encoder.n self.w = int((self.encoder.targetSparsity / 100) * self.n) self.categoryBitmaps = {} self.negatives = defaultdict(list) self.positives = defaultdict(list)
def __init__(self, verbosity=1, numLabels=3, modelDir="ClassificationModelFingerprint", fingerprintType=EncoderTypes.word, unionSparsity=0.20, retinaScaling=1.0, retina="en_associative", apiKey=None, classifierMetric="rawOverlap", cacheRoot=None): super(ClassificationModelFingerprint, self).__init__(verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) # Init kNN classifier and Cortical.io encoder; need valid API key (see # CioEncoder init for details). self.classifier = KNNClassifier(k=numLabels, distanceMethod=classifierMetric, exact=False, verbosity=verbosity - 1) if fingerprintType is (not EncoderTypes.document or not EncoderTypes.word): raise ValueError("Invaid type of fingerprint encoding; see the " "EncoderTypes class for eligble types.") cacheRoot = cacheRoot or os.path.dirname(os.path.realpath(__file__)) self.encoder = CioEncoder(retinaScaling=retinaScaling, cacheDir=os.path.join(cacheRoot, "CioCache"), fingerprintType=fingerprintType, unionSparsity=unionSparsity, retina=retina, apiKey=apiKey)
def _initModel(self, k): """ Initialize the network """ encoder = CioEncoder(retinaScaling=self.retinaScaling, retina=self.retina, fingerprintType=EncoderTypes.document, apiKey=self.apiKey, verbosity=self.verbosity-1) modelConfig["classifierRegionConfig"]["regionParams"]["k"] = k modelConfig["classifierRegionConfig"]["regionParams"][ "maxCategoryCount"] = self.numLabels self.networkConfig = modelConfig self.network = configureNetwork(None, self.networkConfig, encoder)
def _initModel(self, k): """ Initialize the network """ root = os.path.dirname(os.path.realpath(__file__)) encoder = CioEncoder(retinaScaling=self.retinaScaling, cacheDir=os.path.join(root, "CioCache"), retina=self.retina, fingerprintType=EncoderTypes.document, apiKey=self.apiKey) modelConfig["classifierRegionConfig"]["regionParams"]["k"] = k modelConfig["classifierRegionConfig"]["regionParams"][ "maxCategoryCount"] = self.numLabels self.networkConfig = modelConfig self.network = configureNetwork(None, self.networkConfig, encoder)
def initModel(self): """ Initialize the network; self.networdDataPath must already be set. """ if self.networkDataPath is not None: recordStream = FileRecordStream(streamID=self.networkDataPath) else: recordStream = None root = os.path.dirname(os.path.realpath(__file__)) encoder = CioEncoder(retinaScaling=self.retinaScaling, cacheDir=os.path.join(root, "CioCache"), retina=self.retina, apiKey=self.apiKey) # This encoder specifies the LanguageSensor output width. return configureNetwork(recordStream, self.networkConfig, encoder)
def testRetinaScaling(self): """Test the CioEncoder for retina dimension scaling.""" cio = CioEncoder( retinaScaling = 1.0, fingerprintType=EncoderTypes.document) cioScaled = CioEncoder( retinaScaling = 0.5, fingerprintType=EncoderTypes.document) cioScaled2 = CioEncoder( retinaScaling = 0.71, fingerprintType=EncoderTypes.document) self.assertAlmostEqual(int(0.5*cio.width), cioScaled.width) self.assertAlmostEqual(int(0.5*cio.height), cioScaled.height) self.assertAlmostEqual(int(0.71*cio.height), cioScaled2.height) response = cio.encode(self.text) responseScaled = cioScaled.encode(self.text) responseScaled2 = cioScaled2.encode(self.text) # Each bit position should be scaled down by retinaScaling*retinaScaling self.assertLessEqual(responseScaled["fingerprint"]["positions"].sum(), 0.5*0.5*response["fingerprint"]["positions"].sum()) self.assertLessEqual(responseScaled2["fingerprint"]["positions"].sum(), 0.71*0.71*response["fingerprint"]["positions"].sum()) # The number of on bits in scaled retina should normally be slightly less # than the original, but can be equal in some cases self.assertLessEqual(len(responseScaled["fingerprint"]["positions"]), len(response["fingerprint"]["positions"])) self.assertLessEqual(len(responseScaled["fingerprint"]["positions"]), len(responseScaled2["fingerprint"]["positions"])) # Check that encodeIntoArray works even with weird scaling a = numpy.zeros(cioScaled2.width*cioScaled2.height) cioScaled2.encodeIntoArray(self.text, a) self.assertEqual(len(responseScaled2["fingerprint"]["positions"]), a.sum())
def __init__(self, verbosity=1, numLabels=3, modelDir="ClassificationModelEndpoint", unionSparsity=0.20): """ Initializes the encoder as CioEncoder; requires a valid API key. """ super(ClassificationModelEndpoint, self).__init__( verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) root = os.path.dirname(os.path.realpath(__file__)) self.encoder = CioEncoder(cacheDir=os.path.join(root, "CioCache"), unionSparsity=unionSparsity) self.compareEncoder = LanguageEncoder() self.n = self.encoder.n self.w = int((self.encoder.targetSparsity/100) * self.n) self.categoryBitmaps = {} self.negatives = defaultdict(list) self.positives = defaultdict(list)
def testMaxSparsity(self): """Test that CioEncoder's maxSparsity works.""" # This text seems to generate bitmaps with about 8% sparsity text = ( "Smoking harms nearly every organ in your body. Over 7000 chemicals" " have been identified in tobacco smoke. After reading all this" " James and Sue decided to abruptly quit cigarette smoking to" " improve their health but it clearly was not an easy decision.") # Encoders with maxSparsity of 100%, 10%, 5%, and 1% cio100 = CioEncoder(maxSparsity=1.0, fingerprintType=EncoderTypes.document) cio10 = CioEncoder(maxSparsity=0.1, fingerprintType=EncoderTypes.document) cio5 = CioEncoder(maxSparsity=0.05, fingerprintType=EncoderTypes.document) cio1 = CioEncoder(maxSparsity=0.01, fingerprintType=EncoderTypes.document) bitmapSize = cio100.width * cio100.height r100 = cio100.encode(text) r10 = cio10.encode(text) r5 = cio5.encode(text) r1 = cio1.encode(text) length100 = len(r100["fingerprint"]["positions"]) length10 = len(r10["fingerprint"]["positions"]) length5 = len(r5["fingerprint"]["positions"]) length1 = len(r1["fingerprint"]["positions"]) # Encodings must have no more than desired sparsity self.assertLessEqual(r100["sparsity"], 1.0) self.assertLessEqual(r10["sparsity"], 0.1) self.assertLessEqual(r5["sparsity"], 0.05) self.assertLessEqual(r1["sparsity"], 0.01) self.assertLessEqual(length100, bitmapSize) self.assertLessEqual(length10, 0.1 * bitmapSize) self.assertLessEqual(length5, 0.05 * bitmapSize) self.assertLessEqual(length1, 0.01 * bitmapSize) # Encodings can't be zero self.assertGreater(length100, 0) self.assertGreater(length10, 0) self.assertGreater(length5, 0) self.assertGreater(length1, 0) # Encodings must have complete overlap with the next higher encoding s100 = set(r100["fingerprint"]["positions"]) s10 = set(r10["fingerprint"]["positions"]) s5 = set(r5["fingerprint"]["positions"]) s1 = set(r1["fingerprint"]["positions"]) self.assertEqual(len(s100 & s10), length10) self.assertEqual(len(s10 & s5), length5) self.assertEqual(len(s5 & s1), length1) # Test that if you encode a second time, you get the same bitmap r100_2 = cio100.encode(text) r10_2 = cio10.encode(text) r5_2 = cio5.encode(text) r1_2 = cio1.encode(text) self.assertEqual( hashlib.sha224(str(r100)).hexdigest(), hashlib.sha224(str(r100_2)).hexdigest()) self.assertEqual( hashlib.sha224(str(r10)).hexdigest(), hashlib.sha224(str(r10_2)).hexdigest()) self.assertEqual( hashlib.sha224(str(r5)).hexdigest(), hashlib.sha224(str(r5_2)).hexdigest()) self.assertEqual( hashlib.sha224(str(r1)).hexdigest(), hashlib.sha224(str(r1_2)).hexdigest())
class ClassificationModelWindows(ClassificationModel): """ Class to run classification tasks with a sliding windwo of Coritcal.io word fingerprint encodings. """ def __init__(self, verbosity=1, numLabels=3, modelDir="ClassificationModelWindow", unionSparsity=0.20, retinaScaling=1.0, retina="en_associative", apiKey=None, classifierMetric="rawOverlap"): super(ClassificationModelWindows, self).__init__(verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) # window patterns below minSparsity will be skipped over self.minSparsity = 0.9 * unionSparsity self.classifier = KNNClassifier(k=numLabels, distanceMethod=classifierMetric, exact=False, verbosity=verbosity - 1) # need valid API key (see CioEncoder init for details) root = os.path.dirname(os.path.realpath(__file__)) self.encoder = CioEncoder(retinaScaling=retinaScaling, cacheDir=os.path.join(root, "CioCache"), fingerprintType=EncoderTypes.word, unionSparsity=unionSparsity, retina=retina, apiKey=apiKey) def encodeSample(self, sample): """ Encode an SDR of the input string by querying the Cortical.io API for each word. The resulting bitmaps are unionized in a sliding window. @param sample (list) Tokenized sample, where each item is a str. @return (list) Pattern dicts for the windows, each with the sample text, sparsity, and bitmap. """ return self.encoder.getWindowEncoding(sample, self.minSparsity) def writeOutEncodings(self): """ Write the encoding dictionaries to a txt file; overrides the superclass implementation. """ if not os.path.isdir(self.modelDir): raise ValueError("Invalid path to write file.") # Cast numpy arrays to list objects for serialization. jsonPatterns = copy.deepcopy(self.patterns) for jp in jsonPatterns: for tokenPattern in jp["pattern"]: tokenPattern["bitmap"] = tokenPattern.get( "bitmap", numpy.array([])).tolist() jp["labels"] = jp.get("labels", numpy.array([])).tolist() with open(os.path.join(self.modelDir, "encoding_log.txt"), "w") as f: f.write(json.dumps(jsonPatterns, indent=1)) def trainModel(self, i): # TODO: add batch training, where i is a list """ Train the classifier on the sample and labels for record i. The list sampleReference is populated to correlate classifier prototypes to sample IDs. This model is unique in that a single sample contains multiple encoded patterns, of which, any that are too sparse are skipped over. @return (int) Number of patterns trained on. """ patternWindows = self.patterns[i]["pattern"] if len(patternWindows) == 0: # no patterns b/c no windows were large enough for encoding return count = 0 for window in patternWindows: for label in self.patterns[i]["labels"]: self.classifier.learn(window["bitmap"], label, isSparse=self.encoder.n) self.sampleReference.append(self.patterns[i]["ID"]) count += 1 return count def testModel(self, i, seed=42): """ Test the model on record i. Returns the classifications most frequent amongst the classifications of the sample's individual tokens. We ignore the terms that are unclassified, picking the most frequent classifications among those that are detected; in getWinningLabels(). @return (numpy array) numLabels most-frequent classifications for the data samples; int or empty. """ totalInferenceResult = None for pattern in self.patterns[i]["pattern"]: if not pattern: continue _, inferenceResult, _, _ = self.classifier.infer( self.sparsifyPattern(pattern["bitmap"], self.encoder.n)) if totalInferenceResult is None: totalInferenceResult = inferenceResult else: totalInferenceResult += inferenceResult return self.getWinningLabels(totalInferenceResult, seed) def queryModel(self, query, preprocess=False): """ Preprocesses the query, encodes it into a pattern, then queries the classifier to infer distances to trained-on samples. @return (list) Two-tuples of sample ID and distance, sorted closest to farthest from the query. """ if preprocess: sample = TextPreprocess().tokenize( query, ignoreCommon=100, removeStrings=["[identifier deleted]"], correctSpell=True) else: sample = TextPreprocess().tokenize(query) # Get window patterns for the query, but if the query is too small such that # the window encodings are too sparse, we default to a pure union. encodedQuery = self.encodeSample(sample) if len(encodedQuery) == 0: sample = " ".join(sample) fpInfo = self.encoder.getUnionEncoding(sample) encodedQuery = [{ "text": fpInfo["text"], "sparsity": fpInfo["sparsity"], "bitmap": numpy.array(fpInfo["fingerprint"]["positions"]) }] allDistances = self.infer(encodedQuery) if len(allDistances) != len(self.sampleReference): raise IndexError( "Number of protoype distances must match number of " "samples trained on.") sampleDistances = defaultdict() for uniqueID in self.sampleReference: sampleDistances[uniqueID] = min([ allDistances[i] for i, x in enumerate(self.sampleReference) if x == uniqueID ]) return sorted(sampleDistances.items(), key=operator.itemgetter(1)) def infer(self, patterns): """ Get the classifier output for a single input pattern; assumes classifier has an infer() method (as specified in NuPIC kNN implementation). For this model we sum the distances across the patterns and normalize before returning. NOTE: there is no check here that the pattern sparsities are > the minimum. @return (numpy.array) Each entry is the distance from the input pattern to that prototype (pattern in the classifier). All distances are between 0.0 and 1.0 """ distances = numpy.zeros((self.classifier._numPatterns)) for i, p in enumerate(patterns): (_, _, dist, _) = self.classifier.infer( self.sparsifyPattern(p["bitmap"], self.encoder.n)) distances = distances + dist return distances / float(i + 1)
class ClassificationModelWindows(ClassificationModel): """ Class to run classification tasks with a sliding windwo of Coritcal.io word fingerprint encodings. """ def __init__(self, verbosity=1, numLabels=3, modelDir="ClassificationModelWindow", unionSparsity=0.20, retinaScaling=1.0, retina="en_associative", apiKey=None, classifierMetric="rawOverlap", cacheRoot=None): super(ClassificationModelWindows, self).__init__( verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) # window patterns below minSparsity will be skipped over self.minSparsity = 0.9 * unionSparsity self.classifier = KNNClassifier(k=numLabels, distanceMethod=classifierMetric, exact=False, verbosity=verbosity-1) # need valid API key (see CioEncoder init for details) cacheRoot = cacheRoot or os.path.dirname(os.path.realpath(__file__)) self.encoder = CioEncoder(retinaScaling=retinaScaling, cacheDir=os.path.join(cacheRoot, "CioCache"), fingerprintType=EncoderTypes.word, unionSparsity=unionSparsity, retina=retina, apiKey=apiKey) def encodeSample(self, sample): """ Encode an SDR of the input string by querying the Cortical.io API for each word. The resulting bitmaps are unionized in a sliding window. @param sample (list) Tokenized sample, where each item is a str. @return (list) Pattern dicts for the windows, each with the sample text, sparsity, and bitmap. """ return self.encoder.getWindowEncoding(sample, self.minSparsity) def writeOutEncodings(self): """ Write the encoding dictionaries to a txt file; overrides the superclass implementation. """ if not os.path.isdir(self.modelDir): raise ValueError("Invalid path to write file.") # Cast numpy arrays to list objects for serialization. jsonPatterns = copy.deepcopy(self.patterns) for jp in jsonPatterns: for tokenPattern in jp["pattern"]: tokenPattern["bitmap"] = tokenPattern.get( "bitmap", numpy.array([])).tolist() jp["labels"] = jp.get("labels", numpy.array([])).tolist() with open(os.path.join(self.modelDir, "encoding_log.txt"), "w") as f: f.write(json.dumps(jsonPatterns, indent=1)) def trainModel(self, i): # TODO: add batch training, where i is a list """ Train the classifier on the sample and labels for record i. The list sampleReference is populated to correlate classifier prototypes to sample IDs. This model is unique in that a single sample contains multiple encoded patterns, of which, any that are too sparse are skipped over. @return (int) Number of patterns trained on. """ patternWindows = self.patterns[i]["pattern"] if len(patternWindows) == 0: # no patterns b/c no windows were large enough for encoding return count = 0 for window in patternWindows: for label in self.patterns[i]["labels"]: self.classifier.learn( window["bitmap"], label, isSparse=self.encoder.n) self.sampleReference.append(self.patterns[i]["ID"]) count += 1 return count def testModel(self, i, seed=42): """ Test the model on record i. Returns the classifications most frequent amongst the classifications of the sample's individual tokens. We ignore the terms that are unclassified, picking the most frequent classifications among those that are detected; in getWinningLabels(). @return (numpy array) numLabels most-frequent classifications for the data samples; int or empty. """ totalInferenceResult = None for pattern in self.patterns[i]["pattern"]: if not pattern: continue _, inferenceResult, _, _ = self.classifier.infer( self.sparsifyPattern(pattern["bitmap"], self.encoder.n)) if totalInferenceResult is None: totalInferenceResult = inferenceResult else: totalInferenceResult += inferenceResult return self.getWinningLabels(totalInferenceResult, seed) def queryModel(self, query, preprocess=False): """ Preprocesses the query, encodes it into a pattern, then queries the classifier to infer distances to trained-on samples. @return (list) Two-tuples of sample ID and distance, sorted closest to farthest from the query. """ if preprocess: sample = TextPreprocess().tokenize(query, ignoreCommon=100, removeStrings=["[identifier deleted]"], correctSpell=True) else: sample = TextPreprocess().tokenize(query) # Get window patterns for the query, but if the query is too small such that # the window encodings are too sparse, we default to a pure union. encodedQuery = self.encodeSample(sample) if len(encodedQuery) == 0: sample = " ".join(sample) fpInfo = self.encoder.getUnionEncoding(sample) encodedQuery = [{ "text":fpInfo["text"], "sparsity":fpInfo["sparsity"], "bitmap":numpy.array(fpInfo["fingerprint"]["positions"]) }] allDistances = self.infer(encodedQuery) if len(allDistances) != len(self.sampleReference): raise IndexError("Number of protoype distances must match number of " "samples trained on.") sampleDistances = defaultdict() for uniqueID in self.sampleReference: sampleDistances[uniqueID] = min( [allDistances[i] for i, x in enumerate(self.sampleReference) if x == uniqueID]) return sorted(sampleDistances.items(), key=operator.itemgetter(1)) def infer(self, patterns): """ Get the classifier output for a single input pattern; assumes classifier has an infer() method (as specified in NuPIC kNN implementation). For this model we sum the distances across the patterns and normalize before returning. NOTE: there is no check here that the pattern sparsities are > the minimum. @return (numpy.array) Each entry is the distance from the input pattern to that prototype (pattern in the classifier). All distances are between 0.0 and 1.0 """ distances = numpy.zeros((self.classifier._numPatterns)) for i, p in enumerate(patterns): (_, _, dist, _) = self.classifier.infer( self.sparsifyPattern(p["bitmap"], self.encoder.n)) distances = distances + dist return distances / float(i+1)
def testMaxSparsity(self): """Test that CioEncoder's maxSparsity works.""" # This text seems to generate bitmaps with about 8% sparsity text = ("Smoking harms nearly every organ in your body. Over 7000 chemicals" " have been identified in tobacco smoke. After reading all this" " James and Sue decided to abruptly quit cigarette smoking to" " improve their health but it clearly was not an easy decision.") # Encoders with maxSparsity of 100%, 10%, 5%, and 1% cio100 = CioEncoder(maxSparsity=1.0, fingerprintType=EncoderTypes.document) cio10 = CioEncoder(maxSparsity=0.1, fingerprintType=EncoderTypes.document) cio5 = CioEncoder(maxSparsity=0.05, fingerprintType=EncoderTypes.document) cio1 = CioEncoder(maxSparsity=0.01, fingerprintType=EncoderTypes.document) bitmapSize = cio100.width*cio100.height r100 = cio100.encode(text) r10 = cio10.encode(text) r5 = cio5.encode(text) r1 = cio1.encode(text) length100 = len(r100["fingerprint"]["positions"]) length10 = len(r10["fingerprint"]["positions"]) length5 = len(r5["fingerprint"]["positions"]) length1 = len(r1["fingerprint"]["positions"]) # Encodings must have no more than desired sparsity self.assertLessEqual(r100["sparsity"], 1.0) self.assertLessEqual(r10["sparsity"], 0.1) self.assertLessEqual(r5["sparsity"], 0.05) self.assertLessEqual(r1["sparsity"], 0.01) self.assertLessEqual(length100, bitmapSize) self.assertLessEqual(length10, 0.1*bitmapSize) self.assertLessEqual(length5, 0.05*bitmapSize) self.assertLessEqual(length1, 0.01*bitmapSize) # Encodings can't be zero self.assertGreater(length100, 0) self.assertGreater(length10, 0) self.assertGreater(length5, 0) self.assertGreater(length1, 0) # Encodings must have complete overlap with the next higher encoding s100 = set(r100["fingerprint"]["positions"]) s10 = set(r10["fingerprint"]["positions"]) s5 = set(r5["fingerprint"]["positions"]) s1 = set(r1["fingerprint"]["positions"]) self.assertEqual(len(s100 & s10), length10) self.assertEqual(len(s10 & s5), length5) self.assertEqual(len(s5 & s1), length1) # Test that if you encode a second time, you get the same bitmap r100_2 = cio100.encode(text) r10_2 = cio10.encode(text) r5_2 = cio5.encode(text) r1_2 = cio1.encode(text) self.assertEqual(hashlib.sha224(str(r100)).hexdigest(), hashlib.sha224(str(r100_2)).hexdigest()) self.assertEqual(hashlib.sha224(str(r10)).hexdigest(), hashlib.sha224(str(r10_2)).hexdigest()) self.assertEqual(hashlib.sha224(str(r5)).hexdigest(), hashlib.sha224(str(r5_2)).hexdigest()) self.assertEqual(hashlib.sha224(str(r1)).hexdigest(), hashlib.sha224(str(r1_2)).hexdigest())
class ClassificationModelContext(ClassificationModel): """ Class to run the survey response classification task with Cortical.io text context, then AND the context From the experiment runner, the methods expect to be fed one sample at a time. """ def __init__(self, verbosity=1, numLabels=1): """ Initialize the CorticalClient and CioEncoder. Requires a valid API key. """ super(ClassificationModelContext, self).__init__(verbosity) root = os.path.dirname(os.path.realpath(__file__)) self.encoder = CioEncoder(cacheDir=os.path.join(root, "CioCache")) self.client = CorticalClient(self.encoder.apiKey) self.n = self.encoder.n self.w = int((self.encoder.targetSparsity / 100) * self.n) self.categoryBitmaps = {} self.numLabels = numLabels def encodePattern(self, pattern): """ Encode an SDR of the input string by querying the Cortical.io API. @param pattern (list) Tokenized sample, where each item is a string @return (dictionary) Dictionary, containing text, sparsity, and bitmap Example return dict: { "text": "Example text", "sparsity": 0.0, "bitmap": numpy.zeros(0) } """ text = " ".join(pattern) return {"text": text, "sparsity": 0.0, "bitmap": self._encodeText(text)} def _encodeText(self, text): fpInfo = self.encoder.encode(text) if self.verbosity > 1: print "Fingerprint sparsity = {0}%.".format(fpInfo["sparsity"]) if fpInfo: bitmap = numpy.array(fpInfo["fingerprint"]["positions"]) else: bitmap = self.encodeRandomly(text, self.n, self.w) return bitmap.astype(int) def resetModel(self): """Reset the model""" self.categoryBitmaps.clear() def trainModel(self, samples, labels): """ Train the classifier on the input sample and label. Use Cortical.io's keyword extraction to get the most relevant terms then get the intersection of those bitmaps @param samples (dictionary) Dictionary, containing text, sparsity, and bitmap @param labels (int) Reference index for the classification of this sample. """ for sample, sample_labels in zip(samples, labels): bitmaps = [sample["bitmap"].tolist()] context = self.client.getContextFromText(bitmaps, maxResults=5, getFingerprint=True) if len(context) != 0: union = numpy.zeros(0) for c in context: bitmap = c["fingerprint"]["positions"] union = numpy.union1d(bitmap, union).astype(int) for label in sample_labels: # Haven't seen the label before if label not in self.categoryBitmaps: self.categoryBitmaps[label] = union intersection = numpy.intersect1d(union, self.categoryBitmaps[label]) if intersection.size == 0: # Don't want to lose all the old information union = numpy.union1d(union, self.categoryBitmaps[label]).astype(int) # Need to sample to stay sparse count = len(union) sampleIndices = random.sample(xrange(count), min(count, self.w)) intersection = numpy.sort(union[sampleIndices]) self.categoryBitmaps[label] = intersection def testModel(self, sample): """ Test the intersection bitmap on the input sample. Returns a dictionary containing various distance metrics between the sample and the classes. @param sample (dictionary) Dictionary, containing text, sparsity, and bitmap @return (dictionary) The distances between the sample and the classes Example return dict: { 0: { "cosineSimilarity": 0.6666666666666666, "euclideanDistance": 0.3333333333333333, "jaccardDistance": 0.5, "overlappingAll": 6, "overlappingLeftRight": 0.6666666666666666, "overlappingRightLeft": 0.6666666666666666, "sizeLeft": 9, "sizeRight": 9, "weightedScoring": 0.4436476984102028 } } """ sampleBitmap = sample["bitmap"].tolist() distances = {} for cat, catBitmap in self.categoryBitmaps.iteritems(): distances[cat] = self.client.compare(sampleBitmap, catBitmap.tolist()) return self.winningLabels(distances, numberCats=self.numLabels, metric="overlappingAll") @staticmethod def winningLabels(distances, numberCats, metric): """ Return indices of winning categories, based off of the input metric. Overrides the base class implementation. """ metricValues = numpy.array([v[metric] for v in distances.values()]) sortedIdx = numpy.argsort(metricValues) # euclideanDistance and jaccardDistance are ascending descendingOrder = set(["overlappingAll", "overlappingLeftRight", "overlappingRightLeft", "cosineSimilarity", "weightedScoring"]) if metric in descendingOrder: sortedIdx = sortedIdx[::-1] return [distances.keys()[catIdx] for catIdx in sortedIdx[:numberCats]]
class ClassificationModelFingerprint(ClassificationModel): """ Class to run the survey response classification task with Coritcal.io fingerprint encodings. From the experiment runner, the methods expect to be fed one sample at a time. """ def __init__(self, verbosity=1, numLabels=3, modelDir="ClassificationModelFingerprint", fingerprintType=EncoderTypes.word, unionSparsity=0.20, retinaScaling=1.0, retina="en_associative", apiKey=None, classifierMetric="rawOverlap", cacheRoot=None): super(ClassificationModelFingerprint, self).__init__( verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) # Init kNN classifier and Cortical.io encoder; need valid API key (see # CioEncoder init for details). self.classifier = KNNClassifier(k=numLabels, distanceMethod=classifierMetric, exact=False, verbosity=verbosity-1) if fingerprintType is (not EncoderTypes.document or not EncoderTypes.word): raise ValueError("Invaid type of fingerprint encoding; see the " "EncoderTypes class for eligble types.") cacheRoot = cacheRoot or os.path.dirname(os.path.realpath(__file__)) self.encoder = CioEncoder(retinaScaling=retinaScaling, cacheDir=os.path.join(cacheRoot, "CioCache"), fingerprintType=fingerprintType, unionSparsity=unionSparsity, retina=retina, apiKey=apiKey) def encodeSample(self, sample): """ Encode an SDR of the input string by querying the Cortical.io API. If the client returns None, we create a random SDR with the model's dimensions n and w. @param sample (list) Tokenized sample, where each item is a str. @return fp (dict) The sample text, sparsity, and bitmap. Example return dict: { "text": "Example text", "sparsity": 0.03, "bitmap": numpy.array([]) } """ sample = " ".join(sample) fpInfo = self.encoder.encode(sample) if fpInfo: fp = {"text":fpInfo["text"] if "text" in fpInfo else fpInfo["term"], "sparsity":fpInfo["sparsity"], "bitmap":numpy.array(fpInfo["fingerprint"]["positions"])} else: fp = {"text":sample, "sparsity":float(self.encoder.w)/self.encoder.n, "bitmap":self.encodeRandomly( sample, self.encoder.n, self.encoder.w)} return fp def trainModel(self, i): # TODO: add batch training, where i is a list """ Train the classifier on the sample and labels for record i. The list sampleReference is populated to correlate classifier prototypes to sample IDs. """ bitmap = self.patterns[i]["pattern"]["bitmap"] count = 0 if bitmap.any(): for count, label in enumerate(self.patterns[i]["labels"]): self.classifier.learn(bitmap, label, isSparse=self.encoder.n) self.sampleReference.append(self.patterns[i]["ID"]) count += 1 return count def testModel(self, i, seed=42): """ Test the model on record i. The random seed is used in getWinningLabels(). @return (numpy array) numLabels most-frequent classifications for the data samples; int or empty. """ (_, inferenceResult, _, _) = self.classifier.infer(self.sparsifyPattern( self.patterns[i]["pattern"]["bitmap"], self.encoder.n)) return self.getWinningLabels(inferenceResult, seed)
class ClassificationModelFingerprint(ClassificationModel): """ Class to run the survey response classification task with Coritcal.io fingerprint encodings. From the experiment runner, the methods expect to be fed one sample at a time. """ def __init__(self, fingerprintType=EncoderTypes.word, unionSparsity=0.20, retinaScaling=1.0, retina="en_associative", apiKey=None, k=1, classifierMetric="rawOverlap", cacheRoot=None, **kwargs): super(ClassificationModelFingerprint, self).__init__(**kwargs) self.classifier = KNNClassifier(k=k, distanceMethod=classifierMetric, exact=False, verbosity=self.verbosity-1) # Need a valid API key for the Cortical.io encoder (see CioEncoder # constructor for details). if fingerprintType is (not EncoderTypes.document or not EncoderTypes.word): raise ValueError("Invalid type of fingerprint encoding; see the " "EncoderTypes class for eligble types.") self.encoder = CioEncoder(retinaScaling=retinaScaling, fingerprintType=fingerprintType, unionSparsity=unionSparsity, retina=retina, apiKey=apiKey, cacheDir=cacheRoot) self.currentDocument = None def trainToken(self, token, labels, sampleId, reset=0): """ Train the model with the given text token, associated labels, and sampleId. See base class for params and return type descriptions. """ if self.currentDocument is None: # start of a new document self.currentDocument = [token] else: # accumulate text for this document self.currentDocument.append(token) if reset == 1: # all text accumulated, proceed w/ training on this document document = " ".join(self.currentDocument) bitmap = self.encoder.encode(document)["fingerprint"]["positions"] if self.verbosity >= 2: print "CioFP model training with: '{}'".format(document) print "\tBitmap:", bitmap for label in labels: self.classifier.learn( bitmap, label, isSparse=self.encoder.n, partitionId=sampleId) self.currentDocument = None def inferToken(self, token, reset=0, returnDetailedResults=False, sortResults=True): """ Classify the token (i.e. run inference on the model with this document) and return classification results and (optionally) a list of sampleIds and distances. Repeated sampleIds are NOT removed from the results. See base class for params and return type descriptions. """ if self.currentDocument is None: # start of a new document self.currentDocument = [token] else: # accumulate text for this document self.currentDocument.append(token) if reset == 0: return numpy.zeros(self.numLabels), [], numpy.zeros(0) # With reset=1, all text accumulated, proceed w/ classifying this document document = " ".join(self.currentDocument) bitmap = self.encoder.encode(document)["fingerprint"]["positions"] densePattern =self.encoder.densifyPattern(bitmap) (_, inferenceResult, dist, _) = self.classifier.infer(densePattern) if self.verbosity >= 2: print "CioFP model inference with: '{}'".format(document) print "\tBitmap:", bitmap print "\tInference result=", inferenceResult print "\tDistances=", dist self.currentDocument = None # Figure out format of returned results if not returnDetailedResults: # Return non-detailed results. return inferenceResult, None, None if not sortResults: idList = [self.classifier.getPartitionId(i) for i in xrange(len(dist))] return inferenceResult, idList, dist # Return sorted results sortedIndices = dist.argsort() idList = [self.classifier.getPartitionId(i) for i in sortedIndices] sortedDistances = dist[sortedIndices] return inferenceResult, idList, sortedDistances def getEncoder(self): """ Returns the encoder instance for the model. """ return self.encoder def getClassifier(self): """ Returns the classifier instance for the model. """ return self.classifier
class ClassificationModelEndpoint(ClassificationModel): """ Class to run the survey response classification task with Cortical.io text endpoint encodings and classification system. From the experiment runner, the methods expect to be fed one sample at a time. """ def __init__(self, verbosity=1, numLabels=3, modelDir="ClassificationModelEndpoint", unionSparsity=0.20, cacheRoot=None): """ Initializes the encoder as CioEncoder; requires a valid API key. """ super(ClassificationModelEndpoint, self).__init__( verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) cacheRoot = cacheRoot or os.path.dirname(os.path.realpath(__file__)) self.encoder = CioEncoder(cacheDir=os.path.join(cacheRoot, "CioCache"), unionSparsity=unionSparsity) self.compareEncoder = LanguageEncoder() self.n = self.encoder.n self.w = int((self.encoder.targetSparsity/100) * self.n) self.categoryBitmaps = {} self.negatives = defaultdict(list) self.positives = defaultdict(list) def encodeSample(self, sample): """ Encode an SDR of the input string by querying the Cortical.io API. @param sample (list) Tokenized sample, where each item is a string @return fp (dict) The sample text, sparsity, and bitmap. Example return dict: { "text": "Example text", "sparsity": 0.03, "bitmap": numpy.array([]) } """ sample = " ".join(sample) fpInfo = self.encoder.encode(sample) if fpInfo: fp = {"text":fpInfo["text"] if "text" in fpInfo else fpInfo["term"], "sparsity":fpInfo["sparsity"], "bitmap":numpy.array(fpInfo["fingerprint"]["positions"])} else: fp = {"text":sample, "sparsity":float(self.w)/self.n, "bitmap":self.encodeRandomly(sample, self.n, self.w)} return fp def resetModel(self): """Reset the model""" self.positives.clear() self.negatives.clear() self.categoryBitmaps.clear() def trainModel(self, i, negatives=None): # TODO: add batch training, where i is a list; note we should only add # negatives when training on one sample so we know which labels to use. """ Train the classifier on the sample and labels for record i. Use Cortical.io's createClassification() to make a bitmap that represents the class. The list sampleReference is populated to correlate classifier prototypes to sample IDs. @param negative (list) Each item is the dictionary containing text, sparsity and bitmap for the negative samples. """ record = self.patterns[i] labelsToUpdateBitmaps = set() for label in record["labels"]: if record["pattern"]["text"] and record["pattern"]["bitmap"].any(): self.positives[label].append(record["pattern"]["text"]) if negatives: for neg in negatives: if neg["text"]: self.negatives[label].append(neg["text"]) labelsToUpdateBitmaps.add(label) for label in labelsToUpdateBitmaps: self.categoryBitmaps[label] = self.encoder.createCategory( str(label), self.positives[label], self.negatives[label])["positions"] self.sampleReference.append(i) def testModel(self, i, _, metric="overlappingAll"): """ Test on record i. The Cortical.io classifier returns a dictionary containing various distance metrics between the sample and the classes. @param metric (str) Distance metric use by classifier. @return (numpy array) numLabels most-frequent classifications for the data samples; int or empty. """ sampleBitmap = self.patterns[i]["pattern"]["bitmap"].tolist() distances = defaultdict(list) for cat, catBitmap in self.categoryBitmaps.iteritems(): distances[cat] = self.compareEncoder.compare(sampleBitmap, catBitmap) return self.getWinningLabels(distances, metric=metric) def getWinningLabels(self, distances, metric): """ Return indices of winning categories, based off of the input metric. Overrides the base class implementation. """ metricValues = numpy.array([v[metric] for v in distances.values()]) sortedIdx = numpy.argsort(metricValues) # euclideanDistance and jaccardDistance are ascending descendingOrder = ("overlappingAll", "overlappingLeftRight", "overlappingRightLeft", "cosineSimilarity", "weightedScoring") if metric in descendingOrder: sortedIdx = sortedIdx[::-1] return numpy.array( [distances.keys()[catIdx] for catIdx in sortedIdx[:self.numLabels]]) def getCategoryDistances(self, sort=True, save=None, labelRefs=None): """ Return a dict where keys are categories and values are dicts of distances. @param sort (bool) Sort the inner dicts with compareCategories() @param save (str) Dump catDistances to a JSON in this dir. @return (defaultdict) E.g. w/ categories 0 and 1: catDistances = { 0: { 0: {"cosineSimilarity": 1.0, ...}, 1: {"cosineSimilarity": 0.33, ...} }, 1: { 0: {"cosineSimilarity": 0.33, ...}, 1: {"cosineSimilarity": 1.0, ...} } Note the inner-dicts of catDistances are OrderedDict objects. """ catDistances = defaultdict(list) for cat, catBitmap in self.categoryBitmaps.iteritems(): catDistances[cat] = OrderedDict() for compareCat, compareBitmap in self.categoryBitmaps.iteritems(): # List is in order of self.categoryBitmaps.keys() catDistances[cat][compareCat] = self.compareEncoder.compare( catBitmap, compareBitmap) if sort: # Order each inner dict of catDistances such that the ranking is most to # least similar. catDistances = self.compareCategories(catDistances) if save is not None: self.writeOutCategories( save, comparisons=catDistances, labelRefs=labelRefs) return catDistances @staticmethod def compareCategories(catDistances, metric="overlappingAll"): """ Calculate category distances. Returns a defaultdict of category keys, where values are OrderedDicts sorted such that the most similar categories (according to the input metric) are listed first. """ descendingOrder = ("overlappingAll", "overlappingLeftRight", "overlappingRightLeft", "cosineSimilarity", "weightedScoring") categoryComparisons = defaultdict(list) for k, v in catDistances.iteritems(): # Create a dict for this category metricDict = {compareCat: distances[metric] for compareCat, distances in v.iteritems()} # Sort the dict by the metric reverse = True if metric in descendingOrder else False categoryComparisons[k] = OrderedDict( sorted(metricDict.items(), key=lambda k: k[1], reverse=reverse)) return categoryComparisons @staticmethod def query(): print "The Classification Endpoint model doesn't support this method." @staticmethod def infer(): print "The Classification Endpoint model doesn't support this method."
class ClassificationModelFingerprint(ClassificationModel): """ Class to run the survey response classification task with Coritcal.io fingerprint encodings. From the experiment runner, the methods expect to be fed one sample at a time. """ def __init__(self, fingerprintType=EncoderTypes.word, unionSparsity=0.20, retinaScaling=1.0, retina="en_associative", apiKey=None, k=1, classifierMetric="rawOverlap", cacheRoot=None, **kwargs): super(ClassificationModelFingerprint, self).__init__(**kwargs) self.classifier = KNNClassifier(k=k, distanceMethod=classifierMetric, exact=False, verbosity=self.verbosity - 1) # Need a valid API key for the Cortical.io encoder (see CioEncoder # constructor for details). if fingerprintType is (not EncoderTypes.document or not EncoderTypes.word): raise ValueError("Invalid type of fingerprint encoding; see the " "EncoderTypes class for eligble types.") self.encoder = CioEncoder(retinaScaling=retinaScaling, fingerprintType=fingerprintType, unionSparsity=unionSparsity, retina=retina, apiKey=apiKey, cacheDir=cacheRoot) self.currentDocument = None def trainToken(self, token, labels, sampleId, reset=0): """ Train the model with the given text token, associated labels, and sampleId. See base class for params and return type descriptions. """ if self.currentDocument is None: # start of a new document self.currentDocument = [token] else: # accumulate text for this document self.currentDocument.append(token) if reset == 1: # all text accumulated, proceed w/ training on this document document = " ".join(self.currentDocument) bitmap = self.encoder.encode(document)["fingerprint"]["positions"] if self.verbosity >= 2: print "CioFP model training with: '{}'".format(document) print "\tBitmap:", bitmap for label in labels: self.classifier.learn(bitmap, label, isSparse=self.encoder.n, partitionId=sampleId) self.currentDocument = None def inferToken(self, token, reset=0, returnDetailedResults=False, sortResults=True): """ Classify the token (i.e. run inference on the model with this document) and return classification results and (optionally) a list of sampleIds and distances. Repeated sampleIds are NOT removed from the results. See base class for params and return type descriptions. """ if self.currentDocument is None: # start of a new document self.currentDocument = [token] else: # accumulate text for this document self.currentDocument.append(token) if reset == 0: return numpy.zeros(self.numLabels), [], numpy.zeros(0) # With reset=1, all text accumulated, proceed w/ classifying this document document = " ".join(self.currentDocument) bitmap = self.encoder.encode(document)["fingerprint"]["positions"] densePattern = self.encoder.densifyPattern(bitmap) (_, inferenceResult, dist, _) = self.classifier.infer(densePattern) if self.verbosity >= 2: print "CioFP model inference with: '{}'".format(document) print "\tBitmap:", bitmap print "\tInference result=", inferenceResult print "\tDistances=", dist self.currentDocument = None # Figure out format of returned results if not returnDetailedResults: # Return non-detailed results. return inferenceResult, None, None if not sortResults: idList = [ self.classifier.getPartitionId(i) for i in xrange(len(dist)) ] return inferenceResult, idList, dist # Return sorted results sortedIndices = dist.argsort() idList = [self.classifier.getPartitionId(i) for i in sortedIndices] sortedDistances = dist[sortedIndices] return inferenceResult, idList, sortedDistances def getEncoder(self): """ Returns the encoder instance for the model. """ return self.encoder def getClassifier(self): """ Returns the classifier instance for the model. """ return self.classifier
def testRetinaScaling(self): """Test the CioEncoder for retina dimension scaling.""" cio = CioEncoder(retinaScaling=1.0, fingerprintType=EncoderTypes.document) cioScaled = CioEncoder(retinaScaling=0.5, fingerprintType=EncoderTypes.document) cioScaled2 = CioEncoder(retinaScaling=0.71, fingerprintType=EncoderTypes.document) self.assertAlmostEqual(int(0.5 * cio.width), cioScaled.width) self.assertAlmostEqual(int(0.5 * cio.height), cioScaled.height) self.assertAlmostEqual(int(0.71 * cio.height), cioScaled2.height) response = cio.encode(self.text) responseScaled = cioScaled.encode(self.text) responseScaled2 = cioScaled2.encode(self.text) # Each bit position should be scaled down by retinaScaling*retinaScaling self.assertLessEqual( responseScaled["fingerprint"]["positions"].sum(), 0.5 * 0.5 * response["fingerprint"]["positions"].sum()) self.assertLessEqual( responseScaled2["fingerprint"]["positions"].sum(), 0.71 * 0.71 * response["fingerprint"]["positions"].sum()) # The number of on bits in scaled retina should normally be slightly less # than the original, but can be equal in some cases self.assertLessEqual(len(responseScaled["fingerprint"]["positions"]), len(response["fingerprint"]["positions"])) self.assertLessEqual(len(responseScaled["fingerprint"]["positions"]), len(responseScaled2["fingerprint"]["positions"])) # Check that encodeIntoArray works even with weird scaling a = numpy.zeros(cioScaled2.width * cioScaled2.height) cioScaled2.encodeIntoArray(self.text, a) self.assertEqual(len(responseScaled2["fingerprint"]["positions"]), a.sum())
class ClassificationModelFingerprint(ClassificationModel): """ Class to run the survey response classification task with Coritcal.io fingerprint encodings. From the experiment runner, the methods expect to be fed one sample at a time. """ def __init__(self, verbosity=1, numLabels=3, modelDir="ClassificationModelFingerprint", fingerprintType=EncoderTypes.word, unionSparsity=0.20, retinaScaling=1.0, retina="en_associative", apiKey=None, classifierMetric="rawOverlap", cacheRoot=None): super(ClassificationModelFingerprint, self).__init__(verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) # Init kNN classifier and Cortical.io encoder; need valid API key (see # CioEncoder init for details). self.classifier = KNNClassifier(k=numLabels, distanceMethod=classifierMetric, exact=False, verbosity=verbosity - 1) if fingerprintType is (not EncoderTypes.document or not EncoderTypes.word): raise ValueError("Invaid type of fingerprint encoding; see the " "EncoderTypes class for eligble types.") cacheRoot = cacheRoot or os.path.dirname(os.path.realpath(__file__)) self.encoder = CioEncoder(retinaScaling=retinaScaling, cacheDir=os.path.join(cacheRoot, "CioCache"), fingerprintType=fingerprintType, unionSparsity=unionSparsity, retina=retina, apiKey=apiKey) def encodeSample(self, sample): """ Encode an SDR of the input string by querying the Cortical.io API. If the client returns None, we create a random SDR with the model's dimensions n and w. @param sample (list) Tokenized sample, where each item is a str. @return fp (dict) The sample text, sparsity, and bitmap. Example return dict: { "text": "Example text", "sparsity": 0.03, "bitmap": numpy.array([]) } """ sample = " ".join(sample) fpInfo = self.encoder.encode(sample) if fpInfo: fp = { "text": fpInfo["text"] if "text" in fpInfo else fpInfo["term"], "sparsity": fpInfo["sparsity"], "bitmap": numpy.array(fpInfo["fingerprint"]["positions"]) } else: fp = { "text": sample, "sparsity": float(self.encoder.w) / self.encoder.n, "bitmap": self.encodeRandomly(sample, self.encoder.n, self.encoder.w) } return fp def trainModel(self, i): # TODO: add batch training, where i is a list """ Train the classifier on the sample and labels for record i. The list sampleReference is populated to correlate classifier prototypes to sample IDs. """ bitmap = self.patterns[i]["pattern"]["bitmap"] count = 0 if bitmap.any(): for count, label in enumerate(self.patterns[i]["labels"]): self.classifier.learn(bitmap, label, isSparse=self.encoder.n) self.sampleReference.append(self.patterns[i]["ID"]) count += 1 return count def testModel(self, i, seed=42): """ Test the model on record i. The random seed is used in getWinningLabels(). @return (numpy array) numLabels most-frequent classifications for the data samples; int or empty. """ (_, inferenceResult, _, _) = self.classifier.infer( self.sparsifyPattern(self.patterns[i]["pattern"]["bitmap"], self.encoder.n)) return self.getWinningLabels(inferenceResult, seed)