def testWindowEncodings(self): """Test the CioEncoder for the sliding window encodings.""" cio = CioEncoder(fingerprintType=EncoderTypes.word) text = """ I grok people. I am people, so now I can say it in people talk. I've found out why people laugh. They laugh because it hurts so much, because it's the only thing that'll make it stop hurting.""" tokens = TextPreprocess().tokenize(text) encodingDicts = cio.getWindowEncoding(tokens, minSparsity=0.19) # Test that only dense windows get encoded self.assertTrue( len(tokens) > len(encodingDicts), "Returned incorrect number of window encodings.") # Test window windowEncoding = getTestData("cio_encoding_window.json") self.assertEqual(windowEncoding["text"], encodingDicts[-1]["text"], "Window encoding represents the wrong text.") self.assertTrue(encodingDicts[-1]["sparsity"] <= cio.unionSparsity, "Sparsity for large window is larger than the max.") self.assertSequenceEqual( windowEncoding["bitmap"], encodingDicts[-1]["bitmap"].tolist(), "Window encoding's bitmap is not as expected.")
def __init__(self, verbosity=1, numLabels=3, modelDir="ClassificationModelWindow", unionSparsity=0.20, retinaScaling=1.0, retina="en_associative", apiKey=None, classifierMetric="rawOverlap"): super(ClassificationModelWindows, self).__init__(verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) # window patterns below minSparsity will be skipped over self.minSparsity = 0.9 * unionSparsity self.classifier = KNNClassifier(k=numLabels, distanceMethod=classifierMetric, exact=False, verbosity=verbosity - 1) # need valid API key (see CioEncoder init for details) root = os.path.dirname(os.path.realpath(__file__)) self.encoder = CioEncoder(retinaScaling=retinaScaling, cacheDir=os.path.join(root, "CioCache"), fingerprintType=EncoderTypes.word, unionSparsity=unionSparsity, retina=retina, apiKey=apiKey)
def __init__(self, fingerprintType=EncoderTypes.word, unionSparsity=0.20, retinaScaling=1.0, retina="en_associative", apiKey=None, k=1, classifierMetric="rawOverlap", cacheRoot=None, **kwargs): super(ClassificationModelFingerprint, self).__init__(**kwargs) self.classifier = KNNClassifier(k=k, distanceMethod=classifierMetric, exact=False, verbosity=self.verbosity - 1) # Need a valid API key for the Cortical.io encoder (see CioEncoder # constructor for details). if fingerprintType is (not EncoderTypes.document or not EncoderTypes.word): raise ValueError("Invalid type of fingerprint encoding; see the " "EncoderTypes class for eligble types.") self.encoder = CioEncoder(retinaScaling=retinaScaling, fingerprintType=fingerprintType, unionSparsity=unionSparsity, retina=retina, apiKey=apiKey, cacheDir=cacheRoot) self.currentDocument = None
def initModel(self): """ Initialize the network; self.networdDataPath must already be set. """ encoder = CioEncoder(retinaScaling=self.retinaScaling, retina=self.retina, apiKey=self.apiKey, maxSparsity=self.maxSparsity, verbosity=self.verbosity - 1) # This encoder specifies the LanguageSensor output width. return configureNetwork(None, self.networkConfig, encoder)
def testWordFingerprint(self): """Test the Cortical.io term (word-lelevl) encoding.""" cio = CioEncoder(fingerprintType=EncoderTypes.word) response = cio.encode(self.text) self.assertFingerprintFields(response) encodingDict = getTestData("cio_encoding_word.json") self.assertEqual(encodingDict["fingerprint"]["positions"], response["fingerprint"]["positions"], "Cio bitmap is not as expected.")
def testRetinaScaling(self): """Test the CioEncoder for retina dimension scaling.""" cio = CioEncoder(retinaScaling=1.0, fingerprintType=EncoderTypes.document) cioScaled = CioEncoder(retinaScaling=0.5, fingerprintType=EncoderTypes.document) cioScaled2 = CioEncoder(retinaScaling=0.71, fingerprintType=EncoderTypes.document) self.assertAlmostEqual(int(0.5 * cio.width), cioScaled.width) self.assertAlmostEqual(int(0.5 * cio.height), cioScaled.height) self.assertAlmostEqual(int(0.71 * cio.height), cioScaled2.height) response = cio.encode(self.text) responseScaled = cioScaled.encode(self.text) responseScaled2 = cioScaled2.encode(self.text) # Each bit position should be scaled down by retinaScaling*retinaScaling self.assertLessEqual( responseScaled["fingerprint"]["positions"].sum(), 0.5 * 0.5 * response["fingerprint"]["positions"].sum()) self.assertLessEqual( responseScaled2["fingerprint"]["positions"].sum(), 0.71 * 0.71 * response["fingerprint"]["positions"].sum()) # The number of on bits in scaled retina should normally be slightly less # than the original, but can be equal in some cases self.assertLessEqual(len(responseScaled["fingerprint"]["positions"]), len(response["fingerprint"]["positions"])) self.assertLessEqual(len(responseScaled["fingerprint"]["positions"]), len(responseScaled2["fingerprint"]["positions"])) # Check that encodeIntoArray works even with weird scaling a = numpy.zeros(cioScaled2.width * cioScaled2.height) cioScaled2.encodeIntoArray(self.text, a) self.assertEqual(len(responseScaled2["fingerprint"]["positions"]), a.sum())
def __init__(self, verbosity=1, numLabels=1): """ Initialize the CorticalClient and CioEncoder. Requires a valid API key. """ super(ClassificationModelContext, self).__init__(verbosity) root = os.path.dirname(os.path.realpath(__file__)) self.encoder = CioEncoder(cacheDir=os.path.join(root, "CioCache")) self.client = CorticalClient(self.encoder.apiKey) self.n = self.encoder.n self.w = int((self.encoder.targetSparsity / 100) * self.n) self.categoryBitmaps = {} self.numLabels = numLabels
def _initModel(self, k): """ Initialize the network """ encoder = CioEncoder(retinaScaling=self.retinaScaling, retina=self.retina, fingerprintType=EncoderTypes.document, apiKey=self.apiKey, verbosity=self.verbosity-1) modelConfig["classifierRegionConfig"]["regionParams"]["k"] = k modelConfig["classifierRegionConfig"]["regionParams"][ "maxCategoryCount"] = self.numLabels self.networkConfig = modelConfig self.network = configureNetwork(None, self.networkConfig, encoder)
def _initModel(self, k): """ Initialize the network """ root = os.path.dirname(os.path.realpath(__file__)) encoder = CioEncoder(retinaScaling=self.retinaScaling, cacheDir=os.path.join(root, "CioCache"), retina=self.retina, fingerprintType=EncoderTypes.document, apiKey=self.apiKey) modelConfig["classifierRegionConfig"]["regionParams"]["k"] = k modelConfig["classifierRegionConfig"]["regionParams"][ "maxCategoryCount"] = self.numLabels self.networkConfig = modelConfig self.network = configureNetwork(None, self.networkConfig, encoder)
def initModel(self): """ Initialize the network; self.networdDataPath must already be set. """ if self.networkDataPath is not None: recordStream = FileRecordStream(streamID=self.networkDataPath) else: recordStream = None root = os.path.dirname(os.path.realpath(__file__)) encoder = CioEncoder(retinaScaling=self.retinaScaling, cacheDir=os.path.join(root, "CioCache"), retina=self.retina, apiKey=self.apiKey) # This encoder specifies the LanguageSensor output width. return configureNetwork(recordStream, self.networkConfig, encoder)
def testRetinaScaling(self): """Test the CioEncoder for retina dimension scaling.""" cio = CioEncoder(retinaScaling=0.25, fingerprintType=EncoderTypes.document) response = cio.encode(self.text) encodingDict = getTestData("cio_encoding_scaled_retina.json") self.assertEqual(encodingDict["fingerprint"]["positions"], response["fingerprint"]["positions"], "Cio bitmap is not as expected.") fullRetinaEncodingDict = getTestData("cio_encoding_document.json") fullLength = len(fullRetinaEncodingDict["fingerprint"]["positions"]) responseLength = len(response["fingerprint"]["positions"]) self.assertTrue( responseLength <= fullLength, "Retina scaling did not decrease the fingerprint size.")
def __init__(self, verbosity=1, numLabels=3, modelDir="ClassificationModelEndpoint", unionSparsity=0.20): """ Initializes the encoder as CioEncoder; requires a valid API key. """ super(ClassificationModelEndpoint, self).__init__(verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) root = os.path.dirname(os.path.realpath(__file__)) self.encoder = CioEncoder(cacheDir=os.path.join(root, "CioCache"), unionSparsity=unionSparsity) self.compareEncoder = LanguageEncoder() self.n = self.encoder.n self.w = int((self.encoder.targetSparsity / 100) * self.n) self.categoryBitmaps = {} self.negatives = defaultdict(list) self.positives = defaultdict(list)
def __init__(self, verbosity=1, numLabels=3, modelDir="ClassificationModelFingerprint", fingerprintType=EncoderTypes.word, unionSparsity=0.20, retinaScaling=1.0, retina="en_associative", apiKey=None, classifierMetric="rawOverlap", cacheRoot=None): super(ClassificationModelFingerprint, self).__init__(verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) # Init kNN classifier and Cortical.io encoder; need valid API key (see # CioEncoder init for details). self.classifier = KNNClassifier(k=numLabels, distanceMethod=classifierMetric, exact=False, verbosity=verbosity - 1) if fingerprintType is (not EncoderTypes.document or not EncoderTypes.word): raise ValueError("Invaid type of fingerprint encoding; see the " "EncoderTypes class for eligble types.") cacheRoot = cacheRoot or os.path.dirname(os.path.realpath(__file__)) self.encoder = CioEncoder(retinaScaling=retinaScaling, cacheDir=os.path.join(cacheRoot, "CioCache"), fingerprintType=fingerprintType, unionSparsity=unionSparsity, retina=retina, apiKey=apiKey)
def testMaxSparsity(self): """Test that CioEncoder's maxSparsity works.""" # This text seems to generate bitmaps with about 8% sparsity text = ( "Smoking harms nearly every organ in your body. Over 7000 chemicals" " have been identified in tobacco smoke. After reading all this" " James and Sue decided to abruptly quit cigarette smoking to" " improve their health but it clearly was not an easy decision.") # Encoders with maxSparsity of 100%, 10%, 5%, and 1% cio100 = CioEncoder(maxSparsity=1.0, fingerprintType=EncoderTypes.document) cio10 = CioEncoder(maxSparsity=0.1, fingerprintType=EncoderTypes.document) cio5 = CioEncoder(maxSparsity=0.05, fingerprintType=EncoderTypes.document) cio1 = CioEncoder(maxSparsity=0.01, fingerprintType=EncoderTypes.document) bitmapSize = cio100.width * cio100.height r100 = cio100.encode(text) r10 = cio10.encode(text) r5 = cio5.encode(text) r1 = cio1.encode(text) length100 = len(r100["fingerprint"]["positions"]) length10 = len(r10["fingerprint"]["positions"]) length5 = len(r5["fingerprint"]["positions"]) length1 = len(r1["fingerprint"]["positions"]) # Encodings must have no more than desired sparsity self.assertLessEqual(r100["sparsity"], 1.0) self.assertLessEqual(r10["sparsity"], 0.1) self.assertLessEqual(r5["sparsity"], 0.05) self.assertLessEqual(r1["sparsity"], 0.01) self.assertLessEqual(length100, bitmapSize) self.assertLessEqual(length10, 0.1 * bitmapSize) self.assertLessEqual(length5, 0.05 * bitmapSize) self.assertLessEqual(length1, 0.01 * bitmapSize) # Encodings can't be zero self.assertGreater(length100, 0) self.assertGreater(length10, 0) self.assertGreater(length5, 0) self.assertGreater(length1, 0) # Encodings must have complete overlap with the next higher encoding s100 = set(r100["fingerprint"]["positions"]) s10 = set(r10["fingerprint"]["positions"]) s5 = set(r5["fingerprint"]["positions"]) s1 = set(r1["fingerprint"]["positions"]) self.assertEqual(len(s100 & s10), length10) self.assertEqual(len(s10 & s5), length5) self.assertEqual(len(s5 & s1), length1) # Test that if you encode a second time, you get the same bitmap r100_2 = cio100.encode(text) r10_2 = cio10.encode(text) r5_2 = cio5.encode(text) r1_2 = cio1.encode(text) self.assertEqual( hashlib.sha224(str(r100)).hexdigest(), hashlib.sha224(str(r100_2)).hexdigest()) self.assertEqual( hashlib.sha224(str(r10)).hexdigest(), hashlib.sha224(str(r10_2)).hexdigest()) self.assertEqual( hashlib.sha224(str(r5)).hexdigest(), hashlib.sha224(str(r5_2)).hexdigest()) self.assertEqual( hashlib.sha224(str(r1)).hexdigest(), hashlib.sha224(str(r1_2)).hexdigest())