示例#1
0
    def testWindowEncodings(self):
        """Test the CioEncoder for the sliding window encodings."""
        cio = CioEncoder(fingerprintType=EncoderTypes.word)

        text = """
      I grok people. I am people, so now I can say it in people talk. I've found
      out why people laugh. They laugh because it hurts so much, because it's
      the only thing that'll make it stop hurting."""

        tokens = TextPreprocess().tokenize(text)

        encodingDicts = cio.getWindowEncoding(tokens, minSparsity=0.19)

        # Test that only dense windows get encoded
        self.assertTrue(
            len(tokens) > len(encodingDicts),
            "Returned incorrect number of window encodings.")

        # Test window
        windowEncoding = getTestData("cio_encoding_window.json")
        self.assertEqual(windowEncoding["text"], encodingDicts[-1]["text"],
                         "Window encoding represents the wrong text.")
        self.assertTrue(encodingDicts[-1]["sparsity"] <= cio.unionSparsity,
                        "Sparsity for large window is larger than the max.")
        self.assertSequenceEqual(
            windowEncoding["bitmap"], encodingDicts[-1]["bitmap"].tolist(),
            "Window encoding's bitmap is not as expected.")
示例#2
0
    def __init__(self,
                 verbosity=1,
                 numLabels=3,
                 modelDir="ClassificationModelWindow",
                 unionSparsity=0.20,
                 retinaScaling=1.0,
                 retina="en_associative",
                 apiKey=None,
                 classifierMetric="rawOverlap"):

        super(ClassificationModelWindows, self).__init__(verbosity=verbosity,
                                                         numLabels=numLabels,
                                                         modelDir=modelDir)

        # window patterns below minSparsity will be skipped over
        self.minSparsity = 0.9 * unionSparsity

        self.classifier = KNNClassifier(k=numLabels,
                                        distanceMethod=classifierMetric,
                                        exact=False,
                                        verbosity=verbosity - 1)

        # need valid API key (see CioEncoder init for details)
        root = os.path.dirname(os.path.realpath(__file__))
        self.encoder = CioEncoder(retinaScaling=retinaScaling,
                                  cacheDir=os.path.join(root, "CioCache"),
                                  fingerprintType=EncoderTypes.word,
                                  unionSparsity=unionSparsity,
                                  retina=retina,
                                  apiKey=apiKey)
    def __init__(self,
                 fingerprintType=EncoderTypes.word,
                 unionSparsity=0.20,
                 retinaScaling=1.0,
                 retina="en_associative",
                 apiKey=None,
                 k=1,
                 classifierMetric="rawOverlap",
                 cacheRoot=None,
                 **kwargs):

        super(ClassificationModelFingerprint, self).__init__(**kwargs)

        self.classifier = KNNClassifier(k=k,
                                        distanceMethod=classifierMetric,
                                        exact=False,
                                        verbosity=self.verbosity - 1)

        # Need a valid API key for the Cortical.io encoder (see CioEncoder
        # constructor for details).
        if fingerprintType is (not EncoderTypes.document
                               or not EncoderTypes.word):
            raise ValueError("Invalid type of fingerprint encoding; see the "
                             "EncoderTypes class for eligble types.")

        self.encoder = CioEncoder(retinaScaling=retinaScaling,
                                  fingerprintType=fingerprintType,
                                  unionSparsity=unionSparsity,
                                  retina=retina,
                                  apiKey=apiKey,
                                  cacheDir=cacheRoot)

        self.currentDocument = None
    def initModel(self):
        """
    Initialize the network; self.networdDataPath must already be set.
    """
        encoder = CioEncoder(retinaScaling=self.retinaScaling,
                             retina=self.retina,
                             apiKey=self.apiKey,
                             maxSparsity=self.maxSparsity,
                             verbosity=self.verbosity - 1)

        # This encoder specifies the LanguageSensor output width.
        return configureNetwork(None, self.networkConfig, encoder)
示例#5
0
    def testWordFingerprint(self):
        """Test the Cortical.io term (word-lelevl) encoding."""

        cio = CioEncoder(fingerprintType=EncoderTypes.word)
        response = cio.encode(self.text)

        self.assertFingerprintFields(response)

        encodingDict = getTestData("cio_encoding_word.json")

        self.assertEqual(encodingDict["fingerprint"]["positions"],
                         response["fingerprint"]["positions"],
                         "Cio bitmap is not as expected.")
    def testRetinaScaling(self):
        """Test the CioEncoder for retina dimension scaling."""

        cio = CioEncoder(retinaScaling=1.0,
                         fingerprintType=EncoderTypes.document)
        cioScaled = CioEncoder(retinaScaling=0.5,
                               fingerprintType=EncoderTypes.document)
        cioScaled2 = CioEncoder(retinaScaling=0.71,
                                fingerprintType=EncoderTypes.document)

        self.assertAlmostEqual(int(0.5 * cio.width), cioScaled.width)
        self.assertAlmostEqual(int(0.5 * cio.height), cioScaled.height)
        self.assertAlmostEqual(int(0.71 * cio.height), cioScaled2.height)

        response = cio.encode(self.text)
        responseScaled = cioScaled.encode(self.text)
        responseScaled2 = cioScaled2.encode(self.text)

        # Each bit position should be scaled down by retinaScaling*retinaScaling
        self.assertLessEqual(
            responseScaled["fingerprint"]["positions"].sum(),
            0.5 * 0.5 * response["fingerprint"]["positions"].sum())

        self.assertLessEqual(
            responseScaled2["fingerprint"]["positions"].sum(),
            0.71 * 0.71 * response["fingerprint"]["positions"].sum())

        # The number of on bits in scaled retina should normally be slightly less
        # than the original, but can be equal in some cases
        self.assertLessEqual(len(responseScaled["fingerprint"]["positions"]),
                             len(response["fingerprint"]["positions"]))
        self.assertLessEqual(len(responseScaled["fingerprint"]["positions"]),
                             len(responseScaled2["fingerprint"]["positions"]))

        # Check that encodeIntoArray works even with weird scaling
        a = numpy.zeros(cioScaled2.width * cioScaled2.height)
        cioScaled2.encodeIntoArray(self.text, a)
        self.assertEqual(len(responseScaled2["fingerprint"]["positions"]),
                         a.sum())
示例#7
0
  def __init__(self, verbosity=1, numLabels=1):
    """
    Initialize the CorticalClient and CioEncoder. Requires a valid API key.
    """
    super(ClassificationModelContext, self).__init__(verbosity)

    root = os.path.dirname(os.path.realpath(__file__))
    self.encoder = CioEncoder(cacheDir=os.path.join(root, "CioCache"))
    self.client = CorticalClient(self.encoder.apiKey)

    self.n = self.encoder.n
    self.w = int((self.encoder.targetSparsity / 100) * self.n)

    self.categoryBitmaps = {}
    self.numLabels = numLabels
示例#8
0
  def _initModel(self, k):
    """
    Initialize the network
    """
    encoder = CioEncoder(retinaScaling=self.retinaScaling,
                         retina=self.retina,
                         fingerprintType=EncoderTypes.document,
                         apiKey=self.apiKey,
                         verbosity=self.verbosity-1)

    modelConfig["classifierRegionConfig"]["regionParams"]["k"] = k
    modelConfig["classifierRegionConfig"]["regionParams"][
                "maxCategoryCount"] = self.numLabels
    self.networkConfig = modelConfig
    self.network = configureNetwork(None, self.networkConfig, encoder)
示例#9
0
  def _initModel(self, k):
    """
    Initialize the network
    """
    root = os.path.dirname(os.path.realpath(__file__))
    encoder = CioEncoder(retinaScaling=self.retinaScaling,
                         cacheDir=os.path.join(root, "CioCache"),
                         retina=self.retina,
                         fingerprintType=EncoderTypes.document,
                         apiKey=self.apiKey)

    modelConfig["classifierRegionConfig"]["regionParams"]["k"] = k
    modelConfig["classifierRegionConfig"]["regionParams"][
                "maxCategoryCount"] = self.numLabels
    self.networkConfig = modelConfig
    self.network = configureNetwork(None, self.networkConfig, encoder)
示例#10
0
    def initModel(self):
        """
    Initialize the network; self.networdDataPath must already be set.
    """
        if self.networkDataPath is not None:
            recordStream = FileRecordStream(streamID=self.networkDataPath)
        else:
            recordStream = None

        root = os.path.dirname(os.path.realpath(__file__))
        encoder = CioEncoder(retinaScaling=self.retinaScaling,
                             cacheDir=os.path.join(root, "CioCache"),
                             retina=self.retina,
                             apiKey=self.apiKey)

        # This encoder specifies the LanguageSensor output width.
        return configureNetwork(recordStream, self.networkConfig, encoder)
示例#11
0
    def testRetinaScaling(self):
        """Test the CioEncoder for retina dimension scaling."""

        cio = CioEncoder(retinaScaling=0.25,
                         fingerprintType=EncoderTypes.document)
        response = cio.encode(self.text)

        encodingDict = getTestData("cio_encoding_scaled_retina.json")

        self.assertEqual(encodingDict["fingerprint"]["positions"],
                         response["fingerprint"]["positions"],
                         "Cio bitmap is not as expected.")

        fullRetinaEncodingDict = getTestData("cio_encoding_document.json")
        fullLength = len(fullRetinaEncodingDict["fingerprint"]["positions"])
        responseLength = len(response["fingerprint"]["positions"])

        self.assertTrue(
            responseLength <= fullLength,
            "Retina scaling did not decrease the fingerprint size.")
示例#12
0
    def __init__(self,
                 verbosity=1,
                 numLabels=3,
                 modelDir="ClassificationModelEndpoint",
                 unionSparsity=0.20):
        """
    Initializes the encoder as CioEncoder; requires a valid API key.
    """
        super(ClassificationModelEndpoint, self).__init__(verbosity=verbosity,
                                                          numLabels=numLabels,
                                                          modelDir=modelDir)

        root = os.path.dirname(os.path.realpath(__file__))
        self.encoder = CioEncoder(cacheDir=os.path.join(root, "CioCache"),
                                  unionSparsity=unionSparsity)
        self.compareEncoder = LanguageEncoder()

        self.n = self.encoder.n
        self.w = int((self.encoder.targetSparsity / 100) * self.n)

        self.categoryBitmaps = {}
        self.negatives = defaultdict(list)
        self.positives = defaultdict(list)
示例#13
0
    def __init__(self,
                 verbosity=1,
                 numLabels=3,
                 modelDir="ClassificationModelFingerprint",
                 fingerprintType=EncoderTypes.word,
                 unionSparsity=0.20,
                 retinaScaling=1.0,
                 retina="en_associative",
                 apiKey=None,
                 classifierMetric="rawOverlap",
                 cacheRoot=None):

        super(ClassificationModelFingerprint,
              self).__init__(verbosity=verbosity,
                             numLabels=numLabels,
                             modelDir=modelDir)

        # Init kNN classifier and Cortical.io encoder; need valid API key (see
        # CioEncoder init for details).
        self.classifier = KNNClassifier(k=numLabels,
                                        distanceMethod=classifierMetric,
                                        exact=False,
                                        verbosity=verbosity - 1)

        if fingerprintType is (not EncoderTypes.document
                               or not EncoderTypes.word):
            raise ValueError("Invaid type of fingerprint encoding; see the "
                             "EncoderTypes class for eligble types.")

        cacheRoot = cacheRoot or os.path.dirname(os.path.realpath(__file__))

        self.encoder = CioEncoder(retinaScaling=retinaScaling,
                                  cacheDir=os.path.join(cacheRoot, "CioCache"),
                                  fingerprintType=fingerprintType,
                                  unionSparsity=unionSparsity,
                                  retina=retina,
                                  apiKey=apiKey)
    def testMaxSparsity(self):
        """Test that CioEncoder's maxSparsity works."""

        # This text seems to generate bitmaps with about 8% sparsity
        text = (
            "Smoking harms nearly every organ in your body. Over 7000 chemicals"
            " have been identified in tobacco smoke. After reading all this"
            " James and Sue decided to abruptly quit cigarette smoking to"
            " improve their health but it clearly was not an easy decision.")

        # Encoders with maxSparsity of 100%, 10%, 5%, and 1%
        cio100 = CioEncoder(maxSparsity=1.0,
                            fingerprintType=EncoderTypes.document)
        cio10 = CioEncoder(maxSparsity=0.1,
                           fingerprintType=EncoderTypes.document)
        cio5 = CioEncoder(maxSparsity=0.05,
                          fingerprintType=EncoderTypes.document)
        cio1 = CioEncoder(maxSparsity=0.01,
                          fingerprintType=EncoderTypes.document)

        bitmapSize = cio100.width * cio100.height
        r100 = cio100.encode(text)
        r10 = cio10.encode(text)
        r5 = cio5.encode(text)
        r1 = cio1.encode(text)

        length100 = len(r100["fingerprint"]["positions"])
        length10 = len(r10["fingerprint"]["positions"])
        length5 = len(r5["fingerprint"]["positions"])
        length1 = len(r1["fingerprint"]["positions"])

        # Encodings must have no more than desired sparsity
        self.assertLessEqual(r100["sparsity"], 1.0)
        self.assertLessEqual(r10["sparsity"], 0.1)
        self.assertLessEqual(r5["sparsity"], 0.05)
        self.assertLessEqual(r1["sparsity"], 0.01)

        self.assertLessEqual(length100, bitmapSize)
        self.assertLessEqual(length10, 0.1 * bitmapSize)
        self.assertLessEqual(length5, 0.05 * bitmapSize)
        self.assertLessEqual(length1, 0.01 * bitmapSize)

        # Encodings can't be zero
        self.assertGreater(length100, 0)
        self.assertGreater(length10, 0)
        self.assertGreater(length5, 0)
        self.assertGreater(length1, 0)

        # Encodings must have complete overlap with the next higher encoding
        s100 = set(r100["fingerprint"]["positions"])
        s10 = set(r10["fingerprint"]["positions"])
        s5 = set(r5["fingerprint"]["positions"])
        s1 = set(r1["fingerprint"]["positions"])
        self.assertEqual(len(s100 & s10), length10)
        self.assertEqual(len(s10 & s5), length5)
        self.assertEqual(len(s5 & s1), length1)

        # Test that if you encode a second time, you get the same bitmap
        r100_2 = cio100.encode(text)
        r10_2 = cio10.encode(text)
        r5_2 = cio5.encode(text)
        r1_2 = cio1.encode(text)

        self.assertEqual(
            hashlib.sha224(str(r100)).hexdigest(),
            hashlib.sha224(str(r100_2)).hexdigest())
        self.assertEqual(
            hashlib.sha224(str(r10)).hexdigest(),
            hashlib.sha224(str(r10_2)).hexdigest())
        self.assertEqual(
            hashlib.sha224(str(r5)).hexdigest(),
            hashlib.sha224(str(r5_2)).hexdigest())
        self.assertEqual(
            hashlib.sha224(str(r1)).hexdigest(),
            hashlib.sha224(str(r1_2)).hexdigest())