def testWindowEncodings(self):
    """Test the CioEncoder for the sliding window encodings."""
    cio = CioEncoder(fingerprintType=EncoderTypes.word)

    text = """
      I grok people. I am people, so now I can say it in people talk. I've found
      out why people laugh. They laugh because it hurts so much, because it's
      the only thing that'll make it stop hurting."""

    tokens = TextPreprocess().tokenize(text)

    encodingDicts = cio.getWindowEncoding(tokens, minSparsity=0.19)
    
    # Test that only dense windows get encoded
    self.assertTrue(len(tokens) > len(encodingDicts),
      "Returned incorrect number of window encodings.")

    # Test window
    windowEncoding = getTestData("cio_encoding_window.json")
    self.assertEqual(windowEncoding["text"], encodingDicts[-1]["text"],
      "Window encoding represents the wrong text.")
    self.assertTrue(encodingDicts[-1]["sparsity"] <= cio.unionSparsity,
      "Sparsity for large window is larger than the max.")
    self.assertSequenceEqual(
      windowEncoding["bitmap"], encodingDicts[-1]["bitmap"].tolist(),
      "Window encoding's bitmap is not as expected.")
예제 #2
0
    def testWindowEncodings(self):
        """Test the CioEncoder for the sliding window encodings."""
        cio = CioEncoder(fingerprintType=EncoderTypes.word)

        text = """
      I grok people. I am people, so now I can say it in people talk. I've found
      out why people laugh. They laugh because it hurts so much, because it's
      the only thing that'll make it stop hurting."""

        tokens = TextPreprocess().tokenize(text)

        encodingDicts = cio.getWindowEncoding(tokens, minSparsity=0.19)

        # Test that only dense windows get encoded
        self.assertTrue(
            len(tokens) > len(encodingDicts),
            "Returned incorrect number of window encodings.")

        # Test window
        windowEncoding = getTestData("cio_encoding_window.json")
        self.assertEqual(windowEncoding["text"], encodingDicts[-1]["text"],
                         "Window encoding represents the wrong text.")
        self.assertTrue(encodingDicts[-1]["sparsity"] <= cio.unionSparsity,
                        "Sparsity for large window is larger than the max.")
        self.assertSequenceEqual(
            windowEncoding["bitmap"], encodingDicts[-1]["bitmap"].tolist(),
            "Window encoding's bitmap is not as expected.")
예제 #3
0
class ClassificationModelWindows(ClassificationModel):
  """
  Class to run classification tasks with a sliding windwo of Coritcal.io word
  fingerprint encodings.
  """

  def __init__(self,
               verbosity=1,
               numLabels=3,
               modelDir="ClassificationModelWindow",
               unionSparsity=0.20,
               retinaScaling=1.0,
               retina="en_associative",
               apiKey=None,
               classifierMetric="rawOverlap",
               cacheRoot=None):

    super(ClassificationModelWindows, self).__init__(
      verbosity=verbosity, numLabels=numLabels, modelDir=modelDir)

    # window patterns below minSparsity will be skipped over
    self.minSparsity = 0.9 * unionSparsity

    self.classifier = KNNClassifier(k=numLabels,
                                    distanceMethod=classifierMetric,
                                    exact=False,
                                    verbosity=verbosity-1)

    # need valid API key (see CioEncoder init for details)
    cacheRoot = cacheRoot or os.path.dirname(os.path.realpath(__file__))
    self.encoder = CioEncoder(retinaScaling=retinaScaling,
                              cacheDir=os.path.join(cacheRoot, "CioCache"),
                              fingerprintType=EncoderTypes.word,
                              unionSparsity=unionSparsity,
                              retina=retina,
                              apiKey=apiKey)


  def encodeSample(self, sample):
    """
    Encode an SDR of the input string by querying the Cortical.io API for each
    word. The resulting bitmaps are unionized in a sliding window.

    @param sample     (list)        Tokenized sample, where each item is a str.
    @return           (list)        Pattern dicts for the windows, each with the
                                    sample text, sparsity, and bitmap.
    """
    return self.encoder.getWindowEncoding(sample, self.minSparsity)


  def writeOutEncodings(self):
    """
    Write the encoding dictionaries to a txt file; overrides the superclass
    implementation.
    """
    if not os.path.isdir(self.modelDir):
      raise ValueError("Invalid path to write file.")

    # Cast numpy arrays to list objects for serialization.
    jsonPatterns = copy.deepcopy(self.patterns)
    for jp in jsonPatterns:
      for tokenPattern in jp["pattern"]:
        tokenPattern["bitmap"] = tokenPattern.get(
          "bitmap", numpy.array([])).tolist()
      jp["labels"] = jp.get("labels", numpy.array([])).tolist()

    with open(os.path.join(self.modelDir, "encoding_log.txt"), "w") as f:
      f.write(json.dumps(jsonPatterns, indent=1))


  def trainModel(self, i):
    # TODO: add batch training, where i is a list
    """
    Train the classifier on the sample and labels for record i. The list
    sampleReference is populated to correlate classifier prototypes to sample
    IDs. This model is unique in that a single sample contains multiple encoded
    patterns, of which, any that are too sparse are skipped over.

    @return       (int)     Number of patterns trained on.
    """
    patternWindows = self.patterns[i]["pattern"]
    if len(patternWindows) == 0:
      # no patterns b/c no windows were large enough for encoding
      return
    count = 0
    for window in patternWindows:
      for label in self.patterns[i]["labels"]:
        self.classifier.learn(
          window["bitmap"], label, isSparse=self.encoder.n)
        self.sampleReference.append(self.patterns[i]["ID"])
        count += 1

    return count


  def testModel(self, i, seed=42):
    """
    Test the model on record i. Returns the classifications most frequent
    amongst the classifications of the sample's individual tokens.
    We ignore the terms that are unclassified, picking the most frequent
    classifications among those that are detected; in getWinningLabels().

    @return           (numpy array)   numLabels most-frequent classifications
                                      for the data samples; int or empty.
    """
    totalInferenceResult = None
    for pattern in self.patterns[i]["pattern"]:
      if not pattern:
        continue

      _, inferenceResult, _, _ = self.classifier.infer(
        self.sparsifyPattern(pattern["bitmap"], self.encoder.n))

      if totalInferenceResult is None:
        totalInferenceResult = inferenceResult
      else:
        totalInferenceResult += inferenceResult

    return self.getWinningLabels(totalInferenceResult, seed)


  def queryModel(self, query, preprocess=False):
    """
    Preprocesses the query, encodes it into a pattern, then queries the
    classifier to infer distances to trained-on samples.
    @return       (list)          Two-tuples of sample ID and distance, sorted
                                  closest to farthest from the query.
    """
    if preprocess:
      sample = TextPreprocess().tokenize(query,
                                         ignoreCommon=100,
                                         removeStrings=["[identifier deleted]"],
                                         correctSpell=True)
    else:
      sample = TextPreprocess().tokenize(query)

    # Get window patterns for the query, but if the query is too small such that
    # the window encodings are too sparse, we default to a pure union.
    encodedQuery = self.encodeSample(sample)
    if len(encodedQuery) == 0:
      sample = " ".join(sample)
      fpInfo = self.encoder.getUnionEncoding(sample)
      encodedQuery = [{
        "text":fpInfo["text"],
        "sparsity":fpInfo["sparsity"],
        "bitmap":numpy.array(fpInfo["fingerprint"]["positions"])
      }]
    allDistances = self.infer(encodedQuery)

    if len(allDistances) != len(self.sampleReference):
      raise IndexError("Number of protoype distances must match number of "
                       "samples trained on.")

    sampleDistances = defaultdict()
    for uniqueID in self.sampleReference:
      sampleDistances[uniqueID] = min(
        [allDistances[i] for i, x in enumerate(self.sampleReference)
         if x == uniqueID])

    return sorted(sampleDistances.items(), key=operator.itemgetter(1))


  def infer(self, patterns):
    """
    Get the classifier output for a single input pattern; assumes classifier
    has an infer() method (as specified in NuPIC kNN implementation). For this
    model we sum the distances across the patterns and normalize
    before returning.

    NOTE: there is no check here that the pattern sparsities are > the minimum.

    @return       (numpy.array)       Each entry is the distance from the
        input pattern to that prototype (pattern in the classifier). All
        distances are between 0.0 and 1.0
    """
    distances = numpy.zeros((self.classifier._numPatterns))

    for i, p in enumerate(patterns):
      (_, _, dist, _) = self.classifier.infer(
        self.sparsifyPattern(p["bitmap"], self.encoder.n))

      distances = distances + dist

    return distances / float(i+1)
예제 #4
0
class ClassificationModelWindows(ClassificationModel):
    """
  Class to run classification tasks with a sliding windwo of Coritcal.io word
  fingerprint encodings.
  """
    def __init__(self,
                 verbosity=1,
                 numLabels=3,
                 modelDir="ClassificationModelWindow",
                 unionSparsity=0.20,
                 retinaScaling=1.0,
                 retina="en_associative",
                 apiKey=None,
                 classifierMetric="rawOverlap"):

        super(ClassificationModelWindows, self).__init__(verbosity=verbosity,
                                                         numLabels=numLabels,
                                                         modelDir=modelDir)

        # window patterns below minSparsity will be skipped over
        self.minSparsity = 0.9 * unionSparsity

        self.classifier = KNNClassifier(k=numLabels,
                                        distanceMethod=classifierMetric,
                                        exact=False,
                                        verbosity=verbosity - 1)

        # need valid API key (see CioEncoder init for details)
        root = os.path.dirname(os.path.realpath(__file__))
        self.encoder = CioEncoder(retinaScaling=retinaScaling,
                                  cacheDir=os.path.join(root, "CioCache"),
                                  fingerprintType=EncoderTypes.word,
                                  unionSparsity=unionSparsity,
                                  retina=retina,
                                  apiKey=apiKey)

    def encodeSample(self, sample):
        """
    Encode an SDR of the input string by querying the Cortical.io API for each
    word. The resulting bitmaps are unionized in a sliding window.

    @param sample     (list)        Tokenized sample, where each item is a str.
    @return           (list)        Pattern dicts for the windows, each with the
                                    sample text, sparsity, and bitmap.
    """
        return self.encoder.getWindowEncoding(sample, self.minSparsity)

    def writeOutEncodings(self):
        """
    Write the encoding dictionaries to a txt file; overrides the superclass
    implementation.
    """
        if not os.path.isdir(self.modelDir):
            raise ValueError("Invalid path to write file.")

        # Cast numpy arrays to list objects for serialization.
        jsonPatterns = copy.deepcopy(self.patterns)
        for jp in jsonPatterns:
            for tokenPattern in jp["pattern"]:
                tokenPattern["bitmap"] = tokenPattern.get(
                    "bitmap", numpy.array([])).tolist()
            jp["labels"] = jp.get("labels", numpy.array([])).tolist()

        with open(os.path.join(self.modelDir, "encoding_log.txt"), "w") as f:
            f.write(json.dumps(jsonPatterns, indent=1))

    def trainModel(self, i):
        # TODO: add batch training, where i is a list
        """
    Train the classifier on the sample and labels for record i. The list
    sampleReference is populated to correlate classifier prototypes to sample
    IDs. This model is unique in that a single sample contains multiple encoded
    patterns, of which, any that are too sparse are skipped over.

    @return       (int)     Number of patterns trained on.
    """
        patternWindows = self.patterns[i]["pattern"]
        if len(patternWindows) == 0:
            # no patterns b/c no windows were large enough for encoding
            return
        count = 0
        for window in patternWindows:
            for label in self.patterns[i]["labels"]:
                self.classifier.learn(window["bitmap"],
                                      label,
                                      isSparse=self.encoder.n)
                self.sampleReference.append(self.patterns[i]["ID"])
                count += 1

        return count

    def testModel(self, i, seed=42):
        """
    Test the model on record i. Returns the classifications most frequent
    amongst the classifications of the sample's individual tokens.
    We ignore the terms that are unclassified, picking the most frequent
    classifications among those that are detected; in getWinningLabels().

    @return           (numpy array)   numLabels most-frequent classifications
                                      for the data samples; int or empty.
    """
        totalInferenceResult = None
        for pattern in self.patterns[i]["pattern"]:
            if not pattern:
                continue

            _, inferenceResult, _, _ = self.classifier.infer(
                self.sparsifyPattern(pattern["bitmap"], self.encoder.n))

            if totalInferenceResult is None:
                totalInferenceResult = inferenceResult
            else:
                totalInferenceResult += inferenceResult

        return self.getWinningLabels(totalInferenceResult, seed)

    def queryModel(self, query, preprocess=False):
        """
    Preprocesses the query, encodes it into a pattern, then queries the
    classifier to infer distances to trained-on samples.
    @return       (list)          Two-tuples of sample ID and distance, sorted
                                  closest to farthest from the query.
    """
        if preprocess:
            sample = TextPreprocess().tokenize(
                query,
                ignoreCommon=100,
                removeStrings=["[identifier deleted]"],
                correctSpell=True)
        else:
            sample = TextPreprocess().tokenize(query)

        # Get window patterns for the query, but if the query is too small such that
        # the window encodings are too sparse, we default to a pure union.
        encodedQuery = self.encodeSample(sample)
        if len(encodedQuery) == 0:
            sample = " ".join(sample)
            fpInfo = self.encoder.getUnionEncoding(sample)
            encodedQuery = [{
                "text":
                fpInfo["text"],
                "sparsity":
                fpInfo["sparsity"],
                "bitmap":
                numpy.array(fpInfo["fingerprint"]["positions"])
            }]
        allDistances = self.infer(encodedQuery)

        if len(allDistances) != len(self.sampleReference):
            raise IndexError(
                "Number of protoype distances must match number of "
                "samples trained on.")

        sampleDistances = defaultdict()
        for uniqueID in self.sampleReference:
            sampleDistances[uniqueID] = min([
                allDistances[i] for i, x in enumerate(self.sampleReference)
                if x == uniqueID
            ])

        return sorted(sampleDistances.items(), key=operator.itemgetter(1))

    def infer(self, patterns):
        """
    Get the classifier output for a single input pattern; assumes classifier
    has an infer() method (as specified in NuPIC kNN implementation). For this
    model we sum the distances across the patterns and normalize
    before returning.

    NOTE: there is no check here that the pattern sparsities are > the minimum.

    @return       (numpy.array)       Each entry is the distance from the
        input pattern to that prototype (pattern in the classifier). All
        distances are between 0.0 and 1.0
    """
        distances = numpy.zeros((self.classifier._numPatterns))

        for i, p in enumerate(patterns):
            (_, _, dist, _) = self.classifier.infer(
                self.sparsifyPattern(p["bitmap"], self.encoder.n))

            distances = distances + dist

        return distances / float(i + 1)