Python CioEncoder.getWindowEncoding 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: htmresearch.encoders.cio_encoder

클래스/타입: CioEncoder

메소드/함수: getWindowEncoding

hotexamples.com에서의 예제들: 4

Python CioEncoder.getWindowEncoding - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 htmresearch.encoders.cio_encoder.CioEncoder.getWindowEncoding에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

CioEncoder(14)

encode(8)

getWindowEncoding(2)

createCategory(1)

densifyPattern(1)

encodeIntoArray(1)

getUnionEncoding(1)

예제 #1

파일 보기

파일: cio_encodings_test.py 프로젝트: Starcounter-Jack/nupic.research

  def testWindowEncodings(self):
    """Test the CioEncoder for the sliding window encodings."""
    cio = CioEncoder(fingerprintType=EncoderTypes.word)

    text = """
      I grok people. I am people, so now I can say it in people talk. I've found
      out why people laugh. They laugh because it hurts so much, because it's
      the only thing that'll make it stop hurting."""

    tokens = TextPreprocess().tokenize(text)

    encodingDicts = cio.getWindowEncoding(tokens, minSparsity=0.19)
    
    # Test that only dense windows get encoded
    self.assertTrue(len(tokens) > len(encodingDicts),
      "Returned incorrect number of window encodings.")

    # Test window
    windowEncoding = getTestData("cio_encoding_window.json")
    self.assertEqual(windowEncoding["text"], encodingDicts[-1]["text"],
      "Window encoding represents the wrong text.")
    self.assertTrue(encodingDicts[-1]["sparsity"] <= cio.unionSparsity,
      "Sparsity for large window is larger than the max.")
    self.assertSequenceEqual(
      windowEncoding["bitmap"], encodingDicts[-1]["bitmap"].tolist(),
      "Window encoding's bitmap is not as expected.")

예제 #2

파일 보기

파일: cio_encodings_test.py 프로젝트: oxtopus/htmresearch

    def testWindowEncodings(self):
        """Test the CioEncoder for the sliding window encodings."""
        cio = CioEncoder(fingerprintType=EncoderTypes.word)

        text = """
      I grok people. I am people, so now I can say it in people talk. I've found
      out why people laugh. They laugh because it hurts so much, because it's
      the only thing that'll make it stop hurting."""

        tokens = TextPreprocess().tokenize(text)

        encodingDicts = cio.getWindowEncoding(tokens, minSparsity=0.19)

        # Test that only dense windows get encoded
        self.assertTrue(
            len(tokens) > len(encodingDicts),
            "Returned incorrect number of window encodings.")

        # Test window
        windowEncoding = getTestData("cio_encoding_window.json")
        self.assertEqual(windowEncoding["text"], encodingDicts[-1]["text"],
                         "Window encoding represents the wrong text.")
        self.assertTrue(encodingDicts[-1]["sparsity"] <= cio.unionSparsity,
                        "Sparsity for large window is larger than the max.")
        self.assertSequenceEqual(
            windowEncoding["bitmap"], encodingDicts[-1]["bitmap"].tolist(),
            "Window encoding's bitmap is not as expected.")

예제 #3

파일 보기

파일: classify_windows.py 프로젝트: aoman89757/nupic.research

class ClassificationModelWindows(ClassificationModel):
  """
  Class to run classification tasks with a sliding windwo of Coritcal.io word
  fingerprint encodings.
  """

  def __init__(self,
               verbosity=1,
               numLabels=3,
               modelDir="ClassificationModelWindow",
               unionSparsity=0.20,
               retinaScaling=1.0,
               retina="en_associative",
               apiKey=None,
               classifierMetric="rawOverlap",
               cacheRoot=None):

    super(ClassificationModelWindows, self).__init__(
      verbosity=verbosity, numLabels=numLabels, modelDir=modelDir)

    # window patterns below minSparsity will be skipped over
    self.minSparsity = 0.9 * unionSparsity

    self.classifier = KNNClassifier(k=numLabels,
                                    distanceMethod=classifierMetric,
                                    exact=False,
                                    verbosity=verbosity-1)

    # need valid API key (see CioEncoder init for details)
    cacheRoot = cacheRoot or os.path.dirname(os.path.realpath(__file__))
    self.encoder = CioEncoder(retinaScaling=retinaScaling,
                              cacheDir=os.path.join(cacheRoot, "CioCache"),
                              fingerprintType=EncoderTypes.word,
                              unionSparsity=unionSparsity,
                              retina=retina,
                              apiKey=apiKey)


  def encodeSample(self, sample):
    """
    Encode an SDR of the input string by querying the Cortical.io API for each
    word. The resulting bitmaps are unionized in a sliding window.

    @param sample     (list)        Tokenized sample, where each item is a str.
    @return           (list)        Pattern dicts for the windows, each with the
                                    sample text, sparsity, and bitmap.
    """
    return self.encoder.getWindowEncoding(sample, self.minSparsity)


  def writeOutEncodings(self):
    """
    Write the encoding dictionaries to a txt file; overrides the superclass
    implementation.
    """
    if not os.path.isdir(self.modelDir):
      raise ValueError("Invalid path to write file.")

    # Cast numpy arrays to list objects for serialization.
    jsonPatterns = copy.deepcopy(self.patterns)
    for jp in jsonPatterns:
      for tokenPattern in jp["pattern"]:
        tokenPattern["bitmap"] = tokenPattern.get(
          "bitmap", numpy.array([])).tolist()
      jp["labels"] = jp.get("labels", numpy.array([])).tolist()

    with open(os.path.join(self.modelDir, "encoding_log.txt"), "w") as f:
      f.write(json.dumps(jsonPatterns, indent=1))


  def trainModel(self, i):
    # TODO: add batch training, where i is a list
    """
    Train the classifier on the sample and labels for record i. The list
    sampleReference is populated to correlate classifier prototypes to sample
    IDs. This model is unique in that a single sample contains multiple encoded
    patterns, of which, any that are too sparse are skipped over.

    @return       (int)     Number of patterns trained on.
    """
    patternWindows = self.patterns[i]["pattern"]
    if len(patternWindows) == 0:
      # no patterns b/c no windows were large enough for encoding
      return
    count = 0
    for window in patternWindows:
      for label in self.patterns[i]["labels"]:
        self.classifier.learn(
          window["bitmap"], label, isSparse=self.encoder.n)
        self.sampleReference.append(self.patterns[i]["ID"])
        count += 1

    return count


  def testModel(self, i, seed=42):
    """
    Test the model on record i. Returns the classifications most frequent
    amongst the classifications of the sample's individual tokens.
    We ignore the terms that are unclassified, picking the most frequent
    classifications among those that are detected; in getWinningLabels().

    @return           (numpy array)   numLabels most-frequent classifications
                                      for the data samples; int or empty.
    """
    totalInferenceResult = None
    for pattern in self.patterns[i]["pattern"]:
      if not pattern:
        continue

      _, inferenceResult, _, _ = self.classifier.infer(
        self.sparsifyPattern(pattern["bitmap"], self.encoder.n))

      if totalInferenceResult is None:
        totalInferenceResult = inferenceResult
      else:
        totalInferenceResult += inferenceResult

    return self.getWinningLabels(totalInferenceResult, seed)


  def queryModel(self, query, preprocess=False):
    """
    Preprocesses the query, encodes it into a pattern, then queries the
    classifier to infer distances to trained-on samples.
    @return       (list)          Two-tuples of sample ID and distance, sorted
                                  closest to farthest from the query.
    """
    if preprocess:
      sample = TextPreprocess().tokenize(query,
                                         ignoreCommon=100,
                                         removeStrings=["[identifier deleted]"],
                                         correctSpell=True)
    else:
      sample = TextPreprocess().tokenize(query)

    # Get window patterns for the query, but if the query is too small such that
    # the window encodings are too sparse, we default to a pure union.
    encodedQuery = self.encodeSample(sample)
    if len(encodedQuery) == 0:
      sample = " ".join(sample)
      fpInfo = self.encoder.getUnionEncoding(sample)
      encodedQuery = [{
        "text":fpInfo["text"],
        "sparsity":fpInfo["sparsity"],
        "bitmap":numpy.array(fpInfo["fingerprint"]["positions"])
      }]
    allDistances = self.infer(encodedQuery)

    if len(allDistances) != len(self.sampleReference):
      raise IndexError("Number of protoype distances must match number of "
                       "samples trained on.")

    sampleDistances = defaultdict()
    for uniqueID in self.sampleReference:
      sampleDistances[uniqueID] = min(
        [allDistances[i] for i, x in enumerate(self.sampleReference)
         if x == uniqueID])

    return sorted(sampleDistances.items(), key=operator.itemgetter(1))


  def infer(self, patterns):
    """
    Get the classifier output for a single input pattern; assumes classifier
    has an infer() method (as specified in NuPIC kNN implementation). For this
    model we sum the distances across the patterns and normalize
    before returning.

    NOTE: there is no check here that the pattern sparsities are > the minimum.

    @return       (numpy.array)       Each entry is the distance from the
        input pattern to that prototype (pattern in the classifier). All
        distances are between 0.0 and 1.0
    """
    distances = numpy.zeros((self.classifier._numPatterns))

    for i, p in enumerate(patterns):
      (_, _, dist, _) = self.classifier.infer(
        self.sparsifyPattern(p["bitmap"], self.encoder.n))

      distances = distances + dist

    return distances / float(i+1)

예제 #4

파일 보기

파일: classify_windows.py 프로젝트: oxtopus/htmresearch

class ClassificationModelWindows(ClassificationModel):
    """
  Class to run classification tasks with a sliding windwo of Coritcal.io word
  fingerprint encodings.
  """
    def __init__(self,
                 verbosity=1,
                 numLabels=3,
                 modelDir="ClassificationModelWindow",
                 unionSparsity=0.20,
                 retinaScaling=1.0,
                 retina="en_associative",
                 apiKey=None,
                 classifierMetric="rawOverlap"):

        super(ClassificationModelWindows, self).__init__(verbosity=verbosity,
                                                         numLabels=numLabels,
                                                         modelDir=modelDir)

        # window patterns below minSparsity will be skipped over
        self.minSparsity = 0.9 * unionSparsity

        self.classifier = KNNClassifier(k=numLabels,
                                        distanceMethod=classifierMetric,
                                        exact=False,
                                        verbosity=verbosity - 1)

        # need valid API key (see CioEncoder init for details)
        root = os.path.dirname(os.path.realpath(__file__))
        self.encoder = CioEncoder(retinaScaling=retinaScaling,
                                  cacheDir=os.path.join(root, "CioCache"),
                                  fingerprintType=EncoderTypes.word,
                                  unionSparsity=unionSparsity,
                                  retina=retina,
                                  apiKey=apiKey)

    def encodeSample(self, sample):
        """
    Encode an SDR of the input string by querying the Cortical.io API for each
    word. The resulting bitmaps are unionized in a sliding window.

    @param sample     (list)        Tokenized sample, where each item is a str.
    @return           (list)        Pattern dicts for the windows, each with the
                                    sample text, sparsity, and bitmap.
    """
        return self.encoder.getWindowEncoding(sample, self.minSparsity)

    def writeOutEncodings(self):
        """
    Write the encoding dictionaries to a txt file; overrides the superclass
    implementation.
    """
        if not os.path.isdir(self.modelDir):
            raise ValueError("Invalid path to write file.")

        # Cast numpy arrays to list objects for serialization.
        jsonPatterns = copy.deepcopy(self.patterns)
        for jp in jsonPatterns:
            for tokenPattern in jp["pattern"]:
                tokenPattern["bitmap"] = tokenPattern.get(
                    "bitmap", numpy.array([])).tolist()
            jp["labels"] = jp.get("labels", numpy.array([])).tolist()

        with open(os.path.join(self.modelDir, "encoding_log.txt"), "w") as f:
            f.write(json.dumps(jsonPatterns, indent=1))

    def trainModel(self, i):
        # TODO: add batch training, where i is a list
        """
    Train the classifier on the sample and labels for record i. The list
    sampleReference is populated to correlate classifier prototypes to sample
    IDs. This model is unique in that a single sample contains multiple encoded
    patterns, of which, any that are too sparse are skipped over.

    @return       (int)     Number of patterns trained on.
    """
        patternWindows = self.patterns[i]["pattern"]
        if len(patternWindows) == 0:
            # no patterns b/c no windows were large enough for encoding
            return
        count = 0
        for window in patternWindows:
            for label in self.patterns[i]["labels"]:
                self.classifier.learn(window["bitmap"],
                                      label,
                                      isSparse=self.encoder.n)
                self.sampleReference.append(self.patterns[i]["ID"])
                count += 1

        return count

    def testModel(self, i, seed=42):
        """
    Test the model on record i. Returns the classifications most frequent
    amongst the classifications of the sample's individual tokens.
    We ignore the terms that are unclassified, picking the most frequent
    classifications among those that are detected; in getWinningLabels().

    @return           (numpy array)   numLabels most-frequent classifications
                                      for the data samples; int or empty.
    """
        totalInferenceResult = None
        for pattern in self.patterns[i]["pattern"]:
            if not pattern:
                continue

            _, inferenceResult, _, _ = self.classifier.infer(
                self.sparsifyPattern(pattern["bitmap"], self.encoder.n))

            if totalInferenceResult is None:
                totalInferenceResult = inferenceResult
            else:
                totalInferenceResult += inferenceResult

        return self.getWinningLabels(totalInferenceResult, seed)

    def queryModel(self, query, preprocess=False):
        """
    Preprocesses the query, encodes it into a pattern, then queries the
    classifier to infer distances to trained-on samples.
    @return       (list)          Two-tuples of sample ID and distance, sorted
                                  closest to farthest from the query.
    """
        if preprocess:
            sample = TextPreprocess().tokenize(
                query,
                ignoreCommon=100,
                removeStrings=["[identifier deleted]"],
                correctSpell=True)
        else:
            sample = TextPreprocess().tokenize(query)

        # Get window patterns for the query, but if the query is too small such that
        # the window encodings are too sparse, we default to a pure union.
        encodedQuery = self.encodeSample(sample)
        if len(encodedQuery) == 0:
            sample = " ".join(sample)
            fpInfo = self.encoder.getUnionEncoding(sample)
            encodedQuery = [{
                "text":
                fpInfo["text"],
                "sparsity":
                fpInfo["sparsity"],
                "bitmap":
                numpy.array(fpInfo["fingerprint"]["positions"])
            }]
        allDistances = self.infer(encodedQuery)

        if len(allDistances) != len(self.sampleReference):
            raise IndexError(
                "Number of protoype distances must match number of "
                "samples trained on.")

        sampleDistances = defaultdict()
        for uniqueID in self.sampleReference:
            sampleDistances[uniqueID] = min([
                allDistances[i] for i, x in enumerate(self.sampleReference)
                if x == uniqueID
            ])

        return sorted(sampleDistances.items(), key=operator.itemgetter(1))

    def infer(self, patterns):
        """
    Get the classifier output for a single input pattern; assumes classifier
    has an infer() method (as specified in NuPIC kNN implementation). For this
    model we sum the distances across the patterns and normalize
    before returning.

    NOTE: there is no check here that the pattern sparsities are > the minimum.

    @return       (numpy.array)       Each entry is the distance from the
        input pattern to that prototype (pattern in the classifier). All
        distances are between 0.0 and 1.0
    """
        distances = numpy.zeros((self.classifier._numPatterns))

        for i, p in enumerate(patterns):
            (_, _, dist, _) = self.classifier.infer(
                self.sparsifyPattern(p["bitmap"], self.encoder.n))

            distances = distances + dist

        return distances / float(i + 1)