예제 #1
0
class CioEncoder(LanguageEncoder):
  """
  A language encoder using the Cortical.io API.

  The encoder queries the Cortical.io REST API via the cortipy module, which
  returns data in the form of "fingerprints". These representations are
  converted to binary SDR arrays with this Cio encoder.
  """

  def __init__(self, w=128, h=128, cacheDir="./cache", verbosity=0):
    if 'CORTICAL_API_KEY' not in os.environ:
      print ("Missing CORTICAL_API_KEY environment variable. If you have a "
        "key, set it with $ export CORTICAL_API_KEY=api_key\n"
        "You can retrieve a key by registering for the REST API at "
        "http://www.cortical.io/resources_apikey.html")
      raise Exception("Missing API key.")

    self.apiKey         = os.environ['CORTICAL_API_KEY']
    self.client         = CorticalClient(self.apiKey, cacheDir=cacheDir)
    self.targetSparsity = 5.0
    self.w              = w
    self.h              = h
    self.n              = w*h
    self.verbosity      = verbosity


  def encode(self, text):
    """
    Encodes the input text w/ a cortipy client. The client returns a
    dictionary of "fingerprint" info, including the SDR bitmap.

    @param  text    (str)             A non-tokenized sample of text.
    @return         (dict)            Result from the cortipy client. The bitmap
                                      encoding is at
                                      encoding["fingerprint"]["positions"].
    """
    try:
      encoding = self.client.getTextBitmap(text)
    except Exception:
      if self.verbosity > 0:
        print("\tThe client returned no encoding for the text, so we'll use "
          "the encoding of the token that is least frequent in the corpus.")
      encoding = self._subEncoding(text)

    return encoding


  def decode(self, encoding, numTerms=None):
    """
    Converts an SDR back into the most likely word or words.

    By default, the most likely term will be returned. If numTerms is
    specified, then the Cortical.io API will attempt to return that many;
    otherwise the standard is 10. The return value will be a sequence of
    (term, weight) tuples, where higher weights imply the corresponding term
    better matches the encoding.

    @param  encoding        (list)            SDR.
    @param  numTerms        (int)             The max number of terms to return.
    @return similar         (list)            List of dictionaries, where keys
                                              are terms and likelihood scores.
    """
    # Convert SDR to bitmap, send to cortipy client.
    terms = client.bitmapToTerms(
      super(CioEncoder, self).bitmapFromSDR(encoding))
    # Convert cortipy response to list of tuples (term, weight)
    return [((term["term"], term["score"])) for term in terms]


  def _subEncoding(self, text):
    """
    @param text             (str)             A non-tokenized sample of text.
    @return encoding        (dict)            Fingerprint from cortipy client.
                                              An empty dictionary of the text
                                              could not be encoded.
    """
    tokens = list(itertools.chain.from_iterable(
      [t.split(',') for t in self.client.tokenize(text)]))
    try:
      encoding = min([self.client.getBitmap(t) for t in tokens],
        key=lambda x: x["df"])
    except Exception:
      encoding = {}

    return encoding


  ## TODO: redo fields? delete (see line 81 TODO)?
  def _createFromBitmap(self, bitmap, width, height):
    self.bitmap = bitmap
    self.w = width
    self.h = height
    self.sparsity = (100.0 * len(bitmap)) / (width*height)
    return self


  def compare(self, encoding1, encoding2):
    """
    Compare encodings, returning the distances between the SDRs.
    Example return dict:
      {
        "cosineSimilarity": 0.6666666666666666,
        "euclideanDistance": 0.3333333333333333,
        "jaccardDistance": 0.5,
        "overlappingAll": 6,
        "overlappingLeftRight": 0.6666666666666666,
        "overlappingRightLeft": 0.6666666666666666,
        "sizeLeft": 9,
        "sizeRight": 9,
        "weightedScoring": 0.4436476984102028
      }
    """
    # Format input SDRs as Cio fingerprints
    fp1 = {"fingerprint": {"positions":self.bitmapFromSDR(encoding1)}}
    fp2 = {"fingerprint": {"positions":self.bitmapFromSDR(encoding2)}}

    return self.client.compare(fp1, fp2)


  def getWidth(self):
    return self.w


  def getHeight(self):
    return self.h


  def getDescription(self):
    return self.description
예제 #2
0
class CioEncoder(LanguageEncoder):
    """
  A language encoder using the Cortical.io API.

  The encoder queries the Cortical.io REST API via the cortipy module, which
  returns data in the form of "fingerprints". These representations are
  converted to binary SDR arrays with this Cio encoder.
  """
    def __init__(self,
                 w=128,
                 h=128,
                 retina=DEFAULT_RETINA,
                 cacheDir="./cache",
                 verbosity=0,
                 fingerprintType=EncoderTypes.document):
        """
    @param w               (int)      Width dimension of the SDR topology.
    @param h               (int)      Height dimension of the SDR topology.
    @param cacheDir        (str)      Where to cache results of API queries.
    @param verbosity       (int)      Amount of info printed out, 0, 1, or 2.
    @param fingerprintType (Enum)     Specify word- or document-level encoding.
    """
        if "CORTICAL_API_KEY" not in os.environ:
            print(
                "Missing CORTICAL_API_KEY environment variable. If you have a "
                "key, set it with $ export CORTICAL_API_KEY=api_key\n"
                "You can retrieve a key by registering for the REST API at "
                "http://www.cortical.io/resources_apikey.html")
            raise OSError("Missing API key.")

        self.apiKey = os.environ["CORTICAL_API_KEY"]
        self.client = CorticalClient(self.apiKey,
                                     retina=retina,
                                     cacheDir=cacheDir)
        self.targetSparsity = 5.0
        self.w = w
        self.h = h
        self.n = w * h
        self.verbosity = verbosity
        self.fingerprintType = fingerprintType
        self.description = ("Cio Encoder", 0)

    def encode(self, text):
        """
    Encodes the input text w/ a cortipy client. The client returns a
    dictionary of "fingerprint" info, including the SDR bitmap.

    NOTE: returning this fingerprint dict differs from the base class spec.

    @param  text    (str)             A non-tokenized sample of text.
    @return         (dict)            Result from the cortipy client. The bitmap
                                      encoding is at
                                      encoding["fingerprint"]["positions"].
    """
        if not text:
            return None
        try:
            if self.fingerprintType is EncoderTypes.document:
                encoding = self.client.getTextBitmap(text)
            elif self.fingerprintType is EncoderTypes.word:
                encoding = self.getUnionEncoding(text)
        except UnsuccessfulEncodingError:
            if self.verbosity > 0:
                print(
                    "\tThe client returned no encoding for the text \'{0}\', so "
                    "we'll use the encoding of the token that is least frequent in "
                    "the corpus.".format(text))
            encoding = self._subEncoding(text)

        return encoding

    def getUnionEncoding(self, text):
        """
    Encode each token of the input text, take the union, and then sparsify.

    @param  text    (str)             A non-tokenized sample of text.
    @return         (dict)            The bitmap encoding is at
                                      encoding["fingerprint"]["positions"].
    """
        tokens = TextPreprocess().tokenize(text)

        # Count the ON bits represented in the encoded tokens.
        counts = Counter()
        for t in tokens:
            bitmap = self.client.getBitmap(t)["fingerprint"]["positions"]
            counts.update(bitmap)

        positions = self.sparseUnion(counts)

        # Populate encoding
        encoding = {
            "text": text,
            "sparsity": len(positions) * 100 / float(self.n),
            "df": 0.0,
            "height": self.h,
            "width": self.w,
            "score": 0.0,
            "fingerprint": {
                "positions": sorted(positions)
            },
            "pos_types": []
        }

        return encoding

    def encodeIntoArray(self, inputText, output):
        """
    See method description in language_encoder.py. It is expected the inputText
    is a single word/token (str).

    NOTE: nupic Encoder class method encodes output in place as sparse array
    (commented out below), but this method returns a bitmap.
    """
        if not isinstance(inputText, str):
            raise TypeError(
                "Expected a string input but got input of type {}.".format(
                    type(inputText)))

        # Encode with term endpoint of Cio API
        try:
            encoding = self.client.getBitmap(inputText)
        except UnsuccessfulEncodingError:
            if self.verbosity > 0:
                print(
                    "\tThe client returned no encoding for the text \'{0}\', so "
                    "we'll use the encoding of the token that is least frequent in "
                    "the corpus.".format(inputText))
            encoding = self._subEncoding(inputText)

        # output = sparsify(encoding["fingerprint"]["positions"])
        return encoding

    def decode(self, encoding, numTerms=10):
        """
    Converts an SDR back into the most likely word or words.

    By default, the most likely term will be returned. If numTerms is
    specified, then the Cortical.io API will attempt to return that many;
    otherwise the standard is 10. The return value will be a sequence of
    (term, weight) tuples, where higher weights imply the corresponding term
    better matches the encoding.

    @param  encoding        (list)            Bitmap encoding.
    @param  numTerms        (int)             The max number of terms to return.
    @return                 (list)            List of dictionaries, where keys
                                              are terms and likelihood scores.
    """
        terms = self.client.bitmapToTerms(encoding, numTerms=numTerms)
        # Convert cortipy response to list of tuples (term, weight)
        return [((term["term"], term["score"])) for term in terms]

    def _subEncoding(self, text, method="keyword"):
        """
    @param text             (str)             A non-tokenized sample of text.
    @return encoding        (dict)            Fingerprint from cortipy client.
                                              An empty dictionary of the text
                                              could not be encoded.
    """
        tokens = list(
            itertools.chain.from_iterable(
                [t.split(',') for t in self.client.tokenize(text)]))
        try:
            if method == "df":
                encoding = min([self.client.getBitmap(t) for t in tokens],
                               key=lambda x: x["df"])
            elif method == "keyword":
                encoding = self.getUnionEncoding(text)
            else:
                raise ValueError("method must be either \'df\' or \'keyword\'")
        except UnsuccessfulEncodingError:
            if self.verbosity > 0:
                print(
                    "\tThe client returned no substitute encoding for the text "
                    "\'{0}\', so we encode with None.".format(text))
            encoding = None

        return encoding

    def compare(self, bitmap1, bitmap2):
        """
    Compare encodings, returning the distances between the SDRs. Input bitmaps
    must be list objects (need to be serializable).

    Example return dict:
      {
        "cosineSimilarity": 0.6666666666666666,
        "euclideanDistance": 0.3333333333333333,
        "jaccardDistance": 0.5,
        "overlappingAll": 6,
        "overlappingLeftRight": 0.6666666666666666,
        "overlappingRightLeft": 0.6666666666666666,
        "sizeLeft": 9,
        "sizeRight": 9,
        "weightedScoring": 0.4436476984102028
      }
    """
        if not isinstance(bitmap1 and bitmap2, list):
            raise TypeError("Comparison bitmaps must be lists.")

        return self.client.compare(bitmap1, bitmap2)

    def createCategory(self, label, positives, negatives=None):
        """
    Create a classification category (bitmap) via the Cio claassify endpoint.

    @param label      (str)     Name of category.
    @param positives  (list)    Bitmap(s) of samples to define.
    @param negatives  (list)    Not required to make category.

    @return           (dict)    Key-values for "positions" (list bitmap encoding
                                of the category and "categoryName" (str).
    """
        if negatives is None:
            negatives = []
        if not isinstance(positives and negatives, list):
            raise TypeError("Input bitmaps must be lists.")

        return self.client.createClassification(label, positives, negatives)

    def getWidth(self):
        return self.n

    def getDescription(self):
        return self.description
예제 #3
0
class CioEncoder(LanguageEncoder):
    """
  A language encoder using the Cortical.io API.

  The encoder queries the Cortical.io REST API via the cortipy module, which
  returns data in the form of "fingerprints". These representations are
  converted to binary SDR arrays with this Cio encoder.
  """
    def __init__(self, w=128, h=128):
        if 'CORTICAL_API_KEY' not in os.environ:
            print(
                "Missing CORTICAL_API_KEY environment variable. If you have a "
                "key, set it with $ export CORTICAL_API_KEY=api_key\n"
                "You can retrieve a key by registering for the REST API at "
                "http://www.cortical.io/resources_apikey.html")
            raise Exception("Missing API key.")

        self.apiKey = os.environ['CORTICAL_API_KEY']
        self.client = CorticalClient(self.apiKey, cacheDir=os.join("./cache"))
        self.targetSparsity = 1.0
        self.w = w  ## Alternatively get dimensions from cortipy client object?
        self.h = h
        self.n = w * h

    def encode(self, text):
        """
    Encodes the input text w/ a cortipy client. The client returns a
    dictionary of "fingerprint" info, including the SDR bitmap.

    @param  text    (str, list)       If the input is type str, the encoder
                                      assumes it has not yet been tokenized. A
                                      list input will skip the tokenization
                                      step.
    @return         (list)            SDR.
    """
        if isinstance(text, str):
            text = self.client.tokenize(text)

        try:
            encoding = self.client.getBitmap(text)
        except ValueError:
            encoding = self.client.getTextBitmap(text)

        if encoding.sparsity == 0:  ##TODO: test again when/if this happens
            # No fingerprint so fill w/ random bitmap, seeded for each specific term.
            print(
                "\tThe client returned a bitmap with sparsity=0 for the string "
                "\'%s\', so we'll generate a pseudo-random SDR with the target "
                "sparsity=%0.1f." % (text, self.targetSparsity))
            state = random.getstate()
            random.seed(text)
            num = self.w * self.h
            bitmap = random.sample(range(num),
                                   int(self.targetSparsity * num / 100))
            self._createFromBitmap(bitmap, self.w, self.h)
            random.setstate(state)

        return self.client.getSDR(encoding["fingerprint"]["positions"])

    def decode(self, encoding, numTerms=None):
        """
    Converts an SDR back into the most likely word or words.

    By default, the most likely term will be returned. If numTerms is
    specified, then the Cortical.io API will attempt to return that many;
    otherwise the standard is 10. The return value will be a sequence of
    (term, weight) tuples, where higher weights imply the corresponding term
    better matches the encoding.

    @param  encoding        (list)             SDR.
    @param  numTerms        (int)              The max number of terms to
                                               return.
    @return similar         (list)             List of dictionaries, where keys
                                               are terms and likelihood scores.
    """
        # Convert SDR to bitmap, send to cortipy client.
        terms = client.bitmapToTerms(
            super(CioEncoder, self).bitmapFromSDR(encoding))
        # Convert cortipy response to list of tuples (term, weight)
        return [((term["term"], term["score"])) for term in terms]

    ## TODO: redo fields? delete (see line 81 TODO)?
    def _createFromBitmap(self, bitmap, width, height):
        self.bitmap = bitmap
        self.w = width
        self.h = height
        self.sparsity = (100.0 * len(bitmap)) / (width * height)
        return self

    def compare(self, encoding1, encoding2):
        """
    Compare encodings, returning the distances between the SDRs.
    Example return dict:
      {
        "cosineSimilarity": 0.6666666666666666,
        "euclideanDistance": 0.3333333333333333,
        "jaccardDistance": 0.5,
        "overlappingAll": 6,
        "overlappingLeftRight": 0.6666666666666666,
        "overlappingRightLeft": 0.6666666666666666,
        "sizeLeft": 9,
        "sizeRight": 9,
        "weightedScoring": 0.4436476984102028
      }
    """
        # Format input SDRs as Cio fingerprints
        fp1 = {"fingerprint": {"positions": self.bitmapFromSDR(encoding1)}}
        fp2 = {"fingerprint": {"positions": self.bitmapFromSDR(encoding2)}}

        return self.client.compare(fp1, fp2)

    def getWidth(self):
        return self.w

    def getHeight(self):
        return self.h

    def getDescription(self):
        return self.description
예제 #4
0
class CioEncoder(LanguageEncoder):
    """
  A language encoder using the Cortical.io API.

  The encoder queries the Cortical.io REST API via the cortipy module, which
  returns data in the form of "fingerprints". These representations are
  converted to binary SDR arrays with this Cio encoder.
  """
    def __init__(self,
                 retina=DEFAULT_RETINA,
                 retinaScaling=1.0,
                 cacheDir=None,
                 verbosity=0,
                 fingerprintType=EncoderTypes.document,
                 unionSparsity=0.20,
                 apiKey=None,
                 maxSparsity=0.50):
        """
    @param retina          (str)      Cortical.io retina, either "en_synonymous"
                                      or "en_associative".
    @param retinaScaling   (float)    Scale each dimension of the SDR bitmap
                                      by this factor.
    @param cacheDir        (str)      Where to cache results of API queries.
    @param verbosity       (int)      Amount of info printed out, 0, 1, or 2.
    @param fingerprintType (Enum)     Specify word- or document-level encoding.
    @param unionSparsity   (float)    Any union'ing done in this encoder will
                                      stop once this sparsity is reached.
    @param maxSparsity     (float)    The maximum sparsity of the returned
                                      bitmap. If the percentage of bits in the
                                      encoding is > maxSparsity, it will be
                                      randomly subsampled.

    TODO: replace enum with a simple string
    """
        if apiKey is None and "CORTICAL_API_KEY" not in os.environ:
            print(
                "Missing CORTICAL_API_KEY environment variable. If you have a "
                "key, set it with $ export CORTICAL_API_KEY=api_key\n"
                "You can retrieve a key by registering for the REST API at "
                "http://www.cortical.io/resources_apikey.html")
            raise OSError("Missing API key.")

        super(CioEncoder, self).__init__(unionSparsity=unionSparsity)

        if cacheDir is None:
            root = os.path.dirname(os.path.realpath(__file__))
            cacheDir = os.path.join(root, "CioCache")

        self.apiKey = apiKey if apiKey else os.environ["CORTICAL_API_KEY"]
        self.client = CorticalClient(self.apiKey,
                                     retina=retina,
                                     cacheDir=cacheDir)

        self._setDimensions(retina, retinaScaling)

        self.fingerprintType = fingerprintType
        self.description = ("Cio Encoder", 0)

        self.verbosity = verbosity
        self.maxSparsity = maxSparsity

    def _setDimensions(self, retina, scalingFactor):
        if scalingFactor <= 0 or scalingFactor > 1:
            raise ValueError(
                "Retina can only be scaled by values between 0 and 1.")

        retinaDim = RETINA_SIZES[retina]["width"]

        self.width = int(retinaDim * scalingFactor)
        self.height = int(retinaDim * scalingFactor)
        self.retinaScaling = float(self.width) / retinaDim
        self.n = self.width * self.height

    def encode(self, text):
        """
    Encodes the input text w/ a cortipy client. The client returns a
    dictionary of "fingerprint" info, including the SDR bitmap as a numpy.array.

    NOTE: returning this fingerprint dict differs from the base class spec.

    @param  text    (str)             A non-tokenized sample of text.
    @return         (dict)            Result from the cortipy client. The bitmap
                                      encoding is at
                                      encoding["fingerprint"]["positions"].
    """
        if not isinstance(text, str) and not isinstance(text, unicode):
            raise TypeError(
                "Expected a string input but got input of type {}.".format(
                    type(text)))

        try:
            if self.fingerprintType == EncoderTypes.document:
                encoding = self.client.getTextBitmap(text)

            elif self.fingerprintType == EncoderTypes.word:
                encoding = self.getUnionEncoding(text)

            else:
                encoding = self.client.getBitmap(text)

        except UnsuccessfulEncodingError:
            if self.verbosity > 0:
                print(
                    "\tThe client returned no encoding for the text \'{0}\', so "
                    "we'll use the encoding of the token that is least frequent in "
                    "the corpus.".format(text))
            encoding = self._subEncoding(text)

        return self.finishEncoding(encoding)

    def getUnionEncoding(self, text):
        """
    Encode each token of the input text, take the union, and then sparsify.

    @param  text    (str)             A non-tokenized sample of text.
    @return         (dict)            The bitmap encoding is at
                                      encoding["fingerprint"]["positions"].
    """
        tokens = TextPreprocess().tokenize(text)

        # Count the ON bits represented in the encoded tokens.
        counts = Counter()
        for t in tokens:
            bitmap = self._getWordBitmap(t)
            counts.update(bitmap)

        positions = self.sparseUnion(counts)

        # Populate encoding
        encoding = {
            "text": text,
            "sparsity": len(positions) / float(self.n),
            "df": 0.0,
            "height": self.height,
            "width": self.width,
            "score": 0.0,
            "fingerprint": {
                "positions": sorted(positions)
            },
            "pos_types": []
        }

        return encoding

    def getWindowEncoding(self, tokens, minSparsity=0.0):
        """
    The encodings simulate a "sliding window", where the encoding representation
    of a given token is a union of its bitmap with the immediately previous
    tokens' bitmaps, up to the maximum sparsity. The returned list only includes
    those windows with sparsities larger than the minimum.

    @param tokens           (list)  Tokenized string.
    @param minSparsity      (float) Only window encodings denser than this value
                                    will be included.
    @return windowBitmaps   (list)  Dict for each token, with entries for the
                                    token string, sparsity float, and bitmap
                                    numpy array.
    """
        if self.fingerprintType != EncoderTypes.word:
            print("Although the encoder type is not set for words, the window "
                  "encodings use word-level fingerprints.")

        bitmaps = [numpy.array(self._getWordBitmap(t)) for t in tokens]

        windowBitmaps = []
        for tokenIndex, windowBitmap in enumerate(bitmaps):
            # Each index in the tokens list is the end of a possible window.
            for i in reversed(xrange(tokenIndex)):
                # From the current token, increase the window by successively adding the
                # previous tokens.
                windowSparsity = len(windowBitmap) / float(self.n)
                nextSparsity = len(bitmaps[i]) / float(self.n)
                if windowSparsity + nextSparsity > self.unionSparsity:
                    # stopping criterion reached -- window is full
                    break
                else:
                    # add bitmap to the current window bitmap
                    windowBitmap = numpy.union1d(windowBitmap, bitmaps[i])

            sparsity = len(windowBitmap) / float(self.n)
            if sparsity > minSparsity:
                # only include windows of sufficient density
                windowBitmaps.append({
                    "text": tokens[i:tokenIndex + 1],
                    "sparsity": sparsity,
                    "bitmap": numpy.array(windowBitmap)
                })

        return windowBitmaps

    def finishEncoding(self, encoding):
        """
    Scale the fingerprint of the encoding dict (if specified) and fill the
    width, height, and sparsity fields.

    @param encoding       (dict)      Dict as returned by the Cio client.
    @return encoding      (dict)      Same format as the input dict, with the
                                      dimensions and sparsity fields populated.
    """
        if self.retinaScaling != 1:
            encoding["fingerprint"]["positions"] = self.scaleEncoding(
                encoding["fingerprint"]["positions"], self.retinaScaling**2)
            encoding["width"] = self.width
            encoding["height"] = self.height

        encoding["fingerprint"]["positions"] = numpy.array(
            encoding["fingerprint"]["positions"])

        encoding["sparsity"] = len(
            encoding["fingerprint"]["positions"]) / float(
                (encoding["width"] * encoding["height"]))

        # Reduce sparsity if needed
        if encoding["sparsity"] > self.maxSparsity:
            self.reduceSparsity(encoding, self.maxSparsity)

        return encoding

    def _getWordBitmap(self, term):
        """
    Return a bitmap for the word. If the Cortical.io API can't encode, cortipy
    will use a random encoding for the word.
    """
        return self.client.getBitmap(term)["fingerprint"]["positions"]

    def encodeIntoArray(self, inputText, output):
        """
    Encodes inputText and puts the encoded value into the numpy output array,
    which is a 1-D array of length returned by getWidth().
    """
        encoding = self.encode(inputText)
        output[:] = 0
        if encoding["fingerprint"]["positions"].size > 0:
            output[encoding["fingerprint"]["positions"]] = 1

    def decode(self, encoding, numTerms=10):
        """
    Converts an SDR back into the most likely word or words.

    By default, the most likely term will be returned. If numTerms is
    specified, then the Cortical.io API will attempt to return that many;
    otherwise the standard is 10. The return value will be a sequence of
    (term, weight) tuples, where higher weights imply the corresponding term
    better matches the encoding.

    @param  encoding        (list)            Bitmap encoding.
    @param  numTerms        (int)             The max number of terms to return.
    @return                 (list)            List of dictionaries, where keys
                                              are terms and likelihood scores.
    """
        terms = self.client.bitmapToTerms(encoding, numTerms=numTerms)
        # Convert cortipy response to list of tuples (term, weight)
        return [(term["term"], term["score"]) for term in terms]

    def _subEncoding(self, text, method="keyword"):
        """
    @param text             (str)             A non-tokenized sample of text.
    @return encoding        (dict)            Fingerprint from cortipy client.
                                              An empty dictionary of the text
                                              could not be encoded.
    """
        try:
            if method == "df":
                tokens = list(
                    itertools.chain.from_iterable(
                        [t.split(",") for t in self.client.tokenize(text)]))
                encoding = min([self.client.getBitmap(t) for t in tokens],
                               key=lambda x: x["df"])
            elif method == "keyword":
                encoding = self.getUnionEncoding(text)
            else:
                raise ValueError("method must be either 'df' or 'keyword'")
        except UnsuccessfulEncodingError:
            if self.verbosity > 0:
                print(
                    "\tThe client returned no substitute encoding for the text "
                    "'{}', so we encode with None.".format(text))
            encoding = None

        return encoding

    def compare(self, bitmap1, bitmap2):
        """
    Compare encodings, returning the distances between the SDRs. Input bitmaps
    must be list objects (need to be serializable).

    Example return dict:
      {
        "cosineSimilarity": 0.6666666666666666,
        "euclideanDistance": 0.3333333333333333,
        "jaccardDistance": 0.5,
        "overlappingAll": 6,
        "overlappingLeftRight": 0.6666666666666666,
        "overlappingRightLeft": 0.6666666666666666,
        "sizeLeft": 9,
        "sizeRight": 9,
        "weightedScoring": 0.4436476984102028
      }
    """
        if not isinstance(bitmap1 and bitmap2, list):
            raise TypeError("Comparison bitmaps must be lists.")

        return self.client.compare(bitmap1, bitmap2)

    def createCategory(self, label, positives, negatives=None):
        """
    Create a classification category (bitmap) via the Cio claassify endpoint.

    @param label      (str)     Name of category.
    @param positives  (list)    Bitmap(s) of samples to define.
    @param negatives  (list)    Not required to make category.

    @return           (dict)    Key-values for "positions" (list bitmap encoding
                                of the category and "categoryName" (str).
    """
        if negatives is None:
            negatives = []
        if not isinstance(positives and negatives, list):
            raise TypeError("Input bitmaps must be lists.")

        return self.client.createClassification(label, positives, negatives)

    def getWidth(self):
        return self.n

    def getDimensions(self):
        return (self.width, self.height)

    def getDescription(self):
        return self.description

    def densifyPattern(self, bitmap):
        """Return a numpy array of 0s and 1s to represent the given bitmap."""
        sparsePattern = numpy.zeros(self.n)
        for i in bitmap:
            sparsePattern[i] = 1.0
        return sparsePattern

    def reduceSparsity(self, encoding, maxSparsity):
        """Reduce the sparsity of the encoding down to maxSparsity"""

        desiredBits = maxSparsity * encoding["width"] * encoding["height"]
        bitmap = encoding["fingerprint"]["positions"]

        # Choose a random subsampling of the bits but seed the random number
        # generator so we get consistent bitmaps
        numpy.random.seed(bitmap.sum())
        encoding["fingerprint"]["positions"] = (
            numpy.random.permutation(bitmap)[0:desiredBits])

        encoding["sparsity"] = len(
            encoding["fingerprint"]["positions"]) / float(
                (encoding["width"] * encoding["height"]))
예제 #5
0
class CioEncoder(LanguageEncoder):
  """
  A language encoder using the Cortical.io API.

  The encoder queries the Cortical.io REST API via the cortipy module, which
  returns data in the form of "fingerprints". These representations are
  converted to binary SDR arrays with this Cio encoder.
  """

  def __init__(self, w=128, h=128, cacheDir="./cache", verbosity=0):
    if 'CORTICAL_API_KEY' not in os.environ:
      print ("Missing CORTICAL_API_KEY environment variable. If you have a "
        "key, set it with $ export CORTICAL_API_KEY=api_key\n"
        "You can retrieve a key by registering for the REST API at "
        "http://www.cortical.io/resources_apikey.html")
      raise OSError("Missing API key.")

    self.apiKey         = os.environ['CORTICAL_API_KEY']
    self.client         = CorticalClient(self.apiKey, cacheDir=cacheDir)
    self.targetSparsity = 5.0
    self.w              = w
    self.h              = h
    self.n              = w*h
    self.verbosity      = verbosity


  def encode(self, text):
    """
    Encodes the input text w/ a cortipy client. The client returns a
    dictionary of "fingerprint" info, including the SDR bitmap.

    @param  text    (str)             A non-tokenized sample of text.
    @return         (dict)            Result from the cortipy client. The bitmap
                                      encoding is at
                                      encoding["fingerprint"]["positions"].
    """
    if not text:
      return None
    try:
      encoding = self.client.getTextBitmap(text)
    except UnsuccessfulEncodingError:
      if self.verbosity > 0:
        print ("\tThe client returned no encoding for the text \'{0}\', so "
               "we'll use the encoding of the token that is least frequent in "
               "the corpus.".format(text))
      encoding = self._subEncoding(text)

    return encoding


  def decode(self, encoding, numTerms=10):
    """
    Converts an SDR back into the most likely word or words.

    By default, the most likely term will be returned. If numTerms is
    specified, then the Cortical.io API will attempt to return that many;
    otherwise the standard is 10. The return value will be a sequence of
    (term, weight) tuples, where higher weights imply the corresponding term
    better matches the encoding.

    @param  encoding        (list)            Bitmap encoding.
    @param  numTerms        (int)             The max number of terms to return.
    @return                 (list)            List of dictionaries, where keys
                                              are terms and likelihood scores.
    """
    terms = client.bitmapToTerms(encoding, numTerms=numTerms)
    # Convert cortipy response to list of tuples (term, weight)
    return [((term["term"], term["score"])) for term in terms]


  def _subEncoding(self, text, method="df"):
    """
    @param text             (str)             A non-tokenized sample of text.
    @return encoding        (dict)            Fingerprint from cortipy client.
                                              An empty dictionary of the text
                                              could not be encoded.
    """
    tokens = list(itertools.chain.from_iterable(
      [t.split(',') for t in self.client.tokenize(text)]))
    try:
      if method == "df":
        encoding = min([self.client.getBitmap(t) for t in tokens],
                        key=lambda x: x["df"])
      elif method == "keyword":
        # Take a union of the bitmaps
        counts = Counter()
        for t in tokens:
          bitmap = self.client.getBitmap(t)["fingerprint"]["positions"]
          counts.update(bitmap)

        # Sample to remain sparse
        max_sparsity = int((self.targetSparsity / 100) * self.n)
        w = min(len(counts), max_sparsity)
        positions = [c[0] for c in counts.most_common(w)]

        # Populate encoding
        encoding = {
            "text": text,
            "sparsity": w * 100 / float(self.n),
            "df": 0.0,
            "height": self.h,
            "width": self.w,
            "score": 0.0,
            "fingerprint": {
              "positions":sorted(positions)
              },
            "pos_types": []
            }
      else:
        raise ValueError("method must be either \'df\' or \'keyword\'")
    except UnsuccessfulEncodingError:
      if self.verbosity > 0:
        print ("\tThe client returned no substitute encoding for the text "
               "\'{0}\', so we encode with None.".format(text))
      encoding = None

    return encoding


  def compare(self, encoding1, encoding2):
    """
    Compare encodings, returning the distances between the SDRs.
    Example return dict:
      {
        "cosineSimilarity": 0.6666666666666666,
        "euclideanDistance": 0.3333333333333333,
        "jaccardDistance": 0.5,
        "overlappingAll": 6,
        "overlappingLeftRight": 0.6666666666666666,
        "overlappingRightLeft": 0.6666666666666666,
        "sizeLeft": 9,
        "sizeRight": 9,
        "weightedScoring": 0.4436476984102028
      }
    """
    # Format input SDRs as Cio fingerprints
    fp1 = {"fingerprint": {"positions":self.bitmapFromSDR(encoding1)}}
    fp2 = {"fingerprint": {"positions":self.bitmapFromSDR(encoding2)}}

    return self.client.compare(fp1, fp2)


  def getWidth(self):
    return self.w


  def getHeight(self):
    return self.h


  def getDescription(self):
    return self.description
예제 #6
0
class CioEncoder(LanguageEncoder):
  """
  A language encoder using the Cortical.io API.

  The encoder queries the Cortical.io REST API via the cortipy module, which
  returns data in the form of "fingerprints". These representations are
  converted to binary SDR arrays with this Cio encoder.
  """

  def __init__(self, retina=DEFAULT_RETINA, retinaScaling=1.0, cacheDir=None,
               verbosity=0, fingerprintType=EncoderTypes.document,
               unionSparsity=0.20, apiKey=None,
               maxSparsity=0.50):
    """
    @param retina          (str)      Cortical.io retina, either "en_synonymous"
                                      or "en_associative".
    @param retinaScaling   (float)    Scale each dimension of the SDR bitmap
                                      by this factor.
    @param cacheDir        (str)      Where to cache results of API queries.
    @param verbosity       (int)      Amount of info printed out, 0, 1, or 2.
    @param fingerprintType (Enum)     Specify word- or document-level encoding.
    @param unionSparsity   (float)    Any union'ing done in this encoder will
                                      stop once this sparsity is reached.
    @param maxSparsity     (float)    The maximum sparsity of the returned
                                      bitmap. If the percentage of bits in the
                                      encoding is > maxSparsity, it will be
                                      randomly subsampled.

    TODO: replace enum with a simple string
    """
    if apiKey is None and "CORTICAL_API_KEY" not in os.environ:
      print ("Missing CORTICAL_API_KEY environment variable. If you have a "
        "key, set it with $ export CORTICAL_API_KEY=api_key\n"
        "You can retrieve a key by registering for the REST API at "
        "http://www.cortical.io/resources_apikey.html")
      raise OSError("Missing API key.")

    super(CioEncoder, self).__init__(unionSparsity=unionSparsity)

    if cacheDir is None:
      root = os.path.dirname(os.path.realpath(__file__))
      cacheDir = os.path.join(root, "CioCache")

    self.apiKey = apiKey if apiKey else os.environ["CORTICAL_API_KEY"]
    self.client = CorticalClient(self.apiKey, retina=retina, cacheDir=cacheDir)

    self._setDimensions(retina, retinaScaling)

    self.fingerprintType = fingerprintType
    self.description = ("Cio Encoder", 0)

    self.verbosity = verbosity
    self.maxSparsity = maxSparsity


  def _setDimensions(self, retina, scalingFactor):
    if scalingFactor <= 0 or scalingFactor > 1:
      raise ValueError("Retina can only be scaled by values between 0 and 1.")

    retinaDim = RETINA_SIZES[retina]["width"]

    self.width = int(retinaDim * scalingFactor)
    self.height = int(retinaDim * scalingFactor)
    self.retinaScaling = float(self.width)/retinaDim
    self.n = self.width * self.height


  def encode(self, text):
    """
    Encodes the input text w/ a cortipy client. The client returns a
    dictionary of "fingerprint" info, including the SDR bitmap as a numpy.array.

    NOTE: returning this fingerprint dict differs from the base class spec.

    @param  text    (str)             A non-tokenized sample of text.
    @return         (dict)            Result from the cortipy client. The bitmap
                                      encoding is at
                                      encoding["fingerprint"]["positions"].
    """
    if not isinstance(text, str) and not isinstance(text, unicode):
      raise TypeError("Expected a string input but got input of type {}."
                      .format(type(text)))

    try:
      if self.fingerprintType == EncoderTypes.document:
        encoding = self.client.getTextBitmap(text)

      elif self.fingerprintType == EncoderTypes.word:
        encoding = self.getUnionEncoding(text)

      else:
        encoding = self.client.getBitmap(text)

    except UnsuccessfulEncodingError:
      if self.verbosity > 0:
        print ("\tThe client returned no encoding for the text \'{0}\', so "
               "we'll use the encoding of the token that is least frequent in "
               "the corpus.".format(text))
      encoding = self._subEncoding(text)

    return self.finishEncoding(encoding)


  def getUnionEncoding(self, text):
    """
    Encode each token of the input text, take the union, and then sparsify.

    @param  text    (str)             A non-tokenized sample of text.
    @return         (dict)            The bitmap encoding is at
                                      encoding["fingerprint"]["positions"].
    """
    tokens = TextPreprocess().tokenize(text)

    # Count the ON bits represented in the encoded tokens.
    counts = Counter()
    for t in tokens:
      bitmap = self._getWordBitmap(t)
      counts.update(bitmap)

    positions = self.sparseUnion(counts)

    # Populate encoding
    encoding = {
        "text": text,
        "sparsity": len(positions) / float(self.n),
        "df": 0.0,
        "height": self.height,
        "width": self.width,
        "score": 0.0,
        "fingerprint": {
            "positions":sorted(positions)
            },
        "pos_types": []
        }

    return encoding


  def getWindowEncoding(self, tokens, minSparsity=0.0):
    """
    The encodings simulate a "sliding window", where the encoding representation
    of a given token is a union of its bitmap with the immediately previous
    tokens' bitmaps, up to the maximum sparsity. The returned list only includes
    those windows with sparsities larger than the minimum.

    @param tokens           (list)  Tokenized string.
    @param minSparsity      (float) Only window encodings denser than this value
                                    will be included.
    @return windowBitmaps   (list)  Dict for each token, with entries for the
                                    token string, sparsity float, and bitmap
                                    numpy array.
    """
    if self.fingerprintType != EncoderTypes.word:
      print ("Although the encoder type is not set for words, the window "
        "encodings use word-level fingerprints.")

    bitmaps = [numpy.array(self._getWordBitmap(t)) for t in tokens]

    windowBitmaps = []
    for tokenIndex, windowBitmap in enumerate(bitmaps):
      # Each index in the tokens list is the end of a possible window.
      for i in reversed(xrange(tokenIndex)):
        # From the current token, increase the window by successively adding the
        # previous tokens.
        windowSparsity = len(windowBitmap) / float(self.n)
        nextSparsity = len(bitmaps[i]) / float(self.n)
        if windowSparsity + nextSparsity > self.unionSparsity:
          # stopping criterion reached -- window is full
          break
        else:
          # add bitmap to the current window bitmap
          windowBitmap = numpy.union1d(windowBitmap, bitmaps[i])

      sparsity = len(windowBitmap) / float(self.n)
      if sparsity > minSparsity:
        # only include windows of sufficient density
        windowBitmaps.append(
          {"text": tokens[i:tokenIndex+1],
           "sparsity": sparsity,
           "bitmap": numpy.array(windowBitmap)})

    return windowBitmaps


  def finishEncoding(self, encoding):
    """
    Scale the fingerprint of the encoding dict (if specified) and fill the
    width, height, and sparsity fields.

    @param encoding       (dict)      Dict as returned by the Cio client.
    @return encoding      (dict)      Same format as the input dict, with the
                                      dimensions and sparsity fields populated.
    """
    if self.retinaScaling != 1:
      encoding["fingerprint"]["positions"] = self.scaleEncoding(
        encoding["fingerprint"]["positions"], self.retinaScaling**2)
      encoding["width"] = self.width
      encoding["height"] = self.height

    encoding["fingerprint"]["positions"] = numpy.array(
      encoding["fingerprint"]["positions"])

    encoding["sparsity"] = len(encoding["fingerprint"]["positions"]) / float(
      (encoding["width"] * encoding["height"]))

    # Reduce sparsity if needed
    if encoding["sparsity"] > self.maxSparsity:
      self.reduceSparsity(encoding, self.maxSparsity)

    return encoding


  def _getWordBitmap(self, term):
    """
    Return a bitmap for the word. If the Cortical.io API can't encode, cortipy
    will use a random encoding for the word.
    """
    return self.client.getBitmap(term)["fingerprint"]["positions"]


  def encodeIntoArray(self, inputText, output):
    """
    Encodes inputText and puts the encoded value into the numpy output array,
    which is a 1-D array of length returned by getWidth().
    """
    encoding = self.encode(inputText)
    output[:] = 0
    if encoding["fingerprint"]["positions"].size > 0:
      output[encoding["fingerprint"]["positions"]] = 1


  def decode(self, encoding, numTerms=10):
    """
    Converts an SDR back into the most likely word or words.

    By default, the most likely term will be returned. If numTerms is
    specified, then the Cortical.io API will attempt to return that many;
    otherwise the standard is 10. The return value will be a sequence of
    (term, weight) tuples, where higher weights imply the corresponding term
    better matches the encoding.

    @param  encoding        (list)            Bitmap encoding.
    @param  numTerms        (int)             The max number of terms to return.
    @return                 (list)            List of dictionaries, where keys
                                              are terms and likelihood scores.
    """
    terms = self.client.bitmapToTerms(encoding, numTerms=numTerms)
    # Convert cortipy response to list of tuples (term, weight)
    return [(term["term"], term["score"]) for term in terms]


  def _subEncoding(self, text, method="keyword"):
    """
    @param text             (str)             A non-tokenized sample of text.
    @return encoding        (dict)            Fingerprint from cortipy client.
                                              An empty dictionary of the text
                                              could not be encoded.
    """
    try:
      if method == "df":
        tokens = list(itertools.chain.from_iterable(
          [t.split(",") for t in self.client.tokenize(text)]))
        encoding = min(
          [self.client.getBitmap(t) for t in tokens], key=lambda x: x["df"])
      elif method == "keyword":
        encoding = self.getUnionEncoding(text)
      else:
        raise ValueError("method must be either 'df' or 'keyword'")
    except UnsuccessfulEncodingError:
      if self.verbosity > 0:
        print ("\tThe client returned no substitute encoding for the text "
               "'{}', so we encode with None.".format(text))
      encoding = None

    return encoding


  def compare(self, bitmap1, bitmap2):
    """
    Compare encodings, returning the distances between the SDRs. Input bitmaps
    must be list objects (need to be serializable).

    Example return dict:
      {
        "cosineSimilarity": 0.6666666666666666,
        "euclideanDistance": 0.3333333333333333,
        "jaccardDistance": 0.5,
        "overlappingAll": 6,
        "overlappingLeftRight": 0.6666666666666666,
        "overlappingRightLeft": 0.6666666666666666,
        "sizeLeft": 9,
        "sizeRight": 9,
        "weightedScoring": 0.4436476984102028
      }
    """
    if not isinstance(bitmap1 and bitmap2, list):
      raise TypeError("Comparison bitmaps must be lists.")

    return self.client.compare(bitmap1, bitmap2)


  def createCategory(self, label, positives, negatives=None):
    """
    Create a classification category (bitmap) via the Cio claassify endpoint.

    @param label      (str)     Name of category.
    @param positives  (list)    Bitmap(s) of samples to define.
    @param negatives  (list)    Not required to make category.

    @return           (dict)    Key-values for "positions" (list bitmap encoding
                                of the category and "categoryName" (str).
    """
    if negatives is None:
      negatives = []
    if not isinstance(positives and negatives, list):
      raise TypeError("Input bitmaps must be lists.")

    return self.client.createClassification(label, positives, negatives)


  def getWidth(self):
    return self.n


  def getDimensions(self):
    return (self.width, self.height)


  def getDescription(self):
    return self.description


  def densifyPattern(self, bitmap):
    """Return a numpy array of 0s and 1s to represent the given bitmap."""
    sparsePattern = numpy.zeros(self.n)
    for i in bitmap:
      sparsePattern[i] = 1.0
    return sparsePattern


  def reduceSparsity(self, encoding, maxSparsity):
    """Reduce the sparsity of the encoding down to maxSparsity"""

    desiredBits = maxSparsity*encoding["width"]*encoding["height"]
    bitmap = encoding["fingerprint"]["positions"]

    # Choose a random subsampling of the bits but seed the random number
    # generator so we get consistent bitmaps
    numpy.random.seed(bitmap.sum())
    encoding["fingerprint"]["positions"] = (
      numpy.random.permutation(bitmap)[0:desiredBits] )

    encoding["sparsity"] = len(encoding["fingerprint"]["positions"]) / float(
      (encoding["width"] * encoding["height"]))
예제 #7
0
class CioEncoder(LanguageEncoder):
  """
  A language encoder using the Cortical.io API.

  The encoder queries the Cortical.io REST API via the cortipy module, which 
  returns data in the form of "fingerprints". These representations are 
  converted to binary SDR arrays with this Cio encoder.
  """

  def __init__(self, w=128, h=128):
    if 'CORTICAL_API_KEY' not in os.environ:
      print ("Missing CORTICAL_API_KEY environment variable. If you have a "
        "key, set it with $ export CORTICAL_API_KEY=api_key\n"
        "You can retrieve a key by registering for the REST API at "
        "http://www.cortical.io/resources_apikey.html")
      raise Exception("Missing API key.")

    self.apiKey         = os.environ['CORTICAL_API_KEY']
    self.client         = CorticalClient(self.apiKey,
                                         cacheDir=os.join("./cache"))
    self.targetSparsity = 1.0
    self.w              = w  ## Alternatively get dimensions from cortipy client object?
    self.h              = h
    self.n = w*h
    
    
  def encode(self, text):
    """
    Encodes the input text w/ a cortipy client. The client returns a
    dictionary of "fingerprint" info, including the SDR bitmap.
    
    @param  text    (str, list)       If the input is type str, the encoder
                                      assumes it has not yet been tokenized. A
                                      list input will skip the tokenization
                                      step.
    @return         (list)            SDR.
    """
    if isinstance(text, str):
      text = self.client.tokenize(text)
    
    try:
      encoding = self.client.getBitmap(string)
    except ValueError:
      encoding = self.client.getTextBitmap(string)


    if encoding.sparsity == 0:  ##TODO: test again when/if this happens
      # No fingerprint so fill w/ random bitmap, seeded for each specific term.
      print ("\tThe client returned a bitmap with sparsity=0 for the string "
            "\'%s\', so we'll generate a pseudo-random SDR with the target "
            "sparsity=%0.1f." % (string, self.targetSparsity))
      state = random.getstate()
      random.seed(string)
      num = self.w * self.h
      bitmap = random.sample(range(num), int(self.targetSparsity * num / 100))
      self._createFromBitmap(bitmap, self.w, self.h)
      random.setstate(state)


    return self.client.getSDR(encoding["fingerprint"]["positions"])
      

  def decode(self, encoding, numTerms=None):
    """
    Converts an SDR back into the most likely word or words.

    By default, the most likely term will be returned. If numTerms is
    specified, then the Cortical.io API will attempt to return that many;
    otherwise the standard is 10. The return value will be a sequence of 
    (term, weight) tuples, where higher weights imply the corresponding term 
    better matches the encoding.
    
    @param  encoding        (list)             SDR.
    @param  numTerms        (int)              The max number of terms to
                                               return.
    @return similar         (list)             List of dictionaries, where keys
                                               are terms and likelihood scores.
    """
    # Convert SDR to bitmap, send to cortipy client.
    terms = client.bitmapToTerms(
      super(CioEncoder, self).bitmapFromSDR(encoding))
    # Convert cortipy response to list of tuples (term, weight)
    return [((term["term"], term["score"])) for term in terms]
    
  
  ## TODO: redo fields? delete (see line 81 TODO)?
  def _createFromBitmap(self, bitmap, width, height):
    self.bitmap = bitmap
    self.w = width
    self.h = height
    self.sparsity = (100.0 * len(bitmap)) / (width*height)
    return self


  def compare(self, encoding1, encoding2):
    """
    Compare encodings, returning the distances between the SDRs.
    Example return dict:
      {
        "cosineSimilarity": 0.6666666666666666,
        "euclideanDistance": 0.3333333333333333,
        "jaccardDistance": 0.5,
        "overlappingAll": 6,
        "overlappingLeftRight": 0.6666666666666666,
        "overlappingRightLeft": 0.6666666666666666,
        "sizeLeft": 9,
        "sizeRight": 9,
        "weightedScoring": 0.4436476984102028
      }
    """
    # Format input SDRs as Cio fingerprints
    fp1 = {"fingerprint": {"positions":self.bitmapFromSDR(encoding1)}}
    fp2 = {"fingerprint": {"positions":self.bitmapFromSDR(encoding2)}}

    return self.client.compare(fp1, fp2)
  

  def getWidth(self):
    return self.w


  def getHeight(self):
    return self.h


  def getDescription(self):
    return self.description