class CioEncoder(LanguageEncoder): """ A language encoder using the Cortical.io API. The encoder queries the Cortical.io REST API via the cortipy module, which returns data in the form of "fingerprints". These representations are converted to binary SDR arrays with this Cio encoder. """ def __init__(self, w=128, h=128, cacheDir="./cache", verbosity=0): if 'CORTICAL_API_KEY' not in os.environ: print ("Missing CORTICAL_API_KEY environment variable. If you have a " "key, set it with $ export CORTICAL_API_KEY=api_key\n" "You can retrieve a key by registering for the REST API at " "http://www.cortical.io/resources_apikey.html") raise Exception("Missing API key.") self.apiKey = os.environ['CORTICAL_API_KEY'] self.client = CorticalClient(self.apiKey, cacheDir=cacheDir) self.targetSparsity = 5.0 self.w = w self.h = h self.n = w*h self.verbosity = verbosity def encode(self, text): """ Encodes the input text w/ a cortipy client. The client returns a dictionary of "fingerprint" info, including the SDR bitmap. @param text (str) A non-tokenized sample of text. @return (dict) Result from the cortipy client. The bitmap encoding is at encoding["fingerprint"]["positions"]. """ try: encoding = self.client.getTextBitmap(text) except Exception: if self.verbosity > 0: print("\tThe client returned no encoding for the text, so we'll use " "the encoding of the token that is least frequent in the corpus.") encoding = self._subEncoding(text) return encoding def decode(self, encoding, numTerms=None): """ Converts an SDR back into the most likely word or words. By default, the most likely term will be returned. If numTerms is specified, then the Cortical.io API will attempt to return that many; otherwise the standard is 10. The return value will be a sequence of (term, weight) tuples, where higher weights imply the corresponding term better matches the encoding. @param encoding (list) SDR. @param numTerms (int) The max number of terms to return. @return similar (list) List of dictionaries, where keys are terms and likelihood scores. """ # Convert SDR to bitmap, send to cortipy client. terms = client.bitmapToTerms( super(CioEncoder, self).bitmapFromSDR(encoding)) # Convert cortipy response to list of tuples (term, weight) return [((term["term"], term["score"])) for term in terms] def _subEncoding(self, text): """ @param text (str) A non-tokenized sample of text. @return encoding (dict) Fingerprint from cortipy client. An empty dictionary of the text could not be encoded. """ tokens = list(itertools.chain.from_iterable( [t.split(',') for t in self.client.tokenize(text)])) try: encoding = min([self.client.getBitmap(t) for t in tokens], key=lambda x: x["df"]) except Exception: encoding = {} return encoding ## TODO: redo fields? delete (see line 81 TODO)? def _createFromBitmap(self, bitmap, width, height): self.bitmap = bitmap self.w = width self.h = height self.sparsity = (100.0 * len(bitmap)) / (width*height) return self def compare(self, encoding1, encoding2): """ Compare encodings, returning the distances between the SDRs. Example return dict: { "cosineSimilarity": 0.6666666666666666, "euclideanDistance": 0.3333333333333333, "jaccardDistance": 0.5, "overlappingAll": 6, "overlappingLeftRight": 0.6666666666666666, "overlappingRightLeft": 0.6666666666666666, "sizeLeft": 9, "sizeRight": 9, "weightedScoring": 0.4436476984102028 } """ # Format input SDRs as Cio fingerprints fp1 = {"fingerprint": {"positions":self.bitmapFromSDR(encoding1)}} fp2 = {"fingerprint": {"positions":self.bitmapFromSDR(encoding2)}} return self.client.compare(fp1, fp2) def getWidth(self): return self.w def getHeight(self): return self.h def getDescription(self): return self.description
class CioEncoder(LanguageEncoder): """ A language encoder using the Cortical.io API. The encoder queries the Cortical.io REST API via the cortipy module, which returns data in the form of "fingerprints". These representations are converted to binary SDR arrays with this Cio encoder. """ def __init__(self, w=128, h=128, retina=DEFAULT_RETINA, cacheDir="./cache", verbosity=0, fingerprintType=EncoderTypes.document): """ @param w (int) Width dimension of the SDR topology. @param h (int) Height dimension of the SDR topology. @param cacheDir (str) Where to cache results of API queries. @param verbosity (int) Amount of info printed out, 0, 1, or 2. @param fingerprintType (Enum) Specify word- or document-level encoding. """ if "CORTICAL_API_KEY" not in os.environ: print( "Missing CORTICAL_API_KEY environment variable. If you have a " "key, set it with $ export CORTICAL_API_KEY=api_key\n" "You can retrieve a key by registering for the REST API at " "http://www.cortical.io/resources_apikey.html") raise OSError("Missing API key.") self.apiKey = os.environ["CORTICAL_API_KEY"] self.client = CorticalClient(self.apiKey, retina=retina, cacheDir=cacheDir) self.targetSparsity = 5.0 self.w = w self.h = h self.n = w * h self.verbosity = verbosity self.fingerprintType = fingerprintType self.description = ("Cio Encoder", 0) def encode(self, text): """ Encodes the input text w/ a cortipy client. The client returns a dictionary of "fingerprint" info, including the SDR bitmap. NOTE: returning this fingerprint dict differs from the base class spec. @param text (str) A non-tokenized sample of text. @return (dict) Result from the cortipy client. The bitmap encoding is at encoding["fingerprint"]["positions"]. """ if not text: return None try: if self.fingerprintType is EncoderTypes.document: encoding = self.client.getTextBitmap(text) elif self.fingerprintType is EncoderTypes.word: encoding = self.getUnionEncoding(text) except UnsuccessfulEncodingError: if self.verbosity > 0: print( "\tThe client returned no encoding for the text \'{0}\', so " "we'll use the encoding of the token that is least frequent in " "the corpus.".format(text)) encoding = self._subEncoding(text) return encoding def getUnionEncoding(self, text): """ Encode each token of the input text, take the union, and then sparsify. @param text (str) A non-tokenized sample of text. @return (dict) The bitmap encoding is at encoding["fingerprint"]["positions"]. """ tokens = TextPreprocess().tokenize(text) # Count the ON bits represented in the encoded tokens. counts = Counter() for t in tokens: bitmap = self.client.getBitmap(t)["fingerprint"]["positions"] counts.update(bitmap) positions = self.sparseUnion(counts) # Populate encoding encoding = { "text": text, "sparsity": len(positions) * 100 / float(self.n), "df": 0.0, "height": self.h, "width": self.w, "score": 0.0, "fingerprint": { "positions": sorted(positions) }, "pos_types": [] } return encoding def encodeIntoArray(self, inputText, output): """ See method description in language_encoder.py. It is expected the inputText is a single word/token (str). NOTE: nupic Encoder class method encodes output in place as sparse array (commented out below), but this method returns a bitmap. """ if not isinstance(inputText, str): raise TypeError( "Expected a string input but got input of type {}.".format( type(inputText))) # Encode with term endpoint of Cio API try: encoding = self.client.getBitmap(inputText) except UnsuccessfulEncodingError: if self.verbosity > 0: print( "\tThe client returned no encoding for the text \'{0}\', so " "we'll use the encoding of the token that is least frequent in " "the corpus.".format(inputText)) encoding = self._subEncoding(inputText) # output = sparsify(encoding["fingerprint"]["positions"]) return encoding def decode(self, encoding, numTerms=10): """ Converts an SDR back into the most likely word or words. By default, the most likely term will be returned. If numTerms is specified, then the Cortical.io API will attempt to return that many; otherwise the standard is 10. The return value will be a sequence of (term, weight) tuples, where higher weights imply the corresponding term better matches the encoding. @param encoding (list) Bitmap encoding. @param numTerms (int) The max number of terms to return. @return (list) List of dictionaries, where keys are terms and likelihood scores. """ terms = self.client.bitmapToTerms(encoding, numTerms=numTerms) # Convert cortipy response to list of tuples (term, weight) return [((term["term"], term["score"])) for term in terms] def _subEncoding(self, text, method="keyword"): """ @param text (str) A non-tokenized sample of text. @return encoding (dict) Fingerprint from cortipy client. An empty dictionary of the text could not be encoded. """ tokens = list( itertools.chain.from_iterable( [t.split(',') for t in self.client.tokenize(text)])) try: if method == "df": encoding = min([self.client.getBitmap(t) for t in tokens], key=lambda x: x["df"]) elif method == "keyword": encoding = self.getUnionEncoding(text) else: raise ValueError("method must be either \'df\' or \'keyword\'") except UnsuccessfulEncodingError: if self.verbosity > 0: print( "\tThe client returned no substitute encoding for the text " "\'{0}\', so we encode with None.".format(text)) encoding = None return encoding def compare(self, bitmap1, bitmap2): """ Compare encodings, returning the distances between the SDRs. Input bitmaps must be list objects (need to be serializable). Example return dict: { "cosineSimilarity": 0.6666666666666666, "euclideanDistance": 0.3333333333333333, "jaccardDistance": 0.5, "overlappingAll": 6, "overlappingLeftRight": 0.6666666666666666, "overlappingRightLeft": 0.6666666666666666, "sizeLeft": 9, "sizeRight": 9, "weightedScoring": 0.4436476984102028 } """ if not isinstance(bitmap1 and bitmap2, list): raise TypeError("Comparison bitmaps must be lists.") return self.client.compare(bitmap1, bitmap2) def createCategory(self, label, positives, negatives=None): """ Create a classification category (bitmap) via the Cio claassify endpoint. @param label (str) Name of category. @param positives (list) Bitmap(s) of samples to define. @param negatives (list) Not required to make category. @return (dict) Key-values for "positions" (list bitmap encoding of the category and "categoryName" (str). """ if negatives is None: negatives = [] if not isinstance(positives and negatives, list): raise TypeError("Input bitmaps must be lists.") return self.client.createClassification(label, positives, negatives) def getWidth(self): return self.n def getDescription(self): return self.description
class CioEncoder(LanguageEncoder): """ A language encoder using the Cortical.io API. The encoder queries the Cortical.io REST API via the cortipy module, which returns data in the form of "fingerprints". These representations are converted to binary SDR arrays with this Cio encoder. """ def __init__(self, w=128, h=128): if 'CORTICAL_API_KEY' not in os.environ: print( "Missing CORTICAL_API_KEY environment variable. If you have a " "key, set it with $ export CORTICAL_API_KEY=api_key\n" "You can retrieve a key by registering for the REST API at " "http://www.cortical.io/resources_apikey.html") raise Exception("Missing API key.") self.apiKey = os.environ['CORTICAL_API_KEY'] self.client = CorticalClient(self.apiKey, cacheDir=os.join("./cache")) self.targetSparsity = 1.0 self.w = w ## Alternatively get dimensions from cortipy client object? self.h = h self.n = w * h def encode(self, text): """ Encodes the input text w/ a cortipy client. The client returns a dictionary of "fingerprint" info, including the SDR bitmap. @param text (str, list) If the input is type str, the encoder assumes it has not yet been tokenized. A list input will skip the tokenization step. @return (list) SDR. """ if isinstance(text, str): text = self.client.tokenize(text) try: encoding = self.client.getBitmap(text) except ValueError: encoding = self.client.getTextBitmap(text) if encoding.sparsity == 0: ##TODO: test again when/if this happens # No fingerprint so fill w/ random bitmap, seeded for each specific term. print( "\tThe client returned a bitmap with sparsity=0 for the string " "\'%s\', so we'll generate a pseudo-random SDR with the target " "sparsity=%0.1f." % (text, self.targetSparsity)) state = random.getstate() random.seed(text) num = self.w * self.h bitmap = random.sample(range(num), int(self.targetSparsity * num / 100)) self._createFromBitmap(bitmap, self.w, self.h) random.setstate(state) return self.client.getSDR(encoding["fingerprint"]["positions"]) def decode(self, encoding, numTerms=None): """ Converts an SDR back into the most likely word or words. By default, the most likely term will be returned. If numTerms is specified, then the Cortical.io API will attempt to return that many; otherwise the standard is 10. The return value will be a sequence of (term, weight) tuples, where higher weights imply the corresponding term better matches the encoding. @param encoding (list) SDR. @param numTerms (int) The max number of terms to return. @return similar (list) List of dictionaries, where keys are terms and likelihood scores. """ # Convert SDR to bitmap, send to cortipy client. terms = client.bitmapToTerms( super(CioEncoder, self).bitmapFromSDR(encoding)) # Convert cortipy response to list of tuples (term, weight) return [((term["term"], term["score"])) for term in terms] ## TODO: redo fields? delete (see line 81 TODO)? def _createFromBitmap(self, bitmap, width, height): self.bitmap = bitmap self.w = width self.h = height self.sparsity = (100.0 * len(bitmap)) / (width * height) return self def compare(self, encoding1, encoding2): """ Compare encodings, returning the distances between the SDRs. Example return dict: { "cosineSimilarity": 0.6666666666666666, "euclideanDistance": 0.3333333333333333, "jaccardDistance": 0.5, "overlappingAll": 6, "overlappingLeftRight": 0.6666666666666666, "overlappingRightLeft": 0.6666666666666666, "sizeLeft": 9, "sizeRight": 9, "weightedScoring": 0.4436476984102028 } """ # Format input SDRs as Cio fingerprints fp1 = {"fingerprint": {"positions": self.bitmapFromSDR(encoding1)}} fp2 = {"fingerprint": {"positions": self.bitmapFromSDR(encoding2)}} return self.client.compare(fp1, fp2) def getWidth(self): return self.w def getHeight(self): return self.h def getDescription(self): return self.description
class CioEncoder(LanguageEncoder): """ A language encoder using the Cortical.io API. The encoder queries the Cortical.io REST API via the cortipy module, which returns data in the form of "fingerprints". These representations are converted to binary SDR arrays with this Cio encoder. """ def __init__(self, retina=DEFAULT_RETINA, retinaScaling=1.0, cacheDir=None, verbosity=0, fingerprintType=EncoderTypes.document, unionSparsity=0.20, apiKey=None, maxSparsity=0.50): """ @param retina (str) Cortical.io retina, either "en_synonymous" or "en_associative". @param retinaScaling (float) Scale each dimension of the SDR bitmap by this factor. @param cacheDir (str) Where to cache results of API queries. @param verbosity (int) Amount of info printed out, 0, 1, or 2. @param fingerprintType (Enum) Specify word- or document-level encoding. @param unionSparsity (float) Any union'ing done in this encoder will stop once this sparsity is reached. @param maxSparsity (float) The maximum sparsity of the returned bitmap. If the percentage of bits in the encoding is > maxSparsity, it will be randomly subsampled. TODO: replace enum with a simple string """ if apiKey is None and "CORTICAL_API_KEY" not in os.environ: print( "Missing CORTICAL_API_KEY environment variable. If you have a " "key, set it with $ export CORTICAL_API_KEY=api_key\n" "You can retrieve a key by registering for the REST API at " "http://www.cortical.io/resources_apikey.html") raise OSError("Missing API key.") super(CioEncoder, self).__init__(unionSparsity=unionSparsity) if cacheDir is None: root = os.path.dirname(os.path.realpath(__file__)) cacheDir = os.path.join(root, "CioCache") self.apiKey = apiKey if apiKey else os.environ["CORTICAL_API_KEY"] self.client = CorticalClient(self.apiKey, retina=retina, cacheDir=cacheDir) self._setDimensions(retina, retinaScaling) self.fingerprintType = fingerprintType self.description = ("Cio Encoder", 0) self.verbosity = verbosity self.maxSparsity = maxSparsity def _setDimensions(self, retina, scalingFactor): if scalingFactor <= 0 or scalingFactor > 1: raise ValueError( "Retina can only be scaled by values between 0 and 1.") retinaDim = RETINA_SIZES[retina]["width"] self.width = int(retinaDim * scalingFactor) self.height = int(retinaDim * scalingFactor) self.retinaScaling = float(self.width) / retinaDim self.n = self.width * self.height def encode(self, text): """ Encodes the input text w/ a cortipy client. The client returns a dictionary of "fingerprint" info, including the SDR bitmap as a numpy.array. NOTE: returning this fingerprint dict differs from the base class spec. @param text (str) A non-tokenized sample of text. @return (dict) Result from the cortipy client. The bitmap encoding is at encoding["fingerprint"]["positions"]. """ if not isinstance(text, str) and not isinstance(text, unicode): raise TypeError( "Expected a string input but got input of type {}.".format( type(text))) try: if self.fingerprintType == EncoderTypes.document: encoding = self.client.getTextBitmap(text) elif self.fingerprintType == EncoderTypes.word: encoding = self.getUnionEncoding(text) else: encoding = self.client.getBitmap(text) except UnsuccessfulEncodingError: if self.verbosity > 0: print( "\tThe client returned no encoding for the text \'{0}\', so " "we'll use the encoding of the token that is least frequent in " "the corpus.".format(text)) encoding = self._subEncoding(text) return self.finishEncoding(encoding) def getUnionEncoding(self, text): """ Encode each token of the input text, take the union, and then sparsify. @param text (str) A non-tokenized sample of text. @return (dict) The bitmap encoding is at encoding["fingerprint"]["positions"]. """ tokens = TextPreprocess().tokenize(text) # Count the ON bits represented in the encoded tokens. counts = Counter() for t in tokens: bitmap = self._getWordBitmap(t) counts.update(bitmap) positions = self.sparseUnion(counts) # Populate encoding encoding = { "text": text, "sparsity": len(positions) / float(self.n), "df": 0.0, "height": self.height, "width": self.width, "score": 0.0, "fingerprint": { "positions": sorted(positions) }, "pos_types": [] } return encoding def getWindowEncoding(self, tokens, minSparsity=0.0): """ The encodings simulate a "sliding window", where the encoding representation of a given token is a union of its bitmap with the immediately previous tokens' bitmaps, up to the maximum sparsity. The returned list only includes those windows with sparsities larger than the minimum. @param tokens (list) Tokenized string. @param minSparsity (float) Only window encodings denser than this value will be included. @return windowBitmaps (list) Dict for each token, with entries for the token string, sparsity float, and bitmap numpy array. """ if self.fingerprintType != EncoderTypes.word: print("Although the encoder type is not set for words, the window " "encodings use word-level fingerprints.") bitmaps = [numpy.array(self._getWordBitmap(t)) for t in tokens] windowBitmaps = [] for tokenIndex, windowBitmap in enumerate(bitmaps): # Each index in the tokens list is the end of a possible window. for i in reversed(xrange(tokenIndex)): # From the current token, increase the window by successively adding the # previous tokens. windowSparsity = len(windowBitmap) / float(self.n) nextSparsity = len(bitmaps[i]) / float(self.n) if windowSparsity + nextSparsity > self.unionSparsity: # stopping criterion reached -- window is full break else: # add bitmap to the current window bitmap windowBitmap = numpy.union1d(windowBitmap, bitmaps[i]) sparsity = len(windowBitmap) / float(self.n) if sparsity > minSparsity: # only include windows of sufficient density windowBitmaps.append({ "text": tokens[i:tokenIndex + 1], "sparsity": sparsity, "bitmap": numpy.array(windowBitmap) }) return windowBitmaps def finishEncoding(self, encoding): """ Scale the fingerprint of the encoding dict (if specified) and fill the width, height, and sparsity fields. @param encoding (dict) Dict as returned by the Cio client. @return encoding (dict) Same format as the input dict, with the dimensions and sparsity fields populated. """ if self.retinaScaling != 1: encoding["fingerprint"]["positions"] = self.scaleEncoding( encoding["fingerprint"]["positions"], self.retinaScaling**2) encoding["width"] = self.width encoding["height"] = self.height encoding["fingerprint"]["positions"] = numpy.array( encoding["fingerprint"]["positions"]) encoding["sparsity"] = len( encoding["fingerprint"]["positions"]) / float( (encoding["width"] * encoding["height"])) # Reduce sparsity if needed if encoding["sparsity"] > self.maxSparsity: self.reduceSparsity(encoding, self.maxSparsity) return encoding def _getWordBitmap(self, term): """ Return a bitmap for the word. If the Cortical.io API can't encode, cortipy will use a random encoding for the word. """ return self.client.getBitmap(term)["fingerprint"]["positions"] def encodeIntoArray(self, inputText, output): """ Encodes inputText and puts the encoded value into the numpy output array, which is a 1-D array of length returned by getWidth(). """ encoding = self.encode(inputText) output[:] = 0 if encoding["fingerprint"]["positions"].size > 0: output[encoding["fingerprint"]["positions"]] = 1 def decode(self, encoding, numTerms=10): """ Converts an SDR back into the most likely word or words. By default, the most likely term will be returned. If numTerms is specified, then the Cortical.io API will attempt to return that many; otherwise the standard is 10. The return value will be a sequence of (term, weight) tuples, where higher weights imply the corresponding term better matches the encoding. @param encoding (list) Bitmap encoding. @param numTerms (int) The max number of terms to return. @return (list) List of dictionaries, where keys are terms and likelihood scores. """ terms = self.client.bitmapToTerms(encoding, numTerms=numTerms) # Convert cortipy response to list of tuples (term, weight) return [(term["term"], term["score"]) for term in terms] def _subEncoding(self, text, method="keyword"): """ @param text (str) A non-tokenized sample of text. @return encoding (dict) Fingerprint from cortipy client. An empty dictionary of the text could not be encoded. """ try: if method == "df": tokens = list( itertools.chain.from_iterable( [t.split(",") for t in self.client.tokenize(text)])) encoding = min([self.client.getBitmap(t) for t in tokens], key=lambda x: x["df"]) elif method == "keyword": encoding = self.getUnionEncoding(text) else: raise ValueError("method must be either 'df' or 'keyword'") except UnsuccessfulEncodingError: if self.verbosity > 0: print( "\tThe client returned no substitute encoding for the text " "'{}', so we encode with None.".format(text)) encoding = None return encoding def compare(self, bitmap1, bitmap2): """ Compare encodings, returning the distances between the SDRs. Input bitmaps must be list objects (need to be serializable). Example return dict: { "cosineSimilarity": 0.6666666666666666, "euclideanDistance": 0.3333333333333333, "jaccardDistance": 0.5, "overlappingAll": 6, "overlappingLeftRight": 0.6666666666666666, "overlappingRightLeft": 0.6666666666666666, "sizeLeft": 9, "sizeRight": 9, "weightedScoring": 0.4436476984102028 } """ if not isinstance(bitmap1 and bitmap2, list): raise TypeError("Comparison bitmaps must be lists.") return self.client.compare(bitmap1, bitmap2) def createCategory(self, label, positives, negatives=None): """ Create a classification category (bitmap) via the Cio claassify endpoint. @param label (str) Name of category. @param positives (list) Bitmap(s) of samples to define. @param negatives (list) Not required to make category. @return (dict) Key-values for "positions" (list bitmap encoding of the category and "categoryName" (str). """ if negatives is None: negatives = [] if not isinstance(positives and negatives, list): raise TypeError("Input bitmaps must be lists.") return self.client.createClassification(label, positives, negatives) def getWidth(self): return self.n def getDimensions(self): return (self.width, self.height) def getDescription(self): return self.description def densifyPattern(self, bitmap): """Return a numpy array of 0s and 1s to represent the given bitmap.""" sparsePattern = numpy.zeros(self.n) for i in bitmap: sparsePattern[i] = 1.0 return sparsePattern def reduceSparsity(self, encoding, maxSparsity): """Reduce the sparsity of the encoding down to maxSparsity""" desiredBits = maxSparsity * encoding["width"] * encoding["height"] bitmap = encoding["fingerprint"]["positions"] # Choose a random subsampling of the bits but seed the random number # generator so we get consistent bitmaps numpy.random.seed(bitmap.sum()) encoding["fingerprint"]["positions"] = ( numpy.random.permutation(bitmap)[0:desiredBits]) encoding["sparsity"] = len( encoding["fingerprint"]["positions"]) / float( (encoding["width"] * encoding["height"]))
class CioEncoder(LanguageEncoder): """ A language encoder using the Cortical.io API. The encoder queries the Cortical.io REST API via the cortipy module, which returns data in the form of "fingerprints". These representations are converted to binary SDR arrays with this Cio encoder. """ def __init__(self, w=128, h=128, cacheDir="./cache", verbosity=0): if 'CORTICAL_API_KEY' not in os.environ: print ("Missing CORTICAL_API_KEY environment variable. If you have a " "key, set it with $ export CORTICAL_API_KEY=api_key\n" "You can retrieve a key by registering for the REST API at " "http://www.cortical.io/resources_apikey.html") raise OSError("Missing API key.") self.apiKey = os.environ['CORTICAL_API_KEY'] self.client = CorticalClient(self.apiKey, cacheDir=cacheDir) self.targetSparsity = 5.0 self.w = w self.h = h self.n = w*h self.verbosity = verbosity def encode(self, text): """ Encodes the input text w/ a cortipy client. The client returns a dictionary of "fingerprint" info, including the SDR bitmap. @param text (str) A non-tokenized sample of text. @return (dict) Result from the cortipy client. The bitmap encoding is at encoding["fingerprint"]["positions"]. """ if not text: return None try: encoding = self.client.getTextBitmap(text) except UnsuccessfulEncodingError: if self.verbosity > 0: print ("\tThe client returned no encoding for the text \'{0}\', so " "we'll use the encoding of the token that is least frequent in " "the corpus.".format(text)) encoding = self._subEncoding(text) return encoding def decode(self, encoding, numTerms=10): """ Converts an SDR back into the most likely word or words. By default, the most likely term will be returned. If numTerms is specified, then the Cortical.io API will attempt to return that many; otherwise the standard is 10. The return value will be a sequence of (term, weight) tuples, where higher weights imply the corresponding term better matches the encoding. @param encoding (list) Bitmap encoding. @param numTerms (int) The max number of terms to return. @return (list) List of dictionaries, where keys are terms and likelihood scores. """ terms = client.bitmapToTerms(encoding, numTerms=numTerms) # Convert cortipy response to list of tuples (term, weight) return [((term["term"], term["score"])) for term in terms] def _subEncoding(self, text, method="df"): """ @param text (str) A non-tokenized sample of text. @return encoding (dict) Fingerprint from cortipy client. An empty dictionary of the text could not be encoded. """ tokens = list(itertools.chain.from_iterable( [t.split(',') for t in self.client.tokenize(text)])) try: if method == "df": encoding = min([self.client.getBitmap(t) for t in tokens], key=lambda x: x["df"]) elif method == "keyword": # Take a union of the bitmaps counts = Counter() for t in tokens: bitmap = self.client.getBitmap(t)["fingerprint"]["positions"] counts.update(bitmap) # Sample to remain sparse max_sparsity = int((self.targetSparsity / 100) * self.n) w = min(len(counts), max_sparsity) positions = [c[0] for c in counts.most_common(w)] # Populate encoding encoding = { "text": text, "sparsity": w * 100 / float(self.n), "df": 0.0, "height": self.h, "width": self.w, "score": 0.0, "fingerprint": { "positions":sorted(positions) }, "pos_types": [] } else: raise ValueError("method must be either \'df\' or \'keyword\'") except UnsuccessfulEncodingError: if self.verbosity > 0: print ("\tThe client returned no substitute encoding for the text " "\'{0}\', so we encode with None.".format(text)) encoding = None return encoding def compare(self, encoding1, encoding2): """ Compare encodings, returning the distances between the SDRs. Example return dict: { "cosineSimilarity": 0.6666666666666666, "euclideanDistance": 0.3333333333333333, "jaccardDistance": 0.5, "overlappingAll": 6, "overlappingLeftRight": 0.6666666666666666, "overlappingRightLeft": 0.6666666666666666, "sizeLeft": 9, "sizeRight": 9, "weightedScoring": 0.4436476984102028 } """ # Format input SDRs as Cio fingerprints fp1 = {"fingerprint": {"positions":self.bitmapFromSDR(encoding1)}} fp2 = {"fingerprint": {"positions":self.bitmapFromSDR(encoding2)}} return self.client.compare(fp1, fp2) def getWidth(self): return self.w def getHeight(self): return self.h def getDescription(self): return self.description
class CioEncoder(LanguageEncoder): """ A language encoder using the Cortical.io API. The encoder queries the Cortical.io REST API via the cortipy module, which returns data in the form of "fingerprints". These representations are converted to binary SDR arrays with this Cio encoder. """ def __init__(self, retina=DEFAULT_RETINA, retinaScaling=1.0, cacheDir=None, verbosity=0, fingerprintType=EncoderTypes.document, unionSparsity=0.20, apiKey=None, maxSparsity=0.50): """ @param retina (str) Cortical.io retina, either "en_synonymous" or "en_associative". @param retinaScaling (float) Scale each dimension of the SDR bitmap by this factor. @param cacheDir (str) Where to cache results of API queries. @param verbosity (int) Amount of info printed out, 0, 1, or 2. @param fingerprintType (Enum) Specify word- or document-level encoding. @param unionSparsity (float) Any union'ing done in this encoder will stop once this sparsity is reached. @param maxSparsity (float) The maximum sparsity of the returned bitmap. If the percentage of bits in the encoding is > maxSparsity, it will be randomly subsampled. TODO: replace enum with a simple string """ if apiKey is None and "CORTICAL_API_KEY" not in os.environ: print ("Missing CORTICAL_API_KEY environment variable. If you have a " "key, set it with $ export CORTICAL_API_KEY=api_key\n" "You can retrieve a key by registering for the REST API at " "http://www.cortical.io/resources_apikey.html") raise OSError("Missing API key.") super(CioEncoder, self).__init__(unionSparsity=unionSparsity) if cacheDir is None: root = os.path.dirname(os.path.realpath(__file__)) cacheDir = os.path.join(root, "CioCache") self.apiKey = apiKey if apiKey else os.environ["CORTICAL_API_KEY"] self.client = CorticalClient(self.apiKey, retina=retina, cacheDir=cacheDir) self._setDimensions(retina, retinaScaling) self.fingerprintType = fingerprintType self.description = ("Cio Encoder", 0) self.verbosity = verbosity self.maxSparsity = maxSparsity def _setDimensions(self, retina, scalingFactor): if scalingFactor <= 0 or scalingFactor > 1: raise ValueError("Retina can only be scaled by values between 0 and 1.") retinaDim = RETINA_SIZES[retina]["width"] self.width = int(retinaDim * scalingFactor) self.height = int(retinaDim * scalingFactor) self.retinaScaling = float(self.width)/retinaDim self.n = self.width * self.height def encode(self, text): """ Encodes the input text w/ a cortipy client. The client returns a dictionary of "fingerprint" info, including the SDR bitmap as a numpy.array. NOTE: returning this fingerprint dict differs from the base class spec. @param text (str) A non-tokenized sample of text. @return (dict) Result from the cortipy client. The bitmap encoding is at encoding["fingerprint"]["positions"]. """ if not isinstance(text, str) and not isinstance(text, unicode): raise TypeError("Expected a string input but got input of type {}." .format(type(text))) try: if self.fingerprintType == EncoderTypes.document: encoding = self.client.getTextBitmap(text) elif self.fingerprintType == EncoderTypes.word: encoding = self.getUnionEncoding(text) else: encoding = self.client.getBitmap(text) except UnsuccessfulEncodingError: if self.verbosity > 0: print ("\tThe client returned no encoding for the text \'{0}\', so " "we'll use the encoding of the token that is least frequent in " "the corpus.".format(text)) encoding = self._subEncoding(text) return self.finishEncoding(encoding) def getUnionEncoding(self, text): """ Encode each token of the input text, take the union, and then sparsify. @param text (str) A non-tokenized sample of text. @return (dict) The bitmap encoding is at encoding["fingerprint"]["positions"]. """ tokens = TextPreprocess().tokenize(text) # Count the ON bits represented in the encoded tokens. counts = Counter() for t in tokens: bitmap = self._getWordBitmap(t) counts.update(bitmap) positions = self.sparseUnion(counts) # Populate encoding encoding = { "text": text, "sparsity": len(positions) / float(self.n), "df": 0.0, "height": self.height, "width": self.width, "score": 0.0, "fingerprint": { "positions":sorted(positions) }, "pos_types": [] } return encoding def getWindowEncoding(self, tokens, minSparsity=0.0): """ The encodings simulate a "sliding window", where the encoding representation of a given token is a union of its bitmap with the immediately previous tokens' bitmaps, up to the maximum sparsity. The returned list only includes those windows with sparsities larger than the minimum. @param tokens (list) Tokenized string. @param minSparsity (float) Only window encodings denser than this value will be included. @return windowBitmaps (list) Dict for each token, with entries for the token string, sparsity float, and bitmap numpy array. """ if self.fingerprintType != EncoderTypes.word: print ("Although the encoder type is not set for words, the window " "encodings use word-level fingerprints.") bitmaps = [numpy.array(self._getWordBitmap(t)) for t in tokens] windowBitmaps = [] for tokenIndex, windowBitmap in enumerate(bitmaps): # Each index in the tokens list is the end of a possible window. for i in reversed(xrange(tokenIndex)): # From the current token, increase the window by successively adding the # previous tokens. windowSparsity = len(windowBitmap) / float(self.n) nextSparsity = len(bitmaps[i]) / float(self.n) if windowSparsity + nextSparsity > self.unionSparsity: # stopping criterion reached -- window is full break else: # add bitmap to the current window bitmap windowBitmap = numpy.union1d(windowBitmap, bitmaps[i]) sparsity = len(windowBitmap) / float(self.n) if sparsity > minSparsity: # only include windows of sufficient density windowBitmaps.append( {"text": tokens[i:tokenIndex+1], "sparsity": sparsity, "bitmap": numpy.array(windowBitmap)}) return windowBitmaps def finishEncoding(self, encoding): """ Scale the fingerprint of the encoding dict (if specified) and fill the width, height, and sparsity fields. @param encoding (dict) Dict as returned by the Cio client. @return encoding (dict) Same format as the input dict, with the dimensions and sparsity fields populated. """ if self.retinaScaling != 1: encoding["fingerprint"]["positions"] = self.scaleEncoding( encoding["fingerprint"]["positions"], self.retinaScaling**2) encoding["width"] = self.width encoding["height"] = self.height encoding["fingerprint"]["positions"] = numpy.array( encoding["fingerprint"]["positions"]) encoding["sparsity"] = len(encoding["fingerprint"]["positions"]) / float( (encoding["width"] * encoding["height"])) # Reduce sparsity if needed if encoding["sparsity"] > self.maxSparsity: self.reduceSparsity(encoding, self.maxSparsity) return encoding def _getWordBitmap(self, term): """ Return a bitmap for the word. If the Cortical.io API can't encode, cortipy will use a random encoding for the word. """ return self.client.getBitmap(term)["fingerprint"]["positions"] def encodeIntoArray(self, inputText, output): """ Encodes inputText and puts the encoded value into the numpy output array, which is a 1-D array of length returned by getWidth(). """ encoding = self.encode(inputText) output[:] = 0 if encoding["fingerprint"]["positions"].size > 0: output[encoding["fingerprint"]["positions"]] = 1 def decode(self, encoding, numTerms=10): """ Converts an SDR back into the most likely word or words. By default, the most likely term will be returned. If numTerms is specified, then the Cortical.io API will attempt to return that many; otherwise the standard is 10. The return value will be a sequence of (term, weight) tuples, where higher weights imply the corresponding term better matches the encoding. @param encoding (list) Bitmap encoding. @param numTerms (int) The max number of terms to return. @return (list) List of dictionaries, where keys are terms and likelihood scores. """ terms = self.client.bitmapToTerms(encoding, numTerms=numTerms) # Convert cortipy response to list of tuples (term, weight) return [(term["term"], term["score"]) for term in terms] def _subEncoding(self, text, method="keyword"): """ @param text (str) A non-tokenized sample of text. @return encoding (dict) Fingerprint from cortipy client. An empty dictionary of the text could not be encoded. """ try: if method == "df": tokens = list(itertools.chain.from_iterable( [t.split(",") for t in self.client.tokenize(text)])) encoding = min( [self.client.getBitmap(t) for t in tokens], key=lambda x: x["df"]) elif method == "keyword": encoding = self.getUnionEncoding(text) else: raise ValueError("method must be either 'df' or 'keyword'") except UnsuccessfulEncodingError: if self.verbosity > 0: print ("\tThe client returned no substitute encoding for the text " "'{}', so we encode with None.".format(text)) encoding = None return encoding def compare(self, bitmap1, bitmap2): """ Compare encodings, returning the distances between the SDRs. Input bitmaps must be list objects (need to be serializable). Example return dict: { "cosineSimilarity": 0.6666666666666666, "euclideanDistance": 0.3333333333333333, "jaccardDistance": 0.5, "overlappingAll": 6, "overlappingLeftRight": 0.6666666666666666, "overlappingRightLeft": 0.6666666666666666, "sizeLeft": 9, "sizeRight": 9, "weightedScoring": 0.4436476984102028 } """ if not isinstance(bitmap1 and bitmap2, list): raise TypeError("Comparison bitmaps must be lists.") return self.client.compare(bitmap1, bitmap2) def createCategory(self, label, positives, negatives=None): """ Create a classification category (bitmap) via the Cio claassify endpoint. @param label (str) Name of category. @param positives (list) Bitmap(s) of samples to define. @param negatives (list) Not required to make category. @return (dict) Key-values for "positions" (list bitmap encoding of the category and "categoryName" (str). """ if negatives is None: negatives = [] if not isinstance(positives and negatives, list): raise TypeError("Input bitmaps must be lists.") return self.client.createClassification(label, positives, negatives) def getWidth(self): return self.n def getDimensions(self): return (self.width, self.height) def getDescription(self): return self.description def densifyPattern(self, bitmap): """Return a numpy array of 0s and 1s to represent the given bitmap.""" sparsePattern = numpy.zeros(self.n) for i in bitmap: sparsePattern[i] = 1.0 return sparsePattern def reduceSparsity(self, encoding, maxSparsity): """Reduce the sparsity of the encoding down to maxSparsity""" desiredBits = maxSparsity*encoding["width"]*encoding["height"] bitmap = encoding["fingerprint"]["positions"] # Choose a random subsampling of the bits but seed the random number # generator so we get consistent bitmaps numpy.random.seed(bitmap.sum()) encoding["fingerprint"]["positions"] = ( numpy.random.permutation(bitmap)[0:desiredBits] ) encoding["sparsity"] = len(encoding["fingerprint"]["positions"]) / float( (encoding["width"] * encoding["height"]))
class CioEncoder(LanguageEncoder): """ A language encoder using the Cortical.io API. The encoder queries the Cortical.io REST API via the cortipy module, which returns data in the form of "fingerprints". These representations are converted to binary SDR arrays with this Cio encoder. """ def __init__(self, w=128, h=128): if 'CORTICAL_API_KEY' not in os.environ: print ("Missing CORTICAL_API_KEY environment variable. If you have a " "key, set it with $ export CORTICAL_API_KEY=api_key\n" "You can retrieve a key by registering for the REST API at " "http://www.cortical.io/resources_apikey.html") raise Exception("Missing API key.") self.apiKey = os.environ['CORTICAL_API_KEY'] self.client = CorticalClient(self.apiKey, cacheDir=os.join("./cache")) self.targetSparsity = 1.0 self.w = w ## Alternatively get dimensions from cortipy client object? self.h = h self.n = w*h def encode(self, text): """ Encodes the input text w/ a cortipy client. The client returns a dictionary of "fingerprint" info, including the SDR bitmap. @param text (str, list) If the input is type str, the encoder assumes it has not yet been tokenized. A list input will skip the tokenization step. @return (list) SDR. """ if isinstance(text, str): text = self.client.tokenize(text) try: encoding = self.client.getBitmap(string) except ValueError: encoding = self.client.getTextBitmap(string) if encoding.sparsity == 0: ##TODO: test again when/if this happens # No fingerprint so fill w/ random bitmap, seeded for each specific term. print ("\tThe client returned a bitmap with sparsity=0 for the string " "\'%s\', so we'll generate a pseudo-random SDR with the target " "sparsity=%0.1f." % (string, self.targetSparsity)) state = random.getstate() random.seed(string) num = self.w * self.h bitmap = random.sample(range(num), int(self.targetSparsity * num / 100)) self._createFromBitmap(bitmap, self.w, self.h) random.setstate(state) return self.client.getSDR(encoding["fingerprint"]["positions"]) def decode(self, encoding, numTerms=None): """ Converts an SDR back into the most likely word or words. By default, the most likely term will be returned. If numTerms is specified, then the Cortical.io API will attempt to return that many; otherwise the standard is 10. The return value will be a sequence of (term, weight) tuples, where higher weights imply the corresponding term better matches the encoding. @param encoding (list) SDR. @param numTerms (int) The max number of terms to return. @return similar (list) List of dictionaries, where keys are terms and likelihood scores. """ # Convert SDR to bitmap, send to cortipy client. terms = client.bitmapToTerms( super(CioEncoder, self).bitmapFromSDR(encoding)) # Convert cortipy response to list of tuples (term, weight) return [((term["term"], term["score"])) for term in terms] ## TODO: redo fields? delete (see line 81 TODO)? def _createFromBitmap(self, bitmap, width, height): self.bitmap = bitmap self.w = width self.h = height self.sparsity = (100.0 * len(bitmap)) / (width*height) return self def compare(self, encoding1, encoding2): """ Compare encodings, returning the distances between the SDRs. Example return dict: { "cosineSimilarity": 0.6666666666666666, "euclideanDistance": 0.3333333333333333, "jaccardDistance": 0.5, "overlappingAll": 6, "overlappingLeftRight": 0.6666666666666666, "overlappingRightLeft": 0.6666666666666666, "sizeLeft": 9, "sizeRight": 9, "weightedScoring": 0.4436476984102028 } """ # Format input SDRs as Cio fingerprints fp1 = {"fingerprint": {"positions":self.bitmapFromSDR(encoding1)}} fp2 = {"fingerprint": {"positions":self.bitmapFromSDR(encoding2)}} return self.client.compare(fp1, fp2) def getWidth(self): return self.w def getHeight(self): return self.h def getDescription(self): return self.description