def __init__(self, w=128, h=128, retina=DEFAULT_RETINA, cacheDir="./cache", verbosity=0, fingerprintType=EncoderTypes.document): """ @param w (int) Width dimension of the SDR topology. @param h (int) Height dimension of the SDR topology. @param cacheDir (str) Where to cache results of API queries. @param verbosity (int) Amount of info printed out, 0, 1, or 2. @param fingerprintType (Enum) Specify word- or document-level encoding. """ if "CORTICAL_API_KEY" not in os.environ: print( "Missing CORTICAL_API_KEY environment variable. If you have a " "key, set it with $ export CORTICAL_API_KEY=api_key\n" "You can retrieve a key by registering for the REST API at " "http://www.cortical.io/resources_apikey.html") raise OSError("Missing API key.") self.apiKey = os.environ["CORTICAL_API_KEY"] self.client = CorticalClient(self.apiKey, retina=retina, cacheDir=cacheDir) self.targetSparsity = 5.0 self.w = w self.h = h self.n = w * h self.verbosity = verbosity self.fingerprintType = fingerprintType self.description = ("Cio Encoder", 0)
def cacheDir(self, value): if value: # Only set cacheDir if value explicitly provided self._cacheDir = value # Re-init the encoder's Cio client for the new cacheDir self.client = CorticalClient(self.apiKey, retina=self.retina, cacheDir=value)
def __init__(self, retina=DEFAULT_RETINA, retinaScaling=1.0, cacheDir=None, verbosity=0, fingerprintType=EncoderTypes.document, unionSparsity=0.20, apiKey=None, maxSparsity=0.50): """ @param retina (str) Cortical.io retina, either "en_synonymous" or "en_associative". @param retinaScaling (float) Scale each dimension of the SDR bitmap by this factor. @param cacheDir (str) Where to cache results of API queries. @param verbosity (int) Amount of info printed out, 0, 1, or 2. @param fingerprintType (Enum) Specify word- or document-level encoding. @param unionSparsity (float) Any union'ing done in this encoder will stop once this sparsity is reached. @param maxSparsity (float) The maximum sparsity of the returned bitmap. If the percentage of bits in the encoding is > maxSparsity, it will be randomly subsampled. TODO: replace enum with a simple string """ if apiKey is None and "CORTICAL_API_KEY" not in os.environ: print( "Missing CORTICAL_API_KEY environment variable. If you have a " "key, set it with $ export CORTICAL_API_KEY=api_key\n" "You can retrieve a key by registering for the REST API at " "http://www.cortical.io/resources_apikey.html") raise OSError("Missing API key.") super(CioEncoder, self).__init__(unionSparsity=unionSparsity) if cacheDir is None: # Use the default cache directory root = os.path.dirname(os.path.realpath(__file__)) cacheDir = os.path.join(root, "CioCache") self.apiKey = apiKey if apiKey else os.environ["CORTICAL_API_KEY"] self.retina = retina self.client = CorticalClient(self.apiKey, retina=retina, cacheDir=cacheDir) self._setDimensions(retinaScaling) self.fingerprintType = fingerprintType self.description = ("Cio Encoder", 0) self.verbosity = verbosity self.maxSparsity = maxSparsity
def __init__(self, verbosity=1, numLabels=1): """ Initialize the CorticalClient and CioEncoder. Requires a valid API key """ super(ClassificationModelContext, self).__init__(verbosity) self.encoder = CioEncoder(cacheDir="./experiments/cache") self.client = CorticalClient(self.encoder.apiKey) self.n = self.encoder.n self.w = int((self.encoder.targetSparsity / 100) * self.n) self.categoryBitmaps = {} self.numLabels = numLabels
def __init__(self, retina=DEFAULT_RETINA, retinaScaling=1.0, cacheDir=None, verbosity=0, fingerprintType=EncoderTypes.document, unionSparsity=0.20, apiKey=None, maxSparsity=0.50): """ @param retina (str) Cortical.io retina, either "en_synonymous" or "en_associative". @param retinaScaling (float) Scale each dimension of the SDR bitmap by this factor. @param cacheDir (str) Where to cache results of API queries. @param verbosity (int) Amount of info printed out, 0, 1, or 2. @param fingerprintType (Enum) Specify word- or document-level encoding. @param unionSparsity (float) Any union'ing done in this encoder will stop once this sparsity is reached. @param maxSparsity (float) The maximum sparsity of the returned bitmap. If the percentage of bits in the encoding is > maxSparsity, it will be randomly subsampled. TODO: replace enum with a simple string """ if apiKey is None and "CORTICAL_API_KEY" not in os.environ: print ("Missing CORTICAL_API_KEY environment variable. If you have a " "key, set it with $ export CORTICAL_API_KEY=api_key\n" "You can retrieve a key by registering for the REST API at " "http://www.cortical.io/resources_apikey.html") raise OSError("Missing API key.") super(CioEncoder, self).__init__(unionSparsity=unionSparsity) if cacheDir is None: root = os.path.dirname(os.path.realpath(__file__)) cacheDir = os.path.join(root, "CioCache") self.apiKey = apiKey if apiKey else os.environ["CORTICAL_API_KEY"] self.client = CorticalClient(self.apiKey, retina=retina, cacheDir=cacheDir) self.cacheDir = cacheDir self.client = CorticalClient(self.apiKey, retina=retina, cacheDir=self.cacheDir) self._setDimensions(retina, retinaScaling) self.fingerprintType = fingerprintType self.description = ("Cio Encoder", 0) self.verbosity = verbosity self.maxSparsity = maxSparsity
def __init__(self, w=128, h=128): if 'CORTICAL_API_KEY' not in os.environ: print( "Missing CORTICAL_API_KEY environment variable. If you have a " "key, set it with $ export CORTICAL_API_KEY=api_key\n" "You can retrieve a key by registering for the REST API at " "http://www.cortical.io/resources_apikey.html") raise Exception("Missing API key.") self.apiKey = os.environ['CORTICAL_API_KEY'] self.client = CorticalClient(self.apiKey, cacheDir=os.join("./cache")) self.targetSparsity = 1.0 self.w = w ## Alternatively get dimensions from cortipy client object? self.h = h self.n = w * h
def __init__(self, verbosity=1, numLabels=1): """ Initialize the CorticalClient and CioEncoder. Requires a valid API key. """ super(ClassificationModelContext, self).__init__(verbosity) root = os.path.dirname(os.path.realpath(__file__)) self.encoder = CioEncoder(cacheDir=os.path.join(root, "CioCache")) self.client = CorticalClient(self.encoder.apiKey) self.n = self.encoder.n self.w = int((self.encoder.targetSparsity / 100) * self.n) self.categoryBitmaps = {} self.numLabels = numLabels
def __init__(self, retina=DEFAULT_RETINA, retinaScaling=1.0, cacheDir=None, verbosity=0, fingerprintType=EncoderTypes.document, unionSparsity=0.20, apiKey=None): """ @param retina (str) Cortical.io retina, either "en_synonymous" or "en_associative". @param retinaScaling (float) Scales the dimensions of the SDR topology, where the width and height are both 128. @param cacheDir (str) Where to cache results of API queries. @param verbosity (int) Amount of info printed out, 0, 1, or 2. @param fingerprintType (Enum) Specify word- or document-level encoding. """ if apiKey is None and "CORTICAL_API_KEY" not in os.environ: print ("Missing CORTICAL_API_KEY environment variable. If you have a " "key, set it with $ export CORTICAL_API_KEY=api_key\n" "You can retrieve a key by registering for the REST API at " "http://www.cortical.io/resources_apikey.html") raise OSError("Missing API key.") super(CioEncoder, self).__init__(unionSparsity=unionSparsity) if cacheDir is None: root = os.path.dirname(os.path.realpath(__file__)) cacheDir = os.path.join(root, "CioCache") self.apiKey = apiKey if apiKey else os.environ["CORTICAL_API_KEY"] self.client = CorticalClient(self.apiKey, retina=retina, cacheDir=cacheDir) self._setDimensions(retina, retinaScaling) self.fingerprintType = fingerprintType self.description = ("Cio Encoder", 0) self.verbosity = verbosity
def main(terms): parser = argparse.ArgumentParser() parser.add_argument("--retinaId", default="en_synonymous", type=str) parser.add_argument("--corticalApiKey", default=os.environ.get("CORTICAL_API_KEY"), type=str) parser.add_argument( "--plot", default="histogram", choices=["histogram", "heatmap", "frequencyHistogram", "None", "none"]) parser.add_argument("--cacheDir", default=None, type=str) opts = parser.parse_args() if opts.retinaId == "random": counts = countRandomBitFrequencies() else: client = CorticalClient(opts.corticalApiKey, retina=opts.retinaId, verbosity=0, cacheDir=opts.cacheDir, fillSDR=None) counts = countBitFrequenciesForTerms(client, terms) if opts.plot == "histogram": plotHistogram(counts) elif opts.plot == "heatmap": plotlyHeatmap(counts) elif opts.plot == "frequencyHistogram": plotlyFrequencyHistogram(counts)
def __setstate__(self, state): """ Called when CioEncoder is unpickled per pickle protocol. This includes a mechanism to gracefully re-init the CorticalClient instance with a calculated cacheDir, in the event that the previously pickled CorticalClient instance includes a cacheDir that does not exist, which is likely the case when a model is trained on one machine for reuse elsewhere. """ if "_cacheDir" not in state: state["client"] = CorticalClient(state["apiKey"], retina=state["client"].retina, cacheDir=self.cacheDir) self.__dict__ = state
def __init__(self, retina=DEFAULT_RETINA, retinaScaling=1.0, cacheDir=None, verbosity=0, fingerprintType=EncoderTypes.document, unionSparsity=0.20, apiKey=None): """ @param retina (str) Cortical.io retina, either "en_synonymous" or "en_associative". @param retinaScaling (float) Scales the dimensions of the SDR topology, where the width and height are both 128. @param cacheDir (str) Where to cache results of API queries. @param verbosity (int) Amount of info printed out, 0, 1, or 2. @param fingerprintType (Enum) Specify word- or document-level encoding. TODO: replace enum with a simple string """ if apiKey is None and "CORTICAL_API_KEY" not in os.environ: print ("Missing CORTICAL_API_KEY environment variable. If you have a " "key, set it with $ export CORTICAL_API_KEY=api_key\n" "You can retrieve a key by registering for the REST API at " "http://www.cortical.io/resources_apikey.html") raise OSError("Missing API key.") super(CioEncoder, self).__init__(unionSparsity=unionSparsity) if cacheDir is None: root = os.path.dirname(os.path.realpath(__file__)) cacheDir = os.path.join(root, "CioCache") self.apiKey = apiKey if apiKey else os.environ["CORTICAL_API_KEY"] self.client = CorticalClient(self.apiKey, retina=retina, cacheDir=cacheDir) self._setDimensions(retina, retinaScaling) self.fingerprintType = fingerprintType self.description = ("Cio Encoder", 0) self.verbosity = verbosity
def __init__(self, w=128, h=128, cacheDir="./cache", verbosity=0): if 'CORTICAL_API_KEY' not in os.environ: print ("Missing CORTICAL_API_KEY environment variable. If you have a " "key, set it with $ export CORTICAL_API_KEY=api_key\n" "You can retrieve a key by registering for the REST API at " "http://www.cortical.io/resources_apikey.html") raise Exception("Missing API key.") self.apiKey = os.environ['CORTICAL_API_KEY'] self.client = CorticalClient(self.apiKey, cacheDir=cacheDir) self.targetSparsity = 5.0 self.w = w self.h = h self.n = w*h self.verbosity = verbosity
def __init__(self, w=128, h=128): if 'CORTICAL_API_KEY' not in os.environ: print ("Missing CORTICAL_API_KEY environment variable. If you have a " "key, set it with $ export CORTICAL_API_KEY=api_key\n" "You can retrieve a key by registering for the REST API at " "http://www.cortical.io/resources_apikey.html") raise Exception("Missing API key.") self.apiKey = os.environ['CORTICAL_API_KEY'] self.client = CorticalClient(self.apiKey, cacheDir=os.join("./cache")) self.targetSparsity = 1.0 self.w = w ## Alternatively get dimensions from cortipy client object? self.h = h self.n = w*h
parser = argparse.ArgumentParser() parser.add_argument("--retinaId", default="en_synonymous", type=str) parser.add_argument("--corticalApiKey", default=os.environ.get("CORTICAL_API_KEY"), type=str) parser.add_argument("--cacheDir", default="../../htmresearch/encoders/CioCache", type=str) opts = parser.parse_args() client = CorticalClient(opts.corticalApiKey, retina=opts.retinaId, verbosity=0, cacheDir=opts.cacheDir, fillSDR=None) # Read in words from dictionary with open("enable1.txt", "r") as f: lines = f.readlines() print "Processing",len(lines),"lines..." words = [] random.seed(42) # Subsample small percentage of words for line in lines: p = random.uniform(0,1) if p <= 1.05:
class CioEncoder(LanguageEncoder): """ A language encoder using the Cortical.io API. The encoder queries the Cortical.io REST API via the cortipy module, which returns data in the form of "fingerprints". These representations are converted to binary SDR arrays with this Cio encoder. """ def __init__(self, w=128, h=128, retina=DEFAULT_RETINA, cacheDir="./cache", verbosity=0, fingerprintType=EncoderTypes.document): """ @param w (int) Width dimension of the SDR topology. @param h (int) Height dimension of the SDR topology. @param cacheDir (str) Where to cache results of API queries. @param verbosity (int) Amount of info printed out, 0, 1, or 2. @param fingerprintType (Enum) Specify word- or document-level encoding. """ if "CORTICAL_API_KEY" not in os.environ: print( "Missing CORTICAL_API_KEY environment variable. If you have a " "key, set it with $ export CORTICAL_API_KEY=api_key\n" "You can retrieve a key by registering for the REST API at " "http://www.cortical.io/resources_apikey.html") raise OSError("Missing API key.") self.apiKey = os.environ["CORTICAL_API_KEY"] self.client = CorticalClient(self.apiKey, retina=retina, cacheDir=cacheDir) self.targetSparsity = 5.0 self.w = w self.h = h self.n = w * h self.verbosity = verbosity self.fingerprintType = fingerprintType self.description = ("Cio Encoder", 0) def encode(self, text): """ Encodes the input text w/ a cortipy client. The client returns a dictionary of "fingerprint" info, including the SDR bitmap. NOTE: returning this fingerprint dict differs from the base class spec. @param text (str) A non-tokenized sample of text. @return (dict) Result from the cortipy client. The bitmap encoding is at encoding["fingerprint"]["positions"]. """ if not text: return None try: if self.fingerprintType is EncoderTypes.document: encoding = self.client.getTextBitmap(text) elif self.fingerprintType is EncoderTypes.word: encoding = self.getUnionEncoding(text) except UnsuccessfulEncodingError: if self.verbosity > 0: print( "\tThe client returned no encoding for the text \'{0}\', so " "we'll use the encoding of the token that is least frequent in " "the corpus.".format(text)) encoding = self._subEncoding(text) return encoding def getUnionEncoding(self, text): """ Encode each token of the input text, take the union, and then sparsify. @param text (str) A non-tokenized sample of text. @return (dict) The bitmap encoding is at encoding["fingerprint"]["positions"]. """ tokens = TextPreprocess().tokenize(text) # Count the ON bits represented in the encoded tokens. counts = Counter() for t in tokens: bitmap = self.client.getBitmap(t)["fingerprint"]["positions"] counts.update(bitmap) positions = self.sparseUnion(counts) # Populate encoding encoding = { "text": text, "sparsity": len(positions) * 100 / float(self.n), "df": 0.0, "height": self.h, "width": self.w, "score": 0.0, "fingerprint": { "positions": sorted(positions) }, "pos_types": [] } return encoding def encodeIntoArray(self, inputText, output): """ See method description in language_encoder.py. It is expected the inputText is a single word/token (str). NOTE: nupic Encoder class method encodes output in place as sparse array (commented out below), but this method returns a bitmap. """ if not isinstance(inputText, str): raise TypeError( "Expected a string input but got input of type {}.".format( type(inputText))) # Encode with term endpoint of Cio API try: encoding = self.client.getBitmap(inputText) except UnsuccessfulEncodingError: if self.verbosity > 0: print( "\tThe client returned no encoding for the text \'{0}\', so " "we'll use the encoding of the token that is least frequent in " "the corpus.".format(inputText)) encoding = self._subEncoding(inputText) # output = sparsify(encoding["fingerprint"]["positions"]) return encoding def decode(self, encoding, numTerms=10): """ Converts an SDR back into the most likely word or words. By default, the most likely term will be returned. If numTerms is specified, then the Cortical.io API will attempt to return that many; otherwise the standard is 10. The return value will be a sequence of (term, weight) tuples, where higher weights imply the corresponding term better matches the encoding. @param encoding (list) Bitmap encoding. @param numTerms (int) The max number of terms to return. @return (list) List of dictionaries, where keys are terms and likelihood scores. """ terms = self.client.bitmapToTerms(encoding, numTerms=numTerms) # Convert cortipy response to list of tuples (term, weight) return [((term["term"], term["score"])) for term in terms] def _subEncoding(self, text, method="keyword"): """ @param text (str) A non-tokenized sample of text. @return encoding (dict) Fingerprint from cortipy client. An empty dictionary of the text could not be encoded. """ tokens = list( itertools.chain.from_iterable( [t.split(',') for t in self.client.tokenize(text)])) try: if method == "df": encoding = min([self.client.getBitmap(t) for t in tokens], key=lambda x: x["df"]) elif method == "keyword": encoding = self.getUnionEncoding(text) else: raise ValueError("method must be either \'df\' or \'keyword\'") except UnsuccessfulEncodingError: if self.verbosity > 0: print( "\tThe client returned no substitute encoding for the text " "\'{0}\', so we encode with None.".format(text)) encoding = None return encoding def compare(self, bitmap1, bitmap2): """ Compare encodings, returning the distances between the SDRs. Input bitmaps must be list objects (need to be serializable). Example return dict: { "cosineSimilarity": 0.6666666666666666, "euclideanDistance": 0.3333333333333333, "jaccardDistance": 0.5, "overlappingAll": 6, "overlappingLeftRight": 0.6666666666666666, "overlappingRightLeft": 0.6666666666666666, "sizeLeft": 9, "sizeRight": 9, "weightedScoring": 0.4436476984102028 } """ if not isinstance(bitmap1 and bitmap2, list): raise TypeError("Comparison bitmaps must be lists.") return self.client.compare(bitmap1, bitmap2) def createCategory(self, label, positives, negatives=None): """ Create a classification category (bitmap) via the Cio claassify endpoint. @param label (str) Name of category. @param positives (list) Bitmap(s) of samples to define. @param negatives (list) Not required to make category. @return (dict) Key-values for "positions" (list bitmap encoding of the category and "categoryName" (str). """ if negatives is None: negatives = [] if not isinstance(positives and negatives, list): raise TypeError("Input bitmaps must be lists.") return self.client.createClassification(label, positives, negatives) def getWidth(self): return self.n def getDescription(self): return self.description
class CioEncoder(LanguageEncoder): """ A language encoder using the Cortical.io API. The encoder queries the Cortical.io REST API via the cortipy module, which returns data in the form of "fingerprints". These representations are converted to binary SDR arrays with this Cio encoder. """ def __init__(self, w=128, h=128, cacheDir="./cache", verbosity=0): if 'CORTICAL_API_KEY' not in os.environ: print ("Missing CORTICAL_API_KEY environment variable. If you have a " "key, set it with $ export CORTICAL_API_KEY=api_key\n" "You can retrieve a key by registering for the REST API at " "http://www.cortical.io/resources_apikey.html") raise Exception("Missing API key.") self.apiKey = os.environ['CORTICAL_API_KEY'] self.client = CorticalClient(self.apiKey, cacheDir=cacheDir) self.targetSparsity = 5.0 self.w = w self.h = h self.n = w*h self.verbosity = verbosity def encode(self, text): """ Encodes the input text w/ a cortipy client. The client returns a dictionary of "fingerprint" info, including the SDR bitmap. @param text (str) A non-tokenized sample of text. @return (dict) Result from the cortipy client. The bitmap encoding is at encoding["fingerprint"]["positions"]. """ try: encoding = self.client.getTextBitmap(text) except Exception: if self.verbosity > 0: print("\tThe client returned no encoding for the text, so we'll use " "the encoding of the token that is least frequent in the corpus.") encoding = self._subEncoding(text) return encoding def decode(self, encoding, numTerms=None): """ Converts an SDR back into the most likely word or words. By default, the most likely term will be returned. If numTerms is specified, then the Cortical.io API will attempt to return that many; otherwise the standard is 10. The return value will be a sequence of (term, weight) tuples, where higher weights imply the corresponding term better matches the encoding. @param encoding (list) SDR. @param numTerms (int) The max number of terms to return. @return similar (list) List of dictionaries, where keys are terms and likelihood scores. """ # Convert SDR to bitmap, send to cortipy client. terms = client.bitmapToTerms( super(CioEncoder, self).bitmapFromSDR(encoding)) # Convert cortipy response to list of tuples (term, weight) return [((term["term"], term["score"])) for term in terms] def _subEncoding(self, text): """ @param text (str) A non-tokenized sample of text. @return encoding (dict) Fingerprint from cortipy client. An empty dictionary of the text could not be encoded. """ tokens = list(itertools.chain.from_iterable( [t.split(',') for t in self.client.tokenize(text)])) try: encoding = min([self.client.getBitmap(t) for t in tokens], key=lambda x: x["df"]) except Exception: encoding = {} return encoding ## TODO: redo fields? delete (see line 81 TODO)? def _createFromBitmap(self, bitmap, width, height): self.bitmap = bitmap self.w = width self.h = height self.sparsity = (100.0 * len(bitmap)) / (width*height) return self def compare(self, encoding1, encoding2): """ Compare encodings, returning the distances between the SDRs. Example return dict: { "cosineSimilarity": 0.6666666666666666, "euclideanDistance": 0.3333333333333333, "jaccardDistance": 0.5, "overlappingAll": 6, "overlappingLeftRight": 0.6666666666666666, "overlappingRightLeft": 0.6666666666666666, "sizeLeft": 9, "sizeRight": 9, "weightedScoring": 0.4436476984102028 } """ # Format input SDRs as Cio fingerprints fp1 = {"fingerprint": {"positions":self.bitmapFromSDR(encoding1)}} fp2 = {"fingerprint": {"positions":self.bitmapFromSDR(encoding2)}} return self.client.compare(fp1, fp2) def getWidth(self): return self.w def getHeight(self): return self.h def getDescription(self): return self.description
class CioEncoder(LanguageEncoder): """ A language encoder using the Cortical.io API. The encoder queries the Cortical.io REST API via the cortipy module, which returns data in the form of "fingerprints". These representations are converted to binary SDR arrays with this Cio encoder. """ def __init__(self, w=128, h=128): if 'CORTICAL_API_KEY' not in os.environ: print ("Missing CORTICAL_API_KEY environment variable. If you have a " "key, set it with $ export CORTICAL_API_KEY=api_key\n" "You can retrieve a key by registering for the REST API at " "http://www.cortical.io/resources_apikey.html") raise Exception("Missing API key.") self.apiKey = os.environ['CORTICAL_API_KEY'] self.client = CorticalClient(self.apiKey, cacheDir=os.join("./cache")) self.targetSparsity = 1.0 self.w = w ## Alternatively get dimensions from cortipy client object? self.h = h self.n = w*h def encode(self, text): """ Encodes the input text w/ a cortipy client. The client returns a dictionary of "fingerprint" info, including the SDR bitmap. @param text (str, list) If the input is type str, the encoder assumes it has not yet been tokenized. A list input will skip the tokenization step. @return (list) SDR. """ if isinstance(text, str): text = self.client.tokenize(text) try: encoding = self.client.getBitmap(string) except ValueError: encoding = self.client.getTextBitmap(string) if encoding.sparsity == 0: ##TODO: test again when/if this happens # No fingerprint so fill w/ random bitmap, seeded for each specific term. print ("\tThe client returned a bitmap with sparsity=0 for the string " "\'%s\', so we'll generate a pseudo-random SDR with the target " "sparsity=%0.1f." % (string, self.targetSparsity)) state = random.getstate() random.seed(string) num = self.w * self.h bitmap = random.sample(range(num), int(self.targetSparsity * num / 100)) self._createFromBitmap(bitmap, self.w, self.h) random.setstate(state) return self.client.getSDR(encoding["fingerprint"]["positions"]) def decode(self, encoding, numTerms=None): """ Converts an SDR back into the most likely word or words. By default, the most likely term will be returned. If numTerms is specified, then the Cortical.io API will attempt to return that many; otherwise the standard is 10. The return value will be a sequence of (term, weight) tuples, where higher weights imply the corresponding term better matches the encoding. @param encoding (list) SDR. @param numTerms (int) The max number of terms to return. @return similar (list) List of dictionaries, where keys are terms and likelihood scores. """ # Convert SDR to bitmap, send to cortipy client. terms = client.bitmapToTerms( super(CioEncoder, self).bitmapFromSDR(encoding)) # Convert cortipy response to list of tuples (term, weight) return [((term["term"], term["score"])) for term in terms] ## TODO: redo fields? delete (see line 81 TODO)? def _createFromBitmap(self, bitmap, width, height): self.bitmap = bitmap self.w = width self.h = height self.sparsity = (100.0 * len(bitmap)) / (width*height) return self def compare(self, encoding1, encoding2): """ Compare encodings, returning the distances between the SDRs. Example return dict: { "cosineSimilarity": 0.6666666666666666, "euclideanDistance": 0.3333333333333333, "jaccardDistance": 0.5, "overlappingAll": 6, "overlappingLeftRight": 0.6666666666666666, "overlappingRightLeft": 0.6666666666666666, "sizeLeft": 9, "sizeRight": 9, "weightedScoring": 0.4436476984102028 } """ # Format input SDRs as Cio fingerprints fp1 = {"fingerprint": {"positions":self.bitmapFromSDR(encoding1)}} fp2 = {"fingerprint": {"positions":self.bitmapFromSDR(encoding2)}} return self.client.compare(fp1, fp2) def getWidth(self): return self.w def getHeight(self): return self.h def getDescription(self): return self.description
class CioEncoder(LanguageEncoder): """ A language encoder using the Cortical.io API. The encoder queries the Cortical.io REST API via the cortipy module, which returns data in the form of "fingerprints". These representations are converted to binary SDR arrays with this Cio encoder. """ def __init__(self, w=128, h=128): if 'CORTICAL_API_KEY' not in os.environ: print( "Missing CORTICAL_API_KEY environment variable. If you have a " "key, set it with $ export CORTICAL_API_KEY=api_key\n" "You can retrieve a key by registering for the REST API at " "http://www.cortical.io/resources_apikey.html") raise Exception("Missing API key.") self.apiKey = os.environ['CORTICAL_API_KEY'] self.client = CorticalClient(self.apiKey, cacheDir=os.join("./cache")) self.targetSparsity = 1.0 self.w = w ## Alternatively get dimensions from cortipy client object? self.h = h self.n = w * h def encode(self, text): """ Encodes the input text w/ a cortipy client. The client returns a dictionary of "fingerprint" info, including the SDR bitmap. @param text (str, list) If the input is type str, the encoder assumes it has not yet been tokenized. A list input will skip the tokenization step. @return (list) SDR. """ if isinstance(text, str): text = self.client.tokenize(text) try: encoding = self.client.getBitmap(text) except ValueError: encoding = self.client.getTextBitmap(text) if encoding.sparsity == 0: ##TODO: test again when/if this happens # No fingerprint so fill w/ random bitmap, seeded for each specific term. print( "\tThe client returned a bitmap with sparsity=0 for the string " "\'%s\', so we'll generate a pseudo-random SDR with the target " "sparsity=%0.1f." % (text, self.targetSparsity)) state = random.getstate() random.seed(text) num = self.w * self.h bitmap = random.sample(range(num), int(self.targetSparsity * num / 100)) self._createFromBitmap(bitmap, self.w, self.h) random.setstate(state) return self.client.getSDR(encoding["fingerprint"]["positions"]) def decode(self, encoding, numTerms=None): """ Converts an SDR back into the most likely word or words. By default, the most likely term will be returned. If numTerms is specified, then the Cortical.io API will attempt to return that many; otherwise the standard is 10. The return value will be a sequence of (term, weight) tuples, where higher weights imply the corresponding term better matches the encoding. @param encoding (list) SDR. @param numTerms (int) The max number of terms to return. @return similar (list) List of dictionaries, where keys are terms and likelihood scores. """ # Convert SDR to bitmap, send to cortipy client. terms = client.bitmapToTerms( super(CioEncoder, self).bitmapFromSDR(encoding)) # Convert cortipy response to list of tuples (term, weight) return [((term["term"], term["score"])) for term in terms] ## TODO: redo fields? delete (see line 81 TODO)? def _createFromBitmap(self, bitmap, width, height): self.bitmap = bitmap self.w = width self.h = height self.sparsity = (100.0 * len(bitmap)) / (width * height) return self def compare(self, encoding1, encoding2): """ Compare encodings, returning the distances between the SDRs. Example return dict: { "cosineSimilarity": 0.6666666666666666, "euclideanDistance": 0.3333333333333333, "jaccardDistance": 0.5, "overlappingAll": 6, "overlappingLeftRight": 0.6666666666666666, "overlappingRightLeft": 0.6666666666666666, "sizeLeft": 9, "sizeRight": 9, "weightedScoring": 0.4436476984102028 } """ # Format input SDRs as Cio fingerprints fp1 = {"fingerprint": {"positions": self.bitmapFromSDR(encoding1)}} fp2 = {"fingerprint": {"positions": self.bitmapFromSDR(encoding2)}} return self.client.compare(fp1, fp2) def getWidth(self): return self.w def getHeight(self): return self.h def getDescription(self): return self.description
class ClassificationModelContext(ClassificationModel): """ Class to run the survey response classification task with Cortical.io text context, then AND the context From the experiment runner, the methods expect to be fed one sample at a time. """ def __init__(self, verbosity=1, numLabels=1): """ Initialize the CorticalClient and CioEncoder. Requires a valid API key """ super(ClassificationModelContext, self).__init__(verbosity) self.encoder = CioEncoder(cacheDir="./experiments/cache") self.client = CorticalClient(self.encoder.apiKey) self.n = self.encoder.n self.w = int((self.encoder.targetSparsity / 100) * self.n) self.categoryBitmaps = {} self.numLabels = numLabels def encodePattern(self, pattern): """ Encode an SDR of the input string by querying the Cortical.io API. @param pattern (list) Tokenized sample, where each item is a string @return (dictionary) Dictionary, containing text, sparsity, and bitmap Example return dict: { "text": "Example text", "sparsity": 0.0, "bitmap": numpy.zeros(0) } """ text = " ".join(pattern) return { "text": text, "sparsity": 0.0, "bitmap": self._encodeText(text) } def _encodeText(self, text): fpInfo = self.encoder.encode(text) if self.verbosity > 1: print "Fingerprint sparsity = {0}%.".format(fpInfo["sparsity"]) if fpInfo: bitmap = numpy.array(fpInfo["fingerprint"]["positions"]) else: bitmap = self.encodeRandomly(text) return bitmap.astype(int) def resetModel(self): """Reset the model""" self.categoryBitmaps.clear() def trainModel(self, samples, labels): """ Train the classifier on the input sample and label. Use Cortical.io's keyword extraction to get the most relevant terms then get the intersection of those bitmaps @param samples (dictionary) Dictionary, containing text, sparsity, and bitmap @param labels (int) Reference index for the classification of this sample. """ for sample, sample_labels in zip(samples, labels): bitmaps = [sample["bitmap"].tolist()] context = self.client.getContextFromText(bitmaps, maxResults=5, getFingerprint=True) if len(context) != 0: union = numpy.zeros(0) for c in context: bitmap = c["fingerprint"]["positions"] union = numpy.union1d(bitmap, union).astype(int) for label in sample_labels: # Haven't seen the label before if label not in self.categoryBitmaps: self.categoryBitmaps[label] = union intersection = numpy.intersect1d( union, self.categoryBitmaps[label]) if intersection.size == 0: # Don't want to lose all the old information union = numpy.union1d( union, self.categoryBitmaps[label]).astype(int) # Need to sample to stay sparse count = len(union) sampleIndices = random.sample(xrange(count), min(count, self.w)) intersection = numpy.sort(union[sampleIndices]) self.categoryBitmaps[label] = intersection def testModel(self, sample): """ Test the intersection bitmap on the input sample. Returns a dictionary containing various distance metrics between the sample and the classes. @param sample (dictionary) Dictionary, containing text, sparsity, and bitmap @return (dictionary) The distances between the sample and the classes Example return dict: { 0: { "cosineSimilarity": 0.6666666666666666, "euclideanDistance": 0.3333333333333333, "jaccardDistance": 0.5, "overlappingAll": 6, "overlappingLeftRight": 0.6666666666666666, "overlappingRightLeft": 0.6666666666666666, "sizeLeft": 9, "sizeRight": 9, "weightedScoring": 0.4436476984102028 } } """ sampleBitmap = sample["bitmap"].tolist() distances = {} for cat, catBitmap in self.categoryBitmaps.iteritems(): distances[cat] = self.client.compare(sampleBitmap, catBitmap.tolist()) return self.winningLabels(distances, numberCats=self.numLabels, metric="overlappingAll") @staticmethod def winningLabels(distances, numberCats, metric): """ Return indices of winning categories, based off of the input metric. Overrides the base class implementation. """ metricValues = numpy.array([v[metric] for v in distances.values()]) sortedIdx = numpy.argsort(metricValues) # euclideanDistance and jaccardDistance are ascending descendingOrder = set([ "overlappingAll", "overlappingLeftRight", "overlappingRightLeft", "cosineSimilarity", "weightedScoring" ]) if metric in descendingOrder: sortedIdx = sortedIdx[::-1] return [distances.keys()[catIdx] for catIdx in sortedIdx[:numberCats]]
class CioEncoder(LanguageEncoder): """ A language encoder using the Cortical.io API. The encoder queries the Cortical.io REST API via the cortipy module, which returns data in the form of "fingerprints". These representations are converted to binary SDR arrays with this Cio encoder. """ def __init__(self, retina=DEFAULT_RETINA, retinaScaling=1.0, cacheDir=None, verbosity=0, fingerprintType=EncoderTypes.document, unionSparsity=0.20, apiKey=None, maxSparsity=0.50): """ @param retina (str) Cortical.io retina, either "en_synonymous" or "en_associative". @param retinaScaling (float) Scale each dimension of the SDR bitmap by this factor. @param cacheDir (str) Where to cache results of API queries. @param verbosity (int) Amount of info printed out, 0, 1, or 2. @param fingerprintType (Enum) Specify word- or document-level encoding. @param unionSparsity (float) Any union'ing done in this encoder will stop once this sparsity is reached. @param maxSparsity (float) The maximum sparsity of the returned bitmap. If the percentage of bits in the encoding is > maxSparsity, it will be randomly subsampled. TODO: replace enum with a simple string """ if apiKey is None and "CORTICAL_API_KEY" not in os.environ: print( "Missing CORTICAL_API_KEY environment variable. If you have a " "key, set it with $ export CORTICAL_API_KEY=api_key\n" "You can retrieve a key by registering for the REST API at " "http://www.cortical.io/resources_apikey.html") raise OSError("Missing API key.") super(CioEncoder, self).__init__(unionSparsity=unionSparsity) if cacheDir is None: root = os.path.dirname(os.path.realpath(__file__)) cacheDir = os.path.join(root, "CioCache") self.apiKey = apiKey if apiKey else os.environ["CORTICAL_API_KEY"] self.client = CorticalClient(self.apiKey, retina=retina, cacheDir=cacheDir) self._setDimensions(retina, retinaScaling) self.fingerprintType = fingerprintType self.description = ("Cio Encoder", 0) self.verbosity = verbosity self.maxSparsity = maxSparsity def _setDimensions(self, retina, scalingFactor): if scalingFactor <= 0 or scalingFactor > 1: raise ValueError( "Retina can only be scaled by values between 0 and 1.") retinaDim = RETINA_SIZES[retina]["width"] self.width = int(retinaDim * scalingFactor) self.height = int(retinaDim * scalingFactor) self.retinaScaling = float(self.width) / retinaDim self.n = self.width * self.height def encode(self, text): """ Encodes the input text w/ a cortipy client. The client returns a dictionary of "fingerprint" info, including the SDR bitmap as a numpy.array. NOTE: returning this fingerprint dict differs from the base class spec. @param text (str) A non-tokenized sample of text. @return (dict) Result from the cortipy client. The bitmap encoding is at encoding["fingerprint"]["positions"]. """ if not isinstance(text, str) and not isinstance(text, unicode): raise TypeError( "Expected a string input but got input of type {}.".format( type(text))) try: if self.fingerprintType == EncoderTypes.document: encoding = self.client.getTextBitmap(text) elif self.fingerprintType == EncoderTypes.word: encoding = self.getUnionEncoding(text) else: encoding = self.client.getBitmap(text) except UnsuccessfulEncodingError: if self.verbosity > 0: print( "\tThe client returned no encoding for the text \'{0}\', so " "we'll use the encoding of the token that is least frequent in " "the corpus.".format(text)) encoding = self._subEncoding(text) return self.finishEncoding(encoding) def getUnionEncoding(self, text): """ Encode each token of the input text, take the union, and then sparsify. @param text (str) A non-tokenized sample of text. @return (dict) The bitmap encoding is at encoding["fingerprint"]["positions"]. """ tokens = TextPreprocess().tokenize(text) # Count the ON bits represented in the encoded tokens. counts = Counter() for t in tokens: bitmap = self._getWordBitmap(t) counts.update(bitmap) positions = self.sparseUnion(counts) # Populate encoding encoding = { "text": text, "sparsity": len(positions) / float(self.n), "df": 0.0, "height": self.height, "width": self.width, "score": 0.0, "fingerprint": { "positions": sorted(positions) }, "pos_types": [] } return encoding def getWindowEncoding(self, tokens, minSparsity=0.0): """ The encodings simulate a "sliding window", where the encoding representation of a given token is a union of its bitmap with the immediately previous tokens' bitmaps, up to the maximum sparsity. The returned list only includes those windows with sparsities larger than the minimum. @param tokens (list) Tokenized string. @param minSparsity (float) Only window encodings denser than this value will be included. @return windowBitmaps (list) Dict for each token, with entries for the token string, sparsity float, and bitmap numpy array. """ if self.fingerprintType != EncoderTypes.word: print("Although the encoder type is not set for words, the window " "encodings use word-level fingerprints.") bitmaps = [numpy.array(self._getWordBitmap(t)) for t in tokens] windowBitmaps = [] for tokenIndex, windowBitmap in enumerate(bitmaps): # Each index in the tokens list is the end of a possible window. for i in reversed(xrange(tokenIndex)): # From the current token, increase the window by successively adding the # previous tokens. windowSparsity = len(windowBitmap) / float(self.n) nextSparsity = len(bitmaps[i]) / float(self.n) if windowSparsity + nextSparsity > self.unionSparsity: # stopping criterion reached -- window is full break else: # add bitmap to the current window bitmap windowBitmap = numpy.union1d(windowBitmap, bitmaps[i]) sparsity = len(windowBitmap) / float(self.n) if sparsity > minSparsity: # only include windows of sufficient density windowBitmaps.append({ "text": tokens[i:tokenIndex + 1], "sparsity": sparsity, "bitmap": numpy.array(windowBitmap) }) return windowBitmaps def finishEncoding(self, encoding): """ Scale the fingerprint of the encoding dict (if specified) and fill the width, height, and sparsity fields. @param encoding (dict) Dict as returned by the Cio client. @return encoding (dict) Same format as the input dict, with the dimensions and sparsity fields populated. """ if self.retinaScaling != 1: encoding["fingerprint"]["positions"] = self.scaleEncoding( encoding["fingerprint"]["positions"], self.retinaScaling**2) encoding["width"] = self.width encoding["height"] = self.height encoding["fingerprint"]["positions"] = numpy.array( encoding["fingerprint"]["positions"]) encoding["sparsity"] = len( encoding["fingerprint"]["positions"]) / float( (encoding["width"] * encoding["height"])) # Reduce sparsity if needed if encoding["sparsity"] > self.maxSparsity: self.reduceSparsity(encoding, self.maxSparsity) return encoding def _getWordBitmap(self, term): """ Return a bitmap for the word. If the Cortical.io API can't encode, cortipy will use a random encoding for the word. """ return self.client.getBitmap(term)["fingerprint"]["positions"] def encodeIntoArray(self, inputText, output): """ Encodes inputText and puts the encoded value into the numpy output array, which is a 1-D array of length returned by getWidth(). """ encoding = self.encode(inputText) output[:] = 0 if encoding["fingerprint"]["positions"].size > 0: output[encoding["fingerprint"]["positions"]] = 1 def decode(self, encoding, numTerms=10): """ Converts an SDR back into the most likely word or words. By default, the most likely term will be returned. If numTerms is specified, then the Cortical.io API will attempt to return that many; otherwise the standard is 10. The return value will be a sequence of (term, weight) tuples, where higher weights imply the corresponding term better matches the encoding. @param encoding (list) Bitmap encoding. @param numTerms (int) The max number of terms to return. @return (list) List of dictionaries, where keys are terms and likelihood scores. """ terms = self.client.bitmapToTerms(encoding, numTerms=numTerms) # Convert cortipy response to list of tuples (term, weight) return [(term["term"], term["score"]) for term in terms] def _subEncoding(self, text, method="keyword"): """ @param text (str) A non-tokenized sample of text. @return encoding (dict) Fingerprint from cortipy client. An empty dictionary of the text could not be encoded. """ try: if method == "df": tokens = list( itertools.chain.from_iterable( [t.split(",") for t in self.client.tokenize(text)])) encoding = min([self.client.getBitmap(t) for t in tokens], key=lambda x: x["df"]) elif method == "keyword": encoding = self.getUnionEncoding(text) else: raise ValueError("method must be either 'df' or 'keyword'") except UnsuccessfulEncodingError: if self.verbosity > 0: print( "\tThe client returned no substitute encoding for the text " "'{}', so we encode with None.".format(text)) encoding = None return encoding def compare(self, bitmap1, bitmap2): """ Compare encodings, returning the distances between the SDRs. Input bitmaps must be list objects (need to be serializable). Example return dict: { "cosineSimilarity": 0.6666666666666666, "euclideanDistance": 0.3333333333333333, "jaccardDistance": 0.5, "overlappingAll": 6, "overlappingLeftRight": 0.6666666666666666, "overlappingRightLeft": 0.6666666666666666, "sizeLeft": 9, "sizeRight": 9, "weightedScoring": 0.4436476984102028 } """ if not isinstance(bitmap1 and bitmap2, list): raise TypeError("Comparison bitmaps must be lists.") return self.client.compare(bitmap1, bitmap2) def createCategory(self, label, positives, negatives=None): """ Create a classification category (bitmap) via the Cio claassify endpoint. @param label (str) Name of category. @param positives (list) Bitmap(s) of samples to define. @param negatives (list) Not required to make category. @return (dict) Key-values for "positions" (list bitmap encoding of the category and "categoryName" (str). """ if negatives is None: negatives = [] if not isinstance(positives and negatives, list): raise TypeError("Input bitmaps must be lists.") return self.client.createClassification(label, positives, negatives) def getWidth(self): return self.n def getDimensions(self): return (self.width, self.height) def getDescription(self): return self.description def densifyPattern(self, bitmap): """Return a numpy array of 0s and 1s to represent the given bitmap.""" sparsePattern = numpy.zeros(self.n) for i in bitmap: sparsePattern[i] = 1.0 return sparsePattern def reduceSparsity(self, encoding, maxSparsity): """Reduce the sparsity of the encoding down to maxSparsity""" desiredBits = maxSparsity * encoding["width"] * encoding["height"] bitmap = encoding["fingerprint"]["positions"] # Choose a random subsampling of the bits but seed the random number # generator so we get consistent bitmaps numpy.random.seed(bitmap.sum()) encoding["fingerprint"]["positions"] = ( numpy.random.permutation(bitmap)[0:desiredBits]) encoding["sparsity"] = len( encoding["fingerprint"]["positions"]) / float( (encoding["width"] * encoding["height"]))
class CioEncoder(LanguageEncoder): """ A language encoder using the Cortical.io API. The encoder queries the Cortical.io REST API via the cortipy module, which returns data in the form of "fingerprints". These representations are converted to binary SDR arrays with this Cio encoder. """ def __init__(self, w=128, h=128, cacheDir="./cache", verbosity=0): if 'CORTICAL_API_KEY' not in os.environ: print ("Missing CORTICAL_API_KEY environment variable. If you have a " "key, set it with $ export CORTICAL_API_KEY=api_key\n" "You can retrieve a key by registering for the REST API at " "http://www.cortical.io/resources_apikey.html") raise OSError("Missing API key.") self.apiKey = os.environ['CORTICAL_API_KEY'] self.client = CorticalClient(self.apiKey, cacheDir=cacheDir) self.targetSparsity = 5.0 self.w = w self.h = h self.n = w*h self.verbosity = verbosity def encode(self, text): """ Encodes the input text w/ a cortipy client. The client returns a dictionary of "fingerprint" info, including the SDR bitmap. @param text (str) A non-tokenized sample of text. @return (dict) Result from the cortipy client. The bitmap encoding is at encoding["fingerprint"]["positions"]. """ if not text: return None try: encoding = self.client.getTextBitmap(text) except UnsuccessfulEncodingError: if self.verbosity > 0: print ("\tThe client returned no encoding for the text \'{0}\', so " "we'll use the encoding of the token that is least frequent in " "the corpus.".format(text)) encoding = self._subEncoding(text) return encoding def decode(self, encoding, numTerms=10): """ Converts an SDR back into the most likely word or words. By default, the most likely term will be returned. If numTerms is specified, then the Cortical.io API will attempt to return that many; otherwise the standard is 10. The return value will be a sequence of (term, weight) tuples, where higher weights imply the corresponding term better matches the encoding. @param encoding (list) Bitmap encoding. @param numTerms (int) The max number of terms to return. @return (list) List of dictionaries, where keys are terms and likelihood scores. """ terms = client.bitmapToTerms(encoding, numTerms=numTerms) # Convert cortipy response to list of tuples (term, weight) return [((term["term"], term["score"])) for term in terms] def _subEncoding(self, text, method="df"): """ @param text (str) A non-tokenized sample of text. @return encoding (dict) Fingerprint from cortipy client. An empty dictionary of the text could not be encoded. """ tokens = list(itertools.chain.from_iterable( [t.split(',') for t in self.client.tokenize(text)])) try: if method == "df": encoding = min([self.client.getBitmap(t) for t in tokens], key=lambda x: x["df"]) elif method == "keyword": # Take a union of the bitmaps counts = Counter() for t in tokens: bitmap = self.client.getBitmap(t)["fingerprint"]["positions"] counts.update(bitmap) # Sample to remain sparse max_sparsity = int((self.targetSparsity / 100) * self.n) w = min(len(counts), max_sparsity) positions = [c[0] for c in counts.most_common(w)] # Populate encoding encoding = { "text": text, "sparsity": w * 100 / float(self.n), "df": 0.0, "height": self.h, "width": self.w, "score": 0.0, "fingerprint": { "positions":sorted(positions) }, "pos_types": [] } else: raise ValueError("method must be either \'df\' or \'keyword\'") except UnsuccessfulEncodingError: if self.verbosity > 0: print ("\tThe client returned no substitute encoding for the text " "\'{0}\', so we encode with None.".format(text)) encoding = None return encoding def compare(self, encoding1, encoding2): """ Compare encodings, returning the distances between the SDRs. Example return dict: { "cosineSimilarity": 0.6666666666666666, "euclideanDistance": 0.3333333333333333, "jaccardDistance": 0.5, "overlappingAll": 6, "overlappingLeftRight": 0.6666666666666666, "overlappingRightLeft": 0.6666666666666666, "sizeLeft": 9, "sizeRight": 9, "weightedScoring": 0.4436476984102028 } """ # Format input SDRs as Cio fingerprints fp1 = {"fingerprint": {"positions":self.bitmapFromSDR(encoding1)}} fp2 = {"fingerprint": {"positions":self.bitmapFromSDR(encoding2)}} return self.client.compare(fp1, fp2) def getWidth(self): return self.w def getHeight(self): return self.h def getDescription(self): return self.description
def testExceptionIfAPIKeyNotPresent(self, mockOS): with self.assertRaises(Exception) as e: cClient = CorticalClient() self.assertIn("Missing API key.", e.exception)
class ClassificationModelContext(ClassificationModel): """ Class to run the survey response classification task with Cortical.io text context, then AND the context From the experiment runner, the methods expect to be fed one sample at a time. """ def __init__(self, verbosity=1, numLabels=1): """ Initialize the CorticalClient and CioEncoder. Requires a valid API key """ super(ClassificationModelContext, self).__init__(verbosity) self.encoder = CioEncoder(cacheDir="./experiments/cache") self.client = CorticalClient(self.encoder.apiKey) self.n = self.encoder.n self.w = int((self.encoder.targetSparsity / 100) * self.n) self.categoryBitmaps = {} self.numLabels = numLabels def encodePattern(self, pattern): """ Encode an SDR of the input string by querying the Cortical.io API. @param pattern (list) Tokenized sample, where each item is a string @return (dictionary) Dictionary, containing text, sparsity, and bitmap Example return dict: { "text": "Example text", "sparsity": 0.0, "bitmap": numpy.zeros(0) } """ text = " ".join(pattern) return {"text": text, "sparsity": 0.0, "bitmap": self._encodeText(text)} def _encodeText(self, text): fpInfo = self.encoder.encode(text) if self.verbosity > 1: print "Fingerprint sparsity = {0}%.".format(fpInfo["sparsity"]) if fpInfo: bitmap = numpy.array(fpInfo["fingerprint"]["positions"]) else: bitmap = self.encodeRandomly(text) return bitmap.astype(int) def resetModel(self): """Reset the model""" self.categoryBitmaps.clear() def trainModel(self, samples, labels): """ Train the classifier on the input sample and label. Use Cortical.io's keyword extraction to get the most relevant terms then get the intersection of those bitmaps @param samples (dictionary) Dictionary, containing text, sparsity, and bitmap @param labels (int) Reference index for the classification of this sample. """ for sample, sample_labels in zip(samples, labels): bitmaps = [sample["bitmap"].tolist()] context = self.client.getContextFromText(bitmaps, maxResults=5, getFingerprint=True) if len(context) != 0: union = numpy.zeros(0) for c in context: bitmap = c["fingerprint"]["positions"] union = numpy.union1d(bitmap, union).astype(int) for label in sample_labels: # Haven't seen the label before if label not in self.categoryBitmaps: self.categoryBitmaps[label] = union intersection = numpy.intersect1d(union, self.categoryBitmaps[label]) if intersection.size == 0: # Don't want to lose all the old information union = numpy.union1d(union, self.categoryBitmaps[label]).astype(int) # Need to sample to stay sparse count = len(union) sampleIndices = random.sample(xrange(count), min(count, self.w)) intersection = numpy.sort(union[sampleIndices]) self.categoryBitmaps[label] = intersection def testModel(self, sample): """ Test the intersection bitmap on the input sample. Returns a dictionary containing various distance metrics between the sample and the classes. @param sample (dictionary) Dictionary, containing text, sparsity, and bitmap @return (dictionary) The distances between the sample and the classes Example return dict: { 0: { "cosineSimilarity": 0.6666666666666666, "euclideanDistance": 0.3333333333333333, "jaccardDistance": 0.5, "overlappingAll": 6, "overlappingLeftRight": 0.6666666666666666, "overlappingRightLeft": 0.6666666666666666, "sizeLeft": 9, "sizeRight": 9, "weightedScoring": 0.4436476984102028 } } """ sampleBitmap = sample["bitmap"].tolist() distances = {} for cat, catBitmap in self.categoryBitmaps.iteritems(): distances[cat] = self.client.compare(sampleBitmap, catBitmap.tolist()) return self.winningLabels(distances, numberCats=self.numLabels, metric="overlappingAll") @staticmethod def winningLabels(distances, numberCats, metric): """ Return indices of winning categories, based off of the input metric. Overrides the base class implementation. """ metricValues = numpy.array([v[metric] for v in distances.values()]) sortedIdx = numpy.argsort(metricValues) # euclideanDistance and jaccardDistance are ascending descendingOrder = set(["overlappingAll", "overlappingLeftRight", "overlappingRightLeft", "cosineSimilarity", "weightedScoring"]) if metric in descendingOrder: sortedIdx = sortedIdx[::-1] return [distances.keys()[catIdx] for catIdx in sortedIdx[:numberCats]]
class CioEncoder(LanguageEncoder): """ A language encoder using the Cortical.io API. The encoder queries the Cortical.io REST API via the cortipy module, which returns data in the form of "fingerprints". These representations are converted to binary SDR arrays with this Cio encoder. """ def __init__(self, retina=DEFAULT_RETINA, retinaScaling=1.0, cacheDir=None, verbosity=0, fingerprintType=EncoderTypes.document, unionSparsity=0.20, apiKey=None, maxSparsity=0.50): """ @param retina (str) Cortical.io retina, either "en_synonymous" or "en_associative". @param retinaScaling (float) Scale each dimension of the SDR bitmap by this factor. @param cacheDir (str) Where to cache results of API queries. @param verbosity (int) Amount of info printed out, 0, 1, or 2. @param fingerprintType (Enum) Specify word- or document-level encoding. @param unionSparsity (float) Any union'ing done in this encoder will stop once this sparsity is reached. @param maxSparsity (float) The maximum sparsity of the returned bitmap. If the percentage of bits in the encoding is > maxSparsity, it will be randomly subsampled. TODO: replace enum with a simple string """ if apiKey is None and "CORTICAL_API_KEY" not in os.environ: print ("Missing CORTICAL_API_KEY environment variable. If you have a " "key, set it with $ export CORTICAL_API_KEY=api_key\n" "You can retrieve a key by registering for the REST API at " "http://www.cortical.io/resources_apikey.html") raise OSError("Missing API key.") super(CioEncoder, self).__init__(unionSparsity=unionSparsity) if cacheDir is None: root = os.path.dirname(os.path.realpath(__file__)) cacheDir = os.path.join(root, "CioCache") self.apiKey = apiKey if apiKey else os.environ["CORTICAL_API_KEY"] self.client = CorticalClient(self.apiKey, retina=retina, cacheDir=cacheDir) self._setDimensions(retina, retinaScaling) self.fingerprintType = fingerprintType self.description = ("Cio Encoder", 0) self.verbosity = verbosity self.maxSparsity = maxSparsity def _setDimensions(self, retina, scalingFactor): if scalingFactor <= 0 or scalingFactor > 1: raise ValueError("Retina can only be scaled by values between 0 and 1.") retinaDim = RETINA_SIZES[retina]["width"] self.width = int(retinaDim * scalingFactor) self.height = int(retinaDim * scalingFactor) self.retinaScaling = float(self.width)/retinaDim self.n = self.width * self.height def encode(self, text): """ Encodes the input text w/ a cortipy client. The client returns a dictionary of "fingerprint" info, including the SDR bitmap as a numpy.array. NOTE: returning this fingerprint dict differs from the base class spec. @param text (str) A non-tokenized sample of text. @return (dict) Result from the cortipy client. The bitmap encoding is at encoding["fingerprint"]["positions"]. """ if not isinstance(text, str) and not isinstance(text, unicode): raise TypeError("Expected a string input but got input of type {}." .format(type(text))) try: if self.fingerprintType == EncoderTypes.document: encoding = self.client.getTextBitmap(text) elif self.fingerprintType == EncoderTypes.word: encoding = self.getUnionEncoding(text) else: encoding = self.client.getBitmap(text) except UnsuccessfulEncodingError: if self.verbosity > 0: print ("\tThe client returned no encoding for the text \'{0}\', so " "we'll use the encoding of the token that is least frequent in " "the corpus.".format(text)) encoding = self._subEncoding(text) return self.finishEncoding(encoding) def getUnionEncoding(self, text): """ Encode each token of the input text, take the union, and then sparsify. @param text (str) A non-tokenized sample of text. @return (dict) The bitmap encoding is at encoding["fingerprint"]["positions"]. """ tokens = TextPreprocess().tokenize(text) # Count the ON bits represented in the encoded tokens. counts = Counter() for t in tokens: bitmap = self._getWordBitmap(t) counts.update(bitmap) positions = self.sparseUnion(counts) # Populate encoding encoding = { "text": text, "sparsity": len(positions) / float(self.n), "df": 0.0, "height": self.height, "width": self.width, "score": 0.0, "fingerprint": { "positions":sorted(positions) }, "pos_types": [] } return encoding def getWindowEncoding(self, tokens, minSparsity=0.0): """ The encodings simulate a "sliding window", where the encoding representation of a given token is a union of its bitmap with the immediately previous tokens' bitmaps, up to the maximum sparsity. The returned list only includes those windows with sparsities larger than the minimum. @param tokens (list) Tokenized string. @param minSparsity (float) Only window encodings denser than this value will be included. @return windowBitmaps (list) Dict for each token, with entries for the token string, sparsity float, and bitmap numpy array. """ if self.fingerprintType != EncoderTypes.word: print ("Although the encoder type is not set for words, the window " "encodings use word-level fingerprints.") bitmaps = [numpy.array(self._getWordBitmap(t)) for t in tokens] windowBitmaps = [] for tokenIndex, windowBitmap in enumerate(bitmaps): # Each index in the tokens list is the end of a possible window. for i in reversed(xrange(tokenIndex)): # From the current token, increase the window by successively adding the # previous tokens. windowSparsity = len(windowBitmap) / float(self.n) nextSparsity = len(bitmaps[i]) / float(self.n) if windowSparsity + nextSparsity > self.unionSparsity: # stopping criterion reached -- window is full break else: # add bitmap to the current window bitmap windowBitmap = numpy.union1d(windowBitmap, bitmaps[i]) sparsity = len(windowBitmap) / float(self.n) if sparsity > minSparsity: # only include windows of sufficient density windowBitmaps.append( {"text": tokens[i:tokenIndex+1], "sparsity": sparsity, "bitmap": numpy.array(windowBitmap)}) return windowBitmaps def finishEncoding(self, encoding): """ Scale the fingerprint of the encoding dict (if specified) and fill the width, height, and sparsity fields. @param encoding (dict) Dict as returned by the Cio client. @return encoding (dict) Same format as the input dict, with the dimensions and sparsity fields populated. """ if self.retinaScaling != 1: encoding["fingerprint"]["positions"] = self.scaleEncoding( encoding["fingerprint"]["positions"], self.retinaScaling**2) encoding["width"] = self.width encoding["height"] = self.height encoding["fingerprint"]["positions"] = numpy.array( encoding["fingerprint"]["positions"]) encoding["sparsity"] = len(encoding["fingerprint"]["positions"]) / float( (encoding["width"] * encoding["height"])) # Reduce sparsity if needed if encoding["sparsity"] > self.maxSparsity: self.reduceSparsity(encoding, self.maxSparsity) return encoding def _getWordBitmap(self, term): """ Return a bitmap for the word. If the Cortical.io API can't encode, cortipy will use a random encoding for the word. """ return self.client.getBitmap(term)["fingerprint"]["positions"] def encodeIntoArray(self, inputText, output): """ Encodes inputText and puts the encoded value into the numpy output array, which is a 1-D array of length returned by getWidth(). """ encoding = self.encode(inputText) output[:] = 0 if encoding["fingerprint"]["positions"].size > 0: output[encoding["fingerprint"]["positions"]] = 1 def decode(self, encoding, numTerms=10): """ Converts an SDR back into the most likely word or words. By default, the most likely term will be returned. If numTerms is specified, then the Cortical.io API will attempt to return that many; otherwise the standard is 10. The return value will be a sequence of (term, weight) tuples, where higher weights imply the corresponding term better matches the encoding. @param encoding (list) Bitmap encoding. @param numTerms (int) The max number of terms to return. @return (list) List of dictionaries, where keys are terms and likelihood scores. """ terms = self.client.bitmapToTerms(encoding, numTerms=numTerms) # Convert cortipy response to list of tuples (term, weight) return [(term["term"], term["score"]) for term in terms] def _subEncoding(self, text, method="keyword"): """ @param text (str) A non-tokenized sample of text. @return encoding (dict) Fingerprint from cortipy client. An empty dictionary of the text could not be encoded. """ try: if method == "df": tokens = list(itertools.chain.from_iterable( [t.split(",") for t in self.client.tokenize(text)])) encoding = min( [self.client.getBitmap(t) for t in tokens], key=lambda x: x["df"]) elif method == "keyword": encoding = self.getUnionEncoding(text) else: raise ValueError("method must be either 'df' or 'keyword'") except UnsuccessfulEncodingError: if self.verbosity > 0: print ("\tThe client returned no substitute encoding for the text " "'{}', so we encode with None.".format(text)) encoding = None return encoding def compare(self, bitmap1, bitmap2): """ Compare encodings, returning the distances between the SDRs. Input bitmaps must be list objects (need to be serializable). Example return dict: { "cosineSimilarity": 0.6666666666666666, "euclideanDistance": 0.3333333333333333, "jaccardDistance": 0.5, "overlappingAll": 6, "overlappingLeftRight": 0.6666666666666666, "overlappingRightLeft": 0.6666666666666666, "sizeLeft": 9, "sizeRight": 9, "weightedScoring": 0.4436476984102028 } """ if not isinstance(bitmap1 and bitmap2, list): raise TypeError("Comparison bitmaps must be lists.") return self.client.compare(bitmap1, bitmap2) def createCategory(self, label, positives, negatives=None): """ Create a classification category (bitmap) via the Cio claassify endpoint. @param label (str) Name of category. @param positives (list) Bitmap(s) of samples to define. @param negatives (list) Not required to make category. @return (dict) Key-values for "positions" (list bitmap encoding of the category and "categoryName" (str). """ if negatives is None: negatives = [] if not isinstance(positives and negatives, list): raise TypeError("Input bitmaps must be lists.") return self.client.createClassification(label, positives, negatives) def getWidth(self): return self.n def getDimensions(self): return (self.width, self.height) def getDescription(self): return self.description def densifyPattern(self, bitmap): """Return a numpy array of 0s and 1s to represent the given bitmap.""" sparsePattern = numpy.zeros(self.n) for i in bitmap: sparsePattern[i] = 1.0 return sparsePattern def reduceSparsity(self, encoding, maxSparsity): """Reduce the sparsity of the encoding down to maxSparsity""" desiredBits = maxSparsity*encoding["width"]*encoding["height"] bitmap = encoding["fingerprint"]["positions"] # Choose a random subsampling of the bits but seed the random number # generator so we get consistent bitmaps numpy.random.seed(bitmap.sum()) encoding["fingerprint"]["positions"] = ( numpy.random.permutation(bitmap)[0:desiredBits] ) encoding["sparsity"] = len(encoding["fingerprint"]["positions"]) / float( (encoding["width"] * encoding["height"]))
def testAPIKeyPresent(self): with patch.dict("os.environ", {"CIO_API_KEY": "apikey123"}): cClient = CorticalClient()