Exemplo n.º 1
0
    def __init__(self,
                 verbosity=1,
                 numLabels=3,
                 modelDir="ClassificationModelFingerprint",
                 fingerprintType=EncoderTypes.word,
                 unionSparsity=20.0):

        super(ClassificationModelFingerprint,
              self).__init__(verbosity=verbosity,
                             numLabels=numLabels,
                             modelDir=modelDir)

        # Init kNN classifier and Cortical.io encoder; need valid API key (see
        # CioEncoder init for details).
        self.classifier = KNNClassifier(k=numLabels,
                                        distanceMethod='rawOverlap',
                                        exact=False,
                                        verbosity=verbosity - 1)

        if fingerprintType is (not EncoderTypes.document
                               or not EncoderTypes.word):
            raise ValueError("Invaid type of fingerprint encoding; see the "
                             "EncoderTypes class for eligble types.")
        self.encoder = CioEncoder(cacheDir="./fluent/experiments/cioCache",
                                  fingerprintType=fingerprintType,
                                  unionSparsity=unionSparsity)
        self.n = self.encoder.n
        self.w = int((self.encoder.targetSparsity / 100) * self.n)
  def testWordFingerprint(self):
    """Test the Cortical.io term (word-lelevl) encoding."""

    cio = CioEncoder(fingerprintType=EncoderTypes.word)
    response = cio.encode(self.text)
    
    self.assertFingerprintFields(response)
    
    encodingDict = getTestData("cio_encoding_word.json")
    self.assertEqual(encodingDict["fingerprint"]["positions"],
        response["fingerprint"]["positions"], "Cio bitmap is not as expected.")
Exemplo n.º 3
0
  def testWordFingerprint(self):
    """Test the Cortical.io term (word-lelevl) encoding."""

    cio = CioEncoder(fingerprintType=EncoderTypes.word)
    response = cio.encode(self.text)
    
    self.assertFingerprintFields(response)
    
    encodingDict = getTestData("cio_encoding_word.json")
    self.assertEqual(encodingDict["fingerprint"]["positions"],
        response["fingerprint"]["positions"], "Cio bitmap is not as expected.")
Exemplo n.º 4
0
    def __init__(self, verbosity=1, numLabels=1):
        """
    Initialize the CorticalClient and CioEncoder. Requires a valid API key
    """
        super(ClassificationModelContext, self).__init__(verbosity)

        self.encoder = CioEncoder(cacheDir="./experiments/cache")
        self.client = CorticalClient(self.encoder.apiKey)

        self.n = self.encoder.n
        self.w = int((self.encoder.targetSparsity / 100) * self.n)

        self.categoryBitmaps = {}
        self.numLabels = numLabels
Exemplo n.º 5
0
    def __init__(self, verbosity=1, numLabels=3):
        """
    Initialize the encoder as CioEncoder; requires a valid API key.
    """
        super(ClassificationModelEndpoint, self).__init__(verbosity, numLabels)

        self.encoder = CioEncoder(cacheDir="./experiments/cache")
        self.compareEncoder = LanguageEncoder()

        self.n = self.encoder.n
        self.w = int((self.encoder.targetSparsity / 100) * self.n)

        self.categoryBitmaps = {}
        self.negatives = defaultdict(list)
        self.positives = defaultdict(list)
  def __init__(self,
               verbosity=1,
               numLabels=3,
               modelDir="ClassificationModelFingerprint",
               fingerprintType=EncoderTypes.word,
               unionSparsity=20.0):

    super(ClassificationModelFingerprint, self).__init__(
      verbosity=verbosity, numLabels=numLabels, modelDir=modelDir)

    # Init kNN classifier and Cortical.io encoder; need valid API key (see
    # CioEncoder init for details).
    self.classifier = KNNClassifier(k=numLabels,
                                    distanceMethod='rawOverlap',
                                    exact=False,
                                    verbosity=verbosity-1)

    if fingerprintType is (not EncoderTypes.document or not EncoderTypes.word):
      raise ValueError("Invaid type of fingerprint encoding; see the "
                       "EncoderTypes class for eligble types.")
    self.encoder = CioEncoder(cacheDir="./fluent/experiments/cioCache",
                              fingerprintType=fingerprintType,
                              unionSparsity=unionSparsity)
    self.n = self.encoder.n
    self.w = int((self.encoder.targetSparsity/100)*self.n)
Exemplo n.º 7
0
  def __init__(self, inputFilePath, verbosity=1, numLabels=3, spTrainingSize=0,
               tmTrainingSize=0, clsTrainingSize=0, classifierType="KNN"):
    """
    @param inputFilePath      (str)       Path to data formatted for network
                                          API
    @param spTrainingSize     (int)       Number of samples the network has to
                                          be trained on before training the
                                          spatial pooler
    @param tmTrainingSize     (int)       Number of samples the network has to
                                          be trained on before training the
                                          temporal memory
    @param clsTrainingSize    (int)       Number of samples the network has to
                                          be trained on before training the
                                          classifier
    @param classifierType     (str)       Either "KNN" or "CLA"
    See ClassificationModel for remaining parameters
    """
    self.spTrainingSize = spTrainingSize
    self.tmTrainingSize = tmTrainingSize
    self.clsTrainingSize = clsTrainingSize

    super(ClassificationModelHTM, self).__init__(verbosity=verbosity,
      numLabels=numLabels)

    # Initialize Network
    self.classifierType = classifierType
    self.recordStream = FileRecordStream(streamID=inputFilePath)
    self.encoder = CioEncoder(cacheDir="./experiments/cache")
    self._initModel()
Exemplo n.º 8
0
    def initModel(self):
        """
    Initialize the network; self.networdDataPath must already be set.
    """
        recordStream = FileRecordStream(streamID=self.networkDataPath)
        encoder = CioEncoder(cacheDir="./experiments/cache")

        return configureNetwork(recordStream, self.networkConfig, encoder)
  def __init__(self, verbosity=1):
    super(ClassificationModelFingerprint, self).__init__(verbosity)

    # Init kNN classifier and Cortical.io encoder; need valid API key (see
    # CioEncoder init for details).
    self.classifier = KNNClassifier(k=1, exact=False, verbosity=verbosity-1)
    self.encoder = CioEncoder(cacheDir="./experiments/cache")
    self.n = self.encoder.n
    self.w = int((self.encoder.targetSparsity/100)*self.n)
Exemplo n.º 10
0
  def __init__(self, verbosity=1, numLabels=1):
    """
    Initialize the CorticalClient and CioEncoder. Requires a valid API key
    """
    super(ClassificationModelContext, self).__init__(verbosity)

    self.encoder = CioEncoder(cacheDir="./experiments/cache")
    self.client = CorticalClient(self.encoder.apiKey)

    self.n = self.encoder.n
    self.w = int((self.encoder.targetSparsity / 100) * self.n)

    self.categoryBitmaps = {}
    self.numLabels = numLabels
  def __init__(self,
               verbosity=1,
               numLabels=3,
               modelDir="ClassificationModelEndpoint"):
    """
    Initializes the encoder as CioEncoder; requires a valid API key.
    """
    super(ClassificationModelEndpoint, self).__init__(
        verbosity=verbosity, numLabels=numLabels, modelDir=modelDir)

    self.encoder = CioEncoder(cacheDir="./experiments/cache")
    self.compareEncoder = LanguageEncoder()

    self.n = self.encoder.n
    self.w = int((self.encoder.targetSparsity/100) * self.n)

    self.categoryBitmaps = {}
    self.negatives = defaultdict(list)
    self.positives = defaultdict(list)
class ClassificationModelFingerprint(ClassificationModel):
  """
  Class to run the survey response classification task with Coritcal.io
  fingerprint encodings.

  From the experiment runner, the methods expect to be fed one sample at a time.
  """

  def __init__(self,
               verbosity=1,
               numLabels=3,
               modelDir="ClassificationModelFingerprint",
               fingerprintType=EncoderTypes.word,
               unionSparsity=20.0):

    super(ClassificationModelFingerprint, self).__init__(
      verbosity=verbosity, numLabels=numLabels, modelDir=modelDir)

    # Init kNN classifier and Cortical.io encoder; need valid API key (see
    # CioEncoder init for details).
    self.classifier = KNNClassifier(k=numLabels,
                                    distanceMethod='rawOverlap',
                                    exact=False,
                                    verbosity=verbosity-1)

    if fingerprintType is (not EncoderTypes.document or not EncoderTypes.word):
      raise ValueError("Invaid type of fingerprint encoding; see the "
                       "EncoderTypes class for eligble types.")
    self.encoder = CioEncoder(cacheDir="./fluent/experiments/cioCache",
                              fingerprintType=fingerprintType,
                              unionSparsity=unionSparsity)
    self.n = self.encoder.n
    self.w = int((self.encoder.targetSparsity/100)*self.n)


  def encodeSample(self, sample):
    """
    Encode an SDR of the input string by querying the Cortical.io API. If the
    client returns None, we create a random SDR with the model's dimensions n
    and w.

    @param sample     (list)        Tokenized sample, where each item is a str.
    @return fp        (dict)        The sample text, sparsity, and bitmap.
    Example return dict:
      {
        "text": "Example text",
        "sparsity": 0.03,
        "bitmap": numpy.array([])
      }
    """
    sample = " ".join(sample)
    fpInfo = self.encoder.encode(sample)
    if fpInfo:
      fp = {"text":fpInfo["text"] if "text" in fpInfo else fpInfo["term"],
            "sparsity":fpInfo["sparsity"],
            "bitmap":numpy.array(fpInfo["fingerprint"]["positions"])}
    else:
      fp = {"text":sample,
            "sparsity":float(self.w)/self.n,
            "bitmap":self.encodeRandomly(sample)}

    return fp


  def trainModel(self, i):
    # TODO: add batch training, where i is a list
    """
    Train the classifier on the sample and labels for record i. The list
    sampleReference is populated to correlate classifier prototypes to sample
    IDs.
    """
    bitmap = self.patterns[i]["pattern"]["bitmap"]
    if bitmap.any():
      for label in self.patterns[i]["labels"]:
        self.classifier.learn(bitmap, label, isSparse=self.n)
        self.sampleReference.append(self.patterns[i]["ID"])


  def testModel(self, i, numLabels=3):
    """
    Test the model on record i.

    @param numLabels  (int)           Number of classification predictions.
    @return           (numpy array)   numLabels most-frequent classifications
                                      for the data samples; int or empty.
    """
    (_, inferenceResult, _, _) = self.classifier.infer(
      self.sparsifyPattern(self.patterns[i]["pattern"]["bitmap"], self.n))
    return self.getWinningLabels(inferenceResult, numLabels)
class ClassificationModelFingerprint(ClassificationModel):
    """
  Class to run the survey response classification task with Coritcal.io
  fingerprint encodings.

  From the experiment runner, the methods expect to be fed one sample at a time.
  """
    def __init__(self,
                 verbosity=1,
                 numLabels=3,
                 fingerprintType=EncoderTypes.document):

        super(ClassificationModelFingerprint,
              self).__init__(verbosity, numLabels)

        # Init kNN classifier and Cortical.io encoder; need valid API key (see
        # CioEncoder init for details).
        self.classifier = KNNClassifier(k=numLabels,
                                        distanceMethod='rawOverlap',
                                        exact=False,
                                        verbosity=verbosity - 1)

        if fingerprintType is (not EncoderTypes.document
                               or not EncoderTypes.word):
            raise ValueError("Invaid type of fingerprint encoding; see the "
                             "EncoderTypes class for eligble types.")
        self.encoder = CioEncoder(cacheDir="./fluent/experiments/cioCache",
                                  fingerprintType=fingerprintType)
        self.n = self.encoder.n
        self.w = int((self.encoder.targetSparsity / 100) * self.n)

    def encodePattern(self, sample):
        """
    Encode an SDR of the input string by querying the Cortical.io API. If the
    client returns None, we create a random SDR with the model's dimensions n
    and w.

    @param sample     (list)            Tokenized sample, where each item is a
                                        string token.
    @return fp        (dict)            The sample text, sparsity, and bitmap.
    Example return dict:
      {
        "text": "Example text",
        "sparsity": 0.03,
        "bitmap": numpy.array([])
      }
    """
        sample = " ".join(sample)
        fpInfo = self.encoder.encode(sample)
        if fpInfo:
            fp = {
                "text": fpInfo["text"] if "text" in fpInfo else fpInfo["term"],
                "sparsity": fpInfo["sparsity"],
                "bitmap": numpy.array(fpInfo["fingerprint"]["positions"])
            }
        else:
            fp = {
                "text": sample,
                "sparsity": float(self.w) / self.n,
                "bitmap": self.encodeRandomly(sample)
            }

        return fp

    def resetModel(self):
        """Reset the model by clearing the classifier."""
        self.classifier.clear()

    def trainModel(self, samples, labels):
        """
    Train the classifier on the input sample and labels.

    @param samples    (list)          List of dictionaries containing the
                                      sample text, sparsity, and bitmap.
    @param labels     (list)          List of numpy arrays containing the
                                      reference indices for the classifications
                                      of each sample.
    """
        for sample, sample_labels in zip(samples, labels):
            if sample["bitmap"].any():
                for label in sample_labels:
                    self.classifier.learn(sample["bitmap"],
                                          label,
                                          isSparse=self.n)

    def testModel(self, sample, numLabels=3):
        """
    Test the kNN classifier on the input sample. Returns the classification most
    frequent amongst the classifications of the sample's individual tokens.
    We ignore the terms that are unclassified, picking the most frequent
    classification among those that are detected.

    @param sample         (dict)          The sample text, sparsity, and bitmap.
    @param numLabels      (int)           Number of predicted classifications.
    @return               (numpy array)   The numLabels most-frequent
                                          classifications for the data samples;
                                          values are int or empty.
    """
        (_, inferenceResult, _,
         _) = self.classifier.infer(self._densifyPattern(sample["bitmap"]))
        return self.getWinningLabels(inferenceResult, numLabels)
Exemplo n.º 14
0
class ClassificationModelContext(ClassificationModel):
  """
  Class to run the survey response classification task with Cortical.io
  text context, then AND the context

  From the experiment runner, the methods expect to be fed one sample at a time.
  """

  def __init__(self, verbosity=1, numLabels=1):
    """
    Initialize the CorticalClient and CioEncoder. Requires a valid API key
    """
    super(ClassificationModelContext, self).__init__(verbosity)

    self.encoder = CioEncoder(cacheDir="./experiments/cache")
    self.client = CorticalClient(self.encoder.apiKey)

    self.n = self.encoder.n
    self.w = int((self.encoder.targetSparsity / 100) * self.n)

    self.categoryBitmaps = {}
    self.numLabels = numLabels


  def encodePattern(self, pattern):
    """
    Encode an SDR of the input string by querying the Cortical.io API.

    @param pattern     (list)           Tokenized sample, where each item is a
                                        string
    @return            (dictionary)     Dictionary, containing text, sparsity,
                                        and bitmap
    Example return dict:
    {
      "text": "Example text",
      "sparsity": 0.0,
      "bitmap": numpy.zeros(0)
    }
    """
    text = " ".join(pattern)
    return {"text": text, "sparsity": 0.0, "bitmap": self._encodeText(text)}


  def _encodeText(self, text):
    fpInfo = self.encoder.encode(text)
    if self.verbosity > 1:
      print "Fingerprint sparsity = {0}%.".format(fpInfo["sparsity"])

    if fpInfo:
      bitmap = numpy.array(fpInfo["fingerprint"]["positions"])
    else:
      bitmap = self.encodeRandomly(text)

    return bitmap.astype(int)


  def resetModel(self):
    """Reset the model"""
    self.categoryBitmaps.clear()


  def trainModel(self, samples, labels):
    """
    Train the classifier on the input sample and label. Use Cortical.io's
    keyword extraction to get the most relevant terms then get the intersection
    of those bitmaps

    @param samples     (dictionary)      Dictionary, containing text, sparsity,
                                         and bitmap
    @param labels      (int)             Reference index for the classification
                                         of this sample.
    """
    for sample, sample_labels in zip(samples, labels):
      bitmaps = [sample["bitmap"].tolist()]
      context = self.client.getContextFromText(bitmaps, maxResults=5,
                                               getFingerprint=True)

      if len(context) != 0:
        union = numpy.zeros(0)
        for c in context:
          bitmap = c["fingerprint"]["positions"]
          union = numpy.union1d(bitmap, union).astype(int)

        for label in sample_labels:
          # Haven't seen the label before
          if label not in self.categoryBitmaps:
            self.categoryBitmaps[label] = union

          intersection = numpy.intersect1d(union, self.categoryBitmaps[label])
          if intersection.size == 0:
            # Don't want to lose all the old information
            union = numpy.union1d(union, self.categoryBitmaps[label]).astype(int)
            # Need to sample to stay sparse
            count = len(union)
            sampleIndices = random.sample(xrange(count), min(count, self.w))
            intersection = numpy.sort(union[sampleIndices])

          self.categoryBitmaps[label] = intersection


  def testModel(self, sample):
    """
    Test the intersection bitmap on the input sample. Returns a dictionary
    containing various distance metrics between the sample and the classes.

    @param sample     (dictionary)      Dictionary, containing text, sparsity,
                                        and bitmap
    @return           (dictionary)      The distances between the sample and
                                        the classes
    Example return dict:
      {
        0: {
          "cosineSimilarity": 0.6666666666666666,
          "euclideanDistance": 0.3333333333333333,
          "jaccardDistance": 0.5,
          "overlappingAll": 6,
          "overlappingLeftRight": 0.6666666666666666,
          "overlappingRightLeft": 0.6666666666666666,
          "sizeLeft": 9,
          "sizeRight": 9,
          "weightedScoring": 0.4436476984102028
        }
      }
    """

    sampleBitmap = sample["bitmap"].tolist()

    distances = {}
    for cat, catBitmap in self.categoryBitmaps.iteritems():
      distances[cat] = self.client.compare(sampleBitmap, catBitmap.tolist())

    return self.winningLabels(distances, numberCats=self.numLabels,
      metric="overlappingAll") 


  @staticmethod
  def winningLabels(distances, numberCats, metric):
    """
    Return indices of winning categories, based off of the input metric.
    Overrides the base class implementation.
    """
    metricValues = numpy.array([v[metric] for v in distances.values()])
    sortedIdx = numpy.argsort(metricValues)

    # euclideanDistance and jaccardDistance are ascending
    descendingOrder = set(["overlappingAll", "overlappingLeftRight",
      "overlappingRightLeft", "cosineSimilarity", "weightedScoring"])
    if metric in descendingOrder:
      sortedIdx = sortedIdx[::-1]

    return [distances.keys()[catIdx] for catIdx in sortedIdx[:numberCats]]
class ClassificationModelFingerprint(ClassificationModel):
  """
  Class to run the survey response classification task with Coritcal.io
  fingerprint encodings.

  From the experiment runner, the methods expect to be fed one sample at a time.
  """

  def __init__(self, verbosity=1):
    super(ClassificationModelFingerprint, self).__init__(verbosity)

    # Init kNN classifier and Cortical.io encoder; need valid API key (see
    # CioEncoder init for details).
    self.classifier = KNNClassifier(k=1, exact=False, verbosity=verbosity-1)
    self.encoder = CioEncoder(cacheDir="./experiments/cache")
    self.n = self.encoder.n
    self.w = int((self.encoder.targetSparsity/100)*self.n)


  def encodePattern(self, sample):
    """
    Encode an SDR of the input string by querying the Cortical.io API.

    @param sample     (list)            Tokenized sample, where each item is a
                                        string token.
    @return           (list)            Numpy arrays, each with a bitmap of the
                                        encoding.
    """
    fpInfo = self.encoder.encode(string.join(sample))
    if self.verbosity > 1:
      print "Fingerprint sparsity = {0}%.".format(fpInfo["sparsity"])
    if fpInfo:
      return numpy.array(fpInfo["fingerprint"]["positions"], dtype="uint32")
    else:
      return numpy.empty(0)


  def resetModel(self):
    """Reset the model by clearing the classifier."""
    self.classifier.clear()


  def trainModel(self, sample, label):
    """
    Train the classifier on the input sample and label.

    @param sample     (numpy.array)     Bitmap encoding of the sample.
    @param label      (int)             Reference index for the classification
                                        of this sample.
    """
    if sample.any():
      _ = self.classifier.learn(sample, label, isSparse=self.n)


  def testModel(self, sample):
    """
    Test the kNN classifier on the input sample. Returns the classification most
    frequent amongst the classifications of the sample's individual tokens.
    We ignore the terms that are unclassified, picking the most frequent
    classification among those that are detected.

    @param sample     (numpy.array)     Bitmap encoding of the sample.
    @return           (list)            The n most-frequent classifications
                                        for the data samples; for more, see the
                                        KNNClassifier.infer() documentation.
                                        Values are int or None.
    Note: to return multiple winner classifications, modify the return statement
    accordingly.
    """
    tokenLabels = []
    (tokenLabel, _, _, _) = self.classifier.infer(self._densifyPattern(sample))
    ## TODO: get list of closest classifications, not just the winner
    return [tokenLabel]
Exemplo n.º 16
0
class ClassificationModelFingerprint(ClassificationModel):
  """
  Class to run the survey response classification task with Coritcal.io
  fingerprint encodings.

  From the experiment runner, the methods expect to be fed one sample at a time.
  """

  def __init__(self, verbosity=1, numLabels=3):
    super(ClassificationModelFingerprint, self).__init__(verbosity, numLabels)

    # Init kNN classifier and Cortical.io encoder; need valid API key (see
    # CioEncoder init for details).
    self.classifier = KNNClassifier(k=numLabels,
                                    distanceMethod='rawOverlap',
                                    exact=False,
                                    verbosity=verbosity-1)

    self.encoder = CioEncoder(cacheDir="./experiments/cache")
    self.n = self.encoder.n
    self.w = int((self.encoder.targetSparsity/100)*self.n)


  def encodePattern(self, sample):
    """
    Encode an SDR of the input string by querying the Cortical.io API. If the
    client returns None, we create a random SDR with the model's dimensions n
    and w.

    @param sample     (list)            Tokenized sample, where each item is a
                                        string token.
    @return fp        (dict)            The sample text, sparsity, and bitmap.
    Example return dict:
      {
        "text": "Example text",
        "sparsity": 0.03,
        "bitmap": numpy.array([])
      }
    """
    sample = " ".join(sample)
    fpInfo = self.encoder.encode(sample)
    if fpInfo:
      fp = {"text":fpInfo["text"] if "text" in fpInfo else fpInfo["term"],
            "sparsity":fpInfo["sparsity"],
            "bitmap":numpy.array(fpInfo["fingerprint"]["positions"])
            }
    else:
      fp = {"text":sample,
            "sparsity":float(self.w)/self.n,
            "bitmap":self.encodeRandomly(sample)
            }

    return fp


  def resetModel(self):
    """Reset the model by clearing the classifier."""
    self.classifier.clear()


  def trainModel(self, sample, labels):
    """
    Train the classifier on the input sample and labels.

    @param sample     (dict)          The sample text, sparsity, and bitmap.
    @param labels     (numpy array)   Reference indices for the classifications
                                      of this sample.
    """
    if sample["bitmap"].any():
      for label in labels:
        self.classifier.learn(sample["bitmap"], label, isSparse=self.n)


  def testModel(self, sample, numLabels=3):
    """
    Test the kNN classifier on the input sample. Returns the classification most
    frequent amongst the classifications of the sample's individual tokens.
    We ignore the terms that are unclassified, picking the most frequent
    classification among those that are detected.

    @param sample         (dict)          The sample text, sparsity, and bitmap.
    @param numLabels      (int)           Number of predicted classifications.
    @return               (numpy array)   The numLabels most-frequent
                                          classifications for the data samples;
                                          values are int or empty.
    """
    (_, inferenceResult, _, _) = self.classifier.infer(
      self._densifyPattern(sample["bitmap"]))
    return self.getWinningLabels(inferenceResult, numLabels)
Exemplo n.º 17
0
class ClassificationModelEndpoint(ClassificationModel):
    """
  Class to run the survey response classification task with Cortical.io
  text endpoint encodings and classification system.

  From the experiment runner, the methods expect to be fed one sample at a time.
  """
    def __init__(self, verbosity=1, numLabels=3):
        """
    Initialize the encoder as CioEncoder; requires a valid API key.
    """
        super(ClassificationModelEndpoint, self).__init__(verbosity, numLabels)

        self.encoder = CioEncoder(cacheDir="./experiments/cache")
        self.compareEncoder = LanguageEncoder()

        self.n = self.encoder.n
        self.w = int((self.encoder.targetSparsity / 100) * self.n)

        self.categoryBitmaps = {}
        self.negatives = defaultdict(list)
        self.positives = defaultdict(list)

    def encodePattern(self, sample):
        """
    Encode an SDR of the input string by querying the Cortical.io API.

    @param sample         (list)          Tokenized sample, where each item is
                                          a string
    @return fp            (dict)          The sample text, sparsity, and bitmap.
    Example return dict:
      {
        "text": "Example text",
        "sparsity": 0.03,
        "bitmap": numpy.array([])
      }
    """
        sample = " ".join(sample)
        fpInfo = self.encoder.encode(sample)
        if fpInfo:
            fp = {
                "text": fpInfo["text"] if "text" in fpInfo else fpInfo["term"],
                "sparsity": fpInfo["sparsity"],
                "bitmap": numpy.array(fpInfo["fingerprint"]["positions"])
            }
        else:
            fp = {
                "text": sample,
                "sparsity": float(self.w) / self.n,
                "bitmap": self.encodeRandomly(sample)
            }

        return fp

    def resetModel(self):
        """Reset the model"""
        self.positives.clear()
        self.negatives.clear()
        self.categoryBitmaps.clear()

    def trainModel(self, samples, labels, negatives=None):
        """
    Train the classifier on the input sample and label. Use Cortical.io's
    createClassification to make a bitmap that represents the class

    @param samples    (list)            List of dictionaries containing the
                                        sample text, sparsity, and bitmap.
    @param labels     (list)            List of numpy arrays containing the
                                        reference indices for the
                                        classifications of each sample.
    @param negatives  (list)            Each item is the dictionary containing
                                        text, sparsity and bitmap for the
                                        negative samples.
    """
        labelsToUpdateBitmaps = set()
        for sample, sampleLabels in zip(samples, labels):
            for label in sampleLabels:
                fpInfo = self.encoder.encode(sample["text"])
                if sample["text"] and fpInfo:
                    self.positives[label].append(sample["text"])

                    # Only add negatives when training on one sample so we know which
                    # labels to use
                    if negatives and len(samples) == 1:
                        for neg in negatives:
                            if neg["text"]:
                                self.negatives[label].append(neg["text"])
                    labelsToUpdateBitmaps.add(label)

        for label in labelsToUpdateBitmaps:
            self.categoryBitmaps[label] = self.encoder.createCategory(
                str(label), self.positives[label],
                self.negatives[label])["positions"]

    def testModel(self, sample, numLabels=3, metric="overlappingAll"):
        """
    Test the Cortical.io classifier on the input sample. Returns a dictionary
    containing various distance metrics between the sample and the classes.

    @param sample         (dict)      The sample text, sparsity, and bitmap.
    @return               (list)      Winning classifications based on the
                                      specified metric. The number of items
                                      returned will be <= numLabels.
    """
        sampleBitmap = sample["bitmap"].tolist()

        distances = defaultdict(list)
        for cat, catBitmap in self.categoryBitmaps.iteritems():
            distances[cat] = self.compareEncoder.compare(
                sampleBitmap, catBitmap)

        return self.getWinningLabels(distances,
                                     numLabels=numLabels,
                                     metric=metric)

    @staticmethod
    def compareCategories(catDistances, metric="overlappingAll"):
        """
    Calculate category distances. Returns a defaultdict of category keys, where
    values are OrderedDicts sorted such that the most similar categories
    (according to the input metric) are listed first.
    """
        descendingOrder = ("overlappingAll", "overlappingLeftRight",
                           "overlappingRightLeft", "cosineSimilarity",
                           "weightedScoring")

        categoryComparisons = defaultdict(list)
        for k, v in catDistances.iteritems():
            # Create a dict for this category
            metricDict = {
                compareCat: distances[metric]
                for compareCat, distances in v.iteritems()
            }
            # Sort the dict by the metric
            reverse = True if metric in descendingOrder else False
            categoryComparisons[k] = OrderedDict(
                sorted(metricDict.items(), key=lambda k: k[1],
                       reverse=reverse))

        return categoryComparisons

    def getCategoryDistances(self, sort=True, save=None, labelRefs=None):
        """
    Return a dict where keys are categories and values are dicts of distances.

    @param sort      (bool)        Sort the inner dicts with compareCategories()
    @param save      (str)         Dump catDistances to a JSON in this dir.
    @return          (defaultdict)

    E.g. w/ categories 0 and 1:
      catDistances = {
          0: {
              0: {"cosineSimilarity": 1.0, ...},
              1: {"cosineSimilarity": 0.33, ...}
              },
          1: {
              0: {"cosineSimilarity": 0.33, ...},
              1: {"cosineSimilarity": 1.0, ...}
              }
    Note the inner-dicts of catDistances are OrderedDict objects.
    """
        catDistances = defaultdict(list)
        for cat, catBitmap in self.categoryBitmaps.iteritems():
            catDistances[cat] = OrderedDict()
            for compareCat, compareBitmap in self.categoryBitmaps.iteritems():
                # List is in order of self.categoryBitmaps.keys()
                catDistances[cat][compareCat] = self.compareEncoder.compare(
                    catBitmap, compareBitmap)

        if sort:
            # Order each inner dict of catDistances such that the ranking is most to
            # least similar.
            catDistances = self.compareCategories(catDistances)

        if save is not None:
            self.writeOutCategories(save,
                                    comparisons=catDistances,
                                    labelRefs=labelRefs)

        return catDistances

    @staticmethod
    def getWinningLabels(distances, numLabels, metric):
        """
    Return indices of winning categories, based off of the input metric.
    Overrides the base class implementation.
    """
        metricValues = numpy.array([v[metric] for v in distances.values()])
        sortedIdx = numpy.argsort(metricValues)

        # euclideanDistance and jaccardDistance are ascending
        descendingOrder = ("overlappingAll", "overlappingLeftRight",
                           "overlappingRightLeft", "cosineSimilarity",
                           "weightedScoring")
        if metric in descendingOrder:
            sortedIdx = sortedIdx[::-1]

        return numpy.array(
            [distances.keys()[catIdx] for catIdx in sortedIdx[:numLabels]])
Exemplo n.º 18
0
class ClassificationModelFingerprint(ClassificationModel):
    """
  Class to run the survey response classification task with Coritcal.io
  fingerprint encodings.

  From the experiment runner, the methods expect to be fed one sample at a time.
  """
    def __init__(self,
                 verbosity=1,
                 numLabels=3,
                 modelDir="ClassificationModelFingerprint",
                 fingerprintType=EncoderTypes.word,
                 unionSparsity=20.0):

        super(ClassificationModelFingerprint,
              self).__init__(verbosity=verbosity,
                             numLabels=numLabels,
                             modelDir=modelDir)

        # Init kNN classifier and Cortical.io encoder; need valid API key (see
        # CioEncoder init for details).
        self.classifier = KNNClassifier(k=numLabels,
                                        distanceMethod='rawOverlap',
                                        exact=False,
                                        verbosity=verbosity - 1)

        if fingerprintType is (not EncoderTypes.document
                               or not EncoderTypes.word):
            raise ValueError("Invaid type of fingerprint encoding; see the "
                             "EncoderTypes class for eligble types.")
        self.encoder = CioEncoder(cacheDir="./fluent/experiments/cioCache",
                                  fingerprintType=fingerprintType,
                                  unionSparsity=unionSparsity)
        self.n = self.encoder.n
        self.w = int((self.encoder.targetSparsity / 100) * self.n)

    def encodeSample(self, sample):
        """
    Encode an SDR of the input string by querying the Cortical.io API. If the
    client returns None, we create a random SDR with the model's dimensions n
    and w.

    @param sample     (list)        Tokenized sample, where each item is a str.
    @return fp        (dict)        The sample text, sparsity, and bitmap.
    Example return dict:
      {
        "text": "Example text",
        "sparsity": 0.03,
        "bitmap": numpy.array([])
      }
    """
        sample = " ".join(sample)
        fpInfo = self.encoder.encode(sample)
        if fpInfo:
            fp = {
                "text": fpInfo["text"] if "text" in fpInfo else fpInfo["term"],
                "sparsity": fpInfo["sparsity"],
                "bitmap": numpy.array(fpInfo["fingerprint"]["positions"])
            }
        else:
            fp = {
                "text": sample,
                "sparsity": float(self.w) / self.n,
                "bitmap": self.encodeRandomly(sample)
            }

        return fp

    def trainModel(self, i):
        # TODO: add batch training, where i is a list
        """
    Train the classifier on the sample and labels for record i. The list
    sampleReference is populated to correlate classifier prototypes to sample
    IDs.
    """
        bitmap = self.patterns[i]["pattern"]["bitmap"]
        if bitmap.any():
            for label in self.patterns[i]["labels"]:
                self.classifier.learn(bitmap, label, isSparse=self.n)
                self.sampleReference.append(self.patterns[i]["ID"])

    def testModel(self, i, numLabels=3):
        """
    Test the model on record i.

    @param numLabels  (int)           Number of classification predictions.
    @return           (numpy array)   numLabels most-frequent classifications
                                      for the data samples; int or empty.
    """
        (_, inferenceResult, _, _) = self.classifier.infer(
            self.sparsifyPattern(self.patterns[i]["pattern"]["bitmap"],
                                 self.n))
        return self.getWinningLabels(inferenceResult, numLabels)
Exemplo n.º 19
0
class ClassificationModelContext(ClassificationModel):
    """
  Class to run the survey response classification task with Cortical.io
  text context, then AND the context

  From the experiment runner, the methods expect to be fed one sample at a time.
  """
    def __init__(self, verbosity=1, numLabels=1):
        """
    Initialize the CorticalClient and CioEncoder. Requires a valid API key
    """
        super(ClassificationModelContext, self).__init__(verbosity)

        self.encoder = CioEncoder(cacheDir="./experiments/cache")
        self.client = CorticalClient(self.encoder.apiKey)

        self.n = self.encoder.n
        self.w = int((self.encoder.targetSparsity / 100) * self.n)

        self.categoryBitmaps = {}
        self.numLabels = numLabels

    def encodePattern(self, pattern):
        """
    Encode an SDR of the input string by querying the Cortical.io API.

    @param pattern     (list)           Tokenized sample, where each item is a
                                        string
    @return            (dictionary)     Dictionary, containing text, sparsity,
                                        and bitmap
    Example return dict:
    {
      "text": "Example text",
      "sparsity": 0.0,
      "bitmap": numpy.zeros(0)
    }
    """
        text = " ".join(pattern)
        return {
            "text": text,
            "sparsity": 0.0,
            "bitmap": self._encodeText(text)
        }

    def _encodeText(self, text):
        fpInfo = self.encoder.encode(text)
        if self.verbosity > 1:
            print "Fingerprint sparsity = {0}%.".format(fpInfo["sparsity"])

        if fpInfo:
            bitmap = numpy.array(fpInfo["fingerprint"]["positions"])
        else:
            bitmap = self.encodeRandomly(text)

        return bitmap.astype(int)

    def resetModel(self):
        """Reset the model"""
        self.categoryBitmaps.clear()

    def trainModel(self, samples, labels):
        """
    Train the classifier on the input sample and label. Use Cortical.io's
    keyword extraction to get the most relevant terms then get the intersection
    of those bitmaps

    @param samples     (dictionary)      Dictionary, containing text, sparsity,
                                         and bitmap
    @param labels      (int)             Reference index for the classification
                                         of this sample.
    """
        for sample, sample_labels in zip(samples, labels):
            bitmaps = [sample["bitmap"].tolist()]
            context = self.client.getContextFromText(bitmaps,
                                                     maxResults=5,
                                                     getFingerprint=True)

            if len(context) != 0:
                union = numpy.zeros(0)
                for c in context:
                    bitmap = c["fingerprint"]["positions"]
                    union = numpy.union1d(bitmap, union).astype(int)

                for label in sample_labels:
                    # Haven't seen the label before
                    if label not in self.categoryBitmaps:
                        self.categoryBitmaps[label] = union

                    intersection = numpy.intersect1d(
                        union, self.categoryBitmaps[label])
                    if intersection.size == 0:
                        # Don't want to lose all the old information
                        union = numpy.union1d(
                            union, self.categoryBitmaps[label]).astype(int)
                        # Need to sample to stay sparse
                        count = len(union)
                        sampleIndices = random.sample(xrange(count),
                                                      min(count, self.w))
                        intersection = numpy.sort(union[sampleIndices])

                    self.categoryBitmaps[label] = intersection

    def testModel(self, sample):
        """
    Test the intersection bitmap on the input sample. Returns a dictionary
    containing various distance metrics between the sample and the classes.

    @param sample     (dictionary)      Dictionary, containing text, sparsity,
                                        and bitmap
    @return           (dictionary)      The distances between the sample and
                                        the classes
    Example return dict:
      {
        0: {
          "cosineSimilarity": 0.6666666666666666,
          "euclideanDistance": 0.3333333333333333,
          "jaccardDistance": 0.5,
          "overlappingAll": 6,
          "overlappingLeftRight": 0.6666666666666666,
          "overlappingRightLeft": 0.6666666666666666,
          "sizeLeft": 9,
          "sizeRight": 9,
          "weightedScoring": 0.4436476984102028
        }
      }
    """

        sampleBitmap = sample["bitmap"].tolist()

        distances = {}
        for cat, catBitmap in self.categoryBitmaps.iteritems():
            distances[cat] = self.client.compare(sampleBitmap,
                                                 catBitmap.tolist())

        return self.winningLabels(distances,
                                  numberCats=self.numLabels,
                                  metric="overlappingAll")

    @staticmethod
    def winningLabels(distances, numberCats, metric):
        """
    Return indices of winning categories, based off of the input metric.
    Overrides the base class implementation.
    """
        metricValues = numpy.array([v[metric] for v in distances.values()])
        sortedIdx = numpy.argsort(metricValues)

        # euclideanDistance and jaccardDistance are ascending
        descendingOrder = set([
            "overlappingAll", "overlappingLeftRight", "overlappingRightLeft",
            "cosineSimilarity", "weightedScoring"
        ])
        if metric in descendingOrder:
            sortedIdx = sortedIdx[::-1]

        return [distances.keys()[catIdx] for catIdx in sortedIdx[:numberCats]]
class ClassificationModelEndpoint(ClassificationModel):
  """
  Class to run the survey response classification task with Cortical.io
  text endpoint encodings and classification system.

  From the experiment runner, the methods expect to be fed one sample at a time.
  """

  def __init__(self,
               verbosity=1,
               numLabels=3,
               modelDir="ClassificationModelEndpoint",
               unionSparsity=20.0):
    """
    Initializes the encoder as CioEncoder; requires a valid API key.
    """
    super(ClassificationModelEndpoint, self).__init__(
      verbosity=verbosity, numLabels=numLabels, modelDir=modelDir)

    self.encoder = CioEncoder(cacheDir="./experiments/cache",
                              unionSparsity=unionSparsity)
    self.compareEncoder = LanguageEncoder()

    self.n = self.encoder.n
    self.w = int((self.encoder.targetSparsity/100) * self.n)

    self.categoryBitmaps = {}
    self.negatives = defaultdict(list)
    self.positives = defaultdict(list)


  def encodeSample(self, sample):
    """
    Encode an SDR of the input string by querying the Cortical.io API.

    @param sample         (list)          Tokenized sample, where each item is
                                          a string
    @return fp            (dict)          The sample text, sparsity, and bitmap.
    Example return dict:
      {
        "text": "Example text",
        "sparsity": 0.03,
        "bitmap": numpy.array([])
      }
    """
    sample = " ".join(sample)
    fpInfo = self.encoder.encode(sample)
    if fpInfo:
      fp = {"text":fpInfo["text"] if "text" in fpInfo else fpInfo["term"],
            "sparsity":fpInfo["sparsity"],
            "bitmap":numpy.array(fpInfo["fingerprint"]["positions"])}
    else:
      fp = {"text":sample,
            "sparsity":float(self.w)/self.n,
            "bitmap":self.encodeRandomly(sample)}

    return fp


  def resetModel(self):
    """Reset the model"""
    self.positives.clear()
    self.negatives.clear()
    self.categoryBitmaps.clear()


  def trainModel(self, i, negatives=None):
    # TODO: add batch training, where i is a list; note we should only add
    # negatives when training on one sample so we know which labels to use.
    """
    Train the classifier on the sample and labels for record i. Use
    Cortical.io's createClassification() to make a bitmap that represents the
    class. The list sampleReference is populated to correlate classifier
    prototypes to sample IDs.

    @param negative   (list)            Each item is the dictionary containing
                                        text, sparsity and bitmap for the
                                        negative samples.
    """
    record = self.patterns[i]
    labelsToUpdateBitmaps = set()
    for label in record["labels"]:
      if record["pattern"]["text"] and record["pattern"]["bitmap"].any():
        self.positives[label].append(record["pattern"]["text"])
        if negatives:
          for neg in negatives:
            if neg["text"]:
              self.negatives[label].append(neg["text"])
        labelsToUpdateBitmaps.add(label)

    for label in labelsToUpdateBitmaps:
      self.categoryBitmaps[label] = self.encoder.createCategory(
        str(label), self.positives[label], self.negatives[label])["positions"]
      self.sampleReference.append(i)


  def testModel(self, i, numLabels=3, metric="overlappingAll"):
    """
    Test on record i. The Cortical.io classifier returns a dictionary
    containing various distance metrics between the sample and the classes.

    @param numLabels  (int)           Number of classification predictions.
    @param metric     (str)           Distance metric use by classifier.
    @return           (numpy array)   numLabels most-frequent classifications
                                      for the data samples; int or empty.
    """
    sampleBitmap = self.patterns[i]["pattern"]["bitmap"].tolist()

    distances = defaultdict(list)
    for cat, catBitmap in self.categoryBitmaps.iteritems():
      distances[cat] = self.compareEncoder.compare(sampleBitmap, catBitmap)

    return self.getWinningLabels(distances, numLabels=numLabels, metric=metric)


  @staticmethod
  def compareCategories(catDistances, metric="overlappingAll"):
    """
    Calculate category distances. Returns a defaultdict of category keys, where
    values are OrderedDicts sorted such that the most similar categories
    (according to the input metric) are listed first.
    """
    descendingOrder = ("overlappingAll", "overlappingLeftRight",
                       "overlappingRightLeft", "cosineSimilarity",
                       "weightedScoring")

    categoryComparisons = defaultdict(list)
    for k, v in catDistances.iteritems():
      # Create a dict for this category
      metricDict = {compareCat: distances[metric]
                    for compareCat, distances in v.iteritems()}
      # Sort the dict by the metric
      reverse = True if metric in descendingOrder else False
      categoryComparisons[k] = OrderedDict(
        sorted(metricDict.items(), key=lambda k: k[1], reverse=reverse))

    return categoryComparisons


  def getCategoryDistances(self, sort=True, save=None, labelRefs=None):
    """
    Return a dict where keys are categories and values are dicts of distances.

    @param sort      (bool)        Sort the inner dicts with compareCategories()
    @param save      (str)         Dump catDistances to a JSON in this dir.
    @return          (defaultdict)

    E.g. w/ categories 0 and 1:
      catDistances = {
          0: {
              0: {"cosineSimilarity": 1.0, ...},
              1: {"cosineSimilarity": 0.33, ...}
              },
          1: {
              0: {"cosineSimilarity": 0.33, ...},
              1: {"cosineSimilarity": 1.0, ...}
              }
    Note the inner-dicts of catDistances are OrderedDict objects.
    """
    catDistances = defaultdict(list)
    for cat, catBitmap in self.categoryBitmaps.iteritems():
      catDistances[cat] = OrderedDict()
      for compareCat, compareBitmap in self.categoryBitmaps.iteritems():
        # List is in order of self.categoryBitmaps.keys()
        catDistances[cat][compareCat] = self.compareEncoder.compare(
          catBitmap, compareBitmap)

    if sort:
      # Order each inner dict of catDistances such that the ranking is most to
      # least similar.
      catDistances = self.compareCategories(catDistances)

    if save is not None:
      self.writeOutCategories(
        save, comparisons=catDistances, labelRefs=labelRefs)

    return catDistances


  @staticmethod
  def getWinningLabels(distances, numLabels, metric):
    """
    Return indices of winning categories, based off of the input metric.
    Overrides the base class implementation.
    """
    metricValues = numpy.array([v[metric] for v in distances.values()])
    sortedIdx = numpy.argsort(metricValues)

    # euclideanDistance and jaccardDistance are ascending
    descendingOrder = ("overlappingAll", "overlappingLeftRight",
                       "overlappingRightLeft", "cosineSimilarity",
                       "weightedScoring")
    if metric in descendingOrder:
      sortedIdx = sortedIdx[::-1]

    return numpy.array(
      [distances.keys()[catIdx] for catIdx in sortedIdx[:numLabels]])


  @staticmethod
  def query():
    print "The Classification Endpoint model doesn't support this method."


  @staticmethod
  def infer():
    print "The Classification Endpoint model doesn't support this method."