Пример #1
0
    def testMappingsWithImbuDocumentModel(self):
        # Create the CioDocumentFingerprint model
        modelName = "CioDocumentFingerprint"
        kwargs = {
            "numLabels": 1,
            "classifierMetric": "pctOverlapOfInput",
            "filterText": True,
            "verbosity": 0,
            "fingerprintType": EncoderTypes.document,
            "cacheRoot": None,
        }
        model = createModel("CioDocumentFingerprint", **kwargs)

        # Train the model for use in Imbu
        for seqId, text in enumerate(self.testDocuments):
            model.trainDocument(text, [0], seqId)

        # Query the model, expecting two matches from one sample
        query = "The key to artificial intelligence has always been the " "representation."
        _, sortedIds, sortedDistances = model.inferDocument(query, returnDetailedResults=True, sortResults=True)

        self.assertEqual(
            len(self.testDocuments), len(sortedIds), "Document-level models should have one prototype ID per document."
        )

        results = self._formatResults(modelName, sortedDistances, sortedIds)

        for r in results:
            self.assertEqual(0, r["wordId"], "wordId is insignificant in document-level models, and should be 0.")
Пример #2
0
def instantiateModel(args):
    """
  Set some specific arguments and return an instance of the model we will use.
  """
    args.networkConfig = getNetworkConfig(args.networkConfigPath)
    args.k = kValues.get(args.modelName, 1)
    return createModel(**vars(args))
    def testMappingsWithImbuDocumentModel(self):
        # Create the CioDocumentFingerprint model
        modelName = "CioDocumentFingerprint"
        kwargs = {
            "numLabels": 1,
            "classifierMetric": "pctOverlapOfInput",
            "filterText": True,
            "verbosity": 0,
            "fingerprintType": EncoderTypes.document,
            "cacheRoot": None
        }
        model = createModel("CioDocumentFingerprint", **kwargs)

        # Train the model for use in Imbu
        for seqId, text in enumerate(self.testDocuments):
            model.trainDocument(text, [0], seqId)

        # Query the model, expecting two matches from one sample
        query = ("The key to artificial intelligence has always been the "
                 "representation.")
        _, sortedIds, sortedDistances = model.inferDocument(
            query, returnDetailedResults=True, sortResults=True)

        self.assertEqual(
            len(self.testDocuments), len(sortedIds),
            "Document-level models should have one prototype ID per document.")

        results = self._formatResults(modelName, sortedDistances, sortedIds)

        for r in results:
            self.assertEqual(
                0, r["wordId"],
                "wordId is insignificant in document-level models, and should be 0."
            )
def instantiateModel(args):
  """
  Set some specific arguments and return an instance of the model we will use.
  """
  args.networkConfig = getNetworkConfig(args.networkConfigPath)
  args.k = kValues.get(args.modelName, 1)
  return createModel(**vars(args))
Пример #5
0
 def _executeModelLifecycle(self, modelName, modelDir):
   """ Create a model, train it, save it, reload it, return it."""
   model = createModel(modelName, **self.modelParams)
   model = trainModel(model, self.dataSet)
   model.save(modelDir)
   del model
   return ClassificationModel.load(modelDir)
Пример #6
0
    def testMappingsWithImbuWordModel(self):
        # Create a Keywords model
        modelName = "Keywords"
        kwargs = {"numLabels": 1, "k": 42, "classifierMetric": "pctOverlapOfInput", "filterText": True, "verbosity": 0}
        model = createModel(modelName, **kwargs)

        # Train the model for use in Imbu
        for seqId, text in enumerate(self.testDocuments):
            tokenList, mapping = model.tokenize(text)
            lastTokenIndex = len(tokenList) - 1
            for i, (token, tokenIndex) in enumerate(zip(tokenList, mapping)):
                wordId = seqId * self.tokenIndexingFactor + tokenIndex
                model.trainToken(token, [0], wordId, reset=int(i == lastTokenIndex))

        # Query the model, expecting two matches from one sample
        query = "The key to artificial intelligence has always been the " "representation."
        _, sortedIds, sortedDistances = model.inferDocument(query, returnDetailedResults=True, sortResults=True)

        # Test for expected word-token mapping (in prototype IDs)
        self.assertItemsEqual(
            self.filteredProtoIds,
            sortedIds,
            "List of IDs returned from inference does not match the expected list of " "prototype IDs.",
        )

        # Test for exact matching results
        self.assertSequenceEqual(
            [0.0, 0.0, 1.0], sortedDistances[:3].tolist(), "Expected two exact-matching prototypes."
        )

        # Test for multiple matches per sample
        results = self._formatResults(modelName, sortedDistances, sortedIds)
        self.assertEqual(results[0]["sampleId"], results[1]["sampleId"])
        self.assertEqual(results[0]["text"], results[1]["text"])
        self.assertNotEqual(results[0]["wordId"], results[1]["wordId"])

        # Test the match maps back to the query
        matchingWord = results[0]["text"].split(" ")[results[0]["wordId"]]
        self.assertIn(matchingWord, query, "Matching word is indexed incorrectly.")

        # Query the model again, expecting five matches from two samples
        query = "sequence"
        _, sortedIds, sortedDistances = model.inferDocument(query, returnDetailedResults=True, sortResults=True)

        # Test for exact matching results
        self.assertSequenceEqual(
            [0.0, 0.0, 0.0, 0.0, 0.0, 1.0], sortedDistances[:6].tolist(), "Expected five exact-matching prototypes."
        )

        # Test the exact matches map back to the query term
        results = self._formatResults(modelName, sortedDistances, sortedIds)
        for r in results[:5]:
            self.assertIn(r["sampleId"], (2, 3))
            matchingWord = r["text"].split(" ")[r["wordId"]]
            self.assertIn(query, matchingWord, "Matching word is indexed incorrectly.")
def instantiateModel(args):
    """
  Return an instance of the model we will use.
  """
    # Some values of K we know work well for this problem for specific model types
    kValues = {"keywords": 21, "docfp": 3}

    # Create model after setting specific arguments required for this experiment
    args.networkConfig = getNetworkConfig(args.networkConfigPath)
    args.numLabels = 2
    args.k = kValues.get(args.modelName, 1)

    return createModel(**vars(args))
def instantiateModel(args):
    """
  Return an instance of the model we will use.
  """
    # Some values of K we know work well for this problem for specific model types
    kValues = {"keywords": 21, "docfp": 3}

    # Create model after setting specific arguments required for this experiment
    args.networkConfig = getNetworkConfig(args.networkConfigPath)
    args.numLabels = 2
    args.k = kValues.get(args.modelName, 1)

    return createModel(**vars(args))
Пример #9
0
    def _modelFactory(self, modelName, savePath, **kwargs):
        """ Imbu model factory.  Returns a concrete instance of a classification
    model given a model type name and kwargs.

    @param modelName (str)    Must be one of 'CioWordFingerprint',
        'CioDocumentFingerprint', 'HTMNetwork', 'Keywords'.
    """
        kwargs.update(modelDir=savePath, **self._defaultModelFactoryKwargs())

        modelName = self._mapModelName(modelName)

        if getattr(ClassificationModelTypes,
                   modelName) in self.requiresCIOKwargs:
            # Model type requires Cortical.io credentials
            kwargs.update(retina=self.retina, apiKey=self.apiKey)
            # Specify encoder params
            kwargs.update(cacheRoot=self.cacheRoot, retinaScaling=1.0)

        if modelName == "CioWordFingerprint":
            kwargs.update(fingerprintType=EncoderTypes.word)

        elif modelName == "CioDocumentFingerprint":
            kwargs.update(fingerprintType=EncoderTypes.document)

        elif modelName == "HTMNetwork":
            try:
                kwargs.update(networkConfig=_loadNetworkConfig(
                    kwargs["networkConfigName"]))
            except Exception as e:
                print "Could not add params to HTMNetwork model config."
                raise e

        elif modelName == "Keywords":
            # k should be > the number of data samples because the Keywords model
            # looks for exact matching tokens, so we want to consider all data
            # samples in the search of k nearest neighbors.
            kwargs.update(k=10 * len(self.dataDict.keys()))

        else:
            raise ValueError(
                "{} is not an acceptable Imbu model.".format(modelName))

        model = createModel(modelName, **kwargs)

        model.verbosity = 0

        return model
Пример #10
0
  def _modelFactory(self, modelName, savePath, **kwargs):
    """ Imbu model factory.  Returns a concrete instance of a classification
    model given a model type name and kwargs.

    @param modelName (str)    Must be one of 'CioWordFingerprint',
        'CioDocumentFingerprint', 'HTMNetwork', 'Keywords'.
    """
    kwargs.update(modelDir=savePath, **self._defaultModelFactoryKwargs())

    modelName = self._mapModelName(modelName)

    if getattr(ClassificationModelTypes, modelName) in self.requiresCIOKwargs:
      # Model type requires Cortical.io credentials
      kwargs.update(retina=self.retina, apiKey=self.apiKey)
      # Specify encoder params
      kwargs.update(cacheRoot=self.cacheRoot, retinaScaling=1.0)

    if modelName == "CioWordFingerprint":
      kwargs.update(fingerprintType=EncoderTypes.word)

    elif modelName == "CioDocumentFingerprint":
      kwargs.update(fingerprintType=EncoderTypes.document)

    elif modelName == "HTMNetwork":
      try:
        kwargs.update(
          networkConfig=_loadNetworkConfig(kwargs["networkConfigName"]))
      except Exception as e:
        print "Could not add params to HTMNetwork model config."
        raise e

    elif modelName == "Keywords":
      # k should be > the number of data samples because the Keywords model
      # looks for exact matching tokens, so we want to consider all data
      # samples in the search of k nearest neighbors.
      kwargs.update(k=10 * len(self.dataDict.keys()))

    else:
      raise ValueError("{} is not an acceptable Imbu model.".format(modelName))

    model = createModel(modelName, **kwargs)

    model.verbosity = 0

    return model
def runExperiment(args):
  if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)
  
  (trainingDataDup, labelRefs, documentCategoryMap,
   documentTextMap) = readDataAndReshuffle(args)
  
  # remove duplicates from training data
  includedDocIds = set()
  trainingData = []
  for record in trainingDataDup:
    if record[2] not in includedDocIds:
      includedDocIds.add(record[2])
      trainingData.append(record)
  
  args.networkConfig = getNetworkConfig(args.networkConfigPath)
  model = createModel(numLabels=1, **vars(args))
  model = trainModel(args, model, trainingData, labelRefs)
  
  numDocs = model.getClassifier()._numPatterns
  
  print "Model trained with %d documents" % (numDocs,)
  
  knn = model.getClassifier()
  hc = HierarchicalClustering(knn)
  
  hc.cluster("complete")
  protos, clusterSizes = hc.getClusterPrototypes(args.numClusters,
                                                 numDocs)

  # Run test to ensure consistency with KNN
  if args.knnTest:
    knnTest(protos, knn)
    return


  # Summary statistics
  # bucketCounts[i, j] is the number of occurrances of bucket j in cluster i
  bucketCounts = numpy.zeros((args.numClusters, len(labelRefs)))  

  for clusterId in xrange(len(clusterSizes)):
    print
    print "Cluster %d with %d documents" % (clusterId, clusterSizes[clusterId])
    print "==============="

    prototypeNum = 0
    for index in protos[clusterId]:
      if index != -1:
        docId = trainingData[index][2]
        prototypeNum += 1
        display = prototypeNum <= args.numPrototypes

        if display:
          print "(%d) %s" % (docId, trainingData[index][0])
          print "Buckets:"

        # The docId keys in documentCategoryMap are strings rather than ints
        if docId in documentCategoryMap:
          for bucketId in documentCategoryMap[docId]:
            bucketCounts[clusterId, bucketId] += 1
            if display:
              print "    ", labelRefs[bucketId]
        elif display:
          print "    <None>"
        if display:
          print "\n\n"

  createBucketClusterPlot(args, bucketCounts)
  create2DSVDProjection(args, protos, trainingData, documentCategoryMap, knn)
    def testMappingsWithImbuWordModel(self):
        # Create a Keywords model
        modelName = "Keywords"
        kwargs = {
            "numLabels": 1,
            "k": 42,
            "classifierMetric": "pctOverlapOfInput",
            "filterText": True,
            "verbosity": 0
        }
        model = createModel(modelName, **kwargs)

        # Train the model for use in Imbu
        for seqId, text in enumerate(self.testDocuments):
            tokenList, mapping = model.tokenize(text)
            lastTokenIndex = len(tokenList) - 1
            for i, (token, tokenIndex) in enumerate(zip(tokenList, mapping)):
                wordId = seqId * self.tokenIndexingFactor + tokenIndex
                model.trainToken(token, [0],
                                 wordId,
                                 reset=int(i == lastTokenIndex))

        # Query the model, expecting two matches from one sample
        query = ("The key to artificial intelligence has always been the "
                 "representation.")
        _, sortedIds, sortedDistances = model.inferDocument(
            query, returnDetailedResults=True, sortResults=True)

        # Test for expected word-token mapping (in prototype IDs)
        self.assertItemsEqual(
            self.filteredProtoIds, sortedIds,
            "List of IDs returned from inference does not match the expected list of "
            "prototype IDs.")

        # Test for exact matching results
        self.assertSequenceEqual([0.0, 0.0, 1.0], sortedDistances[:3].tolist(),
                                 "Expected two exact-matching prototypes.")

        # Test for multiple matches per sample
        results = self._formatResults(modelName, sortedDistances, sortedIds)
        self.assertEqual(results[0]["sampleId"], results[1]["sampleId"])
        self.assertEqual(results[0]["text"], results[1]["text"])
        self.assertNotEqual(results[0]["wordId"], results[1]["wordId"])

        # Test the match maps back to the query
        matchingWord = results[0]["text"].split(" ")[results[0]["wordId"]]
        self.assertIn(matchingWord, query,
                      "Matching word is indexed incorrectly.")

        # Query the model again, expecting five matches from two samples
        query = ("sequence")
        _, sortedIds, sortedDistances = model.inferDocument(
            query, returnDetailedResults=True, sortResults=True)

        # Test for exact matching results
        self.assertSequenceEqual([0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
                                 sortedDistances[:6].tolist(),
                                 "Expected five exact-matching prototypes.")

        # Test the exact matches map back to the query term
        results = self._formatResults(modelName, sortedDistances, sortedIds)
        for r in results[:5]:
            self.assertIn(r["sampleId"], (2, 3))
            matchingWord = r["text"].split(" ")[r["wordId"]]
            self.assertIn(query, matchingWord,
                          "Matching word is indexed incorrectly.")
Пример #13
0
def runExperiment(args):
    if not os.path.exists(SAVE_PATH):
        os.makedirs(SAVE_PATH)

    (trainingDataDup, labelRefs, documentCategoryMap,
     documentTextMap) = readDataAndReshuffle(args)

    # remove duplicates from training data
    includedDocIds = set()
    trainingData = []
    for record in trainingDataDup:
        if record[2] not in includedDocIds:
            includedDocIds.add(record[2])
            trainingData.append(record)

    args.networkConfig = getNetworkConfig(args.networkConfigPath)
    model = createModel(numLabels=1, **vars(args))
    model = trainModel(args, model, trainingData, labelRefs)

    numDocs = model.getClassifier()._numPatterns

    print "Model trained with %d documents" % (numDocs, )

    knn = model.getClassifier()
    hc = HierarchicalClustering(knn)

    hc.cluster("complete")
    protos, clusterSizes = hc.getClusterPrototypes(args.numClusters, numDocs)

    # Run test to ensure consistency with KNN
    if args.knnTest:
        knnTest(protos, knn)
        return

    # Summary statistics
    # bucketCounts[i, j] is the number of occurrances of bucket j in cluster i
    bucketCounts = numpy.zeros((args.numClusters, len(labelRefs)))

    for clusterId in xrange(len(clusterSizes)):
        print
        print "Cluster %d with %d documents" % (clusterId,
                                                clusterSizes[clusterId])
        print "==============="

        prototypeNum = 0
        for index in protos[clusterId]:
            if index != -1:
                docId = trainingData[index][2]
                prototypeNum += 1
                display = prototypeNum <= args.numPrototypes

                if display:
                    print "(%d) %s" % (docId, trainingData[index][0])
                    print "Buckets:"

                # The docId keys in documentCategoryMap are strings rather than ints
                if docId in documentCategoryMap:
                    for bucketId in documentCategoryMap[docId]:
                        bucketCounts[clusterId, bucketId] += 1
                        if display:
                            print "    ", labelRefs[bucketId]
                elif display:
                    print "    <None>"
                if display:
                    print "\n\n"

    createBucketClusterPlot(args, bucketCounts)
    create2DSVDProjection(args, protos, trainingData, documentCategoryMap, knn)