Python NetworkDataGenerator.generateSequence 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: htmresearch.support.network_text_data_generator

클래스/타입: NetworkDataGenerator

메소드/함수: generateSequence

hotexamples.com에서의 예제들: 3

Python NetworkDataGenerator.generateSequence - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 htmresearch.support.network_text_data_generator.NetworkDataGenerator.generateSequence에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

NetworkDataGenerator(12)

split(8)

saveData(6)

getNumberOfTokens(3)

randomizeData(3)

generateSequence(1)

getClassifications(1)

getSamples(1)

reset(1)

setupData(1)

stripCategories(1)

예제 #1

파일 보기

파일: classify_htm.py 프로젝트: SaganBolliger/nupic.research

class ClassificationModelHTM(ClassificationModel):
  """Class to run the classification experiments with HTM network models."""

  def __init__(self,
               networkConfig,
               inputFilePath,
               retinaScaling=1.0,
               retina="en_associative",
               apiKey=None,
               verbosity=1,
               numLabels=3,
               modelDir="ClassificationModelHTM",
               prepData=True,
               stripCats=False):
    """
    @param networkConfig      (dict)    Network configuration dict with region
                                        parameters.
    @param inputFilePath      (str)     Path to data file.
    @param retinaScaling      (float)   Scales the dimensions of the SDRs.
    @param retina             (str)     Name of Cio retina.
    @param apiKey             (str)     Key for Cio API.
    @param prepData           (bool)    Prepare the input data into network API
                                        format.
    @param stripCats          (bool)    Remove the categories and replace them
                                        with the sequence_Id.
    See ClassificationModel for remaining parameters.
    """
    super(ClassificationModelHTM, self).__init__(
      verbosity=verbosity, numLabels=numLabels, modelDir=modelDir)

    self.networkConfig = networkConfig
    self.retinaScaling = retinaScaling
    self.retina = retina
    self.apiKey = apiKey

    self.networkDataGen = NetworkDataGenerator()
    if prepData:
      self.networkDataPath = self.prepData(inputFilePath, stripCats=stripCats)
    else:
      self.networkDataPath = inputFilePath

    self.network = self.initModel()
    self.learningRegions = self._getLearningRegions()

    # Always a sensor and classifier region.
    self.sensorRegion = self.network.regions[
      self.networkConfig["sensorRegionConfig"].get("regionName")]
    self.classifierRegion = self.network.regions[
      self.networkConfig["classifierRegionConfig"].get("regionName")]


  def prepData(self, dataPath, ordered=False, stripCats=False, **kwargs):
    """
    Generate the data in network API format.

    @param dataPath          (str)  Path to input data file; format as expected
                                    by NetworkDataGenerator.
    @param ordered           (bool) Keep order of data, or randomize.
    @param stripCats         (bool) Remove the categories and replace them with
                                    the sequence_Id.
    @return networkDataPath  (str)  Path to data formtted for network API.
    """
    networkDataPath = self.networkDataGen.setupData(
      dataPath, self.numLabels, ordered, stripCats, **kwargs)

    return networkDataPath


  def initModel(self):
    """
    Initialize the network; self.networdDataPath must already be set.
    """
    recordStream = FileRecordStream(streamID=self.networkDataPath)
    root = os.path.dirname(os.path.realpath(__file__))
    encoder = CioEncoder(retinaScaling=self.retinaScaling,
                         cacheDir=os.path.join(root, "CioCache"),
                         retina=self.retina,
                         apiKey=self.apiKey)

    # This encoder specifies the LanguageSensor output width.
    return configureNetwork(recordStream, self.networkConfig, encoder)


  def _getLearningRegions(self):
    """Return tuple of the network's region objects that learn."""
    learningRegions = []
    for region in self.network.regions.values():
      spec = region.getSpec()
      if spec.parameters.contains('learningMode'):
        learningRegions.append(region)

    return learningRegions


  # TODO: is this still needed?
  def encodeSample(self, sample):
    """
    Put each token in its own dictionary with its bitmap
    @param sample     (list)            Tokenized sample, where each item is a
                                        string token.
    @return           (list)            The sample text, sparsity, and bitmap
                                        for each token. Since the network will
                                        do the actual encoding, the bitmap and
                                        sparsity will be None
    Example return list:
      [{
        "text": "Example text",
        "sparsity": 0.0,
        "bitmap": None
      }]
    """
    return [{"text": t,
             "sparsity": None,
             "bitmap": None} for t in sample]


  def resetModel(self):
    """
    Reset the model by creating a new network since the network API does not
    support resets.
    """
    # TODO: test this works as expected
    self.network = self.initModel()


  def saveModel(self, trial=None):
    try:
      if not os.path.exists(self.modelDir):
        os.makedirs(self.modelDir)
      if trial:
        netPath = os.path.join(self.modelDir, "network_{}.nta".format(trial))
      else:
        netPath = os.path.join(self.modelDir, "network.nta")
      self.network.save(netPath)
      # with open(netPath, "wb") as f:
      #   pkl.dump(self, f)
      if self.verbosity > 0:
        print "Model saved to '{}'.".format(netPath)
    except IOError as e:
      print "Could not save model to '{}'.".format(netPath)
      raise e


  def trainModel(self, iterations=1):
    """
    Run the network with all regions learning.
    Note self.sampleReference doesn't get populated b/c in a network model
    there's a 1-to-1 mapping of training samples.
    """
    for region in self.learningRegions:
      region.setParameter("learningMode", True)

    self.network.run(iterations)


  def trainNetwork(self, iterations):
    """Run the network with all regions learning but the classifier."""
    for region in self.learningRegions:
      if region.name == "classifier":
        region.setParameter("learningMode", False)
      else:
        region.setParameter("learningMode", True)

    self.network.run(iterations)


  def classifyNetwork(self, iterations):
    """
    For running after the network has been trained by trainNetwork(), this
    populates the KNN prototype space with the final network representations.
    """
    for region in self.learningRegions:
      region.setParameter("learningMode", False)

    sensor = self.sensorRegion.getSelf()
    sensor.rewind()

    self.classifierRegion.setParameter("learningMode", True)
    self.classifierRegion.setParameter("inferenceMode", True)

    sequenceIds = []
    for _ in xrange(iterations):
      self.network.run(1)
      sequenceIds.append(sensor.getOutputValues("sequenceIdOut")[0])

    return sequenceIds


  def inferNetwork(self, iterations, fileRecord=None, learn=False):
    """
    Run the network to infer distances to the classified samples.

    @param fileRecord (str)     If you want to change the file record stream.
    @param learn      (bool)    The classifier will learn the inferred sequnce.
    """
    if fileRecord:
      self.swapRecordStream(fileRecord)

    self.classifierRegion.setParameter("learningMode", learn)
    self.classifierRegion.setParameter("inferenceMode", True)

    sampleDistances = None
    for i in xrange(iterations):
      self.network.run(1)
      inferenceValues = self.classifierRegion.getOutputData("categoriesOut")
      # Sum together the inferred distances for each word of the sequence.
      if sampleDistances is None:
        sampleDistances = inferenceValues
      else:
        sampleDistances += inferenceValues

    return sampleDistances


  def swapRecordStream(self, dataPath):
    """Change the data source for the network's sensor region."""
    recordStream = FileRecordStream(streamID=dataPath)
    sensor = self.sensorRegion.getSelf()
    sensor.dataSource = recordStream  # TODO: implement this in network API


  def testModel(self, seed=42):
    """
    Test the classifier region on the input sample. Call this method for each
    word of a sequence. The random seed is used in getWinningLabels().

    @return           (numpy array)   numLabels most-frequent classifications
                                      for the data samples; int or empty.
    """
    for region in self.learningRegions:
      region.setParameter("learningMode", False)
    self.classifierRegion.setParameter("inferenceMode", True)

    self.network.run(1)

    inference = self._getClassifierInference(seed)
    activityBitmap = self.classifierRegion.getInputData("bottomUpIn")

    return inference, activityBitmap


  def _getClassifierInference(self, seed):
    """Return output categories from the classifier region."""
    relevantCats = self.classifierRegion.getParameter("categoryCount")

    if self.classifierRegion.type == "py.KNNClassifierRegion":
      # max number of inferences = k
      inferenceValues = self.classifierRegion.getOutputData(
        "categoriesOut")[:relevantCats]
      return self.getWinningLabels(inferenceValues, seed)

    elif self.classifierRegion.type == "py.CLAClassifierRegion":
      # TODO: test this
      return self.classifierRegion.getOutputData("categoriesOut")[:relevantCats]


  def queryModel(self, query, preprocess=False):
    """
    Run the query through the network, getting the classifer region's inferences
    for all words of the query sequence.
    @return       (list)          Two-tuples of sequence ID and distance, sorted
                                  closest to farthest from the query.
    """
    for region in self.learningRegions:
      region.setParameter("learningMode", False)
    self.classifierRegion.setParameter("inferenceMode", True)

    # Put query text in LanguageSensor data format.
    queryDicts = self.networkDataGen.generateSequence(query, preprocess)

    sensor = self.sensorRegion.getSelf()
    sampleDistances = None
    for qD in queryDicts:
      # Sum together the inferred distances for each word of the query sequence.
      sensor.queue.appendleft(qD)
      self.network.run(1)
      inferenceValues = self.classifierRegion.getOutputData("categoriesOut")
      if sampleDistances is None:
        sampleDistances = inferenceValues
      else:
        sampleDistances += inferenceValues

    catCount = self.classifierRegion.getParameter("categoryCount")
    # The use of numpy.lexsort() here is to first sort by randomValues, then
    # sort by random values; this breaks ties in a random manner.
    randomValues = numpy.random.random(catCount)
    sortedSamples = numpy.lexsort((randomValues, sampleDistances[:catCount]))
    qTuple = [(a, b) for a, b in zip(sortedSamples, sampleDistances[:catCount])]

    return sorted(qTuple, key=operator.itemgetter(1))

예제 #2

파일 보기

파일: classify_htm.py 프로젝트: aoman89757/nupic.research

class ClassificationModelHTM(ClassificationModel):
    """Classify text using generic network-API based models."""

    def __init__(
        self,
        networkConfig,
        inputFilePath,
        retinaScaling=1.0,
        retina="en_associative",
        apiKey=None,
        verbosity=1,
        numLabels=3,
        modelDir="ClassificationModelHTM",
        prepData=True,
        stripCats=False,
        cacheRoot=None,
    ):
        """
    @param networkConfig      (dict)    Network configuration dict with region
                                        parameters.
    @param inputFilePath      (str)     Path to data file.
    @param retinaScaling      (float)   Scales the dimensions of the SDRs.
    @param retina             (str)     Name of Cio retina.
    @param apiKey             (str)     Key for Cio API.
    @param prepData           (bool)    Prepare the input data into network API
                                        format.
    @param stripCats          (bool)    Remove the categories and replace them
                                        with the sequence_Id.
    @param cacheRoot          (str)     Root cache directory for CioEncoder
    See ClassificationModel for remaining parameters.

    Note classifierMetric is not specified here as it is in other models. This
    is done in the network config file.
    """
        super(ClassificationModelHTM, self).__init__(verbosity=verbosity, numLabels=numLabels, modelDir=modelDir)

        self.networkConfig = networkConfig
        self.retinaScaling = retinaScaling
        self.retina = retina
        self.apiKey = apiKey
        self.inputFilePath = inputFilePath

        self.networkDataGen = NetworkDataGenerator()
        if prepData:
            self.networkDataPath = self.prepData(self.inputFilePath, stripCats=stripCats)
        else:
            self.networkDataPath = self.inputFilePath

        self.cacheRoot = cacheRoot or os.path.dirname(os.path.realpath(__file__))

        self.network = self.initModel()
        self._initializeRegionHelpers()

    def getClassifier(self):
        """
    Returns the classifier for the model.
    """
        return self.classifierRegion.getSelf().getAlgorithmInstance()

    def prepData(self, dataPath, ordered=False, stripCats=False, **kwargs):
        """
    Generate the data in network API format.

    @param dataPath          (str)  Path to input data file; format as expected
                                    by NetworkDataGenerator.
    @param ordered           (bool) Keep order of data, or randomize.
    @param stripCats         (bool) Remove the categories and replace them with
                                    the sequence_Id.
    @return networkDataPath  (str)  Path to data formtted for network API.
    """
        networkDataPath = self.networkDataGen.setupData(dataPath, self.numLabels, ordered, stripCats, **kwargs)

        return networkDataPath

    def initModel(self):
        """
    Initialize the network; self.networdDataPath must already be set.
    """
        if self.networkDataPath is not None:
            recordStream = FileRecordStream(streamID=self.networkDataPath)
        else:
            recordStream = None

        encoder = CioEncoder(
            retinaScaling=self.retinaScaling,
            cacheDir=os.path.join(self.cacheRoot, "CioCache"),
            retina=self.retina,
            apiKey=self.apiKey,
            verbosity=self.verbosity - 1,
        )

        # This encoder specifies the LanguageSensor output width.
        return configureNetwork(recordStream, self.networkConfig, encoder)

    def _initializeRegionHelpers(self):
        """
    Set helper member variables once network has been initialized. This will
    also be called from _deSerializeExtraData()
    """
        learningRegions = []
        for region in self.network.regions.values():
            spec = region.getSpec()
            if spec.parameters.contains("learningMode"):
                learningRegions.append(region)

        # Always a sensor and classifier region.
        self.sensorRegion = self.network.regions[self.networkConfig["sensorRegionConfig"].get("regionName")]
        self.classifierRegion = self.network.regions[self.networkConfig["classifierRegionConfig"].get("regionName")]

        # There is sometimes a TP region
        self.tpRegion = None
        if self.networkConfig.has_key("tpRegionConfig"):
            self.tpRegion = self.network.regions[self.networkConfig["tpRegionConfig"].get("regionName")]

        self.learningRegions = learningRegions

        self.network.enableProfiling()

    # TODO: is this still needed?
    def encodeSample(self, sample):
        """
    Put each token in its own dictionary with its bitmap
    @param sample     (list)            Tokenized sample, where each item is a
                                        string token.
    @return           (list)            The sample text, sparsity, and bitmap
                                        for each token. Since the network will
                                        do the actual encoding, the bitmap and
                                        sparsity will be None
    Example return list:
      [{
        "text": "Example text",
        "sparsity": 0.0,
        "bitmap": None
      }]
    """
        return [{"text": t, "sparsity": None, "bitmap": None} for t in sample]

    def resetModel(self):
        """
    Reset the model by creating a new network since the network API does not
    support resets.
    """
        # TODO: test this works as expected
        self.network = self.initModel()

    def saveModel(self, trial=None):
        try:
            if not os.path.exists(self.modelDir):
                os.makedirs(self.modelDir)
            if trial:
                netPath = os.path.join(self.modelDir, "network_{}.nta".format(trial))
            else:
                netPath = os.path.join(self.modelDir, "network.nta")
            self.network.save(netPath)
            if self.verbosity > 0:
                print "Model saved to '{}'.".format(netPath)
        except IOError as e:
            print "Could not save model to '{}'.".format(netPath)
            raise e

    def trainModel(self, iterations=1):
        """
    Run the network with all regions learning.
    Note self.sampleReference doesn't get populated b/c in a network model
    there's a 1-to-1 mapping of training samples.
    """
        for region in self.learningRegions:
            region.setParameter("learningMode", True)

        self.network.run(iterations)

    def trainNetwork(self, iterations):
        """Run the network with all regions learning but the classifier."""
        for region in self.learningRegions:
            if region.name == "classifier":
                region.setParameter("learningMode", False)
            else:
                region.setParameter("learningMode", True)

        self.network.run(iterations)

    def classifyNetwork(self, iterations):
        """
    For running after the network has been trained by trainNetwork(), this
    populates the KNN prototype space with the final network representations.
    """
        for region in self.learningRegions:
            region.setParameter("learningMode", False)

        sensor = self.sensorRegion.getSelf()
        sensor.rewind()

        self.classifierRegion.setParameter("learningMode", True)
        self.classifierRegion.setParameter("inferenceMode", True)

        sequenceIds = []
        for _ in xrange(iterations):
            self.network.run(1)
            sequenceIds.append(sensor.getOutputValues("sequenceIdOut")[0])

        return sequenceIds

    def inferNetwork(self, iterations, fileRecord=None, learn=False):
        """
    Run the network to infer distances to the classified samples.

    @param fileRecord (str)     If you want to change the file record stream.
    @param learn      (bool)    The classifier will learn the inferred sequnce.
    """
        if fileRecord:
            self.swapRecordStream(fileRecord)

        self.classifierRegion.setParameter("learningMode", learn)
        self.classifierRegion.setParameter("inferenceMode", True)

        sampleDistances = None
        for i in xrange(iterations):
            self.network.run(1)
            inferenceValues = self.classifierRegion.getOutputData("categoriesOut")
            # Sum together the inferred distances for each word of the sequence.
            if sampleDistances is None:
                sampleDistances = inferenceValues
            else:
                sampleDistances += inferenceValues

        return sampleDistances

    def swapRecordStream(self, dataPath):
        """Change the data source for the network's sensor region."""
        recordStream = FileRecordStream(streamID=dataPath)
        sensor = self.sensorRegion.getSelf()
        sensor.dataSource = recordStream  # TODO: implement this in network API

    def testModel(self, seed=42):
        """
    Test the classifier region on the input sample. Call this method for each
    word of a sequence. The random seed is used in getWinningLabels().

    @return           (numpy array)   numLabels most-frequent classifications
                                      for the data samples; int or empty.
    """
        for region in self.learningRegions:
            region.setParameter("learningMode", False)
        self.classifierRegion.setParameter("inferenceMode", True)

        self.network.run(1)

        inference = self._getClassifierInference(seed)
        activityBitmap = self.classifierRegion.getInputData("bottomUpIn")

        return inference, activityBitmap

    def _getClassifierInference(self, seed):
        """Return output categories from the classifier region."""
        relevantCats = self.classifierRegion.getParameter("categoryCount")

        if self.classifierRegion.type == "py.KNNClassifierRegion":
            # max number of inferences = k
            inferenceValues = self.classifierRegion.getOutputData("categoriesOut")[:relevantCats]
            return self.getWinningLabels(inferenceValues, seed)

        elif self.classifierRegion.type == "py.CLAClassifierRegion":
            # TODO: test this
            return self.classifierRegion.getOutputData("categoriesOut")[:relevantCats]

    def queryModel(self, query, preprocess=False):
        """
    Run the query through the network, getting the classifer region's inferences
    for all words of the query sequence.
    @return       (list)          Two-tuples of sequence ID and distance, sorted
                                  closest to farthest from the query.
    """
        for region in self.learningRegions:
            region.setParameter("learningMode", False)
        self.classifierRegion.setParameter("inferenceMode", True)

        # Put query text in LanguageSensor data format.
        queryDicts = self.networkDataGen.generateSequence(query, preprocess)

        sensor = self.sensorRegion.getSelf()
        sampleDistances = None
        for qD in queryDicts:
            # Sum together the inferred distances for each word of the query sequence.
            sensor.queue.appendleft(qD)
            self.network.run(1)
            inferenceValues = self.classifierRegion.getOutputData("categoriesOut")
            if sampleDistances is None:
                sampleDistances = inferenceValues
            else:
                sampleDistances += inferenceValues

        catCount = self.classifierRegion.getParameter("categoryCount")
        # The use of numpy.lexsort() here is to first sort by randomValues, then
        # sort by random values; this breaks ties in a random manner.
        randomValues = numpy.random.random(catCount)
        sortedSamples = numpy.lexsort((randomValues, sampleDistances[:catCount]))
        qTuple = [(a, b) for a, b in zip(sortedSamples, sampleDistances[:catCount])]

        return sorted(qTuple, key=operator.itemgetter(1))

    def tokenize(self, text, preprocess=False):
        """
    Given a bunch of text (could be several sentences) return a single list
    containing individual tokens. Text is tokenized using the CIO tokenize
    method.

    @param text         (str)     A bunch of text.
    @param preprocess   (bool)    Whether or not to preprocess the text data.
    """
        encoder = self.sensorRegion.getSelf().encoder
        sentenceList = encoder.client.tokenize(text)
        tokenList = []
        for sentence in sentenceList:
            tokenList.extend(sentence.split(","))
        return tokenList

    def reset(self):
        """
    Issue a reset signal to the model. The assumption is that a sequence has
    just ended and a new sequence is about to begin.  The default behavior is
    to do nothing - not all subclasses may re-implement this.
    """
        # TODO: Introduce a consistent reset method name.
        for r in self.learningRegions:
            if r.type == "py.TemporalPoolerRegion":
                r.executeCommand(["reset"])
            elif r.type == "py.TPRegion":
                r.executeCommand(["resetSequenceStates"])

    def trainText(self, token, labels, sequenceId=None, reset=0):
        """
    Train the model with the given text token, associated labels, and
    sequence ID.

    @param token      (str)  The text token to train on
    @param labels     (list) A list of one or more integer labels associated
                             with this token. If the list is empty, the
                             classifier will not be trained.
    @param sequenceId (int)  An integer ID associated with this token and its
                             sequence (document).
    @param reset      (int)  Should be 0 or 1. If 1, assumes we are at the
                             beginning of a new sequence.
    """
        for region in self.learningRegions:
            region.setParameter("learningMode", True)
        sensor = self.sensorRegion.getSelf()
        sensor.addDataToQueue(token, labels, sequenceId, 0)
        self.network.run(1)

        # Print the outputs of each region
        if self.verbosity >= 2:
            self.printRegionOutputs()

        if reset == 1:
            self.reset()

    def classifyText(self, token, reset=0):
        """
    Classify the token and return a list of the best classifications.

    @param token    (str)  The text token to train on
    @param reset    (int)  Should be 0 or 1. If 1, assumes we are at the
                           end of a sequence. A reset signal will be issued
                           after the model has been trained on this token.

    @return  (numpy array) An array of size numLabels. Position i contains
                           the likelihood that this sample belongs to the
                           i'th category. An array containing all zeros
                           implies no decision could be made.
    """
        for region in self.learningRegions:
            region.setParameter("learningMode", False)
            region.setParameter("inferenceMode", True)
        sensor = self.sensorRegion.getSelf()
        sensor.addDataToQueue(token, [None], -1, 0)
        self.network.run(1)

        # Print the outputs of each region
        if self.verbosity >= 2:
            self.printRegionOutputs()

        if reset == 1:
            self.reset()

        return self.classifierRegion.getOutputData("categoriesOut")[0 : self.numLabels]

    def printRegionOutputs(self):
        """
    Print the outputs of regions to console for debugging, depending on
    verbosity level.
    """

        print "================== HTM Debugging output:"
        print "Sensor output:",
        print self.sensorRegion.getOutputData("dataOut").nonzero()
        print "Sensor categoryOut:",
        print self.sensorRegion.getOutputData("categoryOut")

        if self.verbosity >= 3:
            if self.tpRegion is not None:
                print "TP region input:",
                print self.tpRegion.getInputData("activeCells").nonzero()
                print "TP region output:",
                print self.tpRegion.getOutputData("mostActiveCells").nonzero()

            print "Classifier bottomUpIn: ",
            print self.classifierRegion.getInputData("bottomUpIn").nonzero()
            print "Classifier categoryIn: ",
            print self.classifierRegion.getInputData("categoryIn")[0 : self.numLabels]

        print "Classifier categoriesOut: ",
        print self.classifierRegion.getOutputData("categoriesOut")[0 : self.numLabels]
        print "Classifier categoryProbabilitiesOut",
        print self.classifierRegion.getOutputData("categoryProbabilitiesOut")[0 : self.numLabels]

    def dumpProfile(self):
        """
    Print region profiling information in a nice format.
    """
        print "Profiling information for {}".format(type(self).__name__)
        totalTime = 0.000001
        for region in self.network.regions.values():
            timer = region.computeTimer
            totalTime += timer.getElapsed()

        profileInfo = []
        for region in self.network.regions.values():
            timer = region.computeTimer
            profileInfo.append(
                [region.name, timer.getStartCount(), timer.getElapsed(), 100.0 * timer.getElapsed() / totalTime]
            )

        profileInfo.append(["Total time", "", totalTime, "100.0"])
        print tabulate(profileInfo, headers=["Region", "Count", "Elapsed", "Percent of total"], tablefmt="grid")

    def __getstate__(self):
        """
    Return serializable state.  This function will return a version of the
    __dict__ with data that shouldn't be pickled stripped out. For example,
    Network API instances are stripped out because they have their own
    serialization mechanism.

    See also: _serializeExtraData()
    """
        state = self.__dict__.copy()
        # Remove member variables that we can't pickle
        state.pop("network")
        state.pop("sensorRegion")
        state.pop("classifierRegion")
        state.pop("tpRegion")
        state.pop("learningRegions")
        state.pop("networkDataGen")

        return state

    def _serializeExtraData(self, extraDataDir):
        """
    Protected method that is called during serialization with an external
    directory path. We override it here to save the Network API instance.

    @param extraDataDir (string) Model's extra data directory path
    """
        self.network.save(os.path.join(extraDataDir, "network.nta"))

    def _deSerializeExtraData(self, extraDataDir):
        """
    Protected method that is called during deserialization (after __setstate__)
    with an external directory path. We override it here to load the Network API
    instance.

    @param extraDataDir (string) Model's extra data directory path
    """
        self.network = Network(os.path.join(extraDataDir, "network.nta"))
        self._initializeRegionHelpers()
        self.networkDataGen = NetworkDataGenerator()

예제 #3

파일 보기

파일: classify_htm.py 프로젝트: oxtopus/htmresearch

class ClassificationModelHTM(ClassificationModel):
    """Classify text using generic network-API based models."""
    def __init__(self,
                 networkConfig,
                 inputFilePath,
                 retinaScaling=1.0,
                 retina="en_associative",
                 apiKey=None,
                 verbosity=1,
                 numLabels=3,
                 modelDir="ClassificationModelHTM",
                 prepData=True,
                 stripCats=False):
        """
    @param networkConfig      (dict)    Network configuration dict with region
                                        parameters.
    @param inputFilePath      (str)     Path to data file.
    @param retinaScaling      (float)   Scales the dimensions of the SDRs.
    @param retina             (str)     Name of Cio retina.
    @param apiKey             (str)     Key for Cio API.
    @param prepData           (bool)    Prepare the input data into network API
                                        format.
    @param stripCats          (bool)    Remove the categories and replace them
                                        with the sequence_Id.
    See ClassificationModel for remaining parameters.

    Note classifierMetric is not specified here as it is in other models. This
    is done in the network config file.
    """
        super(ClassificationModelHTM, self).__init__(verbosity=verbosity,
                                                     numLabels=numLabels,
                                                     modelDir=modelDir)

        self.networkConfig = networkConfig
        self.retinaScaling = retinaScaling
        self.retina = retina
        self.apiKey = apiKey
        self.inputFilePath = inputFilePath

        self.networkDataGen = NetworkDataGenerator()
        if prepData:
            self.networkDataPath = self.prepData(self.inputFilePath,
                                                 stripCats=stripCats)
        else:
            self.networkDataPath = self.inputFilePath

        self.network = self.initModel()
        self._initializeRegionHelpers()

    def getClassifier(self):
        """
    Returns the classifier for the model.
    """
        return self.classifierRegion.getSelf().getAlgorithmInstance()

    def prepData(self, dataPath, ordered=False, stripCats=False, **kwargs):
        """
    Generate the data in network API format.

    @param dataPath          (str)  Path to input data file; format as expected
                                    by NetworkDataGenerator.
    @param ordered           (bool) Keep order of data, or randomize.
    @param stripCats         (bool) Remove the categories and replace them with
                                    the sequence_Id.
    @return networkDataPath  (str)  Path to data formtted for network API.
    """
        networkDataPath = self.networkDataGen.setupData(
            dataPath, self.numLabels, ordered, stripCats, **kwargs)

        return networkDataPath

    def initModel(self):
        """
    Initialize the network; self.networdDataPath must already be set.
    """
        if self.networkDataPath is not None:
            recordStream = FileRecordStream(streamID=self.networkDataPath)
        else:
            recordStream = None

        root = os.path.dirname(os.path.realpath(__file__))
        encoder = CioEncoder(retinaScaling=self.retinaScaling,
                             cacheDir=os.path.join(root, "CioCache"),
                             retina=self.retina,
                             apiKey=self.apiKey)

        # This encoder specifies the LanguageSensor output width.
        return configureNetwork(recordStream, self.networkConfig, encoder)

    def _initializeRegionHelpers(self):
        """
    Set helper member variables once network has been initialized. This will
    also be called from _deSerializeExtraData()
    """
        learningRegions = []
        for region in self.network.regions.values():
            spec = region.getSpec()
            if spec.parameters.contains('learningMode'):
                learningRegions.append(region)

        # Always a sensor and classifier region.
        self.sensorRegion = self.network.regions[
            self.networkConfig["sensorRegionConfig"].get("regionName")]
        self.classifierRegion = self.network.regions[
            self.networkConfig["classifierRegionConfig"].get("regionName")]

        # There is sometimes a TP region
        self.tpRegion = None
        if self.networkConfig.has_key("tpRegionConfig"):
            self.tpRegion = self.network.regions[
                self.networkConfig["tpRegionConfig"].get("regionName")]

        self.learningRegions = learningRegions

    # TODO: is this still needed?
    def encodeSample(self, sample):
        """
    Put each token in its own dictionary with its bitmap
    @param sample     (list)            Tokenized sample, where each item is a
                                        string token.
    @return           (list)            The sample text, sparsity, and bitmap
                                        for each token. Since the network will
                                        do the actual encoding, the bitmap and
                                        sparsity will be None
    Example return list:
      [{
        "text": "Example text",
        "sparsity": 0.0,
        "bitmap": None
      }]
    """
        return [{"text": t, "sparsity": None, "bitmap": None} for t in sample]

    def resetModel(self):
        """
    Reset the model by creating a new network since the network API does not
    support resets.
    """
        # TODO: test this works as expected
        self.network = self.initModel()

    def saveModel(self, trial=None):
        try:
            if not os.path.exists(self.modelDir):
                os.makedirs(self.modelDir)
            if trial:
                netPath = os.path.join(self.modelDir,
                                       "network_{}.nta".format(trial))
            else:
                netPath = os.path.join(self.modelDir, "network.nta")
            self.network.save(netPath)
            if self.verbosity > 0:
                print "Model saved to '{}'.".format(netPath)
        except IOError as e:
            print "Could not save model to '{}'.".format(netPath)
            raise e

    def trainModel(self, iterations=1):
        """
    Run the network with all regions learning.
    Note self.sampleReference doesn't get populated b/c in a network model
    there's a 1-to-1 mapping of training samples.
    """
        for region in self.learningRegions:
            region.setParameter("learningMode", True)

        self.network.run(iterations)

    def trainNetwork(self, iterations):
        """Run the network with all regions learning but the classifier."""
        for region in self.learningRegions:
            if region.name == "classifier":
                region.setParameter("learningMode", False)
            else:
                region.setParameter("learningMode", True)

        self.network.run(iterations)

    def classifyNetwork(self, iterations):
        """
    For running after the network has been trained by trainNetwork(), this
    populates the KNN prototype space with the final network representations.
    """
        for region in self.learningRegions:
            region.setParameter("learningMode", False)

        sensor = self.sensorRegion.getSelf()
        sensor.rewind()

        self.classifierRegion.setParameter("learningMode", True)
        self.classifierRegion.setParameter("inferenceMode", True)

        sequenceIds = []
        for _ in xrange(iterations):
            self.network.run(1)
            sequenceIds.append(sensor.getOutputValues("sequenceIdOut")[0])

        return sequenceIds

    def inferNetwork(self, iterations, fileRecord=None, learn=False):
        """
    Run the network to infer distances to the classified samples.

    @param fileRecord (str)     If you want to change the file record stream.
    @param learn      (bool)    The classifier will learn the inferred sequnce.
    """
        if fileRecord:
            self.swapRecordStream(fileRecord)

        self.classifierRegion.setParameter("learningMode", learn)
        self.classifierRegion.setParameter("inferenceMode", True)

        sampleDistances = None
        for i in xrange(iterations):
            self.network.run(1)
            inferenceValues = self.classifierRegion.getOutputData(
                "categoriesOut")
            # Sum together the inferred distances for each word of the sequence.
            if sampleDistances is None:
                sampleDistances = inferenceValues
            else:
                sampleDistances += inferenceValues

        return sampleDistances

    def swapRecordStream(self, dataPath):
        """Change the data source for the network's sensor region."""
        recordStream = FileRecordStream(streamID=dataPath)
        sensor = self.sensorRegion.getSelf()
        sensor.dataSource = recordStream  # TODO: implement this in network API

    def testModel(self, seed=42):
        """
    Test the classifier region on the input sample. Call this method for each
    word of a sequence. The random seed is used in getWinningLabels().

    @return           (numpy array)   numLabels most-frequent classifications
                                      for the data samples; int or empty.
    """
        for region in self.learningRegions:
            region.setParameter("learningMode", False)
        self.classifierRegion.setParameter("inferenceMode", True)

        self.network.run(1)

        inference = self._getClassifierInference(seed)
        activityBitmap = self.classifierRegion.getInputData("bottomUpIn")

        return inference, activityBitmap

    def _getClassifierInference(self, seed):
        """Return output categories from the classifier region."""
        relevantCats = self.classifierRegion.getParameter("categoryCount")

        if self.classifierRegion.type == "py.KNNClassifierRegion":
            # max number of inferences = k
            inferenceValues = self.classifierRegion.getOutputData(
                "categoriesOut")[:relevantCats]
            return self.getWinningLabels(inferenceValues, seed)

        elif self.classifierRegion.type == "py.CLAClassifierRegion":
            # TODO: test this
            return self.classifierRegion.getOutputData(
                "categoriesOut")[:relevantCats]

    def queryModel(self, query, preprocess=False):
        """
    Run the query through the network, getting the classifer region's inferences
    for all words of the query sequence.
    @return       (list)          Two-tuples of sequence ID and distance, sorted
                                  closest to farthest from the query.
    """
        for region in self.learningRegions:
            region.setParameter("learningMode", False)
        self.classifierRegion.setParameter("inferenceMode", True)

        # Put query text in LanguageSensor data format.
        queryDicts = self.networkDataGen.generateSequence(query, preprocess)

        sensor = self.sensorRegion.getSelf()
        sampleDistances = None
        for qD in queryDicts:
            # Sum together the inferred distances for each word of the query sequence.
            sensor.queue.appendleft(qD)
            self.network.run(1)
            inferenceValues = self.classifierRegion.getOutputData(
                "categoriesOut")
            if sampleDistances is None:
                sampleDistances = inferenceValues
            else:
                sampleDistances += inferenceValues

        catCount = self.classifierRegion.getParameter("categoryCount")
        # The use of numpy.lexsort() here is to first sort by randomValues, then
        # sort by random values; this breaks ties in a random manner.
        randomValues = numpy.random.random(catCount)
        sortedSamples = numpy.lexsort(
            (randomValues, sampleDistances[:catCount]))
        qTuple = [(a, b)
                  for a, b in zip(sortedSamples, sampleDistances[:catCount])]

        return sorted(qTuple, key=operator.itemgetter(1))

    def reset(self):
        """
    Issue a reset signal to the model. The assumption is that a sequence has
    just ended and a new sequence is about to begin.  The default behavior is
    to do nothing - not all subclasses may re-implement this.
    """
        # TODO: Introduce a consistent reset method name.
        for r in self.learningRegions:
            if r.type == 'py.TemporalPoolerRegion':
                r.executeCommand(['reset'])
            elif r.type == 'py.TPRegion':
                r.executeCommand(['resetSequenceStates'])

    def trainText(self, token, labels, sequenceId=None, reset=0):
        """
    Train the model with the given text token, associated labels, and
    sequence ID.

    @param token      (str)  The text token to train on
    @param labels     (list) A list of one or more integer labels associated
                             with this token. If the list is empty, the
                             classifier will not be trained.
    @param sequenceId (int)  An integer ID associated with this token and its
                             sequence (document).
    @param reset      (int)  Should be 0 or 1. If 1, assumes we are at the
                             beginning of a new sequence.
    """
        for region in self.learningRegions:
            region.setParameter("learningMode", True)
        sensor = self.sensorRegion.getSelf()
        sensor.addDataToQueue(token, labels, sequenceId, 0)
        self.network.run(1)

        # Print the outputs of each region
        if self.verbosity >= 2:
            self.printRegionOutputs()

        if reset == 1:
            self.reset()

    def classifyText(self, token, reset=0):
        """
    Classify the token and return a list of the best classifications.

    @param token    (str)  The text token to train on
    @param reset    (int)  Should be 0 or 1. If 1, assumes we are at the
                           end of a sequence. A reset signal will be issued
                           after the model has been trained on this token.

    @return  (numpy array) An array of size numLabels. Position i contains
                           the likelihood that this sample belongs to the
                           i'th category. An array containing all zeros
                           implies no decision could be made.
    """
        for region in self.learningRegions:
            region.setParameter("learningMode", False)
            region.setParameter("inferenceMode", True)
        sensor = self.sensorRegion.getSelf()
        sensor.addDataToQueue(token, [None], -1, 0)
        self.network.run(1)

        # Print the outputs of each region
        if self.verbosity >= 2:
            self.printRegionOutputs()

        if reset == 1:
            self.reset()

        return self.classifierRegion.getOutputData(
            "categoriesOut")[0:self.numLabels]

    def printRegionOutputs(self):
        """
    Print the outputs of regions to console for debugging, depending on
    verbosity level.
    """

        print "================== HTM Debugging output:"
        print "Sensor output:",
        print self.sensorRegion.getOutputData("dataOut").nonzero()
        print "Sensor categoryOut:",
        print self.sensorRegion.getOutputData("categoryOut")

        if self.verbosity >= 3:
            if self.tpRegion is not None:
                print "TP region input:",
                print self.tpRegion.getInputData("activeCells").nonzero()
                print "TP region output:",
                print self.tpRegion.getOutputData("mostActiveCells").nonzero()

            print "Classifier bottomUpIn: ",
            print self.classifierRegion.getInputData("bottomUpIn").nonzero()
            print "Classifier categoryIn: ",
            print self.classifierRegion.getInputData(
                "categoryIn")[0:self.numLabels]

        print "Classifier categoriesOut: ",
        print self.classifierRegion.getOutputData(
            "categoriesOut")[0:self.numLabels]
        print "Classifier categoryProbabilitiesOut",
        print self.classifierRegion.getOutputData(
            "categoryProbabilitiesOut")[0:self.numLabels]

    def __getstate__(self):
        """
    Return serializable state.  This function will return a version of the
    __dict__ with data that shouldn't be pickled stripped out. For example,
    Network API instances are stripped out because they have their own
    serialization mechanism.

    See also: _serializeExtraData()
    """
        state = self.__dict__.copy()
        # Remove member variables that we can't pickle
        state.pop("network")
        state.pop("sensorRegion")
        state.pop("classifierRegion")
        state.pop("tpRegion")
        state.pop("learningRegions")
        state.pop("networkDataGen")

        return state

    def _serializeExtraData(self, extraDataDir):
        """
    Protected method that is called during serialization with an external
    directory path. We override it here to save the Network API instance.

    @param extraDataDir (string) Model's extra data directory path
    """
        self.network.save(os.path.join(extraDataDir, "network.nta"))

    def _deSerializeExtraData(self, extraDataDir):
        """
    Protected method that is called during deserialization (after __setstate__)
    with an external directory path. We override it here to load the Network API
    instance.

    @param extraDataDir (string) Model's extra data directory path
    """
        self.network = Network(os.path.join(extraDataDir, "network.nta"))
        self._initializeRegionHelpers()
        self.networkDataGen = NetworkDataGenerator()