def __init__(self, corpus, numBest = None, dtype = numpy.float32, numFeatures = None):
     """
     If `numBest` is left unspecified, similarity queries return a full list (one 
     float for every document in the corpus, including the query document):
     
     If `numBest` is set, queries return `numBest` most similar documents, as a 
     sorted list:
     
     >>> sms = MatrixSimilarity(corpus, numBest = 3)
     >>> sms[vec12]
     [(12, 1.0), (30, 0.95), (5, 0.45)]
     
     """
     if numFeatures is None:
         logging.info("scanning corpus of %i documents to determine the number of features" %
                      len(corpus))
         numFeatures = 1 + utils.getMaxId(corpus)
         
     logging.info("creating matrix for %i documents and %i features" % 
                  (len(corpus), numFeatures))
     self.numFeatures = numFeatures
     self.numBest = numBest
     self.corpus = numpy.empty(shape = (len(corpus), numFeatures), dtype = dtype, order = 'F')
     self.normalize = True
     
     # iterate over corpus, populating the numpy matrix
     for docNo, vector in enumerate(corpus):
         if docNo % 1000 == 0:
             logging.info("PROGRESS: at document #%i/%i" % (docNo, len(corpus)))
         vector = matutils.unitVec(matutils.sparse2full(vector, numFeatures))
         self.corpus[docNo] = vector
     
     self.corpus = numpy.asmatrix(self.corpus)
示例#2
0
    def __init__(self,
                 corpus,
                 numBest=None,
                 dtype=numpy.float32,
                 numFeatures=None):
        """
        If `numBest` is left unspecified, similarity queries return a full list (one
        float for every document in the corpus, including the query document):

        If `numBest` is set, queries return `numBest` most similar documents, as a
        sorted list:

        >>> sms = MatrixSimilarity(corpus, numBest = 3)
        >>> sms[vec12]
        [(12, 1.0), (30, 0.95), (5, 0.45)]

        """
        if numFeatures is None:
            logging.info(
                "scanning corpus of %i documents to determine the number of features"
                % len(corpus))
            numFeatures = 1 + utils.getMaxId(corpus)

        logging.info("creating matrix for %i documents and %i features" %
                     (len(corpus), numFeatures))
        self.numFeatures = numFeatures
        self.numBest = numBest
        self.corpus = numpy.empty(shape=(len(corpus), numFeatures),
                                  dtype=dtype)
        self.normalize = True

        if corpus is not None:
            # iterate over corpus, populating the numpy matrix
            for docNo, vector in enumerate(corpus):
                if docNo % 1000 == 0:
                    logging.info("PROGRESS: at document #%i/%i" %
                                 (docNo, len(corpus)))
                vector = matutils.unitVec(
                    matutils.sparse2full(vector, numFeatures))
                self.corpus[docNo] = vector

        self.corpus = numpy.asmatrix(self.corpus)