def __init__(self, corpus, numBest = None, dtype = numpy.float32, numFeatures = None): """ If `numBest` is left unspecified, similarity queries return a full list (one float for every document in the corpus, including the query document): If `numBest` is set, queries return `numBest` most similar documents, as a sorted list: >>> sms = MatrixSimilarity(corpus, numBest = 3) >>> sms[vec12] [(12, 1.0), (30, 0.95), (5, 0.45)] """ if numFeatures is None: logging.info("scanning corpus of %i documents to determine the number of features" % len(corpus)) numFeatures = 1 + utils.getMaxId(corpus) logging.info("creating matrix for %i documents and %i features" % (len(corpus), numFeatures)) self.numFeatures = numFeatures self.numBest = numBest self.corpus = numpy.empty(shape = (len(corpus), numFeatures), dtype = dtype, order = 'F') self.normalize = True # iterate over corpus, populating the numpy matrix for docNo, vector in enumerate(corpus): if docNo % 1000 == 0: logging.info("PROGRESS: at document #%i/%i" % (docNo, len(corpus))) vector = matutils.unitVec(matutils.sparse2full(vector, numFeatures)) self.corpus[docNo] = vector self.corpus = numpy.asmatrix(self.corpus)
def __init__(self, corpus, numBest=None, dtype=numpy.float32, numFeatures=None): """ If `numBest` is left unspecified, similarity queries return a full list (one float for every document in the corpus, including the query document): If `numBest` is set, queries return `numBest` most similar documents, as a sorted list: >>> sms = MatrixSimilarity(corpus, numBest = 3) >>> sms[vec12] [(12, 1.0), (30, 0.95), (5, 0.45)] """ if numFeatures is None: logging.info( "scanning corpus of %i documents to determine the number of features" % len(corpus)) numFeatures = 1 + utils.getMaxId(corpus) logging.info("creating matrix for %i documents and %i features" % (len(corpus), numFeatures)) self.numFeatures = numFeatures self.numBest = numBest self.corpus = numpy.empty(shape=(len(corpus), numFeatures), dtype=dtype) self.normalize = True if corpus is not None: # iterate over corpus, populating the numpy matrix for docNo, vector in enumerate(corpus): if docNo % 1000 == 0: logging.info("PROGRESS: at document #%i/%i" % (docNo, len(corpus))) vector = matutils.unitVec( matutils.sparse2full(vector, numFeatures)) self.corpus[docNo] = vector self.corpus = numpy.asmatrix(self.corpus)