示例#1
0
    def test_simple_lists_of_tuples(self):
        # test list words

        # one document, one word
        potentialCorpus = [[(0, 4.)]]
        result = utils.is_corpus(potentialCorpus)
        expected = (True, potentialCorpus)
        self.assertEqual(expected, result)

        # one document, several words
        potentialCorpus = [[(0, 4.), (1, 2.)]]
        result = utils.is_corpus(potentialCorpus)
        expected = (True, potentialCorpus)
        self.assertEqual(expected, result)

        potentialCorpus = [[(0, 4.), (1, 2.), (2, 5.), (3, 8.)]]
        result = utils.is_corpus(potentialCorpus)
        expected = (True, potentialCorpus)
        self.assertEqual(expected, result)

        # several documents, one word
        potentialCorpus = [[(0, 4.)], [(1, 2.)]]
        result = utils.is_corpus(potentialCorpus)
        expected = (True, potentialCorpus)
        self.assertEqual(expected, result)

        potentialCorpus = [[(0, 4.)], [(1, 2.)], [(2, 5.)], [(3, 8.)]]
        result = utils.is_corpus(potentialCorpus)
        expected = (True, potentialCorpus)
        self.assertEqual(expected, result)
示例#2
0
文件: esamodel.py 项目: eric011/nyan
    def __getitem__(self, bow, eps=1e-12):
        """
        Return esa representation of the input vector and/or corpus.
        
        bow should already be weights, e.g. with TF-IDF
        """
        # if the input vector is in fact a corpus, return a transformed corpus 
        # as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        #use corpus as interpreter matrix
        #simply multiply feature vector of input with corpus matrix
        #to get the weight of the concept
        vector = numpy.dot(matutils.sparse2full(bow, self.num_features), self.corpus)

        #normalize
        vector = matutils.unitvec(vector)

        # make sure there are no explicit zeroes in the vector (must be sparse)
        vector = [(concept_id, weight)
                  for concept_id, weight
                  in enumerate(vector)
                  if abs(weight) > eps]
        return vector
    def __getitem__(self, bow):
        """Get log entropy representation of the input vector and/or corpus.

        Parameters
        ----------
        bow : list of (int, int)
            Document in BoW format.

        Returns
        -------
        list of (int, float)
            Log-entropy vector for passed `bow`.

        """
        # if the input vector is in fact a corpus, return a transformed corpus
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge)
        vector = [
            (term_id, math.log(tf + 1) * self.entr.get(term_id))
            for term_id, tf in bow
            if term_id in self.entr
        ]
        if self.normalize:
            vector = matutils.unitvec(vector)
        return vector
示例#4
0
    def top_topics(self, corpus, num_words=20):
        """
        Calculate the Umass topic coherence for each topic. Algorithm from
        **Mimno, Wallach, Talley, Leenders, McCallum: Optimizing Semantic Coherence in Topic Models, CEMNLP 2011.**
        """
        is_corpus, corpus = utils.is_corpus(corpus)
        if not is_corpus:
            logger.warning("LdaModel.top_topics() called with an empty corpus")
            return

        topics = []
        str_topics = []
        for topic in self.state.get_lambda():
            topic = topic / topic.sum()  # normalize to probability distribution
            bestn = matutils.argsort(topic, topn=num_words, reverse=True)
            topics.append(bestn)
            beststr = [(topic[id], self.id2word[id]) for id in bestn]
            str_topics.append(beststr)

        # top_ids are limited to every topics top words. should not exceed the
        # vocabulary size.
        top_ids = set(chain.from_iterable(topics))

        # create a document occurence sparse matrix for each word
        doc_word_list = {}
        for id in top_ids:
            id_list = set()
            for n, document in enumerate(corpus):
                if id in frozenset(x[0] for x in document):
                    id_list.add(n)

            doc_word_list[id] = id_list

        coherence_scores = []
        for t, top_words in enumerate(topics):
            # Calculate each coherence score C(t, top_words)
            coherence = 0.0
            # Sum of top words m=2..M
            for m in top_words[1:]:
                # m_docs is v_m^(t)
                m_docs = doc_word_list[m]

                # Sum of top words l=1..m-1
                # i.e., all words ranked higher than the current word m
                for l in top_words[:m - 1]:
                    # l_docs is v_l^(t)
                    l_docs = doc_word_list[l]

                    # make sure this word appears in some documents.
                    if len(l_docs) > 0:
                        # co_doc_frequency is D(v_m^(t), v_l^(t))
                        co_doc_frequency = len(m_docs.intersection(l_docs))

                        # add to the coherence sum for these two words m, l
                        coherence += numpy.log((co_doc_frequency + 1.0) / len(l_docs))

            coherence_scores.append((str_topics[t], coherence))

        top_topics = sorted(coherence_scores, key=lambda t: t[1], reverse=True)
        return top_topics
示例#5
0
    def get_similarities(self, query):
        """Get similarity between `query` and this index.

        Warnings
        --------
        Do not use this function directly; use the `self[query]` syntax instead.

        Parameters
        ----------
        query : {list of (int, number), iterable of list of (int, number)}
            Document or collection of documents.

        Return
        ------
        :class:`numpy.ndarray`
            Similarity matrix.

        """
        if not self.corpus:
            return numpy.array()

        is_corpus, query = utils.is_corpus(query)
        if not is_corpus and isinstance(query, numpy.ndarray):
            query = [self.corpus[i] for i in query]  # convert document indexes to actual documents
        result = self.similarity_matrix.inner_product(query, self.corpus, normalized=True)

        if scipy.sparse.issparse(result):
            return numpy.asarray(result.todense())
        if numpy.isscalar(result):
            return numpy.array(result)
        return numpy.asarray(result)[0]
    def __getitem__(self, bow, eps=1e-12):
        """
        Return esa representation of the input vector and/or corpus.

        bow should already be weights, e.g. with TF-IDF
        """
        # if the input vector is in fact a corpus, return a transformed corpus
        # as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # use similarity index to calculate similarity with each vector of corpus
        vector = self.similarity_index[bow]

        # consine similarity is in [-1, 1] shift and scale to make it [0, 1]
        vector += 1
        vector /= 2

        #normalize
        vector = matutils.unitvec(vector)

        # make sure there are no explicit zeroes in the vector (must be sparse)
        vector = [(concept_id, weight)
                  for concept_id, weight
                  in enumerate(vector)
                  if abs(weight) > eps]
        return vector
示例#7
0
    def _getbow(self, doc):
        # if doc is an iterable apply to all
        is_corpus, doc = utils.is_corpus(doc)
        if is_corpus:
            return SimpleCorpus(self._apply(doc))

        return self.dict.doc2bow(doc, allow_update=True)
示例#8
0
    def __getitem__(self, bow, iterations=100):
        """Get vector for document(s).

        Parameters
        ----------
        bow : {list of (int, int), iterable of list of (int, int)}
            Document (or corpus) in BoW format.
        iterations : int, optional
            Number of iterations that will be used for inferring.

        Returns
        -------
        list of (int, float)
            LDA vector for document as sequence of (topic_id, topic_probability) **OR**
        list of list of (int, float)
            LDA vectors for corpus in same format.

        """
        is_corpus, corpus = utils.is_corpus(bow)
        if not is_corpus:
            # query is a single document => make a corpus out of it
            bow = [bow]

        self.convert_input(bow, infer=True)
        cmd = \
            self.mallet_path + ' infer-topics --input %s --inferencer %s ' \
                               '--output-doc-topics %s --num-iterations %s --doc-topics-threshold %s'
        cmd = cmd % (
            self.fcorpusmallet() + '.infer', self.finferencer(),
            self.fdoctopics() + '.infer', iterations, self.topic_threshold
        )
        logger.info("inferring topics with MALLET LDA '%s'", cmd)
        check_output(args=cmd, shell=True)
        result = list(self.read_doctopics(self.fdoctopics() + '.infer'))
        return result if is_corpus else result[0]
示例#9
0
    def get_document_topics(self, bow, minimum_probability=None, minimum_phi_value=None, per_word_topics=False):
        """
        Return topic distribution for the given document `bow`, as a list of
        (topic_id, topic_probability) 2-tuples.

        Ignore topics with very low probability (below `minimum_probability`).

        If per_word_topics is True, it also returns a list of topics, sorted in descending order of most likely topics for that word.
        It also returns a list of word_ids and each words corresponding topics' phi_values, multiplied by feature length (i.e, word count)

        """
        if minimum_probability is None:
            minimum_probability = self.minimum_probability
        minimum_probability = max(minimum_probability, 1e-8)  # never allow zero values in sparse output

        if minimum_phi_value is None:
            minimum_phi_value = self.minimum_probability
        minimum_phi_value = max(minimum_phi_value, 1e-8)  # never allow zero values in sparse output

        # if the input vector is a corpus, return a transformed corpus
        is_corpus, corpus = utils.is_corpus(bow)
        if is_corpus:
            kwargs = dict(
                per_word_topics=per_word_topics,
                minimum_probability=minimum_probability,
                minimum_phi_value=minimum_phi_value
            )
            return self._apply(corpus, **kwargs)

        gamma, phis = self.inference([bow], collect_sstats=per_word_topics)
        topic_dist = gamma[0] / sum(gamma[0])  # normalize distribution

        document_topics = [
            (topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist)
            if topicvalue >= minimum_probability
        ]

        if not per_word_topics:
            return document_topics
        else:
            word_topic = []  # contains word and corresponding topic
            word_phi = []  # contains word and phi values
            for word_type, weight in bow:
                phi_values = []  # contains (phi_value, topic) pairing to later be sorted
                phi_topic = []  # contains topic and corresponding phi value to be returned 'raw' to user
                for topic_id in range(0, self.num_topics):
                    if phis[topic_id][word_type] >= minimum_phi_value:
                        # appends phi values for each topic for that word
                        # these phi values are scaled by feature length
                        phi_values.append((phis[topic_id][word_type], topic_id))
                        phi_topic.append((topic_id, phis[topic_id][word_type]))

                # list with ({word_id => [(topic_0, phi_value), (topic_1, phi_value) ...]).
                word_phi.append((word_type, phi_topic))
                # sorts the topics based on most likely topic
                # returns a list like ({word_id => [topic_id_most_probable, topic_id_second_most_probable, ...]).
                sorted_phi_values = sorted(phi_values, reverse=True)
                topics_sorted = [x[1] for x in sorted_phi_values]
                word_topic.append((word_type, topics_sorted))
            return (document_topics, word_topic, word_phi)  # returns 2-tuple
示例#10
0
    def __getitem__(self, bow, eps=0.01):
        """Convert document or corpus in BoW format to LDA vectors in BoW format

        Parameters
        ----------
        bow : {list of (int, int), iterable of list of (int, int)}
            Document or corpus in BoW format.
        eps : float
            Threshold value (all topics with probability < `eps` will be ignored.

        Returns
        -------
        list of (int, float)
            LDA vector for document **OR**
        list of list of (int, float)
            LDA vectors for corpus.

        """
        is_corpus, dummy_corpus = utils.is_corpus(bow)
        if not is_corpus:
            bow = [bow]

        predictions = self._predict(bow)[0]

        topics = []
        for row in predictions:
            row_topics = []
            for topic_id, val in enumerate(row):
                if val > eps:
                    row_topics.append((topic_id, val))
            topics.append(row_topics)

        return topics if is_corpus else topics[0]
示例#11
0
    def __getitem__(self, bow, scaled=False, chunksize=512):
        """
        Return latent representation, as a list of (topic_id, topic_value) 2-tuples.

        This is done by folding input document into the latent topic space.
        """
        assert self.projection.u is not None, "decomposition not initialized yet"

        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus and chunksize:
            # by default, transform 256 documents at once, when called as `lsi[corpus]`.
            # this chunking is completely transparent to the user, but it speeds
            # up internal computations (one mat * mat multiplication, instead of
            # 256 smaller mat * vec multiplications).
            return self._apply(bow, chunksize=chunksize)

        if not is_corpus:
            bow = [bow]
        vec = matutils.corpus2csc(bow, num_terms=self.num_terms)

        topic_dist = (vec.T * self.projection.u[:, :self.num_topics]).T # (x^T * u).T = u^-1 * x
        if scaled:
            topic_dist = (1.0 / self.projection.s[:self.num_topics]) * topic_dist # s^-1 * u^-1 * x

        # convert a numpy array to gensim sparse vector = tuples of (feature_id, feature_weight),
        # with no zero weights.
        if not is_corpus:
            # lsi[single_document]
            result = matutils.full2sparse(topic_dist.flat)
        else:
            # lsi[chunk of documents]
            result = matutils.Dense2Corpus(topic_dist)
        return result
示例#12
0
文件: docsim.py 项目: Dieterbe/gensim
    def get_similarities(self, query):
        """
        Return similarity of sparse vector `query` to all documents in the corpus,
        as a numpy array.

        If `query` is a collection of documents, return a 2D array of similarities
        of each document in `query` to all documents in the corpus (=batch query,
        faster than processing each document in turn).

        **Do not use this function directly; use the self[query] syntax instead.**

        """
        is_corpus, query = utils.is_corpus(query)
        if is_corpus:
            query = matutils.corpus2csc(query, self.index.shape[1], dtype=self.index.dtype)
        else:
            if scipy.sparse.issparse(query):
                query = query.T # convert documents=rows to documents=columns
            elif isinstance(query, numpy.ndarray):
                if query.ndim == 1:
                    query.shape = (1, len(query))
                query = scipy.sparse.csr_matrix(query, dtype=self.index.dtype).T
            else:
                # default case: query is a single vector, in sparse gensim format
                query = matutils.corpus2csc([query], self.index.shape[1], dtype=self.index.dtype)

        # compute cosine similarity against every other document in the collection
        result = self.index * query.tocsc() # N x T * T x C = N x C
        if result.shape[1] == 1:
            # for queries of one document, return a 1d array
            result = result.toarray().flatten()
        else:
            # otherwise, return a 2d matrix (#queries x #index)
            result = result.toarray().T
        return result
示例#13
0
    def __getitem__(self, bow, chunksize=10000):
        #ln.debug("getitem: %s" % chunksize)
        is_corpus, bow = utils.is_corpus(bow)
        if not is_corpus:
            bow = [bow]

        ln.info("Computing hidden representation for %s documents..." % len(bow))

        if not chunksize:  # todo I think could be removed altogether
            chunksize = 1

        def transformed_corpus():
            for chunk_no, doc_chunk in enumerate(utils.grouper(bow, chunksize)):
                ln.debug("Converting chunk %s to csc format.." % chunk_no)
                chunk = matutils.corpus2csc(doc_chunk, self.input_dimensionality)
                ln.debug("Computing hidden representation for chunk.. ")
                hidden = self._get_hidden_representations(chunk)
                ln.info("Finished computing representation for chunk %s, yielding results. Total docs processed: %s" %
                        (chunk_no, chunk_no * chunksize + len(doc_chunk)))
                for column in hidden.T:
                    yield matutils.dense2vec(column.T)
                ln.debug("Done yielding chunk %s" % chunk_no)

            ln.info("Finished computing representations for all chunks.")

        if not is_corpus:
            res = list(transformed_corpus()).pop()
            return res
        else:
            return transformed_corpus()
示例#14
0
    def serialize(filename_prefix, layer, current_representation, num_terms=None, chunksize=10000):
        is_corpus, current_representation = utils.is_corpus(current_representation)
        if is_corpus:
            for chunk_no, chunk in enumerate(utils.grouper(current_representation, chunksize)):
                ln.debug("preparing chunk for conversion (%s documents)..." % len(chunk))
                assert num_terms is not None, "Need num_terms to properly handle sparse corpus format"
                chunk_as_csc = matutils.corpus2csc(chunk, num_terms=num_terms)

                ln.debug("Chunk converted to csc, running through layer..")
                chunk_trans = layer.__getitem__(chunk_as_csc)

                ln.debug("Serializing hidden representation..")
                fname = "%s_%s" % (filename_prefix, ("0" * (15 - len(str(chunk_no)))) + str(chunk_no))
                np.save(fname, chunk_trans)
                ln.debug("Finished serializing chunk. Processed %s documents so far." %
                         (chunk_no * chunksize + len(chunk)))
        else:
            ln.info("Beginning serialization of non-gensim corpus format intermediate representation.")
            ln.debug("Type of current_representation is %s" % type(current_representation))
            for chunk_no, chunk in enumerate(current_representation):
                ln.debug("converting chunk (%s documents)..." % chunksize)
                chunk_trans = layer.__getitem__(chunk)
                ln.debug("Serializing hidden representation..")
                fname = "%s_%s" % (filename_prefix, ("0" * (15 - len(str(chunk_no)))) + str(chunk_no))
                np.save(fname, chunk_trans)
                ln.debug("finished serializing chunk.")

        ln.info("Finished serializing all chunks.")
示例#15
0
文件: docsim.py 项目: leahic/gensim
    def get_similarities(self, query):
        """
        Return similarity of sparse vector `query` to all documents in the corpus,
        as a numpy array.

        If `query` is a collection of documents, return a 2D array of similarities
        of each document in `query` to all documents in the corpus (=batch query,
        faster than processing each document in turn).

        **Do not use this function directly; use the self[query] syntax instead.**

        """
        is_corpus, query = utils.is_corpus(query)
        if is_corpus:
            query = numpy.asarray(
                [matutils.sparse2full(vec, self.num_features) for vec in query],
                dtype=self.index.dtype)
        else:
            if scipy.sparse.issparse(query):
                query = query.toarray()  # convert sparse to dense
            elif isinstance(query, numpy.ndarray):
                pass
            else:
                # default case: query is a single vector in sparse gensim format
                query = matutils.sparse2full(query, self.num_features)
            query = numpy.asarray(query, dtype=self.index.dtype)

        # do a little transposition dance to stop numpy from making a copy of
        # self.index internally in numpy.dot (very slow).
        result = numpy.dot(self.index, query.T).T  # return #queries x #index
        return result  # XXX: removed casting the result from array to list; does anyone care?
示例#16
0
    def __getitem__(self, bow, eps=1e-12):
        """
        Return tf-idf representation of the input vector and/or corpus.
        """
        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
        # as strict application of the IDF formula would dictate)
        vector = [
            (termid, self.wlocal(tf) * self.idfs.get(termid)) for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0
        ]

        # and finally, normalize the vector either to unit length, or use a
        # user-defined normalization function
        if self.normalize is True:
            vector = matutils.unitvec(vector)
        elif self.normalize:
            vector = self.normalize(vector)

        # make sure there are no explicit zeroes in the vector (must be sparse)
        vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps]
        return vector
示例#17
0
文件: docsim.py 项目: Dieterbe/gensim
    def __getitem__(self, query):
        """Get similarities of document `query` to all documents in the corpus.

        **or**

        If `query` is a corpus (iterable of documents), return a matrix of similarities
        of all query documents vs. all corpus document. This batch query is more
        efficient than computing the similarities one document after another.
        """
        self.close_shard() # no-op if no documents added to index since last query

        results = []
        for shard in self.shards:
            shard.num_best = self.num_best
            shard.normalize = self.normalize
            results.append(shard[query])

        if self.num_best is None:
            return numpy.hstack(results)

        # only top-n most similars requested; merge the partial results from all shards
        is_corpus, results = utils.is_corpus(results)
        if is_corpus:
            # query = single document?
            result = sorted(sum(results, []), key=lambda item: -item[1])[ : self.num_best]
        else:
            result = []
            for parts in itertools.izip(*results):
                merged = sorted(sum(parts, []), key=lambda item: -item[1])[ : self.num_best]
                result.append(merged)
        return result
示例#18
0
    def __getitem__(self, bow, eps=0.01):
        is_corpus, corpus = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(corpus)

        gamma, _ = self.inference([bow])
        topic_dist = gamma[0] / sum(gamma[0]) # normalize to proper distribution
        return [topicvalue for topicid, topicvalue in enumerate(topic_dist)]
示例#19
0
    def __getitem__(self, bow, eps=0.01):
        is_corpus, corpus = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(corpus)

        gamma = self.inference([bow])[0]
        topic_dist = gamma / sum(gamma) if sum(gamma) != 0 else []
        return [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist) if topicvalue >= eps]
示例#20
0
    def __getitem__(self, doc):
        # if doc is an iterable apply to all
        is_corpus, doc = utils.is_corpus(doc)
        if is_corpus:
            return self._apply(doc)

        # return transformed doc according to function
        return self.funct(doc, *self.fargs, **self.fkwargs)
示例#21
0
文件: ldamodel.py 项目: polcar/gensim
    def top_topics(self, corpus, num_words=20):
        """
        Calculate the Umass topic coherence for each topic. Algorithm from
        **Mimno, Wallach, Talley, Leenders, McCallum: Optimizing Semantic Coherence in Topic Models, CEMNLP 2011.**
        """
        is_corpus, corpus = utils.is_corpus(corpus)
        if not is_corpus:
            logger.warning("LdaModel.top_topics() called with an empty corpus")
            return

        topics = []
        str_topics = []
        for topic in self.state.get_lambda():
            topic = topic / topic.sum()  # normalize to probability distribution
            bestn = matutils.argsort(topic, topn=num_words, reverse=True)
            topics.append(bestn)
            beststr = [(topic[id], self.id2word[id]) for id in bestn]
            str_topics.append(beststr)

        # top_ids are limited to every topics top words. should not exceed the
        # vocabulary size.
        top_ids = set(chain.from_iterable(topics))

        # create a document occurence sparse matrix for each word
        doc_word_list = {}
        for id in top_ids:
            id_list = set()
            for n, document in enumerate(corpus):
                if id in frozenset(x[0] for x in document):
                    id_list.add(n)

            doc_word_list[id] = id_list

        coherence_scores = []
        for t, top_words in enumerate(topics):
            # Calculate each coherence score C(t, top_words)
            coherence = 0.0
            # Sum of top words m=2..M
            for m in top_words[1:]:
                # m_docs is v_m^(t)
                m_docs = doc_word_list[m]

                # Sum of top words l=1..m-1
                # i.e., all words ranked higher than the current word m
                for l in top_words[:m - 1]:
                    # l_docs is v_l^(t)
                    l_docs = doc_word_list[l]

                    # co_doc_frequency is D(v_m^(t), v_l^(t))
                    co_doc_frequency = len(m_docs.intersection(l_docs))

                    # add to the coherence sum for these two words m, l
                    coherence += numpy.log((co_doc_frequency + 1.0) / len(l_docs))

            coherence_scores.append((str_topics[t], coherence))

        top_topics = sorted(coherence_scores, key=lambda t: t[1], reverse=True)
        return top_topics
示例#22
0
    def get_document_topics(self, bow, minimum_probability=None, minimum_phi_value=None, per_word_topics=False):
        """
        Return topic distribution for the given document `bow`, as a list of
        (topic_id, topic_probability) 2-tuples.

        Ignore topics with very low probability (below `minimum_probability`).

        If per_word_topics is True, it also returns a list of topics, sorted in descending order of most likely topics for that word.
        It also returns a list of word_ids and each words corresponding topics' phi_values, multiplied by feature length (i.e, word count)

        """
        if minimum_probability is None:
            minimum_probability = self.minimum_probability
        minimum_probability = max(minimum_probability, 1e-8)  # never allow zero values in sparse output

        if minimum_phi_value is None:
            minimum_phi_value = self.minimum_probability
        minimum_phi_value = max(minimum_phi_value, 1e-8)  # never allow zero values in sparse output

        # if the input vector is a corpus, return a transformed corpus
        is_corpus, corpus = utils.is_corpus(bow)
        if is_corpus:
            kwargs = dict(
                per_word_topics = per_word_topics,
                minimum_probability = minimum_probability,
                minimum_phi_value = minimum_phi_value
            )
            return self._apply(corpus, **kwargs)

        gamma, phis = self.inference([bow], collect_sstats=True)
        topic_dist = gamma[0] / sum(gamma[0])  # normalize distribution

        document_topics = [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist)
                    if topicvalue >= minimum_probability]

        if not per_word_topics:
            return document_topics
        else:
            word_topic = [] # contains word and corresponding topic
            word_phi = [] # contains word and phi values
            for word_type, weight in bow:
                phi_values = [] # contains (phi_value, topic) pairing to later be sorted
                phi_topic = [] # contains topic and corresponding phi value to be returned 'raw' to user
                for topic_id in range(0, self.num_topics):
                    if phis[topic_id][word_type] >= minimum_phi_value:
                        # appends phi values for each topic for that word
                        # these phi values are scaled by feature length
                        phi_values.append((phis[topic_id][word_type], topic_id))
                        phi_topic.append((topic_id, phis[topic_id][word_type]))

                # list with ({word_id => [(topic_0, phi_value), (topic_1, phi_value) ...]).
                word_phi.append((word_type, phi_topic))
                # sorts the topics based on most likely topic
                # returns a list like ({word_id => [topic_id_most_probable, topic_id_second_most_probable, ...]).
                sorted_phi_values = sorted(phi_values, reverse=True)
                topics_sorted = [x[1] for x in sorted_phi_values]
                word_topic.append((word_type, topics_sorted))
            return (document_topics, word_topic, word_phi) # returns 2-tuple
    def __getitem__(self, bow, eps=1e-12):
        """Get the tf-idf representation of an input vector and/or corpus.

        bow : {list of (int, int), iterable of iterable of (int, int)}
            Input document in the `sparse Gensim bag-of-words format
            <https://radimrehurek.com/gensim/intro.html#core-concepts>`_,
            or a streamed corpus of such documents.
        eps : float
            Threshold value, will remove all position that have tfidf-value less than `eps`.

        Returns
        -------
        vector : list of (int, float)
            TfIdf vector, if `bow` is a single document
        :class:`~gensim.interfaces.TransformedCorpus`
            TfIdf corpus, if `bow` is a corpus.

        """
        self.eps = eps
        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
        # as strict application of the IDF formula would dictate)

        termid_array, tf_array = [], []
        for termid, tf in bow:
            termid_array.append(termid)
            tf_array.append(tf)

        tf_array = self.wlocal(np.array(tf_array))

        vector = [
            (termid, tf * self.idfs.get(termid))
            for termid, tf in zip(termid_array, tf_array) if abs(self.idfs.get(termid, 0.0)) > self.eps
        ]

        if self.normalize is True:
            self.normalize = matutils.unitvec
        elif self.normalize is False:
            self.normalize = utils.identity

        # and finally, normalize the vector either to unit length, or use a
        # user-defined normalization function
        if self.pivot is None:
            norm_vector = self.normalize(vector)
            norm_vector = [(termid, weight) for termid, weight in norm_vector if abs(weight) > self.eps]
        else:
            _, old_norm = self.normalize(vector, return_norm=True)
            pivoted_norm = (1 - self.slope) * self.pivot + self.slope * old_norm
            norm_vector = [
                (termid, weight / float(pivoted_norm))
                for termid, weight in vector
                if abs(weight / float(pivoted_norm)) > self.eps
            ]
        return norm_vector
示例#24
0
    def __getitem__(self, bow, scaled=False, chunksize=512):
        """
        Return latent representation, as a list of (topic_id, topic_value) 2-tuples.
        This is done by folding input document into the latent topic space.
        If `scaled` is set, scale topics by the inverse of singular values (default: no scaling).
        """
        assert self.projection.u is not None, "decomposition not initialized yet"

        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus and chunksize:
            # by default, transform `chunksize` documents at once, when called as `lsi[corpus]`.
            # this chunking is completely transparent to the user, but it speeds
            # up internal computations (one mat * mat multiplication, instead of
            # `chunksize` smaller mat * vec multiplications).
            return self._apply(bow, chunksize=chunksize)

        if not is_corpus:
            bow = [bow]

        # convert input to scipy.sparse CSC, then do "sparse * dense = dense" multiplication
        vec = matutils.corpus2csc(bow,
                                  num_terms=self.num_terms,
                                  dtype=self.projection.u.dtype)
        topic_dist = (
            vec.T *
            self.projection.u[:, :self.num_topics]).T  # (x^T * u).T = u^-1 * x

        # # convert input to dense, then do dense * dense multiplication
        # # ± same performance as above (BLAS dense * dense is better optimized than scipy.sparse), but consumes more memory
        # vec = matutils.corpus2dense(bow, num_terms=self.num_terms, num_docs=len(bow))
        # topic_dist = np.dot(self.projection.u[:, :self.num_topics].T, vec)

        # # use np's advanced indexing to simulate sparse * dense
        # # ± same speed again
        # u = self.projection.u[:, :self.num_topics]
        # topic_dist = np.empty((u.shape[1], len(bow)), dtype=u.dtype)
        # for vecno, vec in enumerate(bow):
        #     indices, data = zip(*vec) if vec else ([], [])
        #     topic_dist[:, vecno] = np.dot(u.take(indices, axis=0).T, np.array(data, dtype=u.dtype))

        if not is_corpus:
            # convert back from matrix into a 1d vec
            topic_dist = topic_dist.reshape(-1)

        if scaled:
            topic_dist = (1.0 / self.projection.s[:self.num_topics]
                          ) * topic_dist  # s^-1 * u^-1 * x

        # convert a np array to gensim sparse vector = tuples of (feature_id, feature_weight),
        # with no zero weights.
        if not is_corpus:
            # lsi[single_document]
            result = matutils.full2sparse(topic_dist)
        else:
            # lsi[chunk of documents]
            result = matutils.Dense2Corpus(topic_dist)
        return result
示例#25
0
    def __getitem__(self, bow, eps=1e-12):
        """Get the tf-idf representation of an input vector and/or corpus.

        bow : {list of (int, int), iterable of iterable of (int, int)}
            Input document in the `sparse Gensim bag-of-words format
            <https://radimrehurek.com/gensim/intro.html#core-concepts>`_,
            or a streamed corpus of such documents.
        eps : float
            Threshold value, will remove all position that have tfidf-value less than `eps`.

        Returns
        -------
        vector : list of (int, float)
            TfIdf vector, if `bow` is a single document
        :class:`~gensim.interfaces.TransformedCorpus`
            TfIdf corpus, if `bow` is a corpus.

        """
        self.eps = eps
        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
        # as strict application of the IDF formula would dictate)

        termid_array, tf_array = [], []
        for termid, tf in bow:
            termid_array.append(termid)
            tf_array.append(tf)

        tf_array = self.wlocal(np.array(tf_array))

        vector = [
            (termid, tf * self.idfs.get(termid))
            for termid, tf in zip(termid_array, tf_array) if abs(self.idfs.get(termid, 0.0)) > self.eps
        ]

        if self.normalize is True:
            self.normalize = matutils.unitvec
        elif self.normalize is False:
            self.normalize = utils.identity

        # and finally, normalize the vector either to unit length, or use a
        # user-defined normalization function
        if self.pivot is None:
            norm_vector = self.normalize(vector)
            norm_vector = [(termid, weight) for termid, weight in norm_vector if abs(weight) > self.eps]
        else:
            _, old_norm = self.normalize(vector, return_norm=True)
            pivoted_norm = (1 - self.slope) * self.pivot + self.slope * old_norm
            norm_vector = [
                (termid, weight / float(pivoted_norm))
                for termid, weight in vector
                if abs(weight / float(pivoted_norm)) > self.eps
            ]
        return norm_vector
示例#26
0
    def test_getitem_dense2gensim(self):

        corpus = ShardedCorpus(self.tmp_fname, self.data, shardsize=100,
                               dim=self.dim, sparse_serialization=False,
                               gensim=True)

        item = corpus[3]
        self.assertTrue(isinstance(item, list))
        self.assertTrue(isinstance(item[0], tuple))

        dslice = corpus[2:6]
        self.assertTrue(next(dslice) == corpus[2])
        dslice = list(dslice)
        self.assertTrue(isinstance(dslice, list))
        self.assertTrue(isinstance(dslice[0], list))
        self.assertTrue(isinstance(dslice[0][0], tuple))

        iscorp, _ = is_corpus(dslice)
        self.assertTrue(iscorp, "Is the object returned by slice notation "
                                "a gensim corpus?")

        ilist = corpus[[2, 3, 4, 5]]
        self.assertTrue(next(ilist) == corpus[2])
        ilist = list(ilist)
        self.assertTrue(isinstance(ilist, list))
        self.assertTrue(isinstance(ilist[0], list))
        self.assertTrue(isinstance(ilist[0][0], tuple))

        # From generators to lists

        self.assertEqual(len(ilist), len(dslice))
        for i in xrange(len(ilist)):
            self.assertEqual(len(ilist[i]), len(dslice[i]),
                             "Row %d: dims %d/%d" % (i, len(ilist[i]),
                                                     len(dslice[i])))
            for j in xrange(len(ilist[i])):
                self.assertEqual(ilist[i][j], dslice[i][j],
                                 "ilist[%d][%d] = %s ,dslice[%d][%d] = %s" % (
                                     i, j, str(ilist[i][j]), i, j,
                                     str(dslice[i][j])))


        iscorp, _ = is_corpus(ilist)
        self.assertTrue(iscorp, "Is the object returned by list notation "
                                "a gensim corpus?")
示例#27
0
    def test_getitem_dense2gensim(self):

        corpus = ShardedCorpus(self.tmp_fname, self.data, shardsize=100,
                               dim=self.dim, sparse_serialization=False,
                               gensim=True)

        item = corpus[3]
        self.assertTrue(isinstance(item, list))
        self.assertTrue(isinstance(item[0], tuple))

        dslice = corpus[2:6]
        self.assertTrue(next(dslice) == corpus[2])
        dslice = list(dslice)
        self.assertTrue(isinstance(dslice, list))
        self.assertTrue(isinstance(dslice[0], list))
        self.assertTrue(isinstance(dslice[0][0], tuple))

        iscorp, _ = is_corpus(dslice)
        self.assertTrue(iscorp, "Is the object returned by slice notation "
                                "a gensim corpus?")

        ilist = corpus[[2, 3, 4, 5]]
        self.assertTrue(next(ilist) == corpus[2])
        ilist = list(ilist)
        self.assertTrue(isinstance(ilist, list))
        self.assertTrue(isinstance(ilist[0], list))
        self.assertTrue(isinstance(ilist[0][0], tuple))

        # From generators to lists

        self.assertEqual(len(ilist), len(dslice))
        for i in xrange(len(ilist)):
            self.assertEqual(len(ilist[i]), len(dslice[i]),
                             "Row %d: dims %d/%d" % (i, len(ilist[i]),
                                                     len(dslice[i])))
            for j in xrange(len(ilist[i])):
                self.assertEqual(ilist[i][j], dslice[i][j],
                                 "ilist[%d][%d] = %s ,dslice[%d][%d] = %s" % (
                                     i, j, str(ilist[i][j]), i, j,
                                     str(dslice[i][j])))


        iscorp, _ = is_corpus(ilist)
        self.assertTrue(iscorp, "Is the object returned by list notation "
                                "a gensim corpus?")
示例#28
0
    def __getitem__(self, doc):
        # if doc is an iterable apply to all
        is_corpus, doc = utils.is_corpus(doc)
        if is_corpus:
            return self._apply(doc)

        self.counter.update(doc)

        return doc
示例#29
0
    def __getitem__(self, bow, scaled=False, chunksize=512):
        """
        Return latent representation, as a list of (topic_id, topic_value) 2-tuples.

        This is done by folding input document into the latent topic space.

        If `scaled` is set, scale topics by the inverse of singular values (default: no scaling).

        """
        assert self.projection.u is not None, "decomposition not initialized yet"

        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus and chunksize:
            # by default, transform `chunksize` documents at once, when called as `lsi[corpus]`.
            # this chunking is completely transparent to the user, but it speeds
            # up internal computations (one mat * mat multiplication, instead of
            # `chunksize` smaller mat * vec multiplications).
            return self._apply(bow, chunksize=chunksize)

        if not is_corpus:
            bow = [bow]

        # convert input to scipy.sparse CSC, then do "sparse * dense = dense" multiplication
        vec = matutils.corpus2csc(bow, num_terms=self.num_terms, dtype=self.projection.u.dtype)
        topic_dist = (vec.T * self.projection.u[:, :self.num_topics]).T  # (x^T * u).T = u^-1 * x

        # # convert input to dense, then do dense * dense multiplication
        # # ± same performance as above (BLAS dense * dense is better optimized than scipy.sparse),
        # but consumes more memory
        # vec = matutils.corpus2dense(bow, num_terms=self.num_terms, num_docs=len(bow))
        # topic_dist = np.dot(self.projection.u[:, :self.num_topics].T, vec)

        # # use np's advanced indexing to simulate sparse * dense
        # # ± same speed again
        # u = self.projection.u[:, :self.num_topics]
        # topic_dist = np.empty((u.shape[1], len(bow)), dtype=u.dtype)
        # for vecno, vec in enumerate(bow):
        #     indices, data = zip(*vec) if vec else ([], [])
        #     topic_dist[:, vecno] = np.dot(u.take(indices, axis=0).T, np.array(data, dtype=u.dtype))

        if not is_corpus:
            # convert back from matrix into a 1d vec
            topic_dist = topic_dist.reshape(-1)

        if scaled:
            topic_dist = (1.0 / self.projection.s[:self.num_topics]) * topic_dist  # s^-1 * u^-1 * x

        # convert a np array to gensim sparse vector = tuples of (feature_id, feature_weight),
        # with no zero weights.
        if not is_corpus:
            # lsi[single_document]
            result = matutils.full2sparse(topic_dist)
        else:
            # lsi[chunk of documents]
            result = matutils.Dense2Corpus(topic_dist)
        return result
示例#30
0
    def __getitem__(self, doc):
        # if doc is an iterable apply to all
        is_corpus, doc = utils.is_corpus(doc)
        if is_corpus:
            return self._apply(doc)

        # appling transformation, return doc as a bag-of-bitokens list
        allow_update = False if len(self.bidict) > 0 else True
        return self.bidict.doc2bob(doc, allow_update)
示例#31
0
    def __getitem__(self, bow, eps=0.01):
        is_corpus, corpus = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(corpus)

        gamma, _ = self.inference([bow])
        topic_dist = gamma[0] / sum(
            gamma[0])  # normalize to proper distribution
        return [topicvalue for topicid, topicvalue in enumerate(topic_dist)]
示例#32
0
    def __getitem__(self, item):

        iscorpus, _ = is_corpus(item)

        if iscorpus or isinstance(item, DatasetABC):
            return self._apply(item)
        else:
            raise ValueError('Cannot apply flatten_composite to individual '
                             'documents.')
示例#33
0
    def __getitem__(self, bow, eps=0.01):
        is_corpus, corpus = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(corpus)

        gamma = self.inference([bow])[0]
        topic_dist = gamma / sum(gamma) if sum(gamma) != 0 else []
        return [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist)
                if topicvalue >= eps]
示例#34
0
    def __getitem__(self, vec, eps=1e-12):
        is_corpus, vec = utils.is_corpus(vec)
        if is_corpus:
            return self._apply(vec)

        if self.L1:
            score = sum(v for _, v in vec) / len(vec) if vec else 0
        else:
            score = sum(v * v for _, v in vec) / len(vec) if vec else 0
        return score
    def __getitem__(self, items):
        is_corpus, items = utils.is_corpus(items)

        if not is_corpus:
            v = self._get_vector_representation(items)
            return sparse2full(v, self.size)
        else:
            return list(
                map(lambda v: sparse2full(v, self.size),
                    self._get_vector_representation(items)))
示例#36
0
    def __getitem__(self, bow):
        """
        Return representation with the ids transformed.
        """
        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        return sorted((self.old2new[oldid], weight) for oldid, weight in bow if oldid in self.old2new)
示例#37
0
  def __getitem__(self, vec, eps=1e-12):
    is_corpus, vec = utils.is_corpus(vec)
    if is_corpus:
      return self._apply(vec)

    if self.L1:
      score = sum( v  for _, v in vec) / len(vec) if vec else 0
    else:
      score = sum(v*v for _, v in vec) / len(vec) if vec else 0
    return score
示例#38
0
    def __getitem__(self, bow):
        """
        Return representation with the ids transformed.
        """
        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        return sorted((self.old2new[oldid], weight) for oldid, weight in bow if oldid in self.old2new)
示例#39
0
    def __getitem__(self, item):

        iscorpus, _ = is_corpus(item)

        if iscorpus:
            return self._apply(item)
        else:
            #raise ValueError('Cannot apply serializer to individual documents.')
            # Will this work?
            return self.serialized_data[item]
示例#40
0
文件: ldamodel.py 项目: zyenge/gensim
    def top_topics(self, corpus, num_topics=5, num_words=20):
        """
        Calculate the Umass topic coherence for each topic and return
        the top num_topics. Algorithm from
        **Mimno, Wallach, Talley, Leenders, McCallum: Optimizing Semantic Coherence in Topic Models, CEMNLP 2011.**
        """
        if num_topics < 0 or num_topics >= self.num_topics:
            if self.num_topics >= 5:
                num_topics = 5
            else:
                num_topics = self.num_topics
            logger.warning("num_topics out of range - setting to default of 5")
        is_corpus, corpus = utils.is_corpus(corpus)
        if not is_corpus:
            logger.warning("LdaModel.top_topics() called with an empty corpus")
            return

        coherence_scores = []
        topics = []
        str_topics = []
        for topic in self.state.get_lambda():
            topic = topic / topic.sum()  # normalize to probability dist
            bestn = np.argsort(topic)[::-1][:num_words]
            topics.append(bestn)
            beststr = [(topic[id], self.id2word[id]) for id in bestn]
            str_topics.append(beststr)
        top_id = chain.from_iterable(topics)
        top_id = list(set(top_id))

        doc_word_list = {}
        for id in top_id:
            id_list = []
            for document in range(len(corpus)):
                if len(list(ifilter(lambda x: x[0] == id,
                                    corpus[document]))) > 0:
                    id_list.append(document)
            if len(id_list) > 0:
                doc_word_list[id] = id_list

        for topic in xrange(len(topics)):
            topic_coherence_sum = 0.0
            for word_m in topics[topic][1:]:
                doc_frequency_m = len(doc_word_list[word_m])
                m_set = set(doc_word_list[word_m])
                for word_l in topics[topic][:-1]:
                    l_set = set(doc_word_list[word_l])
                    co_doc_frequency = len(m_set.intersection(l_set))
                    topic_coherence_sum += numpy.log(
                        (co_doc_frequency + 1.0) / doc_frequency_m)
            coherence_scores.append((str_topics[topic], topic_coherence_sum))

        top_topics = sorted(coherence_scores,
                            key=lambda tup: tup[1],
                            reverse=True)[0:num_topics - 1]
        return top_topics
示例#41
0
    def __getitem__(self, bow, iterations=100):
        is_corpus, corpus = utils.is_corpus(bow)
        if not is_corpus:
            bow = [bow]

        self.convert_input(bow, infer=True)
        cmd = self.mallet_path + " infer-topics --input %s --inferencer %s --output-doc-topics %s --num-iterations %s"
        cmd = cmd % (self.fcorpusmallet() + '.infer', self.finferencer(), self.fdoctopics() + '.infer', iterations)
        logger.info("inferring with Mallet LDA with %s" % cmd)
        call(cmd, shell=True)
        return list(read_doctopics(self.fdoctopics() + '.infer'))
    def __getitem__(self, bow):
        """Get random-projection representation of the input vector or corpus.

        Parameters
        ----------
        bow : {list of (int, int), iterable of list of (int, int)}
            Input document or corpus.

        Returns
        -------
        list of (int, float)
            if `bow` is document OR
        :class:`~gensim.interfaces.TransformedCorpus`
            if `bow` is corpus.

        Examples
        ----------

        .. sourcecode:: pycon

            >>> from gensim.models import RpModel
            >>> from gensim.corpora import Dictionary
            >>> from gensim.test.utils import common_texts
            >>>
            >>> dictionary = Dictionary(common_texts)  # fit dictionary
            >>> corpus = [dictionary.doc2bow(text) for text in common_texts]  # convert texts to BoW format
            >>>
            >>> model = RpModel(corpus, id2word=dictionary)  # fit model
            >>>
            >>> # apply model to document, result is vector in BoW format, i.e. [(1, 0.3), ... ]
            >>> result = model[corpus[0]]

        """
        # if the input vector is in fact a corpus, return a transformed corpus as result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        if getattr(self, 'freshly_loaded', False):
            # This is a hack to work around a bug in np, where a FORTRAN-order array
            # unpickled from disk segfaults on using it.
            self.freshly_loaded = False
            self.projection = self.projection.copy(
                'F')  # simply making a fresh copy fixes the broken array

        vec = matutils.sparse2full(bow, self.num_terms).reshape(
            self.num_terms, 1) / np.sqrt(self.num_topics)
        vec = np.asfortranarray(vec, dtype=np.float32)
        topic_dist = np.dot(self.projection, vec)  # (k, d) * (d, 1) = (k, 1)
        return [
            (topicid, float(topicvalue))
            for topicid, topicvalue in enumerate(topic_dist.flat)
            if np.isfinite(topicvalue) and not np.allclose(topicvalue, 0.0)
        ]
示例#43
0
    def __getitem__(self, bow, iterations=100):
        is_corpus, corpus = utils.is_corpus(bow)
        if not is_corpus:
            bow = [bow]

        self.convert_input(bow, infer=True)
        cmd = self.mallet_path + " infer-topics --input %s --inferencer %s --output-doc-topics %s --num-iterations %s"
        cmd = cmd % (self.fcorpusmallet() + '.infer', self.finferencer(),
                     self.fdoctopics() + '.infer', iterations)
        logger.info("inferring with Mallet LDA with %s" % cmd)
        call(cmd, shell=True)
        return list(read_doctopics(self.fdoctopics() + '.infer'))
    def __getitem__(self, items):
        """
        Return random vector(s).
        :param items:
        :return:
        """
        is_corpus, items = utils.is_corpus(items)

        if not is_corpus:
            return np.random.random(self.size)
        else:
            return list(map(lambda v: np.random.random(self.size), items))
示例#45
0
    def __getitem__(self, query):
        """Get similarities of document `query` to all documents in the corpus.

        **or**

        If `query` is a corpus (iterable of documents), return a matrix of similarities
        of all query documents vs. all corpus document. This batch query is more
        efficient than computing the similarities one document after another.
        """
        self.close_shard()  # no-op if no documents added to index since last query

        # reset num_best and normalize parameters, in case they were changed dynamically
        for shard in self.shards:
            shard.num_best = self.num_best
            shard.normalize = self.norm

        # there are 4 distinct code paths, depending on whether input `query` is
        # a corpus (or numpy/scipy matrix) or a single document, and whether the
        # similarity result should be a full array or only num_best most similar
        # documents.
        pool, shard_results = self.query_shards(query)
        if self.num_best is None:
            # user asked for all documents => just stack the sub-results into a single matrix
            # (works for both corpus / single doc query)
            result = numpy.hstack(shard_results)
        else:
            # the following uses a lot of lazy evaluation and (optionally) parallel
            # processing, to improve query latency and minimize memory footprint.
            offsets = numpy.cumsum([0] + [len(shard) for shard in self.shards])
            convert = lambda doc, shard_no: [(doc_index + offsets[shard_no], sim) for doc_index, sim in doc]
            is_corpus, query = utils.is_corpus(query)
            is_corpus = is_corpus or hasattr(query, 'ndim') and query.ndim > 1 and query.shape[0] > 1
            if not is_corpus:
                # user asked for num_best most similar and query is a single doc
                results = (convert(result, shard_no) for shard_no, result in enumerate(shard_results))
                result = heapq.nlargest(self.num_best, itertools.chain(*results), key=lambda item: item[1])
            else:
                # the trickiest combination: returning num_best results when query was a corpus
                results = []
                for shard_no, result in enumerate(shard_results):
                    shard_result = [convert(doc, shard_no) for doc in result]
                    results.append(shard_result)
                result = []
                for parts in izip(*results):
                    merged = heapq.nlargest(self.num_best, itertools.chain(*parts), key=lambda item: item[1])
                    result.append(merged)
        if pool:
            # gc doesn't seem to collect the Pools, eventually leading to
            # "IOError 24: too many open files". so let's terminate it manually.
            pool.terminate()

        return result
示例#46
0
    def __getitem__(self, query):
        """Get similarities of document `query` to all documents in the corpus.

        **or**

        If `query` is a corpus (iterable of documents), return a matrix of similarities
        of all query documents vs. all corpus document. This batch query is more
        efficient than computing the similarities one document after another.
        """
        self.close_shard(
        )  # no-op if no documents added to index since last query

        # reset num_best and normalize parameters, in case they were changed dynamically
        for shard in self.shards:
            shard.num_best = self.num_best
            shard.normalize = self.normalize

        # there are 4 distinct code paths, depending on whether input `query` is
        # a corpus (or numpy/scipy matrix) or a single document, and whether the
        # similarity result is a full array or only num_best most similar documents.

        if self.num_best is None:
            # user asked for all documents => just stack the sub-results into a single matrix
            # (works for both corpus / single doc query)
            return numpy.hstack(shard[query] for shard in self.shards)

        offsets = numpy.cumsum([0] + [len(shard) for shard in self.shards])
        convert = lambda doc, shard_no: [(doc_index + offsets[shard_no], sim)
                                         for doc_index, sim in doc]
        is_corpus, query = utils.is_corpus(query)
        is_corpus = is_corpus or hasattr(
            query, 'ndim') and query.ndim > 1 and query.shape[0] > 1
        if not is_corpus:
            # user asked for num_best most similar and query is a single doc
            results = (convert(shard[query], shard_no)
                       for shard_no, shard in enumerate(self.shards))
            return heapq.nlargest(self.num_best,
                                  itertools.chain(*results),
                                  key=lambda item: item[1])

        # the trickiest combination: returning num_best results when query was a corpus
        shard_results = []
        for shard_no, shard in enumerate(self.shards):
            shard_result = [convert(doc, shard_no) for doc in shard[query]]
            shard_results.append(shard_result)
        result = []
        for parts in itertools.izip(*shard_results):
            merged = heapq.nlargest(self.num_best,
                                    itertools.chain(*parts),
                                    key=lambda item: item[1])
            result.append(merged)
        return result
示例#47
0
    def __getitem__(self, bow, iterations=100):
        is_corpus, corpus = utils.is_corpus(bow)
        if not is_corpus:
            # query is a single document => make a corpus out of it
            bow = [bow]

        self.convert_input(bow, infer=True)
        cmd = self.mallet_path + ' infer-topics --input %s --inferencer %s --output-doc-topics %s --num-iterations %s --doc-topics-threshold %s'
        cmd = cmd % (self.fcorpusmallet() + '.infer', self.finferencer(), self.fdoctopics() + '.infer', iterations, self.topic_threshold)
        logger.info("inferring topics with MALLET LDA '%s'", cmd)
        check_output(args=cmd, shell=True)
        result = list(self.read_doctopics(self.fdoctopics() + '.infer'))
        return result if is_corpus else result[0]
示例#48
0
    def __getitem__(self, query):
        """Get access to similarities of document/corpus `query` to all documents in the corpus.

        Using :meth:`~gensim.interfaces.SimilarityABC.get_similarities`


        Notes
        -----
        Passing corpus to `query` (instead of document) can be more efficient, because will processed in batching-way.

        Parameters
        ----------
        query : {list of (int, int), iterable of list of (int, int)}
            Document or corpus in BoW format.

        Returns
        -------
        {`scipy.sparse.csr.csr_matrix`, list of (int, float)}
            Similarities given document or corpus and objects corpus, depends on `query`.

        """
        is_corpus, query = utils.is_corpus(query)
        if self.normalize:
            # self.normalize only works if the input is a plain gensim vector/corpus (as
            # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix
            # as well, but in that case assume tricks are happening and don't normalize
            # anything (self.normalize has no effect).
            if not matutils.ismatrix(query):
                if is_corpus:
                    query = [matutils.unitvec(v) for v in query]
                else:
                    query = matutils.unitvec(query)
        result = self.get_similarities(query)

        if self.num_best is None:
            return result

        # if maintain_sparsity is True, result is scipy sparse. Sort, clip the
        # topn and return as a scipy sparse matrix.
        if getattr(self, 'maintain_sparsity', False):
            return matutils.scipy2scipy_clipped(result, self.num_best)

        # if the input query was a corpus (=more documents), compute the top-n
        # most similar for each document in turn
        if matutils.ismatrix(result):
            return [
                matutils.full2sparse_clipped(v, self.num_best) for v in result
            ]
        else:
            # otherwise, return top-n of the single input document
            return matutils.full2sparse_clipped(result, self.num_best)
示例#49
0
    def __getitem__(self, query):
        """Get similarities of the given document or corpus against this index.

        Uses :meth:`~gensim.interfaces.SimilarityABC.get_similarities` internally.

        Notes
        -----
        Passing an entire corpus as `query` can be more efficient than passing its documents one after another,
        because it will issue queries in batches internally.

        Parameters
        ----------
        query : {list of (int, number), iterable of list of (int, number)}
            Document in the sparse Gensim bag-of-words format, or a streamed corpus of such documents.

        Returns
        -------
        {`scipy.sparse.csr.csr_matrix`, list of (int, float)}
            Similarities given document or corpus and objects corpus, depends on `query`.

        """
        is_corpus, query = utils.is_corpus(query)
        if self.normalize:
            # self.normalize only works if the input is a plain gensim vector/corpus (as
            # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix
            # as well, but in that case assume tricks are happening and don't normalize
            # anything (self.normalize has no effect).
            if not matutils.ismatrix(query):
                if is_corpus:
                    query = [matutils.unitvec(v) for v in query]
                else:
                    query = matutils.unitvec(query)
        result = self.get_similarities(query)

        if self.num_best is None:
            return result

        # if maintain_sparsity is True, result is scipy sparse. Sort, clip the
        # topn and return as a scipy sparse matrix.
        if getattr(self, 'maintain_sparsity', False):
            return matutils.scipy2scipy_clipped(result, self.num_best)

        # if the input query was a corpus (=more documents), compute the top-n
        # most similar for each document in turn
        if matutils.ismatrix(result):
            return [
                matutils.full2sparse_clipped(v, self.num_best) for v in result
            ]
        else:
            # otherwise, return top-n of the single input document
            return matutils.full2sparse_clipped(result, self.num_best)
示例#50
0
    def get_similarities(self, query):
        """Get similarity between `query` and this index.

        Warnings
        --------
        Do not use this function directly; use the `self[query]` syntax instead.

        Parameters
        ----------
        query : {list of (int, number), iterable of list of (int, number), :class:`scipy.sparse.csr_matrix`}
            Document or collection of documents.

        Return
        ------
        :class:`numpy.ndarray`
            Similarity matrix (if maintain_sparsity=False) **OR**
        :class:`scipy.sparse.csc`
            otherwise

        """
        is_corpus, query = utils.is_corpus(query)
        if is_corpus:
            query = matutils.corpus2csc(query,
                                        self.index.shape[1],
                                        dtype=self.index.dtype)
        else:
            if scipy.sparse.issparse(query):
                query = query.T  # convert documents=rows to documents=columns
            elif isinstance(query, numpy.ndarray):
                if query.ndim == 1:
                    query.shape = (1, len(query))
                query = scipy.sparse.csr_matrix(query,
                                                dtype=self.index.dtype).T
            else:
                # default case: query is a single vector, in sparse gensim format
                query = matutils.corpus2csc([query],
                                            self.index.shape[1],
                                            dtype=self.index.dtype)

        # compute cosine similarity against every other document in the collection
        result = self.index * query.tocsc()  # N x T * T x C = N x C
        if result.shape[1] == 1 and not is_corpus:
            # for queries of one document, return a 1d array
            result = result.toarray().flatten()
        elif self.maintain_sparsity:
            # avoid converting to dense array if maintaining sparsity
            result = result.T
        else:
            # otherwise, return a 2d matrix (#queries x #index)
            result = result.toarray().T
        return result
示例#51
0
文件: rpmodel.py 项目: ummae/gensim
    def __getitem__(self, bow):
        """
        Return RP representation of the input vector and/or corpus.
        """
        # if the input vector is in fact a corpus, return a transformed corpus as result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        vec = matutils.sparse2full(bow, self.num_terms).reshape(self.num_terms, 1) / numpy.sqrt(self.num_topics)
        vec = numpy.asfortranarray(vec, dtype=numpy.float32)
        topic_dist = scipy.linalg.fblas.sgemv(1.0, self.projection, vec)  # (k, d) * (d, 1) = (k, 1)
        return [(topicid, float(topicvalue)) for topicid, topicvalue in enumerate(topic_dist.flat)
                if numpy.isfinite(topicvalue) and not numpy.allclose(topicvalue, 0.0)]
示例#52
0
 def test_invalid_formats(self):
     # test invalid formats
     # these are no corpus, because they do not consists of 2-tuples with
     # the form(int, float).
     potentials = list()
     potentials.append(["human"])
     potentials.append("human")
     potentials.append(["human", "star"])
     potentials.append([1, 2, 3, 4, 5, 5])
     potentials.append([[(0, 'string')]])
     for noCorpus in potentials:
         result = utils.is_corpus(noCorpus)
         expected = (False, noCorpus)
         self.assertEqual(expected, result)
    def __getitem__(self, bow):
        """
        Return log entropy representation of the input vector and/or corpus.
        """
        # if the input vector is in fact a corpus, return a transformed corpus
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge)
        vector = [(term_id, math.log(tf + 1) * self.entr.get(term_id))
                  for term_id, tf in bow if term_id in self.entr]
        if self.normalize:
            vector = matutils.unitvec(vector)
        return vector
示例#54
0
	def __getitem__(self, bow, iterations=100):
		is_corpus, corpus = utils.is_corpus(bow)
		if not is_corpus:
			# query is a single document => make a corpus out of it
			bow = [bow]

		self.convert_input(bow, infer=True)
		cmd = self.mallet_path + " infer-topics --input %s --inferencer %s --output-doc-topics %s --num-iterations %s --doc-topics-threshold %f"
		cmd = cmd % (self.fcorpusmallet(True) + '.infer', self.finferencer(True), self.fdoctopics(True) + '.infer', iterations, 1/(self.num_topics))
		logger.info("inferring topics with MALLET LDA '%s'" % cmd)
		retval = call(cmd, shell=True)
		if retval != 0:
			raise RuntimeError("MALLET failed with error %s on return" % retval)
		result = list(gensim.models.wrappers.ldamallet.read_doctopics(self.fdoctopics(True) + '.infer'))
		return result if is_corpus else result[0]
示例#55
0
    def __getitem__(self, bow, eps=1e-12):
        """Get tf-idf representation of the input vector and/or corpus.

        bow : {list of (int, int), iterable of iterable of (int, int)}
            Input document or copus in BoW format.
        eps : float
            Threshold value, will remove all position that have tfidf-value less than `eps`.

        Returns
        -------
        vector : list of (int, float)
            TfIdf vector, if `bow` is document **OR**
        :class:`~gensim.interfaces.TransformedCorpus`
            TfIdf corpus, if `bow` is corpus.

        """
        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
        # as strict application of the IDF formula would dictate)

        termid_array, tf_array = [], []
        for termid, tf in bow:
            termid_array.append(termid)
            tf_array.append(tf)

        tf_array = self.wlocal(np.array(tf_array))

        vector = [(termid, tf * self.idfs.get(termid))
                  for termid, tf in zip(termid_array, tf_array)
                  if abs(self.idfs.get(termid, 0.0)) > eps]

        if self.normalize is True:
            self.normalize = matutils.unitvec
        elif self.normalize is False:
            self.normalize = utils.identity

        # and finally, normalize the vector either to unit length, or use a
        # user-defined normalization function
        vector = self.normalize(vector)

        # make sure there are no explicit zeroes in the vector (must be sparse)
        vector = [(termid, weight) for termid, weight in vector
                  if abs(weight) > eps]
        return vector
示例#56
0
    def __getitem__(self, bow, eps=0.01):
        """
        Return topic distribution for the given document `bow`, as a list of
        (topic_id, topic_probability) 2-tuples.

        Ignore topics with very low probability (below `eps`).
        """
        # if the input vector is in fact a corpus, return a transformed corpus as result
        is_corpus, corpus = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(corpus)

        gamma, _ = self.inference([bow])
        topic_dist = gamma[0] / sum(gamma[0]) # normalize to proper distribution
        return [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist)
                if topicvalue >= eps] # ignore document's topics that have prob < eps
示例#57
0
    def __getitem__(self, bow):
        """
        Return tf-idf representation of the input vector and/or corpus.
        """
        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
        # as strict application of the IDF formula would dictate)
        vector = [(termid, tf * self.idfs.get(termid, 0.0))
                  for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0]
        if self.normalize:
            vector = matutils.unitvec(vector)
        return vector
示例#58
0
    def __getitem__(self, bow, eps=0.01):
        is_corpus, dummy_corpus = utils.is_corpus(bow)
        if not is_corpus:
            bow = [bow]

        predictions = self._predict(bow)[0]

        topics = []
        for row in predictions:
            row_topics = []
            for topic_id, val in enumerate(row):
                if val > eps:
                    row_topics.append((topic_id, val))
            topics.append(row_topics)

        return topics if is_corpus else topics[0]
示例#59
0
    def get_document_topics(self,
                            bow,
                            minimum_probability=None,
                            normalize=None):
        """Get the topic distribution for the given document.

        Parameters
        ----------
        bow : list of (int, float)
            The document in BOW format.
        minimum_probability : float
            If `normalize` is True, topics with smaller probabilities are filtered out.
            If `normalize` is False, topics with smaller factors are filtered out.
            If set to None, a value of 1e-8 is used to prevent 0s.
        normalize: bool or None, optional
            Whether to normalize the result. Allows for estimation of perplexity, coherence, e.t.c.

        Returns
        -------
        list of (int, float)
            Topic distribution for the whole document. Each element in the list is a pair of a topic's id, and
            the probability that was assigned to it.

        """
        if minimum_probability is None:
            minimum_probability = self.minimum_probability
        minimum_probability = max(minimum_probability, 1e-8)

        # if the input vector is a corpus, return a transformed corpus
        is_corpus, corpus = utils.is_corpus(bow)

        if is_corpus:
            kwargs = dict(minimum_probability=minimum_probability)
            return self._apply(corpus, **kwargs)

        v = matutils.corpus2csc([bow], self.num_tokens)
        h = self._solveproj(v, self._W, v_max=np.inf)

        if normalize is None:
            normalize = self.normalize
        if normalize:
            the_sum = h.sum()
            if the_sum:
                h /= the_sum

        return [(idx, proba) for idx, proba in enumerate(h[:, 0])
                if not minimum_probability or proba > minimum_probability]
示例#60
0
    def get_similarities(self, query):
        """Get similarity between `query` and this index.

        Warnings
        --------
        Do not use this function directly; use the `self[query]` syntax instead.

        Parameters
        ----------
        query : {list of (int, number), iterable of list of (int, number)}
            Document or collection of documents.

        Return
        ------
        :class:`numpy.ndarray`
            Similarity matrix.

        """

        is_corpus, query = utils.is_corpus(query)
        if not is_corpus:
            if isinstance(query, numpy.ndarray):
                # Convert document indexes to actual documents.
                query = [self.corpus[i] for i in query]
            else:
                query = [query]

        result = []
        for query_document in query:
            # Compute similarity for each query.
            qresult = [
                matutils.softcossim(query_document, corpus_document,
                                    self.similarity_matrix)
                for corpus_document in self.corpus
            ]
            qresult = numpy.array(qresult)

            # Append single query result to list of all results.
            result.append(qresult)

        if is_corpus:
            result = numpy.array(result)
        else:
            result = result[0]

        return result