Exemplo n.º 1
0
    def create_dictionary(self):
        """
        Utility method to generate gensim-style Dictionary directly from
        the corpus and vocabulary data.
        """
        dictionary = Dictionary()

        # replace dfs with defaultdict to avoid downstream KeyErrors
        # uci vocabularies may contain terms that are not used in the document data
        dictionary.dfs = defaultdict(int)

        dictionary.id2token = self.id2word
        dictionary.token2id = dict((v, k) for k, v in iteritems(self.id2word))

        dictionary.num_docs = self.num_docs
        dictionary.num_nnz = self.num_nnz

        for docno, doc in enumerate(self):
            if docno % 10000 == 0:
                logger.info('PROGRESS: processing document %i of %i' % (docno, self.num_docs))

            for word, count in doc:
                dictionary.dfs[word] += 1
                dictionary.num_pos += count

        return dictionary
Exemplo n.º 2
0
    def create_dictionary(self):
        """
        Utility method to generate gensim-style Dictionary directly from
        the corpus and vocabulary data.
        """
        dictionary = Dictionary()

        # replace dfs with defaultdict to avoid downstream KeyErrors
        # uci vocabularies may contain terms that are not used in the document data
        dictionary.dfs = defaultdict(int)

        dictionary.id2token = self.id2word
        dictionary.token2id = dict((v, k) for k, v in iteritems(self.id2word))

        dictionary.num_docs = self.num_docs
        dictionary.num_nnz = self.num_nnz

        for docno, doc in enumerate(self):
            if docno % 10000 == 0:
                logger.info('PROGRESS: processing document %i of %i' %
                            (docno, self.num_docs))

            for word, count in doc:
                dictionary.dfs[word] += 1
                dictionary.num_pos += count

        return dictionary
Exemplo n.º 3
0
    def _load_gensim_dictionary(self):

        setattr(self, '_index2id', {})

        gensim_dict = GensimDictionary()
        gensim_dict.num_docs = self.num_docs
        gensim_dict.num_pos = self.num_pos
        gensim_dict.num_nnz = self.num_nnz

        for dic_token in self.dic_tokens.all():
            self._index2id[dic_token.index] = dic_token.id
            gensim_dict.token2id[dic_token.text] = dic_token.index
            gensim_dict.dfs[dic_token.index] = dic_token.document_frequency

        logger.info("Dictionary contains %d tokens" % len(gensim_dict.token2id))

        return gensim_dict
Exemplo n.º 4
0
    def create_dictionary(self):
        """Generate :class:`gensim.corpora.dictionary.Dictionary` directly from the corpus and vocabulary data.

        Return
        ------
        :class:`gensim.corpora.dictionary.Dictionary`
            Dictionary, based on corpus.

        Examples
        --------

        .. sourcecode:: pycon

            >>> from gensim.corpora.ucicorpus import UciCorpus
            >>> from gensim.test.utils import datapath
            >>> ucc = UciCorpus(datapath('testcorpus.uci'))
            >>> dictionary = ucc.create_dictionary()

        """
        dictionary = Dictionary()

        # replace dfs with defaultdict to avoid downstream KeyErrors
        # uci vocabularies may contain terms that are not used in the document data
        dictionary.dfs = defaultdict(int)

        dictionary.id2token = self.id2word
        dictionary.token2id = utils.revdict(self.id2word)

        dictionary.num_docs = self.num_docs
        dictionary.num_nnz = self.num_nnz

        for docno, doc in enumerate(self):
            if docno % 10000 == 0:
                logger.info('PROGRESS: processing document %i of %i', docno,
                            self.num_docs)

            for word, count in doc:
                dictionary.dfs[word] += 1
                dictionary.num_pos += count

        return dictionary
Exemplo n.º 5
0
    def create_dictionary(self):
        """Generate :class:`gensim.corpora.dictionary.Dictionary` directly from the corpus and vocabulary data.

        Return
        ------
        :class:`gensim.corpora.dictionary.Dictionary`
            Dictionary, based on corpus.

        Examples
        --------

        .. sourcecode:: pycon

            >>> from gensim.corpora.ucicorpus import UciCorpus
            >>> from gensim.test.utils import datapath
            >>> ucc = UciCorpus(datapath('testcorpus.uci'))
            >>> dictionary = ucc.create_dictionary()

        """
        dictionary = Dictionary()

        # replace dfs with defaultdict to avoid downstream KeyErrors
        # uci vocabularies may contain terms that are not used in the document data
        dictionary.dfs = defaultdict(int)

        dictionary.id2token = self.id2word
        dictionary.token2id = utils.revdict(self.id2word)

        dictionary.num_docs = self.num_docs
        dictionary.num_nnz = self.num_nnz

        for docno, doc in enumerate(self):
            if docno % 10000 == 0:
                logger.info('PROGRESS: processing document %i of %i', docno, self.num_docs)

            for word, count in doc:
                dictionary.dfs[word] += 1
                dictionary.num_pos += count

        return dictionary