Python Dictionary.token2id 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: gensim.corpora

클래스/타입: Dictionary

메소드/함수: token2id

hotexamples.com에서의 예제들: 6

Python Dictionary.token2id - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 gensim.corpora.Dictionary.token2id에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Dictionary(30)

add_documents(30)

load(30)

load_from_text(30)

filter_extremes(30)

doc2bow(30)

save(30)

compactify(30)

doc2idx(28)

save_as_text(28)

items(27)

filter_tokens(26)

keys(16)

from_corpus(15)

filter_n_most_frequent(13)

merge_with(10)

get(10)

values(9)

iteritems(7)

id2token(7)

from_documents(6)

patch_with_special_tokens(6)

token2id(4)

num_docs(2)

num_nnz(2)

dfs(2)

itervalues(1)

loadFromText(1)

filterExtremes(1)

most_common(1)

num_pos(1)

saveAsText(1)

add_word(1)

iterkeys(1)

예제 #1

파일 보기

파일: ucicorpus.py 프로젝트: 234205367/gensim

    def create_dictionary(self):
        """
        Utility method to generate gensim-style Dictionary directly from
        the corpus and vocabulary data.
        """
        dictionary = Dictionary()

        # replace dfs with defaultdict to avoid downstream KeyErrors
        # uci vocabularies may contain terms that are not used in the document data
        dictionary.dfs = defaultdict(int)

        dictionary.id2token = self.id2word
        dictionary.token2id = dict((v, k) for k, v in iteritems(self.id2word))

        dictionary.num_docs = self.num_docs
        dictionary.num_nnz = self.num_nnz

        for docno, doc in enumerate(self):
            if docno % 10000 == 0:
                logger.info('PROGRESS: processing document %i of %i' % (docno, self.num_docs))

            for word, count in doc:
                dictionary.dfs[word] += 1
                dictionary.num_pos += count

        return dictionary

예제 #2

파일 보기

    def create_dictionary(self):
        """
        Utility method to generate gensim-style Dictionary directly from
        the corpus and vocabulary data.
        """
        dictionary = Dictionary()

        # replace dfs with defaultdict to avoid downstream KeyErrors
        # uci vocabularies may contain terms that are not used in the document data
        dictionary.dfs = defaultdict(int)

        dictionary.id2token = self.id2word
        dictionary.token2id = dict((v, k) for k, v in iteritems(self.id2word))

        dictionary.num_docs = self.num_docs
        dictionary.num_nnz = self.num_nnz

        for docno, doc in enumerate(self):
            if docno % 10000 == 0:
                logger.info('PROGRESS: processing document %i of %i' %
                            (docno, self.num_docs))

            for word, count in doc:
                dictionary.dfs[word] += 1
                dictionary.num_pos += count

        return dictionary

예제 #3

파일 보기

파일: ucicorpus.py 프로젝트: zjyeon/ODSA-PythonAdvModels

    def create_dictionary(self):
        """Generate :class:`gensim.corpora.dictionary.Dictionary` directly from the corpus and vocabulary data.

        Return
        ------
        :class:`gensim.corpora.dictionary.Dictionary`
            Dictionary, based on corpus.

        Examples
        --------

        .. sourcecode:: pycon

            >>> from gensim.corpora.ucicorpus import UciCorpus
            >>> from gensim.test.utils import datapath
            >>> ucc = UciCorpus(datapath('testcorpus.uci'))
            >>> dictionary = ucc.create_dictionary()

        """
        dictionary = Dictionary()

        # replace dfs with defaultdict to avoid downstream KeyErrors
        # uci vocabularies may contain terms that are not used in the document data
        dictionary.dfs = defaultdict(int)

        dictionary.id2token = self.id2word
        dictionary.token2id = utils.revdict(self.id2word)

        dictionary.num_docs = self.num_docs
        dictionary.num_nnz = self.num_nnz

        for docno, doc in enumerate(self):
            if docno % 10000 == 0:
                logger.info('PROGRESS: processing document %i of %i', docno,
                            self.num_docs)

            for word, count in doc:
                dictionary.dfs[word] += 1
                dictionary.num_pos += count

        return dictionary

예제 #4

파일 보기

파일: ucicorpus.py 프로젝트: RaRe-Technologies/gensim

    def create_dictionary(self):
        """Generate :class:`gensim.corpora.dictionary.Dictionary` directly from the corpus and vocabulary data.

        Return
        ------
        :class:`gensim.corpora.dictionary.Dictionary`
            Dictionary, based on corpus.

        Examples
        --------

        .. sourcecode:: pycon

            >>> from gensim.corpora.ucicorpus import UciCorpus
            >>> from gensim.test.utils import datapath
            >>> ucc = UciCorpus(datapath('testcorpus.uci'))
            >>> dictionary = ucc.create_dictionary()

        """
        dictionary = Dictionary()

        # replace dfs with defaultdict to avoid downstream KeyErrors
        # uci vocabularies may contain terms that are not used in the document data
        dictionary.dfs = defaultdict(int)

        dictionary.id2token = self.id2word
        dictionary.token2id = utils.revdict(self.id2word)

        dictionary.num_docs = self.num_docs
        dictionary.num_nnz = self.num_nnz

        for docno, doc in enumerate(self):
            if docno % 10000 == 0:
                logger.info('PROGRESS: processing document %i of %i', docno, self.num_docs)

            for word, count in doc:
                dictionary.dfs[word] += 1
                dictionary.num_pos += count

        return dictionary

예제 #5

파일 보기

    if os.path.exists(
            os.path.join(config_dic.get("vocab_dir"),
                         f"{args.config}.word.dic")):
        word_dic = Dictionary.load(
            os.path.join(config_dic.get("vocab_dir"),
                         f"{args.config}.word.dic"))
        #char_dic = Dictionary.load(os.path.join(config_dic.get("vocab_dir"), f"{args.config}.char.dic"))
        sw_dicts = {}
        for sp_key, sp in sps.items():
            sw_dicts[sp_key] = Dictionary.load(
                os.path.join(config_dic.get("vocab_dir"),
                             f"{args.config}.{sp_key}.dic"))
    else:
        special_token_dict = {PADDING: 0, UNKNOWN: 1, START: 2, END: 3}
        word_dic = Dictionary()
        word_dic.token2id = special_token_dict
        #char_dic = Dictionary()
        #char_dic.token2id = special_token_dict
        sw_dicts = {}
        for sp_key, sp in sps.items():
            _dic = Dictionary()
            _dic.token2id = special_token_dict
            sw_dicts[sp_key] = _dic
    label_dic = Dictionary(train_label_documents)
    label_dic.patch_with_special_tokens({PADDING: 0})
    label_dic.id2token = {
        _id: label
        for label, _id in label_dic.token2id.items()
    }

    # add vocabulary

예제 #6

파일 보기

## Vectorize the corpus
cv = CountVectorizer(stop_words="english",
                     min_df=5,
                     max_df=0.4,
                     max_features=5000,
                     ngram_range=(1, 1))
dtm = cv.fit_transform(corpus)

features = np.array(cv.get_feature_names())
id2token = dict(zip(range(len(features)), features))
token2id = dict(zip(features, range(len(features))))

## Create a gensim dictionary
dictionary = Dictionary()
dictionary.id2token = id2token
dictionary.token2id = token2id

## Train LDA models with different count of topics
topic_counts = [20, 30, 40, 50, 70, 100, 120, 150]


def get_topn_words(lda_model, features, topn=20):
    topics = lda_model.components_
    topic_words = []
    for topic_num, topic_weights in enumerate(topics):
        top_words = topic_weights.argsort()[::-1][:topn]
        topic_words.append(list(features[top_words]))
    return topic_words


def get_coherence_lda_models(topic_counts, dtm, features, corpus, dictionary):