Python Dictionary.num_docs 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: gensim.corpora.dictionary

클래스/타입: Dictionary

메소드/함수: num_docs

hotexamples.com에서의 예제들: 2

Python Dictionary.num_docs - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 gensim.corpora.dictionary.Dictionary.num_docs에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Dictionary(30)

items(30)

save(30)

doc2bow(30)

filter_extremes(30)

load(30)

add_documents(30)

get(23)

load_from_text(19)

from_corpus(16)

doc2idx(12)

compactify(9)

save_as_text(8)

keys(6)

token2id(4)

itervalues(4)

id2token(4)

filter_tokens(3)

values(3)

merge_with(2)

num_docs(2)

num_nnz(2)

num_pos(2)

dfs(2)

iteritems(1)

from_documents(1)

filter_n_most_frequent(1)

filterExtremes(1)

patch_with_special_tokens(1)

corpus_id2orig_id(1)

corpus(1)

예제 #1

파일 보기

 def init_dictionary(self, save=True):
     import gzip
     from collections import Counter
     corpus_file = self.params.get(
         'dictionary__corpus_file') or self.params.get(
             'corpus_file') or 'sentences.txt.gz'
     doc_id = 0
     num_pos = 0
     num_nnz = 0
     cfs = Counter()
     dfs = Counter()
     f = gzip.open(self.path + corpus_file, 'rt', encoding='utf8')
     f = tqdm(f, 'dictionary', self.sentences_cnt)
     unique = set()
     for line in f:
         line = line.strip()
         if not line:  # end of document
             dfs.update(unique)
             num_nnz += len(unique)
             #
             doc_id += 1
             unique = set()
             continue
         tokens = line.split(' ')
         cfs.update(tokens)
         num_pos += len(tokens)
         unique.update(tokens)
     f.close()
     #
     token2id = {t: i for i, (t, cnt) in enumerate(cfs.most_common())}
     dictionary = GensimDictionary()
     dictionary.num_pos = num_pos
     dictionary.num_nnz = num_nnz
     dictionary.num_docs = doc_id
     dictionary.token2id = token2id
     #dictionary.cfs = {i:cfs[t] for t,i in token2id.items()}
     #dictionary.dfs = {i:dfs[t] for t,i in token2id.items()}
     for t, i in token2id.items():
         dictionary.cfs[i] = cfs[t]
         dictionary.dfs[i] = dfs[t]
     #dictionary.patch_with_special_tokens({'<PAD>':0})
     if save:
         dictionary.save(self.path + 'dictionary.pkl')
     self.dictionary = dictionary

예제 #2

파일 보기

파일: pytopia2gensim.py 프로젝트: dkorenci/doc-topic-coherence

def pytopia2gensimDict(dict_):
    '''
    Creates gensim dictionary from a pytopia dictionary.
    This is necessary since building of gensim models requires gensim dictionary
     but pytopia model builders must be able to receive generic pytopia Dictionary as parameter.
    '''
    # sort dictionary tokens by index
    dict_ = resolve(dict_)
    toki = [(tok, dict_[tok]) for tok in dict_]
    toki.sort(key=lambda ti: ti[1])
    # directly set gensim dict data structures,
    # this works for gensim 0.12.4
    gdict = GensimDict()
    gdict.token2id = {tok: i for tok, i in toki}
    gdict.id2token = {i: tok for tok, i in toki}
    gdict.dfs = {tok: 1 for tok, _ in toki}
    gdict.num_docs = 1  # number of documents processed
    gdict.num_pos = len(toki)  # total number of corpus positions
    gdict.num_nnz = len(toki)  # total number of non-zeroes in the BOW matrix
    return gdict