def docs_to_gensim(spacy_docs, spacy_vocab, lemmatize=True, filter_stops=True, filter_punct=True, filter_nums=False): """ Convert multiple ``spacy.Doc`` s into a gensim dictionary and bag-of-words corpus. Args: spacy_docs (list(``spacy.Doc``)) spacy_vocab (``spacy.Vocab``) lemmatize (bool): if True, use lemmatized strings for words; otherwise, use the original form of the string as it appears in ``doc`` filter_stops (bool): if True, remove stop words from word list filter_punct (bool): if True, remove punctuation from word list filter_nums (bool): if True, remove numbers from word list Returns: :class:`gensim.Dictionary <gensim.corpora.dictionary.Dictionary>`: integer word ID to word string mapping list(list((int, int))): list of bag-of-words documents, where each doc is a list of (integer word ID, word count) 2-tuples """ gdict = Dictionary() gcorpus = [] stringstore = StringStore() doc_freqs = Counter() for spacy_doc in spacy_docs: if lemmatize is True: bow = ( (spacy_vocab[tok_id], count) for tok_id, count in spacy_doc.count_by(attrs.LEMMA).items()) else: bow = ((spacy_vocab[tok_id], count) for tok_id, count in spacy_doc.count_by(attrs.ORTH).items()) if filter_stops is True: bow = ((lex, count) for lex, count in bow if not lex.is_stop) if filter_punct is True: bow = ((lex, count) for lex, count in bow if not lex.is_punct) if filter_nums is True: bow = ((lex, count) for lex, count in bow if not lex.like_num) bow = sorted(((stringstore[lex.orth_], count) for lex, count in bow), key=itemgetter(0)) doc_freqs.update(tok_id for tok_id, _ in bow) gdict.num_docs += 1 gdict.num_pos += sum(count for _, count in bow) gdict.num_nnz += len(bow) gcorpus.append(bow) gdict.token2id = {s: i for i, s in enumerate(stringstore)} gdict.dfs = dict(doc_freqs) return (gdict, gcorpus)
def docs_to_gensim(spacy_docs, spacy_vocab, lemmatize=True, filter_stops=True, filter_punct=True, filter_nums=False): """ Convert multiple ``spacy.Doc`` s into a gensim dictionary and bag-of-words corpus. Args: spacy_docs (list(``spacy.Doc``)) spacy_vocab (``spacy.Vocab``) lemmatize (bool): if True, use lemmatized strings for words; otherwise, use the original form of the string as it appears in ``doc`` filter_stops (bool): if True, remove stop words from word list filter_punct (bool): if True, remove punctuation from word list filter_nums (bool): if True, remove numbers from word list Returns: :class:`gensim.Dictionary <gensim.corpora.dictionary.Dictionary>`: integer word ID to word string mapping list(list((int, int))): list of bag-of-words documents, where each doc is a list of (integer word ID, word count) 2-tuples """ gdict = Dictionary() gcorpus = [] stringstore = StringStore() doc_freqs = Counter() for spacy_doc in spacy_docs: if lemmatize is True: bow = ((spacy_vocab[tok_id], count) for tok_id, count in spacy_doc.count_by(attrs.LEMMA).items()) else: bow = ((spacy_vocab[tok_id], count) for tok_id, count in spacy_doc.count_by(attrs.ORTH).items()) if filter_stops is True: bow = ((lex, count) for lex, count in bow if not lex.is_stop) if filter_punct is True: bow = ((lex, count) for lex, count in bow if not lex.is_punct) if filter_nums is True: bow = ((lex, count) for lex, count in bow if not lex.like_num) bow = sorted(((stringstore[lex.orth_], count) for lex, count in bow), key=itemgetter(0)) doc_freqs.update(tok_id for tok_id, _ in bow) gdict.num_docs += 1 gdict.num_pos += sum(count for _, count in bow) gdict.num_nnz += len(bow) gcorpus.append(bow) gdict.token2id = {s: i for i, s in enumerate(stringstore)} gdict.dfs = dict(doc_freqs) return (gdict, gcorpus)
def init_dictionary(self, save=True): import gzip from collections import Counter corpus_file = self.params.get( 'dictionary__corpus_file') or self.params.get( 'corpus_file') or 'sentences.txt.gz' doc_id = 0 num_pos = 0 num_nnz = 0 cfs = Counter() dfs = Counter() f = gzip.open(self.path + corpus_file, 'rt', encoding='utf8') f = tqdm(f, 'dictionary', self.sentences_cnt) unique = set() for line in f: line = line.strip() if not line: # end of document dfs.update(unique) num_nnz += len(unique) # doc_id += 1 unique = set() continue tokens = line.split(' ') cfs.update(tokens) num_pos += len(tokens) unique.update(tokens) f.close() # token2id = {t: i for i, (t, cnt) in enumerate(cfs.most_common())} dictionary = GensimDictionary() dictionary.num_pos = num_pos dictionary.num_nnz = num_nnz dictionary.num_docs = doc_id dictionary.token2id = token2id #dictionary.cfs = {i:cfs[t] for t,i in token2id.items()} #dictionary.dfs = {i:dfs[t] for t,i in token2id.items()} for t, i in token2id.items(): dictionary.cfs[i] = cfs[t] dictionary.dfs[i] = dfs[t] #dictionary.patch_with_special_tokens({'<PAD>':0}) if save: dictionary.save(self.path + 'dictionary.pkl') self.dictionary = dictionary
def pytopia2gensimDict(dict_): ''' Creates gensim dictionary from a pytopia dictionary. This is necessary since building of gensim models requires gensim dictionary but pytopia model builders must be able to receive generic pytopia Dictionary as parameter. ''' # sort dictionary tokens by index dict_ = resolve(dict_) toki = [(tok, dict_[tok]) for tok in dict_] toki.sort(key=lambda ti: ti[1]) # directly set gensim dict data structures, # this works for gensim 0.12.4 gdict = GensimDict() gdict.token2id = {tok: i for tok, i in toki} gdict.id2token = {i: tok for tok, i in toki} gdict.dfs = {tok: 1 for tok, _ in toki} gdict.num_docs = 1 # number of documents processed gdict.num_pos = len(toki) # total number of corpus positions gdict.num_nnz = len(toki) # total number of non-zeroes in the BOW matrix return gdict
* **iteration** : 각 문서에 대해 업데이트를 반복하는 횟수 * **random_state**: 재현 가능한 결과를 위해 임의의 숫자를 설정한다. """ model = LdaModel(corpus=corpus, id2word=id2word, num_topics=100, iterations=500, passes=10) # Coherence 계산을 위한 dictionary 만들기 from gensim.corpora.dictionary import Dictionary dic = Dictionary() dic.token2id = {t: i for i, t in enumerate(vectorizer.get_feature_names())} coherence_model_lda = CoherenceModel(model=model, texts=khaiii_xr, dictionary=dic, coherence='c_v') coherence = coherence_model_lda.get_coherence() detoken_xr[:10] """### 최적의 number of words 찾기 * 지표 : Coherence """ def compete_number_of_words(detoken_data, token_data, min_num, max_num, step, random_state=None): ''' number_of_words를 찾기 위한 함수