예제 #1
0
def docs_to_gensim(spacy_docs,
                   spacy_vocab,
                   lemmatize=True,
                   filter_stops=True,
                   filter_punct=True,
                   filter_nums=False):
    """
    Convert multiple ``spacy.Doc`` s into a gensim dictionary and bag-of-words corpus.

    Args:
        spacy_docs (list(``spacy.Doc``))
        spacy_vocab (``spacy.Vocab``)
        lemmatize (bool): if True, use lemmatized strings for words; otherwise,
            use the original form of the string as it appears in ``doc``
        filter_stops (bool): if True, remove stop words from word list
        filter_punct (bool): if True, remove punctuation from word list
        filter_nums (bool): if True, remove numbers from word list

    Returns:
        :class:`gensim.Dictionary <gensim.corpora.dictionary.Dictionary>`:
            integer word ID to word string mapping
        list(list((int, int))): list of bag-of-words documents, where each doc is
            a list of (integer word ID, word count) 2-tuples
    """
    gdict = Dictionary()
    gcorpus = []
    stringstore = StringStore()
    doc_freqs = Counter()

    for spacy_doc in spacy_docs:
        if lemmatize is True:
            bow = (
                (spacy_vocab[tok_id], count)
                for tok_id, count in spacy_doc.count_by(attrs.LEMMA).items())
        else:
            bow = ((spacy_vocab[tok_id], count)
                   for tok_id, count in spacy_doc.count_by(attrs.ORTH).items())

        if filter_stops is True:
            bow = ((lex, count) for lex, count in bow if not lex.is_stop)
        if filter_punct is True:
            bow = ((lex, count) for lex, count in bow if not lex.is_punct)
        if filter_nums is True:
            bow = ((lex, count) for lex, count in bow if not lex.like_num)

        bow = sorted(((stringstore[lex.orth_], count) for lex, count in bow),
                     key=itemgetter(0))

        doc_freqs.update(tok_id for tok_id, _ in bow)
        gdict.num_docs += 1
        gdict.num_pos += sum(count for _, count in bow)
        gdict.num_nnz += len(bow)

        gcorpus.append(bow)

    gdict.token2id = {s: i for i, s in enumerate(stringstore)}
    gdict.dfs = dict(doc_freqs)

    return (gdict, gcorpus)
예제 #2
0
def docs_to_gensim(spacy_docs, spacy_vocab, lemmatize=True,
                   filter_stops=True, filter_punct=True, filter_nums=False):
    """
    Convert multiple ``spacy.Doc`` s into a gensim dictionary and bag-of-words corpus.

    Args:
        spacy_docs (list(``spacy.Doc``))
        spacy_vocab (``spacy.Vocab``)
        lemmatize (bool): if True, use lemmatized strings for words; otherwise,
            use the original form of the string as it appears in ``doc``
        filter_stops (bool): if True, remove stop words from word list
        filter_punct (bool): if True, remove punctuation from word list
        filter_nums (bool): if True, remove numbers from word list

    Returns:
        :class:`gensim.Dictionary <gensim.corpora.dictionary.Dictionary>`:
            integer word ID to word string mapping
        list(list((int, int))): list of bag-of-words documents, where each doc is
            a list of (integer word ID, word count) 2-tuples
    """
    gdict = Dictionary()
    gcorpus = []
    stringstore = StringStore()
    doc_freqs = Counter()

    for spacy_doc in spacy_docs:
        if lemmatize is True:
            bow = ((spacy_vocab[tok_id], count)
                   for tok_id, count in spacy_doc.count_by(attrs.LEMMA).items())
        else:
            bow = ((spacy_vocab[tok_id], count)
                   for tok_id, count in spacy_doc.count_by(attrs.ORTH).items())

        if filter_stops is True:
            bow = ((lex, count) for lex, count in bow if not lex.is_stop)
        if filter_punct is True:
            bow = ((lex, count) for lex, count in bow if not lex.is_punct)
        if filter_nums is True:
            bow = ((lex, count) for lex, count in bow if not lex.like_num)

        bow = sorted(((stringstore[lex.orth_], count) for lex, count in bow),
                     key=itemgetter(0))

        doc_freqs.update(tok_id for tok_id, _ in bow)
        gdict.num_docs += 1
        gdict.num_pos += sum(count for _, count in bow)
        gdict.num_nnz += len(bow)

        gcorpus.append(bow)

    gdict.token2id = {s: i for i, s in enumerate(stringstore)}
    gdict.dfs = dict(doc_freqs)

    return (gdict, gcorpus)
예제 #3
0
 def init_dictionary(self, save=True):
     import gzip
     from collections import Counter
     corpus_file = self.params.get(
         'dictionary__corpus_file') or self.params.get(
             'corpus_file') or 'sentences.txt.gz'
     doc_id = 0
     num_pos = 0
     num_nnz = 0
     cfs = Counter()
     dfs = Counter()
     f = gzip.open(self.path + corpus_file, 'rt', encoding='utf8')
     f = tqdm(f, 'dictionary', self.sentences_cnt)
     unique = set()
     for line in f:
         line = line.strip()
         if not line:  # end of document
             dfs.update(unique)
             num_nnz += len(unique)
             #
             doc_id += 1
             unique = set()
             continue
         tokens = line.split(' ')
         cfs.update(tokens)
         num_pos += len(tokens)
         unique.update(tokens)
     f.close()
     #
     token2id = {t: i for i, (t, cnt) in enumerate(cfs.most_common())}
     dictionary = GensimDictionary()
     dictionary.num_pos = num_pos
     dictionary.num_nnz = num_nnz
     dictionary.num_docs = doc_id
     dictionary.token2id = token2id
     #dictionary.cfs = {i:cfs[t] for t,i in token2id.items()}
     #dictionary.dfs = {i:dfs[t] for t,i in token2id.items()}
     for t, i in token2id.items():
         dictionary.cfs[i] = cfs[t]
         dictionary.dfs[i] = dfs[t]
     #dictionary.patch_with_special_tokens({'<PAD>':0})
     if save:
         dictionary.save(self.path + 'dictionary.pkl')
     self.dictionary = dictionary
def pytopia2gensimDict(dict_):
    '''
    Creates gensim dictionary from a pytopia dictionary.
    This is necessary since building of gensim models requires gensim dictionary
     but pytopia model builders must be able to receive generic pytopia Dictionary as parameter.
    '''
    # sort dictionary tokens by index
    dict_ = resolve(dict_)
    toki = [(tok, dict_[tok]) for tok in dict_]
    toki.sort(key=lambda ti: ti[1])
    # directly set gensim dict data structures,
    # this works for gensim 0.12.4
    gdict = GensimDict()
    gdict.token2id = {tok: i for tok, i in toki}
    gdict.id2token = {i: tok for tok, i in toki}
    gdict.dfs = {tok: 1 for tok, _ in toki}
    gdict.num_docs = 1  # number of documents processed
    gdict.num_pos = len(toki)  # total number of corpus positions
    gdict.num_nnz = len(toki)  # total number of non-zeroes in the BOW matrix
    return gdict
예제 #5
0
* **iteration** : 각 문서에 대해 업데이트를 반복하는 횟수
* **random_state**: 재현 가능한 결과를 위해 임의의 숫자를 설정한다.
"""

model = LdaModel(corpus=corpus,
                 id2word=id2word,
                 num_topics=100, 
                 iterations=500,
                 passes=10)

# Coherence 계산을 위한 dictionary 만들기

from gensim.corpora.dictionary import Dictionary

dic = Dictionary()
dic.token2id = {t: i for i, t in enumerate(vectorizer.get_feature_names())}

coherence_model_lda = CoherenceModel(model=model, texts=khaiii_xr, dictionary=dic, coherence='c_v')

coherence = coherence_model_lda.get_coherence()

detoken_xr[:10]

"""### 최적의 number of words 찾기
* 지표 : Coherence
"""

def compete_number_of_words(detoken_data, token_data, min_num, max_num, step, random_state=None):

  '''
  number_of_words를 찾기 위한 함수