Python Dictionary.dfs 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: gensim.corpora.dictionary

클래스/타입: Dictionary

메소드/함수: dfs

hotexamples.com에서의 예제들: 3

Python Dictionary.dfs - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 gensim.corpora.dictionary.Dictionary.dfs에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Dictionary(30)

items(30)

save(30)

doc2bow(30)

filter_extremes(30)

load(30)

add_documents(30)

get(23)

load_from_text(19)

from_corpus(16)

doc2idx(12)

compactify(9)

save_as_text(8)

keys(6)

token2id(4)

itervalues(4)

id2token(4)

filter_tokens(3)

values(3)

merge_with(2)

num_docs(2)

num_nnz(2)

num_pos(2)

dfs(2)

iteritems(1)

from_documents(1)

filter_n_most_frequent(1)

filterExtremes(1)

patch_with_special_tokens(1)

corpus_id2orig_id(1)

corpus(1)

예제 #1

파일 보기

파일: export.py 프로젝트: mbatchkarov/textacy

def docs_to_gensim(spacy_docs,
                   spacy_vocab,
                   lemmatize=True,
                   filter_stops=True,
                   filter_punct=True,
                   filter_nums=False):
    """
    Convert multiple ``spacy.Doc`` s into a gensim dictionary and bag-of-words corpus.

    Args:
        spacy_docs (list(``spacy.Doc``))
        spacy_vocab (``spacy.Vocab``)
        lemmatize (bool): if True, use lemmatized strings for words; otherwise,
            use the original form of the string as it appears in ``doc``
        filter_stops (bool): if True, remove stop words from word list
        filter_punct (bool): if True, remove punctuation from word list
        filter_nums (bool): if True, remove numbers from word list

    Returns:
        :class:`gensim.Dictionary <gensim.corpora.dictionary.Dictionary>`:
            integer word ID to word string mapping
        list(list((int, int))): list of bag-of-words documents, where each doc is
            a list of (integer word ID, word count) 2-tuples
    """
    gdict = Dictionary()
    gcorpus = []
    stringstore = StringStore()
    doc_freqs = Counter()

    for spacy_doc in spacy_docs:
        if lemmatize is True:
            bow = (
                (spacy_vocab[tok_id], count)
                for tok_id, count in spacy_doc.count_by(attrs.LEMMA).items())
        else:
            bow = ((spacy_vocab[tok_id], count)
                   for tok_id, count in spacy_doc.count_by(attrs.ORTH).items())

        if filter_stops is True:
            bow = ((lex, count) for lex, count in bow if not lex.is_stop)
        if filter_punct is True:
            bow = ((lex, count) for lex, count in bow if not lex.is_punct)
        if filter_nums is True:
            bow = ((lex, count) for lex, count in bow if not lex.like_num)

        bow = sorted(((stringstore[lex.orth_], count) for lex, count in bow),
                     key=itemgetter(0))

        doc_freqs.update(tok_id for tok_id, _ in bow)
        gdict.num_docs += 1
        gdict.num_pos += sum(count for _, count in bow)
        gdict.num_nnz += len(bow)

        gcorpus.append(bow)

    gdict.token2id = {s: i for i, s in enumerate(stringstore)}
    gdict.dfs = dict(doc_freqs)

    return (gdict, gcorpus)

예제 #2

파일 보기

파일: export.py 프로젝트: GregBowyer/textacy

def docs_to_gensim(spacy_docs, spacy_vocab, lemmatize=True,
                   filter_stops=True, filter_punct=True, filter_nums=False):
    """
    Convert multiple ``spacy.Doc`` s into a gensim dictionary and bag-of-words corpus.

    Args:
        spacy_docs (list(``spacy.Doc``))
        spacy_vocab (``spacy.Vocab``)
        lemmatize (bool): if True, use lemmatized strings for words; otherwise,
            use the original form of the string as it appears in ``doc``
        filter_stops (bool): if True, remove stop words from word list
        filter_punct (bool): if True, remove punctuation from word list
        filter_nums (bool): if True, remove numbers from word list

    Returns:
        :class:`gensim.Dictionary <gensim.corpora.dictionary.Dictionary>`:
            integer word ID to word string mapping
        list(list((int, int))): list of bag-of-words documents, where each doc is
            a list of (integer word ID, word count) 2-tuples
    """
    gdict = Dictionary()
    gcorpus = []
    stringstore = StringStore()
    doc_freqs = Counter()

    for spacy_doc in spacy_docs:
        if lemmatize is True:
            bow = ((spacy_vocab[tok_id], count)
                   for tok_id, count in spacy_doc.count_by(attrs.LEMMA).items())
        else:
            bow = ((spacy_vocab[tok_id], count)
                   for tok_id, count in spacy_doc.count_by(attrs.ORTH).items())

        if filter_stops is True:
            bow = ((lex, count) for lex, count in bow if not lex.is_stop)
        if filter_punct is True:
            bow = ((lex, count) for lex, count in bow if not lex.is_punct)
        if filter_nums is True:
            bow = ((lex, count) for lex, count in bow if not lex.like_num)

        bow = sorted(((stringstore[lex.orth_], count) for lex, count in bow),
                     key=itemgetter(0))

        doc_freqs.update(tok_id for tok_id, _ in bow)
        gdict.num_docs += 1
        gdict.num_pos += sum(count for _, count in bow)
        gdict.num_nnz += len(bow)

        gcorpus.append(bow)

    gdict.token2id = {s: i for i, s in enumerate(stringstore)}
    gdict.dfs = dict(doc_freqs)

    return (gdict, gcorpus)

예제 #3

파일 보기

파일: pytopia2gensim.py 프로젝트: dkorenci/doc-topic-coherence

def pytopia2gensimDict(dict_):
    '''
    Creates gensim dictionary from a pytopia dictionary.
    This is necessary since building of gensim models requires gensim dictionary
     but pytopia model builders must be able to receive generic pytopia Dictionary as parameter.
    '''
    # sort dictionary tokens by index
    dict_ = resolve(dict_)
    toki = [(tok, dict_[tok]) for tok in dict_]
    toki.sort(key=lambda ti: ti[1])
    # directly set gensim dict data structures,
    # this works for gensim 0.12.4
    gdict = GensimDict()
    gdict.token2id = {tok: i for tok, i in toki}
    gdict.id2token = {i: tok for tok, i in toki}
    gdict.dfs = {tok: 1 for tok, _ in toki}
    gdict.num_docs = 1  # number of documents processed
    gdict.num_pos = len(toki)  # total number of corpus positions
    gdict.num_nnz = len(toki)  # total number of non-zeroes in the BOW matrix
    return gdict