예제 #1
0
파일: models.py 프로젝트: Kamaros/fnc-1
def hashdictionary_corpus(dataframe, id_range=32000):
    """Returns a HashDictionary mapping words to ids.

    Precomputed HashDictionaries are read from file if previously cached, or generated then cached otherwise.

    Parameters
    ----------
    dataframe : Pandas DataFrame
        The DataFrame containing the documents to process.
    id_range : int
        The maximum number of ids available.

    Returns
    -------
    dictionary : Gensim HashDictionary
        HashDictionary mapping words to ids.
    """
    filename = 'caches/models/dictionary_{}.model'.format(id_range)

    if not os.path.isfile(filename):
        corpus = text_corpus(dataframe)
        dictionary = HashDictionary(corpus, id_range=id_range)
        dictionary.save(filename)
    else:
        dictionary = HashDictionary.load(filename)

    return dictionary
예제 #2
0
    tokens = list(filter(None, tokens))
    return tokens


class Corpus(object):
    def __iter__(self):
        for file in glob.glob("*.txt"):
            print(file)
            paper = Path(file).read_text(encoding='utf8')
            yield paper


corpus_memory_friendly = Corpus()
papers = list(corpus_memory_friendly)

texts = [list(preprocess(t)) for t in papers]

# define the dictionary:
dictionary = Dictionary(texts)
dictionary.save('reasoning_corpura.dict')

corpus = [dictionary.doc2bow(text) for text in texts]
MmCorpus.serialize('reasoning_bow.mm', corpus)


hash_dictionary = HashDictionary(texts )
hash_dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
hash_dictionary.save_as_text('reasoning_wordids.txt.bz2')
hash_dictionary.save('reasoning_corpura_hash.dict')

예제 #3
0
Download it from:

    https://www.kaggle.com/benhamner/nips-papers/downloads/papers.csv/2

"""
papers = pd.read_csv('papers.csv')
corpus = list(papers['paper_text'])

print("corpus size: ", len(corpus))

# ToDo: check performance with lemmatization: gensim.utils.lemmatize

tokenized_corpus = [[
    utils.to_unicode(token)
    for token in utils.tokenize(corpus_item, lower=True, errors='ignore')
] for corpus_item in corpus]

hash_dictionary = HashDictionary(tokenized_corpus)

bow_corpus = [hash_dictionary.doc2bow(text) for text in corpus]
MmCorpus.serialize('nips_bow.mm', bow_corpus, progress_cnt=10000)

hash_dictionary.filter_extremes(no_below=20,
                                no_above=0.1,
                                keep_n=DEFAULT_DICT_SIZE)
hash_dictionary.save_as_text('nips_wordids.txt.bz2')
hash_dictionary.save('nips_corpura_hash.dict')

dictionary = Dictionary(tokenized_corpus)
dictionary.save('nips_corpura.dict')