Exemplos de Dictionary.filterExtremes em Python

Linguagem de programação: Python

Espaço para nome / nome do pacote: gensim.corpora.dictionary

Classe / Tipo: Dictionary

Método / Função: filterExtremes

Exemplos em hotexamples.com: 2

Dictionary.filterExtremes em Python - 2 exemplos encontrados. Esses são os exemplos do mundo real mais bem avaliados de gensim.corpora.dictionary.Dictionary.filterExtremes em Python extraídos de projetos de código aberto. Você pode avaliar os exemplos para nos ajudar a melhorar a qualidade deles.

Métodos Frequentes

Exibir Ocultar

Dictionary(30)

items(30)

save(30)

doc2bow(30)

filter_extremes(30)

load(30)

add_documents(30)

get(23)

load_from_text(19)

from_corpus(16)

doc2idx(12)

compactify(9)

save_as_text(8)

keys(6)

token2id(4)

itervalues(4)

id2token(4)

filter_tokens(3)

values(3)

merge_with(2)

num_docs(2)

num_nnz(2)

num_pos(2)

dfs(2)

iteritems(1)

from_documents(1)

filter_n_most_frequent(1)

filterExtremes(1)

patch_with_special_tokens(1)

corpus_id2orig_id(1)

corpus(1)

Métodos Frequentes

Dictionary (30)

items (30)

save (30)

doc2bow (30)

filter_extremes (30)

load (30)

add_documents (30)

get (23)

load_from_text (19)

from_corpus (16)

Métodos Frequentes

doc2idx (12)

compactify (9)

save_as_text (8)

keys (6)

token2id (4)

itervalues (4)

id2token (4)

filter_tokens (3)

values (3)

merge_with (2)

num_docs (2)

num_nnz (2)

num_pos (2)

dfs (2)

iteritems (1)

from_documents (1)

filter_n_most_frequent (1)

filterExtremes (1)

patch_with_special_tokens (1)

corpus_id2orig_id (1)

Métodos Frequentes

num_docs (2)

num_nnz (2)

num_pos (2)

dfs (2)

iteritems (1)

from_documents (1)

filter_n_most_frequent (1)

filterExtremes (1)

patch_with_special_tokens (1)

corpus_id2orig_id (1)

corpus (1)

Métodos Frequentes

corpus (1)

Exemplo n.º 1

0

Exibir arquivo

Arquivo: wikicorpus.py Projeto: beibeiyang/Latent-Dirichlet-Allocation

class WikiCorpus(interfaces.CorpusABC): """ Treat a wikipedia articles dump (*articles.xml.bz2) as a (read-only) corpus. The documents are extracted on-the-fly, so that the whole (massive) dump can stay compressed on disk. >>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id, takes almost 7h >>> wiki.saveAsText('wiki_en_vocab200k') # another 7.5h, creates a file in MatrixMarket format plus file with id->word """ def __init__(self, fname, noBelow = 20, keep_words = 200000, dictionary = None): """ Initialize the corpus. This scans the corpus once, to determine its vocabulary (only the first `keep_words` most frequent words that appear in at least `noBelow` documents are kept). """ self.fname = fname if dictionary is None: self.dictionary = Dictionary(self.getArticles()) self.dictionary.filterExtremes(noBelow = noBelow, noAbove = 0.1, keepN = keep_words) else: self.dictionary = dictionary def __len__(self): return self.numDocs def __iter__(self): """ The function that defines a corpus -- iterating over the corpus yields vectors, one for each document. """ for docNo, text in enumerate(self.getArticles()): yield self.dictionary.doc2bow(text, allowUpdate = False) def saveDictionary(self, fname): """ Store id->word mapping to a file, in format `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`. """ logger.info("saving dictionary mapping to %s" % fname) fout = open(fname, 'w') for token, tokenId in sorted(self.dictionary.token2id.iteritems()): fout.write("%i\t%s\t%i\n" % (tokenId, token, self.dictionary.docFreq[tokenId])) fout.close() @staticmethod def loadDictionary(fname): """ Load previously stored mapping between words and their ids. The result can be used as the `id2word` parameter for input to transformations. """ result = {} for lineNo, line in enumerate(open(fname)): cols = line[:-1].split('\t') if len(cols) == 2: wordId, word = cols elif len(cols) == 3: wordId, word, docFreq = cols else: continue result[int(wordId)] = word # docFreq not used return result def saveAsText(self, fname): """ Store the corpus to disk, in a human-readable text format. This actually saves two files: 1. Document-term co-occurence frequency counts (bag-of-words), as a Matrix Market file `fname_bow.mm`. 2. Token to integer mapping, as a text file `fname_wordids.txt`. """ self.saveDictionary(fname + '_wordids.txt') matutils.MmWriter.writeCorpus(fname + '_bow.mm', self, progressCnt = 10000) def getArticles(self): """ Iterate over the dump, returning text version of each article. Only articles of sufficient length are returned (short articles & redirects etc are ignored). """ articles, intext = 0, False for lineno, line in enumerate(bz2.BZ2File(self.fname)): if line.startswith(' <text'): intext = True line = line[line.find('>') + 1 : ] lines = [line] elif intext: lines.append(line) pos = line.find('</text>') # can be on the same line as <text> if pos >= 0: intext = False if not lines: continue lines[-1] = line[:pos] text = filterWiki(''.join(lines)) if len(text) > ARTICLE_MIN_CHARS: # article redirects are pruned here articles += 1 yield tokenize(text) # split text into tokens self.numDocs = articles # cache corpus length

Exemplo n.º 2

0

Exibir arquivo

Arquivo: wikicorpus.py Projeto: zitingtang/Latent-Dirichlet-Allocation

class WikiCorpus(interfaces.CorpusABC): """ Treat a wikipedia articles dump (*articles.xml.bz2) as a (read-only) corpus. The documents are extracted on-the-fly, so that the whole (massive) dump can stay compressed on disk. >>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id mapping, takes almost 8h >>> wiki.saveAsText('wiki_en_vocab200k') # another 8h, creates a file in MatrixMarket format plus file with id->word """ def __init__(self, fname, noBelow=20, keep_words=200000, dictionary=None): """ Initialize the corpus. This scans the corpus once, to determine its vocabulary (only the first `keep_words` most frequent words that appear in at least `noBelow` documents are kept). """ self.fname = fname if dictionary is None: self.dictionary = Dictionary(self.getArticles()) self.dictionary.filterExtremes(noBelow=noBelow, noAbove=0.1, keepN=keep_words) else: self.dictionary = dictionary def __len__(self): return self.numDocs def __iter__(self): """ The function that defines a corpus -- iterating over the corpus yields vectors, one for each document. """ for docNo, text in enumerate(self.getArticles()): yield self.dictionary.doc2bow(text, allowUpdate=False) def saveDictionary(self, fname): """ Store id->word mapping to a file, in format `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`. """ logger.info("saving dictionary mapping to %s" % fname) fout = open(fname, 'w') for token, tokenId in sorted(self.dictionary.token2id.iteritems()): fout.write("%i\t%s\t%i\n" % (tokenId, token, self.dictionary.docFreq[tokenId])) fout.close() @staticmethod def loadDictionary(fname): """ Load previously stored mapping between words and their ids. The result can be used as the `id2word` parameter for input to transformations. """ result = {} for lineNo, line in enumerate(open(fname)): cols = line[:-1].split('\t') if len(cols) == 2: wordId, word = cols elif len(cols) == 3: wordId, word, docFreq = cols else: continue result[int(wordId)] = word # docFreq not used return result def saveAsText(self, fname): """ Store the corpus to disk, in a human-readable text format. This actually saves two files: 1. Document-term co-occurence frequency counts (bag-of-words), as a Matrix Market file `fname_bow.mm`. 2. Token to integer mapping, as a text file `fname_wordids.txt`. """ self.saveDictionary(fname + '_wordids.txt') matutils.MmWriter.writeCorpus(fname + '_bow.mm', self, progressCnt=10000) def getArticles(self): """ Iterate over the dump, returning text version of each article. Only articles of sufficient length are returned (short articles & redirects etc are ignored). """ articles, intext = 0, False for lineno, line in enumerate(bz2.BZ2File(self.fname)): if line.startswith(' <text'): intext = True line = line[line.find('>') + 1:] lines = [line] elif intext: lines.append(line) pos = line.find('</text>') # can be on the same line as <text> if pos >= 0: intext = False if not lines: continue lines[-1] = line[:pos] text = filterWiki(''.join(lines)) if len( text ) > ARTICLE_MIN_CHARS: # article redirects are pruned here articles += 1 yield tokenize(text) # split text into tokens self.numDocs = articles # cache corpus length