def get_texts(self): """ Iterate over the dump, returning text version of each article as a list of tokens. Only articles of sufficient length are returned (short articles & redirects etc are ignored). Note that this iterates over the **texts**; if you want vectors, just use the standard corpus interface instead of this function:: >>> for vec in wiki_corpus: >>> print vec """ articles, articles_all = 0, 0 positions, positions_all = 0, 0 texts = ((text, self.lemmatize) for _, text in _extract_pages(bz2.BZ2File(self.fname))) pool = multiprocessing.Pool(self.processes) # process the corpus in smaller chunks of docs, because multiprocessing.Pool # is dumb and would load the entire input into RAM at once... for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1): for tokens in pool.imap(process_article, group): # chunksize=10): articles_all += 1 positions_all += len(tokens) if len(tokens) > ARTICLE_MIN_WORDS: # article redirects are pruned here articles += 1 positions += len(tokens) yield tokens pool.terminate() logger.info("finished iterating over Wikipedia corpus of %i documents with %i positions" " (total %i articles, %i positions before pruning articles shorter than %i words)" % (articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS)) self.length = articles # cache corpus length
def get_texts(self): """ Iterate over the dump, returning text version of each article as a list of tokens. Only articles of sufficient length are returned (short articles & redirects etc are ignored). Note that this iterates over the **texts**; if you want vectors, just use the standard corpus interface instead of this function:: >>> for vec in wiki_corpus: >>> print(vec) """ articles, articles_all = 0, 0 texts = ((text, title, pageid) for title, text, pageid in extract_pages( bz2.BZ2File(self.fname), self.filter_namespaces)) pool = multiprocessing.Pool(self.processes) # process the corpus in smaller chunks of docs, because multiprocessing.Pool # is dumb and would load the entire input into RAM at once... ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split( ) for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1): for text, title, pageid in pool.imap(process_article, group): # chunksize=10): articles_all += 1 # article redirects and short stubs are pruned here articles += 1 yield text, title, pageid pool.terminate() # logger.info("finished iterating over Wikipedia corpus of %i documents (all : %i)" % (articles, articles_all)) self.length = articles # cache corpus length
def get_claims(self): """ Iterate over the dump, creating a pseudo-XML file called "output" containing claims that are marked with the "citation needed" template """ articles, articles_all = 0, 0 positions, positions_all = 0, 0 texts = ((text, self.lemmatize, title, pageid, self.set_citation, self.quote_identifiers) for title, text, pageid in extract_pages( bz2.BZ2File(self.fname), self.filter_namespaces)) pool = multiprocessing.Pool(self.processes) # process the corpus in smaller chunks of docs, because multiprocessing.Pool # is dumb and would load the entire input into RAM at once... claim_list = [] for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1): for claims in pool.imap(get_article_claims, group): # chunksize=10): claim_list.append(claims) pool.terminate() #with open("output.finder", "w") as outfile: # for claim in retList: # outfile.write(claim) for x in claim_list: for c in x.claims: c.get_query(self.dictionary, self.articlecount) return claim_list
def get_texts(self): articles, articles_all = 0, 0 positions, positions_all = 0, 0 texts = ((text, self.lemmatize, title, pageid) for pageid, title, text in self.pages_gen()) pool = multiprocessing.Pool(self.processes) # process the corpus in smaller chunks of docs, because multiprocessing.Pool # is dumb and would load the entire input into RAM at once... for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1): for tokens, title, pageid in pool.imap(process_article, group): # chunksize=10): articles_all += 1 positions_all += len(tokens) # article redirects and short stubs are pruned here if len(tokens) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): continue articles += 1 positions += len(tokens) if self.metadata: yield (tokens, (pageid, title)) else: yield tokens pool.terminate() logger.info( "finished iterating over Wikipedia corpus of %i documents with %i positions" " (total %i articles, %i positions before pruning articles shorter than %i words)", articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS) self.length = articles # cache corpus length
def __iter__(self): pool = multiprocessing.Pool(pool_size) for file_chunk in utils.chunkize(self.file_path_iter(), chunksize=1000, maxsize=20): docs = pool.imap(tokenized_from_file, file_chunk) for doc_tokenized in docs: yield self.dictionary.doc2bow(doc_tokenized) pool.terminate()
def load_corpus(self): pool = multiprocessing.Pool(pool_size) for file_chunk in utils.chunkize(self.file_path_iter(), chunksize=1000, maxsize=20): results = pool.imap(tokenized_from_file, file_chunk) self.dictionary.add_documents(results, prune_at=200000) self.document_file_names += [file_path for file_path in file_chunk] pool.terminate()
def vanilla_chunk(unigrams, n): chunks = [] for doc in unigrams: clen = len(doc) / n for chunk in chunkize(doc, clen): chunks.append(chunk) return chunks
def map_wikidocs2lda(num_topics): logger = logging.getLogger('Wiki2LDA') pool = Pool(cpu_count()) logger.info('Loading LDA model...') lda = gensim.models.ldamodel.LdaState.load(LDA_MODEL_FILENAME(num_topics)) # Mapping betwee doc-id -> topics logger.info('Mapping Wikipedia documents to LDA model...') doc_topics = os.path.join( LDA_MODEL_DIR(num_topics) + '/topical_documents.gz') with smart_open(doc_topics, 'wb') as f: n = 0 for wiki_docs in utils.chunkize(wiki_document_generator(), 100): #, 10000): lda_wiki_docs = pool.map(wiki2LDA, [(lda, wiki_doc) for wiki_doc in wiki_docs]) # Writes to file as 'wiki_id \t index:value \n' format. for wiki_id, lda_text in lda_wiki_docs: f.write(str(wiki_id) + '\t') embedding = [ '{0}:{1:.10f}'.format(index, value) for index, value in lda_text ] f.write('\t'.join(embedding) + '\n') n += len(wiki_docs) logger.info('{0} documents mapped to LDA embedding.'.format(n))
def get_texts_sent_split(self): """ Iterate over the dump, returning text version of each article as a list of tokens. Only articles of sufficient length are returned (short articles & redirects etc are ignored). Note that this iterates over the **texts**; if you want vectors, just use the standard corpus interface instead of this function:: """ texts = ((text, self.lemmatize, title, pageid) for title, text, pageid in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces)) pool = multiprocessing.Pool(self.processes) # process the corpus in smaller chunks of docs, because multiprocessing.Pool # is dumb and would load the entire input into RAM at once... for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1): for categories, tokens, title, pageid in pool.imap(process_article, group): # chunksize=10): if categories is None: continue tokens_all = [x for x in tokens.values() for x in x for x in x] # article redirects and short stubs are pruned here if len(tokens_all) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): continue if self.metadata: for k, v in tokens.items(): yield (v, (pageid, k)) else: for k, v in tokens.items(): yield v pool.terminate()
def get_texts(self): articles, articles_all = 0, 0 positions, positions_all = 0, 0 texts = ((text, self.lemmatize, title, pageid) for title, text, pageid in extract_pages( bz2.BZ2File(self.fname), self.filter_namespaces)) pool = multiprocessing.Pool(self.processes) # process the corpus in smaller chunks of docs, because multiprocessing.Pool # is dumb and would load the entire input into RAM at once... for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1): for tokens, title, pageid in pool.imap(process_article, group): # chunksize=10): articles_all += 1 positions_all += len(tokens) # article redirects and short stubs are pruned here if len(tokens) < ARTICLE_MIN_WORDS or any( title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): continue articles += 1 positions += len(tokens) if self.metadata: yield (tokens, (pageid, title)) else: yield tokens pool.terminate() logger.info( "finished iterating over Wikipedia corpus of %i documents with %i positions" " (total %i articles, %i positions before pruning articles shorter than %i words)", articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS) self.length = articles # cache corpus length
def get_texts(self): """ Iterate over the dump, returning text version of each article as a list of tokens. """ reviews = 0 positions = 0 texts = [text for text in _extract_reviews(self.fname)] pool = multiprocessing.Pool(self.processes) # process the corpus in smaller chunks of docs, because multiprocessing.Pool # is dumb and would load the entire input into RAM at once... #for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1): for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1): for tokens in pool.imap(process_review, group): # chunksize=10): reviews += 1 positions += len(tokens) yield tokens pool.terminate() logger.info( "finished iterating over the generated Yelp corpus of %i documents with %i positions" " (total %i articles, %i positions before pruning articles shorter than %i words)" % (reviews, positions, reviews, positions, 10000)) self.length = reviews # cache corpus length
def get_texts(self): """ Iterate over the dump, returning text version of each article as a list of tokens. Only articles of sufficient length are returned (short articles & redirects etc are ignored). Note that this iterates over the **texts**; if you want vectors, just use the standard corpus interface instead of this function:: >>> for vec in wiki_corpus: >>> print(vec) """ articles, articles_all = 0, 0 positions, positions_all = 0, 0 texts = ((text, title, pageid) for title, text, pageid in extract_pages( bz2.BZ2File(self.fname), self.filter_namespaces)) batch_idx = 0 pool = multiprocessing.Pool(self.processes) # Process the corpus in smaller chunks of docs, # because multiprocessing.Pool is dumb and would load the entire input # into RAM at once... for group in chunkize(texts, chunksize=10 * self.processes, maxsize=1): for tokens, title, pageid in pool.imap(process_article, group): articles_all += 1 positions_all += len(tokens) # article redirects and short stubs are pruned here to_ignored = any( title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES) if len(tokens) < ARTICLE_MIN_WORDS or to_ignored: continue articles += 1 positions += len(tokens) if self.metadata: yield title, tokens else: yield tokens batch_idx += 1 if self.max_batch and batch_idx == self.max_batch: break pool.terminate() logger.info( "Finished iterating over Wikipedia corpus of %i documents with " "%i positions (total %i articles, %i positions before pruning " "articles shorter than %i words)", articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS) self.length = articles
def get_texts(self): """ Iterate over the dump, returning text version of each article as a list of tokens. Only articles of sufficient length are returned (short articles & redirects etc are ignored). This is control by `article_min_tokens` on the class instance. Note that this iterates over the **texts**; if you want vectors, just use the standard corpus interface instead of this function:: >>> for vec in wiki_corpus: >>> print(vec) """ articles, articles_all = 0, 0 positions, positions_all = 0, 0 tokenization_params = (self.tokenizer_func, self.token_min_len, self.token_max_len, self.lower) texts = \ ((text, self.lemmatize, title, pageid, tokenization_params) for title, text, pageid in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces)) pool = multiprocessing.Pool(self.processes, init_to_ignore_interrupt) try: # process the corpus in smaller chunks of docs, because multiprocessing.Pool # is dumb and would load the entire input into RAM at once... for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1): for tokens, title, pageid in pool.imap(_process_article, group): articles_all += 1 positions_all += len(tokens) # article redirects and short stubs are pruned here if len(tokens) < self.article_min_tokens or \ any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): continue articles += 1 positions += len(tokens) if self.metadata: yield (tokens, (pageid, title)) else: yield tokens except KeyboardInterrupt: logger.warn( "user terminated iteration over Wikipedia corpus after %i documents with %i positions " "(total %i articles, %i positions before pruning articles shorter than %i words)", articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS ) else: logger.info( "finished iterating over Wikipedia corpus of %i documents with %i positions " "(total %i articles, %i positions before pruning articles shorter than %i words)", articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS ) self.length = articles # cache corpus length finally: pool.terminate()
def parse_wiki_dump(infile, min_words, process_function, processes=multiprocessing.cpu_count() - 2): """ Yield articles from a bz2 Wikipedia dump `infile` as (title, tokens) 2-tuples. Only articles of sufficient length are returned (short articles & redirects etc are ignored). Uses multiple processes to speed up the parsing in parallel. Args: infile (str) : path to bz2 Wikipedia dump min_words (int) : skip article if it has less than this many words process_function (function) : preprocessing function processes (int) : number of cores to be used """ logger.info("Start processing Wikipedia dump `{}`".format(infile)) articles, articles_all = 0, 0 pool = multiprocessing.Pool(processes) # process the corpus in smaller chunks of docs, because multiprocessing.Pool # is dumb and would try to load the entire dump into RAM... texts = wikicorpus._extract_pages(bz2.BZ2File(infile)) # generator ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split( ) for group in chunkize(texts, chunksize=10 * processes): for title, tokens in pool.imap(process_function, group): if articles_all % 10000 == 0: logger.info( "PROGRESS: at article #{} accepted {} articles".format( articles_all, articles)) articles_all += 1 # article redirects and short stubs are pruned here if any( title.startswith(ignore + ':') for ignore in ignore_namespaces) or len(tokens) < min_words: continue # all good: use this article articles += 1 yield title, tokens pool.terminate() logger.info( "finished iterating over Wikipedia corpus of {} documents with total {} articles" .format(articles, articles_all))
def get_texts(self): """ Iterate over the dump, returning text version of each article as a list of tokens. Only articles of sufficient length are returned (short articles & redirects etc are ignored). Note that this iterates over the **texts**; if you want vectors, just use the standard corpus interface instead of this function:: >>> for vec in wiki_corpus: >>> print(vec) """ articles, articles_all = 0, 0 positions, positions_all = 0, 0 texts = ((text, self.lemmatize, title, pageid) for title, text, pageid in extract_pages( bz2.BZ2File(self.fname), self.filter_namespaces)) pool = multiprocessing.Pool(self.processes) # process the corpus in smaller chunks of docs, because multiprocessing.Pool # is dumb and would load the entire input into RAM at once... ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split( ) for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1): for tokens, title, pageid in pool.imap(process_article, group): # chunksize=10): articles_all += 1 positions_all += len(tokens) # article redirects and short stubs are pruned here if len(tokens) < ARTICLE_MIN_WORDS or any( title.startswith(ignore + ':') for ignore in ignore_namespaces): continue articles += 1 positions += len(tokens) if self.metadata: yield (tokens, (pageid, title)) else: yield tokens pool.terminate() logger.info( "finished iterating over Wikipedia corpus of %i documents with %i positions" " (total %i articles, %i positions before pruning articles shorter than %i words)" % (articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS)) self.length = articles # cache corpus length
def __iter__(self): files = iter_files(self.directory) posts = 0 pool = multiprocessing.Pool(self.n_workers) # process the corpus in smaller chunks of docs, because multiprocessing.Pool # is dumb and would load the entire input into RAM at once... for group in utils.chunkize(files, chunksize=self.job_size * self.n_workers, maxsize=1): for result in pool.imap(process_file, zip(group, itertools.repeat(self.out_dir))): posts += 1 yield result pool.terminate() logger.info("finished iterating over corpus of %i documents", posts) self.length = posts # cache corpus length
def get_texts(self): """ Iterate over the dump, returning text version of each article as a list of tokens. Only articles of sufficient length are returned (short articles & redirects etc are ignored). Note that this iterates over the **texts**; if you want vectors, just use the standard corpus interface instead of this function:: >>> for vec in wiki_corpus: >>> print(vec) """ articles, articles_all = 0, 0 positions, positions_all = 0, 0 texts = ( (text, self.lemmatize, title, pageid) for title, text, pageid in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces) ) pool = multiprocessing.Pool(self.processes) # process the corpus in smaller chunks of docs, because multiprocessing.Pool # is dumb and would load the entire input into RAM at once... ignore_namespaces = "Wikipedia Category File Portal Template MediaWiki User Help Book Draft".split() for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1): for tokens, title, pageid in pool.imap(process_article, group): # chunksize=10): articles_all += 1 positions_all += len(tokens) # article redirects and short stubs are pruned here if len(tokens) < ARTICLE_MIN_WORDS or any( title.startswith(ignore + ":") for ignore in ignore_namespaces ): continue articles += 1 positions += len(tokens) if self.metadata: yield (tokens, (pageid, title)) else: yield tokens pool.terminate() logger.info( "finished iterating over Wikipedia corpus of %i documents with %i positions" " (total %i articles, %i positions before pruning articles shorter than %i words)" % (articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS) ) self.length = articles # cache corpus length
def populate_database(self, ): articles, articles_all = 0, 0 positions, positions_all = 0, 0 texts = ((doc["text"], doc["title"], doc["id"]) for doc in self.extract_wiki_pages(self.dbconfig.dataset_dir)) try: # process the corpus in smaller chunks of docs, because multiprocessing.Pool # is dumb and would load the entire input into RAM at once... for group in utils.chunkize(texts, chunksize=1000, maxsize=1): for g in group: for text, tokens_ids, title, pageid in self.process_article( g): articles_all += 1 positions_all += len(tokens_ids) # article redirects and short stubs are pruned here if len(tokens_ids) < ARTICLE_MIN_WORDS or any( title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): continue articles += 1 positions += len(tokens_ids) document = { "article": articles, "title": title, "text": text, "token_ids": tokens_ids, "pageid": pageid, } self.db[self.dbconfig.collection_name].insert_one( document) except KeyboardInterrupt: print( "user terminated iteration over Wikipedia corpus after %i documents with %i positions " "(total %i articles, %i positions before pruning articles shorter than %i words)", articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS) else: print( "finished iterating over Wikipedia corpus of %i documents with %i positions " "(total %i articles, %i positions before pruning articles shorter than %i words)", articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS) length = articles # cache corpus length
def get_texts(self): articles, articles_all = 0, 0 positions, positions_all = 0, 0 tokenization_params = (self.tokenizer_func, self.token_min_len, self.token_max_len, self.lower) texts = \ ((text, self.lemmatize, title, pageid, tokenization_params) for title, text, pageid #in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces)) in extract_pages_without_namespaces(bz2.BZ2File(self.fname), self.filter_namespaces)) pool = multiprocessing.Pool(self.processes, init_to_ignore_interrupt) try: for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1): for tokens, title, pageid in pool.imap(_process_article, group): articles_all += 1 positions_all += len(tokens) if len(tokens) < self.article_min_tokens or \ any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): continue articles += 1 positions += len(tokens) if self.metadata: yield (tokens, (pageid, title)) else: yield tokens except KeyboardInterrupt: logger.warn( "user terminated iteration over Wikipedia corpus after %i documents with %i positions " "(total %i articles, %i positions before pruning articles shorter than %i words)", articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS) else: logger.info( "finished iterating over Wikipedia corpus of %i documents with %i positions " "(total %i articles, %i positions before pruning articles shorter than %i words)", articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS) self.length = articles # cache corpus length finally: pool.terminate()
def get_texts(self): texts = ((content, self.lemmatize, subject, pageid) for subject, content, pageid in get_messages(self.conn)) pool = multiprocessing.Pool(self.processes) posts, token_count = 0, 0 for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1): for tokens, title, pageid in pool.imap(process_post, group): if len(tokens) < ARTICLE_MIN_WORDS: continue if self.metadata: yield (tokens, (repr(pageid), title)) else: yield tokens posts += 1 token_count += len(tokens) pool.terminate() log.info("Processed %d posts with %d tokens", posts, token_count)
def get_texts(self): """ Iterate over the Wikipedia dump and the HN articles returning text """ wiki_articles, hn_articles, articles_all = 0, 0, 0 positions, positions_all = 0, 0 # ************ Wikipedia ************ texts = ((text, self.lemmatize) for _, text in wikicorpus._extract_pages(bz2.BZ2File(self.wiki_file))) pool = multiprocessing.Pool(self.processes) for group in utils.chunkize(texts, chunksize=10 * pool._processes, maxsize=1): # otherwise imap puts all the corpus into memory for tokens in pool.imap(wikicorpus.process_article, group): articles_all += 1 positions_all += len(tokens) if len(tokens) > WIKI_ARTICLE_MIN_WORDS: wiki_articles += 1 positions += len(tokens) yield tokens pool.terminate() print (">>> finished iterating over Wikipedia corpus of %i documents with %i positions (total %i articles, %i positions before pruning articles shorter than %i words)" % (wiki_articles, positions, articles_all, positions_all, WIKI_ARTICLE_MIN_WORDS)) # ************ HN articles ************ positions_after_wiki = positions fnamelist = [] for g in glob.iglob(self.hn_folder + '/*.txt'): fnamelist.append(g) for fileno, fname in enumerate(fnamelist): # TODO parallelize as Wiki hn_text = open(fname).read() if self.lemmatize: result = utils.lemmatize(hn_text) # text into lemmas here else: result = tokenize(hn_text) # text into tokens here articles_all += 1 positions_all += len(result) if len(result) > HN_ARTICLE_MIN_WORDS: hn_articles += 1 positions += len(result) yield result print (">>> finished iterating over HN corpus of %i documents with %i positions" % (hn_articles, positions - positions_after_wiki)) # ************ /HN articles ************ self.length = wiki_articles + hn_articles # cache corpus length
def process_wiki_dump(source, target, processes=None): if processes is None: processes = max(1, multiprocessing.cpu_count() - 1) print(processes) with open(source, 'r', encoding='utf-8') as dump_file, \ open(target, 'w', encoding='utf-8') as out_file: page_generator = extract_pages(dump_file, filter_namespaces=set(['0'])) #for title, text, pageid in page_generator: # sentences, title, pageid = process_page(title, text, pageid) # for sentence in sentences: # out_file.write(sentence + '\n') with multiprocessing.Pool(processes) as pool: for group in utils.chunkize(page_generator, chunksize=10 * processes, maxsize=1): for sentences, title, pageid in pool.imap(process_page, group): for sentence in sentences: out_file.write(sentence + '\n')
def get_texts(self): """ Iterate over the dump, returning text version of each article as a list of tokens. Only articles of sufficient length are returned (short articles & redirects etc are ignored). Note that this iterates over the **texts**; if you want vectors, just use the standard corpus interface instead of this function:: >>> for vec in wiki_corpus: >>> print vec """ articles, articles_all = 0, 0 positions, positions_all = 0, 0 texts = ((text, self.lemmatize) for _, text in _extract_pages(bz2.BZ2File(self.fname))) pool = multiprocessing.Pool(self.processes) # process the corpus in smaller chunks of docs, because multiprocessing.Pool # is dumb and would load the entire input into RAM at once... for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1): for tokens in pool.imap(process_article, group): # chunksize=10): articles_all += 1 positions_all += len(tokens) if len( tokens ) > ARTICLE_MIN_WORDS: # article redirects and short stubs are pruned here articles += 1 positions += len(tokens) yield tokens pool.terminate() logger.info( "finished iterating over Wikipedia corpus of %i documents with %i positions" " (total %i articles, %i positions before pruning articles shorter than %i words)" % (articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS)) self.length = articles # cache corpus length
def get_texts(self): """ Iterate over the dump, returning text version of each article as a list of tokens. """ reviews = 0 positions = 0 texts = [text for text in _extract_reviews(self.fname)] pool = multiprocessing.Pool(self.processes) # process the corpus in smaller chunks of docs, because multiprocessing.Pool # is dumb and would load the entire input into RAM at once... #for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1): for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1): for tokens in pool.imap(process_review, group): # chunksize=10): reviews += 1 positions += len(tokens) yield tokens pool.terminate() logger.info("finished iterating over the generated Yelp corpus of %i documents with %i positions" " (total %i articles, %i positions before pruning articles shorter than %i words)" % (reviews, positions, reviews, positions, 10000)) self.length = reviews # cache corpus length
def addDocuments(self, corpus, chunks=None, decay=None): """ Update singular value decomposition to take into account a new corpus of documents. Training proceeds in chunks of `chunks` documents at a time. The size of `chunks` is a tradeoff between increased speed (bigger `chunks`) vs. lower memory footprint (smaller `chunks`). If the distributed mode is on, each chunk is sent to a different worker/computer. Setting `decay` < 1.0 causes re-orientation towards new data trends in the input document stream, by giving less emphasis to old observations. This allows LSA to gradually "forget" old observations (documents) and give more preference to new ones. """ logger.info("updating SVD with new documents") # get computation parameters; if not specified, use the ones from constructor if chunks is None: chunks = self.chunks if decay is None: decay = self.decay if not scipy.sparse.issparse(corpus): if not self.onepass: # we are allowed multiple passes over the input => use a faster, randomized two-pass algo update = Projection(self.numTerms, self.numTopics, None) update.u, update.s = stochasticSvd( corpus, self.numTopics, num_terms=self.numTerms, chunks=chunks, extra_dims=self.extra_samples, power_iters=self.power_iters) self.projection.merge(update, decay=decay) else: # the one-pass algo doc_no = 0 # the corpus will be processed in chunks of `chunks` of documents. # keep preparing new chunks in a separate thread, so that we don't # waste time waiting for chunks to be read from disk. instead, fill # a (relatively short) chunk queue asynchronously in utils.chunkize, # and pop already-ready chunks from it as needed. for chunk_no, chunk in enumerate( utils.chunkize(corpus, chunks, self.numworkers)): # construct the job as a sparse matrix, to minimize memory overhead # definitely avoid materializing it as a dense matrix! job = matutils.corpus2csc(chunk, num_terms=self.numTerms) del chunk doc_no += job.shape[1] if self.dispatcher: # distributed version: add this job to the job queue, so workers can work on it logger.debug("creating job #%i" % chunk_no) self.dispatcher.putjob( job ) # put job into queue; this will eventually block, because the queue has a small finite size del job logger.info("dispatched documents up to #%s" % doc_no) else: # serial version, there is only one "worker" (myself) => process the job directly update = Projection(self.numTerms, self.numTopics, job) del job self.projection.merge(update, decay=decay) del update logger.info("processed documents up to #%s" % doc_no) self.printTopics( 5 ) # TODO see if printDebug works and remove one of these.. # wait for all workers to finish (distributed version only) if self.dispatcher: logger.info( "reached the end of input; now waiting for all remaining jobs to finish" ) self.projection = self.dispatcher.getstate() # logging.info("top topics after adding %i documents" % doc_no) # self.printDebug(10) else: assert not self.dispatcher, "must be in serial mode to receive jobs" assert self.onepass, "distributed two-pass algo not supported yet" update = Projection(self.numTerms, self.numTopics, corpus.tocsc()) self.projection.merge(update, decay=decay) logger.info("processed sparse job of %i documents" % (corpus.shape[1]))
def update(self, corpus, chunks=None, decay=None, passes=None, update_every=None): """ Train the model with new documents, by EM-iterating over `corpus` until the topics converge (or until the maximum number of allowed iterations is reached). In distributed mode, the E step is distributed over a cluster of machines. This update also supports updating an already trained model (`self`) with new documents from `corpus`; the two models are then merged in proportion to the number of old vs. new documents. This feature is still experimental for non-stationary input streams. For stationary input (no topic drift in new documents), on the other hand, this equals the online update of Hoffman et al. and is guaranteed to converge for any `decay` in (0.5, 1.0>. """ if chunks is None: chunks = self.chunks if decay is None: decay = self.decay if passes is None: passes = self.passes if update_every is None: update_every = self.update_every if not passes: # if the number of whole-corpus iterations was not specified explicitly, # assume iterating over the corpus until convergence (or until self.MAXITER # iterations, whichever happens first) passes = self.MAXITER # rho is the "speed" of updating; TODO try other fncs rho = lambda: pow(1.0 + self.num_updates, -decay) try: lencorpus = len(corpus) except: logger.warning("input corpus stream has no len(); counting documents") lencorpus = sum(1 for _ in corpus) if lencorpus == 0: logger.warning("LdaModel.update() called with an empty corpus") return self.state.numdocs += lencorpus if update_every > 0: updatetype = "online" updateafter = min(lencorpus, update_every * self.numworkers * chunks) else: updatetype = "batch" updateafter = lencorpus updates_per_pass = max(1, lencorpus / updateafter) logger.info("running %s LDA training, %s topics, %i passes over " "the supplied corpus of %i documets, updating model once " "every %i documents" % (updatetype, self.numTopics, passes, lencorpus, updateafter)) if updates_per_pass * passes < 10: logger.warning("too few updates, training might not converge; consider " "increasing the number of passes to improve accuracy") for iteration in xrange(passes): if self.dispatcher: logger.info('initializing workers') self.dispatcher.reset(self.state) else: other = LdaState(self.state.sstats) dirty = False # the corpus will be processed in chunks of `chunks` of documents. # keep preparing new chunks in a separate thread, so that we don't # waste time waiting for chunks to be read from disk. instead, fill # a (relatively short) chunk queue asynchronously in utils.chunkize, # and pop already-ready chunks from it as needed. for chunk_no, chunk in enumerate(utils.chunkize(corpus, chunks, self.numworkers)): if self.dispatcher: # add the chunk to dispatcher's job queue, so workers can munch on it logger.info('PROGRESS: iteration %i, dispatching documents up to #%i/%i' % (iteration, chunk_no * chunks + len(chunk), lencorpus)) # this will eventually block until some jobs finish, because the queue has a small finite length # convert each document to a 2d numpy array (~6x faster when transmitting # list data over the wire, in Pyro) self.dispatcher.putjob(chunk) else: logger.info('PROGRESS: iteration %i, at document #%i/%i' % (iteration, chunk_no * chunks + len(chunk), lencorpus)) self.doEstep(chunk, other) dirty = True if update_every and (chunk_no + 1) % (update_every * self.numworkers) == 0: if self.dispatcher: # distributed mode: wait for all workers to finish logger.info("reached the end of input; now waiting for all remaining jobs to finish") other = self.dispatcher.getstate() diff = self.doMstep(rho(), other) del other # free up some mem if self.dispatcher: logger.info('initializing workers') self.dispatcher.reset(self.state) else: other = LdaState(self.state.sstats) dirty = False #endfor corpus iteration if dirty: # finish any remaining updates if self.dispatcher: # distributed mode: wait for all workers to finish logger.info("reached the end of input; now waiting for all remaining jobs to finish") other = self.dispatcher.getstate() self.doMstep(rho(), other) dirty = False
def get_texts(self): """Iterate over the dump, yielding a list of tokens for each article that passed the length and namespace filtering. Uses multiprocessing internally to parallelize the work and process the dump more quickly. Notes ----- This iterates over the **texts**. If you want vectors, just use the standard corpus interface instead of this method: Examples -------- >>> from gensim.test.utils import datapath >>> from gensim.corpora import WikiCorpus >>> >>> path_to_wiki_dump = datapath("enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2") >>> >>> for vec in WikiCorpus(path_to_wiki_dump): ... pass Yields ------ list of str If `metadata` is False, yield only list of token extracted from the article. (list of str, (int, str)) List of tokens (extracted from the article), page id and article title otherwise. """ articles, articles_all = 0, 0 positions, positions_all = 0, 0 tokenization_params = (self.tokenizer_func, self.token_min_len, self.token_max_len, self.lower) texts = \ ((text, self.lemmatize, title, pageid, tokenization_params) for title, text, pageid in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces)) pool = multiprocessing.Pool(self.processes, init_to_ignore_interrupt) try: # process the corpus in smaller chunks of docs, because multiprocessing.Pool # is dumb and would load the entire input into RAM at once... for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1): for tokens, title, pageid in pool.imap(_process_article, group): articles_all += 1 positions_all += len(tokens) # article redirects and short stubs are pruned here if len(tokens) < self.article_min_tokens or \ any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): continue articles += 1 positions += len(tokens) if self.metadata: yield (tokens, (pageid, title)) else: yield tokens except KeyboardInterrupt: logger.warn( "user terminated iteration over Wikipedia corpus after %i documents with %i positions " "(total %i articles, %i positions before pruning articles shorter than %i words)", articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS) else: logger.info( "finished iterating over Wikipedia corpus of %i documents with %i positions " "(total %i articles, %i positions before pruning articles shorter than %i words)", articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS) self.length = articles # cache corpus length finally: pool.terminate()
def update(self, corpus, chunks=None, decay=None, passes=None, update_every=None): """ Train the model with new documents, by EM-iterating over `corpus` until the topics converge (or until the maximum number of allowed iterations is reached). In distributed mode, the E step is distributed over a cluster of machines. This update also supports updating an already trained model (`self`) with new documents from `corpus`; the two models are then merged in proportion to the number of old vs. new documents. This feature is still experimental for non-stationary input streams. For stationary input (no topic drift in new documents), on the other hand, this equals the online update of Hoffman et al. and is guaranteed to converge for any `decay` in (0.5, 1.0>. """ if chunks is None: chunks = self.chunks if decay is None: decay = self.decay if passes is None: passes = self.passes if update_every is None: update_every = self.update_every if not passes: # if the number of whole-corpus iterations was not specified explicitly, # assume iterating over the corpus until convergence (or until self.MAXITER # iterations, whichever happens first) passes = self.MAXITER # rho is the "speed" of updating; TODO try other fncs rho = lambda: pow(1.0 + self.num_updates, -decay) try: lencorpus = len(corpus) except: logger.warning("input corpus stream has no len(); counting documents") lencorpus = sum(1 for _ in corpus) if lencorpus == 0: logger.warning("LdaModel.update() called with an empty corpus") return self.state.numdocs += lencorpus if update_every > 0: updatetype = "online" updateafter = min(lencorpus, update_every * self.numworkers * chunks) else: updatetype = "batch" updateafter = lencorpus updates_per_pass = max(1, lencorpus / updateafter) logger.info("running %s LDA training, %s topics, %i passes over " "the supplied corpus of %i documents, updating model once " "every %i documents" % (updatetype, self.numTopics, passes, lencorpus, updateafter)) if updates_per_pass * passes < 10: logger.warning("too few updates, training might not converge; consider " "increasing the number of passes to improve accuracy") for iteration in xrange(passes): if self.dispatcher: logger.info('initializing %s workers' % self.numworkers) self.dispatcher.reset(self.state) else: other = LdaState(self.state.sstats) dirty = False # the corpus will be processed in chunks of `chunks` of documents. # keep preparing new chunks in a separate thread, so that we don't # waste time waiting for chunks to be read from disk. instead, fill # a (relatively short) chunk queue asynchronously in utils.chunkize, # and pop already-ready chunks from it as needed. for chunk_no, chunk in enumerate(utils.chunkize(corpus, chunks, self.numworkers)): if self.dispatcher: # add the chunk to dispatcher's job queue, so workers can munch on it logger.info('PROGRESS: iteration %i, dispatching documents up to #%i/%i' % (iteration, chunk_no * chunks + len(chunk), lencorpus)) # this will eventually block until some jobs finish, because the queue has a small finite length # convert each document to a 2d numpy array (~6x faster when transmitting # list data over the wire, in Pyro) self.dispatcher.putjob(chunk) else: logger.info('PROGRESS: iteration %i, at document #%i/%i' % (iteration, chunk_no * chunks + len(chunk), lencorpus)) self.doEstep(chunk, other) dirty = True if update_every and (chunk_no + 1) % (update_every * self.numworkers) == 0: if self.dispatcher: # distributed mode: wait for all workers to finish logger.info("reached the end of input; now waiting for all remaining jobs to finish") other = self.dispatcher.getstate() diff = self.doMstep(rho(), other) del other # free up some mem if self.dispatcher: logger.info('initializing workers') self.dispatcher.reset(self.state) else: other = LdaState(self.state.sstats) dirty = False #endfor corpus iteration if dirty: # finish any remaining updates if self.dispatcher: # distributed mode: wait for all workers to finish logger.info("reached the end of input; now waiting for all remaining jobs to finish") other = self.dispatcher.getstate() self.doMstep(rho(), other) dirty = False
def preprocess_data( train_infile, test_infile, output_dir, train_prefix, test_prefix, min_doc_count=0, max_doc_freq=1.0, ngram_range=(1, 1), vocab_size=None, stopwords=None, keep_num=False, keep_alphanum=False, strip_html=False, lower=True, min_word_length=3, max_doc_length=5000, label_fields=None, workers=4, proc_multiplier=500, ): if stopwords == "mallet": print("Using Mallet stopwords") stopword_list = fh.read_text( os.path.join("stopwords", "mallet_stopwords.txt")) elif stopwords == "snowball": print("Using snowball stopwords") stopword_list = fh.read_text( os.path.join("stopwords", "snowball_stopwords.txt")) elif stopwords is not None: print("Using custom stopwords") stopword_list = fh.read_text( os.path.join("stopwords", stopwords + "_stopwords.txt")) else: stopword_list = [] stopword_set = {s.strip() for s in stopword_list} print("Reading data files") train_items = fh.LazyJsonlistReader(train_infile) n_train = len(train_items) print("Found {:d} training documents".format(n_train)) if test_infile is not None: test_items = fh.LazyJsonlistReader(test_infile) n_test = len(test_items) print("Found {:d} test documents".format(n_test)) else: test_items = [] n_test = 0 n_items = n_train + n_test if label_fields: label_lists = {} if "," in label_fields: label_fields = label_fields.split(",") else: label_fields = [label_fields] if label_fields is None: label_fields = [] # make vocabulary train_ids, train_parsed, train_labels = [], [], [] test_ids, test_parsed, test_labels = [], [], [] print("Parsing documents") word_counts = Counter() doc_counts = Counter() vocab = None # process in blocks pool = multiprocessing.Pool(workers) chunksize = proc_multiplier * workers kwargs = { "strip_html": strip_html, "lower": lower, "keep_numbers": keep_num, "keep_alphanum": keep_alphanum, "min_length": min_word_length, "stopwords": stopword_set, "ngram_range": ngram_range, "vocab": vocab, "label_fields": label_fields, } # these two loops below do the majority of the preprocessing. unfortunately, without # a major refactor, they cannot be turned into generators and the results of # tokenization must be appended to a list. this unfortunately implies a large # memory footprint for i, group in enumerate(chunkize(iter(train_items), chunksize=chunksize)): print(f"On training chunk {i} of {len(train_items) // chunksize}", end="\r") for ids, tokens, labels in pool.imap(partial(_process_item, **kwargs), group): # store the parsed documents if ids is not None: train_ids.append(ids) if labels is not None: train_labels.append(labels) tokens = tokens[:max_doc_length] # keep track of the number of documents with each word word_counts.update(tokens) doc_counts.update(set(tokens)) train_parsed.append(" ".join(tokens)) # more efficient storage print("Train set processing complete") for i, group in enumerate(chunkize(iter(test_items), chunksize=chunksize)): print(f"On testing chunk {i} of {len(test_items) // chunksize}", end="\r") for ids, tokens, labels in pool.imap(partial(_process_item, **kwargs), group): # store the parsed documents if ids is not None: test_ids.append(ids) if labels is not None: test_labels.append(labels) tokens = tokens[:max_doc_length] # keep track of the number of documents with each word word_counts.update(tokens) doc_counts.update(set(tokens)) test_parsed.append(" ".join(tokens)) # more efficient storage print("Test set processing complete") pool.terminate() print("Size of full vocabulary=%d" % len(word_counts)) # store possible label values if label_fields: labels_df = pd.DataFrame.from_records(train_labels + test_labels) for label_name in label_fields: label_list = sorted(labels_df[label_name].unique().tolist()) n_labels = len(label_list) print("Found label %s with %d classes" % (label_name, n_labels)) label_lists[label_name] = label_list print("Selecting the vocabulary") most_common = doc_counts.most_common() words, doc_counts = zip(*most_common) doc_freqs = np.array(doc_counts) / float(n_items) vocab = [ word for i, word in enumerate(words) if doc_counts[i] >= min_doc_count and doc_freqs[i] <= max_doc_freq ] most_common = [ word for i, word in enumerate(words) if doc_freqs[i] > max_doc_freq ] if max_doc_freq < 1.0: print( "Excluding words with frequency > {:0.2f}:".format(max_doc_freq), most_common, ) print("Vocab size after filtering = %d" % len(vocab)) if vocab_size is not None: if len(vocab) > int(vocab_size): vocab = vocab[:int(vocab_size)] vocab_size = len(vocab) print("Final vocab size = %d" % vocab_size) print("Most common words remaining:", " ".join(vocab[:10])) vocab.sort() fh.write_to_json(vocab, os.path.join(output_dir, train_prefix + ".vocab.json")) count_dtype = np.uint16 if max_doc_length < np.iinfo( np.uint16).max else np.int train_X_sage, tr_aspect, tr_no_aspect, tr_widx, vocab_for_sage = process_subset( train_items, train_ids, train_parsed, train_labels, label_fields, label_lists, vocab, output_dir, train_prefix, count_dtype=count_dtype, ) if n_test > 0: test_X_sage, te_aspect, te_no_aspect, _, _ = process_subset( test_items, test_ids, test_parsed, test_labels, label_fields, label_lists, vocab, output_dir, test_prefix, count_dtype=count_dtype, ) train_sum = np.array(train_X_sage.sum(axis=0)) print("%d words missing from training data" % np.sum(train_sum == 0)) if n_test > 0: test_sum = np.array(test_X_sage.sum(axis=0)) print("%d words missing from test data" % np.sum(test_sum == 0)) sage_output = { "tr_data": train_X_sage, "tr_aspect": tr_aspect, "widx": tr_widx, "vocab": vocab_for_sage, } if n_test > 0: sage_output["te_data"] = test_X_sage sage_output["te_aspect"] = te_aspect savemat(os.path.join(output_dir, "sage_labeled.mat"), sage_output) sage_output["tr_aspect"] = tr_no_aspect if n_test > 0: sage_output["te_aspect"] = te_no_aspect savemat(os.path.join(output_dir, "sage_unlabeled.mat"), sage_output) print("Done!")
def get_texts(self): """ Iterate over the corpus data, yielding sentences of the text version of each article (each sentence represented as a list of tokens). See the `WikiCorpus` class for more. """ # Unfortunately due to the OOP-unfriendly implementation of # `WikiCorpus` we have to copy-and-paste some code. This code is # based on `WikiCorpus#get_texts`. n_articles, n_articles_all = 0, 0 n_sentences, n_sentences_all = 0, 0 pages = _extract_pages(self.open_corpus_file(), self.filter_namespaces) texts = ((text, self.lemmatize, title, pageid) for title, text, pageid in pages) pool = multiprocessing.Pool(self.processes) # process the corpus in smaller chunks of docs, because # multiprocessing.Pool is dumb and would load the entire input # into RAM at once... chunks = utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1) for group in chunks: for sentences, title, pageid in pool.imap(process_article, group): n_articles_all += 1 n_sentences_all += len(sentences) num_tokens = sum(len(sentence) for sentence in sentences) # article redirects and short stubs are pruned here if num_tokens > ARTICLE_MIN_WORDS: n_articles += 1 n_sentences += len(sentences) for sentence in sentences: if self.metadata: yield (sentence, (pageid, title)) else: yield sentence if self.sentences_out is not None: self.sentences_out.write(' '.join(sentence)) self.sentences_out.write('\n') pool.terminate() LOGGER.info("finished iterating over Wikipedia corpus of %i " "articles with %i sentences (%i articles / %i " "sentences retained)" % (n_articles_all, n_sentences_all, n_articles, n_sentences)) # cache corpus length self.length = n_sentences # Close sentences file if we were writing one if self.sentences_out is not None: self.sentences_out.close() self.sentences_out = None
def get_texts(self): """Iterate over the dump, yielding a list of tokens for each article that passed the length and namespace filtering. Uses multiprocessing internally to parallelize the work and process the dump more quickly. Notes ----- This iterates over the **texts**. If you want vectors, just use the standard corpus interface instead of this method: Examples -------- .. sourcecode:: pycon >>> from gensim.test.utils import datapath >>> from gensim.corpora import WikiCorpus >>> >>> path_to_wiki_dump = datapath("enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2") >>> >>> for vec in WikiCorpus(path_to_wiki_dump): ... pass Yields ------ list of str If `metadata` is False, yield only list of token extracted from the article. (list of str, (int, str)) List of tokens (extracted from the article), page id and article title otherwise. """ articles, articles_all = 0, 0 positions, positions_all = 0, 0 tokenization_params = (self.tokenizer_func, self.token_min_len, self.token_max_len, self.lower) texts = \ ((text, self.lemmatize, title, pageid, tokenization_params) for title, text, pageid in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces, self.filter_articles)) pool = multiprocessing.Pool(self.processes, init_to_ignore_interrupt) try: # process the corpus in smaller chunks of docs, because multiprocessing.Pool # is dumb and would load the entire input into RAM at once... for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1): for tokens, title, pageid in pool.imap(_process_article, group): articles_all += 1 positions_all += len(tokens) # article redirects and short stubs are pruned here if len(tokens) < self.article_min_tokens or \ any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): continue articles += 1 positions += len(tokens) if self.metadata: yield (tokens, (pageid, title)) else: yield tokens except KeyboardInterrupt: logger.warn( "user terminated iteration over Wikipedia corpus after %i documents with %i positions " "(total %i articles, %i positions before pruning articles shorter than %i words)", articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS ) except PicklingError as exc: raise_from(PicklingError('Can not send filtering function {} to multiprocessing, ' 'make sure the function can be pickled.'.format(self.filter_articles)), exc) else: logger.info( "finished iterating over Wikipedia corpus of %i documents with %i positions " "(total %i articles, %i positions before pruning articles shorter than %i words)", articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS ) self.length = articles # cache corpus length finally: pool.terminate()
contents.append(regex.sub('', text)) os.chdir(wd) # import stopword list filename = 'stopwords_eng.txt' with io.open(filename,'r',encoding='utf8') as f: text = f.read() stoplist = set(text.split()) # tokenize and case fold contents_tok = [[w for w in doc.lower().split() if w not in stoplist] for doc in contents] # chunk documents in n chuncks n = 100 from gensim.utils import chunkize contents_chunk = [] for doc in contents_tok: clen = len(doc)/n for c in chunkize(doc,clen): contents_chunk.append(c) # extract raw frequencies from gensim import corpora, models from collections import defaultdict import numpy as np # compute word freq frequency = defaultdict(int) for chunk in contents_chunk: for token in chunk: frequency[token] += 1 freq = [val for val in frequency.values()] # prune bottum (mn) and top (mx) mn = 1 mx = np.percentile(freq, 98) contents_chunk = [[token for token in chunk if frequency[token] > mn and frequency[token] <= mx] for chunk in contents_chunk]
def addDocuments(self, corpus, chunks=None, decay=None): """ Update singular value decomposition to take into account a new corpus of documents. Training proceeds in chunks of `chunks` documents at a time. The size of `chunks` is a tradeoff between increased speed (bigger `chunks`) vs. lower memory footprint (smaller `chunks`). If the distributed mode is on, each chunk is sent to a different worker/computer. Setting `decay` < 1.0 causes re-orientation towards new data trends in the input document stream, by giving less emphasis to old observations. This allows LSA to gradually "forget" old observations (documents) and give more preference to new ones. """ logger.info("updating SVD with new documents") # get computation parameters; if not specified, use the ones from constructor if chunks is None: chunks = self.chunks if decay is None: decay = self.decay if not scipy.sparse.issparse(corpus): if not self.onepass: # we are allowed multiple passes over the input => use a faster, randomized two-pass algo update = Projection(self.numTerms, self.numTopics, None) update.u, update.s = stochasticSvd(corpus, self.numTopics, num_terms=self.numTerms, chunks=chunks, extra_dims=self.extra_samples, power_iters=self.power_iters) self.projection.merge(update, decay=decay) else: # the one-pass algo doc_no = 0 # the corpus will be processed in chunks of `chunks` of documents. # keep preparing new chunks in a separate thread, so that we don't # waste time waiting for chunks to be read from disk. instead, fill # a (relatively short) chunk queue asynchronously in utils.chunkize, # and pop already-ready chunks from it as needed. for chunk_no, chunk in enumerate(utils.chunkize(corpus, chunks, self.numworkers)): # construct the job as a sparse matrix, to minimize memory overhead # definitely avoid materializing it as a dense matrix! job = matutils.corpus2csc(chunk, num_terms=self.numTerms) del chunk doc_no += job.shape[1] if self.dispatcher: # distributed version: add this job to the job queue, so workers can work on it logger.debug("creating job #%i" % chunk_no) self.dispatcher.putjob(job) # put job into queue; this will eventually block, because the queue has a small finite size del job logger.info("dispatched documents up to #%s" % doc_no) else: # serial version, there is only one "worker" (myself) => process the job directly update = Projection(self.numTerms, self.numTopics, job) del job self.projection.merge(update, decay = decay) del update logger.info("processed documents up to #%s" % doc_no) self.printTopics(5) # TODO see if printDebug works and remove one of these.. # wait for all workers to finish (distributed version only) if self.dispatcher: logger.info("reached the end of input; now waiting for all remaining jobs to finish") self.projection = self.dispatcher.getstate() # logging.info("top topics after adding %i documents" % doc_no) # self.printDebug(10) else: assert not self.dispatcher, "must be in serial mode to receive jobs" assert self.onepass, "distributed two-pass algo not supported yet" update = Projection(self.numTerms, self.numTopics, corpus.tocsc()) self.projection.merge(update, decay=decay) logger.info("processed sparse job of %i documents" % (corpus.shape[1]))