def iter_over_dump_file(dump_file, min_length_of_article=50, ignore_namespaces=None): """ Iterator over wiki_dump_file. Returns title and tokens for next article in dump file. Ignores short articles. Ignores meta articles, throug given namespaces. Default namespaces are 'Wikipedia', 'Category', 'File', 'Portal', 'Template', 'MediaWiki', 'User', 'Help', 'Book', 'Draft' :param dump_file: the dump file :param min_length_of_article: the min number of words in the next article. Default = 50 :param ignore_namespaces: list of namespaces which should be ignored. :return: title, tokens """ if ignore_namespaces is None: ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split( ) for title, text, pageid in _extract_pages(smart_open(dump_file)): text = filter_wiki(text) tokens = tokenize(text) if len(tokens) < min_length_of_article or any( title.startswith(namespace + ':') for namespace in ignore_namespaces): continue # ignore short articles and various meta-articles yield title, tokens
def iter_wiki(dump_file): # making a wiki token stream """Yield each article from the Wikipedia dump, as a `(title, tokens)` 2-tuple.""" ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split() for title, text, pageid in _extract_pages(smart_open(dump_file)): text = filter_wiki(text) tokens = tokenize(text) if len(tokens) < 50 or any(title.startswith(ns + ':') for ns in ignore_namespaces): continue # ignore short articles and various meta-articles yield tokens
def iter_wiki(self): """Yield each article from the Wikipedia dump, as a `(title, tokens)` 2-tuple.""" ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split() for title, text, pageid in _extract_pages(smart_open(self.dump_file)): text = filter_wiki(text) tokens = [token for token in simple_preprocess(text) if token not in STOPWORDS] if len(tokens) < 50 or any(title.startswith(ns + ':') for ns in ignore_namespaces): continue # ignore short articles and various meta-articles yield title, tokens
def iter_wiki(dump_file): """Yield each article from the Wikipedia dump, as a `(title, tokens)` 2-tuple.""" ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split() for title, text, pageid in _extract_pages(smart_open(dump_file)): text = filter_wiki(text) tokens = tokenize(text) if any(title.startswith(ns + ':') for ns in ignore_namespaces): continue # ignore short articles and various meta-articles yield title, tokens
def iter_wiki(dump_file): ignore_namespaces = "Wikipedia Category File Portal Template MediaWiki User Help Book Draft".split( ) for title, text, pageid in _extract_pages(smart_open(dump_file)): text = filter_wiki(text) tokens = tokenize(text) if len(tokens) < 50 or any( title.startswith(ns + ':') for ns in ignore_namespaces): continue yield title, tokens
def iter_wiki(dump_file, n=-1): ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split() counter = 0 for title, text, pageid in _extract_pages(smart_open(dump_file)): counter += 1 if counter == n: break text = filter_wiki(text) tokens = tokenize(text) if len(tokens) < 50 or any(title.startswith(ns+':') for ns in ignore_namespaces): continue yield title, tokens
def parse_wiki_dump(infile, min_words, process_function, processes=multiprocessing.cpu_count() - 2): """ Yield articles from a bz2 Wikipedia dump `infile` as (title, tokens) 2-tuples. Only articles of sufficient length are returned (short articles & redirects etc are ignored). Uses multiple processes to speed up the parsing in parallel. Args: infile (str) : path to bz2 Wikipedia dump min_words (int) : skip article if it has less than this many words process_function (function) : preprocessing function processes (int) : number of cores to be used """ logger.info("Start processing Wikipedia dump `{}`".format(infile)) articles, articles_all = 0, 0 pool = multiprocessing.Pool(processes) # process the corpus in smaller chunks of docs, because multiprocessing.Pool # is dumb and would try to load the entire dump into RAM... texts = wikicorpus._extract_pages(bz2.BZ2File(infile)) # generator ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split( ) for group in chunkize(texts, chunksize=10 * processes): for title, tokens in pool.imap(process_function, group): if articles_all % 10000 == 0: logger.info( "PROGRESS: at article #{} accepted {} articles".format( articles_all, articles)) articles_all += 1 # article redirects and short stubs are pruned here if any( title.startswith(ignore + ':') for ignore in ignore_namespaces) or len(tokens) < min_words: continue # all good: use this article articles += 1 yield title, tokens pool.terminate() logger.info( "finished iterating over Wikipedia corpus of {} documents with total {} articles" .format(articles, articles_all))
def iter_wiki(self, dump_file): logger.info("preprocessing dump {0}".format(dump_file)) """Yield each article from the Wikipedia dump, as a `(title, tokens)` 2-tuple.""" ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split( ) index = 0 for title, text, pageid in _extract_pages(smart_open(dump_file)): if index > 160_000: break text = filter_wiki(text) tokens = self.preprocess_doc(text) if len(tokens) < 50 or any( title.startswith(ns + ':') for ns in ignore_namespaces): continue # ignore short articles and various meta-articles index += 1 yield title, tokens
def get_texts(self): """ Iterate over the Wikipedia dump and the HN articles returning text """ wiki_articles, hn_articles, articles_all = 0, 0, 0 positions, positions_all = 0, 0 # ************ Wikipedia ************ texts = ((text, self.lemmatize) for _, text in wikicorpus._extract_pages(bz2.BZ2File(self.wiki_file))) pool = multiprocessing.Pool(self.processes) for group in utils.chunkize(texts, chunksize=10 * pool._processes, maxsize=1): # otherwise imap puts all the corpus into memory for tokens in pool.imap(wikicorpus.process_article, group): articles_all += 1 positions_all += len(tokens) if len(tokens) > WIKI_ARTICLE_MIN_WORDS: wiki_articles += 1 positions += len(tokens) yield tokens pool.terminate() print (">>> finished iterating over Wikipedia corpus of %i documents with %i positions (total %i articles, %i positions before pruning articles shorter than %i words)" % (wiki_articles, positions, articles_all, positions_all, WIKI_ARTICLE_MIN_WORDS)) # ************ HN articles ************ positions_after_wiki = positions fnamelist = [] for g in glob.iglob(self.hn_folder + '/*.txt'): fnamelist.append(g) for fileno, fname in enumerate(fnamelist): # TODO parallelize as Wiki hn_text = open(fname).read() if self.lemmatize: result = utils.lemmatize(hn_text) # text into lemmas here else: result = tokenize(hn_text) # text into tokens here articles_all += 1 positions_all += len(result) if len(result) > HN_ARTICLE_MIN_WORDS: hn_articles += 1 positions += len(result) yield result print (">>> finished iterating over HN corpus of %i documents with %i positions" % (hn_articles, positions - positions_after_wiki)) # ************ /HN articles ************ self.length = wiki_articles + hn_articles # cache corpus length
def get_texts(self): """ Iterate over the dump, returning text version of each article as a list of tokens. Only articles of sufficient length are returned (short articles & redirects etc are ignored). Note that this iterates over the **texts**; if you want vectors, just use the standard corpus interface instead of this function:: >>> for vec in wiki_corpus: >>> print(vec) """ articles, articles_all = 0, 0 positions, positions_all = 0, 0 texts = ((text, self.lemmatize, title, pageid) for title, text, pageid in _extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces)) #pool = multiprocessing.Pool(self.processes) # process the corpus in smaller chunks of docs, because multiprocessing.Pool # is dumb and would load the entire input into RAM at once... for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1): #for tokens, title, pageid in pool.imap(process_article, group): # chunksize=10): for tokens, title, pageid in imap(process_article, group): # chunksize=10): articles_all += 1 positions_all += len(tokens) # Check if the article is long enough and has tokens in our set of interestx if len(tokens) > ARTICLE_MIN_WORDS and set(tokens) & self.terms: articles += 1 positions += len(tokens) if self.metadata: yield (tokens, (pageid, title)) else: yield tokens #pool.terminate() logger.info("finished iterating over Wikipedia corpus of %i documents with %i positions" " (total %i articles, %i positions before pruning articles shorter than %i words)" % (articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS)) self.length = articles # cache corpus length
def iter_wiki(dump_file): for title, text, pageid in _extract_pages(smart_open(dump_file)): text = filter_wiki(text) yield title, text
def get_texts(self): """ Iterate over the corpus data, yielding sentences of the text version of each article (each sentence represented as a list of tokens). See the `WikiCorpus` class for more. """ # Unfortunately due to the OOP-unfriendly implementation of # `WikiCorpus` we have to copy-and-paste some code. This code is # based on `WikiCorpus#get_texts`. n_articles, n_articles_all = 0, 0 n_sentences, n_sentences_all = 0, 0 pages = _extract_pages(self.open_corpus_file(), self.filter_namespaces) texts = ((text, self.lemmatize, title, pageid) for title, text, pageid in pages) pool = multiprocessing.Pool(self.processes) # process the corpus in smaller chunks of docs, because # multiprocessing.Pool is dumb and would load the entire input # into RAM at once... chunks = utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1) for group in chunks: for sentences, title, pageid in pool.imap(process_article, group): n_articles_all += 1 n_sentences_all += len(sentences) num_tokens = sum(len(sentence) for sentence in sentences) # article redirects and short stubs are pruned here if num_tokens > ARTICLE_MIN_WORDS: n_articles += 1 n_sentences += len(sentences) for sentence in sentences: if self.metadata: yield (sentence, (pageid, title)) else: yield sentence if self.sentences_out is not None: self.sentences_out.write(' '.join(sentence)) self.sentences_out.write('\n') pool.terminate() LOGGER.info("finished iterating over Wikipedia corpus of %i " "articles with %i sentences (%i articles / %i " "sentences retained)" % (n_articles_all, n_sentences_all, n_articles, n_sentences)) # cache corpus length self.length = n_sentences # Close sentences file if we were writing one if self.sentences_out is not None: self.sentences_out.close() self.sentences_out = None
model_fn = os.path.splitext( os.path.basename(fn))[0] + "-%d.w2v-gensim" % n else: model_fn = options.model_file if os.path.exists(model_fn): logging.error("File already exists, %s" % model_fn) exit(1) logging.info("Generating word vectors size %d from %s" % (n, fn)) sent_gen = None if options.file_type == 'bz-wiki': sent_gen = (process_article((text, None)) for title, text in _extract_pages(bz2.BZ2File(fn))) elif options.file_type == 'mahoney': sent_gen = Text8Corpus(fn) else: raise ValueError model = gensim.models.Word2Vec(sent_gen, workers=n_jobs, window=w, size=n, min_count=c) if options.accuracy: print accuracy(model, options.accuracy, DEFAULT_ACCURACY_CUTOFF) logging.info("Writing model to %s" % model_fn)
c = options.min_count if not options.model_file: model_fn = os.path.splitext(os.path.basename(fn))[0] + "-%d.w2v-gensim" % n else: model_fn = options.model_file if os.path.exists(model_fn): logging.error("File already exists, %s" % model_fn) exit(1) logging.info("Generating word vectors size %d from %s" % (n, fn)) sent_gen = None if options.file_type == 'bz-wiki': sent_gen = (process_article((text, None)) for title, text in _extract_pages(bz2.BZ2File(fn))) elif options.file_type == 'mahoney': sent_gen = Text8Corpus(fn) else: raise ValueError model = gensim.models.Word2Vec(sent_gen, workers=n_jobs, window=w, size=n, min_count=c) if options.accuracy: print accuracy(model, options.accuracy, DEFAULT_ACCURACY_CUTOFF) logging.info("Writing model to %s" % model_fn) model.save(model_fn)