class WikiCorpus(interfaces.CorpusABC): """ Treat a wikipedia articles dump (*articles.xml.bz2) as a (read-only) corpus. The documents are extracted on-the-fly, so that the whole (massive) dump can stay compressed on disk. >>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id, takes almost 7h >>> wiki.saveAsText('wiki_en_vocab200k') # another 7.5h, creates a file in MatrixMarket format plus file with id->word """ def __init__(self, fname, noBelow = 20, keep_words = 200000, dictionary = None): """ Initialize the corpus. This scans the corpus once, to determine its vocabulary (only the first `keep_words` most frequent words that appear in at least `noBelow` documents are kept). """ self.fname = fname if dictionary is None: self.dictionary = Dictionary(self.getArticles()) self.dictionary.filterExtremes(noBelow = noBelow, noAbove = 0.1, keepN = keep_words) else: self.dictionary = dictionary def __len__(self): return self.numDocs def __iter__(self): """ The function that defines a corpus -- iterating over the corpus yields vectors, one for each document. """ for docNo, text in enumerate(self.getArticles()): yield self.dictionary.doc2bow(text, allowUpdate = False) def saveDictionary(self, fname): """ Store id->word mapping to a file, in format `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`. """ logger.info("saving dictionary mapping to %s" % fname) fout = open(fname, 'w') for token, tokenId in sorted(self.dictionary.token2id.iteritems()): fout.write("%i\t%s\t%i\n" % (tokenId, token, self.dictionary.docFreq[tokenId])) fout.close() @staticmethod def loadDictionary(fname): """ Load previously stored mapping between words and their ids. The result can be used as the `id2word` parameter for input to transformations. """ result = {} for lineNo, line in enumerate(open(fname)): cols = line[:-1].split('\t') if len(cols) == 2: wordId, word = cols elif len(cols) == 3: wordId, word, docFreq = cols else: continue result[int(wordId)] = word # docFreq not used return result def saveAsText(self, fname): """ Store the corpus to disk, in a human-readable text format. This actually saves two files: 1. Document-term co-occurence frequency counts (bag-of-words), as a Matrix Market file `fname_bow.mm`. 2. Token to integer mapping, as a text file `fname_wordids.txt`. """ self.saveDictionary(fname + '_wordids.txt') matutils.MmWriter.writeCorpus(fname + '_bow.mm', self, progressCnt = 10000) def getArticles(self): """ Iterate over the dump, returning text version of each article. Only articles of sufficient length are returned (short articles & redirects etc are ignored). """ articles, intext = 0, False for lineno, line in enumerate(bz2.BZ2File(self.fname)): if line.startswith(' <text'): intext = True line = line[line.find('>') + 1 : ] lines = [line] elif intext: lines.append(line) pos = line.find('</text>') # can be on the same line as <text> if pos >= 0: intext = False if not lines: continue lines[-1] = line[:pos] text = filterWiki(''.join(lines)) if len(text) > ARTICLE_MIN_CHARS: # article redirects are pruned here articles += 1 yield tokenize(text) # split text into tokens self.numDocs = articles # cache corpus length
class WikiCorpus(interfaces.CorpusABC): """ Treat a wikipedia articles dump (*articles.xml.bz2) as a (read-only) corpus. The documents are extracted on-the-fly, so that the whole (massive) dump can stay compressed on disk. >>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id mapping, takes almost 8h >>> wiki.saveAsText('wiki_en_vocab200k') # another 8h, creates a file in MatrixMarket format plus file with id->word """ def __init__(self, fname, noBelow=20, keep_words=200000, dictionary=None): """ Initialize the corpus. This scans the corpus once, to determine its vocabulary (only the first `keep_words` most frequent words that appear in at least `noBelow` documents are kept). """ self.fname = fname if dictionary is None: self.dictionary = Dictionary(self.getArticles()) self.dictionary.filterExtremes(noBelow=noBelow, noAbove=0.1, keepN=keep_words) else: self.dictionary = dictionary def __len__(self): return self.numDocs def __iter__(self): """ The function that defines a corpus -- iterating over the corpus yields vectors, one for each document. """ for docNo, text in enumerate(self.getArticles()): yield self.dictionary.doc2bow(text, allowUpdate=False) def saveDictionary(self, fname): """ Store id->word mapping to a file, in format `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`. """ logger.info("saving dictionary mapping to %s" % fname) fout = open(fname, 'w') for token, tokenId in sorted(self.dictionary.token2id.iteritems()): fout.write("%i\t%s\t%i\n" % (tokenId, token, self.dictionary.docFreq[tokenId])) fout.close() @staticmethod def loadDictionary(fname): """ Load previously stored mapping between words and their ids. The result can be used as the `id2word` parameter for input to transformations. """ result = {} for lineNo, line in enumerate(open(fname)): cols = line[:-1].split('\t') if len(cols) == 2: wordId, word = cols elif len(cols) == 3: wordId, word, docFreq = cols else: continue result[int(wordId)] = word # docFreq not used return result def saveAsText(self, fname): """ Store the corpus to disk, in a human-readable text format. This actually saves two files: 1. Document-term co-occurence frequency counts (bag-of-words), as a Matrix Market file `fname_bow.mm`. 2. Token to integer mapping, as a text file `fname_wordids.txt`. """ self.saveDictionary(fname + '_wordids.txt') matutils.MmWriter.writeCorpus(fname + '_bow.mm', self, progressCnt=10000) def getArticles(self): """ Iterate over the dump, returning text version of each article. Only articles of sufficient length are returned (short articles & redirects etc are ignored). """ articles, intext = 0, False for lineno, line in enumerate(bz2.BZ2File(self.fname)): if line.startswith(' <text'): intext = True line = line[line.find('>') + 1:] lines = [line] elif intext: lines.append(line) pos = line.find('</text>') # can be on the same line as <text> if pos >= 0: intext = False if not lines: continue lines[-1] = line[:pos] text = filterWiki(''.join(lines)) if len( text ) > ARTICLE_MIN_CHARS: # article redirects are pruned here articles += 1 yield tokenize(text) # split text into tokens self.numDocs = articles # cache corpus length