def test_min_token_len_set(self): """ set the parameter token_min_len to 1 and check that 'a' as a token exists """ wc = WikiCorpus(datapath(FILENAME), processes=1, token_min_len=1, lemmatize=False) l = wc.get_texts() self.assertTrue(u'a' in next(l))
def preprocess_wiki(input_file, output_file): # Import input file if not os.path.exists(input_file): url = 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2' logging.info('Download Wiki dump from {}'.format(url)) wget.download(url) wiki = WikiCorpus(input_file, lemmatize=False, dictionary=[]) # Convert tradtional Chinese to simplified Chinese using OpenCC cc = OpenCC('t2s') # Segment the sentences into words using Jieba paddle mode jieba.enable_paddle() # Process Wiki text logging.info('Start processing Wiki text') output = open(output_file, 'w') i = 0 for article in tqdm(wiki.get_texts()): raw = ' '.join(article) processed = [] # Remove non-Chinese words for token in list(jieba.cut(cc.convert(raw))): matched = re.findall(r'[\u4e00-\u9fff]+', token) if matched: processed.append(matched[0]) output.write(' '.join(processed) + '\n') i += 1 if (i % 10000 == 0): logging.info('Finished processing {} articles'.format(i)) output.close() logging.info('Done')
def test_min_token_len_not_set(self): """ don't set the parameter token_min_len and check that 'a' as a token doesn't exists default token_min_len=2 """ wc = WikiCorpus(datapath(FILENAME), processes=1, lemmatize=False) self.assertTrue(u'a' not in next(wc.get_texts()))
def test_max_token_len_set(self): """ set the parameter token_max_len to 16 and check that 'collectivisation' as a token exists """ wc = WikiCorpus(datapath(FILENAME), processes=1, token_max_len=16, lemmatize=False) l = wc.get_texts() self.assertTrue(u'collectivization' in next(l))
def test_max_token_len_not_set(self): """ don't set the parameter token_max_len and check that 'collectivisation' as a token doesn't exists default token_max_len=15 """ wc = WikiCorpus(datapath(FILENAME), processes=1, lemmatize=False) l = wc.get_texts() self.assertTrue(u'collectivization' not in next(l))
def test_get_texts_returns_generator_of_lists(self): if sys.version_info < (2, 7, 0): return wc = WikiCorpus(datapath(FILENAME)) l = wc.get_texts() self.assertEqual(type(l), types.GeneratorType) first = next(l) self.assertEqual(type(first), list) self.assertTrue(isinstance(first[0], bytes) or isinstance(first[0], str))
def test_lower_case_set_false(self): """ set the parameter lower to False and check that upper case Anarchism' token exist """ wc = WikiCorpus(datapath(FILENAME), processes=1, lower=False, lemmatize=False) row = wc.get_texts() list_tokens = next(row) self.assertTrue(u'Anarchism' in list_tokens) self.assertTrue(u'anarchism' in list_tokens)
def test_lower_case_set_true(self): """ set the parameter lower to True and check that upper case 'Anarchism' token doesnt exist """ wc = WikiCorpus(datapath(FILENAME), processes=1, lower=True, lemmatize=False) row = wc.get_texts() list_tokens = next(row) self.assertTrue(u'Anarchism' not in list_tokens) self.assertTrue(u'anarchism' in list_tokens)
def test_unicode_element(self): """ First unicode article in this sample is 1) папа """ wc = WikiCorpus(datapath(FILENAME_U), processes=1) l = wc.get_texts() self.assertTrue(u'папа' in next(l))
def test_lower_case_set_false(self): """ set the parameter lower to False and check that upper case Anarchism' token exist """ wc = WikiCorpus(datapath(FILENAME), processes=1, lower=False, lemmatize=False) l = wc.get_texts() list_tokens = next(l) self.assertTrue(u'Anarchism' in list_tokens) self.assertTrue(u'anarchism' in list_tokens)
def test_min_token_len_set(self): """ set the parameter token_min_len to 1 and check that 'a' as a token exists """ wc = WikiCorpus(datapath(FILENAME), processes=1, token_min_len=1, lemmatize=False) self.assertTrue(u'a' in next(wc.get_texts()))
def test_max_token_len_set(self): """ set the parameter token_max_len to 16 and check that 'collectivisation' as a token exists """ wc = WikiCorpus(datapath(FILENAME), processes=1, token_max_len=16, lemmatize=False) self.assertTrue(u'collectivization' in next(wc.get_texts()))
def test_get_texts_returns_generator_of_lists(self): if sys.version_info < (2, 7, 0): return wc = WikiCorpus(datapath(FILENAME)) l = wc.get_texts() self.assertEqual(type(l), types.GeneratorType) first = next(l) self.assertEqual(type(first), list) self.assertTrue( isinstance(first[0], bytes) or isinstance(first[0], str))
def test_first_element(self): """ First two articles in this sample are 1) anarchism 2) autism """ wc = WikiCorpus(datapath(FILENAME), processes=1) l = wc.get_texts() self.assertTrue(u'anarchism' in next(l)) self.assertTrue(u'autism' in next(l))
def test_lower_case_set_true(self): """ set the parameter lower to True and check that upper case 'Anarchism' token doesnt exist """ wc = WikiCorpus(datapath(FILENAME), processes=1, lower=True, lemmatize=False) l = wc.get_texts() list_tokens = next(l) self.assertTrue(u'Anarchism' not in list_tokens) self.assertTrue(u'anarchism' in list_tokens)
def test_custom_tokenizer(self): """ define a custom tokenizer function and use it """ wc = WikiCorpus(datapath(FILENAME), processes=1, lemmatize=False, tokenizer_func=custom_tokeiner, token_max_len=16, token_min_len=1, lower=False) row = wc.get_texts() list_tokens = next(row) self.assertTrue(u'Anarchism' in list_tokens) self.assertTrue(u'collectivization' in list_tokens) self.assertTrue(u'a' in list_tokens) self.assertTrue(u'i.e.' in list_tokens)
def test_custom_tokenizer(self): """ define a custom tokenizer function and use it """ wc = WikiCorpus(datapath(FILENAME), processes=1, lemmatize=False, tokenizer_func=custom_tokeiner, token_max_len=16, token_min_len=1, lower=False) l = wc.get_texts() list_tokens = next(l) self.assertTrue(u'Anarchism' in list_tokens) self.assertTrue(u'collectivization' in list_tokens) self.assertTrue(u'a' in list_tokens) self.assertTrue(u'i.e.' in list_tokens)
def __init__(self, corpus, wiki_dict, wordfile, vocab_size=200000, window_size=5): self.w2id_dict = util.load_worddict(wordfile, vocab_size) self.window_size = window_size print('Starting loading Wiki Corpus...', end='') wiki_d = Dictionary.load(wiki_dict) self.wiki_corpus = WikiCorpus(corpus, dictionary=wiki_d) print('[done]')
def test_first_element(self): """ First two articles in this sample are 1) anarchism 2) autism """ if sys.version_info < (2, 7, 0): return wc = WikiCorpus(datapath(FILENAME)) l = wc.get_texts() self.assertTrue(b"anarchism" in next(l)) self.assertTrue(b"autism" in next(l))
def train_and_save_model(articles_path, model_path): corpus = WikiCorpus(articles_path, lemmatize=False, dictionary={}) sentences = list(corpus.get_texts()) params = { 'size': 200, 'window': 10, 'min_count': 10, 'workers': max(1, multiprocessing.cpu_count() - 1), 'sample': 1E-3, } model = Word2Vec(sentences, **params) model.save(model_path) return model
def makeWikiTextEmbedding(self): # on wiki text sentences wiki = WikiCorpus('data/swwiki-latest-pages-articles.xml.bz2', lemmatize=False, dictionary={}) sentences = list(wiki.get_texts()) print("wikitext: ", len(sentences), " sentences") self.debugPrintRandomSentences(sentences, 10) model = gs.models.Word2Vec(sentences, size=100, window=5, min_count=5, workers=8, sg=1, hs=1, iter=15)
def test_get_interesting_proportion(): dictionary = HashDictionary(id_range=100000) dataset = Dataset(settings.dataset) terms = dataset.all_lemmas() wiki = RestrictedWikiCorpus(settings.corpus, terms=terms, dictionary=dictionary) num_pages = sum(1 for _ in wiki.get_texts()) unrestricted = WikiCorpus(settings.corpus, dictionary=dictionary) num_unrestricted_pages = sum(1 for _ in unrestricted.get_texts()) print "Unrestricted: %d, restricted: %d" % ( num_unrestricted_pages, num_pages)
def main(): args = setup_args() logging.info(args) fw = open(args.text, 'w') corpus = WikiCorpus(args.dump, dictionary={'a'}, tokenizer_func=tokenize_spacy) for index, sentences in enumerate(corpus.get_texts()): for sentence in sentences: fw.write('{}\n'.format(sentence)) if index % 10000 == 0: logging.info('Done Article: {}'.format(index)) return
class WikiSentences: def __init__(self, wiki_dump_path): self.wiki = WikiCorpus(wiki_dump_path) def __iter__(self): for sentence in self.wiki.get_texts(): yield sentence
def save_corpus(wiki_path, corpus_path): # make a corpus corpus = WikiCorpus(wiki_path) # save it print("output path: {}".format(corpus_path)) cPickle.dump(corpus, open(corpus_path, "wb"))
class WikiCorpus: def __init__(self, wiki_dump_path, lang): logging.info('Parsing wiki corpus') self.wiki = WikiCorpus(wiki_dump_path) self.lang = lang def __iter__(self): for sentence in self.wiki.get_texts(): yield list(sentence)
def main(): program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) print("Loading Wiki Corpus") #wiki = WikiCorpus("enwiki-latest-pages-articles14.xml-p7697599p7744799.bz2") wiki = WikiCorpus('enwiki-latest-pages-articles.xml.bz2') print(type(wiki)) documents = TaggedWikiDocument(wiki) print("Documents Parsed") cores = multiprocessing.cpu_count() models = [ # PV-DBOW Doc2Vec(dm=0, dbow_words=1, size=200, window=8, min_count=19, iter=10, workers=cores), # PV-DM w/average Doc2Vec(dm=1, dm_mean=1, size=200, window=8, min_count=19, iter=10, workers=cores), ] models[0].build_vocab(documents) print(str(models[0])) models[1].reset_from(models[0]) print(str(models[1])) start = time.time() c = -1 for model in models: print("Model building") c += 1 model.train(documents, total_examples=model.corpus_count, epochs=model.iter) model.save(str(time.time()) + '_tweet_doc2vec{}.model'.format(str(c))) print(time.time() - start) print(time.time() - start)
def _iterate_over_pages(fname): """ Iterate over the pages in a Wikipedia articles database dump (*articles.xml.bz2), yielding one (page id, title, page content) 3-tuple at a time. """ dictionary = Dictionary() wiki = WikiCorpus(fname, lemmatize=False, dictionary=dictionary, filter_namespaces={'0'}) for title, content, page_id in extract_pages(bz2.BZ2File(wiki.fname), wiki.filter_namespaces): yield (page_id, title, content)
class WikiSentences: # reference: https://github.com/LasseRegin/gensim-word2vec-model/blob/master/train.py def __init__(self, wiki_dump_path, lang): logging.info('Parsing wiki corpus') self.wiki = WikiCorpus(wiki_dump_path) self.lang = lang def __iter__(self): for sentence in self.wiki.get_texts(): if self.lang == 'zh': yield list(jieba.cut(''.join(sentence), cut_all=False)) else: yield list(sentence)
def build_d2v_model(): result_file = open('res.txt', 'w+') result_file.write('start \n') result_file.flush() # include wikipedia dataset wiki = WikiCorpus(get_d2v_base()) result_file.write('tag docs ready \n') result_file.flush() documents = TaggedWikiDocument(wiki) result_file.write('docs = wiki') result_file.flush() cores = multiprocessing.cpu_count() models = [ # PV-DBOW Doc2Vec(dm=0, window=10, dbow_words=1, vector_size=300, min_count=20, epochs=20, workers=cores), # PV-DM w/average Doc2Vec(dm=1, window=10, dm_mean=1, vector_size=300, min_count=20, epochs=20, workers=cores), ] models[0].build_vocab(documents) models[1].reset_from(models[0]) result_file.write('vocabulary built') result_file.flush() for model in models: model.train(documents, total_examples=model.corpus_count, epochs=model.epochs) models[0].save('doc2vec_model_0') models[1].save('doc2vec_model_1')
def Word2VecTraining(Size=200, window=7, min_count=5, Language='spanish'): #Using WikiCorpus in Spanish Version wiki = WikiCorpus( 'D:/Gita/GITA_Master/Databases/WikiCorpus/eswiki-latest-pages-articles.xml.bz2', lemmatize=False, dictionary={}) corpus = list(wiki.get_texts()) #Defining Paramters params = { 'size': Size, 'window': window, 'min_count': min_count, 'workers': max(1, multiprocessing.cpu_count() - 1), 'sample': 1E-3, } #Model Training with WikiCorpus word2vec = Word2Vec(corpus, **params) ################TODO: Save model#################### return word2vec
def LDADictionary(Language='spanish'): path_to_wiki_dump = datapath( 'D:/Gita/GITA_Master/Databases/WikiCorpus/eswiki-latest-pages-articles.xml.bz2' ) corpus_path = get_tmpfile("wiki-corpus.mm") wiki = WikiCorpus( path_to_wiki_dump) # create word->word_id mapping, ~8h on full wiki MmCorpus.serialize( corpus_path, wiki) # another 8h, creates a file in MatrixMarket format and mapping dictionary = wiki.dictionary ################TODO: Save Dictionary#################### return dictionary
def create_wiki_dict(wiki_path, run_type): from gensim.corpora.wikicorpus import WikiCorpus fn = '/data/logs/create_wiki_dict.log' logging.basicConfig(filename=fn, level=logging.DEBUG, format=FORMAT) module_logger = logging.getLogger('wiki_module_logger') module_logger.setLevel(logging.DEBUG) # set file handler fh = logging.FileHandler(fn) fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) module_logger.addHandler(fh) module_logger.info("START") wiki_corpus = WikiCorpus(wiki_path) # This will take many hours! Output is Wikipedia in bucket-of-words (BOW) sparse matrix. module_logger.info("Wiki corpus ready") wiki_corpus.dictionary.save("/data/logs/wiki_dump_dict.dict") module_logger.info("Dictionary Created")
def main(): wiki = WikiCorpus("enwiki-latest-pages-articles.xml.bz2") documents = TaggedWikiDocument(wiki) domain_vocab_file = "poems poets poem poet symfony symfonies ghazal ghazals song lyrics" vocab_list = domain_vocab_file.split() dim = 200 win = 8 neg = 5 kwargs = { "sent": documents, "vocab": vocab_list, "dim": dim, "win": win, "min_cnt": 19, "neg": neg, "iter": 20, "tag_doc": documents } Dis2Vec(**kwargs).run_Dis2Vec()
#!/usr/bin/env python # coding: utf-8 from gensim.corpora.wikicorpus import WikiCorpus from gensim.models.doc2vec import Doc2Vec, TaggedDocument from pprint import pprint import multiprocessing from gensim.test.utils import get_tmpfile from gensim.models.callbacks import CallbackAny2Vec from gensim.test.utils import get_tmpfile print('Start') wiki = WikiCorpus("./data/enwiki-latest-pages-articles.xml.bz2.2") print('Wiki loaded') class TaggedWikiDocument(object): def __init__(self, wiki): self.wiki = wiki self.wiki.metadata = True def __iter__(self): for content, (page_id, title) in self.wiki.get_texts(): print (TaggedDocument(content, [title])) yield TaggedDocument(content, [title]) class EpochLoggerDM(CallbackAny2Vec): '''Callback to log information about training''' def __init__(self, path_prefix): self.path_prefix = path_prefix self.epoch = 0 def on_epoch_begin(self, model): print("Epoch #{} start".format(self.epoch))
from gensim.corpora.wikicorpus import WikiCorpus from gensim.models.phrases import Phrases, Phraser from gensim.models.word2vec import Word2Vec wiki = WikiCorpus('/Users/pavel/PycharmProjects/NeuralNetworkWithTenser/src/wordrecognition/res/ruwiki-20190120-pages-articles-multistream1.xml-p4p204179.bz2', dictionary=False) print('bigram') bigram = Phrases(wiki.get_texts()) print('bigram_transformer') bigram_transformer = Phraser(bigram) def text_generator_bigram(): for text in wiki.get_texts(): yield bigram_transformer[[word for word in text]] trigram = Phrases(text_generator_bigram()) print('trigram') trigram_transformer = Phraser(trigram) print('trigram_transformer') def text_generator_trigram(): for text in wiki.get_texts(): yield trigram_transformer[bigram_transformer[[word for word in text]]] print('model create') model = Word2Vec(size=100, window=7, min_count=10, workers=10, iter=1, min_alpha=0.025) print('build_vocab') model.build_vocab(text_generator_trigram())
import sys import json from os import path from gensim.corpora.wikicorpus import WikiCorpus base_dir = path.join(path.dirname(path.realpath(__file__)), path.pardir) wiki_filename = 'simplewiki-20171103-pages-articles-multistream.xml.bz2' wiki_path = path.join(base_dir, 'corpora', wiki_filename) outname = path.join(base_dir, 'corpora', 'simplewikiselect') index = [] # Save information about articles as they've been processed. wiki = WikiCorpus(wiki_path, dictionary=True) # dict=True avoids making vocab wiki.metadata = True # Want article titles print("Loading Wikipedia archive (this may take a few minutes)... ", end="") articles = list(wiki.get_texts()) print("Done.") num_articles = len(articles) print("Total Number of Articles:", num_articles) MAX_WC = 20_000_000 ARTICLE_MIN_WC = 200 ARTICLE_MAX_WC = 10000 ac = 0 wc = 0 selected = [] with open(outname + ".txt", "w") as f:
from gensim.corpora.wikicorpus import WikiCorpus wiki = WikiCorpus('', processes=None, lemmatize=False, dictionary=None) texts = wiki.get_texts() with open('wikitext.txt', 'w') as wikitext: for text in texts: wikitext.write(' '.join(text) + "\n")