Пример #1
0
 def test_min_token_len_set(self):
     """
     set the parameter token_min_len to 1 and check that 'a' as a token exists
     """
     wc = WikiCorpus(datapath(FILENAME), processes=1, token_min_len=1, lemmatize=False)
     l = wc.get_texts()
     self.assertTrue(u'a' in next(l))
Пример #2
0
def preprocess_wiki(input_file, output_file):
    # Import input file
    if not os.path.exists(input_file):
        url = 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
        logging.info('Download Wiki dump from {}'.format(url))
        wget.download(url)
    wiki = WikiCorpus(input_file, lemmatize=False, dictionary=[])

    # Convert tradtional Chinese to simplified Chinese using OpenCC
    cc = OpenCC('t2s')
    # Segment the sentences into words using Jieba paddle mode
    jieba.enable_paddle()

    # Process Wiki text
    logging.info('Start processing Wiki text')
    output = open(output_file, 'w')
    i = 0
    for article in tqdm(wiki.get_texts()):
        raw = ' '.join(article)
        processed = []
        # Remove non-Chinese words
        for token in list(jieba.cut(cc.convert(raw))):
            matched = re.findall(r'[\u4e00-\u9fff]+', token)
            if matched:
                processed.append(matched[0])
        output.write(' '.join(processed) + '\n')
        i += 1
        if (i % 10000 == 0):
            logging.info('Finished processing {} articles'.format(i))
    output.close()
    logging.info('Done')
Пример #3
0
 def test_min_token_len_not_set(self):
     """
     don't set the parameter token_min_len and check that 'a' as a token doesn't exists
     default token_min_len=2
     """
     wc = WikiCorpus(datapath(FILENAME), processes=1, lemmatize=False)
     self.assertTrue(u'a' not in next(wc.get_texts()))
Пример #4
0
 def test_min_token_len_not_set(self):
     """
     don't set the parameter token_min_len and check that 'a' as a token doesn't exists
     default token_min_len=2
     """
     wc = WikiCorpus(datapath(FILENAME), processes=1, lemmatize=False)
     self.assertTrue(u'a' not in next(wc.get_texts()))
Пример #5
0
 def test_max_token_len_set(self):
     """
     set the parameter token_max_len to 16 and check that 'collectivisation' as a token exists
     """
     wc = WikiCorpus(datapath(FILENAME), processes=1, token_max_len=16, lemmatize=False)
     l = wc.get_texts()
     self.assertTrue(u'collectivization' in next(l))
Пример #6
0
 def test_max_token_len_not_set(self):
     """
     don't set the parameter token_max_len and check that 'collectivisation' as a token doesn't exists
     default token_max_len=15
     """
     wc = WikiCorpus(datapath(FILENAME), processes=1, lemmatize=False)
     l = wc.get_texts()
     self.assertTrue(u'collectivization' not in next(l))
 def test_max_token_len_not_set(self):
     """
     don't set the parameter token_max_len and check that 'collectivisation' as a token doesn't exists
     default token_max_len=15
     """
     wc = WikiCorpus(datapath(FILENAME), processes=1, lemmatize=False)
     l = wc.get_texts()
     self.assertTrue(u'collectivization' not in next(l))
Пример #8
0
 def test_get_texts_returns_generator_of_lists(self):
     if sys.version_info < (2, 7, 0):
         return
     wc = WikiCorpus(datapath(FILENAME))
     l = wc.get_texts()
     self.assertEqual(type(l), types.GeneratorType)
     first = next(l)
     self.assertEqual(type(first), list)
     self.assertTrue(isinstance(first[0], bytes) or isinstance(first[0], str))
Пример #9
0
 def test_lower_case_set_false(self):
     """
     set the parameter lower to False and check that upper case Anarchism' token exist
     """
     wc = WikiCorpus(datapath(FILENAME), processes=1, lower=False, lemmatize=False)
     row = wc.get_texts()
     list_tokens = next(row)
     self.assertTrue(u'Anarchism' in list_tokens)
     self.assertTrue(u'anarchism' in list_tokens)
Пример #10
0
 def test_lower_case_set_true(self):
     """
     set the parameter lower to True and check that upper case 'Anarchism' token doesnt exist
     """
     wc = WikiCorpus(datapath(FILENAME), processes=1, lower=True, lemmatize=False)
     row = wc.get_texts()
     list_tokens = next(row)
     self.assertTrue(u'Anarchism' not in list_tokens)
     self.assertTrue(u'anarchism' in list_tokens)
    def test_unicode_element(self):
        """
        First unicode article in this sample is
        1) папа
        """
        wc = WikiCorpus(datapath(FILENAME_U), processes=1)

        l = wc.get_texts()
        self.assertTrue(u'папа' in next(l))
Пример #12
0
 def test_lower_case_set_false(self):
     """
     set the parameter lower to False and check that upper case Anarchism' token exist
     """
     wc = WikiCorpus(datapath(FILENAME), processes=1, lower=False, lemmatize=False)
     l = wc.get_texts()
     list_tokens = next(l)
     self.assertTrue(u'Anarchism' in list_tokens)
     self.assertTrue(u'anarchism' in list_tokens)
Пример #13
0
 def test_min_token_len_set(self):
     """
     set the parameter token_min_len to 1 and check that 'a' as a token exists
     """
     wc = WikiCorpus(datapath(FILENAME),
                     processes=1,
                     token_min_len=1,
                     lemmatize=False)
     self.assertTrue(u'a' in next(wc.get_texts()))
Пример #14
0
 def test_max_token_len_set(self):
     """
     set the parameter token_max_len to 16 and check that 'collectivisation' as a token exists
     """
     wc = WikiCorpus(datapath(FILENAME),
                     processes=1,
                     token_max_len=16,
                     lemmatize=False)
     self.assertTrue(u'collectivization' in next(wc.get_texts()))
Пример #15
0
 def test_get_texts_returns_generator_of_lists(self):
     if sys.version_info < (2, 7, 0):
         return
     wc = WikiCorpus(datapath(FILENAME))
     l = wc.get_texts()
     self.assertEqual(type(l), types.GeneratorType)
     first = next(l)
     self.assertEqual(type(first), list)
     self.assertTrue(
         isinstance(first[0], bytes) or isinstance(first[0], str))
    def test_first_element(self):
        """
        First two articles in this sample are
        1) anarchism
        2) autism
        """
        wc = WikiCorpus(datapath(FILENAME), processes=1)

        l = wc.get_texts()
        self.assertTrue(u'anarchism' in next(l))
        self.assertTrue(u'autism' in next(l))
 def test_lower_case_set_true(self):
     """
     set the parameter lower to True and check that upper case 'Anarchism' token doesnt exist
     """
     wc = WikiCorpus(datapath(FILENAME),
                     processes=1,
                     lower=True,
                     lemmatize=False)
     l = wc.get_texts()
     list_tokens = next(l)
     self.assertTrue(u'Anarchism' not in list_tokens)
     self.assertTrue(u'anarchism' in list_tokens)
Пример #18
0
 def test_custom_tokenizer(self):
     """
     define a custom tokenizer function and use it
     """
     wc = WikiCorpus(datapath(FILENAME), processes=1, lemmatize=False, tokenizer_func=custom_tokeiner,
                     token_max_len=16, token_min_len=1, lower=False)
     row = wc.get_texts()
     list_tokens = next(row)
     self.assertTrue(u'Anarchism' in list_tokens)
     self.assertTrue(u'collectivization' in list_tokens)
     self.assertTrue(u'a' in list_tokens)
     self.assertTrue(u'i.e.' in list_tokens)
Пример #19
0
 def test_custom_tokenizer(self):
     """
     define a custom tokenizer function and use it
     """
     wc = WikiCorpus(datapath(FILENAME), processes=1, lemmatize=False, tokenizer_func=custom_tokeiner,
                     token_max_len=16, token_min_len=1, lower=False)
     l = wc.get_texts()
     list_tokens = next(l)
     self.assertTrue(u'Anarchism' in list_tokens)
     self.assertTrue(u'collectivization' in list_tokens)
     self.assertTrue(u'a' in list_tokens)
     self.assertTrue(u'i.e.' in list_tokens)
Пример #20
0
    def __init__(self,
                 corpus,
                 wiki_dict,
                 wordfile,
                 vocab_size=200000,
                 window_size=5):
        self.w2id_dict = util.load_worddict(wordfile, vocab_size)
        self.window_size = window_size

        print('Starting loading Wiki Corpus...', end='')
        wiki_d = Dictionary.load(wiki_dict)
        self.wiki_corpus = WikiCorpus(corpus, dictionary=wiki_d)
        print('[done]')
Пример #21
0
    def test_first_element(self):
        """
        First two articles in this sample are
        1) anarchism
        2) autism
        """
        if sys.version_info < (2, 7, 0):
            return
        wc = WikiCorpus(datapath(FILENAME))

        l = wc.get_texts()
        self.assertTrue(b"anarchism" in next(l))
        self.assertTrue(b"autism" in next(l))
Пример #22
0
    def test_first_element(self):
        """
        First two articles in this sample are
        1) anarchism
        2) autism
        """
        if sys.version_info < (2, 7, 0):
            return
        wc = WikiCorpus(datapath(FILENAME))

        l = wc.get_texts()
        self.assertTrue(b"anarchism" in next(l))
        self.assertTrue(b"autism" in next(l))
Пример #23
0
def train_and_save_model(articles_path, model_path):
    corpus = WikiCorpus(articles_path, lemmatize=False, dictionary={})
    sentences = list(corpus.get_texts())
    params = {
        'size': 200,
        'window': 10,
        'min_count': 10,
        'workers': max(1,
                       multiprocessing.cpu_count() - 1),
        'sample': 1E-3,
    }
    model = Word2Vec(sentences, **params)
    model.save(model_path)

    return model
Пример #24
0
 def makeWikiTextEmbedding(self):
     # on wiki text sentences
     wiki = WikiCorpus('data/swwiki-latest-pages-articles.xml.bz2',
                       lemmatize=False,
                       dictionary={})
     sentences = list(wiki.get_texts())
     print("wikitext: ", len(sentences), " sentences")
     self.debugPrintRandomSentences(sentences, 10)
     model = gs.models.Word2Vec(sentences,
                                size=100,
                                window=5,
                                min_count=5,
                                workers=8,
                                sg=1,
                                hs=1,
                                iter=15)
Пример #25
0
def test_get_interesting_proportion():
    dictionary = HashDictionary(id_range=100000)
    dataset = Dataset(settings.dataset)
    terms = dataset.all_lemmas()
    
    wiki = RestrictedWikiCorpus(settings.corpus, terms=terms,
                                dictionary=dictionary)
    
    num_pages = sum(1 for _ in wiki.get_texts())
    
    unrestricted = WikiCorpus(settings.corpus,
                              dictionary=dictionary)

    num_unrestricted_pages = sum(1 for _ in unrestricted.get_texts())
    print "Unrestricted: %d, restricted: %d" % (
        num_unrestricted_pages, num_pages)
Пример #26
0
def main():
    args = setup_args()
    logging.info(args)

    fw = open(args.text, 'w')
    corpus = WikiCorpus(args.dump,
                        dictionary={'a'},
                        tokenizer_func=tokenize_spacy)
    for index, sentences in enumerate(corpus.get_texts()):
        for sentence in sentences:
            fw.write('{}\n'.format(sentence))

        if index % 10000 == 0:
            logging.info('Done Article: {}'.format(index))

    return
Пример #27
0
class WikiSentences:
    def __init__(self, wiki_dump_path):
        self.wiki = WikiCorpus(wiki_dump_path)

    def __iter__(self):
        for sentence in self.wiki.get_texts():
            yield sentence
Пример #28
0
def save_corpus(wiki_path, corpus_path):

    # make a corpus
    corpus = WikiCorpus(wiki_path)

    # save it
    print("output path: {}".format(corpus_path))
    cPickle.dump(corpus, open(corpus_path, "wb"))
Пример #29
0
class WikiCorpus:
    def __init__(self, wiki_dump_path, lang):
        logging.info('Parsing wiki corpus')
        self.wiki = WikiCorpus(wiki_dump_path)
        self.lang = lang

    def __iter__(self):
        for sentence in self.wiki.get_texts():
            yield list(sentence)
Пример #30
0
def main():
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    print("Loading Wiki Corpus")
    #wiki = WikiCorpus("enwiki-latest-pages-articles14.xml-p7697599p7744799.bz2")
    wiki = WikiCorpus('enwiki-latest-pages-articles.xml.bz2')
    print(type(wiki))
    documents = TaggedWikiDocument(wiki)

    print("Documents Parsed")
    cores = multiprocessing.cpu_count()

    models = [
        # PV-DBOW
        Doc2Vec(dm=0,
                dbow_words=1,
                size=200,
                window=8,
                min_count=19,
                iter=10,
                workers=cores),
        # PV-DM w/average
        Doc2Vec(dm=1,
                dm_mean=1,
                size=200,
                window=8,
                min_count=19,
                iter=10,
                workers=cores),
    ]
    models[0].build_vocab(documents)
    print(str(models[0]))
    models[1].reset_from(models[0])
    print(str(models[1]))

    start = time.time()
    c = -1
    for model in models:
        print("Model  building")
        c += 1
        model.train(documents,
                    total_examples=model.corpus_count,
                    epochs=model.iter)
        model.save(str(time.time()) + '_tweet_doc2vec{}.model'.format(str(c)))
        print(time.time() - start)

    print(time.time() - start)
Пример #31
0
def _iterate_over_pages(fname):
    """
    Iterate over the pages in a Wikipedia articles database dump (*articles.xml.bz2),
    yielding one (page id, title, page content) 3-tuple at a time.
    """
    dictionary = Dictionary()
    wiki = WikiCorpus(fname,
                      lemmatize=False,
                      dictionary=dictionary,
                      filter_namespaces={'0'})
    for title, content, page_id in extract_pages(bz2.BZ2File(wiki.fname),
                                                 wiki.filter_namespaces):
        yield (page_id, title, content)
Пример #32
0
class WikiSentences:
    # reference: https://github.com/LasseRegin/gensim-word2vec-model/blob/master/train.py
    def __init__(self, wiki_dump_path, lang):
        logging.info('Parsing wiki corpus')
        self.wiki = WikiCorpus(wiki_dump_path)
        self.lang = lang

    def __iter__(self):
        for sentence in self.wiki.get_texts():
            if self.lang == 'zh':
                yield list(jieba.cut(''.join(sentence), cut_all=False))
            else:
                yield list(sentence)
Пример #33
0
def build_d2v_model():
    result_file = open('res.txt', 'w+')
    result_file.write('start \n')
    result_file.flush()

    # include wikipedia dataset
    wiki = WikiCorpus(get_d2v_base())

    result_file.write('tag docs ready \n')

    result_file.flush()

    documents = TaggedWikiDocument(wiki)

    result_file.write('docs = wiki')
    result_file.flush()

    cores = multiprocessing.cpu_count()

    models = [
        # PV-DBOW
        Doc2Vec(dm=0,
                window=10,
                dbow_words=1,
                vector_size=300,
                min_count=20,
                epochs=20,
                workers=cores),
        # PV-DM w/average
        Doc2Vec(dm=1,
                window=10,
                dm_mean=1,
                vector_size=300,
                min_count=20,
                epochs=20,
                workers=cores),
    ]

    models[0].build_vocab(documents)
    models[1].reset_from(models[0])

    result_file.write('vocabulary built')
    result_file.flush()

    for model in models:
        model.train(documents,
                    total_examples=model.corpus_count,
                    epochs=model.epochs)

    models[0].save('doc2vec_model_0')
    models[1].save('doc2vec_model_1')
Пример #34
0
def Word2VecTraining(Size=200, window=7, min_count=5, Language='spanish'):

    #Using WikiCorpus in Spanish Version
    wiki = WikiCorpus(
        'D:/Gita/GITA_Master/Databases/WikiCorpus/eswiki-latest-pages-articles.xml.bz2',
        lemmatize=False,
        dictionary={})
    corpus = list(wiki.get_texts())
    #Defining Paramters
    params = {
        'size': Size,
        'window': window,
        'min_count': min_count,
        'workers': max(1,
                       multiprocessing.cpu_count() - 1),
        'sample': 1E-3,
    }
    #Model Training with WikiCorpus
    word2vec = Word2Vec(corpus, **params)

    ################TODO: Save model####################

    return word2vec
Пример #35
0
def LDADictionary(Language='spanish'):
    path_to_wiki_dump = datapath(
        'D:/Gita/GITA_Master/Databases/WikiCorpus/eswiki-latest-pages-articles.xml.bz2'
    )
    corpus_path = get_tmpfile("wiki-corpus.mm")

    wiki = WikiCorpus(
        path_to_wiki_dump)  # create word->word_id mapping, ~8h on full wiki
    MmCorpus.serialize(
        corpus_path,
        wiki)  # another 8h, creates a file in MatrixMarket format and mapping
    dictionary = wiki.dictionary
    ################TODO: Save Dictionary####################
    return dictionary
Пример #36
0
def create_wiki_dict(wiki_path, run_type):
    from gensim.corpora.wikicorpus import WikiCorpus

    fn = '/data/logs/create_wiki_dict.log'
    logging.basicConfig(filename=fn, level=logging.DEBUG, format=FORMAT)
    module_logger = logging.getLogger('wiki_module_logger')
    module_logger.setLevel(logging.DEBUG)
    # set file handler
    fh = logging.FileHandler(fn)
    fh.setLevel(logging.DEBUG)
    fh.setFormatter(formatter)
    module_logger.addHandler(fh)

    module_logger.info("START")
    wiki_corpus = WikiCorpus(wiki_path)  # This will take many hours! Output is Wikipedia in bucket-of-words (BOW) sparse matrix.
    module_logger.info("Wiki corpus ready")
    wiki_corpus.dictionary.save("/data/logs/wiki_dump_dict.dict")
    module_logger.info("Dictionary Created")
Пример #37
0
def main():

    wiki = WikiCorpus("enwiki-latest-pages-articles.xml.bz2")
    documents = TaggedWikiDocument(wiki)

    domain_vocab_file = "poems poets poem poet symfony symfonies ghazal ghazals song lyrics"
    vocab_list = domain_vocab_file.split()

    dim = 200
    win = 8
    neg = 5

    kwargs = {
        "sent": documents,
        "vocab": vocab_list,
        "dim": dim,
        "win": win,
        "min_cnt": 19,
        "neg": neg,
        "iter": 20,
        "tag_doc": documents
    }
    Dis2Vec(**kwargs).run_Dis2Vec()
Пример #38
0
#!/usr/bin/env python
# coding: utf-8

from gensim.corpora.wikicorpus import WikiCorpus
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from pprint import pprint
import multiprocessing
from gensim.test.utils import get_tmpfile
from gensim.models.callbacks import CallbackAny2Vec
from gensim.test.utils import get_tmpfile
print('Start')
wiki = WikiCorpus("./data/enwiki-latest-pages-articles.xml.bz2.2")
print('Wiki loaded')
class TaggedWikiDocument(object):
    def __init__(self, wiki):
        self.wiki = wiki
        self.wiki.metadata = True
    def __iter__(self):
        for content, (page_id, title) in self.wiki.get_texts():
            print (TaggedDocument(content, [title]))
            yield TaggedDocument(content, [title])

class EpochLoggerDM(CallbackAny2Vec):
    '''Callback to log information about training'''

    def __init__(self, path_prefix):
        self.path_prefix = path_prefix
        self.epoch = 0

    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))
Пример #39
0
from gensim.corpora.wikicorpus import WikiCorpus
from gensim.models.phrases import Phrases, Phraser
from gensim.models.word2vec import Word2Vec

wiki = WikiCorpus('/Users/pavel/PycharmProjects/NeuralNetworkWithTenser/src/wordrecognition/res/ruwiki-20190120-pages-articles-multistream1.xml-p4p204179.bz2', dictionary=False)
print('bigram')
bigram = Phrases(wiki.get_texts())
print('bigram_transformer')
bigram_transformer = Phraser(bigram)


def text_generator_bigram():
    for text in wiki.get_texts():
        yield bigram_transformer[[word for word in text]]


trigram = Phrases(text_generator_bigram())
print('trigram')
trigram_transformer = Phraser(trigram)
print('trigram_transformer')


def text_generator_trigram():
    for text in wiki.get_texts():
        yield trigram_transformer[bigram_transformer[[word for word in text]]]


print('model create')
model = Word2Vec(size=100, window=7, min_count=10, workers=10, iter=1, min_alpha=0.025)
print('build_vocab')
model.build_vocab(text_generator_trigram())
import sys
import json
from os import path
from gensim.corpora.wikicorpus import WikiCorpus

base_dir = path.join(path.dirname(path.realpath(__file__)), path.pardir)
wiki_filename = 'simplewiki-20171103-pages-articles-multistream.xml.bz2'
wiki_path = path.join(base_dir, 'corpora', wiki_filename)
outname = path.join(base_dir, 'corpora', 'simplewikiselect')

index = []  # Save information about articles as they've been processed.

wiki = WikiCorpus(wiki_path, dictionary=True)  # dict=True avoids making vocab
wiki.metadata = True  # Want article titles
print("Loading Wikipedia archive (this may take a few minutes)... ", end="")
articles = list(wiki.get_texts())
print("Done.")

num_articles = len(articles)

print("Total Number of Articles:", num_articles)

MAX_WC = 20_000_000
ARTICLE_MIN_WC = 200
ARTICLE_MAX_WC = 10000

ac = 0
wc = 0
selected = []

with open(outname + ".txt", "w") as f:
Пример #41
0
from gensim.corpora.wikicorpus import WikiCorpus


wiki = WikiCorpus('', processes=None, lemmatize=False, dictionary=None)
texts = wiki.get_texts()
with open('wikitext.txt', 'w') as wikitext:
    for text in texts:
        wikitext.write(' '.join(text) + "\n")