def main(argv=None):
    if argv is None:
        argv = sys.argv

    print('Creating simple wiki serialized corpus')
    # Download the raw file if we do not have it already
    if not os.path.isfile(WIKIFILE):
        # Get the file
        wget.download(WIKIURL)
    wiki = WikiCorpus(WIKIFILE, lemmatize=False)
    i = 0
    article_dict = {}
    for text in wiki.get_texts(meta=True):
        url_string = 'https://simple.wikipedia.org/wiki/?curid={}'
        article_dict[i] = (url_string.format(text[0]), text[1])
        i += 1
    with open(ARTICLEDICT, 'w') as f:
        json.dump(article_dict, f)
    wiki.dictionary.filter_extremes(no_below=20, no_above=0.1,
                                    keep_n=DEFAULT_DICT_SIZE)
    MmCorpus.serialize(MMFILE, wiki, progress_cnt=10000, )
    wiki.dictionary.save_as_text(DICTFILE)
    print('Simple wiki serialized corpus created')
    # Now run LSI
    dictionary = Dictionary.load_from_text(DICTFILE)
    mm = MmCorpus(MMFILE)
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(TDIFMODEL)
    MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000)
    mm_tdif = MmCorpus(TDIFFILE)
    lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300)
    index = similarities.MatrixSimilarity(lsi[mm_tdif])
    index.save(SIMMATRIX)
    lsi.save(LSIMODEL)
    print("LSI model and index created")
示例#2
0
def extract_wiki(thresh, env_path, vec_file):
    program = os.path.basename(env_path[0])
    logger = logging.getLogger(program)
 
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))
 
    # check and process input arguments
    if len(sys.argv) < 3:
        print globals()['__doc__'] % locals()
        sys.exit(1)
    inp, outp = sys.argv[1:3]
    space = " "
    i = 0
    print('--- load ck12 word2vec')
    model = gensim.models.Word2Vec.load_word2vec_format(vec_file, binary=False)
    print('--- filtering keywords based on sim to ck12 keyword science')
    output = open(outp, 'w')
    wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
    for text in wiki.get_texts():
        topic =[ w for w in  text[:20] if w not in stopwords.words('english')]
        sim = np.mean([ model[w].dot(model['science']) if w in model else 0 for w in topic])
        #sim = model['science'].dot(topic_vec)
        if sim > thresh:
            output.write(space.join(text) + "\n")
            i = i + 1
            if (i % 100 == 0):
                logger.info("Saved " + str(i) + " articles")    
    output.close()
    logger.info("Finished Saved " + str(i) + " articles")
def process_enwiki(input_file, output_file):
    space = ' '
    i = 0
    output = open(output_file, 'w')
    wiki = WikiCorpus(input_file, lemmatize=False, dictionary={})
    for text in wiki.get_texts():
        output.write(space.join(text) + '\n')
        i += 1
        if i % 10000 == 0:
            logger.info('Saved ' + str(i) + ' articles')
    output.close()
def parse(filename):
	OUTPATH = '../gen_data/wikicorpus'
	fout = open(OUTPATH, 'w')
	wiki = WikiCorpus(filename, lemmatize=False, dictionary={}, processes=5)
	count = 0
	for text in wiki.get_texts():
		fout.write(" ".join(text) + "\n")
		count = count + 1
		if (count % 10000 == 0):
			logging.info("Save "+str(count) + " articles")
	fout.close()
	logging.info("Finished saved "+str(count) + "articles")
示例#5
0
def parse_wiki(filename):
    fout = file('../../paper/data/wiki/wiki_corpus', 'w')
    wiki = WikiCorpus(filename, lemmatize=False, dictionary={}, processes=5)
    count = 0
    for text in wiki.get_texts():
        fout.write('%s\n' % ' '.join(text))
        if count % 10000 == 0:
            logging.info(count)
        count += 1

    fout.close()
    logging.info('Finish %d' % count)
示例#6
0
def process_wiki(infile, outfile):
	from gensim.corpora import WikiCorpus
	wiki = WikiCorpus(infile, lemmatize=False, dictionary={})
	i = 0
	with open(outfile, 'w') as fw:
		for text in wiki.get_texts():
			text = ' '.join(text)
			cut_text = cut(text)
			fw.write(re.sub(r' {1,}', ' ', ' '.join(cut_text)) + '\n')
			i += 1
			if i % 1000 == 0:
				logger.info('Saved ' + str(i) + ' texts')
	logger.info('Finished ' + str(i) + ' texts')
示例#7
0
def process_wiki(infile, outfile):
    from gensim.corpora import WikiCorpus
    wiki = WikiCorpus(infile, lemmatize=False, dictionary={})
    i = 0
    with open(outfile, 'w') as fw:
        for text in wiki.get_texts():
            text = ' '.join(text)
            cut_text = cut(text)
            fw.write(re.sub(r' {1,}', ' ', ' '.join(cut_text)) + '\n')
            i += 1
            if i % 1000 == 0:
                logger.info('Saved ' + str(i) + ' texts')
    logger.info('Finished ' + str(i) + ' texts')
示例#8
0
def preprocessing(logger, data_path, output_filename):
    i = 0
    output = open(output_filename, 'w', encoding="utf-8")

    wiki = WikiCorpus(data_path, lemmatize=False, dictionary={}, lower=False)
    for text in wiki.get_texts():
        output.write(' '.join(text) + '\n')
        i = i + 1
        if i % 10000 == 0:
            logger.info("Saved " + str(i) + " articles")

    output.close()
    logger.info("Finished Saved " + str(i) + " articles")
示例#9
0
def extract_corpus(infile, outfile):
    print(' '.join([
        'Extracting Wikipedia corpus file ' + infile + '.',
        'This may take a couple minutes...',
    ]))
    with open(outfile, 'w') as output:
        wiki = WikiCorpus(infile)
        # "text" is actually each individual article
        for i, text in enumerate(wiki.get_texts()):
            output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
            if i > 0 and i % 10000 == 0:
                print('Processed ' + str(i) + ' articles so far.')
    print('Processing complete! Yippee!')
示例#10
0
def wiki_to_txt(file_name, output_name):
    logging.info("開始 wiki_to_txt")
    wiki_corpus = WikiCorpus(file_name, dictionary={})
    texts_num = 0
    converter = opencc.OpenCC('s2t.json')
    with open(output_name, 'w', encoding='utf-8') as output:
        for texts in wiki_corpus.get_texts():
            r = converter.convert(' '.join(texts))
            output.write(r + '\n')
            texts_num += 1
            if texts_num % 10000 == 0:
                logging.info("已處理 %d 篇文章" % texts_num)

    logging.info("結束 wiki_to_txt")
示例#11
0
def build_corpus(infile,outfile):
	""" Converts a Wikipedia xml dump to a text corpus"""
	output = open(outfile,'w')
	wiki = WikiCorpus(infile)

	i = 0
	for text in wiki.get_texts():
		output.write(bytes(' '.join(text),'utf-8').decode('utf-8')+'\n')
		i += 1
		if (i%10 == 0):
			print('Processed '+ str(i) + ' articles')

	output.close()
	print('Processing complete!')
    def make_corpus(self):
        """Convert Wikipedia xml dump file to text corpus"""

        output = open(self.wiki_file, 'w', encoding="utf-8")
        wiki = WikiCorpus(self.dump_file)
        i = 0
        for text in wiki.get_texts():
            output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
            i = i + 1
            if (i % 10000 == 0):
                print('Processed ' + str(i) + ' articles')

        output.close()
        print('Processing complete!')
示例#13
0
def make_corpus(in_f, out_f):
    """Convert Wikipedia xml dump file to text corpus"""

    output = open(out_f, 'w')
    wiki = WikiCorpus(in_f, tokenizer_func=tokenize)

    i = 0
    for text in wiki.get_texts():
        output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
        i = i + 1
        if i % 10000 == 0:
            print('Processed ' + str(i) + ' articles')
    output.close()
    print('Processing complete!')
示例#14
0
def make_corpus(input_file):
    """Convert Wikipedia xml dump file to text corpus"""

    wiki = WikiCorpus(input_file)
    wiki.metadata = True
    output_folder = '../corpus'
    Path(output_folder).mkdir(parents=True, exist_ok=True)
    for article in wiki.get_texts():
        text = article[0]
        page_id, title = article[1]
        filename = f'{output_folder}/{page_id}-{slugify(title)}.txt'
        with open(filename, 'a') as file:
            file.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
            print(f'{page_id} {title}')
示例#15
0
def make_corpus(in_f, out_f):
    """Convert Wikipedia xml dump file to text corpus"""

    output = open(out_f, 'w')
    wiki = WikiCorpus(in_f)

    i = 0
    for text in wiki.get_texts():
        output.write(bytes(' '.join(text)) + '\n')
        i = i + 1
        if (i % 1 == 0):
            print('Processed ' + str(i) + ' articles')
    output.close()
    print('Processing complete!')
示例#16
0
def wiki_to_text(wiki_data_path):
    logging.info("开始将维基语料转换为普通文本格式:")
    if os.path.exists("wiki_texts.txt"):
        return

    wiki_corpus = WikiCorpus(wiki_data_path, dictionary={})
    texts_num = 0

    with open("wiki_texts.txt",'w',encoding='utf-8') as output:
        for text in wiki_corpus.get_texts():
            output.write(b' '.join(text).decode('utf-8') + '\n')
            texts_num += 1
            if texts_num % 10000 == 0:
                logging.info("已处理 %d 篇文章" % texts_num)
示例#17
0
def create_corpus(input_file_name, output_file_name):
  output = open(output_file_name, 'w', encoding='utf-8')
  
  wiki = WikiCorpus(input_file_name, lemmatize=False, dictionary={}, lower=False)
  
  i = 0
  for text in wiki.get_texts():
      output.write(' '.join(text) + '\n')
      i = i + 1
      if i % 10000 == 0:
          logger.info(f"Saved {i} articles")
  
  output.close()
  logger.info(f"Finished Saved {i} articles")
示例#18
0
    def wikiToTxt(self):
        # This function takes about 25 minutes
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                            level=logging.INFO)
        wiki_corpus = WikiCorpus(
            './build/zhwiki-latest-pages-articles.xml.bz2', dictionary={})

        texts_num = 0
        with open('./build/wiki_texts.txt', 'w') as output:
            for text in wiki_corpus.get_texts(
            ):  # get_texts()一次會回傳一篇文章,其中一句話為一個item組成一個list
                output.write(' '.join(text) + '\n')
                texts_num += 1
                if texts_num % 10000 == 0:
                    logging.info("壓縮檔轉為文字檔(以空格分開句子),已處理 %d 篇文章" % texts_num)
示例#19
0
def make_corpus(in_f, out_f):

    """Convert Wikipedia xml dump file to text corpus"""

    output = open(out_f, "w")
    wiki = WikiCorpus(in_f)

    i = 0
    for text in wiki.get_texts():
        output.write(bytes(" ".join(text), "utf-8").decode("utf-8") + "\n")
        i = i + 1
        if i % 10000 == 0:
            print("Processed " + str(i) + " articles")
    output.close()
    print("Processing complete!")
示例#20
0
    def wikiToTxt(self):
        if os.path.exists(self.wiki_texts):
            return

        # This function takes about 25 minutes
        from gensim.corpora import WikiCorpus
        wiki_corpus = WikiCorpus(self.wiki_dump, dictionary={})

        texts_num = 0
        with open(self.wiki_texts, 'w', encoding='utf-8') as output:
            for text in wiki_corpus.get_texts():
                output.write(' '.join(text) + '\n')
                texts_num += 1
                if texts_num % 10000 == 0:
                    logging.info("already processed %d articles" % texts_num)
def process_wiki(in_f, out_f):
    """Convert Wikipedia xml dump file to text corpus"""

    output = open(out_f, 'w', encoding='utf-8')
    wiki = WikiCorpus(in_f)

    i = 0
    print('start')
    for text in wiki.get_texts():
        output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
        i = i + 1
        if (i % 10 == 0):
            print('Processed ' + str(i) + ' articles')
    output.close()
    print('Processing complete!')
示例#22
0
def wikicorpus2text(source, target):
    '''
    将维基语料转化成text文本
    :param source: 原始维基语料压缩文件格式(bz2)地址, 下载地址:https://dumps.wikimedia.org/zhwiki/
    :param target: 文件的目标位置
    :return:
    '''
    wiki = WikiCorpus(source, lemmatize=False, dictionary=[])
    with open(target, 'w') as t:
        i = 1
        for text in tqdm(wiki.get_texts()):
            t.write(' '.join(text) + "\n")
            if (i % 10000 == 0):
                print(f'{i * 10000} is done')
            i += 1
示例#23
0
def main():
	# Load wikipedia data
	print("... Load wikipedia data")
	wiki = WikiCorpus(WIKI_FILE_PATH, lemmatize=False,tokenizer_func = tokenize_and_stem)

	# Save the wikipedia data before word2vec training, in case of any erros in the training phase
	print("... Save tokenized data")
	with open(TOKENIZED_WIKI_FILE_PATH,"w",encoding="utf-8") as output_file:
		for text in wiki.get_texts():
			output_file.write(" ".join(text)+"\n")

	# Train word2vec model and save it do disk
	print("... Train word2vec model")
	model = Word2Vec(LineSentence(TOKENIZED_WIKI_FILE_PATH), size=400, window=5, min_count=5, workers=multiprocessing.cpu_count())
	model.save(WORD2VEC_MODEL_FILE_PATH)
示例#24
0
def make_corpus(wiki_in_file, wiki_out_file):
    """Convert Wikipedia xml dump file to text corpus"""

    path_to_wiki_dump = datapath(wiki_in_file)

    with open(wiki_out_file, 'w') as output:
        wiki = WikiCorpus(path_to_wiki_dump)  # create word->word_id mapping
        i = 0
        for text in wiki.get_texts():
            output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
            i += 1
            if i % 10000 == 0:
                print('Processed ' + str(i) + ' articles')
        output.close()
        print('Processing complete!')
示例#25
0
def handle_wiki_data():
    print os.getcwd()
    wiki_source_path = sys.path[0] + '/zhwiki-latest-pages-articles.xml.bz2'
    print 'wiki baike 数据路径', wiki_source_path
    # 解析后终保存的目录
    wiki_source_path_out = sys.path[0] + '/wiki-zh-1.3g.txt'
    print 'wiki baike 数据解析结果路径', wiki_source_path_out
    wiki = WikiCorpus(wiki_source_path, lemmatize=False, dictionary={})
    file_out = open(wiki_source_path_out, 'w')
    for text in wiki.get_texts():
        str_line = ' '.join(text) + "\n"
        # 转化成简体中文
        simple_line = tradition2simple(str_line.encode('utf-8'))
        file_out.write(simple_line)
    file_out.close()
示例#26
0
def get_wiki_text():
    outp = "../../data/wiki/wiki.zh.txt"
    inp = "../../data/wiki/zhwiki-20190720-pages-articles-multistream.xml.bz2"

    space = " "

    output = open(outp, 'w', encoding='utf-8')

    # gensim里的维基百科处理类WikiCorpus
    wiki = WikiCorpus(inp, lemmatize=False, dictionary=[])

    # 通过get_texts将维基里的每篇文章转换位1行text文本,并且去掉了标点符号等内容
    for text in wiki.get_texts():
        output.write(space.join(text) + "\n")
    output.close()
示例#27
0
def make_corpus(in_f, out_f, num_articles):
    """Convert Wikipedia xml dump file to text corpus"""
    output = open(out_f, 'w+')
    wiki = WikiCorpus(in_f, tokenizer_func=tokenize)

    i = 0
    for text in wiki.get_texts():
        output.write((bytes(' ', 'utf-8').join(text)).decode('utf-8') + '\n')
        i += 1
        if (i % 100 == 0):
            print('Processed ' + str(i) + ' articles')
        if (i >= num_articles):
            break
    output.close()
    print('Processing complete!')
示例#28
0
def enwiki(srcPath, tarPath):
    index = 0
    space = " "    
    
    output = open(tarPath, 'w')
    wiki = WikiCorpus(srcPath, lemmatize=False, dictionary={})
    
    for text in wiki.get_texts():
        output.write(' '.join(text) + '\n')
        index += 1
        if (index % 10000 == 0):
            print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + "\tSaved " + str(index) + " articles.")
            
    output.close()
    print("Finished saved " + str(index) + " articles.")
示例#29
0
def generate_wiki():

    i = 0
    # Use the WikiCorpus API to read text contents from the raw dump file
    wiki = WikiCorpus(path_to_yue_wiki, lemmatize=False)
    file = codecs.open('./text/yue_wiki2.txt', 'w', 'utf-8')
    # Write texts into the new file article by article
    for text in wiki.get_texts():
        str_lines = " ".join(text) + "\n"
        file.write(str_lines)
        i += 1
        if (i % 100 == 0):
            print("Save " + str(i) + " articles")
    file.close()
    print("Finished saved " + str(i) + " articles")
示例#30
0
def save_to_batches(input, doc_set=set(), batch_path='.', batch_size=1000, lang='@body'):
    if not doc_set: # is empty
        return
    wiki = WikiCorpus(input, lemmatize=False, dictionary='empty dictionary')
    wiki.metadata = True  # request to extract page_id and title
    
    num_docs_found = 0
    batch_dict = {}
    NNZ = 0
    batch = artm.messages_pb2.Batch()
    for (text, page_id_and_title) in wiki.get_texts():
        page_id = page_id_and_title[0]
        title = page_id_and_title[1]

        if page_id in doc_set:
            num_docs_found += 1
            print num_docs_found, page_id, title

            # get tokens tf in the text
            text_tf = Counter(text)
            for token in text:
                # update batch dictionary
                if token not in batch_dict:
                    batch.token.append(unicode(token, 'utf-8'))
                    batch_dict[token] = len(batch.token) - 1

            # add item to batch
            item = batch.item.add()
            item.id = int(page_id)
            item.title = title
            field = item.field.add()
            field.name = lang
            for token in text_tf:
                field.token_id.append(batch_dict[token])
                field.token_count.append(text_tf[token])
                NNZ += text_tf[token]
       
            if len(batch.item) == batch_size:
                artm.library.Library().SaveBatch(batch, batch_path)
                print 'Batch done, |W| = ' + str(len(batch.token)) + ", NNZ = " + str(NNZ)

                batch = artm.messages_pb2.Batch()
                batch_dict = {}
                NNZ = 0

    if len(batch.item) > 0:
        artm.library.Library().SaveBatch(batch, batch_path)
        print 'Last batch done, |W| = ' + str(len(batch.token)) + ", NNZ = " + str(NNZ)
def process_wiki_to_text(input_filename, output_text_filename, output_sentences_filename):

    if os.path.isfile(output_text_filename) and os.path.isfile(output_sentences_filename):
        logging.info('Skipping process_wiki_to_text(). Files already exist: {} {}'.format(output_text_filename,
                                                                                          output_sentences_filename))
        return

    start = time.time()
    intermediary_time = None
    sentences_count = 0

    with open(output_text_filename, 'w') as out:
        with open(output_sentences_filename, 'w') as out_sentences:

            # Open the Wiki Dump with gensim
            wiki = WikiCorpus(input_filename, lemmatize=False, dictionary={}, processes=cpu_count())
            wiki.metadata = True
            texts = wiki.get_texts()

            for i, article in enumerate(texts):
                # article[1] refers to the name of the article.
                text_list = article[0]  
                sentences = text_list
                sentences_count += len(sentences)

                # Write sentences per line
                for sentence in sentences:
                    out_sentences.write((sentence + '\n'))

                # Write each page in one line
                text = ' '.join(sentences) + '\n'
                out.write(text)

                # This is just for the logging
                if i % (100 - 1) == 0 and i != 0:
                    if intermediary_time is None:
                        intermediary_time = time.time()
                        elapsed = intermediary_time - start
                    else:
                        new_time = time.time()
                        elapsed = new_time - intermediary_time
                        intermediary_time = new_time
                    sentences_per_sec = int(len(sentences) / elapsed)
                    logging.info('Saved {0} articles containing {1} sentences ({2} sentences/sec).'.format(i + 1,
                                                                                                           sentences_count,
                                                                                                           sentences_per_sec))
        logging.info(
            'Finished process_wiki_to_text(). It took {0:.2f} s to execute.'.format(round(time.time() - start, 2)))
示例#32
0
def main():

    if len(sys.argv) != 2:
        print("Usage: python3 " + sys.argv[0] + " wiki_data_path")
        exit()

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    wiki_corpus = WikiCorpus(sys.argv[1], dictionary={})
    texts_num = 0

    with open("wiki_texts_en.txt",'w',encoding='utf-8') as output:
        for text in wiki_corpus.get_texts():
            output.write(' '.join(text) + '\n')
            texts_num += 1
            if texts_num % 10000 == 0:
                logging.info("已處理 %d 篇文章" % texts_num)
示例#33
0
def my_function():
    zhwiki_name = './zhwiki-latest-pages-articles.xml.bz2'
    wiki = WikiCorpus(zhwiki_name,  dictionary={})
    documents = TaggedWikiDocument(wiki)
 
    model = g.Doc2Vec(documents, dm=0, dbow_words=1, vector_size=docvec_size, window=8, min_count=19, workers=8)
    model.save('./wiki.doc2vec.model')
示例#34
0
class LineSentences(object):
    def __init__(self, dirname, wikipath=None, lower=True):
        self.dirname = dirname
        self.wiki = None
        if wikipath:
            self.wiki = WikiCorpus(wikipath,
                                   lemmatize=False,
                                   dictionary={},
                                   lower=lower)
            self.wiki.metadata = False

    def __iter__(self):
        # if self.wiki:
        for content in self.wiki.get_texts():
            # print(content)
            yield content
        for fname in os.listdir(self.dirname):
            _, ext = splitext(fname)
            if ".txt" in ext:
                for line in open(os.path.join(self.dirname, fname)):
                    line = line.rstrip('\n')
                    words = word_tokenize(line)
                    if words:
                        # print(words)
                        yield words
    def data_process(self):
        """
        extract txt content from xml file
        """

        space = " "
        i = 0
        output = open(self.txt_path, 'w', encoding='utf-8')
        wiki = WikiCorpus(self.origin_path, lemmatize=False, dictionary={})
        for text in wiki.get_texts():
            output.write(space.join(text) + "\n")
            i = i + 1
            if i % 10000 == 0:
                print('Saved ' + str(i) + ' articles')
        output.close()
        print('Finished Saved ' + str(i) + ' articles')
示例#36
0
def my_function():
    zhwiki_name = './data/zhwiki-latest-pages-articles.xml.bz2'
    wiki = WikiCorpus(zhwiki_name, lemmatize=False, dictionary={})
    documents = TaggedWikiDocument(wiki)

    model = g.Doc2Vec(documents, dm=0, dbow_words=1, size=docvec_size, window=8, min_count=19, iter=5, workers=8)
    model.save('data/zhiwiki_news.doc2vec')
示例#37
0
def main():

    if len(sys.argv) != 2:
        print("Usage: python3 " + sys.argv[0] + " wiki_data_path")
        exit()

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    wiki_corpus = WikiCorpus(sys.argv[1], dictionary={})
    texts_num = 0

    with io.open("wiki_texts.txt",'w',encoding='utf-8') as output:
        for text in wiki_corpus.get_texts():
            output.write(b' '.join(text).decode('utf-8') + '\n')
            texts_num += 1
            if texts_num % 10000 == 0:
                logging.info("已處理 %d 篇文章" % texts_num)
示例#38
0
def dataprocess(_config):
    i = 0
    output = None
    if six.PY3:
        output = open(os.path.join(_config.data_path, _config.zhwiki_raw), 'w')
    else:
        output = codecs.open(os.path.join(_config.data_path, _config.zhwiki_raw), 'w')
    wiki = WikiCorpus(os.path.join(_config.data_path, _config.zhwiki_bz2), lemmatize=False, dictionary={})
    for text in wiki.get_texts():
        if six.PY3:
            output.write(b' '.join(text).decode('utf-8', 'ignore') + '\n')
        else:
            output.write(' '.join(text) + '\n')
        i += 1
        if i % 10000 == 0:
            print('Saved ' + str(i) + ' articles')
    output.close()
    print('Finished Saved ' + str(i) + ' articles')
def process_wiki(inp, outp):
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)

    i = 0

    output = open(outp, 'w', encoding='utf-8')
    wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
    for text in wiki.get_texts():
        output.write(b' '.join(text).decode('utf-8') + '\n')
        i = i + 1
        if i % 10000 == 0:
            logger.info('Saved ' + str(i) + ' articles')

    output.close()
    logger.info('Finished ' + str(i) + ' articles')
示例#40
0
def my_function():
    space = ' '
    i = 0
    l = []
    zhwiki_name = './data/zhwiki-latest-pages-articles.xml.bz2'
    f = open('./data/reduce_zhiwiki.txt', 'w')
    wiki = WikiCorpus(zhwiki_name, lemmatize=False, dictionary={})
    for text in wiki.get_texts():
        for temp_sentence in text:
            temp_sentence = Converter('zh-hans').convert(temp_sentence)
            seg_list = list(jieba.cut(temp_sentence))
            for temp_term in seg_list:
                l.append(temp_term)
        f.write(space.join(l) + '\n')
        l = []
        i = i + 1

        if (i %200 == 0):
            print('Saved ' + str(i) + ' articles')
    f.close()
示例#41
0
def process_wiki(inp,outp):
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    space = " "
    i = 0
    output = open(outp, 'w')
    wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
    for text in wiki.get_texts():
        output.write(space.join(text) + "\n")
        i = i + 1
        if (i % 10000 == 0):
            logger.info("Saved " + str(i) + " articles")

    output.close()
    logger.info("Finished Saved " + str(i) + " articles")
def make_wiki_corpus(inp, outp, logger):
    '''
    Предобработка википедии.
    :param inp: путь к файлу, например: enwiki-20150304-pages-articles.xml.bz2
    :param outp: выходной текстовый файл с предобработанной базой текстов
                 например: wiki.en.text
    :param logger: логер для вывода информации о процессе предобработки
    '''
    output = open(outp, 'w')
    wiki = WikiCorpus(inp, lemmatize=False, dictionary={})

    i = 0
    space = " "
    for text in wiki.get_texts():
        output.write(space.join(text) + "\n")
        i += 1
        if i % 10000 == 0:
            logger.info("Saved " + str(i) + " articles")

    output.close()
    logger.info("Finished Saved " + str(i) + " articles")
示例#43
0
def zhwiki2chars(in_file, out_file):
    reg = re.compile(r'^[a-zA-Z]+$')

    def _isalpha(string):
        return reg.match(string) is not None

    i = 0
    out = open(out_file, 'w')
    wiki = WikiCorpus(in_file, lemmatize=False, dictionary={})
    for article in wiki.get_texts():
        tokens = []
        for token in article:
            token = token.decode("utf-8").strip()
            if _isalpha(token):
                continue
            tokens.append(" ".join(token))  # divided by character
        out.write(" ".join(tokens) + "\n")
        i += 1
        if i % 10000 == 0:
            print("process %d articles" % i)
    out.close()
示例#44
0
def main():
    gensim.corpora.wikicorpus.tokenize = replacement_tokenize

    infn, outfn = sys.argv[1:3]
    wiki = WikiCorpus(infn, lemmatize=False, dictionary={})
    with open(outfn, 'w') as outfile:
        for i, article in enumerate(wiki.get_texts()):
            article = [entry.decode("utf-8") for entry in article]
            text = " ".join(article)
            mostly_sentences = nltk.sent_tokenize(text)

            sentences = []
            for sent in mostly_sentences:
                for line in sent.splitlines():
                    sentences.append(line.strip())

            for sentence in sentences:
                sentence = cleanup(sentence)
                if sentence:
                    print(sentence, file=outfile)
            if (i % 10000 == 0):
                print("Saved ", i, "articles")
示例#45
0
def convert(input_path, output_path):
    logger.info("Converting Wiki Corpus...")
    corpus_path = check_path(input_path)
    wiki_text_output_path = output_path

    start_time = time.time()

    space = " "
    i = 0

    wiki = WikiCorpus(corpus_path, lemmatize=False, dictionary={})

    output = open(wiki_text_output_path, 'w')

    # Convert WikiCorpus into Text output (1 article per line)
    for text in wiki.get_texts():
        output.write(space.join(text) + '\n')
        i += 1
        if i % 10000 == 0:
            logger.info("Saved " + str(i) + " articles")

    output.close()
    logger.info("Finished Saved " + str(i) + " articles. Time needed: " + str(time.time() - start_time))
示例#46
0
if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)
 
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))
 
    # check and process input arguments
    if len(sys.argv) < 3:
        print "Usage: extractwiki.py infile_name outfile_name"
        sys.exit(1)
        
    infilename, outfilename = sys.argv[1:3]
 
    if os.path.isfile(outfilename):
        logger.error("Output file %s exists. Change the file name and try again." %outfilename)
        sys.exit(1)
        
    i = 0
    output = open(outfilename, 'w')
    wiki = WikiCorpus(infilename, lemmatize=False, dictionary={})
    for text in wiki.get_texts():
        output.write( " ".join(text) + "\n")
        i = i + 1
        if (i % 10000 == 0):
            logger.info("Saved " + str(i) + " articles")
 
    output.close()
    logger.info("Finished Saved " + str(i) + " articles")
    
示例#47
0
    if len(sys.argv) < 3:
        print(globals()['__doc__'] % locals())
        sys.exit(1)
    inp, outp = sys.argv[1:3]
    if len(sys.argv) > 3:
        keep_words = int(sys.argv[3])
    else:
        keep_words = DEFAULT_DICT_SIZE
    online = 'online' in program
    lemmatize = 'lemma' in program
    debug = 'nodebug' not in program

    if online:
        dictionary = HashDictionary(id_range=keep_words, debug=debug)
        dictionary.allow_update = True # start collecting document frequencies
        wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary)
        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
        # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
        dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        dictionary.save_as_text(outp + '_wordids.txt.bz2')
        wiki.save(outp + '_corpus.pkl.bz2')
        dictionary.allow_update = False
    else:
        wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
        mywiki = myWikiCorpus(inp, lemmatize=lemmatize)
        # only keep the most frequent words (out of total ~8.2m unique tokens)
        wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        # save dictionary and bag-of-words (term-document frequency matrix)
        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
        MmCorpus.serialize(outp + '_bowm.mm', mywiki, progress_cnt=10000) # another ~9h
        wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
示例#48
0
    program = os.path.basename(sys.argv[0])#得到文件名
    #program = os.path.basename()#得到文件名
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    if len(sys.argv) < 3:
        print(globals()['__doc__'] % locals())
        sys.exit(1)

    inp, outp = sys.argv[1:3]
    space = " "
    i = 0

    output = open(outp, 'w',encoding='utf-8')
    wiki =WikiCorpus(inp, lemmatize=False, dictionary=[])#gensim里的维基百科处理类WikiCorpus
    for text in wiki.get_texts():#通过get_texts将维基里的每篇文章转换位1行text文本,并且去掉了标点符号等内容
        output.write(space.join(text) + "\n")
        i = i+1
        if (i % 10000 == 0):
            logger.info("Saved "+str(i)+" articles.")

    output.close()
    logger.info("Finished Saved "+str(i)+" articles.")
    



示例#49
0
    if len(sys.argv) < 3:
        print globals()['__doc__'] % locals()
        sys.exit(1)
    inp, outp = sys.argv[1:3]
    if len(sys.argv) > 3:
        keep_words = int(sys.argv[3])
    else:
        keep_words = DEFAULT_DICT_SIZE
    online = 'online' in program
    lemmatize = 'lemma' in program
    debug = 'nodebug' not in program

    if online:
        dictionary = HashDictionary(id_range=keep_words, debug=debug)
        dictionary.allow_update = True # start collecting document frequencies
        wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary)
        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
        # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
        dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        dictionary.save_as_text(outp + '_wordids.txt')
        wiki.save(outp + '_corpus.pkl')
        dictionary.allow_update = False
    else:
        wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
        # only keep the most frequent words (out of total ~8.2m unique tokens)
        wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        # save dictionary and bag-of-words (term-document frequency matrix)
        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
        wiki.dictionary.save_as_text(outp + '_wordids.txt')
        # load back the id->word mapping directly from file
        # this seems to save more memory, compared to keeping the wiki.dictionary object from above
示例#50
0
if __name__ == '__main__':

    # set up logging
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running: %s" % ' '.join(sys.argv))

    # check and process input arguments
    args = parse_args(sys.argv[1:])

    if not 'input' in args:
        logger.error("No input given!")
        sys.exit(1)

    # get args
    inp, outp, limit = args['input'], args['output'], args['limit']

    # prepare corpus
    wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
    texts = slice(wiki.get_texts(), limit);

    # save this for efficiency
    space = " "
    output = open(outp, 'w')
    iterate_with_logging(logger, 10000, texts,
                 lambda text: output.write(space.join(text) + "\n"))

    output.close()
示例#51
0
        gamma, _ = self.inference([bow])
        theta = numpy.exp(dirichlet_expectation(gamma[0]))
        topicDist = theta / theta.sum() # normalize to proper distribution
        return [(topicId, topicValue) for topicId, topicValue in enumerate(topicDist)
                if topicValue >= eps] # ignore document's topics that have prob < eps
#endclass LdaModel



if __name__ == '__main__':
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logger.setLevel(level = logging.DEBUG)
    logger.info("running %s" % ' '.join(sys.argv))

    import os.path
    program = os.path.basename(sys.argv[0])
    from gensim.corpora import WikiCorpus, MmCorpus, LowCorpus
    numpy.random.seed(100000001)

    vocab = WikiCorpus.loadDictionary('/Users/kofola/gensim/results/wiki10_en_wordids.txt')
    corpus = MmCorpus('/Users/kofola/gensim/results/wiki10_en_bow.mm')
    K = 50

    olda = LdaModel(numTopics=K, id2word=vocab, alpha=1./K, eta=1./K, decay=0.5)
    olda.update(corpus)
    olda.save('olda2.pkl')

    logging.info("finished running %s" % program)

示例#52
0
文件: gen.py 项目: chiraggiri/NLQA
import os.path
import sys

from gensim.corpora import WikiCorpus
from gensim.models import TfidfModel, Word2Vec

if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])

    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    if len(sys.argv) < 3:
        print globals()['__doc__'] % locals()
        sys.exit(1)

    inp, outp = sys.argv[1:3]

    wiki = WikiCorpus(inp, dictionary={})
    model = Word2Vec(size=300, window=5, min_count=5, workers=8)
    sentences = wiki.get_texts()
    model.build_vocab(sentences)
    sentences = wiki.get_texts()
    model.train(sentences)
    model.save(outp)
    model.init_sims(replace=True)
    model.save('trimmed-model')
示例#53
0
#! /usr/bin/env python
# -*- coding: utf-8 -*-



from gensim.corpora import WikiCorpus
from gensim.models.word2vec import Word2Vec
import logging, os
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
corpus = WikiCorpus('../fawiki-latest-pages-articles.xml.bz2',dictionary=False)

max_sentence = -1

def generate_lines():
    for index, text in enumerate(corpus.get_texts()):
        if index < max_sentence or max_sentence==-1:
            yield text
        else:
            break

# Check if model is not exist
model = Word2Vec() 		
if ((os.path.exists('../model_farsi')) and (os.path.isfile('../model_farsi'))):
	model = Word2Vec.load('../model_farsi')
	result_1 = model.most_similar('روز')
	result_2 = model.most_similar(positive=['زن', 'پادشاه'], negative=['مرد'], topn=10)
	
	print "result is:"
	for (re,v) in result_1:
		print re + ' '+ str(v)
	print "======================="
示例#54
0
    if len(sys.argv) < 3:
        print globals()['__doc__'] % locals()
        sys.exit(1)
    inp, outp = sys.argv[1:3]
    if len(sys.argv) > 3:
        keep_words = int(sys.argv[3])
    else:
        keep_words = DEFAULT_DICT_SIZE
    online = 'online' in program
    lemmatize = 'lemma' in program
    debug = 'nodebug' not in program

    if online:
        dictionary = HashDictionary(id_range=keep_words, debug=debug)
        dictionary.allow_update = True # start collecting document frequencies
        wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary)
        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
        # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
        dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        dictionary.save_as_text(outp + '_wordids.txt.bz2', use_bzip2=True)
        wiki.save(outp + '_corpus.pkl.bz2', use_bzip2=True)
        dictionary.allow_update = False
    else:
        wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
        # only keep the most frequent words (out of total ~8.2m unique tokens)
        wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        # save dictionary and bag-of-words (term-document frequency matrix)
        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
        wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2', use_bzip2=True)
        # load back the id->word mapping directly from file
        # this seems to save more memory, compared to keeping the wiki.dictionary object from above
示例#55
0
	# SETTINGS

	# model parameters and output
	lsa_dim = 100
	w2v_dim = 50
	f_bow = "{0}.bow".format(prefix)
	f_tfidf = "{0}_voc{1}.tfidf".format(prefix, voc_size)
	f_lsa = "{0}_voc{1}_dim{2}.lsa".format(prefix, voc_size, lsa_dim)
	f_dict = "{0}_voc{1}.dict".format(prefix, voc_size)
	f_w2v = "{0}_voc{1}_dim{2}_win5.bin".format(prefix, voc_size, w2v_dim)

	# CORPUS PREPROCESSING

	if wiki: # models will be trained on the Dutch Wikipedia corpus
		if os.path.exists(f_bow):
			corpus = WikiCorpus.load(f_bow)
		else:
			# download wikipedia training corpus (2015/10/14 18:45, 132MB)
			if not os.path.exists(f_corpus):
				wiki_lang, wiki_size, wiki_url = wikis[lang]
				if raw_input("About to download {0} Wikipedia corpus ({1}). Do you want to proceed? (y/n) ".format(wiki_lang, wiki_size)).startswith("y"):
					util.download_file(wiki_url, f_corpus, progress=True)
				else:
					sys.exit()
			corpus = WikiCorpus(f_corpus)
#			corpus.save(f_bow)
	else: # models will be trained on your own corpus
		if os.path.exists(f_bow):
			corpus = TextCorpus.load(f_bow)
		else:
			corpus = TextCorpus(f_corpus)
logging.basicConfig(
        format='%(asctime)s : %(levelname)s : %(message)s',
        level=logging.INFO
)

parser = argparse.ArgumentParser()
parser.add_argument("-a", "--articles", help="path to enwiki-latest-pages-articles.xml.bz2")
parser.add_argument("-m", "--model", help="path to model dir")
parser.add_argument("-d", "--demo", help="path to question-words.txt analogies")
parser.add_argument("-l", "--lines", help="path to wiki-lines.txt")
args = parser.parse_args()

# Load or create wiki-lines.txt
if not (os.path.isfile(args.lines)):
    wiki_corpus = WikiCorpus(args.articles, lemmatize=False)
    wiki_lines = wiki_corpus.get_texts()

    # Write wiki_lines out for future use
    lines_file = open(args.lines, 'w')
    for text in wiki_lines:
        lines_file.write(" ".join(text) + "\n")
    lines_file.close()
else:
    wiki_lines = open(args.lines)

model = Word2Vec(
        sentences=LineSentence(wiki_lines),
        size=400,
        hs=1,
        window=5,
示例#57
0
#!/usr/bin/python

from gensim.corpora import WikiCorpus
from gensim.models.word2vec import Word2Vec

corpus = WikiCorpus('dewiki-latest-pages-articles.xml.bz2', dictionary=False, lemmatize=False)

model = Word2Vec(size=300, window=7, min_count=7, workers=4, negative=10, hs=0)
model.build_vocab(corpus.get_texts())
model.train(corpus.get_texts())
model.init_sims(replace=True)
model.save('dewiki.w2v')
    parser.add_argument('--online', default=False, action='store_true')
    parser.add_argument('--debug', default=False, action='store_true')
    parser.add_argument('--keep-words', default=DEFAULT_DICT_SIZE, type=int, help='number of words to keep')
    args = parser.parse_args()

    logger = logging.getLogger('gensim.scripts.read_stream_items')

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %r" % args.__dict__)

    if args.online:
        dictionary = HashDictionary(id_range=args.keep_words, debug=args.debug)
        dictionary.allow_update = True # start collecting document frequencies
        ## cannot use --max-articles or --expect-streamitems
        wiki = WikiCorpus(args.input, lemmatize=args.lemmatize, dictionary=dictionary)
        MmCorpus.serialize(args.output + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
        # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
        dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        dictionary.save_as_text(outp + '_wordids.txt.bz2')
        wiki.save(args.output + '_corpus.pkl.bz2')
        dictionary.allow_update = False

    else:  ## not online
        # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
        wiki = WikiCorpus(
            args.input, lemmatize=args.lemmatize, 
            max_articles=args.max_articles,
            expect_streamitems=args.expect_streamitems,                          
            file_name_pattern=args.file_name_pattern,
        )