def main(argv=None): if argv is None: argv = sys.argv print('Creating simple wiki serialized corpus') # Download the raw file if we do not have it already if not os.path.isfile(WIKIFILE): # Get the file wget.download(WIKIURL) wiki = WikiCorpus(WIKIFILE, lemmatize=False) i = 0 article_dict = {} for text in wiki.get_texts(meta=True): url_string = 'https://simple.wikipedia.org/wiki/?curid={}' article_dict[i] = (url_string.format(text[0]), text[1]) i += 1 with open(ARTICLEDICT, 'w') as f: json.dump(article_dict, f) wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) MmCorpus.serialize(MMFILE, wiki, progress_cnt=10000, ) wiki.dictionary.save_as_text(DICTFILE) print('Simple wiki serialized corpus created') # Now run LSI dictionary = Dictionary.load_from_text(DICTFILE) mm = MmCorpus(MMFILE) tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(TDIFMODEL) MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000) mm_tdif = MmCorpus(TDIFFILE) lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300) index = similarities.MatrixSimilarity(lsi[mm_tdif]) index.save(SIMMATRIX) lsi.save(LSIMODEL) print("LSI model and index created")
def extract_wiki(thresh, env_path, vec_file): program = os.path.basename(env_path[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) # check and process input arguments if len(sys.argv) < 3: print globals()['__doc__'] % locals() sys.exit(1) inp, outp = sys.argv[1:3] space = " " i = 0 print('--- load ck12 word2vec') model = gensim.models.Word2Vec.load_word2vec_format(vec_file, binary=False) print('--- filtering keywords based on sim to ck12 keyword science') output = open(outp, 'w') wiki = WikiCorpus(inp, lemmatize=False, dictionary={}) for text in wiki.get_texts(): topic =[ w for w in text[:20] if w not in stopwords.words('english')] sim = np.mean([ model[w].dot(model['science']) if w in model else 0 for w in topic]) #sim = model['science'].dot(topic_vec) if sim > thresh: output.write(space.join(text) + "\n") i = i + 1 if (i % 100 == 0): logger.info("Saved " + str(i) + " articles") output.close() logger.info("Finished Saved " + str(i) + " articles")
def process_enwiki(input_file, output_file): space = ' ' i = 0 output = open(output_file, 'w') wiki = WikiCorpus(input_file, lemmatize=False, dictionary={}) for text in wiki.get_texts(): output.write(space.join(text) + '\n') i += 1 if i % 10000 == 0: logger.info('Saved ' + str(i) + ' articles') output.close()
def parse_wiki(filename): fout = file('../../paper/data/wiki/wiki_corpus', 'w') wiki = WikiCorpus(filename, lemmatize=False, dictionary={}, processes=5) count = 0 for text in wiki.get_texts(): fout.write('%s\n' % ' '.join(text)) if count % 10000 == 0: logging.info(count) count += 1 fout.close() logging.info('Finish %d' % count)
def parse(filename): OUTPATH = '../gen_data/wikicorpus' fout = open(OUTPATH, 'w') wiki = WikiCorpus(filename, lemmatize=False, dictionary={}, processes=5) count = 0 for text in wiki.get_texts(): fout.write(" ".join(text) + "\n") count = count + 1 if (count % 10000 == 0): logging.info("Save "+str(count) + " articles") fout.close() logging.info("Finished saved "+str(count) + "articles")
def process_wiki(infile, outfile): from gensim.corpora import WikiCorpus wiki = WikiCorpus(infile, lemmatize=False, dictionary={}) i = 0 with open(outfile, 'w') as fw: for text in wiki.get_texts(): text = ' '.join(text) cut_text = cut(text) fw.write(re.sub(r' {1,}', ' ', ' '.join(cut_text)) + '\n') i += 1 if i % 1000 == 0: logger.info('Saved ' + str(i) + ' texts') logger.info('Finished ' + str(i) + ' texts')
def enwiki(srcPath, tarPath): index = 0 space = " " output = open(tarPath, 'w') wiki = WikiCorpus(srcPath, lemmatize=False, dictionary={}) for text in wiki.get_texts(): output.write(' '.join(text) + '\n') index += 1 if (index % 10000 == 0): print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + "\tSaved " + str(index) + " articles.") output.close() print("Finished saved " + str(index) + " articles.")
def save_to_batches(input, doc_set=set(), batch_path='.', batch_size=1000, lang='@body'): if not doc_set: # is empty return wiki = WikiCorpus(input, lemmatize=False, dictionary='empty dictionary') wiki.metadata = True # request to extract page_id and title num_docs_found = 0 batch_dict = {} NNZ = 0 batch = artm.messages_pb2.Batch() for (text, page_id_and_title) in wiki.get_texts(): page_id = page_id_and_title[0] title = page_id_and_title[1] if page_id in doc_set: num_docs_found += 1 print num_docs_found, page_id, title # get tokens tf in the text text_tf = Counter(text) for token in text: # update batch dictionary if token not in batch_dict: batch.token.append(unicode(token, 'utf-8')) batch_dict[token] = len(batch.token) - 1 # add item to batch item = batch.item.add() item.id = int(page_id) item.title = title field = item.field.add() field.name = lang for token in text_tf: field.token_id.append(batch_dict[token]) field.token_count.append(text_tf[token]) NNZ += text_tf[token] if len(batch.item) == batch_size: artm.library.Library().SaveBatch(batch, batch_path) print 'Batch done, |W| = ' + str(len(batch.token)) + ", NNZ = " + str(NNZ) batch = artm.messages_pb2.Batch() batch_dict = {} NNZ = 0 if len(batch.item) > 0: artm.library.Library().SaveBatch(batch, batch_path) print 'Last batch done, |W| = ' + str(len(batch.token)) + ", NNZ = " + str(NNZ)
def main(): if len(sys.argv) != 2: print("Usage: python3 " + sys.argv[0] + " wiki_data_path") exit() logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) wiki_corpus = WikiCorpus(sys.argv[1], dictionary={}) texts_num = 0 with io.open("wiki_texts.txt",'w',encoding='utf-8') as output: for text in wiki_corpus.get_texts(): output.write(b' '.join(text).decode('utf-8') + '\n') texts_num += 1 if texts_num % 10000 == 0: logging.info("已處理 %d 篇文章" % texts_num)
def dataprocess(_config): i = 0 output = None if six.PY3: output = open(os.path.join(_config.data_path, _config.zhwiki_raw), 'w') else: output = codecs.open(os.path.join(_config.data_path, _config.zhwiki_raw), 'w') wiki = WikiCorpus(os.path.join(_config.data_path, _config.zhwiki_bz2), lemmatize=False, dictionary={}) for text in wiki.get_texts(): if six.PY3: output.write(b' '.join(text).decode('utf-8', 'ignore') + '\n') else: output.write(' '.join(text) + '\n') i += 1 if i % 10000 == 0: print('Saved ' + str(i) + ' articles') output.close() print('Finished Saved ' + str(i) + ' articles')
def process_wiki(inp, outp): program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) i = 0 output = open(outp, 'w', encoding='utf-8') wiki = WikiCorpus(inp, lemmatize=False, dictionary={}) for text in wiki.get_texts(): output.write(b' '.join(text).decode('utf-8') + '\n') i = i + 1 if i % 10000 == 0: logger.info('Saved ' + str(i) + ' articles') output.close() logger.info('Finished ' + str(i) + ' articles')
def my_function(): space = ' ' i = 0 l = [] zhwiki_name = './data/zhwiki-latest-pages-articles.xml.bz2' f = open('./data/reduce_zhiwiki.txt', 'w') wiki = WikiCorpus(zhwiki_name, lemmatize=False, dictionary={}) for text in wiki.get_texts(): for temp_sentence in text: temp_sentence = Converter('zh-hans').convert(temp_sentence) seg_list = list(jieba.cut(temp_sentence)) for temp_term in seg_list: l.append(temp_term) f.write(space.join(l) + '\n') l = [] i = i + 1 if (i %200 == 0): print('Saved ' + str(i) + ' articles') f.close()
def process_wiki(inp,outp): program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) space = " " i = 0 output = open(outp, 'w') wiki = WikiCorpus(inp, lemmatize=False, dictionary={}) for text in wiki.get_texts(): output.write(space.join(text) + "\n") i = i + 1 if (i % 10000 == 0): logger.info("Saved " + str(i) + " articles") output.close() logger.info("Finished Saved " + str(i) + " articles")
def zhwiki2chars(in_file, out_file): reg = re.compile(r'^[a-zA-Z]+$') def _isalpha(string): return reg.match(string) is not None i = 0 out = open(out_file, 'w') wiki = WikiCorpus(in_file, lemmatize=False, dictionary={}) for article in wiki.get_texts(): tokens = [] for token in article: token = token.decode("utf-8").strip() if _isalpha(token): continue tokens.append(" ".join(token)) # divided by character out.write(" ".join(tokens) + "\n") i += 1 if i % 10000 == 0: print("process %d articles" % i) out.close()
def make_wiki_corpus(inp, outp, logger): ''' Предобработка википедии. :param inp: путь к файлу, например: enwiki-20150304-pages-articles.xml.bz2 :param outp: выходной текстовый файл с предобработанной базой текстов например: wiki.en.text :param logger: логер для вывода информации о процессе предобработки ''' output = open(outp, 'w') wiki = WikiCorpus(inp, lemmatize=False, dictionary={}) i = 0 space = " " for text in wiki.get_texts(): output.write(space.join(text) + "\n") i += 1 if i % 10000 == 0: logger.info("Saved " + str(i) + " articles") output.close() logger.info("Finished Saved " + str(i) + " articles")
def main(): gensim.corpora.wikicorpus.tokenize = replacement_tokenize infn, outfn = sys.argv[1:3] wiki = WikiCorpus(infn, lemmatize=False, dictionary={}) with open(outfn, 'w') as outfile: for i, article in enumerate(wiki.get_texts()): article = [entry.decode("utf-8") for entry in article] text = " ".join(article) mostly_sentences = nltk.sent_tokenize(text) sentences = [] for sent in mostly_sentences: for line in sent.splitlines(): sentences.append(line.strip()) for sentence in sentences: sentence = cleanup(sentence) if sentence: print(sentence, file=outfile) if (i % 10000 == 0): print("Saved ", i, "articles")
def preprocess(): space = '' i = 0 l = [] zhwiki_name = './data/***.xml.bz2' f = open('./data/***.txt', 'w') wiki = WikiCorpus(zhwiki_name, lemmatize='None', dictionary={}) # xml文件中的训练语料 for text in wiki.get_texts(): for temp_sentence in text: temp_sentence = Converter('zh_hans').convert( temp_sentence) # 繁体转为简体 seg_list = list(jieba.cut(temp_sentence)) for term in seg_list: l.append(term) f.write(space.join(l) + 'n/') i = i + 1 l = [] if (i % 200) == 0: print("saved" + str(i) + "articles") f.close()
def convert(input_path, output_path): logger.info("Converting Wiki Corpus...") corpus_path = check_path(input_path) wiki_text_output_path = output_path start_time = time.time() space = " " i = 0 wiki = WikiCorpus(corpus_path, lemmatize=False, dictionary={}) output = open(wiki_text_output_path, 'w') # Convert WikiCorpus into Text output (1 article per line) for text in wiki.get_texts(): output.write(space.join(text) + '\n') i += 1 if i % 10000 == 0: logger.info("Saved " + str(i) + " articles") output.close() logger.info("Finished Saved " + str(i) + " articles. Time needed: " + str(time.time() - start_time))
def preprocess(): """ 使用gensim中的WikiCorpus库提取wiki的中文语料,并将繁体转成简体中文。 然后利用jieba的分词工具将转换后的语料分词并写入一个txt 每个wiki文档的分词结果写在新txt中的一行,词与词之间用空格隔开 :return: """ count = 0 zhwiki_path = './data/zhwiki-latest-pages-articles.xml.bz2' f = open('./data/reduced_zhwiki.txt', 'w', encoding='utf8') wiki = WikiCorpus(zhwiki_path, lemmatize=False, dictionary={}) for text in wiki.get_texts(): word_list = [] for sentence in text: sentence = Converter('zh-hans').convert(sentence) # 繁体转简体 seg_list = jieba.cut(sentence) for seg in seg_list: word_list.append(seg) f.write(' '.join(word_list) + '\n') count += 1 if count % 200 == 0: print("Saved " + str(count) + ' articles') f.close()
def main(args): """ args: argparse.Namespace object Returns: None """ logger = logging.getLogger(__name__) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) # load wiki corpus from a .xml.bz2 file wiki = WikiCorpus(args.infile, lemmatize=False, processes=multiprocessing.cpu_count()) # parse documents from the corpus and write to the output file with open(args.outfile, 'w', encoding='utf-8') as fout: for i, text in enumerate(wiki.get_texts()): fout.write(' '.join(text) + '\n') if (i + 1) % 10000 == 0: logger.info('Processed %d documents' % (i + 1)) cnt = i logger.info('Finished processing %d documents' % cnt)
def process_wiki(): import logging import os.path import sys from gensim.corpora import WikiCorpus print(__name__) print("running %s" % ' '.join(sys.argv)) # if __name__ == '__main__': program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) # check and process input arguments if len(sys.argv) < 3: print((globals()['__doc__'] % locals())) sys.exit(1) inp, outp = sys.argv[1:3] space = b' ' i = 0 output = open(outp, 'w', encoding='utf-8') wiki = WikiCorpus(inp, lemmatize=False, dictionary={}) for text in wiki.get_texts(): list1 = space.join(text) output.write((list1.decode('utf-8')) + "\n") i += 1 if (i % 10000 == 0): logger.info("Saved " + str(i) + " articles") output.close() logger.info("Finished Saved " + str(i) + " articles")
def set_wiki_to_txt(self, wiki_data_path=None): for s in sys.argv: print(s) if wiki_data_path == None: # parameter if len(sys.argv) != 2: print("Please Usage: python3 " + sys.argv[0] + " wiki_data_path") exit() else: wiki_corpus = WikiCorpus(sys.argv[1], dictionary={}) else: wiki_corpus = WikiCorpus(wiki_data_path, dictionary={}) # wiki.xml convert to wiki.txt with open(r'.\word2vec_data\wiki_text.txt', 'w', encoding='utf-8') as output: text_count = 0 for text in wiki_corpus.get_texts(): # save use string(gensim) output.write(' '.join(text) + '\n') text_count += 1 if text_count % 10000 == 0: logging.info("目前已處理 %d 篇文章" % text_count) print("轉檔完畢!")
def my_function(): space = ' ' i = 0 l = [] zhwiki_name = './study_ml/data/text_vector/zhwiki-latest-pages-articles.xml.bz2' f = open('./study_ml/data/text_vector/reduce_zhiwiki.txt', 'w') # 维基百科语料将xml的wiki数据转换为text格式 wiki = WikiCorpus(zhwiki_name, lemmatize=False, dictionary={}) for text in wiki.get_texts(): for temp_sentence in text: # 繁体字转换 temp_sentence = langconv.Converter('zh-hans').convert( temp_sentence) # 分词 seg_list = list(jieba.cut(temp_sentence)) for temp_term in seg_list: l.append(temp_term) f.write(space.join(l) + '\n') l = [] i = i + 1 if (i % 200 == 0): print('Saved ' + str(i) + ' articles') f.close()
if __name__ == '__main__': # set up logging program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running: %s" % ' '.join(sys.argv)) # check and process input arguments args = parse_args(sys.argv[1:]) if not 'input' in args: logger.error("No input given!") sys.exit(1) # get args inp, outp, limit = args['input'], args['output'], args['limit'] # prepare corpus wiki = WikiCorpus(inp, lemmatize=False, dictionary={}) texts = slice(wiki.get_texts(), limit); # save this for efficiency space = " " output = open(outp, 'w') iterate_with_logging(logger, 10000, texts, lambda text: output.write(space.join(text) + "\n")) output.close()
timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # Specify data path data_path = '/data/khgkim/compling/dump' token_path = '/data/khgkim/compling/word2vec_tokens.txt' analogy_path = '/data/khgkim/compling/questions-words.txt' os.chdir(data_path) if not (os.path.isfile(token_path)): # Extract and tokenize Wikipedia articles wiki_corpus = WikiCorpus('wiki_dump.xml.bz2') wiki_lines = wiki_corpus.get_texts() # Write wiki_lines out for future use lines_output = open(token_path, 'w') for text in wiki_lines: lines_output.write(" ".join(text) + "\n").encode('utf-8') lines_output.close() else: print 'Output message: word2vec_tokens.txt already exists!' exit() model = Word2Vec(sentences=LineSentence(wiki_lines), size=400, negative=5, hs=0, sample=1e-5,
from gensim.corpora import WikiCorpus from gensim.models import Word2Vec wiki_corpus = WikiCorpus("dewiki-latest-pages-articles.xml.bz2", dictionary={None: None}) normal_window_model = Word2Vec(window=5) normal_window_model.build_vocab(wiki_corpus.get_texts()) normal_window_model.train(wiki_corpus.get_texts(), total_examples=normal_window_model.corpus_count, epochs=normal_window_model.epochs) normal_window_model.save("normal_window_model") small_window_model = Word2Vec(window=2) small_window_model.build_vocab(wiki_corpus.get_texts()) small_window_model.train(wiki_corpus.get_texts(), total_examples=small_window_model.corpus_count, epochs=small_window_model.epochs) small_window_model.save("small_window_model")
# -*- coding: utf-8 -*- """ Código fuente de ejemplos y ejercicios del libro "Curso de Programación Python" (C) Ediciones Anaya Multimedia 2019 Autores: Arturo Montejo Ráez y Salud María Jiménez Zafra """ from gensim.corpora import WikiCorpus from gensim.models.word2vec import Word2Vec from gensim.utils import deaccent # Leemos el volcado descargado de Wikipedia corpus = WikiCorpus('eswiki-latest-pages-articles.xml.bz2', dictionary=False) # Quitamos tildes texts = [deaccent(t) for t in corpus.get_texts()] # Definimos el algoritmo a utilizar y sus hiperparámetros model = Word2Vec(size=400, window=5, min_count=5) # Generamos el vocabulario model.build_vocab(texts) # Entrenamos el modelo model.train(texts, chunksize=500) # Lo guardamos en disco para su uso posterior model.save('eswikipedia_w2v_model')
#_*_coding:utf-8_*_ from gensim.corpora import WikiCorpus import jieba from langconv import * import codecs from tqdm import tqdm, trange import time, datetime start = datetime.datetime.now() zhwiki = '/NLP/data/zhwiki-latest-pages-articles.xml.bz2' strs = [] i = 0 f = codecs.open('./zhiwiki.txt', 'a', 'utf-8') wiki = WikiCorpus(zhwiki, lemmatize=False, dictionary={}) for text in tqdm(wiki.get_texts()): for sen in text: sen = Converter('zh-hans').convert(sen) sen_list = list(jieba.cut(sen)) for s in sen_list: strs.append(str(s)) tmp = ' '.join(strs) f.write(tmp + '\n') strs = [] i = i + 1 if (i % 200 == 0): print('save' + str(i) + 'article') f.close() end = datetime.datetime.now() print((end - start).seconds)
reload(sys) sys.setdefaultencoding('utf-8') program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) # check and process input arguments if len(sys.argv) != 3: print("Using: python wiki_process.py zhwiki.xxx.xml.bz2 wiki.en.text") sys.exit(1) inp, outp = sys.argv[1:3] space = " " i = 0 output = open(outp, 'w') wiki = WikiCorpus(inp, lemmatize=False, dictionary={}) for text in wiki.get_texts(): if six.PY3: output.write(b' '.join(text).decode('utf-8') + '\n') else: output.write(space.join(text) + "\n") i = i + 1 if (i % 10000 == 0): logger.info("Saved " + str(i) + " articles") output.close() logger.info("Finished Saved " + str(i) + " articles")
program = os.path.basename(sys.argv[0])#得到文件名 #program = os.path.basename()#得到文件名 logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) if len(sys.argv) < 3: print(globals()['__doc__'] % locals()) sys.exit(1) inp, outp = sys.argv[1:3] space = " " i = 0 output = open(outp, 'w',encoding='utf-8') wiki =WikiCorpus(inp, lemmatize=False, dictionary=[])#gensim里的维基百科处理类WikiCorpus for text in wiki.get_texts():#通过get_texts将维基里的每篇文章转换位1行text文本,并且去掉了标点符号等内容 output.write(space.join(text) + "\n") i = i+1 if (i % 10000 == 0): logger.info("Saved "+str(i)+" articles.") output.close() logger.info("Finished Saved "+str(i)+" articles.")
from gensim.corpora import WikiCorpus if __name__ == '__main__': program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) # check and process input arguments if len(sys.argv) < 3: print(globals()) print(locals()) sys.exit(1) inp, outp = sys.argv[1:3] space = " " i = 0 output = open(outp, 'w') wiki = WikiCorpus(inp, lemmatize=False, dictionary={}) for text in wiki.get_texts(): # 一篇文章一篇文章的获取 output.write(space.join(text) + "\n") i = i + 1 if (i % 10000 == 0): logger.info("Saved " + str(i) + " articles") output.close() logger.info("Finished Saved " + str(i) + " articles")
# Отфильтруем слова, встречающиеся слишком редко и слишком часто data.dictionary.filter_extremes(no_below=10, no_above=0.15) words_count = len(data.dictionary) print(words_count) word2id = dict() for elem in data.dictionary: word2id.update({data.dictionary[elem]: elem}) id2word = dict() for elem in data.dictionary: id2word.update({elem: data.dictionary[elem]}) # In[4]: sentences = list(data.get_texts()) # ### 2. LSA (Latent semantic analysis) # This solution uses full document as a context of word. So, we have some vocabulary $W$ and a set of documents $D$. Matrix $X$ with shape $|W| \times |D|$ at position $w, d$ stores importance of word $w$ for document $d$. If word $w$ is not found in the document $d$ than at appropriate position $X$ has 0 (obviously, matrix is sparse). # # For each matrix you can find [SVD decomposition](https://en.wikipedia.org/wiki/Singular_value_decomposition) # $$X = U \Sigma V^{T} \text{, где }$$ # * $U$ – orthogonal matrix $|W| \times |W|$ of left singular vectors # * $\Sigma$ – diagonal matrix $|W| \times |D|$ of singular values # * $V$ – orthogonal matrix $|D| \times |D|$ of right singular vectors # # Let's suppouse that row $w$ in matrix $U\Sigma$ is a vector that represents word $w$, and row $d$ of $V$ coresponds to document $d$. In some sense we already found the embeddings of words and documents at the same time. But size of vectors are determined by documents number $|D|$. # # Nevertheless you can use truncated SVD instead # $$ X \approx X_k = U_k \Sigma_k V^{T}_k \text{, where }$$ # * $U_k$ – $k$ left singular vectors
#encoding: utf-8 from gensim.corpora import WikiCorpus import codecs import os path_for_save_resault = '/home/ubuntu/Documents/hw_background_gene/' wiki_jpn = WikiCorpus( '/home/ubuntu/Documents/hw_background_gene/jawiki-latest-pages-articles.xml.bz2' ) with codecs.open(os.path.join(path_for_save_resault, "wiki_jpn.txt"), "w", 'utf-8') as output: for i in wiki_jpn.get_texts(): output.write('\n'.join(i).decode('utf-8'))
def generate_wiki_corpus(self): wiki_corpus = WikiCorpus(self.wikidump_filename, dictionary={}) with open(self.output_text_filename,'w',encoding='utf-8') as output: for text in wiki_corpus.get_texts(): output.write(' '.join(text) + '\n')
corpus = WikiCorpus('../fawiki-latest-pages-articles.xml.bz2',dictionary=False) max_sentence = -1 def generate_lines(): for index, text in enumerate(corpus.get_texts()): if index < max_sentence or max_sentence==-1: yield text else: break # Check if model is not exist model = Word2Vec() if ((os.path.exists('../model_farsi')) and (os.path.isfile('../model_farsi'))): model = Word2Vec.load('../model_farsi') result_1 = model.most_similar('روز') result_2 = model.most_similar(positive=['زن', 'پادشاه'], negative=['مرد'], topn=10) print "result is:" for (re,v) in result_1: print re + ' '+ str(v) print "=======================" for (re,v) in result_2: print re + ' '+ str(v) else: model.build_vocab(corpus.get_texts()) model.train(generate_lines(),chunksize=500) model.save('../model_farsi')
""" # pip intall gensim from gensim.corpora import WikiCorpus import time start_time = time.time() # Creates an Empty file to dump data. target = open('Wiki_Data.txt', 'w') wiki_data = WikiCorpus('enwiki-latest-pages-articles15.xml-p7744803p9244803.bz2') i = 0 for text in wiki_data.get_texts(): target.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n') i = i + 1 if (i % 10000 == 0): print('Extracted ' + str(i) + ' articles') target.close() print(" Data Extraction Completed in %d seconds!" %(time.time() - start_time)) """ Extracted 10000 articles Extracted 20000 articles Extracted 30000 articles Extracted 40000 articles Extracted 50000 articles Extracted 60000 articles
#!/usr/bin/python from gensim.corpora import WikiCorpus from gensim.models.word2vec import Word2Vec corpus = WikiCorpus('dewiki-latest-pages-articles.xml.bz2', dictionary=False, lemmatize=False) model = Word2Vec(size=300, window=7, min_count=7, workers=4, negative=10, hs=0) model.build_vocab(corpus.get_texts()) model.train(corpus.get_texts()) model.init_sims(replace=True) model.save('dewiki.w2v')
Config = ConfigParser.ConfigParser() # check and process input arguments if len(sys.argv) < 3: print(globals()['__doc__'] % locals()) sys.exit(1) inp, outp = sys.argv[1:3] if not os.path.isdir(os.path.dirname(outp)): raise SystemExit("Error: The output directory does not exist. Create" "the directory and try again.") # create the dictionary containing document frequencies for each token wiki = WikiCorpus(inp, lemmatize=False, dictionary=Dictionary()) wiki.dictionary = Dictionary(wiki.get_texts(), prune_at=None) wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') # create the configuration file with default values config_file = open(outp + '_wikifinder.cfg', 'w') Config.add_section('general') Config.set('general', 'articlecount', wiki.length) Config.set('general', 'wordids_path', outp + '_wordids.txt.bz2') Config.set('general', 'bing_api_key', 'none') Config.add_section('citation-needed') Config.set('citation-needed', 'Citation needed', 'true') Config.set('citation-needed', 'Cn', 'true') Config.set('citation-needed', 'Fact', 'true') Config.set('citation-needed', 'Cb', 'true') Config.set('citation-needed', 'Ctn', 'true') Config.set('citation-needed', 'Ref?', 'true')
keep_words = int(sys.argv[3]) else: keep_words = DEFAULT_DICT_SIZE wiki = WikiCorpus(inp, lemmatize=True) wiki.metadata = True # Ensure doc id is captured # only keep the most frequent words wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) # Save the document ids to titles as a dictionary -- this will take a long time # Also may be unnessesary if metadata works correctly docmap = {} for index, doc in enumerate(wiki.get_texts()): docmap[index] = doc[1][1] with bz2.BZ2File('doc_index.pickle.bz2', 'w') as f: pickle.dump(docmap, f) # save dictionary and bag-of-words (term-document frequency matrix) MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000, metadata=True) wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') # load back the id->word mapping directly from file # this seems to save more memory, compared to keeping the wiki.dictionary object from above dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') wiki.save(outp + '_corpus.pkl.bz2')
def make_wiki(wiki_dump_path, wiki_text_path): wiki = WikiCorpus(wiki_dump_path) with open(wiki_text_path, 'w', encoding='utf-8') as fout: for text in tqdm(wiki.get_texts()): fout.write(' '.join(text) + '\n')
import os.path import sys from gensim.corpora import WikiCorpus from gensim.models import TfidfModel, Word2Vec if __name__ == '__main__': program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) if len(sys.argv) < 3: print globals()['__doc__'] % locals() sys.exit(1) inp, outp = sys.argv[1:3] wiki = WikiCorpus(inp, dictionary={}) model = Word2Vec(size=300, window=5, min_count=5, workers=8) sentences = wiki.get_texts() model.build_vocab(sentences) sentences = wiki.get_texts() model.train(sentences) model.save(outp) model.init_sims(replace=True) model.save('trimmed-model')
logging.basicConfig( format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO ) parser = argparse.ArgumentParser() parser.add_argument("-a", "--articles", help="path to enwiki-latest-pages-articles.xml.bz2") parser.add_argument("-m", "--model", help="path to model dir") parser.add_argument("-d", "--demo", help="path to question-words.txt analogies") parser.add_argument("-l", "--lines", help="path to wiki-lines.txt") args = parser.parse_args() # Load or create wiki-lines.txt if not (os.path.isfile(args.lines)): wiki_corpus = WikiCorpus(args.articles, lemmatize=False) wiki_lines = wiki_corpus.get_texts() # Write wiki_lines out for future use lines_file = open(args.lines, 'w') for text in wiki_lines: lines_file.write(" ".join(text) + "\n") lines_file.close() else: wiki_lines = open(args.lines) model = Word2Vec( sentences=LineSentence(wiki_lines), size=400, hs=1, window=5, min_count=5,
from gensim.corpora import WikiCorpus from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence import urllib import urllib.request # скачиваем википедию urllib.request.urlretrieve("https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2", "enwiki-latest-pages-articles.xml.bz2") # обучаем with open('wiki.en.text', 'w') as fout: wiki = WikiCorpus(f, lemmatize=False, dictionary={}) for i, text in enumerate(wiki.get_texts()): fout.write(' '.join(text) + '\n') if i == 99999: sys.exit() model = Word2Vec(LineSentence('wiki.en.text'), size=200, window=5, min_count=3, workers=8) # trim unneeded model memory = use (much) less RAM model.init_sims(replace=True) model.save('wiki.en.word2vec.model') # тестируем модель model.most_similar(’queen’, topn=3) model.most_similar(positive=[’woman’, ’king’],
if __name__ == "__main__": # if the program is being run directly and is not being imported... program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) if len(sys.argv) != 3: print("Use python3 ExtractArticles.py enwiki.xxx.xml.bz2 wikien.txt") sys.exit(1) # exits from python inp, outp = sys.argv[1:3] space = " " i = 0 output = open(outp, "w", encoding="utf-8") wiki = WikiCorpus(inp, lemmatize=False, dictionary={}) count = 0 for text in wiki.get_texts(): # wiki.get_texts() is generator object output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n') i = i + 1 if (i % 10000 == 0): logger.info("Saved", str(i), "articles") output.close() logger.info("Finished saving", str(i), "articles")
import logging from gensim.corpora import WikiCorpus logging.info("Love Live!") wiki_corpus = WikiCorpus('zhwiki-20190520-pages-articles-multistream.xml.bz2', dictionary={}) texts_num = 0 with open("wiki_texts.txt", 'w', encoding='utf-8') as output: for text in wiki_corpus.get_texts(): output.write(''.join(text) + '\n') texts_num += 1 if texts_num % 10000 == 0: logging.info("Processed %d articles" % texts_num)
def set_wiki_to_txt(self): wiki_corpus = WikiCorpus(self.wiki_data_path, dictionary={}) with open(self.save_text_path, 'w', encoding='utf-8') as output: for text in wiki_corpus.get_texts(): output.write(' '.join(text) + '\n') print("转档完成!")
from gensim.corpora import WikiCorpus if __name__ == '__main__': program = os.path.basename(sys.argv[0]) #得到文件名 logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) if len(sys.argv) < 3: print globals()['__doc__'] % locals() sys.exit(1) inp, outp = sys.argv[1:3] space = " " i = 0 output = open(outp, 'w') wiki = WikiCorpus(inp, lemmatize=False, dictionary=[]) #gensim里的维基百科处理类WikiCorpus for text in wiki.get_texts( ): #通过get_texts将维基里的每篇文章转换位1行text文本,并且去掉了标点符号等内容 output.write(space.join(text) + "\n") i = i + 1 if (i % 10000 == 0): logger.info("Saved " + str(i) + " articles.") output.close() logger.info("Finished Saved " + str(i) + " articles.")
from gensim.corpora import WikiCorpus import time # Extract edu_duct edu_dict = set() with open('datas/dict/edu_dict.txt', 'r') as f: edu_dict.update([line.strip('\n') for line in f]) # Extract articles + convert to traditional chinese # !! Warning !! # Below code will replace original result and run roughly 20 minutes wiki_corpus = WikiCorpus('datas/raw/zhwiki-20170801-pages-articles.xml.bz2', dictionary=edu_dict) with open('datas/wiki-texts.txt', 'w', encoding='utf-8') as output: start_time = time.time() for i, text in enumerate(wiki_corpus.get_texts()): output.write(' '.join(text) + '\n') if i % 1000 == 0: print('Finished %3dk lines / elapsed time %10.2f' % (i/1000, time.time() - start_time), end='\r')
这个代码是将从网络上下载的xml格式的wiki百科训练语料转为txt格式 wiki百科训练语料 链接:https://pan.baidu.com/s/1eLkybiYOE_aVxsN0pALATg 密码:hmtn """ from gensim.corpora import WikiCorpus if __name__ == '__main__': print('主程序开始...') input_file_name = 'zhwiki-latest-pages-articles.xml.bz2' output_file_name = 'wiki.cn.txt' print('开始读入wiki数据...') input_file = WikiCorpus(input_file_name, lemmatize=False, dictionary={}) print('wiki数据读入完成!') output_file = open(output_file_name, 'w', encoding="utf-8") print('处理程序开始...') count = 0 for text in input_file.get_texts(): output_file.write(' '.join(text) + '\n') count = count + 1 if count % 10000 == 0: print('目前已处理%d条数据' % count) print('处理程序结束!') output_file.close() print('主程序结束!')
# wiki.get_texts() will only return articles which pass a couple # filters that weed out stubs, redirects, etc. If you included all of # those, Wikpedia is more like ~17M articles. # # For each article, it's going to add the words in the article to the # dictionary. # # If you look inside add_documents, you'll see that it calls doc2bow-- # this generates a bag of words vector, but we're not keeping it. The # dictionary isn't finalized until all of the articles have been # scanned, so we don't know the right mapping of words to ids yet. # # You can use the prune_at parameter to prevent the dictionary from # growing too large during this process, but I think it's interesting # to see the total count of unique tokens before pruning. dictionary.add_documents(wiki.get_texts(), prune_at=None) print ' Building dictionary took %s' % formatTime(time.time() - t0) print ' %d unique tokens before pruning.' % len(dictionary) sys.stdout.flush() keep_words = 100000 # The initial dictionary is huge (~8.75M words in my Wikipedia dump), # so let's filter it down. We want to keep the words that are neither # very rare or overly common. To do this, we will keep only words that # exist within at least 20 articles, but not more than 10% of all # documents. Finally, we'll also put a hard limit on the dictionary # size and just keep the 'keep_words' most frequent works. wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=keep_words)
def xml2txt(f_name, out_name): output = open(out_name, 'w', encoding='utf-8') wiki = WikiCorpus(f_name, lemmatize=False, dictionary={}) for text in wiki.get_texts(): str_line = ' '.join(text) output.write(str_line + '\n')
parser = ArgumentParser(description="Get a number of articles containing analogy words from a wikipedia dump.", formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument("csvPath", help="The path to the csv containing the word analogies which should be contained") parser.add_argument("wikiPath", help="The path of the wikipedia dump") parser.add_argument("outputPath", help="The output path") parser.add_argument("--n-articles", help="The number of articles", type=int, default=1000) args = parser.parse_args() if args.wikiPath.endswith(".txt"): inp = open(args.wikiPath, "r") wiki_file = False else: wiki = WikiCorpus(args.wikiPath, lemmatize=False, dictionary={}) inp = wiki.get_texts() wiki_file = True try: with open(args.csvPath) as csvfile: word_analogies = [row for row in csv.reader(csvfile, delimiter=",")] remaining_n_articles = args.n_articles with open(args.outputPath, "w") as out: for text in inp: for word_analogy in word_analogies: if word_analogy[0] in text and word_analogy[1] in text: if wiki_file: out.write(" ".join(text) + "\n") else: out.write(text) remaining_n_articles -= 1
if __name__ == '__main__': program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) # check and process input arguments if len(sys.argv) < 3: print "Usage: extractwiki.py infile_name outfile_name" sys.exit(1) infilename, outfilename = sys.argv[1:3] if os.path.isfile(outfilename): logger.error("Output file %s exists. Change the file name and try again." %outfilename) sys.exit(1) i = 0 output = open(outfilename, 'w') wiki = WikiCorpus(infilename, lemmatize=False, dictionary={}) for text in wiki.get_texts(): output.write( " ".join(text) + "\n") i = i + 1 if (i % 10000 == 0): logger.info("Saved " + str(i) + " articles") output.close() logger.info("Finished Saved " + str(i) + " articles")
program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) # check and process input arguments # if len(sys.argv) < 3: # print(globals()['__doc__'] % locals()) # sys.exit(1) # python process_wiki.py zhwiki-latest-pages-articles.xml.bz2 wiki.zh.text # inp, outp = sys.argv[1:3] # set the input and output filenames inp, outp = '/home/hs/Data/wikipedia/zhwiki-latest-pages-articles.xml.bz2', '/home/hs/Data/wikipedia/wiki.zh.text' space = " " i = 0 output = open(outp, 'w') wiki = WikiCorpus(inp, lemmatize=False, dictionary={}) texts = wiki.get_texts() for text in texts: # print((text[0]).decode("utf-8")) # exit() output.write(space.join([t.decode('utf-8') for t in text]) + "\n") i = i + 1 if (i % 10000 == 0): logger.info("Saved " + str(i) + " articles") output.close() logger.info("Finished Saved " + str(i) + " articles")