def collect_wiki_corpus(language, lang, num_items): """ Download <n> random wikipedia articles in language <lang> """ filename = "%s.plain" % (language) out = codecs.open(filename, "w", "utf-8") for title in query_random_titles(lang, num_items): article_dict = query_text_rendered(title, language=lang) # Soup it soup = BeautifulSoup(article_dict['html']) p_text = '' for p in soup.findAll('p'): only_p = p.findAll(text=True) p_text = ''.join(only_p) # Tokenize but keep . at the end of words p_tokenized = ' '.join(PunktWordTokenizer().tokenize(p_text)) out.write(p_tokenized) out.write("\n") out.close()
import wikipydia from wikipydia import query_random_titles import datetime import sys import os date = datetime.date(int((sys.argv[2])[:4]), int((sys.argv[2])[5:7]), int((sys.argv[2])[8:10])) loops = 1 if len(sys.argv) > 4: loops = int(sys.argv[4]) random = query_random_titles(sys.argv[1], int(sys.argv[3])) """ For purpose of debugging the difference function for sets (python generate_negative.py en 2011-05-11 50 15) random.append('The_Pirate_Bay') """ wikitopics_path = os.environ["WIKITOPICS"] articles_path = wikitopics_path + "/data/articles/" + sys.argv[1] + "/" + (sys.argv[2])[:4] + "/" top_news = [] for i in range(0, loops): previousdays = datetime.timedelta(days=i) new_date = date - previousdays articles = articles_path + new_date.strftime("%Y-%m-%d") if os.path.exists(articles):