Пример #1
0
def collect_wiki_corpus(language, lang, num_items):
    """
    Download <n> random wikipedia articles in language <lang>
    """
    filename = "%s.plain" % (language)
    out = codecs.open(filename, "w", "utf-8")

    for title in query_random_titles(lang, num_items):
        article_dict = query_text_rendered(title, language=lang)

        # Soup it
        soup = BeautifulSoup(article_dict['html'])
        p_text = ''
        for p in soup.findAll('p'):
            only_p = p.findAll(text=True)
            p_text = ''.join(only_p)

            # Tokenize but keep . at the end of words
            p_tokenized = ' '.join(PunktWordTokenizer().tokenize(p_text))

            out.write(p_tokenized)
            out.write("\n")

    out.close()
Пример #2
0
import wikipydia
from wikipydia import query_random_titles

import datetime
import sys
import os

date = datetime.date(int((sys.argv[2])[:4]), int((sys.argv[2])[5:7]), int((sys.argv[2])[8:10]))

loops = 1

if len(sys.argv) > 4:
    loops = int(sys.argv[4])

random = query_random_titles(sys.argv[1], int(sys.argv[3]))

"""
For purpose of debugging the difference function for sets
(python generate_negative.py en 2011-05-11 50 15)
random.append('The_Pirate_Bay')
"""

wikitopics_path = os.environ["WIKITOPICS"]
articles_path = wikitopics_path + "/data/articles/" + sys.argv[1] + "/" + (sys.argv[2])[:4] + "/"

top_news = []
for i in range(0, loops):
    previousdays = datetime.timedelta(days=i)
    new_date = date - previousdays
    articles = articles_path + new_date.strftime("%Y-%m-%d")
    if os.path.exists(articles):