def get_page_info(url, length, index): # print "" # print url title = url2title(url) lang = url2lang(url) print "[%s] %s (%s/%s)" % (lang, title, index, length) wp = Page() # print wp if (lang != "www"): r = wp.fetch_from_api_title(title, lang=lang) file = "dataset/%s.info.json" % (wp.page_id) if not os.path.isfile(file): with open(file, "w") as out: data = { "edits": wp.get_all_editors(), "langs": wp.get_langlinks() } json.dump(data, out)
source_in = "data/in/wicrimea-seeds.txt" source_ext = "data/out/wicrimea-seeds.extended.txt" out = open(source_ext, "w") with open(source_in, "r") as file: for l in file: p = Page() r = p.fetch_from_api_title(url2title(l.strip())) print "" print u"→ %s (%s)" % (p.title, l.strip()) out.write(l.strip()+"\n") langs = p.get_langlinks() for l in langs: if l["lang"] in extra_languages: # print l p_lang = Page() p_lang.fetch_from_api_title(l["*"], lang=l["lang"]) print u" → [%s] %s (%s)" % (l["lang"], p_lang.title, p_lang.url) out.write(p_lang.url+"\n") out.close() def write_revision(rev_id, file): rev_with_content = p.get_revisions(extra_params={ "rvstartid": rev_id, "rvlimit" : 1})