def get_en_page(ur_name): if(wikipydia.query_exists(touni(ur_name), language='ur')): links = wikipydia.query_language_links(touni(ur_name), language='ur') if('en' in links): return links['en'] else: return ""
langs = [ ] #list of languages represented as wikipedia prefixes e.g. xx - xx.wikipedia.org langs = wikilanguages.load(settings["languages_file"]) print langs lang = "en" dict = {} for word in words: #print "------" #print word #print "-" links = wikipydia.query_language_links(title=word, language=lang, limit=1000) #print len(links) for link in links: #print ">>", link if link in dict: dict[link][word] = (links[link]) else: dict[link] = {word: links[link]} #print link, " - ",links[link] print "# of languages: ", len(langs) for l in langs: if l in dict:
toRemove.append(i) if "edit" in l: toRemove.append(i) if "Special:" in l: toRemove.append(i) if "a,b" in l: toRemove.append(i) if l == "originallypublishedin1973": toRemove.append(i) if opts.language == "es": if "Archivo" in l: toRemove.append(i) if "Wikipedia" in l: toRemove.append(i) if l == "editar": toRemove.append(i) for i in sorted(toRemove, reverse=True): if i < len(links): del links[i] if opts.language == "en": for (i, link) in enumerate(links): allLanguages = wikipydia.query_language_links(link) if "es" in allLanguages: links[i] = allLanguages["es"] links[i] = links[i].replace(" ", "_") print u" ".join([urllib.quote(l.encode("utf-8")) for l in links]) else: print u" ".join([urllib.quote(l.encode("utf-8")) for l in links])
def write_csv_file(csv_filename, sentence_filename, articles_filename, articles, lang, num_sentences_per_hit, img_output_dir, img_url_dir, fontName='Times New Roman', target_lang='en'): """ Generates a comma seperated value file and associated image files so that a Mechanical Turk translation HIT can be created. """ init_files(sentence_filename, articles_filename) article_ids = [] target_titles = [] write_lines_to_file(articles_filename, articles) for article in articles: print article, article_id = wikipydia.query_page_id(article, language=lang) print article_id try: sentences = get_sentences_for_article(article, article_id, lang, sentence_filename) article_ids.append(article_id) ll = wikipydia.query_language_links(article, lang) if target_lang in ll: target_titles.append(ll[target_lang]) else: target_titles.append('') except: target_titles.append('') write_lines_to_file(articles_filename + '.ids', article_ids) write_lines_to_file(articles_filename + '.target_titles', target_titles) # # translate all sentences translations = get_translations(sentence_filename, lang) # # generate all images img_urls = generate_images(sentence_filename, img_output_dir, img_url_dir, fontName) # # filter sentences that are mainly ascii mask = filter_sentences(sentence_filename, lang) # csv_output_file = open(csv_filename, 'w') header = 'lang_pair' for i in range(1, num_sentences_per_hit + 1): header += ',seg_id%s' % str(i) header += ',tag%s' % str(i) header += ',seg%s' % str(i) header += ',img_url%s' % str(i) header += ',machine_translation%s' % str(i) # # load the sentences sentences = read_lines_from_file(sentence_filename) seg_ids = read_lines_from_file(sentence_filename + '.seg_ids') tags = read_lines_from_file(sentence_filename + '.tags') mask = read_lines_from_file(sentence_filename + '.mask') # line = header counter = 0 for i, sentence in enumerate(sentences): if (mask[i] == '1'): if counter % num_sentences_per_hit == 0: csv_output_file.write(line.encode('UTF-8')) csv_output_file.write('\n'.encode('UTF-8')) line = lang + "-" + target_lang counter += 1 seg_id = seg_ids[i] tag = tags[i] sentence = format_for_csv(sentence) img_url = img_urls[i] translation = format_for_csv(translations[i]) line += ',%s,%s,%s,%s,%s' % (seg_id, tag, sentence, img_url, translation) # if there are an odd number of sentences, then fill out the rest of the fields with a do-not-translate message if not counter % num_sentences_per_hit == 0: dnt_url = ",,,," + img_url_dir + "/do-not-translate.png," line = dnt_url * (num_sentences_per_hit - counter) csv_output_file.write(line.encode('UTF-8')) csv_output_file.write('\n'.encode('UTF-8')) csv_output_file.close()
def write_csv_file(csv_filename, sentence_filename, articles_filename, articles, lang, num_sentences_per_hit, img_output_dir, img_url_dir, fontName='Times New Roman', target_lang='en'): """ Generates a comma seperated value file and associated image files so that a Mechanical Turk translation HIT can be created. """ init_files(sentence_filename, articles_filename) article_ids = [] target_titles = [] write_lines_to_file(articles_filename, articles) for article in articles: print article, article_id = wikipydia.query_page_id(article, language=lang) print article_id try: sentences = get_sentences_for_article(article, article_id, lang, sentence_filename) article_ids.append(article_id) ll = wikipydia.query_language_links(article, lang) if target_lang in ll: target_titles.append(ll[target_lang]) else: target_titles.append('') except: target_titles.append('') write_lines_to_file(articles_filename + '.ids', article_ids) write_lines_to_file(articles_filename + '.target_titles', target_titles) # # translate all sentences translations = get_translations(sentence_filename, lang) # # generate all images img_urls = generate_images(sentence_filename, img_output_dir, img_url_dir, fontName) # # filter sentences that are mainly ascii mask = filter_sentences(sentence_filename, lang) # csv_output_file = open(csv_filename, 'w') header = 'lang_pair' for i in range(1, num_sentences_per_hit+1): header += ',seg_id%s' % str(i) header += ',tag%s' % str(i) header += ',seg%s' % str(i) header += ',img_url%s' % str(i) header += ',machine_translation%s' % str(i) # # load the sentences sentences = read_lines_from_file(sentence_filename) seg_ids = read_lines_from_file(sentence_filename + '.seg_ids') tags = read_lines_from_file(sentence_filename + '.tags') mask = read_lines_from_file(sentence_filename + '.mask') # line = header counter = 0 for i, sentence in enumerate(sentences): if(mask[i] == '1'): if counter % num_sentences_per_hit == 0 : csv_output_file.write(line.encode('UTF-8')) csv_output_file.write('\n'.encode('UTF-8')) line = lang + "-" + target_lang counter += 1 seg_id = seg_ids[i] tag = tags[i] sentence = format_for_csv(sentence) img_url = img_urls[i] translation = format_for_csv(translations[i]) line += ',%s,%s,%s,%s,%s' % (seg_id, tag, sentence, img_url, translation) # if there are an odd number of sentences, then fill out the rest of the fields with a do-not-translate message if not counter % num_sentences_per_hit == 0: dnt_url = ",,,," + img_url_dir + "/do-not-translate.png," line = dnt_url * (num_sentences_per_hit - counter) csv_output_file.write(line.encode('UTF-8')) csv_output_file.write('\n'.encode('UTF-8')) csv_output_file.close()
#langs=langs.split(',') langs=[] #list of languages represented as wikipedia prefixes e.g. xx - xx.wikipedia.org langs=wikilanguages.load(settings["languages_file"]) print langs lang="en" dict={} for word in words: #print "------" #print word #print "-" links= wikipydia.query_language_links(title=word, language=lang, limit=1000) #print len(links) for link in links: #print ">>", link if link in dict: dict[link][word]=(links[link]) else: dict[link]={word:links[link]} #print link, " - ",links[link] print "# of languages: ",len(langs) for l in langs: if l in dict: text=l+" : "