def load_freq_pages(page_view_counts_filename, language, limit=settings["top_articles"]): """ Reads a file with page view counts and retrieves the top-k most frequent page for the language """ #OPT: counts file is parsed separately for each language logging.info("loading list of articles for language %s" % (language)) logging.debug("load freq pages for %s, limit=%s" % (language,limit)) freq_pages = [] #input_file = codecs.open(page_view_counts_filename, 'r', 'utf-8') input_file = open(page_view_counts_filename, 'r') for line in input_file: #line = line.encode('UTF-8') line = line.rstrip('\n') (lang, rank, title, count) = line.split(' ') if lang==language and len(freq_pages) < limit: title = unquote(title) try: id = wikipydia.query_page_id(title, language=lang) freq_pages.append(title) except KeyError: #logging.debug( 'no page for %s %s' % (title, language)) print 'no page for ', title, language except IOError: #logging.debug( 'cannot reach %s %s' % (title, language)) print 'cannot reach ', title, language except TypeError: #logging.debug( 'unicode object error for %s %s' % (title, language)) print 'unicode object error for ', title, language except UnicodeDecodeError: #logging.debug( 'unicode error for %s %s' % (title, language)) print 'unicode error for ', title, language input_file.close() logging.info("# of articles loaded: %s" % (len(freq_pages))) return freq_pages
def write_csv_file(csv_filename, sentence_filename, articles_filename, articles, lang, num_sentences_per_hit, img_output_dir, img_url_dir, fontName='Times New Roman', target_lang='en'): """ Generates a comma seperated value file and associated image files so that a Mechanical Turk translation HIT can be created. """ init_files(sentence_filename, articles_filename) article_ids = [] target_titles = [] write_lines_to_file(articles_filename, articles) for article in articles: print article, article_id = wikipydia.query_page_id(article, language=lang) print article_id try: sentences = get_sentences_for_article(article, article_id, lang, sentence_filename) article_ids.append(article_id) ll = wikipydia.query_language_links(article, lang) if target_lang in ll: target_titles.append(ll[target_lang]) else: target_titles.append('') except: target_titles.append('') write_lines_to_file(articles_filename + '.ids', article_ids) write_lines_to_file(articles_filename + '.target_titles', target_titles) # # translate all sentences translations = get_translations(sentence_filename, lang) # # generate all images img_urls = generate_images(sentence_filename, img_output_dir, img_url_dir, fontName) # # filter sentences that are mainly ascii mask = filter_sentences(sentence_filename, lang) # csv_output_file = open(csv_filename, 'w') header = 'lang_pair' for i in range(1, num_sentences_per_hit + 1): header += ',seg_id%s' % str(i) header += ',tag%s' % str(i) header += ',seg%s' % str(i) header += ',img_url%s' % str(i) header += ',machine_translation%s' % str(i) # # load the sentences sentences = read_lines_from_file(sentence_filename) seg_ids = read_lines_from_file(sentence_filename + '.seg_ids') tags = read_lines_from_file(sentence_filename + '.tags') mask = read_lines_from_file(sentence_filename + '.mask') # line = header counter = 0 for i, sentence in enumerate(sentences): if (mask[i] == '1'): if counter % num_sentences_per_hit == 0: csv_output_file.write(line.encode('UTF-8')) csv_output_file.write('\n'.encode('UTF-8')) line = lang + "-" + target_lang counter += 1 seg_id = seg_ids[i] tag = tags[i] sentence = format_for_csv(sentence) img_url = img_urls[i] translation = format_for_csv(translations[i]) line += ',%s,%s,%s,%s,%s' % (seg_id, tag, sentence, img_url, translation) # if there are an odd number of sentences, then fill out the rest of the fields with a do-not-translate message if not counter % num_sentences_per_hit == 0: dnt_url = ",,,," + img_url_dir + "/do-not-translate.png," line = dnt_url * (num_sentences_per_hit - counter) csv_output_file.write(line.encode('UTF-8')) csv_output_file.write('\n'.encode('UTF-8')) csv_output_file.close()
def write_csv_file(csv_filename, sentence_filename, articles_filename, articles, lang, num_sentences_per_hit, img_output_dir, img_url_dir, fontName='Times New Roman', target_lang='en'): """ Generates a comma seperated value file and associated image files so that a Mechanical Turk translation HIT can be created. """ init_files(sentence_filename, articles_filename) article_ids = [] target_titles = [] write_lines_to_file(articles_filename, articles) for article in articles: print article, article_id = wikipydia.query_page_id(article, language=lang) print article_id try: sentences = get_sentences_for_article(article, article_id, lang, sentence_filename) article_ids.append(article_id) ll = wikipydia.query_language_links(article, lang) if target_lang in ll: target_titles.append(ll[target_lang]) else: target_titles.append('') except: target_titles.append('') write_lines_to_file(articles_filename + '.ids', article_ids) write_lines_to_file(articles_filename + '.target_titles', target_titles) # # translate all sentences translations = get_translations(sentence_filename, lang) # # generate all images img_urls = generate_images(sentence_filename, img_output_dir, img_url_dir, fontName) # # filter sentences that are mainly ascii mask = filter_sentences(sentence_filename, lang) # csv_output_file = open(csv_filename, 'w') header = 'lang_pair' for i in range(1, num_sentences_per_hit+1): header += ',seg_id%s' % str(i) header += ',tag%s' % str(i) header += ',seg%s' % str(i) header += ',img_url%s' % str(i) header += ',machine_translation%s' % str(i) # # load the sentences sentences = read_lines_from_file(sentence_filename) seg_ids = read_lines_from_file(sentence_filename + '.seg_ids') tags = read_lines_from_file(sentence_filename + '.tags') mask = read_lines_from_file(sentence_filename + '.mask') # line = header counter = 0 for i, sentence in enumerate(sentences): if(mask[i] == '1'): if counter % num_sentences_per_hit == 0 : csv_output_file.write(line.encode('UTF-8')) csv_output_file.write('\n'.encode('UTF-8')) line = lang + "-" + target_lang counter += 1 seg_id = seg_ids[i] tag = tags[i] sentence = format_for_csv(sentence) img_url = img_urls[i] translation = format_for_csv(translations[i]) line += ',%s,%s,%s,%s,%s' % (seg_id, tag, sentence, img_url, translation) # if there are an odd number of sentences, then fill out the rest of the fields with a do-not-translate message if not counter % num_sentences_per_hit == 0: dnt_url = ",,,," + img_url_dir + "/do-not-translate.png," line = dnt_url * (num_sentences_per_hit - counter) csv_output_file.write(line.encode('UTF-8')) csv_output_file.write('\n'.encode('UTF-8')) csv_output_file.close()