예제 #1
0
def load_freq_pages(page_view_counts_filename, language, limit=settings["top_articles"]):
	"""
	Reads a file  with page view counts and retrieves the top-k
	most frequent page for the language
	"""
	#OPT: counts file is parsed separately for each language
	logging.info("loading list of articles for language %s" % (language))
	
	logging.debug("load freq pages for %s, limit=%s" % (language,limit))
	freq_pages = []
	#input_file = codecs.open(page_view_counts_filename, 'r', 'utf-8')
	input_file = open(page_view_counts_filename, 'r')
	for line in input_file:
		#line = line.encode('UTF-8')
		line = line.rstrip('\n')
		(lang, rank, title, count) = line.split(' ')
		if lang==language and len(freq_pages) < limit:
			title = unquote(title)
			try:
				id = wikipydia.query_page_id(title, language=lang)
				freq_pages.append(title)
			except KeyError:
				#logging.debug( 'no page for %s %s' % (title, language))
				print 'no page for ', title, language
			except IOError:
				#logging.debug( 'cannot reach %s %s' % (title, language))
				print 'cannot reach ', title, language
			except TypeError:
				#logging.debug( 'unicode object error for %s %s' % (title, language))
				print 'unicode object error for ', title, language
			except UnicodeDecodeError:
				#logging.debug( 'unicode error for %s %s' % (title, language))
				print 'unicode error for ', title, language
	input_file.close()
	
	logging.info("# of articles loaded: %s" % (len(freq_pages)))
	
	return freq_pages
def write_csv_file(csv_filename,
                   sentence_filename,
                   articles_filename,
                   articles,
                   lang,
                   num_sentences_per_hit,
                   img_output_dir,
                   img_url_dir,
                   fontName='Times New Roman',
                   target_lang='en'):
    """
   Generates a comma seperated value file and associated image files 
   so that a Mechanical Turk translation HIT can be created.
   """
    init_files(sentence_filename, articles_filename)
    article_ids = []
    target_titles = []
    write_lines_to_file(articles_filename, articles)
    for article in articles:
        print article,
        article_id = wikipydia.query_page_id(article, language=lang)
        print article_id
        try:
            sentences = get_sentences_for_article(article, article_id, lang,
                                                  sentence_filename)
            article_ids.append(article_id)
            ll = wikipydia.query_language_links(article, lang)
            if target_lang in ll:
                target_titles.append(ll[target_lang])
            else:
                target_titles.append('')
        except:
            target_titles.append('')
    write_lines_to_file(articles_filename + '.ids', article_ids)
    write_lines_to_file(articles_filename + '.target_titles', target_titles)
    #
    # translate all sentences
    translations = get_translations(sentence_filename, lang)
    #
    # generate all images
    img_urls = generate_images(sentence_filename, img_output_dir, img_url_dir,
                               fontName)
    #
    # filter sentences that are mainly ascii
    mask = filter_sentences(sentence_filename, lang)
    #
    csv_output_file = open(csv_filename, 'w')
    header = 'lang_pair'
    for i in range(1, num_sentences_per_hit + 1):
        header += ',seg_id%s' % str(i)
        header += ',tag%s' % str(i)
        header += ',seg%s' % str(i)
        header += ',img_url%s' % str(i)
        header += ',machine_translation%s' % str(i)
    #
    # load the sentences
    sentences = read_lines_from_file(sentence_filename)
    seg_ids = read_lines_from_file(sentence_filename + '.seg_ids')
    tags = read_lines_from_file(sentence_filename + '.tags')
    mask = read_lines_from_file(sentence_filename + '.mask')
    #
    line = header
    counter = 0
    for i, sentence in enumerate(sentences):
        if (mask[i] == '1'):
            if counter % num_sentences_per_hit == 0:
                csv_output_file.write(line.encode('UTF-8'))
                csv_output_file.write('\n'.encode('UTF-8'))
                line = lang + "-" + target_lang
            counter += 1
            seg_id = seg_ids[i]
            tag = tags[i]
            sentence = format_for_csv(sentence)
            img_url = img_urls[i]
            translation = format_for_csv(translations[i])
            line += ',%s,%s,%s,%s,%s' % (seg_id, tag, sentence, img_url,
                                         translation)
    # if there are an odd number of sentences, then fill out the rest of the fields with a do-not-translate message
    if not counter % num_sentences_per_hit == 0:
        dnt_url = ",,,," + img_url_dir + "/do-not-translate.png,"
        line = dnt_url * (num_sentences_per_hit - counter)
        csv_output_file.write(line.encode('UTF-8'))
    csv_output_file.write('\n'.encode('UTF-8'))
    csv_output_file.close()
def write_csv_file(csv_filename, sentence_filename, articles_filename, articles, lang, num_sentences_per_hit, img_output_dir, img_url_dir, fontName='Times New Roman', target_lang='en'):
   """
   Generates a comma seperated value file and associated image files 
   so that a Mechanical Turk translation HIT can be created.
   """
   init_files(sentence_filename, articles_filename)
   article_ids = []
   target_titles = []
   write_lines_to_file(articles_filename, articles)
   for article in articles:
       print article,
       article_id = wikipydia.query_page_id(article, language=lang)
       print article_id
       try:
           sentences = get_sentences_for_article(article, article_id, lang, sentence_filename)
           article_ids.append(article_id)
           ll = wikipydia.query_language_links(article, lang)
           if target_lang in ll:
               target_titles.append(ll[target_lang])
           else:
               target_titles.append('')
       except:
           target_titles.append('')
   write_lines_to_file(articles_filename + '.ids', article_ids)
   write_lines_to_file(articles_filename + '.target_titles', target_titles)
   #
   # translate all sentences
   translations = get_translations(sentence_filename, lang)
   #
   # generate all images
   img_urls = generate_images(sentence_filename, img_output_dir, img_url_dir, fontName)
   #
   # filter sentences that are mainly ascii
   mask = filter_sentences(sentence_filename, lang)
   #
   csv_output_file = open(csv_filename, 'w')
   header = 'lang_pair'
   for i in range(1, num_sentences_per_hit+1):
      header += ',seg_id%s' % str(i)
      header += ',tag%s' % str(i)
      header += ',seg%s' % str(i)
      header += ',img_url%s' % str(i)
      header += ',machine_translation%s' % str(i)
   #
   # load the sentences
   sentences = read_lines_from_file(sentence_filename)
   seg_ids = read_lines_from_file(sentence_filename + '.seg_ids')
   tags =  read_lines_from_file(sentence_filename + '.tags')
   mask =  read_lines_from_file(sentence_filename + '.mask')
   #
   line = header
   counter = 0
   for i, sentence in enumerate(sentences):
      if(mask[i] == '1'):
           if counter % num_sentences_per_hit == 0 :
               csv_output_file.write(line.encode('UTF-8'))
               csv_output_file.write('\n'.encode('UTF-8'))
               line = lang + "-" + target_lang               
           counter += 1
           seg_id = seg_ids[i]
           tag = tags[i]
           sentence = format_for_csv(sentence)
           img_url = img_urls[i]
           translation = format_for_csv(translations[i])
           line += ',%s,%s,%s,%s,%s' % (seg_id, tag, sentence, img_url, translation)
   # if there are an odd number of sentences, then fill out the rest of the fields with a do-not-translate message
   if not counter % num_sentences_per_hit == 0:
       dnt_url = ",,,," + img_url_dir + "/do-not-translate.png,"
       line = dnt_url * (num_sentences_per_hit - counter)
       csv_output_file.write(line.encode('UTF-8'))
   csv_output_file.write('\n'.encode('UTF-8'))
   csv_output_file.close()