Exemplo n.º 1
0
def get_en_page(ur_name):
	if(wikipydia.query_exists(touni(ur_name), language='ur')):
		links = wikipydia.query_language_links(touni(ur_name), language='ur')
		if('en' in links):
			return links['en']
		else:
			return "" 
Exemplo n.º 2
0
langs = [
]  #list of languages represented as wikipedia prefixes e.g. xx - xx.wikipedia.org
langs = wikilanguages.load(settings["languages_file"])

print langs
lang = "en"

dict = {}

for word in words:
    #print "------"
    #print word
    #print "-"
    links = wikipydia.query_language_links(title=word,
                                           language=lang,
                                           limit=1000)
    #print len(links)
    for link in links:
        #print ">>", link

        if link in dict:
            dict[link][word] = (links[link])
        else:
            dict[link] = {word: links[link]}
        #print link, " - ",links[link]

print "# of languages: ", len(langs)

for l in langs:
    if l in dict:
Exemplo n.º 3
0
                toRemove.append(i)
            if "edit" in l:
                toRemove.append(i)
            if "Special:" in l:
                toRemove.append(i)
            if "a,b" in l:
                toRemove.append(i)
            if l == "originallypublishedin1973":
                toRemove.append(i)
        if opts.language == "es":
            if "Archivo" in l:
                toRemove.append(i)
            if "Wikipedia" in l:
                toRemove.append(i)
            if l == "editar":
                toRemove.append(i)

    for i in sorted(toRemove, reverse=True):
        if i < len(links):
            del links[i]

    if opts.language == "en":
        for (i, link) in enumerate(links):
            allLanguages = wikipydia.query_language_links(link)
            if "es" in allLanguages:
                links[i] = allLanguages["es"]
                links[i] = links[i].replace(" ", "_")
        print u" ".join([urllib.quote(l.encode("utf-8")) for l in links])
    else:
        print u" ".join([urllib.quote(l.encode("utf-8")) for l in links])
def write_csv_file(csv_filename,
                   sentence_filename,
                   articles_filename,
                   articles,
                   lang,
                   num_sentences_per_hit,
                   img_output_dir,
                   img_url_dir,
                   fontName='Times New Roman',
                   target_lang='en'):
    """
   Generates a comma seperated value file and associated image files 
   so that a Mechanical Turk translation HIT can be created.
   """
    init_files(sentence_filename, articles_filename)
    article_ids = []
    target_titles = []
    write_lines_to_file(articles_filename, articles)
    for article in articles:
        print article,
        article_id = wikipydia.query_page_id(article, language=lang)
        print article_id
        try:
            sentences = get_sentences_for_article(article, article_id, lang,
                                                  sentence_filename)
            article_ids.append(article_id)
            ll = wikipydia.query_language_links(article, lang)
            if target_lang in ll:
                target_titles.append(ll[target_lang])
            else:
                target_titles.append('')
        except:
            target_titles.append('')
    write_lines_to_file(articles_filename + '.ids', article_ids)
    write_lines_to_file(articles_filename + '.target_titles', target_titles)
    #
    # translate all sentences
    translations = get_translations(sentence_filename, lang)
    #
    # generate all images
    img_urls = generate_images(sentence_filename, img_output_dir, img_url_dir,
                               fontName)
    #
    # filter sentences that are mainly ascii
    mask = filter_sentences(sentence_filename, lang)
    #
    csv_output_file = open(csv_filename, 'w')
    header = 'lang_pair'
    for i in range(1, num_sentences_per_hit + 1):
        header += ',seg_id%s' % str(i)
        header += ',tag%s' % str(i)
        header += ',seg%s' % str(i)
        header += ',img_url%s' % str(i)
        header += ',machine_translation%s' % str(i)
    #
    # load the sentences
    sentences = read_lines_from_file(sentence_filename)
    seg_ids = read_lines_from_file(sentence_filename + '.seg_ids')
    tags = read_lines_from_file(sentence_filename + '.tags')
    mask = read_lines_from_file(sentence_filename + '.mask')
    #
    line = header
    counter = 0
    for i, sentence in enumerate(sentences):
        if (mask[i] == '1'):
            if counter % num_sentences_per_hit == 0:
                csv_output_file.write(line.encode('UTF-8'))
                csv_output_file.write('\n'.encode('UTF-8'))
                line = lang + "-" + target_lang
            counter += 1
            seg_id = seg_ids[i]
            tag = tags[i]
            sentence = format_for_csv(sentence)
            img_url = img_urls[i]
            translation = format_for_csv(translations[i])
            line += ',%s,%s,%s,%s,%s' % (seg_id, tag, sentence, img_url,
                                         translation)
    # if there are an odd number of sentences, then fill out the rest of the fields with a do-not-translate message
    if not counter % num_sentences_per_hit == 0:
        dnt_url = ",,,," + img_url_dir + "/do-not-translate.png,"
        line = dnt_url * (num_sentences_per_hit - counter)
        csv_output_file.write(line.encode('UTF-8'))
    csv_output_file.write('\n'.encode('UTF-8'))
    csv_output_file.close()
def write_csv_file(csv_filename, sentence_filename, articles_filename, articles, lang, num_sentences_per_hit, img_output_dir, img_url_dir, fontName='Times New Roman', target_lang='en'):
   """
   Generates a comma seperated value file and associated image files 
   so that a Mechanical Turk translation HIT can be created.
   """
   init_files(sentence_filename, articles_filename)
   article_ids = []
   target_titles = []
   write_lines_to_file(articles_filename, articles)
   for article in articles:
       print article,
       article_id = wikipydia.query_page_id(article, language=lang)
       print article_id
       try:
           sentences = get_sentences_for_article(article, article_id, lang, sentence_filename)
           article_ids.append(article_id)
           ll = wikipydia.query_language_links(article, lang)
           if target_lang in ll:
               target_titles.append(ll[target_lang])
           else:
               target_titles.append('')
       except:
           target_titles.append('')
   write_lines_to_file(articles_filename + '.ids', article_ids)
   write_lines_to_file(articles_filename + '.target_titles', target_titles)
   #
   # translate all sentences
   translations = get_translations(sentence_filename, lang)
   #
   # generate all images
   img_urls = generate_images(sentence_filename, img_output_dir, img_url_dir, fontName)
   #
   # filter sentences that are mainly ascii
   mask = filter_sentences(sentence_filename, lang)
   #
   csv_output_file = open(csv_filename, 'w')
   header = 'lang_pair'
   for i in range(1, num_sentences_per_hit+1):
      header += ',seg_id%s' % str(i)
      header += ',tag%s' % str(i)
      header += ',seg%s' % str(i)
      header += ',img_url%s' % str(i)
      header += ',machine_translation%s' % str(i)
   #
   # load the sentences
   sentences = read_lines_from_file(sentence_filename)
   seg_ids = read_lines_from_file(sentence_filename + '.seg_ids')
   tags =  read_lines_from_file(sentence_filename + '.tags')
   mask =  read_lines_from_file(sentence_filename + '.mask')
   #
   line = header
   counter = 0
   for i, sentence in enumerate(sentences):
      if(mask[i] == '1'):
           if counter % num_sentences_per_hit == 0 :
               csv_output_file.write(line.encode('UTF-8'))
               csv_output_file.write('\n'.encode('UTF-8'))
               line = lang + "-" + target_lang               
           counter += 1
           seg_id = seg_ids[i]
           tag = tags[i]
           sentence = format_for_csv(sentence)
           img_url = img_urls[i]
           translation = format_for_csv(translations[i])
           line += ',%s,%s,%s,%s,%s' % (seg_id, tag, sentence, img_url, translation)
   # if there are an odd number of sentences, then fill out the rest of the fields with a do-not-translate message
   if not counter % num_sentences_per_hit == 0:
       dnt_url = ",,,," + img_url_dir + "/do-not-translate.png,"
       line = dnt_url * (num_sentences_per_hit - counter)
       csv_output_file.write(line.encode('UTF-8'))
   csv_output_file.write('\n'.encode('UTF-8'))
   csv_output_file.close()
Exemplo n.º 6
0
#langs=langs.split(',')

langs=[] #list of languages represented as wikipedia prefixes e.g. xx - xx.wikipedia.org
langs=wikilanguages.load(settings["languages_file"])


print langs
lang="en"

dict={}

for word in words:
	#print "------"
	#print word
	#print "-"
	links= wikipydia.query_language_links(title=word, language=lang, limit=1000)
	#print len(links)
	for link in links:
		#print ">>", link
		
		if link in dict:
			dict[link][word]=(links[link])
		else:
			dict[link]={word:links[link]}
		#print link, " - ",links[link]
		
print "# of languages: ",len(langs)

for l in langs:
	if l in dict:
		text=l+" : "