def select_translation(sentence, idx, word, translations): # make sure the subject pronoun is in subject form # heuristic: if it's the first word or the previous word is punctuation # or conjunction, it's considered a subject if word[1] == 'r' and word[0] in subject_pronoun: if idx == 0 or sentence[idx-1][1] in ['x', 'c']: return (subject_pronoun[word[0]], 'pron') # handle special case: <digits>/m 日/m if word[1] == 'm': if DIGITS_PATTERN.match(word[0]): if idx+1 < len(sentence) and sentence[idx+1][0] == u'日': # return proper date string return (translate_date(int(word[0])), 'n') else: # return digits directly return (word[0], 'n') elif word[0] == u'日': # symmetric case if idx > 0 and DIGITS_PATTERN.match(sentence[i-1][0]): return ('', '') # construct a list of translations with the same pos as word same_pos_translations = filter(lambda t: match_pos(word[1], t[1]), translations) ng = NGram() if len(same_pos_translations) > 0: max_unigram_trans = max(same_pos_translations, key=lambda t: ng.get(t[0])) return max_unigram_trans return translations[0]