def main(): parser = argparse.ArgumentParser( description='Look up japanese words via EDict.') parser.add_argument('words', nargs='*', help='Words to attempt stemming on') args = parser.parse_args() edict_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'JMdict_e.gz') dictionary = etree.parse(edict_path) for word in args.words: #best practice: to decode early, encode late word = word.decode('utf-8') hiragana = romaji2hiragana(word) results = lookup(hiragana, dictionary) if not results: print(u'No initial results found for query {q}'.format(q=hiragana)) alternatives = guess_stem(hiragana) if alternatives: for alternative in alternatives: alternate_results = lookup(alternative, dictionary) if alternate_results: print(u'Perhaps you meant {r}?'.format(r=alternative)) else: print(u'No results or possible results found for query {q}'. format(q=hiragana)) else: for result in results: format_entry(result)
def main(): parser = argparse.ArgumentParser(description='Look up japanese words via EDict.') parser.add_argument('words', nargs='*', help='Words to attempt stemming on') args = parser.parse_args() edict_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),'JMdict_e.gz') dictionary = etree.parse(edict_path) for word in args.words: #best practice: to decode early, encode late word = word.decode('utf-8') hiragana = romaji2hiragana(word) results = lookup(hiragana, dictionary) if not results: print(u'No initial results found for query {q}'.format(q=hiragana)) alternatives = guess_stem(hiragana) if alternatives: for alternative in alternatives: alternate_results = lookup(alternative, dictionary) if alternate_results: print(u'Perhaps you meant {r}?'.format(r=alternative)) else: print(u'No results or possible results found for query {q}'.format(q=hiragana)) else: for result in results: format_entry(result)
def guess_stem(word): """given single input word, try to discern japanese word stem """ #ensure input is a unicode string if not isinstance(word, unicode): raise NonUnicodeInputException( 'Input argument {word} is not unicode.'.format(word=word)) #1. input word should have no spaces word = word.strip().lower() #2b Convert filtered word to hiragana via romkan hiragana = romaji2hiragana(word) results = [hiragana] #3: We've got a simple single word in hiragana. First test against adjectival endings for tx in AdjectivalTransforms: negative = tx.Negative(hiragana) if negative: results.append(negative) past = tx.Past(hiragana) if past: results.append(past) past = tx.Past(hiragana) if past: results.append(past) #4: No hits for adjetive stem, test against verbal endings for tx in VerbalTransforms: polite = tx.Polite(hiragana) if polite: results.append(polite) negative = tx.Negative(hiragana) if negative: results.append(negative) te = tx.Te(hiragana) if te: results.append(te) perfect = tx.Perfect(hiragana) if perfect: results.append(perfect) #5: Return input word and candidate stems as tuple, to do dictionary lookups on all. #The best hit will be the longest(?) exact match of a suggested stem return tuple(results)
def guess_stem(word): """given single input word, try to discern japanese word stem """ #ensure input is a unicode string if not isinstance(word, unicode): raise NonUnicodeInputException('Input argument {word} is not unicode.'.format(word=word)) #1. input word should have no spaces word = word.strip().lower() #2b Convert filtered word to hiragana via romkan hiragana = romaji2hiragana(word) results = [hiragana] #3: We've got a simple single word in hiragana. First test against adjectival endings for tx in AdjectivalTransforms: negative = tx.Negative(hiragana) if negative: results.append(negative) past = tx.Past(hiragana) if past: results.append(past) past = tx.Past(hiragana) if past: results.append(past) #4: No hits for adjetive stem, test against verbal endings for tx in VerbalTransforms: polite = tx.Polite(hiragana) if polite: results.append(polite) negative = tx.Negative(hiragana) if negative: results.append(negative) te = tx.Te(hiragana) if te: results.append(te) perfect = tx.Perfect(hiragana) if perfect: results.append(perfect) #5: Return input word and candidate stems as tuple, to do dictionary lookups on all. #The best hit will be the longest(?) exact match of a suggested stem return tuple(results)