示例#1
0
def main():
    parser = argparse.ArgumentParser(
        description='Look up japanese words via EDict.')
    parser.add_argument('words',
                        nargs='*',
                        help='Words to attempt stemming on')
    args = parser.parse_args()

    edict_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                              'JMdict_e.gz')
    dictionary = etree.parse(edict_path)

    for word in args.words:
        #best practice: to decode early, encode late
        word = word.decode('utf-8')
        hiragana = romaji2hiragana(word)
        results = lookup(hiragana, dictionary)
        if not results:
            print(u'No initial results found for query {q}'.format(q=hiragana))
            alternatives = guess_stem(hiragana)
            if alternatives:
                for alternative in alternatives:
                    alternate_results = lookup(alternative, dictionary)
                    if alternate_results:
                        print(u'Perhaps you meant {r}?'.format(r=alternative))
            else:
                print(u'No results or possible results found for query {q}'.
                      format(q=hiragana))
        else:
            for result in results:
                format_entry(result)
示例#2
0
文件: edict.py 项目: johnoneil/jpn
def main():
  parser = argparse.ArgumentParser(description='Look up japanese words via EDict.')
  parser.add_argument('words', nargs='*', help='Words to attempt stemming on')
  args = parser.parse_args()

  edict_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),'JMdict_e.gz')
  dictionary = etree.parse(edict_path)

  for word in args.words:
    #best practice: to decode early, encode late
    word = word.decode('utf-8')
    hiragana = romaji2hiragana(word)
    results = lookup(hiragana, dictionary)
    if not results:
      print(u'No initial results found for query {q}'.format(q=hiragana))
      alternatives = guess_stem(hiragana)
      if alternatives:
        for alternative in alternatives:
          alternate_results = lookup(alternative, dictionary)
          if alternate_results:
            print(u'Perhaps you meant {r}?'.format(r=alternative))
      else:
        print(u'No results or possible results found for query {q}'.format(q=hiragana))
    else:
      for result in results:
        format_entry(result)
示例#3
0
文件: deinflect.py 项目: huzerD/jpn
def guess_stem(word):
    """given single input word, try to discern japanese word stem
  """
    #ensure input is a unicode string
    if not isinstance(word, unicode):
        raise NonUnicodeInputException(
            'Input argument {word} is not unicode.'.format(word=word))

    #1. input word should have no spaces
    word = word.strip().lower()

    #2b Convert filtered word to hiragana via romkan
    hiragana = romaji2hiragana(word)
    results = [hiragana]

    #3: We've got a simple single word in hiragana. First test against adjectival endings
    for tx in AdjectivalTransforms:
        negative = tx.Negative(hiragana)
        if negative: results.append(negative)
        past = tx.Past(hiragana)
        if past: results.append(past)
        past = tx.Past(hiragana)
        if past: results.append(past)

    #4: No hits for adjetive stem, test against verbal endings
    for tx in VerbalTransforms:
        polite = tx.Polite(hiragana)
        if polite: results.append(polite)
        negative = tx.Negative(hiragana)
        if negative: results.append(negative)
        te = tx.Te(hiragana)
        if te: results.append(te)
        perfect = tx.Perfect(hiragana)
        if perfect: results.append(perfect)

    #5: Return input word and candidate stems as tuple, to do dictionary lookups on all.
    #The best hit will be the longest(?) exact match of a suggested stem
    return tuple(results)
示例#4
0
def guess_stem(word):
  """given single input word, try to discern japanese word stem
  """
  #ensure input is a unicode string
  if not isinstance(word, unicode):
    raise NonUnicodeInputException('Input argument {word} is not unicode.'.format(word=word))

  #1. input word should have no spaces
  word = word.strip().lower()

  #2b Convert filtered word to hiragana via romkan
  hiragana = romaji2hiragana(word)
  results = [hiragana]

  #3: We've got a simple single word in hiragana. First test against adjectival endings
  for tx in AdjectivalTransforms:
    negative = tx.Negative(hiragana)
    if negative: results.append(negative)
    past = tx.Past(hiragana)
    if past: results.append(past)
    past = tx.Past(hiragana)
    if past: results.append(past)

  #4: No hits for adjetive stem, test against verbal endings
  for tx in VerbalTransforms:
    polite = tx.Polite(hiragana)
    if polite: results.append(polite)
    negative = tx.Negative(hiragana)
    if negative: results.append(negative)
    te = tx.Te(hiragana)
    if te: results.append(te)
    perfect = tx.Perfect(hiragana)
    if perfect: results.append(perfect)

  #5: Return input word and candidate stems as tuple, to do dictionary lookups on all.
  #The best hit will be the longest(?) exact match of a suggested stem
  return tuple(results)