def deinflect(self, corpus, explain=False): '''Attempt to find the plain form of an inflected word or phrase.''' variants = [ ] for pos in Nihongo.STEMS: forms = getattr(Nihongo, pos) stem = '' if not hasattr(Nihongo, pos): print 'no Nihongo attrib for pos %s' %s continue stem = romaji.kana(Nihongo.STEMS[pos]) for form in forms: for variant in forms[form]: kana = romaji.kana(variant) if corpus.endswith(kana): candidate = (pos, variant, len(kana), kana, stem, form) variants.append( candidate ) if not variants: if explain: return (corpus, None) return corpus variants.sort(lambda x,y: y[2]-x[2]) while variants: winner = variants.pop() word = corpus[:-winner[2]] + winner[4] found = self.db.find(word) if found and found[0].is_pos(winner[0]): if explain: return (word, winner[-1]) return word if explain: return (corpus, None) return corpus
def katakanaize(hiragana): """ Return katakana Transform a hiragana string to katakana through the circuitous route of converting it to rōmaji, then to uppercase, than to kana again. """ return romaji.kana(romaji.roma(hiragana).upper())
def find(self, word, roma=True, kana=True, kanji=True): '''Searches for entries matching given word (in kana or kanji).''' if not word: return None found = [ ] if roma and is_other(word[0]): word = romaji.kana(word) kana = True if kanji and word[0] in self._keb0: found.extend( [ e for e in self._keb0[word[0]] if e == word ] ) if kana and word[0] in self._reb0: found.extend( [ e for e in self._reb0[word[0]] if e == word ] ) # search by english words in meanings is not yet supported return found
try: import psyco psyco.full() except: print '(no psyco acceleration available)' pass import os sys.stdout = codecs.lookup('utf-8')[-1](sys.stdout) # load the dictionary db = JmDict('JMdict_e') # self-test: look up a common word if False: term = romaji.kana('konnichiha') entries = db.find(term) print term, '=>' for e in entries: print u' %s' % e for k in e.keb: print u' kanji: %s' % k for r in e.reb: print u' kana: %s' % r if e.is_uk(): print 'usually kana' print # prepare for dictionary work corpus = [] source = 'corpus.txt' xl = Nihongo(db) # self-test: de-inflect some simple words via dictionary