def __loadDictionary(self): starttime = time.clock() self.dictionaryJ2E = dict() EdictDictionary.dictionaryJ2E = self.dictionaryJ2E #make a static copy output = open("edict.sql", "w", encoding="utf8") output.write("CREATE TABLE IF NOT EXISTS edict_lemmas (lemma, uninflectedLemma, articleId, isInflected);\n") output.write("CREATE TABLE IF NOT EXISTS edict_articles (id, content);\n") output.write("CREATE INDEX IF NOT EXISTS idx_edict_lemmas_lemma on edict_lemmas (lemma);\n") output.write("CREATE INDEX IF NOT EXISTS idx_edict_lemmas_uninfl_lemma on edict_lemmas (uninflectedLemma);\n") output.write("CREATE INDEX IF NOT EXISTS idx_edict_articles_id on edict_articles (id);\n") output.write("BEGIN TRANSACTION;\n") with open("edict2u", "r", encoding="utf8") as f: # stats = PerformanceStatistics() for line in f.readlines(): entryIdIndex = line.rfind("/Ent") #remove entry id (eg. "EntL1000920X") if entryIdIndex != -1: line = line[:line.rfind("/Ent")] articleid = str(uuid.uuid1()) output.write("insert into edict_articles (id, content) values ('{id}', '{content}');\n".format(id = articleid, content=line.replace("'", "''"))) boundary = line.find("/") kanjis = line[0:boundary].lower() kanjis = re.sub("\[|\]| |\(.*?\)", ";", kanjis) #remove anything that's inside () kanjis = romkan.katakana_to_hiragana(kanjis) kanjis = kanjis.split(";") for k in kanjis: if k == "": continue output.write("""insert into edict_lemmas (lemma, uninflectedLemma, articleId, isInflected) values ('{lemma}', '{uninflectedLemma}', '{articleId}', 'N'); """.format(lemma=k.replace("'", "''"), uninflectedLemma=k.replace("'", "''"), articleId=articleid)) conjugations = self.calculateConjugations([k], line[boundary:]) for c in conjugations: if c == "": continue output.write("""insert into edict_lemmas (lemma, uninflectedLemma, articleId, isInflected) values ('{lemma}', '{uninflectedLemma}', '{articleId}', 'Y'); """.format(lemma=c.replace("'", "''"), uninflectedLemma=k.replace("'", "''"), articleId=articleid)) output.write("END TRANSACTION;\n")
def normalizeInput(self, text): text = romkan.to_hiragana(text.replace(" ", "")) text = romkan.katakana_to_hiragana(text.lower()) return text