예제 #1
0
 def __loadDictionary(self):
     starttime = time.clock()
     self.dictionaryJ2E = dict()
     EdictDictionary.dictionaryJ2E = self.dictionaryJ2E #make a static copy 
     
     output = open("edict.sql", "w", encoding="utf8")
     output.write("CREATE TABLE IF NOT EXISTS edict_lemmas (lemma, uninflectedLemma, articleId, isInflected);\n")
     output.write("CREATE TABLE IF NOT EXISTS edict_articles (id, content);\n")
     output.write("CREATE INDEX IF NOT EXISTS idx_edict_lemmas_lemma on edict_lemmas (lemma);\n")
     output.write("CREATE INDEX IF NOT EXISTS idx_edict_lemmas_uninfl_lemma on edict_lemmas (uninflectedLemma);\n")
     output.write("CREATE INDEX IF NOT EXISTS idx_edict_articles_id on edict_articles (id);\n")
     output.write("BEGIN TRANSACTION;\n")
     
     with open("edict2u", "r", encoding="utf8") as f:
         # stats = PerformanceStatistics()
         for line in f.readlines():
             entryIdIndex = line.rfind("/Ent") #remove entry id (eg. "EntL1000920X")
             if entryIdIndex != -1:
                 line = line[:line.rfind("/Ent")]
             
             articleid = str(uuid.uuid1())
             output.write("insert into edict_articles (id, content) values ('{id}', '{content}');\n".format(id = articleid, content=line.replace("'", "''")))
                 
             boundary = line.find("/")
             
             kanjis = line[0:boundary].lower()
             kanjis = re.sub("\[|\]| |\(.*?\)", ";", kanjis) #remove anything that's inside ()
             kanjis = romkan.katakana_to_hiragana(kanjis)
             kanjis = kanjis.split(";")
             
                 
             for k in kanjis:
                 if k == "": continue
                 output.write("""insert into edict_lemmas 
                                 (lemma, 
                                 uninflectedLemma, 
                                 articleId, 
                                 isInflected) 
                             values ('{lemma}', 
                                     '{uninflectedLemma}', 
                                     '{articleId}', 
                                     'N');
                         """.format(lemma=k.replace("'", "''"), uninflectedLemma=k.replace("'", "''"), articleId=articleid))
                 
                 conjugations = self.calculateConjugations([k], line[boundary:])
                 
                 for c in conjugations:
                     if c == "": continue
                     output.write("""insert into edict_lemmas 
                                 (lemma, 
                                 uninflectedLemma, 
                                 articleId, 
                                 isInflected) 
                             values ('{lemma}', 
                                     '{uninflectedLemma}', 
                                     '{articleId}', 
                                     'Y');
                             """.format(lemma=c.replace("'", "''"), uninflectedLemma=k.replace("'", "''"), articleId=articleid))
                 
     output.write("END TRANSACTION;\n")
예제 #2
0
 def normalizeInput(self, text):
     text = romkan.to_hiragana(text.replace(" ", ""))    
     text = romkan.katakana_to_hiragana(text.lower())
     return text