def generate_all_readings(cursor, writing, reading): result = set([""]) previous = None history = [] for c in writing: chars = set([]) if c == u"々": if not previous: # Kanji repetition sign used, but the previous character was not a kanji return set([""]) kanji = previous elif c in specialkanji: kanji = specialkanji[c] else: cursor.execute('select onyomi,kunyomi from kanji where kanji = ?', (c, )) kanji = cursor.fetchone() previous = kanji if kanji: (onyomis, kunyomis) = kanji if onyomis: for onyomi in onyomis.split(" "): onyomi = kata2hira(onyomi).replace("-", "") chars.add(onyomi) voiced = voice_first_char(onyomi) if voiced: chars.add(voiced) maruchar = maru_first_char(onyomi) if maruchar: chars.add(maruchar) if len(onyomi) > 1 and (onyomi.endswith(u"つ") or onyomi.endswith(u"く")): chars.add(onyomi[0:-1] + u"っ") if kunyomis: for kunyomi in kunyomis.split(" "): kunyomi = kunyomi.replace("-", "") chars.add(kunyomi) voiced = voice_first_char(kunyomi) if voiced: chars.add(voiced) maruchar = maru_first_char(kunyomi) if maruchar: chars.add(maruchar) else: chars.add(kata2hira(c)) if c in extrareadings: for extrareading in extrareadings[c].split(" "): chars.add(extrareading) lastresult = result result = append_to_all_strings(result, chars) history.append(result) result = prune_impossible(result, reading) if len(result) == 0: return set() return result
def suggestion_to_furigana(suggestion, writing): parts = suggestion.split(".")[0:-1] furigana = list(zip(parts, list(writing))) furigana = [(f if kata2hira(k) != f else "", k) for (f, k) in furigana] cfurigana = reduce(compact_furigana, furigana, []) if furigana != cfurigana: furigana = cfurigana return furigana
def get_possible_entries(cursor, origform, reading): seqs = dict() if reading != origform: cursor.execute( 'select definings.seq, definings.freq from definings, readings on definings.seq = readings.seq where defining = ? and reading = ?', (origform, reading)) seqs = dict([(row[0], row[1]) for row in cursor.fetchall()]) if not seqs: cursor.execute('select seq, freq from readings where reading = ?', (kata2hira(origform), )) seqs = dict([(row[0], row[1]) for row in cursor.fetchall()]) return seqs
for keb in kebs: c.execute( "insert into definings (seq, defining, truekanji, notes, freq) values (?, ?, ?, ?, ?)", (seq, keb.text, 1, "\t".join(ke_inf), "\t".join(ke_pri))) for reading in readings: rebs = list(reading.getiterator("reb")) if len(rebs) != 1: print seq, len(rebs) re_pri = [p.text for p in list(reading.getiterator("re_pri"))] re_restr = [p.text for p in list(reading.getiterator("re_restr"))] reb = rebs[0] if not writings: re_restr = [reb.text] c.execute( "insert into readings (seq, reading, freq, restr) values (?, ?, ?, ?)", (seq, kata2hira(reb.text), "\t".join(re_pri), "\t".join(re_restr))) if not writings: c.execute( "insert into definings (seq, defining, truekanji, freq) values (?, ?, ?, ?)", (seq, reb.text, 0, "\t".join(re_pri))) for sense in senses: pos = [p.text for p in list(sense.getiterator("pos"))] misc = [p.text for p in list(sense.getiterator("misc"))] glosses = [ gloss.text for gloss in list(sense.getiterator("gloss")) if gloss.get("{http://www.w3.org/XML/1998/namespace}lang", default="eng") == "eng" ] if glosses: c.execute( "insert into senses (seq, pos, misc, gloss, lang) values (?, ?, ?, ?, ?)",
def gettext(arg, resourcedir): conn = sqlite3.connect(resourcedir + "/lib/JMDict.sqlite") cursor = conn.cursor() chunks = [] # add features as separate members for elem in arg: features = elem["feature"].split(",") if len(features) == 9: (pos, subclass1, subclass2, subclass3, conjform, conjtype, origform, reading, pronunciation) = features elem["pos"] = (pos, subclass1, subclass2, subclass3, conjform) elem["conjform"] = conjform elem["conjtype"] = conjtype elem["origform"] = origform elem["reading"] = kata2hira(reading) elem["pronunciation"] = pronunciation elif len(features) == 7: (pos, subclass1, subclass2, subclass3, conjform, conjtype, origform) = features elem["pos"] = (pos, subclass1, subclass2, subclass3, conjform) elem["conjform"] = conjform elem["conjtype"] = conjtype elem["origform"] = origform else: raise Exception("cannot process: " + elem) # add reading lemma for elem in arg: if "reading" in elem: elem["reading_lemma"] = get_reading_lemma(elem["surface"], elem["origform"], elem["reading"]) try: while True: elem = arg.pop(0) (pos, subclass1, subclass2, subclass3, conjform) = elem["pos"] # default values glosses, conjfunc, surface, reading, reading_lemma, conjtype, origform = ( [], [], elem["surface"], elem.get("reading"), elem.get("reading_lemma"), elem["conjtype"], elem["origform"]) if pos == u"BOS/EOS": continue elif pos == u"助詞": pass elif pos not in [u"名詞", u"動詞", u"形容詞", u"副詞", u"接頭詞", u"接続詞"]: pass elif (origform, elem["pos"]) in special_meanings: glosses = special_meanings[(origform, elem["pos"])] else: glosses, conjfunc, surface, reading, reading_lemma, conjtype, origform = processelem( elem, arg, cursor) if reading: if isinstance(reading, tuple) and reading[0] == "preformatted": combinedreadings = reading[1] combinedwritings = reading[2] combinedreadingslemma = reading[1] combinedwritingslemma = reading[2] else: deducedfurigana = deduce_furigana_known_reading( cursor, reading, surface) combinedreadings = u"\t".join( [e[0] for e in deducedfurigana]) combinedwritings = u"\t".join( [e[1] for e in deducedfurigana]) deducedfuriganalemma = deduce_furigana_known_reading( cursor, reading_lemma, origform) combinedreadingslemma = u"\t".join( [e[0] for e in deducedfuriganalemma]) combinedwritingslemma = u"\t".join( [e[1] for e in deducedfuriganalemma]) else: combinedreadings = u"" combinedwritings = surface combinedreadingslemma = u"" combinedwritingslemma = origform if glosses: firstgloss = glosses[0].split("\t")[0] glosses.insert(0, firstgloss) chunks.append((combinedreadings, combinedwritings, glosses, pos, (combinedreadingslemma, combinedwritingslemma), conjtype, conjfunc)) except IndexError: pass return chunks