예제 #1
0
def generate_all_readings(cursor, writing, reading):
    result = set([""])
    previous = None
    history = []
    for c in writing:
        chars = set([])
        if c == u"々":
            if not previous:
                # Kanji repetition sign used, but the previous character was not a kanji
                return set([""])
            kanji = previous
        elif c in specialkanji:
            kanji = specialkanji[c]
        else:
            cursor.execute('select onyomi,kunyomi from kanji where kanji = ?',
                           (c, ))
            kanji = cursor.fetchone()
        previous = kanji
        if kanji:
            (onyomis, kunyomis) = kanji
            if onyomis:
                for onyomi in onyomis.split(" "):
                    onyomi = kata2hira(onyomi).replace("-", "")
                    chars.add(onyomi)
                    voiced = voice_first_char(onyomi)
                    if voiced:
                        chars.add(voiced)
                    maruchar = maru_first_char(onyomi)
                    if maruchar:
                        chars.add(maruchar)
                    if len(onyomi) > 1 and (onyomi.endswith(u"つ")
                                            or onyomi.endswith(u"く")):
                        chars.add(onyomi[0:-1] + u"っ")
            if kunyomis:
                for kunyomi in kunyomis.split(" "):
                    kunyomi = kunyomi.replace("-", "")
                    chars.add(kunyomi)
                    voiced = voice_first_char(kunyomi)
                    if voiced:
                        chars.add(voiced)
                    maruchar = maru_first_char(kunyomi)
                    if maruchar:
                        chars.add(maruchar)
        else:
            chars.add(kata2hira(c))
        if c in extrareadings:
            for extrareading in extrareadings[c].split(" "):
                chars.add(extrareading)
        lastresult = result
        result = append_to_all_strings(result, chars)
        history.append(result)
        result = prune_impossible(result, reading)
        if len(result) == 0:
            return set()

    return result
예제 #2
0
def suggestion_to_furigana(suggestion, writing):
    parts = suggestion.split(".")[0:-1]
    furigana = list(zip(parts, list(writing)))
    furigana = [(f if kata2hira(k) != f else "", k) for (f, k) in furigana]
    cfurigana = reduce(compact_furigana, furigana, [])
    if furigana != cfurigana:
        furigana = cfurigana
    return furigana
예제 #3
0
def get_possible_entries(cursor, origform, reading):
    seqs = dict()
    if reading != origform:
        cursor.execute(
            'select definings.seq, definings.freq from definings, readings on definings.seq = readings.seq where defining = ? and reading = ?',
            (origform, reading))
        seqs = dict([(row[0], row[1]) for row in cursor.fetchall()])
    if not seqs:
        cursor.execute('select seq, freq from readings where reading = ?',
                       (kata2hira(origform), ))
        seqs = dict([(row[0], row[1]) for row in cursor.fetchall()])
    return seqs
예제 #4
0
     for keb in kebs:
         c.execute(
             "insert into definings (seq, defining, truekanji, notes, freq) values (?, ?, ?, ?, ?)",
             (seq, keb.text, 1, "\t".join(ke_inf), "\t".join(ke_pri)))
 for reading in readings:
     rebs = list(reading.getiterator("reb"))
     if len(rebs) != 1:
         print seq, len(rebs)
     re_pri = [p.text for p in list(reading.getiterator("re_pri"))]
     re_restr = [p.text for p in list(reading.getiterator("re_restr"))]
     reb = rebs[0]
     if not writings:
         re_restr = [reb.text]
     c.execute(
         "insert into readings (seq, reading, freq, restr) values (?, ?, ?, ?)",
         (seq, kata2hira(reb.text), "\t".join(re_pri), "\t".join(re_restr)))
     if not writings:
         c.execute(
             "insert into definings (seq, defining, truekanji, freq) values (?, ?, ?, ?)",
             (seq, reb.text, 0, "\t".join(re_pri)))
 for sense in senses:
     pos = [p.text for p in list(sense.getiterator("pos"))]
     misc = [p.text for p in list(sense.getiterator("misc"))]
     glosses = [
         gloss.text for gloss in list(sense.getiterator("gloss"))
         if gloss.get("{http://www.w3.org/XML/1998/namespace}lang",
                      default="eng") == "eng"
     ]
     if glosses:
         c.execute(
             "insert into senses (seq, pos, misc, gloss, lang) values (?, ?, ?, ?, ?)",
예제 #5
0
def gettext(arg, resourcedir):
    conn = sqlite3.connect(resourcedir + "/lib/JMDict.sqlite")
    cursor = conn.cursor()

    chunks = []

    # add features as separate members
    for elem in arg:
        features = elem["feature"].split(",")
        if len(features) == 9:
            (pos, subclass1, subclass2, subclass3, conjform, conjtype,
             origform, reading, pronunciation) = features
            elem["pos"] = (pos, subclass1, subclass2, subclass3, conjform)
            elem["conjform"] = conjform
            elem["conjtype"] = conjtype
            elem["origform"] = origform
            elem["reading"] = kata2hira(reading)
            elem["pronunciation"] = pronunciation
        elif len(features) == 7:
            (pos, subclass1, subclass2, subclass3, conjform, conjtype,
             origform) = features
            elem["pos"] = (pos, subclass1, subclass2, subclass3, conjform)
            elem["conjform"] = conjform
            elem["conjtype"] = conjtype
            elem["origform"] = origform
        else:
            raise Exception("cannot process: " + elem)

    # add reading lemma
    for elem in arg:
        if "reading" in elem:
            elem["reading_lemma"] = get_reading_lemma(elem["surface"],
                                                      elem["origform"],
                                                      elem["reading"])

    try:
        while True:
            elem = arg.pop(0)
            (pos, subclass1, subclass2, subclass3, conjform) = elem["pos"]

            # default values
            glosses, conjfunc, surface, reading, reading_lemma, conjtype, origform = (
                [], [], elem["surface"], elem.get("reading"),
                elem.get("reading_lemma"), elem["conjtype"], elem["origform"])

            if pos == u"BOS/EOS":
                continue
            elif pos == u"助詞":
                pass
            elif pos not in [u"名詞", u"動詞", u"形容詞", u"副詞", u"接頭詞", u"接続詞"]:
                pass
            elif (origform, elem["pos"]) in special_meanings:
                glosses = special_meanings[(origform, elem["pos"])]
            else:
                glosses, conjfunc, surface, reading, reading_lemma, conjtype, origform = processelem(
                    elem, arg, cursor)

            if reading:
                if isinstance(reading, tuple) and reading[0] == "preformatted":
                    combinedreadings = reading[1]
                    combinedwritings = reading[2]
                    combinedreadingslemma = reading[1]
                    combinedwritingslemma = reading[2]
                else:
                    deducedfurigana = deduce_furigana_known_reading(
                        cursor, reading, surface)
                    combinedreadings = u"\t".join(
                        [e[0] for e in deducedfurigana])
                    combinedwritings = u"\t".join(
                        [e[1] for e in deducedfurigana])
                    deducedfuriganalemma = deduce_furigana_known_reading(
                        cursor, reading_lemma, origform)
                    combinedreadingslemma = u"\t".join(
                        [e[0] for e in deducedfuriganalemma])
                    combinedwritingslemma = u"\t".join(
                        [e[1] for e in deducedfuriganalemma])
            else:
                combinedreadings = u""
                combinedwritings = surface
                combinedreadingslemma = u""
                combinedwritingslemma = origform

            if glosses:
                firstgloss = glosses[0].split("\t")[0]
                glosses.insert(0, firstgloss)

            chunks.append((combinedreadings, combinedwritings, glosses, pos,
                           (combinedreadingslemma,
                            combinedwritingslemma), conjtype, conjfunc))
    except IndexError:
        pass

    return chunks