Exemplo n.º 1
0
def add_words_dict(word, pinyin, freq):
    # assume all tones are already removed
    assert pinyin == strip_tone(pinyin)

    if not word in words_dict:
        status = Untouched
        pinyins = []
        pinyins.append((pinyin, freq))
        words_dict[word] = (status, pinyins)
    else:
        (status, pinyins) = words_dict[word]
        assert Untouched == status

        for i, item in enumerate(pinyins):
            (oldpinyin, oldfreq) = item
            assert oldpinyin != pinyin

        pinyins.append((pinyin, freq))
Exemplo n.º 2
0
def add_words_dict(word, pinyin, freq):
    # assume all tones are already removed
    assert pinyin == strip_tone(pinyin)

    if not word in words_dict:
        status = Untouched
        pinyins = []
        pinyins.append((pinyin, freq))
        words_dict[word] = (status, pinyins)
    else:
        (status, pinyins) = words_dict[word]
        assert Untouched == status

        for i, item in enumerate(pinyins):
            (oldpinyin, oldfreq) = item
            assert oldpinyin != pinyin

        pinyins.append((pinyin, freq))
Exemplo n.º 3
0
def handle_pinyin(outfile, word, num, pinyin):
    # no tones in opengram dictionary
    stripped = strip_tone(pinyin)
    assert stripped == pinyin

    freq = 0
    if not ":" in pinyin:
        freq = total_frequency / num
    else:
        (py, freq) = pinyin.split(":", 1)
        assert freq.endswith("%")
        freq = freq.rstrip("%")
        freq = float(freq)
        freq = total_frequency * freq
        pinyin = py

    freq = int(freq)
    freq = max(freq, minimum)
    freq = str(freq)
    oneline = "\t".join((word, pinyin, freq))
    outfile.writelines([oneline, os.linesep])
Exemplo n.º 4
0
def add_words_dict(word, pinyin, freq):
    pinyin = strip_tone(pinyin)
    if not word in words_dict:
        pinyins = []
        pinyins.append((pinyin, freq))
        words_dict[word] = pinyins
    else:
        pinyins = words_dict[word]

        found = False
        for i, item in enumerate(pinyins):
            (oldpinyin, oldfreq) = item
            if oldpinyin == pinyin:
                # print out the collapsed word and pinyin pair
                print('Collapse: {0} and {1}'.format(word, pinyin))
                freq += oldfreq
                pinyins[i] = (pinyin, freq)
                found = True

        if not found:
            pinyins.append((pinyin, freq))
Exemplo n.º 5
0
def handle_pinyin(outfile, word, num, pinyin):
    # no tones in opengram dictionary
    stripped = strip_tone(pinyin)
    assert stripped == pinyin

    freq = 0
    if not ":" in pinyin:
        freq = total_frequency / num
    else:
        (py, freq) = pinyin.split(":", 1)
        assert freq.endswith("%")
        freq = freq.rstrip("%")
        freq = float(freq)
        freq = total_frequency * freq
        pinyin = py

    freq = int(freq)
    freq = max(freq, minimum)
    freq = str(freq)
    oneline = "\t".join((word, pinyin, freq))
    outfile.writelines([oneline, os.linesep])
Exemplo n.º 6
0
def add_words_dict(word, pinyin, freq):
    pinyin = strip_tone(pinyin)
    if not word in words_dict:
        pinyins = []
        pinyins.append((pinyin, freq))
        words_dict[word] = pinyins
    else:
        pinyins = words_dict[word]

        found = False
        for i, item in enumerate(pinyins):
            (oldpinyin, oldfreq) = item
            if oldpinyin == pinyin:
                # print out the collapsed word and pinyin pair
                print('Collapse: {0} and {1}'.format(word, pinyin))
                freq += oldfreq
                pinyins[i] = (pinyin, freq)
                found = True

        if not found:
            pinyins.append((pinyin, freq))