Exemplo n.º 1
0
    def group_kana(self, string):
        # Group up the kanji field
        part = None
        out = []

        for kanji in string:
            if kanji in KanjiWord.SEP:
                # For now we ignore things past the seperator for alternate readings
                break
            elif kana.is_kana(kanji):
                if part is None:
                    part = self._new_part(is_kanji=False)
                    part['is_kanji'] = False

                part['base'] += kanji
            else:
                # Close any KANA parts first
                if part is not None:
                    out.append(part)
                    part = None

                # Add a new kanji
                part2 = self._new_part(is_kanji=True)
                part2['base'] = kanji
                out.append(part2)

        # Close any KANA parts left over
        if part is not None:
            out.append(part)

        return out
Exemplo n.º 2
0
def load_anki_data(kanji_list):
    kanji_list = set(kanji_list)

    # Find out which kanji we actually have cards for
    expected = set()
    for kanji in Kanji.all():
        if kanji.suspended:
            continue
        expected.add(kanji.kanji)

    # Kanji words also get to add to the whitelist
    actual = set()
    for word in Counter.all() + KanjiWord.all():
        if word.suspended:
            continue

        # Add all the kanji in the word
        for kanji in word.kanji:
            # Make sure we only add kanji
            if kana.is_kana(kanji):
                continue

            actual.add(kanji)

    extra = load_extra(settings.EXTRA_DICT_KANJI)

    # Find which kanji we have no cards for
    missing = actual - expected
    if len(missing):
        message("Missing Kanji Found", ' '.join(missing))

    # Notify the user of any kanji that don't have examples (no kanji-words)
    no_example = expected - actual
    if len(no_example):
        message("Kanji with no Examples", ' '.join(no_example))

    # Notify the user of any kanji that aren't in our dictionary
    unknown = (expected | actual) - (kanji_list | extra)
    if len(unknown):
        message("Unknown Kanji, not in Dict:", ' '.join(unknown))

    # Now we finally make our known kanji list
    known = (expected | actual)

    return known
Exemplo n.º 3
0
def load_anki_data(kanji_list):
    kanji_list = set(kanji_list)

    # Find out which kanji we actually have cards for
    expected = set()
    for kanji in Kanji.all():
        if kanji.suspended:
            continue
        expected.add(kanji.kanji)

    # Kanji words also get to add to the whitelist
    actual = set()
    for word in Counter.all() + KanjiWord.all():
        if word.suspended:
            continue

        # Add all the kanji in the word
        for kanji in word.kanji:
            # Make sure we only add kanji
            if kana.is_kana(kanji):
                continue

            actual.add(kanji)

    extra = load_extra(settings.EXTRA_DICT_KANJI)

    # Find which kanji we have no cards for
    missing = actual - expected
    if len(missing):
        message("Missing Kanji Found", ' '.join(missing))

    # Notify the user of any kanji that don't have examples (no kanji-words)
    no_example = expected - actual
    if len(no_example):
        message("Kanji with no Examples", ' '.join(no_example))

    # Notify the user of any kanji that aren't in our dictionary
    unknown = (expected | actual) - (kanji_list | extra)
    if len(unknown):
        message("Unknown Kanji, not in Dict:", ' '.join(unknown))

    # Now we finally make our known kanji list
    known = (expected | actual)

    return known
Exemplo n.º 4
0
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
from models.kanji import Kanji
from models.kanji_word import KanjiWord
from utf8_helper import force_UTF8
import kana

import settings


if __name__ == '__main__':
    force_UTF8()

    # First we need to read out whitelist
    whitelist = set()
    for kanji in Kanji.all():
        if kanji.suspended:
            continue
        whitelist.add(kanji.kanji)

    # Now we filter out any KanjiWords that use other kanji
    for kanji_word in KanjiWord.all():
        fine = True
        for kanji in kanji_word.kanji:
            if kana.is_kana(kanji) and kanji not in whitelist:
                fine = False

        if fine:
            kanji_word.mark_suspended(False)

Exemplo n.º 5
0
    data[key]['words'].append((word, readings))


if __name__ == '__main__':
    force_UTF8()

    missing = {}

    # Now we need to find if all the readings are found
    for word in KanjiWord.all():
        for reading in word.kanji_readings:
            try:
                kanji = Kanji.find(reading['base'])
            except KeyError:
                if kana.is_kana(reading['base']) and reading['base'] != u'ヶ':
                    raise AnkiModel.Error(u"Kana mismatch: %s word(%s) reading(%s)" % (
                        reading['base'], word.kanji, word.reading
                    ))
                else:
                    # Make sure not to do the rest of the work
                    # otherwise you'll use the previous kanji
                    continue
                    # raise AnkiModel.Error(u"Kanji not found, but in use: %s word(%s)" % (
                    #     reading['base'], word.kanji
                    # ))

            # Now that we have the kanji, check if this reading is used
            if kanji.kanji == '々':
                pass
            elif kana.all_to_hiragana(reading['reading']) in kanji.readings:
Exemplo n.º 6
0

if __name__ == '__main__':
    force_UTF8()

    args = parse()

    # Find all the kanji that are in the deck
    all_kanji = set()
    for word in KanjiWord.all():
        for kanji in word.kanji:
            all_kanji.add(kanji)
    for kanji in Kanji.all():
        all_kanji.add(kanji)

    # Count which kanji the input data has
    data = Counter(unicode(sys.stdin.read()))
    for char, count in data.most_common():
        # we don't want kana
        if kana.is_kana(char):
            del data[char]
        # Nor do we want kanji we know
        if char in all_kanji:
            del data[char]
        # Nor any non-kanji chars
        if not kana.is_kanji(char):
            del data[char]

    for char, count in data.most_common(args.count):
        print char, count
Exemplo n.º 7
0
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
from models.kanji import Kanji
from models.kanji_word import KanjiWord
from utf8_helper import force_UTF8
import kana

import settings

if __name__ == '__main__':
    force_UTF8()

    # First we need to read out whitelist
    whitelist = set()
    for kanji in Kanji.all():
        if kanji.suspended:
            continue
        whitelist.add(kanji.kanji)

    # Now we filter out any KanjiWords that use other kanji
    for kanji_word in KanjiWord.all():
        fine = True
        for kanji in kanji_word.kanji:
            if kana.is_kana(kanji) and kanji not in whitelist:
                fine = False

        if fine:
            kanji_word.mark_suspended(False)
Exemplo n.º 8
0
    data[key]['words'].append((word, readings))


if __name__ == '__main__':
    force_UTF8()

    missing = {}

    # Now we need to find if all the readings are found
    for word in KanjiWord.all():
        for reading in word.kanji_readings:
            try:
                kanji = Kanji.find(reading['base'])
            except KeyError:
                if kana.is_kana(reading['base']) and reading['base'] != u'ヶ':
                    raise AnkiModel.Error(
                        u"Kana mismatch: %s word(%s) reading(%s)" %
                        (reading['base'], word.kanji, word.reading))
                else:
                    # Make sure not to do the rest of the work
                    # otherwise you'll use the previous kanji
                    continue
                    # raise AnkiModel.Error(u"Kanji not found, but in use: %s word(%s)" % (
                    #     reading['base'], word.kanji
                    # ))

            # Now that we have the kanji, check if this reading is used
            if kanji.kanji == '々':
                pass
            elif kana.all_to_hiragana(reading['reading']) in kanji.readings:
Exemplo n.º 9
0
if __name__ == '__main__':
    force_UTF8()

    args = parse()

    # Find all the kanji that are in the deck
    all_kanji = set()
    for word in KanjiWord.all():
        for kanji in word.kanji:
            all_kanji.add(kanji)
    for kanji in Kanji.all():
        all_kanji.add(kanji)

    # Count which kanji the input data has
    data = Counter(unicode(sys.stdin.read()))
    for char, count in data.most_common():
        # we don't want kana
        if kana.is_kana(char):
            del data[char]
        # Nor do we want kanji we know
        if char in all_kanji:
            del data[char]
        # Nor any non-kanji chars
        if not kana.is_kanji(char):
            del data[char]

    for char, count in data.most_common(args.count):
        print char, count