Пример #1
0
class JdsInfoHandler:
    def __init__(self, argv):
        self.args = parse_args(argv)
        self.db = JdsDatabase()
        self.chars = self.db.get_all_chars_with_count()
        self.total_count = None

    def reset(self):
        self.db.reset_info()
        self.db.prepare_info(self.chars)

    def update_jlpt_joyo(self):
        # read jlpt/joyou levels; count number of kanji per level at the same time
        jdpt_count = {}
        with open('jlpt_kanji.csv', mode='r', encoding='utf-8') as csv_file:
            for row in csv.reader(csv_file, delimiter=';'):
                # update kanji info
                uid = ord(row[0])
                jlpt = int(row[1])
                if uid not in self.chars:
                    new_char = JdsChar(chr(uid))
                    self.chars[uid] = new_char
                    self.db.push_char(new_char)
                self.chars[uid].jlpt = jlpt
                # update count of kanji per level
                if jlpt not in jdpt_count:
                    jdpt_count[jlpt] = 0
                jdpt_count[jlpt] += 1

        with open('jouyou_kanji.csv', mode='r', encoding='utf-8') as csv_file:
            for row in csv.reader(csv_file, delimiter=';'):
                uid = ord(row[0])
                jouyou = int(row[1])
                if uid not in self.chars:
                    new_char = JdsChar(chr(uid))
                    self.chars[uid] = new_char
                    self.db.push_char(new_char)
                self.chars[uid].jouyou = jouyou

        self.db.push_kanji_jlpt_joyo(self.chars)

    def compute_pos_dict(self, jlpt_or_jouyou):
        # sort chars by jlpt then by count
        my_dict = dict()

        for char in self.chars.values():
            if jlpt_or_jouyou is 'jlpt':
                if char.jlpt not in my_dict:
                    my_dict[char.jlpt] = []
                my_dict[char.jlpt].append(char)
            else:
                if char.jouyou not in my_dict:
                    my_dict[char.jouyou] = []
                my_dict[char.jouyou].append(char)

        for level in my_dict:
            my_dict[level].sort(key=methodcaller('count'), reverse=True)
        return my_dict

    def update_kanji_pos(self):

        # get all char count
        self.total_count = self.db.get_count_for_drama(
            JdsDatabase.get_merged_drama())

        # set jlpt position
        jlpt_dict = self.compute_pos_dict('jlpt')
        position = 1
        for level in range(len(jlpt_dict) - 1, 0, -1):
            for char in jlpt_dict[level]:
                char.jlpt_pos = position
                position += 1

        # set jouyou position
        jouyou_dict = self.compute_pos_dict('jouyou')
        position = 1
        for level in range(1, len(jouyou_dict), 1):
            for char in jouyou_dict[level]:
                char.jouyou_pos = position
                position += 1

        # set jdpt position
        char_per_level = {}
        for level in range(len(jlpt_dict) - 1, 0, -1):
            char_per_level[level] = len(jlpt_dict[level])

        # find sum of all kanji count
        sum_all_count = 0
        for char_uid in sorted(self.total_count,
                               key=self.total_count.get,
                               reverse=True):
            if is_kanji(self.chars[char_uid].value):
                sum_all_count += self.chars[char_uid].count()

        # set JDPT level
        position = 1
        cur_level = 6
        jdpt_limits = [1, 0.99, 0.98, 0.95, 0.9, 0.75, 0.5]
        cumul_freq = 0
        for char_uid in sorted(self.total_count,
                               key=self.total_count.get,
                               reverse=True):
            if is_kanji(self.chars[char_uid].value):
                count = self.total_count[char_uid]
                freq = count / sum_all_count
                self.chars[char_uid].jdpt_pos = position
                self.chars[char_uid].freq = freq
                self.chars[char_uid].cumul_freq = cumul_freq + freq
                cumul_freq += freq
                self.chars[char_uid].set_count(count)
                position += 1

                if cumul_freq > jdpt_limits[cur_level]:
                    cur_level -= 1
                if cumul_freq < jdpt_limits[cur_level]:
                    self.chars[char_uid].jdpt = cur_level - 1

        self.db.push_kanji_pos(self.chars)

        # set episode and drama frequency
        char_drama_count = {}
        char_episode_count = {}
        results = self.db.get_kanji_count_raw()
        for result in results:
            kanji_uid = result['kanji_uid']
            if kanji_uid not in char_drama_count:
                char_drama_count[kanji_uid] = 0
                char_episode_count[kanji_uid] = 0
            drama_uid = result['drama_uid']
            episode_count = result['episode_count']
            if drama_uid is 0:
                continue
            char_drama_count[kanji_uid] += 1
            char_episode_count[kanji_uid] += episode_count

        num_of_drama = len(
            self.db.get_all_dramas()) - 1  # -1 due to merge drama
        num_of_episodes = len(self.db.get_all_episodes_raw())
        char_drama_freq = {}
        char_episode_freq = {}

        for kanji_uid, count in char_drama_count.items():
            char_drama_freq[kanji_uid] = count / num_of_drama
        for kanji_uid, count in char_episode_count.items():
            char_episode_freq[kanji_uid] = count / num_of_episodes

        self.db.push_drama_and_episode_count(char_drama_freq,
                                             char_episode_freq)

    def update_kanji_flags(self):
        for char in self.chars.values():
            if is_kanji(char.value):
                char.flag = 1
            elif re.match("[ぁ-んァ-ン]", char.value):
                char.flag = 2
            else:
                char.flag = 3
        self.db.push_kanji_info_flags(self.chars)
Пример #2
0
from python.DccUtils import parse_args
from python.JdsDatabase import JdsDatabase

if __name__ == "__main__":
    print("{} started".format(__file__))
    start_time = time.perf_counter()

    pr = None
    if settings.enable_profiler:
        pr = cProfile.Profile()
        pr.enable()

    args = parse_args(sys.argv[1:])
    db = JdsDatabase()
    kanji_info_results = db.get_kanji_info_raw()
    kanji_count_results = db.get_kanji_count_raw()

    with open(settings.csv_path_kanji, mode='w', encoding='utf-8',
              newline='') as csv_file:
        fieldnames = [
            'kanji', 'count', 'freq', 'cumul_freq', 'drama_freq',
            'episode_freq', 'jdpt', 'jdpt_pos', 'jouyou', 'jouyou_pos'
        ]
        writer = csv.DictWriter(csv_file,
                                fieldnames=fieldnames,
                                delimiter='\t')
        rows = {}
        writer.writeheader()
        for result in kanji_info_results:
            if result['flag'] is not 1:
                continue