class JdsDramaHandler: """ Find all the drama in the given folder (i.e. top level subfolders), assign a uid and push then to the database """ def __init__(self, argv): self.args = parse_args(argv) self.db = JdsDatabase() def reset(self): return self.db.reset_dramas() def read_dramas(self): subfolders = DccUtils.get_subfolders(self.args["path"]) dramas = [self.db.get_merged_drama()] for subfolder in subfolders: dramas.append(JdsDrama(len(dramas), os.path.basename(subfolder))) self.db.push_dramas(dramas) def read_episodes(self): episodes = {} subfolders = DccUtils.get_subfolders(self.args["path"]) for subfolder in subfolders: for filepath in DccUtils.get_files(subfolder): episodes[len(episodes)] = os.path.basename(filepath) self.db.push_episodes(episodes)
def __init__(self, argv): self.args = parse_args(argv) self.db = JdsDatabase()
class JdsCharHandler: def __init__(self, argv): self.args = parse_args(argv) self.db = JdsDatabase() def reset(self): return self.db.reset_chars() def read_chars_worker(self, drama): """ threaded worker that counts all characters for a given drama, by getting all lines from the DB and counting the char. requires drama,lines to be in the DB beforehand :param drama: :return: """ chars = {} # key = char, value = count episodes = {} print("start read_chars_worker for {}".format(drama.value)) jds_lines = self.db.get_lines_for_drama(drama) cur_start_time = time.perf_counter() for jds_line in jds_lines: try: for char in jds_line.value: if char not in chars: chars[char] = 0 episodes[char] = set() chars[char] = chars[char] + 1 if jds_line.episode_uid not in episodes[char]: episodes[char].add(jds_line.episode_uid) except Exception as e: exception(e) jds_chars = {} for char in chars: new_char = JdsChar.from_drama(char, drama.uid) new_char.set_count(chars[char]) new_char.episode_count = len(episodes[char]) jds_chars[char] = new_char if "\n" in chars: del chars[JdsChar("\n")] print("Deleted \\n") run_time = time.perf_counter() - cur_start_time print("stop read_chars_worker for {} with {} chars in {:2.2f}".format( drama.value, len(chars), run_time)) return jds_chars def read_chars(self): dramas = self.db.get_all_dramas() with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: while len(dramas) > 0: try: futures = {} for drama in dramas: if drama.kanji_ok is 1: dramas.remove(drama) print("kanji_ok TRUE -> {} skipped".format( drama.uid)) continue futures[drama] = executor.submit( self.read_chars_worker, drama) dramas.remove(drama) if len(futures) > 10: break for future in concurrent.futures.as_completed( futures.values()): chars = future.result() self.db.push_chars_count(chars) except Error as e: exception(e) self.db.push_chars() def create_tables(self): self.db.create_char_tables()
def update_kanji_pos(self): # get all char count self.total_count = self.db.get_count_for_drama( JdsDatabase.get_merged_drama()) # set jlpt position jlpt_dict = self.compute_pos_dict('jlpt') position = 1 for level in range(len(jlpt_dict) - 1, 0, -1): for char in jlpt_dict[level]: char.jlpt_pos = position position += 1 # set jouyou position jouyou_dict = self.compute_pos_dict('jouyou') position = 1 for level in range(1, len(jouyou_dict), 1): for char in jouyou_dict[level]: char.jouyou_pos = position position += 1 # set jdpt position char_per_level = {} for level in range(len(jlpt_dict) - 1, 0, -1): char_per_level[level] = len(jlpt_dict[level]) # find sum of all kanji count sum_all_count = 0 for char_uid in sorted(self.total_count, key=self.total_count.get, reverse=True): if is_kanji(self.chars[char_uid].value): sum_all_count += self.chars[char_uid].count() # set JDPT level position = 1 cur_level = 6 jdpt_limits = [1, 0.99, 0.98, 0.95, 0.9, 0.75, 0.5] cumul_freq = 0 for char_uid in sorted(self.total_count, key=self.total_count.get, reverse=True): if is_kanji(self.chars[char_uid].value): count = self.total_count[char_uid] freq = count / sum_all_count self.chars[char_uid].jdpt_pos = position self.chars[char_uid].freq = freq self.chars[char_uid].cumul_freq = cumul_freq + freq cumul_freq += freq self.chars[char_uid].set_count(count) position += 1 if cumul_freq > jdpt_limits[cur_level]: cur_level -= 1 if cumul_freq < jdpt_limits[cur_level]: self.chars[char_uid].jdpt = cur_level - 1 self.db.push_kanji_pos(self.chars) # set episode and drama frequency char_drama_count = {} char_episode_count = {} results = self.db.get_kanji_count_raw() for result in results: kanji_uid = result['kanji_uid'] if kanji_uid not in char_drama_count: char_drama_count[kanji_uid] = 0 char_episode_count[kanji_uid] = 0 drama_uid = result['drama_uid'] episode_count = result['episode_count'] if drama_uid is 0: continue char_drama_count[kanji_uid] += 1 char_episode_count[kanji_uid] += episode_count num_of_drama = len( self.db.get_all_dramas()) - 1 # -1 due to merge drama num_of_episodes = len(self.db.get_all_episodes_raw()) char_drama_freq = {} char_episode_freq = {} for kanji_uid, count in char_drama_count.items(): char_drama_freq[kanji_uid] = count / num_of_drama for kanji_uid, count in char_episode_count.items(): char_episode_freq[kanji_uid] = count / num_of_episodes self.db.push_drama_and_episode_count(char_drama_freq, char_episode_freq)
def __init__(self, argv): self.args = parse_args(argv) self.db = JdsDatabase() self.chars = self.db.get_all_chars_with_count() self.total_count = None
class JdsInfoHandler: def __init__(self, argv): self.args = parse_args(argv) self.db = JdsDatabase() self.chars = self.db.get_all_chars_with_count() self.total_count = None def reset(self): self.db.reset_info() self.db.prepare_info(self.chars) def update_jlpt_joyo(self): # read jlpt/joyou levels; count number of kanji per level at the same time jdpt_count = {} with open('jlpt_kanji.csv', mode='r', encoding='utf-8') as csv_file: for row in csv.reader(csv_file, delimiter=';'): # update kanji info uid = ord(row[0]) jlpt = int(row[1]) if uid not in self.chars: new_char = JdsChar(chr(uid)) self.chars[uid] = new_char self.db.push_char(new_char) self.chars[uid].jlpt = jlpt # update count of kanji per level if jlpt not in jdpt_count: jdpt_count[jlpt] = 0 jdpt_count[jlpt] += 1 with open('jouyou_kanji.csv', mode='r', encoding='utf-8') as csv_file: for row in csv.reader(csv_file, delimiter=';'): uid = ord(row[0]) jouyou = int(row[1]) if uid not in self.chars: new_char = JdsChar(chr(uid)) self.chars[uid] = new_char self.db.push_char(new_char) self.chars[uid].jouyou = jouyou self.db.push_kanji_jlpt_joyo(self.chars) def compute_pos_dict(self, jlpt_or_jouyou): # sort chars by jlpt then by count my_dict = dict() for char in self.chars.values(): if jlpt_or_jouyou is 'jlpt': if char.jlpt not in my_dict: my_dict[char.jlpt] = [] my_dict[char.jlpt].append(char) else: if char.jouyou not in my_dict: my_dict[char.jouyou] = [] my_dict[char.jouyou].append(char) for level in my_dict: my_dict[level].sort(key=methodcaller('count'), reverse=True) return my_dict def update_kanji_pos(self): # get all char count self.total_count = self.db.get_count_for_drama( JdsDatabase.get_merged_drama()) # set jlpt position jlpt_dict = self.compute_pos_dict('jlpt') position = 1 for level in range(len(jlpt_dict) - 1, 0, -1): for char in jlpt_dict[level]: char.jlpt_pos = position position += 1 # set jouyou position jouyou_dict = self.compute_pos_dict('jouyou') position = 1 for level in range(1, len(jouyou_dict), 1): for char in jouyou_dict[level]: char.jouyou_pos = position position += 1 # set jdpt position char_per_level = {} for level in range(len(jlpt_dict) - 1, 0, -1): char_per_level[level] = len(jlpt_dict[level]) # find sum of all kanji count sum_all_count = 0 for char_uid in sorted(self.total_count, key=self.total_count.get, reverse=True): if is_kanji(self.chars[char_uid].value): sum_all_count += self.chars[char_uid].count() # set JDPT level position = 1 cur_level = 6 jdpt_limits = [1, 0.99, 0.98, 0.95, 0.9, 0.75, 0.5] cumul_freq = 0 for char_uid in sorted(self.total_count, key=self.total_count.get, reverse=True): if is_kanji(self.chars[char_uid].value): count = self.total_count[char_uid] freq = count / sum_all_count self.chars[char_uid].jdpt_pos = position self.chars[char_uid].freq = freq self.chars[char_uid].cumul_freq = cumul_freq + freq cumul_freq += freq self.chars[char_uid].set_count(count) position += 1 if cumul_freq > jdpt_limits[cur_level]: cur_level -= 1 if cumul_freq < jdpt_limits[cur_level]: self.chars[char_uid].jdpt = cur_level - 1 self.db.push_kanji_pos(self.chars) # set episode and drama frequency char_drama_count = {} char_episode_count = {} results = self.db.get_kanji_count_raw() for result in results: kanji_uid = result['kanji_uid'] if kanji_uid not in char_drama_count: char_drama_count[kanji_uid] = 0 char_episode_count[kanji_uid] = 0 drama_uid = result['drama_uid'] episode_count = result['episode_count'] if drama_uid is 0: continue char_drama_count[kanji_uid] += 1 char_episode_count[kanji_uid] += episode_count num_of_drama = len( self.db.get_all_dramas()) - 1 # -1 due to merge drama num_of_episodes = len(self.db.get_all_episodes_raw()) char_drama_freq = {} char_episode_freq = {} for kanji_uid, count in char_drama_count.items(): char_drama_freq[kanji_uid] = count / num_of_drama for kanji_uid, count in char_episode_count.items(): char_episode_freq[kanji_uid] = count / num_of_episodes self.db.push_drama_and_episode_count(char_drama_freq, char_episode_freq) def update_kanji_flags(self): for char in self.chars.values(): if is_kanji(char.value): char.flag = 1 elif re.match("[ぁ-んァ-ン]", char.value): char.flag = 2 else: char.flag = 3 self.db.push_kanji_info_flags(self.chars)
class JdsLineHandler: """ Read all lines in the provided folder, assign them a unique uid and push the result in the database The drama must have been loaded to the database before (via JdsDramaHandler) """ def __init__(self, argv): self.args = parse_args(argv) self.db = JdsDatabase() self.episode_to_uid = {} def reset(self): return self.db.reset_lines() def line_ref_worker(self, subfolder): lines = [] drama = self.db.get_drama(os.path.basename(subfolder)) print("read_lines for drama {}".format(drama.uid)) for filepath in DccUtils.get_files(subfolder): filename = os.path.basename(filepath) with open(filepath, encoding='utf-8') as file: try: for line in file.readlines(): try: lines.append( JdsLine( uid=0, drama_uid=drama.uid, value=line, episode_uid=self.episode_to_uid[filename])) except Exception as e: exception(e) except Exception as e: exception(e) return lines def read_lines(self): line_id = 0 subfolders = DccUtils.get_subfolders(self.args["path"]) with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: try: while len(subfolders) > 0: futures = {} for subfolder in subfolders: futures[subfolder] = executor.submit( self.line_ref_worker, subfolder) subfolders.remove(subfolder) if len(futures) > 15: break for future in concurrent.futures.as_completed( futures.values()): lines = future.result() for line in lines: line.uid = line_id line_id += 1 self.db.push_lines(lines) except Error as e: exception(e) def setup(self): results = self.db.get_episodes_raw() for result in results: self.episode_to_uid[result['name']] = result['episode_uid']
class JdsLineRefHandler: def __init__(self, argv): self.args = parse_args(argv) self.db = JdsDatabase() def reset(self): return self.db.reset_line_refs() def line_ref_worker(self, drama): """ threaded worker that build references of characters with lines. requires drama,lines to be in the DB beforehand :param drama: :return: """ lines = {} # key = char, value = [] of line_uid jds_lines = self.db.get_lines_for_drama(drama) print("start line_ref_worker for {}".format(drama.value)) cur_start_time = time.perf_counter() for jds_line in jds_lines: for char in jds_line.value: try: if char not in lines: lines[char] = [] lines[char].append(jds_line.uid) except Exception as e: exception(e) jds_chars = {} for char in lines: new_char = JdsChar.from_drama(char, drama.uid) new_char.add_line_refs(lines[char][:10]) jds_chars[char] = new_char if "\n" in lines: del lines[JdsChar("\n")] print("Deleted \\n") run_time = time.perf_counter() - cur_start_time print("stop line_ref_worker for {} with {} chars in {}".format( drama.value, len(lines), run_time)) return jds_chars def do_line_ref(self): dramas = self.db.get_all_dramas() with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: while len(dramas) > 0: try: futures = {} for drama in dramas: if drama.kanji_line_ref_ok is 1: print( "kanji_line_ref_ok TRUE -> {} skipped".format( drama.uid)) dramas.remove(drama) continue futures[drama] = executor.submit( self.line_ref_worker, drama) dramas.remove(drama) if len(futures) > 15: break for future in concurrent.futures.as_completed( futures.values()): chars = future.result() self.db.push_chars_to_line(chars) except Error as e: exception(e)
from python import settings from python.DccUtils import parse_args from python.JdsDatabase import JdsDatabase if __name__ == "__main__": print("{} started".format(__file__)) start_time = time.perf_counter() pr = None if settings.enable_profiler: pr = cProfile.Profile() pr.enable() args = parse_args(sys.argv[1:]) db = JdsDatabase() kanji_info_results = db.get_kanji_info_raw() kanji_count_results = db.get_kanji_count_raw() with open(settings.csv_path_kanji, mode='w', encoding='utf-8', newline='') as csv_file: fieldnames = [ 'kanji', 'count', 'freq', 'cumul_freq', 'drama_freq', 'episode_freq', 'jdpt', 'jdpt_pos', 'jouyou', 'jouyou_pos' ] writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter='\t') rows = {} writer.writeheader() for result in kanji_info_results: