def AddAuxTrans(self, word_dict, tran_prob_dbm, aux_trans): if not tran_prob_dbm: return for word, trans in aux_trans.items(): norm_word = tkrzw_dict.NormalizeWord(word) trans = set(trans) tsv = tran_prob_dbm.GetStr(norm_word) if not tsv: continue tran_probs = {} fields = tsv.split("\t") for i in range(0, len(fields), 3): src, trg, prob = fields[i], fields[i + 1], float(fields[i + 2]) if src != word: continue tran_probs[trg] = prob for tran, tran_prob in tran_probs.items(): if tran_prob < 0.1: continue if tran not in trans: continue if tkrzw_dict.NormalizeWord(tran) == norm_word: continue tran_stem, tran_prefix, tran_suffix = self.tokenizer.StripJaParticles( tran) if tran_prefix: new_tran = tran_stem + tran_suffix new_prob = tran_probs.get(new_tran) or 0 if (tran_prefix == "を" or regex.search( r"^[\p{Han}\p{Katakana}]", tran_stem) or (new_prob >= 0.01 and new_prob >= tran_prob)): tran = new_tran tran_prob = max(tran_prob, new_prob) score = tran_prob**0.5 word_dict[tran].append((word, score, tran_prob, []))
def SearchExactReverse(self, text, capacity): ja_words = [] ja_uniq_words = set() for ja_word in text.split(","): ja_word = tkrzw_dict.NormalizeWord(ja_word) if not ja_word: continue if ja_word in ja_uniq_words: continue ja_uniq_words.add(ja_word) ja_words.append(ja_word) en_words = [] en_uniq_words = set() for ja_word in ja_words: for en_word in self.SearchTranIndex(ja_word): if en_word in en_uniq_words: continue en_uniq_words.add(en_word) en_words.append(en_word) result = [] uniq_words = set() for en_word in en_words: if capacity < 1: break norm_en_word = tkrzw_dict.NormalizeWord(en_word) entries = self.SearchBody(norm_en_word) if entries: for entry in entries: if capacity < 1: break word = entry["word"] if word != en_word: continue if word in uniq_words: continue uniq_words.add(word) match = False translations = entry.get("translation") if translations: for tran in translations: tran = tkrzw_dict.NormalizeWord(tran) for ja_word in ja_words: if tran.find(ja_word) >= 0: match = True break if match: break if not match: phrases = entry.get("phrase") for phrase in phrases: for tran in phrase["x"]: tran = tkrzw_dict.NormalizeWord(tran) for ja_word in ja_words: if tran.find(ja_word) >= 0: match = True break if match: result.append(entry) capacity -= 1 return result
def SearchExact(self, text): text = tkrzw_dict.NormalizeWord(text) result = [] entry = self.SearchBody(text) if entry: result.append((text, entry)) return result
def SearchTranIndex(self, text): text = tkrzw_dict.NormalizeWord(text) tsv = self.tran_index_dbm.GetStr(text) result = [] if tsv: result.extend(tsv.split("\t")) return result
def ReadTranIndex(self, synset_index): tran_index = {} if not self.tran_prob_path: return tran_index logger.info("Reading tran index: input_path={}".format(self.tran_prob_path)) tran_prob_dbm = tkrzw.DBM() tran_prob_dbm.Open(self.tran_prob_path, False, dbm="HashDBM").OrDie() num_words = 0 for word in synset_index: key = tkrzw_dict.NormalizeWord(word) tsv = tran_prob_dbm.GetStr(key) if tsv: tran_probs = {} fields = tsv.split("\t") for i in range(0, len(fields), 3): src, trg, prob = fields[i], fields[i + 1], fields[i + 2] if src != word: continue prob = float(prob) if prob > 0.04: tran_probs[trg] = prob if tran_probs: tran_index[word] = tran_probs num_words += 1 if num_words % 10000 == 0: logger.info("Reading trans: words={}".format(num_words)) tran_prob_dbm.Close().OrDie() logger.info("Reading tran index done: records={}".format(len(tran_index))) return tran_index
def CheckExact(self, text): for word in text.split(","): word = tkrzw_dict.NormalizeWord(word) if not word: continue if word in self.body_dbm: return True return False
def Run(self): start_time = time.time() logger.info("Process started: input_path={}, output_path={}".format( str(self.input_path), self.output_path)) input_dbm = tkrzw.DBM() input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie() os.makedirs(self.output_path, exist_ok=True) meta_dir_path = os.path.join(self.output_path, "META-INF") os.makedirs(meta_dir_path, exist_ok=True) data_dir_path = os.path.join(self.output_path, "OEBPS") os.makedirs(data_dir_path, exist_ok=True) words = self.ListUpWords(input_dbm) keys = sorted(set([tkrzw_dict.NormalizeWord(x) for x in words])) key_prefixes = set() for key in keys: key_prefixes.add(GetKeyPrefix(key)) key_prefixes = sorted(list(key_prefixes)) self.MakeMimeType() self.MakeContainer(meta_dir_path) self.MakePackage(data_dir_path, key_prefixes) self.MakeSearchKeyMap(data_dir_path, input_dbm, keys, words) self.MakeStyle(data_dir_path) self.MakeNavigation(data_dir_path, key_prefixes) self.MakeOverview(data_dir_path) self.MakeMain(data_dir_path, input_dbm, keys, words) input_dbm.Close().OrDie() logger.info("Process done: elapsed_time={:.2f}s".format(time.time() - start_time))
def GetTranProb(self, tran_prob_dbm, src_text, trg_text): src_text = tkrzw_dict.NormalizeWord(src_text) tsv = tran_prob_dbm.GetStr(src_text) max_prob = 0.0 if tsv: trg_text = tkrzw_dict.NormalizeWord(trg_text) fields = tsv.split("\t") for i in range(0, len(fields), 3): src, trg, prob = fields[i], fields[i + 1], float(fields[i + 2]) norm_trg = tkrzw_dict.NormalizeWord(trg) if norm_trg == trg_text: max_prob = max(max_prob, prob) elif len(norm_trg) >= 2 and trg_text.startswith(norm_trg): max_prob = max(max_prob, prob * 0.01) elif len(trg_text) >= 2 and norm_trg.startswith(trg_text): max_prob = max(max_prob, prob * 0.01) return max_prob
def Run(self): start_time = time.time() logger.info("Process started: input_path={}, output_path={}".format( self.input_path, self.output_path)) mem_index = tkrzw.DBM() mem_index.Open("", True, dbm="BabyDBM").OrDie() input_dbm = tkrzw.DBM() input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie() it = input_dbm.MakeIterator() it.First() num_entries = 0 num_translations = 0 while True: record = it.GetStr() if not record: break key, serialized = record entry = json.loads(serialized) for item in entry["item"]: translations = item.get("translation") if translations: for tran in translations: norm_tran = tkrzw_dict.NormalizeWord(tran) mem_index.Append(norm_tran, key, "\t").OrDie() num_translations += len(translations) num_entries += 1 if num_entries % 10000 == 0: logger.info("Reading: entries={}, translationss={}".format( num_entries, num_translations)) it.Next() input_dbm.Close().OrDie() logger.info("Reading done: entries={}, translationss={}".format( num_entries, num_translations)) output_dbm = tkrzw.DBM() num_buckets = mem_index.Count() * 2 output_dbm.Open(self.output_path, True, dbm="HashDBM", truncate=True, align_pow=0, num_buckets=num_buckets).OrDie() it = mem_index.MakeIterator() it.First() num_records = 0 while True: record = it.GetStr() if not record: break key, value = record value = "\t".join(list(set(value.split("\t")))) output_dbm.Set(key, value).OrDie() num_records += 1 if num_records % 10000 == 0: logger.info("Writing: records={}".format(num_records)) it.Next() output_dbm.Close().OrDie() logger.info("Writing done: records={}".format(num_records)) mem_index.Close().OrDie() logger.info("Process done: elapsed_time={:.2f}s".format(time.time() - start_time))
def SearchPatternMatch(self, mode, text, capacity): text = tkrzw_dict.NormalizeWord(text) keys = self.keys_file.Search(mode, text, capacity) result = [] for key in keys: if len(result) >= capacity: break for entry in self.SearchExact(key, capacity - len(result)): result.append(entry) return result
def GetFeatures(self, entry): SCORE_DECAY = 0.95 word = tkrzw_dict.NormalizeWord(entry["word"]) features = {word: 1.0} pos_score = 1.0 pos_score_max = 0.0 pos_features = collections.defaultdict(float) for item in entry["item"]: pos = "__" + item["pos"] new_score = (pos_features.get(pos) or 0.0) + pos_score pos_features[pos] = new_score pos_score_max = max(pos_score_max, new_score) pos_score *= SCORE_DECAY for pos, pos_feature_score in pos_features.items(): features[pos] = pos_feature_score / pos_score_max score = 1.0 rel_words = entry.get("related") if rel_words: for rel_word in rel_words[:20]: rel_word = tkrzw_dict.NormalizeWord(rel_word) if rel_word not in features: score *= SCORE_DECAY features[rel_word] = score score = max(score, 0.4) trans = entry.get("translation") if trans: for tran in trans[:20]: tran = tkrzw_dict.NormalizeWord(tran) tran = regex.sub( r"([\p{Han}\p{Katakana}ー]{2,})(する|すること|される|されること|をする|な|に|さ)$", r"\1", tran) if tran not in features: score *= SCORE_DECAY features[tran] = score score = max(score, 0.2) coocs = entry.get("cooccurrence") if coocs: for cooc in coocs[:20]: cooc = tkrzw_dict.NormalizeWord(cooc) if cooc not in features: score *= SCORE_DECAY features[cooc] = score return features
def GetTranProb(self, tran_prob_dbm, word, tran): max_prob = 0.0 key = tkrzw_dict.NormalizeWord(word) tsv = tran_prob_dbm.GetStr(key) norm_tran = tran.lower() if tsv: fields = tsv.split("\t") for i in range(0, len(fields), 3): src, trg, prob = fields[i], fields[i + 1], fields[i + 2] if src == word and trg.lower() == norm_tran: prob = float(prob) max_prob = max(max_prob, prob) return max_prob
def SearchPatternMatchReverse(self, mode, text, capacity): text = tkrzw_dict.NormalizeWord(text) keys = self.tran_keys_file.Search(mode, text, capacity, True) result = [] uniq_words = set() for key in keys: if len(result) >= capacity: break for entry in self.SearchExactReverse(key, capacity - len(result) + 10): if len(result) >= capacity: break word = entry["word"] if word in uniq_words: continue uniq_words.add(word) result.append(entry) return result
def Tokenize(self, language, sentence, lowering, stemming): sentence = self.NormalizeSentence(sentence) if language == "en": if stemming: words = self.TokenizeEnStemming(sentence) else: words = self.TokenizeEnSimple(sentence) elif language == "ja": words = self.TokenizeJaMecab(sentence, stemming) else: raise ValueError("unsupported language: " + language) if lowering: words = [tkrzw_dict.NormalizeWord(x) for x in words] return words
def SearchRelatedWithSeeds(self, seeds, capacity): seed_features = collections.defaultdict(float) base_weight = 1.0 uniq_words = set() for seed in seeds: norm_word = tkrzw_dict.NormalizeWord(seed["word"]) weight = base_weight if norm_word in uniq_words: weight *= 0.1 uniq_words.add(norm_word) for word, score in self.GetFeatures(seed).items(): seed_features[word] += score * weight base_weight *= 0.8 result = self.ExpandEntries(seeds, seed_features, max(int(capacity * 1.2), 100)) return result[:capacity]
def SearchExact(self, text, capacity): result = [] uniq_words = set() for word in text.split(","): if len(result) >= capacity: break word = tkrzw_dict.NormalizeWord(word) if not word: continue entries = self.SearchBody(word) if not entries: continue for entry in entries: if len(result) >= capacity: break word = entry["word"] if word in uniq_words: continue uniq_words.add(word) result.append(entry) return result
def Predict(self, text): words = set(self.tokenizer.Tokenize(self.language, text, True, False)) if len(words) > 1: words.add(tkrzw_dict.NormalizeWord(text)) cooc_words = {} for word in words: for cooc_word, cooc_score in self.GetCoocWords(word): cooc_words[cooc_word] = (cooc_words.get(cooc_word) or 0) + cooc_score sorted_cooc_words = sorted(cooc_words.items(), key=operator.itemgetter(1), reverse=True) rel_words = {} num_traces = 0 for cooc_word, cooc_score in sorted_cooc_words: if num_traces >= self.TRACE_COOC_WORDS: break if cooc_word in words: continue for rel_word, rel_score in self.GetCoocWords(cooc_word): if rel_word in words: continue rel_words[rel_word] = max( rel_words.get(rel_word) or 0, cooc_score * rel_score) num_traces += 1 sorted_rel_words = sorted(rel_words.items(), key=operator.itemgetter(1), reverse=True) check_words = set(words) num_cooc_checked = 0 for cooc_word, _ in sorted_cooc_words: if num_cooc_checked >= self.CHECK_COOC_WORDS: break if cooc_word in check_words: continue check_words.add(cooc_word) num_cooc_checked += 1 num_rel_checked = 0 for rel_word, _ in sorted_rel_words: if num_rel_checked >= self.CHECK_REL_WORDS: break if rel_word in check_words: continue check_words.add(rel_word) num_rel_checked += 1 scored_rel_words = [] for rel_word in check_words: rel_cooc_words = self.GetCoocWords(rel_word) score = self.GetSimilarity(sorted_cooc_words, rel_cooc_words) scored_rel_words.append((rel_word, score)) scored_rel_words = sorted(scored_rel_words, key=operator.itemgetter(1), reverse=True) return scored_rel_words, sorted_cooc_words
def SearchReverse(self, text): text = tkrzw_dict.NormalizeWord(text) result = [] src_words = self.SearchTranIndex(text) if src_words: for src_word in src_words: entry = self.SearchBody(src_word) if entry: items = [] for item in entry["item"]: hit = False translations = item.get("translation") if translations: for tran in translations: if tran.lower() == text: hit = True break if hit: items.append(item) if items: entry["item"] = items result.append((src_word, entry)) if len(result) > 1: for record in result: entry = record[1] score = float(entry.get("score") or 0.0) for item in entry["item"]: tran_scores = item.get("translation_score") if tran_scores: value = tran_scores.get(text) if value: value = float(value) if value > score: score = value entry["search_score"] = score result = sorted(result, key=lambda rec: rec[1]["search_score"], reverse=True) return result
def Run(self): start_time = time.time() logger.info("Process started: input_path={}, output_path={}".format( str(self.input_path), self.output_path)) input_dbm = tkrzw.DBM() input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie() os.makedirs(self.output_path, exist_ok=True) words = self.ListUpWords(input_dbm) keys = sorted(set([tkrzw_dict.NormalizeWord(word) for word, prob in words.items()])) key_prefixes = set() for key in keys: key_prefixes.add(GetKeyPrefix(key)) key_prefixes = sorted(list(key_prefixes), key=lambda x: 1000 if x == "_" else ord(x)) self.MakeMain(input_dbm, keys, words) self.MakeNavigation(key_prefixes) self.MakeOverview() self.MakeStyle() self.MakePackage(key_prefixes) input_dbm.Close().OrDie() for label, count in self.label_counters.items(): logger.info("Adopted label: {}: {}".format(label, count)) logger.info("Stats: num_words={}, num_trans={}, num_items={}, num_aux_items={}".format( self.num_words, self.num_trans, self.num_items, self.num_aux_items)) logger.info("Process done: elapsed_time={:.2f}s".format(time.time() - start_time))
def Run(self): start_time = time.time() logger.info("Process started: input_path={}, output_path={}".format( self.input_path, self.output_path)) input_dbm = tkrzw.DBM() input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie() it = input_dbm.MakeIterator() it.First() num_entries = 0 index = collections.defaultdict(list) infl_names = ("noun_plural", "verb_singular", "verb_present_participle", "verb_past", "verb_past_participle", "adjective_comparative", "adjective_superative", "adverb_comparative", "adverb_superative") while True: record = it.GetStr() if not record: break key, serialized = record entry = json.loads(serialized) for word_entry in entry: word = word_entry["word"] prob = max(float(word_entry.get("probability") or "0"), 0.0000001) score = prob * math.log2(len(word_entry["item"])) if "translation" in word_entry: score *= 2 inflections = set() for infl_name in infl_names: inflection = word_entry.get(infl_name) if inflection: for infl_value in regex.split(r"[,|]", inflection): infl_value = tkrzw_dict.NormalizeWord( infl_value.strip()) if not regex.search(r"\p{Latin}", infl_value): continue inflections.add(infl_value) for inflection in inflections: index[inflection].append((word, score)) num_entries += 1 if num_entries % 10000 == 0: logger.info("Reading: entries={}".format(num_entries)) it.Next() input_dbm.Close().OrDie() logger.info("Reading done: entries={}".format(num_entries)) output_dbm = tkrzw.DBM() num_buckets = len(index) * 2 output_dbm.Open(self.output_path, True, dbm="HashDBM", truncate=True, align_pow=0, num_buckets=num_buckets).OrDie() num_entries = 0 for inflection, scores in index.items(): scores = sorted(scores, key=lambda x: x[1], reverse=True) words = [x[0] for x in scores] output_dbm.Set(inflection, "\t".join(words)).OrDie() num_entries += 1 if num_entries % 10000 == 0: logger.info("Writing: entries={}".format(num_entries)) output_dbm.Close().OrDie() logger.info("Process done: elapsed_time={:.2f}s".format(time.time() - start_time))
def MakeMainEntry(self, out_file, entry, conj_verbs, conj_adjs, rev_prob_dbm): def P(*args, end="\n"): esc_args = [] for arg in args[1:]: if isinstance(arg, str): arg = esc(arg) esc_args.append(arg) print(args[0].format(*esc_args), end=end, file=out_file) self.num_words += 1 yomi, word, trans = entry variants = {} variants[yomi] = True pos = self.tokenizer.GetJaLastPos(word) word_prob = 0 if rev_prob_dbm: word_prob = self.GetPhraseProb(rev_prob_dbm, "ja", word) if word.endswith(pos[3]): prefix = word[:-len(pos[3])] for focus_pos, conj_map in [("動詞", conj_verbs), ("形容詞", conj_adjs)]: if pos[1] != focus_pos: continue conjs = conj_map.get(word) if prefix and not conjs and word_prob >= 0.00001: part_conjs = conj_map.get(pos[3]) if part_conjs: conjs = [prefix + x for x in part_conjs] if conjs: for conj in sorted(conjs): variants[conj] = True stem, prefix, suffix = self.tokenizer.StripJaParticles(word) if stem != word: if prefix == "を" or regex.search(r"[\p{Han}\p{Katakana}]", stem): prefix = "" new_word = prefix + stem variants[new_word] = True for suffix in ("する", "した", "される", "された"): if word.endswith(suffix): stem = word[:-len(suffix)] if self.tokenizer.IsJaWordSahenNoun(stem): variants[stem] = True for suffix in ("な", "に", "と"): if word.endswith(suffix): stem = word[:-len(suffix)] if self.tokenizer.IsJaWordAdjvNoun(stem): variants[stem] = True if word in variants: del variants[word] trans = sorted(trans, key=lambda x: x[1], reverse=True) P('<idx:entry>') P('<div>') P('<span class="word">') P('<idx:orth>{}', word) if variants: P('<idx:infl>') for variant, _ in variants.items(): P('<idx:iform value="{}"/>', variant) P('</idx:infl>') P('</idx:orth>') P('</span>') if yomi != word: P(' <span class="pron">({})</span>', yomi) P('</div>') uniq_trans = set() uniq_synsets = set() num_lines = 0 for tran, score, tran_prob, synsets in trans: norm_tran = tkrzw_dict.NormalizeWord(tran) if norm_tran in uniq_trans: continue uniq_trans.add(norm_tran) self.num_items += 1 hit_syn = False for syn_id, syn_gross, syn_words in synsets: if syn_id in uniq_synsets: continue uniq_synsets.add(syn_id) hit_syn = True P('<div>{}', ", ".join([tran] + syn_words), end="") P(' <span class="gross">- {}</span>', syn_gross, end="") P('</div>') num_lines += 1 for synonym in syn_words: norm_syn = tkrzw_dict.NormalizeWord(synonym) uniq_trans.add(norm_syn) if not hit_syn and num_lines < 8: P('<div>{}</div>', tran) num_lines += 1 P('</idx:entry>') P('<br/>')
def ReadEntry(self, word_dict, entry, tran_prob_dbm, aux_trans): word = entry["word"] norm_word = tkrzw_dict.NormalizeWord(word) word_prob = float(entry.get("probability") or 0) trans = entry.get("translation") if not trans: return word_aux_trans = aux_trans.get(word) if word_aux_trans: word_aux_trans = set(word_aux_trans) trans.extend(word_aux_trans) dict_trans = set() for item in entry["item"]: label = item["label"] text = item["text"] if label in self.supplement_labels: for tran in text.split(","): tran = tran.strip() if tran: trans.append(tran) dict_trans.add(tran) tran_probs = {} if tran_prob_dbm: tsv = tran_prob_dbm.GetStr(norm_word) if tsv: fields = tsv.split("\t") for i in range(0, len(fields), 3): src, trg, prob = fields[i], fields[i + 1], float(fields[i + 2]) if src != word: continue tran_probs[trg] = prob word_prob_score = max(0.1, (word_prob**0.5)) rank_score = 0.5 uniq_trans = set() norm_trans = [] for tran in trans: tran = regex.sub("[・]", "", tran) if tran and tran not in uniq_trans: norm_trans.append(tran) uniq_trans.add(tran) for i, tran in enumerate(norm_trans): if tkrzw_dict.NormalizeWord(tran) == norm_word: continue tran_prob = tran_probs.get(tran) or 0 tran_stem, tran_prefix, tran_suffix = self.tokenizer.StripJaParticles( tran) if tran_prefix: new_tran = tran_stem + tran_suffix new_prob = tran_probs.get(new_tran) or 0 if (tran_prefix == "を" or regex.search(r"^[\p{Han}\p{Katakana}]", tran_stem) or (new_prob >= 0.01 and new_prob >= tran_prob)): tran = new_tran tran_prob = max(tran_prob, new_prob) if i == 0: pass elif i <= 1 and tran_prob >= 0.01: pass elif i <= 2 and tran_prob >= 0.02: pass elif i <= 3 and tran_prob >= 0.04: pass elif tran_prob >= 0.1: pass elif tran in dict_trans: pass else: continue tran_prob_score = tran_prob**0.75 dict_score = 0.1 if tran in dict_trans else 0.0 if word_aux_trans and tran in word_aux_trans: dict_score += 0.1 synsets = [] for item in entry["item"]: if item["label"] != "wn": continue texts = item["text"].split(" [-] ") synset_id = "" gross = texts[0] synonyms = [] tran_match = False for text in texts[1:]: match = regex.search(r"^\[(\w+)\]: (.*)", text) if not match: continue name = match.group(1).strip() text = match.group(2).strip() if name == "synset": synset_id = text elif name == "synonym": for synonym in text.split(","): synonym = synonym.strip() if synonym: synonyms.append(synonym) elif name == "translation": for syn_tran in text.split(","): syn_tran = syn_tran.strip() if syn_tran == tran: tran_match = True if synset_id and tran_match: synsets.append((synset_id, gross, synonyms)) if synsets: dict_score += 0.1 score = word_prob_score + rank_score + tran_prob_score + dict_score word_dict[tran].append((word, score, tran_prob, synsets)) rank_score *= 0.8 phrases = entry.get("phrase") if phrases: for phrase in phrases: phrase_word = phrase.get("w") if not phrase_word or phrase.get("p") or phrase.get("i"): continue score = word_prob_score + rank_score for phrase_tran in phrase.get("x"): phrase_tran = regex.sub(r"\(.*?\)", "", phrase_tran).strip() if phrase_tran: word_dict[phrase_tran].append( (phrase_word, score, 0.05, []))
def ProcessWord(word, trans, tokenizer, phrase_prob_dbm, rev_prob_dbm, tran_prob_dbm, aux_trans, yomis, min_phrase_prob, min_tran_prob): phrase_prob = 0.0 if phrase_prob_dbm: tokens = tokenizer.Tokenize("en", word, False, True)[:3] norm_phrase = " ".join(tokens) phrase_prob = float(phrase_prob_dbm.GetStr(norm_phrase) or 0.0) uniq_trans = set() check_trans = [] norm_word = word.lower() for pos, tran in trans: norm_tran = tran.lower() if norm_tran == norm_word: continue if norm_tran in uniq_trans: continue uniq_trans.add(norm_tran) check_trans.append((pos, tran)) tran = tokenizer.NormalizeJaWordForPos(pos, tran) norm_tran = tran.lower() if norm_tran in uniq_trans: continue uniq_trans.add(norm_tran) check_trans.append((pos, tran)) scored_trans = [] aux_targets = aux_trans.get(word) for pos, tran in check_trans: tran_prob = 0.0 if tran_prob_dbm: key = tkrzw_dict.NormalizeWord(word) tsv = tran_prob_dbm.GetStr(key) if tsv: fields = tsv.split("\t") extra_records = [] for i in range(0, len(fields), 3): src, trg, prob = fields[i], fields[i + 1], float(fields[i + 2]) if src != word: continue if trg != tran: continue tran_prob = float(prob) aux_hit = False if aux_targets: count = aux_targets.count(tran) if count > 0: aux_hit = True tran_prob += count * 0.05 has_yomi = tran in yomis if has_yomi: tran_prob += 0.01 is_known_word = False tran_tokens = tokenizer.Tokenize("ja", tran, False, True) if len(tran_tokens) == 2 and tran_tokens[-1] in ("する", "な"): tran_tokens = tran_tokens[:1] if len(tran_tokens) == 1: tran_last_pos = tokenizer.GetJaLastPos(tran) if tran_last_pos[1] in ("動詞", "形容詞", "副詞"): is_known_word = True elif tokenizer.IsJaWordSahenNoun( tran) or tokenizer.IsJaWordAdjvNoun(tran): is_known_word = True rev_prob = 0.0 if rev_prob_dbm: tokens = tokenizer.Tokenize("ja", tran, False, True)[:3] norm_phrase = " ".join(tokens) rev_prob = float(rev_prob_dbm.GetStr(norm_phrase) or 0.0) if rev_prob > 0: tran_prob += min(rev_prob**0.5, 0.2) else: tran_prob *= 0.8 if tran_prob > 0.04 and (rev_prob > 0.0 or has_yomi): pass elif tran_prob > 0.01 and is_known_word: pass else: if not aux_hit and phrase_prob < min_phrase_prob: continue if not aux_hit and tran_prob < min_tran_prob: continue if regex.fullmatch(r"[\p{Katakana}ー]+", tran): tran_prob *= 0.7 elif regex.fullmatch(r"[\p{Hiragana}\p{Katakana}ー]+", tran): tran_prob *= 0.8 scored_trans.append((pos, tran, tran_prob)) if not scored_trans: return None, None scored_trans = sorted(scored_trans, key=lambda x: x[2], reverse=True) return (scored_trans, phrase_prob)
def Run(phrase_prob_path, rev_prob_path, tran_prob_path, tran_aux_paths, yomi_paths, min_phrase_prob, min_tran_prob): logger.info("Start the process") phrase_prob_dbm = None if phrase_prob_path: logger.info("Opening the phrase prob DB: " + phrase_prob_path) phrase_prob_dbm = tkrzw.DBM() phrase_prob_dbm.Open(phrase_prob_path, False, dbm="HashDBM").OrDie() rev_prob_dbm = None if rev_prob_path: logger.info("Opening the reverse prob DB: " + rev_prob_path) rev_prob_dbm = tkrzw.DBM() rev_prob_dbm.Open(rev_prob_path, False, dbm="HashDBM").OrDie() tran_prob_dbm = None if tran_prob_path: logger.info("Opening the tran prob DB: " + tran_prob_path) tran_prob_dbm = tkrzw.DBM() tran_prob_dbm.Open(tran_prob_path, False, dbm="HashDBM").OrDie() aux_trans = collections.defaultdict(list) for tran_aux_path in tran_aux_paths.split(","): tran_aux_path = tran_aux_path.strip() if tran_aux_path: logger.info("Reading the tran aux file: " + tran_aux_path) with open(tran_aux_path) as input_file: uniq_keys = set() for line in input_file: fields = line.strip().split("\t") if len(fields) < 2: continue word = fields[0] for tran in fields[1:]: uniq_key = word + ":" + tran if uniq_key in uniq_keys: continue aux_trans[word].append(tran) uniq_keys.add(uniq_key) yomis = set() for yomi_path in yomi_paths.split(","): yomi_path = yomi_path.strip() if yomi_path: logger.info("Reading the yomi file: " + yomi_path) with open(yomi_path) as input_file: for line in input_file: fields = line.strip().split("\t") if len(fields) < 1: continue yomis.add(fields[0]) logger.info("Processing the gross.") tokenizer = tkrzw_tokenizer.Tokenizer() word_dict = collections.defaultdict(list) alt_source = None alt_targets = None num_lines = 0 for line in sys.stdin: num_lines += 1 if num_lines % 10000 == 0: logger.info("Processing the gross: {} lines: {} items".format( num_lines, len(word_dict))) fields = line.strip().split("\t") if len(fields) != 3: continue word, pos, text = fields if pos == "alternative": alt_source = word alt_targets = set() for alt in regex.split(r"[,;]", text): if regex.fullmatch(r"[\p{Han}\p{Hiragana}\p{Katakana}ー]+", alt): alt_targets.add(alt) continue text = regex.sub(r"\.$", "", text).strip() for tran in regex.split(r"[,;]", text): tran = tran.strip() if pos == "verb": tran = regex.sub(r"^to ", "", tran) if pos == "noun": tran = regex.sub(r"^(a|an|the) ", "", tran) tran = regex.sub("^[-~] ", "", tran) tran = regex.sub(" [-~]$", "", tran) if not regex.fullmatch(r"[-_\p{Latin}0-9'. ]+", tran): continue tokens = tran.split(" ") if len(tokens) < 1 or len(tokens) > 4: continue word_dict[tran].append((pos, word)) if alt_source == word: for alt in alt_targets: word_dict[tran].append((pos, alt)) norm_word_dict = collections.defaultdict(list) for word, trans in word_dict.items(): scored_trans, phrase_prob = ProcessWord(word, trans, tokenizer, phrase_prob_dbm, rev_prob_dbm, tran_prob_dbm, aux_trans, yomis, min_phrase_prob, min_tran_prob) if scored_trans: key = tkrzw_dict.NormalizeWord(word) norm_word_dict[key].append((word, scored_trans, phrase_prob)) for key, entries in norm_word_dict.items(): sum_phrase_prob = 0.0 for word, scored_trans, phrase_prob in entries: sum_phrase_prob += phrase_prob for word, scored_trans, phrase_prob in entries: if sum_phrase_prob > 0: if key == word: if phrase_prob / sum_phrase_prob < 0.6: continue else: if phrase_prob / sum_phrase_prob < 0.8: continue PrintEntry(word, scored_trans) if tran_prob_dbm: tran_prob_dbm.Close().OrDie() if phrase_prob_dbm: phrase_prob_dbm.Close().OrDie() logger.info("Process done")
def GetResultKeys(self, entries): keys = set() for entry in entries: keys.add(tkrzw_dict.NormalizeWord(entry["word"])) return keys
def Run(self): start_time = time.time() logger.info("Process started: input_path={}, output_path={}".format( self.input_path, self.output_path)) mem_index = tkrzw.DBM() mem_index.Open("", True, dbm="BabyDBM").OrDie() input_dbm = tkrzw.DBM() input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie() it = input_dbm.MakeIterator() it.First() num_entries = 0 num_translations = 0 tran_dict = set() while True: record = it.GetStr() if not record: break key, serialized = record entry = json.loads(serialized) for word_entry in entry: word = word_entry["word"] prob = max(float(word_entry.get("probability") or "0"), 0.0000001) aoa = min(float(word_entry.get("aoa") or "20"), 20.0) score = prob * ((30 - aoa) / 10) word_trans = word_entry.get("translation") or [] phrase_trans = [] phrases = word_entry.get("phrase") if phrases: for phrase in phrases: if phrase.get("p") or phrase.get("i"): continue for phrase_tran in phrase.get("x"): phrase_tran = regex.sub(r"\(.*?\)", "", phrase_tran).strip() if phrase_tran: phrase_trans.append(phrase_tran) weight_word_trans = [] for trans, weight in [(word_trans, 1.0), (phrase_trans, 0.5)]: for word_tran in trans: weight_word_trans.append((word_tran, weight)) match = regex.search( r"([\p{Han}\p{Katakana}ー]{2,})(する|すること|される|されること|をする)$", word_tran) if match: short_word_tran = word_tran[:-len(match.group(2))] if short_word_tran: weight_word_trans.append( (short_word_tran, weight * 0.8)) short_word_tran = self.tokenizer.CutJaWordNounParticle( word_tran) if short_word_tran != word_tran: weight_word_trans.append( (short_word_tran, weight * 0.8)) match = regex.search( r"([\p{Han}\p{Katakana}ー]{2,})(的|的な|的に)$", word_tran) if match: short_word_tran = word_tran[:-len(match.group(2))] if short_word_tran: weight_word_trans.append( (short_word_tran, weight * 0.8)) match = regex.search( r"([\p{Han}]{2,})(が|の|を|に|へ|と|より|から|で|や|な|なる|たる)$", word_tran) if match: short_word_tran = word_tran[:-len(match.group(2))] if short_word_tran: weight_word_trans.append( (short_word_tran, weight * 0.8)) uniq_trans = set() for tran, weight in weight_word_trans: norm_tran = tkrzw_dict.NormalizeWord(tran) if norm_tran in uniq_trans: continue uniq_trans.add(norm_tran) pair = "{}\t{:.8f}".format(word, score * weight) score *= 0.98 mem_index.Append(norm_tran, pair, "\t").OrDie() for item in word_entry["item"]: if item["label"] in self.supplement_labels: for tran in item["text"].split(","): tran = tran.strip() if tran: tran_dict_key = word + "\t" + tran tran_dict.add(tran_dict_key) num_translations += len(uniq_trans) num_entries += 1 if num_entries % 10000 == 0: logger.info("Reading: entries={}, translations={}".format( num_entries, num_translations)) it.Next() input_dbm.Close().OrDie() logger.info("Reading done: entries={}, translations={}".format( num_entries, num_translations)) output_dbm = tkrzw.DBM() num_buckets = mem_index.Count() * 2 output_dbm.Open(self.output_path, True, dbm="HashDBM", truncate=True, align_pow=0, num_buckets=num_buckets).OrDie() tran_prob_dbm = None if self.tran_prob_path: tran_prob_dbm = tkrzw.DBM() tran_prob_dbm.Open(self.tran_prob_path, False, dbm="HashDBM").OrDie() it = mem_index.MakeIterator() it.First() num_records = 0 while True: record = it.GetStr() if not record: break key, value = record scored_trans = [] uniq_words = set() fields = value.split("\t") for i in range(0, len(fields), 2): word = fields[i] score = float(fields[i + 1]) if word in uniq_words: continue uniq_words.add(word) if tran_prob_dbm: prob = self.GetTranProb(tran_prob_dbm, word, key) tran_dict_key = word + "\t" + key prob = max(prob, 0.000001) if tran_dict_key in tran_dict: prob += 0.1 score = (score * prob)**0.5 scored_trans.append((word, score)) scored_trans = sorted(scored_trans, key=lambda x: x[1], reverse=True) value = "\t".join([x[0] for x in scored_trans]) output_dbm.Set(key, value).OrDie() num_records += 1 if num_records % 10000 == 0: logger.info("Writing: records={}".format(num_records)) it.Next() if tran_prob_dbm: tran_prob_dbm.Close().OrDie() output_dbm.Close().OrDie() logger.info("Writing done: records={}".format(num_records)) mem_index.Close().OrDie() logger.info("Process done: elapsed_time={:.2f}s".format(time.time() - start_time))
def MakeMainEntry(self, out_file, entry, input_dbm, keys, inflections): def P(*args, end="\n"): esc_args = [] for arg in args[1:]: if isinstance(arg, str): arg = esc(arg) esc_args.append(arg) print(args[0].format(*esc_args), end=end, file=out_file) word = entry["word"] prob = float(entry.get("probability") or "0") pronunciation = entry.get("pronunciation") translations = entry.get("translation") is_major_word = prob >= 0.00001 and not regex.search("[A-Z]", word) poses = set() sub_poses = set() for item in entry["item"][:10]: if item["label"] in self.supplement_labels: sub_poses.add(item["pos"]) else: poses.add(item["pos"]) if not poses: poses = sub_poses infl_groups = collections.defaultdict(list) if not regex.search(r"[A-Z].*[A-Z]", word): for attr_list in INFLECTIONS: for name, label in attr_list: pos, suffix = name.split("_", 1) if pos not in poses: continue if name == "verb_singular": suffix = "present 3ps" else: suffix = suffix.replace("_", " ") value = entry.get(name) if value: infl_groups[pos].append((suffix, value, label)) main_labels = set() label_items = collections.defaultdict(list) for item in entry["item"]: label = item["label"] if label in self.preferable_labels: main_labels.add(label) label_items[label].append(item) best_label = None is_stop = word in ARTICLES or word in PARTICLES if len(main_labels) >= 2: min_cost = None for label in main_labels: is_best = label in self.best_labels is_vetted = not is_stop and label in self.vetted_labels num_items = 0 length_cost = 0 for item in label_items[label]: text = item["text"] if not is_best and not is_vetted and not CheckSafeText(text): length_cost += 10.0 if text.startswith("[translation]:"): continue text = regex.sub(r" \[-+\] .*", "", text).strip() if not text: continue num_items += 1 text = regex.sub(r"[^-_\p{Latin}\d']+", " ", text).strip() num_words = text.count(" ") + 1 length_cost += abs(math.log(9) - math.log(num_words)) if not num_items: continue item_cost = abs(math.log(5) - math.log(num_items)) length_cost = length_cost / num_items if is_best: quality_cost = 0.8 elif is_vetted: quality_cost = 1.0 else: quality_cost = 1.25 cost = (item_cost + 0.5) * (length_cost + 1.0) * quality_cost if not min_cost or cost < min_cost: best_label = label min_cost = cost elif len(main_labels) >= 1: best_label = list(main_labels)[0] else: best_label = entry["item"][0]["label"] self.label_counters[best_label] += 1 items = [] sub_items = [] tran_items = [] for item in entry["item"]: label = item["label"] text = item["text"] if text.startswith("[translation]:"): tran_items.append(item) elif label == best_label: items.append(item) elif label in main_labels and is_major_word and not regex.search(r"\w{20,}", text): sub_items.append(item) if not items: items = sub_items if not items: items = tran_items if not items: return items = self.MergeShownItems(items, sub_items) self.num_words += 1 P('<idx:entry>') P('<div>') P('<span class="word">') P('<idx:orth>{}', word) for pos, values in infl_groups.items(): kind_infls = [] for kind, value, label in values: for infl in value.split(","): infl = infl.strip() if not infl: continue if inflections.get(infl) != word: continue kind_infls.append((kind, infl)) if not kind_infls: continue P('<idx:infl inflgrp="{}">', pos) for kind, infl in kind_infls: P('<idx:iform name="{}" value="{}"/>', kind, infl) P('</idx:infl>') alternatives = entry.get("alternative") if alternatives: alt_words = [] for alternative in alternatives: alt_norm = tkrzw_dict.NormalizeWord(alternative) if not alt_norm or alt_norm in keys or alt_norm in inflections: continue alt_words.append(alternative) if alt_words: P('<idx:infl inflgrp="common">') for alt_word in alt_words: P('<idx:iform name="alternative" value="{}"/>', alt_word) P('</idx:infl>') P('</idx:orth>') P('</span>') if pronunciation: P(' <span class="pron">/{}/</span>', pronunciation) P('</div>') if translations: self.num_trans += 1 P('<div>{}</div>', ", ".join(translations[:6])) for item in items: self.MakeMainEntryItem(P, item) phrases = entry.get("phrase") if phrases: for phrase in phrases: self.MakeMainEntryPhraseItem(P, phrase) parents = entry.get("parent") if parents: for parent in parents: self.MakeMainEntryParentItem(P, parent, input_dbm) for pos, values in infl_groups.items(): P('<div>') for kind, value, label in values: P('<span class="attr">[{}]</span> {}', label, value) P('</div>') P('</idx:entry>') P('<br/>')