def Run(self): start_time = time.time() logger.info("Process started: input_path={}".format(self.input_path)) input_dbm = tkrzw.DBM() input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie() it = input_dbm.MakeIterator() it.First() num_entries = 0 while True: record = it.GetStr() if not record: break key, serialized = record entry = json.loads(serialized) for word_entry in entry: word = word_entry["word"] infls = [] for infl_name in inflection_names: infl_value = word_entry.get(infl_name) if infl_value: for infl in infl_value.split(","): infl = infl.strip() if infl and infl != word and infl not in infls: infls.append(infl_value) parents = word_entry.get("parent") or [] children = word_entry.get("child") or [] synonym_scores = collections.defaultdict(float) synonym_weight = 1.0 for item in word_entry["item"]: text = item["text"] for part in text.split("[-]"): part = part.strip() match = regex.search(r"\[synonym\]: (.*)", part) if match: for synonym in match.group(1).split(","): synonym = synonym.strip() if synonym and synonym != word: synonym_scores[synonym] += synonym_weight synonym_weight *= 0.98 synonym_scores = sorted(synonym_scores.items(), key=lambda x: x[1], reverse=True) synonyms = [x[0] for x in synonym_scores] if not infls and not parents and not children and not synonyms: continue print("{}\t{}\t{}\t{}\t{}".format(word, ",".join(infls), ",".join(parents), ",".join(children), ",".join(synonyms))) num_entries += 1 if num_entries % 10000 == 0: logger.info("Reading: entries={}".format(num_entries)) it.Next() input_dbm.Close().OrDie() logger.info("Process done: elapsed_time={:.2f}s".format(time.time() - start_time))
def DivideWordCount(self, word_count_path, word_prob_path): start_time = time.time() logger.info( "Writing the word probability database: src={}, dest={}".format( word_count_path, word_prob_path)) word_count_dbm = tkrzw.DBM() word_count_dbm.Open(word_count_path, False, dbm="SkipDBM").OrDie() word_prob_dbm = tkrzw.DBM() num_buckets = word_count_dbm.Count() * 2 word_prob_dbm.Open(word_prob_path, True, dbm="HashDBM", truncate=True, num_buckets=num_buckets).OrDie() it = word_count_dbm.MakeIterator() it.First() record = it.GetStr() if not record or len(record[0]) != 0: raise RuntimeError("invalid first record") num_sentences = int(record[1]) it.Next() num_records = 0 while True: record = it.GetStr() if not record: break word = record[0] count = int(record[1]) prob = count / num_sentences value = "{:.8f}".format(prob) value = regex.sub(r"^0\.", ".", value) word_prob_dbm.Set(word, value).OrDie() num_records += 1 if num_records % 1000 == 0: logger.info( "Dividing word counts: {} records".format(num_records)) it.Next() word_prob_dbm.Close().OrDie() word_count_dbm.Close().OrDie() logger.info( "Writing the word probability database done: elapsed_time={:.2f}s". format(time.time() - start_time))
def Run(self): start_time = time.time() logger.info("Process started: data_prefix={}".format(self.data_prefix)) phrase_count_path = "{}-count.tks".format(self.data_prefix) phrase_count_dbm = tkrzw.DBM() phrase_count_dbm.Open(phrase_count_path, False).OrDie() it = phrase_count_dbm.MakeIterator() it.First() record = it.GetStr() if not record or len(record[0]) != 0: raise RuntimeError("invalid first record") num_domains = int(record[1]) it.Next() logger.info("Processing phrase counts") num_target_records = 0 num_pair_records = 0 last_source = "" last_source_count = 0 targets = [] source_counts = {} target_counts = {} while True: record = it.GetStr() if not record: break source, target = record[0].split("\t") count = int(record[1]) if source: if source != last_source: if last_source_count and targets: self.ProcessRecord( last_source, last_source_count, targets, target_counts) targets = [] last_source = source num_pair_records += 1 if num_pair_records % 10000 == 0: logger.info("Processing phrase pair counts: {} records".format(num_pair_records)) if target: targets.append((target, count)) else: last_source_count = count source_counts[source] = count else: target_counts[target] = count num_target_records += 1 if num_target_records % 100000 == 0: logger.info("Reading target counts: {} records".format(num_target_records)) it.Next() if last_source_count and targets: self.ProcessRecord( last_source, last_source_count, targets, target_counts) logger.info("Process done: elapsed_time={:.2f}s".format( time.time() - start_time))
def Dump(self): logger.info( "Batch {} aggregation done: elapsed_time={:.2f}s, RSS={:.2f}MB". format(self.num_batches + 1, time.time() - self.start_time, tkrzw.Utility.GetMemoryUsage() / 1024.0 / 1024)) logger.info( "Batch {} dumping: sentences={}, words={}, unique_words={}".format( self.num_batches + 1, self.num_sentences, self.num_words, self.mem_phrase_count.Count())) start_time = time.time() fill_ratio = min(self.num_words / BATCH_MAX_WORDS, 1.0) dbm_phrase_count_path = "{}-phrase-count-{:08d}.tks".format( self.data_prefix, self.num_batches) dbm_phrase_count = tkrzw.DBM() dbm_phrase_count.Open(dbm_phrase_count_path, True, dbm="SkipDBM", truncate=True, insert_in_order=True, offset_width=4, step_unit=4, max_level=12).OrDie() logger.info("Batch {} word count dumping: dest={}".format( self.num_batches + 1, dbm_phrase_count_path)) dbm_phrase_count.Set("", self.num_sentences).OrDie() it = self.mem_phrase_count.MakeIterator() it.First() min_phrase_count = max( math.ceil(MIN_PHRASE_COUNT_IN_BATCH * fill_ratio), 2) while True: record = it.Get() if not record: break phrase = record[0] count = struct.unpack(">q", record[1])[0] if count >= min_phrase_count: dbm_phrase_count.Set(phrase, count).OrDie() it.Remove() dbm_phrase_count.Close().OrDie() logger.info("Dumping done: elapsed_time={:.2f}s".format(time.time() - start_time)) self.num_batches += 1 merge_db_unit = 1 while self.num_batches % (merge_db_unit * MERGE_DB_UNIT) == 0: merge_db_unit *= MERGE_DB_UNIT self.ReduceDatabases(merge_db_unit) self.num_words_since_cutoff = 0
def Run(self): start_time = time.time() logger.info("Process started: input_path={}".format(self.input_path)) word_dbm = tkrzw.DBM() word_dbm.Open(self.input_path, False).OrDie() it = word_dbm.MakeIterator() it.First() num_records = 0 while True: record = it.GetStr() if not record: break self.PrintRecord(json.loads(record[1])) num_records += 1 if num_records % 10000 == 0: logger.info("Processing: records={}".format(num_records)) it.Next() word_dbm.Close().OrDie() logger.info("Process done: elapsed_time={:.2f}s".format(time.time() - start_time))
def ReadSynsetIndex(self): logger.info("Reading synset index: input_path={}".format(self.input_path)) synset_index = collections.defaultdict(set) input_dbm = tkrzw.DBM() input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie() num_words = 0 it = input_dbm.MakeIterator() it.First() while True: record = it.GetStr() if not record: break key, serialized = record entry = json.loads(serialized) for item in entry["item"]: word = item["word"] synset = item["synset"] synset_index[word].add(synset) num_words += 1 if num_words % 10000 == 0: logger.info("Reading synsets: words={}".format(num_words)) it.Next() logger.info("Reading synset index done: records={}".format(len(synset_index))) return synset_index
def Run(self): start_time = time.time() logger.info("Process started: input_path={}, output_path={}".format( str(self.input_path), self.output_path)) input_dbm = tkrzw.DBM() input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie() os.makedirs(self.output_path, exist_ok=True) words = self.ListUpWords(input_dbm) keys = sorted(set([tkrzw_dict.NormalizeWord(word) for word, prob in words.items()])) key_prefixes = set() for key in keys: key_prefixes.add(GetKeyPrefix(key)) key_prefixes = sorted(list(key_prefixes), key=lambda x: 1000 if x == "_" else ord(x)) self.MakeMain(input_dbm, keys, words) self.MakeNavigation(key_prefixes) self.MakeOverview() self.MakeStyle() self.MakePackage(key_prefixes) input_dbm.Close().OrDie() for label, count in self.label_counters.items(): logger.info("Adopted label: {}: {}".format(label, count)) logger.info("Stats: num_words={}, num_trans={}, num_items={}, num_aux_items={}".format( self.num_words, self.num_trans, self.num_items, self.num_aux_items)) logger.info("Process done: elapsed_time={:.2f}s".format(time.time() - start_time))
def Run(self): start_time = time.time() logger.info("Process started: input_path={}, output_path={}".format( self.input_path, self.output_path)) input_dbm = tkrzw.DBM() input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie() it = input_dbm.MakeIterator() logger.info("Getting AOA records") num_entries = 0 records = [] it.First() while True: record = it.GetStr() if not record: break key, serialized = record entry = json.loads(serialized) for word_entry in entry: word = word_entry["word"] trans = word_entry.get("translation") if not trans: continue trans = trans[:8] labels = set() poses = {} for item in word_entry["item"]: labels.add(item["label"]) poses[item["pos"]] = True poses = poses.keys() aoa = (word_entry.get("aoa") or word_entry.get("aoa_concept") or word_entry.get("aoa_base")) if aoa: aoa = float(aoa) else: if len(labels) < 2: continue prob = word_entry.get("probability") if not prob: continue prob = float(prob) if word.count(" "): token_probs = [] for token in word.split(" "): token_serialized = input_dbm.GetStr(token.lower()) token_prob = 0.0 if token_serialized: for token_entry in json.loads( token_serialized): token_word = token_entry["word"] if token_word != token: continue token_prob = float( token_entry.get("probability") or 0.0) token_probs.append(token_prob) min_token_prob = min(token_probs) if min_token_prob > prob: prob = (prob * min_token_prob)**0.5 aoa = math.log(prob + 0.00000001) * -1 + 3.5 record = (word, aoa, poses, trans) records.append(record) num_entries += 1 if num_entries % 10000 == 0: logger.info( "Getting AOA records: entries={}".format(num_entries)) it.Next() logger.info("Reading done: entries={}".format(num_entries)) input_dbm.Close().OrDie() records = sorted(records, key=lambda x: x[1]) output_dbm = tkrzw.DBM() output_dbm.Open(self.output_path, True, dbm="SkipDBM", truncate=True, insert_in_order=True).OrDie() num_entries = 0 for word, aoa, poses, trans in records: key = "{:05d}".format(num_entries) fields = [word] fields.append("{:.2f}".format(aoa)) fields.append(",".join(poses)) fields.append(",".join(trans)) output_dbm.Set(key, "\t".join(fields)).OrDie() num_entries += 1 if num_entries % 10000 == 0: logger.info("Writing: entries={}".format(num_entries)) if num_entries >= 100000: break logger.info("Writing done: entries={}".format(num_entries)) output_dbm.Close().OrDie() logger.info("Process done: elapsed_time={:.2f}s".format(time.time() - start_time))
def AppendTranslations(self, wnjpn_trans, aux_trans, subaux_trans, synset_index): start_time = time.time() logger.info( "Appending translations: input_path={}, output_path={}".format( self.input_path, self.output_path)) input_dbm = tkrzw.DBM() input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie() phrase_prob_dbm = None if self.phrase_prob_path: phrase_prob_dbm = tkrzw.DBM() phrase_prob_dbm.Open(self.phrase_prob_path, False, dbm="HashDBM").OrDie() rev_prob_dbm = None if self.rev_prob_path: rev_prob_dbm = tkrzw.DBM() rev_prob_dbm.Open(self.rev_prob_path, False, dbm="HashDBM").OrDie() tokenizer = tkrzw_tokenizer.Tokenizer() tran_prob_dbm = None if self.tran_prob_path: tran_prob_dbm = tkrzw.DBM() tran_prob_dbm.Open(self.tran_prob_path, False, dbm="HashDBM").OrDie() output_dbm = tkrzw.DBM() num_buckets = input_dbm.Count() * 2 output_dbm.Open(self.output_path, True, dbm="HashDBM", truncate=True, align_pow=0, num_buckets=num_buckets).OrDie() num_words = 0 num_orig_trans = 0 num_match_trans = 0 num_voted_trans = 0 num_items = 0 num_items_bare = 0 num_items_rescued = 0 it = input_dbm.MakeIterator() it.First() while True: record = it.GetStr() if not record: break key, serialized = record entry = json.loads(serialized) items = entry["item"] spell_ratios = {} for item in items: word = item["word"] phrase_prob = float(item.get("prob") or 0.0) spell_ratios[word] = phrase_prob + 0.00000001 sum_prob = 0.0 for word, prob in spell_ratios.items(): sum_prob += prob for word, prob in list(spell_ratios.items()): spell_ratios[word] = prob / sum_prob for item in items: word = item["word"] pos = item["pos"] synset = item["synset"] links = item.get("link") or {} phrase_prob = float(item.get("prob") or 0.0) spell_ratio = spell_ratios[word] synonyms = item.get("synonym") or [] hypernyms = item.get("hypernym") or [] hyponyms = item.get("hyponym") or [] similars = item.get("similar") or [] derivatives = item.get("derivative") or [] synonym_ids = links.get("synonym") or [] hypernym_ids = links.get("hypernym") or [] hyponym_ids = links.get("hyponym") or [] similar_ids = links.get("similar") or [] derivative_ids = links.get("derivative") or [] item_tran_pairs = wnjpn_trans.get(synset) or [] item_aux_trans = aux_trans.get(word) or [] item_aux_trans.extend(subaux_trans.get(word) or []) self.NormalizeTranslationList(tokenizer, pos, item_aux_trans) item_trans = [] hand_trans = set() for tran, src in item_tran_pairs: if src == "mono": hit = False for item_aux_tran in item_aux_trans: dist = tkrzw.Utility.EditDistanceLev( tran, item_aux_tran) dist_ratio = dist / max(len(tran), len(item_aux_tran)) if dist < 0.3: hit = True if not hit: continue item_trans.append(tran) if src == "hand": hand_trans.add(tran) self.NormalizeTranslationList(tokenizer, pos, item_trans) num_items += 1 bare = not item_trans if bare: num_items_bare += 1 num_orig_trans += len(item_trans) syno_tran_counts = collections.defaultdict(int) hyper_tran_counts = collections.defaultdict(int) hypo_tran_counts = collections.defaultdict(int) similar_tran_counts = collections.defaultdict(int) derivative_tran_counts = collections.defaultdict(int) aux_trans_set = set(item_aux_trans) checked_words = set() checked_ids = set([synset]) voted_rel_words = set() voted_rel_records = set() for rel_words, rel_ids, tran_counts in ( (synonyms, synonym_ids, syno_tran_counts), (hypernyms, hypernym_ids, hyper_tran_counts), (hyponyms, hyponym_ids, hypo_tran_counts), (similars, similar_ids, similar_tran_counts), (derivatives, derivative_ids, derivative_tran_counts)): for rel_word in rel_words: is_similar = self.AreSimilarWords(rel_word, word) rel_phrase_prob = 0.0 if phrase_prob_dbm: rel_phrase_prob = self.GetPhraseProb( phrase_prob_dbm, tokenizer, "en", rel_word) mean_prob = (phrase_prob * rel_phrase_prob)**0.5 rel_aux_trans = [] if rel_word not in checked_words: checked_words.add(rel_word) tmp_aux_trans = aux_trans.get(rel_word) if tmp_aux_trans: rel_aux_trans.extend(tmp_aux_trans) for rel_id in synset_index[rel_word]: if rel_id not in rel_ids: continue if rel_id not in checked_ids: checked_ids.add(rel_id) tmp_aux_trans = wnjpn_trans.get(rel_id) if tmp_aux_trans: tmp_aux_trans = [ x[0] for x in tmp_aux_trans ] rel_aux_trans.extend(tmp_aux_trans) if rel_aux_trans: self.NormalizeTranslationList( tokenizer, pos, rel_aux_trans) if not is_similar and mean_prob < 0.0005: for item_aux_tran in item_aux_trans: if regex.fullmatch(r"[\p{Hiragana}]{,3}", item_aux_tran): continue if item_aux_tran in rel_aux_trans: valid_pos = self.IsValidPosTran( tokenizer, pos, item_aux_tran) if valid_pos and item_aux_tran not in item_trans: item_trans.append(item_aux_tran) num_match_trans += 1 if mean_prob < 0.005: voted_top = rel_word for voted_rel_word in voted_rel_words: if self.AreSimilarWords( rel_word, voted_rel_word): voted_top = voted_rel_word break voted_rel_words.add(rel_word) for rel_aux_tran in set(rel_aux_trans): voted_record = (voted_top, rel_aux_tran) if voted_record in voted_rel_records: continue voted_rel_records.add(voted_record) tran_counts[rel_aux_tran] += 1 if bare: for deri_tran, count in derivative_tran_counts.items(): syno_tran_counts[ deri_tran] = syno_tran_counts[deri_tran] + count derivative_tran_counts.clear() for syno_tran, count in syno_tran_counts.items(): if regex.fullmatch(r"[\p{Hiragana}]{,3}", syno_tran): continue if syno_tran in hyper_tran_counts: count += 1 if syno_tran in hypo_tran_counts: count += 1 if syno_tran in similar_tran_counts: count += 1 if syno_tran in derivative_tran_counts: count += 1 if bare and syno_tran in aux_trans_set: count += 1 if count >= 3 and syno_tran not in item_trans: valid_pos = self.IsValidPosTran( tokenizer, pos, syno_tran) if valid_pos and syno_tran not in item_trans: item_trans.append(syno_tran) num_voted_trans += 1 item_score = 0.0 if item_trans: if bare: num_items_rescued += 1 if rev_prob_dbm or tran_prob_dbm: item_trans, item_score, tran_scores = ( self.SortWordsByScore(word, item_trans, hand_trans, rev_prob_dbm, tokenizer, tran_prob_dbm)) item[ "translation"] = item_trans[:MAX_TRANSLATIONS_PER_WORD] if tran_scores: tran_score_map = {} for tran, tran_score in tran_scores[: MAX_TRANSLATIONS_PER_WORD]: tran_score_map[tran] = "{:.6f}".format( tran_score).replace("0.", ".") item["translation_score"] = tran_score_map item_score += spell_ratio * 0.5 item["score"] = "{:.8f}".format(item_score).replace("0.", ".") if "link" in item: del item["link"] if rev_prob_dbm: entry["item"] = sorted( items, key=lambda item: float(item.get("score") or 0.0), reverse=True) serialized = json.dumps(entry, separators=(",", ":"), ensure_ascii=False) output_dbm.Set(key, serialized).OrDie() num_words += 1 if num_words % 10000 == 0: logger.info("Saving words: words={}".format(num_words)) it.Next() output_dbm.Close().OrDie() if tran_prob_dbm: tran_prob_dbm.Close().OrDie() if rev_prob_dbm: rev_prob_dbm.Close().OrDie() if phrase_prob_dbm: phrase_prob_dbm.Close().OrDie() input_dbm.Close().OrDie() logger.info( "Aappending translations done: words={}, elapsed_time={:.2f}s". format(num_words, time.time() - start_time)) logger.info( "Stats: orig={}, match={}, voted={}, items={}, bare={}, rescued={}" .format(num_orig_trans, num_match_trans, num_voted_trans, num_items, num_items_bare, num_items_rescued))
def main(): args = sys.argv[1:] if len(args) < 1: raise ValueError("invalid arguments") input_path = args[0] is_synset = False for arg in args[1:]: if arg == "--synset": is_synset = True else: raise ValueError("invalid arguments") tokenizer = tkrzw_tokenizer.Tokenizer() dbm = tkrzw.DBM() dbm.Open(input_path, False).OrDie() it = dbm.MakeIterator() it.First().OrDie() while True: record = it.GetStr() if not record: break; key, data = record entries = json.loads(data) for entry in entries: word = entry["word"] if is_synset: for item in entry["item"]: pos = item["pos"] text = item["text"] syn_match = regex.search(r"\[synset\]: ([-0-9a-z]+)", text) tran_match = regex.search(r"\[translation\]: ([^\[]+)", text) if syn_match and tran_match: syn = syn_match.group(1) tran = tran_match.group(1) tran = regex.sub(r"\([^)]+\)", "", tran) norm_trans = [] uniq_trans = set() for syn_tran in tran.split(","): norm_tran = tokenizer.NormalizeJaWordForPos(pos, syn_tran.strip()) if norm_tran and norm_tran not in uniq_trans: norm_trans.append(norm_tran) uniq_trans.add(norm_tran) if norm_trans: print("{}:{}\t{}".format(word, syn, "\t".join(norm_trans))) else: poses = set() tran_poses = {} for item in entry["item"]: pos = item["pos"] text = item["text"] poses.add(pos) tran_match = regex.search(r"\[translation\]: ([^\[]+)", text) if tran_match: tran = tran_match.group(1) tran = regex.sub(r"\([^)]+\)", "", tran) for syn_tran in tran.split(","): syn_tran = syn_tran.strip() if syn_tran and syn_tran not in tran_poses: tran_poses[syn_tran] = pos only_pos = list(poses)[0] if len(poses) == 1 else None translations = entry.get("translation") if translations: norm_trans = [] uniq_trans = set() for tran in translations: pos = only_pos if not pos: pos = tran_poses.get(tran) norm_tran = tokenizer.NormalizeJaWordForPos(pos, tran) if pos else tran if norm_tran and norm_tran not in uniq_trans: norm_trans.append(norm_tran) uniq_trans.add(norm_tran) if norm_trans: print("{}\t{}".format(word, "\t".join(norm_trans))) it.Next() dbm.Close().OrDie()
def Run(self): start_time = time.time() logger.info("Process started: input_path={}, output_path={}".format( self.input_path, self.output_path)) input_dbm = tkrzw.DBM() input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie() it = input_dbm.MakeIterator() it.First() num_entries = 0 index = collections.defaultdict(list) infl_names = ("noun_plural", "verb_singular", "verb_present_participle", "verb_past", "verb_past_participle", "adjective_comparative", "adjective_superative", "adverb_comparative", "adverb_superative") while True: record = it.GetStr() if not record: break key, serialized = record entry = json.loads(serialized) for word_entry in entry: word = word_entry["word"] prob = max(float(word_entry.get("probability") or "0"), 0.0000001) score = prob * math.log2(len(word_entry["item"])) if "translation" in word_entry: score *= 2 inflections = set() for infl_name in infl_names: inflection = word_entry.get(infl_name) if inflection: for infl_value in regex.split(r"[,|]", inflection): infl_value = tkrzw_dict.NormalizeWord( infl_value.strip()) if not regex.search(r"\p{Latin}", infl_value): continue inflections.add(infl_value) for inflection in inflections: index[inflection].append((word, score)) num_entries += 1 if num_entries % 10000 == 0: logger.info("Reading: entries={}".format(num_entries)) it.Next() input_dbm.Close().OrDie() logger.info("Reading done: entries={}".format(num_entries)) output_dbm = tkrzw.DBM() num_buckets = len(index) * 2 output_dbm.Open(self.output_path, True, dbm="HashDBM", truncate=True, align_pow=0, num_buckets=num_buckets).OrDie() num_entries = 0 for inflection, scores in index.items(): scores = sorted(scores, key=lambda x: x[1], reverse=True) words = [x[0] for x in scores] output_dbm.Set(inflection, "\t".join(words)).OrDie() num_entries += 1 if num_entries % 10000 == 0: logger.info("Writing: entries={}".format(num_entries)) output_dbm.Close().OrDie() logger.info("Process done: elapsed_time={:.2f}s".format(time.time() - start_time))
def Dump(self): logger.info( "Batch {} aggregation done: elapsed_time={:.2f}s, RSS={:.2f}MB". format(self.num_batches + 1, time.time() - self.start_time, tkrzw.Utility.GetMemoryUsage() / 1024.0 / 1024)) logger.info( ("Batch {} dumping: documents={}, sentences={}, words={}," + " unique_words={}, unique_cooc={}").format( self.num_batches + 1, self.num_documents, self.num_sentences, self.num_words, self.mem_word_count.Count(), self.mem_cooc_count.Count())) start_time = time.time() fill_ratio = min(self.num_words / BATCH_MAX_WORDS, 1.0) dbm_cooc_count_path = "{}-cooc-count-{:08d}.tks".format( self.data_prefix, self.num_batches) dbm_cooc_count = tkrzw.DBM() dbm_cooc_count.Open(dbm_cooc_count_path, True, dbm="SkipDBM", truncate=True, insert_in_order=True, offset_width=5, step_unit=16, max_level=8).OrDie() logger.info("Batch {} cooc count dumping: dest={}".format( self.num_batches + 1, dbm_cooc_count_path)) dbm_cooc_count.Set("", self.num_sentences).OrDie() it = self.mem_cooc_count.MakeIterator() it.First() min_word_count = math.ceil(MIN_WORD_COUNT_IN_BATCH * fill_ratio) if MIN_WORD_COUNT_IN_BATCH >= 2: min_word_count = max(min_word_count, 2) min_count = math.ceil(tkrzw_dict.COOC_BASE_SCORE * MIN_COOC_COUNT_IN_BATCH * fill_ratio) cur_word = None cur_word_count = 0 cur_word_weight = 1.0 cooc_words = [] while True: record = it.Get() if not record: break word_pair = record[0].decode() count = struct.unpack(">q", record[1])[0] word, cooc_word = word_pair.split(" ") if cur_word != word: if cur_word and cooc_words: self.DumpCoocWords(cur_word, cooc_words, dbm_cooc_count) cur_word = word cur_word_count = struct.unpack( ">q", self.mem_word_count.Get(cur_word))[0] cur_word_weight = 1.0 if tkrzw_dict.IsNumericWord(cur_word): cur_word_weight = tkrzw_dict.NUMERIC_WORD_WEIGHT elif tkrzw_dict.IsStopWord(self.language, cur_word): cur_word_weight = tkrzw_dict.STOP_WORD_WEIGHT cooc_words = [] if cur_word_count * cur_word_weight >= min_word_count: cooc_count = struct.unpack( ">q", self.mem_word_count.Get(cooc_word))[0] cooc_weight = 1.0 if tkrzw_dict.IsNumericWord(cooc_word): cooc_weight = tkrzw_dict.NUMERIC_WORD_WEIGHT elif tkrzw_dict.IsStopWord(self.language, cooc_word): cooc_weight = tkrzw_dict.STOP_WORD_WEIGHT cooc_prob = cooc_count / self.num_sentences cooc_idf = min( math.log(cooc_prob) * -1, tkrzw_dict.MAX_IDF_WEIGHT) score = count * (cooc_idf**tkrzw_dict.IDF_POWER) score *= cur_word_weight * cooc_weight if (cooc_count * cooc_weight >= min_word_count and count * cur_word_weight * cooc_weight >= min_count): cooc_words.append((cooc_word, count, score)) it.Remove() if cur_word and cooc_words: self.DumpCoocWords(cur_word, cooc_words, dbm_cooc_count) dbm_cooc_count.Close().OrDie() dbm_word_count_path = "{}-word-count-{:08d}.tks".format( self.data_prefix, self.num_batches) dbm_word_count = tkrzw.DBM() dbm_word_count.Open(dbm_word_count_path, True, dbm="SkipDBM", truncate=True, insert_in_order=True, offset_width=4, step_unit=4, max_level=12).OrDie() logger.info("Batch {} word count dumping: dest={}".format( self.num_batches + 1, dbm_word_count_path)) dbm_word_count.Set("", self.num_sentences).OrDie() it = self.mem_word_count.MakeIterator() it.First() while True: record = it.Get() if not record: break word = record[0] count = struct.unpack(">q", record[1])[0] if count >= min_word_count: dbm_word_count.Set(word, count).OrDie() it.Remove() dbm_word_count.Close().OrDie() logger.info("Dumping done: elapsed_time={:.2f}s".format(time.time() - start_time)) self.num_batches += 1 merge_db_unit = 1 while self.num_batches % (merge_db_unit * MERGE_DB_UNIT) == 0: merge_db_unit *= MERGE_DB_UNIT self.ReduceDatabases(merge_db_unit) self.num_words_since_cutoff = 0
def Run(self): start_time = time.time() logger.info("Process started: input_path={}, output_path={}".format( self.input_path, self.output_path)) input_dbm = tkrzw.DBM() input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie() tran_prob_dbm = None if self.tran_prob_path: tran_prob_dbm = tkrzw.DBM() tran_prob_dbm.Open(self.tran_prob_path, False, dbm="HashDBM").OrDie() it = input_dbm.MakeIterator() logger.info("Getting AOA records") it.First() num_entries = 0 aoa_records = {} real_aoa_probs = collections.defaultdict(list) while True: record = it.GetStr() if not record: break key, serialized = record entry = json.loads(serialized) for word_entry in entry: word = word_entry["word"] aoa = (word_entry.get("aoa") or word_entry.get("aoa_concept") or word_entry.get("aoa_base")) if aoa: aoa_records[word] = float(aoa) real_aoa = word_entry.get("aoa") prob = word_entry.get("probability") if real_aoa and prob: real_aoa_probs[int(float(real_aoa))].append(float(prob)) num_entries += 1 if num_entries % 10000 == 0: logger.info( "Getting AOA records: entries={}".format(num_entries)) it.Next() aoa_prob_map = {} min_aoa_prob = 0.0001 for aoa_age, probs in sorted(list(real_aoa_probs.items())): if aoa_age < 4 or aoa_age > 20: continue prob_mean = sum(probs) / len(probs) min_aoa_prob = min(prob_mean, min_aoa_prob) aoa_prob_map[aoa_age] = min(min_aoa_prob, 0.01) it.First() num_entries = 0 scores = [] while True: record = it.GetStr() if not record: break key, serialized = record entry = json.loads(serialized) max_score = 0 for word_entry in entry: word = word_entry["word"] prob = float(word_entry.get("probability") or "0") aoa_prob = 0 real_aoa = word_entry.get("aoa") if real_aoa: aoa_prob = float( aoa_prob_map.get(int(float(real_aoa))) or 0) prob += aoa_prob prob_score = max(prob**0.5, 0.00001) aoa = (word_entry.get("aoa") or word_entry.get("aoa_concept") or word_entry.get("aoa_base")) if aoa: aoa = float(aoa) else: aoa = sys.maxsize tokens = word.split(" ") if len(tokens) > 1: max_aoa = 0 for token in tokens: token_aoa = aoa_records.get(token) if token_aoa: max_aoa = max(max_aoa, float(token_aoa)) else: max_aoa = sys.maxsize if max_aoa < sys.maxsize: aoa = max_aoa + len(tokens) - 1 aoa_score = (25 - min(aoa, 20.0)) / 10.0 tran_score = 1.0 if "translation" in word_entry: tran_score += 1.0 if tran_prob_dbm: tsv = tran_prob_dbm.GetStr(key) if tsv: fields = tsv.split("\t") max_tran_prob = 0.0 for i in range(0, len(fields), 3): tran_src, tran_trg, tran_prob = fields[i], fields[ i + 1], float(fields[i + 2]) if tran_src != word: continue if not regex.search(r"[\p{Han}]", tran_trg): prob *= 0.5 max_tran_prob = max(max_tran_prob, tran_prob) tran_score += max_tran_prob item_score = math.log2(len(word_entry["item"]) + 1) labels = set() for item in word_entry["item"]: labels.add(item["label"]) label_score = math.log2(len(labels) + 1) children = word_entry.get("child") child_score = math.log2((len(children) if children else 0) + 4) score = prob_score * aoa_score * tran_score * item_score * label_score * child_score if regex.fullmatch(r"\d+", word): score *= 0.1 elif regex.match(r"\d", word): score *= 0.3 elif regex.search(r"^[^\p{Latin}]", word) or regex.search( r"[^\p{Latin}]$", word): score *= 0.5 elif regex.search(r".[\p{Lu}]", word): score *= 0.5 max_score = max(max_score, score) scores.append((key, max_score)) num_entries += 1 if num_entries % 10000 == 0: logger.info("Reading: entries={}".format(num_entries)) it.Next() if tran_prob_dbm: tran_prob_dbm.Close().OrDie() input_dbm.Close().OrDie() logger.info("Reading done: entries={}".format(num_entries)) scores = sorted(scores, key=lambda x: x[1], reverse=True) with open(self.output_path, "w") as out_file: num_entries = 0 for key, score in scores: print(key, file=out_file) num_entries += 1 if num_entries % 10000 == 0: logger.info("Writing: entries={}".format(num_entries)) logger.info("Writing done: entries={}".format(num_entries)) logger.info("Process done: elapsed_time={:.2f}s".format(time.time() - start_time))
def __init__(self, data_prefix, language): self.language = language self.tokenizer = tkrzw_tokenizer.Tokenizer() word_score_path = tkrzw_dict.GetCoocScorePath(data_prefix) self.word_score_dbm = tkrzw.DBM() self.word_score_dbm.Open(word_score_path, False, dbm="HashDBM").OrDie()
def DivideCoocCount(self, cooc_count_path, word_prob_path, cooc_prob_path): start_time = time.time() logger.info( "Writing the coocccurrence probability database: src={}, dest={}". format(cooc_count_path, cooc_prob_path)) cooc_count_dbm = tkrzw.DBM() cooc_count_dbm.Open(cooc_count_path, False, dbm="SkipDBM").OrDie() word_prob_dbm = tkrzw.DBM() word_prob_dbm.Open(word_prob_path, False, dbm="HashDBM").OrDie() cooc_prob_dbm = tkrzw.DBM() num_buckets = word_prob_dbm.Count() * 2 cooc_prob_dbm.Open(cooc_prob_path, True, dbm="HashDBM", truncate=True, offset_width=4, num_buckets=num_buckets).OrDie() word_prob_cache = tkrzw.DBM() word_prob_cache.Open("", True, dbm="CacheDBM", cap_rec_num=PROB_CACHE_CAPACITY) def GetWordProb(key): value = word_prob_cache.Get(key) if value: return float(value) value = word_prob_dbm.GetStr(key) if value: word_prob_cache.Set(key, value) return float(value) return None it = cooc_count_dbm.MakeIterator() it.First() record = it.GetStr() if not record or len(record[0]) != 0: raise RuntimeError("invalid first record") num_sentences = int(record[1]) it.Next() num_records = 0 cur_word = None cur_word_prob = 0 cooc_words = [] while True: record = it.GetStr() if not record: break word_pair = record[0] count = int(record[1]) / tkrzw_dict.COOC_BASE_SCORE word, cooc_word = word_pair.split(" ") if cur_word != word: if cooc_words: self.SaveCoocWords(cur_word, cooc_words, cooc_prob_dbm) num_records += 1 if num_records % 1000 == 0: logger.info( "Dividing coocurrence counts: {} records".format( num_records)) cur_word = word cur_word_prob = GetWordProb(cur_word) cooc_words = [] if cur_word_prob: cooc_prob = GetWordProb(cooc_word) if cooc_prob: cooc_idf = min( math.log(cooc_prob) * -1, tkrzw_dict.MAX_IDF_WEIGHT) cur_word_count = max(round(cur_word_prob * num_sentences), 1) prob = count / cur_word_count score = prob * (cooc_idf**tkrzw_dict.IDF_POWER) if tkrzw_dict.IsNumericWord(cooc_word): score *= tkrzw_dict.NUMERIC_WORD_WEIGHT elif tkrzw_dict.IsStopWord(self.language, cooc_word): score *= tkrzw_dict.STOP_WORD_WEIGHT cooc_words.append((cooc_word, prob, score)) it.Next() if cur_word and cooc_words: self.SaveCoocWords(cur_word, cooc_words, cooc_prob_dbm) cooc_prob_dbm.Close().OrDie() word_prob_dbm.Close().OrDie() cooc_count_dbm.Close().OrDie() logger.info( "Writing the cooccurrence probability database done: elapsed_time={:.2f}s" .format(time.time() - start_time))
def Run(phrase_prob_path, rev_prob_path, tran_prob_path, tran_aux_paths, yomi_paths, min_phrase_prob, min_tran_prob): logger.info("Start the process") phrase_prob_dbm = None if phrase_prob_path: logger.info("Opening the phrase prob DB: " + phrase_prob_path) phrase_prob_dbm = tkrzw.DBM() phrase_prob_dbm.Open(phrase_prob_path, False, dbm="HashDBM").OrDie() rev_prob_dbm = None if rev_prob_path: logger.info("Opening the reverse prob DB: " + rev_prob_path) rev_prob_dbm = tkrzw.DBM() rev_prob_dbm.Open(rev_prob_path, False, dbm="HashDBM").OrDie() tran_prob_dbm = None if tran_prob_path: logger.info("Opening the tran prob DB: " + tran_prob_path) tran_prob_dbm = tkrzw.DBM() tran_prob_dbm.Open(tran_prob_path, False, dbm="HashDBM").OrDie() aux_trans = collections.defaultdict(list) for tran_aux_path in tran_aux_paths.split(","): tran_aux_path = tran_aux_path.strip() if tran_aux_path: logger.info("Reading the tran aux file: " + tran_aux_path) with open(tran_aux_path) as input_file: uniq_keys = set() for line in input_file: fields = line.strip().split("\t") if len(fields) < 2: continue word = fields[0] for tran in fields[1:]: uniq_key = word + ":" + tran if uniq_key in uniq_keys: continue aux_trans[word].append(tran) uniq_keys.add(uniq_key) yomis = set() for yomi_path in yomi_paths.split(","): yomi_path = yomi_path.strip() if yomi_path: logger.info("Reading the yomi file: " + yomi_path) with open(yomi_path) as input_file: for line in input_file: fields = line.strip().split("\t") if len(fields) < 1: continue yomis.add(fields[0]) logger.info("Processing the gross.") tokenizer = tkrzw_tokenizer.Tokenizer() word_dict = collections.defaultdict(list) alt_source = None alt_targets = None num_lines = 0 for line in sys.stdin: num_lines += 1 if num_lines % 10000 == 0: logger.info("Processing the gross: {} lines: {} items".format( num_lines, len(word_dict))) fields = line.strip().split("\t") if len(fields) != 3: continue word, pos, text = fields if pos == "alternative": alt_source = word alt_targets = set() for alt in regex.split(r"[,;]", text): if regex.fullmatch(r"[\p{Han}\p{Hiragana}\p{Katakana}ー]+", alt): alt_targets.add(alt) continue text = regex.sub(r"\.$", "", text).strip() for tran in regex.split(r"[,;]", text): tran = tran.strip() if pos == "verb": tran = regex.sub(r"^to ", "", tran) if pos == "noun": tran = regex.sub(r"^(a|an|the) ", "", tran) tran = regex.sub("^[-~] ", "", tran) tran = regex.sub(" [-~]$", "", tran) if not regex.fullmatch(r"[-_\p{Latin}0-9'. ]+", tran): continue tokens = tran.split(" ") if len(tokens) < 1 or len(tokens) > 4: continue word_dict[tran].append((pos, word)) if alt_source == word: for alt in alt_targets: word_dict[tran].append((pos, alt)) norm_word_dict = collections.defaultdict(list) for word, trans in word_dict.items(): scored_trans, phrase_prob = ProcessWord(word, trans, tokenizer, phrase_prob_dbm, rev_prob_dbm, tran_prob_dbm, aux_trans, yomis, min_phrase_prob, min_tran_prob) if scored_trans: key = tkrzw_dict.NormalizeWord(word) norm_word_dict[key].append((word, scored_trans, phrase_prob)) for key, entries in norm_word_dict.items(): sum_phrase_prob = 0.0 for word, scored_trans, phrase_prob in entries: sum_phrase_prob += phrase_prob for word, scored_trans, phrase_prob in entries: if sum_phrase_prob > 0: if key == word: if phrase_prob / sum_phrase_prob < 0.6: continue else: if phrase_prob / sum_phrase_prob < 0.8: continue PrintEntry(word, scored_trans) if tran_prob_dbm: tran_prob_dbm.Close().OrDie() if phrase_prob_dbm: phrase_prob_dbm.Close().OrDie() logger.info("Process done")
def Run(self): start_time = time.time() logger.info("Process started: input_path={}, output_path={}".format( self.input_path, self.output_path)) mem_index = tkrzw.DBM() mem_index.Open("", True, dbm="BabyDBM").OrDie() input_dbm = tkrzw.DBM() input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie() it = input_dbm.MakeIterator() it.First() num_entries = 0 num_translations = 0 tran_dict = set() while True: record = it.GetStr() if not record: break key, serialized = record entry = json.loads(serialized) for word_entry in entry: word = word_entry["word"] prob = max(float(word_entry.get("probability") or "0"), 0.0000001) aoa = min(float(word_entry.get("aoa") or "20"), 20.0) score = prob * ((30 - aoa) / 10) word_trans = word_entry.get("translation") or [] phrase_trans = [] phrases = word_entry.get("phrase") if phrases: for phrase in phrases: if phrase.get("p") or phrase.get("i"): continue for phrase_tran in phrase.get("x"): phrase_tran = regex.sub(r"\(.*?\)", "", phrase_tran).strip() if phrase_tran: phrase_trans.append(phrase_tran) weight_word_trans = [] for trans, weight in [(word_trans, 1.0), (phrase_trans, 0.5)]: for word_tran in trans: weight_word_trans.append((word_tran, weight)) match = regex.search( r"([\p{Han}\p{Katakana}ー]{2,})(する|すること|される|されること|をする)$", word_tran) if match: short_word_tran = word_tran[:-len(match.group(2))] if short_word_tran: weight_word_trans.append( (short_word_tran, weight * 0.8)) short_word_tran = self.tokenizer.CutJaWordNounParticle( word_tran) if short_word_tran != word_tran: weight_word_trans.append( (short_word_tran, weight * 0.8)) match = regex.search( r"([\p{Han}\p{Katakana}ー]{2,})(的|的な|的に)$", word_tran) if match: short_word_tran = word_tran[:-len(match.group(2))] if short_word_tran: weight_word_trans.append( (short_word_tran, weight * 0.8)) match = regex.search( r"([\p{Han}]{2,})(が|の|を|に|へ|と|より|から|で|や|な|なる|たる)$", word_tran) if match: short_word_tran = word_tran[:-len(match.group(2))] if short_word_tran: weight_word_trans.append( (short_word_tran, weight * 0.8)) uniq_trans = set() for tran, weight in weight_word_trans: norm_tran = tkrzw_dict.NormalizeWord(tran) if norm_tran in uniq_trans: continue uniq_trans.add(norm_tran) pair = "{}\t{:.8f}".format(word, score * weight) score *= 0.98 mem_index.Append(norm_tran, pair, "\t").OrDie() for item in word_entry["item"]: if item["label"] in self.supplement_labels: for tran in item["text"].split(","): tran = tran.strip() if tran: tran_dict_key = word + "\t" + tran tran_dict.add(tran_dict_key) num_translations += len(uniq_trans) num_entries += 1 if num_entries % 10000 == 0: logger.info("Reading: entries={}, translations={}".format( num_entries, num_translations)) it.Next() input_dbm.Close().OrDie() logger.info("Reading done: entries={}, translations={}".format( num_entries, num_translations)) output_dbm = tkrzw.DBM() num_buckets = mem_index.Count() * 2 output_dbm.Open(self.output_path, True, dbm="HashDBM", truncate=True, align_pow=0, num_buckets=num_buckets).OrDie() tran_prob_dbm = None if self.tran_prob_path: tran_prob_dbm = tkrzw.DBM() tran_prob_dbm.Open(self.tran_prob_path, False, dbm="HashDBM").OrDie() it = mem_index.MakeIterator() it.First() num_records = 0 while True: record = it.GetStr() if not record: break key, value = record scored_trans = [] uniq_words = set() fields = value.split("\t") for i in range(0, len(fields), 2): word = fields[i] score = float(fields[i + 1]) if word in uniq_words: continue uniq_words.add(word) if tran_prob_dbm: prob = self.GetTranProb(tran_prob_dbm, word, key) tran_dict_key = word + "\t" + key prob = max(prob, 0.000001) if tran_dict_key in tran_dict: prob += 0.1 score = (score * prob)**0.5 scored_trans.append((word, score)) scored_trans = sorted(scored_trans, key=lambda x: x[1], reverse=True) value = "\t".join([x[0] for x in scored_trans]) output_dbm.Set(key, value).OrDie() num_records += 1 if num_records % 10000 == 0: logger.info("Writing: records={}".format(num_records)) it.Next() if tran_prob_dbm: tran_prob_dbm.Close().OrDie() output_dbm.Close().OrDie() logger.info("Writing done: records={}".format(num_records)) mem_index.Close().OrDie() logger.info("Process done: elapsed_time={:.2f}s".format(time.time() - start_time))
def Run(self): start_time = time.time() logger.info("Process started: input_path={}, output_path={}".format( self.input_path, self.output_path)) input_dbm = tkrzw.DBM() input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie() it = input_dbm.MakeIterator() logger.info("Getting AOA records") it.First() num_entries = 0 aoa_records = {} while True: record = it.GetStr() if not record: break key, serialized = record entry = json.loads(serialized) max_score = 0 for word_entry in entry: word = word_entry["word"] aoa = (word_entry.get("aoa") or word_entry.get("aoa_concept") or word_entry.get("aoa_base")) if aoa: aoa_records[word] = float(aoa) num_entries += 1 if num_entries % 10000 == 0: logger.info("Getting AOA records: entries={}".format(num_entries)) it.Next() it.First() num_entries = 0 scores = [] while True: record = it.GetStr() if not record: break key, serialized = record entry = json.loads(serialized) max_score = 0 for word_entry in entry: word = word_entry["word"] prob = float(word_entry.get("probability") or "0") prob_score = max(prob ** 0.5, 0.00001) aoa = (word_entry.get("aoa") or word_entry.get("aoa_concept") or word_entry.get("aoa_base")) if aoa: aoa = float(aoa) else: aoa = sys.maxsize tokens = word.split(" ") if len(tokens) > 1: max_aoa = 0 for token in tokens: token_aoa = aoa_records.get(token) if token_aoa: max_aoa = max(max_aoa, float(token_aoa)) else: max_aoa = sys.maxsize if max_aoa < sys.maxsize: aoa = max_aoa + len(tokens) - 1 aoa_score = (25 - min(aoa, 20.0)) / 10.0 tran_score = 1.0 if "translation" in word_entry else 0.5 item_score = math.log2(len(word_entry["item"]) + 1) labels = set() for item in word_entry["item"]: labels.add(item["label"]) label_score = len(labels) + 1 children = word_entry.get("child") child_score = math.log2((len(children) if children else 0) + 4) score = prob_score * aoa_score * tran_score * item_score * child_score if regex.fullmatch(r"\d+", word): score *= 0.1 elif regex.match(r"\d", word): score *= 0.3 elif regex.search(r"^[^\p{Latin}]", word) or regex.search(r"[^\p{Latin}]$", word): score *= 0.5 elif regex.search(r".[\p{Lu}]", word): score *= 0.5 max_score = max(max_score, score) scores.append((key, max_score)) num_entries += 1 if num_entries % 10000 == 0: logger.info("Reading: entries={}".format(num_entries)) it.Next() input_dbm.Close().OrDie() logger.info("Reading done: entries={}".format(num_entries)) scores = sorted(scores, key=lambda x: x[1], reverse=True) with open(self.output_path, "w") as out_file: num_entries = 0 for key, score in scores: print(key, file=out_file) num_entries += 1 if num_entries % 10000 == 0: logger.info("Writing: entries={}".format(num_entries)) logger.info("Writing done: entries={}".format(num_entries)) logger.info("Process done: elapsed_time={:.2f}s".format(time.time() - start_time))
def __init__(self): self.__counter = 0 self.__db = tkrzw.DBM() self.__fname = None
def Run(rev_prob_path, min_count, enough_ef, enough_fe, omit_latin, min_score, min_score_large, min_score_stop, max_targets, tran_aux_paths): start_time = time.time() logger.info("Process started") aux_trans = collections.defaultdict(list) for tran_aux_path in tran_aux_paths: if not tran_aux_path: continue logger.info("Reading: " + tran_aux_path) with open(tran_aux_path) as input_file: for line in input_file: fields = line.strip().split("\t") if len(fields) < 2: continue source = fields[0] targets = set() for target in fields[1:]: target = unicodedata.normalize('NFKC', target) target = regex.sub(r"[\p{Ps}\p{Pe}\p{C}]", "", target) target = regex.sub(r"\s+", " ", target).strip() if target: aux_trans[source].append(target) rev_prob_dbm = None if rev_prob_path: logger.info("Reading: " + rev_prob_path) rev_prob_dbm = tkrzw.DBM() rev_prob_dbm.Open(rev_prob_path, False, dbm="HashDBM").OrDie() logger.info("Processing records") records = {} for line in sys.stdin: fields = line.strip().split("\t") if len(fields) < 3: continue source = fields[0] count = int(fields[1]) targets = [] for field in fields[2:]: columns = field.split("|") if len(columns) != 3: continue targets.append((columns[0], float(columns[1]), float(columns[2]))) records[source] = (count, targets) for source, (count, targets) in records.items(): if count < min_count: continue if len(source) <= 1: continue large = bool(regex.search(r"^\p{Lu}", source)) if large: cap_source = source.lower() else: cap_source = source[0].upper() + source[1:] cap_count, cap_targets = 0, [] if cap_source != source: cap_record = records.get(cap_source) if cap_record: cap_count, cap_targets = cap_record if large: cap_count *= 5.0 if count < cap_count: continue scored_targets = [] for target, ef_prob, fe_prob in targets: for cap_target, cap_ef_prob, cap_fe_prob in cap_targets: if cap_target == target: fe_prob += cap_fe_prob ef_prob = min(1.0, ef_prob) fe_prob = min(1.0, fe_prob) score = ((ef_prob ** EF_WEIGHT) * (fe_prob ** FE_WEIGHT)) ** (1 / (EF_WEIGHT + FE_WEIGHT)) #score = 2 * ef_prob * fe_prob / (ef_prob + fe_prob) scored_targets.append((target, score, ef_prob, fe_prob)) scored_targets = sorted(scored_targets, key=lambda x: x[1], reverse=True) source_aux_trans = aux_trans.get(source) or [] good_targets = [] for target, score, ef_prob, fe_prob in scored_targets: if target in source_aux_trans: score *= 1.1 else: is_prefix = False is_single_noun = False for cmp_target, cmp_score, _, _ in scored_targets: if target != cmp_target and cmp_target.startswith(target) and cmp_score >= min_score: if (cmp_target == target + "の" or cmp_target == target + "する") and regex.fullmatch(r"\p{Han}+", target): is_single_noun = True else: is_prefix = True is_stop = bool(regex.fullmatch(r"[\p{Hiragana}]+", target)) if omit_latin and regex.search(r"[\p{Latin}]{2,}", target): continue if len(target) <= 1 and is_prefix and not is_single_noun: continue if large: if score < min_score_large: continue elif is_stop: if score < min_score_stop: continue else: if score < min_score: if (regex.search(r"[\p{Latin}]{4,}", source) and not regex.search(r"\d", source) and (regex.search(r"[\p{Han}]{2,}", target) or regex.search(r"[\p{Han}][\p{Hiragana}]", target)) and (ef_prob >= enough_ef or fe_prob >= enough_fe)): pass else: continue norm_source = source.lower() norm_target = target.lower() if norm_source.find(norm_target) >= 0 or norm_target.find(norm_source) >= 0: continue if norm_target in ("する", "ます", "より", "から"): continue if norm_target.startswith("っ") or norm_target.startswith("を"): continue if norm_target.endswith("っ") or norm_target.endswith("を"): continue if regex.fullmatch(r"[\p{Hiragana}ー{Latin}]", target): continue if regex.search(r"^[\p{Hiragana}]+[\p{Han}\p{Katakana}\p{Latin}]", target): continue elif regex.search(r"[\p{Han}\{Katakana}ー\p{Latin}][は|が|を|と]", target): continue if len(target) <= 1: score *= 0.5 elif len(target) <= 2: score *= 0.9 if regex.fullmatch(r"[\p{Hiragana}ー]+", target): score *= 0.8 elif regex.search(r"\d", target): score *= 0.8 target = regex.sub(r"([\p{Han}\p{Katakana}ー\p{Latin}])だ", r"\1な", target) good_targets.append((target, score, ef_prob, fe_prob)) if not good_targets: continue good_targets = sorted(good_targets, key=lambda x: x[1], reverse=True) outputs = [] for target, score, ef_prob, fe_prob in good_targets[:max_targets]: if rev_prob_dbm: prob = GetPhraseProb(rev_prob_dbm, "ja", target) if prob < MIN_PROB: continue #outputs.append("{}:{:.3f}:{:.3f}:{:.3f}".format(target, score, ef_prob, fe_prob)) outputs.append(target) if outputs: print("{}\t{}".format(source, "\t".join(outputs))) if rev_prob_dbm: rev_prob_dbm.Close().OrDie() logger.info("Process done: elapsed_time={:.2f}s".format( time.time() - start_time))
def Dump(self): logger.info( "Batch {} aggregation done: elapsed_time={:.2f}s, RSS={:.2f}MB". format(self.num_batches + 1, time.time() - self.start_time, tkrzw.Utility.GetMemoryUsage() / 1024.0 / 1024)) logger.info( "Batch {} dumping: sentences={}, records={}, dup={}, unique_phrases={}" .format(self.num_batches + 1, self.num_sentences, self.num_records, self.num_duplications, self.mem_phrase_count.Count())) start_time = time.time() fill_ratio = min(self.num_records / BATCH_MAX_RECORDS, 1.0) dbm_phrase_count_path = "{}-count-{:08d}.tks".format( self.data_prefix, self.num_batches) dbm_phrase_count = tkrzw.DBM() dbm_phrase_count.Open(dbm_phrase_count_path, True, dbm="SkipDBM", truncate=True, insert_in_order=True, offset_width=4, step_unit=4, max_level=12).OrDie() logger.info("Batch {} word count dumping: dest={}".format( self.num_batches + 1, dbm_phrase_count_path)) dbm_phrase_count.Set("", self.num_domains).OrDie() it = self.mem_phrase_count.MakeIterator() it.First() min_phrase_count = max( math.ceil(MIN_PHRASE_COUNT_IN_BATCH * fill_ratio), 2) re_symbol = regex.compile(r"[\p{S}\p{P}]") re_double_particle = regex.compile( r"^[\p{Hiragana}ー]+ [\p{Hiragana}ー]+") re_hiragana_only = regex.compile(r"[ \p{Hiragana}ー]+") particles = set([ "を", "に", "が", "へ", "や", "の", "と", "から", "で", "より", "な", "は", "です", "ます", "この", "その", "あの", "こと", "する", "される", "た", "て", "と", "ある", "いる", "これ", "それ", "あれ", "れる", "という", "として", "だ", "など" ]) prefixes = [x + " " for x in particles] def Output(src_phrase, trg_phrases): scored_targets = [] for trg_phrase, count in trg_phrases: score = count if trg_phrase: if re_symbol.search(trg_phrase): continue if re_double_particle.search(trg_phrase): score *= 0.5 elif trg_phrase in particles: score *= 0.5 else: hit = False for prefix in prefixes: if trg_phrase.startswith(prefix): hit = True break if hit: score *= 0.8 if re_hiragana_only.fullmatch(trg_phrase): score *= 0.5 if len(trg_phrase) <= 1: score *= 0.5 elif len(trg_phrase) <= 2: score *= 0.8 else: score += 1 scored_targets.append((trg_phrase, count, score)) scored_targets = sorted(scored_targets, key=lambda x: x[2], reverse=True) scored_targets = scored_targets[:MAX_TARGETS_IN_BATCH] outputs = [] for trg_phrase, count, score in scored_targets: key = src_phrase + "\t" + trg_phrase outputs.append((key, count)) outputs = sorted(outputs) for key, value in outputs: dbm_phrase_count.Set(key, value).OrDie() last_src_phrase = "" trg_phrases = [] while True: record = it.Get() if not record: break src_phrase, trg_phrase = record[0].decode().split("\t") count = struct.unpack(">q", record[1])[0] if src_phrase: if src_phrase != last_src_phrase: if trg_phrases: Output(last_src_phrase, trg_phrases) trg_phrases = [] if count >= min_phrase_count: trg_phrases.append((trg_phrase, count)) last_src_phrase = src_phrase else: if count >= min_phrase_count: dbm_phrase_count.Set("\t" + trg_phrase, count).OrDie() it.Remove() if trg_phrases: Output(last_src_phrase, trg_phrases) dbm_phrase_count.Close().OrDie() logger.info("Dumping done: elapsed_time={:.2f}s".format(time.time() - start_time)) self.num_batches += 1 merge_db_unit = 1 while self.num_batches % (merge_db_unit * MERGE_DB_UNIT) == 0: merge_db_unit *= MERGE_DB_UNIT self.ReduceDatabases(merge_db_unit)
def main(): args = sys.argv[1:] if len(args) < 2: raise ValueError("invalid arguments") data_prefix = args[0] phrase_path = args[1] searcher = tkrzw_union_searcher.UnionSearcher(data_prefix) phrase_dbm = tkrzw.DBM() phrase_dbm.Open(phrase_path, False, dbm="HashDBM").OrDie() parent_index = collections.defaultdict(list) page_index = 1 while True: result = searcher.SearchByGrade(100, page_index, True) if not result: break for entry in result: word = entry["word"] prob = max(float(entry.get("probability") or 0.0), 0.000001) item_labels = [] for item in entry["item"]: label = item["label"] if not label in item_labels: item_labels.append(label) if "wn" not in item_labels: continue features = GetFeatures(searcher, entry) rel_words = {} normals = [] alternatives = entry.get("alternative") or [] suffix_pairs = [("se", "ze"), ("ence", "ense"), ("isation", "ization"), ("our", "or"), ("og", "ogue"), ("re", "er"), ("l", "ll")] for gb_suffix, us_suffix in suffix_pairs: if word.endswith(gb_suffix): us_word = word[:-len(gb_suffix)] + us_suffix if us_word in normals: continue if us_word in alternatives and searcher.CheckExact( us_word): normals.append(us_word) for alt in alternatives: if alt in normals: continue if word.count(" ") == alt.count(" "): continue dist = tkrzw.Utility.EditDistanceLev(word, alt) similar = False if dist == 1 and word[:3] != alt[:3]: similar = True elif dist == 2 and word[:5] == alt[:5] and word[-2:] == alt[ -2:]: similar = True if similar and searcher.CheckExact(alt): word_prob = float(phrase_dbm.GetStr(word) or "0") alt_prob = float(phrase_dbm.GetStr(alt) or "0") if alt_prob > word_prob * 2: normals.append(alt) parents = [] for parent in entry.get("parent") or []: parent_entries = searcher.SearchBody(parent) if not parent_entries: continue parent_prob = 0 for parent_entry in parent_entries: if parent_entry["word"] != parent: continue parent_prob = float(parent_entry["probability"] or "0") parents.append(parent) for parent in parent_index.get(word) or []: if parent not in parents: parents.append(parent) if parents: weight = 1 / (min(len(parents), 5) + 1) for parent in parents: rel_words[parent] = max(rel_words.get(parent) or 0, weight) weight *= 0.9 children = entry.get("child") or [] if len(word) >= 5: for phrase in entry.get("phrase") or []: phrase_word = phrase["w"] if not phrase_word.startswith(word): continue if phrase_word.endswith("ing") or phrase_word.endswith( "ed"): children.append(phrase_word) if children: weight = 1 / (min(len(parents), 5) + 2) for child in children: rel_words[child] = max(rel_words.get(child) or 0, weight) parent_index[child].append(word) weight *= 0.9 related = entry.get("related") or [] if related: weight = 1 / (min(len(parents), 5) + 2) for rel_word in related: rel_words[rel_word] = max( rel_words.get(rel_word) or 0, weight) weight *= 0.9 synonyms = {} hypernyms = {} hyponyms = {} antonyms = {} similars = {} item_weight = 1.0 for item in entry["item"]: if item["label"] != "wn": continue hit = False text = item["text"] for part in text.split("[-]"): match = regex.search(r"\[([a-z]+)\]: (.*)", part.strip()) if match: if match.group(1) == "synonym": res_words = synonyms elif match.group(1) == "hypernym": res_words = hypernyms elif match.group(1) == "hyponym": res_words = hyponyms elif match.group(1) == "antonym": res_words = antonyms elif match.group(1) == "similar": res_words = similars else: continue order_weight = 1.0 for rel_word in match.group(2).split(","): rel_word = rel_word.strip() if rel_word: weight = item_weight * order_weight res_words[rel_word] = max( res_words.get(rel_word) or 0, weight) order_weight *= 0.95 hit = True if hit: item_weight *= 0.95 voted_words = set() for cand_words, penalty, propagate in [(synonyms, 2, True), (hypernyms, 2, True), (hyponyms, 3, False), (antonyms, 3, False), (similars, 3, False)]: if not cand_words: continue type_weight = 1 / (math.log(len(cand_words)) + penalty) for cand_word, cand_weight in cand_words.items(): weight = cand_weight * type_weight if cand_word in voted_words: continue voted_words.add(cand_word) features[cand_word] = (features.get(cand_word) or 0) + weight * 0.5 if propagate: rel_words[cand_word] = max( rel_words.get(cand_word) or 0, weight) for rel_word, weight in rel_words.items(): AddFeatures(searcher, rel_word, weight, features) features.pop(word, None) features.pop("wikipedia", None) merged_features = {} for label, score in features.items(): if regex.search(r"[\p{Han}\p{Katakana}\p{Hiragana}]", label): label = NormalizeTran(label) label = regex.sub( r"[\p{Hiragana}]*(\p{Han})[\p{Hiragana}]*(\p{Han}).*", r"\1\2", label) label = regex.sub(r"([\p{Katakana}ー]{2,})\p{Hiragana}.*", r"\1", label) label = regex.sub(r"\p{Hiragana}+([\p{Katakana}ー]{2,})", r"\1", label) merged_features[label] = max( merged_features.get(label) or 0, score) features = [ x for x in merged_features.items() if not x[0].startswith("__") ] gb_words = set() rel_words = [x[0] for x in features] rel_words.append(word) for rel_word in rel_words: for gb_suffix, us_suffix in suffix_pairs: if rel_word.endswith(us_suffix): gb_word = rel_word[:-len(us_suffix)] + gb_suffix gb_words.add(gb_word) if not features: continue max_score = max(features, key=lambda x: x[1])[1] mod_features = [] for label, score in features: if len(mod_features) >= 128: break if label in gb_words: continue score /= max_score mod_features.append((label, score)) mod_features = sorted(mod_features, key=lambda x: x[1], reverse=True) fields = [word] fields.append(",".join(normals)) fields.append(",".join(parents)) fields.append(",".join(children)) fields.append("{:.6f}".format(prob)) for label, score in mod_features[:100]: fields.append(label) fields.append("{:.3f}".format(score)) print("\t".join(fields)) page_index += 1 phrase_dbm.Close().OrDie()