def AddAuxTrans(self, word_dict, tran_prob_dbm, aux_trans):
     if not tran_prob_dbm: return
     for word, trans in aux_trans.items():
         norm_word = tkrzw_dict.NormalizeWord(word)
         trans = set(trans)
         tsv = tran_prob_dbm.GetStr(norm_word)
         if not tsv: continue
         tran_probs = {}
         fields = tsv.split("\t")
         for i in range(0, len(fields), 3):
             src, trg, prob = fields[i], fields[i + 1], float(fields[i + 2])
             if src != word: continue
             tran_probs[trg] = prob
         for tran, tran_prob in tran_probs.items():
             if tran_prob < 0.1: continue
             if tran not in trans: continue
             if tkrzw_dict.NormalizeWord(tran) == norm_word: continue
             tran_stem, tran_prefix, tran_suffix = self.tokenizer.StripJaParticles(
                 tran)
             if tran_prefix:
                 new_tran = tran_stem + tran_suffix
                 new_prob = tran_probs.get(new_tran) or 0
                 if (tran_prefix == "を" or regex.search(
                         r"^[\p{Han}\p{Katakana}]", tran_stem)
                         or (new_prob >= 0.01 and new_prob >= tran_prob)):
                     tran = new_tran
                     tran_prob = max(tran_prob, new_prob)
             score = tran_prob**0.5
             word_dict[tran].append((word, score, tran_prob, []))
示例#2
0
 def SearchExactReverse(self, text, capacity):
     ja_words = []
     ja_uniq_words = set()
     for ja_word in text.split(","):
         ja_word = tkrzw_dict.NormalizeWord(ja_word)
         if not ja_word: continue
         if ja_word in ja_uniq_words: continue
         ja_uniq_words.add(ja_word)
         ja_words.append(ja_word)
     en_words = []
     en_uniq_words = set()
     for ja_word in ja_words:
         for en_word in self.SearchTranIndex(ja_word):
             if en_word in en_uniq_words: continue
             en_uniq_words.add(en_word)
             en_words.append(en_word)
     result = []
     uniq_words = set()
     for en_word in en_words:
         if capacity < 1: break
         norm_en_word = tkrzw_dict.NormalizeWord(en_word)
         entries = self.SearchBody(norm_en_word)
         if entries:
             for entry in entries:
                 if capacity < 1: break
                 word = entry["word"]
                 if word != en_word: continue
                 if word in uniq_words: continue
                 uniq_words.add(word)
                 match = False
                 translations = entry.get("translation")
                 if translations:
                     for tran in translations:
                         tran = tkrzw_dict.NormalizeWord(tran)
                         for ja_word in ja_words:
                             if tran.find(ja_word) >= 0:
                                 match = True
                                 break
                         if match: break
                 if not match:
                     phrases = entry.get("phrase")
                     for phrase in phrases:
                         for tran in phrase["x"]:
                             tran = tkrzw_dict.NormalizeWord(tran)
                             for ja_word in ja_words:
                                 if tran.find(ja_word) >= 0:
                                     match = True
                                     break
                 if match:
                     result.append(entry)
                     capacity -= 1
     return result
 def SearchExact(self, text):
     text = tkrzw_dict.NormalizeWord(text)
     result = []
     entry = self.SearchBody(text)
     if entry:
         result.append((text, entry))
     return result
示例#4
0
 def SearchTranIndex(self, text):
     text = tkrzw_dict.NormalizeWord(text)
     tsv = self.tran_index_dbm.GetStr(text)
     result = []
     if tsv:
         result.extend(tsv.split("\t"))
     return result
示例#5
0
 def ReadTranIndex(self, synset_index):
   tran_index = {}
   if not self.tran_prob_path:
     return tran_index
   logger.info("Reading tran index: input_path={}".format(self.tran_prob_path))
   tran_prob_dbm = tkrzw.DBM()
   tran_prob_dbm.Open(self.tran_prob_path, False, dbm="HashDBM").OrDie()
   num_words = 0
   for word in synset_index:
     key = tkrzw_dict.NormalizeWord(word)
     tsv = tran_prob_dbm.GetStr(key)
     if tsv:
       tran_probs = {}
       fields = tsv.split("\t")
       for i in range(0, len(fields), 3):
         src, trg, prob = fields[i], fields[i + 1], fields[i + 2]
         if src != word: continue
         prob = float(prob)
         if prob > 0.04:
           tran_probs[trg] = prob
       if tran_probs:
         tran_index[word] = tran_probs
     num_words += 1
     if num_words % 10000 == 0:
       logger.info("Reading trans: words={}".format(num_words))
   tran_prob_dbm.Close().OrDie()
   logger.info("Reading tran index done: records={}".format(len(tran_index)))
   return tran_index
示例#6
0
 def CheckExact(self, text):
     for word in text.split(","):
         word = tkrzw_dict.NormalizeWord(word)
         if not word: continue
         if word in self.body_dbm:
             return True
     return False
 def Run(self):
     start_time = time.time()
     logger.info("Process started: input_path={}, output_path={}".format(
         str(self.input_path), self.output_path))
     input_dbm = tkrzw.DBM()
     input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie()
     os.makedirs(self.output_path, exist_ok=True)
     meta_dir_path = os.path.join(self.output_path, "META-INF")
     os.makedirs(meta_dir_path, exist_ok=True)
     data_dir_path = os.path.join(self.output_path, "OEBPS")
     os.makedirs(data_dir_path, exist_ok=True)
     words = self.ListUpWords(input_dbm)
     keys = sorted(set([tkrzw_dict.NormalizeWord(x) for x in words]))
     key_prefixes = set()
     for key in keys:
         key_prefixes.add(GetKeyPrefix(key))
     key_prefixes = sorted(list(key_prefixes))
     self.MakeMimeType()
     self.MakeContainer(meta_dir_path)
     self.MakePackage(data_dir_path, key_prefixes)
     self.MakeSearchKeyMap(data_dir_path, input_dbm, keys, words)
     self.MakeStyle(data_dir_path)
     self.MakeNavigation(data_dir_path, key_prefixes)
     self.MakeOverview(data_dir_path)
     self.MakeMain(data_dir_path, input_dbm, keys, words)
     input_dbm.Close().OrDie()
     logger.info("Process done: elapsed_time={:.2f}s".format(time.time() -
                                                             start_time))
示例#8
0
 def GetTranProb(self, tran_prob_dbm, src_text, trg_text):
     src_text = tkrzw_dict.NormalizeWord(src_text)
     tsv = tran_prob_dbm.GetStr(src_text)
     max_prob = 0.0
     if tsv:
         trg_text = tkrzw_dict.NormalizeWord(trg_text)
         fields = tsv.split("\t")
         for i in range(0, len(fields), 3):
             src, trg, prob = fields[i], fields[i + 1], float(fields[i + 2])
             norm_trg = tkrzw_dict.NormalizeWord(trg)
             if norm_trg == trg_text:
                 max_prob = max(max_prob, prob)
             elif len(norm_trg) >= 2 and trg_text.startswith(norm_trg):
                 max_prob = max(max_prob, prob * 0.01)
             elif len(trg_text) >= 2 and norm_trg.startswith(trg_text):
                 max_prob = max(max_prob, prob * 0.01)
     return max_prob
示例#9
0
 def Run(self):
     start_time = time.time()
     logger.info("Process started: input_path={}, output_path={}".format(
         self.input_path, self.output_path))
     mem_index = tkrzw.DBM()
     mem_index.Open("", True, dbm="BabyDBM").OrDie()
     input_dbm = tkrzw.DBM()
     input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie()
     it = input_dbm.MakeIterator()
     it.First()
     num_entries = 0
     num_translations = 0
     while True:
         record = it.GetStr()
         if not record: break
         key, serialized = record
         entry = json.loads(serialized)
         for item in entry["item"]:
             translations = item.get("translation")
             if translations:
                 for tran in translations:
                     norm_tran = tkrzw_dict.NormalizeWord(tran)
                     mem_index.Append(norm_tran, key, "\t").OrDie()
                 num_translations += len(translations)
         num_entries += 1
         if num_entries % 10000 == 0:
             logger.info("Reading: entries={}, translationss={}".format(
                 num_entries, num_translations))
         it.Next()
     input_dbm.Close().OrDie()
     logger.info("Reading done: entries={}, translationss={}".format(
         num_entries, num_translations))
     output_dbm = tkrzw.DBM()
     num_buckets = mem_index.Count() * 2
     output_dbm.Open(self.output_path,
                     True,
                     dbm="HashDBM",
                     truncate=True,
                     align_pow=0,
                     num_buckets=num_buckets).OrDie()
     it = mem_index.MakeIterator()
     it.First()
     num_records = 0
     while True:
         record = it.GetStr()
         if not record: break
         key, value = record
         value = "\t".join(list(set(value.split("\t"))))
         output_dbm.Set(key, value).OrDie()
         num_records += 1
         if num_records % 10000 == 0:
             logger.info("Writing: records={}".format(num_records))
         it.Next()
     output_dbm.Close().OrDie()
     logger.info("Writing done: records={}".format(num_records))
     mem_index.Close().OrDie()
     logger.info("Process done: elapsed_time={:.2f}s".format(time.time() -
                                                             start_time))
示例#10
0
 def SearchPatternMatch(self, mode, text, capacity):
     text = tkrzw_dict.NormalizeWord(text)
     keys = self.keys_file.Search(mode, text, capacity)
     result = []
     for key in keys:
         if len(result) >= capacity: break
         for entry in self.SearchExact(key, capacity - len(result)):
             result.append(entry)
     return result
示例#11
0
 def GetFeatures(self, entry):
     SCORE_DECAY = 0.95
     word = tkrzw_dict.NormalizeWord(entry["word"])
     features = {word: 1.0}
     pos_score = 1.0
     pos_score_max = 0.0
     pos_features = collections.defaultdict(float)
     for item in entry["item"]:
         pos = "__" + item["pos"]
         new_score = (pos_features.get(pos) or 0.0) + pos_score
         pos_features[pos] = new_score
         pos_score_max = max(pos_score_max, new_score)
         pos_score *= SCORE_DECAY
     for pos, pos_feature_score in pos_features.items():
         features[pos] = pos_feature_score / pos_score_max
     score = 1.0
     rel_words = entry.get("related")
     if rel_words:
         for rel_word in rel_words[:20]:
             rel_word = tkrzw_dict.NormalizeWord(rel_word)
             if rel_word not in features:
                 score *= SCORE_DECAY
                 features[rel_word] = score
     score = max(score, 0.4)
     trans = entry.get("translation")
     if trans:
         for tran in trans[:20]:
             tran = tkrzw_dict.NormalizeWord(tran)
             tran = regex.sub(
                 r"([\p{Han}\p{Katakana}ー]{2,})(する|すること|される|されること|をする|な|に|さ)$",
                 r"\1", tran)
             if tran not in features:
                 score *= SCORE_DECAY
                 features[tran] = score
     score = max(score, 0.2)
     coocs = entry.get("cooccurrence")
     if coocs:
         for cooc in coocs[:20]:
             cooc = tkrzw_dict.NormalizeWord(cooc)
             if cooc not in features:
                 score *= SCORE_DECAY
                 features[cooc] = score
     return features
示例#12
0
 def GetTranProb(self, tran_prob_dbm, word, tran):
   max_prob = 0.0
   key = tkrzw_dict.NormalizeWord(word)
   tsv = tran_prob_dbm.GetStr(key)
   norm_tran = tran.lower()
   if tsv:
     fields = tsv.split("\t")
     for i in range(0, len(fields), 3):
       src, trg, prob = fields[i], fields[i + 1], fields[i + 2]
       if src == word and trg.lower() == norm_tran:
         prob = float(prob)
         max_prob = max(max_prob, prob)
   return max_prob
示例#13
0
 def SearchPatternMatchReverse(self, mode, text, capacity):
   text = tkrzw_dict.NormalizeWord(text)
   keys = self.tran_keys_file.Search(mode, text, capacity, True)
   result = []
   uniq_words = set()
   for key in keys:
     if len(result) >= capacity: break
     for entry in self.SearchExactReverse(key, capacity - len(result) + 10):
       if len(result) >= capacity: break
       word = entry["word"]
       if word in uniq_words: continue
       uniq_words.add(word)
       result.append(entry)
   return result
 def Tokenize(self, language, sentence, lowering, stemming):
     sentence = self.NormalizeSentence(sentence)
     if language == "en":
         if stemming:
             words = self.TokenizeEnStemming(sentence)
         else:
             words = self.TokenizeEnSimple(sentence)
     elif language == "ja":
         words = self.TokenizeJaMecab(sentence, stemming)
     else:
         raise ValueError("unsupported language: " + language)
     if lowering:
         words = [tkrzw_dict.NormalizeWord(x) for x in words]
     return words
示例#15
0
 def SearchRelatedWithSeeds(self, seeds, capacity):
   seed_features = collections.defaultdict(float)
   base_weight = 1.0
   uniq_words = set()
   for seed in seeds:
     norm_word = tkrzw_dict.NormalizeWord(seed["word"])
     weight = base_weight
     if norm_word in uniq_words:
       weight *= 0.1
     uniq_words.add(norm_word)
     for word, score in self.GetFeatures(seed).items():
       seed_features[word] += score * weight
     base_weight *= 0.8
   result = self.ExpandEntries(seeds, seed_features, max(int(capacity * 1.2), 100))
   return result[:capacity]
示例#16
0
 def SearchExact(self, text, capacity):
     result = []
     uniq_words = set()
     for word in text.split(","):
         if len(result) >= capacity: break
         word = tkrzw_dict.NormalizeWord(word)
         if not word: continue
         entries = self.SearchBody(word)
         if not entries: continue
         for entry in entries:
             if len(result) >= capacity: break
             word = entry["word"]
             if word in uniq_words: continue
             uniq_words.add(word)
             result.append(entry)
     return result
 def Predict(self, text):
     words = set(self.tokenizer.Tokenize(self.language, text, True, False))
     if len(words) > 1:
         words.add(tkrzw_dict.NormalizeWord(text))
     cooc_words = {}
     for word in words:
         for cooc_word, cooc_score in self.GetCoocWords(word):
             cooc_words[cooc_word] = (cooc_words.get(cooc_word)
                                      or 0) + cooc_score
     sorted_cooc_words = sorted(cooc_words.items(),
                                key=operator.itemgetter(1),
                                reverse=True)
     rel_words = {}
     num_traces = 0
     for cooc_word, cooc_score in sorted_cooc_words:
         if num_traces >= self.TRACE_COOC_WORDS: break
         if cooc_word in words: continue
         for rel_word, rel_score in self.GetCoocWords(cooc_word):
             if rel_word in words: continue
             rel_words[rel_word] = max(
                 rel_words.get(rel_word) or 0, cooc_score * rel_score)
         num_traces += 1
     sorted_rel_words = sorted(rel_words.items(),
                               key=operator.itemgetter(1),
                               reverse=True)
     check_words = set(words)
     num_cooc_checked = 0
     for cooc_word, _ in sorted_cooc_words:
         if num_cooc_checked >= self.CHECK_COOC_WORDS: break
         if cooc_word in check_words: continue
         check_words.add(cooc_word)
         num_cooc_checked += 1
     num_rel_checked = 0
     for rel_word, _ in sorted_rel_words:
         if num_rel_checked >= self.CHECK_REL_WORDS: break
         if rel_word in check_words: continue
         check_words.add(rel_word)
         num_rel_checked += 1
     scored_rel_words = []
     for rel_word in check_words:
         rel_cooc_words = self.GetCoocWords(rel_word)
         score = self.GetSimilarity(sorted_cooc_words, rel_cooc_words)
         scored_rel_words.append((rel_word, score))
     scored_rel_words = sorted(scored_rel_words,
                               key=operator.itemgetter(1),
                               reverse=True)
     return scored_rel_words, sorted_cooc_words
 def SearchReverse(self, text):
     text = tkrzw_dict.NormalizeWord(text)
     result = []
     src_words = self.SearchTranIndex(text)
     if src_words:
         for src_word in src_words:
             entry = self.SearchBody(src_word)
             if entry:
                 items = []
                 for item in entry["item"]:
                     hit = False
                     translations = item.get("translation")
                     if translations:
                         for tran in translations:
                             if tran.lower() == text:
                                 hit = True
                                 break
                     if hit:
                         items.append(item)
                 if items:
                     entry["item"] = items
                     result.append((src_word, entry))
     if len(result) > 1:
         for record in result:
             entry = record[1]
             score = float(entry.get("score") or 0.0)
             for item in entry["item"]:
                 tran_scores = item.get("translation_score")
                 if tran_scores:
                     value = tran_scores.get(text)
                     if value:
                         value = float(value)
                         if value > score:
                             score = value
             entry["search_score"] = score
         result = sorted(result,
                         key=lambda rec: rec[1]["search_score"],
                         reverse=True)
     return result
示例#19
0
 def Run(self):
   start_time = time.time()
   logger.info("Process started: input_path={}, output_path={}".format(
     str(self.input_path), self.output_path))
   input_dbm = tkrzw.DBM()
   input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie()
   os.makedirs(self.output_path, exist_ok=True)
   words = self.ListUpWords(input_dbm)
   keys = sorted(set([tkrzw_dict.NormalizeWord(word) for word, prob in words.items()]))
   key_prefixes = set()
   for key in keys:
     key_prefixes.add(GetKeyPrefix(key))
   key_prefixes = sorted(list(key_prefixes), key=lambda x: 1000 if x == "_" else ord(x))
   self.MakeMain(input_dbm, keys, words)
   self.MakeNavigation(key_prefixes)
   self.MakeOverview()
   self.MakeStyle()
   self.MakePackage(key_prefixes)
   input_dbm.Close().OrDie()
   for label, count in self.label_counters.items():
     logger.info("Adopted label: {}: {}".format(label, count))
   logger.info("Stats: num_words={}, num_trans={}, num_items={}, num_aux_items={}".format(
     self.num_words, self.num_trans, self.num_items, self.num_aux_items))
   logger.info("Process done: elapsed_time={:.2f}s".format(time.time() - start_time))
示例#20
0
 def Run(self):
     start_time = time.time()
     logger.info("Process started: input_path={}, output_path={}".format(
         self.input_path, self.output_path))
     input_dbm = tkrzw.DBM()
     input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie()
     it = input_dbm.MakeIterator()
     it.First()
     num_entries = 0
     index = collections.defaultdict(list)
     infl_names = ("noun_plural", "verb_singular",
                   "verb_present_participle", "verb_past",
                   "verb_past_participle", "adjective_comparative",
                   "adjective_superative", "adverb_comparative",
                   "adverb_superative")
     while True:
         record = it.GetStr()
         if not record: break
         key, serialized = record
         entry = json.loads(serialized)
         for word_entry in entry:
             word = word_entry["word"]
             prob = max(float(word_entry.get("probability") or "0"),
                        0.0000001)
             score = prob * math.log2(len(word_entry["item"]))
             if "translation" in word_entry:
                 score *= 2
             inflections = set()
             for infl_name in infl_names:
                 inflection = word_entry.get(infl_name)
                 if inflection:
                     for infl_value in regex.split(r"[,|]", inflection):
                         infl_value = tkrzw_dict.NormalizeWord(
                             infl_value.strip())
                         if not regex.search(r"\p{Latin}", infl_value):
                             continue
                         inflections.add(infl_value)
             for inflection in inflections:
                 index[inflection].append((word, score))
         num_entries += 1
         if num_entries % 10000 == 0:
             logger.info("Reading: entries={}".format(num_entries))
         it.Next()
     input_dbm.Close().OrDie()
     logger.info("Reading done: entries={}".format(num_entries))
     output_dbm = tkrzw.DBM()
     num_buckets = len(index) * 2
     output_dbm.Open(self.output_path,
                     True,
                     dbm="HashDBM",
                     truncate=True,
                     align_pow=0,
                     num_buckets=num_buckets).OrDie()
     num_entries = 0
     for inflection, scores in index.items():
         scores = sorted(scores, key=lambda x: x[1], reverse=True)
         words = [x[0] for x in scores]
         output_dbm.Set(inflection, "\t".join(words)).OrDie()
         num_entries += 1
         if num_entries % 10000 == 0:
             logger.info("Writing: entries={}".format(num_entries))
     output_dbm.Close().OrDie()
     logger.info("Process done: elapsed_time={:.2f}s".format(time.time() -
                                                             start_time))
    def MakeMainEntry(self, out_file, entry, conj_verbs, conj_adjs,
                      rev_prob_dbm):
        def P(*args, end="\n"):
            esc_args = []
            for arg in args[1:]:
                if isinstance(arg, str):
                    arg = esc(arg)
                esc_args.append(arg)
            print(args[0].format(*esc_args), end=end, file=out_file)

        self.num_words += 1
        yomi, word, trans = entry
        variants = {}
        variants[yomi] = True
        pos = self.tokenizer.GetJaLastPos(word)
        word_prob = 0
        if rev_prob_dbm:
            word_prob = self.GetPhraseProb(rev_prob_dbm, "ja", word)
        if word.endswith(pos[3]):
            prefix = word[:-len(pos[3])]
            for focus_pos, conj_map in [("動詞", conj_verbs),
                                        ("形容詞", conj_adjs)]:
                if pos[1] != focus_pos: continue
                conjs = conj_map.get(word)
                if prefix and not conjs and word_prob >= 0.00001:
                    part_conjs = conj_map.get(pos[3])
                    if part_conjs:
                        conjs = [prefix + x for x in part_conjs]
                if conjs:
                    for conj in sorted(conjs):
                        variants[conj] = True
        stem, prefix, suffix = self.tokenizer.StripJaParticles(word)
        if stem != word:
            if prefix == "を" or regex.search(r"[\p{Han}\p{Katakana}]", stem):
                prefix = ""
            new_word = prefix + stem
            variants[new_word] = True
        for suffix in ("する", "した", "される", "された"):
            if word.endswith(suffix):
                stem = word[:-len(suffix)]
                if self.tokenizer.IsJaWordSahenNoun(stem):
                    variants[stem] = True
        for suffix in ("な", "に", "と"):
            if word.endswith(suffix):
                stem = word[:-len(suffix)]
                if self.tokenizer.IsJaWordAdjvNoun(stem):
                    variants[stem] = True
        if word in variants:
            del variants[word]
        trans = sorted(trans, key=lambda x: x[1], reverse=True)
        P('<idx:entry>')
        P('<div>')
        P('<span class="word">')
        P('<idx:orth>{}', word)
        if variants:
            P('<idx:infl>')
            for variant, _ in variants.items():
                P('<idx:iform value="{}"/>', variant)
            P('</idx:infl>')
        P('</idx:orth>')
        P('</span>')
        if yomi != word:
            P('&#x2003;<span class="pron">({})</span>', yomi)
        P('</div>')
        uniq_trans = set()
        uniq_synsets = set()
        num_lines = 0
        for tran, score, tran_prob, synsets in trans:
            norm_tran = tkrzw_dict.NormalizeWord(tran)
            if norm_tran in uniq_trans: continue
            uniq_trans.add(norm_tran)
            self.num_items += 1
            hit_syn = False
            for syn_id, syn_gross, syn_words in synsets:
                if syn_id in uniq_synsets: continue
                uniq_synsets.add(syn_id)
                hit_syn = True
                P('<div>{}', ", ".join([tran] + syn_words), end="")
                P(' <span class="gross">- {}</span>', syn_gross, end="")
                P('</div>')
                num_lines += 1
                for synonym in syn_words:
                    norm_syn = tkrzw_dict.NormalizeWord(synonym)
                    uniq_trans.add(norm_syn)
            if not hit_syn and num_lines < 8:
                P('<div>{}</div>', tran)
                num_lines += 1
        P('</idx:entry>')
        P('<br/>')
 def ReadEntry(self, word_dict, entry, tran_prob_dbm, aux_trans):
     word = entry["word"]
     norm_word = tkrzw_dict.NormalizeWord(word)
     word_prob = float(entry.get("probability") or 0)
     trans = entry.get("translation")
     if not trans: return
     word_aux_trans = aux_trans.get(word)
     if word_aux_trans:
         word_aux_trans = set(word_aux_trans)
         trans.extend(word_aux_trans)
     dict_trans = set()
     for item in entry["item"]:
         label = item["label"]
         text = item["text"]
         if label in self.supplement_labels:
             for tran in text.split(","):
                 tran = tran.strip()
                 if tran:
                     trans.append(tran)
                     dict_trans.add(tran)
     tran_probs = {}
     if tran_prob_dbm:
         tsv = tran_prob_dbm.GetStr(norm_word)
         if tsv:
             fields = tsv.split("\t")
             for i in range(0, len(fields), 3):
                 src, trg, prob = fields[i], fields[i + 1], float(fields[i +
                                                                         2])
                 if src != word: continue
                 tran_probs[trg] = prob
     word_prob_score = max(0.1, (word_prob**0.5))
     rank_score = 0.5
     uniq_trans = set()
     norm_trans = []
     for tran in trans:
         tran = regex.sub("[・]", "", tran)
         if tran and tran not in uniq_trans:
             norm_trans.append(tran)
             uniq_trans.add(tran)
     for i, tran in enumerate(norm_trans):
         if tkrzw_dict.NormalizeWord(tran) == norm_word: continue
         tran_prob = tran_probs.get(tran) or 0
         tran_stem, tran_prefix, tran_suffix = self.tokenizer.StripJaParticles(
             tran)
         if tran_prefix:
             new_tran = tran_stem + tran_suffix
             new_prob = tran_probs.get(new_tran) or 0
             if (tran_prefix == "を"
                     or regex.search(r"^[\p{Han}\p{Katakana}]", tran_stem)
                     or (new_prob >= 0.01 and new_prob >= tran_prob)):
                 tran = new_tran
                 tran_prob = max(tran_prob, new_prob)
         if i == 0:
             pass
         elif i <= 1 and tran_prob >= 0.01:
             pass
         elif i <= 2 and tran_prob >= 0.02:
             pass
         elif i <= 3 and tran_prob >= 0.04:
             pass
         elif tran_prob >= 0.1:
             pass
         elif tran in dict_trans:
             pass
         else:
             continue
         tran_prob_score = tran_prob**0.75
         dict_score = 0.1 if tran in dict_trans else 0.0
         if word_aux_trans and tran in word_aux_trans: dict_score += 0.1
         synsets = []
         for item in entry["item"]:
             if item["label"] != "wn": continue
             texts = item["text"].split(" [-] ")
             synset_id = ""
             gross = texts[0]
             synonyms = []
             tran_match = False
             for text in texts[1:]:
                 match = regex.search(r"^\[(\w+)\]: (.*)", text)
                 if not match: continue
                 name = match.group(1).strip()
                 text = match.group(2).strip()
                 if name == "synset":
                     synset_id = text
                 elif name == "synonym":
                     for synonym in text.split(","):
                         synonym = synonym.strip()
                         if synonym:
                             synonyms.append(synonym)
                 elif name == "translation":
                     for syn_tran in text.split(","):
                         syn_tran = syn_tran.strip()
                         if syn_tran == tran:
                             tran_match = True
             if synset_id and tran_match:
                 synsets.append((synset_id, gross, synonyms))
         if synsets:
             dict_score += 0.1
         score = word_prob_score + rank_score + tran_prob_score + dict_score
         word_dict[tran].append((word, score, tran_prob, synsets))
         rank_score *= 0.8
     phrases = entry.get("phrase")
     if phrases:
         for phrase in phrases:
             phrase_word = phrase.get("w")
             if not phrase_word or phrase.get("p") or phrase.get("i"):
                 continue
             score = word_prob_score + rank_score
             for phrase_tran in phrase.get("x"):
                 phrase_tran = regex.sub(r"\(.*?\)", "",
                                         phrase_tran).strip()
                 if phrase_tran:
                     word_dict[phrase_tran].append(
                         (phrase_word, score, 0.05, []))
def ProcessWord(word, trans, tokenizer, phrase_prob_dbm, rev_prob_dbm,
                tran_prob_dbm, aux_trans, yomis, min_phrase_prob,
                min_tran_prob):
    phrase_prob = 0.0
    if phrase_prob_dbm:
        tokens = tokenizer.Tokenize("en", word, False, True)[:3]
        norm_phrase = " ".join(tokens)
        phrase_prob = float(phrase_prob_dbm.GetStr(norm_phrase) or 0.0)
    uniq_trans = set()
    check_trans = []
    norm_word = word.lower()
    for pos, tran in trans:
        norm_tran = tran.lower()
        if norm_tran == norm_word: continue
        if norm_tran in uniq_trans: continue
        uniq_trans.add(norm_tran)
        check_trans.append((pos, tran))
        tran = tokenizer.NormalizeJaWordForPos(pos, tran)
        norm_tran = tran.lower()
        if norm_tran in uniq_trans: continue
        uniq_trans.add(norm_tran)
        check_trans.append((pos, tran))
    scored_trans = []
    aux_targets = aux_trans.get(word)
    for pos, tran in check_trans:
        tran_prob = 0.0
        if tran_prob_dbm:
            key = tkrzw_dict.NormalizeWord(word)
            tsv = tran_prob_dbm.GetStr(key)
            if tsv:
                fields = tsv.split("\t")
                extra_records = []
                for i in range(0, len(fields), 3):
                    src, trg, prob = fields[i], fields[i + 1], float(fields[i +
                                                                            2])
                    if src != word: continue
                    if trg != tran: continue
                    tran_prob = float(prob)
        aux_hit = False
        if aux_targets:
            count = aux_targets.count(tran)
            if count > 0:
                aux_hit = True
                tran_prob += count * 0.05
        has_yomi = tran in yomis
        if has_yomi:
            tran_prob += 0.01
        is_known_word = False
        tran_tokens = tokenizer.Tokenize("ja", tran, False, True)
        if len(tran_tokens) == 2 and tran_tokens[-1] in ("する", "な"):
            tran_tokens = tran_tokens[:1]
        if len(tran_tokens) == 1:
            tran_last_pos = tokenizer.GetJaLastPos(tran)
            if tran_last_pos[1] in ("動詞", "形容詞", "副詞"):
                is_known_word = True
            elif tokenizer.IsJaWordSahenNoun(
                    tran) or tokenizer.IsJaWordAdjvNoun(tran):
                is_known_word = True
        rev_prob = 0.0
        if rev_prob_dbm:
            tokens = tokenizer.Tokenize("ja", tran, False, True)[:3]
            norm_phrase = " ".join(tokens)
            rev_prob = float(rev_prob_dbm.GetStr(norm_phrase) or 0.0)
            if rev_prob > 0:
                tran_prob += min(rev_prob**0.5, 0.2)
            else:
                tran_prob *= 0.8
        if tran_prob > 0.04 and (rev_prob > 0.0 or has_yomi):
            pass
        elif tran_prob > 0.01 and is_known_word:
            pass
        else:
            if not aux_hit and phrase_prob < min_phrase_prob: continue
            if not aux_hit and tran_prob < min_tran_prob: continue
        if regex.fullmatch(r"[\p{Katakana}ー]+", tran):
            tran_prob *= 0.7
        elif regex.fullmatch(r"[\p{Hiragana}\p{Katakana}ー]+", tran):
            tran_prob *= 0.8
        scored_trans.append((pos, tran, tran_prob))
    if not scored_trans: return None, None
    scored_trans = sorted(scored_trans, key=lambda x: x[2], reverse=True)
    return (scored_trans, phrase_prob)
def Run(phrase_prob_path, rev_prob_path, tran_prob_path, tran_aux_paths,
        yomi_paths, min_phrase_prob, min_tran_prob):
    logger.info("Start the process")
    phrase_prob_dbm = None
    if phrase_prob_path:
        logger.info("Opening the phrase prob DB: " + phrase_prob_path)
        phrase_prob_dbm = tkrzw.DBM()
        phrase_prob_dbm.Open(phrase_prob_path, False, dbm="HashDBM").OrDie()
    rev_prob_dbm = None
    if rev_prob_path:
        logger.info("Opening the reverse prob DB: " + rev_prob_path)
        rev_prob_dbm = tkrzw.DBM()
        rev_prob_dbm.Open(rev_prob_path, False, dbm="HashDBM").OrDie()
    tran_prob_dbm = None
    if tran_prob_path:
        logger.info("Opening the tran prob DB: " + tran_prob_path)
        tran_prob_dbm = tkrzw.DBM()
        tran_prob_dbm.Open(tran_prob_path, False, dbm="HashDBM").OrDie()
    aux_trans = collections.defaultdict(list)
    for tran_aux_path in tran_aux_paths.split(","):
        tran_aux_path = tran_aux_path.strip()
        if tran_aux_path:
            logger.info("Reading the tran aux file: " + tran_aux_path)
            with open(tran_aux_path) as input_file:
                uniq_keys = set()
                for line in input_file:
                    fields = line.strip().split("\t")
                    if len(fields) < 2: continue
                    word = fields[0]
                    for tran in fields[1:]:
                        uniq_key = word + ":" + tran
                        if uniq_key in uniq_keys: continue
                        aux_trans[word].append(tran)
                        uniq_keys.add(uniq_key)
    yomis = set()
    for yomi_path in yomi_paths.split(","):
        yomi_path = yomi_path.strip()
        if yomi_path:
            logger.info("Reading the yomi file: " + yomi_path)
            with open(yomi_path) as input_file:
                for line in input_file:
                    fields = line.strip().split("\t")
                    if len(fields) < 1: continue
                    yomis.add(fields[0])
    logger.info("Processing the gross.")
    tokenizer = tkrzw_tokenizer.Tokenizer()
    word_dict = collections.defaultdict(list)
    alt_source = None
    alt_targets = None
    num_lines = 0
    for line in sys.stdin:
        num_lines += 1
        if num_lines % 10000 == 0:
            logger.info("Processing the gross: {} lines: {} items".format(
                num_lines, len(word_dict)))
        fields = line.strip().split("\t")
        if len(fields) != 3: continue
        word, pos, text = fields
        if pos == "alternative":
            alt_source = word
            alt_targets = set()
            for alt in regex.split(r"[,;]", text):
                if regex.fullmatch(r"[\p{Han}\p{Hiragana}\p{Katakana}ー]+",
                                   alt):
                    alt_targets.add(alt)
            continue
        text = regex.sub(r"\.$", "", text).strip()
        for tran in regex.split(r"[,;]", text):
            tran = tran.strip()
            if pos == "verb":
                tran = regex.sub(r"^to ", "", tran)
            if pos == "noun":
                tran = regex.sub(r"^(a|an|the) ", "", tran)
            tran = regex.sub("^[-~] ", "", tran)
            tran = regex.sub(" [-~]$", "", tran)
            if not regex.fullmatch(r"[-_\p{Latin}0-9'. ]+", tran): continue
            tokens = tran.split(" ")
            if len(tokens) < 1 or len(tokens) > 4: continue
            word_dict[tran].append((pos, word))
            if alt_source == word:
                for alt in alt_targets:
                    word_dict[tran].append((pos, alt))
    norm_word_dict = collections.defaultdict(list)
    for word, trans in word_dict.items():
        scored_trans, phrase_prob = ProcessWord(word, trans, tokenizer,
                                                phrase_prob_dbm, rev_prob_dbm,
                                                tran_prob_dbm, aux_trans,
                                                yomis, min_phrase_prob,
                                                min_tran_prob)
        if scored_trans:
            key = tkrzw_dict.NormalizeWord(word)
            norm_word_dict[key].append((word, scored_trans, phrase_prob))
    for key, entries in norm_word_dict.items():
        sum_phrase_prob = 0.0
        for word, scored_trans, phrase_prob in entries:
            sum_phrase_prob += phrase_prob
        for word, scored_trans, phrase_prob in entries:
            if sum_phrase_prob > 0:
                if key == word:
                    if phrase_prob / sum_phrase_prob < 0.6: continue
                else:
                    if phrase_prob / sum_phrase_prob < 0.8: continue
            PrintEntry(word, scored_trans)
    if tran_prob_dbm:
        tran_prob_dbm.Close().OrDie()
    if phrase_prob_dbm:
        phrase_prob_dbm.Close().OrDie()
    logger.info("Process done")
示例#25
0
 def GetResultKeys(self, entries):
     keys = set()
     for entry in entries:
         keys.add(tkrzw_dict.NormalizeWord(entry["word"]))
     return keys
示例#26
0
 def Run(self):
     start_time = time.time()
     logger.info("Process started: input_path={}, output_path={}".format(
         self.input_path, self.output_path))
     mem_index = tkrzw.DBM()
     mem_index.Open("", True, dbm="BabyDBM").OrDie()
     input_dbm = tkrzw.DBM()
     input_dbm.Open(self.input_path, False, dbm="HashDBM").OrDie()
     it = input_dbm.MakeIterator()
     it.First()
     num_entries = 0
     num_translations = 0
     tran_dict = set()
     while True:
         record = it.GetStr()
         if not record: break
         key, serialized = record
         entry = json.loads(serialized)
         for word_entry in entry:
             word = word_entry["word"]
             prob = max(float(word_entry.get("probability") or "0"),
                        0.0000001)
             aoa = min(float(word_entry.get("aoa") or "20"), 20.0)
             score = prob * ((30 - aoa) / 10)
             word_trans = word_entry.get("translation") or []
             phrase_trans = []
             phrases = word_entry.get("phrase")
             if phrases:
                 for phrase in phrases:
                     if phrase.get("p") or phrase.get("i"): continue
                     for phrase_tran in phrase.get("x"):
                         phrase_tran = regex.sub(r"\(.*?\)", "",
                                                 phrase_tran).strip()
                         if phrase_tran:
                             phrase_trans.append(phrase_tran)
             weight_word_trans = []
             for trans, weight in [(word_trans, 1.0), (phrase_trans, 0.5)]:
                 for word_tran in trans:
                     weight_word_trans.append((word_tran, weight))
                     match = regex.search(
                         r"([\p{Han}\p{Katakana}ー]{2,})(する|すること|される|されること|をする)$",
                         word_tran)
                     if match:
                         short_word_tran = word_tran[:-len(match.group(2))]
                         if short_word_tran:
                             weight_word_trans.append(
                                 (short_word_tran, weight * 0.8))
                     short_word_tran = self.tokenizer.CutJaWordNounParticle(
                         word_tran)
                     if short_word_tran != word_tran:
                         weight_word_trans.append(
                             (short_word_tran, weight * 0.8))
                     match = regex.search(
                         r"([\p{Han}\p{Katakana}ー]{2,})(的|的な|的に)$",
                         word_tran)
                     if match:
                         short_word_tran = word_tran[:-len(match.group(2))]
                         if short_word_tran:
                             weight_word_trans.append(
                                 (short_word_tran, weight * 0.8))
                     match = regex.search(
                         r"([\p{Han}]{2,})(が|の|を|に|へ|と|より|から|で|や|な|なる|たる)$",
                         word_tran)
                     if match:
                         short_word_tran = word_tran[:-len(match.group(2))]
                         if short_word_tran:
                             weight_word_trans.append(
                                 (short_word_tran, weight * 0.8))
             uniq_trans = set()
             for tran, weight in weight_word_trans:
                 norm_tran = tkrzw_dict.NormalizeWord(tran)
                 if norm_tran in uniq_trans: continue
                 uniq_trans.add(norm_tran)
                 pair = "{}\t{:.8f}".format(word, score * weight)
                 score *= 0.98
                 mem_index.Append(norm_tran, pair, "\t").OrDie()
             for item in word_entry["item"]:
                 if item["label"] in self.supplement_labels:
                     for tran in item["text"].split(","):
                         tran = tran.strip()
                         if tran:
                             tran_dict_key = word + "\t" + tran
                             tran_dict.add(tran_dict_key)
             num_translations += len(uniq_trans)
         num_entries += 1
         if num_entries % 10000 == 0:
             logger.info("Reading: entries={}, translations={}".format(
                 num_entries, num_translations))
         it.Next()
     input_dbm.Close().OrDie()
     logger.info("Reading done: entries={}, translations={}".format(
         num_entries, num_translations))
     output_dbm = tkrzw.DBM()
     num_buckets = mem_index.Count() * 2
     output_dbm.Open(self.output_path,
                     True,
                     dbm="HashDBM",
                     truncate=True,
                     align_pow=0,
                     num_buckets=num_buckets).OrDie()
     tran_prob_dbm = None
     if self.tran_prob_path:
         tran_prob_dbm = tkrzw.DBM()
         tran_prob_dbm.Open(self.tran_prob_path, False,
                            dbm="HashDBM").OrDie()
     it = mem_index.MakeIterator()
     it.First()
     num_records = 0
     while True:
         record = it.GetStr()
         if not record: break
         key, value = record
         scored_trans = []
         uniq_words = set()
         fields = value.split("\t")
         for i in range(0, len(fields), 2):
             word = fields[i]
             score = float(fields[i + 1])
             if word in uniq_words: continue
             uniq_words.add(word)
             if tran_prob_dbm:
                 prob = self.GetTranProb(tran_prob_dbm, word, key)
                 tran_dict_key = word + "\t" + key
                 prob = max(prob, 0.000001)
                 if tran_dict_key in tran_dict:
                     prob += 0.1
                 score = (score * prob)**0.5
             scored_trans.append((word, score))
         scored_trans = sorted(scored_trans,
                               key=lambda x: x[1],
                               reverse=True)
         value = "\t".join([x[0] for x in scored_trans])
         output_dbm.Set(key, value).OrDie()
         num_records += 1
         if num_records % 10000 == 0:
             logger.info("Writing: records={}".format(num_records))
         it.Next()
     if tran_prob_dbm:
         tran_prob_dbm.Close().OrDie()
     output_dbm.Close().OrDie()
     logger.info("Writing done: records={}".format(num_records))
     mem_index.Close().OrDie()
     logger.info("Process done: elapsed_time={:.2f}s".format(time.time() -
                                                             start_time))
示例#27
0
 def MakeMainEntry(self, out_file, entry, input_dbm, keys, inflections):
   def P(*args, end="\n"):
     esc_args = []
     for arg in args[1:]:
       if isinstance(arg, str):
         arg = esc(arg)
       esc_args.append(arg)
     print(args[0].format(*esc_args), end=end, file=out_file)
   word = entry["word"]
   prob = float(entry.get("probability") or "0")
   pronunciation = entry.get("pronunciation")
   translations = entry.get("translation")
   is_major_word = prob >= 0.00001 and not regex.search("[A-Z]", word)
   poses = set()
   sub_poses = set()
   for item in entry["item"][:10]:
     if item["label"] in self.supplement_labels:
       sub_poses.add(item["pos"])
     else:
       poses.add(item["pos"])
   if not poses:
     poses = sub_poses
   infl_groups = collections.defaultdict(list)
   if not regex.search(r"[A-Z].*[A-Z]", word):
     for attr_list in INFLECTIONS:
       for name, label in attr_list:
         pos, suffix = name.split("_", 1)
         if pos not in poses: continue
         if name == "verb_singular":
           suffix = "present 3ps"
         else:
           suffix = suffix.replace("_", " ")
         value = entry.get(name)
         if value:
           infl_groups[pos].append((suffix, value, label))
   main_labels = set()
   label_items = collections.defaultdict(list)
   for item in entry["item"]:
     label = item["label"]
     if label in self.preferable_labels:
       main_labels.add(label)
     label_items[label].append(item)
   best_label = None
   is_stop = word in ARTICLES or word in PARTICLES
   if len(main_labels) >= 2:
     min_cost = None
     for label in main_labels:
       is_best = label in self.best_labels
       is_vetted = not is_stop and label in self.vetted_labels
       num_items = 0
       length_cost = 0
       for item in label_items[label]:
         text = item["text"]
         if not is_best and not is_vetted and not CheckSafeText(text):
           length_cost += 10.0
         if text.startswith("[translation]:"): continue
         text = regex.sub(r" \[-+\] .*", "", text).strip()
         if not text: continue
         num_items += 1
         text = regex.sub(r"[^-_\p{Latin}\d']+", " ", text).strip()
         num_words = text.count(" ") + 1
         length_cost += abs(math.log(9) - math.log(num_words))
       if not num_items: continue
       item_cost = abs(math.log(5) - math.log(num_items))
       length_cost = length_cost / num_items
       if is_best:
         quality_cost = 0.8
       elif is_vetted:
         quality_cost = 1.0
       else:
         quality_cost = 1.25
       cost = (item_cost + 0.5) * (length_cost + 1.0) * quality_cost
       if not min_cost or cost < min_cost:
         best_label = label
         min_cost = cost
   elif len(main_labels) >= 1:
     best_label = list(main_labels)[0]
   else:
     best_label = entry["item"][0]["label"]
   self.label_counters[best_label] += 1
   items = []
   sub_items = []
   tran_items = []
   for item in entry["item"]:
     label = item["label"]
     text = item["text"]
     if text.startswith("[translation]:"):
       tran_items.append(item)
     elif label == best_label:
       items.append(item)
     elif label in main_labels and is_major_word and not regex.search(r"\w{20,}", text):
       sub_items.append(item)
   if not items:
     items = sub_items
   if not items:
     items = tran_items
   if not items: return
   items = self.MergeShownItems(items, sub_items)
   self.num_words += 1
   P('<idx:entry>')
   P('<div>')
   P('<span class="word">')
   P('<idx:orth>{}', word)
   for pos, values in infl_groups.items():
     kind_infls = []
     for kind, value, label in values:
       for infl in value.split(","):
         infl = infl.strip()
         if not infl: continue
         if inflections.get(infl) != word: continue
         kind_infls.append((kind, infl))
     if not kind_infls: continue
     P('<idx:infl inflgrp="{}">', pos)
     for kind, infl in kind_infls:
       P('<idx:iform name="{}" value="{}"/>', kind, infl)
     P('</idx:infl>')
   alternatives = entry.get("alternative")
   if alternatives:
     alt_words = []
     for alternative in alternatives:
       alt_norm = tkrzw_dict.NormalizeWord(alternative)
       if not alt_norm or alt_norm in keys or alt_norm in inflections:
         continue
       alt_words.append(alternative)
     if alt_words:
       P('<idx:infl inflgrp="common">')
       for alt_word in alt_words:
         P('<idx:iform name="alternative" value="{}"/>', alt_word)
       P('</idx:infl>')
   P('</idx:orth>')
   P('</span>')
   if pronunciation:
     P('&#x2003;<span class="pron">/{}/</span>', pronunciation)
   P('</div>')
   if translations:
     self.num_trans += 1
     P('<div>{}</div>', ", ".join(translations[:6]))
   for item in items:
     self.MakeMainEntryItem(P, item)
   phrases = entry.get("phrase")
   if phrases:
     for phrase in phrases:
       self.MakeMainEntryPhraseItem(P, phrase)
   parents = entry.get("parent")
   if parents:
     for parent in parents:
       self.MakeMainEntryParentItem(P, parent, input_dbm)
   for pos, values in infl_groups.items():
     P('<div>')
     for kind, value, label in values:
       P('<span class="attr">[{}]</span> {}', label, value)
     P('</div>')
   P('</idx:entry>')
   P('<br/>')