def process_each_prune(tup): """ poolで呼ばれる関数。 classのなかにかくと、classごとcopyされてeach processに渡される。 それを防ぐために、外に書く 1文ずつの処理。(sent,piece,trie)よりも、まとめた方が早い(sent_list, piece,trie) """ (items, piece, trie) = tup vsum = 0 freq = defaultdict(int) inverted = defaultdict(int) L = Lattice() for item in items: if item is None: continue (s, score) = item vsum += score L.set_sentence(s) L.populate_nodes(piece, trie) for word in L.Viterbi(ret_piece=True): freq[word] += score inverted[word] += score return (vsum, freq, inverted)
def encode_one_sent(self, sent): #TODO encode_poolがうまくいくなら決して良い """ Arguments: sent(str): sentence piece vocを使って分割する文 Returns: tokenize_sent(str): space split tokenize sentence """ L = Lattice() L.set_sentence(sent) L.populate_nodes(self.SentencePiece.get_pieces(), self.Trie) tokenize_sent = " ".join(L.Viterbi(ret_piece=True)) assert "".join(tokenize_sent.split(" ")) == sent return tokenize_sent
def process_each(tup): (items,piece,trie) = tup L = Lattice() ret=[] for item in items: if item is None: continue L.set_sentence(item) L.populate_nodes(piece, trie) ret.append(L.Viterbi(ret_piece=True)) return ret
def process_each_encode(tup): """ tup: tuple(sentence_list, piece, trie) return: tokenized_sentence_list """ (items, piece, trie) = tup ret = [] L = Lattice() for sent in items: if sent is None: continue L.set_sentence(sent) L.populate_nodes(piece, trie) tokenize_sent = " ".join(L.Viterbi(ret_piece=True)) ret.append(tokenize_sent) assert "".join(tokenize_sent.split(" ")) == sent return ret
def process_each_estep(tup): expected = defaultdict(int) objective = 0 num_tokens = 0 (items, pieces, trie) = tup L = Lattice() for item in items: if item is None: continue (key, freq) = item L.set_sentence(key) L.populate_nodes(pieces, trie) Z, ret_expected = L.populate_marginal(freq) for key, val in ret_expected.items(): expected[key] += val N = len(L.Viterbi()) num_tokens += N objective -= Z return (expected, objective, num_tokens)