def process_each_prune(tup): """ poolで呼ばれる関数。 classのなかにかくと、classごとcopyされてeach processに渡される。 それを防ぐために、外に書く 1文ずつの処理。(sent,piece,trie)よりも、まとめた方が早い(sent_list, piece,trie) """ (items, piece, trie) = tup vsum = 0 freq = defaultdict(int) inverted = defaultdict(int) L = Lattice() for item in items: if item is None: continue (s, score) = item vsum += score L.set_sentence(s) L.populate_nodes(piece, trie) for word in L.Viterbi(ret_piece=True): freq[word] += score inverted[word] += score return (vsum, freq, inverted)
def prune_step_1_always_keep_alternative(self): """ Return always_keep(dict) alternatives(dict) """ current_piece = self.SentencePiece.get_pieces() # pieceをkeyとしてdictで管理 always_keep = dict() alternatives = defaultdict(list) # First segments the current sentencepieces to kwon how each sentencepiece is resegmented if this sentencepiece is removed from vocabulary. for key, score in current_piece.items(): L = Lattice() L.set_sentence(key) L.populate_nodes(current_piece, self.Trie) nbests = L.NBest(2, ret_piece=True) if len(nbests) == 1: # only one way to resegment it always_keep[key] = True elif len(nbests[0]) >= 2: always_keep[key] = False elif len(nbests[0]) == 1: always_keep[key] = True alternatives[key] = nbests[1] #print("alt=>",alternatives) return always_keep, alternatives
def encode_one_sent(self, sent): #TODO encode_poolがうまくいくなら決して良い """ Arguments: sent(str): sentence piece vocを使って分割する文 Returns: tokenize_sent(str): space split tokenize sentence """ L = Lattice() L.set_sentence(sent) L.populate_nodes(self.SentencePiece.get_pieces(), self.Trie) tokenize_sent = " ".join(L.Viterbi(ret_piece=True)) assert "".join(tokenize_sent.split(" ")) == sent return tokenize_sent
def process_each(tup): (items,piece,trie) = tup L = Lattice() ret=[] for item in items: if item is None: continue L.set_sentence(item) L.populate_nodes(piece, trie) ret.append(L.Viterbi(ret_piece=True)) return ret
def process_each_encode(tup): """ tup: tuple(sentence_list, piece, trie) return: tokenized_sentence_list """ (items, piece, trie) = tup ret = [] L = Lattice() for sent in items: if sent is None: continue L.set_sentence(sent) L.populate_nodes(piece, trie) tokenize_sent = " ".join(L.Viterbi(ret_piece=True)) ret.append(tokenize_sent) assert "".join(tokenize_sent.split(" ")) == sent return ret
def process_each_estep(tup): expected = defaultdict(int) objective = 0 num_tokens = 0 (items, pieces, trie) = tup L = Lattice() for item in items: if item is None: continue (key, freq) = item L.set_sentence(key) L.populate_nodes(pieces, trie) Z, ret_expected = L.populate_marginal(freq) for key, val in ret_expected.items(): expected[key] += val N = len(L.Viterbi()) num_tokens += N objective -= Z return (expected, objective, num_tokens)