Пример #1
0
def process_each_prune(tup):
    """
    poolで呼ばれる関数。
    classのなかにかくと、classごとcopyされてeach processに渡される。
    それを防ぐために、外に書く

    1文ずつの処理。(sent,piece,trie)よりも、まとめた方が早い(sent_list, piece,trie)
    """
    (items, piece, trie) = tup

    vsum = 0
    freq = defaultdict(int)
    inverted = defaultdict(int)

    L = Lattice()
    for item in items:
        if item is None:
            continue

        (s, score) = item
        vsum += score
        L.set_sentence(s)
        L.populate_nodes(piece, trie)

        for word in L.Viterbi(ret_piece=True):
            freq[word] += score
            inverted[word] += score
    return (vsum, freq, inverted)
Пример #2
0
    def prune_step_1_always_keep_alternative(self):
        """
        Return
            always_keep(dict)
            alternatives(dict)
        """
        current_piece = self.SentencePiece.get_pieces()
        # pieceをkeyとしてdictで管理
        always_keep = dict()
        alternatives = defaultdict(list)

        # First segments the current sentencepieces to kwon how each sentencepiece is resegmented if this sentencepiece is  removed from vocabulary.
        for key, score in current_piece.items():
            L = Lattice()
            L.set_sentence(key)
            L.populate_nodes(current_piece, self.Trie)
            nbests = L.NBest(2, ret_piece=True)

            if len(nbests) == 1:  # only one way to resegment it
                always_keep[key] = True

            elif len(nbests[0]) >= 2:
                always_keep[key] = False

            elif len(nbests[0]) == 1:
                always_keep[key] = True
                alternatives[key] = nbests[1]

        #print("alt=>",alternatives)
        return always_keep, alternatives
Пример #3
0
 def encode_one_sent(self, sent):
     #TODO encode_poolがうまくいくなら決して良い
     """
     Arguments:
         sent(str): sentence piece vocを使って分割する文
     Returns:
         tokenize_sent(str): space split tokenize sentence
     """
     L = Lattice()
     L.set_sentence(sent)
     L.populate_nodes(self.SentencePiece.get_pieces(), self.Trie)
     tokenize_sent = " ".join(L.Viterbi(ret_piece=True))
     assert "".join(tokenize_sent.split(" ")) == sent
     return tokenize_sent
def process_each(tup):
    (items,piece,trie) = tup

    L = Lattice()

    ret=[]
    for item in items:
        if item is None:
            continue

        L.set_sentence(item)
        L.populate_nodes(piece, trie)
        ret.append(L.Viterbi(ret_piece=True))

    return ret
Пример #5
0
def process_each_encode(tup):
    """
    tup: tuple(sentence_list, piece, trie)
    
    return: tokenized_sentence_list
    """
    (items, piece, trie) = tup

    ret = []
    L = Lattice()
    for sent in items:
        if sent is None:
            continue
        L.set_sentence(sent)
        L.populate_nodes(piece, trie)
        tokenize_sent = " ".join(L.Viterbi(ret_piece=True))
        ret.append(tokenize_sent)
        assert "".join(tokenize_sent.split(" ")) == sent
    return ret
Пример #6
0
def process_each_estep(tup):

    expected = defaultdict(int)
    objective = 0
    num_tokens = 0

    (items, pieces, trie) = tup
    L = Lattice()
    for item in items:
        if item is None:
            continue
        (key, freq) = item
        L.set_sentence(key)
        L.populate_nodes(pieces, trie)
        Z, ret_expected = L.populate_marginal(freq)

        for key, val in ret_expected.items():
            expected[key] += val

        N = len(L.Viterbi())
        num_tokens += N
        objective -= Z
    return (expected, objective, num_tokens)