Пример #1
0
def process_each_prune(tup):
    """
    poolで呼ばれる関数。
    classのなかにかくと、classごとcopyされてeach processに渡される。
    それを防ぐために、外に書く

    1文ずつの処理。(sent,piece,trie)よりも、まとめた方が早い(sent_list, piece,trie)
    """
    (items, piece, trie) = tup

    vsum = 0
    freq = defaultdict(int)
    inverted = defaultdict(int)

    L = Lattice()
    for item in items:
        if item is None:
            continue

        (s, score) = item
        vsum += score
        L.set_sentence(s)
        L.populate_nodes(piece, trie)

        for word in L.Viterbi(ret_piece=True):
            freq[word] += score
            inverted[word] += score
    return (vsum, freq, inverted)
Пример #2
0
 def encode_one_sent(self, sent):
     #TODO encode_poolがうまくいくなら決して良い
     """
     Arguments:
         sent(str): sentence piece vocを使って分割する文
     Returns:
         tokenize_sent(str): space split tokenize sentence
     """
     L = Lattice()
     L.set_sentence(sent)
     L.populate_nodes(self.SentencePiece.get_pieces(), self.Trie)
     tokenize_sent = " ".join(L.Viterbi(ret_piece=True))
     assert "".join(tokenize_sent.split(" ")) == sent
     return tokenize_sent
def process_each(tup):
    (items,piece,trie) = tup

    L = Lattice()

    ret=[]
    for item in items:
        if item is None:
            continue

        L.set_sentence(item)
        L.populate_nodes(piece, trie)
        ret.append(L.Viterbi(ret_piece=True))

    return ret
Пример #4
0
def process_each_encode(tup):
    """
    tup: tuple(sentence_list, piece, trie)
    
    return: tokenized_sentence_list
    """
    (items, piece, trie) = tup

    ret = []
    L = Lattice()
    for sent in items:
        if sent is None:
            continue
        L.set_sentence(sent)
        L.populate_nodes(piece, trie)
        tokenize_sent = " ".join(L.Viterbi(ret_piece=True))
        ret.append(tokenize_sent)
        assert "".join(tokenize_sent.split(" ")) == sent
    return ret
Пример #5
0
def process_each_estep(tup):

    expected = defaultdict(int)
    objective = 0
    num_tokens = 0

    (items, pieces, trie) = tup
    L = Lattice()
    for item in items:
        if item is None:
            continue
        (key, freq) = item
        L.set_sentence(key)
        L.populate_nodes(pieces, trie)
        Z, ret_expected = L.populate_marginal(freq)

        for key, val in ret_expected.items():
            expected[key] += val

        N = len(L.Viterbi())
        num_tokens += N
        objective -= Z
    return (expected, objective, num_tokens)