Exemplo n.º 1
0
def batcher(params, batch):
    new_batch = []
    for p in batch:
        if params.tokenize:
            tok = params.entok.tokenize(p, escape=False)
            p = " ".join(tok)
        if params.lower_case:
            p = p.lower()
        p = params.sp.EncodeAsPieces(p)
        p = " ".join(p)
        p = Example(p, params.lower_case)
        p.populate_embeddings(params.model.vocab, params.model.zero_unk, params.model.ngrams)
        new_batch.append(p)
    x, l = params.model.torchify_batch(new_batch)
    vecs = params.model.encode(x, l)
    return vecs.detach().cpu().numpy()
def get_sequences(p1, p2, model, params, fr0=0, fr1=0):
    wp1 = Example(p1)
    wp2 = Example(p2)

    if fr0 == 1 and fr1 == 1 and not model.share_vocab:
        wp1.populate_embeddings(model.vocab_fr, model.zero_unk, params.ngrams)
        wp2.populate_embeddings(model.vocab_fr, model.zero_unk, params.ngrams)
        if len(wp1.embeddings) == 0:
            wp1.embeddings.append(model.vocab_fr[unk_string])
        if len(wp2.embeddings) == 0:
            wp2.embeddings.append(model.vocab_fr[unk_string])
    elif fr0 == 0 and fr1 == 1 and not model.share_vocab:
        wp1.populate_embeddings(model.vocab, model.zero_unk, params.ngrams)
        wp2.populate_embeddings(model.vocab_fr, model.zero_unk, params.ngrams)
        if len(wp1.embeddings) == 0:
            wp1.embeddings.append(model.vocab[unk_string])
        if len(wp2.embeddings) == 0:
            wp2.embeddings.append(model.vocab_fr[unk_string])
    else:
        wp1.populate_embeddings(model.vocab, model.zero_unk, params.ngrams)
        wp2.populate_embeddings(model.vocab, model.zero_unk, params.ngrams)
        if len(wp1.embeddings) == 0:
            wp1.embeddings.append(model.vocab[unk_string])
        if len(wp2.embeddings) == 0:
            wp2.embeddings.append(model.vocab[unk_string])

    return wp1, wp2