Exemplos de tokenizer em Python, exemplos de data.text_utils.tokenizer em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: model_server.py Projeto: samyunwei/EncoderDecoder

def model_msg(msg):
    tokens = ["<sos>"]
    tokens.extend(text_utils.tokenizer(msg))
    tokens.append("<eos>")

    tokens_ids = []
    for token in tokens:
        tokens_ids.append(TEXT.vocab.stoi[token])

    tensor_data = t.LongTensor(tokens_ids)
    source_text = t.unsqueeze(tensor_data, dim=0)

    length_data = t.LongTensor([len(tokens_ids)])
    source_length = length_data

    # source_text = source_text.to(device)
    # source_length = source_length.to(device)

    result, _ = greedy_model(source_text, source_length)

    random_key = np.random.randint(low=2, high=5, size=(1))[0]

    result = result[:random_key]

    return_msg = []
    for ele in result:
        if TEXT.vocab.itos[ele] != '<sos>' and TEXT.vocab.itos[ele] != '<eos>':
            return_msg.append(TEXT.vocab.itos[ele])

    return " ".join(return_msg)

Exemplo n.º 2

0

Exibir arquivo

def return_msg(msg):
    tokens = text_utils.tokenizer(msg)

    msg_cos = 0
    for token in tokens:
        msg_cos += word_vector.item().weight[TEXT.vocab.stoi[token]]

    scores = np.dot(msg_cos, text_cos.T)
    index = np.argmax(scores)
    return target_text[index]

Exemplo n.º 3

0

Exibir arquivo

    def __init__(self,
                 path,
                 text_field,
                 len_field,
                 test=False,
                 aug=False,
                 **kwargs):

        fields = [
            ("id", None),
            ("source_text", text_field),
            ("source_length", len_field),
            ("target_text", text_field),
            ("target_len", len_field),
        ]
        examples = []

        with codecs.open(path) as fin:
            for index, line in enumerate(fin):

                if len(line[:-1].split("\t")) > 2:
                    source = " ".join(line[:-1].split("\t")[:-1])
                    target = line[:-1].split("\t")[-1]
                else:
                    source, target = line[:-1].split("\t")

                source_len = len(tokenizer(source)) + 2
                target_len = len(tokenizer(target)) + 2

                if target_len > text_field.fix_length or source_len > text_field.fix_length:
                    continue

                examples.append(
                    data.Example.fromlist(
                        [None, source, source_len, target, target_len],
                        fields))

        super(MyDataset, self).__init__(examples, fields, **kwargs)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: cos_text.py Projeto: samyunwei/EncoderDecoder

from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from data import text_utils

if __name__ == '__main__':
    with open("seq2seq/TEXT.Field", "rb") as f:
        TEXT = dill.load(f)

    source_path = "train_data/bakeup_chat_source.txt"
    word_vector_path = "train_data/craw1.npz"

    text_cos = []
    word_vector = np.load(word_vector_path, allow_pickle=True)["embeddings"]

    with codecs.open(source_path, "r") as sfin:
        for index, line in enumerate(sfin):
            tokens = text_utils.tokenizer(line)

            token_cos = 0
            for token in tokens:
                token_cos += word_vector.item().weight[TEXT.vocab.stoi[
                    token.lower()]]
            text_cos.append(token_cos)

    import numpy as np

    for index, ele in enumerate(text_cos):
        text_cos[index] = ele.numpy()

    np.save("seq2seq/text_cos.npy", text_cos)

Exemplo n.º 5

0

Exibir arquivo

import numpy as np
import codecs
import numpy as np
import dill

with open("seq2seq/TEXT.Field", "rb") as f:
    TEXT = dill.load(f)

text_cos = np.load("seq2seq/text_cos.npy")

word_vector_path = "train_data/craw1.npz.npz"
word_vector = np.load(word_vector_path, allow_pickle=True)["embeddings"]

target_file = "train_data/bakeup_chat_target.txt"
target_text = []
with codecs.open(target_file, "r") as fin:
    for line in iter(fin):
        target_text.append(" ".join(text_utils.tokenizer(line)))


def return_msg(msg):
    tokens = text_utils.tokenizer(msg)

    msg_cos = 0
    for token in tokens:
        msg_cos += word_vector.item().weight[TEXT.vocab.stoi[token]]

    scores = np.dot(msg_cos, text_cos.T)
    index = np.argmax(scores)
    return target_text[index]

Exemplo n.º 6

0

Exibir arquivo

    # model = model_class(encoder, decoder, args)
    model = model_class(encoder, decoder, args)

    checkpoint = t.load(args.load_dir)
    model.load_state_dict(checkpoint['model'])

    device = t.device('cuda' if False else 'cpu')

    greedy_model = SN_MODELS["greedy"](model.encoder, model.decoder, device,
                                       args)
    greedy_model.eval()

    msg = "Are you robot"
    tokens = ["<sos>"]
    tokens.extend(text_utils.tokenizer(msg))
    tokens.append("<eos>")

    tokens_ids = []
    for token in tokens:
        tokens_ids.append(TEXT.vocab.stoi[token])

    tensor_data = t.LongTensor(tokens_ids)
    source_text = t.unsqueeze(tensor_data, dim=0)

    length_data = t.LongTensor([len(tokens_ids)])
    source_length = length_data

    #source_text = source_text.to(device)
    #source_length = source_length.to(device)