Exemplo n.º 1
0
    def greedy_predict(self, entries, wombat_object=None, maxlen=2000):
        nl = []
        wd_tokens = []
        for entry in entries:
            wd_tokens.append(entry["question_arg"])
            nl.append(self.source2idx(entry["question_arg"]))
        self.seq2seq.eval()
        with torch.no_grad():
            nl_pad_ids, nl_lens = seqPAD.pad_sequences(nl, pad_tok=self.pad_id, nlevels=1)
            nl_tensor = Data2tensor.idx2tensor(nl_pad_ids, dtype=torch.long, device=self.device)
            nl_len_tensor = Data2tensor.idx2tensor(nl_lens, dtype=torch.long, device=self.device)
            # wombat_tensor = [batch, nl_len, emb_dim]
            wombat_tensor = torch.zeros(nl_tensor.shape + (self.args.swd_dim,), dtype=torch.float32, device=self.device)
            wombat_idx = (nl_tensor == self.unk_id).nonzero()
            if wombat_object is not None:
                for t, (i, j) in enumerate(wombat_idx.tolist()):
                    wombat_emb = wombat_object.get(wd_tokens[t][i][j])
                    if wombat_emb is not None:
                        wombat_tensor[i, j] = torch.from_numpy(wombat_emb)

            pred_outputs, acc_prob = self.seq2seq.greedy_predict(nl_tensor, nl_len_tensor,
                                                                 maxlen=maxlen, wombat_tensor=wombat_tensor)
            if self.args.tokenize_type != "bpe":
                predict_words = self.tokenizer.decode_batch(pred_outputs.tolist(), self.tokenizer.i2tw, 2)
                predict_words = [words if EOT not in words else words[: words.index(EOT) + 1] for words in
                                 predict_words]
            else:
                predict_words = self.tokenizer.decode_batch(pred_outputs.tolist())
                predict_words = [words[0: words.find(EOT)].split() for words in predict_words]
            # predict_prob = acc_prob.prod(dim=-1).tolist()
            predict_prob = acc_prob.squeeze().tolist()
        for i, entry in enumerate(entries):
            entry['model_result'] = " ".join(predict_words[i])
            entry['pred_prob'] = predict_prob[i]
        return entries
Exemplo n.º 2
0
    def predict_batch(self, entries, wombat_object=None):
        nl = []
        wd_tokens = []
        for entry in entries:
            input_tokens = entry["input_tokens"]
            ids = self.source2idx(input_tokens)
            nl.append(ids)
            if self.args.tokenize_type != "bpe":
                entry['input_list'] = self.tokenizer.process_nl(input_tokens)
            else:
                entry['input_list'] = self.tokenizer.encode(
                    input_tokens, add_special_tokens=False).tokens
            wd_tokens.append(entry['input_list'])
        self.classifier.eval()
        with torch.no_grad():
            nl_pad_ids, nl_lens = seqPAD.pad_sequences(nl,
                                                       pad_tok=self.pad_id,
                                                       nlevels=1)
            nl_tensor = Data2tensor.idx2tensor(nl_pad_ids,
                                               dtype=torch.long,
                                               device=self.device)
            nl_len_tensor = Data2tensor.idx2tensor(nl_lens,
                                                   dtype=torch.long,
                                                   device=self.device)
            # wombat_tensor = [batch, nl_len, emb_dim]
            wombat_tensor = torch.zeros(nl_tensor.shape +
                                        (self.args.swd_dim, ),
                                        dtype=torch.float32,
                                        device=self.device)
            wombat_idx = (nl_tensor == self.unk_id).nonzero()
            if wombat_object is not None:
                for t, (i, j) in enumerate(wombat_idx.tolist()):
                    word_to_lookup = wd_tokens[i][j]
                    print('Looking up Wombat for:', word_to_lookup)
                    wombat_emb = wombat_object.get(word_to_lookup)
                    if wombat_emb is not None:
                        print('Found Wombat embedding for:', word_to_lookup)
                        wombat_tensor[i, j] = torch.from_numpy(wombat_emb)
            de_score = self.classifier(nl_tensor,
                                       nl_len_tensor,
                                       wombat_tensor=wombat_tensor)
            label_mask = nl_tensor > 0
            output_prob, output_idx = self.classifier.inference(de_score)
            # output_idx = de_score.max(-1)[1]
            predict_words = Tokenizer.decode_batch(
                output_idx.squeeze(-1).tolist(), self.tokenizer.i2tw, 1)
            # predict_prob = acc_prob.prod(dim=-1).tolist()
            predict_prob = output_prob.squeeze(-1).tolist()

        for i, entry in enumerate(entries):
            # entry["pred_pair"] = list(zip(entry["input_review"], predict_words[i]))
            entry['pred_sequence'] = predict_words[i]
            entry['prob_sequence'] = predict_prob[i]
        return entries
Exemplo n.º 3
0
 def __init__(self, args):
     args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
     args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
     self.args = args
     margs = torch.load(os.path.join(args.model_name_or_path, "training_args.bin"))
     margs.no_cuda = args.no_cuda
     margs.model_name_or_path = args.model_name_or_path
     margs.overwrite_output_dir = True
     self.lm = TransLanguageModel(margs)
     self.lm.model_init()
     # lm.load_model(args.model_name_or_path)
     Data2tensor.set_randseed(args.seed)
Exemplo n.º 4
0
 def __init__(self, args):
     args.device = torch.device("cuda" if torch.cuda.is_available()
                                and not args.no_cuda else "cpu")
     args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
     margs = torch.load(
         os.path.join(args.model_name_or_path, "training_args.bin"))
     margs.no_cuda = args.no_cuda
     margs.label_file = args.label_file
     margs.model_name_or_path = args.model_name_or_path
     self.tagger = TransLabelerModel(margs)
     self.tagger.model_init(args.model_name_or_path)
     Data2tensor.set_randseed(args.seed)
Exemplo n.º 5
0
 def __init__(self, args):
     args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
     args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
     self.args = args
     margs = torch.load(os.path.join(args.model_name_or_path, "training_args.bin"))
     margs.no_cuda = args.no_cuda
     margs.model_name_or_path = args.model_name_or_path
     margs.overwrite_output_dir = True
     self.lm = TransSeq2SeqModel(margs)
     self.lm.model_init(args.model_name_or_path)
     # lm.load_model(args.model_name_or_path)
     Data2tensor.set_randseed(args.seed)
     self.bos_token_id = self.lm.tokenizer.tw2i[SOT]
     # self.pad_token_id = self.lm.pad_id
     self.eos_token_id = self.lm.tokenizer.tw2i[EOT]
Exemplo n.º 6
0
    def beam_predict(self, entries, bw=2, topk=2, wombat_object=None, maxlen=2000):
        nl = []
        wd_tokens = []
        for entry in entries:
            wd_tokens.append(entry["question_arg"])
            nl.append(self.source2idx(entry["question_arg"]))
        self.seq2seq.eval()
        with torch.no_grad():
            nl_pad_ids, nl_lens = seqPAD.pad_sequences(nl, pad_tok=self.pad_id, nlevels=1)
            nl_tensor = Data2tensor.idx2tensor(nl_pad_ids, dtype=torch.long, device=self.device)
            nl_len_tensor = Data2tensor.idx2tensor(nl_lens, dtype=torch.long, device=self.device)

            # wombat_tensor = [batch, nl_len, emb_dim]
            wombat_tensor = torch.zeros(nl_tensor.shape + (self.args.swd_dim,), dtype=torch.float32, device=self.device)
            wombat_idx = (nl_tensor == self.unk_id).nonzero()
            if wombat_object is not None:
                for t, (i, j) in enumerate(wombat_idx.tolist()):
                    wombat_emb = wombat_object.get(wd_tokens[t][i][j])
                    if wombat_emb is not None:
                        wombat_tensor[i, j] = torch.from_numpy(wombat_emb)

            pred_outputs, predict_prob = self.seq2seq.beam_predict(nl_tensor, nl_len_tensor,
                                                                   minlen=1, maxlen=maxlen,
                                                                   bw=bw, n_best=topk, wombat_tensor=wombat_tensor)
            if self.args.tokenize_type != "bpe":
                predict_words = self.tokenizer.decode_batch(pred_outputs, self.tokenizer.i2tw, 3)
                predict_words = [words if EOT not in words else words[: words.index(EOT) + 1] for words in
                                 predict_words]
                predict_words = [[" ".join(words) for words in topk_outputs] for topk_outputs in predict_words]
            else:
                predict_words = [self.tokenizer.decode_batch(topk_outputs) for topk_outputs in pred_outputs]
                predict_words = [[words[0: words.find(EOT)] for words in topk_outputs] for topk_outputs in predict_words]
        for i, entry in enumerate(entries):
            entry['model_result'] = predict_words[i][0]
            entry['pred_prob'] = predict_prob[i][0]
            entry['decoded_batch'] = list(zip(predict_words[i], predict_prob[i]))
        return entries
Exemplo n.º 7
0
    @staticmethod
    def decode_batch(pad_ids, i2t, level=2):
        return Tokenizer.idx2text(pad_ids=pad_ids, i2t=i2t, level=level)


if __name__ == '__main__':
    import torch
    from mlmodels.utils.idx2tensor import Data2tensor, seqPAD
    from mlmodels.utils.dataset import IterDataset, collate_fn, tokens2ids
    from torch.utils.data import DataLoader, Dataset, IterableDataset, RandomSampler, SequentialSampler, TensorDataset
    from mlmodels.utils.BPEtonkenizer import BPE
    from mlmodels.utils.special_tokens import BPAD, PAD, NULL
    from mlmodels.utils.txtIO import TXT

    Data2tensor.set_randseed(12345)
    device = torch.device("cpu")
    dtype = torch.long
    use_cuda = False
    filename = "../../data/reviews/processed_csv/train_res4.csv"
    label_file = "../../data/reviews/processed_csv/labels.txt"
    labels_list = TXT.read(label_file, firstline=False)
    lb2id_dict = Tokenizer.list2dict(sys_tokens + labels_list)
    id2lb_dict = Tokenizer.reversed_dict(lb2id_dict)
    lb2ids = Tokenizer.lst2idx(tokenizer=Tokenizer.process_target,
                               vocab_words=lb2id_dict,
                               unk_words=False,
                               sos=False,
                               eos=False)
    tokenize_type = "bpe"
    if tokenize_type != "bpe":
Exemplo n.º 8
0
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    Data2tensor.set_randseed(args.seed, args.n_gpu)

    # Prepare CONLL-2003 task
    labels = TextDataset.get_labels(args.labels)
    num_labels = len(labels)
    # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
    pad_token_label_id = CrossEntropyLoss().ignore_index

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
Exemplo n.º 9
0
    def predict_batch(self,
                      entries,
                      wombat_object=None,
                      return_probability=False):
        nl = []
        wd_tokens = []
        for entry in entries:
            input_tokens = entry["input_tokens"]
            ids = self.source2idx(input_tokens)
            nl.append(ids)
            if self.args.tokenize_type != "bpe":
                entry['input_list'] = self.tokenizer.process_nl(input_tokens)
            else:
                entry['input_list'] = self.tokenizer.encode(
                    input_tokens, add_special_tokens=False).tokens
            wd_tokens.append(entry['input_list'])
        self.labeler.eval()
        with torch.no_grad():
            nl_pad_ids, nl_lens = seqPAD.pad_sequences(nl,
                                                       pad_tok=self.pad_id,
                                                       nlevels=1)
            nl_tensor = Data2tensor.idx2tensor(nl_pad_ids,
                                               dtype=torch.long,
                                               device=self.device)
            nl_len_tensor = Data2tensor.idx2tensor(nl_lens,
                                                   dtype=torch.long,
                                                   device=self.device)
            # wombat_tensor = [batch, nl_len, emb_dim]
            wombat_tensor = torch.zeros(nl_tensor.shape +
                                        (self.args.swd_dim, ),
                                        dtype=torch.float32,
                                        device=self.device)
            wombat_idx = (nl_tensor == self.unk_id).nonzero()
            if wombat_object is not None:
                for t, (i, j) in enumerate(wombat_idx.tolist()):
                    word_to_lookup = wd_tokens[i][j]
                    print('Looking up Wombat for:', word_to_lookup)
                    wombat_emb = wombat_object.get(word_to_lookup)
                    if wombat_emb is not None:
                        print('Found Wombat embedding for:', word_to_lookup)
                        wombat_tensor[i, j] = torch.from_numpy(wombat_emb)
            de_score = self.labeler(nl_tensor,
                                    nl_len_tensor,
                                    wombat_tensor=wombat_tensor)
            label_mask = nl_tensor > 0
            if return_probability is False:
                output_prob, output_idx = self.labeler.inference(
                    de_score, label_mask)
                if self.args.use_crf:
                    predict_words = Tokenizer.decode_batch(
                        output_idx, self.tokenizer.i2tw, 2)
                    # predict_words = [words[:i] for words, i in zip(predict_words, label_mask.sum(dim=1).tolist())]
                    predict_prob = list(output_prob)
                else:
                    # output_idx = de_score.max(-1)[1]
                    predict_words = Tokenizer.decode_batch(
                        output_idx.squeeze(-1).tolist(), self.tokenizer.i2tw,
                        2)
                    predict_words = [
                        words[:i]
                        for words, i in zip(predict_words,
                                            label_mask.sum(dim=1).tolist())
                    ]
                    # predict_prob = acc_prob.prod(dim=-1).tolist()
                    predict_prob = [
                        words[:i] for words, i in zip(
                            output_prob.squeeze(-1).tolist(),
                            label_mask.sum(dim=1).tolist())
                    ]

                for i, entry in enumerate(entries):
                    # entry["pred_pair"] = list(zip(entry["input_review"], predict_words[i]))
                    entry['pred_sequence'] = predict_words[i]
                    entry['prob_sequence'] = predict_prob[i]
                    entities_list = NER_metrics.absa_extractor(
                        entry["input_list"], predict_words[i],
                        None if self.args.use_crf else predict_prob[i])
                    entry["entities"] = []
                    if len(entities_list) > 0:
                        for entity, senti, _, prob in entities_list:
                            # entry["entities"].append((entity, senti, prob))
                            entry["entities"].append({
                                "aspect": entity,
                                "polarity": senti,
                                "probability": prob
                            })

                return entries
            else:
                label_prob = torch.softmax(de_score.squeeze(), dim=-1)
                return [{
                    self.tokenizer.i2tw[ind]: prob
                    for ind, prob in enumerate(prob_i)
                } for prob_i in label_prob.tolist()]
Exemplo n.º 10
0
            por += [labels[i][2:]]
            tok += [tokens[-1]]
            p += [prob[-1] if prob is not None else 0]
            span.extend([[
                " ".join(tok),
                Counter(por).most_common(1)[0][0], " ".join(cur),
                sum(p) / len(p)
            ]])
        return span


if __name__ == '__main__':
    import torch
    from mlmodels.utils.idx2tensor import Data2tensor, seqPAD

    Data2tensor.set_randseed(12345)
    device = torch.device("cpu")
    dtype = torch.long
    use_cuda = False
    filename = "/media/data/review_response/Dev.json"

    s_paras = [-1, 1]
    t_paras = [-1, 1]

    vocab = Vocab(s_paras, t_paras)
    vocab.build([filename])

    nl2ids = vocab.lst2idx(vocab_words=vocab.sw2i, unk_words=True, eos=True)

    tg2ids = vocab.lst2idx(vocab_words=vocab.tw2i,
                           unk_words=False,
Exemplo n.º 11
0
    def greedy_predict(self, nl_tensor, nl_len_tensor, maxlen=500, wombat_tensor=None):
        device = nl_len_tensor.device
        # sort lengths of input tensors in the descending mode
        nl_tensor, nl_len_tensor, nl_ord_tensor, nl_recover_ord_tensor = self.sort_tensors(nl_tensor, nl_len_tensor)

        en_inp = self.sembedding(nl_tensor)
        if wombat_tensor is not None:
            wombat_tensor = self.reorder_tensor(wombat_tensor, nl_ord_tensor, dim=0)
            en_inp += wombat_tensor
        en_out, en_hidden = self.encoder(en_inp, nl_len_tensor)
        if self.enc_cnn == "cnn" and self.ed_mode != "lstm":
            en_hidden = en_hidden[0]
        # en_hn = tensor(batch_size, num_directions * rnn_dim)
        en_hn = self.encoder.get_last_hiddens(en_hidden)
        # recover the original order of inputs
        en_out = self.reorder_tensor(en_out, nl_recover_ord_tensor, dim=0)
        de_hidden = self.reorder_tensor(en_hidden, nl_recover_ord_tensor, dim=1)
        en_hn = self.reorder_tensor(en_hn, nl_recover_ord_tensor, dim=0)
        nl_len_tensor = self.reorder_tensor(nl_len_tensor, nl_recover_ord_tensor, dim=0)
        en_mask = None
        if nl_len_tensor.size(0) > 1:
            en_mask = torch.arange(en_out.size(1), dtype=torch.long, device=device)[None, :] < nl_len_tensor[:, None]

        batch_size = nl_tensor.shape[0]
        output = Data2tensor.idx2tensor([[SOT_id]] * batch_size, dtype=torch.long, device=device)
        pred_outputs = []
        acc_prob = Data2tensor.idx2tensor([[0.0]] * batch_size, dtype=torch.float32, device=device)
        EOT_tensor = Data2tensor.idx2tensor([[False]] * batch_size, dtype=torch.bool, device=device)
        count = 0
        while True:
            count += 1
            pred_outputs.append(output)
            de_out, de_hidden = self.decoder(output, None, de_hidden)
            enc_context, enc_align = None, None
            if self.enc_att:
                # enc_context: [batch, seq_length2, hidden_dim]
                enc_context, enc_align = self.enc_attention(en_out, de_out, en_mask)
                # rnn_out = torch.cat((rnn_out, enc_context), dim=-1)
            if enc_context is not None:
                de_out = torch.cat((de_out, enc_context), dim=-1)

            # de_score = [batch, 1, num_labels]
            de_score = self.scoring(de_out)
            log_probs = torch.nn.functional.log_softmax(de_score, dim=-1)
            top1_scores, top1_ids = torch.topk(log_probs,  1, dim=-1)

            # pred_prob, pred_label = self.inference(de_score)
            raw_output = top1_ids.squeeze(-1)
            acc_prob += top1_scores.squeeze(-1)
            EOT_tensor = EOT_tensor | (raw_output == EOT_id)
            # TODO: change to tensor.all()
            if EOT_tensor.all() or count > maxlen:
                # extend EOT to outputs
                pred_outputs.append(raw_output)
                break

            output = raw_output.detach().clone()

        pred_outputs = torch.cat(pred_outputs, dim=-1)
        # acc_prob = torch.cat(acc_prob, dim=-1)
        return pred_outputs, acc_prob.exp()