Пример #1
0
    def greedy_predict(self, entries, wombat_object=None, maxlen=2000):
        nl = []
        wd_tokens = []
        for entry in entries:
            wd_tokens.append(entry["question_arg"])
            nl.append(self.source2idx(entry["question_arg"]))
        self.seq2seq.eval()
        with torch.no_grad():
            nl_pad_ids, nl_lens = seqPAD.pad_sequences(nl, pad_tok=self.pad_id, nlevels=1)
            nl_tensor = Data2tensor.idx2tensor(nl_pad_ids, dtype=torch.long, device=self.device)
            nl_len_tensor = Data2tensor.idx2tensor(nl_lens, dtype=torch.long, device=self.device)
            # wombat_tensor = [batch, nl_len, emb_dim]
            wombat_tensor = torch.zeros(nl_tensor.shape + (self.args.swd_dim,), dtype=torch.float32, device=self.device)
            wombat_idx = (nl_tensor == self.unk_id).nonzero()
            if wombat_object is not None:
                for t, (i, j) in enumerate(wombat_idx.tolist()):
                    wombat_emb = wombat_object.get(wd_tokens[t][i][j])
                    if wombat_emb is not None:
                        wombat_tensor[i, j] = torch.from_numpy(wombat_emb)

            pred_outputs, acc_prob = self.seq2seq.greedy_predict(nl_tensor, nl_len_tensor,
                                                                 maxlen=maxlen, wombat_tensor=wombat_tensor)
            if self.args.tokenize_type != "bpe":
                predict_words = self.tokenizer.decode_batch(pred_outputs.tolist(), self.tokenizer.i2tw, 2)
                predict_words = [words if EOT not in words else words[: words.index(EOT) + 1] for words in
                                 predict_words]
            else:
                predict_words = self.tokenizer.decode_batch(pred_outputs.tolist())
                predict_words = [words[0: words.find(EOT)].split() for words in predict_words]
            # predict_prob = acc_prob.prod(dim=-1).tolist()
            predict_prob = acc_prob.squeeze().tolist()
        for i, entry in enumerate(entries):
            entry['model_result'] = " ".join(predict_words[i])
            entry['pred_prob'] = predict_prob[i]
        return entries
Пример #2
0
    def predict_batch(self, entries, wombat_object=None):
        nl = []
        wd_tokens = []
        for entry in entries:
            input_tokens = entry["input_tokens"]
            ids = self.source2idx(input_tokens)
            nl.append(ids)
            if self.args.tokenize_type != "bpe":
                entry['input_list'] = self.tokenizer.process_nl(input_tokens)
            else:
                entry['input_list'] = self.tokenizer.encode(
                    input_tokens, add_special_tokens=False).tokens
            wd_tokens.append(entry['input_list'])
        self.classifier.eval()
        with torch.no_grad():
            nl_pad_ids, nl_lens = seqPAD.pad_sequences(nl,
                                                       pad_tok=self.pad_id,
                                                       nlevels=1)
            nl_tensor = Data2tensor.idx2tensor(nl_pad_ids,
                                               dtype=torch.long,
                                               device=self.device)
            nl_len_tensor = Data2tensor.idx2tensor(nl_lens,
                                                   dtype=torch.long,
                                                   device=self.device)
            # wombat_tensor = [batch, nl_len, emb_dim]
            wombat_tensor = torch.zeros(nl_tensor.shape +
                                        (self.args.swd_dim, ),
                                        dtype=torch.float32,
                                        device=self.device)
            wombat_idx = (nl_tensor == self.unk_id).nonzero()
            if wombat_object is not None:
                for t, (i, j) in enumerate(wombat_idx.tolist()):
                    word_to_lookup = wd_tokens[i][j]
                    print('Looking up Wombat for:', word_to_lookup)
                    wombat_emb = wombat_object.get(word_to_lookup)
                    if wombat_emb is not None:
                        print('Found Wombat embedding for:', word_to_lookup)
                        wombat_tensor[i, j] = torch.from_numpy(wombat_emb)
            de_score = self.classifier(nl_tensor,
                                       nl_len_tensor,
                                       wombat_tensor=wombat_tensor)
            label_mask = nl_tensor > 0
            output_prob, output_idx = self.classifier.inference(de_score)
            # output_idx = de_score.max(-1)[1]
            predict_words = Tokenizer.decode_batch(
                output_idx.squeeze(-1).tolist(), self.tokenizer.i2tw, 1)
            # predict_prob = acc_prob.prod(dim=-1).tolist()
            predict_prob = output_prob.squeeze(-1).tolist()

        for i, entry in enumerate(entries):
            # entry["pred_pair"] = list(zip(entry["input_review"], predict_words[i]))
            entry['pred_sequence'] = predict_words[i]
            entry['prob_sequence'] = predict_prob[i]
        return entries
Пример #3
0
    def beam_predict(self, entries, bw=2, topk=2, wombat_object=None, maxlen=2000):
        nl = []
        wd_tokens = []
        for entry in entries:
            wd_tokens.append(entry["question_arg"])
            nl.append(self.source2idx(entry["question_arg"]))
        self.seq2seq.eval()
        with torch.no_grad():
            nl_pad_ids, nl_lens = seqPAD.pad_sequences(nl, pad_tok=self.pad_id, nlevels=1)
            nl_tensor = Data2tensor.idx2tensor(nl_pad_ids, dtype=torch.long, device=self.device)
            nl_len_tensor = Data2tensor.idx2tensor(nl_lens, dtype=torch.long, device=self.device)

            # wombat_tensor = [batch, nl_len, emb_dim]
            wombat_tensor = torch.zeros(nl_tensor.shape + (self.args.swd_dim,), dtype=torch.float32, device=self.device)
            wombat_idx = (nl_tensor == self.unk_id).nonzero()
            if wombat_object is not None:
                for t, (i, j) in enumerate(wombat_idx.tolist()):
                    wombat_emb = wombat_object.get(wd_tokens[t][i][j])
                    if wombat_emb is not None:
                        wombat_tensor[i, j] = torch.from_numpy(wombat_emb)

            pred_outputs, predict_prob = self.seq2seq.beam_predict(nl_tensor, nl_len_tensor,
                                                                   minlen=1, maxlen=maxlen,
                                                                   bw=bw, n_best=topk, wombat_tensor=wombat_tensor)
            if self.args.tokenize_type != "bpe":
                predict_words = self.tokenizer.decode_batch(pred_outputs, self.tokenizer.i2tw, 3)
                predict_words = [words if EOT not in words else words[: words.index(EOT) + 1] for words in
                                 predict_words]
                predict_words = [[" ".join(words) for words in topk_outputs] for topk_outputs in predict_words]
            else:
                predict_words = [self.tokenizer.decode_batch(topk_outputs) for topk_outputs in pred_outputs]
                predict_words = [[words[0: words.find(EOT)] for words in topk_outputs] for topk_outputs in predict_words]
        for i, entry in enumerate(entries):
            entry['model_result'] = predict_words[i][0]
            entry['pred_prob'] = predict_prob[i][0]
            entry['decoded_batch'] = list(zip(predict_words[i], predict_prob[i]))
        return entries
Пример #4
0
    def predict_batch(self,
                      entries,
                      wombat_object=None,
                      return_probability=False):
        nl = []
        wd_tokens = []
        for entry in entries:
            input_tokens = entry["input_tokens"]
            ids = self.source2idx(input_tokens)
            nl.append(ids)
            if self.args.tokenize_type != "bpe":
                entry['input_list'] = self.tokenizer.process_nl(input_tokens)
            else:
                entry['input_list'] = self.tokenizer.encode(
                    input_tokens, add_special_tokens=False).tokens
            wd_tokens.append(entry['input_list'])
        self.labeler.eval()
        with torch.no_grad():
            nl_pad_ids, nl_lens = seqPAD.pad_sequences(nl,
                                                       pad_tok=self.pad_id,
                                                       nlevels=1)
            nl_tensor = Data2tensor.idx2tensor(nl_pad_ids,
                                               dtype=torch.long,
                                               device=self.device)
            nl_len_tensor = Data2tensor.idx2tensor(nl_lens,
                                                   dtype=torch.long,
                                                   device=self.device)
            # wombat_tensor = [batch, nl_len, emb_dim]
            wombat_tensor = torch.zeros(nl_tensor.shape +
                                        (self.args.swd_dim, ),
                                        dtype=torch.float32,
                                        device=self.device)
            wombat_idx = (nl_tensor == self.unk_id).nonzero()
            if wombat_object is not None:
                for t, (i, j) in enumerate(wombat_idx.tolist()):
                    word_to_lookup = wd_tokens[i][j]
                    print('Looking up Wombat for:', word_to_lookup)
                    wombat_emb = wombat_object.get(word_to_lookup)
                    if wombat_emb is not None:
                        print('Found Wombat embedding for:', word_to_lookup)
                        wombat_tensor[i, j] = torch.from_numpy(wombat_emb)
            de_score = self.labeler(nl_tensor,
                                    nl_len_tensor,
                                    wombat_tensor=wombat_tensor)
            label_mask = nl_tensor > 0
            if return_probability is False:
                output_prob, output_idx = self.labeler.inference(
                    de_score, label_mask)
                if self.args.use_crf:
                    predict_words = Tokenizer.decode_batch(
                        output_idx, self.tokenizer.i2tw, 2)
                    # predict_words = [words[:i] for words, i in zip(predict_words, label_mask.sum(dim=1).tolist())]
                    predict_prob = list(output_prob)
                else:
                    # output_idx = de_score.max(-1)[1]
                    predict_words = Tokenizer.decode_batch(
                        output_idx.squeeze(-1).tolist(), self.tokenizer.i2tw,
                        2)
                    predict_words = [
                        words[:i]
                        for words, i in zip(predict_words,
                                            label_mask.sum(dim=1).tolist())
                    ]
                    # predict_prob = acc_prob.prod(dim=-1).tolist()
                    predict_prob = [
                        words[:i] for words, i in zip(
                            output_prob.squeeze(-1).tolist(),
                            label_mask.sum(dim=1).tolist())
                    ]

                for i, entry in enumerate(entries):
                    # entry["pred_pair"] = list(zip(entry["input_review"], predict_words[i]))
                    entry['pred_sequence'] = predict_words[i]
                    entry['prob_sequence'] = predict_prob[i]
                    entities_list = NER_metrics.absa_extractor(
                        entry["input_list"], predict_words[i],
                        None if self.args.use_crf else predict_prob[i])
                    entry["entities"] = []
                    if len(entities_list) > 0:
                        for entity, senti, _, prob in entities_list:
                            # entry["entities"].append((entity, senti, prob))
                            entry["entities"].append({
                                "aspect": entity,
                                "polarity": senti,
                                "probability": prob
                            })

                return entries
            else:
                label_prob = torch.softmax(de_score.squeeze(), dim=-1)
                return [{
                    self.tokenizer.i2tw[ind]: prob
                    for ind, prob in enumerate(prob_i)
                } for prob_i in label_prob.tolist()]
Пример #5
0
                           unk_words=False,
                           sos=True,
                           eos=True)

    train_data = JSON(filename, source2idx=nl2ids, target2idx=tg2ids)
    # train_data = Csvfile(filename)

    data_idx = []
    batch = 8
    for d in Vocab.minibatches(train_data, batch):
        data_idx.append(d)
        nl, target = list(zip(*d))

        nl_pad_ids, nl_lens = seqPAD.pad_sequences(nl,
                                                   pad_tok=vocab.sw2i[PAD],
                                                   nlevels=1)
        nl_tensor = Data2tensor.idx2tensor(nl_pad_ids,
                                           dtype=torch.long,
                                           device=device)
        nl_len_tensor = Data2tensor.idx2tensor(nl_lens,
                                               dtype=torch.long,
                                               device=device)

        lb_pad_ids, lb_lens = seqPAD.pad_sequences(target,
                                                   pad_tok=vocab.tw2i[PAD],
                                                   nlevels=1)
        lb_tensor = Data2tensor.idx2tensor(lb_pad_ids,
                                           dtype=torch.long,
                                           device=device)

        break
Пример #6
0
    def greedy_predict(self, nl_tensor, nl_len_tensor, maxlen=500, wombat_tensor=None):
        device = nl_len_tensor.device
        # sort lengths of input tensors in the descending mode
        nl_tensor, nl_len_tensor, nl_ord_tensor, nl_recover_ord_tensor = self.sort_tensors(nl_tensor, nl_len_tensor)

        en_inp = self.sembedding(nl_tensor)
        if wombat_tensor is not None:
            wombat_tensor = self.reorder_tensor(wombat_tensor, nl_ord_tensor, dim=0)
            en_inp += wombat_tensor
        en_out, en_hidden = self.encoder(en_inp, nl_len_tensor)
        if self.enc_cnn == "cnn" and self.ed_mode != "lstm":
            en_hidden = en_hidden[0]
        # en_hn = tensor(batch_size, num_directions * rnn_dim)
        en_hn = self.encoder.get_last_hiddens(en_hidden)
        # recover the original order of inputs
        en_out = self.reorder_tensor(en_out, nl_recover_ord_tensor, dim=0)
        de_hidden = self.reorder_tensor(en_hidden, nl_recover_ord_tensor, dim=1)
        en_hn = self.reorder_tensor(en_hn, nl_recover_ord_tensor, dim=0)
        nl_len_tensor = self.reorder_tensor(nl_len_tensor, nl_recover_ord_tensor, dim=0)
        en_mask = None
        if nl_len_tensor.size(0) > 1:
            en_mask = torch.arange(en_out.size(1), dtype=torch.long, device=device)[None, :] < nl_len_tensor[:, None]

        batch_size = nl_tensor.shape[0]
        output = Data2tensor.idx2tensor([[SOT_id]] * batch_size, dtype=torch.long, device=device)
        pred_outputs = []
        acc_prob = Data2tensor.idx2tensor([[0.0]] * batch_size, dtype=torch.float32, device=device)
        EOT_tensor = Data2tensor.idx2tensor([[False]] * batch_size, dtype=torch.bool, device=device)
        count = 0
        while True:
            count += 1
            pred_outputs.append(output)
            de_out, de_hidden = self.decoder(output, None, de_hidden)
            enc_context, enc_align = None, None
            if self.enc_att:
                # enc_context: [batch, seq_length2, hidden_dim]
                enc_context, enc_align = self.enc_attention(en_out, de_out, en_mask)
                # rnn_out = torch.cat((rnn_out, enc_context), dim=-1)
            if enc_context is not None:
                de_out = torch.cat((de_out, enc_context), dim=-1)

            # de_score = [batch, 1, num_labels]
            de_score = self.scoring(de_out)
            log_probs = torch.nn.functional.log_softmax(de_score, dim=-1)
            top1_scores, top1_ids = torch.topk(log_probs,  1, dim=-1)

            # pred_prob, pred_label = self.inference(de_score)
            raw_output = top1_ids.squeeze(-1)
            acc_prob += top1_scores.squeeze(-1)
            EOT_tensor = EOT_tensor | (raw_output == EOT_id)
            # TODO: change to tensor.all()
            if EOT_tensor.all() or count > maxlen:
                # extend EOT to outputs
                pred_outputs.append(raw_output)
                break

            output = raw_output.detach().clone()

        pred_outputs = torch.cat(pred_outputs, dim=-1)
        # acc_prob = torch.cat(acc_prob, dim=-1)
        return pred_outputs, acc_prob.exp()