예제 #1
0
    def greedy_predict(self, entries, wombat_object=None, maxlen=2000):
        nl = []
        wd_tokens = []
        for entry in entries:
            wd_tokens.append(entry["question_arg"])
            nl.append(self.source2idx(entry["question_arg"]))
        self.seq2seq.eval()
        with torch.no_grad():
            nl_pad_ids, nl_lens = seqPAD.pad_sequences(nl, pad_tok=self.pad_id, nlevels=1)
            nl_tensor = Data2tensor.idx2tensor(nl_pad_ids, dtype=torch.long, device=self.device)
            nl_len_tensor = Data2tensor.idx2tensor(nl_lens, dtype=torch.long, device=self.device)
            # wombat_tensor = [batch, nl_len, emb_dim]
            wombat_tensor = torch.zeros(nl_tensor.shape + (self.args.swd_dim,), dtype=torch.float32, device=self.device)
            wombat_idx = (nl_tensor == self.unk_id).nonzero()
            if wombat_object is not None:
                for t, (i, j) in enumerate(wombat_idx.tolist()):
                    wombat_emb = wombat_object.get(wd_tokens[t][i][j])
                    if wombat_emb is not None:
                        wombat_tensor[i, j] = torch.from_numpy(wombat_emb)

            pred_outputs, acc_prob = self.seq2seq.greedy_predict(nl_tensor, nl_len_tensor,
                                                                 maxlen=maxlen, wombat_tensor=wombat_tensor)
            if self.args.tokenize_type != "bpe":
                predict_words = self.tokenizer.decode_batch(pred_outputs.tolist(), self.tokenizer.i2tw, 2)
                predict_words = [words if EOT not in words else words[: words.index(EOT) + 1] for words in
                                 predict_words]
            else:
                predict_words = self.tokenizer.decode_batch(pred_outputs.tolist())
                predict_words = [words[0: words.find(EOT)].split() for words in predict_words]
            # predict_prob = acc_prob.prod(dim=-1).tolist()
            predict_prob = acc_prob.squeeze().tolist()
        for i, entry in enumerate(entries):
            entry['model_result'] = " ".join(predict_words[i])
            entry['pred_prob'] = predict_prob[i]
        return entries
예제 #2
0
    def predict_batch(self, entries, wombat_object=None):
        nl = []
        wd_tokens = []
        for entry in entries:
            input_tokens = entry["input_tokens"]
            ids = self.source2idx(input_tokens)
            nl.append(ids)
            if self.args.tokenize_type != "bpe":
                entry['input_list'] = self.tokenizer.process_nl(input_tokens)
            else:
                entry['input_list'] = self.tokenizer.encode(
                    input_tokens, add_special_tokens=False).tokens
            wd_tokens.append(entry['input_list'])
        self.classifier.eval()
        with torch.no_grad():
            nl_pad_ids, nl_lens = seqPAD.pad_sequences(nl,
                                                       pad_tok=self.pad_id,
                                                       nlevels=1)
            nl_tensor = Data2tensor.idx2tensor(nl_pad_ids,
                                               dtype=torch.long,
                                               device=self.device)
            nl_len_tensor = Data2tensor.idx2tensor(nl_lens,
                                                   dtype=torch.long,
                                                   device=self.device)
            # wombat_tensor = [batch, nl_len, emb_dim]
            wombat_tensor = torch.zeros(nl_tensor.shape +
                                        (self.args.swd_dim, ),
                                        dtype=torch.float32,
                                        device=self.device)
            wombat_idx = (nl_tensor == self.unk_id).nonzero()
            if wombat_object is not None:
                for t, (i, j) in enumerate(wombat_idx.tolist()):
                    word_to_lookup = wd_tokens[i][j]
                    print('Looking up Wombat for:', word_to_lookup)
                    wombat_emb = wombat_object.get(word_to_lookup)
                    if wombat_emb is not None:
                        print('Found Wombat embedding for:', word_to_lookup)
                        wombat_tensor[i, j] = torch.from_numpy(wombat_emb)
            de_score = self.classifier(nl_tensor,
                                       nl_len_tensor,
                                       wombat_tensor=wombat_tensor)
            label_mask = nl_tensor > 0
            output_prob, output_idx = self.classifier.inference(de_score)
            # output_idx = de_score.max(-1)[1]
            predict_words = Tokenizer.decode_batch(
                output_idx.squeeze(-1).tolist(), self.tokenizer.i2tw, 1)
            # predict_prob = acc_prob.prod(dim=-1).tolist()
            predict_prob = output_prob.squeeze(-1).tolist()

        for i, entry in enumerate(entries):
            # entry["pred_pair"] = list(zip(entry["input_review"], predict_words[i]))
            entry['pred_sequence'] = predict_words[i]
            entry['prob_sequence'] = predict_prob[i]
        return entries
예제 #3
0
    def beam_predict(self, entries, bw=2, topk=2, wombat_object=None, maxlen=2000):
        nl = []
        wd_tokens = []
        for entry in entries:
            wd_tokens.append(entry["question_arg"])
            nl.append(self.source2idx(entry["question_arg"]))
        self.seq2seq.eval()
        with torch.no_grad():
            nl_pad_ids, nl_lens = seqPAD.pad_sequences(nl, pad_tok=self.pad_id, nlevels=1)
            nl_tensor = Data2tensor.idx2tensor(nl_pad_ids, dtype=torch.long, device=self.device)
            nl_len_tensor = Data2tensor.idx2tensor(nl_lens, dtype=torch.long, device=self.device)

            # wombat_tensor = [batch, nl_len, emb_dim]
            wombat_tensor = torch.zeros(nl_tensor.shape + (self.args.swd_dim,), dtype=torch.float32, device=self.device)
            wombat_idx = (nl_tensor == self.unk_id).nonzero()
            if wombat_object is not None:
                for t, (i, j) in enumerate(wombat_idx.tolist()):
                    wombat_emb = wombat_object.get(wd_tokens[t][i][j])
                    if wombat_emb is not None:
                        wombat_tensor[i, j] = torch.from_numpy(wombat_emb)

            pred_outputs, predict_prob = self.seq2seq.beam_predict(nl_tensor, nl_len_tensor,
                                                                   minlen=1, maxlen=maxlen,
                                                                   bw=bw, n_best=topk, wombat_tensor=wombat_tensor)
            if self.args.tokenize_type != "bpe":
                predict_words = self.tokenizer.decode_batch(pred_outputs, self.tokenizer.i2tw, 3)
                predict_words = [words if EOT not in words else words[: words.index(EOT) + 1] for words in
                                 predict_words]
                predict_words = [[" ".join(words) for words in topk_outputs] for topk_outputs in predict_words]
            else:
                predict_words = [self.tokenizer.decode_batch(topk_outputs) for topk_outputs in pred_outputs]
                predict_words = [[words[0: words.find(EOT)] for words in topk_outputs] for topk_outputs in predict_words]
        for i, entry in enumerate(entries):
            entry['model_result'] = predict_words[i][0]
            entry['pred_prob'] = predict_prob[i][0]
            entry['decoded_batch'] = list(zip(predict_words[i], predict_prob[i]))
        return entries
예제 #4
0
    def predict_batch(self,
                      entries,
                      wombat_object=None,
                      return_probability=False):
        nl = []
        wd_tokens = []
        for entry in entries:
            input_tokens = entry["input_tokens"]
            ids = self.source2idx(input_tokens)
            nl.append(ids)
            if self.args.tokenize_type != "bpe":
                entry['input_list'] = self.tokenizer.process_nl(input_tokens)
            else:
                entry['input_list'] = self.tokenizer.encode(
                    input_tokens, add_special_tokens=False).tokens
            wd_tokens.append(entry['input_list'])
        self.labeler.eval()
        with torch.no_grad():
            nl_pad_ids, nl_lens = seqPAD.pad_sequences(nl,
                                                       pad_tok=self.pad_id,
                                                       nlevels=1)
            nl_tensor = Data2tensor.idx2tensor(nl_pad_ids,
                                               dtype=torch.long,
                                               device=self.device)
            nl_len_tensor = Data2tensor.idx2tensor(nl_lens,
                                                   dtype=torch.long,
                                                   device=self.device)
            # wombat_tensor = [batch, nl_len, emb_dim]
            wombat_tensor = torch.zeros(nl_tensor.shape +
                                        (self.args.swd_dim, ),
                                        dtype=torch.float32,
                                        device=self.device)
            wombat_idx = (nl_tensor == self.unk_id).nonzero()
            if wombat_object is not None:
                for t, (i, j) in enumerate(wombat_idx.tolist()):
                    word_to_lookup = wd_tokens[i][j]
                    print('Looking up Wombat for:', word_to_lookup)
                    wombat_emb = wombat_object.get(word_to_lookup)
                    if wombat_emb is not None:
                        print('Found Wombat embedding for:', word_to_lookup)
                        wombat_tensor[i, j] = torch.from_numpy(wombat_emb)
            de_score = self.labeler(nl_tensor,
                                    nl_len_tensor,
                                    wombat_tensor=wombat_tensor)
            label_mask = nl_tensor > 0
            if return_probability is False:
                output_prob, output_idx = self.labeler.inference(
                    de_score, label_mask)
                if self.args.use_crf:
                    predict_words = Tokenizer.decode_batch(
                        output_idx, self.tokenizer.i2tw, 2)
                    # predict_words = [words[:i] for words, i in zip(predict_words, label_mask.sum(dim=1).tolist())]
                    predict_prob = list(output_prob)
                else:
                    # output_idx = de_score.max(-1)[1]
                    predict_words = Tokenizer.decode_batch(
                        output_idx.squeeze(-1).tolist(), self.tokenizer.i2tw,
                        2)
                    predict_words = [
                        words[:i]
                        for words, i in zip(predict_words,
                                            label_mask.sum(dim=1).tolist())
                    ]
                    # predict_prob = acc_prob.prod(dim=-1).tolist()
                    predict_prob = [
                        words[:i] for words, i in zip(
                            output_prob.squeeze(-1).tolist(),
                            label_mask.sum(dim=1).tolist())
                    ]

                for i, entry in enumerate(entries):
                    # entry["pred_pair"] = list(zip(entry["input_review"], predict_words[i]))
                    entry['pred_sequence'] = predict_words[i]
                    entry['prob_sequence'] = predict_prob[i]
                    entities_list = NER_metrics.absa_extractor(
                        entry["input_list"], predict_words[i],
                        None if self.args.use_crf else predict_prob[i])
                    entry["entities"] = []
                    if len(entities_list) > 0:
                        for entity, senti, _, prob in entities_list:
                            # entry["entities"].append((entity, senti, prob))
                            entry["entities"].append({
                                "aspect": entity,
                                "polarity": senti,
                                "probability": prob
                            })

                return entries
            else:
                label_prob = torch.softmax(de_score.squeeze(), dim=-1)
                return [{
                    self.tokenizer.i2tw[ind]: prob
                    for ind, prob in enumerate(prob_i)
                } for prob_i in label_prob.tolist()]
예제 #5
0
    tg2ids = vocab.lst2idx(vocab_words=vocab.tw2i,
                           unk_words=False,
                           sos=True,
                           eos=True)

    train_data = JSON(filename, source2idx=nl2ids, target2idx=tg2ids)
    # train_data = Csvfile(filename)

    data_idx = []
    batch = 8
    for d in Vocab.minibatches(train_data, batch):
        data_idx.append(d)
        nl, target = list(zip(*d))

        nl_pad_ids, nl_lens = seqPAD.pad_sequences(nl,
                                                   pad_tok=vocab.sw2i[PAD],
                                                   nlevels=1)
        nl_tensor = Data2tensor.idx2tensor(nl_pad_ids,
                                           dtype=torch.long,
                                           device=device)
        nl_len_tensor = Data2tensor.idx2tensor(nl_lens,
                                               dtype=torch.long,
                                               device=device)

        lb_pad_ids, lb_lens = seqPAD.pad_sequences(target,
                                                   pad_tok=vocab.tw2i[PAD],
                                                   nlevels=1)
        lb_tensor = Data2tensor.idx2tensor(lb_pad_ids,
                                           dtype=torch.long,
                                           device=device)