def greedy_predict(self, entries, wombat_object=None, maxlen=2000): nl = [] wd_tokens = [] for entry in entries: wd_tokens.append(entry["question_arg"]) nl.append(self.source2idx(entry["question_arg"])) self.seq2seq.eval() with torch.no_grad(): nl_pad_ids, nl_lens = seqPAD.pad_sequences(nl, pad_tok=self.pad_id, nlevels=1) nl_tensor = Data2tensor.idx2tensor(nl_pad_ids, dtype=torch.long, device=self.device) nl_len_tensor = Data2tensor.idx2tensor(nl_lens, dtype=torch.long, device=self.device) # wombat_tensor = [batch, nl_len, emb_dim] wombat_tensor = torch.zeros(nl_tensor.shape + (self.args.swd_dim,), dtype=torch.float32, device=self.device) wombat_idx = (nl_tensor == self.unk_id).nonzero() if wombat_object is not None: for t, (i, j) in enumerate(wombat_idx.tolist()): wombat_emb = wombat_object.get(wd_tokens[t][i][j]) if wombat_emb is not None: wombat_tensor[i, j] = torch.from_numpy(wombat_emb) pred_outputs, acc_prob = self.seq2seq.greedy_predict(nl_tensor, nl_len_tensor, maxlen=maxlen, wombat_tensor=wombat_tensor) if self.args.tokenize_type != "bpe": predict_words = self.tokenizer.decode_batch(pred_outputs.tolist(), self.tokenizer.i2tw, 2) predict_words = [words if EOT not in words else words[: words.index(EOT) + 1] for words in predict_words] else: predict_words = self.tokenizer.decode_batch(pred_outputs.tolist()) predict_words = [words[0: words.find(EOT)].split() for words in predict_words] # predict_prob = acc_prob.prod(dim=-1).tolist() predict_prob = acc_prob.squeeze().tolist() for i, entry in enumerate(entries): entry['model_result'] = " ".join(predict_words[i]) entry['pred_prob'] = predict_prob[i] return entries
def predict_batch(self, entries, wombat_object=None): nl = [] wd_tokens = [] for entry in entries: input_tokens = entry["input_tokens"] ids = self.source2idx(input_tokens) nl.append(ids) if self.args.tokenize_type != "bpe": entry['input_list'] = self.tokenizer.process_nl(input_tokens) else: entry['input_list'] = self.tokenizer.encode( input_tokens, add_special_tokens=False).tokens wd_tokens.append(entry['input_list']) self.classifier.eval() with torch.no_grad(): nl_pad_ids, nl_lens = seqPAD.pad_sequences(nl, pad_tok=self.pad_id, nlevels=1) nl_tensor = Data2tensor.idx2tensor(nl_pad_ids, dtype=torch.long, device=self.device) nl_len_tensor = Data2tensor.idx2tensor(nl_lens, dtype=torch.long, device=self.device) # wombat_tensor = [batch, nl_len, emb_dim] wombat_tensor = torch.zeros(nl_tensor.shape + (self.args.swd_dim, ), dtype=torch.float32, device=self.device) wombat_idx = (nl_tensor == self.unk_id).nonzero() if wombat_object is not None: for t, (i, j) in enumerate(wombat_idx.tolist()): word_to_lookup = wd_tokens[i][j] print('Looking up Wombat for:', word_to_lookup) wombat_emb = wombat_object.get(word_to_lookup) if wombat_emb is not None: print('Found Wombat embedding for:', word_to_lookup) wombat_tensor[i, j] = torch.from_numpy(wombat_emb) de_score = self.classifier(nl_tensor, nl_len_tensor, wombat_tensor=wombat_tensor) label_mask = nl_tensor > 0 output_prob, output_idx = self.classifier.inference(de_score) # output_idx = de_score.max(-1)[1] predict_words = Tokenizer.decode_batch( output_idx.squeeze(-1).tolist(), self.tokenizer.i2tw, 1) # predict_prob = acc_prob.prod(dim=-1).tolist() predict_prob = output_prob.squeeze(-1).tolist() for i, entry in enumerate(entries): # entry["pred_pair"] = list(zip(entry["input_review"], predict_words[i])) entry['pred_sequence'] = predict_words[i] entry['prob_sequence'] = predict_prob[i] return entries
def __init__(self, args): args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() self.args = args margs = torch.load(os.path.join(args.model_name_or_path, "training_args.bin")) margs.no_cuda = args.no_cuda margs.model_name_or_path = args.model_name_or_path margs.overwrite_output_dir = True self.lm = TransLanguageModel(margs) self.lm.model_init() # lm.load_model(args.model_name_or_path) Data2tensor.set_randseed(args.seed)
def __init__(self, args): args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() margs = torch.load( os.path.join(args.model_name_or_path, "training_args.bin")) margs.no_cuda = args.no_cuda margs.label_file = args.label_file margs.model_name_or_path = args.model_name_or_path self.tagger = TransLabelerModel(margs) self.tagger.model_init(args.model_name_or_path) Data2tensor.set_randseed(args.seed)
def __init__(self, args): args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() self.args = args margs = torch.load(os.path.join(args.model_name_or_path, "training_args.bin")) margs.no_cuda = args.no_cuda margs.model_name_or_path = args.model_name_or_path margs.overwrite_output_dir = True self.lm = TransSeq2SeqModel(margs) self.lm.model_init(args.model_name_or_path) # lm.load_model(args.model_name_or_path) Data2tensor.set_randseed(args.seed) self.bos_token_id = self.lm.tokenizer.tw2i[SOT] # self.pad_token_id = self.lm.pad_id self.eos_token_id = self.lm.tokenizer.tw2i[EOT]
def beam_predict(self, entries, bw=2, topk=2, wombat_object=None, maxlen=2000): nl = [] wd_tokens = [] for entry in entries: wd_tokens.append(entry["question_arg"]) nl.append(self.source2idx(entry["question_arg"])) self.seq2seq.eval() with torch.no_grad(): nl_pad_ids, nl_lens = seqPAD.pad_sequences(nl, pad_tok=self.pad_id, nlevels=1) nl_tensor = Data2tensor.idx2tensor(nl_pad_ids, dtype=torch.long, device=self.device) nl_len_tensor = Data2tensor.idx2tensor(nl_lens, dtype=torch.long, device=self.device) # wombat_tensor = [batch, nl_len, emb_dim] wombat_tensor = torch.zeros(nl_tensor.shape + (self.args.swd_dim,), dtype=torch.float32, device=self.device) wombat_idx = (nl_tensor == self.unk_id).nonzero() if wombat_object is not None: for t, (i, j) in enumerate(wombat_idx.tolist()): wombat_emb = wombat_object.get(wd_tokens[t][i][j]) if wombat_emb is not None: wombat_tensor[i, j] = torch.from_numpy(wombat_emb) pred_outputs, predict_prob = self.seq2seq.beam_predict(nl_tensor, nl_len_tensor, minlen=1, maxlen=maxlen, bw=bw, n_best=topk, wombat_tensor=wombat_tensor) if self.args.tokenize_type != "bpe": predict_words = self.tokenizer.decode_batch(pred_outputs, self.tokenizer.i2tw, 3) predict_words = [words if EOT not in words else words[: words.index(EOT) + 1] for words in predict_words] predict_words = [[" ".join(words) for words in topk_outputs] for topk_outputs in predict_words] else: predict_words = [self.tokenizer.decode_batch(topk_outputs) for topk_outputs in pred_outputs] predict_words = [[words[0: words.find(EOT)] for words in topk_outputs] for topk_outputs in predict_words] for i, entry in enumerate(entries): entry['model_result'] = predict_words[i][0] entry['pred_prob'] = predict_prob[i][0] entry['decoded_batch'] = list(zip(predict_words[i], predict_prob[i])) return entries
@staticmethod def decode_batch(pad_ids, i2t, level=2): return Tokenizer.idx2text(pad_ids=pad_ids, i2t=i2t, level=level) if __name__ == '__main__': import torch from mlmodels.utils.idx2tensor import Data2tensor, seqPAD from mlmodels.utils.dataset import IterDataset, collate_fn, tokens2ids from torch.utils.data import DataLoader, Dataset, IterableDataset, RandomSampler, SequentialSampler, TensorDataset from mlmodels.utils.BPEtonkenizer import BPE from mlmodels.utils.special_tokens import BPAD, PAD, NULL from mlmodels.utils.txtIO import TXT Data2tensor.set_randseed(12345) device = torch.device("cpu") dtype = torch.long use_cuda = False filename = "../../data/reviews/processed_csv/train_res4.csv" label_file = "../../data/reviews/processed_csv/labels.txt" labels_list = TXT.read(label_file, firstline=False) lb2id_dict = Tokenizer.list2dict(sys_tokens + labels_list) id2lb_dict = Tokenizer.reversed_dict(lb2id_dict) lb2ids = Tokenizer.lst2idx(tokenizer=Tokenizer.process_target, vocab_words=lb2id_dict, unk_words=False, sos=False, eos=False) tokenize_type = "bpe" if tokenize_type != "bpe":
logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed Data2tensor.set_randseed(args.seed, args.n_gpu) # Prepare CONLL-2003 task labels = TextDataset.get_labels(args.labels) num_labels = len(labels) # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later pad_token_label_id = CrossEntropyLoss().ignore_index # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained(
def predict_batch(self, entries, wombat_object=None, return_probability=False): nl = [] wd_tokens = [] for entry in entries: input_tokens = entry["input_tokens"] ids = self.source2idx(input_tokens) nl.append(ids) if self.args.tokenize_type != "bpe": entry['input_list'] = self.tokenizer.process_nl(input_tokens) else: entry['input_list'] = self.tokenizer.encode( input_tokens, add_special_tokens=False).tokens wd_tokens.append(entry['input_list']) self.labeler.eval() with torch.no_grad(): nl_pad_ids, nl_lens = seqPAD.pad_sequences(nl, pad_tok=self.pad_id, nlevels=1) nl_tensor = Data2tensor.idx2tensor(nl_pad_ids, dtype=torch.long, device=self.device) nl_len_tensor = Data2tensor.idx2tensor(nl_lens, dtype=torch.long, device=self.device) # wombat_tensor = [batch, nl_len, emb_dim] wombat_tensor = torch.zeros(nl_tensor.shape + (self.args.swd_dim, ), dtype=torch.float32, device=self.device) wombat_idx = (nl_tensor == self.unk_id).nonzero() if wombat_object is not None: for t, (i, j) in enumerate(wombat_idx.tolist()): word_to_lookup = wd_tokens[i][j] print('Looking up Wombat for:', word_to_lookup) wombat_emb = wombat_object.get(word_to_lookup) if wombat_emb is not None: print('Found Wombat embedding for:', word_to_lookup) wombat_tensor[i, j] = torch.from_numpy(wombat_emb) de_score = self.labeler(nl_tensor, nl_len_tensor, wombat_tensor=wombat_tensor) label_mask = nl_tensor > 0 if return_probability is False: output_prob, output_idx = self.labeler.inference( de_score, label_mask) if self.args.use_crf: predict_words = Tokenizer.decode_batch( output_idx, self.tokenizer.i2tw, 2) # predict_words = [words[:i] for words, i in zip(predict_words, label_mask.sum(dim=1).tolist())] predict_prob = list(output_prob) else: # output_idx = de_score.max(-1)[1] predict_words = Tokenizer.decode_batch( output_idx.squeeze(-1).tolist(), self.tokenizer.i2tw, 2) predict_words = [ words[:i] for words, i in zip(predict_words, label_mask.sum(dim=1).tolist()) ] # predict_prob = acc_prob.prod(dim=-1).tolist() predict_prob = [ words[:i] for words, i in zip( output_prob.squeeze(-1).tolist(), label_mask.sum(dim=1).tolist()) ] for i, entry in enumerate(entries): # entry["pred_pair"] = list(zip(entry["input_review"], predict_words[i])) entry['pred_sequence'] = predict_words[i] entry['prob_sequence'] = predict_prob[i] entities_list = NER_metrics.absa_extractor( entry["input_list"], predict_words[i], None if self.args.use_crf else predict_prob[i]) entry["entities"] = [] if len(entities_list) > 0: for entity, senti, _, prob in entities_list: # entry["entities"].append((entity, senti, prob)) entry["entities"].append({ "aspect": entity, "polarity": senti, "probability": prob }) return entries else: label_prob = torch.softmax(de_score.squeeze(), dim=-1) return [{ self.tokenizer.i2tw[ind]: prob for ind, prob in enumerate(prob_i) } for prob_i in label_prob.tolist()]
por += [labels[i][2:]] tok += [tokens[-1]] p += [prob[-1] if prob is not None else 0] span.extend([[ " ".join(tok), Counter(por).most_common(1)[0][0], " ".join(cur), sum(p) / len(p) ]]) return span if __name__ == '__main__': import torch from mlmodels.utils.idx2tensor import Data2tensor, seqPAD Data2tensor.set_randseed(12345) device = torch.device("cpu") dtype = torch.long use_cuda = False filename = "/media/data/review_response/Dev.json" s_paras = [-1, 1] t_paras = [-1, 1] vocab = Vocab(s_paras, t_paras) vocab.build([filename]) nl2ids = vocab.lst2idx(vocab_words=vocab.sw2i, unk_words=True, eos=True) tg2ids = vocab.lst2idx(vocab_words=vocab.tw2i, unk_words=False,
def greedy_predict(self, nl_tensor, nl_len_tensor, maxlen=500, wombat_tensor=None): device = nl_len_tensor.device # sort lengths of input tensors in the descending mode nl_tensor, nl_len_tensor, nl_ord_tensor, nl_recover_ord_tensor = self.sort_tensors(nl_tensor, nl_len_tensor) en_inp = self.sembedding(nl_tensor) if wombat_tensor is not None: wombat_tensor = self.reorder_tensor(wombat_tensor, nl_ord_tensor, dim=0) en_inp += wombat_tensor en_out, en_hidden = self.encoder(en_inp, nl_len_tensor) if self.enc_cnn == "cnn" and self.ed_mode != "lstm": en_hidden = en_hidden[0] # en_hn = tensor(batch_size, num_directions * rnn_dim) en_hn = self.encoder.get_last_hiddens(en_hidden) # recover the original order of inputs en_out = self.reorder_tensor(en_out, nl_recover_ord_tensor, dim=0) de_hidden = self.reorder_tensor(en_hidden, nl_recover_ord_tensor, dim=1) en_hn = self.reorder_tensor(en_hn, nl_recover_ord_tensor, dim=0) nl_len_tensor = self.reorder_tensor(nl_len_tensor, nl_recover_ord_tensor, dim=0) en_mask = None if nl_len_tensor.size(0) > 1: en_mask = torch.arange(en_out.size(1), dtype=torch.long, device=device)[None, :] < nl_len_tensor[:, None] batch_size = nl_tensor.shape[0] output = Data2tensor.idx2tensor([[SOT_id]] * batch_size, dtype=torch.long, device=device) pred_outputs = [] acc_prob = Data2tensor.idx2tensor([[0.0]] * batch_size, dtype=torch.float32, device=device) EOT_tensor = Data2tensor.idx2tensor([[False]] * batch_size, dtype=torch.bool, device=device) count = 0 while True: count += 1 pred_outputs.append(output) de_out, de_hidden = self.decoder(output, None, de_hidden) enc_context, enc_align = None, None if self.enc_att: # enc_context: [batch, seq_length2, hidden_dim] enc_context, enc_align = self.enc_attention(en_out, de_out, en_mask) # rnn_out = torch.cat((rnn_out, enc_context), dim=-1) if enc_context is not None: de_out = torch.cat((de_out, enc_context), dim=-1) # de_score = [batch, 1, num_labels] de_score = self.scoring(de_out) log_probs = torch.nn.functional.log_softmax(de_score, dim=-1) top1_scores, top1_ids = torch.topk(log_probs, 1, dim=-1) # pred_prob, pred_label = self.inference(de_score) raw_output = top1_ids.squeeze(-1) acc_prob += top1_scores.squeeze(-1) EOT_tensor = EOT_tensor | (raw_output == EOT_id) # TODO: change to tensor.all() if EOT_tensor.all() or count > maxlen: # extend EOT to outputs pred_outputs.append(raw_output) break output = raw_output.detach().clone() pred_outputs = torch.cat(pred_outputs, dim=-1) # acc_prob = torch.cat(acc_prob, dim=-1) return pred_outputs, acc_prob.exp()