def predict_batch(self, entries, wombat_object=None): nl = [] wd_tokens = [] for entry in entries: input_tokens = entry["input_tokens"] ids = self.source2idx(input_tokens) nl.append(ids) if self.args.tokenize_type != "bpe": entry['input_list'] = self.tokenizer.process_nl(input_tokens) else: entry['input_list'] = self.tokenizer.encode( input_tokens, add_special_tokens=False).tokens wd_tokens.append(entry['input_list']) self.classifier.eval() with torch.no_grad(): nl_pad_ids, nl_lens = seqPAD.pad_sequences(nl, pad_tok=self.pad_id, nlevels=1) nl_tensor = Data2tensor.idx2tensor(nl_pad_ids, dtype=torch.long, device=self.device) nl_len_tensor = Data2tensor.idx2tensor(nl_lens, dtype=torch.long, device=self.device) # wombat_tensor = [batch, nl_len, emb_dim] wombat_tensor = torch.zeros(nl_tensor.shape + (self.args.swd_dim, ), dtype=torch.float32, device=self.device) wombat_idx = (nl_tensor == self.unk_id).nonzero() if wombat_object is not None: for t, (i, j) in enumerate(wombat_idx.tolist()): word_to_lookup = wd_tokens[i][j] print('Looking up Wombat for:', word_to_lookup) wombat_emb = wombat_object.get(word_to_lookup) if wombat_emb is not None: print('Found Wombat embedding for:', word_to_lookup) wombat_tensor[i, j] = torch.from_numpy(wombat_emb) de_score = self.classifier(nl_tensor, nl_len_tensor, wombat_tensor=wombat_tensor) label_mask = nl_tensor > 0 output_prob, output_idx = self.classifier.inference(de_score) # output_idx = de_score.max(-1)[1] predict_words = Tokenizer.decode_batch( output_idx.squeeze(-1).tolist(), self.tokenizer.i2tw, 1) # predict_prob = acc_prob.prod(dim=-1).tolist() predict_prob = output_prob.squeeze(-1).tolist() for i, entry in enumerate(entries): # entry["pred_pair"] = list(zip(entry["input_review"], predict_words[i])) entry['pred_sequence'] = predict_words[i] entry['prob_sequence'] = predict_prob[i] return entries
def inference(self, rv_text): prompt_text = LM.prepare_entry(rv_text) encoded_prompt = self.input2tensor(prompt_text) length = self.args.length if self.lm.args.model_type == "t5" else self.args.length + len(encoded_prompt[0]) output_sequences, probs = self.lm.model.generate( input_ids=encoded_prompt, max_length=length, temperature=self.args.temperature, top_k=self.args.k, top_p=self.args.p, repetition_penalty=self.args.repetition_penalty, num_beams=self.args.num_beams, do_sample=self.args.do_sample, num_return_sequences=self.args.num_return_sequences, bos_token_id=self.bos_token_id, # pad_token_id=self.pad_token_id, eos_token_id=self.eos_token_id, ) # Remove the batch dimension when returning multiple sequences if len(output_sequences.shape) > 2: output_sequences.squeeze_() generated_sequences = [] for generated_sequence_idx, generated_sequence in enumerate(output_sequences): # print("=== GENERATED SEQUENCE {} ===".format(generated_sequence_idx + 1)) generated_sequence = generated_sequence.tolist() # Decode text text = Tokenizer.decode_batch(generated_sequence, self.lm.tokenizer.i2tw, level=1) text = " ".join(text) # text = self.lm.tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True, # skip_special_tokens=True) # Remove all text after the stop token # gen_text = text[: text.find(self.args.stop_token) if self.args.stop_token else None] gen_text = text[: text.find(self.lm.tokenizer.eos_token) + len(self.lm.tokenizer.eos_token) if text.find(self.lm.tokenizer.eos_token) != -1 else None] if self.lm.args.model_type != "t5": gen_text = gen_text[len(self.lm.tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True, skip_special_tokens=True)):] # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing total_sequence = (prompt_text, gen_text, probs[generated_sequence_idx]) generated_sequences.append(total_sequence) # print("".join(total_sequence)) return generated_sequences, probs
def evaluate_batch(self, eva_data, num_eva): start = time.time() self.labeler.eval() nl_tokens = [] reference = [] candidate = [] predict_probs = [] dev_loss = [] total_tokens = 0 eva_iterdataset = IterDataset( eva_data, source2idx=self.source2idx, target2idx=self.target2idx, num_lines=num_eva, bpe=True if self.args.tokenize_type == "bpe" else False) eva_dataloader = DataLoader(eva_iterdataset, pin_memory=True, batch_size=self.args.batch_size, collate_fn=self.collate_fn) with torch.no_grad(): for i, d in enumerate(eva_dataloader): # nl, target = list(zip(*d)) d = tuple(t.to(self.device) for t in d) nl_tensor, lb_tensor = d nl_len_tensor = (nl_tensor != self.pad_id).sum(dim=1) de_score = self.labeler(nl_tensor, nl_len_tensor) label_mask = nl_tensor != self.pad_id # TODO: can move NLL into seq2seq for multigpu total_loss = self.labeler.NLL_loss(de_score, lb_tensor, label_mask) dev_loss.append(total_loss.item()) total_tokens += label_mask.sum() output_prob, output_idx = self.labeler.inference( de_score, label_mask) label_words = Tokenizer.decode_batch(lb_tensor.tolist(), self.tokenizer.i2tw, 2) label_words = [ words[:i] for words, i in zip(label_words, label_mask.sum(dim=1).tolist()) ] # reference = [[w1, ..., EOT], ..., [w1, ..., EOT]] reference.extend(label_words) if self.args.use_crf: predict_words = Tokenizer.decode_batch( output_idx, self.tokenizer.i2tw, 2) # predict_words = [words[:i] for words, i in zip(predict_words, label_mask.sum(dim=1).tolist())] predict_probs += output_prob else: # output_idx = de_score.max(-1)[1] predict_words = Tokenizer.decode_batch( output_idx.squeeze(-1).tolist(), self.tokenizer.i2tw, 2) predict_words = [ words[:i] for words, i in zip(predict_words, label_mask.sum(dim=1).tolist()) ] # predict_prob = acc_prob.prod(dim=-1).tolist() predict_probs += [ words[:i] for words, i in zip( output_prob.squeeze(-1).tolist(), label_mask.sum(dim=1).tolist()) ] # if sum([len(k) for k in predict_words]) != 0: # candidate = [[w1, ..., EOT], ..., [w1, ..., EOT]] candidate.extend(predict_words) if self.args.tokenize_type != "bpe": nl_token = self.tokenizer.decode_batch( nl_tensor.tolist(), self.tokenizer.i2sw, 2) nl_token = [ words[:i] if EOT not in words else words[:words.index(EOT)] for words, i in zip(nl_token, (nl_tensor > 0).sum( dim=1).tolist()) ] else: nl_token = self.tokenizer.decode_batch(nl_tensor.tolist()) # nl_token = [enc_words.tokens for enc_words in self.tokenizer.encode_batch(nl_token)] nl_token = [ words[0:words.find(EOT)].split() for words in nl_token ] nl_tokens.extend(nl_token) del nl_tensor, nl_len_tensor, lb_tensor, de_score, label_mask # gc.collect() # torch.cuda.empty_cache() if len(candidate) != 0 and len(reference) != 0: assert len(candidate) == len(reference) # Randomly sample one pair rand_idx = random.randint(0, len(reference) - 1) print("\nRANDOMLY sampling: ") print("\t- An Input Sequence: ", " ".join(nl_tokens[rand_idx])) print("\t- A LABEL query: ", " ".join(reference[rand_idx])) print("\t- A PREDICTED query: ", " ".join(candidate[rand_idx])) print("\t- A PREDICTED prob: ", predict_probs[rand_idx], "\n\n") metrics = Labeler_model.class_metrics(reference, candidate) else: metrics = [0., 0., 0., 0., 0.] end = time.time() - start speed = total_tokens / end return sum(dev_loss) / len(dev_loss), metrics, speed
def predict_batch(self, entries, wombat_object=None, return_probability=False): nl = [] wd_tokens = [] for entry in entries: input_tokens = entry["input_tokens"] ids = self.source2idx(input_tokens) nl.append(ids) if self.args.tokenize_type != "bpe": entry['input_list'] = self.tokenizer.process_nl(input_tokens) else: entry['input_list'] = self.tokenizer.encode( input_tokens, add_special_tokens=False).tokens wd_tokens.append(entry['input_list']) self.labeler.eval() with torch.no_grad(): nl_pad_ids, nl_lens = seqPAD.pad_sequences(nl, pad_tok=self.pad_id, nlevels=1) nl_tensor = Data2tensor.idx2tensor(nl_pad_ids, dtype=torch.long, device=self.device) nl_len_tensor = Data2tensor.idx2tensor(nl_lens, dtype=torch.long, device=self.device) # wombat_tensor = [batch, nl_len, emb_dim] wombat_tensor = torch.zeros(nl_tensor.shape + (self.args.swd_dim, ), dtype=torch.float32, device=self.device) wombat_idx = (nl_tensor == self.unk_id).nonzero() if wombat_object is not None: for t, (i, j) in enumerate(wombat_idx.tolist()): word_to_lookup = wd_tokens[i][j] print('Looking up Wombat for:', word_to_lookup) wombat_emb = wombat_object.get(word_to_lookup) if wombat_emb is not None: print('Found Wombat embedding for:', word_to_lookup) wombat_tensor[i, j] = torch.from_numpy(wombat_emb) de_score = self.labeler(nl_tensor, nl_len_tensor, wombat_tensor=wombat_tensor) label_mask = nl_tensor > 0 if return_probability is False: output_prob, output_idx = self.labeler.inference( de_score, label_mask) if self.args.use_crf: predict_words = Tokenizer.decode_batch( output_idx, self.tokenizer.i2tw, 2) # predict_words = [words[:i] for words, i in zip(predict_words, label_mask.sum(dim=1).tolist())] predict_prob = list(output_prob) else: # output_idx = de_score.max(-1)[1] predict_words = Tokenizer.decode_batch( output_idx.squeeze(-1).tolist(), self.tokenizer.i2tw, 2) predict_words = [ words[:i] for words, i in zip(predict_words, label_mask.sum(dim=1).tolist()) ] # predict_prob = acc_prob.prod(dim=-1).tolist() predict_prob = [ words[:i] for words, i in zip( output_prob.squeeze(-1).tolist(), label_mask.sum(dim=1).tolist()) ] for i, entry in enumerate(entries): # entry["pred_pair"] = list(zip(entry["input_review"], predict_words[i])) entry['pred_sequence'] = predict_words[i] entry['prob_sequence'] = predict_prob[i] entities_list = NER_metrics.absa_extractor( entry["input_list"], predict_words[i], None if self.args.use_crf else predict_prob[i]) entry["entities"] = [] if len(entities_list) > 0: for entity, senti, _, prob in entities_list: # entry["entities"].append((entity, senti, prob)) entry["entities"].append({ "aspect": entity, "polarity": senti, "probability": prob }) return entries else: label_prob = torch.softmax(de_score.squeeze(), dim=-1) return [{ self.tokenizer.i2tw[ind]: prob for ind, prob in enumerate(prob_i) } for prob_i in label_prob.tolist()]
if use_selfatt: # use the maximum length 5 times larger than input length nlemb_HPs = [sw_size, 50, None, 0.5, True, True, 1000] # nn_mode, ninp, nhid, nlayers, nhead, dropout, activation, norm, his_mask enc_HPs = ["self_attention", 50, 200, 6, 10, 0.5, "relu", None, False] else: nlemb_HPs = [sw_size, 50, None, 0.5, True, True] enc_HPs = ["lstm", 50, 200, 2, False, 0.5] classifier = Classifier(nlemb_HPs, enc_HPs, drop_rate=0.5, num_labels=len(lb2id_dict)) de_score = classifier(nl_tensor, nl_len_tensor) output_idx = de_score.max(-1)[1] de_loss = classifier.NLL_loss(de_score, lb_tensor) reference = [] candidate = [] label_words = Tokenizer.decode_batch(lb_tensor.squeeze().tolist(), id2lb_dict, 1) predict_words = Tokenizer.decode_batch(output_idx.tolist(), id2lb_dict, 1) if tokenize_type != "bpe": nl_token = tokenizer.decode_batch(nl_tensor.tolist(), tokenizer.i2sw, 2) nl_token = [words[:i] if EOT not in words else words[: words.index(EOT)] for words, i in zip(nl_token, (nl_tensor > 0).sum(dim=1).tolist())] else: nl_token = tokenizer.decode_batch(nl_tensor.tolist()) # nl_token = [enc_words.tokens for enc_words in self.args.vocab.encode_batch(nl_token)] nl_token = [words[0: words.find(EOT)].split() for words in nl_token] pass # reference = [[w1, ..., EOT], ..., [w1, ..., EOT]] reference.extend(label_words) candidate.extend(predict_words) # test inference label_prob, label_pred = classifier.inference(de_score)
# fn_dim += hidden_dim # # hidden2tag = nn.Linear(fn_dim, len(vocab.tw2i)) seq2seq = EncDec(nlemb_HPs, enc_HPs, dec_HPs, drop_rate=fn_dropout, num_labels=tw_size, enc_att=enc_att) nl_len_tensor = (nl_tensor > pad_id).sum(dim=1) random_force = True if random.random() < teacher_forcing_ratio else False # print("\nMODEL INPUTs: ", nl_tensor.shape, "\n") de_score = seq2seq(nl_tensor, nl_len_tensor, lb_tensor, random_force) olb_tensor = lb_tensor[:, 1:] label_mask = olb_tensor > 0 total_loss = seq2seq.NLL_loss(de_score[label_mask], olb_tensor[label_mask]).mean() output_idx = de_score.max(-1)[1] if tokenize_type != "bpe": label_words = vocab.decode_batch(olb_tensor.tolist(), vocab.i2tw, 2) label_words = [words[:i] if EOT not in words else words[: words.index(EOT)] for words, i in zip(label_words, label_mask.sum(dim=1).tolist())] predict_words = vocab.decode_batch(output_idx.tolist(), vocab.i2tw, 2) predict_words = [words[:i] if EOT not in words else words[: words.index(EOT)] for words, i in zip(predict_words, label_mask.sum(dim=1).tolist())] nl_token = vocab.decode_batch(nl_tensor.tolist(), vocab.i2sw, 2) nl_token = [words[:i] if EOT not in words else words[: words.index(EOT)] for words, i in zip(nl_token, (nl_tensor > 0).sum(dim=1).tolist())] else: label_words = vocab.decode_batch(olb_tensor.tolist()) # label_words = [enc_words.tokens for enc_words in self.args.vocab.encode_batch(label_words)] label_words = [words[0: words.find(EOT)].split() for words in label_words]