def save_label(train_files, label_file, task=2, firstline=True): datasets = Tokenizer.load_file(train_files, firstline=firstline, task=task) # data = [] label_set = set() for dataset in datasets: for nl, label in dataset: # data.append(d) label_set.update(set(label.split())) # label_set.update([NULL]) TXT.write(label_set, label_file)
def predict_batch(self, entries, wombat_object=None): nl = [] wd_tokens = [] for entry in entries: input_tokens = entry["input_tokens"] ids = self.source2idx(input_tokens) nl.append(ids) if self.args.tokenize_type != "bpe": entry['input_list'] = self.tokenizer.process_nl(input_tokens) else: entry['input_list'] = self.tokenizer.encode( input_tokens, add_special_tokens=False).tokens wd_tokens.append(entry['input_list']) self.classifier.eval() with torch.no_grad(): nl_pad_ids, nl_lens = seqPAD.pad_sequences(nl, pad_tok=self.pad_id, nlevels=1) nl_tensor = Data2tensor.idx2tensor(nl_pad_ids, dtype=torch.long, device=self.device) nl_len_tensor = Data2tensor.idx2tensor(nl_lens, dtype=torch.long, device=self.device) # wombat_tensor = [batch, nl_len, emb_dim] wombat_tensor = torch.zeros(nl_tensor.shape + (self.args.swd_dim, ), dtype=torch.float32, device=self.device) wombat_idx = (nl_tensor == self.unk_id).nonzero() if wombat_object is not None: for t, (i, j) in enumerate(wombat_idx.tolist()): word_to_lookup = wd_tokens[i][j] print('Looking up Wombat for:', word_to_lookup) wombat_emb = wombat_object.get(word_to_lookup) if wombat_emb is not None: print('Found Wombat embedding for:', word_to_lookup) wombat_tensor[i, j] = torch.from_numpy(wombat_emb) de_score = self.classifier(nl_tensor, nl_len_tensor, wombat_tensor=wombat_tensor) label_mask = nl_tensor > 0 output_prob, output_idx = self.classifier.inference(de_score) # output_idx = de_score.max(-1)[1] predict_words = Tokenizer.decode_batch( output_idx.squeeze(-1).tolist(), self.tokenizer.i2tw, 1) # predict_prob = acc_prob.prod(dim=-1).tolist() predict_prob = output_prob.squeeze(-1).tolist() for i, entry in enumerate(entries): # entry["pred_pair"] = list(zip(entry["input_review"], predict_words[i])) entry['pred_sequence'] = predict_words[i] entry['prob_sequence'] = predict_prob[i] return entries
def extract_label(inp_file, out_file, tokenizer=Tokenizer.process_target): data = read_data(inp_file) twcnt, twl = Counter(), 0 for line in data: nl, target = line nl = nl.lower() target = target.lower() # Tokenize target into tokens target = tokenizer(target) # target = Utilities.fast_tokenize(target) twcnt, twl = Tokenizer.update_sent(target, twcnt, twl) labels = list(twcnt.keys()) TXT.write(labels, out_file)
def inference(self, rv_text): prompt_text = LM.prepare_entry(rv_text) encoded_prompt = self.input2tensor(prompt_text) length = self.args.length if self.lm.args.model_type == "t5" else self.args.length + len(encoded_prompt[0]) output_sequences, probs = self.lm.model.generate( input_ids=encoded_prompt, max_length=length, temperature=self.args.temperature, top_k=self.args.k, top_p=self.args.p, repetition_penalty=self.args.repetition_penalty, num_beams=self.args.num_beams, do_sample=self.args.do_sample, num_return_sequences=self.args.num_return_sequences, bos_token_id=self.bos_token_id, # pad_token_id=self.pad_token_id, eos_token_id=self.eos_token_id, ) # Remove the batch dimension when returning multiple sequences if len(output_sequences.shape) > 2: output_sequences.squeeze_() generated_sequences = [] for generated_sequence_idx, generated_sequence in enumerate(output_sequences): # print("=== GENERATED SEQUENCE {} ===".format(generated_sequence_idx + 1)) generated_sequence = generated_sequence.tolist() # Decode text text = Tokenizer.decode_batch(generated_sequence, self.lm.tokenizer.i2tw, level=1) text = " ".join(text) # text = self.lm.tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True, # skip_special_tokens=True) # Remove all text after the stop token # gen_text = text[: text.find(self.args.stop_token) if self.args.stop_token else None] gen_text = text[: text.find(self.lm.tokenizer.eos_token) + len(self.lm.tokenizer.eos_token) if text.find(self.lm.tokenizer.eos_token) != -1 else None] if self.lm.args.model_type != "t5": gen_text = gen_text[len(self.lm.tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True, skip_special_tokens=True)):] # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing total_sequence = (prompt_text, gen_text, probs[generated_sequence_idx]) generated_sequences.append(total_sequence) # print("".join(total_sequence)) return generated_sequences, probs
def build_dataloader(file: str, task: int, source2idx: Callable, target2idx: Callable, batch_size: int, firstline: bool, collate: Callable) -> Tuple[DataLoader, int]: """ @param task: Choose the respective task @param source2idx: source tokenizer @param target2idx: target tokenizer @param batch_size: batch size @param firstline: if first line is header @param collate: collate function for sequence conversion @return: Dataloader and the file size """ iterdata, num_lines = Tokenizer.prepare_iter(file, firstline=firstline, task=task) dataset = IterDataset(iterdata, source2idx=source2idx, target2idx=target2idx, num_lines=num_lines) dataloader = DataLoader(dataset, pin_memory=True, batch_size=batch_size, collate_fn=collate, num_workers=8) return dataloader, num_lines
def build_data(args): if not args.tl: if not os.path.exists(args.model_dir): os.mkdir(args.model_dir) if args.timestamped_subdir: sub_folder = datetime.now().isoformat(sep='-', timespec='minutes').replace(":", "-").replace("-", "_") else: sub_folder = '' if not os.path.exists(os.path.join(args.model_dir, sub_folder)): os.mkdir(os.path.join(args.model_dir, sub_folder)) args.model_dir = os.path.join(args.model_dir, sub_folder) args.log_file = os.path.join(args.model_dir, args.log_file) if args.tokenize_type != "bpe": s_paras = [args.wl_th, args.wcutoff] t_paras = [args.wl_th, args.wcutoff] print("INFO: - Build vocabulary...") tokenizer = Tokenizer(s_paras, t_paras) files = [args.train_file] if args.train_file != args.dev_file: files.append(args.dev_file) # Load datasets to build vocabulary data = Tokenizer.load_file(files, task=2) tokenizer.build(datasets=data) sw2i = tokenizer.sw2i tw2i = tokenizer.tw2i print("INFO: - Save vocabulary...") Tokenizer.save(tokenizer, os.path.join(args.model_dir, "tokenizer.vocab")) else: print("INFO: - Load vocabulary...") tokenizer = BPE.load(args.vocab_file) tokenizer.add_tokens(sys_tokens) sw2i = tokenizer.get_vocab() tw2i = tokenizer.get_vocab() # args.tokenizer = tokenizer # Source language args.swd_pretrained = None args.twd_pretrained = None if len(args.swd_embfile) != 0: scale = np.sqrt(3.0 / args.swd_dim) emb_reader = Embeddings(args.swd_embfile) args.swd_pretrained = emb_reader.get_W(args.swd_dim, sw2i, scale) if args.twd_embfile == args.swd_embfile: scale = np.sqrt(3.0 / args.twd_dim) args.twd_pretrained = emb_reader.get_W(args.twd_dim, tw2i, scale) # Target language if len(args.twd_embfile) != 0: scale = np.sqrt(3.0 / args.twd_dim) if args.twd_pretrained is None: emb_reader = Embeddings(args.swd_embfile) args.twd_pretrained = emb_reader.get_W(args.twd_dim, tw2i, scale) # directly integrate transfer learning if no updating new words SaveloadHP.save(args, os.path.join(args.model_dir, args.model_args)) return args else: print("INFO: - Use transfer learning technique") assert os.path.exists(args.tlargs), print("\t - There is no pre-trained argument file") # load pre-trained argument file from a previous training folder margs = SaveloadHP.load(args.tlargs) # margs.tl = args.tl # margs.log_file = args.log_file # TODO update new vocab and all other new arguments used for new training # 0. Read vocab # 1. Update schema # 2. Update vocab # args.tokenizer = margs.tokenizer # 3. Use all model file directory of previous train args.model_dir = margs.model_dir args.seq2seq_file = margs.seq2seq_file # 4. Keep the remaining current arguments # add a constraint at the loading time that if fail to load any model, just skip it args.swd_pretrained = margs.swd_pretrained args.twd_pretrained = margs.twd_pretrained return args
def __init__(self, args=None): print("INFO: - Load the pre-built tokenizer...") if args.tokenize_type != "bpe": tokenizer = Tokenizer.load(os.path.join(args.model_dir, "tokenizer.vocab")) else: tokenizer = BPE.load(args.vocab_file) tokenizer.add_tokens(sys_tokens) tokenizer.tw2i = tokenizer.get_vocab() tokenizer.i2tw = Tokenizer.reversed_dict(tokenizer.tw2i) self.args = args self.tokenizer = tokenizer self.device = torch.device("cuda:0" if self.args.use_cuda else "cpu") # Include SOt, EOt if set set_words, else Ignore SOt, EOt # self.num_labels = len(self.tokenizer.tw2i) self.num_labels = self.tokenizer.get_vocab_size() if self.num_labels > 2: self.lossF = nn.CrossEntropyLoss().to(self.device) else: self.lossF = nn.BCEWithLogitsLoss().to(self.device) # Hyper-parameters at source language if self.args.tokenize_type != "bpe": self.source2idx = Tokenizer.lst2idx(tokenizer=self.tokenizer.process_nl, vocab_words=self.tokenizer.sw2i, unk_words=True, sos=self.args.ssos, eos=self.args.seos) # Hyper-parameters at target language self.target2idx = Tokenizer.lst2idx(tokenizer=self.tokenizer.process_target, vocab_words=self.tokenizer.tw2i, unk_words=True, sos=self.args.tsos, eos=self.args.teos) self.pad_id = self.tokenizer.sw2i.get(PAD, 0) self.unk_id = self.tokenizer.sw2i.get(UNK, UNK_id) sw_size = len(self.tokenizer.sw2i) # tw_size = len(self.tokenizer.tw2i) self.collate_fn = Tokenizer.collate_fn(self.pad_id, True) else: self.source2idx = BPE.tokens2ids(self.tokenizer, sos=self.args.ssos, eos=self.args.seos) self.target2idx = BPE.tokens2ids(self.tokenizer, sos=self.args.tsos, eos=self.args.teos) self.pad_id = self.tokenizer.token_to_id(BPAD) if self.tokenizer.token_to_id(BPAD) is not None \ else self.tokenizer.token_to_id(PAD) self.unk_id = self.tokenizer.token_to_id(BUNK) if self.tokenizer.token_to_id(BUNK) is not None \ else self.tokenizer.token_to_id(UNK) sw_size = self.tokenizer.get_vocab_size() # tw_size = self.tokenizer.get_vocab_size() self.collate_fn = BPE.collate_fn(self.pad_id, True) # Hyper-parameters at word-level source language # [size, dim, pre_embs, drop_rate, zero_padding, requires_grad] = HPs nlemb_HPs = [sw_size, self.args.swd_dim, self.args.swd_pretrained, self.args.wd_dropout, self.args.wd_padding, self.args.snl_reqgrad] # NL inputs # Encoder # [nn_mode, nn_inp_dim, nn_out_dim, nn_layers, nn_bidirect, nn_dropout] = HPs if self.args.enc_cnn: enc_HPs = ["cnn", self.args.swd_dim, self.args.ed_outdim, self.args.ed_layers, self.args.ed_bidirect, self.args.kernel_size] else: enc_HPs = [self.args.ed_mode, self.args.swd_dim, self.args.ed_outdim, self.args.ed_layers, self.args.ed_bidirect, self.args.ed_dropout] # Decoder # [size, dim, pre_embs, drop_rate, zero_padding, requires_grad] = HPs temb_HPs = [self.num_labels, self.args.twd_dim, self.args.twd_pretrained, self.args.wd_dropout, self.args.wd_padding, self.args.twd_reqgrad] # Hyper-parameters at word-level target language dec_HPs = [self.args.ed_mode, self.args.twd_dim, self.args.ed_outdim, self.args.ed_layers, self.args.ed_bidirect, self.args.ed_dropout] dec_HPs = [temb_HPs, dec_HPs] print("INFO: - Build model...") # self.seq2seq = Seq2seq(semb_HPs, sch_HPs, enc_HPs, dec_HPs, drop_rate=self.args.final_dropout, # num_labels=self.num_labels, enc_att=self.args.enc_att).to(self.device) self.seq2seq = Seq2seq(nlemb_HPs, enc_HPs, dec_HPs, drop_rate=self.args.final_dropout, num_labels=self.num_labels, enc_att=self.args.enc_att) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs self.seq2seq = nn.DataParallel(self.seq2seq) self.seq2seq.to(self.device) self.seq2seq_optimizer = None if self.args.optimizer.lower() == "adamax": self.init_optimizers(optim.Adamax) elif self.args.optimizer.lower() == "adam": self.init_optimizers(optim.Adam) elif self.args.optimizer.lower() == "radam": self.init_optimizers(RAdam) elif self.args.optimizer.lower() == "adadelta": self.init_optimizers(optim.Adadelta) elif self.args.optimizer.lower() == "adagrad": self.init_optimizers(optim.Adagrad) else: self.init_optimizers(optim.SGD)
def train(self): # training result is returned after training to inform calling code of the outcome of training # Values: Matching threshold reached (success): 0, Otherwise: 1 # training_result = 1 train_data, train_numlines = Tokenizer.prepare_iter(self.args.train_file, firstline=self.args.firstline, task=2) dev_data, dev_numlines = Tokenizer.prepare_iter(self.args.dev_file, firstline=self.args.firstline, task=2) test_data, test_numlines = Tokenizer.prepare_iter(self.args.test_file, firstline=self.args.firstline, task=2) saved_epoch = 0 nepoch_no_imprv = 0 epoch_start = time.time() max_epochs = self.args.max_epochs best_dev = -np.inf if self.args.metric == "bleu" else np.inf if self.args.tl: # 1. Load pre-trained model from previous model_dir print("INFO: - Load transfer learning models") self.load_transferlearning(epoch=-1) # 2. Update model_dir to the new one if self.args.timestamped_subdir: self.args.model_dir = os.path.abspath(os.path.join(self.args.model_dir, "..")) sub_folder = datetime.now().isoformat(sep='-', timespec='minutes').replace(":", "-").replace("-", "_") else: sub_folder = '' if not os.path.exists(os.path.join(self.args.model_dir, sub_folder)): os.mkdir(os.path.join(self.args.model_dir, sub_folder)) self.args.model_dir = os.path.join(self.args.model_dir, sub_folder) # 3. Update logfile dir self.args.log_file = os.path.join(self.args.model_dir, self.args.log_file) with open(self.args.log_file, "w") as f: f.write("START TRAINING\n") # 4. save updated arguments and log file to the new folder print("INFO: - Save new argument file") SaveloadHP.save(self.args, os.path.join(self.args.model_dir, self.args.model_args)) dev_loss, dev_bleu, dev_string_match, dev_speed = self.evaluate_batch(dev_data, dev_numlines, self.args.pred_dev_file) best_dev = dev_bleu[0] if self.args.metric == "bleu" else dev_loss print("INFO: - Transfer learning performance") print(" - Current Dev loss: %.4f; Current Dev bleu: %.4f; Current Dev string match: %.4f; Dev speed: %.2f(tokens/s)" % (dev_loss, dev_bleu[0], dev_string_match, dev_speed)) self.appendfile("\t- Transfer learning performance") self.appendfile("\t\t- Current Dev loss: %.4f; Current Dev bleu: %.4f; Current Dev string match: %.4f; Dev speed: %.2f(tokens/s)\n" % (dev_loss, dev_bleu[0], dev_string_match, dev_speed)) # print("INFO: - Save transfer learning models") # self.save_parameters(epoch=0) # suppose the transfered model is the best one and save in the main dir self.save_parameters(epoch=-1) else: with open(self.args.log_file, "w") as f: f.write("START TRAINING\n") print('Dev metric:', self.args.metric) for epoch in range(1, max_epochs + 1): print("Epoch: %s/%s" % (epoch, max_epochs)) stime = time.time() train_loss = self.train_batch(train_data, train_numlines) print("BONUS: Training time of %.4f" % (time.time() - stime)) # Save the model # print("INFO: - Frequently save models to checkpoint folders") # self.save_parameters(epoch=epoch) # set the first model as the best one and save to the main dir # evaluate on developing data dev_loss, dev_bleu, dev_string_match, dev_speed = self.evaluate_batch(dev_data, dev_numlines, self.args.pred_dev_file) dev_metric = dev_bleu[0] if self.args.metric == "bleu" else dev_loss cond = dev_metric > best_dev if self.args.metric == "bleu" else dev_loss < best_dev if cond: nepoch_no_imprv = 0 saved_epoch = epoch best_dev = dev_metric print("UPDATES: - New improvement") print(" - Train loss: %.4f" % train_loss) print(" - Dev loss: %.4f; Dev bleu: %.4f; Dev string match: %.4f; Dev speed: %.2f(tokens/s)" % (dev_loss, dev_bleu[0], dev_string_match, dev_speed)) self.appendfile("\t- New improvement at epoch %d:\n" % saved_epoch) self.appendfile("\t\t- Dev loss: %.4f; Dev bleu: %.4f; Dev string match: %.4f; Dev speed: %.2f(tokens/s)\n" % (dev_loss, dev_bleu[0], dev_string_match, dev_speed)) print("INFO: - Save best models") self.save_parameters(epoch=-1) # if dev_string_match >= self.args.matching_threshold: # # TODO: automatically load models to gcp # training_result = 0 # break else: print("UPDATES: - No improvement") print(" - Train loss: %.4f" % train_loss) print(" - Dev loss: %.4f; Dev bleu: %.4f; Dev string match: %.4f; Dev speed: %.2f(tokens/s)" % (dev_loss, dev_bleu[0], dev_string_match, dev_speed)) nepoch_no_imprv += 1 # Decay learning_rate if no improvement if self.args.decay_rate > 0: self.lr_decay(epoch) if nepoch_no_imprv >= self.args.patience: # Load the current best models print("INFO: - Load best models") self.load_parameters(epoch=-1) test_loss, test_bleu, test_string_match, test_speed = self.evaluate_batch(test_data, test_numlines, self.args.pred_test_file) print("SUMMARY: - Early stopping after %d epochs without improvements" % nepoch_no_imprv) print(" - Dev metric (%s): %.4f" % (self.args.metric, best_dev)) print(" - Test loss: %.4f; Test bleu: %.4f; Test string match: %.4f; Test speed: %.2f(tokens/s)" % (test_loss, test_bleu[0], test_string_match, test_speed)) self.appendfile("STOP TRAINING at epoch %s/%s\n" % (epoch, max_epochs)) self.appendfile("\t- Testing the best model at epoch %d:\n" % saved_epoch) self.appendfile("\t\t- Test loss: %.4f; Test bleu: %.4f; Test speed: %.2f(tokens/s)\n" % (test_loss, test_bleu[0], test_speed)) return test_bleu[0] epoch_finish, epoch_remain = Timer.timeEst2(epoch_start, epoch / max_epochs) print("INFO: - Trained time for %d epochs: %s" % (epoch, epoch_finish)) print("\t- Remained time for %d epochs (est): %s\n" % (max_epochs - epoch, epoch_remain)) # print("INFO: - Save best models") # self.save_parameters(epoch=-1) print("INFO: - Load best models") self.load_parameters(epoch=-1) test_loss, test_bleu, test_string_match, test_speed = self.evaluate_batch(test_data, test_numlines, self.args.pred_test_file) print("SUMMARY: - Completed %d epoches" % max_epochs) print(" - Dev metric (%s): %.4f" % (self.args.metric, best_dev)) print(" - Test loss: %.4f; Test bleu: %.4f; Test string match: %.4f; Test speed: %.2f(tokens/s)" % (test_loss, test_bleu[0], test_string_match, test_speed)) self.appendfile("STOP TRAINING at epoch %s/%s\n" % (epoch, max_epochs)) self.appendfile("\t- Testing the best model at epoch %d:\n" % saved_epoch) self.appendfile("\t\t- Test loss: %.4f; Test bleu: %.4f; Test string match: %.4f; Test speed: %.2f(tokens/s)\n" % (test_loss, test_bleu[0], test_string_match, test_speed)) return test_bleu[0]
def train(self): train_data, train_numlines = Tokenizer.prepare_iter( self.args.train_file, firstline=self.args.firstline, task=2) dev_data, dev_numlines = Tokenizer.prepare_iter( self.args.dev_file, firstline=self.args.firstline, task=2) test_data, test_numlines = Tokenizer.prepare_iter( self.args.test_file, firstline=self.args.firstline, task=2) saved_epoch = 0 nepoch_no_imprv = 0 epoch_start = time.time() max_epochs = self.args.max_epochs # best_dev = -np.inf if self.args.metric == "f1" else np.inf best_dev = np.inf if self.args.metric == "loss" else -np.inf with open(self.args.log_file, "w") as f: f.write("START TRAINING\n") if self.args.tl: # 1. Load pre-trained model from previous model_dir print("INFO: - Load transfer learning models") self.load_transferlearning(epoch=-1) # 2. Update model_dir to the new one if self.args.timestamped_subdir: self.args.model_dir = os.path.abspath( os.path.join(self.args.model_dir, "..")) sub_folder = datetime.now().isoformat( sep='-', timespec='minutes').replace(":", "-").replace("-", "_") else: sub_folder = '' if not os.path.exists(os.path.join(self.args.model_dir, sub_folder)): os.mkdir(os.path.join(self.args.model_dir, sub_folder)) self.args.model_dir = os.path.join(self.args.model_dir, sub_folder) # 3. Update logfile dir self.args.log_file = os.path.join(self.args.model_dir, self.args.log_file) # 4. save updated arguments and log file to the new folder print("INFO: - Save new argument file") SaveloadHP.save( self.args, os.path.join(self.args.model_dir, self.args.model_args)) dev_loss, dev_metrics, dev_speed = self.evaluate_batch( dev_data, dev_numlines) # best_dev = dev_metrics[2] if self.args.metric == "f1" else dev_loss best_dev = dev_loss if self.args.metric == "loss" else dev_metrics[ 2] print("INFO: - Transfer learning performance") print( " - Current Dev loss: %.4f; Current Dev P: %.4f; Current Dev R: %.4f; " "Current Dev F1: %.4f; Dev speed: %.2f(tokens/s)" % (dev_loss, dev_metrics[0], dev_metrics[1], dev_metrics[2], dev_speed)) print( " - Current Dev sep_acc: %.4f; Current Dev full_acc: %.4f" % (dev_metrics[3], dev_metrics[4])) self.appendfile("\t- Transfer learning performance") self.appendfile( "\t\t- Current Dev loss: %.4f; Current Dev P: %.4f; Current Dev R: %.4f; " "Current Dev F1: %.4f; Dev speed: %.2f(tokens/s)" % (dev_loss, dev_metrics[0], dev_metrics[1], dev_metrics[2], dev_speed)) self.appendfile( "\t\t- Current Dev sep_acc: %.4f; Current Dev full_acc: %.4f" % (dev_metrics[3], dev_metrics[4])) # print("INFO: - Save transfer learning models") # self.save_parameters(epoch=0) # suppose the transfered model is the best one and save in the main dir self.save_parameters(epoch=-1) for epoch in range(1, max_epochs + 1): print("Epoch: %s/%s" % (epoch, max_epochs)) stime = time.time() train_loss = self.train_batch(train_data, train_numlines) print("BONUS: Training time of %.4f" % (time.time() - stime)) # Save the model # print("INFO: - Frequently save models to checkpoint folders") # self.save_parameters(epoch=epoch) # evaluate on developing data dev_loss, dev_metrics, dev_speed = self.evaluate_batch( dev_data, dev_numlines) # dev_metric = dev_metrics[2] if self.args.metric == "f1" else dev_loss dev_metric = dev_loss if self.args.metric == "loss" else dev_metrics[ 2] # cond = dev_metric > best_dev if self.args.metric == "f1" else dev_loss < best_dev cond = dev_loss < best_dev if self.args.metric == "loss" else dev_metric > best_dev if cond: nepoch_no_imprv = 0 saved_epoch = epoch best_dev = dev_metric print("UPDATES: - New improvement") print(" - Train loss: %.4f" % train_loss) print( " - Current Dev loss: %.4f; Current Dev P: %.4f; Current Dev R: %.4f; " "Current Dev F1: %.4f; Dev speed: %.2f(tokens/s)" % (dev_loss, dev_metrics[0], dev_metrics[1], dev_metrics[2], dev_speed)) print( " - Current Dev sep_acc: %.4f; Current Dev full_acc: %.4f" % (dev_metrics[3], dev_metrics[4])) self.appendfile("\t- New improvement at epoch %d:\n" % saved_epoch) self.appendfile( "\t\t- Current Dev loss: %.4f; Current Dev P: %.4f; Current Dev R: %.4f; " "Current Dev F1: %.4f; Dev speed: %.2f(tokens/s)\n" % (dev_loss, dev_metrics[0], dev_metrics[1], dev_metrics[2], dev_speed)) self.appendfile( "\t\t- Current Dev sep_acc: %.4f; Current Dev full_acc: %.4f" % (dev_metrics[3], dev_metrics[4])) print("INFO: - Save best models") self.save_parameters(epoch=-1) else: print("UPDATES: - No improvement") print(" - Train loss: %.4f" % train_loss) print( " - Current Dev loss: %.4f; Current Dev P: %.4f; Current Dev R: %.4f; " "Current Dev F1: %.4f; Dev speed: %.2f(tokens/s)" % (dev_loss, dev_metrics[0], dev_metrics[1], dev_metrics[2], dev_speed)) print( " - Current Dev sep_acc: %.4f; Current Dev full_acc: %.4f" % (dev_metrics[3], dev_metrics[4])) nepoch_no_imprv += 1 # Decay learning_rate if no improvement if self.args.decay_rate > 0: self.lr_decay(epoch) if nepoch_no_imprv >= self.args.patience: # Load the current best models print("INFO: - Load best models") self.load_parameters(epoch=-1) test_loss, test_metrics, test_speed = self.evaluate_batch( test_data, test_numlines) print( "SUMMARY: - Early stopping after %d epochs without improvements" % nepoch_no_imprv) print(" - Dev metric (%s): %.4f" % (self.args.metric, best_dev)) print( " - Test loss: %.4f; Test P: %.4f; Test R: %.4f; " "Test F1: %.4f; Test speed: %.2f(tokens/s)" % (test_loss, test_metrics[0], test_metrics[1], test_metrics[2], test_speed)) print( " - Current Test sep_acc: %.4f; Current Test full_acc: %.4f" % (test_metrics[3], test_metrics[4])) self.appendfile("STOP TRAINING at epoch %s/%s\n" % (epoch, max_epochs)) self.appendfile( "\t- Testing the best model at epoch %d:\n" % saved_epoch) self.appendfile( "\t\t- Test loss: %.4f; Test P: %.4f; Test R: %.4f; " "Test F1: %.4f; Test speed: %.2f(tokens/s)\n" % (test_loss, test_metrics[0], test_metrics[1], test_metrics[2], test_speed)) self.appendfile( "\t\t- Current Test sep_acc: %.4f; Current Test full_acc: %.4f" % (test_metrics[3], test_metrics[4])) return test_metrics epoch_finish, epoch_remain = Timer.timeEst2( epoch_start, epoch / max_epochs) print("INFO: - Trained time for %d epochs: %s" % (epoch, epoch_finish)) print("\t- Remained time for %d epochs (est): %s\n" % (max_epochs - epoch, epoch_remain)) print("INFO: - Load best models") self.load_parameters(epoch=-1) test_loss, test_metrics, test_speed = self.evaluate_batch( test_data, test_numlines) print("SUMMARY: - Completed %d epoches" % max_epochs) print(" - Dev metric (%s): %.4f" % (self.args.metric, best_dev)) print(" - Test loss: %.4f; Test P: %.4f; Test R: %.4f; " "Test F1: %.4f; Test speed: %.2f(tokens/s)" % (test_loss, test_metrics[0], test_metrics[1], test_metrics[2], test_speed)) print( " - Current Test sep_acc: %.4f; Current Test full_acc: %.4f" % (test_metrics[3], test_metrics[4])) self.appendfile("STOP TRAINING at epoch %s/%s\n" % (epoch, max_epochs)) self.appendfile("\t- Testing the best model at epoch %d:\n" % saved_epoch) self.appendfile("\t\t- Test loss: %.4f; Test P: %.4f; Test R: %.4f; " "Test F1: %.4f; Test speed: %.2f(tokens/s)\n" % (test_loss, test_metrics[0], test_metrics[1], test_metrics[2], test_speed)) self.appendfile( "\t\t- Current Test sep_acc: %.4f; Current Test full_acc: %.4f" % (test_metrics[3], test_metrics[4])) return test_metrics
def __init__(self, args=None): print("INFO: - Load the pre-built tokenizer...") if args.tokenize_type != "bpe": tokenizer = Tokenizer.load( os.path.join(args.model_dir, "tokenizer.vocab")) else: tokenizer = BPE.load(args.vocab_file) tokenizer.add_tokens(sys_tokens) labels_list = TXT.read(args.label_file, firstline=False) tokenizer.tw2i = Tokenizer.list2dict(sys_tokens + labels_list) tokenizer.i2tw = Tokenizer.reversed_dict(tokenizer.tw2i) self.args = args self.tokenizer = tokenizer self.device = torch.device("cuda:0" if self.args.use_cuda else "cpu") self.num_labels = len(self.tokenizer.tw2i) # Hyper-parameters at target language self.target2idx = Tokenizer.lst2idx(tokenizer=Tokenizer.process_target, vocab_words=self.tokenizer.tw2i, unk_words=True, sos=self.args.ssos, eos=self.args.seos) if self.args.tokenize_type != "bpe": # Hyper-parameters at source language self.source2idx = Tokenizer.lst2idx( tokenizer=Tokenizer.process_nl, vocab_words=self.tokenizer.sw2i, unk_words=True, sos=self.args.ssos, eos=self.args.seos) self.pad_id = self.tokenizer.sw2i.get(PAD, PAD_id) self.unk_id = self.tokenizer.sw2i.get(UNK, UNK_id) sw_size = len(self.tokenizer.sw2i) # tw_size = len(self.tokenizer.tw2i) self.collate_fn = Tokenizer.collate_fn(self.pad_id, True) else: self.source2idx = BPE.tokens2ids(self.tokenizer, sos=self.args.ssos, eos=self.args.seos) self.pad_id = self.tokenizer.token_to_id(BPAD) if self.tokenizer.token_to_id(BPAD) is not None \ else self.tokenizer.token_to_id(PAD) self.unk_id = self.tokenizer.token_to_id(BUNK) if self.tokenizer.token_to_id(BUNK) is not None \ else self.tokenizer.token_to_id(UNK) sw_size = self.tokenizer.get_vocab_size() # tw_size = self.tokenizer.get_vocab_size() self.collate_fn = BPE.collate_fn(self.pad_id, True) # Hyper-parameters at word-level source language # [size, dim, pre_embs, drop_rate, zero_padding, requires_grad] = HPs nlemb_HPs = [ sw_size, self.args.swd_dim, self.args.swd_pretrained, self.args.wd_dropout, self.args.wd_padding, self.args.snl_reqgrad ] # Encoder # [nn_mode, nn_inp_dim, nn_out_dim, nn_layers, nn_bidirect, nn_dropout] = HPs if self.args.enc_cnn: enc_HPs = [ "cnn", self.args.swd_dim, self.args.ed_outdim, self.args.ed_layers, self.args.ed_bidirect, self.args.kernel_size ] else: if self.args.ed_mode == "self_attention": # use the maximum length 5 times larger than input length nlemb_HPs += [self.tokenizer.swl * 5] # nn_mode, ninp, nhid, nlayers, nhead, dropout, activation, norm, his_mask enc_HPs = [ self.args.ed_mode, self.args.swd_dim, self.args.ed_outdim, self.args.ed_layers, self.args.ed_heads, self.args.ed_dropout, self.args.ed_activation, None, self.args.ed_hismask ] else: enc_HPs = [ self.args.ed_mode, self.args.swd_dim, self.args.ed_outdim, self.args.ed_layers, self.args.ed_bidirect, self.args.ed_dropout ] crf_HPs = [ self.args.use_crf, self.num_labels, self.args.se_transitions ] print("INFO: - Build model...") self.labeler = Labeler(nlemb_HPs, enc_HPs, crf_HPs, drop_rate=self.args.final_dropout, num_labels=self.num_labels) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") self.labeler = nn.DataParallel(self.labeler) self.labeler.to(self.device) self.labeler_optimizer = None if self.args.optimizer.lower() == "adamax": self.init_optimizers(optim.Adamax) elif self.args.optimizer.lower() == "adam": self.init_optimizers(optim.Adam) elif self.args.optimizer.lower() == "radam": self.init_optimizers(RAdam) elif self.args.optimizer.lower() == "adadelta": self.init_optimizers(optim.Adadelta) elif self.args.optimizer.lower() == "adagrad": self.init_optimizers(optim.Adagrad) else: self.init_optimizers(optim.SGD)
def evaluate_batch(self, eva_data, num_eva): start = time.time() self.labeler.eval() nl_tokens = [] reference = [] candidate = [] predict_probs = [] dev_loss = [] total_tokens = 0 eva_iterdataset = IterDataset( eva_data, source2idx=self.source2idx, target2idx=self.target2idx, num_lines=num_eva, bpe=True if self.args.tokenize_type == "bpe" else False) eva_dataloader = DataLoader(eva_iterdataset, pin_memory=True, batch_size=self.args.batch_size, collate_fn=self.collate_fn) with torch.no_grad(): for i, d in enumerate(eva_dataloader): # nl, target = list(zip(*d)) d = tuple(t.to(self.device) for t in d) nl_tensor, lb_tensor = d nl_len_tensor = (nl_tensor != self.pad_id).sum(dim=1) de_score = self.labeler(nl_tensor, nl_len_tensor) label_mask = nl_tensor != self.pad_id # TODO: can move NLL into seq2seq for multigpu total_loss = self.labeler.NLL_loss(de_score, lb_tensor, label_mask) dev_loss.append(total_loss.item()) total_tokens += label_mask.sum() output_prob, output_idx = self.labeler.inference( de_score, label_mask) label_words = Tokenizer.decode_batch(lb_tensor.tolist(), self.tokenizer.i2tw, 2) label_words = [ words[:i] for words, i in zip(label_words, label_mask.sum(dim=1).tolist()) ] # reference = [[w1, ..., EOT], ..., [w1, ..., EOT]] reference.extend(label_words) if self.args.use_crf: predict_words = Tokenizer.decode_batch( output_idx, self.tokenizer.i2tw, 2) # predict_words = [words[:i] for words, i in zip(predict_words, label_mask.sum(dim=1).tolist())] predict_probs += output_prob else: # output_idx = de_score.max(-1)[1] predict_words = Tokenizer.decode_batch( output_idx.squeeze(-1).tolist(), self.tokenizer.i2tw, 2) predict_words = [ words[:i] for words, i in zip(predict_words, label_mask.sum(dim=1).tolist()) ] # predict_prob = acc_prob.prod(dim=-1).tolist() predict_probs += [ words[:i] for words, i in zip( output_prob.squeeze(-1).tolist(), label_mask.sum(dim=1).tolist()) ] # if sum([len(k) for k in predict_words]) != 0: # candidate = [[w1, ..., EOT], ..., [w1, ..., EOT]] candidate.extend(predict_words) if self.args.tokenize_type != "bpe": nl_token = self.tokenizer.decode_batch( nl_tensor.tolist(), self.tokenizer.i2sw, 2) nl_token = [ words[:i] if EOT not in words else words[:words.index(EOT)] for words, i in zip(nl_token, (nl_tensor > 0).sum( dim=1).tolist()) ] else: nl_token = self.tokenizer.decode_batch(nl_tensor.tolist()) # nl_token = [enc_words.tokens for enc_words in self.tokenizer.encode_batch(nl_token)] nl_token = [ words[0:words.find(EOT)].split() for words in nl_token ] nl_tokens.extend(nl_token) del nl_tensor, nl_len_tensor, lb_tensor, de_score, label_mask # gc.collect() # torch.cuda.empty_cache() if len(candidate) != 0 and len(reference) != 0: assert len(candidate) == len(reference) # Randomly sample one pair rand_idx = random.randint(0, len(reference) - 1) print("\nRANDOMLY sampling: ") print("\t- An Input Sequence: ", " ".join(nl_tokens[rand_idx])) print("\t- A LABEL query: ", " ".join(reference[rand_idx])) print("\t- A PREDICTED query: ", " ".join(candidate[rand_idx])) print("\t- A PREDICTED prob: ", predict_probs[rand_idx], "\n\n") metrics = Labeler_model.class_metrics(reference, candidate) else: metrics = [0., 0., 0., 0., 0.] end = time.time() - start speed = total_tokens / end return sum(dev_loss) / len(dev_loss), metrics, speed
def predict_batch(self, entries, wombat_object=None, return_probability=False): nl = [] wd_tokens = [] for entry in entries: input_tokens = entry["input_tokens"] ids = self.source2idx(input_tokens) nl.append(ids) if self.args.tokenize_type != "bpe": entry['input_list'] = self.tokenizer.process_nl(input_tokens) else: entry['input_list'] = self.tokenizer.encode( input_tokens, add_special_tokens=False).tokens wd_tokens.append(entry['input_list']) self.labeler.eval() with torch.no_grad(): nl_pad_ids, nl_lens = seqPAD.pad_sequences(nl, pad_tok=self.pad_id, nlevels=1) nl_tensor = Data2tensor.idx2tensor(nl_pad_ids, dtype=torch.long, device=self.device) nl_len_tensor = Data2tensor.idx2tensor(nl_lens, dtype=torch.long, device=self.device) # wombat_tensor = [batch, nl_len, emb_dim] wombat_tensor = torch.zeros(nl_tensor.shape + (self.args.swd_dim, ), dtype=torch.float32, device=self.device) wombat_idx = (nl_tensor == self.unk_id).nonzero() if wombat_object is not None: for t, (i, j) in enumerate(wombat_idx.tolist()): word_to_lookup = wd_tokens[i][j] print('Looking up Wombat for:', word_to_lookup) wombat_emb = wombat_object.get(word_to_lookup) if wombat_emb is not None: print('Found Wombat embedding for:', word_to_lookup) wombat_tensor[i, j] = torch.from_numpy(wombat_emb) de_score = self.labeler(nl_tensor, nl_len_tensor, wombat_tensor=wombat_tensor) label_mask = nl_tensor > 0 if return_probability is False: output_prob, output_idx = self.labeler.inference( de_score, label_mask) if self.args.use_crf: predict_words = Tokenizer.decode_batch( output_idx, self.tokenizer.i2tw, 2) # predict_words = [words[:i] for words, i in zip(predict_words, label_mask.sum(dim=1).tolist())] predict_prob = list(output_prob) else: # output_idx = de_score.max(-1)[1] predict_words = Tokenizer.decode_batch( output_idx.squeeze(-1).tolist(), self.tokenizer.i2tw, 2) predict_words = [ words[:i] for words, i in zip(predict_words, label_mask.sum(dim=1).tolist()) ] # predict_prob = acc_prob.prod(dim=-1).tolist() predict_prob = [ words[:i] for words, i in zip( output_prob.squeeze(-1).tolist(), label_mask.sum(dim=1).tolist()) ] for i, entry in enumerate(entries): # entry["pred_pair"] = list(zip(entry["input_review"], predict_words[i])) entry['pred_sequence'] = predict_words[i] entry['prob_sequence'] = predict_prob[i] entities_list = NER_metrics.absa_extractor( entry["input_list"], predict_words[i], None if self.args.use_crf else predict_prob[i]) entry["entities"] = [] if len(entities_list) > 0: for entity, senti, _, prob in entities_list: # entry["entities"].append((entity, senti, prob)) entry["entities"].append({ "aspect": entity, "polarity": senti, "probability": prob }) return entries else: label_prob = torch.softmax(de_score.squeeze(), dim=-1) return [{ self.tokenizer.i2tw[ind]: prob for ind, prob in enumerate(prob_i) } for prob_i in label_prob.tolist()]
except: tokenizer.add_tokens([MASK]) tokenizer.add_tokens([SENSP, SENGE, NL, NULL]) special_tokens_count = tokenizer.num_special_tokens_to_add() max_seq_length = tokenizer.max_len cls_token_id = tokenizer.cls_token_id sep_token_id = tokenizer.sep_token_id cls_token_at_end = False sep_token_extra = False pad_token_label_id = -100 label_file = "/media/data/review_response/labels.txt" labels_list = TXT.read(label_file, firstline=False) tokenizer.tw2i = Tokenizer.list2dict(sys_tokens + labels_list) tokenizer.i2tw = Tokenizer.reversed_dict(tokenizer.tw2i) lb2ids = Tokenizer.lst2idx(tokenizer=Tokenizer.process_target, vocab_words=tokenizer.tw2i, unk_words=False, sos=False, eos=False) pad_id = 0 if tokenizer._pad_token is None else tokenizer.pad_token_id num_labels = len(tokenizer.tw2i) build_inputs_with_special_tokens = add_special_tokens( max_seq_length=max_seq_length, special_tokens_count=special_tokens_count, cls_token_id=cls_token_id, sep_token_id=sep_token_id, cls_token_at_end=cls_token_at_end,
from torch.utils.data import DataLoader, Dataset, IterableDataset, RandomSampler, SequentialSampler, TensorDataset from mlmodels.utils.BPEtonkenizer import BPE from mlmodels.utils.special_tokens import BPAD, PAD, NULL, EOT from mlmodels.utils.txtIO import TXT from mlmodels.utils.trad_tokenizer import Tokenizer, sys_tokens from mlmodels.utils.jsonIO import JSON from mlmodels.utils.csvIO import CSV Data2tensor.set_randseed(12345) device = torch.device("cpu") dtype = torch.long use_cuda = False filename = "/media/data/classification/datasets/yelp_review_full_csv/train.csv" label_file = "/media/data/classification/datasets/yelp_review_full_csv/labels.txt" labels_list = TXT.read(label_file, firstline=False) lb2id_dict = Tokenizer.list2dict(sys_tokens + labels_list) id2lb_dict = Tokenizer.reversed_dict(lb2id_dict) lb2ids = Tokenizer.lst2idx(tokenizer=Tokenizer.process_target, vocab_words=lb2id_dict, unk_words=False, sos=False, eos=False) tokenize_type = "bpe" if tokenize_type != "bpe": # Load datasets to build vocabulary data = Tokenizer.load_file([filename], task=1) s_paras = [-1, 1] t_paras = [-1, 1] tokenizer = Tokenizer(s_paras, t_paras) tokenizer.build(data) nl2ids = Tokenizer.lst2idx(tokenizer=Tokenizer.process_nl, vocab_words=tokenizer.sw2i, unk_words=True, sos=False, eos=False) tokenizer.tw2i = lb2id_dict tokenizer.i2tw = id2lb_dict
from mlmodels.utils.dataset import IterDataset from torch.utils.data import DataLoader from mlmodels.utils.BPEtonkenizer import BPE from mlmodels.utils.special_tokens import BPAD, PAD, SOT, EOT, NULL from mlmodels.utils.trad_tokenizer import Tokenizer from mlmodels.utils.jsonIO import JSON Data2tensor.set_randseed(12345) device = torch.device("cpu") dtype = torch.long use_cuda = False filename = "/media/data/review_response/Dev.json" tokenize_type = "bpe" if tokenize_type != "bpe": # Load datasets to build vocabulary data = Tokenizer.load_file([filename], task=2) s_paras = [-1, 1] t_paras = [-1, 1] vocab = Tokenizer(s_paras, t_paras) vocab.build(data) nl2ids = Tokenizer.lst2idx(tokenizer=vocab.process_nl, vocab_words=vocab.sw2i, unk_words=True, eos=True) tg2ids = Tokenizer.lst2idx(tokenizer=vocab.process_target, vocab_words=vocab.tw2i, unk_words=False, sos=True, eos=True) pad_id = vocab.sw2i.get(PAD, 0) sw_size = len(vocab.sw2i) tw_size = len(vocab.tw2i) else: vocab_file = "/media/data/review_response/tokens/bert_level-bpe-vocab.txt" vocab = BPE.load(vocab_file) vocab.add_tokens([SOT, EOT, NULL])