def transform(self): """ Map bert of source language to target space """ assert self.args.output_file is not None self.trainer = SupervisedBertTrainer( self.bert_model, self.mapping, self.args, trans_types=self.transformer_types) self.trainer.load_best() assert self.args.bert_file0 is not None pred_dataset, unique_id_to_feature, features = load_from_single_bert( self.args.bert_file0, max_seq_length=self.args.max_seq_length) pred_sampler = SequentialSampler(pred_dataset) pred_dataloader = DataLoader(pred_dataset, sampler=pred_sampler, batch_size=self.args.batch_size) self.trainer.mapping.eval() with open(self.args.output_file, "w", encoding='utf-8') as writer: for input_embs, input_mask, example_indices in pred_dataloader: input_embs = input_embs.to(self.device) input_mask = input_mask.to(self.device) src_encoder_layer = input_embs if self.args.map_type in self.transformer_types: target_layer = self.trainer.mapping( src_encoder_layer, input_mask) elif self.args.map_type == 'fine_tune': target_layer = src_encoder_layer else: target_layer = self.trainer.mapping(src_encoder_layer) for b, example_index in enumerate(example_indices): feature = features[example_index.item()] unique_id = int(feature.unique_id) # feature = unique_id_to_feature[unique_id] output_json = OrderedDict() output_json["linex_index"] = unique_id all_out_features = [] for (i, token) in enumerate(feature.tokens): all_layers = [] layer_output = target_layer.detach().cpu().numpy() layer_output = layer_output[b] layers = OrderedDict() layers["index"] = self.args.bert_layer layers["values"] = [ round(x.item(), 6) for x in layer_output[i] ] all_layers.append(layers) out_features = OrderedDict() out_features["token"] = token out_features["layers"] = all_layers all_out_features.append(out_features) output_json["features"] = all_out_features writer.write(json.dumps(output_json) + "\n")
def eval(self): """ """ if self.args.load_pred_bert: assert self.args.bert_file0 is not None assert self.args.bert_file1 is not None self.dataset, unique_id_to_feature, self.features = load_from_bert( self.args.vocab_file, self.args.bert_file0, self.args.bert_file1, do_lower_case=self.args.do_lower_case, max_seq_length=self.args.max_seq_length, n_max_sent=self.args.n_max_sent, vocab_file1=self.args.vocab_file1, align_file=self.args.align_file) self.trainer = SupervisedBertTrainer( self.bert_model, self.mapping, self.args, bert_model1=self.bert_model1, trans_types=self.transformer_types) self.trainer.load_best() self.trainer.mapping.eval() sampler = SequentialSampler(self.dataset) train_loader = DataLoader(self.dataset, sampler=sampler, batch_size=self.args.batch_size) n_inst = 0 n_batch = 0 to_log = {"avg_cos_sim": 0, "loss": 0} self.trainer.args.loss = 'l2_dist' for input_embs_a, input_mask_a, input_embs_b, input_mask_b, align_ids_a, align_ids_b, align_mask, example_indices in train_loader: n_batch += 1 with torch.no_grad(): input_embs_a = input_embs_a.to(self.device) input_mask_a = input_mask_a.to(self.device) input_embs_b = input_embs_b.to(self.device) align_ids_a = align_ids_a.to(self.device) align_ids_b = align_ids_b.to(self.device) align_mask = align_mask.to(self.device) #print (align_ids_a, align_ids_b, align_mask) src_bert = self.trainer.get_indexed_mapped_bert_from_bert( input_embs_a, input_mask_a, align_ids_a, align_mask, bert_layer=self.args.bert_layer) tgt_bert = self.trainer.get_indexed_bert_from_bert( input_embs_b, align_ids_b, align_mask, bert_layer=self.args.bert_layer) avg_cos_sim, loss = self.trainer.supervised_mapping_step( src_bert, tgt_bert, eval_only=True) n_inst += src_bert.size()[0] cos_sim = avg_cos_sim.cpu().detach().numpy() loss_ = loss.cpu().detach().numpy() to_log["avg_cos_sim"] += cos_sim to_log["loss"] += loss_ to_log["avg_cos_sim"] /= n_batch to_log["loss"] /= n_batch print( "avg cos sim:{:.6f}, avg l2 distance:{:.6f}, instances:{}".format( to_log["avg_cos_sim"], to_log["loss"], n_inst))
def train(self): """ """ if self.args.load_pred_bert: assert self.args.bert_file0 is not None assert self.args.bert_file1 is not None assert self.args.vocab_file is not None assert self.args.vocab_file1 is not None self.dataset, unique_id_to_feature, self.features = load_from_bert( self.args.vocab_file, self.args.bert_file0, self.args.bert_file1, do_lower_case=self.args.do_lower_case, max_seq_length=self.args.max_seq_length, n_max_sent=self.args.n_max_sent, vocab_file1=self.args.vocab_file1, align_file=self.args.align_file, align_punc=self.args.align_punc, policy=self.args.align_policy) else: self.dataset, unique_id_to_feature, self.features = load( self.args.vocab_file, self.args.input_file, batch_size=self.args.batch_size, do_lower_case=self.args.do_lower_case, max_seq_length=self.args.max_seq_length, local_rank=self.args.local_rank, vocab_file1=self.args.vocab_file1, align_file=self.args.align_file) self.trainer = SupervisedBertTrainer( self.bert_model, self.mapping, self.args, bert_model1=self.bert_model1, trans_types=self.transformer_types) sampler = RandomSampler(self.dataset) train_loader = DataLoader(self.dataset, sampler=sampler, batch_size=self.args.batch_size) n_without_improvement = 0 min_loss = 1e6 path4loss = self.args.model_path + '/model4loss' if not os.path.exists(path4loss): os.makedirs(path4loss) if self.args.save_all: model_log = open(path4loss + '/model.log', 'w') # training loop for n_epoch in range(self.args.n_epochs): #self.logger.info('Starting epoch %i...' % n_epoch) if (n_epoch + 1) % self.args.decay_step == 0: self.trainer.decay_map_lr() n_inst = 0 n_batch = 0 to_log = {"avg_cosine_similarity": 0, "loss": 0} if self.args.load_pred_bert: for input_embs_a, input_mask_a, input_embs_b, input_mask_b, align_ids_a, align_ids_b, align_mask, example_indices in train_loader: n_batch += 1 with torch.no_grad(): input_embs_a = input_embs_a.to(self.device) input_mask_a = input_mask_a.to(self.device) input_embs_b = input_embs_b.to(self.device) align_ids_a = align_ids_a.to(self.device) align_ids_b = align_ids_b.to(self.device) align_mask = align_mask.to(self.device) #print (align_ids_a, align_ids_b, align_mask) src_bert = self.trainer.get_indexed_mapped_bert_from_bert( input_embs_a, input_mask_a, align_ids_a, align_mask, bert_layer=self.args.bert_layer) tgt_bert = self.trainer.get_indexed_bert_from_bert( input_embs_b, align_ids_b, align_mask, bert_layer=self.args.bert_layer) avg_cos_sim, loss = self.trainer.supervised_mapping_step( src_bert, tgt_bert) n_inst += src_bert.size()[0] cos_sim = avg_cos_sim.cpu().detach().numpy() loss_ = loss.cpu().detach().numpy() to_log["avg_cosine_similarity"] += cos_sim to_log["loss"] += loss_ else: for input_ids_a, input_mask_a, input_ids_b, input_mask_b, align_ids_a, align_ids_b, align_mask, example_indices in train_loader: n_batch += 1 input_ids_a = input_ids_a.to(self.device) input_mask_a = input_mask_a.to(self.device) input_ids_b = input_ids_b.to(self.device) input_mask_b = input_mask_b.to(self.device) align_ids_a = align_ids_a.to(self.device) align_ids_b = align_ids_b.to(self.device) align_mask = align_mask.to(self.device) #print (align_ids_a, align_ids_b, align_mask) src_bert = self.trainer.get_indexed_mapped_bert( input_ids_a, input_mask_a, align_ids_a, align_mask, bert_layer=self.args.bert_layer, model_id=0) tgt_bert = self.trainer.get_indexed_bert( input_ids_b, input_mask_b, align_ids_b, align_mask, bert_layer=self.args.bert_layer, model_id=1) avg_cos_sim, loss = self.trainer.supervised_mapping_step( src_bert, tgt_bert) n_inst += src_bert.size()[0] cos_sim = avg_cos_sim.cpu().detach().numpy() loss_ = loss.cpu().detach().numpy() to_log["avg_cosine_similarity"] += cos_sim to_log["loss"] += loss_ to_log["avg_cosine_similarity"] /= n_batch to_log["loss"] /= n_batch self.logger.info( "Epoch:{}, avg cos sim:{:.6f}, avg loss:{:.6f}, instances:{}". format(n_epoch, to_log["avg_cosine_similarity"], to_log["loss"], n_inst)) if to_log[ "avg_cosine_similarity"] <= self.trainer.best_valid_metric and to_log[ "loss"] >= min_loss: n_without_improvement += 1 else: n_without_improvement = 0 if to_log["loss"] < min_loss: self.logger.info(" Minimum loss : {:.6f}".format( to_log["loss"])) if self.args.save_all: save_path = path4loss + '/epoch-' + str(n_epoch) model_log.write( "Epoch:{}, avg cos sim:{:.6f}, avg loss:{:.6f}\n". format(n_epoch, to_log["avg_cosine_similarity"], to_log["loss"])) else: save_path = path4loss self.trainer.save_model(save_path + '/best_mapping.pkl') min_loss = to_log["loss"] if self.args.save_sim: self.trainer.save_best(to_log, "avg_cosine_similarity") else: if to_log[ "avg_cosine_similarity"] > self.trainer.best_valid_metric: self.trainer.best_valid_metric = to_log[ "avg_cosine_similarity"] self.logger.info( "Max avg cos sim:{:.6f}, Min avg loss:{:.6f}".format( self.trainer.best_valid_metric, min_loss)) #self.logger.info('End of epoch %i.\n\n' % n_epoch) if n_without_improvement >= self.args.quit_after_n_epochs_without_improvement: self.logger.info( 'After {} epochs without improvement, quiting!'.format( n_without_improvement)) break
def svd(self): """ """ if self.args.load_pred_bert: assert self.args.bert_file0 is not None assert self.args.bert_file1 is not None assert self.args.vocab_file is not None assert self.args.vocab_file1 is not None self.dataset, unique_id_to_feature, self.features = load_from_bert( self.args.vocab_file, self.args.bert_file0, self.args.bert_file1, do_lower_case=self.args.do_lower_case, max_seq_length=self.args.max_seq_length, n_max_sent=self.args.n_max_sent, vocab_file1=self.args.vocab_file1, align_file=self.args.align_file, align_punc=self.args.align_punc, policy=self.args.align_policy) self.trainer = SupervisedBertTrainer( self.bert_model, self.mapping, self.args, bert_model1=self.bert_model1, trans_types=self.transformer_types) sampler = SequentialSampler(self.dataset) train_loader = DataLoader(self.dataset, sampler=sampler, batch_size=len(self.dataset)) self.trainer.args.loss = 'l2_dist' for input_embs_a, input_mask_a, input_embs_b, input_mask_b, align_ids_a, align_ids_b, align_mask, example_indices in train_loader: self.logger.info("Applying SVD") with torch.no_grad(): input_embs_a = input_embs_a.to(self.device) input_mask_a = input_mask_a.to(self.device) input_embs_b = input_embs_b.to(self.device) align_ids_a = align_ids_a.to(self.device) align_ids_b = align_ids_b.to(self.device) align_mask = align_mask.to(self.device) #print (align_ids_a, align_ids_b, align_mask) src_bert = self.trainer.get_indexed_bert_from_bert( input_embs_a, align_ids_a, align_mask, bert_layer=self.args.bert_layer) tgt_bert = self.trainer.get_indexed_bert_from_bert( input_embs_b, align_ids_b, align_mask, bert_layer=self.args.bert_layer) avg_cos_sim, loss = self.trainer.supervised_mapping_step( src_bert, tgt_bert, eval_only=True) avg_cos_sim_0 = avg_cos_sim.cpu().detach().numpy() loss_0 = loss.cpu().detach().numpy() self.logger.info( "Before mapping: avg cos sim:{:.6f}, avg l2 distance:{:.6f}". format(avg_cos_sim_0, loss_0)) self.trainer.procrustes(src_bert, tgt_bert) self.trainer.save_model(self.args.model_path + '/best_mapping.pkl') mapped_src_bert = self.trainer.get_indexed_mapped_bert_from_bert( input_embs_a, input_mask_a, align_ids_a, align_mask, bert_layer=self.args.bert_layer) avg_cos_sim, loss = self.trainer.supervised_mapping_step( mapped_src_bert, tgt_bert, eval_only=True) avg_cos_sim_1 = avg_cos_sim.cpu().detach().numpy() loss_1 = loss.cpu().detach().numpy() self.logger.info( "After mapping: avg cos sim:{:.6f}, avg l2 distance:{:.6f}". format(avg_cos_sim_1, loss_1))
class SupervisedBert(object): def __init__(self, args): self.args = args # check parameters if not self.args.pred: #assert 0 < self.args.lr_shrink <= 1 assert self.args.model_path is not None self.dataset = None # build model / trainer / evaluator if not (self.args.pred or self.args.eval): self.logger = initialize_exp(self.args) self.bert_model, self.bert_model1, self.mapping = build_model( self.args, True) if self.args.local_rank == -1 or self.args.no_cuda: self.device = torch.device("cuda" if torch.cuda.is_available() and not self.args.no_cuda else "cpu") else: self.device = torch.device("cuda", self.args.local_rank) self.transformer_types = [ 'self_attention', 'attention', 'linear_self_attention', 'nonlinear_self_attention' ] def train(self): """ """ if self.args.load_pred_bert: assert self.args.bert_file0 is not None assert self.args.bert_file1 is not None assert self.args.vocab_file is not None assert self.args.vocab_file1 is not None self.dataset, unique_id_to_feature, self.features = load_from_bert( self.args.vocab_file, self.args.bert_file0, self.args.bert_file1, do_lower_case=self.args.do_lower_case, max_seq_length=self.args.max_seq_length, n_max_sent=self.args.n_max_sent, vocab_file1=self.args.vocab_file1, align_file=self.args.align_file, align_punc=self.args.align_punc, policy=self.args.align_policy) else: self.dataset, unique_id_to_feature, self.features = load( self.args.vocab_file, self.args.input_file, batch_size=self.args.batch_size, do_lower_case=self.args.do_lower_case, max_seq_length=self.args.max_seq_length, local_rank=self.args.local_rank, vocab_file1=self.args.vocab_file1, align_file=self.args.align_file) self.trainer = SupervisedBertTrainer( self.bert_model, self.mapping, self.args, bert_model1=self.bert_model1, trans_types=self.transformer_types) sampler = RandomSampler(self.dataset) train_loader = DataLoader(self.dataset, sampler=sampler, batch_size=self.args.batch_size) n_without_improvement = 0 min_loss = 1e6 path4loss = self.args.model_path + '/model4loss' if not os.path.exists(path4loss): os.makedirs(path4loss) if self.args.save_all: model_log = open(path4loss + '/model.log', 'w') # training loop for n_epoch in range(self.args.n_epochs): #self.logger.info('Starting epoch %i...' % n_epoch) if (n_epoch + 1) % self.args.decay_step == 0: self.trainer.decay_map_lr() n_inst = 0 n_batch = 0 to_log = {"avg_cosine_similarity": 0, "loss": 0} if self.args.load_pred_bert: for input_embs_a, input_mask_a, input_embs_b, input_mask_b, align_ids_a, align_ids_b, align_mask, example_indices in train_loader: n_batch += 1 with torch.no_grad(): input_embs_a = input_embs_a.to(self.device) input_mask_a = input_mask_a.to(self.device) input_embs_b = input_embs_b.to(self.device) align_ids_a = align_ids_a.to(self.device) align_ids_b = align_ids_b.to(self.device) align_mask = align_mask.to(self.device) #print (align_ids_a, align_ids_b, align_mask) src_bert = self.trainer.get_indexed_mapped_bert_from_bert( input_embs_a, input_mask_a, align_ids_a, align_mask, bert_layer=self.args.bert_layer) tgt_bert = self.trainer.get_indexed_bert_from_bert( input_embs_b, align_ids_b, align_mask, bert_layer=self.args.bert_layer) avg_cos_sim, loss = self.trainer.supervised_mapping_step( src_bert, tgt_bert) n_inst += src_bert.size()[0] cos_sim = avg_cos_sim.cpu().detach().numpy() loss_ = loss.cpu().detach().numpy() to_log["avg_cosine_similarity"] += cos_sim to_log["loss"] += loss_ else: for input_ids_a, input_mask_a, input_ids_b, input_mask_b, align_ids_a, align_ids_b, align_mask, example_indices in train_loader: n_batch += 1 input_ids_a = input_ids_a.to(self.device) input_mask_a = input_mask_a.to(self.device) input_ids_b = input_ids_b.to(self.device) input_mask_b = input_mask_b.to(self.device) align_ids_a = align_ids_a.to(self.device) align_ids_b = align_ids_b.to(self.device) align_mask = align_mask.to(self.device) #print (align_ids_a, align_ids_b, align_mask) src_bert = self.trainer.get_indexed_mapped_bert( input_ids_a, input_mask_a, align_ids_a, align_mask, bert_layer=self.args.bert_layer, model_id=0) tgt_bert = self.trainer.get_indexed_bert( input_ids_b, input_mask_b, align_ids_b, align_mask, bert_layer=self.args.bert_layer, model_id=1) avg_cos_sim, loss = self.trainer.supervised_mapping_step( src_bert, tgt_bert) n_inst += src_bert.size()[0] cos_sim = avg_cos_sim.cpu().detach().numpy() loss_ = loss.cpu().detach().numpy() to_log["avg_cosine_similarity"] += cos_sim to_log["loss"] += loss_ to_log["avg_cosine_similarity"] /= n_batch to_log["loss"] /= n_batch self.logger.info( "Epoch:{}, avg cos sim:{:.6f}, avg loss:{:.6f}, instances:{}". format(n_epoch, to_log["avg_cosine_similarity"], to_log["loss"], n_inst)) if to_log[ "avg_cosine_similarity"] <= self.trainer.best_valid_metric and to_log[ "loss"] >= min_loss: n_without_improvement += 1 else: n_without_improvement = 0 if to_log["loss"] < min_loss: self.logger.info(" Minimum loss : {:.6f}".format( to_log["loss"])) if self.args.save_all: save_path = path4loss + '/epoch-' + str(n_epoch) model_log.write( "Epoch:{}, avg cos sim:{:.6f}, avg loss:{:.6f}\n". format(n_epoch, to_log["avg_cosine_similarity"], to_log["loss"])) else: save_path = path4loss self.trainer.save_model(save_path + '/best_mapping.pkl') min_loss = to_log["loss"] if self.args.save_sim: self.trainer.save_best(to_log, "avg_cosine_similarity") else: if to_log[ "avg_cosine_similarity"] > self.trainer.best_valid_metric: self.trainer.best_valid_metric = to_log[ "avg_cosine_similarity"] self.logger.info( "Max avg cos sim:{:.6f}, Min avg loss:{:.6f}".format( self.trainer.best_valid_metric, min_loss)) #self.logger.info('End of epoch %i.\n\n' % n_epoch) if n_without_improvement >= self.args.quit_after_n_epochs_without_improvement: self.logger.info( 'After {} epochs without improvement, quiting!'.format( n_without_improvement)) break def svd(self): """ """ if self.args.load_pred_bert: assert self.args.bert_file0 is not None assert self.args.bert_file1 is not None assert self.args.vocab_file is not None assert self.args.vocab_file1 is not None self.dataset, unique_id_to_feature, self.features = load_from_bert( self.args.vocab_file, self.args.bert_file0, self.args.bert_file1, do_lower_case=self.args.do_lower_case, max_seq_length=self.args.max_seq_length, n_max_sent=self.args.n_max_sent, vocab_file1=self.args.vocab_file1, align_file=self.args.align_file, align_punc=self.args.align_punc, policy=self.args.align_policy) self.trainer = SupervisedBertTrainer( self.bert_model, self.mapping, self.args, bert_model1=self.bert_model1, trans_types=self.transformer_types) sampler = SequentialSampler(self.dataset) train_loader = DataLoader(self.dataset, sampler=sampler, batch_size=len(self.dataset)) self.trainer.args.loss = 'l2_dist' for input_embs_a, input_mask_a, input_embs_b, input_mask_b, align_ids_a, align_ids_b, align_mask, example_indices in train_loader: self.logger.info("Applying SVD") with torch.no_grad(): input_embs_a = input_embs_a.to(self.device) input_mask_a = input_mask_a.to(self.device) input_embs_b = input_embs_b.to(self.device) align_ids_a = align_ids_a.to(self.device) align_ids_b = align_ids_b.to(self.device) align_mask = align_mask.to(self.device) #print (align_ids_a, align_ids_b, align_mask) src_bert = self.trainer.get_indexed_bert_from_bert( input_embs_a, align_ids_a, align_mask, bert_layer=self.args.bert_layer) tgt_bert = self.trainer.get_indexed_bert_from_bert( input_embs_b, align_ids_b, align_mask, bert_layer=self.args.bert_layer) avg_cos_sim, loss = self.trainer.supervised_mapping_step( src_bert, tgt_bert, eval_only=True) avg_cos_sim_0 = avg_cos_sim.cpu().detach().numpy() loss_0 = loss.cpu().detach().numpy() self.logger.info( "Before mapping: avg cos sim:{:.6f}, avg l2 distance:{:.6f}". format(avg_cos_sim_0, loss_0)) self.trainer.procrustes(src_bert, tgt_bert) self.trainer.save_model(self.args.model_path + '/best_mapping.pkl') mapped_src_bert = self.trainer.get_indexed_mapped_bert_from_bert( input_embs_a, input_mask_a, align_ids_a, align_mask, bert_layer=self.args.bert_layer) avg_cos_sim, loss = self.trainer.supervised_mapping_step( mapped_src_bert, tgt_bert, eval_only=True) avg_cos_sim_1 = avg_cos_sim.cpu().detach().numpy() loss_1 = loss.cpu().detach().numpy() self.logger.info( "After mapping: avg cos sim:{:.6f}, avg l2 distance:{:.6f}". format(avg_cos_sim_1, loss_1)) def eval(self): """ """ if self.args.load_pred_bert: assert self.args.bert_file0 is not None assert self.args.bert_file1 is not None self.dataset, unique_id_to_feature, self.features = load_from_bert( self.args.vocab_file, self.args.bert_file0, self.args.bert_file1, do_lower_case=self.args.do_lower_case, max_seq_length=self.args.max_seq_length, n_max_sent=self.args.n_max_sent, vocab_file1=self.args.vocab_file1, align_file=self.args.align_file) self.trainer = SupervisedBertTrainer( self.bert_model, self.mapping, self.args, bert_model1=self.bert_model1, trans_types=self.transformer_types) self.trainer.load_best() self.trainer.mapping.eval() sampler = SequentialSampler(self.dataset) train_loader = DataLoader(self.dataset, sampler=sampler, batch_size=self.args.batch_size) n_inst = 0 n_batch = 0 to_log = {"avg_cos_sim": 0, "loss": 0} self.trainer.args.loss = 'l2_dist' for input_embs_a, input_mask_a, input_embs_b, input_mask_b, align_ids_a, align_ids_b, align_mask, example_indices in train_loader: n_batch += 1 with torch.no_grad(): input_embs_a = input_embs_a.to(self.device) input_mask_a = input_mask_a.to(self.device) input_embs_b = input_embs_b.to(self.device) align_ids_a = align_ids_a.to(self.device) align_ids_b = align_ids_b.to(self.device) align_mask = align_mask.to(self.device) #print (align_ids_a, align_ids_b, align_mask) src_bert = self.trainer.get_indexed_mapped_bert_from_bert( input_embs_a, input_mask_a, align_ids_a, align_mask, bert_layer=self.args.bert_layer) tgt_bert = self.trainer.get_indexed_bert_from_bert( input_embs_b, align_ids_b, align_mask, bert_layer=self.args.bert_layer) avg_cos_sim, loss = self.trainer.supervised_mapping_step( src_bert, tgt_bert, eval_only=True) n_inst += src_bert.size()[0] cos_sim = avg_cos_sim.cpu().detach().numpy() loss_ = loss.cpu().detach().numpy() to_log["avg_cos_sim"] += cos_sim to_log["loss"] += loss_ to_log["avg_cos_sim"] /= n_batch to_log["loss"] /= n_batch print( "avg cos sim:{:.6f}, avg l2 distance:{:.6f}, instances:{}".format( to_log["avg_cos_sim"], to_log["loss"], n_inst)) def list2bert(self, sents): """ Map bert of source language to target space """ assert self.args.output_file is not None self.trainer = SupervisedBertTrainer( self.bert_model, self.mapping, self.args, trans_types=self.transformer_types) self.trainer.load_best() if self.args.load_pred_bert: assert self.args.bert_file0 is not None pred_dataset, unique_id_to_feature, features = load_from_single_bert( self.args.bert_file0, sents, max_seq_length=self.args.max_seq_length) else: pred_dataset, unique_id_to_feature, features = convert( self.args.vocab_file, sents, batch_size=self.args.batch_size, do_lower_case=self.args.do_lower_case, max_seq_length=self.args.max_seq_length, local_rank=self.args.local_rank) self.bert_model.eval() pred_sampler = SequentialSampler(pred_dataset) pred_dataloader = DataLoader(pred_dataset, sampler=pred_sampler, batch_size=self.args.batch_size) self.trainer.mapping.eval() with open(self.args.output_file, "w", encoding='utf-8') as writer: if self.args.load_pred_bert: for input_embs, input_mask, example_indices in pred_dataloader: input_embs = input_embs.to(self.device) input_mask = input_mask.to(self.device) src_encoder_layer = input_embs if self.args.map_type in self.transformer_types: target_layer = self.trainer.mapping( src_encoder_layer, input_mask) elif self.args.map_type == 'fine_tune': target_layer = src_encoder_layer else: target_layer = self.trainer.mapping(src_encoder_layer) for b, example_index in enumerate(example_indices): feature = features[example_index.item()] unique_id = int(feature.unique_id) # feature = unique_id_to_feature[unique_id] output_json = OrderedDict() output_json["linex_index"] = unique_id all_out_features = [] for (i, token) in enumerate(feature.tokens): all_layers = [] layer_output = target_layer.detach().cpu().numpy() layer_output = layer_output[b] layers = OrderedDict() layers["index"] = self.args.bert_layer layers["values"] = [ round(x.item(), 6) for x in layer_output[i] ] all_layers.append(layers) out_features = OrderedDict() out_features["token"] = token out_features["layers"] = all_layers all_out_features.append(out_features) output_json["features"] = all_out_features writer.write(json.dumps(output_json) + "\n") else: for input_ids, input_mask, example_indices in pred_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) if self.args.map_input: all_encoder_layers, _ = self.bert_model( input_ids, token_type_ids=None, attention_mask=input_mask, input_mapping=self.trainer.mapping) target_layer = all_encoder_layers[self.args.bert_layer] else: all_encoder_layers, _ = self.bert_model( input_ids, token_type_ids=None, attention_mask=input_mask) src_encoder_layer = all_encoder_layers[ self.args.bert_layer] if self.args.map_type in self.transformer_types: target_layer = self.trainer.mapping( src_encoder_layer, input_mask) elif self.args.map_type == 'fine_tune': target_layer = src_encoder_layer else: target_layer = self.trainer.mapping( src_encoder_layer) for b, example_index in enumerate(example_indices): feature = features[example_index.item()] unique_id = int(feature.unique_id) # feature = unique_id_to_feature[unique_id] output_json = OrderedDict() output_json["linex_index"] = unique_id all_out_features = [] for (i, token) in enumerate(feature.tokens): all_layers = [] layer_output = target_layer.detach().cpu().numpy() layer_output = layer_output[b] layers = OrderedDict() layers["index"] = self.args.bert_layer layers["values"] = [ round(x.item(), 6) for x in layer_output[i] ] all_layers.append(layers) out_features = OrderedDict() out_features["token"] = token out_features["layers"] = all_layers all_out_features.append(out_features) output_json["features"] = all_out_features writer.write(json.dumps(output_json) + "\n") def transform(self): """ Map bert of source language to target space """ assert self.args.output_file is not None self.trainer = SupervisedBertTrainer( self.bert_model, self.mapping, self.args, trans_types=self.transformer_types) self.trainer.load_best() assert self.args.bert_file0 is not None pred_dataset, unique_id_to_feature, features = load_from_single_bert( self.args.bert_file0, max_seq_length=self.args.max_seq_length) pred_sampler = SequentialSampler(pred_dataset) pred_dataloader = DataLoader(pred_dataset, sampler=pred_sampler, batch_size=self.args.batch_size) self.trainer.mapping.eval() with open(self.args.output_file, "w", encoding='utf-8') as writer: for input_embs, input_mask, example_indices in pred_dataloader: input_embs = input_embs.to(self.device) input_mask = input_mask.to(self.device) src_encoder_layer = input_embs if self.args.map_type in self.transformer_types: target_layer = self.trainer.mapping( src_encoder_layer, input_mask) elif self.args.map_type == 'fine_tune': target_layer = src_encoder_layer else: target_layer = self.trainer.mapping(src_encoder_layer) for b, example_index in enumerate(example_indices): feature = features[example_index.item()] unique_id = int(feature.unique_id) # feature = unique_id_to_feature[unique_id] output_json = OrderedDict() output_json["linex_index"] = unique_id all_out_features = [] for (i, token) in enumerate(feature.tokens): all_layers = [] layer_output = target_layer.detach().cpu().numpy() layer_output = layer_output[b] layers = OrderedDict() layers["index"] = self.args.bert_layer layers["values"] = [ round(x.item(), 6) for x in layer_output[i] ] all_layers.append(layers) out_features = OrderedDict() out_features["token"] = token out_features["layers"] = all_layers all_out_features.append(out_features) output_json["features"] = all_out_features writer.write(json.dumps(output_json) + "\n")