示例#1
0
    def transform(self):
        """
        Map bert of source language to target space
        """
        assert self.args.output_file is not None

        self.trainer = SupervisedBertTrainer(
            self.bert_model,
            self.mapping,
            self.args,
            trans_types=self.transformer_types)
        self.trainer.load_best()

        assert self.args.bert_file0 is not None
        pred_dataset, unique_id_to_feature, features = load_from_single_bert(
            self.args.bert_file0, max_seq_length=self.args.max_seq_length)
        pred_sampler = SequentialSampler(pred_dataset)
        pred_dataloader = DataLoader(pred_dataset,
                                     sampler=pred_sampler,
                                     batch_size=self.args.batch_size)

        self.trainer.mapping.eval()
        with open(self.args.output_file, "w", encoding='utf-8') as writer:
            for input_embs, input_mask, example_indices in pred_dataloader:
                input_embs = input_embs.to(self.device)
                input_mask = input_mask.to(self.device)

                src_encoder_layer = input_embs
                if self.args.map_type in self.transformer_types:
                    target_layer = self.trainer.mapping(
                        src_encoder_layer, input_mask)
                elif self.args.map_type == 'fine_tune':
                    target_layer = src_encoder_layer
                else:
                    target_layer = self.trainer.mapping(src_encoder_layer)

                for b, example_index in enumerate(example_indices):
                    feature = features[example_index.item()]
                    unique_id = int(feature.unique_id)
                    # feature = unique_id_to_feature[unique_id]
                    output_json = OrderedDict()
                    output_json["linex_index"] = unique_id
                    all_out_features = []
                    for (i, token) in enumerate(feature.tokens):
                        all_layers = []
                        layer_output = target_layer.detach().cpu().numpy()
                        layer_output = layer_output[b]
                        layers = OrderedDict()
                        layers["index"] = self.args.bert_layer
                        layers["values"] = [
                            round(x.item(), 6) for x in layer_output[i]
                        ]
                        all_layers.append(layers)
                        out_features = OrderedDict()
                        out_features["token"] = token
                        out_features["layers"] = all_layers
                        all_out_features.append(out_features)
                    output_json["features"] = all_out_features
                    writer.write(json.dumps(output_json) + "\n")
示例#2
0
    def eval(self):
        """
        """
        if self.args.load_pred_bert:
            assert self.args.bert_file0 is not None
            assert self.args.bert_file1 is not None
            self.dataset, unique_id_to_feature, self.features = load_from_bert(
                self.args.vocab_file,
                self.args.bert_file0,
                self.args.bert_file1,
                do_lower_case=self.args.do_lower_case,
                max_seq_length=self.args.max_seq_length,
                n_max_sent=self.args.n_max_sent,
                vocab_file1=self.args.vocab_file1,
                align_file=self.args.align_file)

        self.trainer = SupervisedBertTrainer(
            self.bert_model,
            self.mapping,
            self.args,
            bert_model1=self.bert_model1,
            trans_types=self.transformer_types)
        self.trainer.load_best()
        self.trainer.mapping.eval()

        sampler = SequentialSampler(self.dataset)
        train_loader = DataLoader(self.dataset,
                                  sampler=sampler,
                                  batch_size=self.args.batch_size)

        n_inst = 0
        n_batch = 0
        to_log = {"avg_cos_sim": 0, "loss": 0}
        self.trainer.args.loss = 'l2_dist'

        for input_embs_a, input_mask_a, input_embs_b, input_mask_b, align_ids_a, align_ids_b, align_mask, example_indices in train_loader:
            n_batch += 1
            with torch.no_grad():
                input_embs_a = input_embs_a.to(self.device)
                input_mask_a = input_mask_a.to(self.device)
                input_embs_b = input_embs_b.to(self.device)
            align_ids_a = align_ids_a.to(self.device)
            align_ids_b = align_ids_b.to(self.device)
            align_mask = align_mask.to(self.device)
            #print (align_ids_a, align_ids_b, align_mask)
            src_bert = self.trainer.get_indexed_mapped_bert_from_bert(
                input_embs_a,
                input_mask_a,
                align_ids_a,
                align_mask,
                bert_layer=self.args.bert_layer)
            tgt_bert = self.trainer.get_indexed_bert_from_bert(
                input_embs_b,
                align_ids_b,
                align_mask,
                bert_layer=self.args.bert_layer)
            avg_cos_sim, loss = self.trainer.supervised_mapping_step(
                src_bert, tgt_bert, eval_only=True)
            n_inst += src_bert.size()[0]
            cos_sim = avg_cos_sim.cpu().detach().numpy()
            loss_ = loss.cpu().detach().numpy()
            to_log["avg_cos_sim"] += cos_sim
            to_log["loss"] += loss_

        to_log["avg_cos_sim"] /= n_batch
        to_log["loss"] /= n_batch
        print(
            "avg cos sim:{:.6f}, avg l2 distance:{:.6f}, instances:{}".format(
                to_log["avg_cos_sim"], to_log["loss"], n_inst))
示例#3
0
    def train(self):
        """
        """

        if self.args.load_pred_bert:
            assert self.args.bert_file0 is not None
            assert self.args.bert_file1 is not None
            assert self.args.vocab_file is not None
            assert self.args.vocab_file1 is not None
            self.dataset, unique_id_to_feature, self.features = load_from_bert(
                self.args.vocab_file,
                self.args.bert_file0,
                self.args.bert_file1,
                do_lower_case=self.args.do_lower_case,
                max_seq_length=self.args.max_seq_length,
                n_max_sent=self.args.n_max_sent,
                vocab_file1=self.args.vocab_file1,
                align_file=self.args.align_file,
                align_punc=self.args.align_punc,
                policy=self.args.align_policy)
        else:
            self.dataset, unique_id_to_feature, self.features = load(
                self.args.vocab_file,
                self.args.input_file,
                batch_size=self.args.batch_size,
                do_lower_case=self.args.do_lower_case,
                max_seq_length=self.args.max_seq_length,
                local_rank=self.args.local_rank,
                vocab_file1=self.args.vocab_file1,
                align_file=self.args.align_file)
        self.trainer = SupervisedBertTrainer(
            self.bert_model,
            self.mapping,
            self.args,
            bert_model1=self.bert_model1,
            trans_types=self.transformer_types)

        sampler = RandomSampler(self.dataset)
        train_loader = DataLoader(self.dataset,
                                  sampler=sampler,
                                  batch_size=self.args.batch_size)

        n_without_improvement = 0
        min_loss = 1e6
        path4loss = self.args.model_path + '/model4loss'
        if not os.path.exists(path4loss):
            os.makedirs(path4loss)
        if self.args.save_all:
            model_log = open(path4loss + '/model.log', 'w')

        # training loop
        for n_epoch in range(self.args.n_epochs):
            #self.logger.info('Starting epoch %i...' % n_epoch)
            if (n_epoch + 1) % self.args.decay_step == 0:
                self.trainer.decay_map_lr()
            n_inst = 0
            n_batch = 0
            to_log = {"avg_cosine_similarity": 0, "loss": 0}

            if self.args.load_pred_bert:
                for input_embs_a, input_mask_a, input_embs_b, input_mask_b, align_ids_a, align_ids_b, align_mask, example_indices in train_loader:
                    n_batch += 1
                    with torch.no_grad():
                        input_embs_a = input_embs_a.to(self.device)
                        input_mask_a = input_mask_a.to(self.device)
                        input_embs_b = input_embs_b.to(self.device)
                    align_ids_a = align_ids_a.to(self.device)
                    align_ids_b = align_ids_b.to(self.device)
                    align_mask = align_mask.to(self.device)
                    #print (align_ids_a, align_ids_b, align_mask)
                    src_bert = self.trainer.get_indexed_mapped_bert_from_bert(
                        input_embs_a,
                        input_mask_a,
                        align_ids_a,
                        align_mask,
                        bert_layer=self.args.bert_layer)
                    tgt_bert = self.trainer.get_indexed_bert_from_bert(
                        input_embs_b,
                        align_ids_b,
                        align_mask,
                        bert_layer=self.args.bert_layer)

                    avg_cos_sim, loss = self.trainer.supervised_mapping_step(
                        src_bert, tgt_bert)
                    n_inst += src_bert.size()[0]
                    cos_sim = avg_cos_sim.cpu().detach().numpy()
                    loss_ = loss.cpu().detach().numpy()

                    to_log["avg_cosine_similarity"] += cos_sim
                    to_log["loss"] += loss_
            else:
                for input_ids_a, input_mask_a, input_ids_b, input_mask_b, align_ids_a, align_ids_b, align_mask, example_indices in train_loader:
                    n_batch += 1
                    input_ids_a = input_ids_a.to(self.device)
                    input_mask_a = input_mask_a.to(self.device)
                    input_ids_b = input_ids_b.to(self.device)
                    input_mask_b = input_mask_b.to(self.device)
                    align_ids_a = align_ids_a.to(self.device)
                    align_ids_b = align_ids_b.to(self.device)
                    align_mask = align_mask.to(self.device)
                    #print (align_ids_a, align_ids_b, align_mask)
                    src_bert = self.trainer.get_indexed_mapped_bert(
                        input_ids_a,
                        input_mask_a,
                        align_ids_a,
                        align_mask,
                        bert_layer=self.args.bert_layer,
                        model_id=0)
                    tgt_bert = self.trainer.get_indexed_bert(
                        input_ids_b,
                        input_mask_b,
                        align_ids_b,
                        align_mask,
                        bert_layer=self.args.bert_layer,
                        model_id=1)

                    avg_cos_sim, loss = self.trainer.supervised_mapping_step(
                        src_bert, tgt_bert)
                    n_inst += src_bert.size()[0]
                    cos_sim = avg_cos_sim.cpu().detach().numpy()
                    loss_ = loss.cpu().detach().numpy()

                    to_log["avg_cosine_similarity"] += cos_sim
                    to_log["loss"] += loss_
            to_log["avg_cosine_similarity"] /= n_batch
            to_log["loss"] /= n_batch
            self.logger.info(
                "Epoch:{}, avg cos sim:{:.6f}, avg loss:{:.6f}, instances:{}".
                format(n_epoch, to_log["avg_cosine_similarity"],
                       to_log["loss"], n_inst))

            if to_log[
                    "avg_cosine_similarity"] <= self.trainer.best_valid_metric and to_log[
                        "loss"] >= min_loss:
                n_without_improvement += 1
            else:
                n_without_improvement = 0
            if to_log["loss"] < min_loss:
                self.logger.info(" Minimum loss : {:.6f}".format(
                    to_log["loss"]))
                if self.args.save_all:
                    save_path = path4loss + '/epoch-' + str(n_epoch)
                    model_log.write(
                        "Epoch:{}, avg cos sim:{:.6f}, avg loss:{:.6f}\n".
                        format(n_epoch, to_log["avg_cosine_similarity"],
                               to_log["loss"]))
                else:
                    save_path = path4loss
                self.trainer.save_model(save_path + '/best_mapping.pkl')
                min_loss = to_log["loss"]
            if self.args.save_sim:
                self.trainer.save_best(to_log, "avg_cosine_similarity")
            else:
                if to_log[
                        "avg_cosine_similarity"] > self.trainer.best_valid_metric:
                    self.trainer.best_valid_metric = to_log[
                        "avg_cosine_similarity"]
            self.logger.info(
                "Max avg cos sim:{:.6f}, Min avg loss:{:.6f}".format(
                    self.trainer.best_valid_metric, min_loss))
            #self.logger.info('End of epoch %i.\n\n' % n_epoch)
            if n_without_improvement >= self.args.quit_after_n_epochs_without_improvement:
                self.logger.info(
                    'After {} epochs without improvement, quiting!'.format(
                        n_without_improvement))
                break
示例#4
0
    def svd(self):
        """
        """
        if self.args.load_pred_bert:
            assert self.args.bert_file0 is not None
            assert self.args.bert_file1 is not None
            assert self.args.vocab_file is not None
            assert self.args.vocab_file1 is not None
            self.dataset, unique_id_to_feature, self.features = load_from_bert(
                self.args.vocab_file,
                self.args.bert_file0,
                self.args.bert_file1,
                do_lower_case=self.args.do_lower_case,
                max_seq_length=self.args.max_seq_length,
                n_max_sent=self.args.n_max_sent,
                vocab_file1=self.args.vocab_file1,
                align_file=self.args.align_file,
                align_punc=self.args.align_punc,
                policy=self.args.align_policy)

        self.trainer = SupervisedBertTrainer(
            self.bert_model,
            self.mapping,
            self.args,
            bert_model1=self.bert_model1,
            trans_types=self.transformer_types)

        sampler = SequentialSampler(self.dataset)
        train_loader = DataLoader(self.dataset,
                                  sampler=sampler,
                                  batch_size=len(self.dataset))

        self.trainer.args.loss = 'l2_dist'
        for input_embs_a, input_mask_a, input_embs_b, input_mask_b, align_ids_a, align_ids_b, align_mask, example_indices in train_loader:
            self.logger.info("Applying SVD")
            with torch.no_grad():
                input_embs_a = input_embs_a.to(self.device)
                input_mask_a = input_mask_a.to(self.device)
                input_embs_b = input_embs_b.to(self.device)
            align_ids_a = align_ids_a.to(self.device)
            align_ids_b = align_ids_b.to(self.device)
            align_mask = align_mask.to(self.device)
            #print (align_ids_a, align_ids_b, align_mask)
            src_bert = self.trainer.get_indexed_bert_from_bert(
                input_embs_a,
                align_ids_a,
                align_mask,
                bert_layer=self.args.bert_layer)
            tgt_bert = self.trainer.get_indexed_bert_from_bert(
                input_embs_b,
                align_ids_b,
                align_mask,
                bert_layer=self.args.bert_layer)
            avg_cos_sim, loss = self.trainer.supervised_mapping_step(
                src_bert, tgt_bert, eval_only=True)
            avg_cos_sim_0 = avg_cos_sim.cpu().detach().numpy()
            loss_0 = loss.cpu().detach().numpy()
            self.logger.info(
                "Before mapping: avg cos sim:{:.6f}, avg l2 distance:{:.6f}".
                format(avg_cos_sim_0, loss_0))

            self.trainer.procrustes(src_bert, tgt_bert)
            self.trainer.save_model(self.args.model_path + '/best_mapping.pkl')

            mapped_src_bert = self.trainer.get_indexed_mapped_bert_from_bert(
                input_embs_a,
                input_mask_a,
                align_ids_a,
                align_mask,
                bert_layer=self.args.bert_layer)
            avg_cos_sim, loss = self.trainer.supervised_mapping_step(
                mapped_src_bert, tgt_bert, eval_only=True)
            avg_cos_sim_1 = avg_cos_sim.cpu().detach().numpy()
            loss_1 = loss.cpu().detach().numpy()
            self.logger.info(
                "After mapping: avg cos sim:{:.6f}, avg l2 distance:{:.6f}".
                format(avg_cos_sim_1, loss_1))
示例#5
0
class SupervisedBert(object):
    def __init__(self, args):

        self.args = args
        # check parameters
        if not self.args.pred:
            #assert 0 < self.args.lr_shrink <= 1
            assert self.args.model_path is not None
        self.dataset = None
        # build model / trainer / evaluator
        if not (self.args.pred or self.args.eval):
            self.logger = initialize_exp(self.args)

        self.bert_model, self.bert_model1, self.mapping = build_model(
            self.args, True)

        if self.args.local_rank == -1 or self.args.no_cuda:
            self.device = torch.device("cuda" if torch.cuda.is_available()
                                       and not self.args.no_cuda else "cpu")
        else:
            self.device = torch.device("cuda", self.args.local_rank)
        self.transformer_types = [
            'self_attention', 'attention', 'linear_self_attention',
            'nonlinear_self_attention'
        ]

    def train(self):
        """
        """

        if self.args.load_pred_bert:
            assert self.args.bert_file0 is not None
            assert self.args.bert_file1 is not None
            assert self.args.vocab_file is not None
            assert self.args.vocab_file1 is not None
            self.dataset, unique_id_to_feature, self.features = load_from_bert(
                self.args.vocab_file,
                self.args.bert_file0,
                self.args.bert_file1,
                do_lower_case=self.args.do_lower_case,
                max_seq_length=self.args.max_seq_length,
                n_max_sent=self.args.n_max_sent,
                vocab_file1=self.args.vocab_file1,
                align_file=self.args.align_file,
                align_punc=self.args.align_punc,
                policy=self.args.align_policy)
        else:
            self.dataset, unique_id_to_feature, self.features = load(
                self.args.vocab_file,
                self.args.input_file,
                batch_size=self.args.batch_size,
                do_lower_case=self.args.do_lower_case,
                max_seq_length=self.args.max_seq_length,
                local_rank=self.args.local_rank,
                vocab_file1=self.args.vocab_file1,
                align_file=self.args.align_file)
        self.trainer = SupervisedBertTrainer(
            self.bert_model,
            self.mapping,
            self.args,
            bert_model1=self.bert_model1,
            trans_types=self.transformer_types)

        sampler = RandomSampler(self.dataset)
        train_loader = DataLoader(self.dataset,
                                  sampler=sampler,
                                  batch_size=self.args.batch_size)

        n_without_improvement = 0
        min_loss = 1e6
        path4loss = self.args.model_path + '/model4loss'
        if not os.path.exists(path4loss):
            os.makedirs(path4loss)
        if self.args.save_all:
            model_log = open(path4loss + '/model.log', 'w')

        # training loop
        for n_epoch in range(self.args.n_epochs):
            #self.logger.info('Starting epoch %i...' % n_epoch)
            if (n_epoch + 1) % self.args.decay_step == 0:
                self.trainer.decay_map_lr()
            n_inst = 0
            n_batch = 0
            to_log = {"avg_cosine_similarity": 0, "loss": 0}

            if self.args.load_pred_bert:
                for input_embs_a, input_mask_a, input_embs_b, input_mask_b, align_ids_a, align_ids_b, align_mask, example_indices in train_loader:
                    n_batch += 1
                    with torch.no_grad():
                        input_embs_a = input_embs_a.to(self.device)
                        input_mask_a = input_mask_a.to(self.device)
                        input_embs_b = input_embs_b.to(self.device)
                    align_ids_a = align_ids_a.to(self.device)
                    align_ids_b = align_ids_b.to(self.device)
                    align_mask = align_mask.to(self.device)
                    #print (align_ids_a, align_ids_b, align_mask)
                    src_bert = self.trainer.get_indexed_mapped_bert_from_bert(
                        input_embs_a,
                        input_mask_a,
                        align_ids_a,
                        align_mask,
                        bert_layer=self.args.bert_layer)
                    tgt_bert = self.trainer.get_indexed_bert_from_bert(
                        input_embs_b,
                        align_ids_b,
                        align_mask,
                        bert_layer=self.args.bert_layer)

                    avg_cos_sim, loss = self.trainer.supervised_mapping_step(
                        src_bert, tgt_bert)
                    n_inst += src_bert.size()[0]
                    cos_sim = avg_cos_sim.cpu().detach().numpy()
                    loss_ = loss.cpu().detach().numpy()

                    to_log["avg_cosine_similarity"] += cos_sim
                    to_log["loss"] += loss_
            else:
                for input_ids_a, input_mask_a, input_ids_b, input_mask_b, align_ids_a, align_ids_b, align_mask, example_indices in train_loader:
                    n_batch += 1
                    input_ids_a = input_ids_a.to(self.device)
                    input_mask_a = input_mask_a.to(self.device)
                    input_ids_b = input_ids_b.to(self.device)
                    input_mask_b = input_mask_b.to(self.device)
                    align_ids_a = align_ids_a.to(self.device)
                    align_ids_b = align_ids_b.to(self.device)
                    align_mask = align_mask.to(self.device)
                    #print (align_ids_a, align_ids_b, align_mask)
                    src_bert = self.trainer.get_indexed_mapped_bert(
                        input_ids_a,
                        input_mask_a,
                        align_ids_a,
                        align_mask,
                        bert_layer=self.args.bert_layer,
                        model_id=0)
                    tgt_bert = self.trainer.get_indexed_bert(
                        input_ids_b,
                        input_mask_b,
                        align_ids_b,
                        align_mask,
                        bert_layer=self.args.bert_layer,
                        model_id=1)

                    avg_cos_sim, loss = self.trainer.supervised_mapping_step(
                        src_bert, tgt_bert)
                    n_inst += src_bert.size()[0]
                    cos_sim = avg_cos_sim.cpu().detach().numpy()
                    loss_ = loss.cpu().detach().numpy()

                    to_log["avg_cosine_similarity"] += cos_sim
                    to_log["loss"] += loss_
            to_log["avg_cosine_similarity"] /= n_batch
            to_log["loss"] /= n_batch
            self.logger.info(
                "Epoch:{}, avg cos sim:{:.6f}, avg loss:{:.6f}, instances:{}".
                format(n_epoch, to_log["avg_cosine_similarity"],
                       to_log["loss"], n_inst))

            if to_log[
                    "avg_cosine_similarity"] <= self.trainer.best_valid_metric and to_log[
                        "loss"] >= min_loss:
                n_without_improvement += 1
            else:
                n_without_improvement = 0
            if to_log["loss"] < min_loss:
                self.logger.info(" Minimum loss : {:.6f}".format(
                    to_log["loss"]))
                if self.args.save_all:
                    save_path = path4loss + '/epoch-' + str(n_epoch)
                    model_log.write(
                        "Epoch:{}, avg cos sim:{:.6f}, avg loss:{:.6f}\n".
                        format(n_epoch, to_log["avg_cosine_similarity"],
                               to_log["loss"]))
                else:
                    save_path = path4loss
                self.trainer.save_model(save_path + '/best_mapping.pkl')
                min_loss = to_log["loss"]
            if self.args.save_sim:
                self.trainer.save_best(to_log, "avg_cosine_similarity")
            else:
                if to_log[
                        "avg_cosine_similarity"] > self.trainer.best_valid_metric:
                    self.trainer.best_valid_metric = to_log[
                        "avg_cosine_similarity"]
            self.logger.info(
                "Max avg cos sim:{:.6f}, Min avg loss:{:.6f}".format(
                    self.trainer.best_valid_metric, min_loss))
            #self.logger.info('End of epoch %i.\n\n' % n_epoch)
            if n_without_improvement >= self.args.quit_after_n_epochs_without_improvement:
                self.logger.info(
                    'After {} epochs without improvement, quiting!'.format(
                        n_without_improvement))
                break

    def svd(self):
        """
        """
        if self.args.load_pred_bert:
            assert self.args.bert_file0 is not None
            assert self.args.bert_file1 is not None
            assert self.args.vocab_file is not None
            assert self.args.vocab_file1 is not None
            self.dataset, unique_id_to_feature, self.features = load_from_bert(
                self.args.vocab_file,
                self.args.bert_file0,
                self.args.bert_file1,
                do_lower_case=self.args.do_lower_case,
                max_seq_length=self.args.max_seq_length,
                n_max_sent=self.args.n_max_sent,
                vocab_file1=self.args.vocab_file1,
                align_file=self.args.align_file,
                align_punc=self.args.align_punc,
                policy=self.args.align_policy)

        self.trainer = SupervisedBertTrainer(
            self.bert_model,
            self.mapping,
            self.args,
            bert_model1=self.bert_model1,
            trans_types=self.transformer_types)

        sampler = SequentialSampler(self.dataset)
        train_loader = DataLoader(self.dataset,
                                  sampler=sampler,
                                  batch_size=len(self.dataset))

        self.trainer.args.loss = 'l2_dist'
        for input_embs_a, input_mask_a, input_embs_b, input_mask_b, align_ids_a, align_ids_b, align_mask, example_indices in train_loader:
            self.logger.info("Applying SVD")
            with torch.no_grad():
                input_embs_a = input_embs_a.to(self.device)
                input_mask_a = input_mask_a.to(self.device)
                input_embs_b = input_embs_b.to(self.device)
            align_ids_a = align_ids_a.to(self.device)
            align_ids_b = align_ids_b.to(self.device)
            align_mask = align_mask.to(self.device)
            #print (align_ids_a, align_ids_b, align_mask)
            src_bert = self.trainer.get_indexed_bert_from_bert(
                input_embs_a,
                align_ids_a,
                align_mask,
                bert_layer=self.args.bert_layer)
            tgt_bert = self.trainer.get_indexed_bert_from_bert(
                input_embs_b,
                align_ids_b,
                align_mask,
                bert_layer=self.args.bert_layer)
            avg_cos_sim, loss = self.trainer.supervised_mapping_step(
                src_bert, tgt_bert, eval_only=True)
            avg_cos_sim_0 = avg_cos_sim.cpu().detach().numpy()
            loss_0 = loss.cpu().detach().numpy()
            self.logger.info(
                "Before mapping: avg cos sim:{:.6f}, avg l2 distance:{:.6f}".
                format(avg_cos_sim_0, loss_0))

            self.trainer.procrustes(src_bert, tgt_bert)
            self.trainer.save_model(self.args.model_path + '/best_mapping.pkl')

            mapped_src_bert = self.trainer.get_indexed_mapped_bert_from_bert(
                input_embs_a,
                input_mask_a,
                align_ids_a,
                align_mask,
                bert_layer=self.args.bert_layer)
            avg_cos_sim, loss = self.trainer.supervised_mapping_step(
                mapped_src_bert, tgt_bert, eval_only=True)
            avg_cos_sim_1 = avg_cos_sim.cpu().detach().numpy()
            loss_1 = loss.cpu().detach().numpy()
            self.logger.info(
                "After mapping: avg cos sim:{:.6f}, avg l2 distance:{:.6f}".
                format(avg_cos_sim_1, loss_1))

    def eval(self):
        """
        """
        if self.args.load_pred_bert:
            assert self.args.bert_file0 is not None
            assert self.args.bert_file1 is not None
            self.dataset, unique_id_to_feature, self.features = load_from_bert(
                self.args.vocab_file,
                self.args.bert_file0,
                self.args.bert_file1,
                do_lower_case=self.args.do_lower_case,
                max_seq_length=self.args.max_seq_length,
                n_max_sent=self.args.n_max_sent,
                vocab_file1=self.args.vocab_file1,
                align_file=self.args.align_file)

        self.trainer = SupervisedBertTrainer(
            self.bert_model,
            self.mapping,
            self.args,
            bert_model1=self.bert_model1,
            trans_types=self.transformer_types)
        self.trainer.load_best()
        self.trainer.mapping.eval()

        sampler = SequentialSampler(self.dataset)
        train_loader = DataLoader(self.dataset,
                                  sampler=sampler,
                                  batch_size=self.args.batch_size)

        n_inst = 0
        n_batch = 0
        to_log = {"avg_cos_sim": 0, "loss": 0}
        self.trainer.args.loss = 'l2_dist'

        for input_embs_a, input_mask_a, input_embs_b, input_mask_b, align_ids_a, align_ids_b, align_mask, example_indices in train_loader:
            n_batch += 1
            with torch.no_grad():
                input_embs_a = input_embs_a.to(self.device)
                input_mask_a = input_mask_a.to(self.device)
                input_embs_b = input_embs_b.to(self.device)
            align_ids_a = align_ids_a.to(self.device)
            align_ids_b = align_ids_b.to(self.device)
            align_mask = align_mask.to(self.device)
            #print (align_ids_a, align_ids_b, align_mask)
            src_bert = self.trainer.get_indexed_mapped_bert_from_bert(
                input_embs_a,
                input_mask_a,
                align_ids_a,
                align_mask,
                bert_layer=self.args.bert_layer)
            tgt_bert = self.trainer.get_indexed_bert_from_bert(
                input_embs_b,
                align_ids_b,
                align_mask,
                bert_layer=self.args.bert_layer)
            avg_cos_sim, loss = self.trainer.supervised_mapping_step(
                src_bert, tgt_bert, eval_only=True)
            n_inst += src_bert.size()[0]
            cos_sim = avg_cos_sim.cpu().detach().numpy()
            loss_ = loss.cpu().detach().numpy()
            to_log["avg_cos_sim"] += cos_sim
            to_log["loss"] += loss_

        to_log["avg_cos_sim"] /= n_batch
        to_log["loss"] /= n_batch
        print(
            "avg cos sim:{:.6f}, avg l2 distance:{:.6f}, instances:{}".format(
                to_log["avg_cos_sim"], to_log["loss"], n_inst))

    def list2bert(self, sents):
        """
        Map bert of source language to target space
        """
        assert self.args.output_file is not None

        self.trainer = SupervisedBertTrainer(
            self.bert_model,
            self.mapping,
            self.args,
            trans_types=self.transformer_types)
        self.trainer.load_best()

        if self.args.load_pred_bert:
            assert self.args.bert_file0 is not None
            pred_dataset, unique_id_to_feature, features = load_from_single_bert(
                self.args.bert_file0,
                sents,
                max_seq_length=self.args.max_seq_length)
        else:
            pred_dataset, unique_id_to_feature, features = convert(
                self.args.vocab_file,
                sents,
                batch_size=self.args.batch_size,
                do_lower_case=self.args.do_lower_case,
                max_seq_length=self.args.max_seq_length,
                local_rank=self.args.local_rank)
            self.bert_model.eval()
        pred_sampler = SequentialSampler(pred_dataset)
        pred_dataloader = DataLoader(pred_dataset,
                                     sampler=pred_sampler,
                                     batch_size=self.args.batch_size)

        self.trainer.mapping.eval()
        with open(self.args.output_file, "w", encoding='utf-8') as writer:
            if self.args.load_pred_bert:
                for input_embs, input_mask, example_indices in pred_dataloader:
                    input_embs = input_embs.to(self.device)
                    input_mask = input_mask.to(self.device)

                    src_encoder_layer = input_embs
                    if self.args.map_type in self.transformer_types:
                        target_layer = self.trainer.mapping(
                            src_encoder_layer, input_mask)
                    elif self.args.map_type == 'fine_tune':
                        target_layer = src_encoder_layer
                    else:
                        target_layer = self.trainer.mapping(src_encoder_layer)

                    for b, example_index in enumerate(example_indices):
                        feature = features[example_index.item()]
                        unique_id = int(feature.unique_id)
                        # feature = unique_id_to_feature[unique_id]
                        output_json = OrderedDict()
                        output_json["linex_index"] = unique_id
                        all_out_features = []
                        for (i, token) in enumerate(feature.tokens):
                            all_layers = []
                            layer_output = target_layer.detach().cpu().numpy()
                            layer_output = layer_output[b]
                            layers = OrderedDict()
                            layers["index"] = self.args.bert_layer
                            layers["values"] = [
                                round(x.item(), 6) for x in layer_output[i]
                            ]
                            all_layers.append(layers)
                            out_features = OrderedDict()
                            out_features["token"] = token
                            out_features["layers"] = all_layers
                            all_out_features.append(out_features)
                        output_json["features"] = all_out_features
                        writer.write(json.dumps(output_json) + "\n")
            else:
                for input_ids, input_mask, example_indices in pred_dataloader:
                    input_ids = input_ids.to(self.device)
                    input_mask = input_mask.to(self.device)

                    if self.args.map_input:
                        all_encoder_layers, _ = self.bert_model(
                            input_ids,
                            token_type_ids=None,
                            attention_mask=input_mask,
                            input_mapping=self.trainer.mapping)
                        target_layer = all_encoder_layers[self.args.bert_layer]
                    else:
                        all_encoder_layers, _ = self.bert_model(
                            input_ids,
                            token_type_ids=None,
                            attention_mask=input_mask)
                        src_encoder_layer = all_encoder_layers[
                            self.args.bert_layer]
                        if self.args.map_type in self.transformer_types:
                            target_layer = self.trainer.mapping(
                                src_encoder_layer, input_mask)
                        elif self.args.map_type == 'fine_tune':
                            target_layer = src_encoder_layer
                        else:
                            target_layer = self.trainer.mapping(
                                src_encoder_layer)

                    for b, example_index in enumerate(example_indices):
                        feature = features[example_index.item()]
                        unique_id = int(feature.unique_id)
                        # feature = unique_id_to_feature[unique_id]
                        output_json = OrderedDict()
                        output_json["linex_index"] = unique_id
                        all_out_features = []
                        for (i, token) in enumerate(feature.tokens):
                            all_layers = []
                            layer_output = target_layer.detach().cpu().numpy()
                            layer_output = layer_output[b]
                            layers = OrderedDict()
                            layers["index"] = self.args.bert_layer
                            layers["values"] = [
                                round(x.item(), 6) for x in layer_output[i]
                            ]
                            all_layers.append(layers)
                            out_features = OrderedDict()
                            out_features["token"] = token
                            out_features["layers"] = all_layers
                            all_out_features.append(out_features)
                        output_json["features"] = all_out_features
                        writer.write(json.dumps(output_json) + "\n")

    def transform(self):
        """
        Map bert of source language to target space
        """
        assert self.args.output_file is not None

        self.trainer = SupervisedBertTrainer(
            self.bert_model,
            self.mapping,
            self.args,
            trans_types=self.transformer_types)
        self.trainer.load_best()

        assert self.args.bert_file0 is not None
        pred_dataset, unique_id_to_feature, features = load_from_single_bert(
            self.args.bert_file0, max_seq_length=self.args.max_seq_length)
        pred_sampler = SequentialSampler(pred_dataset)
        pred_dataloader = DataLoader(pred_dataset,
                                     sampler=pred_sampler,
                                     batch_size=self.args.batch_size)

        self.trainer.mapping.eval()
        with open(self.args.output_file, "w", encoding='utf-8') as writer:
            for input_embs, input_mask, example_indices in pred_dataloader:
                input_embs = input_embs.to(self.device)
                input_mask = input_mask.to(self.device)

                src_encoder_layer = input_embs
                if self.args.map_type in self.transformer_types:
                    target_layer = self.trainer.mapping(
                        src_encoder_layer, input_mask)
                elif self.args.map_type == 'fine_tune':
                    target_layer = src_encoder_layer
                else:
                    target_layer = self.trainer.mapping(src_encoder_layer)

                for b, example_index in enumerate(example_indices):
                    feature = features[example_index.item()]
                    unique_id = int(feature.unique_id)
                    # feature = unique_id_to_feature[unique_id]
                    output_json = OrderedDict()
                    output_json["linex_index"] = unique_id
                    all_out_features = []
                    for (i, token) in enumerate(feature.tokens):
                        all_layers = []
                        layer_output = target_layer.detach().cpu().numpy()
                        layer_output = layer_output[b]
                        layers = OrderedDict()
                        layers["index"] = self.args.bert_layer
                        layers["values"] = [
                            round(x.item(), 6) for x in layer_output[i]
                        ]
                        all_layers.append(layers)
                        out_features = OrderedDict()
                        out_features["token"] = token
                        out_features["layers"] = all_layers
                        all_out_features.append(out_features)
                    output_json["features"] = all_out_features
                    writer.write(json.dumps(output_json) + "\n")