示例#1
0
 def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
     rand_scores, nmi_scores, ami_scores = [], [], []
     model_device = model.device
     if not self.use_model_device:
         model.cpu()
     for i in trange(len(self.passages), desc="Evaluating on val", smoothing=0.05):
         passages_to_cluster = [self.passages[i][p] for p in range(len(self.passages[i])) if len(self.passages[i][p])>0]
         true_label = self.labels[i][:len(passages_to_cluster)]
         doc_features = model.tokenize(passages_to_cluster)
         if self.use_model_device:
             batch_to_device(doc_features, model_device)
         doc_embeddings = model(doc_features)['sentence_embedding']
         embeddings_dist_mat = self.euclid_dist(doc_embeddings)
         cl = AgglomerativeClustering(n_clusters=torch.unique(true_label).numel(), affinity='precomputed', linkage='average')
         cluster_label = cl.fit_predict(embeddings_dist_mat.detach().cpu().numpy())
         rand_scores.append(adjusted_rand_score(true_label.numpy(), cluster_label))
         nmi_scores.append(normalized_mutual_info_score(true_label.numpy(), cluster_label))
         ami_scores.append(adjusted_mutual_info_score(true_label.numpy(), cluster_label))
     mean_rand = np.mean(np.array(rand_scores))
     mean_nmi = np.mean(np.array(nmi_scores))
     mean_ami = np.mean(np.array(ami_scores))
     print("\nRAND: %.5f, NMI: %.5f, AMI: %.5f\n" % (mean_rand, mean_nmi, mean_ami), flush=True)
     if not self.use_model_device:
         model.to(model_device)
     return mean_rand
def evaluate_language_pair(model, pair_name="cmn-eng", batch_size=32):
    lang_1, lang_2 = pair_name.split("-")
    reader_1 = TatoebaReader(TATOEBA_PATH / f"tatoeba.{pair_name}.{lang_1}")
    ds_1 = SentencesDataset(reader_1.get_examples(), model=model)
    loader_1 = DataLoader(ds_1,
                          shuffle=False,
                          batch_size=batch_size,
                          collate_fn=model.smart_batching_collate)

    reader_2 = TatoebaReader(TATOEBA_PATH / f"tatoeba.{pair_name}.{lang_2}")
    ds_2 = SentencesDataset(reader_2.get_examples(), model=model)
    loader_2 = DataLoader(ds_2,
                          shuffle=False,
                          batch_size=batch_size,
                          collate_fn=model.smart_batching_collate)

    model.eval()
    emb_1, emb_2 = [], []
    with torch.no_grad():
        for batch in loader_1:
            emb_1.append(
                model(batch_to_device(batch,
                                      "cuda")[0][0])['sentence_embedding'])
        for batch in loader_2:
            emb_2.append(
                model(batch_to_device(batch,
                                      "cuda")[0][0])['sentence_embedding'])
    emb_1 = torch.cat(emb_1).cpu().numpy()
    emb_2 = torch.cat(emb_2).cpu().numpy()

    idx_1 = faiss.IndexFlatL2(emb_1.shape[1])
    faiss.normalize_L2(emb_1)
    idx_1.add(emb_1)
    idx_2 = faiss.IndexFlatL2(emb_2.shape[1])
    faiss.normalize_L2(emb_2)
    idx_2.add(emb_2)

    results = []
    _, match = idx_2.search(x=emb_1, k=1)
    results.append((lang_1, lang_2, np.sum(match[:,
                                                 0] == np.arange(len(emb_1))),
                    len(emb_1)))
    _, match = idx_1.search(x=emb_2, k=1)
    results.append((lang_2, lang_1, np.sum(match[:,
                                                 0] == np.arange(len(emb_2))),
                    len(emb_2)))
    return results
	def _eval_loss(self, evaluation_loss, eval_dataloader, tb_writer, global_step):
		"""evalution on cos similarity w.r.t STS."""

		eval_dataloader.collate_fn = self.smart_batching_collate
		loss_value = 0
		with torch.no_grad():
			for idx, batch_cur in enumerate(eval_dataloader):
				features, labels = batch_to_device(batch_cur, self.device)
				loss_value += evaluation_loss(features, labels)

		loss_value_per_batch = float(loss_value/idx)
		tb_writer.add_scalar("eval/loss", loss_value_per_batch, global_step)
示例#4
0
def do_test(pt_file, model_name, n):
    text = []
    i = 0
    with open(pt_file, 'r', encoding='utf8') as f:
        for l in f:
            text.append(l.split('\t')[1])
            i += 1
            if i >= n:
                break
    psg_word_embedding_model = models.Transformer(model_name)

    # Apply mean pooling to get one fixed sized sentence vector
    psg_pooling_model = models.Pooling(
        psg_word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)

    psg_dense_model = models.Dense(
        in_features=psg_pooling_model.get_sentence_embedding_dimension(),
        out_features=256,
        activation_function=nn.Tanh())
    psg_model = CustomSentenceTransformer(
        modules=[psg_word_embedding_model, psg_pooling_model, psg_dense_model])
    if torch.cuda.is_available():
        psg_model.to(torch.device('cuda'))
    psg_features = []
    print('Tokenizing')
    for p in text:
        psg_tkn = psg_model.tokenize(p)
        if torch.cuda.is_available():
            batch_to_device(psg_tkn, torch.device('cuda'))
        psg_features.append(psg_tkn)
    psg_embs = []
    print('Embedding')
    for pfet in psg_features:
        psg_emb = psg_model(pfet)['sentence_embedding']
        psg_emb.to(torch.device('cpu'))
        psg_embs.append(psg_emb)
    print(psg_embs[:10])
示例#5
0
    def query_batch_collate_fn(self, batch):
        num_texts = len(batch[0].texts)
        queries = []
        texts = [[] for _ in range(num_texts)]
        labels = []

        for example in batch:
            queries.append(example.q_context)
            for idx, text in enumerate(example.texts):
                texts[idx].append(text)
            labels.append(example.label)

        labels = torch.tensor(labels).to(self.device)

        q_tokenized = self.query_model.tokenize(queries)
        batch_to_device(q_tokenized, self.device)

        psg_features = []
        for idx in range(num_texts):
            p_tokenized = self.psg_model.tokenize(texts[idx])
            batch_to_device(p_tokenized, self.device)
            psg_features.append(p_tokenized)

        return q_tokenized, psg_features, labels
示例#6
0
    def __call__(self,
                 model,
                 output_path: str = None,
                 epoch: int = -1,
                 steps: int = -1) -> float:
        model.eval()
        total = 0
        correct = 0

        if epoch != -1:
            if steps == -1:
                out_txt = " after epoch {}:".format(epoch)
            else:
                out_txt = " in epoch {} after {} steps:".format(epoch, steps)
        else:
            out_txt = ":"

        logging.info("Evaluation on the " + self.name + " dataset" + out_txt)
        self.dataloader.collate_fn = model.smart_batching_collate
        for step, batch in enumerate(tqdm(self.dataloader, desc="Evaluating")):
            features, label_ids = batch_to_device(batch, self.device)
            with torch.no_grad():
                _, prediction = self.softmax_model(features, labels=None)

            total += prediction.size(0)
            correct += torch.argmax(prediction,
                                    dim=1).eq(label_ids).sum().item()
        accuracy = correct / total

        logging.info("Accuracy: {:.4f} ({}/{})\n".format(
            accuracy, correct, total))

        if output_path is not None:
            csv_path = os.path.join(output_path, self.csv_file)
            if not os.path.isfile(csv_path):
                with open(csv_path, mode="w", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow(self.csv_headers)
                    writer.writerow([epoch, steps, accuracy])
            else:
                with open(csv_path, mode="a", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow([epoch, steps, accuracy])

        return accuracy
    def __call__(self, model, output_path, epoch=-1, steps=-1):
        model.eval()
        self.dataloader.collate_fn = model.smart_batching_collate

        embeddings = []
        labels = []
        for step, batch in enumerate(self.dataloader):
            features, batch_labels = batch_to_device(batch, self.device)
            with torch.no_grad():
                emb1 = model(
                    features[0])['sentence_embedding'].to("cpu").numpy()

            labels.extend(batch_labels.to("cpu").numpy())
            embeddings.extend(emb1)

        embeddings = np.asarray(embeddings)
        labels = np.asarray(labels)

        mse = ((embeddings - labels)**2).mean()

        logging.info("MSE evaluation on " + self.name + " dataset")
        mse *= 100

        logging.info("embeddings shape:\t" + str(embeddings.shape))
        logging.info("MSE (*100):\t{:4f}".format(mse))

        if output_path is not None:
            csv_path = os.path.join(output_path, self.csv_file)
            output_file_exists = os.path.isfile(csv_path)
            with open(csv_path,
                      mode="a" if output_file_exists else 'w',
                      encoding="utf-8") as f:
                writer = csv.writer(f)
                if not output_file_exists:
                    writer.writerow(self.csv_headers)

                writer.writerow([epoch, steps, mse])

        return -mse  #Return negative score as SentenceTransformers maximizes the performance
    def fit(self,
            train_objectives: Iterable[Tuple[DataLoader, nn.Module]],
            evaluator: SentenceEvaluator = None,
            epochs: int = 1,
            steps_per_epoch=None,
            scheduler: str = 'WarmupLinear',
            warmup_steps: int = 10000,
            optimizer_class: Type[Optimizer] = transformers.AdamW,
            optimizer_params: Dict[str, object] = {
                'lr': 2e-5,
                'eps': 1e-6,
                'correct_bias': False
            },
            weight_decay: float = 0.01,
            evaluation_steps: int = 0,
            output_path: str = None,
            save_best_model: bool = True,
            max_grad_norm: float = 1,
            use_amp: bool = False,
            callback: Callable[[float, int, int], None] = None,
            output_path_ignore_not_empty: bool = False):
        """
        Train the model with the given training objective
        Each training objective is sampled in turn for one batch.
        We sample only as many batches from each objective as there are in the smallest one
        to make sure of equal training with each dataset.
        :param train_objectives: Tuples of (DataLoader, LossFunction). Pass more than one for multi-task learning
        :param evaluator: An evaluator (sentence_transformers.evaluation) evaluates the model performance during training on held-out dev data. It is used to determine the best model that is saved to disc.
        :param epochs: Number of epochs for training
        :param steps_per_epoch: Number of training steps per epoch. If set to None (default), one epoch is equal the DataLoader size from train_objectives.
        :param scheduler: Learning rate scheduler. Available schedulers: constantlr, warmupconstant, warmuplinear, warmupcosine, warmupcosinewithhardrestarts
        :param warmup_steps: Behavior depends on the scheduler. For WarmupLinear (default), the learning rate is increased from o up to the maximal learning rate. After these many training steps, the learning rate is decreased linearly back to zero.
        :param optimizer_class: Optimizer
        :param optimizer_params: Optimizer parameters
        :param weight_decay: Weight decay for model parameters
        :param evaluation_steps: If > 0, evaluate the model using evaluator after each number of training steps
        :param output_path: Storage path for the model and evaluation files
        :param save_best_model: If true, the best model (according to evaluator) is stored at output_path
        :param max_grad_norm: Used for gradient normalization.
        :param use_amp: Use Automatic Mixed Precision (AMP). Only for Pytorch >= 1.6.0
        :param callback: Callback function that is invoked after each evaluation.
                It must accept the following three parameters in this order:
                `score`, `epoch`, `steps`
        :param output_path_ignore_not_empty: deprecated, no longer used
        """

        if use_amp:
            from torch.cuda.amp import autocast
            scaler = torch.cuda.amp.GradScaler()

        self.to(self._target_device)

        if output_path is not None:
            os.makedirs(output_path, exist_ok=True)

        dataloaders = [dataloader for dataloader, _ in train_objectives]

        # Use smart batching
        for dataloader in dataloaders:
            dataloader.collate_fn = self.smart_batching_collate

        loss_models = [loss for _, loss in train_objectives]
        for loss_model in loss_models:
            loss_model.to(self._target_device)

        self.best_score = -9999999

        if steps_per_epoch is None or steps_per_epoch == 0:
            steps_per_epoch = min(
                [len(dataloader) for dataloader in dataloaders])

        num_train_steps = int(steps_per_epoch * epochs)

        # Prepare optimizers
        optimizers = []
        schedulers = []
        for loss_model in loss_models:
            param_optimizer = list(loss_model.named_parameters())

            no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                weight_decay
            }, {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            }]

            optimizer = optimizer_class(optimizer_grouped_parameters,
                                        **optimizer_params)
            scheduler_obj = self._get_scheduler(optimizer,
                                                scheduler=scheduler,
                                                warmup_steps=warmup_steps,
                                                t_total=num_train_steps)

            optimizers.append(optimizer)
            schedulers.append(scheduler_obj)

        global_step = 0
        data_iterators = [iter(dataloader) for dataloader in dataloaders]

        num_train_objectives = len(train_objectives)

        skip_scheduler = False
        for epoch in trange(epochs, desc="Epoch"):
            training_steps = 0

            for loss_model in loss_models:
                loss_model.zero_grad()
                loss_model.train()

            for _ in trange(steps_per_epoch, desc="Iteration", smoothing=0.05):
                for train_idx in range(num_train_objectives):
                    loss_model = loss_models[train_idx]
                    optimizer = optimizers[train_idx]
                    scheduler = schedulers[train_idx]
                    data_iterator = data_iterators[train_idx]

                    try:
                        data = next(data_iterator)
                    except StopIteration:
                        #logging.info("Restart data_iterator")
                        data_iterator = iter(dataloaders[train_idx])
                        data_iterators[train_idx] = data_iterator
                        data = next(data_iterator)

                    features, labels = batch_to_device(data,
                                                       self._target_device)

                    if use_amp:
                        with autocast():
                            loss_value = loss_model(features, labels)

                        scale_before_step = scaler.get_scale()
                        scaler.scale(loss_value).backward()
                        scaler.unscale_(optimizer)
                        torch.nn.utils.clip_grad_norm_(loss_model.parameters(),
                                                       max_grad_norm)
                        scaler.step(optimizer)
                        scaler.update()

                        skip_scheduler = scaler.get_scale(
                        ) != scale_before_step
                    else:
                        loss_value = loss_model(features, labels)
                        loss_value.backward()
                        torch.nn.utils.clip_grad_norm_(loss_model.parameters(),
                                                       max_grad_norm)
                        optimizer.step()

                    optimizer.zero_grad()

                    if not skip_scheduler:
                        scheduler.step()

                training_steps += 1
                global_step += 1

                if evaluation_steps > 0 and training_steps % evaluation_steps == 0:
                    self._eval_during_training(evaluator, output_path,
                                               save_best_model, epoch,
                                               training_steps, callback)
                    for loss_model in loss_models:
                        loss_model.zero_grad()
                        loss_model.train()

            self._eval_during_training(evaluator, output_path, save_best_model,
                                       epoch, -1, callback)
def train(args, train_dataset, model, train_loss, dev_dataset=None):
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  shuffle=False)

    dev_dataloader = DataLoader(dev_dataset,
                                shuffle=False,
                                batch_size=args.train_batch_size)
    evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

    train_objectives = [(train_dataloader, train_loss)]
    epochs = args.epochs
    # evaluation_steps = 1000
    output_path = args.output_dir
    optimizer_class = transformers.AdamW
    optimizer_params = {
        'lr': args.learning_rate,
        'eps': 1e-6,
        'correct_bias': False
    }
    max_grad_norm = 1
    # local_rank = -1
    save_epoch = True

    dataloaders = [dataloader for dataloader, _ in train_objectives]
    # Use smart batching
    for dataloader in dataloaders:
        dataloader.collate_fn = model.smart_batching_collate

    loss_models = [loss for _, loss in train_objectives]
    logging.info('number of models is {} '.format(len(loss_models)))

    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    for loss_model in loss_models:
        loss_model.to(args.device)

    model.best_score = -9999

    min_batch_size = min([len(dataloader) for dataloader in dataloaders])
    num_train_steps = int(min_batch_size * epochs)
    warmup_steps = math.ceil(
        len(train_dataset) * args.epochs / args.train_batch_size *
        0.1)  # 10% of train data for warm-up
    # Prepare optimizers
    optimizers = []
    schedulers = []
    for loss_model in loss_models:
        param_optimizer = list(loss_model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        t_total = num_train_steps
        if args.local_rank != -1:
            t_total = t_total // args.world_size

        optimizer = optimizer_class(optimizer_grouped_parameters,
                                    **optimizer_params)
        scheduler = model._get_scheduler(optimizer,
                                         scheduler='WarmupLinear',
                                         warmup_steps=warmup_steps,
                                         t_total=t_total)

        optimizers.append(optimizer)
        schedulers.append(scheduler)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

        for idx in range(len(loss_models)):
            model2, optimizer2 = amp.initialize(loss_models[idx],
                                                optimizers[idx],
                                                opt_level=args.fp16_opt_level)
            loss_models[idx] = model2
            optimizers[idx] = optimizer2

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        for idx, loss_model in enumerate(loss_models):
            loss_models[idx] = torch.nn.parallel.DistributedDataParallel(
                loss_model,
                device_ids=[args.local_rank],
                output_device=args.local_rank,
                find_unused_parameters=True)
        logger.info('Setting Dist Paralel rank:{}'.format(args.local_rank))
    elif args.n_gpu > 1:
        for idx, loss_model in enumerate(loss_models):
            loss_models[idx] = torch.nn.parallel.DataParallel(loss_model)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    data_iterators = [iter(dataloader) for dataloader in dataloaders]
    num_train_objectives = len(train_objectives)
    # set_seeds(1,args)
    tr_loss = 0.0
    for epoch in trange(epochs,
                        desc="Epoch",
                        disable=args.local_rank not in [-1, 0]):
        training_steps = 0

        for loss_model in loss_models:
            loss_model.zero_grad()
            loss_model.train()

        for step in trange(num_train_objectives * min_batch_size,
                           desc="Iteration",
                           disable=args.local_rank not in [-1, 0]):
            idx = step % num_train_objectives

            loss_model = loss_models[idx]
            optimizer = optimizers[idx]
            scheduler = schedulers[idx]
            data_iterator = data_iterators[idx]

            try:
                data = next(data_iterator)
            except StopIteration:
                logging.info("Restart data_iterator")
                data_iterator = iter(dataloaders[idx])
                data_iterators[idx] = data_iterator
                data = next(data_iterator)

            features, labels = batch_to_device(data, args.device)
            loss_value = loss_model(features, labels)
            # logger.info("loss size: {} ".format(str(len(loss_value))))
            # logger.info("loss: ", loss_value)

            if args.n_gpu > 1:
                loss_value = loss_value.mean()

            if args.fp16:
                with amp.scale_loss(loss_value, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               max_grad_norm)
            else:
                loss_value.backward()
                torch.nn.utils.clip_grad_norm_(loss_model.parameters(),
                                               max_grad_norm)

            training_steps += 1
            tr_loss += loss_value.item()

            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            global_step += 1

        if args.evaluation_steps > 0 and training_steps % args.evaluation_steps == 0 and evaluator != None:
            score = evaluator(model)
            for loss_model in loss_models:
                loss_model.zero_grad()
                loss_model.train()

            logging.info("Dev cosine-Similarity MSE:", score[0], " at step ",
                         global_step)
            logging.info("Dev cosine-Similarity MAE:", score[1], " at step ",
                         global_step)
            if args.local_rank in [-1, 0]:
                model.save(output_path + "_step_" + str(global_step))

        if args.local_rank in [-1, 0] and save_epoch:
            model.save(output_path + "_ep_" + str(epoch))

    return tr_loss / global_step
	def fit(self,
			args,
			train_objectives: Iterable[Tuple[DataLoader, nn.Module]],
			eval_objectives: Iterable[Tuple[DataLoader, nn.Module]],
			train_evaluator: SentenceEvaluator,
			evaluator: SentenceEvaluator,
			train_phase: str = 'STS',
			epochs: int = 1,
			steps_per_epoch = None,
			scheduler_name: str = 'WarmupLinear',
			warmup_steps: int = 10000,
			optimizer_class: Type[Optimizer] = transformers.AdamW,
			optimizer_params : Dict[str, object ]= {'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False},
			weight_decay: float = 0.01,
			evaluation_steps: int = 0,
			save_best_model: bool = True,
			max_grad_norm: float = 1,
			fp16: bool = False,
			fp16_opt_level: str = 'O1',
			local_rank: int = -1
			):
		
		if train_phase not in ['STS', 'NLI']:
			assert False, print(f"Not valid train_phase given.")

		self.lr = optimizer_params['lr']
		self.desc_string = f'lr-{self.lr}_epochs-{epochs}_warmup_steps-{warmup_steps}'
		logger.info(f"model description is {self.desc_string}.")
		args.desc_string = self.desc_string

		if args.output_dir is not None:
			# empty folder is not necessary.
			os.makedirs(args.output_dir, exist_ok=True)
			path_prefix = args.output_dir.split('/')[0]
			tb_writer = SummaryWriter(log_dir=os.path.join(path_prefix, self.desc_string))
			tb_writer.add_text('experiment args', self.desc_string, 0)
			
		dataloaders = [dataloader for dataloader, _ in train_objectives]

		# Use smart batching
		for dataloader in dataloaders:
			dataloader.collate_fn = self.smart_batching_collate

		## GX: this design is for the composite loss.
		loss_models = [loss for _, loss in train_objectives]
		eval_dataloader, evaluation_loss = eval_objectives[0]   # the current version
		device = self.device

		for loss_model in loss_models:
			loss_model.to(device)

		self.best_score = -9999999

		if steps_per_epoch is None or steps_per_epoch == 0:
			steps_per_epoch = min([len(dataloader) for dataloader in dataloaders])

		num_train_steps = int(steps_per_epoch * epochs)

		# Prepare optimizer and schedule (linear warmup and decay)
		param_optimizer = list(loss_model.named_parameters())

		no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
		optimizer_grouped_parameters = [
			{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
			{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
		]
		t_total = num_train_steps
		if local_rank != -1:
			t_total = t_total // torch.distributed.get_world_size()

		optimizer = optimizer_class(optimizer_grouped_parameters, **optimizer_params)
		scheduler = self._get_scheduler(optimizer, scheduler=scheduler_name, warmup_steps=warmup_steps, t_total=t_total)

		# Config
		global_step = 0
		epochs_trained = 0
		steps_trained_in_current_epoch = 0
		data_iterators = [iter(dataloader) for dataloader in dataloaders]

		# Check if continuing training from a checkpoint
		if args.should_continue:
			if self.loading_model_dir and os.path.exists(self.loading_model_dir):
				optimizer.load_state_dict(torch.load(os.path.join(self.loading_model_dir, "optimizer.pt")))
				scheduler.load_state_dict(torch.load(os.path.join(self.loading_model_dir, "scheduler.pt")))
				global_step, epochs_trained, steps_trained_in_current_epoch = self.load(self.loading_model_dir, steps_per_epoch)
			else:
				logger.info("  Starting fine-tuning.")

		# Train !  
		for epoch in trange(epochs_trained, epochs, desc="Epoch"):
			training_steps = 0  # training steps per epoch.
			loss_model.zero_grad()
			loss_model.train()
			data_iterator = data_iterators[0] 
			for _ in trange(steps_per_epoch, desc="Iteration", smoothing=0.05):

				try:
					data = next(data_iterator)
				except StopIteration:
					data_iterator = iter(dataloaders[0])
					data_iterators[0] = data_iterator
					data = next(data_iterator)

				if steps_trained_in_current_epoch > 0:
					steps_trained_in_current_epoch -= 1
					training_steps += 1
					continue

				features, labels = batch_to_device(data, self.device)
				loss_value = loss_model(features, labels)
				tb_writer.add_scalar("progress/lr", scheduler.get_lr()[0], global_step)
				tb_writer.add_scalar("progress/steps_per_epoch", steps_per_epoch, global_step)
				tb_writer.add_scalar("progress/num_train_steps", num_train_steps, global_step)                                
				tb_writer.add_scalar("train/loss_value", loss_value, global_step)                

				if fp16:
					with amp.scale_loss(loss_value, optimizer) as scaled_loss:
						scaled_loss.backward()
					torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm)
				else:
					loss_value.backward()
					torch.nn.utils.clip_grad_norm_(loss_model.parameters(), max_grad_norm)

				optimizer.step()
				scheduler.step()
				optimizer.zero_grad()

				training_steps += 1
				global_step += 1
				args.global_step = global_step

				if evaluation_steps > 0 and training_steps % evaluation_steps == 0:
					criterion = train_evaluator(self, output_path=args.output_dir, epoch=epoch, steps=global_step) 
					if train_phase == 'NLI':
						tb_writer.add_scalar("train/accuracy", criterion, global_step)
					elif train_phase == 'STS':
						tb_writer.add_scalar("train/score", criterion, global_step)
					current_file_name = 'checkpoint-'+str(global_step)
					rotate_checkpoints(args, checkpoint_prefix="checkpoint")
					self.save(current_file_name, args, optimizer, scheduler)
					self._eval_loss(evaluation_loss, eval_dataloader, tb_writer, global_step)
					if self._eval_during_training_custom(evaluator, args.output_dir, tb_writer, epoch, training_steps, global_step):
						self._save_the_best_checkpoint(args, global_step, optimizer, scheduler)
					loss_model.zero_grad()
					loss_model.train()

			self._eval_loss(evaluation_loss, eval_dataloader, tb_writer, global_step)
			if self._eval_during_training_custom(evaluator, args.output_dir, tb_writer, epoch, -1, global_step):
				self._save_the_best_checkpoint(args, global_step, optimizer, scheduler)

		## write results into pkl files.
		write_result(args, self.best_score)
示例#11
0
def train(args, train_dataset, model, train_loss):
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    #train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = SampleGenerator(train_dataset,
                                       sample_count=args.train_batch_size)

    train_objectives = [(train_dataloader, train_loss)]
    epochs = args.epochs
    output_path = args.output_dir
    optimizer_class = transformers.AdamW
    optimizer_params = {
        'lr': args.learning_rate,
        'eps': 1e-6,
        'correct_bias': False
    }
    max_grad_norm = 1
    # local_rank = -1
    save_epoch = True

    dataloaders = [dataloader for dataloader, _ in train_objectives]
    # Use smart batching
    for dataloader in dataloaders:
        dataloader.collate_fn = model.smart_batching_collate

    loss_models = [loss for _, loss in train_objectives]
    logging.info('number of models is {} '.format(len(loss_models)))

    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    for loss_model in loss_models:
        loss_model.to(args.device)

    model.best_score = -9999

    min_batch_size = min([len(dataloader) for dataloader in dataloaders])
    num_train_steps = int(min_batch_size * epochs)
    warmup_steps = math.ceil(
        len(train_dataset) * args.epochs / args.train_batch_size *
        0.1)  # 10% of train data for warm-up
    # Prepare optimizers
    optimizers = []
    schedulers = []
    for loss_model in loss_models:
        param_optimizer = list(loss_model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        t_total = num_train_steps
        if args.local_rank != -1:
            t_total = t_total // args.world_size

        optimizer = optimizer_class(optimizer_grouped_parameters,
                                    **optimizer_params)
        scheduler = model._get_scheduler(optimizer,
                                         scheduler='WarmupLinear',
                                         warmup_steps=warmup_steps,
                                         t_total=t_total)

        optimizers.append(optimizer)
        schedulers.append(scheduler)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", epochs)
    logger.info("  Instantaneous batch size per TPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    data_iterators = [iter(dataloader) for dataloader in dataloaders]
    num_train_objectives = len(train_objectives)
    # set_seeds(1,args)
    tr_loss = 0.0
    for epoch in trange(epochs,
                        desc="Epoch",
                        disable=args.local_rank not in [-1, 0]):
        training_steps = 0

        for loss_model in loss_models:
            loss_model.zero_grad()
            loss_model.train()

        for step in trange(num_train_objectives * min_batch_size,
                           desc="Iteration",
                           disable=args.local_rank not in [-1, 0]):
            idx = step % num_train_objectives

            loss_model = loss_models[idx]
            optimizer = optimizers[idx]
            scheduler = schedulers[idx]
            data_iterator = data_iterators[idx]

            try:
                data = next(data_iterator)
            except StopIteration:
                logging.info("Restart data_iterator")
                data_iterator = iter(dataloaders[idx])
                data_iterators[idx] = data_iterator
                data = next(data_iterator)

            features, labels = batch_to_device(data, args.device)
            loss_value = loss_model(features, labels)

            loss_value.backward()
            torch.nn.utils.clip_grad_norm_(loss_model.parameters(),
                                           max_grad_norm)

            training_steps += 1
            tr_loss += loss_value.item()

            #optimizer.step()
            xm.optimizer_step(optimizer, barrier=True)
            scheduler.step()
            optimizer.zero_grad()
            global_step += 1

        if args.local_rank in [-1, 0] and save_epoch:
            model.save(output_path + "_" + str(epoch))

    return tr_loss / global_step
示例#12
0
    def __call__(self,
                 model,
                 output_path: str = None,
                 epoch: int = -1,
                 steps: int = -1) -> float:
        model.eval()
        total = 0
        correct = 0

        if epoch != -1:
            if steps == -1:
                out_txt = " after epoch {}:".format(epoch)
            else:
                out_txt = " in epoch {} after {} steps:".format(epoch, steps)
        else:
            out_txt = ":"

        logging.info("Evaluation on the " + self.name + " dataset" + out_txt)
        self.dataloader.collate_fn = model.smart_batching_collate

        pre_results = torch.tensor([], dtype=torch.int64).to(self.device)

        prf = torch.tensor([], dtype=torch.int64).to(self.device)
        labels = torch.tensor([], dtype=torch.int64).to(self.device)
        for step, batch in enumerate(tqdm(self.dataloader, desc="Evaluating")):
            features, label_ids = batch_to_device(batch, self.device)
            with torch.no_grad():
                _, prediction = self.softmax_model(features, labels=None)

            total += prediction.size(0)
            correct += torch.argmax(prediction,
                                    dim=1).eq(label_ids).sum().item()
            #pre = torch.argmax(prediction, dim=1)
            #prf = torch.cat((prf,pre))
            #labels = torch.cat((labels,label_ids))
            #correct += pre.eq(label_ids).sum().item()

            #if self.label_text:
            #    pre_results = torch.cat((pre_results,pre), 0 )

        #prf = prf.view(-1)
        #labels = labels.view(-1)

        #p = metrics.precision_score(labels.cpu(), prf.cpu(), average=None)
        #r = metrics.recall_score(labels.cpu(), prf.cpu(), average=None)
        #f = metrics.f1_score(labels.cpu(), prf.cpu(), average=None)

        #target_names = ['class 0', 'class 1', 'class 2', 'class 3', 'class 4', 'class 5']
        #prf_result = metrics.classification_report(labels.cpu(), prf.cpu(), target_names=target_names)

        #print(p,r,f)
        #print(prf_result)

        accuracy = correct / total

        logging.info("Accuracy: {:.4f} ({}/{})\n".format(
            accuracy, correct, total))
        print("Accuracy: {:.4f} ({}/{})\n".format(accuracy, correct, total))

        if output_path is not None:
            csv_path = os.path.join(output_path, self.csv_file)
            if not os.path.isfile(csv_path):
                with open(csv_path, mode="w", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow(self.csv_headers)
                    writer.writerow([epoch, steps, accuracy])
            else:
                with open(csv_path, mode="a", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow([epoch, steps, accuracy])

        if self.label_text:
            pre_results = pre_results.cpu().numpy().tolist()
            for i in range(len(self.label_text)):
                self.label_text[i].insert(1, pre_results[i])

            if output_path is not None:
                csv_path = os.path.join(output_path, self.csv_label_text)
                with open(csv_path, mode="w", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow(self.label_text_headers)
                    for element in self.label_text:
                        writer.writerow(element)

        return accuracy
    def fit(self,
            train_objectives: Iterable[Tuple[DataLoader, nn.Module]],
            evaluator: SentenceEvaluator,
            epochs: int = 1,
            eval_dataloader=None,
            steps_per_epoch=None,
            scheduler: str = 'WarmupLinear',
            warmup_steps: int = 10000,
            optimizer_class: Type[Optimizer] = transformers.AdamW,
            optimizer_params: Dict[str, object] = {
                'lr': 2e-5,
                'eps': 1e-6,
                'correct_bias': False
            },
            weight_decay: float = 0.01,
            evaluation_steps: int = 0,
            output_path: str = None,
            save_best_model: bool = True,
            max_grad_norm: float = 1,
            fp16: bool = False,
            fp16_opt_level: str = 'O1',
            local_rank: int = -1):

        self.lr = optimizer_params['lr']
        self.desc_string = f'lr-{self.lr}_epochs-{epochs}_warmup_steps-{warmup_steps}'
        print(f"model description is {self.desc_string}.")

        if output_path is not None:
            # empty folder is not necessary.
            os.makedirs(output_path, exist_ok=True)
            path_prefix = output_path.split('/')[0]
            tb_writer = SummaryWriter(
                log_dir=os.path.join(path_prefix, self.desc_string))
            tb_writer.add_text('experiment args', self.desc_string, 0)

        dataloaders = [dataloader for dataloader, _ in train_objectives]

        # Use smart batching
        for dataloader in dataloaders:
            dataloader.collate_fn = self.smart_batching_collate

        ## GX: this design is for the composite loss.
        loss_models = [loss for _, loss in train_objectives]
        device = self.device

        for loss_model in loss_models:
            loss_model.to(device)

        self.best_score = -9999999

        if steps_per_epoch is None or steps_per_epoch == 0:
            steps_per_epoch = min(
                [len(dataloader) for dataloader in dataloaders])

        num_train_steps = int(steps_per_epoch * epochs)

        # Prepare optimizers w.r.t each model.
        optimizers = []
        schedulers = []
        for loss_model in loss_models:
            param_optimizer = list(loss_model.named_parameters())

            no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                weight_decay
            }, {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            }]
            t_total = num_train_steps
            if local_rank != -1:
                t_total = t_total // torch.distributed.get_world_size()

            optimizer = optimizer_class(optimizer_grouped_parameters,
                                        **optimizer_params)
            scheduler_obj = self._get_scheduler(optimizer,
                                                scheduler=scheduler,
                                                warmup_steps=warmup_steps,
                                                t_total=t_total)

            optimizers.append(optimizer)
            schedulers.append(scheduler_obj)

        # Decides the data-type here.
        if fp16:
            try:
                from apex import amp
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )

            for train_idx in range(len(loss_models)):
                model, optimizer = amp.initialize(loss_models[train_idx],
                                                  optimizers[train_idx],
                                                  opt_level=fp16_opt_level)
                loss_models[train_idx] = model
                optimizers[train_idx] = optimizer

        global_step = 0
        ## GX: only use iter, instead of for loop on the dataloader.
        data_iterators = [iter(dataloader) for dataloader in dataloaders]
        num_train_objectives = len(train_objectives)

        for epoch in trange(epochs, desc="Epoch"):
            training_steps = 0

            for loss_model in loss_models:
                loss_model.zero_grad()
                loss_model.train()

            for _ in trange(steps_per_epoch, desc="Iteration", smoothing=0.05):
                for train_idx in range(num_train_objectives):
                    loss_model = loss_models[train_idx]
                    optimizer = optimizers[train_idx]
                    scheduler = schedulers[train_idx]
                    data_iterator = data_iterators[train_idx]

                    try:
                        data = next(data_iterator)
                    except StopIteration:
                        data_iterator = iter(dataloaders[train_idx])
                        data_iterators[train_idx] = data_iterator
                        data = next(data_iterator)

                    features, labels = batch_to_device(data, self.device)
                    loss_value = loss_model(features, labels)
                    tb_writer.add_scalar("progress/lr",
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("progress/steps_per_epoch",
                                         steps_per_epoch, global_step)
                    tb_writer.add_scalar("progress/num_train_steps",
                                         num_train_steps, global_step)
                    tb_writer.add_scalar("train/loss_value", loss_value,
                                         global_step)

                    if fp16:
                        with amp.scale_loss(loss_value,
                                            optimizer) as scaled_loss:
                            scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), max_grad_norm)
                    else:
                        loss_value.backward()
                        torch.nn.utils.clip_grad_norm_(loss_model.parameters(),
                                                       max_grad_norm)

                    optimizer.step()
                    scheduler.step()
                    optimizer.zero_grad()

                training_steps += 1
                global_step += 1

                if evaluation_steps > 0 and training_steps % evaluation_steps == 0:
                    self._eval_during_training_custom(evaluator, output_path,
                                                      tb_writer,
                                                      save_best_model, epoch,
                                                      training_steps,
                                                      global_step)
                    for loss_model in loss_models:
                        loss_model.zero_grad()
                        loss_model.train()

            self._eval_during_training_custom(evaluator, output_path,
                                              tb_writer, save_best_model,
                                              epoch, -1, global_step)
    def __call__(self,
                 model,
                 output_path: str = None,
                 epoch: int = -1,
                 steps: int = -1) -> float:
        model.eval()
        embeddings1 = []
        embeddings2 = []
        labels = []

        if epoch != -1:
            if steps == -1:
                out_txt = f" after epoch {epoch}:"
            else:
                out_txt = f" in epoch {epoch} after {steps} steps:"
        else:
            out_txt = ":"

        logging.info("Evaluation the model on " + self.name + " dataset" +
                     out_txt)
        self.dataloader.collate_fn = model.smart_batching_collate
        for step, batch in enumerate(tqdm(self.dataloader, desc="Evaluating")):
            features, label_ids = batch_to_device(batch, self.device)
            with torch.no_grad():
                emb1, emb2 = [
                    model(sent_features)['sentence_embedding'].to(
                        "cpu").numpy() for sent_features in features
                ]

            labels.extend(label_ids.to("cpu").numpy())
            embeddings1.extend(emb1)
            embeddings2.extend(emb2)
        cosine_scores = 1 - paired_cosine_distances(embeddings1, embeddings2)
        manhattan_distances = paired_manhattan_distances(
            embeddings1, embeddings2)
        euclidean_distances = paired_euclidean_distances(
            embeddings1, embeddings2)

        # Ensure labels are just 0 or 1
        for label in labels:
            assert (label == 0 or label == 1)

        labels = np.asarray(labels)
        cosine_acc, cosine_threshold = self.find_best_acc_and_threshold(
            cosine_scores, labels, True)
        manhattan_acc, manhatten_threshold = self.find_best_acc_and_threshold(
            manhattan_distances, labels, False)
        euclidean_acc, euclidean_threshold = self.find_best_acc_and_threshold(
            euclidean_distances, labels, False)

        logging.info(
            "Accuracy with Cosine-Similarity:\t{:.2f}\t(Threshold: {:.4f})".
            format(cosine_acc * 100, cosine_threshold))
        logging.info(
            "Accuracy with Manhattan-Distance:\t{:.2f}\t(Threshold: {:.4f})".
            format(manhattan_acc * 100, manhatten_threshold))
        logging.info(
            "Accuracy with Euclidean-Distance:\t{:.2f}\t(Threshold: {:.4f})\n".
            format(euclidean_acc * 100, euclidean_threshold))

        if output_path is not None:
            csv_path = os.path.join(output_path, self.csv_file)
            if not os.path.isfile(csv_path):
                with open(csv_path, mode="w", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow(self.csv_headers)
                    writer.writerow([
                        epoch, steps, cosine_acc, euclidean_acc, manhattan_acc
                    ])
            else:
                with open(csv_path, mode="a", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow([
                        epoch, steps, cosine_acc, euclidean_acc, manhattan_acc
                    ])

        if self.main_similarity == SimilarityFunction.COSINE:
            return cosine_acc
        elif self.main_similarity == SimilarityFunction.EUCLIDEAN:
            return euclidean_acc
        elif self.main_similarity == SimilarityFunction.MANHATTAN:
            return manhattan_acc
        else:
            raise ValueError("Unknown main_similarity value")