def get_loss(loss_type, model): if loss_type == 'BatchAllTripletLoss': return losses.BatchAllTripletLoss(model=model) if loss_type == 'BatchHardSoftMarginTripletLoss': return losses.BatchHardSoftMarginTripletLoss(model=model) if loss_type == 'BatchHardTripletLoss': return losses.BatchHardTripletLoss(model=model) if loss_type == 'BatchSemiHardTripletLoss': return losses.BatchSemiHardTripletLoss(model=model) if loss_type == 'ContrastiveLoss': return losses.ContrastiveLoss(model=model) if loss_type == 'CosineSimilarityLoss': return losses.CosineSimilarityLoss(model=model) if loss_type == 'MegaBatchMarginLoss': return losses.MegaBatchMarginLoss(model=model) if loss_type == 'MultipleNegativesRankingLoss': return losses.MultipleNegativesRankingLoss(model=model) if loss_type == 'OnlineContrastiveLoss': return losses.OnlineContrastiveLoss(model=model) raise ValueError('Invalid loss type')
train_dataset_MultipleNegativesRankingLoss = SentencesDataset( train_samples_MultipleNegativesRankingLoss, model=model) train_dataloader_MultipleNegativesRankingLoss = DataLoader( train_dataset_MultipleNegativesRankingLoss, shuffle=True, batch_size=train_batch_size) train_loss_MultipleNegativesRankingLoss = losses.MultipleNegativesRankingLoss( model) # Create data loader and loss for OnlineContrastiveLoss train_dataset_ConstrativeLoss = SentencesDataset(train_samples_ConstrativeLoss, model=model) train_dataloader_ConstrativeLoss = DataLoader(train_dataset_ConstrativeLoss, shuffle=True, batch_size=train_batch_size) train_loss_ConstrativeLoss = losses.OnlineContrastiveLoss( model=model, distance_metric=distance_metric, margin=margin) ################### Development Evaluators ################## # We add 3 evaluators, that evaluate the model on Duplicate Questions pair classification, # Duplicate Questions Mining, and Duplicate Questions Information Retrieval evaluators = [] ###### Classification ###### # Given (quesiton1, question2), is this a duplicate or not? # The evaluator will compute the embeddings for both questions and then compute # a cosine similarity. If the similarity is above a threshold, we have a duplicate. dev_sentences1 = [] dev_sentences2 = [] dev_labels = [] with open(os.path.join(dataset_path, "classification/dev_pairs.tsv"), encoding='utf8') as fIn:
gold = qrel[topic["number"]].items() query = topic["title"].strip() for item in gold: try: doc = db.lookup_docno(item[0]) examples.append(InputExample(texts=[query, doc], label=item[1])) except: continue print("finished", len(examples)) #%% from torch.utils.data import DataLoader train_dataset = SentencesDataset(examples, ranker) train_dl = DataLoader(train_dataset, shuffle=True, batch_size=16) train_loss = losses.OnlineContrastiveLoss(model=ranker) ranker.fit(train_dataloader=train_dl, epochs=20, output_path="ranker/constrastive_loss/", save_best_model=True) pickle.dump( ranker, open("ranker/constrastive_loss/ranker_contrastive_loss_20_epochs.pkl", "wb")) from tqdm.notebook import tqdm run = {} for topic in tqdm(topics):
def train(self, train_df, eval_df): """ :param train_df: dataframe with columns 'text_a', 'text_b', 'labels' :param eval_df: dataframe with columns 'text_a', 'text_b', 'labels' :return: """ # format training data if "text_a" in train_df.columns and "text_b" in train_df.columns and "labels" in train_df.columns: if self.args.do_lower_case: train_df.loc[:, 'text_a'] = train_df['text_a'].str.lower() train_df.loc[:, 'text_b'] = train_df['text_b'].str.lower() train_examples = [ InputExample(str(i), texts=[text_a, text_b], label=label) for i, (text_a, text_b, label) in enumerate( zip( train_df["text_a"].astype(str), train_df["text_b"].astype(str), train_df["labels"].astype(int), )) ] else: raise KeyError( 'Training data processing - Required columns not found!') # format evaluation data if "text_a" in train_df.columns and "text_b" in train_df.columns and "labels" in eval_df.columns: if self.args.do_lower_case: eval_df.loc[:, 'text_a'] = eval_df['text_a'].str.lower() eval_df.loc[:, 'text_b'] = eval_df['text_b'].str.lower() evaluator = evaluation.BinaryClassificationEvaluator( list(eval_df["text_a"]), list(eval_df["text_b"]), list(eval_df["labels"].astype(int)), batch_size=self.args.eval_batch_size) else: raise KeyError( 'Evaluation data processing - Required columns not found!') # Define train dataset, the dataloader and the train loss train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=self.args.train_batch_size) if self.args.loss_func is not None and self.args.loss_func == 'MultipleNegativesRankingLoss': train_loss = losses.MultipleNegativesRankingLoss(self.model) else: distance_metric = losses.SiameseDistanceMetric.COSINE_DISTANCE train_loss = losses.OnlineContrastiveLoss( model=self.model, distance_metric=distance_metric, margin=self.args.margin) # Tune the model self.model.fit( train_objectives=[(train_dataloader, train_loss)], epochs=self.args.num_train_epochs, warmup_steps=self.args.warmup_steps, optimizer_params={'lr': self.args.learning_rate}, weight_decay=self.args.weight_decay, evaluator=evaluator, evaluation_steps=self.args.evaluate_during_training_steps, max_grad_norm=self.args.max_grad_norm, output_path=self.args.best_model_dir, show_progress_bar=self.args.show_progress_bar) evaluation_file = os.path.join(self.args.best_model_dir, evaluator.csv_file) eval_results_df = pd.read_csv(evaluation_file) eval_results_df.sort_values(self.score_type, inplace=True, ascending=False, ignore_index=True) self.threshold = eval_results_df.loc[0, self.threshold_type] print( f'Set model threshold to {self.threshold} acquiring a {self.score_type} of {eval_results_df.loc[0, self.score_type]}' ) return self.threshold
def main(): parser = argparse.ArgumentParser() # Input and output configs parser.add_argument("--task", default=None, type=str, required=True, help="the task to run bert ranker for") parser.add_argument("--data_folder", default=None, type=str, required=True, help="the folder containing data") parser.add_argument("--output_dir", default=None, type=str, required=True, help="the folder to output predictions") parser.add_argument("--negative_sampler", default="random", type=str, required=False, help="negative sampling procedure to use ['random', 'bm25', 'sentence_transformer']") parser.add_argument("--anserini_folder", default="", type=str, required=True, help="Path containing the anserini bin <anserini_folder>/target/appassembler/bin/IndexCollection") parser.add_argument("--sentence_bert_ns_model", default="all-MiniLM-L6-v2", type=str, required=False, help="model to use for sentenceBERT negative sampling.") parser.add_argument('--denoise_negatives', dest='denoise_negatives', action='store_true') parser.add_argument('--no-denoise_negatives', dest='denoise_negatives', action='store_false') parser.set_defaults(denoise_negatives=False) parser.add_argument("--num_ns_for_denoising", default=100, type=int, required=False, help="Only used for --denoise_negatives. Number of total of samples to retrieve and get the bottom 10.") parser.add_argument("--generative_sampling_model", default="all-MiniLM-L6-v2", type=str, required=False, help="model to use for generating negative samples on the go.") parser.add_argument('--remove_cand_subsets', dest='remove_cand_subsets', action='store_true') parser.add_argument('--dont_remove_cand_subsets', dest='remove_cand_subsets', action='store_false') parser.set_defaults(remove_cand_subsets=True) #which part of the context we use to sample negatives. parser.add_argument('--last_utterance_only', dest='last_utterance_only', action='store_true') parser.add_argument('--all_utterances', dest='last_utterance_only', action='store_false') parser.set_defaults(last_utterance_only=False) # External corpus to augment negative sampling parser.add_argument('--external_corpus', dest='use_external_corpus', action='store_true') parser.add_argument('--dont_use_external_corpus', dest='use_external_corpus', action='store_false') parser.set_defaults(use_external_corpus=False) # #Training procedure parser.add_argument("--num_epochs", default=3, type=int, required=False, help="Number of epochs for training.") parser.add_argument("--train_batch_size", default=8, type=int, required=False, help="Training batch size.") # #Model hyperparameters parser.add_argument("--transformer_model", default="bert-base-cased", type=str, required=False, help="Bert model to use (default = bert-base-cased).") parser.add_argument("--loss", default='MultipleNegativesRankingLoss', type=str, required=False, help="Loss function to use ['MultipleNegativesRankingLoss', 'TripletLoss', 'MarginMSELoss']") ## Wandb project name parser.add_argument("--wandb_project", default='train_sentence_transformer', type=str, required=False, help="name of the wandb project") parser.add_argument("--seed", default=42, type=int, required=False, help="Random seed.") args = parser.parse_args() torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) max_seq_length = 300 if args.transformer_model == 'all-mpnet-base-v2' or args.transformer_model == 'msmarco-bert-base-dot-v5': model = SentenceTransformer(args.transformer_model) model.max_seq_length = max_seq_length else: word_embedding_model = models.Transformer(args.transformer_model, max_seq_length=max_seq_length) tokens = ['[UTTERANCE_SEP]', '[TURN_SEP]', '[AUG]'] word_embedding_model.tokenizer.add_tokens(tokens, special_tokens=True) word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer)) pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) eval_only = False if eval_only: logging.info("Skipping training (eval_only=True)") else: logging.info("Creating train CRR dataset for {} using {}.".format(args.task, args.negative_sampler)) crr_reader = CRRBenchmarkDataReader('{}/{}'.format(args.data_folder, args.task)) train_data = crr_reader.get_examples("train.tsv", args.negative_sampler, args.anserini_folder, args.sentence_bert_ns_model, args.loss, args.output_dir, True, False, args.denoise_negatives, args.num_ns_for_denoising, args.generative_sampling_model, args.remove_cand_subsets, args.last_utterance_only, args.use_external_corpus) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=args.train_batch_size) if args.loss == 'MultipleNegativesRankingLoss': train_loss = losses.MultipleNegativesRankingLoss(model=model, similarity_fct=util.dot_score) elif args.loss == 'MarginMSELoss': train_loss = losses.MarginMSELoss(model=model) elif args.loss == 'TripletLoss': train_loss = losses.TripletLoss(model=model) elif args.loss == 'ContrastiveLoss': train_loss = losses.ContrastiveLoss(model=model) elif args.loss == 'OnlineContrastiveLoss': train_loss = losses.OnlineContrastiveLoss(model=model) ns_description = args.negative_sampler if args.negative_sampler == 'sentence_transformer': ns_description+="_{}".format(args.sentence_bert_ns_model) if args.negative_sampler == 'generative': ns_description+="_{}".format(args.generative_sampling_model) wandb.init(project=args.wandb_project) wandb.config.update(args) if not eval_only: # this is the eval data for the training, not the actual evaluation logging.info("Getting eval data") examples_dev = crr_reader.get_examples('valid.tsv', args.negative_sampler, args.anserini_folder, args.sentence_bert_ns_model, args.loss, args.output_dir, eval_data=True) examples_dev = examples_dev[0:(11*500)] eval_samples = [] docs = [] for i, example in enumerate(examples_dev): if (i+1)%11==0: eval_samples.append({'query': example.texts[0], 'positive': [example.texts[1]], 'negative': docs }) docs=[] else: docs.append(example.texts[2]) evaluator = RerankingEvaluator(eval_samples, write_csv=True, similarity_fct=util.dot_score) warmup_steps = math.ceil(len(train_data)*args.num_epochs/args.train_batch_size*0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) logging.info("Fitting sentenceBERT for {}".format(args.task)) model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=args.num_epochs, evaluation_steps=100, steps_per_epoch=10000, warmup_steps=warmup_steps, output_path=args.output_dir+"{}_{}_ns_{}_loss_{}".format(args.transformer_model, args.task, ns_description, args.loss)) logging.info("Evaluating for full retrieval of responses to dialogue.") train = pd.read_csv(args.data_folder+args.task+"/train.tsv", sep="\t") test = pd.read_csv(args.data_folder+args.task+"/test.tsv", sep="\t") ns_test_sentenceBERT = negative_sampling.SentenceBERTNegativeSampler(list(train["response"].values)+list(test["response"].values), 10, args.data_folder+args.task+"/test_sentenceBERTembeds", -1, args.output_dir+"{}_{}_ns_{}_loss_{}".format(args.transformer_model, args.task, ns_description, args.loss), use_cache_for_embeddings=False) ns_info = [ (ns_test_sentenceBERT, ["cand_sentenceBERT_{}".format(i) for i in range(10)] + ["sentenceBERT_retrieved_relevant", "sentenceBERT_rank"], 'sentenceBERT') ] examples = [] examples_cols = ["context", "relevant_response"] + \ reduce(lambda x,y:x+y, [t[1] for t in ns_info]) logging.info("Retrieving candidates using different negative sampling strategies for {}.".format(args.task)) recall_df = [] for idx, row in enumerate(tqdm(test.itertuples(index=False), total=len(test))): context = row[0] relevant_response = row[1] instance = [context, relevant_response] for ns, _ , ns_name in ns_info: ns_candidates, scores, had_relevant, rank_relevant, _ = ns.sample(context, [relevant_response]) for ns in ns_candidates: instance.append(ns) instance.append(had_relevant) instance.append(rank_relevant) if had_relevant: r10 = 1 else: r10 = 0 if rank_relevant == 0: r1 = 1 else: r1 =0 recall_df.append([r10, r1]) examples.append(instance) recall_df = pd.DataFrame(recall_df, columns = ["R@10", "R@1"]) examples_df = pd.DataFrame(examples, columns=examples_cols) logging.info("R@10: {}".format(examples_df[[c for c in examples_df.columns if 'retrieved_relevant' in c]].sum()/examples_df.shape[0])) wandb.log({'R@10': (examples_df[[c for c in examples_df.columns if 'retrieved_relevant' in c]].sum()/examples_df.shape[0]).values[0]}) rank_col = [c for c in examples_df.columns if 'rank' in c][0] logging.info("R@1: {}".format(examples_df[examples_df[rank_col]==0].shape[0]/examples_df.shape[0])) wandb.log({'R@1': examples_df[examples_df[rank_col]==0].shape[0]/examples_df.shape[0]}) recall_df.to_csv(args.output_dir+"/recall_df_{}_{}_ns_{}_loss_{}.csv".format(args.transformer_model.replace("/", "-"), args.task, ns_description.replace("/", "-"), args.loss), index=False, sep="\t")