def train(self, train_df, eval_df): """ :param train_df: dataframe with columns 'text_a', 'text_b', 'labels' :param eval_df: dataframe with columns 'text_a', 'text_b', 'labels' :return: """ # format training data if "text_a" in train_df.columns and "text_b" in train_df.columns and "labels" in train_df.columns: if self.args.do_lower_case: train_df.loc[:, 'text_a'] = train_df['text_a'].str.lower() train_df.loc[:, 'text_b'] = train_df['text_b'].str.lower() train_examples = [ InputExample(str(i), [text_a, text_b], label) for i, (text_a, text_b, label) in enumerate( zip( train_df["text_a"].astype(str), train_df["text_b"].astype(str), train_df["labels"].astype(float), )) ] else: raise KeyError( 'Training data processing - Required columns not found!') # format evaluation data if "text_a" in train_df.columns and "text_b" in train_df.columns and "labels" in eval_df.columns: if self.args.do_lower_case: eval_df.loc[:, 'text_a'] = eval_df['text_a'].str.lower() eval_df.loc[:, 'text_b'] = eval_df['text_b'].str.lower() evaluator = evaluation.EmbeddingSimilarityEvaluator( list(eval_df["text_a"]), list(eval_df["text_b"]), list(eval_df["labels"]), batch_size=self.args.eval_batch_size) else: raise KeyError( 'Evaluation data processing - Required columns not found!') # Define train dataset, the dataloader and the train loss train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=self.args.train_batch_size) train_loss = losses.CosineSimilarityLoss(self.model) # Tune the model self.model.fit( train_objectives=[(train_dataloader, train_loss)], epochs=self.args.num_train_epochs, warmup_steps=self.args.warmup_steps, optimizer_params={'lr': self.args.learning_rate}, weight_decay=self.args.weight_decay, evaluator=evaluator, evaluation_steps=self.args.evaluate_during_training_steps, max_grad_norm=self.args.max_grad_norm, output_path=self.args.best_model_dir, show_progress_bar=self.args.show_progress_bar)
'scores': [] } fIn = zip.open(filepath) for line in io.TextIOWrapper(fIn, 'utf8'): sent1, sent2, score = line.strip().split("\t") score = float(score) sts_data[filename]['sentences1'].append(sent1) sts_data[filename]['sentences2'].append(sent2) sts_data[filename]['scores'].append(score) for filename, data in sts_data.items(): test_evaluator = evaluation.EmbeddingSimilarityEvaluator( data['sentences1'], data['sentences2'], data['scores'], batch_size=inference_batch_size, name=filename, show_progress_bar=False) evaluators.append(test_evaluator) # Train the model student_model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluation.SequentialEvaluator( evaluators, main_score_function=lambda scores: np.mean(scores)), epochs=num_epochs, warmup_steps=num_warmup_steps, evaluation_steps=num_evaluation_steps, output_path=output_path, save_best_model=True,
corpus = pd.read_csv('CDSCorpus/CDS_train.csv',sep='\t',error_bad_lines=False, encoding='utf-8')# ,nrows=1000 corpus['relatedness_score'] = corpus['relatedness_score'].div(5) corpus_test = pd.read_csv('CDSCorpus/CDS_test.csv',sep='\t',error_bad_lines=False, encoding='utf-8') corpus_test['relatedness_score'] = corpus_test['relatedness_score'].div(5) # label2int = {"CONTRADICTION": 0, "ENTAILMENT": 1, "NEUTRAL": 2} s1=[] s2=[] sc=[] s3=[] for index, row in corpus_test.iterrows(): s1.append(row['sentence_A']) s2.append(row['sentence_B']) sc.append(row['relatedness_score']) # sc.append(label2int[row['entailment_judgment']]) evaluator = evaluation.EmbeddingSimilarityEvaluator(s1, s2, sc)# then change to corpus_test data # roberta_large requires more gpu memory word_embedding_model = models.Transformer('roberta_base', max_seq_length=512) pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) train_examples = [] test_examples = [] for index, row in corpus.iterrows(): train_examples.append(InputExample(texts=[row['sentence_A'], row['sentence_B']], label=row['relatedness_score'])) s3.append(row['sentence_A']) s3.append(row['sentence_B']) train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=8)
print(f"Dataset length: {len(dataset)}") return dataset #%% train_cluster_dataset = generate_dataset(grouped) test_cluster_dataset = generate_dataset(grouped_test) # train_dataset, test_dataset = train_test_split(dataset, train_size=0.8) train_dataset = train_cluster_dataset test_dataset = test_cluster_dataset train_dataloader = DataLoader(train_cluster_dataset, shuffle=True, batch_size=16) evaluator = evaluation.EmbeddingSimilarityEvaluator( [i.texts[0] for i in test_dataset], [i.texts[1] for i in test_dataset], [i.label for i in test_dataset]) # %% train_loss = losses.CosineSimilarityLoss(model) #%% output_dir = args.output try: os.mkdir(output_dir) except: pass try: os.mkdir(output_dir + '/best') except:
train_data.load_data(train_file_path) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.MSELoss(model=model) ###### Load dev sets ###### # Test on STS 2017.en-de dataset using Spearman rank correlation logging.info("Read dev dataset") evaluators = [] claim_pair_reader = ClaimPairDataReader() dev_data = SentencesDataset(examples=claim_pair_reader.get_examples(split='train', language='hi'), model=model) # dev_file_path = 'test_southeast_asian_parallel_corpus.txt' # dev_data = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model) # dev_data.load_data(dev_file_path) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) evaluator_sts = evaluation.EmbeddingSimilarityEvaluator(dev_dataloader, name='SE Asian Test Data') evaluators.append(evaluator_sts) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluation.SequentialEvaluator(evaluators, main_score_function=lambda scores: scores[-1]), epochs=2, evaluation_steps=1000, warmup_steps=10000, scheduler='warmupconstant', output_path=output_path, save_best_model=True, optimizer_params={'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False} )
t1 = load_dict['images'][i]['sentences'][j]['raw'].lower() t2 = load_dict['images'][i]['sentences'][k]['raw'].lower() train_examples.append(InputExample(texts=[t1, t2], label=1)) # for k in range(i + 1, len(load_dict['images'])): # t1 = load_dict['images'][i]['sentences'][0]['raw'].lower() # t2 = load_dict['images'][k]['sentences'][0]['raw'].lower() # train_examples.append(InputExample(texts=[t1, t2], label=0)) train_dataset = SentencesDataset(train_examples, model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=64) train_loss = losses.ContrastiveLoss(model=model) #Tune the model sentences1 = [ 'This list contains the first column', 'With your sentences', 'You want your model to evaluate on' ] sentences2 = [ 'Sentences contains the other column', 'The evaluator matches sentences1[i] with sentences2[i]', 'Compute the cosine similarity and compares it to scores[i]' ] scores = [0.3, 0.6, 0.2] evaluator = evaluation.EmbeddingSimilarityEvaluator(sentences1, sentences2, scores) model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10, warmup_steps=100, output_path="./flickerbertmodel/", evaluator=evaluator, evaluation_steps=5)
for train_file in train_files: train_data.load_data(train_file) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.MSELoss(model=model) ###### Load dev sets ###### # Test on STS 2017.en-de dataset using Spearman rank correlation logging.info("Read STS2017.en-de dataset") evaluators = [] sts_reader = readers.STSDataReader('../datasets/', s1_col_idx=0, s2_col_idx=1, score_col_idx=2) dev_data = SentencesDataset(examples=sts_reader.get_examples('STS2017.en-de.txt.gz'), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) evaluator_sts = evaluation.EmbeddingSimilarityEvaluator(dev_dataloader, name='STS2017.en-de') evaluators.append(evaluator_sts) # Use XLNI.en-de dataset with MSE evaluation logging.info("Read XNLI.en-de dataset") xnli_reader = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model) xnli_reader.load_data('../datasets/xnli-en-de.txt.gz') xnli_dataloader = DataLoader(xnli_reader, shuffle=False, batch_size=train_batch_size) xnli_mse = evaluation.MSEEvaluator(xnli_dataloader, name='xnli-en-de') evaluators.append(xnli_mse) # Train the model
# The below all apply to the de example - how does one evaluate the model outside this single example??? ###### Load dev sets ###### # Test on STS 2017.en-de dataset using Spearman rank correlation logging.info("Read data/hindi_sbert_sts_train.csv dataset") evaluators = [] sts_reader = readers.STSDataReader('./data/', s1_col_idx=0, s2_col_idx=1, score_col_idx=2) dev_data = SentencesDataset( examples=sts_reader.get_examples('hindi_sbert_sts_train.csv'), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) evaluator_sts = evaluation.EmbeddingSimilarityEvaluator( dev_dataloader, name='Hindi_Headlines_en_hi_sbert') evaluators.append(evaluator_sts) model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluation.SequentialEvaluator( evaluators, main_score_function=lambda scores: scores[-1]), epochs=20, evaluation_steps=1000, warmup_steps=10000, scheduler='warmupconstant', output_path=output_path, save_best_model=True, optimizer_params={ 'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False