def __getitem__(self, index) -> InputExample: """ 0.5: match 0.4: word - sentence 0.: eng - eng: 100000+ 0.: thai - thai: 37706 0.: thai - eng: 93045 0.: eng - thai: 6310 0.05: sentence - sentence: 6310 - match word from thai to eng then pick random sentences 0.05: word - word: 80508 0.5: not match 0.: eng-eng 0.: thai-thai 0.: both """ tha, eng = self.words[self.indices[index]] if np.random.rand() > 0.6 or self.true_only: out = InputExample(texts=[eng, tha], label=0.8) else: while True: idx = torch.randint(0, len(self), (1, )) other_tha, _ = self.words[self.indices[idx]] if other_tha != tha: break out = InputExample(texts=[eng, other_tha], label=0.2) return out
def test_multiclass(self): transformer = models.Transformer('prajjwal1/bert-tiny') model = SentenceTransformer(modules=[ transformer, models.Pooling(transformer.get_word_embedding_dimension()) ]) softmax_loss = losses.SoftmaxLoss( model, transformer.get_word_embedding_dimension(), num_labels=3) samples = [ InputExample(texts=[ "Hello Word, a first test sentence", "Hello Word, a other test sentence" ], label=0), InputExample(texts=[ "Hello Word, a second test sentence", "Hello Word, a other test sentence" ], label=1), InputExample(texts=[ "Hello Word, a third test sentence", "Hello Word, a other test sentence" ], label=2) ] dataloader = DataLoader(samples, batch_size=1) evaluator = MulticlassEvaluator(dataloader, softmax_model=softmax_loss) result = evaluator(model) i = 0
def stratifiedkfoldtest(data): data = data.sample(frac=1,random_state=1).reset_index(drop=True) skf = StratifiedKFold(n_splits=10) splits=[(x,y) for x,y in skf.split(data, data['label'])] f1list=[] acclist=[] import torch torch.cuda.empty_cache() t = torch.cuda.get_device_properties(0).total_memory r = torch.cuda.memory_reserved(0) a = torch.cuda.memory_allocated(0) f = r-a # free inside reserved print(f"Total:{t/1e+9}, Reserved:{r}, Allocated:{a}, Free:{f}") for b in [24]: for l in [2e-5]: for e in [4]: for train_index, test_index in splits: #resetting the model for every fold model=CrossEncoder('cross-encoder/stsb-roberta-base',num_labels=1) #train split train=data.loc[train_index] #test split test=data.loc[test_index] #data loaders train_=SentencesDataset([InputExample(texts=[d['query_p'],d['citation_p']],label=int(d['label'])) for i,d in train.iterrows()],model) test_=SentencesDataset([InputExample(texts=[d['query_p'],d['citation_p']],label=int(d['label'])) for i,d in test.iterrows()],model) train_=DataLoader(train_,batch_size=b) test_=DataLoader(test_) #loss function #training model.fit(train_,epochs=e,optimizer_params={'lr':l}) #predictions using encoder similarity y=test['label'] dlist=list(test.apply(lambda d:(d['query_p'],d['citation_p']), axis=1)) yh=sts_sim(dlist,model) #f1 f1scores,thresholds=f1_macro(y,yh) print(np.nan in f1scores) f1=max(f1scores) f1list.append(f1) print(f1) #accuracy mthres=thresholds[np.nanargmax(f1scores)] yh1=np.zeros(len(yh)) yh1[yh>=mthres]=1 f12=metrics.f1_score(y,yh1,average='macro') if f12!=f1: import pdb pdb.set_trace() acc=metrics.accuracy_score(y, yh1) print(acc) acclist.append(acc) print(b,l,e) print("Average Macro F1 across folds:",np.mean(f1list)) print("Average Acc across folds:",np.mean(acclist))
def kfoldtest(data): data = data.sample(frac=1,random_state=1).reset_index(drop=True) skf = KFold(n_splits=100) splits=[(x,y) for x,y in skf.split(data)] f1list=[] acclist=[] import torch print(torch.cuda.is_available()) for b in [20]: for l in [2e-5]: for e in [4]: yh=np.array([]) y=np.array([]) i=0 for train_index, test_index in splits: i+=1 print(f"Fold {i}") #resetting the model for every fold model=CrossEncoder('cross-encoder/stsb-roberta-base',num_labels=1) #train split train=data.loc[train_index] #test split test=data.loc[test_index] #data loaders train_=SentencesDataset([InputExample(texts=[d['query_p'],d['citation_p']],label=int(d['label'])) for i,d in train.iterrows()],model) test_=SentencesDataset([InputExample(texts=[d['query_p'],d['citation_p']],label=int(d['label'])) for i,d in test.iterrows()],model) train_=DataLoader(train_,batch_size=b) test_=DataLoader(test_) #training model.fit(train_,epochs=e,optimizer_params={'lr':l}) #predictions using cos_similarity y=np.append(y,test['label']) dlist=list(test.apply(lambda d:(d['query_p'],d['citation_p']), axis=1)) yh=np.append(yh,sts_sim(dlist,model)) #f1 f1scores,thresholds=f1_macro(y,yh) print(np.nan in f1scores) f1=max(f1scores) f1list.append(f1) print(f1) #accuracy mthres=thresholds[np.nanargmax(f1scores)] yh1=np.zeros(len(yh)) yh1[yh>=mthres]=1 f12=metrics.f1_score(y,yh1,average='macro') if f12!=f1: import pdb pdb.set_trace() acc=metrics.accuracy_score(y, yh1) print(acc) acclist.append(acc) print(b,l,e) print("BERT Fine-Tuned: Average F1 across folds:",np.mean(f1list)) print("BERT Fine-Tuned: Average Acc across folds:",np.mean(acclist))
def part_gen_constructor(sampler, part_df): #question_neg_dict = {} for question, df in part_df.groupby("question"): pos_answer_list = df["answer"].tolist() negs = sbert_sampler.sample(question, pos_answer_list) #negs = sbert_sampler.sample(question, []) #neg_mg_df = pd.merge(train_part_tiny, pd.DataFrame(np.asarray(negs[0]).reshape([-1, 1]), columns = ["answer"]), on = "answer", how = "inner") #question_neg_dict[question] = neg_mg_df for pos_answer in pos_answer_list: yield InputExample(texts=[question, pos_answer], label=1) for neg_answer in negs[0]: yield InputExample(texts=[question, neg_answer], label=0)
def load_train_sbert(path_train_data, num_samples): df = pd.read_csv(path_train_data) if num_samples>0: df = df.head(num_samples).copy() train_examples = [InputExample(texts=[s1, s2], label=int(l)) \ for s1,s2,l in zip(list(df.sentence1.values), list(df.sentence2.values), list(df.label.values))] return train_examples
def train(self, train_df, eval_df): """ :param train_df: dataframe with columns 'text_a', 'text_b', 'labels' :param eval_df: dataframe with columns 'text_a', 'text_b', 'labels' :return: """ # format training data if "text_a" in train_df.columns and "text_b" in train_df.columns and "labels" in train_df.columns: if self.args.do_lower_case: train_df.loc[:, 'text_a'] = train_df['text_a'].str.lower() train_df.loc[:, 'text_b'] = train_df['text_b'].str.lower() train_examples = [ InputExample(str(i), [text_a, text_b], label) for i, (text_a, text_b, label) in enumerate( zip( train_df["text_a"].astype(str), train_df["text_b"].astype(str), train_df["labels"].astype(float), )) ] else: raise KeyError( 'Training data processing - Required columns not found!') # format evaluation data if "text_a" in train_df.columns and "text_b" in train_df.columns and "labels" in eval_df.columns: if self.args.do_lower_case: eval_df.loc[:, 'text_a'] = eval_df['text_a'].str.lower() eval_df.loc[:, 'text_b'] = eval_df['text_b'].str.lower() evaluator = evaluation.EmbeddingSimilarityEvaluator( list(eval_df["text_a"]), list(eval_df["text_b"]), list(eval_df["labels"]), batch_size=self.args.eval_batch_size) else: raise KeyError( 'Evaluation data processing - Required columns not found!') # Define train dataset, the dataloader and the train loss train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=self.args.train_batch_size) train_loss = losses.CosineSimilarityLoss(self.model) # Tune the model self.model.fit( train_objectives=[(train_dataloader, train_loss)], epochs=self.args.num_train_epochs, warmup_steps=self.args.warmup_steps, optimizer_params={'lr': self.args.learning_rate}, weight_decay=self.args.weight_decay, evaluator=evaluator, evaluation_steps=self.args.evaluate_during_training_steps, max_grad_norm=self.args.max_grad_norm, output_path=self.args.best_model_dir, show_progress_bar=self.args.show_progress_bar)
def get_binary_experimental_setup(): # Items train_items, valid_items = extract_examples("items") # Domains train_domains, valid_domains = extract_examples("domains") # Regroup items and domains together train_examples = train_items + train_domains valid_examples = valid_items + valid_domains print( f"{len(train_examples)} training examples to {len(valid_examples)} valid examples" ) # Postprocess train examples to correct format train_examples = [ InputExample(texts=[sent1, sent2], label=label) for (sent1, sent2, label) in train_examples ] # Get evaluator from valid data evaluator = evaluation.BinaryClassificationEvaluator(*zip(*valid_examples), batch_size=128) return train_examples, evaluator
def pretrained_model_score(self, model_name, expected_score): model = SentenceTransformer(model_name) sts_dataset_path = 'datasets/stsbenchmark.tsv.gz' if not os.path.exists(sts_dataset_path): util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path) train_samples = [] dev_samples = [] test_samples = [] with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: score = float( row['score']) / 5.0 # Normalize score to range 0 ... 1 inp_example = InputExample( texts=[row['sentence1'], row['sentence2']], label=score) if row['split'] == 'dev': dev_samples.append(inp_example) elif row['split'] == 'test': test_samples.append(inp_example) else: train_samples.append(inp_example) evaluator = EmbeddingSimilarityEvaluator.from_input_examples( test_samples, name='sts-test') score = model.evaluate(evaluator) * 100 print(model_name, "{:.2f} vs. exp: {:.2f}".format(score, expected_score)) assert score > expected_score or abs(score - expected_score) < 0.1
def test_LabelAccuracyEvaluator(self): """Tests that the LabelAccuracyEvaluator can be loaded correctly""" model = SentenceTransformer('paraphrase-distilroberta-base-v1') nli_dataset_path = 'datasets/AllNLI.tsv.gz' if not os.path.exists(nli_dataset_path): util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz', nli_dataset_path) label2int = {"contradiction": 0, "entailment": 1, "neutral": 2} dev_samples = [] with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: if row['split'] == 'train': label_id = label2int[row['label']] dev_samples.append( InputExample( texts=[row['sentence1'], row['sentence2']], label=label_id)) if len(dev_samples) >= 100: break train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model. get_sentence_embedding_dimension(), num_labels=len(label2int)) dev_dataloader = DataLoader(dev_samples, shuffle=False, batch_size=16) evaluator = evaluation.LabelAccuracyEvaluator(dev_dataloader, softmax_model=train_loss) acc = evaluator(model) assert acc > 0.2
def generate_dataset(_grouped): def all_same(_sentences): for i in _sentences: for j in _sentences: if i != j: return False return True def duplicate(data, k=1): for cluster_id, _sentences in data.items(): tmp = copy.deepcopy(_sentences) for i in range(k): _sentences += tmp dataset = [] same = copy.deepcopy(_grouped) diff = copy.deepcopy(_grouped) other = copy.deepcopy(_grouped) duplicate(same, k=1) # duplicate for balanced dataset for cluster_id, sentences in same.items(): while len(sentences) > 1: if all_same(sentences): break choices = random.choices(sentences, k=2) if choices[0] != choices[1]: dataset.append( InputExample(texts=[choices[0], choices[1]], label=1.0)) sentences.remove(choices[0]) sentences.remove(choices[1]) for cluster_id, sentences in diff.items(): other_cluster = [ value for key, value in other.items() if key != cluster_id ] other_cluster = [ item for sublist in other_cluster for item in sublist ] # flatten lists for sentence in sentences: choice = random.choice(other_cluster) if choice != choices[1]: other_cluster.remove(choice) dataset.append( InputExample(texts=[sentence, choice], label=0.0)) print(f"Dataset length: {len(dataset)}") return dataset
def create_posts_ranking(fl, data_dir, model, validate=None, is_test=False): train_posts_ranking = [] disbn = [] with open(os.path.join(data_dir, fl)) as f: data = json.load(f) for obj in data: answers = obj['answers'] filtered_answers = [] votes = 1000000 for answer in answers: my_votes = answer['a_votes'] if my_votes < votes: votes = my_votes filtered_answers.append(answer) if len(filtered_answers) > 1: rank = len(filtered_answers) for answer in filtered_answers: dist = rank / len(filtered_answers) disbn.append(answer['a_rank']) rank = rank - 1 train_posts_ranking.append( InputExample(texts=[obj['q_text'], answer['a_text']], label=dist)) random.shuffle(train_posts_ranking) print("data size " + str(len(train_posts_ranking))) if is_test: return train_posts_ranking if max_size: train_posts_ranking = train_posts_ranking[:max_size] evaluator = None if posts_rank_str == validate: train_posts_ranking, dev_posts_ranking = train_test_split( train_posts_ranking, test_size=0.1) evaluator = EmbeddingSimilarityEvaluator.from_input_examples( dev_posts_ranking, name='posts ranking') warmup_steps = math.ceil( len(train_posts_ranking) * num_epochs / batch_size * 0.1) # 10% of train data for warm-up train_data_posts_ranking = SentencesDataset(train_posts_ranking, model=model) train_dataloader_posts_ranking = DataLoader(train_data_posts_ranking, shuffle=True, batch_size=batch_size) train_loss_posts_ranking = losses.CosineSimilarityLoss(model=model) print('R: Number of training examples: ', len(train_posts_ranking)) global evaluation_steps evaluation_steps = math.ceil(len(train_posts_ranking) / 0.1) return train_dataloader_posts_ranking, train_loss_posts_ranking, evaluator, warmup_steps
def __iter__(self): with open(self.triplets_file, 'r') as fIn: for line in fIn: qid, pos_id, neg_id = line.strip().split() query_text = self.queries[qid] pos_text = self.corpus[pos_id] neg_text = self.corpus[neg_id] yield InputExample(texts=[query_text, pos_text, neg_text])
def read_dataset(dataset_path, split): samples = [] with open(dataset_path, 'r') as fIn: for line in tqdm(fIn): line_split, query, doc1, doc2, label = line.strip().split('\t') if line_split == split: samples.append( InputExample(texts=[query, doc1, doc2], label=int(label)))
def to_input_example(language_list): result = [] for dataset in language_list: result.append( InputExample( texts=[dataset["sentence1"], dataset["sentence2"]], label=(dataset["similarity_score"] / 5), )) return result
def sentence_bert_data_prepare(intent_list, text_intent_map): print("开始整理数据") replacement_words1 = ['地方', '位置', '地址'] replacement_words2 = ['没有时间', '没时间', '没空'] duplicated = [] examples_train = [] for intent_a in intent_list: for text_a in text_intent_map[intent_a]: for intent_b in intent_list: for text_b in text_intent_map[intent_b]: if intent_a == intent_b: label = 1 else: label = 0 temp_a = text_a + text_b temp_b = text_b + text_a if temp_a in duplicated or temp_b in duplicated: continue duplicated.append(temp_b) examples_train.append( InputExample(guid='guid', texts=[text_a, text_b], label=float(label))) # for word1 in replacement_words1: # if word1 in text_a: # for word2 in replacement_words1: # if word1 != word2: # new = text_a.replace(word1, word2) # examples_train.append( # InputExample(guid='guid', texts=[new, text_b], label=float(label))) # if word1 in text_b: # for word2 in replacement_words1: # if word1 != word2: # new = text_a.replace(word1, word2) # examples_train.append( # InputExample(guid='guid', texts=[text_a, new], label=float(label))) # # if word1 in text_b and word1 in text_a: # # for word2 in replacement_words1: # # if word1!=word2: # # new1=text_a.replace(word1,word2) # # new2=text_b.replace(word1,word2) # # examples_train.append(InputExample(guid='guid', texts=[new, new], label=float(label))) # for word1 in replacement_words2: # if word1 in text_a: # for word2 in replacement_words2: # if word1 != word2: # new = text_a.replace(word1, word2) # examples_train.append( # InputExample(guid='guid', texts=[new, text_b], label=float(label))) # if word1 in text_b: # for word2 in replacement_words2: # if word1 != word2: # new = text_a.replace(word1, word2) # examples_train.append( # InputExample(guid='guid', texts=[text_a, new], label=float(label))) return examples_train
def evaluate_sbert(model, batch_size=16): sts_dataset_path = 'datasets/stsbenchmark.tsv.gz' test_samples = [] with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: if row['split'] == 'test': score = float( row['score']) / 5.0 #Normalize score to range 0 ... 1 test_samples.append( InputExample(texts=[row['sentence1'], row['sentence2']], label=score)) sentences1 = [] sentences2 = [] scores = [] examples = test_samples for example in examples: sentences1.append((example.texts[0], 'none')) sentences2.append((example.texts[1], 'none')) scores.append(example.label) _, embeddings1 = model.forward(sentences1, checkpoint=False) _, embeddings2 = model.forward(sentences2, checkpoint=False) labels = scores cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2)) manhattan_distances = -paired_manhattan_distances(embeddings1, embeddings2) euclidean_distances = -paired_euclidean_distances(embeddings1, embeddings2) dot_products = [ np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2) ] eval_pearson_cosine, _ = pearsonr(labels, cosine_scores) eval_spearman_cosine, _ = spearmanr(labels, cosine_scores) eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances) eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances) eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances) eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances) eval_pearson_dot, _ = pearsonr(labels, dot_products) eval_spearman_dot, _ = spearmanr(labels, dot_products) print("Cosine-Similarity :\tPearson: {:.4f}\tSpearman: {:.4f}".format( eval_pearson_cosine, eval_spearman_cosine)) print("Manhattan-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format( eval_pearson_manhattan, eval_spearman_manhattan)) print("Euclidean-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format( eval_pearson_euclidean, eval_spearman_euclidean)) print("Dot-Product-Similarity:\tPearson: {:.4f}\tSpearman: {:.4f}".format( eval_pearson_dot, eval_spearman_dot))
def get_softmax_experimental_setup(): train_examples, num_labels = extract_classif_examples() # Postprocess train examples to correct format train_examples = [ InputExample(texts=[sent, sent], label=label) for (sent, label) in train_examples ] * 100 return train_examples, num_labels
def get_file_data(filename): dataset = [] data_file = open(filename, 'r', encoding='utf8') for line in data_file.readlines()[1:]: info = line.split('\t') dataset.append( InputExample(texts=[info[2].strip(), info[3].strip()], label=float(info[1].strip()))) data_file.close() return dataset
def read(self, data, return_pt=False): sentence1 = data['sent1'].tolist() sentence2 = data['sent2'].tolist() labels = data['label'].tolist() if return_pt: dataloader = [] for s1, s2, l in zip(sentence1, sentence2, labels): dataloader.append(InputExample(texts=[s1, s2], label=l)) return dataloader return sentence1, sentence2, labels
def create_hirerachy_examples(fl, data_dir, model, validate=None, is_test=False): train_hierarchy_samples = [] disbn = [] with open(os.path.join(data_dir, fl)) as f: data = json.load(f) max_distance = 0 for obj in data: if obj['distance'] > max_distance: max_distance = obj['distance'] for obj in data: # flip the meaning of similarity, since the more distant the two classes, the closer to 0 it should be dist = (max_distance - obj['distance']) / (max_distance - 1) train_hierarchy_samples.append( InputExample(texts=[obj['class1'], obj['class2']], label=dist)) disbn.append(obj['distance']) random.shuffle(train_hierarchy_samples) train_hierarchy_samples = train_hierarchy_samples[:100000] disbn = disbn[:100000] if max_size: train_hierarchy_samples = train_hierarchy_samples[:max_size] disbn = disbn[:max_size] if is_test: return train_hierarchy_samples evaluator = None if hierarchy_str == validate: train_hierarchy_samples, dev_hierarchy_samples = train_test_split( train_hierarchy_samples, stratify=disbn, test_size=0.1) evaluator = EmbeddingSimilarityEvaluator.from_input_examples( dev_hierarchy_samples, name='hierarchy') warmup_steps = math.ceil( len(train_hierarchy_samples) * num_epochs / batch_size * 0.1) # 10% of train data for warm-up train_data_hierarchy = SentencesDataset(train_hierarchy_samples, model=model) train_dataloader_hierarchy = DataLoader(train_data_hierarchy, shuffle=True, batch_size=batch_size) train_loss_hierarchy = losses.CosineSimilarityLoss(model=model) print('H: Number of training examples: ', len(train_hierarchy_samples)) global evaluation_steps evaluation_steps = math.ceil(len(train_hierarchy_samples) / 0.1) return train_dataloader_hierarchy, train_loss_hierarchy, evaluator, warmup_steps
def get_triplet_data(config): dev_data = get_file_data( os.path.join(config['eval_dir'], config['dev_file'])) train_file = open( os.path.join(config['train_dir'], config['train_triplet_file'])) train_dataset = [] for line in train_file.readlines(): info = line.strip().split('\t') anchor, positive, negative = info[0], info[1], info[2] train_dataset.append(InputExample(texts=[anchor, positive, negative])) train_file.close() return train_dataset, dev_data
def finetune_sbert(model, df, rep_sents, finetune_cfg): """Finetune the Sentence-BERT.""" # setup train_size = finetune_cfg.get("train_size", 200000) sample_per_pair = finetune_cfg.get("sample_per_pair", 5) train_batch_size = finetune_cfg.get("train_batch_size", 32) epochs = finetune_cfg.get("epochs", 1) train = [] n_sampled = 0 cnts = [0, 0] # [neg, pos] max_label_size = train_size // 2 genres = df.genres.apply(set) with tqdm(total=train_size, position=0) as pbar: # sample sentence pairs while n_sampled < train_size: id1, id2 = np.random.randint(0, len(df), 2) label = int(bool(set.intersection(genres[id1], genres[id2]))) if cnts[label] > max_label_size: continue sent_pairs = np.stack(np.meshgrid(rep_sents[id1], rep_sents[id2])).T.reshape( -1, 2) if len(sent_pairs) <= sample_per_pair: samples = sent_pairs else: samples_idx = np.random.choice(sent_pairs.shape[0], sample_per_pair, replace=False) samples = sent_pairs[samples_idx] inexp = lambda pair: InputExample(texts=list(pair), label=label) samples = list(map(inexp, samples)) train.extend(samples) n_sampled += len(samples) cnts[label] += len(samples) pbar.update(len(samples)) # run finetune train_ds = SentencesDataset(train, model) train_obj = ( DataLoader(train_ds, shuffle=True, batch_size=train_batch_size), losses.ContrastiveLoss(model=model), ) model.fit(train_objectives=[train_obj], epochs=epochs, warmup_steps=100) os.makedirs("model/clustering/sbert", exist_ok=True) model.save("model/clustering/sbert")
def load_data(training_data_file, training_split=0.9, batch_size=16): train_examples = [] validation_examples = [] for row in jsonlines.open(training_data_file): query = row["query"] for positive in row["positives"]: sample = InputExample(texts=[query, positive], label=1.0) if random.random() < training_split: train_examples.append(sample) else: validation_examples.append(sample) for negative in row["negatives"]: sample = InputExample(texts=[query, negative], label=0.0) if random.random() < training_split: train_examples.append(sample) else: validation_examples.append(sample) train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size) return train_dataloader, validation_examples
def __getitem__(self, item): query = self.queries[self.queries_ids[item]] query_text = query['query'] pos_id = query['pos'].pop(0) #Pop positive and add at end pos_text = self.corpus[pos_id] query['pos'].append(pos_id) neg_id = query['neg'].pop(0) #Pop negative and add at end neg_text = self.corpus[neg_id] query['neg'].append(neg_id) return InputExample(texts=[query_text, pos_text, neg_text])
def __getitem__(self, index): # Load the code and the descriptions code_snippet = self.code_snippets[index] positive_desc = self.descriptions[index] # negative_desc = self.descriptions[random.randint(0, self.__len__() - 1)] negative_candidates = list(range(self.__len__())) negative_candidates.remove(index) negative_index = random.choice(negative_candidates) negative_desc = self.descriptions[negative_index] input_example = InputExample( texts=[code_snippet, positive_desc, negative_desc]) return input_example
def run( self, training_data, evaluator, output_path, from_scratch=False, loss=SentenceTransformerLoss.cosine_similarity_loss, model_name_or_path="roberta-large-nli-stsb-mean-tokens", cuda=True, **kwargs, ): logger.info( f"Running Sentence Transformer Task: {model_name_or_path}, Output path: {output_path}" ) if from_scratch: logger.info("Training from scratch") models.Transformer(model_name_or_path, max_seq_length=kwargs.get( "max_seq_length", 128)) else: model = SentenceTransformer(model_name_or_path) if cuda: logger.info("Running model on GPU") model.cuda() train_examples = [ InputExample(texts=[data["sentence1"], data["sentence2"]], label=data["label"]) for data in training_data.values() ] train_dataset = SentencesDataset(train_examples, model) train_dataloader = DataLoader( train_dataset, shuffle=kwargs.get("shuffle", True), batch_size=kwargs.get("batch_size", 4), ) warmup_steps = math.ceil( len(train_examples) * kwargs.get("num_epochs", 3) / kwargs.get("train_batch_size", 4) * 0.1) # 10% of train data for warm-up train_loss = loss.value(model) model.fit( train_objectives=[(train_dataloader, train_loss)], epochs=kwargs.get("num_epochs", 3), evaluation_steps=kwargs.get("evaluation_steps", 500), warmup_steps=warmup_steps, output_path=output_path, evaluator=evaluator, )
def construct_train_samples(json_obj, neg_random=False): train_samples = [] label = 1 for t2 in json_obj["pos_tuple_set"]: q_index, a_index = t2[0], t2[1] q, a = json_obj["index_question_dict"][q_index], json_obj[ "index_answer_dict"][a_index] #q = q + "<sep>" train_samples.append(InputExample(texts=[q, a], label=label)) if neg_random: neg_len = len(json_obj["index_answer_dict"]) label = 0 for t2 in json_obj["neg_tuple_set"]: q_index, a_index = t2[0], t2[1] if neg_random: q, a = json_obj["index_question_dict"][q_index], json_obj[ "index_answer_dict"][np.random.randint(0, neg_len)] else: q, a = json_obj["index_question_dict"][q_index], json_obj[ "index_answer_dict"][a_index] #q = q + "<sep>" train_samples.append(InputExample(texts=[q, a], label=label)) train_indexes = np.random.permutation(np.arange(len(train_samples))) return list(map(lambda idx: train_samples[idx], train_indexes))
def create_linked_posts(fl, data_dir, model, validate=None, is_test=False): train_linked_posts = [] disbn = [] with open(os.path.join(data_dir, fl)) as f: data = json.load(f) for obj in data: if obj['class'] == 'relevant': label = 1 else: label = 0 disbn.append(label) train_linked_posts.append( InputExample(texts=[obj['text_1'], obj['text_2']], label=label)) random.shuffle(train_linked_posts) if is_test: return train_linked_posts if max_size: train_linked_posts = train_linked_posts[:max_size] evaluator = None if linked_posts_str == validate: train_linked_posts, dev_linked_posts = train_test_split( train_linked_posts, stratify=disbn, test_size=0.1) evaluator = BinaryClassificationEvaluator.from_input_examples( dev_linked_posts, name='linked-posts') warmup_steps = math.ceil( len(train_linked_posts) * num_epochs / batch_size * 0.1) # 10% of train data for warm-up train_data_linked_posts = SentencesDataset(train_linked_posts, model=model) train_dataloader_linked_posts = DataLoader(train_data_linked_posts, shuffle=True, batch_size=batch_size) train_loss_linked_posts = losses.ContrastiveLoss(model=model) print('L: Number of training examples: ', len(train_linked_posts)) global evaluation_steps evaluation_steps = math.ceil(len(train_linked_posts) / 0.1) return train_dataloader_linked_posts, train_loss_linked_posts, evaluator, warmup_steps
def create_train_usage(fl, data_dir, model, validate=None, is_test=False): train_usage = [] with open(os.path.join(data_dir, fl)) as f: data = json.load(f) min_d = 10000000 max_d = 0 for obj in data: dist = obj['distance'] if dist < min_d: min_d = dist if dist > max_d: max_d = dist for obj in data: dist = (max_d - obj['distance']) / (max_d - min_d) train_usage.append( InputExample(texts=[obj['class1'], obj['class2']], label=dist)) random.shuffle(train_usage) if is_test: return train_usage if max_size: train_usage = train_usage[:max_size] evaluator = None if usage_str == validate: train_usage, dev_usage = train_test_split(train_usage, test_size=0.1) evaluator = EmbeddingSimilarityEvaluator.from_input_examples( dev_usage, name='usage') warmup_steps = math.ceil(len(train_usage) * num_epochs / batch_size * 0.1) # 10% of train data for warm-up train_data_usage = SentencesDataset(train_usage, model=model) train_dataloader_usage = DataLoader(train_data_usage, shuffle=True, batch_size=batch_size) train_loss_usage = losses.CosineSimilarityLoss(model=model) print('U: Number of training examples: ', len(train_usage)) global evaluation_steps evaluation_steps = math.ceil(len(train_usage) / 0.1) return train_dataloader_usage, train_loss_usage, evaluator, warmup_steps