def __init__(self, path:str=None, query_transformer:CustomSentenceTransformer=None, psg_transformer:CustomSentenceTransformer=None, device:torch.device=None): super(QuerySpecificClusterModel, self).__init__() if path is not None: self.query_model = CustomSentenceTransformer(path+'/query_model') self.psg_model = CustomSentenceTransformer(path+'/psg_model') else: self.query_model = query_transformer self.psg_model = psg_transformer self.optim = OptimCluster self.device = device
def do_test(pt_file, model_name, n): text = [] i = 0 with open(pt_file, 'r', encoding='utf8') as f: for l in f: text.append(l.split('\t')[1]) i += 1 if i >= n: break psg_word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector psg_pooling_model = models.Pooling( psg_word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) psg_dense_model = models.Dense( in_features=psg_pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh()) psg_model = CustomSentenceTransformer( modules=[psg_word_embedding_model, psg_pooling_model, psg_dense_model]) if torch.cuda.is_available(): psg_model.to(torch.device('cuda')) psg_features = [] print('Tokenizing') for p in text: psg_tkn = psg_model.tokenize(p) if torch.cuda.is_available(): batch_to_device(psg_tkn, torch.device('cuda')) psg_features.append(psg_tkn) psg_embs = [] print('Embedding') for pfet in psg_features: psg_emb = psg_model(pfet)['sentence_embedding'] psg_emb.to(torch.device('cpu')) psg_embs.append(psg_emb) print(psg_embs[:10])
def run_fixed_lambda_bbcluster(train_cluster_data, val_cluster_data, test_cluster_data, output_path, train_batch_size, eval_steps, num_epochs, warmup_frac, lambda_val, reg, beta, loss_name, use_model_device, model_name='distilbert-base-uncased', out_features=256): task = Task.init(project_name='BB Clustering', task_name='bbclustering_fixed_lambda') config_dict = {'lambda_val': lambda_val, 'reg': reg} config_dict = task.connect(config_dict) if torch.cuda.is_available(): device = torch.device('cuda') print('CUDA is available and using device: '+str(device)) else: device = torch.device('cpu') print('CUDA not available, using device: '+str(device)) ### Configure sentence transformers for training and train on the provided dataset # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) doc_dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=out_features, activation_function=nn.Tanh()) model = CustomSentenceTransformer(modules=[word_embedding_model, pooling_model, doc_dense_model]) # model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) GPUtil.showUtilization() if loss_name == 'bbspec': loss_model = BBSpectralClusterLossModel(model=model, device=device, lambda_val=config_dict.get('lambda_val', lambda_val), reg_const=config_dict.get('reg', reg), beta=beta) else: loss_model = BBClusterLossModel(model=model, device=device, lambda_val=config_dict.get('lambda_val', lambda_val), reg_const=config_dict.get('reg', reg)) # reg_loss_model = ClusterDistLossModel(model=model) train_dataloader = DataLoader(train_cluster_data, shuffle=True, batch_size=train_batch_size) GPUtil.showUtilization() # train_dataloader2 = DataLoader(train_cluster_data, shuffle=True, batch_size=train_batch_size) evaluator = ClusterEvaluator.from_input_examples(val_cluster_data, use_model_device) test_evaluator = ClusterEvaluator.from_input_examples(test_cluster_data, use_model_device) GPUtil.showUtilization() warmup_steps = int(len(train_dataloader) * num_epochs * warmup_frac) # 10% of train data print("Raw BERT embedding performance") model.to(device) evaluator(model, output_path) GPUtil.showUtilization() # Train the model model.fit(train_objectives=[(train_dataloader, loss_model)], evaluator=evaluator, test_evaluator=test_evaluator, epochs=num_epochs, evaluation_steps=eval_steps, warmup_steps=warmup_steps, output_path=output_path)
def run_binary_model(train_pairs, val_cluster_data, test_cluster_data, output_path, train_batch_size, eval_steps, num_epochs, warmup_frac, use_model_device, model_name='distilbert-base-uncased', out_features=256): task = Task.init(project_name='BB Clustering', task_name='bbclustering_pairs') if torch.cuda.is_available(): device = torch.device('cuda') print('CUDA is available and using device: ' + str(device)) else: device = torch.device('cpu') print('CUDA not available, using device: ' + str(device)) ### Configure sentence transformers for training and train on the provided dataset # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) doc_dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=out_features, activation_function=nn.Tanh()) model = CustomSentenceTransformer(modules=[word_embedding_model, pooling_model, doc_dense_model]) train_dataloader = DataLoader(train_pairs, shuffle=True, batch_size=train_batch_size) train_loss = BinaryLoss(model=model) evaluator = ClusterEvaluator.from_input_examples(val_cluster_data, use_model_device) test_evaluator = ClusterEvaluator.from_input_examples(test_cluster_data, use_model_device) warmup_steps = int(len(train_dataloader) * num_epochs * warmup_frac) # 10% of train data print("Raw BERT embedding performance") model.to(device) evaluator(model, output_path) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, test_evaluator=test_evaluator, epochs=num_epochs, evaluation_steps=eval_steps, warmup_steps=warmup_steps, output_path=output_path)
return dist_mat parser = argparse.ArgumentParser(description='Eval treccar experiments') parser.add_argument('-ip', '--input_dir', default='~/trec_dataset') parser.add_argument('-lv', '--level', default='top') parser.add_argument('-pg', '--page_title') parser.add_argument('-mp', '--model_path') parser.add_argument('-out', '--outdict') args = parser.parse_args() input_dir = args.input_dir level = args.level page = args.page_title model_path = args.model_path outpath = args.outdict model = CustomSentenceTransformer(model_path) test_art_qrels = input_dir + '/benchmarkY1/benchmarkY1-test-nodup/test.pages.cbor-article.qrels' test_top_qrels = input_dir + '/benchmarkY1/benchmarkY1-test-nodup/test.pages.cbor-toplevel.qrels' test_hier_qrels = input_dir + '/benchmarkY1/benchmarkY1-test-nodup/test.pages.cbor-hierarchical.qrels' test_paratext = input_dir + '/benchmarkY1/benchmarkY1-test-nodup/by1test_paratext/by1test_paratext.tsv' test_top_cluster_data, test_hier_cluster_data = prepare_cluster_data2( test_art_qrels, test_top_qrels, test_hier_qrels, test_paratext, False, -1, 0) if level == 'top': test_cluster_data = test_top_cluster_data else: test_cluster_data = test_hier_cluster_data emb_dict = {} for sample in test_cluster_data: print(sample.qid)
mean_rand_tf = np.mean(np.array(rand_scores_tf)) mean_nmi_tf = np.mean(np.array(nmi_scores_tf)) mean_ami_tf = np.mean(np.array(ami_scores_tf)) mean_urand_tf = np.mean(np.array(urand_scores_tf)) print('TFIDF') print("\nRAND: %.5f, NMI: %.5f, AMI: %.5f, URAND: %.5f\n" % (mean_rand_tf, mean_nmi_tf, mean_ami_tf, mean_urand_tf), flush=True) word_embedding_model = models.Transformer(model_name) pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) doc_dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh()) raw_model = CustomSentenceTransformer(modules=[word_embedding_model, pooling_model, doc_dense_model]) anchor_rand, anchor_nmi, anchor_ami, anchor_urand = [], [], [], [] for i in range(len(model_paths)): mp = model_paths[i] m = CustomSentenceTransformer(mp) print('Model: '+mp.split('/')[len(mp.split('/'))-1]) if i == 0: print('This is the anchor model for paired ttest') anchor_rand, anchor_nmi, anchor_ami, anchor_urand = get_eval_scores(m, test_cluster_data) else: mean_rand, mean_nmi, mean_ami, mean_urand = get_eval_scores(m, test_cluster_data, anchor_rand, anchor_nmi, anchor_ami, anchor_urand) mean_rand, mean_nmi, mean_ami, mean_urand = get_eval_scores(raw_model, test_cluster_data, anchor_rand, anchor_nmi, anchor_ami, anchor_urand) rand_ttest_tf, nmi_ttest_tf, ami_ttest_tf, urand_ttest_tf = (ttest_rel(anchor_rand, rand_scores_tf), ttest_rel(anchor_nmi, nmi_scores_tf),
class QuerySpecificClusterModel(nn.Module): def __init__(self, path:str=None, query_transformer:CustomSentenceTransformer=None, psg_transformer:CustomSentenceTransformer=None, device:torch.device=None): super(QuerySpecificClusterModel, self).__init__() if path is not None: self.query_model = CustomSentenceTransformer(path+'/query_model') self.psg_model = CustomSentenceTransformer(path+'/psg_model') else: self.query_model = query_transformer self.psg_model = psg_transformer self.optim = OptimCluster self.device = device def save(self, path): self.query_model.save(path+'/query_model') self.psg_model.save(path+'/psg_model') def _get_scheduler(self, optimizer, scheduler: str, warmup_steps: int, t_total: int): """ Taken from SentenceTransformers Returns the correct learning rate scheduler """ scheduler = scheduler.lower() if scheduler == 'constantlr': return transformers.get_constant_schedule(optimizer) elif scheduler == 'warmupconstant': return transformers.get_constant_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps) elif scheduler == 'warmuplinear': return transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) elif scheduler == 'warmupcosine': return transformers.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) elif scheduler == 'warmupcosinewithhardrestarts': return transformers.get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) else: raise ValueError("Unknown scheduler {}".format(scheduler)) def query_batch_collate_fn(self, batch): num_texts = len(batch[0].texts) queries = [] texts = [[] for _ in range(num_texts)] labels = [] for example in batch: queries.append(example.q_context) for idx, text in enumerate(example.texts): texts[idx].append(text) labels.append(example.label) labels = torch.tensor(labels).to(self.device) q_tokenized = self.query_model.tokenize(queries) batch_to_device(q_tokenized, self.device) psg_features = [] for idx in range(num_texts): p_tokenized = self.psg_model.tokenize(texts[idx]) batch_to_device(p_tokenized, self.device) psg_features.append(p_tokenized) return q_tokenized, psg_features, labels def forward(self, query_feature: Dict[str, Tensor], passage_features: Iterable[Dict[str, Tensor]], labels: Tensor): n = labels.shape[1] query_embedding = self.query_model(query_feature)['sentence_embedding'] # its the scaling vector, so each element in vector should be [0, 1] psg_embeddings = torch.stack([self.psg_model(passages)['sentence_embedding'] for passages in passage_features], dim=1) scaled_psg_embeddings = torch.tile(query_embedding.unsqueeze(1), (1, n, 1)) * psg_embeddings return scaled_psg_embeddings
def train(train_cluster_data, val_cluster_data, test_cluster_data, output_path, eval_steps, num_epochs, warmup_frac, lambda_val, reg, use_model_device, max_train_size=-1, train_psg_model=False, model_name='distilbert-base-uncased', out_features=256, steps_per_epoch=None, weight_decay=0.01, optimizer_class=transformers.AdamW, scheduler='WarmupLinear', optimizer_params={'lr':2e-5}, show_progress_bar=True, max_grad_norm=1, save_best_model=True): tensorboard_writer = SummaryWriter('./tensorboard_logs') task = Task.init(project_name='Query Specific BB Clustering', task_name='query_bbc_fixed_lambda') config_dict = {'lambda_val': lambda_val, 'reg': reg} config_dict = task.connect(config_dict) if torch.cuda.is_available(): device = torch.device('cuda') print('CUDA is available and using device: '+str(device)) else: device = torch.device('cpu') print('CUDA not available, using device: '+str(device)) ### Configure sentence transformers for training and train on the provided dataset # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings query_word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector query_pooling_model = models.Pooling(query_word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) query_dense_model = models.Dense(in_features=query_pooling_model.get_sentence_embedding_dimension(), out_features=out_features, activation_function=nn.Sigmoid()) psg_word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector psg_pooling_model = models.Pooling(psg_word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) psg_dense_model = models.Dense(in_features=psg_pooling_model.get_sentence_embedding_dimension(), out_features=out_features, activation_function=nn.Tanh()) query_model = CustomSentenceTransformer(modules=[query_word_embedding_model, query_pooling_model, query_dense_model]) psg_model = SentenceTransformer(modules=[psg_word_embedding_model, psg_pooling_model, psg_dense_model]) model = QuerySpecificClusterModel(query_transformer=query_model, psg_transformer=psg_model, device=device) train_dataloader = DataLoader(train_cluster_data, shuffle=True, batch_size=1) evaluator = QueryClusterEvaluator.from_input_examples(val_cluster_data, use_model_device) test_evaluator = QueryClusterEvaluator.from_input_examples(test_cluster_data, use_model_device) warmup_steps = int(len(train_dataloader) * num_epochs * warmup_frac) # 10% of train data print("Untrained performance") model.to(device) evaluator(model) train_dataloader.collate_fn = model.query_batch_collate_fn # Train the model best_score = -9999999 if steps_per_epoch is None or steps_per_epoch == 0: steps_per_epoch = len(train_dataloader) num_train_steps = int(steps_per_epoch * num_epochs) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] data_iter = iter(train_dataloader) optimizer = optimizer_class(optimizer_grouped_parameters, **optimizer_params) scheduler_obj = model._get_scheduler(optimizer, scheduler=scheduler, warmup_steps=warmup_steps, t_total=num_train_steps) config = {'epochs': num_epochs, 'steps_per_epoch': steps_per_epoch} global_step = 0 loss_model = BBClusterLossModel(model, device, lambda_val, reg) for epoch in trange(config.get('epochs'), desc="Epoch", disable=not show_progress_bar): training_steps = 0 running_loss_0 = 0.0 model.zero_grad() model.train() if not train_psg_model: for m in model.psg_model.modules(): m.training = False for _ in trange(config.get('steps_per_epoch'), desc="Iteration", smoothing=0.05, disable=not show_progress_bar): try: data = next(data_iter) except StopIteration: data_iter = iter(train_dataloader) data = next(data_iter) query_feature, psg_features, labels = data if max_train_size > 0 and labels.shape[1] > max_train_size: print('skipping instance with '+str(labels.shape[1])+' passages') continue loss_val = loss_model(query_feature, psg_features, labels) running_loss_0 += loss_val.item() loss_val.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) optimizer.step() optimizer.zero_grad() scheduler_obj.step() training_steps += 1 global_step += 1 if eval_steps > 0 and training_steps % eval_steps == 0: tensorboard_writer.add_scalar('training_loss', running_loss_0 / eval_steps, global_step) # logger.report_scalar('Loss', 'training_loss', iteration=global_step, v # alue=running_loss_0/evaluation_steps) running_loss_0 = 0.0 # self._eval_during_training(evaluator, output_path, save_best_model, epoch, training_steps, callback) if evaluator is not None: score = evaluator(model, output_path=output_path, epoch=epoch, steps=training_steps) tensorboard_writer.add_scalar('val_ARI', score, global_step) # logger.report_scalar('Training progress', 'val_ARI', iteration=global_step, value=score) if score > best_score: best_score = score if save_best_model: print('Saving model at: ' + output_path) model.save(output_path) model.zero_grad() model.train() if not train_psg_model: for m in model.psg_model.modules(): m.training = False if evaluator is not None: score = evaluator(model, output_path=output_path, epoch=epoch, steps=training_steps) tensorboard_writer.add_scalar('val_ARI', score, global_step) # logger.report_scalar('Training progress', 'val_ARI', iteration=global_step, value=score) if score > best_score: best_score = score if save_best_model: model.save(output_path) if test_evaluator is not None: best_model = QuerySpecificClusterModel(output_path) if torch.cuda.is_available(): model.to(torch.device('cpu')) best_model.to(device) test_ari = test_evaluator(best_model) best_model.to(torch.device('cpu')) model.to(device) else: test_ari = test_evaluator(best_model) tensorboard_writer.add_scalar('test_ARI', test_ari, global_step) # logger.report_scalar('Training progress', 'test_ARI', iteration=global_step, value=test_ari) if evaluator is None and output_path is not None: # No evaluator, but output path: save final model version model.save(output_path)
def _run_fixed_lambda_bbcluster(train_batch_size, num_epochs, lambda_val, reg, use_model_device, eval_steps, out_path, warmup_frac=0.1, model_name='distilbert-base-uncased', out_features=256): exp_task = Task.create(project_name='Optuna Hyperparam optim', task_name='trial') config_dict = {'lambda_val': lambda_val, 'reg': reg} config_dict = task.connect(config_dict) if torch.cuda.is_available(): device = torch.device('cuda') print('CUDA is available and using device: ' + str(device)) else: device = torch.device('cpu') print('CUDA not available, using device: ' + str(device)) word_embedding_model = models.Transformer(model_name) pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) doc_dense_model = models.Dense( in_features=pooling_model.get_sentence_embedding_dimension(), out_features=out_features, activation_function=nn.Tanh()) model = CustomSentenceTransformer( modules=[word_embedding_model, pooling_model, doc_dense_model]) loss_model = BBClusterLossModel(model=model, device=device, lambda_val=config_dict.get( 'lambda_val', lambda_val), reg_const=config_dict.get('reg', reg)) train_dataloader = DataLoader(train_cluster_data, shuffle=True, batch_size=train_batch_size) evaluator = ClusterEvaluator.from_input_examples(val_cluster_data, use_model_device) warmup_steps = int(len(train_dataloader) * num_epochs * warmup_frac) # 10% of train data model.to(device) # Train the model model.fit(train_objectives=[(train_dataloader, loss_model)], epochs=num_epochs, warmup_steps=warmup_steps, evaluator=evaluator, evaluation_steps=eval_steps, output_path=out_path) best_model = CustomSentenceTransformer(out_path) return evaluator(best_model)