def run_fixed_lambda_bbcluster(train_cluster_data, val_cluster_data, test_cluster_data, output_path, train_batch_size, eval_steps, num_epochs, warmup_frac, lambda_val, reg, beta, loss_name, use_model_device, model_name='distilbert-base-uncased', out_features=256): task = Task.init(project_name='BB Clustering', task_name='bbclustering_fixed_lambda') config_dict = {'lambda_val': lambda_val, 'reg': reg} config_dict = task.connect(config_dict) if torch.cuda.is_available(): device = torch.device('cuda') print('CUDA is available and using device: '+str(device)) else: device = torch.device('cpu') print('CUDA not available, using device: '+str(device)) ### Configure sentence transformers for training and train on the provided dataset # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) doc_dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=out_features, activation_function=nn.Tanh()) model = CustomSentenceTransformer(modules=[word_embedding_model, pooling_model, doc_dense_model]) # model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) GPUtil.showUtilization() if loss_name == 'bbspec': loss_model = BBSpectralClusterLossModel(model=model, device=device, lambda_val=config_dict.get('lambda_val', lambda_val), reg_const=config_dict.get('reg', reg), beta=beta) else: loss_model = BBClusterLossModel(model=model, device=device, lambda_val=config_dict.get('lambda_val', lambda_val), reg_const=config_dict.get('reg', reg)) # reg_loss_model = ClusterDistLossModel(model=model) train_dataloader = DataLoader(train_cluster_data, shuffle=True, batch_size=train_batch_size) GPUtil.showUtilization() # train_dataloader2 = DataLoader(train_cluster_data, shuffle=True, batch_size=train_batch_size) evaluator = ClusterEvaluator.from_input_examples(val_cluster_data, use_model_device) test_evaluator = ClusterEvaluator.from_input_examples(test_cluster_data, use_model_device) GPUtil.showUtilization() warmup_steps = int(len(train_dataloader) * num_epochs * warmup_frac) # 10% of train data print("Raw BERT embedding performance") model.to(device) evaluator(model, output_path) GPUtil.showUtilization() # Train the model model.fit(train_objectives=[(train_dataloader, loss_model)], evaluator=evaluator, test_evaluator=test_evaluator, epochs=num_epochs, evaluation_steps=eval_steps, warmup_steps=warmup_steps, output_path=output_path)
def run(): train_file = config.TRAINING_FILE train_batch = config.TRAIN_BATCH_SIZE vaild_batch = config.VALID_BATCH_SIZE model_path = config.BERT_PATH max_length = config.MAX_LEN dfs = pd.read_csv(train_file, sep="\t", names=['idx', 'sent1', 'sent2', 'label']) dfs['label'] = pd.to_numeric(dfs["label"], downcast='float') df_train, df_valid = model_selection.train_test_split( dfs, test_size=0.1, random_state=42, stratify=dfs.label.values, ) df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) dataset_reader = dataset.Dataset() train_dataset = dataset_reader.read(df_train, return_pt=True) valid_sentence1, valid_sentence2, valid_labels = dataset_reader.read( df_valid) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch) # evaluator = evaluation.EmbeddingSimilarityEvaluator(valid_sentence1, valid_sentence2, valid_labels) evaluator = evaluation.BinaryClassificationEvaluator( valid_sentence1, valid_sentence2, valid_labels, batch_size=vaild_batch, show_progress_bar=False) word_embedding_model = models.Transformer(model_path, max_seq_length=max_length) pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension()) dense_model = models.Dense( in_features=pooling_model.get_sentence_embedding_dimension(), out_features=max_length, activation_function=nn.Tanh()) model = SentenceTransformer( modules=[word_embedding_model, pooling_model, dense_model]) train_loss = losses.CosineSimilarityLoss(model) engine.train(train_dataloader, model, train_loss, evaluator)
def define_bert_encoder(): word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=200) pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension()) dense_model = models.Dense( ni_features=pooling_model.get_sentence_embedding_dimension(), out_features=200, activation_function=nn.Tanh()) bert_model = SentenceTransformer( modules=[word_embedding_model, pooling_model, dense_model]) return bert_model
def __init__(self): word_embedding_model = models.Transformer( 'sentence-transformers/bert-large-nli-max-tokens', max_seq_length=256) pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension()) dense_model = models.Dense( in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh()) self.model = SentenceTransformer( modules=[word_embedding_model, pooling_model, dense_model]) path = 'multinli_1.0/' self.MNLI_train_path = path + 'multinli_1.0_train.txt' self.MNLI_matched_test_path = path + 'multinli_1.0_dev_matched.txt' self.MNLI_mismatched_test_path = path + 'multinli_1.0_dev_mismatched.txt'
def run_binary_model(train_pairs, val_cluster_data, test_cluster_data, output_path, train_batch_size, eval_steps, num_epochs, warmup_frac, use_model_device, model_name='distilbert-base-uncased', out_features=256): task = Task.init(project_name='BB Clustering', task_name='bbclustering_pairs') if torch.cuda.is_available(): device = torch.device('cuda') print('CUDA is available and using device: ' + str(device)) else: device = torch.device('cpu') print('CUDA not available, using device: ' + str(device)) ### Configure sentence transformers for training and train on the provided dataset # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) doc_dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=out_features, activation_function=nn.Tanh()) model = CustomSentenceTransformer(modules=[word_embedding_model, pooling_model, doc_dense_model]) train_dataloader = DataLoader(train_pairs, shuffle=True, batch_size=train_batch_size) train_loss = BinaryLoss(model=model) evaluator = ClusterEvaluator.from_input_examples(val_cluster_data, use_model_device) test_evaluator = ClusterEvaluator.from_input_examples(test_cluster_data, use_model_device) warmup_steps = int(len(train_dataloader) * num_epochs * warmup_frac) # 10% of train data print("Raw BERT embedding performance") model.to(device) evaluator(model, output_path) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, test_evaluator=test_evaluator, epochs=num_epochs, evaluation_steps=eval_steps, warmup_steps=warmup_steps, output_path=output_path)
def do_test(pt_file, model_name, n): text = [] i = 0 with open(pt_file, 'r', encoding='utf8') as f: for l in f: text.append(l.split('\t')[1]) i += 1 if i >= n: break psg_word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector psg_pooling_model = models.Pooling( psg_word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) psg_dense_model = models.Dense( in_features=psg_pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh()) psg_model = CustomSentenceTransformer( modules=[psg_word_embedding_model, psg_pooling_model, psg_dense_model]) if torch.cuda.is_available(): psg_model.to(torch.device('cuda')) psg_features = [] print('Tokenizing') for p in text: psg_tkn = psg_model.tokenize(p) if torch.cuda.is_available(): batch_to_device(psg_tkn, torch.device('cuda')) psg_features.append(psg_tkn) psg_embs = [] print('Embedding') for pfet in psg_features: psg_emb = psg_model(pfet)['sentence_embedding'] psg_emb.to(torch.device('cpu')) psg_embs.append(psg_emb) print(psg_embs[:10])
# Initialize the WordWeights model. This model must be between the WordEmbeddings and the Pooling model word_weights = models.WordWeights(vocab=vocab, word_weights=word_weights, unknown_word_weight=unknown_word_weight) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) # Add two trainable feed-forward networks (DAN) sent_embeddings_dimension = pooling_model.get_sentence_embedding_dimension() dan1 = models.Dense(in_features=sent_embeddings_dimension, out_features=sent_embeddings_dimension) dan2 = models.Dense(in_features=sent_embeddings_dimension, out_features=sent_embeddings_dimension) model = SentenceTransformer( modules=[word_embedding_model, word_weights, pooling_model, dan1, dan2]) # Convert the dataset to a DataLoader ready for training logging.info("Read STSbenchmark train dataset") train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model=model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) train_loss = losses.CosineSimilarityLoss(model=model) logging.info("Read STSbenchmark dev dataset") dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'),
nli_sentences = list(nli_sentences) random.shuffle(nli_sentences) #To determine the PCA matrix, we need some example sentence embeddings. #Here, we compute the embeddings for 20k random sentences from the AllNLI dataset pca_train_sentences = nli_sentences[0:20000] train_embeddings = model.encode(pca_train_sentences, convert_to_numpy=True) #Compute PCA on the train embeddings matrix pca = PCA(n_components=new_dimension) pca.fit(train_embeddings) pca_comp = np.asarray(pca.components_) # We add a dense layer to the model, so that it will produce directly embeddings with the new size dense = models.Dense(in_features=model.get_sentence_embedding_dimension(), out_features=new_dimension, bias=False, activation_function=torch.nn.Identity()) dense.linear.weight = torch.nn.Parameter(torch.tensor(pca_comp)) model.add_module('dense', dense) # Evaluate the model with the reduce embedding size logger.info("Model with {} dimensions:".format(new_dimension)) stsb_evaluator(model) # If you like, you can store the model on disc by uncommenting the following line #model.save('models/bert-base-nli-stsb-mean-tokens-128dim') # You can then load the adapted model that produces 128 dimensional embeddings like this: #model = SentenceTransformer('models/bert-base-nli-stsb-mean-tokens-128dim')
#%% print("Setting model...") modules = [] word_embedding_model = models.Transformer(args.model, max_seq_length=128) modules.append(word_embedding_model) pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_cls_token=True, pooling_mode_mean_tokens=False) modules.append(pooling_model) if args.dense: if args.activation == 'tanh': dense_model = models.Dense( in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh()) elif args.activation == 'sigmod': dense_model = models.Dense( in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Sigmoid()) elif args.activation == 'relu': dense_model = models.Dense( in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.ReLU()) assert dense_model, f"unknown activation function {args.activation}" modules.append(dense_model) model = SentenceTransformer(modules=modules)
args = parser.parse_args() base_model = args.base_model sentence_embedding_dim = args.sentence_embedding_dim model_save_path = args.model_save_path batch_size = args.batch_size epochs = args.epochs dataset = args.dataset task_type = args.task_type masked = args.masked word_embedding_model = models.Transformer(base_model, max_seq_length=256) pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension()) dense_model = models.Dense( in_features=pooling_model.get_sentence_embedding_dimension(), out_features=sentence_embedding_dim, activation_function=nn.Tanh()) model = SentenceTransformer( modules=[word_embedding_model, pooling_model, dense_model]) train_examples = ld.load_dataset(dataset_name=dataset, dataset_type='train') train_dataset = SentencesDataset(train_examples, model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size) train_loss = losses.ContrastiveLoss(model=model)
def main(): parser = argparse.ArgumentParser(description='Start training with SBERT') parser.add_argument('--model_path', type=str, help='Path to trained model folder ./models/[MODEL_NAME]') parser.add_argument('--dataset', type=str, default='few_rel', help='Name dataset') parser.add_argument('--mask_method', type=str, default='bracket', help='Type of masking') parser.add_argument('--num_epochs', type=int, default=15, help='Number epochs') parser.add_argument('--num_samples', type=int, default=-1, help='Number of samples for test run, default -1 means all data') parser.add_argument('--max_seq_length', type=int, default=256, help='Max token length for BERT') args = parser.parse_args() model_path = args.model_path dataset = args.dataset mask_method = args.mask_method num_samples = args.num_samples max_seq_length=args.max_seq_length num_epochs = args.num_epochs evaluation_steps = 1000 # Frequency of evaluation results warmup_steps = 1000 # warm up steps sentence_out_embedding_dimension = 256 if model_path.endswith('/'): model_path = model_path[:-1] model_name = model_path.split('/')[-1] path_train_data = f'./data/train_samples/{dataset}_train_{mask_method}_train.csv' path_eval_data = f'./data/train_samples/{dataset}_val_{mask_method}_test.csv' if num_samples>0: model_save_path = f'./trained_models/{model_name}_sbert_bi_{dataset}_test/' else: model_save_path = f'./trained_models/{model_name}_sbert_bi_{dataset}/' ### Define the model word_embedding_model = models.Transformer(model_path, max_seq_length=max_seq_length) ### Add special tokens - this helps us add tokens like Doc or query or Entity1 / Entity2 # but in our case we already added that to the model prior #tokens = ["[DOC]", "[QRY]"] #word_embedding_model.tokenizer.add_tokens(tokens, special_tokens=True) #word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer)) pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=sentence_out_embedding_dimension, activation_function=nn.Tanh()) # Model pipeline model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model]) # Prep DataLoader train_examples = load_train_sbert(path_train_data, num_samples) train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16) # Prep Evaluator sentences1, sentences2, scores = load_eval_sbert(path_eval_data, num_samples) #evaluator = evaluation.EmbeddingSimilarityEvaluator(sentences1, sentences2, scores) evaluator = evaluation.BinaryClassificationEvaluator(sentences1, sentences2, scores) #train_loss = losses.CosineSimilarityLoss(model) train_loss = losses.SoftmaxLoss(model, sentence_embedding_dimension= sentence_out_embedding_dimension, num_labels = 2) #Tune the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=evaluation_steps, warmup_steps=warmup_steps, output_path=model_save_path)
def train(train_cluster_data, val_cluster_data, test_cluster_data, output_path, eval_steps, num_epochs, warmup_frac, lambda_val, reg, use_model_device, max_train_size=-1, train_psg_model=False, model_name='distilbert-base-uncased', out_features=256, steps_per_epoch=None, weight_decay=0.01, optimizer_class=transformers.AdamW, scheduler='WarmupLinear', optimizer_params={'lr':2e-5}, show_progress_bar=True, max_grad_norm=1, save_best_model=True): tensorboard_writer = SummaryWriter('./tensorboard_logs') task = Task.init(project_name='Query Specific BB Clustering', task_name='query_bbc_fixed_lambda') config_dict = {'lambda_val': lambda_val, 'reg': reg} config_dict = task.connect(config_dict) if torch.cuda.is_available(): device = torch.device('cuda') print('CUDA is available and using device: '+str(device)) else: device = torch.device('cpu') print('CUDA not available, using device: '+str(device)) ### Configure sentence transformers for training and train on the provided dataset # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings query_word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector query_pooling_model = models.Pooling(query_word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) query_dense_model = models.Dense(in_features=query_pooling_model.get_sentence_embedding_dimension(), out_features=out_features, activation_function=nn.Sigmoid()) psg_word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector psg_pooling_model = models.Pooling(psg_word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) psg_dense_model = models.Dense(in_features=psg_pooling_model.get_sentence_embedding_dimension(), out_features=out_features, activation_function=nn.Tanh()) query_model = CustomSentenceTransformer(modules=[query_word_embedding_model, query_pooling_model, query_dense_model]) psg_model = SentenceTransformer(modules=[psg_word_embedding_model, psg_pooling_model, psg_dense_model]) model = QuerySpecificClusterModel(query_transformer=query_model, psg_transformer=psg_model, device=device) train_dataloader = DataLoader(train_cluster_data, shuffle=True, batch_size=1) evaluator = QueryClusterEvaluator.from_input_examples(val_cluster_data, use_model_device) test_evaluator = QueryClusterEvaluator.from_input_examples(test_cluster_data, use_model_device) warmup_steps = int(len(train_dataloader) * num_epochs * warmup_frac) # 10% of train data print("Untrained performance") model.to(device) evaluator(model) train_dataloader.collate_fn = model.query_batch_collate_fn # Train the model best_score = -9999999 if steps_per_epoch is None or steps_per_epoch == 0: steps_per_epoch = len(train_dataloader) num_train_steps = int(steps_per_epoch * num_epochs) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] data_iter = iter(train_dataloader) optimizer = optimizer_class(optimizer_grouped_parameters, **optimizer_params) scheduler_obj = model._get_scheduler(optimizer, scheduler=scheduler, warmup_steps=warmup_steps, t_total=num_train_steps) config = {'epochs': num_epochs, 'steps_per_epoch': steps_per_epoch} global_step = 0 loss_model = BBClusterLossModel(model, device, lambda_val, reg) for epoch in trange(config.get('epochs'), desc="Epoch", disable=not show_progress_bar): training_steps = 0 running_loss_0 = 0.0 model.zero_grad() model.train() if not train_psg_model: for m in model.psg_model.modules(): m.training = False for _ in trange(config.get('steps_per_epoch'), desc="Iteration", smoothing=0.05, disable=not show_progress_bar): try: data = next(data_iter) except StopIteration: data_iter = iter(train_dataloader) data = next(data_iter) query_feature, psg_features, labels = data if max_train_size > 0 and labels.shape[1] > max_train_size: print('skipping instance with '+str(labels.shape[1])+' passages') continue loss_val = loss_model(query_feature, psg_features, labels) running_loss_0 += loss_val.item() loss_val.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) optimizer.step() optimizer.zero_grad() scheduler_obj.step() training_steps += 1 global_step += 1 if eval_steps > 0 and training_steps % eval_steps == 0: tensorboard_writer.add_scalar('training_loss', running_loss_0 / eval_steps, global_step) # logger.report_scalar('Loss', 'training_loss', iteration=global_step, v # alue=running_loss_0/evaluation_steps) running_loss_0 = 0.0 # self._eval_during_training(evaluator, output_path, save_best_model, epoch, training_steps, callback) if evaluator is not None: score = evaluator(model, output_path=output_path, epoch=epoch, steps=training_steps) tensorboard_writer.add_scalar('val_ARI', score, global_step) # logger.report_scalar('Training progress', 'val_ARI', iteration=global_step, value=score) if score > best_score: best_score = score if save_best_model: print('Saving model at: ' + output_path) model.save(output_path) model.zero_grad() model.train() if not train_psg_model: for m in model.psg_model.modules(): m.training = False if evaluator is not None: score = evaluator(model, output_path=output_path, epoch=epoch, steps=training_steps) tensorboard_writer.add_scalar('val_ARI', score, global_step) # logger.report_scalar('Training progress', 'val_ARI', iteration=global_step, value=score) if score > best_score: best_score = score if save_best_model: model.save(output_path) if test_evaluator is not None: best_model = QuerySpecificClusterModel(output_path) if torch.cuda.is_available(): model.to(torch.device('cpu')) best_model.to(device) test_ari = test_evaluator(best_model) best_model.to(torch.device('cpu')) model.to(device) else: test_ari = test_evaluator(best_model) tensorboard_writer.add_scalar('test_ARI', test_ari, global_step) # logger.report_scalar('Training progress', 'test_ARI', iteration=global_step, value=test_ari) if evaluator is None and output_path is not None: # No evaluator, but output path: save final model version model.save(output_path)
datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) # Check if dataset exsist. If not, download and extract it sts_dataset_path = 'datasets/stsbenchmark.tsv.gz' if not os.path.exists(sts_dataset_path): util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path) # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name, max_seq_length=32) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension()) dense = models.Dense(pooling_model.get_sentence_embedding_dimension(), pooling_model.get_sentence_embedding_dimension()) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # We use 1 Million sentences from Wikipedia to train our model wikipedia_dataset_path = 'datasets/wiki1m_for_simcse.txt' if not os.path.exists(wikipedia_dataset_path): util.http_get( 'https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse/resolve/main/wiki1m_for_simcse.txt', wikipedia_dataset_path) # train_samples is a list of InputExample objects where we pass the same sentence twice to texts, i.e. texts=[sent, sent] train_samples = [] with open(wikipedia_dataset_path, 'r', encoding='utf8') as fIn: for line in fIn: train_samples.append(InputExample(texts=[line.strip(), line.strip()]))
def main(opt): ArgumentParser.validate_preprocess_args(opt) torch.manual_seed(opt.seed) if not (opt.overwrite): check_existing_pt_files(opt) init_logger(opt.log_file) shutil.copy2(opt.config, os.path.dirname(opt.log_file)) logger.info(opt) logger.info("Extracting features...") #Prepares the document embedding to initialize memory vectors. word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=256) pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension()) dense_model = models.Dense( in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh()) embedder = SentenceTransformer( modules=[word_embedding_model, pooling_model, dense_model]) # embedder = SentenceTransformer('bert-base-nli-mean-tokens') kpcorpus = [] files_path = [ #'data/keyphrase/json/kp20k/kp20k_train.json', # 'data/keyphrase/json/kp20k/kp20k_valid.json', # 'data/keyphrase/json/kp20k/kp20k_test.json', # 'data/keyphrase/json/inspec/inspec_valid.json', # 'data/keyphrase/json/inspec/inspec_test.json', # 'data/keyphrase/json/krapivin/krapivin_valid.json', # 'data/keyphrase/json/krapivin/krapivin_test.json', # 'data/keyphrase/json/nus/split/nus_valid.json', # 'data/keyphrase/json/nus/split/nus_test.json', # 'data/keyphrase/json/semeval/semeval_valid.json', # 'data/keyphrase/json/semeval/semeval_test.json', # 'data/keyphrase/json/duc/split/duc_valid.json', # 'data/keyphrase/json/duc/split/duc_test.json' 'data/keyphrase/json/twitter_conv/twitter_conv_valid.json', 'data/keyphrase/json/twitter_conv/twitter_conv_train.json', 'data/keyphrase/json/twitter_conv/twitter_conv_test.json' ] for file_path in files_path: file = open(file_path, 'r') for line in file.readlines(): dic = json.loads(line) # print(dic) kpcorpus.append(dic['title'] + ' ' + dic['abstract']) # print(kpcorpus) num_of_example = len(kpcorpus) print("number of examples in corpus: ", num_of_example) time_a = time.time() corpus_embeddings = embedder.encode(kpcorpus[:num_of_example]) print("elapsed time: ", time.time() - time_a) alldocs_emb = torch.Tensor(corpus_embeddings) torch.save(alldocs_emb, './data/alldocs_emb') src_nfeats = 0 tgt_nfeats = 0 for src, tgt in zip(opt.train_src, opt.train_tgt): src_nfeats += count_features(src) if opt.data_type == 'text' \ else 0 tgt_nfeats += count_features(tgt) # tgt always text so far logger.info(" * number of source features: %d." % src_nfeats) logger.info(" * number of target features: %d." % tgt_nfeats) logger.info("Building `Fields` object...") fields = inputters.get_fields(opt.data_type, src_nfeats, tgt_nfeats, dynamic_dict=opt.dynamic_dict, src_truncate=opt.src_seq_length_trunc, tgt_truncate=opt.tgt_seq_length_trunc) src_reader = inputters.str2reader[opt.data_type].from_opt(opt) tgt_reader = inputters.str2reader[opt.data_type].from_opt(opt) logger.info("Building & saving training data...") build_save_dataset('train', fields, src_reader, tgt_reader, opt) if opt.valid_src and opt.valid_tgt: logger.info("Building & saving validation data...") build_save_dataset('valid', fields, src_reader, tgt_reader, opt)
def _run_fixed_lambda_bbcluster(train_batch_size, num_epochs, lambda_val, reg, use_model_device, eval_steps, out_path, warmup_frac=0.1, model_name='distilbert-base-uncased', out_features=256): exp_task = Task.create(project_name='Optuna Hyperparam optim', task_name='trial') config_dict = {'lambda_val': lambda_val, 'reg': reg} config_dict = task.connect(config_dict) if torch.cuda.is_available(): device = torch.device('cuda') print('CUDA is available and using device: ' + str(device)) else: device = torch.device('cpu') print('CUDA not available, using device: ' + str(device)) word_embedding_model = models.Transformer(model_name) pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) doc_dense_model = models.Dense( in_features=pooling_model.get_sentence_embedding_dimension(), out_features=out_features, activation_function=nn.Tanh()) model = CustomSentenceTransformer( modules=[word_embedding_model, pooling_model, doc_dense_model]) loss_model = BBClusterLossModel(model=model, device=device, lambda_val=config_dict.get( 'lambda_val', lambda_val), reg_const=config_dict.get('reg', reg)) train_dataloader = DataLoader(train_cluster_data, shuffle=True, batch_size=train_batch_size) evaluator = ClusterEvaluator.from_input_examples(val_cluster_data, use_model_device) warmup_steps = int(len(train_dataloader) * num_epochs * warmup_frac) # 10% of train data model.to(device) # Train the model model.fit(train_objectives=[(train_dataloader, loss_model)], epochs=num_epochs, warmup_steps=warmup_steps, evaluator=evaluator, evaluation_steps=eval_steps, output_path=out_path) best_model = CustomSentenceTransformer(out_path) return evaluator(best_model)