def train(self, train_examples, dev_examples, dir_path=None): train_examples = SentencesDataset(train_examples, self.model) dev_examples = SentencesDataset(dev_examples, self.model) train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=self.args.train_batch_size) dev_dataloader = DataLoader(dev_examples, shuffle=False, batch_size=self.args.eval_batch_size) train_loss = losses.CosineSimilarityLoss(model=self.model) evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) warmup_steps = math.ceil( len(train_examples) * self.args.num_train_epochs / self.args.train_batch_size * self.args.warmup_proportion) self.model.zero_grad() self.model.train() self.model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=self.args.num_train_epochs, evaluation_steps=10000, warmup_steps=warmup_steps, output_path=None, optimizer_params={ 'lr': self.args.learning_rate, 'eps': 1e-6, 'correct_bias': False })
def train(self, train_df, eval_df): """ :param train_df: dataframe with columns 'text_a', 'text_b', 'labels' :param eval_df: dataframe with columns 'text_a', 'text_b', 'labels' :return: """ # format training data if "text_a" in train_df.columns and "text_b" in train_df.columns and "labels" in train_df.columns: if self.args.do_lower_case: train_df.loc[:, 'text_a'] = train_df['text_a'].str.lower() train_df.loc[:, 'text_b'] = train_df['text_b'].str.lower() train_examples = [ InputExample(str(i), [text_a, text_b], label) for i, (text_a, text_b, label) in enumerate( zip( train_df["text_a"].astype(str), train_df["text_b"].astype(str), train_df["labels"].astype(float), )) ] else: raise KeyError( 'Training data processing - Required columns not found!') # format evaluation data if "text_a" in train_df.columns and "text_b" in train_df.columns and "labels" in eval_df.columns: if self.args.do_lower_case: eval_df.loc[:, 'text_a'] = eval_df['text_a'].str.lower() eval_df.loc[:, 'text_b'] = eval_df['text_b'].str.lower() evaluator = evaluation.EmbeddingSimilarityEvaluator( list(eval_df["text_a"]), list(eval_df["text_b"]), list(eval_df["labels"]), batch_size=self.args.eval_batch_size) else: raise KeyError( 'Evaluation data processing - Required columns not found!') # Define train dataset, the dataloader and the train loss train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=self.args.train_batch_size) train_loss = losses.CosineSimilarityLoss(self.model) # Tune the model self.model.fit( train_objectives=[(train_dataloader, train_loss)], epochs=self.args.num_train_epochs, warmup_steps=self.args.warmup_steps, optimizer_params={'lr': self.args.learning_rate}, weight_decay=self.args.weight_decay, evaluator=evaluator, evaluation_steps=self.args.evaluate_during_training_steps, max_grad_norm=self.args.max_grad_norm, output_path=self.args.best_model_dir, show_progress_bar=self.args.show_progress_bar)
def get_loss(loss_type, model): if loss_type == 'BatchAllTripletLoss': return losses.BatchAllTripletLoss(model=model) if loss_type == 'BatchHardSoftMarginTripletLoss': return losses.BatchHardSoftMarginTripletLoss(model=model) if loss_type == 'BatchHardTripletLoss': return losses.BatchHardTripletLoss(model=model) if loss_type == 'BatchSemiHardTripletLoss': return losses.BatchSemiHardTripletLoss(model=model) if loss_type == 'ContrastiveLoss': return losses.ContrastiveLoss(model=model) if loss_type == 'CosineSimilarityLoss': return losses.CosineSimilarityLoss(model=model) if loss_type == 'MegaBatchMarginLoss': return losses.MegaBatchMarginLoss(model=model) if loss_type == 'MultipleNegativesRankingLoss': return losses.MultipleNegativesRankingLoss(model=model) if loss_type == 'OnlineContrastiveLoss': return losses.OnlineContrastiveLoss(model=model) raise ValueError('Invalid loss type')
def create_posts_ranking(fl, data_dir, model, validate=None, is_test=False): train_posts_ranking = [] disbn = [] with open(os.path.join(data_dir, fl)) as f: data = json.load(f) for obj in data: answers = obj['answers'] filtered_answers = [] votes = 1000000 for answer in answers: my_votes = answer['a_votes'] if my_votes < votes: votes = my_votes filtered_answers.append(answer) if len(filtered_answers) > 1: rank = len(filtered_answers) for answer in filtered_answers: dist = rank / len(filtered_answers) disbn.append(answer['a_rank']) rank = rank - 1 train_posts_ranking.append( InputExample(texts=[obj['q_text'], answer['a_text']], label=dist)) random.shuffle(train_posts_ranking) print("data size " + str(len(train_posts_ranking))) if is_test: return train_posts_ranking if max_size: train_posts_ranking = train_posts_ranking[:max_size] evaluator = None if posts_rank_str == validate: train_posts_ranking, dev_posts_ranking = train_test_split( train_posts_ranking, test_size=0.1) evaluator = EmbeddingSimilarityEvaluator.from_input_examples( dev_posts_ranking, name='posts ranking') warmup_steps = math.ceil( len(train_posts_ranking) * num_epochs / batch_size * 0.1) # 10% of train data for warm-up train_data_posts_ranking = SentencesDataset(train_posts_ranking, model=model) train_dataloader_posts_ranking = DataLoader(train_data_posts_ranking, shuffle=True, batch_size=batch_size) train_loss_posts_ranking = losses.CosineSimilarityLoss(model=model) print('R: Number of training examples: ', len(train_posts_ranking)) global evaluation_steps evaluation_steps = math.ceil(len(train_posts_ranking) / 0.1) return train_dataloader_posts_ranking, train_loss_posts_ranking, evaluator, warmup_steps
def dev_config(sts_reader, model, batch_size): '''dev dataloader and model''' logger.info(f"Read STSbenchmark dev dataset") dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) dev_loss = losses.CosineSimilarityLoss(model=model) dev_evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) return dev_loss, dev_dataloader, dev_evaluator
def train_config(sts_reader, model, batch_size): '''train dataloader and model.''' logger.info(f"Read STSbenchmark train dataset") train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) train_loss = losses.CosineSimilarityLoss(model=model) train_evaluator = EmbeddingSimilarityEvaluator(train_dataloader) return train_data, train_loss, train_dataloader, train_evaluator
def main(): parser = set_parser() args = parser.parse_args() # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.info("arguments are parsed") args.world_size = args.gpus * args.nodes patent_reader = PatentDataReader(args.data_dir, normalize_scores=True) # Use BERT for mapping tokens to embeddings logger.warning("Loading Bert Model") word_embedding_model = models.BERT('bert-base-uncased', max_seq_length=510) logger.warning("Model is loaded") # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) if args.use_tpu: logger.warning("TPU training") device = xm.xla_device() args.n_gpu = 1 elif args.local_rank == -1: logger.warning("Non dist training") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs logger.warning("Dist training local rank %s", args.local_rank) torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group( backend="nccl", timeout=datetime.timedelta(hours=10)) args.n_gpu = 1 args.device = device model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) train_loss = losses.CosineSimilarityLoss(model=model) model.to(args.device) train_loss.to(args.device) # Training if args.do_train: logger.warning("Read Patent Training dataset") train_data = load_and_cache_examples(args, patent_reader, model) logger.warning("Training dataset is loaded") # train_data = SentencesDataset(patent_reader.get_examples('train.tsv', max_examples=17714), model) tr_loss = train(args, train_data, model, train_loss) logger.info(" average loss = %s", tr_loss)
def fit_model(trial, train_fold, val_fold, fold_index): print("######################") print("start of fold_index:", fold_index) print("len(train_fold)", len(train_fold)) print("len(val_fold)", len(val_fold)) batch_size = trial.suggest_int("train_batch_size", 4, 50) num_epochs = trial.suggest_int("num_epochs", 1, 4) lr = trial.suggest_uniform("lr", 2e-6, 2e-4) eps = trial.suggest_uniform("eps", 1e-7, 1e-5) weight_decay = trial.suggest_uniform("weight_decay", 0.001, 0.1) warmup_steps_mul = trial.suggest_uniform("warmup_steps_mul", 0.1, 0.5) model = SentenceTransformer(model_name) # create train dataloader # train_sentece_dataset = SentencesDataset(train_fold, model=model) # this is deprecated train_dataloader = DataLoader(train_fold, shuffle=True, batch_size=batch_size) # define loss train_loss = losses.CosineSimilarityLoss(model=model) warmup_steps = math.ceil( len(train_fold) * num_epochs / batch_size * warmup_steps_mul) # Train the model model.fit( train_objectives=[(train_dataloader, train_loss)], evaluator=None, epochs=num_epochs, warmup_steps=warmup_steps, optimizer_params={ "lr": lr, "eps": eps, "correct_bias": False }, weight_decay=weight_decay, ) # evaluate the model val_evaluator = EmbeddingSimilarityEvaluator.from_input_examples( val_fold, name="val_set", main_similarity=SimilarityFunction.COSINE) result = val_evaluator(model) print("######################################################") print("test result:", result) print("######################################################") if math.isnan(result): result = 0.0 return result
def create_hirerachy_examples(fl, data_dir, model, validate=None, is_test=False): train_hierarchy_samples = [] disbn = [] with open(os.path.join(data_dir, fl)) as f: data = json.load(f) max_distance = 0 for obj in data: if obj['distance'] > max_distance: max_distance = obj['distance'] for obj in data: # flip the meaning of similarity, since the more distant the two classes, the closer to 0 it should be dist = (max_distance - obj['distance']) / (max_distance - 1) train_hierarchy_samples.append( InputExample(texts=[obj['class1'], obj['class2']], label=dist)) disbn.append(obj['distance']) random.shuffle(train_hierarchy_samples) train_hierarchy_samples = train_hierarchy_samples[:100000] disbn = disbn[:100000] if max_size: train_hierarchy_samples = train_hierarchy_samples[:max_size] disbn = disbn[:max_size] if is_test: return train_hierarchy_samples evaluator = None if hierarchy_str == validate: train_hierarchy_samples, dev_hierarchy_samples = train_test_split( train_hierarchy_samples, stratify=disbn, test_size=0.1) evaluator = EmbeddingSimilarityEvaluator.from_input_examples( dev_hierarchy_samples, name='hierarchy') warmup_steps = math.ceil( len(train_hierarchy_samples) * num_epochs / batch_size * 0.1) # 10% of train data for warm-up train_data_hierarchy = SentencesDataset(train_hierarchy_samples, model=model) train_dataloader_hierarchy = DataLoader(train_data_hierarchy, shuffle=True, batch_size=batch_size) train_loss_hierarchy = losses.CosineSimilarityLoss(model=model) print('H: Number of training examples: ', len(train_hierarchy_samples)) global evaluation_steps evaluation_steps = math.ceil(len(train_hierarchy_samples) / 0.1) return train_dataloader_hierarchy, train_loss_hierarchy, evaluator, warmup_steps
def construct_model(base_model, encoder_style): # word_embedding_model = models.Transformer(base_model, max_seq_length=256) # pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) # model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) if encoder_style == BIENCODER: model = SentenceTransformer(base_model) train_loss = losses.CosineSimilarityLoss(model) elif encoder_style == CROSSENCODER: model = CrossEncoder(base_model, num_labels=1, max_length=512) train_loss = None return model, train_loss
def run(): train_file = config.TRAINING_FILE train_batch = config.TRAIN_BATCH_SIZE vaild_batch = config.VALID_BATCH_SIZE model_path = config.BERT_PATH max_length = config.MAX_LEN dfs = pd.read_csv(train_file, sep="\t", names=['idx', 'sent1', 'sent2', 'label']) dfs['label'] = pd.to_numeric(dfs["label"], downcast='float') df_train, df_valid = model_selection.train_test_split( dfs, test_size=0.1, random_state=42, stratify=dfs.label.values, ) df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) dataset_reader = dataset.Dataset() train_dataset = dataset_reader.read(df_train, return_pt=True) valid_sentence1, valid_sentence2, valid_labels = dataset_reader.read( df_valid) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch) # evaluator = evaluation.EmbeddingSimilarityEvaluator(valid_sentence1, valid_sentence2, valid_labels) evaluator = evaluation.BinaryClassificationEvaluator( valid_sentence1, valid_sentence2, valid_labels, batch_size=vaild_batch, show_progress_bar=False) word_embedding_model = models.Transformer(model_path, max_seq_length=max_length) pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension()) dense_model = models.Dense( in_features=pooling_model.get_sentence_embedding_dimension(), out_features=max_length, activation_function=nn.Tanh()) model = SentenceTransformer( modules=[word_embedding_model, pooling_model, dense_model]) train_loss = losses.CosineSimilarityLoss(model) engine.train(train_dataloader, model, train_loss, evaluator)
def __init__(self, ): self.batch_size = 16 self.reader = DataReader('') self.model_save_path1 = DATAPATH + 'model_dump' + datetime.now( ).strftime("%Y-%m-%d_%H-%M-%S") self.model1 = SentenceTransformer('bert-base-nli-mean-tokens') self.model_save_path2 = DATAPATH + 'model_2_dump' + datetime.now( ).strftime("%Y-%m-%d_%H-%M-%S") self.model2 = SentenceTransformer('bert-base-nli-mean-tokens') self.train_loss1 = losses.CosineSimilarityLoss(model=self.model1) self.train_loss2 = losses.CosineSimilarityLoss(model=self.model2) self.df1 = None self.train_data1 = None self.train_dataloader1 = None self.df2 = None self.train_data2 = None self.train_dataloader2 = None
def nlptrain(premodel,ver,tr_data,te_data): #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout # Read the dataset model_name = 'roberta-large-nli-stsb-mean-tokens' train_batch_size = 16 num_epochs = 4 model_save_path = ver sts_reader = STSDataReader('kt_datasets/kt_benchmark', normalize_scores=True) # Load a pre-trained sentence transformer model model = SentenceTransformer(premodel) # Convert the dataset to a DataLoader ready for training logging.info("") train_data = SentencesDataset(sts_reader.get_examples(tr_data), model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.CosineSimilarityLoss(model=model) logging.info("") dev_data = SentencesDataset(examples=sts_reader.get_examples(te_data), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) # Configure the training. We skip evaluation in this example warmup_steps = math.ceil(len(train_data)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=model_save_path) list=['model saved in '+ ver+' directory'] return(list)
def fit_model(params, languages, train_data): print("######################") print("start of languages:", languages) batch_size = params["train_batch_size"] num_epochs = params["num_epochs"] lr = params["lr"] eps = params["eps"] weight_decay = params["weight_decay"] warmup_steps_mul = params["warmup_steps_mul"] model = SentenceTransformer(model_name) # create train dataloader # train_sentece_dataset = SentencesDataset(train_fold, model=model) # this is deprecated train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) # define loss train_loss = losses.CosineSimilarityLoss(model=model) warmup_steps = math.ceil( len(train_data) * num_epochs / batch_size * warmup_steps_mul) output_path = os.path.join( base_output_path, f"cross-{languages[0]}-{languages[1]}-roberta-sentence-transformer", ) # Train the model model.fit( train_objectives=[(train_dataloader, train_loss)], evaluator=None, epochs=num_epochs, warmup_steps=warmup_steps, optimizer_params={ "lr": lr, "eps": eps, "correct_bias": False }, weight_decay=weight_decay, ) # save model model.save(output_path) return output_path
def create_train_usage(fl, data_dir, model, validate=None, is_test=False): train_usage = [] with open(os.path.join(data_dir, fl)) as f: data = json.load(f) min_d = 10000000 max_d = 0 for obj in data: dist = obj['distance'] if dist < min_d: min_d = dist if dist > max_d: max_d = dist for obj in data: dist = (max_d - obj['distance']) / (max_d - min_d) train_usage.append( InputExample(texts=[obj['class1'], obj['class2']], label=dist)) random.shuffle(train_usage) if is_test: return train_usage if max_size: train_usage = train_usage[:max_size] evaluator = None if usage_str == validate: train_usage, dev_usage = train_test_split(train_usage, test_size=0.1) evaluator = EmbeddingSimilarityEvaluator.from_input_examples( dev_usage, name='usage') warmup_steps = math.ceil(len(train_usage) * num_epochs / batch_size * 0.1) # 10% of train data for warm-up train_data_usage = SentencesDataset(train_usage, model=model) train_dataloader_usage = DataLoader(train_data_usage, shuffle=True, batch_size=batch_size) train_loss_usage = losses.CosineSimilarityLoss(model=model) print('U: Number of training examples: ', len(train_usage)) global evaluation_steps evaluation_steps = math.ceil(len(train_usage) / 0.1) return train_dataloader_usage, train_loss_usage, evaluator, warmup_steps
def test_train_stsb(self): word_embedding_model = models.Transformer('distilbert-base-uncased') pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension()) model = SentenceTransformer( modules=[word_embedding_model, pooling_model]) train_dataset = SentencesDataset(self.stsb_train_samples, model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16) train_loss = losses.CosineSimilarityLoss(model=model) model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=None, epochs=1, evaluation_steps=1000, warmup_steps=int(len(train_dataloader) * 0.1), use_amp=True) self.evaluate_stsb_test(model, 80.0)
def sentenceTransformers(self, tokens, preprocess_obj, batch_size, num_epoch): model = SentenceTransformer('distilbert-base-nli-mean-tokens') word_embedding_model = model._first_module() train_dataloader = DataLoader(self.train_examples, shuffle=True, batch_size=batch_size) train_loss = losses.CosineSimilarityLoss(model) print(tokens) word_embedding_model.tokenizer.add_tokens(list(tokens), special_tokens=True) word_embedding_model.auto_model.resize_token_embeddings( len(word_embedding_model.tokenizer)) model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=num_epoch, warmup_steps=100, output_path=os.path.join(os.getcwd(), "bureau/models/" + "ST"))
################################################################################################# logging.info( "Step 3: Train bi-encoder: {} with STSbenchmark (gold + silver dataset)". format(model_name)) # Convert the dataset to a DataLoader ready for training logging.info("Read STSbenchmark gold and silver train dataset") silver_samples = list(InputExample(texts=[data[0], data[1]], label=score) for \ data, score in zip(silver_data, silver_scores)) train_dataset = SentencesDataset(gold_samples + silver_samples, bi_encoder) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size) train_loss = losses.CosineSimilarityLoss(model=bi_encoder) logging.info("Read STSbenchmark dev dataset") evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev') # Configure the training. warmup_steps = math.ceil(len(train_dataset) * num_epochs / batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the bi-encoder model bi_encoder.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000,
def train_similarity_sentenceBERT(robot_id, version): """ 训练 意图识别 模型 """ max_seq_length = 24 batch_size = 128 labels = ["0", "1"] # 和蓝博士反复测试, bert-tiny 版,训练异常,一直无法学习,尝试多组参数(训练epoch、学习率、批次大小等) # pretrain_name = "bert-tiny" # 哈工大版本,可以学习 # pretrain_name = "roberta_wwm_ext_3" # 经测试,下面预训练好相似度模型(sentence bert结构会加快收敛速度,由于测试数据少,准确率都在100%,这个无意义) pretrain_name = "distiluse-base-multilingual-cased-v2" train_dir = "train_files" # 初始化权重模型位置 pretrain_path = f"pretrained_models/{pretrain_name}" path = f"config_models/robot_{robot_id}_version_{version}.model" print("model_path") print(path) if os.path.exists(pretrain_path): _ = f"start train sentence_bert model, robot_id: {robot_id}, version:{version} " print(_), logging.info(_) c: Config = pickle.load(open(path, "rb")) temp_dir = f"{train_dir}/robot_{robot_id}_version_{version}_sentbert" if not os.path.exists(temp_dir): os.mkdir(temp_dir) examples_train, examples_dev = prepare_csv_data(c, temp_dir) # pretrain_path='/data4/azun/project_dialout/pretrained_models/distiluse-base-multilingual-cased-v2' print(pretrain_path) print("训练集") print(len(examples_train)) print("测试集") print(len(examples_dev)) if (len(examples_train) > 50000): examples_train = examples_train[:50000] if (len(examples_dev) > 5000): examples_dev = examples_dev[:4000] ####################### ####################### ####################### ####################### ####################### ####################### ####################### ####################### ####################### ####################### ####################### ####################### model = SentenceTransformer(pretrain_path) train_dataset = SentencesDataset(examples_train, model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16) train_loss = losses.CosineSimilarityLoss(model) model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100) model.save( f"config_models/robot_{robot_id}_version_{version}_similarity_sentbert" ) print("模型保存成功,地址是:") print( f"config_models/robot_{robot_id}_version_{version}_similarity_sentbert" ) ####################### ####################### ####################### ####################### ####################### ####################### ####################### ####################### ####################### ####################### ####################### ####################### result = {"train": 0.921, "dev": 0.932} # command = f"cp {pretrain_path}/bert_config.json config_models/robot_{robot_id}_version_{version}_similarity" # os.system(command) # command = f"cp {pretrain_path}/vocab.txt config_models/robot_{robot_id}_version_{version}_similarity" # os.system(command) # 需要上传下成绩,更新到数据库 conn = pool.connection() # 以后每次需要数据库连接就是用connection()函数获取连接就好了 cur = conn.cursor() try: similarity_result = json.dumps(result, ensure_ascii=False) sql_ = f"UPDATE {TABLE_NAME} SET SIMILARITY_RESULT='{similarity_result}',UPDATED_AT=NOW() " \ f"WHERE robot_id='{robot_id}' and version_id='{version}' and DELETE_FLAG=0 and CLUSTER='{CLUSTER}';" print(sql_) index = cur.execute(sql_) conn.commit() except Exception as e: print(repr(e)) pass finally: cur.close() conn.close() #####这里是更新intent,result现在做只是为了适配以前的后端,以后删除 conn = pool.connection() # 以后每次需要数据库连接就是用connection()函数获取连接就好了 cur = conn.cursor() try: similarity_result = json.dumps(result, ensure_ascii=False) sql_ = f"UPDATE {TABLE_NAME} SET INTENT_RESULT='{similarity_result}',UPDATED_AT=NOW() " \ f"WHERE robot_id='{robot_id}' and version_id='{version}' and DELETE_FLAG=0 and CLUSTER='{CLUSTER}';" print(sql_) index = cur.execute(sql_) conn.commit() except Exception as e: print(repr(e)) pass finally: cur.close() conn.close() print(result) else: _ = f"can not found, robot_id: {robot_id}, version:{version} " print(_), logging.info(_)
def main(): parser = argparse.ArgumentParser() # Input and output configs parser.add_argument("--task", default=None, type=str, required=True, help="the task to run bert ranker for") parser.add_argument("--data_folder", default=None, type=str, required=True, help="the folder containing data") parser.add_argument("--output_dir", default=None, type=str, required=True, help="the folder to output predictions") # #Training procedure parser.add_argument("--num_epochs", default=5, type=int, required=False, help="Number of epochs for training.") parser.add_argument("--train_batch_size", default=8, type=int, required=False, help="Training batch size.") # #Model hyperparameters parser.add_argument("--transformer_model", default="bert-base-cased", type=str, required=False, help="Bert model to use (default = bert-base-cased).") args = parser.parse_args() word_embedding_model = models.Transformer(args.transformer_model) pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) logging.info("Creating train CRR dataset.") crr_reader = CRRBenchmarkDataReader('{}/{}'.format(args.data_folder, args.task)) train_data = SentencesDataset(crr_reader.get_examples("train.tsv"), model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=args.train_batch_size) train_loss = losses.CosineSimilarityLoss(model=model) logging.info("Creating dev CRR dataset.") dev_data = SentencesDataset(crr_reader.get_examples('valid.tsv'), model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=args.train_batch_size) evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) warmup_steps = math.ceil( len(train_data) * args.num_epochs / args.train_batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) logging.info("Fitting sentenceBERT") model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=args.num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=args.output_dir + "{}_{}".format(args.transformer_model, args.task))
def main(): parser = set_parser() args = parser.parse_args() # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.info("arguments are parsed") args.world_size = args.gpus * args.nodes patent_reader = PatentDataReader(args.data_dir, normalize_scores=True) # Use BERT for mapping tokens to embeddings word_embedding_model = models.BERT('bert-base-cased', max_seq_length=510) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) # Setup CUDA, GPU & distributed training if args.local_rank == -1: logger.warning("Non dist training") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs logger.warning("Dist training local rank %s", args.local_rank) torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group( backend="nccl", timeout=datetime.timedelta(hours=10)) args.n_gpu = 1 args.device = device model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) train_loss = losses.CosineSimilarityLoss(model=model) model.to(args.device) train_loss.to(args.device) if args.fp16: try: import apex apex.amp.register_half_function(torch, "einsum") except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: logger.warning("Read Patent Training dataset") train_data = load_and_cache_examples(args, patent_reader, model) if args.eval_during_train: logging.info("Read STSbenchmark dev dataset") dev_data = load_and_cache_examples(args, patent_reader, model, evaluate=True) else: dev_data = None # train_data = SentencesDataset(patent_reader.get_examples('train.tsv', max_examples=17714), model) tr_loss = train(args, train_data, model, train_loss, dev_dataset=dev_data) logger.info(" average loss = %s", tr_loss)
# print(word_embedding_model.fc1(x).size()) import math from sentence_transformers import losses from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator # model_save_path = 'output/training_stsbenchmark_continue_training-'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") # model_save_path = './models/clinical_bert/finetuned/output2' train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size) train_loss = losses.CosineSimilarityLoss(model=word_embedding_model) # Development set: Measure correlation between cosine score and gold labels logging.info("Read SNLIbenchmark dev dataset") # evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev') # Configure the training. We skip evaluation in this example warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model word_embedding_model.fit( train_objectives=[(train_dataloader, train_loss)], # evaluator=evaluator, epochs=num_epochs,
def main(args): #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base #model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-uncased' model_name = args.model_name if model_name is None: model_name = 'bert-base-chinese' # Read the dataset batch_size = args.batch_size model_output_dir = args.model_output_dir #model_save_path = os.path.join(model_output_dir, "bert-base", datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) model_save_path = model_output_dir # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings # word_embedding_model = models.Transformer(model_name) if args.init_model is None: word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer( modules=[word_embedding_model, pooling_model]) else: model = SentenceTransformer(args.init_model) if args.do_train == 1: # Convert the dataset to a DataLoader ready for training data_reader = SimTextDataReader() logging.info("train_data:%s" % (args.train_data)) logging.info("cache_data:%s" % (args.cached_data)) train_data_files = args.train_data.split('#') cached_data_file = args.cached_data logging.info("Read train dataset") if not os.path.isfile(cached_data_file): train_examples = [] for train_file in train_data_files: if os.path.isfile(train_file): logging.info("load train file:%s" % (train_file)) now_examples = data_reader.get_examples(train_file) train_examples.extend(now_examples) train_data = SentencesDataset(train_examples, model=model) torch.save(train_data, args.cached_data) else: train_data = torch.load(cached_data_file) logging.info("Load cached dataset %s" % (cached_data_file)) logging.info("Build train dataset") train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) # train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels) train_loss = losses.CosineSimilarityLoss(model=model) logging.info("Read dev dataset") dev_data_files = args.dev_data.split('#') dev_examples = [] for dev_file in dev_data_files: if os.path.isfile(dev_file): logging.info("load dev file:%s" % (dev_file)) now_examples = data_reader.get_examples(dev_file) dev_examples.extend(now_examples) dev_data = SentencesDataset(examples=dev_examples, model=model) logging.info("Build dev dataset") dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) # Configure the training num_epochs = args.num_epochs warmup_steps = math.ceil( len(train_dataloader) * num_epochs / batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) logging.info("Start training") # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=model_save_path) if args.do_predict == 1: logging.info("Read predict dataset") pred_data_file = args.pred_data output_file = os.path.join(args.model_output_dir, "pred_res") text_pairs = load_pred_data(pred_data_file) with open(output_file, "w", encoding="utf-8") as fp: for tpair in text_pairs: embedding_pair = model.encode(tpair) cos_sim = cosine_similarity(embedding_pair[0], embedding_pair[1]) fp.write("%s\t%s\t%f\n" % (tpair[0], tpair[1], cos_sim))
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task.", ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help= "Path to pre-trained model or shortcut name selected in the list: ", ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written.", ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument( "--max_seq_length", default=510, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.", ) parser.add_argument( "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.", ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") args = parser.parse_args() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), ) #TODO: Prepare Dataloader patent_reader = PatentDataReader(args.data_dir, normalize_scores=True) model = SentenceTransformer(args.model_name_or_path) test_data = SentencesDataset(examples=patent_reader.get_examples( "dev.tsv", max_examples=40), model=model) test_dataloader = DataLoader(test_data, shuffle=False, batch_size=args.per_gpu_train_batch_size) evaluator = EmbeddingSimilarityEvaluator(test_dataloader) model.evaluate(evaluator) # Convert the dataset to a DataLoader ready for training print("Read STSbenchmark train dataset") train_data = SentencesDataset( patent_reader.get_examples('train.tsv', max_examples=17714), model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=args.per_gpu_train_batch_size) train_loss = losses.CosineSimilarityLoss(model=model) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_data = SentencesDataset( patent_reader.get_examples('train.tsv', max_examples=17714), model) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
# Add two trainable feed-forward networks (DAN) sent_embeddings_dimension = pooling_model.get_sentence_embedding_dimension() dan1 = models.Dense(in_features=sent_embeddings_dimension, out_features=sent_embeddings_dimension) dan2 = models.Dense(in_features=sent_embeddings_dimension, out_features=sent_embeddings_dimension) model = SentenceTransformer( modules=[word_embedding_model, word_weights, pooling_model, dan1, dan2]) # Convert the dataset to a DataLoader ready for training logging.info("Read STSbenchmark train dataset") train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model=model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) train_loss = losses.CosineSimilarityLoss(model=model) logging.info("Read STSbenchmark dev dataset") dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) # Configure the training num_epochs = 10 warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)],
def BertEM(path_train, path_valid, path_test, path_error, epochs_num, warmup_steps_num, evaluation_steps_num): #实例化进度条 bar = progressbar #定义模型 #model = SentenceTransformer('bert-large-nli-stsb-mean-tokens',device='cuda:1') model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens', device='cuda:6') #model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens',device='cuda:2') data_type = {"text_a": str, "text_b": str} train_data = pd.read_csv(path_train, encoding='utf-8', dtype=data_type) valid_data = pd.read_csv(path_valid, encoding='utf-8', dtype=data_type) test_data = pd.read_csv(path_test, encoding='utf-8', dtype=data_type) #训练集 train_examples = [] for i in bar.progressbar(range(len(train_data))): time.sleep(0.0001) text_a = train_data.iloc[i]['text_a'] text_b = train_data.iloc[i]['text_b'] text_a = str(text_a) text_b = str(text_b) label_data = train_data.iloc[i]['label'] label_data = float(label_data) train_examples.append( InputExample(texts=[text_a, text_b], label=label_data)) print(InputExample) #验证集 sentence_a = [] sentence_b = [] label_valid = [] for i in bar.progressbar(range(len(valid_data))): time.sleep(0.0001) sentence1 = valid_data.iloc[i]['text_a'] sentence2 = valid_data.iloc[i]['text_b'] label_valid_t = valid_data.iloc[i]['label'] label_valid_t = float(label_valid_t) sentence_a.append(sentence1) sentence_b.append(sentence2) label_valid.append(label_valid_t) #定义评估器 #evaluator = evaluation.EmbeddingSimilarityEvaluator(sentence_a, sentence_b, label_valid) evaluator = evaluation.BinaryClassificationEvaluator( sentence_a, sentence_b, label_valid) #定义数据集,损失函数 train_dataset = SentencesDataset(train_examples, model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16) train_loss = losses.CosineSimilarityLoss(model) #计算时间 start_time = time.clock() #训练模型 model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=epochs_num, warmup_steps=warmup_steps_num, evaluator=evaluator, evaluation_steps=evaluation_steps_num, use_amp=True) end_time = time.clock() #=========================================评估过程=================================================== #读取并把test所有属性转化成str test_data = pd.read_csv(path_test, encoding='utf-8') test_data['text_a'] = test_data['text_a'].map(lambda x: str(x)) test_data['text_b'] = test_data['text_b'].map(lambda x: str(x)) #循环创建预测的list字典 list_num = 40 prefix = 'pred_list_' test_map = {prefix + str(i): [] for i in range(list_num)} label_list = [] score = 0.20 error_csv = pd.DataFrame(columns=('id', 'text_a', 'text_b', 'cos_scores')) #进入测试集测试 for i in bar.progressbar(range(len(test_data))): time.sleep(0.0001) text_a_embedding = model.encode(test_data.iloc[i]['text_a'], convert_to_tensor=True) text_b_embedding = model.encode(test_data.iloc[i]['text_b'], convert_to_tensor=True) cos_scores = util.pytorch_cos_sim(text_a_embedding, text_b_embedding)[0] cos_scores = cos_scores.cpu() #标签list label = test_data.iloc[i]['label'] label_list.append(int(label)) #记录下错误的数据 if cos_scores >= 0.80: pred_test = 1 else: pred_test = 0 if pred_test != label: error_text_a = test_data.iloc[i]['text_a'] error_text_b = test_data.iloc[i]['text_b'] error_cos_scores = cos_scores error_csv = error_csv.append(pd.DataFrame({ 'id': [i], 'text_a': [error_text_a], 'text_b': [error_text_b], 'cos_scores': [error_cos_scores] }), ignore_index=True) #生成预测list compute_pred(score, cos_scores, prefix, test_map) error_csv.to_csv(path_error, index=0) max_f1 = 0 target_threshold = 0.01 target_accuracy = 0.01 target_recall = 0.01 threshold = 0.20 #循环输出各种得分结果 for i in range(len(test_map.keys())): #循环计算得分 accuracy, recall, f1 = compute_score(test_map[prefix + str(i)], label_list) if f1 >= max_f1: max_f1 = f1 target_threshold = threshold target_accuracy = accuracy target_recall = recall print('The score > {} result is accuracy: {}, | recall:{}, | f1: {}'. format(round(threshold, 2), accuracy, recall, f1)) threshold += 0.02 #输出所有结果 print('================dataset_name==================', path_a) print( '================threshold:{}, target_accuracy:{}, target_recall:{}, max_f1:{}' .format(target_threshold, target_accuracy, target_recall, max_f1)) print('================train_time:{}'.format(str(end_time - start_time)))
def main(args): #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout # Read the dataset train_batch_size = 64 num_epochs = 1000 if args.pretrained: model = SentenceTransformer(args.pretrained) model_save_path = os.path.join( args.save_path, args.pretrained.split("/")[-1] + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) else: #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base model_name = 'cl-tohoku/bert-base-japanese-char-whole-word-masking' model_save_path = os.path.join( args.save_path, model_name.replace("/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer( modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read custom train dataset") train_samples = [] val_samples = [] inp_list = [] dataset_path = args.data_path with gzip.open(dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: score = float( row['score']) / 10 # Normalize score to range 0 ... 1 inp_list.append( InputExample(texts=[row['sentence1'], row['sentence2']], label=score)) from sklearn.model_selection import train_test_split train_samples, val_samples = train_test_split(inp_list, test_size=0.2) # import ipdb; ipdb.set_trace() train_dataset = SentencesDataset(train_samples, model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) train_loss = losses.CosineSimilarityLoss(model=model) logging.info("Read custom dev dataset") # evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_samples, name='sts-dev') evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_samples) # Configure the training. We skip evaluation in this example warmup_steps = math.ceil( len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # import ipdb; ipdb.set_trace() # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=model_save_path)
def train(model_name_or_path: str, hf_dataset: str, aspect: str, fold: Union[int, str], output_path: str, train_epochs: int = 3, train_batch_size: int = 25, eval_batch_size: int = 32, evaluation_steps: int = 5000, train_on_test: bool = False, loss: str = 'multiple_negatives_ranking', override: bool = False): """ # $MODEL_NAME $HF_DATASET $ASPECT $FOLD $OUTPUT_DIR --train_epochs=3 --train_batch_size=$TRAIN_BATCH_SIZE --eval_batch_size=$EVAL_BATCH_SIZE Run with: $ export CUDA_VISIBLE_DEVICES=1 $ ./sentence_transformer_cli.py train scibert-scivocab-uncased paperswithcode_task_docs 1 ./output/st_scibert/1 --train_epochs=3 --train_batch_size=25 --eval_batch_size=32 :param loss: Training loss function (choices: multiple_negatives_ranking, cosine) :param train_on_test: If True, joint training on train and test set (validation disabled) :param aspect: :param evaluation_steps: :param train_epochs: :param model_name_or_path: :param hf_dataset: :param fold: :param output_path: :param train_batch_size: :param eval_batch_size: :param override: :return: """ top_ks = [5, 10, 25, 50] # cuda_device = -1 # hf_dataset = 'paperswithcode_task_docs' # model_name_or_path = 'scibert-scivocab-uncased' # fold = 1 max_token_length = 336 # ssee pwc_token_stats.ipynb nlp_cache_dir = './data/nlp_cache' # train_batch_size = 25 # eval_batch_size = 32 # override = False # output_path = './output/pwc_task_st/1/sci-bert' # output_path = os.path.join(output_path, str(fold), model_name_or_path) # output/1/sci-bert if os.path.exists(output_path) and not override: logger.error(f'Stop. Output path exists already: {output_path}') sys.exit(1) # if cuda_device >= 0: # os.environ["CUDA_VISIBLE_DEVICES"] = str(cuda_device) # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Model path from env if not os.path.exists(model_name_or_path) and os.path.exists( os.path.join(env['bert_dir'], model_name_or_path)): model_name_or_path = os.path.join(env['bert_dir'], model_name_or_path) word_embedding_model = Transformer(model_name_or_path, max_seq_length=max_token_length) pooling_model = Pooling( word_embedding_model.get_word_embedding_dimension()) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # tokenizer = BertTokenizer.from_pretrained(model_name_or_path) # dataset docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='docs', cache_dir=nlp_cache_dir, split='docs') train_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='relations', cache_dir=nlp_cache_dir, split=get_train_split(aspect, fold)) test_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='relations', cache_dir=nlp_cache_dir, split=get_test_split(aspect, fold)) # filter for positive labels only train_ds = train_ds.filter(lambda row: row['label'] == 'y') logger.info(f'After filtering: {len(train_ds):,}') # joint training on train and test? if train_on_test: # # import pyarrow # from datasets.arrow_dataset import Dataset # # full_ds_table = pyarrow.concat_tables([train_ds.data, test_ds.data]) # full_ds = Dataset(arrow_table=full_ds_table) raise NotImplementedError('TODO Evaluator') else: # standard training on test only train_sds = DocumentPairSentencesDataset(docs_ds, train_ds, model, max_length=max_token_length, forced_length=0) train_sds.tokenize_all_docs() evaluator = NearestNeighborsEvaluator(model, docs_ds, test_ds, top_ks=top_ks, batch_size=eval_batch_size, show_progress_bar=True) if loss == 'cosine': train_loss = losses.CosineSimilarityLoss(model) elif loss == 'multiple_negatives_ranking': # A nice advantage of MultipleNegativesRankingLoss is that it only requires positive pairs # https://github.com/UKPLab/sentence-transformers/tree/master/examples/training/quora_duplicate_questions train_loss = losses.MultipleNegativesRankingLoss(model) else: raise ValueError(f'Unsupported loss function: {loss}') train_dl = DataLoader(train_sds, shuffle=True, batch_size=train_batch_size) # Training model.fit( train_objectives=[(train_dl, train_loss)], epochs=train_epochs, # try 1-4 warmup_steps=100, evaluator=evaluator, evaluation_steps= evaluation_steps, # increase to 5000 (full dataset => 20k steps) output_path=output_path, output_path_ignore_not_empty=True) logger.info('Training done')