def run(fold, model_name): writer = SummaryWriter(log_dir=f'{SAVE_PATH}/', filename_suffix=f'{model_name}-fold{fold}') dfx = pd.read_csv(config.TRAINING_FILE) df_train = dfx[dfx.kfold != fold].reset_index(drop=True) df_valid = dfx[dfx.kfold == fold].reset_index(drop=True) print(df_train.shape) print(df_valid.shape) train_dataset = dataset.TweetDataset( tweet=df_train.text.values, sentiment=df_train.sentiment.values, selected_text=df_train.selected_text.values ) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4 ) valid_dataset = dataset.TweetDataset( tweet=df_valid.text.values, sentiment=df_valid.sentiment.values, selected_text=df_valid.selected_text.values ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2 ) device = torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu') print(f'training on {device}') model_config = transformers.RobertaConfig.from_pretrained(config.ROBERTA_PATH) model_config.output_hidden_states = True model = TweetModel(conf=model_config) model.to(device) num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(params.optimizer_params(model), lr=5e-5) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) es = utils.EarlyStopping(patience=5, mode="max") for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler, writer) jaccard = engine.eval_fn(valid_data_loader, model, device, writer) print(f"Jaccard Score = {jaccard}") print(f"Epoch={epoch}, Jaccard={jaccard}") es(jaccard, model, model_path=f"{SAVE_PATH}/{model_name}-f{fold}.pt") if es.early_stop: print("Early stopping") break
def run(): dfx = pd.read_csv(config.TRAINING_FILE) df_train, df_valid = model_selection.train_test_split(dfx, test_size=0.1, random_state=42) train_dataset = TweetDataset(tweet=df_train.text.values, sentiment=df_train.sentiment.values, selected_text=df_train.selected_text.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_dataset = TweetDataset(tweet=df_valid.text.values, sentiment=df_valid.sentiment.values, selected_text=df_valid.selected_text.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2) device = torch.device('cuda') model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH) model_config.output_hidden_states = True model = TweetModel(conf=model_config) model.to(device) num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }, ] optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) for epoch in range(3): train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler) jaccard = eval_fn(valid_data_loader, model, device) results.append(jaccard) print(f"Jaccard Score = {jaccard}") torch.save(model.state_dict(), f"MODEL_PATH")
def run(): seed_everything(config.SEED) df_train = pd.read_csv( config.TRAINING_FILE).dropna().reset_index(drop=True) train_dataset = TweetDataset(tweet=df_train.text.values, sentiment=df_train.sentiment.values, selected_text=df_train.selected_text.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) device = torch.device("cuda") model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH) model_config.output_hidden_states = True model = TweetModel(conf=model_config) model.to(device) num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * EPOCHS) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }, ] optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) es = utils.EarlyStopping(patience=2, mode="max") for epoch in range(EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler) if epoch + 1 == MAX_EPOCHS: torch.save(model.state_dict(), 'model_full.bin') break
def predict(df_test): device = torch.device("cuda") model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH) model_config.output_hidden_states = True model = TweetModel(conf=model_config) model.load_state_dict(torch.load("model.bin")) model.to(device) test_dataset = TweetDataset(tweet=df_test.text.values, sentiment=df_test.sentiment.values, selected_text=df_test.selected_text.values) data_loader = torch.utils.data.DataLoader( test_dataset, shuffle=False, batch_size=config.VALID_BATCH_SIZE, num_workers=1) return eval_fn(data_loader, model, device)
def train(fold, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path, lr, patience, num_warmup_steps): dfx = pd.read_csv(training_file) df_train = dfx[dfx.kfold != fold].reset_index(drop = True) df_valid = dfx[dfx.kfold == fold].reset_index(drop = True) # 训练集 train_dataset = TweetDataset( tweet = df_train.text.values, sentiment = df_train.sentiment.values, selected_text = df_train.selected_text.values, tokenizer = tokenizer, max_len = max_len ) # 验证集 valid_dataset = TweetDataset( tweet = df_valid.text.values, sentiment = df_valid.sentiment.values, selected_text = df_valid.selected_text.values, tokenizer = tokenizer, max_len = max_len ) train_sampler, valid_sampler = None, None if args.shuffle: train_sampler = RandomSampler(train_dataset) valid_sampler = SequentialSampler(valid_dataset) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size = train_batch_size, num_workers = 4, sampler=train_sampler ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size = valid_batch_size, num_workers = 2, sampler=valid_sampler ) device = torch.device("cuda") model_config = transformers.RobertaConfig.from_pretrained(roberta_path) model_config.output_hidden_states = True model = TweetModel(roberta_path = roberta_path, conf = model_config) model.to(device) num_train_steps = int(len(df_train) / train_batch_size * epochs) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.003}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, ] optimizer = AdamW(optimizer_parameters, lr = lr) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps = num_warmup_steps, num_training_steps = num_train_steps ) if args.fp16: # try: # from apex import amp # except ImportError: # raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.parallel: model = torch.nn.DataParallel(model) es = utils.EarlyStopping(patience = patience, mode = "max") print("Training is Starting for fold", fold) for epoch in range(epochs): train_fn(train_data_loader, model, optimizer, device, scheduler = scheduler) jaccard = eval_fn(valid_data_loader, model, device) print("Jaccard Score = ", jaccard) experiment.log_metric("jaccard", jaccard) es(jaccard, model, model_path = f"{save_path}/model_{fold}.bin") if es.early_stop: print("Early stopping") break del model, optimizer, scheduler, df_train, df_valid, train_dataset, valid_dataset, train_data_loader, valid_data_loader import gc gc.collect() torch.cuda.empty_cache()
def run(fold): dfx = pd.read_csv(config.TRAINING_FILE) df_train = dfx[dfx.kfold != fold].reset_index(drop=True) df_valid = dfx[dfx.kfold == fold].reset_index(drop=True) train_dataset = TweetDataset(tweet=df_train.text.values, sentiment=df_train.sentiment.values, selected_text=df_train.selected_text.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_dataset = TweetDataset(tweet=df_valid.text.values, sentiment=df_valid.sentiment.values, selected_text=df_valid.selected_text.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2) device = torch.device("cuda") model_config = transformers.RobertaConfig.from_pretrained( config.ROBERTA_PATH) model_config.output_hidden_states = True model = TweetModel(conf=model_config) model.to(device) num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }, ] optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) es = utils.EarlyStopping(patience=2, mode="max") print(f"Training is Starting for fold={fold}") for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler) jaccard = engine.eval_fn(valid_data_loader, model, device) #print(f"Jaccard Score = {jaccard}") es(jaccard, model, model_path=f"model_{fold}.bin") if es.early_stop: print("Early stopping") break
def main(): dfx = pd.read_csv(config.TRAINING_FILE) df_train, df_valid = train_test_split(dfx, test_size=0.2, random_state=42) train_dataset = TweetDataset( tweet=df_train.text.values, sentiment=df_train.sentiment.values, selected_text=df_train.selected_text.values, ) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_dataset = TweetDataset( tweet=df_valid.text.values, sentiment=df_valid.sentiment.values, selected_text=df_valid.selected_text.values, ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2) device = torch.device("cuda") model_config = transformers.RobertaConfig.from_pretrained( config.ROBERTA_PATH) model_config.output_hidden_states = True model = TweetModel(conf=model_config) model.to(device) num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) for _ in range(config.EPOCHS): train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler) jaccard = eval_fn(valid_data_loader, model, device) print(f"Jaccard Score = {jaccard}") torch.save(model, "model.pth")
def predict(df_test): device = torch.device("cuda") model_config = transformers.RobertaConfig.from_pretrained(config.ROBERTA_PATH) model_config.output_hidden_states = True model1 = TweetModel(conf=model_config) model1.to(device) model1.load_state_dict(torch.load("model_0.bin")) model1.eval() model2 = TweetModel(conf=model_config) model2.to(device) model2.load_state_dict(torch.load("model_1.bin")) model2.eval() model3 = TweetModel(conf=model_config) model3.to(device) model3.load_state_dict(torch.load("model_2.bin")) model3.eval() model4 = TweetModel(conf=model_config) model4.to(device) model4.load_state_dict(torch.load("model_3.bin")) model4.eval() model5 = TweetModel(conf=model_config) model5.to(device) model5.load_state_dict(torch.load("model_4.bin")) model5.eval() final_output = [] test_dataset = TweetDataset( tweet=df_test.text.values, sentiment=df_test.sentiment.values, selected_text=df_test.selected_text.values ) data_loader = torch.utils.data.DataLoader( test_dataset, shuffle=False, batch_size=config.VALID_BATCH_SIZE, num_workers=1 ) jaccards = utils.AverageMeter() with torch.no_grad(): tk0 = tqdm(data_loader, total=len(data_loader)) for bi, d in enumerate(tk0): ids = d["ids"] token_type_ids = d["token_type_ids"] mask = d["mask"] sentiment = d["sentiment"] orig_selected = d["orig_selected"] orig_tweet = d["orig_tweet"] targets_start = d["targets_start"] targets_end = d["targets_end"] offsets = d["offsets"].numpy() ids = ids.to(device, dtype=torch.long) token_type_ids = token_type_ids.to(device, dtype=torch.long) mask = mask.to(device, dtype=torch.long) targets_start = targets_start.to(device, dtype=torch.long) targets_end = targets_end.to(device, dtype=torch.long) outputs_start1, outputs_end1 = model1( ids=ids, mask=mask, token_type_ids=token_type_ids ) outputs_start2, outputs_end2 = model2( ids=ids, mask=mask, token_type_ids=token_type_ids ) outputs_start3, outputs_end3 = model3( ids=ids, mask=mask, token_type_ids=token_type_ids ) outputs_start4, outputs_end4 = model4( ids=ids, mask=mask, token_type_ids=token_type_ids ) outputs_start5, outputs_end5 = model5( ids=ids, mask=mask, token_type_ids=token_type_ids ) outputs_start = ( outputs_start1 + outputs_start2 + outputs_start3 + outputs_start4 + outputs_start5 ) / 5 outputs_end = ( outputs_end1 + outputs_end2 + outputs_end3 + outputs_end4 + outputs_end5 ) / 5 outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy() outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy() jaccard_scores = [] for px, tweet in enumerate(orig_tweet): selected_tweet = orig_selected[px] tweet_sentiment = sentiment[px] jaccard_score, output_sentence = calculate_jaccard_score( original_tweet=tweet, target_string=selected_tweet, sentiment_val=tweet_sentiment, idx_start=np.argmax(outputs_start[px, :]), idx_end=np.argmax(outputs_end[px, :]), offsets=offsets[px] ) jaccard_scores.append(jaccard_score) final_output.append(output_sentence) jaccards.update(np.mean(jaccard_scores), ids.size(0)) return final_output, jaccards.avg
from dataset import TweetDataset from tqdm import tqdm import numpy as np df_test = pd.read_csv(config.TEST_FILE) df_test.loc[:, "selected_text"] = df_test.text.values #data = [["Its coming out the socket I feel like my phones hole is not a virgin. That`s how loose it is... :`(","loose it is...", "negative"]] # Create the pandas DataFrame # df_test = pd.DataFrame(data, columns = ["text","selected_text","sentiment"]) # df_test.loc[:, "selected_text"] = df_test.text.values device = torch.device("cuda") model_config = transformers.RobertaConfig.from_pretrained(config.ROBERTA_PATH) model_config.output_hidden_states = True # Load each of the five trained models and move to GPU model1 = TweetModel(conf=model_config) model1.to(device) model1.load_state_dict(torch.load("../models/nmodel_0.bin")) #strict=False # print(model1.eval()) model2 = TweetModel(conf=model_config) model2.to(device) model2.load_state_dict(torch.load("../models/nmodel_1.bin")) #strict=False # print(model2.eval()) model3 = TweetModel(conf=model_config) model3.to(device) model3.load_state_dict(torch.load("../models/nmodel_2.bin")) # print(model3.eval()) model4 = TweetModel(conf=model_config) model4.to(device)
def run(fold): dfx = pd.read_csv(config.TRAINING_FILE) # Set train validation set split df_train = dfx[dfx.kfold != fold].reset_index(drop=True) df_valid = dfx[dfx.kfold == fold].reset_index(drop=True) train_dataset = TweetDataset( tweet=df_train.text.values, sentiment=df_train.sentiment.values, selected_text=df_train.selected_text.values ) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4 ) valid_dataset = TweetDataset( tweet=df_valid.text.values, sentiment=df_valid.sentiment.values, selected_text=df_valid.selected_text.values ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2 ) device = torch.device("cuda") model_config = transformers.BertConfig.from_pretrained(config.ROBERTA_PATH) model_config.output_hidden_states = True model = TweetModel(conf=model_config) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] # Define two sets of parameters: those with weight decay, and those without optimizer_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) ''' Create a scheduler to set the learning rate at each training step "Create a schedule with a learning rate that decreases linearly after linearly increasing during a warmup period." (https://pytorch.org/docs/stable/optim.html) Since num_warmup_steps = 0, the learning rate starts at 3e-5, and then linearly decreases at each training step ''' scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps ) es = utils.EarlyStopping(patience=2, mode="max") print(f"Training is Starting for fold={fold}") logger.info("{} - {}".format("Training is Starting for fold", fold)) #model=nn.DataParallel(model) for epoch in range(3): engine.train_fn(train_data_loader, model, optimizer, device, scheduler) jaccard=engine.eval_fn(valid_data_loader, model, device) print(f"Jaccard Score = {jaccard}") logger.info("EPOCHS {} - Jaccard Score - {}".format(epoch, jaccard)) es(jaccard, model, model_path=f"../models/nmodel_{fold}.bin") if es.early_stop: print("Early stopping") break
def run(): dfx = pd.read_csv(config.TRAINING_FILE).dropna().reset_index(drop=True) df_train, df_valid = model_selection.train_test_split( dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values) df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) train_dataset = TweetDataset(tweet=df_train.text.values, sentiment=df_train.sentiment.values, selected_text=df_train.selected_text.values) valid_dataset = TweetDataset(tweet=df_valid.text.values, sentiment=df_valid.sentiment.values, selected_text=df_valid.selected_text.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) device = torch.device("cuda") conf = transformers.RobertaConfig.from_pretrained(config.ROBERTA_PATH) model = TweetModel(conf) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }, ] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) model = nn.DataParallel(model) best_jaccard = 0 for epoch in range(config.EPOCHS): train_fn(train_data_loader, model, optimizer, device, scheduler) jaccard = eval_fn(valid_data_loader, model, device) print(f"Jaccard Score = {jaccard}") if jaccard > best_jaccard: torch.save(model.state_dict(), config.MODEL_PATH) best_jaccard = jaccard
def main(args, mode): config = Config( train_dir='/mfs/renxiangyuan/tweets/data/train_folds.csv', # 原始数据 # train_dir='/mfs/renxiangyuan/tweets/data/train_folds_extra.csv', # 加入更多sentimen分类数据 model_save_dir= f'/mfs/renxiangyuan/tweets/output/{args.model_type}-5-fold-ak', # model_save_dir=f'/mfs/renxiangyuan/tweets/output/shuffle/{args.model_type}-5-fold-ak', model_type=args.model_type, batch_size=args.bs, seed=args.seed, lr=args.lr * 1e-5, max_seq_length=args.max_seq_length, num_hidden_layers=args.num_hidden_layers, cat_n_layers=args.cat_n_layers, froze_n_layers=args.froze_n_layers, # conv_head=True, # eps=args.eps, shuffle_seed=args.shuffle_seed, init_seed=args.init_seed, epochs=args.epochs, # 默认epochs=3 warmup_samples=args.warmup_samples, # frozen_warmup=False, warmup_scheduler=args.scheduler, mask_pad_loss=args.mask_pad_loss, smooth=args.smooth, # fp16=False, io_loss_ratio=args.io_loss_ratio, io_loss_type=args.io_loss_type, # multi_sent_loss_ratio=0, # clean_data=True, # 模型clean_data=False ) config.print_info() set_seed(config.seed) # 训练 if "train" in mode: os.makedirs(config.MODEL_SAVE_DIR, exist_ok=True) jaccard_scores = [] for i in args.train_folds: scores_i = train(fold=i, config=config) jaccard_scores.append(scores_i) # if i == 0 and max(scores_i) < 0.705: # print("Fold 0 Too Weak, Early Stop") # break for i, res_i in enumerate(jaccard_scores): print(i, res_i) print("mean", np.mean([max(scores) for scores in jaccard_scores])) for i in range(1, config.EPOCHS): print(f"\tEpoch{i+1}: ", np.mean([scores[i] for scores in jaccard_scores])) config.print_info() # 测试 if "test" in mode: model_paths = [ "/mfs/renxiangyuan/tweets/output/shuffle/roberta-squad-5-fold-ak/4e-05lr_32bs_42sd_128len_12layer_1cat_-1froze_11shufflesd/model_0_epoch_2.pth", "/mfs/renxiangyuan/tweets/output/shuffle/roberta-squad-5-fold-ak/4e-05lr_32bs_42sd_128len_12layer_1cat_-1froze_3shufflesd/model_1_epoch_3.pth", "/mfs/renxiangyuan/tweets/output/shuffle/roberta-squad-5-fold-ak/4e-05lr_32bs_42sd_128len_12layer_1cat_-1froze_18shufflesd/model_2_epoch_3.pth", "/mfs/renxiangyuan/tweets/output/shuffle/roberta-squad-5-fold-ak/4e-05lr_32bs_42sd_128len_12layer_1cat_-1froze_13shufflesd/model_3_epoch_2.pth", "/mfs/renxiangyuan/tweets/output/shuffle/roberta-squad-5-fold-ak/4e-05lr_32bs_42sd_128len_12layer_1cat_-1froze_19shufflesd/model_4_epoch_3.pth", # "/mfs/renxiangyuan/tweets/output/roberta-base-5-fold-ak/5e-05lr_32bs_42sd_13layer/model_0_epoch_2.pth", # "/mfs/renxiangyuan/tweets/output/roberta-base-5-fold-ak/5e-05lr_32bs_42sd_13layer/model_1_epoch_2.pth", # "/mfs/renxiangyuan/tweets/output/roberta-base-5-fold-ak/5e-05lr_32bs_42sd_13layer/model_2_epoch_3.pth", # "/mfs/renxiangyuan/tweets/output/roberta-base-5-fold-ak/5e-05lr_32bs_42sd_13layer/model_3_epoch_3.pth", # "/mfs/renxiangyuan/tweets/output/roberta-base-5-fold-ak/5e-05lr_32bs_42sd_13layer/model_4_epoch_3.pth", ] ensemble_infer(model_paths, config) # ensemble_infer(model_paths=None, config=config) # # 评估 if "evaluate" in mode: device = torch.device("cuda") model = TweetModel(conf=config.model_config, config=config) model.to(device) res = [[] for _ in range(5)] for fold in range(5): dfx = pd.read_csv(config.TRAINING_FILE) df_valid = dfx[dfx.kfold == fold].reset_index(drop=True) valid_dataset = TweetDataset( tweet=df_valid.text.values, sentiment=df_valid.sentiment.values, selected_text=df_valid.selected_text.values, config=config, ) valid_data_loader = DataLoader(valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=8) for ep in range(1, config.EPOCHS): state_dict_dir = os.path.join( config.MODEL_SAVE_DIR, f"model_{fold}_epoch_{ep+1}.pth") print(state_dict_dir) model.load_state_dict(torch.load(state_dict_dir)) model.eval() jaccards = eval_fn(valid_data_loader, model, device, config) print(jaccards) res[fold].append(jaccards) for i, res_i in enumerate(res): print(i, res_i) print("mean", np.mean([max(scores) for scores in res])) for i in range(2): print(f"\tEpoch{i + 1}: ", np.mean([scores[i] for scores in res]))
def train(fold, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path, lr, patience, num_warmup_steps): dfx = pd.read_csv(training_file) df_train = dfx[dfx.kfold != fold].reset_index(drop = True) df_valid = dfx[dfx.kfold == fold].reset_index(drop = True) train_sampler = None val_sampler = None # 训练集 # 3)使用DistributedSampler train_dataset = TweetDataset( tweet = df_train.text.values, sentiment = df_train.sentiment.values, selected_text = df_train.selected_text.values, tokenizer = tokenizer, max_len = max_len ) # 验证集 valid_dataset = TweetDataset( tweet = df_valid.text.values, sentiment = df_valid.sentiment.values, selected_text = df_valid.selected_text.values, tokenizer = tokenizer, max_len = max_len ) if distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) val_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size = train_batch_size, shuffle=(train_sampler is None), num_workers = 4, sampler=train_sampler ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size = valid_batch_size, shuffle=False, num_workers = 2, sampler=val_sampler ) device = torch.device("cuda") model_config = transformers.RobertaConfig.from_pretrained(roberta_path) model_config.output_hidden_states = True model = TweetModel(roberta_path = roberta_path, conf = model_config) model.to(device) if torch.cuda.device_count() > 1: num_device = torch.cuda.device_count() print("Let's use", num_device, "GPUs!") # 5) 封装 model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True) num_train_steps = int(len(df_train) / train_batch_size * epochs) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, ] optimizer = AdamW(optimizer_parameters, lr = lr * num_device) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps = num_warmup_steps, num_training_steps = num_train_steps ) es = utils.EarlyStopping(patience = patience, mode = "max") print("Training is Starting for fold", fold) for epoch in range(epochs): if distributed: train_sampler.set_epoch(epoch) train_fn(train_data_loader, model, optimizer, device, scheduler = scheduler) jaccard = eval_fn(valid_data_loader, model, device) # if distributed: # jaccard_reduce = reduce_tensor(jaccard) # print("jaccard_reduce:", jaccard_reduce) if not distributed or (distributed and torch.distributed.get_rank() == 0): print("Jaccard Score = ", jaccard) es(jaccard, model, model_path = f"./bin/model_{fold}.bin") if es.early_stop: print("Early stopping") break del model, optimizer, scheduler, df_train, df_valid, train_dataset, valid_dataset, train_data_loader, valid_data_loader import gc gc.collect() torch.cuda.empty_cache()