def __init__(self, batch_size, \ csv_file="/scratch/si699w20_cbudak_class_root/si699w20_cbudak_class/shared_data/JI_team/data/dataset/OneMonthData/OneMonthFilter846.csv", \ root_dir="/scratch/si699w20_cbudak_class_root/si699w20_cbudak_class/shared_data/JI_team/data/dataset/OneMonthData/Image/10033", \ data_transform= transforms.Compose([ #transforms.ToPILImage(mode="RGB"), transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), \ split_ratio=(0.7, 0.1, 0.2), file_size=None): assert(sum(split_ratio) == 1) torch.manual_seed(0) self.label_generator = LabelGenerator(csv_file, file_size=file_size) dataset = TweetDataset(csv_file=csv_file, root_dir=root_dir, tag2label=self.label_generator.tag2label, \ text_vocab=self.label_generator.text_vocab, transform=data_transform, file_size=file_size) train_size = int(len(dataset) * split_ratio[0]) val_size = int(len(dataset) * split_ratio[1]) test_size = len(dataset) - train_size - val_size train_set, val_set, test_set = torch.utils.data.random_split(dataset, [train_size, val_size, test_size]) self.datasets = {"train": train_set, "val": test_set, "test": val_set} self.dataset_sizes = {x: len(self.datasets[x]) for x in ["train", "val", "test"]} self.dataloaders = {x: torch.utils.data.DataLoader(self.datasets[x], batch_size=batch_size, shuffle=True, num_workers=4, drop_last=True, collate_fn=dataset.collate_fn) for x in ["train", "val", "test"]}
def get_train_val_loaders(df, train_idx, val_idx, batch_size=BATCH_SIZE): train_df = df.iloc[train_idx] val_df = df.iloc[val_idx] train_loader = torch.utils.data.DataLoader(TweetDataset(train_df), batch_size=batch_size, shuffle=True, num_workers=NUM_WORKERS, drop_last=True) val_loader = torch.utils.data.DataLoader(TweetDataset(val_df), batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS) dataloaders_dict = {"train": train_loader, "val": val_loader} return dataloaders_dict
def export_RNN_regressor(checkpoint_path): """ :param checkpoint_path: relative path to a PyTorch .pth checkpoint :return: None, dumps a prediction text file in the model's training folder """ checkpoint = torch.load(checkpoint_path) model = RNN(checkpoint['net_config']) model.load_state_dict(checkpoint['model']) model = model.eval().cuda() test_dataset = TweetDataset(dataset_type='test') test_loader = DataLoader(test_dataset, batch_size=TRAIN_CONFIG['batch_size'], num_workers=TRAIN_CONFIG['workers'], collate_fn=collate_function, shuffle=False, pin_memory=True) with open(DATASET_CONFIG['test_csv_relative_path'], newline='') as csvfile: test_data = list(csv.reader(csvfile))[1:] ids = [datum[0] for datum in test_data] n = len(test_loader) with open( "checkpoints/{}/predictions.txt".format( checkpoint['train_config']['experiment_name']), 'w') as f: writer = csv.writer(f) writer.writerow(["TweetID", "NoRetweets"]) current_idx = 0 for batch_index, batch in enumerate(test_loader): printProgressBar(batch_index, n) batch_size = batch['numeric'].shape[0] numeric = batch['numeric'].cuda() text = batch['embedding'].cuda() prediction = torch.exp(model( text, numeric)) - 1 if EXPORT_CONFIG['log'] else model( text, numeric) if EXPORT_CONFIG['threshold']: prediction[ prediction > EXPORT_CONFIG['threshold']] = EXPORT_CONFIG['threshold'] for idx_in_batch in range(batch_size): writer.writerow([ str(ids[current_idx + idx_in_batch]), str(int(prediction[idx_in_batch].item())) ]) current_idx += batch_size print("Exportation done! :)")
def create_loader(tweet: str, sentiment: str): df = pd.DataFrame({"text": tweet, "sentiment": sentiment}, index=[1]) test_dataset = TweetDataset( tweet=df.text.values, sentiment=df.sentiment.values, selected_text=df.text.values, ) test_data_loader = DataLoader(test_dataset, batch_size=1) return test_data_loader
def run(): seed_everything(config.SEED) df_train = pd.read_csv( config.TRAINING_FILE).dropna().reset_index(drop=True) train_dataset = TweetDataset(tweet=df_train.text.values, sentiment=df_train.sentiment.values, selected_text=df_train.selected_text.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) device = torch.device("cuda") model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH) model_config.output_hidden_states = True model = TweetModel(conf=model_config) model.to(device) num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * EPOCHS) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }, ] optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) es = utils.EarlyStopping(patience=2, mode="max") for epoch in range(EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler) if epoch + 1 == MAX_EPOCHS: torch.save(model.state_dict(), 'model_full.bin') break
def __create_data(self): im_dataset = { i: TweetDataset(input_data=self.data_dict[i]) for i in self.set_names } im_loader = { i: DataLoader(im_dataset[i], batch_size=self.batch_size, shuffle=self.shuffle, num_workers=self.num_works, drop_last=True) for i in self.set_names } return im_dataset, im_loader
def predict(df_test): device = torch.device("cuda") model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH) model_config.output_hidden_states = True model = TweetModel(conf=model_config) model.load_state_dict(torch.load("model.bin")) model.to(device) test_dataset = TweetDataset(tweet=df_test.text.values, sentiment=df_test.sentiment.values, selected_text=df_test.selected_text.values) data_loader = torch.utils.data.DataLoader( test_dataset, shuffle=False, batch_size=config.VALID_BATCH_SIZE, num_workers=1) return eval_fn(data_loader, model, device)
def run(): torch.manual_seed(seed) device = xm.xla_device() model = MX.to(device) # DataLoaders train_dataset = TweetDataset(args=args, df=train_df, mode="train", fold=args.fold_index, tokenizer=tokenizer) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, sampler=train_sampler, drop_last=False, num_workers=2) valid_dataset = TweetDataset(args=args, df=train_df, mode="valid", fold=args.fold_index, tokenizer=tokenizer) valid_sampler = torch.utils.data.distributed.DistributedSampler( valid_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=False) valid_loader = DataLoader(valid_dataset, batch_size=args.batch_size, sampler=valid_sampler, num_workers=1, drop_last=False) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }, ] num_train_steps = int(num_train_dpoints / args.batch_size / xm.xrt_world_size() * args.epochs) optimizer = AdamW(optimizer_parameters, lr=args.learning_rate * xm.xrt_world_size()) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) xm.master_print("Training is Starting ...... ") best_jac = 0 #early_stopping = utils.EarlyStopping(patience=2, mode="max", verbose=True) for epoch in range(args.epochs): para_loader = pl.ParallelLoader(train_loader, [device]) train_loss = train(args, para_loader.per_device_loader(device), model, device, optimizer, scheduler, epoch, f) para_loader = pl.ParallelLoader(valid_loader, [device]) valid_jac = valid(args, para_loader.per_device_loader(device), model, device, tokenizer, epoch, f) jac = xm.mesh_reduce("jac_reduce", valid_jac, reduce_fn) xm.master_print(f"**** Epoch {epoch+1} **==>** Jaccard = {jac}") log_ = f"**** Epoch {epoch+1} **==>** Jaccard = {jac}" f.write(log_ + "\n\n") if jac > best_jac: xm.master_print("**** Model Improved !!!! Saving Model") xm.save( model.state_dict(), os.path.join(args.save_path, f"fold_{args.fold_index}")) best_jac = jac
def run(fold): dfx = pd.read_csv(config.TRAINING_FILE) df_train = dfx[dfx.kfold != fold].reset_index(drop=True) df_valid = dfx[dfx.kfold == fold].reset_index(drop=True) train_dataset = TweetDataset(tweet=df_train.text.values, sentiment=df_train.sentiment.values, selected_text=df_train.selected_text.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_dataset = TweetDataset(tweet=df_valid.text.values, sentiment=df_valid.sentiment.values, selected_text=df_valid.selected_text.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2) device = torch.device("cuda") model_config = transformers.RobertaConfig.from_pretrained( config.ROBERTA_PATH) model_config.output_hidden_states = True model = TweetModel(conf=model_config) model.to(device) num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }, ] optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) es = utils.EarlyStopping(patience=2, mode="max") print(f"Training is Starting for fold={fold}") for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler) jaccard = engine.eval_fn(valid_data_loader, model, device) #print(f"Jaccard Score = {jaccard}") es(jaccard, model, model_path=f"model_{fold}.bin") if es.early_stop: print("Early stopping") break
def main(): dfx = pd.read_csv(config.TRAINING_FILE) df_train, df_valid = train_test_split(dfx, test_size=0.2, random_state=42) train_dataset = TweetDataset( tweet=df_train.text.values, sentiment=df_train.sentiment.values, selected_text=df_train.selected_text.values, ) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_dataset = TweetDataset( tweet=df_valid.text.values, sentiment=df_valid.sentiment.values, selected_text=df_valid.selected_text.values, ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2) device = torch.device("cuda") model_config = transformers.RobertaConfig.from_pretrained( config.ROBERTA_PATH) model_config.output_hidden_states = True model = TweetModel(conf=model_config) model.to(device) num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) for _ in range(config.EPOCHS): train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler) jaccard = eval_fn(valid_data_loader, model, device) print(f"Jaccard Score = {jaccard}") torch.save(model, "model.pth")
def train(model, infer_train, infer_val, load_checkpoint=None): """ Train the RNN model using the parameters defined in the config file \n :param model: a pytorch NN :param infer_train: the inference function used for training (see above) :param infer_val: the inference function used for validating (see above) :param load_checkpoint: if None, does nothing, otherwise starts training from the given path to a .pth checkpoint :return: """ global checkpoint_name print('Initialising {}'.format(cfg['experiment_name'])) checkpoint_folder = 'checkpoints/{}/'.format(cfg['experiment_name']) if not os.path.exists(checkpoint_folder): os.makedirs(checkpoint_folder) tb_folder = 'tb/{}/'.format(cfg['experiment_name']) if not os.path.exists(tb_folder): os.makedirs(tb_folder) writer = SummaryWriter(logdir=tb_folder, flush_secs=30) optimiser = Adam(model.parameters(), lr=cfg['learning_rate'], weight_decay=cfg['weight_decay']) train_dataset = TweetDataset(dataset_type='train') train_loader = DataLoader(train_dataset, batch_size=cfg['batch_size'], num_workers=cfg['workers'], collate_fn=collate_function, shuffle=True, pin_memory=True) val_dataset = TweetDataset(dataset_type='val') val_loader = DataLoader(val_dataset, batch_size=cfg['batch_size'], num_workers=cfg['workers'], collate_fn=collate_function, shuffle=False, pin_memory=True) if load_checkpoint: checkpoint = torch.load(load_checkpoint) assert model.config == checkpoint['net_config'], \ "The provided checkpoint has a different configuration, loading is impossible" start_epoch = checkpoint['epoch'] + 1 epochs = cfg['epochs'] + start_epoch step = checkpoint['step'] model.load_state_dict(checkpoint['model']) optimiser.load_state_dict(checkpoint['optimiser']) print("Loaded the checkpoint at {}".format(load_checkpoint)) else: start_epoch, step = 0, 0 epochs = cfg['epochs'] init_loss = 0. avg_loss = AverageMeter() best_mae = 1e10 print('Sanity val') val(model, val_loader, writer, 0, infer_val) model.train() print('Starting training') for epoch in range(start_epoch, epochs): loader_length = len(train_loader) epoch_start = time.time() for batch_idx, batch in enumerate(train_loader): optimiser.zero_grad() loss = infer_train(model, batch) loss.backward() if epoch == 0 and batch_idx == 0: init_loss = loss # logging elapsed = time.time() - epoch_start progress = batch_idx / loader_length est = datetime.timedelta( seconds=int(elapsed / progress)) if progress > 0.001 else '-' avg_loss.update(loss) suffix = '\tloss {:.4f}/{:.4f}\tETA [{}/{}]'.format( avg_loss.avg, init_loss, datetime.timedelta(seconds=int(elapsed)), est) printProgressBar(batch_idx, loader_length, suffix=suffix, prefix='Epoch [{}/{}]\tStep [{}/{}]'.format( epoch, epochs - 1, batch_idx, loader_length)) writer.add_scalar('Steps/train_loss', loss, step) # saving the model if step % cfg['checkpoint_every'] == 0: checkpoint_name = '{}/epoch_{}.pth'.format( checkpoint_folder, epoch) torch.save( { 'model': model.state_dict(), 'epoch': epoch, 'batch_idx': batch_idx, 'step': step, 'optimiser': optimiser.state_dict(), 'train_config': cfg, 'net_config': model.config, 'dataset_config': DATASET_CONFIG }, checkpoint_name) step += 1 optimiser.step() # validating if step % cfg['val_every'] == 0: mae = val(model, val_loader, writer, step, infer_val) if mae < best_mae: best_mae = mae print('Best model with V{:.2f}'.format(best_mae)) torch.save( { 'model': model.state_dict(), 'epoch': epoch, 'batch_idx': batch_idx, 'step': step, 'optimiser': optimiser.state_dict(), 'train_config': cfg, 'net_config': model.config, 'dataset_config': DATASET_CONFIG }, '{}/best.pth'.format(checkpoint_folder)) model.train() # end of epoch print('') writer.add_scalar('Epochs/train_loss', avg_loss.avg, epoch) avg_loss.reset() checkpoint_name = '{}/epoch_{}.pth'.format(checkpoint_folder, epoch) torch.save( { 'model': model.state_dict(), 'epoch': epoch, 'batch_idx': loader_length, 'step': step, 'optimiser': optimiser.state_dict(), 'train_config': cfg, 'net_config': model.config, 'dataset_config': DATASET_CONFIG }, checkpoint_name) # finished training writer.close() print('Training finished :)')
def run(): dfx = pd.read_csv(config.TRAINING_FILE).dropna().reset_index(drop=True) df_train, df_valid = model_selection.train_test_split( dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values) df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) train_dataset = TweetDataset(tweet=df_train.text.values, sentiment=df_train.sentiment.values, selected_text=df_train.selected_text.values) valid_dataset = TweetDataset(tweet=df_valid.text.values, sentiment=df_valid.sentiment.values, selected_text=df_valid.selected_text.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) device = torch.device("cuda") conf = transformers.RobertaConfig.from_pretrained(config.ROBERTA_PATH) model = TweetModel(conf) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }, ] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) model = nn.DataParallel(model) best_jaccard = 0 for epoch in range(config.EPOCHS): train_fn(train_data_loader, model, optimizer, device, scheduler) jaccard = eval_fn(valid_data_loader, model, device) print(f"Jaccard Score = {jaccard}") if jaccard > best_jaccard: torch.save(model.state_dict(), config.MODEL_PATH) best_jaccard = jaccard
model4 = TweetModel(conf=model_config) model4.to(device) model4.load_state_dict(torch.load("../models/nmodel_3.bin")) # print(model4.eval()) model5 = TweetModel(conf=model_config) model5.to(device) model5.load_state_dict(torch.load("../models/nmodel_4.bin")) # print(model5.eval()) final_output = [] # Instantiate TweetDataset with the test data test_dataset = TweetDataset(tweet=df_test.text.values, sentiment=df_test.sentiment.values, selected_text=df_test.selected_text.values) # Instantiate DataLoader with `test_dataset` data_loader = torch.utils.data.DataLoader(test_dataset, shuffle=False, batch_size=config.VALID_BATCH_SIZE, num_workers=1) # Turn of gradient calculations with torch.no_grad(): tk0 = tqdm(data_loader, total=len(data_loader)) # Predict the span containing the sentiment for each batch for bi, d in enumerate(tk0): ids = d["ids"] mask = d["mask"]
def prepare_datasets(): """ Prepares the training, validation and test (Kaggle) datasets used by the XGBoost model \n This function looks into the XGBOOST_CONFIG dictionary in config.py for the following information: \n * which embedding NN to use (looks for the .pth checkpoint in XGBOOST_CONFIG['embedder']) * how to extract the embedding: XGBOOST_CONFIG['embedding_use_hidden', 'embedding_use_output', 'embedding_size'] * how many numeric variables to add as input * where to dump the prepared .npy files: XGBOOST_CONFIG['train_file', 'val_file', 'test_file'] """ checkpoint = torch.load(XGBOOST_CONFIG['embedder']) embed = RNN(config=checkpoint['net_config']).eval() embed.load_state_dict(checkpoint['model']) embed = embed.cuda() annotated_dataset = TweetDataset(dataset_type='all') test_dataset = TweetDataset(dataset_type='test') def get_data(dataset, message): N = len(dataset) data = np.zeros((N, XGBOOST_CONFIG['numeric_data_size'] + XGBOOST_CONFIG['embedding_size'] + 1)) # 1 for answer loader = DataLoader(dataset, batch_size=TRAIN_CONFIG['batch_size'], num_workers=TRAIN_CONFIG['workers'], collate_fn=collate_function, shuffle=False) current_idx = 0 n = len(loader) print('') for batch_index, batch in enumerate(loader): printProgressBar(batch_index, n, prefix=message) batch_size = batch['numeric'].shape[0] numeric = batch['numeric'].cuda() text = batch['embedding'].cuda() if XGBOOST_CONFIG['embedding_use_hidden']: embedding = embed( text, numeric[:, :checkpoint['net_config']['numeric_data_size']] )[1] elif XGBOOST_CONFIG['embedding_use_output']: embedding = torch.exp( embed( text, numeric[:, :checkpoint['net_config'] ['numeric_data_size']])[0]) - 1 else: # expecting a built-in embedding layer -> taking the mean of the embeddings embedding = embed.emb(text).mean(axis=1) data[current_idx:current_idx+batch_size, XGBOOST_CONFIG['numeric_data_size']:-1] = \ embedding.detach().cpu().numpy() data[current_idx:current_idx+batch_size, :XGBOOST_CONFIG['numeric_data_size']] = \ numeric.detach().cpu().numpy() data[current_idx:current_idx + batch_size, -1] = batch['target'].numpy() current_idx += batch_size return data annotated_data = get_data(annotated_dataset, "Preparing train.csv ...") split = int(len(annotated_dataset) * DATASET_CONFIG['train_percent']) np.save(XGBOOST_CONFIG['train_file'], annotated_data[1:split]) np.save(XGBOOST_CONFIG['val_file'], annotated_data[split:]) test_data = get_data(test_dataset, "Preparing evaluation.csv ...") with open(DATASET_CONFIG['test_csv_relative_path'], newline='') as csvfile: ids = [line[0] for line in list(csv.reader(csvfile))[1:]] ids = np.array(ids).reshape(np.shape(ids)[0], 1) prepared_test_data = np.concatenate((test_data, ids), axis=1) np.save(XGBOOST_CONFIG['test_file'], prepared_test_data)
def run(fold): dfx = pd.read_csv(config.TRAINING_FILE) # Set train validation set split df_train = dfx[dfx.kfold != fold].reset_index(drop=True) df_valid = dfx[dfx.kfold == fold].reset_index(drop=True) train_dataset = TweetDataset( tweet=df_train.text.values, sentiment=df_train.sentiment.values, selected_text=df_train.selected_text.values ) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4 ) valid_dataset = TweetDataset( tweet=df_valid.text.values, sentiment=df_valid.sentiment.values, selected_text=df_valid.selected_text.values ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2 ) device = torch.device("cuda") model_config = transformers.BertConfig.from_pretrained(config.ROBERTA_PATH) model_config.output_hidden_states = True model = TweetModel(conf=model_config) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] # Define two sets of parameters: those with weight decay, and those without optimizer_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) ''' Create a scheduler to set the learning rate at each training step "Create a schedule with a learning rate that decreases linearly after linearly increasing during a warmup period." (https://pytorch.org/docs/stable/optim.html) Since num_warmup_steps = 0, the learning rate starts at 3e-5, and then linearly decreases at each training step ''' scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps ) es = utils.EarlyStopping(patience=2, mode="max") print(f"Training is Starting for fold={fold}") logger.info("{} - {}".format("Training is Starting for fold", fold)) #model=nn.DataParallel(model) for epoch in range(3): engine.train_fn(train_data_loader, model, optimizer, device, scheduler) jaccard=engine.eval_fn(valid_data_loader, model, device) print(f"Jaccard Score = {jaccard}") logger.info("EPOCHS {} - Jaccard Score - {}".format(epoch, jaccard)) es(jaccard, model, model_path=f"../models/nmodel_{fold}.bin") if es.early_stop: print("Early stopping") break
def predict(df_test): device = torch.device("cuda") model_config = transformers.RobertaConfig.from_pretrained(config.ROBERTA_PATH) model_config.output_hidden_states = True model1 = TweetModel(conf=model_config) model1.to(device) model1.load_state_dict(torch.load("model_0.bin")) model1.eval() model2 = TweetModel(conf=model_config) model2.to(device) model2.load_state_dict(torch.load("model_1.bin")) model2.eval() model3 = TweetModel(conf=model_config) model3.to(device) model3.load_state_dict(torch.load("model_2.bin")) model3.eval() model4 = TweetModel(conf=model_config) model4.to(device) model4.load_state_dict(torch.load("model_3.bin")) model4.eval() model5 = TweetModel(conf=model_config) model5.to(device) model5.load_state_dict(torch.load("model_4.bin")) model5.eval() final_output = [] test_dataset = TweetDataset( tweet=df_test.text.values, sentiment=df_test.sentiment.values, selected_text=df_test.selected_text.values ) data_loader = torch.utils.data.DataLoader( test_dataset, shuffle=False, batch_size=config.VALID_BATCH_SIZE, num_workers=1 ) jaccards = utils.AverageMeter() with torch.no_grad(): tk0 = tqdm(data_loader, total=len(data_loader)) for bi, d in enumerate(tk0): ids = d["ids"] token_type_ids = d["token_type_ids"] mask = d["mask"] sentiment = d["sentiment"] orig_selected = d["orig_selected"] orig_tweet = d["orig_tweet"] targets_start = d["targets_start"] targets_end = d["targets_end"] offsets = d["offsets"].numpy() ids = ids.to(device, dtype=torch.long) token_type_ids = token_type_ids.to(device, dtype=torch.long) mask = mask.to(device, dtype=torch.long) targets_start = targets_start.to(device, dtype=torch.long) targets_end = targets_end.to(device, dtype=torch.long) outputs_start1, outputs_end1 = model1( ids=ids, mask=mask, token_type_ids=token_type_ids ) outputs_start2, outputs_end2 = model2( ids=ids, mask=mask, token_type_ids=token_type_ids ) outputs_start3, outputs_end3 = model3( ids=ids, mask=mask, token_type_ids=token_type_ids ) outputs_start4, outputs_end4 = model4( ids=ids, mask=mask, token_type_ids=token_type_ids ) outputs_start5, outputs_end5 = model5( ids=ids, mask=mask, token_type_ids=token_type_ids ) outputs_start = ( outputs_start1 + outputs_start2 + outputs_start3 + outputs_start4 + outputs_start5 ) / 5 outputs_end = ( outputs_end1 + outputs_end2 + outputs_end3 + outputs_end4 + outputs_end5 ) / 5 outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy() outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy() jaccard_scores = [] for px, tweet in enumerate(orig_tweet): selected_tweet = orig_selected[px] tweet_sentiment = sentiment[px] jaccard_score, output_sentence = calculate_jaccard_score( original_tweet=tweet, target_string=selected_tweet, sentiment_val=tweet_sentiment, idx_start=np.argmax(outputs_start[px, :]), idx_end=np.argmax(outputs_end[px, :]), offsets=offsets[px] ) jaccard_scores.append(jaccard_score) final_output.append(output_sentence) jaccards.update(np.mean(jaccard_scores), ids.size(0)) return final_output, jaccards.avg
def train(fold, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path, lr, patience, num_warmup_steps): dfx = pd.read_csv(training_file) df_train = dfx[dfx.kfold != fold].reset_index(drop = True) df_valid = dfx[dfx.kfold == fold].reset_index(drop = True) # 训练集 train_dataset = TweetDataset( tweet = df_train.text.values, sentiment = df_train.sentiment.values, selected_text = df_train.selected_text.values, tokenizer = tokenizer, max_len = max_len ) # 验证集 valid_dataset = TweetDataset( tweet = df_valid.text.values, sentiment = df_valid.sentiment.values, selected_text = df_valid.selected_text.values, tokenizer = tokenizer, max_len = max_len ) train_sampler, valid_sampler = None, None if args.shuffle: train_sampler = RandomSampler(train_dataset) valid_sampler = SequentialSampler(valid_dataset) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size = train_batch_size, num_workers = 4, sampler=train_sampler ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size = valid_batch_size, num_workers = 2, sampler=valid_sampler ) device = torch.device("cuda") model_config = transformers.RobertaConfig.from_pretrained(roberta_path) model_config.output_hidden_states = True model = TweetModel(roberta_path = roberta_path, conf = model_config) model.to(device) num_train_steps = int(len(df_train) / train_batch_size * epochs) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.003}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, ] optimizer = AdamW(optimizer_parameters, lr = lr) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps = num_warmup_steps, num_training_steps = num_train_steps ) if args.fp16: # try: # from apex import amp # except ImportError: # raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.parallel: model = torch.nn.DataParallel(model) es = utils.EarlyStopping(patience = patience, mode = "max") print("Training is Starting for fold", fold) for epoch in range(epochs): train_fn(train_data_loader, model, optimizer, device, scheduler = scheduler) jaccard = eval_fn(valid_data_loader, model, device) print("Jaccard Score = ", jaccard) experiment.log_metric("jaccard", jaccard) es(jaccard, model, model_path = f"{save_path}/model_{fold}.bin") if es.early_stop: print("Early stopping") break del model, optimizer, scheduler, df_train, df_valid, train_dataset, valid_dataset, train_data_loader, valid_data_loader import gc gc.collect() torch.cuda.empty_cache()
def train(fold, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path, lr, patience, num_warmup_steps): dfx = pd.read_csv(training_file) df_train = dfx[dfx.kfold != fold].reset_index(drop = True) df_valid = dfx[dfx.kfold == fold].reset_index(drop = True) train_sampler = None val_sampler = None # 训练集 # 3)使用DistributedSampler train_dataset = TweetDataset( tweet = df_train.text.values, sentiment = df_train.sentiment.values, selected_text = df_train.selected_text.values, tokenizer = tokenizer, max_len = max_len ) # 验证集 valid_dataset = TweetDataset( tweet = df_valid.text.values, sentiment = df_valid.sentiment.values, selected_text = df_valid.selected_text.values, tokenizer = tokenizer, max_len = max_len ) if distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) val_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size = train_batch_size, shuffle=(train_sampler is None), num_workers = 4, sampler=train_sampler ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size = valid_batch_size, shuffle=False, num_workers = 2, sampler=val_sampler ) device = torch.device("cuda") model_config = transformers.RobertaConfig.from_pretrained(roberta_path) model_config.output_hidden_states = True model = TweetModel(roberta_path = roberta_path, conf = model_config) model.to(device) if torch.cuda.device_count() > 1: num_device = torch.cuda.device_count() print("Let's use", num_device, "GPUs!") # 5) 封装 model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True) num_train_steps = int(len(df_train) / train_batch_size * epochs) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, ] optimizer = AdamW(optimizer_parameters, lr = lr * num_device) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps = num_warmup_steps, num_training_steps = num_train_steps ) es = utils.EarlyStopping(patience = patience, mode = "max") print("Training is Starting for fold", fold) for epoch in range(epochs): if distributed: train_sampler.set_epoch(epoch) train_fn(train_data_loader, model, optimizer, device, scheduler = scheduler) jaccard = eval_fn(valid_data_loader, model, device) # if distributed: # jaccard_reduce = reduce_tensor(jaccard) # print("jaccard_reduce:", jaccard_reduce) if not distributed or (distributed and torch.distributed.get_rank() == 0): print("Jaccard Score = ", jaccard) es(jaccard, model, model_path = f"./bin/model_{fold}.bin") if es.early_stop: print("Early stopping") break del model, optimizer, scheduler, df_train, df_valid, train_dataset, valid_dataset, train_data_loader, valid_data_loader import gc gc.collect() torch.cuda.empty_cache()