def convert_to_features(df_data, save_path, is_train=False): if is_train: if os.path.exists(save_path): dataset = [] for root, dirs, files in os.walk(save_path): for file in files: dataset.extend(load_pkl_data(os.path.join(root, file))) else: os.makedirs(save_path) dataset = ClozeDataset( tokenizer=config.TOKENIZER, data_id=df_data.data_id.values, tag=df_data.tag.values, text=df_data.text.values, candidate=df_data.candidate.values, groundTruth=df_data.groundTruth.values, max_len=config.MAX_LEN ) datas = [] data = [] batch_id = 1 tk = tqdm(dataset, total=len(dataset)) for bi, item in enumerate(tk): data.append(item) if len(data) == 50000 or bi == len(dataset) - 1: path = save_path + f"/train_features_{batch_id}.pkl" save_pkl_data(data, path) batch_id += 1 datas.extend(data) data = [] dataset = datas else: if os.path.exists(save_path): dataset = load_pkl_data(save_path) else: dataset = ClozeDataset( tokenizer=config.TOKENIZER, data_id=df_data.data_id.values, tag=df_data.tag.values, text=df_data.text.values, candidate=df_data.candidate.values, groundTruth=df_data.groundTruth.values, max_len=config.MAX_LEN ) tk = tqdm(dataset, total=len(dataset)) dataset = [item for item in tk] save_pkl_data(dataset, save_path) return dataset
def run(): """ Train model for a speciied fold """ # Read train csv and dev csv df_train = pd.read_csv(config.TRAIN_FILE) df_valid = pd.read_csv(config.DEV_FILE) # Instantiate TweetDataset with training data train_dataset = SiameseDataset(query=df_train.sentence1.values, question=df_train.sentence2.values, label=df_train.label.values) if os.path.exists(config.train_features): train_dataset = load_pkl_data(config.train_features) else: train_dataset = [item for item in train_dataset] save_pkl_data(train_dataset, config.train_features) # Instantiate DataLoader with `train_dataset` # This is a generator that yields the dataset in batches train_data_loader = torch.utils.data.DataLoader( train_dataset, shuffle=False, batch_size=config.TRAIN_BATCH_SIZE) # Instantiate TweetDataset with validation data valid_dataset = SiameseDataset( query=df_valid.sentence1.values, question=df_valid.sentence2.values, label=df_valid.label.values, ) if os.path.exists(config.valid_features): valid_dataset = load_pkl_data(config.valid_features) else: valid_dataset = [item for item in valid_dataset] save_pkl_data(valid_dataset, config.valid_features) # Instantiate DataLoader with `valid_dataset` valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, shuffle=False) # Set device as `cuda` (GPU) device = torch.device("cuda") # Load pretrained BERT (bert-base-uncased) model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH) # Output hidden states # This is important to set since we want to concatenate the hidden states from the last 2 BERT layers model_config.output_hidden_states = True # Instantiate our model with `model_config` model = SiameseWmdModel(conf=model_config, pretrained_model_path=config.BERT_PATH) # Move the model to the GPU model.to(device) # I'm training only for 3 epochs even though I specified 5!!! pred_labels, wmd, acc, f1, auc = predict(train_data_loader, model, device) logger.info(f"train set : acc = {acc}, f1 score = {f1}, auc = {auc}") df_train["pred_label"] = pred_labels df_train["wmd"] = wmd df_train.to_csv("../output/train_predict.csv") thresholds = [0.25, 0.23] best_f1 = 0 best_th = 0 for threshold in thresholds: pred_labels, wmd, acc, f1, auc = predict(valid_data_loader, model, device, threshold) logger.info( f"dev set :threshold={threshold} acc = {acc}, f1 score = {f1}, auc = {auc}" ) if f1 > best_f1: best_f1 = f1 best_th = threshold print(f"best threshold: {best_th} with best f1 {best_f1}") df_valid["pred_label"] = pred_labels df_valid["wmd"] = wmd df_valid.to_csv("../output/dev_predict.csv")
def train(): """ Train model for a speciied fold """ # Read train csv and dev csv df_train = pd.read_csv(config.TRAIN_FILE) df_valid = pd.read_csv(config.DEV_FILE) # Instantiate TweetDataset with training data train_dataset = SiameseDataset(query=df_train.sentence1.values, question=df_train.sentence2.values, label=df_train.label.values) if os.path.exists(config.train_features): train_dataset = load_pkl_data(config.train_features) else: train_dataset = [item for item in train_dataset] save_pkl_data(train_dataset, config.train_features) # Instantiate DataLoader with `train_dataset` # This is a generator that yields the dataset in batches train_data_loader = torch.utils.data.DataLoader( train_dataset, shuffle=True, batch_size=config.TRAIN_BATCH_SIZE) # Instantiate TweetDataset with validation data valid_dataset = SiameseDataset(query=df_valid.sentence1.values, question=df_valid.sentence2.values, label=df_valid.label.values) if os.path.exists(config.valid_features): valid_dataset = load_pkl_data(config.valid_features) else: valid_dataset = [item for item in valid_dataset] save_pkl_data(valid_dataset, config.valid_features) # Instantiate DataLoader with `valid_dataset` valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE) # Set device as `cuda` (GPU) device = torch.device("cuda:2") # Load pretrained BERT (bert-base-uncased) model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH) # Output hidden states # This is important to set since we want to concatenate the hidden states from the last 2 BERT layers model_config.output_hidden_states = True # Instantiate our model with `model_config` model = SiameseWmdModel(conf=model_config, pretrained_model_path=config.BERT_PATH) # Move the model to the GPU model.to(device) # Calculate the number of training steps num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) # Get the list of named parameters param_optimizer = list(model.named_parameters()) # Specify parameters where weight decay shouldn't be applied no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] # Define two sets of parameters: those with weight decay, and those without optimizer_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }, ] # Instantiate AdamW optimizer with our two sets of parameters, and a learning rate of 3e-5 optimizer = AdamW(optimizer_parameters, lr=3e-5) # Create a scheduler to set the learning rate at each training step # "Create a schedule with a learning rate that decreases linearly after linearly increasing during a warmup period." (https://pytorch.org/docs/stable/optim.html) # Since num_warmup_steps = 0, the learning rate starts at 3e-5, and then linearly decreases at each training step scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) # Apply early stopping with patience of 2 # This means to stop training new epochs when 2 rounds have passed without any improvement es = utils.EarlyStopping(patience=2, mode="max") thresholds = [0.1, 0.15, 0.20] best_f1 = 0 best_th = 0 for threshold in thresholds: # I'm training only for 3 epochs even though I specified 5!!! for epoch in range(config.EPOCHS): train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler, threshold=threshold) acc, f1, auc = eval_fn(valid_data_loader, model, device) # logger.info(f"acc = {acc}, f1 score = {f1}") es(f1, model, model_path=config.MODEL_SAVE_PATH) if es.early_stop: if f1 > best_f1: best_f1 = f1 best_th = threshold print("Early stopping ********") break logger.info(f"best threshold:{best_th}, best f1 :{best_f1}")
def save_metrics(self, d): for k, v in d.items(): utils.save_pkl_data(v, f'{k}.p', data_dir=self.model_dir)
def main(premise_hidden_size, hypo_hidden_size, linear_hidden_size, interaction_type, device, kind, num_layers=1, bidirectional=True, kernel_size=3, lr=1e-4, test=False, model_dir='models'): valid_types = ('cat', 'element_wise_mult') if interaction_type not in valid_types: raise ValueError('interaction_type can only be: ', valid_types) # data batch_size = 32 save_freq = 500 max_epochs = 40 train_loader, val_loader = data.get_loaders(batch_size, test=test) # model embed_size = 300 ind2vec = data.get_table_lookup() if kind == 'rnn': model = models.SNLI_Model(ind2vec, embed_size, premise_hidden_size, hypo_hidden_size, linear_hidden_size, interaction_type, device, kind='rnn', num_layers=num_layers, bidirectional=bidirectional) else: model = models.SNLI_Model(ind2vec, embed_size, premise_hidden_size, hypo_hidden_size, linear_hidden_size, interaction_type, device, kind='cnn', kernel_size=kernel_size) model = model.to(device) optimizer = torch.optim.Adam( [param for param in model.parameters() if param.requires_grad], lr=lr) loss_fn = torch.nn.CrossEntropyLoss() model_name = f'{kind}_model_{premise_hidden_size}_{interaction_type}' model_dir = os.path.join(model_dir, model_name) train_helper = train_helpers.TrainHelper(device, model, loss_fn, optimizer, models.batch_params_key, model_dir, test) train_loss, val_loss, train_acc, val_acc = train_helper.train_loop( train_loader, val_loader, max_epochs=max_epochs, save_freq=save_freq) if 'cpu' in device: os.makedirs('figures', exist_ok=True) path = f'figures/{model_name}' utils.plot_curves(train_loss, val_loss, train_acc, val_acc, path) utils.save_pkl_data(train_loss, 'train_loss.p', data_dir=model_dir) utils.save_pkl_data(val_loss, 'val_loss.p', data_dir=model_dir) utils.save_pkl_data(train_acc, 'train_acc.p', data_dir=model_dir) utils.save_pkl_data(val_acc, 'val_acc.p', data_dir=model_dir)