def load_tweets_dataset(lang, text_transform, batch_size, val_batch_size): """ Import tweets data set :param lang: :param text_transform: :param batch_size: :param val_batch_size: :return: """ # Tweet data set 2017 training tweet_dataset_train_17 = dataset.TweetDataset(root='./data/', download=True, lang=lang, text_transform=text_transform, year=2017, train=True) pan17loader_training = torch.utils.data.DataLoader(tweet_dataset_train_17, batch_size=batch_size, shuffle=True) # Tweet data set 2017 validation tweet_dataset_val_17 = dataset.TweetDataset(root='./data/', download=True, lang=lang, text_transform=text_transform, year=2017, train=False) pan17loader_validation = torch.utils.data.DataLoader(tweet_dataset_val_17, batch_size=batch_size, shuffle=True) # Tweet data set 2018 training tweet_dataset_train_18 = dataset.TweetDataset(root='./data/', download=True, lang=lang, text_transform=text_transform, year=2018, train=True) pan18loader_training = torch.utils.data.DataLoader(tweet_dataset_train_18, batch_size=batch_size, shuffle=True) # Tweet data set 2018 validation tweet_dataset_val_18 = dataset.TweetDataset(root='./data/', download=True, lang=lang, text_transform=text_transform, year=2018, train=False) pan18loader_validation = torch.utils.data.DataLoader(tweet_dataset_val_18, batch_size=val_batch_size, shuffle=True) return pan17loader_training, pan17loader_validation, pan18loader_training, pan18loader_validation
def run(fold, model_name): writer = SummaryWriter(log_dir=f'{SAVE_PATH}/', filename_suffix=f'{model_name}-fold{fold}') dfx = pd.read_csv(config.TRAINING_FILE) df_train = dfx[dfx.kfold != fold].reset_index(drop=True) df_valid = dfx[dfx.kfold == fold].reset_index(drop=True) print(df_train.shape) print(df_valid.shape) train_dataset = dataset.TweetDataset( tweet=df_train.text.values, sentiment=df_train.sentiment.values, selected_text=df_train.selected_text.values ) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4 ) valid_dataset = dataset.TweetDataset( tweet=df_valid.text.values, sentiment=df_valid.sentiment.values, selected_text=df_valid.selected_text.values ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2 ) device = torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu') print(f'training on {device}') model_config = transformers.RobertaConfig.from_pretrained(config.ROBERTA_PATH) model_config.output_hidden_states = True model = TweetModel(conf=model_config) model.to(device) num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(params.optimizer_params(model), lr=5e-5) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) es = utils.EarlyStopping(patience=5, mode="max") for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler, writer) jaccard = engine.eval_fn(valid_data_loader, model, device, writer) print(f"Jaccard Score = {jaccard}") print(f"Epoch={epoch}, Jaccard={jaccard}") es(jaccard, model, model_path=f"{SAVE_PATH}/{model_name}-f{fold}.pt") if es.early_stop: print("Early stopping") break
def run(df): y = df["target"].values X = df.drop(["target", "kfold"], axis=1).values train_dataset = dataset.TweetDataset( tweets=X, targets=y ) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=2 ) device = "cuda" if torch.cuda.is_available() else "cpu" print("Using {} device".format(device)) model = neural_net.NeuralNetwork().to(device) optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) loss_fn = torch.nn.BCELoss() print("Training Model...") for epoch in range(config.EPOCHS): print(f"Epoch {epoch+1}\n--------------------") engine.train( train_dataloader, model, optimizer, loss_fn, device ) torch.save(model.state_dict(), f"{config.MODEL_PATH}/{config.MODEL_NAME}.pth")
def eval_fn(model, dataset): """Eval the eval dataset and returns the metric""" data = tf.data.Dataset.from_generator( dataset.TweetDataset(data, config.TOKENIZER, config.MAX_LEN).gen, output_types=dataset.gen_str).batch(config.VALID_BATCH_SIZE) def get_text(text, pred): pred_texts = [] orig_texts = [] text = text.numpy() pred = tf.argmax(pred, axis=1).numpy() for t, p in zip(text, pred): orig_texts.append(t.decode("utf-8")) t = config.TOKENIZER.encode(orig_texts[-1]).offsets i, j = p[0], p[1] pred_texts.append(orig_texts[-1][t[i][0]:t[j][1]]) return orig_texts, pred_texts score = 0 for i, (data, _) in tqdm(enumerate(data)): orig_text = data["orig"] ext_text = data["ext"] preds = model.predict(data) targets, pred_texts = get_text(orig_text, preds) score = score + utils.jaccard(pred_texts, targets) score = sum(score) / len(score) print("Total jaccard score : ", score)
def run(fold): dfx = pd.read_csv(config.TRAINING_FILE) df_valid = dfx[dfx.kfold == fold].reset_index(drop=True) device = torch.device('cuda') model_config = transformers.RobertaConfig.from_pretrained( config.MODEL_CONFIG) model_config.output_hidden_states = True model = models.TweetModel(conf=model_config) model.to(device) model.load_state_dict(torch.load( f'{config.TRAINED_MODEL_PATH}/model_{fold}.bin')) model.eval() valid_dataset = dataset.TweetDataset( tweets=df_valid.text.values, sentiments=df_valid.sentiment.values, selected_texts=df_valid.selected_text.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=4, shuffle=False) jaccard = eval_fn(valid_data_loader, model, device) return jaccard
def get_dataloader( df_train, df_valid, max_seq_len=128, model_type='roberta', dataloader_shuffle=False, ): tokenizer = m.init_tokenizer(model_type) # [!] train_dataset = dataset.TweetDataset( tweet=df_train.text.values, sentiment=df_train.sentiment.values, selected_text=df_train.selected_text.values, tokenizer=tokenizer, max_seq_len=max_seq_len, model_type=model_type, ) train_data_loader = DataLoader( train_dataset, batch_size=args.batch_size, shuffle=dataloader_shuffle, num_workers=4, drop_last=True ) valid_dataset = dataset.TweetDataset( tweet=df_valid.text.values, sentiment=df_valid.sentiment.values, selected_text=df_valid.selected_text.values, tokenizer=tokenizer, max_seq_len=max_seq_len, model_type=model_type, ) valid_data_loader = DataLoader( valid_dataset, batch_size=args.batch_size, num_workers=2, drop_last=True ) return train_data_loader, valid_data_loader
def train_fn(model, data): """ Train the model with the given data and returns it. """ # checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') # es=EarlyStopping(monitor='val_acc', baseline=0.85, patience=10 ,verbose=1,mode="max") # callbacks_list = [checkpoint] data = tf.data.Dataset.from_generator( dataset.TweetDataset(data, config.TOKENIZER, config.MAX_LEN).gen, output_types=dataset.gen_str).batch(config.TRAIN_BATCH_SIZE) model.fit(dataset, epochs=config.EPOCHS) return model
import pandas as pd from model import BertBaseUncased import torch import numpy as np import string from tqdm import tqdm import dataset import config TEST_FILE = 'Data/test.csv' df_test = pd.read_csv(TEST_FILE) sample = pd.read_csv('Data/sample_submission.csv') test_dataset = dataset.TweetDataset( tweet=df_test.text.values, sentiment=df_test.sentiment.values, selected_text=df_test.text. values # wont need this just so that the data loader works ) test_dataloader = torch.utils.data.DataLoader( test_dataset, batch_size=config.VALID_BATCH_SIZE, shuffle=False) device = torch.device('cuda') print('Running on ', device) model = BertBaseUncased().to(device) model.load_state_dict(torch.load('model.bin')) def test_fn(data_loader, model, device): model.eval()
def run_cv(df, fold): train_df = df[df["kfold"] != fold].reset_index(drop=True) valid_df = df[df["kfold"] == fold].reset_index(drop=True) #y_train = pd.get_dummies(train_df["target"], dtype="int64").values y_train = train_df["target"].values X_train = train_df.drop(["target", "kfold"], axis=1).values #y_valid = pd.get_dummies(valid_df["target"], dtype="int64").values y_valid = valid_df["target"].values X_valid = valid_df.drop(["target", "kfold"], axis=1).values train_dataset = dataset.TweetDataset( tweets=X_train, targets=y_train ) valid_dataset = dataset.TweetDataset( tweets=X_valid, targets=y_valid ) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=2 ) valid_dataloader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=1 ) device = "cuda" if torch.cuda.is_available() else "cpu" print("Using {} device".format(device)) model = neural_net.NeuralNetwork().to(device) optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) loss_fn = torch.nn.BCELoss() print("Training Model...") #early_stopping_counter = 0 for epoch in range(config.EPOCHS): print(f"Epoch {epoch+1}\n--------------------") engine.train( train_dataloader, model, optimizer, loss_fn, device ) outputs, targets = engine.evaluate( valid_dataloader, model, loss_fn, device ) outputs = np.array(outputs).reshape(-1,) outputs = list(map(lambda pred: 1 if pred>0.5 else 0, outputs)) valid_score = metrics.f1_score(targets, outputs) print(f" F1 Score: {valid_score}\n")
def run(fold): dfx = pd.read_csv(config.TRAINING_FILE) df_train = dfx[dfx.kfold != fold].reset_index(drop=True) df_valid = dfx[dfx.kfold == fold].reset_index(drop=True) train_dataset = dataset.TweetDataset( tweets=df_train.text.values, sentiments=df_train.sentiment.values, selected_texts=df_train.selected_text.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4, shuffle=True) valid_dataset = dataset.TweetDataset( tweets=df_valid.text.values, sentiments=df_valid.sentiment.values, selected_texts=df_valid.selected_text.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=4, shuffle=False) device = torch.device('cuda') model_config = transformers.RobertaConfig.from_pretrained( config.MODEL_CONFIG) model_config.output_hidden_states = True model = models.TweetModel(conf=model_config) model = model.to(device) num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': config.WEIGHT_DECAY }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] base_opt = transformers.AdamW(optimizer_parameters, lr=config.LEARNING_RATE) optimizer = torchcontrib.optim.SWA(base_opt, swa_start=int(num_train_steps * config.SWA_RATIO), swa_freq=config.SWA_FREQ, swa_lr=None) scheduler = transformers.get_linear_schedule_with_warmup( optimizer=optimizer, num_warmup_steps=int(num_train_steps * config.WARMUP_RATIO), num_training_steps=num_train_steps) print(f'Training is starting for fold={fold}') for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler) jaccard = engine.eval_fn(valid_data_loader, model, device) if config.USE_SWA: optimizer.swap_swa_sgd() torch.save(model.state_dict(), f'{config.MODEL_SAVE_PATH}/model_{fold}.bin') return jaccard
def run(): print('Loading Files...') dfx = pd.read_csv(config.TRAINING_FILE).dropna().reset_index(drop = True) #dfx = dfx.sample(100) df_train,df_valid = model_selection.train_test_split( dfx, test_size = 0.1, random_state = 42, ) df_train = df_train.reset_index(drop = True) df_valid = df_valid.reset_index(drop = True) print('Files loaded') train_dataset = dataset.TweetDataset( tweet=df_train.text.values, selected_text = df_train.selected_text.values ) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, shuffle = False ) valid_dataset = dataset.TweetDataset( tweet=df_valid.text.values, selected_text = df_valid.selected_text.values ) valid_dataloader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, shuffle = False ) device = torch.device('cuda') print('Running on ',device) model = BertBaseUncased().to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias','LayerNorm.bias','layerNorm.weight'] optimizer_params = [ {'params':[p for n,p in param_optimizer if not any(nd in n for nd in no_decay)],'weight_decay':0.003}, {'params':[p for n,p in param_optimizer if any(nd in n for nd in no_decay)],'weight_decay':0.00} ] num_training_steps = int(len(df_train)/config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_params, lr = 2e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps = 0, num_training_steps = num_training_steps) best_jaccard = 0 for epoch in range(config.EPOCHS): engine.train_fn(train_dataloader,model,optimizer,device,scheduler) jaccard = engine.eval_fn(valid_dataloader,model,device) print(f'Epochs {epoch+1}...', f'Jaccard {jaccard}') if jaccard > best_jaccard: torch.save(model.state_dict(),config.MODEL_PATH) best_jaccard = jaccard print('Memory Used: ',torch.cuda.memory_allocated()/1000000000,'GB') torch.cuda.empty_cache()
def run(): dfx = pd.read_csv(config.TRAINING_FILE, nrows=100).dropna().reset_index(drop=True) dfx.sentiment = dfx.sentiment.apply(lambda x: 1 if x == "positive" else 0) df_train, df_valid = model_selection.train_test_split( dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values) df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) train_dataset = dataset.TweetDataset( tweet=df_train.text.values, sentiment=df_train.sentiment.values, selected_text=df_train.selected_text.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_dataset = dataset.TweetDataset( tweet=df_valid.text.values, sentiment=df_valid.sentiment.values, selected_text=df_valid.selected_text.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) device = torch.device("cuda") model = BERTBaseUncased() model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }, ] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) model = nn.DataParallel(model) best_jaccard = 0 for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler) jaccard = engine.eval_fn(valid_data_loader, model, device) print(f"Jaccard Score = {jaccard}") if jaccard > best_jaccard: torch.save(model.state_dict(), config.MODEL_PATH) best_accuracy = jaccard
def run(): df_test = pd.read_csv(config.TEST_FILE) df_test.loc[:, 'selected_text'] = df_test.text.values device = torch.device('cuda') model_config = transformers.RobertaConfig.from_pretrained( config.MODEL_CONFIG) model_config.output_hidden_states = True fold_models = [] for i in range(config.N_FOLDS): model = models.TweetModel(conf=model_config) model.to(device) model.load_state_dict( torch.load(f'{config.TRAINED_MODEL_PATH}/model_{i}.bin')) model.eval() fold_models.append(model) test_dataset = dataset.TweetDataset( tweets=df_test.text.values, sentiments=df_test.sentiment.values, selected_texts=df_test.selected_text.values) data_loader = torch.utils.data.DataLoader( test_dataset, shuffle=False, batch_size=config.VALID_BATCH_SIZE, num_workers=4) char_pred_test_start = [] char_pred_test_end = [] with torch.no_grad(): tk0 = tqdm.tqdm(data_loader, total=len(data_loader)) for bi, d in enumerate(tk0): ids = d['ids'] token_type_ids = d['token_type_ids'] mask = d['mask'] orig_tweet = d['orig_tweet'] offsets = d['offsets'] ids = ids.to(device, dtype=torch.long) token_type_ids = token_type_ids.to(device, dtype=torch.long) mask = mask.to(device, dtype=torch.long) outputs_start_folds = [] outputs_end_folds = [] for i in range(config.N_FOLDS): outputs_start, outputs_end = \ fold_models[i](ids=ids, mask=mask, token_type_ids=token_type_ids) outputs_start_folds.append(outputs_start) outputs_end_folds.append(outputs_end) outputs_start = sum(outputs_start_folds) / config.N_FOLDS outputs_end = sum(outputs_end_folds) / config.N_FOLDS outputs_start = torch.softmax(outputs_start, dim=-1).cpu().detach().numpy() outputs_end = torch.softmax(outputs_end, dim=-1).cpu().detach().numpy() for px, tweet in enumerate(orig_tweet): char_pred_test_start.append( utils.token_level_to_char_level(tweet, offsets[px], outputs_start[px])) char_pred_test_end.append( utils.token_level_to_char_level(tweet, offsets[px], outputs_end[px])) with open('roberta-char_pred_test_start.pkl', 'wb') as handle: pickle.dump(char_pred_test_start, handle) with open('roberta-char_pred_test_end.pkl', 'wb') as handle: pickle.dump(char_pred_test_end, handle)
def run(): dfx = pd.read_csv(config.TRAINING_FILE, nrows=30).dropna().reset_index(drop=True) df_train, df_valid = model_selection.train_test_split( dfx, test_size = 0.1, random_state = 42, stratify = dfx.sentiment.values ) df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) train_dataset = dataset.TweetDataset( tweet = df_train.text.values, sentiment = df_train.sentiment.values, selected_text=df_train.selected_text.values ) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=1 ) valid_dataset = dataset.TweetDataset( tweet = df_valid.text.values, sentiment = df_valid.sentiment.values, selected_text=df_valid.selected_text.values ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1 ) device = torch.device('cpu') model = BERTBaseUncased() model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps ) best_jaccard = 0 for epoch in range(config.EPOCHS): print("here") engine.train_fn(train_data_loader, model, optimizer, device, scheduler) mean_jac = engine.eval_fn(valid_data_loader, model, device) print("jaccard_score = {mean_jac}".format(mean_jac=mean_jac)) if(mean_jac>best_jaccard): torch.save(model.state_dict(), config.MODEL_PATH) best_jaccard = mean_jac
for i in range(cfg.K_FOLD): _model = m.SentimentExtractor(model_type=cfg.MODEL_TYPE, dropout_rate=cfg.DROPOUT_RATE, last_n_layers=cfg.LAST_N_LAYERS, device=device) _model.to(device) _model.load_state_dict(torch.load(f'{model_path}/model_{i}.pt')) _model.eval() models.append(_model) m.init_tokenizer() test_dataset = dataset.TweetDataset( tweet=test_data.text.values, sentiment=test_data.sentiment.values, selected_text=test_data.selected_text.values, tokenizer=m.tokenizer, max_seq_len=max_seq_len, model_type=cfg.MODEL_TYPE, ) id_list = [] answer = [] sentiments = ['positive', 'negative', 'neutral'] scores = [] selected = [] # [START] with torch.no_grad(): with torch.no_grad(): for idx, d in enumerate(tqdm(test_dataset, desc="test", ncols=80)): uniq_id = test_data.textID.iloc[idx]
def run(): dfx = pd.read_csv(config.TRAINING_FILE, nrows=config.NROWS).dropna().reset_index(drop=True) # dfx.sentiment = dfx.sentiment.apply( # lambda x: 1 if x =='positive' else 0 # ) print('Data Loaded') df_train, df_valid = model_selection.train_test_split( dfx, test_size=0.5, random_state=42, stratify=dfx.sentiment.values) print('Data split into train data and validation data') df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) train_dataset = dataset.TweetDataset( tweet=df_train.text.values, sentiment=df_train.sentiment.values, selected_text=df_train.selected_text.values) print('Train data preprocessed and made into Tweet Dataset Object') train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, shuffle=True, num_workers=4) print('Train dataloader created') valid_dataset = dataset.TweetDataset( tweet=df_valid.text.values, sentiment=df_valid.sentiment.values, selected_text=df_valid.selected_text.values) print('Valid data preprocessed and made into Tweet Dataset Object') valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) print('Valid dataloader created') device = config.DEVICE conf = transformers.RobertaConfig.from_pretrained( f'{config.PATH}roberta-base-config.json') conf.output_hidden_states = False model = Roberta(conf) model.to(device) print('Model Object created') param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = utils.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) best_jaccard = 0 print('Starting Training....') for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler) jaccard = engine.eval_fn(valid_data_loader, model, device) print(f'Jaccard Score : {jaccard}') if jaccard > best_jaccard: torch.save(model.state_dict(), config.MODEL_PATH) best_jaccard = jaccard
def run(): dfx = pd.read_csv(config.TRAINING_FILE).dropna().reset_index(drop=True) #stratify split so that class can be balanced for both train and validation ==>> it means number of positive class will be equal to negative class for train ===>>same for validation dataset also df_train, df_valid = model_selection.train_test_split( dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values) df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) train_dataset = dataset.TweetDataset( tweet=df_train.text.values, target=df_train.sentiment.values, selected_text=df_train.selected_text.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_dataset = dataset.TweetDataset( tweet=df_valid.text.values, target=df_valid.sentiment.values, selected_text=df_valid.selected_text.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) device = torch.device("cuda") model = BERTBaseUncased() model.to(device) #specify what parameters you want to train param_optimizer = list(model.named_parameters()) #we don't want any deacy for these layer names such as bias and othr following things no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { #don't decay weight for above no_decay list else decay "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) #experiment with lr optimizer = AdamW(optimizer_parameters, lr=3e-5) #scheduler can be of your choice scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) #convert model to multi-gpu model --->> no need to do this if you have not multiple gpus model = nn.DataParallel(model) #evaluation matrix is jacccard best_jaccard = 0 for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler) jaccard = engine.eval_fn(valid_data_loader, model, device) print(f"Jaccard Score = {jaccard}") if jaccard > best_jaccard: torch.save(model.state_dict(), config.MODEL_PATH) best_jaccard = jaccard
def run(): dfx = pd.read_csv(config.TRAINING_FILE, nrows=100).dropna().reset_index(drop=True) df_train, df_valid = model_selection.train_test_split( dfx, test_size=0.1, random_state=42, stratify=dfx["sentiment"].values) df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) train_dataset = dataset.TweetDataset( tweet=df_train["text"].values, sentiment=df_train["sentiment"].values, selected_text=df_train["selected_text"].values, ) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, # num_workers=4, ) valid_dataset = dataset.TweetDataset( tweet=df_valid["text"].values, sentiment=df_valid["sentiment"].values, selected_text=df_valid["selected_text"].values, ) valid_dataloader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALIDATION_BATCH_SIZE, # num_workers=1, ) device = torch.device("cpu") model = BERTBasedUncased() model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] num_train_steps = len(dfx) / (config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) best_jaccard = 0 for epoch in range(config.EPOCHS): engine.train_fn(train_dataloader, model, optimizer, device, scheduler) jaccard = engine.eval_fn(valid_dataloader, model, device) print(f"Jaccard score : {jaccard}") if jaccard > best_jaccard: torch.save(model.state_dict(), config.MODEL_PATH) best_jaccard = jaccard
def get_test_loader(df): loader = torch.utils.data.DataLoader(dataset.TweetDataset(df)) return loader