def run(): data_df = pd.read_csv('../input/train.csv') train_df, valid_df = train_test_split(data_df, random_state=42, test_size=0.1) train_df = train_df.reset_index(drop=True) valid_df = valid_df.reset_index(drop=True) train_y = train_df['median_relevance'].values valid_y = valid_df['median_relevance'].values train_dataset = CrowdFlowerDataset( query=train_df['query'].values, prod_title=train_df['product_title'].values, prod_description=train_df['product_description'].values, targets=train_y) valid_dataset = CrowdFlowerDataset( query=valid_df['query'].values, prod_title=valid_df['product_title'].values, prod_description=valid_df['product_description'].values, targets=valid_y) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=configs.TRAIN_BATCH_SIZE, shuffle=True) valid_dataloader = torch.utils.data.DataLoader( valid_dataset, batch_size=configs.VALID_BATCH_SIZE, shuffle=False) num_train_steps = int( len(train_dataset) / configs.TRAIN_BATCH_SIZE * configs.EPOCHS) device = configs.DEVICE model = BERTBaseUncased().to(device) optimizer = configs.OPTIMIZER(model.parameters(), lr=configs.LR) scheduler = configs.SCHEDULER(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) for epoch in range(configs.EPOCHS): epoch_start = time.time() epoch_train_loss = train_loop_fn(train_dataloader, model, optimizer, scheduler) outputs, targets, epoch_valid_loss = eval_loop_fn( valid_dataloader, model) epoch_end = time.time() epoch_time_elapsed = (epoch_end - epoch_start) / 60.0 print(f'time take to run a epoch - {epoch_time_elapsed}') print( f'Epoch - Training loss - {epoch_train_loss} Valid loss - {epoch_valid_loss}' ) qw_kappa = quadratic_weighted_kappa(targets.flatten(), outputs.flatten()) print(f'Quadratic Weighted Kappa: {qw_kappa}')
def run(): data_df = pd.read_csv('../input/train.csv') train_df, valid_df = train_test_split(data_df, random_state=42, test_size=0.1) train_df = train_df.reset_index(drop=True) valid_df = valid_df.reset_index(drop=True) sample_sub_df = pd.read_csv("../input/sample_submission.csv") target_cols = list(sample_sub_df.drop("qa_id", axis=1).columns) train_y = train_df[target_cols].values valid_y = valid_df[target_cols].values train_dataset = BertDataset( qtitle=train_df.question_title.values, qbody=train_df.question_body.values, answer=train_df.answer.values, targets=train_y ) valid_dataset = BertDataset( qtitle=valid_df.question_title.values, qbody=valid_df.question_body.values, answer=valid_df.answer.values, targets=valid_y ) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size= config.TRAIN_BATCH_SIZE, shuffle=True ) valid_dataloader = torch.utils.data.DataLoader( valid_dataset, batch_size= config.VALID_BATCH_SIZE, shuffle=False ) num_train_steps = int(len(train_dataset)/ config.TRAIN_BATCH_SIZE * config.EPOCHS) device= config.DEVICE model = BERTBaseUncased().to(device) optimizer = config.OPTIMIZER(model.parameters(), lr=config.LR) scheduler = config.SCHEDULER(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) for epoch in range(config.EPOCHS): epoch_start = time.time() epoch_train_loss = train_loop_fn(train_dataloader, model, optimizer, scheduler) outputs, targets, epoch_valid_loss = eval_loop_fn(valid_dataloader, model) epoch_end = time.time() epoch_time_elapsed = (epoch_end - epoch_start)/60.0 print(f'time take to run a epoch - {epoch_time_elapsed}') print(f'Epoch - Training loss - {epoch_train_loss} Valid loss - {epoch_valid_loss}') spear =[] for jj in range(targets.shape[1]): p1 = list(targets[:, jj]) p2 = list(outputs[:, jj]) coef, _ = np.nan_to_num(stats.spearmanr(p1, p2)) spear.append(coef) spear = np.mean(spear) print(f"Spearman coeff : {spear}")