def cp_cnn_moa_train_prediction(self): print("Is GPU Available?") if torch.cuda.is_available(): print("Yes, GPU is Available!!") else: print("No, GPU is NOT Available!!", "\n") DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu') no_of_components = 25 NFOLDS = 5 WEIGHT_DECAY = 1e-5 EARLY_STOPPING_STEPS = 10 EARLY_STOP = False hidden_size = 4096 ##dir names model_file_name = "cp_1dcnn" model_dir_name = "cp_cnn_model" trn_pred_name = 'cp_train_preds_1dcnn' tst_pred_name = 'cp_test_preds_1dcnn' model_file_name,model_dir_name,trn_pred_name,tst_pred_name = \ check_if_shuffle_data(self.shuffle, model_file_name, model_dir_name, trn_pred_name, tst_pred_name) model_dir = os.path.join(self.data_dir, model_dir_name) os.makedirs(model_dir, exist_ok=True) # Setup file names if self.shuffle: if self.subsample: input_train_file = os.path.join( self.data_dir, "train_shuffle_lvl4_data_subsample.csv.gz") input_test_file = os.path.join( self.data_dir, "test_lvl4_data_subsample.csv.gz") else: input_train_file = os.path.join( self.data_dir, "train_shuffle_lvl4_data.csv.gz") input_test_file = os.path.join(self.data_dir, "test_lvl4_data.csv.gz") else: if self.subsample: input_train_file = os.path.join( self.data_dir, "train_lvl4_data_subsample.csv.gz") input_test_file = os.path.join( self.data_dir, "test_lvl4_data_subsample.csv.gz") else: input_train_file = os.path.join(self.data_dir, "train_lvl4_data.csv.gz") input_test_file = os.path.join(self.data_dir, "test_lvl4_data.csv.gz") if self.subsample: input_target_file = os.path.join(self.data_dir, 'target_labels_subsample.csv') else: input_target_file = os.path.join(self.data_dir, 'target_labels.csv') df_train = pd.read_csv(input_train_file, compression='gzip', low_memory=False) df_test = pd.read_csv(input_test_file, compression='gzip', low_memory=False) df_targets = pd.read_csv(input_target_file) metadata_cols = [ 'Metadata_broad_sample', 'Metadata_pert_id', 'Metadata_Plate', 'Metadata_Well', 'Metadata_broad_id', 'Metadata_moa', 'broad_id', 'pert_iname', 'moa', 'replicate_name', 'Metadata_dose_recode' ] target_cols = df_targets.columns[1:] df_train_x, df_train_y, df_test_x, df_test_y = split_data( df_train, df_test, metadata_cols, target_cols) features = df_train_x.columns.tolist() num_features = len(features) + no_of_components num_targets = len(target_cols) df_train = drug_stratification(df_train, NFOLDS, target_cols, col_name='replicate_name', cpd_freq_num=24) pos_weight = initialize_weights(df_train, target_cols, DEVICE) def model_train_pred(fold, Model=CNN_Model, df_train_y=df_train_y, df_test_y=df_test_y, features=features, file_name=model_file_name): model_path = os.path.join(model_dir, file_name + f"_FOLD{fold}.pth") x_fold_train, y_fold_train, x_fold_val, y_fold_val, df_test_x_copy, val_idx = \ preprocess(fold, df_train, df_train_x, df_train_y, df_test_x, no_of_components) train_dataset = TrainDataset(x_fold_train.values, y_fold_train.values) valid_dataset = TrainDataset(x_fold_val.values, y_fold_val.values) trainloader = torch.utils.data.DataLoader( train_dataset, batch_size=self.BATCH_SIZE, shuffle=True) validloader = torch.utils.data.DataLoader( valid_dataset, batch_size=self.BATCH_SIZE, shuffle=False) model = Model(num_features=num_features, num_targets=num_targets, hidden_size=hidden_size) model.to(DEVICE) optimizer = torch.optim.Adam(model.parameters(), lr=self.LEARNING_RATE, weight_decay=WEIGHT_DECAY, eps=1e-9) scheduler = optim.lr_scheduler.OneCycleLR( optimizer=optimizer, pct_start=0.1, div_factor=1e3, max_lr=1e-2, epochs=self.EPOCHS, steps_per_epoch=len(trainloader)) loss_train = SmoothBCEwLogits(smoothing=0.001, pos_weight=pos_weight) loss_val = nn.BCEWithLogitsLoss() early_stopping_steps = EARLY_STOPPING_STEPS early_step = 0 oof = np.zeros(df_train_y.shape) best_loss = np.inf best_loss_epoch = -1 for epoch in range(self.EPOCHS): train_loss = train_fn(model, optimizer, scheduler, loss_train, trainloader, DEVICE) valid_loss, valid_preds = valid_fn(model, loss_val, validloader, DEVICE) if valid_loss < best_loss: best_loss = valid_loss best_loss_epoch = epoch oof[val_idx] = valid_preds torch.save(model.state_dict(), model_path) elif (EARLY_STOP == True): early_step += 1 if (early_step >= early_stopping_steps): break print( f"FOLD: {fold}, EPOCH: {epoch},train_loss: {train_loss:.6f},\ valid_loss: {valid_loss:.6f} best_loss: {best_loss:.6f}, best_loss_epoch: {best_loss_epoch}" ) #--------------------- PREDICTION--------------------- testdataset = TestDataset(df_test_x_copy.values) testloader = torch.utils.data.DataLoader( testdataset, batch_size=self.BATCH_SIZE, shuffle=False) model = Model(num_features=num_features, num_targets=num_targets, hidden_size=hidden_size) model.load_state_dict(torch.load(model_path)) model.to(DEVICE) predictions = np.zeros(df_test_y.shape) predictions = inference_fn(model, testloader, DEVICE) return oof, predictions def run_k_fold(NFOLDS, df_train_y=df_train_y, df_test_y=df_test_y): oof = np.zeros(df_train_y.shape) predictions = np.zeros(df_test_y.shape) for fold in range(NFOLDS): oof_, pred_ = model_train_pred(fold) predictions += pred_ / NFOLDS oof += oof_ return oof, predictions oofs_, predictions_ = run_k_fold(NFOLDS) df_oofs = pd.DataFrame(oofs_, columns=df_train_y.columns) df_preds = pd.DataFrame(predictions_, columns=df_test_y.columns) model_eval_results(df_train_y, oofs_, df_test, df_test_y, df_preds, target_cols) save_to_csv(df_preds, self.model_pred_dir, f"{tst_pred_name}{self.output_file_indicator}.csv") save_to_csv(df_oofs, self.model_pred_dir, f"{trn_pred_name}{self.output_file_indicator}.csv.gz", compress="gzip") print( "\n All is set, Train and Test predictions have been read as csv files into the model predictions directory!!" )
def L1000_nn_moa_train_prediction(self): print("Is GPU Available?") if torch.cuda.is_available(): print("Yes, GPU is Available!!") else: print("No, GPU is NOT Available!!", "\n") DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu') no_of_compts = 50 no_of_dims = 25 IS_TRAIN = True NSEEDS = 5 SEED = range(NSEEDS) NFOLDS = 5 WEIGHT_DECAY = 1e-5 EARLY_STOPPING_STEPS = 10 EARLY_STOP = False hidden_size = 1024 ##dir names model_file_name = "L1000_simplenn" model_dir_name = "L1000_simplenn_model" trn_pred_name = 'L1000_train_preds_simplenn' tst_pred_name = 'L1000_test_preds_simplenn' model_file_name,model_dir_name,trn_pred_name,tst_pred_name = \ check_if_shuffle_data(self.shuffle, model_file_name, model_dir_name, trn_pred_name, tst_pred_name) model_dir = os.path.join(self.data_dir, model_dir_name) os.makedirs(model_dir, exist_ok=True) if self.shuffle: df_train = pd.read_csv(os.path.join( self.data_dir, 'train_shuffle_lvl4_data.csv.gz'), compression='gzip', low_memory=False) else: df_train = pd.read_csv(os.path.join(self.data_dir, 'train_lvl4_data.csv.gz'), compression='gzip', low_memory=False) df_test = pd.read_csv(os.path.join(self.data_dir, 'test_lvl4_data.csv.gz'), compression='gzip', low_memory=False) df_targets = pd.read_csv( os.path.join(self.data_dir, 'target_labels.csv')) metadata_cols = [ 'Metadata_broad_sample', 'pert_id', 'pert_idose', 'replicate_id', 'pert_iname', 'moa', 'sig_id', 'det_plate', 'dose', 'det_well' ] target_cols = df_targets.columns[1:] df_train_x, df_train_y, df_test_x, df_test_y = split_data( df_train, df_test, metadata_cols, target_cols) df_train_x, df_test_x = umap_factor_features(df_train_x, df_test_x, no_of_compts, no_of_dims) features = df_train_x.columns.tolist() num_features = len(features) num_targets = len(target_cols) df_train = drug_stratification(df_train, NFOLDS, target_cols, col_name='replicate_id', cpd_freq_num=24) pos_weight = initialize_weights(df_train, target_cols, DEVICE) def model_train_pred(fold, seed): seed_everything(seed) model_path = os.path.join( model_dir, model_file_name + f"_SEED{seed}_FOLD{fold}.pth") trn_idx = df_train[df_train['fold'] != fold].index val_idx = df_train[df_train['fold'] == fold].index x_fold_train = df_train_x.loc[trn_idx].reset_index( drop=True).copy() y_fold_train = df_train_y.loc[trn_idx].reset_index( drop=True).copy() x_fold_val = df_train_x.loc[val_idx].reset_index(drop=True).copy() y_fold_val = df_train_y.loc[val_idx].reset_index(drop=True).copy() df_test_x_copy = df_test_x.copy() x_fold_train, x_fold_val, df_test_x_copy = normalize( x_fold_train, x_fold_val, df_test_x_copy) train_dataset = TrainDataset(x_fold_train.values, y_fold_train.values) valid_dataset = TrainDataset(x_fold_val.values, y_fold_val.values) trainloader = torch.utils.data.DataLoader( train_dataset, batch_size=self.BATCH_SIZE, shuffle=True) validloader = torch.utils.data.DataLoader( valid_dataset, batch_size=self.BATCH_SIZE, shuffle=False) model = SimpleNN_Model(num_features=num_features, num_targets=num_targets, hidden_size=hidden_size) model.to(DEVICE) optimizer = torch.optim.Adam(model.parameters(), weight_decay=WEIGHT_DECAY, lr=self.LEARNING_RATE) scheduler = optim.lr_scheduler.OneCycleLR( optimizer=optimizer, pct_start=0.2, div_factor=1e3, max_lr=1e-2, epochs=self.EPOCHS, steps_per_epoch=len(trainloader)) loss_train = SmoothBCEwLogits(smoothing=0.001) loss_val = nn.BCEWithLogitsLoss() early_stopping_steps = EARLY_STOPPING_STEPS early_step = 0 oof = np.zeros(df_train_y.shape) best_loss = np.inf best_loss_epoch = -1 if IS_TRAIN: for epoch in range(self.EPOCHS): train_loss = train_fn(model, optimizer, scheduler, loss_train, trainloader, DEVICE) valid_loss, valid_preds = valid_fn(model, loss_val, validloader, DEVICE) if valid_loss < best_loss: best_loss = valid_loss best_loss_epoch = epoch oof[val_idx] = valid_preds torch.save(model.state_dict(), model_path) elif (EARLY_STOP == True): early_step += 1 if (early_step >= early_stopping_steps): break if epoch % 10 == 0 or epoch == self.EPOCHS - 1: print(f"seed: {seed}, FOLD: {fold}, EPOCH: {epoch},\ train_loss: {train_loss:.6f}, valid_loss: {valid_loss:.6f}, best_loss: {best_loss:.6f},\ best_loss_epoch: {best_loss_epoch}") #--------------------- PREDICTION--------------------- testdataset = TestDataset(df_test_x_copy.values) testloader = torch.utils.data.DataLoader( testdataset, batch_size=self.BATCH_SIZE, shuffle=False) model = SimpleNN_Model(num_features=num_features, num_targets=num_targets, hidden_size=hidden_size) model.load_state_dict(torch.load(model_path)) model.to(DEVICE) if not IS_TRAIN: valid_loss, valid_preds = valid_fn(model, loss_fn, validloader, DEVICE) oof[val_idx] = valid_preds predictions = np.zeros(df_test_y.shape) predictions = inference_fn(model, testloader, DEVICE) return oof, predictions def run_k_fold(folds, seed): oof = np.zeros(df_train_y.shape) predictions = np.zeros(df_test_y.shape) for fold in range(folds): oof_, pred_ = model_train_pred(fold, seed) predictions += pred_ / folds oof += oof_ return oof, predictions oofs = np.zeros(df_train_y.shape) predictions = np.zeros(df_test_y.shape) time_start = time.time() for seed in SEED: oofs_, predictions_ = run_k_fold(NFOLDS, seed) oofs += oofs_ / len(SEED) predictions += predictions_ / len(SEED) print(f"elapsed time: {time.time() - time_start}") df_oofs = pd.DataFrame(oofs, columns=df_train_y.columns) df_preds = pd.DataFrame(predictions, columns=df_test_y.columns) model_eval_results(df_train_y, oofs, df_test, df_test_y, df_preds, target_cols) save_to_csv(df_preds, self.model_pred_dir, f"{tst_pred_name}.csv") save_to_csv(df_oofs, self.model_pred_dir, f"{trn_pred_name}.csv.gz", compress="gzip") print( "\n All is set, Train and Test predictions have been read as csv files into the model predictions directory!!" )
def L1000_tabnet_moa_train_pred(self): print("Is GPU Available?") if torch.cuda.is_available(): print("Yes, GPU is Available!!") else: print("No, GPU is NOT Available!!", "\n") DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu') no_of_components = 25 NFOLDS = 5 ##dir names model_file_name = "L1000_tabnet" model_dir_name = "L1000_tabnet_model" trn_pred_name = 'L1000_train_preds_tabnet' tst_pred_name = 'L1000_test_preds_tabnet' model_file_name,model_dir_name,trn_pred_name,tst_pred_name = \ check_if_shuffle_data(self.shuffle, model_file_name, model_dir_name, trn_pred_name, tst_pred_name) model_dir = os.path.join(self.data_dir, model_dir_name) os.makedirs(model_dir, exist_ok=True) if self.shuffle: df_train = pd.read_csv(os.path.join( self.data_dir, 'train_shuffle_lvl4_data.csv.gz'), compression='gzip', low_memory=False) else: df_train = pd.read_csv(os.path.join(self.data_dir, 'train_lvl4_data.csv.gz'), compression='gzip', low_memory=False) df_test = pd.read_csv(os.path.join(self.data_dir, 'test_lvl4_data.csv.gz'), compression='gzip', low_memory=False) df_targets = pd.read_csv( os.path.join(self.data_dir, 'target_labels.csv')) metadata_cols = [ 'Metadata_broad_sample', 'pert_id', 'pert_idose', 'replicate_id', 'pert_iname', 'moa', 'sig_id', 'det_plate', 'dose', 'det_well' ] target_cols = df_targets.columns[1:] df_train_x, df_train_y, df_test_x, df_test_y = split_data( df_train, df_test, metadata_cols, target_cols) df_train_x = add_stat_feats(df_train_x) df_test_x = add_stat_feats(df_test_x) df_train = drug_stratification(df_train, NFOLDS, target_cols, col_name='replicate_id', cpd_freq_num=24) pos_weight = initialize_weights(df_train, target_cols, DEVICE) wgt_bce = dp(F.binary_cross_entropy_with_logits) wgt_bce.__defaults__ = (None, None, None, 'mean', pos_weight) def model_train_pred(fold): model_path = os.path.join(model_dir, model_file_name + f"_FOLD{fold}.pth") tabnet_params = dict(n_d=64, n_a=128, n_steps=1, gamma=1.3, lambda_sparse=0, n_independent=2, n_shared=1, optimizer_fn=optim.Adam, optimizer_params=dict(lr=self.LEARNING_RATE, weight_decay=1e-5), mask_type="entmax", scheduler_params=dict(mode="min", patience=10, min_lr=1e-5, factor=0.9), scheduler_fn=ReduceLROnPlateau, verbose=10) x_fold_train, y_fold_train, x_fold_val, y_fold_val, df_test_x_copy, val_idx = \ preprocess(fold, df_train, df_train_x, df_train_y, df_test_x, no_of_components) x_fold_train, x_fold_val, df_test_x_copy = variance_threshold( x_fold_train, x_fold_val, df_test_x_copy) ### Fit ### model = TabNetRegressor(**tabnet_params) model.fit(X_train=x_fold_train.values, y_train=y_fold_train.values, eval_set=[(x_fold_val.values, y_fold_val.values)], eval_name=["val"], eval_metric=["logits_ll"], max_epochs=self.EPOCHS, patience=40, batch_size=self.BATCH_SIZE, virtual_batch_size=32, num_workers=1, drop_last=False, loss_fn=SmoothBCEwLogits(smoothing=0.001, pos_weight=pos_weight)) ###---- Prediction --- oof = np.zeros(df_train_y.shape) valid_preds = 1 / (1 + np.exp(-model.predict(x_fold_val.values))) oof[val_idx] = valid_preds predictions = 1 / (1 + np.exp(-model.predict(df_test_x_copy.values))) model_path = model.save_model(model_path) return oof, predictions def run_k_fold(NFOLDS, df_train_y=df_train_y, df_test_y=df_test_y): oof = np.zeros(df_train_y.shape) predictions = np.zeros(df_test_y.shape) for fold in range(NFOLDS): oof_, pred_ = model_train_pred(fold) predictions += pred_ / NFOLDS oof += oof_ return oof, predictions oofs_, predictions_ = run_k_fold(NFOLDS) df_oofs = pd.DataFrame(oofs_, columns=df_train_y.columns) df_preds = pd.DataFrame(predictions_, columns=df_test_y.columns) model_eval_results(df_train_y, oofs_, df_test, df_test_y, df_preds, target_cols) save_to_csv(df_preds, self.model_pred_dir, f"{tst_pred_name}.csv") save_to_csv(df_oofs, self.model_pred_dir, f"{trn_pred_name}.csv.gz", compress="gzip") print( "\n All is set, Train and Test predictions have been read as csv files into the model predictions directory!!" )