def predict(model_path, datapath, train_data_path=None, proba=False, robust=True): """Return model predictions for a selected dataframe :param model_path: path to the model (ie."results/ISV_gain.json.gz") :param datapath: path to the dataframe to be predicted :param train_data_path: path to training dataframe - only necessary id predicting data other than train/val/test :param proba: return probabilities :param robust: use robust scaling. Otherwise MinMax is used :returns: (yhat, y): predicted and real values """ cnv_type = ['loss', 'gain'][('gain' in model_path) * 1] logtransform = (model_path.split('_')[-1].split('.')[0] == 'log') model = open_model(model_path) if 'train' in datapath or 'validation' in datapath: X_train, Y_train, X_val, Y_val = prepare_df(cnv_type, logtransform, robustscaler=robust) if 'train' in datapath: X, y = X_train, Y_train else: X, y = X_val, Y_val else: X, y = prepare(cnv_type, logtransform=logtransform, robustscaler=robust, data_path=datapath, train_data_path=train_data_path) if proba: if isinstance(model, xgb.core.Booster): X_dmat = xgb.DMatrix(X) yhat = model.predict(X_dmat) else: yhat = model.predict_proba(X)[:, 1] else: if isinstance(model, xgb.core.Booster): X_dmat = xgb.DMatrix(X) yhat = model.predict(X_dmat) yhat = (yhat > 0.5) * 1 else: yhat = model.predict(X) return yhat, y
# %% import pandas as pd import numpy as np from scripts.ml.prepare_df import prepare_df from scripts.constants import LOSS_ATTRIBUTES, GAIN_ATTRIBUTES, HUMAN_READABLE # %% final = [] for i, cnv_type in enumerate(['loss', 'gain']): attributes = [LOSS_ATTRIBUTES, GAIN_ATTRIBUTES][(cnv_type == 'gain') * 1] # translate to human readable attributes = [HUMAN_READABLE[i] for i in attributes] train_X, train_Y, val_X, val_Y = prepare_df(cnv_type, raw=True) X = pd.DataFrame(np.concatenate([train_X, val_X]), columns=attributes) X["y"] = [ 'Pathogenic' if i == 1 else "Benign" for i in np.concatenate([train_Y, val_Y]) ] benign = X.query("y == 'Benign'").drop("y", axis=1) pathogenic = X.query("y == 'Pathogenic'").drop("y", axis=1) bmean = np.round(benign.mean(axis=0).values.reshape(-1, 1), 2) bstd = np.round(benign.std(axis=0).values.reshape(-1, 1), 2) bmax = np.round(benign.max(axis=0).values.reshape(-1, 1), 2) pmean = np.round(pathogenic.mean(axis=0).values.reshape(-1, 1), 2)
# translate to human readable attributes = [HUMAN_READABLE[i] for i in attributes0] # open saved shap values # with open(f'data/shap_data/shap_validation_{cnv_type}.pkl', 'rb') as f: with open(snakemake.input.shap_data, 'rb') as f: shap_values = pickle.load(f) shap_values.feature_names = attributes # y = pd.read_csv(f'data/validation_{cnv_type}.tsv.gz', sep='\t', compression='gzip') y = pd.read_csv(snakemake.input.data, sep='\t', compression='gzip') y = y.clinsig.values _, _, orig, _ = prepare_df(cnv_type, logtransform=True) # load original dataframe orig = pd.DataFrame(orig, columns=attributes) orig["y"] = y sv = pd.DataFrame(shap_values.values, columns=attributes) sv = sv.iloc[:, np.argsort(np.mean(np.abs(sv), axis=0))[::-1]] sv['y'] = ['Pathogenic' if i == 1 else "Benign" for i in y] # swarmplot with discrete hue fig, ax = plt.subplots(1, 1, figsize=(12, 14)) temp = sv.iloc[:, :].melt(id_vars="y")
from matplotlib import rcParams from scripts.ml.prepare_df import prepare_df from scipy.stats import pointbiserialr from scripts.constants import DPI rcParams.update({'font.size': 18}) # %% fig, ax = plt.subplots(1, 2, figsize=(25, 14)) for i, cnv_type in enumerate(['loss', 'gain']): attributes = [LOSS_ATTRIBUTES, GAIN_ATTRIBUTES][(cnv_type == 'gain') * 1] # translate to human readable attributes = [HUMAN_READABLE[i] for i in attributes] train_X, train_Y, val_X, val_Y = prepare_df(cnv_type, logtransform=True) X = pd.DataFrame(np.concatenate([train_X, val_X]), columns=attributes) X["y"] = ['Pathogenic' if i == 1 else "Benign" for i in np.concatenate([train_Y, val_Y])] sns.violinplot(x='value', y='variable', hue="y", data=X.melt(id_vars="y"), ax=ax[i], palette={"Pathogenic": "red", "Benign": "green"}, split=True) ax[i].set_xlabel('log(value)') ax[i].set_ylabel('') ax[i].set_title('copy number ' + cnv_type) ax[i].legend(title='Clinical Significance') fig.tight_layout() plt.savefig(snakemake.output.distributions, dpi=DPI)
filepath_list = str(pathlib.Path(__file__).parent.absolute()).split('/') ind = filepath_list.index('scripts') sys.path.insert(1, '/'.join(filepath_list[:ind])) # %% from scripts.ml.prepare_df import prepare_df import xgboost as xgb import numpy as np from sklearn.metrics import confusion_matrix # %% ######## # GAIN # ######## train_X, train_Y, val_X, val_Y = prepare_df('gain') train_dmat = xgb.DMatrix(train_X, train_Y) val_dmat = xgb.DMatrix(val_X, val_Y) # %% XGBOOST GAIN # ORIGINAL BEST PARAMS # p = {'max_depth': 8, # 'eta': 0.01, # 'gamma': 1, # 'subsample': 1, # 'lambda': 0.1, # 'colsample_bytree': 0.8, # 'scale_pos_weight': np.sqrt(sum(train_Y == 0) / sum(train_Y == 1)), # 'seed': 1618, # 'nthread': 4,