Пример #1
0
def predict(model_path,
            datapath,
            train_data_path=None,
            proba=False,
            robust=True):
    """Return model predictions for a selected dataframe

    :param model_path: path to the model (ie."results/ISV_gain.json.gz")
    :param datapath: path to the dataframe to be predicted
    :param train_data_path: path to training dataframe - only necessary id predicting data other than train/val/test
    :param proba: return probabilities
    :param robust: use robust scaling. Otherwise MinMax is used
    :returns: (yhat, y): predicted and real values
    """
    cnv_type = ['loss', 'gain'][('gain' in model_path) * 1]

    logtransform = (model_path.split('_')[-1].split('.')[0] == 'log')

    model = open_model(model_path)

    if 'train' in datapath or 'validation' in datapath:
        X_train, Y_train, X_val, Y_val = prepare_df(cnv_type,
                                                    logtransform,
                                                    robustscaler=robust)

        if 'train' in datapath:
            X, y = X_train, Y_train
        else:
            X, y = X_val, Y_val

    else:
        X, y = prepare(cnv_type,
                       logtransform=logtransform,
                       robustscaler=robust,
                       data_path=datapath,
                       train_data_path=train_data_path)
    if proba:
        if isinstance(model, xgb.core.Booster):
            X_dmat = xgb.DMatrix(X)
            yhat = model.predict(X_dmat)
        else:
            yhat = model.predict_proba(X)[:, 1]

    else:
        if isinstance(model, xgb.core.Booster):
            X_dmat = xgb.DMatrix(X)
            yhat = model.predict(X_dmat)
            yhat = (yhat > 0.5) * 1
        else:
            yhat = model.predict(X)

    return yhat, y
Пример #2
0
# %%
import pandas as pd
import numpy as np

from scripts.ml.prepare_df import prepare_df
from scripts.constants import LOSS_ATTRIBUTES, GAIN_ATTRIBUTES, HUMAN_READABLE

# %%
final = []
for i, cnv_type in enumerate(['loss', 'gain']):
    attributes = [LOSS_ATTRIBUTES, GAIN_ATTRIBUTES][(cnv_type == 'gain') * 1]
    # translate to human readable
    attributes = [HUMAN_READABLE[i] for i in attributes]

    train_X, train_Y, val_X, val_Y = prepare_df(cnv_type, raw=True)

    X = pd.DataFrame(np.concatenate([train_X, val_X]), columns=attributes)
    X["y"] = [
        'Pathogenic' if i == 1 else "Benign"
        for i in np.concatenate([train_Y, val_Y])
    ]

    benign = X.query("y == 'Benign'").drop("y", axis=1)
    pathogenic = X.query("y == 'Pathogenic'").drop("y", axis=1)

    bmean = np.round(benign.mean(axis=0).values.reshape(-1, 1), 2)
    bstd = np.round(benign.std(axis=0).values.reshape(-1, 1), 2)
    bmax = np.round(benign.max(axis=0).values.reshape(-1, 1), 2)

    pmean = np.round(pathogenic.mean(axis=0).values.reshape(-1, 1), 2)
Пример #3
0
# translate to human readable
attributes = [HUMAN_READABLE[i] for i in attributes0]

# open saved shap values
# with open(f'data/shap_data/shap_validation_{cnv_type}.pkl', 'rb') as f:
with open(snakemake.input.shap_data, 'rb') as f:
    shap_values = pickle.load(f)

shap_values.feature_names = attributes

# y = pd.read_csv(f'data/validation_{cnv_type}.tsv.gz', sep='\t', compression='gzip')
y = pd.read_csv(snakemake.input.data, sep='\t', compression='gzip')
y = y.clinsig.values

_, _, orig, _ = prepare_df(cnv_type, logtransform=True)

# load original dataframe
orig = pd.DataFrame(orig, columns=attributes)
orig["y"] = y

sv = pd.DataFrame(shap_values.values, columns=attributes)

sv = sv.iloc[:, np.argsort(np.mean(np.abs(sv), axis=0))[::-1]]
sv['y'] = ['Pathogenic' if i == 1 else "Benign" for i in y]

# swarmplot with discrete hue
fig, ax = plt.subplots(1, 1, figsize=(12, 14))

temp = sv.iloc[:, :].melt(id_vars="y")
Пример #4
0
from matplotlib import rcParams
from scripts.ml.prepare_df import prepare_df
from scipy.stats import pointbiserialr
from scripts.constants import DPI

rcParams.update({'font.size': 18})

# %%
fig, ax = plt.subplots(1, 2, figsize=(25, 14))

for i, cnv_type in enumerate(['loss', 'gain']):
    attributes = [LOSS_ATTRIBUTES, GAIN_ATTRIBUTES][(cnv_type == 'gain') * 1]
    # translate to human readable
    attributes = [HUMAN_READABLE[i] for i in attributes]
    
    train_X, train_Y, val_X, val_Y = prepare_df(cnv_type, logtransform=True)
    
    X = pd.DataFrame(np.concatenate([train_X, val_X]), columns=attributes)
    X["y"] = ['Pathogenic' if i == 1 else "Benign" for i in np.concatenate([train_Y, val_Y])]
    
    sns.violinplot(x='value', y='variable', hue="y", data=X.melt(id_vars="y"), ax=ax[i],
                   palette={"Pathogenic": "red", "Benign": "green"}, split=True)
    
    ax[i].set_xlabel('log(value)')
    ax[i].set_ylabel('')
    ax[i].set_title('copy number ' + cnv_type)
    ax[i].legend(title='Clinical Significance')

fig.tight_layout()

plt.savefig(snakemake.output.distributions, dpi=DPI)
Пример #5
0
filepath_list = str(pathlib.Path(__file__).parent.absolute()).split('/')
ind = filepath_list.index('scripts')

sys.path.insert(1, '/'.join(filepath_list[:ind]))
# %%
from scripts.ml.prepare_df import prepare_df
import xgboost as xgb
import numpy as np
from sklearn.metrics import confusion_matrix

# %%
########
# GAIN #
########

train_X, train_Y, val_X, val_Y = prepare_df('gain')

train_dmat = xgb.DMatrix(train_X, train_Y)
val_dmat = xgb.DMatrix(val_X, val_Y)

# %% XGBOOST GAIN
# ORIGINAL BEST PARAMS
# p = {'max_depth': 8,
#      'eta': 0.01,
#      'gamma': 1,
#      'subsample': 1,
#      'lambda': 0.1,
#      'colsample_bytree': 0.8,
#      'scale_pos_weight': np.sqrt(sum(train_Y == 0) / sum(train_Y == 1)),
#      'seed': 1618,
#      'nthread': 4,