Пример #1
0
def distance_summary(metrics=['jaccard', 'cosine']):
    """Get the distance summary for all the used cell-types"""
    from scipy.spatial.distance import squareform
    from scipy.stats import rankdata
    from collections import OrderedDict
    ddir = get_data_dir()

    mlist = []
    for metric in metrics:
        d = np.loadtxt(os.path.join(ddir, "processed/tlearn/dist/{metric}.txt".format(metric=metric)))
        d = squareform(d)
        mlist.append((metric, d))

    # Get the indicies where everything was evaluated
    tnames = pd.Series(get_all_task_names())
    is_evaluated = tnames.isin(get_evaluated_task_names())
    il = np.where(is_evaluated)[0]
    out = []
    for mname, m in mlist:
        for i in il:
            d = m[i]
            # Remove self
            d = d[np.arange(len(d)) != i]
            out.append(OrderedDict([
                ("i", i),
                ("ENCODE_ID", tnames[i]),
                ("metric", mname),
                ("dist_avg", d.mean()),
                ("dist_avg_top_10%", d[rankdata(d) < (len(d) * 0.1)].mean()),
                ("dist_nearest", d.min()),
                ("dist_furthest", d.max()),
            ]))
    dfd = pd.DataFrame(out)
    return dfd
Пример #2
0
def get_clusters():
    """Get clusters of the data"""
    ddir = get_data_dir()
    tnames = pd.Series(get_all_task_names())
    clusters = np.loadtxt(os.path.join(ddir, "raw/tlearn/clustering/data_clusters.csv")).astype(int)
    dfc = pd.DataFrame({"ENCODE_ID": tnames, "cluster": clusters})
    dfc['cluster_size'] = dfc.groupby("cluster").transform(len)
    return dfc
Пример #3
0
def get_metadata():
    """Get metadata on all the tasks
    """
    ddir = get_data_dir()
    conf_json = json.load(open(os.path.join(ddir, "raw/tlearn/raw_intervals_config_file_complete.json")))

    df = pd.DataFrame({"task_name": conf_json['task_names'],
                       "file_path": pd.Series(conf_json['task_names']).map(conf_json['Human']['feature_beds'])})
    df['folder_name'] = df.file_path.map(os.path.basename).str.replace('_rep1-pr.IDR0.1.filt.narrowPeak.gz', '')
    dfm = pd.read_csv(os.path.join(ddir, "raw/tlearn/dnase_metadata_2016-12-05.tsv"), sep="\t")
    df = df.merge(dfm, on="folder_name")
    assert list(df.task_name) == conf_json['task_names']
    return df
Пример #4
0
def get_exp():
    """Get a dataframe with information about the executed experiments
    """
    dfe = pd.read_csv(os.path.join(get_data_dir(), "raw/tlearn/experiments.tsv"), sep="\t")
    dfe.columns = ['Model', 'ENCODE_ID', 'Cell_Type', 'Log_Path', 'Eval_set',
                   'val_auPRC_epoch1', 'val_auPRC_epoch5', 'val_auPRC_epoch10',
                   'auPRC_tsv', 'auROC_tsv', 'n_epoch', 'Training_Time']
    dfe = dfe[dfe.Model.notnull()]

    # Valid
    df_valid = pd.concat([parse_log("{}/tfdragonn-train".format(dfe.Log_Path[i])) for i in range(len(dfe))]).reset_index()
    df_valid['Log_Path'] = df_valid['path'].map(lambda x: os.path.dirname(x))
    df_valid = df_valid.merge(dfe, on="Log_Path")

    # Test
    df_test = pd.concat([parse_log("{}/tfdragonn-test".format(dfe.Log_Path[i])) for i in range(len(dfe))]).reset_index()
    df_test['Log_Path'] = df_test['path'].map(lambda x: os.path.dirname(x))
    df_test = df_test.merge(dfe, on="Log_Path")
    return df_valid, df_test
Пример #5
0
import kipoi
import os
import sys
import pandas as pd
from m_kipoi.exp.tfbinding.config import SINGLE_TASK_MODELS, get_dl_kwargs, TF2CT
from kipoi.readers import HDF5Reader
from concise.eval_metrics import auprc, auc, accuracy
from collections import OrderedDict
from pybedtools import BedTool
import numpy as np
from tqdm import tqdm
from m_kipoi.metrics import classification_metrics, MetricsTupleList, BootstrapMetric
from m_kipoi.config import get_data_dir

ddir = get_data_dir()
root_dir = os.path.join(ddir, '../')
eval_dir = os.path.join(ddir, 'processed/tfbinding/eval/preds')


def get_eval_predictions(tf, model, filter_dnase=False):
    """Get the predictions"""
    with HDF5Reader(os.path.join(eval_dir, tf, model + ".h5")) as r:
        y_pred = r.f['/preds'][:]

    labels_bed_file = os.path.join(root_dir,
                                   get_dl_kwargs(tf)['intervals_file'])
    df_unfiltered = pd.read_csv(labels_bed_file, sep="\t", header=None)
    df_unfiltered.columns = ['chr', 'start', 'end', 'y_true']
    if filter_dnase:
        # Filter the DNase peaks based on the overlaps
        dnase_peaks = '{ddir}/raw/tfbinding/eval/tf-DREAM/DNASE.{ctype}.relaxed.narrowPeak.gz'.format(
Пример #6
0
def get_clinvar_ext_Xy(clinvar='20180429',
                       keep_variants="^Pathogenic$|^Benign$"):
    """Load the clinvar data

    Args:
      clinvar: clinvar version (publication date)
      keep_variants: regex of variants to keep
    """
    def variant_id(chr, pos, ref, alt):
        return chr.astype(str) + ":" + pos.astype(
            str) + ":" + ref + ":['" + alt + "']"

    ddir = get_data_dir()
    df = pd.read_csv(
        f"{ddir}/processed/splicing/clinvar/annotated_vcf/{clinvar}.filtered/modeling_df.tsv",
        sep='\t')
    # Keep only Kipoi annotations
    df = df.iloc[:, ~df.columns.str.startswith("other_")]

    # Append clinical significance
    from kipoi_veff.parsers import KipoiVCFParser
    vcf_file = f"{ddir}/processed/splicing/clinvar/{clinvar}.filtered.vcf.gz"
    dfc = pd.DataFrame(list(KipoiVCFParser(vcf_file)))
    dfc['variant_id_old'] = dfc['variant_id']
    dfc['variant_id'] = variant_id(dfc.variant_chr, dfc.variant_pos,
                                   dfc.variant_ref, dfc.variant_alt)
    dfc['ClinicalSignificance'] = dfc['other_CLNSIG']
    # import ipdb
    # ipdb.set_trace()
    df = pd.merge(df,
                  dfc[['variant_id', 'ClinicalSignificance']],
                  on='variant_id',
                  validate="many_to_one").drop_duplicates()

    # add the differences
    df["pathogenic"] = df.ClinicalSignificance == "Pathogenic"

    splicing_models = [
        "MaxEntScan/3prime", "MaxEntScan/5prime", "HAL", "labranchor"
    ]
    for m in splicing_models:
        df[m + "_diff"] = df[m + "_ref"] - df[m + "_ref"]
        df[m + "_isna"] = df[m + "_ref"].isnull().astype(float)

    only_NA_rows = df[[m + "_diff"
                       for m in splicing_models]].isnull().all(axis=1)

    df = df[~only_NA_rows]
    df = df[~df.ClinicalSignificance.isnull()]
    df = df[df.ClinicalSignificance.str.match(keep_variants)]

    # Append conservation scores and dbscSNV from VEP
    df_vep = pd.read_csv(
        f"{ddir}/processed/splicing/clinvar/annotated_vcf/{clinvar}.filtered/VEP.txt.gz",
        sep='\t',
        na_values='-')
    df_vep = df_vep.join(
        df_vep.Location.str.split(":|-", expand=True).rename(columns={
            0: "chr",
            1: "start",
            2: "end"
        }))

    df_vep['start'] = df_vep.start.astype(float)
    df_vep['end'] = df_vep.end.astype(float)
    df_vep['variant_id'] = variant_id(df_vep['chr'], df_vep.start.astype(int),
                                      df_vep.GIVEN_REF, df_vep.Allele)
    cons_features = [
        "CADD_raw", "CADD_phred", "phyloP46way_placental",
        "phyloP46way_primate"
    ]
    splice_features = ['rf_score', 'MaxEntScan_diff']

    # exclude stop_gained variants
    exclude = df_vep[df_vep.Consequence.str.startswith(
        "stop_gained")]['#Uploaded_variation'].unique()
    df_vep["early_stop"] = df_vep['#Uploaded_variation'].isin(exclude)

    df = pd.merge(df,
                  df_vep[["variant_id", "early_stop"] + cons_features +
                         splice_features].drop_duplicates(["variant_id"]),
                  on=["variant_id"],
                  how='left',
                  validate="many_to_one").drop_duplicates()

    # Append spidex
    df_spidex = pd.read_csv(
        f"{ddir}/raw/splicing/spidex/hg19_spidex.clinvar_{clinvar}.txt",
        sep='\t')
    df_spidex = df_spidex.drop_duplicates()
    df_spidex['variant_id'] = variant_id(df_spidex["#Chr"].astype(str),
                                         df_spidex.Start, df_spidex.Ref,
                                         df_spidex.Alt)
    df = pd.merge(df,
                  df_spidex[['variant_id', 'dpsi_max_tissue', 'dpsi_zscore']],
                  on="variant_id",
                  how='left')
    df['dpsi_max_tissue_isna'] = df['dpsi_max_tissue'].isnull()
    df['dpsi_zscore_isna'] = df['dpsi_zscore'].isnull()
    df.loc[df.dpsi_max_tissue.isnull(), "dpsi_max_tissue"] = 0
    df.loc[df.dpsi_zscore.isnull(), "dpsi_zscore"] = 0

    # Append dbscSNV
    dbsc = dd.read_csv(f"{ddir}/raw/splicing/dbscSNV/dbscSNV.chr*",
                       sep='\t',
                       dtype={
                           'chr': 'object'
                       },
                       na_values=".").compute()
    dbsc['variant_id'] = variant_id(dbsc.chr, dbsc.pos, dbsc.ref, dbsc.alt)
    dbsc = dbsc.rename(columns={
        'rf_score': 'dbscSNV_rf_score',
        'ada_score': 'dbscSNV_ada_score'
    })
    df = pd.merge(df, dbsc, on='variant_id', how='left')
    df['dbscSNV_rf_score_isna'] = df.dbscSNV_rf_score.isnull()
    df['dbscSNV_ada_score_isna'] = df.dbscSNV_ada_score.isnull()
    df.loc[df.dbscSNV_rf_score.isnull(), 'dbscSNV_rf_score'] = 0
    df.loc[df.dbscSNV_ada_score.isnull(), 'dbscSNV_ada_score'] = 0

    y_clinvar = np.array(df.ClinicalSignificance == "Pathogenic")
    X_clinvar = df.loc[:, df.columns != 'ClinicalSignificance']
    X_clinvar = X_clinvar.iloc[:, ~X_clinvar.columns.str.contains("diff")]

    return X_clinvar, y_clinvar
Пример #7
0
def get_dbscsnv_Xy():
    """Load the dbscSNV data with additional columns from kipoi models
    """
    ddir = get_data_dir()
    df = pd.read_csv(os.path.join(
        ddir, "processed/splicing/dbscSNV/modeling_df.tsv"),
                     sep='\t')
    df['Chr'] = df.Chr.astype(str)

    # append spidex
    df_spidex = pd.read_csv(
        f"{ddir}/raw/splicing/spidex/hg19_spidex.dbscSNV.txt", sep='\t')
    df_spidex['Chr'] = df_spidex["#Chr"].astype(str)
    del df_spidex['#Chr']
    df_spidex = df_spidex.rename(columns={"Start": "Position", "End": "end"})
    df_spidex = df_spidex.drop_duplicates()
    df = pd.merge(df,
                  df_spidex,
                  on=["Chr", "Position", "Ref", "Alt"],
                  how='left',
                  validate='one_to_one')
    df['dpsi_max_tissue_isna'] = df['dpsi_max_tissue'].isnull()
    df['dpsi_zscore_isna'] = df['dpsi_zscore'].isnull()
    df.loc[df.dpsi_max_tissue.isnull(), "dpsi_max_tissue"] = 0
    df.loc[df.dpsi_zscore.isnull(), "dpsi_zscore"] = 0

    # Append dbscSNV
    dbsc = dd.read_csv(f"{ddir}/raw/splicing/dbscSNV/dbscSNV.chr*",
                       sep='\t',
                       dtype={
                           'chr': 'object'
                       },
                       na_values=".").compute()
    dbsc.rename(columns={
        "chr": "Chr",
        "pos": "Position",
        "ref": "Ref",
        "alt": "Alt"
    },
                copy=False,
                inplace=True)
    df = pd.merge(df,
                  dbsc,
                  on=["Chr", "Position", "Ref", "Alt"],
                  how='left',
                  validate="one_to_one")
    df = df.rename(columns={
        'rf_score': 'dbscSNV_rf_score',
        'ada_score': 'dbscSNV_ada_score'
    })
    df['dbscSNV_rf_score_isna'] = df.dbscSNV_rf_score.isnull()
    df['dbscSNV_ada_score_isna'] = df.dbscSNV_ada_score.isnull()
    df.loc[df.dbscSNV_rf_score.isnull(), 'dbscSNV_rf_score'] = 0
    df.loc[df.dbscSNV_ada_score.isnull(), 'dbscSNV_ada_score'] = 0

    # TODO - append the conservation scores

    # crete x,y
    y_dbscsnv = np.array(df.Group == "Positive")
    X_dbscsnv = df.iloc[:, df.columns != "Group"]
    return X_dbscsnv, y_dbscsnv
Пример #8
0
import pandas as pd
import numpy as np
import os
import dask.dataframe as dd
from m_kipoi.config import get_data_dir
from openpyxl import Workbook, load_workbook
DATA = get_data_dir()


def get_clinvar_ext_Xy(clinvar='20180429',
                       keep_variants="^Pathogenic$|^Benign$"):
    """Load the clinvar data

    Args:
      clinvar: clinvar version (publication date)
      keep_variants: regex of variants to keep
    """
    def variant_id(chr, pos, ref, alt):
        return chr.astype(str) + ":" + pos.astype(
            str) + ":" + ref + ":['" + alt + "']"

    ddir = get_data_dir()
    df = pd.read_csv(
        f"{ddir}/processed/splicing/clinvar/annotated_vcf/{clinvar}.filtered/modeling_df.tsv",
        sep='\t')
    # Keep only Kipoi annotations
    df = df.iloc[:, ~df.columns.str.startswith("other_")]

    # Append clinical significance
    from kipoi_veff.parsers import KipoiVCFParser
    vcf_file = f"{ddir}/processed/splicing/clinvar/{clinvar}.filtered.vcf.gz"
Пример #9
0
def get_multitask_names():
    ddir = get_data_dir()
    with open(os.path.join(ddir, "raw/tlearn/intervalspecfile_holdout_10_nochr1chr8chr9chr21.json")) as f:
        return json.load(f)['task_names']
Пример #10
0
def get_all_task_names():
    ddir = get_data_dir()
    with open(os.path.join(ddir, "raw/tlearn/raw_intervals_config_file_complete.json")) as f:
        return json.load(f)['task_names']