Exemplo n.º 1
0
def melodi_gwas():
    # gwasInfo={'1':'leptin','2':'adiponectin'}
    # gwasInfo={'1':'pcsk9','2':'adiponectin'}
    # gwasInfo = {
    #    "1":"Operative procedures - main OPCS: E03.5 Incision of septum of nose",
    #    "2":"Type of cancer: ICD10: C83.7 Burkitt's tumour",
    #    "3":"Type of cancer: ICD10: C92.0 Acute myeloid leukaemia",
    # }
    logger.info("getting gwas info")
    gwasInfo = get_gwas_data()

    # create test set
    gwasInfoTest = {k: gwasInfo[k] for k in list(gwasInfo)[:10]}
    logger.info(len(gwasInfoTest))
    #gwasInfo = gwasInfoTest 

    # enrich in parallel
    gwasChunks = chunks(gwasInfo,10)
    pool = mp.Pool(processes=10)
    results = pool.starmap(enrich, [(gwasData,gwasInfo) for gwasData in gwasChunks])
    pool.close()

    #create single file
    filename = f"gwas-melodi-enrich-{today}.tsv.gz"
    com = f"for i in melodi/*; do tail -n +2 $i; done | gzip > {filename}"
    subprocess.call(com, shell=True)
    copy_source_data(data_name=data_name,filename=filename)
Exemplo n.º 2
0
def pathways():
    # pathways
    # complete list
    url = "https://reactome.org/download/current/ReactomePathways.txt"
    logger.info(url)
    df1 = pd.read_csv(url, sep="\t")
    df1.columns = ["reactome_id", "name", "species"]
    df1 = df1[df1["species"] == "H**o sapiens"]
    logger.info(df1.head())
    filename = f"/tmp/ReactomePathways_human_{today}.csv"
    df1.to_csv(filename, index=False)
    copy_source_data(data_name=data_name, filename=filename)

    # hierarchy
    url = "https://reactome.org/download/current/ReactomePathwaysRelation.txt"
    logger.info(url)
    df2 = pd.read_csv(url, sep="\t")
    df2.columns = [
        "parent",
        "child",
    ]
    logger.info(df2.head())
    logger.info(df2.shape)
    df2 = df2[df2["parent"].isin(df1["reactome_id"])]
    logger.info(df2.shape)
    filename = f"/tmp/ReactomePathwaysRelation_human_{today}.csv"
    df2.to_csv(
        filename,
        index=False,
    )
    copy_source_data(data_name=data_name, filename=filename)
Exemplo n.º 3
0
def process_variants(variant_file):
    df = pd.read_csv(variant_file, low_memory=False)
    df = df["rsid"]
    df.drop_duplicates(inplace=True)
    logger.info(df.head())
    # in this example, only run 100 variants as can be quite slow
    filename = f"{vep_data_dir}/variants-{today}.txt"
    df.head(n=100).to_csv(filename, index=False, header=False)
    copy_source_data(data_name=data_name, filename=filename)
Exemplo n.º 4
0
def get_ebi_gwas_data():
    # retrieve EBI GWAS data data
    ebi_gwas_api_url = "https://www.ebi.ac.uk/gwas/api/search/downloads/studies_alternative"
    print("Getting GWAS data from EBI GWAS Catalog", ebi_gwas_api_url)
    ebi_gwas = requests.get(ebi_gwas_api_url)
    # save the full dataset
    with open(ebi_gwas_data_file, 'wb') as tsvfile:
        tsvfile.write(ebi_gwas.content)
    copy_source_data(data_name=data_name, filename=ebi_gwas_data_file)
Exemplo n.º 5
0
def biomart_to_file(atts, filename, type):
    logger.info("attributes: {} filename: {}", atts, filename)

    # latest build
    # server = BiomartServer( "http://www.ensembl.org/biomart" )
    # build 37
    server = biomart.BiomartServer("http://grch37.ensembl.org/biomart")

    hge = server.datasets["hsapiens_gene_ensembl"]
    # print(hge.show_attributes())

    s = hge.search({"attributes": atts}, header=1)
    o = gzip.open(filename, "w")
    c = 0
    for l in s.iter_lines():
        if c > 0:
            chr = l.decode("utf-8").split("\t")[0]
            if chr in [
                    "1",
                    "2",
                    "3",
                    "4",
                    "5",
                    "6",
                    "7",
                    "8",
                    "9",
                    "10",
                    "11",
                    "12",
                    "13",
                    "14",
                    "15",
                    "16",
                    "17",
                    "18",
                    "19",
                    "20",
                    "21",
                    "22",
                    "X",
                    "Y",
            ]:
                # print(l.decode('utf-8').split('\t')[0])
                # added binary b for python 3
                if type == "protein":
                    chr, gene, protein = l.decode("utf-8").split("\t")
                    if len(protein) > 1:
                        o.write(l + b"\n")
                else:
                    o.write(l + b"\n")
        c += 1
    # copy to data directory
    copy_source_data(data_name, filename)
Exemplo n.º 6
0
def get_top_hits():
    df = pd.read_csv(gwas_data_file, low_memory=False)
    gwas_ids = list(df.id)
    logger.info(gwas_ids[0:10])
    gwas_api_url = "http://gwasapi.mrcieu.ac.uk/tophits"
    payload = {"id": gwas_ids, "preclumped": 1}
    response = requests.post(gwas_api_url, json=payload)
    res = response.json()
    th_df = pd.json_normalize(res)
    th_df.to_csv(gwas_tophits, index=False)
    copy_source_data(data_name=data_name, filename=gwas_tophits)
Exemplo n.º 7
0
def create_clean_protein(protein_data):
    filename = f"/tmp/protein-only-{today}.txt"
    o = open(filename, "w")
    pCheck = {}
    with gzip.open(protein_data) as f:
        for line in f:
            chr, gene, uni = line.decode("utf-8").split("\t")
            if uni not in pCheck:
                o.write(uni)
                pCheck[uni] = ""
    o.close()
    copy_source_data(data_name, filename)
Exemplo n.º 8
0
def select_ebi_gwas_efo_mapping():
    # keep only required columns: GWAS ID and EFO
    df = pd.read_csv(ebi_gwas_data_file, sep='\t')
    df["GWAS_ID"] = "ebi-a-" + df["STUDY ACCESSION"]
    df = df[["GWAS_ID", "MAPPED_TRAIT_URI"]].drop_duplicates()
    df.columns = ["gwas.id", "efo.id"]
    print(df.head())
    print(df.shape)
    # subset the full dataset to GWAS that are present in OpenGWAS
    df = subset_to_available_gwas(df)
    print(df.shape)
    df.to_csv(ebi_gwas_efo_mapping, sep="\t", index=False)
    copy_source_data(data_name=data_name, filename=ebi_gwas_efo_mapping)
Exemplo n.º 9
0
def get_gwas_data():
    # create the data
    gwas_api_url = "http://gwasapi.mrcieu.ac.uk/gwasinfo"
    logger.info("Getting gwas data from {}", gwas_api_url)
    gwas_res = requests.get(gwas_api_url).json()
    outData = open(gwas_data_file, "w")
    df = pd.DataFrame(gwas_res)
    df = df.T.fillna("")
    logger.info(df.head())
    logger.info(df["year"].describe())
    df.to_csv(outData, index=False)
    outData.close()
    copy_source_data(data_name=data_name, filename=gwas_data_file)
Exemplo n.º 10
0
def main(oFile) -> None:
    gene_id_list = get_ensembl_id()

    with Pool(N_PROCS) as pool:
        nested_list = [
            gene_id_list[i:(i + N_PER_CHUNK)]
            for i in range(0, len(gene_id_list), N_PER_CHUNK)
        ]
        map_res = pool.map(get_ot_data, nested_list)
        ot_df = pd.concat(map_res, ignore_index=True)

    OPENTARGETS_DIR.mkdir(parents=True, exist_ok=True)
    ot_df.to_csv(oFile, index=False)
    copy_source_data(data_name=data_name, filename=oFile)
Exemplo n.º 11
0
def get_variants_from_graph():
    # collect to epigraph
    driver = neo4j_connect()
    session = driver.session()
    # query
    query = """
            match (v:Variant)
            return distinct(v._id) as id limit 100
            """
    logger.info(query)
    query_data = session.run(query).data()
    df = pd.json_normalize(query_data)
    df.to_csv(variant_data, index=False)
    copy_source_data(data_name=data_name, filename=variant_data)
    return df
Exemplo n.º 12
0
def make_tidy_clinvar_output(df):
    # make tidy dates
    df['LastUpdated'] = pd.to_datetime(
        df['LastUpdated']).dt.strftime('%Y-%m-%d')

    # subset and rename columns
    df = df[[
        "Gene name", "Gene stable ID", "GeneType", "DiseaseName", "ConceptID",
        "SourceName", "SourceID", "DiseaseMIM", 'LastUpdated'
    ]]
    df.columns = [
        "gene_name", "ensembl_id", "clinvar_gene_type", "disease_name",
        "umls_id", "source_name", "source_id", "disease_MIM", "last_updated"
    ]
    df.to_csv(clinvar_gene_condition_mapping, sep="\t", index=False)
    copy_source_data(data_name=data_name,
                     filename=clinvar_gene_condition_mapping)
Exemplo n.º 13
0
def run_vep(variant_dir, variant_file):
    com = """
        docker run -t -i -v {vep_data_dir}:/opt/vep/.vep 
        ensemblorg/ensembl-vep ./vep --port 3337 --cache --fork 20 --assembly GRCh37 
        -i /opt/vep/.vep/{variant_file} 
        -o /opt/vep/.vep/vep-{today}.txt 
        --per_gene 
        --no_intergenic
    """.format(
        vep_data_dir=vep_data_dir, variant_file=variant_file, today=today
    )
    com = com.replace("\n", " ")
    logger.info(com)
    subprocess.call(com, shell=True)
    # copy results
    #com = f"cp /data/vep_data/vep-{today}.txt {env_configs['data_dir']}/vep/"
    #subprocess.call(com, shell=True)
    copy_source_data(data_name=data_name,filename=f'{vep_data_dir}/vep-{today}.txt')
Exemplo n.º 14
0
def protein_to_pathway():
    # protein to pathway
    url = "https://reactome.org/download/current/UniProt2Reactome_All_Levels.txt"
    logger.info(url)
    df = pd.read_csv(url, sep="\t")
    df.columns = [
        "source_id",
        "reactome_id",
        "url",
        "event",
        "evidence_code",
        "species",
    ]
    df = df[df["species"] == "H**o sapiens"]
    logger.info(df.head())
    filename = f"/tmp/UniProt2Reactome_All_Levels_human_{today}.csv"
    df.to_csv(
        filename,
        index=False,
    )
    copy_source_data(data_name=data_name, filename=filename)
Exemplo n.º 15
0
def get_phewas():
    df = pd.read_csv(variant_data)
    variant_ids = list(df.id)
    split_val = 20
    pval = 1e-5
    all_res = []
    for i in range(0, len(variant_ids), split_val):
        print(i)
        variants = variant_ids[i:i + split_val]
        gwas_api_url = "http://gwasapi.mrcieu.ac.uk/phewas"
        payload = {"variant": variants, "pval": pval}
        #logger.info(payload)
        response = requests.post(gwas_api_url, json=payload)
        res = response.json()
        logger.info(len(res))
        if len(res) == 1:
            logger.info('Failed')
            exit()
        all_res.extend(res)
    df = pd.json_normalize(all_res)
    logger.info(df)
    df.to_csv(phewas_data_file, index=False)
    copy_source_data(data_name=data_name, filename=phewas_data_file)
Exemplo n.º 16
0
def map_genes_to_diseases():
    df = pd.read_csv(clinvar_gene_condition_mapping, sep='\t')

    # firstly get genes that directly map to mondo_id in clinvar data
    df_mondo = make_gene_to_mondo_map(df)

    # map umls_id in clinvar to mondo_id from the graph
    df_umls = make_umls_to_mondo_map(df)

    # join clinvar table with query output to map umls_id to mondo_id
    df_joined = df.merge(df_umls, on='umls_id', how='inner')
    df_joined = df_joined[[
        'ensembl_id', 'mondo_id', 'clinvar_gene_type', 'last_updated'
    ]]

    # concat direct mondo mappings with mappings via umls_id; drop any dups
    df_total = df_joined.append(df_mondo).drop_duplicates()

    df_total.to_csv(clinvar_gene_condition_mapping_mondo,
                    sep="\t",
                    index=False)
    copy_source_data(data_name=data_name,
                     filename=clinvar_gene_condition_mapping_mondo)
Exemplo n.º 17
0
def create_distances(gwas_df):
    logger.info("Creating distances...")
    # https://stackoverflow.com/questions/48838346/how-to-speed-up-computation-of-cosine-similarity-between-set-of-vectors

    vectors = []
    ids = []

    for i, j in gwas_df.iterrows():
        vectors.append(j["embedding"])
        ids.append(i)

    timestr = time.strftime("%Y%m%d")
    score_cutoff = 0
    filename = f'/tmp/ieu-gwas-cosine-{timestr}-{score_cutoff}.tsv.gz'
    o = gzip.open(filename, "wt")

    logger.info(len(vectors))
    data = np.array(vectors)
    pws = distance.pdist(data, metric="cosine")
    logger.info(len(pws))
    logger.info(len(ids))

    logger.info("Writing to file...")
    mCount = 0
    for i in range(0, len(ids)):
        for j in range(i, len(ids)):
            if i != j:
                # print(ids[i],ids[j],1-pws[mCount])
                score = 1 - pws[mCount]
                if score > score_cutoff:
                    t = f"{ids[i]}\t{ids[j]}\t{str(score)}\n"
                    o.write(t)
                mCount += 1
    o.close()
    logger.info(mCount)
    copy_source_data(data_name, filename)
Exemplo n.º 18
0
def download_data():
    link = 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/gene_condition_source_id'
    wget.download(link, clinvar_data_file)
    copy_source_data(data_name=data_name, filename=clinvar_data_file)
Exemplo n.º 19
0
def download_data():
    link = 'https://api.cpicpgx.org/data/cpicPairs.csv'
    wget.download(link, cpic_data_file)
    copy_source_data(data_name=data_name, filename=cpic_data_file)
Exemplo n.º 20
0
import os

from workflow.scripts.utils.general import copy_source_data

data_name = "string"

# uniprot mapping
uniprot_file = '/tmp/human.uniprot_2_string.2018.tsv.gz'
url = 'https://string-db.org/mapping_files/uniprot/human.uniprot_2_string.2018.tsv.gz'
os.system(f"wget -O {uniprot_file} {url}")
copy_source_data(data_name=data_name, filename=uniprot_file)

# string data
pp_file = '/tmp/9606.protein.links.v11.0.txt.gz'
url = 'https://stringdb-static.org/download/protein.links.v11.0/9606.protein.links.v11.0.txt.gz'
os.system(f"wget -O {pp_file} {url}")
copy_source_data(data_name=data_name, filename=pp_file)