예제 #1
0
def process():
    logger.info("Processing data...")

    df = pd.read_csv(os.path.join(dataDir, FILE), sep=",")
    cleaned = clean_ids(df["Gwas.id"].values)
    df["Gwas.id"] = cleaned
    logger.info("\n{}", df.head())

    df.rename(
        columns={
            "Gene.ensembl_id": "source",
            "Gwas.id": "target",
            "XQTL_MULTI_SNP_MR.b": "beta",
            "XQTL_MULTI_SNP_MR.se": "se",
            "XQTL_MULTI_SNP_MR.p": "p",
            "XQTL_MULTI_SNP_MR.qtl_type": "qtl_type",
            "XQTL_MULTI_SNP_MR.mr_method": "mr_method",
        },
        inplace=True,
    )

    logger.info(df.shape)
    logger.info("\n{}", df.head())
    df.drop_duplicates(inplace=True)
    logger.info(df.shape)
    logger.info("\n{}", df.head())
    create_import(df=df, meta_id=meta_id)
예제 #2
0
def process_data():
    logger.info("Processing mr data...")
    col_names = [
        "source",
        "target",
        "method",
        "nsnp",
        "b",
        "se",
        "ci_low",
        "ci_upp",
        "pval",
        "selection",
        "moescore",
    ]
    data = os.path.join(dataDir, FILE)
    df = pd.read_csv(data,header=None)

    df.columns = col_names
    logger.info(df.shape)
    df.dropna(subset=['pval','se'])
    df.drop_duplicates(inplace=True)
    logger.info(df.shape)
    logger.info(df.head())
    create_import(df=df, meta_id=meta_id)
예제 #3
0
def gene():
    FILE = get_source(meta_id, 1)
    data = os.path.join(dataDir, FILE)
    df = pd.read_csv(data, sep="\t")
    # add column names
    col_names = [
        "chr",
        "type",
        "name",
        "description",
        "biomart_source",
        "ensembl_id",
        "start",
        "end",
    ]
    df.columns = col_names
    df.drop_duplicates(inplace=True)
    create_import(df=df, meta_id=meta_id)

    # create constraints
    constraintCommands = [
        "CREATE CONSTRAINT ON (g:Gene) ASSERT g.ensembl_id IS UNIQUE",
        "CREATE INDEX ON :Gene(name)",
        "CREATE INDEX ON :Gene(chr)",
    ]
    create_constraints(constraintCommands, meta_id)
예제 #4
0
def process():
    # select the file
    FILE = get_source(meta_id, 1)
    logger.info("Reading {}", FILE)
    df = pd.read_csv(os.path.join(dataDir, FILE))
    # logger.info(df.columns)
    logger.info(df.shape)

    # drop some columns
    df.drop(["access", "priority", "coverage", ""],
            axis=1,
            inplace=True,
            errors="ignore")
    logger.info(df.shape)

    # create the csv and import data
    create_import(df=df, meta_id=meta_id)

    # create constraints
    constraintCommands = [
        "CREATE CONSTRAINT ON (g:Gwas) ASSERT g.id IS UNIQUE",
        "CREATE index on :Gwas(trait)",
        "CREATE index on :Gwas(filename)",
    ]
    create_constraints(constraintCommands, meta_id)
예제 #5
0
def process():
    logger.info("loading semrep data...{}", FILE)
    sem_df = pd.read_csv(os.path.join(dataDir, FILE),
                         sep=",",
                         compression="gzip")
    logger.info(sem_df)
    #create new ids
    logger.info('Dealing with IDs')
    sub_id = sem_df.apply(lambda row: make_id(row, 'sub'), axis=1)
    obj_id = sem_df.apply(lambda row: make_id(row, 'obj'), axis=1)
    sem_df['sub_id_all'] = sub_id
    sem_df['obj_id_all'] = obj_id

    # need to split subject and object ids by ,
    logger.info(sem_df.shape)
    sem_df = (sem_df.assign(sub_id_all=sem_df.sub_id_all.str.split(
        ",")).explode("sub_id_all").reset_index(drop=True))
    logger.info(sem_df.shape)
    sem_df = (sem_df.assign(obj_id_all=sem_df.obj_id_all.str.split(
        ",")).explode("obj_id_all").reset_index(drop=True))
    logger.info(sem_df.shape)

    keep_cols = ["sub_id_all", "pred", "obj_id_all"]
    sem_df = sem_df[keep_cols]
    sem_df = pd.DataFrame({
        "count": sem_df.groupby(keep_cols).size()
    }).reset_index()
    logger.info(sem_df.shape)
    sem_df.drop_duplicates(inplace=True)
    logger.info(sem_df.shape)
    sem_df.columns = ["source", "predicate", "target", "count"]
    logger.info("\n {}", sem_df)
    create_import(df=sem_df, meta_id=meta_id)
예제 #6
0
def process():
    # load predicate data
    logger.info("loading data...")
    df = pd.read_csv(
        os.path.join(dataDir, FILE), sep=",", compression="gzip"
    )
    logger.info(df.shape)

    # group
    # keep_cols = ["predicate","subject_name","object_name","subject_type","object_type","subject_id","object_id","id"]
    keep_cols = ["PMID", "predicate", "subject_id", "object_id", "id"]

    # need to split subject and object ids by |
    df = (
        df.assign(subject_id=df.subject_id.str.split("|"))
        .explode("subject_id")
        .reset_index(drop=True)
    )
    logger.info(df.shape)
    df = (
        df.assign(object_id=df.object_id.str.split("|"))
        .explode("object_id")
        .reset_index(drop=True)
    )
    logger.info(df.shape)
    logger.info("\n {}", df)

    df["id"] = df["subject_id"] + ":" + df["predicate"] + ":" + df["object_id"]
    df = df[["PMID", "id"]]
    logger.info(df.shape)
    df.drop_duplicates(inplace=True)
    logger.info(df.shape)
    df.columns = ["target", "source"]
    logger.info("\n {}", df)
    create_import(df=df, meta_id=meta_id)
예제 #7
0
def process():
    # load predicate data
    logger.info("loading predication data...")
    pred_df = pd.read_csv(os.path.join(dataDir, PREDICATION_FILE),
                          sep=",",
                          compression="gzip")
    pred_df["PMID"] = pred_df["PMID"].astype(str)

    logger.info("loading citation data...")
    df = pd.read_csv(os.path.join(dataDir, PUB_FILE),
                     sep="\t",
                     compression="gzip")
    df.columns = ["id", "issn", "dp", "edat", "year"]
    df["id"] = df["id"].str.replace("'", "")
    logger.info(df.shape)

    # merge with predication data
    df_merge = df.merge(pred_df["PMID"], left_on="id", right_on="PMID")
    logger.info(df_merge.shape)
    # drop PMID column
    df_merge.drop("PMID", inplace=True, axis=1)
    # make unique
    df_merge.drop_duplicates(inplace=True)
    logger.info(df_merge.shape)

    logger.info(df_merge.shape)
    logger.info("\n {}", df_merge.head())

    create_import(df=df_merge, meta_id=args.name)

    # create constraints
    constraintCommands = [
        "CREATE CONSTRAINT ON (s:Literature) ASSERT s.id IS UNIQUE;",
    ]
    create_constraints(constraintCommands, meta_id)
예제 #8
0
def process_data():
    logger.info("Processing vep data {}", vep_data)
    col_names = [
        "source",
        "location",
        "allele",
        "target",
        "feature",
        "feature_type",
        "consequence",
        "cdna_position",
        "cds_position",
        "protein_position",
        "amino_acids",
        "codons",
        "existing_variation",
        "extra",
    ]

    # create csv file
    df = pd.read_csv(os.path.join(dataDir, vep_data), sep="\t", comment="#")
    df.drop_duplicates(inplace=True)
    df.columns = col_names
    print(df.head())
    create_import(df=df, meta_id=meta_id)
def process():
    df = pd.read_csv(os.path.join(dataDir, FILE), sep="\t").drop_duplicates()
    print(df.head())
    print(df.shape)
    df.columns = ["source", "target", "gene_relationship_type", "last_updated"]
    logger.info(df.shape)
    logger.info("\n {}", df.head())
    create_import(df=df, meta_id=meta_id)
예제 #10
0
def gwas():
    df = pd.read_csv(os.path.join(dataDir, FILE), low_memory=False)
    df = df[["id", "rsid", "beta", "p"]].drop_duplicates()

    # edit column names to match schema
    df.rename(columns={"id": "source", "rsid": "target", "p": "pval"}, inplace=True)

    create_import(df=df, meta_id=meta_id)
예제 #11
0
def process():
    df = pd.read_csv(os.path.join(dataDir, FILE), sep="\t")
    print(df.head())
    print(df.shape)
    df.columns = ["source", "target"]
    logger.info(df.shape)
    logger.info("\n {}", df.head())
    create_import(df=df, meta_id=meta_id)
예제 #12
0
def process():
    merge = merge_data(BIO_DATA, BIO_SEM)
    logger.info(merge.shape)
    create_import(df=merge, meta_id=args.name)

    # create constraints
    constraintCommands = [
        "CREATE CONSTRAINT ON (s:Literature) ASSERT s.id IS UNIQUE;",
    ]
    create_constraints(constraintCommands, meta_id)
예제 #13
0
def process():
    df = pd.read_csv(os.path.join(dataDir, FILE))
    logger.info(df.head())
    keep_cols = ["source_id", "reactome_id"]
    df = df[keep_cols]
    df.rename(columns={"source_id": "source", "reactome_id": "target"}, inplace=True)
    df.drop_duplicates(inplace=True)
    logger.info(df.head())

    create_import(df=df, meta_id=meta_id)
예제 #14
0
def process():
    # load predicate data
    logger.info("loading data...")
    df = pd.read_csv(os.path.join(dataDir, FILE), sep="\t", compression="gzip")
    logger.info(df.shape)
    col_names = ["source", "target", "score"]
    df.columns = col_names
    df.drop_duplicates(inplace=True)
    logger.info(df.shape)
    logger.info("\n {}", df)

    create_import(df=df, meta_id=meta_id)
예제 #15
0
def protein():
    data = os.path.join(dataDir, FILE)
    df = pd.read_csv(data, sep="\t")
    df.columns = ["uniprot_id"]
    df["name"] = df["uniprot_id"]
    create_import(df=df, meta_id=meta_id)

    constraintCommands = [
        "CREATE CONSTRAINT ON (p:Protein) ASSERT p.uniprot_id IS UNIQUE",
        "CREATE index on :Protein(name);",
    ]
    create_constraints(constraintCommands, meta_id)
예제 #16
0
def process():
    logger.info("Loading efo...")
    data = os.path.join(dataDir, FILE)
    df = pd.read_csv(data)
    df.drop_duplicates(inplace=True)
    df.rename(columns={
        'efo.id': "target",
        'parent_efo.id': "source"
    },
              inplace=True)
    logger.info("\n{}", df.head())
    create_import(df=df, meta_id=meta_id)
예제 #17
0
def process():
    logger.info("Loading efo...{}", FILE)
    df = pd.read_csv(
        os.path.join(dataDir, FILE),
        sep="\t",
        names=["target", "efo_value", "score", "gwas_value", "source"],
    )
    df = df.drop(["efo_value", "gwas_value"], axis=1)
    logger.info("\n{}", df.head())
    logger.info(df.shape)
    df.drop_duplicates(inplace=True)
    logger.info(df.shape)
    create_import(df=df, meta_id=meta_id)
예제 #18
0
def process():
    FILE = get_source(meta_id, 1)
    df = pd.read_csv(os.path.join(dataDir, FILE), low_memory=False)
    df = df[["rsid"]].drop_duplicates()
    # change column name to match schema
    df.rename(columns={"rsid": "name"}, inplace=True)

    create_import(df=df, meta_id=meta_id)

    # create constraints
    constraintCommands = [
        "CREATE CONSTRAINT ON (v:Variant) ASSERT v.name IS UNIQUE;",
    ]
    create_constraints(constraintCommands, meta_id)
def process():
    data = os.path.join(dataDir, FILE)
    # not sure why double quotes weren't being handled properly, added engine param
    df = pd.read_csv(data, sep=",", engine="python")
    logger.info(df.shape)
    logger.info("\n {}", df.head())
    keep_cols = ["molecule_name", "target_id", "action_type", "phase"]
    df = df[keep_cols]
    col_names = ["source", "target", "action_type", "phase"]
    df.columns = col_names
    df["source"] = df["source"].str.upper()
    df.drop_duplicates(inplace=True)
    logger.info(df.shape)
    logger.info("\n {}", df.head())
    create_import(df=df, meta_id=meta_id)
예제 #20
0
def process():
    # load predicate data
    logger.info("loading data... {}", FILE)
    df = pd.read_csv(os.path.join(dataDir, FILE),
                     sep="\t",
                     compression="gzip",
                     header=None)
    logger.info(df.shape)
    col_names = [
        "query",
        "triple",
        "subject_name",
        "subject_type",
        "subject_id",
        "predicate",
        "object_name",
        "object_type",
        "object_id",
        "localCount",
        "localTotal",
        "globalCount",
        "globalTotal",
        "odds",
        "pval",
        "pmids",
        "gwas-id",
    ]
    df.columns = col_names
    logger.info(df.shape)

    keep_cols = ["pmids", "gwas-id"]
    df = df[keep_cols]

    # group
    # keep_cols = ["predicate","subject_name","object_name","subject_type","object_type","subject_id","object_id","id"]

    # need to split pmids
    df = (df.assign(
        pmids=df.pmids.str.split(" ")).explode("pmids").reset_index(drop=True))
    logger.info(df.shape)
    logger.info("\n {}", df)

    df.drop_duplicates(inplace=True)
    df.columns = ["target", "source"]
    logger.info(df.shape)
    logger.info("\n {}", df)

    create_import(df=df, meta_id=meta_id)
예제 #21
0
def process():
    df = pd.read_csv(os.path.join(dataDir, FILE))
    logger.info(df.head())
    keep_cols = ["source_id"]
    df = df[keep_cols]
    df.rename(columns={"source_id": "uniprot_id"}, inplace=True)
    df["name"] = df["uniprot_id"]
    df.drop_duplicates(inplace=True)
    logger.info(df.head())

    create_import(df=df, meta_id=meta_id)
    constraintCommands = [
        "CREATE CONSTRAINT ON (p:Protein) ASSERT p.uniprot_id IS UNIQUE",
        "CREATE index on :Protein(name);",
    ]
    create_constraints(constraintCommands, meta_id)
예제 #22
0
def variant():
    print("Reading...", FILE)
    df = pd.read_csv(os.path.join(dataDir, FILE))

    print("Writing...")
    # create csv
    df.rename(columns={"rsid": "name"}, inplace=True)
    df = df[["name"]].drop_duplicates()
    create_import(df=df, meta_id=meta_id)

    # create constraints
    constraintCommands = [
        "CREATE CONSTRAINT ON (v:Variant) ASSERT v.name IS UNIQUE;",
        "CREATE index on :Variant(chr);",
        "CREATE index on :Variant(pos);",
    ]
    create_constraints(constraintCommands, meta_id)
예제 #23
0
def process():
    logger.info("Processing data...")

    FILE = "XQTL_SINGLE_SNP_MR_SNP_GENE.csv"

    df = pd.read_csv(os.path.join(dataDir, FILE), sep=",")
    df.rename(columns={
        "Snp.name": "source",
        "Gene.ensembl_id": "target",
    },
              inplace=True)
    logger.info(df.shape)
    logger.info("\n{}", df.head())
    df.drop_duplicates(inplace=True)
    logger.info(df.shape)
    logger.info("\n{}", df.head())
    create_import(df=df, meta_id=meta_id)
def process():
    data = os.path.join(dataDir, FILE)
    # not sure why double quotes weren't being handled properly, added engine param
    df = pd.read_csv(data, sep=",", engine="python")
    logger.info(df.shape)
    logger.info("\n {}", df.head())

    #get disease data
    disease_df = get_disease_data()
    disease_df['mondo_efo_id'] = 'http://www.ebi.ac.uk/efo/EFO_' + disease_df[
        'mondo_efo_id'].astype(str)
    logger.info(disease_df)

    keep_cols = [
        "molecule_name",
        "efo_id",
    ]
    df = df[keep_cols]

    mondo_match = pd.merge(df,
                           disease_df,
                           left_on='efo_id',
                           right_on='disease_id')[[
                               'molecule_name', 'disease_id'
                           ]]
    #logger.info(mondo_match)

    efo_match = pd.merge(df,
                         disease_df,
                         left_on='efo_id',
                         right_on='mondo_efo_id')[[
                             'molecule_name', 'disease_id'
                         ]]
    #logger.info(efo_match)

    cat_df = pd.concat([mondo_match, efo_match])
    logger.info(cat_df.shape)

    cat_df.drop_duplicates(inplace=True)
    logger.info(cat_df.shape)

    col_names = ["source", "target"]
    cat_df.columns = col_names
    cat_df["source"] = cat_df["source"].str.upper()

    create_import(df=cat_df, meta_id=meta_id)
예제 #25
0
def process_data():
    print("Processing instruments...")
    csv_data = []
    col_names = [
        "target",
        "source",
        "beta",
        "se",
        "pval",
        "eaf",
        "samplesize",
        "ncase",
        "ncontrol",
    ]
    with gzip.open(os.path.join(dataDir, FILE), "rt") as f:
        next(f)
        filereader = csv.reader(f, delimiter=",")
        for line in filereader:
            variant, gwas, beta, se, pval, eaf, samplesize, ncase, ncontrol = line
            try:
                float(pval)
                float(se)
                float(eaf)
            except ValueError:
                continue
            if not gwas.startswith("UKB"):
                gwas = "IEU-a-" + gwas
            gwas = gwas.replace(":", "-")
            t = [
                variant,
                gwas.lower(),
                beta,
                se,
                pval,
                eaf,
                str(int(float(samplesize))),
                ncase,
                ncontrol,
            ]
            csv_data.append(t)

    # create csv file
    df = pd.DataFrame(csv_data)
    df.columns = col_names
    print(df.head())
    create_import(df=df, meta_id=meta_id)
예제 #26
0
def process():
    # load predicate data
    logger.info("loading data...")
    df = pd.read_csv(os.path.join(dataDir, PREDICATION_FILE),
                     sep=",",
                     compression="gzip")
    logger.info(df.shape)

    # need to split subject and object ids by |
    df = (df.assign(subject_id=df.subject_id.str.split("|")).explode(
        "subject_id").reset_index(drop=True))
    logger.info(df.shape)
    df = (df.assign(object_id=df.object_id.str.split("|")).explode(
        "object_id").reset_index(drop=True))
    logger.info(df.shape)
    logger.info("\n {}", df)

    df["id"] = df["subject_id"] + ":" + df["predicate"] + ":" + df["object_id"]
    df["name"] = df["subject_name"] + " " + df["predicate"] + " " + df[
        "object_name"]

    # keep_cols = ["predicate","subject_name","object_name","subject_type","object_type","subject_id","object_id","id"]
    keep_cols = ["predicate", "subject_id", "object_id", "id", "name"]

    #df = pd.DataFrame({"count": df.groupby(keep_cols).size()}).reset_index()
    df = df[keep_cols]
    df.drop_duplicates(subset=['id'], inplace=True)
    logger.info(df.shape)
    logger.info("\n {}", df.head())

    #drop nas/rows with empty string
    df.replace('', np.nan, inplace=True)
    df.dropna(inplace=True)

    create_import(df=df, meta_id=meta_id)

    # create constraints
    constraintCommands = [
        "CREATE CONSTRAINT ON (s:LiteratureTriple) ASSERT s.id IS UNIQUE",
        "CREATE INDEX ON :LiteratureTriple(name);",
        "CREATE INDEX ON :LiteratureTriple(subject_id);",
        "CREATE INDEX ON :LiteratureTriple(object_id);",
        "CREATE INDEX ON :LiteratureTriple(predicate);",
    ]
    create_constraints(constraintCommands, meta_id)
예제 #27
0
def process():
    df = pd.read_csv(os.path.join(dataDir, FILE))
    logger.info(df.head())
    keep_cols = ["reactome_id", "name"]
    df = df[keep_cols]
    df["url"] = "https://reactome.org/PathwayBrowser/#/" + df["reactome_id"]
    df.rename(columns={"reactome_id": "id"}, inplace=True)
    df.drop_duplicates(inplace=True)
    logger.info(df["url"].head())

    create_import(df=df, meta_id=meta_id)

    # create constraints
    constraintCommands = [
        "CREATE CONSTRAINT ON (p:Pathway) ASSERT p.id IS UNIQUE",
        "CREATE index on :Pathway(name);",
    ]
    create_constraints(constraintCommands, meta_id)
예제 #28
0
def process():
    data = os.path.join(dataDir, FILE)
    df = pd.read_csv(data, sep=",")
    logger.info("\n {}", df.head())
    keep_cols = ["molecule_name", "molecule_type", "chembl_uri"]
    df = df[keep_cols]
    col_names = ["label", "molecule_type", "id"]
    df.columns = col_names
    df.drop_duplicates(inplace=True)
    # set label to uppercase
    df["label"] = df["label"].str.upper()
    logger.info(df.shape)
    logger.info("\n {}", df.head())
    create_import(df=df, meta_id=meta_id)

    # create constraints
    constraintCommands = ["CREATE index on :Drug(label);"]
    create_constraints(constraintCommands, meta_id)
예제 #29
0
def process():
    logger.info("loading semrep data...{}",FILE)
    sem_df = pd.read_csv(os.path.join(dataDir, FILE), sep=",", compression="gzip")
    logger.info(sem_df)
    #create new ids 
    logger.info('Dealing with IDs')
    sub_id = sem_df.apply(lambda row : make_id(row,'sub'), axis = 1)
    obj_id = sem_df.apply(lambda row : make_id(row,'obj'), axis = 1)
    sem_df['sub_id_all']=sub_id
    sem_df['obj_id_all']=obj_id

    # need to split subject and object ids by ,
    logger.info(sem_df.shape)
    sem_df = (
        sem_df.assign(sub_id_all=sem_df.sub_id_all.str.split(","))
        .explode("sub_id_all")
        .reset_index(drop=True)
    )
    logger.info(sem_df.shape)
    sem_df = (
        sem_df.assign(obj_id_all=sem_df.obj_id_all.str.split(","))
        .explode("obj_id_all")
        .reset_index(drop=True)
    )
    logger.info(sem_df.shape)

    sem_id = sem_df['sub_id_all']+':'+sem_df['pred']+':'+sem_df['obj_id_all']
    logger.debug(sem_id)
    sem_df['id']=sem_id
    logger.info("\n{}", sem_df)
    logger.info(sem_df.shape)

    sem_df.rename(columns={'sub_id_all':'subject_id'},inplace=True)

    keep_cols = ["subject_id", "id"]
    sem_df = sem_df[keep_cols]
    logger.info(sem_df.shape)
    sem_df.drop_duplicates(inplace=True)
    sem_df.columns = ["target", "source"]
    logger.info(sem_df.shape)
    logger.info("\n {}", sem_df.head())

    create_import(df=sem_df, meta_id=meta_id)
예제 #30
0
def process():
    data = os.path.join(dataDir, FILE)
    # some bad rows caused by extra commas so need to skip
    df = pd.read_csv(data, sep=",", skiprows=1, error_bad_lines=False)
    keep_cols = ["Drug"]
    df = df[keep_cols]
    df.drop_duplicates(inplace=True)
    df.columns = ["label"]
    # set label to uppercase
    df["label"] = df["label"].str.upper()
    logger.info(df.shape)
    logger.info("\n {}", df.head())
    create_import(df=df, meta_id=meta_id)

    # create constraints
    constraintCommands = [
        "CREATE CONSTRAINT ON (s:Drug) ASSERT s.label IS UNIQUE;"
    ]
    create_constraints(constraintCommands, meta_id)