def process(): logger.info("Processing data...") df = pd.read_csv(os.path.join(dataDir, FILE), sep=",") cleaned = clean_ids(df["Gwas.id"].values) df["Gwas.id"] = cleaned logger.info("\n{}", df.head()) df.rename( columns={ "Gene.ensembl_id": "source", "Gwas.id": "target", "XQTL_MULTI_SNP_MR.b": "beta", "XQTL_MULTI_SNP_MR.se": "se", "XQTL_MULTI_SNP_MR.p": "p", "XQTL_MULTI_SNP_MR.qtl_type": "qtl_type", "XQTL_MULTI_SNP_MR.mr_method": "mr_method", }, inplace=True, ) logger.info(df.shape) logger.info("\n{}", df.head()) df.drop_duplicates(inplace=True) logger.info(df.shape) logger.info("\n{}", df.head()) create_import(df=df, meta_id=meta_id)
def process_data(): logger.info("Processing mr data...") col_names = [ "source", "target", "method", "nsnp", "b", "se", "ci_low", "ci_upp", "pval", "selection", "moescore", ] data = os.path.join(dataDir, FILE) df = pd.read_csv(data,header=None) df.columns = col_names logger.info(df.shape) df.dropna(subset=['pval','se']) df.drop_duplicates(inplace=True) logger.info(df.shape) logger.info(df.head()) create_import(df=df, meta_id=meta_id)
def gene(): FILE = get_source(meta_id, 1) data = os.path.join(dataDir, FILE) df = pd.read_csv(data, sep="\t") # add column names col_names = [ "chr", "type", "name", "description", "biomart_source", "ensembl_id", "start", "end", ] df.columns = col_names df.drop_duplicates(inplace=True) create_import(df=df, meta_id=meta_id) # create constraints constraintCommands = [ "CREATE CONSTRAINT ON (g:Gene) ASSERT g.ensembl_id IS UNIQUE", "CREATE INDEX ON :Gene(name)", "CREATE INDEX ON :Gene(chr)", ] create_constraints(constraintCommands, meta_id)
def process(): # select the file FILE = get_source(meta_id, 1) logger.info("Reading {}", FILE) df = pd.read_csv(os.path.join(dataDir, FILE)) # logger.info(df.columns) logger.info(df.shape) # drop some columns df.drop(["access", "priority", "coverage", ""], axis=1, inplace=True, errors="ignore") logger.info(df.shape) # create the csv and import data create_import(df=df, meta_id=meta_id) # create constraints constraintCommands = [ "CREATE CONSTRAINT ON (g:Gwas) ASSERT g.id IS UNIQUE", "CREATE index on :Gwas(trait)", "CREATE index on :Gwas(filename)", ] create_constraints(constraintCommands, meta_id)
def process(): logger.info("loading semrep data...{}", FILE) sem_df = pd.read_csv(os.path.join(dataDir, FILE), sep=",", compression="gzip") logger.info(sem_df) #create new ids logger.info('Dealing with IDs') sub_id = sem_df.apply(lambda row: make_id(row, 'sub'), axis=1) obj_id = sem_df.apply(lambda row: make_id(row, 'obj'), axis=1) sem_df['sub_id_all'] = sub_id sem_df['obj_id_all'] = obj_id # need to split subject and object ids by , logger.info(sem_df.shape) sem_df = (sem_df.assign(sub_id_all=sem_df.sub_id_all.str.split( ",")).explode("sub_id_all").reset_index(drop=True)) logger.info(sem_df.shape) sem_df = (sem_df.assign(obj_id_all=sem_df.obj_id_all.str.split( ",")).explode("obj_id_all").reset_index(drop=True)) logger.info(sem_df.shape) keep_cols = ["sub_id_all", "pred", "obj_id_all"] sem_df = sem_df[keep_cols] sem_df = pd.DataFrame({ "count": sem_df.groupby(keep_cols).size() }).reset_index() logger.info(sem_df.shape) sem_df.drop_duplicates(inplace=True) logger.info(sem_df.shape) sem_df.columns = ["source", "predicate", "target", "count"] logger.info("\n {}", sem_df) create_import(df=sem_df, meta_id=meta_id)
def process(): # load predicate data logger.info("loading data...") df = pd.read_csv( os.path.join(dataDir, FILE), sep=",", compression="gzip" ) logger.info(df.shape) # group # keep_cols = ["predicate","subject_name","object_name","subject_type","object_type","subject_id","object_id","id"] keep_cols = ["PMID", "predicate", "subject_id", "object_id", "id"] # need to split subject and object ids by | df = ( df.assign(subject_id=df.subject_id.str.split("|")) .explode("subject_id") .reset_index(drop=True) ) logger.info(df.shape) df = ( df.assign(object_id=df.object_id.str.split("|")) .explode("object_id") .reset_index(drop=True) ) logger.info(df.shape) logger.info("\n {}", df) df["id"] = df["subject_id"] + ":" + df["predicate"] + ":" + df["object_id"] df = df[["PMID", "id"]] logger.info(df.shape) df.drop_duplicates(inplace=True) logger.info(df.shape) df.columns = ["target", "source"] logger.info("\n {}", df) create_import(df=df, meta_id=meta_id)
def process(): # load predicate data logger.info("loading predication data...") pred_df = pd.read_csv(os.path.join(dataDir, PREDICATION_FILE), sep=",", compression="gzip") pred_df["PMID"] = pred_df["PMID"].astype(str) logger.info("loading citation data...") df = pd.read_csv(os.path.join(dataDir, PUB_FILE), sep="\t", compression="gzip") df.columns = ["id", "issn", "dp", "edat", "year"] df["id"] = df["id"].str.replace("'", "") logger.info(df.shape) # merge with predication data df_merge = df.merge(pred_df["PMID"], left_on="id", right_on="PMID") logger.info(df_merge.shape) # drop PMID column df_merge.drop("PMID", inplace=True, axis=1) # make unique df_merge.drop_duplicates(inplace=True) logger.info(df_merge.shape) logger.info(df_merge.shape) logger.info("\n {}", df_merge.head()) create_import(df=df_merge, meta_id=args.name) # create constraints constraintCommands = [ "CREATE CONSTRAINT ON (s:Literature) ASSERT s.id IS UNIQUE;", ] create_constraints(constraintCommands, meta_id)
def process_data(): logger.info("Processing vep data {}", vep_data) col_names = [ "source", "location", "allele", "target", "feature", "feature_type", "consequence", "cdna_position", "cds_position", "protein_position", "amino_acids", "codons", "existing_variation", "extra", ] # create csv file df = pd.read_csv(os.path.join(dataDir, vep_data), sep="\t", comment="#") df.drop_duplicates(inplace=True) df.columns = col_names print(df.head()) create_import(df=df, meta_id=meta_id)
def process(): df = pd.read_csv(os.path.join(dataDir, FILE), sep="\t").drop_duplicates() print(df.head()) print(df.shape) df.columns = ["source", "target", "gene_relationship_type", "last_updated"] logger.info(df.shape) logger.info("\n {}", df.head()) create_import(df=df, meta_id=meta_id)
def gwas(): df = pd.read_csv(os.path.join(dataDir, FILE), low_memory=False) df = df[["id", "rsid", "beta", "p"]].drop_duplicates() # edit column names to match schema df.rename(columns={"id": "source", "rsid": "target", "p": "pval"}, inplace=True) create_import(df=df, meta_id=meta_id)
def process(): df = pd.read_csv(os.path.join(dataDir, FILE), sep="\t") print(df.head()) print(df.shape) df.columns = ["source", "target"] logger.info(df.shape) logger.info("\n {}", df.head()) create_import(df=df, meta_id=meta_id)
def process(): merge = merge_data(BIO_DATA, BIO_SEM) logger.info(merge.shape) create_import(df=merge, meta_id=args.name) # create constraints constraintCommands = [ "CREATE CONSTRAINT ON (s:Literature) ASSERT s.id IS UNIQUE;", ] create_constraints(constraintCommands, meta_id)
def process(): df = pd.read_csv(os.path.join(dataDir, FILE)) logger.info(df.head()) keep_cols = ["source_id", "reactome_id"] df = df[keep_cols] df.rename(columns={"source_id": "source", "reactome_id": "target"}, inplace=True) df.drop_duplicates(inplace=True) logger.info(df.head()) create_import(df=df, meta_id=meta_id)
def process(): # load predicate data logger.info("loading data...") df = pd.read_csv(os.path.join(dataDir, FILE), sep="\t", compression="gzip") logger.info(df.shape) col_names = ["source", "target", "score"] df.columns = col_names df.drop_duplicates(inplace=True) logger.info(df.shape) logger.info("\n {}", df) create_import(df=df, meta_id=meta_id)
def protein(): data = os.path.join(dataDir, FILE) df = pd.read_csv(data, sep="\t") df.columns = ["uniprot_id"] df["name"] = df["uniprot_id"] create_import(df=df, meta_id=meta_id) constraintCommands = [ "CREATE CONSTRAINT ON (p:Protein) ASSERT p.uniprot_id IS UNIQUE", "CREATE index on :Protein(name);", ] create_constraints(constraintCommands, meta_id)
def process(): logger.info("Loading efo...") data = os.path.join(dataDir, FILE) df = pd.read_csv(data) df.drop_duplicates(inplace=True) df.rename(columns={ 'efo.id': "target", 'parent_efo.id': "source" }, inplace=True) logger.info("\n{}", df.head()) create_import(df=df, meta_id=meta_id)
def process(): logger.info("Loading efo...{}", FILE) df = pd.read_csv( os.path.join(dataDir, FILE), sep="\t", names=["target", "efo_value", "score", "gwas_value", "source"], ) df = df.drop(["efo_value", "gwas_value"], axis=1) logger.info("\n{}", df.head()) logger.info(df.shape) df.drop_duplicates(inplace=True) logger.info(df.shape) create_import(df=df, meta_id=meta_id)
def process(): FILE = get_source(meta_id, 1) df = pd.read_csv(os.path.join(dataDir, FILE), low_memory=False) df = df[["rsid"]].drop_duplicates() # change column name to match schema df.rename(columns={"rsid": "name"}, inplace=True) create_import(df=df, meta_id=meta_id) # create constraints constraintCommands = [ "CREATE CONSTRAINT ON (v:Variant) ASSERT v.name IS UNIQUE;", ] create_constraints(constraintCommands, meta_id)
def process(): data = os.path.join(dataDir, FILE) # not sure why double quotes weren't being handled properly, added engine param df = pd.read_csv(data, sep=",", engine="python") logger.info(df.shape) logger.info("\n {}", df.head()) keep_cols = ["molecule_name", "target_id", "action_type", "phase"] df = df[keep_cols] col_names = ["source", "target", "action_type", "phase"] df.columns = col_names df["source"] = df["source"].str.upper() df.drop_duplicates(inplace=True) logger.info(df.shape) logger.info("\n {}", df.head()) create_import(df=df, meta_id=meta_id)
def process(): # load predicate data logger.info("loading data... {}", FILE) df = pd.read_csv(os.path.join(dataDir, FILE), sep="\t", compression="gzip", header=None) logger.info(df.shape) col_names = [ "query", "triple", "subject_name", "subject_type", "subject_id", "predicate", "object_name", "object_type", "object_id", "localCount", "localTotal", "globalCount", "globalTotal", "odds", "pval", "pmids", "gwas-id", ] df.columns = col_names logger.info(df.shape) keep_cols = ["pmids", "gwas-id"] df = df[keep_cols] # group # keep_cols = ["predicate","subject_name","object_name","subject_type","object_type","subject_id","object_id","id"] # need to split pmids df = (df.assign( pmids=df.pmids.str.split(" ")).explode("pmids").reset_index(drop=True)) logger.info(df.shape) logger.info("\n {}", df) df.drop_duplicates(inplace=True) df.columns = ["target", "source"] logger.info(df.shape) logger.info("\n {}", df) create_import(df=df, meta_id=meta_id)
def process(): df = pd.read_csv(os.path.join(dataDir, FILE)) logger.info(df.head()) keep_cols = ["source_id"] df = df[keep_cols] df.rename(columns={"source_id": "uniprot_id"}, inplace=True) df["name"] = df["uniprot_id"] df.drop_duplicates(inplace=True) logger.info(df.head()) create_import(df=df, meta_id=meta_id) constraintCommands = [ "CREATE CONSTRAINT ON (p:Protein) ASSERT p.uniprot_id IS UNIQUE", "CREATE index on :Protein(name);", ] create_constraints(constraintCommands, meta_id)
def variant(): print("Reading...", FILE) df = pd.read_csv(os.path.join(dataDir, FILE)) print("Writing...") # create csv df.rename(columns={"rsid": "name"}, inplace=True) df = df[["name"]].drop_duplicates() create_import(df=df, meta_id=meta_id) # create constraints constraintCommands = [ "CREATE CONSTRAINT ON (v:Variant) ASSERT v.name IS UNIQUE;", "CREATE index on :Variant(chr);", "CREATE index on :Variant(pos);", ] create_constraints(constraintCommands, meta_id)
def process(): logger.info("Processing data...") FILE = "XQTL_SINGLE_SNP_MR_SNP_GENE.csv" df = pd.read_csv(os.path.join(dataDir, FILE), sep=",") df.rename(columns={ "Snp.name": "source", "Gene.ensembl_id": "target", }, inplace=True) logger.info(df.shape) logger.info("\n{}", df.head()) df.drop_duplicates(inplace=True) logger.info(df.shape) logger.info("\n{}", df.head()) create_import(df=df, meta_id=meta_id)
def process(): data = os.path.join(dataDir, FILE) # not sure why double quotes weren't being handled properly, added engine param df = pd.read_csv(data, sep=",", engine="python") logger.info(df.shape) logger.info("\n {}", df.head()) #get disease data disease_df = get_disease_data() disease_df['mondo_efo_id'] = 'http://www.ebi.ac.uk/efo/EFO_' + disease_df[ 'mondo_efo_id'].astype(str) logger.info(disease_df) keep_cols = [ "molecule_name", "efo_id", ] df = df[keep_cols] mondo_match = pd.merge(df, disease_df, left_on='efo_id', right_on='disease_id')[[ 'molecule_name', 'disease_id' ]] #logger.info(mondo_match) efo_match = pd.merge(df, disease_df, left_on='efo_id', right_on='mondo_efo_id')[[ 'molecule_name', 'disease_id' ]] #logger.info(efo_match) cat_df = pd.concat([mondo_match, efo_match]) logger.info(cat_df.shape) cat_df.drop_duplicates(inplace=True) logger.info(cat_df.shape) col_names = ["source", "target"] cat_df.columns = col_names cat_df["source"] = cat_df["source"].str.upper() create_import(df=cat_df, meta_id=meta_id)
def process_data(): print("Processing instruments...") csv_data = [] col_names = [ "target", "source", "beta", "se", "pval", "eaf", "samplesize", "ncase", "ncontrol", ] with gzip.open(os.path.join(dataDir, FILE), "rt") as f: next(f) filereader = csv.reader(f, delimiter=",") for line in filereader: variant, gwas, beta, se, pval, eaf, samplesize, ncase, ncontrol = line try: float(pval) float(se) float(eaf) except ValueError: continue if not gwas.startswith("UKB"): gwas = "IEU-a-" + gwas gwas = gwas.replace(":", "-") t = [ variant, gwas.lower(), beta, se, pval, eaf, str(int(float(samplesize))), ncase, ncontrol, ] csv_data.append(t) # create csv file df = pd.DataFrame(csv_data) df.columns = col_names print(df.head()) create_import(df=df, meta_id=meta_id)
def process(): # load predicate data logger.info("loading data...") df = pd.read_csv(os.path.join(dataDir, PREDICATION_FILE), sep=",", compression="gzip") logger.info(df.shape) # need to split subject and object ids by | df = (df.assign(subject_id=df.subject_id.str.split("|")).explode( "subject_id").reset_index(drop=True)) logger.info(df.shape) df = (df.assign(object_id=df.object_id.str.split("|")).explode( "object_id").reset_index(drop=True)) logger.info(df.shape) logger.info("\n {}", df) df["id"] = df["subject_id"] + ":" + df["predicate"] + ":" + df["object_id"] df["name"] = df["subject_name"] + " " + df["predicate"] + " " + df[ "object_name"] # keep_cols = ["predicate","subject_name","object_name","subject_type","object_type","subject_id","object_id","id"] keep_cols = ["predicate", "subject_id", "object_id", "id", "name"] #df = pd.DataFrame({"count": df.groupby(keep_cols).size()}).reset_index() df = df[keep_cols] df.drop_duplicates(subset=['id'], inplace=True) logger.info(df.shape) logger.info("\n {}", df.head()) #drop nas/rows with empty string df.replace('', np.nan, inplace=True) df.dropna(inplace=True) create_import(df=df, meta_id=meta_id) # create constraints constraintCommands = [ "CREATE CONSTRAINT ON (s:LiteratureTriple) ASSERT s.id IS UNIQUE", "CREATE INDEX ON :LiteratureTriple(name);", "CREATE INDEX ON :LiteratureTriple(subject_id);", "CREATE INDEX ON :LiteratureTriple(object_id);", "CREATE INDEX ON :LiteratureTriple(predicate);", ] create_constraints(constraintCommands, meta_id)
def process(): df = pd.read_csv(os.path.join(dataDir, FILE)) logger.info(df.head()) keep_cols = ["reactome_id", "name"] df = df[keep_cols] df["url"] = "https://reactome.org/PathwayBrowser/#/" + df["reactome_id"] df.rename(columns={"reactome_id": "id"}, inplace=True) df.drop_duplicates(inplace=True) logger.info(df["url"].head()) create_import(df=df, meta_id=meta_id) # create constraints constraintCommands = [ "CREATE CONSTRAINT ON (p:Pathway) ASSERT p.id IS UNIQUE", "CREATE index on :Pathway(name);", ] create_constraints(constraintCommands, meta_id)
def process(): data = os.path.join(dataDir, FILE) df = pd.read_csv(data, sep=",") logger.info("\n {}", df.head()) keep_cols = ["molecule_name", "molecule_type", "chembl_uri"] df = df[keep_cols] col_names = ["label", "molecule_type", "id"] df.columns = col_names df.drop_duplicates(inplace=True) # set label to uppercase df["label"] = df["label"].str.upper() logger.info(df.shape) logger.info("\n {}", df.head()) create_import(df=df, meta_id=meta_id) # create constraints constraintCommands = ["CREATE index on :Drug(label);"] create_constraints(constraintCommands, meta_id)
def process(): logger.info("loading semrep data...{}",FILE) sem_df = pd.read_csv(os.path.join(dataDir, FILE), sep=",", compression="gzip") logger.info(sem_df) #create new ids logger.info('Dealing with IDs') sub_id = sem_df.apply(lambda row : make_id(row,'sub'), axis = 1) obj_id = sem_df.apply(lambda row : make_id(row,'obj'), axis = 1) sem_df['sub_id_all']=sub_id sem_df['obj_id_all']=obj_id # need to split subject and object ids by , logger.info(sem_df.shape) sem_df = ( sem_df.assign(sub_id_all=sem_df.sub_id_all.str.split(",")) .explode("sub_id_all") .reset_index(drop=True) ) logger.info(sem_df.shape) sem_df = ( sem_df.assign(obj_id_all=sem_df.obj_id_all.str.split(",")) .explode("obj_id_all") .reset_index(drop=True) ) logger.info(sem_df.shape) sem_id = sem_df['sub_id_all']+':'+sem_df['pred']+':'+sem_df['obj_id_all'] logger.debug(sem_id) sem_df['id']=sem_id logger.info("\n{}", sem_df) logger.info(sem_df.shape) sem_df.rename(columns={'sub_id_all':'subject_id'},inplace=True) keep_cols = ["subject_id", "id"] sem_df = sem_df[keep_cols] logger.info(sem_df.shape) sem_df.drop_duplicates(inplace=True) sem_df.columns = ["target", "source"] logger.info(sem_df.shape) logger.info("\n {}", sem_df.head()) create_import(df=sem_df, meta_id=meta_id)
def process(): data = os.path.join(dataDir, FILE) # some bad rows caused by extra commas so need to skip df = pd.read_csv(data, sep=",", skiprows=1, error_bad_lines=False) keep_cols = ["Drug"] df = df[keep_cols] df.drop_duplicates(inplace=True) df.columns = ["label"] # set label to uppercase df["label"] = df["label"].str.upper() logger.info(df.shape) logger.info("\n {}", df.head()) create_import(df=df, meta_id=meta_id) # create constraints constraintCommands = [ "CREATE CONSTRAINT ON (s:Drug) ASSERT s.label IS UNIQUE;" ] create_constraints(constraintCommands, meta_id)