def gene(): FILE = get_source(meta_id, 1) data = os.path.join(dataDir, FILE) df = pd.read_csv(data, sep="\t") # add column names col_names = [ "chr", "type", "name", "description", "biomart_source", "ensembl_id", "start", "end", ] df.columns = col_names df.drop_duplicates(inplace=True) create_import(df=df, meta_id=meta_id) # create constraints constraintCommands = [ "CREATE CONSTRAINT ON (g:Gene) ASSERT g.ensembl_id IS UNIQUE", "CREATE INDEX ON :Gene(name)", "CREATE INDEX ON :Gene(chr)", ] create_constraints(constraintCommands, meta_id)
def process(): # select the file FILE = get_source(meta_id, 1) logger.info("Reading {}", FILE) df = pd.read_csv(os.path.join(dataDir, FILE)) # logger.info(df.columns) logger.info(df.shape) # drop some columns df.drop(["access", "priority", "coverage", ""], axis=1, inplace=True, errors="ignore") logger.info(df.shape) # create the csv and import data create_import(df=df, meta_id=meta_id) # create constraints constraintCommands = [ "CREATE CONSTRAINT ON (g:Gwas) ASSERT g.id IS UNIQUE", "CREATE index on :Gwas(trait)", "CREATE index on :Gwas(filename)", ] create_constraints(constraintCommands, meta_id)
def protein(): FILE = get_source(meta_id, 1) data = os.path.join(dataDir, FILE) df = pd.read_csv(data, sep="\t") df.columns = ["uniprot_id"] df["name"] = df["uniprot_id"] create_import(df=df, meta_id=meta_id) constraintCommands = [ "CREATE CONSTRAINT ON (p:Protein) ASSERT p.uniprot_id IS UNIQUE", ] create_constraints(constraintCommands, meta_id)
def process(): FILE = get_source(meta_id, 1) df = pd.read_csv(os.path.join(dataDir, FILE), low_memory=False) df = df[["rsid"]].drop_duplicates() # change column name to match schema df.rename(columns={"rsid": "name"}, inplace=True) create_import(df=df, meta_id=meta_id) # create constraints constraintCommands = [ "CREATE CONSTRAINT ON (v:Variant) ASSERT v.name IS UNIQUE;", ] create_constraints(constraintCommands, meta_id)
from workflow.scripts.utils.writers import ( create_constraints, create_import, ) # setup args, dataDir = setup() meta_id = args.name # args = the argparse arguments (name and data) # dataDir = the path to the working directory for this node/rel ####################################################################### FILE = get_source(meta_id,1) def process(): # select the file logger.info("Reading {}", FILE) df = pd.read_csv(os.path.join(dataDir, FILE)) # logger.info(df.columns) logger.info(df.shape) # drop some columns df.drop( ["access", "priority", "coverage", "doi", "group_name", "imputation_panel", "ontology", "study_design", "covariates",""], axis=1, inplace=True, errors="ignore" ) logger.info(df.shape) # create the csv and import data
from workflow.scripts.utils.writers import ( create_constraints, create_import, ) # setup args, dataDir = setup() meta_id = args.name # args = the argparse arguments (name and data) # dataDir = the path to the working directory for this node/rel ####################################################################### BIO_DATA = get_source(meta_id, 1) BIO_SEM = get_source(meta_id, 2) def merge_data(lit_data, sem_data): # load predicate data logger.info("loading data...") data_df = pd.read_csv(os.path.join(dataDir, lit_data), sep=",", compression="gzip") logger.info("\n{}", data_df) logger.info("loading semrep data...") sem_df = pd.read_csv(os.path.join(dataDir, sem_data), sep=",", compression="gzip")
from workflow.scripts.utils.writers import ( create_constraints, create_import, ) # setup args, dataDir = setup() meta_id = args.name # args = the argparse arguments (name and data) # dataDir = the path to the working directory for this node/rel ####################################################################### FILE1 = get_source(meta_id, 1) FILE2 = get_source(meta_id, 2) def process(): df1 = pd.read_csv(os.path.join(dataDir, FILE1), sep=" ") # filter by score df1 = df1[df1["combined_score"] >= 700] logger.info(df1.shape) logger.info("\n {}", df1.head()) df2 = pd.read_csv(os.path.join(dataDir, FILE2), sep="\t") df2.columns = ["species", "uniprot", "protein", "x", "y"] df2["uniprot"] = df2["uniprot"].str.split("|", expand=True)[0] logger.info(df2.shape) logger.info("\n {}", df2.head())
from workflow.scripts.utils.writers import ( create_constraints, create_import, ) # setup args, dataDir = setup() meta_id = args.name # args = the argparse arguments (name and data) # dataDir = the path to the working directory for this node/rel ####################################################################### PREDICATION_FILE = get_source(meta_id, 1) def process(): # load predicate data logger.info("loading data...") df = pd.read_csv(os.path.join(dataDir, PREDICATION_FILE), sep=",", compression="gzip") logger.info(df.shape) # need to split subject and object ids by | df = (df.assign(subject_id=df.subject_id.str.split("|")).explode( "subject_id").reset_index(drop=True)) logger.info(df.shape) df = (df.assign(object_id=df.object_id.str.split("|")).explode(
from workflow.scripts.utils.writers import ( create_constraints, create_import, ) # setup args, dataDir = setup() meta_id = args.name # args = the argparse arguements (name and data) # dataDir = the path to the working directory for this node/rel ####################################################################### vep_data = get_source(meta_id, 1) def process_data(): logger.info("Processing vep data {}", vep_data) col_names = [ "source", "location", "allele", "target", "feature", "feature_type", "consequence", "cdna_position", "cds_position", "protein_position",
from workflow.scripts.utils.writers import ( create_constraints, create_import, ) # setup args, dataDir = setup() meta_id = args.name # args = the argparse arguments (name and data) # dataDir = the path to the working directory for this node/rel ####################################################################### MED_DATA = get_source(meta_id, 1) MED_SEM = get_source(meta_id, 2) def merge_data(lit_data, sem_data): # load predicate data logger.info("loading data...") data_df = pd.read_csv(os.path.join(dataDir, lit_data), sep=",", compression="gzip") logger.info("\n{}", data_df) logger.info("loading semrep data...") sem_df = pd.read_csv(os.path.join(dataDir, sem_data), sep=",", compression="gzip")
from workflow.scripts.utils.writers import ( create_constraints, create_import, ) # setup args, dataDir = setup() meta_id = args.name # args = the argparse arguments (name and data) # dataDir = the path to the working directory for this node/rel ####################################################################### SEM = get_source(meta_id, 1) def make_id(row, sub_type): id_val = row[sub_type + '_id'] if pd.isna(row[sub_type + '_id']): id_val = row[sub_type + '_gene_id'] return id_val def process(): logger.info("loading semrep data...{}", SEM) sem_df = pd.read_csv(os.path.join(dataDir, SEM), sep=",", compression="gzip")