def main(): # input parser = argparse.ArgumentParser( description='List subjects in a given DATS Dataset and StudyGroup.') parser.add_argument('--dats_file', help='Path to TOPMed or GTEx DATS JSON file.') parser.add_argument( '--dataset_id', required=False, help= 'DATS identifier of the Dataset linked to the StudyGroup of interest.') parser.add_argument('--study_group_name', required=False, help='DATS identifier of the StudyGroup of interest.') args = parser.parse_args() # logging logging.basicConfig(level=logging.INFO) # parse JSON LD g = ru.read_json_ld_graph(args.dats_file) # run query members = list_study_group_members(g, args.dataset_id, args.study_group_name) print_results(members, args.dataset_id, args.study_group_name)
def main(): # input parser = argparse.ArgumentParser( description='Run test queries on TOPMed instance.') parser.add_argument('--dats_file', help='Path to TOPMed or GTEx DATS JSON-LD file.') args = parser.parse_args() # logging logging.basicConfig(level=logging.INFO) # parse JSON LD g = ru.read_json_ld_graph(args.dats_file) # list 2nd-level datasets datasets = rdflib_list_2nd_level_datasets.list_2nd_level_datasets(g) rdflib_list_2nd_level_datasets.print_results(datasets) # list dataset variables for dataset_id in (DATASETS): variables = rdflib_list_dataset_variables.list_dataset_variables( g, dataset_id) rdflib_list_dataset_variables.print_results(variables, dataset_id) # list study group members for dataset_id in (DATASETS): for study_group in (STUDY_GROUPS): members = rdflib_list_study_group_members.list_study_group_members( g, dataset_id, study_group) rdflib_list_study_group_members.print_results( members, dataset_id, study_group) # create tabular data dump rdflib_tabular_dump.print_tabular_dump(g)
def main(): # input parser = argparse.ArgumentParser(description='List subjects in a given DATS Dataset and StudyGroup.') parser.add_argument('--dats_file', help ='Path to TOPMed or GTEx DATS JSON file.') parser.add_argument('--dataset_id', required=False, help ='DATS identifier of the Dataset linked to the StudyGroup of interest.') parser.add_argument('--study_group_name', required=False, help ='DATS identifier of the StudyGroup of interest.') args = parser.parse_args() # logging logging.basicConfig(level=logging.INFO) # parse JSON LD g = ru.read_json_ld_graph(args.dats_file) # obo:IAO_0000100 - "data set" # obo:IAO_0000577 - "centrally registered identifier symbol" # obo:RO_0003001 - "produced by" # obo:OBI_0000066 - "investigation" # obo:BFO_0000051 - "has part" # obo:STATO_0000193 - "study group population" # obo:RO_0002351 - "has member" # obo:IAO_0000590 - "a textual entity that denotes a particular in reality" # obo:BFO_0000040 - "material entity" bindings = {} if args.dataset_id is not None: bindings['dbgap_study_acc'] = rdflib.term.Literal(args.dataset_id) if args.study_group_name is not None: bindings['study_group_name'] = rdflib.term.Literal(args.study_group_name, lang="en") qres = g.query( """ SELECT ?dbgap_study_acc ?study_group_name ?subject_name WHERE { ?dataset a obo:IAO_0000100. ?dataset obo:IAO_0000577 ?dataset_id. ?dataset_id sdo:identifier ?dbgap_study_acc. ?dataset obo:RO_0003001 ?study. ?study a obo:OBI_0000066. ?study obo:BFO_0000051 ?study_group. ?study_group a obo:STATO_0000193. ?study_group obo:IAO_0000590 ?study_group_name. ?study_group obo:RO_0002351 ?subject. ?subject a obo:BFO_0000040. ?subject obo:IAO_0000590 ?subject_name. } ORDER BY ?dbgap_study_acc ?study_group_name ?subject_name """, initBindings = bindings) print() print("StudyGroup members:") print() print("dbGaP Study\tStudy Group\tSubject ID") for row in qres: print("%s\t%s\t%s" % row) print()
def main(): # input parser = argparse.ArgumentParser( description= 'List subjects and samples in TOPMed or GTEs crosscut model instance using an RDFLib SPARQL query.' ) parser.add_argument('--dats_file', help='Path to TOPMed or GTEx DATS JSON file.') args = parser.parse_args() # logging logging.basicConfig(level=logging.INFO) # parse JSON LD g = ru.read_json_ld_graph(args.dats_file) # ------------------------------------------ # RNA/DNA extract / sample / subject triples # ------------------------------------------ # obo:IAO_0000100 - "data set" # obo:BFO_0000040 - "material entity" # obo:IAO_0000136 - "is about" # obo:IAO_0000590 - "a textual entity that denotes a particular in reality" # obo:BFO_0000023 - "role" # obo:RO_0001000 - "derives from" # obo:IAO_0000300 - "textual entity" qres = g.query(""" SELECT DISTINCT ?rna_or_dna_extract ?sample_name ?sample_descr ?subject_name ?subject_descr WHERE { ?dataset a obo:IAO_0000100. ?mat1 a obo:BFO_0000040. ?dataset obo:IAO_0000136 ?mat1. ?mat1 obo:IAO_0000590 ?rna_or_dna_extract. ?mat1 obo:BFO_0000023 ?role. ?role sdo:value ?rolename. ?mat1 obo:RO_0001000 ?sample. ?sample obo:IAO_0000590 ?sample_name. ?sample obo:IAO_0000300 ?sample_descr. ?sample obo:RO_0001000 ?subject. ?subject obo:IAO_0000590 ?subject_name. ?subject obo:IAO_0000300 ?subject_descr. FILTER ((str(?rolename) = "DNA extract") || (str(?rolename) = "RNA extract")). } ORDER BY ?subject_name ?sample_name """) print("Samples and subjects:\n") print("RNA/DNA extract\tSample ID\tSample\tSubject ID\tSubject") for row in qres: print("%s\t%s\t%s\t%s\t%s" % row) print("\n")
def main(): # input parser = argparse.ArgumentParser(description='Create tabular dump of DATS metadata using rdflib API calls.') parser.add_argument('--dats_file', help ='Path to TOPMed or GTEx DATS JSON file.') args = parser.parse_args() # logging logging.basicConfig(level=logging.INFO) # parse JSON LD g = ru.read_json_ld_graph(args.dats_file) print_tabular_dump(g)
def main(): # input parser = argparse.ArgumentParser(description='List variables available in the DATS Dataset that corresponds to a given dbGaP study.') parser.add_argument('--dats_file', help ='Path to TOPMed or GTEx DATS JSON file.') parser.add_argument('--dataset_id', required=False, help ='DATS identifier of the Dataset whose variables should be retrieved.') args = parser.parse_args() # logging logging.basicConfig(level=logging.INFO) # parse JSON LD g = ru.read_json_ld_graph(args.dats_file) # obo:IAO_0000100 - "data set" # obo:IAO_0000577 - "centrally registered identifier symbol" # obo:BFO_0000051 - "has part" # obo:STATO_0000258 - "variable" # obo:IAO_0000300 - "textual entity" # obo:IAO_0000590 - "a textual entity that denotes a particular in reality" bindings = None if args.dataset_id is not None: bindings = {'dbgap_study_acc': rdflib.term.Literal(args.dataset_id)} qres = g.query( """ SELECT ?dbgap_study_acc ?dbgap_var_acc ?pname ?descr WHERE { ?dataset a obo:IAO_0000100. ?dataset obo:IAO_0000577 ?dataset_id. ?dataset_id sdo:identifier ?dbgap_study_acc. ?dataset obo:BFO_0000051 ?dim1. ?dim1 a obo:STATO_0000258. ?dim1 obo:IAO_0000300 ?descr. ?dim1 obo:IAO_0000577 ?dim1_id. ?dim1_id sdo:identifier ?dbgap_var_acc. ?dim1 obo:IAO_0000590 ?propname. ?propname sdo:value ?pname. } ORDER BY ?dbgap_study_acc ?dbgap_var_acc """, initBindings = bindings) print() print("Dataset variables:") print() print("dbGaP Study\tdbGaP variable\tName\tDescription") for row in qres: print("%s\t%s\t%s\t%s" % row) print()
def main(): # input parser = argparse.ArgumentParser( description='List subject characteristics using an RDFLib SPARQL query.' ) parser.add_argument('--dats_file', help='Path to TOPMed or GTEx DATS JSON file.') args = parser.parse_args() # logging logging.basicConfig(level=logging.INFO) # parse JSON LD g = ru.read_json_ld_graph(args.dats_file) # ------------------------------------------ # List subject characteristics # ------------------------------------------ # obo:BFO_0000040 - "material entity" # obo:IAO_0000590 - "a textual entity that denotes a particular in reality" # obo:BFO_0000023 - "role" # obo:RO_0000086 - "has quality" # obo:IAO_0000027 - "data item" # obo:IAO_0000577 - "centrally registered identifier symbol" qres = g.query(""" SELECT DISTINCT ?subject_name ?dbgap_var_acc ?pname ?propvalue WHERE { ?subj1 a obo:BFO_0000040. ?subj1 obo:IAO_0000590 ?subject_name. ?subj1 obo:BFO_0000023 ?role. ?role sdo:value ?rolename. ?subj1 obo:RO_0000086 ?chars. ?chars obo:IAO_0000027 ?propvalue. ?chars obo:IAO_0000577 ?chars_id. ?chars_id sdo:identifier ?dbgap_var_acc. ?chars obo:IAO_0000590 ?propname. ?propname sdo:value ?pname. FILTER (str(?rolename) = "donor"). } ORDER BY ?subject_name str(?pname) """) print("Subject characteristics:\n") print("Subject ID\tdbGaP variable\tCharacteristic\tValue") for row in qres: print("%s\t%s\t%s\t%s" % row) print("\n")
def main(): # input parser = argparse.ArgumentParser(description='List variables available in the DATS Dataset that corresponds to a given dbGaP study.') parser.add_argument('--dats_file', help ='Path to TOPMed or GTEx DATS JSON file.') parser.add_argument('--dataset_id', required=False, help ='DATS identifier of the Dataset whose variables should be retrieved.') args = parser.parse_args() # logging logging.basicConfig(level=logging.INFO) # parse JSON LD g = ru.read_json_ld_graph(args.dats_file) # run query variables = list_dataset_variables(g, args.dataset_id) print_results(variables, args.dataset_id)
def main(): # input parser = argparse.ArgumentParser( description= 'List 2nd level Datasets in TOPMed or GTEx crosscut model instance using an RDFLib SPARQL query.' ) parser.add_argument('--dats_file', help='Path to TOPMed or GTEx DATS JSON file.') args = parser.parse_args() # logging logging.basicConfig(level=logging.INFO) # parse JSON LD g = ru.read_json_ld_graph(args.dats_file) # obo:IAO_0000100 - "data set" # obo:BFO_0000051 - "has part" # obo:OBI_0001622 - "a textual entity that denotes an investigation" # obo:IAO_0000577 - "centrally registered identifier symbol" qres = g.query(""" SELECT ?ident ?title WHERE { ?top_dataset a obo:IAO_0000100. ?top_dataset obo:OBI_0001622 ?top_title. FILTER ((str(?top_title) = "Genotype-Tissue Expression Project (GTEx)") || (str(?top_title) = "Trans-Omics for Precision Medicine (TOPMed)")). ?top_dataset obo:BFO_0000051 ?dataset. ?dataset a obo:IAO_0000100. ?dataset obo:OBI_0001622 ?title. ?dataset obo:IAO_0000577 ?identifier. ?identifier sdo:identifier ?ident. } """) print() print("2nd-level DATS Datasets:") print() print("Dataset\tDescription") for row in qres: print("%s\t%s" % row) print("\n")
def main(): # input parser = argparse.ArgumentParser( description='List sample characteristics using an RDFLib SPARQL query.' ) parser.add_argument('--dats_file', help='Path to TOPMed or GTEx DATS JSON file.') args = parser.parse_args() # logging logging.basicConfig(level=logging.INFO) # parse JSON LD g = ru.read_json_ld_graph(args.dats_file) # ------------------------------------------ # List sample characteristics # ------------------------------------------ qres = g.query(""" SELECT DISTINCT ?sample_name ?dbgap_var_acc ?pname ?propvalue WHERE { ?samp1 a obo:BFO_0000040. ?samp1 obo:IAO_0000590 ?sample_name. ?samp1 obo:BFO_0000023 ?role. ?role sdo:value ?rolename. ?samp1 obo:RO_0000086 ?chars. ?chars obo:IAO_0000027 ?propvalue. ?chars obo:IAO_0000577 ?chars_id. ?chars_id sdo:identifier ?dbgap_var_acc. ?chars obo:IAO_0000590 ?propname. ?propname sdo:value ?pname. FILTER (str(?rolename) = "specimen"). } ORDER BY ?sample_name str(?pname) """) print("Sample characteristics:\n") print("Sample ID\tdbGaP variable\tCharacteristic\tValue") for row in qres: print("%s\t%s\t%s\t%s" % row) print("\n")
def main(): # input parser = argparse.ArgumentParser( description= 'List 2nd level Datasets in TOPMed or GTEx crosscut model instance using RDFLib API calls.' ) parser.add_argument('--dats_file', help='Path to TOPMed or GTEx DATS JSON-LD file.') args = parser.parse_args() # logging logging.basicConfig(level=logging.INFO) # parse JSON LD g = ru.read_json_ld_graph(args.dats_file) # run query datasets = list_2nd_level_datasets(g) print_results(datasets)
def main(): # input parser = argparse.ArgumentParser( description='Run test queries on TOPMed instance.') parser.add_argument('--dats_file', help='Path to TOPMed or GTEx DATS JSON-LD file.') args = parser.parse_args() # logging logging.basicConfig(level=logging.INFO) # parse JSON LD g = ru.read_json_ld_graph(args.dats_file) # list 2nd-level datasets datasets = rdflib_list_2nd_level_datasets.list_2nd_level_datasets(g) rdflib_list_2nd_level_datasets.print_results(datasets) # list dataset variables for dataset_id in ([ 'phs001024.v3.p1', 'phs000951.v2.p2', 'phs000179.v5.p2' ]): variables = rdflib_list_dataset_variables.list_dataset_variables( g, dataset_id) rdflib_list_dataset_variables.print_results(variables, dataset_id) # list study group members for dataset_id in ([ 'phs001024.v3.p1', 'phs000951.v2.p2', 'phs000179.v5.p2' ]): for study_group in (['all subjects']): members = rdflib_list_study_group_members.list_study_group_members( g, dataset_id, study_group) rdflib_list_study_group_members.print_results( members, dataset_id, study_group)
def main(): # input parser = argparse.ArgumentParser( description= 'Create tabular dump of DATS metadata using rdflib API calls.') parser.add_argument('--dats_file', help='Path to TOPMed or GTEx DATS JSON file.') args = parser.parse_args() # logging logging.basicConfig(level=logging.INFO) # parse JSON LD g = ru.read_json_ld_graph(args.dats_file) # find ALL Datasets, retain those with a DATS identifier all_datasets = [ s for (s, p, o) in g.triples((None, ru.RDF_TYPE_TERM, ru.DATS_DATASET_TERM)) ] dataset_ids = {} datasets = [] for d in all_datasets: for (s, p, o) in g.triples((d, ru.CENTRAL_ID_TERM, None)): for (s2, p2, o2) in g.triples((o, ru.SDO_IDENT_TERM, None)): dataset_ids[d] = o2 if d in dataset_ids: datasets.append(d) # retrieve top-level Dataset # filter Datasets, get those with a title matching one of these two strings: titles = [ 'Genotype-Tissue Expression Project (GTEx)', 'Trans-Omics for Precision Medicine (TOPMed)' ] title_terms = [ rdflib.term.Literal(t, datatype=ru.DESCR_TERM) for t in titles ] tl_datasets = [] project_name = None for d in all_datasets: for tt in title_terms: for (s, p, o) in g.triples((d, ru.TITLE_TERM, tt)): tl_datasets.append(d) project_name = tt if len(tl_datasets) != 1: logging.fatal("found " + str(len(tl_datasets)) + " top-level DATS Datasets") sys.exit(1) # link each Dataset to Study (should be 1-1) ds_to_study = {} for d in datasets: for (s, p, o) in g.triples((d, ru.PRODUCED_BY_TERM, None)): for (s2, p2, o2) in g.triples( (o, ru.RDF_TYPE_TERM, ru.DATS_STUDY_TERM)): ds_to_study[d] = o # filter Datasets not linked to a study datasets = [d for d in datasets if d in ds_to_study] # link each Study to StudyGroup (1-many) and get StudyGroup name study_to_groups = {} study_group_to_name = {} for s in ds_to_study.values(): groups = [] for (s, p, o) in g.triples((s, ru.HAS_PART_TERM, None)): for (s2, p2, o2) in g.triples( (o, ru.RDF_TYPE_TERM, ru.DATS_STUDY_GROUP_TERM)): # get name n_names = 0 for (s3, p3, o3) in g.triples((o, ru.NAME_TERM, None)): study_group_to_name[o] = o3 n_names += 1 if n_names == 1: groups.append(o) study_to_groups[s] = groups # find subjects in each study group and retrieve their names study_group_to_subjects = {} subject_to_name = {} for sg in study_group_to_name.keys(): subjects = [] for (s, p, o) in g.triples((sg, ru.HAS_MEMBER_TERM, None)): for (s2, p2, o2) in g.triples( (o, ru.RDF_TYPE_TERM, ru.DATS_MATERIAL_TERM)): for (s3, p3, o3) in g.triples((o, ru.NAME_TERM, None)): subject_to_name[o] = o3 subjects.append(o) study_group_to_subjects[sg] = subjects # retrieve subject characteristics subject_to_chars = {} # all known characteristic names all_char_names = {} # ?subj1 obo:RO_0000086 ?chars. X # ?chars obo:IAO_0000027 ?propvalue. X # ?chars obo:IAO_0000577 ?chars_id. # ?chars_id sdo:identifier ?dbgap_var_acc. # ?chars obo:IAO_0000590 ?propname. X # ?propname sdo:value ?pname. X # FILTER (str(?rolename) = "donor"). for s in subject_to_name.keys(): chars = {} for (s, p, o) in g.triples((s, ru.HAS_QUALITY_TERM, None)): char_names = [] char_values = [] dbgap_ids = [] # characteristic value(s) for (s2, p2, o2) in g.triples((o, ru.DATA_ITEM_TERM, None)): char_values.append(o2) # characteristic name(s) for (s2, p2, o2) in g.triples((o, ru.NAME_TERM, None)): for (s3, p3, o3) in g.triples((o2, ru.SDO_VALUE_TERM, None)): char_names.append(o3) # dbGaP ids for (s2, p2, o2) in g.triples((o, ru.CENTRAL_ID_TERM, None)): for (s3, p3, o3) in g.triples((o2, ru.SDO_IDENT_TERM, None)): dbgap_ids.append(o3) if len(char_values) == 0: char_values.append(None) if len(char_values) != 1 or len(char_names) != 1 or len( dbgap_ids) != 1: continue logging.fatal("subject=" + str(s) + " names=" + str(char_names) + " values=" + str(char_values) + " ids=" + str(dbgap_ids)) sys.exit(1) cval = str(char_values[0]) cname = str(char_names[0]) chars[cname] = {"value": cval, "name": cname} all_char_names[cname] = True subject_to_chars[s] = chars sorted_char_names = sorted(all_char_names.keys()) subject_to_files = {} # retrieve data files for d in all_datasets: distribs = [] # file paths and sizes s3_URI = None gs_URI = None md5_checksum = None file_size = None # MD5 checksum # TODO - this is currently stored as a Dimension of Dataset, but will be moved md5_checksum = "TBD" for (s2, p2, o2) in g.triples((d, ru.HAS_PART_TERM, None)): for (s3, p3, o3) in g.triples( (o2, ru.RDF_TYPE_TERM, ru.DATS_DIMENSION_TERM)): name = None value = None for (s4, p4, o4) in g.triples((o2, ru.NAME_TERM, None)): for (s5, p5, o5) in g.triples( (o4, ru.SDO_VALUE_TERM, None)): name = str(o5) for (s4, p4, o4) in g.triples((o2, ru.DATA_ITEM_TERM, None)): value = str(o4) if name == "MD5": md5_checksum = value # link Dataset to DatasetDistributions for (s, p, o) in g.triples((d, ru.SDO_DISTRIBUTIONS_TERM, None)): # file size for (s2, p2, o2) in g.triples((o, ru.SDO_SIZE_TERM, None)): fsize = str(o2) if file_size is None: file_size = fsize else: if file_size != fsize: logging.fatal("file size mismatch") sys.exit(1) for (s2, p2, o2) in g.triples( (o, ru.RDF_TYPE_TERM, ru.SDO_DATA_DOWNLOAD_TERM)): m = re.match(r'^(gs|s3):\/\/.*', str(o)) if m is not None: distribs.append({'URI': str(o), 'size': file_size}) if m.group(1) == "gs": gs_URI = str(o) else: s3_URI = str(o) # link Dataset to DataAcquisition (should be 1-1) data_acqs = [] for (s, p, o) in g.triples((d, ru.PRODUCED_BY_TERM, None)): # TODO - replace SDO_ACTION_TERM with DATS_DATA_ACQUISITION TERM when https://github.com/datatagsuite/context/issues/4 resolved for (s2, p2, o2) in g.triples( (o, ru.RDF_TYPE_TERM, ru.DATS_DATA_ACQUISITION_TERM)): data_acqs.append(o) if len(data_acqs) != 1: continue data_acq = data_acqs[0] # link DataAcquisition to RNA/DNA extract Material via input for (s, p, o) in g.triples((data_acq, ru.HAS_INPUT_TERM, None)): rna_dna_extract = o # type - RNA or DNA extract # link to sample Material via derivesFrom for (s2, p2, o2) in g.triples( (rna_dna_extract, ru.DERIVES_FROM_TERM, None)): # get sample body site / anatomy term anatomical_parts = [] for (s3, p3, o3) in g.triples( (o2, ru.DERIVES_FROM_TERM, None)): for (s4, p4, o4) in g.triples( (o3, ru.RDF_TYPE_TERM, ru.SDO_ANATOMICAL_STRUCTURE_TERM)): term_name = None # anatomy term name for (s5, p5, o5) in g.triples( (o3, ru.NAME_TERM, None)): term_name = str(o5) # anatomy term id term_ids = {} for (s5, p5, o5) in g.triples( (o3, ru.CENTRAL_ID_TERM, None)): # need to filter duplicates due to duplicate AnatomicalPart definitions for (s6, p6, o6) in g.triples( (o5, ru.CENTRAL_ID_TERM, None)): term_ids[str(o6)] = True tids = [t for t in term_ids.keys()] if len(tids) != 1: logging.fatal("found " + str(len(tids)) + " term ids for AnatomicalPart " + term_name) sys.exit(1) anatomical_parts.append({ 'name': term_name, 'id': tids[0] }) # link to subject Material via derivesFrom for (s3, p3, o3) in g.triples( (o2, ru.DERIVES_FROM_TERM, None)): # track link back from subject to data file(s) if o3 not in subject_to_files: subject_to_files[o3] = [] n_anat_parts = len(anatomical_parts) if n_anat_parts != 1: logging.fatal("found " + str(n_anat_parts) + " AnatomicalParts for subject " + str(o3)) sys.exit(1) datatype = None # HACK - this is specific to GTEx if re.search(r'GTEx', project_name): if re.search(r'\/wgs\/', s3_URI): datatype = 'WGS' elif re.search(r'\/rnaseq/', s3_URI): datatype = 'RNA-Seq' else: logging.fatal("couldn't parse seq type from URI " + s3_URI) sys.exit(1) else: logging.fatal( "couldn't determine seq datatype from URI " + s3_URI) sys.exit(1) file_info = { 'anatomical_part_name': anatomical_parts[0]['name'], 'anatomical_part_id': anatomical_parts[0]['id'], 'S3_URI': s3_URI, 'GS_URI': gs_URI, 'datatype': datatype, 'distribs': distribs, 'file_size': file_size, 'md5_checksum': md5_checksum } # TODO - extract sample characteristics subject_to_files[o3].append(file_info) # generate tabular output col_headings = ["Project", "dbGaP_Study", "Study_Group", "Subject_ID"] col_headings.extend(sorted_char_names) col_headings.extend(["Anatomical_Part", "Anatomical_Part_ID"]) col_headings.extend(["Datatype"]) col_headings.extend(["File_Size", "MD5_Checksum"]) col_headings.extend(["AWS_URI", "GCP_URI"]) print("\t".join(col_headings)) # sort datasets datasets.sort(key=lambda x: dataset_ids[x]) for d in datasets: dataset_id = dataset_ids[d] study = ds_to_study[d] groups = study_to_groups[study] # sort study groups groups.sort(key=lambda x: study_group_to_name[x]) for g in groups: group_name = study_group_to_name[g] subjects = study_group_to_subjects[g] # sort subjects subjects.sort(key=lambda x: subject_to_name[x]) for s in subjects: subject_name = subject_to_name[s] col_vals = [project_name, dataset_id, group_name, subject_name] # subject characteristics subject_chars = subject_to_chars[s] for k in sorted_char_names: if k in subject_chars: col_vals.append(subject_chars[k]['value']) else: col_vals.append("") # ensure that subjects with no data files still get printed if s not in subject_to_files: n_extra_cols = len(col_headings) - len(col_vals) for c in range(0, n_extra_cols): col_vals.append("") print("\t".join(col_vals)) continue # data files linked to the subject data_files = subject_to_files[s] data_files.sort(key=lambda d: (d['anatomical_part_name'], d[ 'datatype'], d['S3_URI'])) for d in data_files: col_vals_copy = col_vals[:] # add data file-specific columns # body site col_vals_copy.append(d['anatomical_part_name']) col_vals_copy.append(d['anatomical_part_id']) # data/file type col_vals_copy.append(d['datatype']) # TODO - add sample characteristics # add file size and MD5 checksum col_vals_copy.append(d['file_size']) col_vals_copy.append(d['md5_checksum']) # TODO - add .crai index files? # URIs col_vals_copy.append(d['S3_URI']) col_vals_copy.append(d['GS_URI']) print("\t".join(col_vals_copy))
def main(): # input parser = argparse.ArgumentParser( description= 'List 2nd level Datasets in TOPMed or GTEx crosscut model instance using RDFLib API calls.' ) parser.add_argument('--dats_file', help='Path to TOPMed or GTEx DATS JSON-LD file.') args = parser.parse_args() # logging logging.basicConfig(level=logging.INFO) # parse JSON LD g = ru.read_json_ld_graph(args.dats_file) # SELECT ?ident ?title # WHERE { # ----> ?top_dataset a obo:IAO_0000100. # ?top_dataset obo:OBI_0001622 ?top_title. # FILTER ((str(?top_title) = "Genotype-Tissue Expression Project (GTEx)") || (str(?top_title) = "Trans-Omics for Precision Medicine (TOPMed)")). # ?top_dataset obo:BFO_0000051 ?dataset. # ?dataset a obo:IAO_0000100. # ?dataset obo:OBI_0001622 ?title. # ?dataset obo:IAO_0000577 ?identifier. # ?identifier sdo:identifier ?ident. # } # find ALL Datasets all_datasets = [ s for (s, p, o) in g.triples((None, ru.RDF_TYPE_TERM, ru.DATS_DATASET_TERM)) ] # SELECT ?ident ?title # WHERE { # ?top_dataset a obo:IAO_0000100. # ----> ?top_dataset obo:OBI_0001622 ?top_title. # ----> FILTER ((str(?top_title) = "Genotype-Tissue Expression Project (GTEx)") || (str(?top_title) = "Trans-Omics for Precision Medicine (TOPMed)")). # ?top_dataset obo:BFO_0000051 ?dataset. # ?dataset a obo:IAO_0000100. # ?dataset obo:OBI_0001622 ?title. # ?dataset obo:IAO_0000577 ?identifier. # ?identifier sdo:identifier ?ident. # } # filter Datasets, get those with a title matching one of these two strings: titles = [ 'Genotype-Tissue Expression Project (GTEx)', 'Trans-Omics for Precision Medicine (TOPMed)' ] title_terms = [ rdflib.term.Literal(t, datatype=ru.DESCR_TERM) for t in titles ] datasets = [] for d in all_datasets: for tt in title_terms: for (s, p, o) in g.triples((d, ru.TITLE_TERM, tt)): datasets.append(d) # SELECT ?ident ?title # WHERE { # ?top_dataset a obo:IAO_0000100. # ?top_dataset obo:OBI_0001622 ?top_title. # FILTER ((str(?top_title) = "Genotype-Tissue Expression Project (GTEx)") || (str(?top_title) = "Trans-Omics for Precision Medicine (TOPMed)")). # ----> ?top_dataset obo:BFO_0000051 ?dataset. # ?dataset a obo:IAO_0000100. # ?dataset obo:OBI_0001622 ?title. # ?dataset obo:IAO_0000577 ?identifier. # ?identifier sdo:identifier ?ident. # } # find all entities linked by "has part" to the top-level GTEx and TOPMed Datasets l2_entities = [] for d in datasets: for (s, p, o) in g.triples((s, ru.HAS_PART_TERM, None)): l2_entities.append(o) # SELECT ?ident ?title # WHERE { # ?top_dataset a obo:IAO_0000100. # ?top_dataset obo:OBI_0001622 ?top_title. # FILTER ((str(?top_title) = "Genotype-Tissue Expression Project (GTEx)") || (str(?top_title) = "Trans-Omics for Precision Medicine (TOPMed)")). # ?top_dataset obo:BFO_0000051 ?dataset. # ----> ?dataset a obo:IAO_0000100. # ?dataset obo:OBI_0001622 ?title. # ?dataset obo:IAO_0000577 ?identifier. # ?identifier sdo:identifier ?ident. # } # filter l2_entities, keeping only those that are Datasets l2_datasets = [] for e in l2_entities: l = [t for t in g.triples((e, ru.RDF_TYPE_TERM, ru.DATS_DATASET_TERM))] if len(l) > 0: l2_datasets.append(e) # SELECT ?ident ?title # WHERE { # ?top_dataset a obo:IAO_0000100. # ?top_dataset obo:OBI_0001622 ?top_title. # FILTER ((str(?top_title) = "Genotype-Tissue Expression Project (GTEx)") || (str(?top_title) = "Trans-Omics for Precision Medicine (TOPMed)")). # ?top_dataset obo:BFO_0000051 ?dataset. # ?dataset a obo:IAO_0000100. # ----> ?dataset obo:OBI_0001622 ?title. # ----> ?dataset obo:IAO_0000577 ?identifier. # ?identifier sdo:identifier ?ident. # } # Retrieve title and id for each 2nd-level Dataset. Using parallel arrays, which assumes # that each Dataset will have exactly one of each. titles = [] ids = [] id_term = rdflib.term.Literal('Genotype-Tissue Expression Project (GTEx)', datatype=ru.CENTRAL_ID_TERM) for d in l2_datasets: for (s, p, o) in g.triples((d, ru.TITLE_TERM, None)): titles.append(o) for (s, p, o) in g.triples((d, ru.CENTRAL_ID_TERM, None)): ids.append(o) # SELECT ?ident ?title # WHERE { # ?top_dataset a obo:IAO_0000100. # ?top_dataset obo:OBI_0001622 ?top_title. # FILTER ((str(?top_title) = "Genotype-Tissue Expression Project (GTEx)") || (str(?top_title) = "Trans-Omics for Precision Medicine (TOPMed)")). # ?top_dataset obo:BFO_0000051 ?dataset. # ?dataset a obo:IAO_0000100. # ?dataset obo:OBI_0001622 ?title. # ?dataset obo:IAO_0000577 ?identifier. # ----> ?identifier sdo:identifier ?ident. # } # One more step needed to get from DATS Identifier to the actual id. idents = [] for i in ids: for (s, p, o) in g.triples((i, ru.SDO_IDENT_TERM, None)): idents.append(o) print() print("2nd-level DATS Datasets:") print() print("Dataset\tDescription") nt = len(titles) for i in range(0, nt): print("%s\t%s" % (idents[i], titles[i]))
def main(): # input parser = argparse.ArgumentParser( description= 'List variables available in the DATS Dataset that corresponds to a given dbGaP study.' ) parser.add_argument('--dats_file', help='Path to TOPMed or GTEx DATS JSON file.') parser.add_argument( '--dataset_id', required=False, help= 'DATS identifier of the Dataset whose variables should be retrieved.') args = parser.parse_args() # logging logging.basicConfig(level=logging.INFO) # parse JSON LD g = ru.read_json_ld_graph(args.dats_file) # obo:IAO_0000100 - "data set" # obo:IAO_0000577 - "centrally registered identifier symbol" # obo:BFO_0000051 - "has part" # obo:STATO_0000258 - "variable" # obo:IAO_0000300 - "textual entity" # obo:IAO_0000590 - "a textual entity that denotes a particular in reality" # SELECT DISTINCT ?dbgap_study_acc ?dbgap_var_acc ?pname ?descr # WHERE { # ----> ?dataset a obo:IAO_0000100. # ?dataset obo:IAO_0000577 ?dataset_id. # ?dataset_id sdo:identifier ?dbgap_study_acc. # ?dataset obo:BFO_0000051 ?dim1. # ?dim1 a obo:STATO_0000258. # ?dim1 obo:IAO_0000300 ?descr. # ?dim1 obo:IAO_0000577 ?dim1_id. # ?dim1_id sdo:identifier ?dbgap_var_acc. # ?dim1 obo:IAO_0000590 ?propname. # ?propname sdo:value ?pname. # } # ORDER BY ?dbgap_study_acc ?dbgap_var_acc # find ALL Datasets all_datasets = [ s for (s, p, o) in g.triples((None, None, ru.DATS_DATASET_TERM)) ] # SELECT DISTINCT ?dbgap_study_acc ?dbgap_var_acc ?pname ?descr # WHERE { # ?dataset a obo:IAO_0000100. # ----> ?dataset obo:IAO_0000577 ?dataset_id. # ----> ?dataset_id sdo:identifier ?dbgap_study_acc. # ?dataset obo:BFO_0000051 ?dim1. # ?dim1 a obo:STATO_0000258. # ?dim1 obo:IAO_0000300 ?descr. # ?dim1 obo:IAO_0000577 ?dim1_id. # ?dim1_id sdo:identifier ?dbgap_var_acc. # ?dim1 obo:IAO_0000590 ?propname. # ?propname sdo:value ?pname. # } # ORDER BY ?dbgap_study_acc ?dbgap_var_acc # get DATS identifier for each one - DATS schema specifies the mapping should be 1-1 dataset_ids = {} for d in all_datasets: for (s, p, o) in g.triples((d, ru.CENTRAL_ID_TERM, None)): for (s2, p2, o2) in g.triples((o, ru.SDO_IDENT_TERM, None)): dataset_ids[d] = o2 # filter datasets by id if one was specified datasets = [ d for d in all_datasets if (args.dataset_id is None) or ( rdflib.term.Literal(args.dataset_id) == dataset_ids[d]) ] # SELECT DISTINCT ?dbgap_study_acc ?dbgap_var_acc ?pname ?descr # WHERE { # ?dataset a obo:IAO_0000100. # ?dataset obo:IAO_0000577 ?dataset_id. # ?dataset_id sdo:identifier ?dbgap_study_acc. # ----> ?dataset obo:BFO_0000051 ?dim1. # ----> ?dim1 a obo:STATO_0000258 # ?dim1 obo:IAO_0000300 ?descr. # ?dim1 obo:IAO_0000577 ?dim1_id. # ?dim1_id sdo:identifier ?dbgap_var_acc. # ?dim1 obo:IAO_0000590 ?propname. # ?propname sdo:value ?pname. # } # ORDER BY ?dbgap_study_acc ?dbgap_var_acc # get all dimensions of each Dataset dataset_dims = {} for d in datasets: dims = [] for (s, p, o) in g.triples((d, ru.HAS_PART_TERM, None)): for (s2, p2, o2) in g.triples( (o, ru.RDF_TYPE_TERM, ru.DATS_DIMENSION_TERM)): dims.append(o) dataset_dims[d] = dims # SELECT DISTINCT ?dbgap_study_acc ?dbgap_var_acc ?pname ?descr # WHERE { # ?dataset a obo:IAO_0000100. # ?dataset obo:IAO_0000577 ?dataset_id. # ?dataset_id sdo:identifier ?dbgap_study_acc. # ?dataset obo:BFO_0000051 ?dim1. # ?dim1 a obo:STATO_0000258. # ----> ?dim1 obo:IAO_0000300 ?descr. # ----> ?dim1 obo:IAO_0000577 ?dim1_id. # ----> ?dim1_id sdo:identifier ?dbgap_var_acc. # ----> ?dim1 obo:IAO_0000590 ?propname. # ----> ?propname sdo:value ?pname. # } # ORDER BY ?dbgap_study_acc ?dbgap_var_acc # get Dimension description dim_descrs = {} for d in datasets: for dim in dataset_dims[d]: for (s, p, o) in g.triples((dim, ru.DESCR_TERM, None)): dim_descrs[dim] = o # get Dimension identifier dim_ids = {} for d in datasets: for dim in dataset_dims[d]: for (s, p, o) in g.triples((dim, ru.CENTRAL_ID_TERM, None)): for (s2, p2, o2) in g.triples((o, ru.SDO_IDENT_TERM, None)): dim_ids[dim] = o2 # get Dimension name dim_names = {} for d in datasets: for dim in dataset_dims[d]: for (s, p, o) in g.triples((dim, ru.NAME_TERM, None)): for (s2, p2, o2) in g.triples((o, ru.SDO_VALUE_TERM, None)): dim_names[dim] = o2 # SELECT DISTINCT ?dbgap_study_acc ?dbgap_var_acc ?pname ?descr # WHERE { # ?dataset a obo:IAO_0000100. # ?dataset obo:IAO_0000577 ?dataset_id. # ?dataset_id sdo:identifier ?dbgap_study_acc. # ?dataset obo:BFO_0000051 ?dim1. # ?dim1 a obo:STATO_0000258. # ?dim1 obo:IAO_0000300 ?descr. # ?dim1 obo:IAO_0000577 ?dim1_id. # ?dim1_id sdo:identifier ?dbgap_var_acc. # ?dim1 obo:IAO_0000590 ?propname. # ?propname sdo:value ?pname. # } # ----> ORDER BY ?dbgap_study_acc ?dbgap_var_acc datasets_with_ids = [{ "d": d, "i": dataset_ids[d] } for d in datasets if d in dataset_ids] datasets_with_ids.sort(key=lambda x: x["i"]) print() print("Dataset variables:") print() print("dbGaP Study\tdbGaP variable\tName\tDescription") for ds in datasets_with_ids: dims = dataset_dims[ds['d']] # filter out those with no id # (may still fail if the description or name are missing) dims_with_atts = [{ "d": d, "descr": dim_descrs[d], "id": dim_ids[d], "name": dim_names[d] } for d in dims if d in dim_ids] dims_with_atts.sort(key=lambda x: x["id"]) for d in dims_with_atts: print("%s\t%s\t%s\t%s" % (ds["i"], d["id"], d["name"], d["descr"])) print()
def main(): # input parser = argparse.ArgumentParser( description='List subjects in a given DATS Dataset and StudyGroup.') parser.add_argument('--dats_file', help='Path to TOPMed or GTEx DATS JSON file.') parser.add_argument( '--dataset_id', required=False, help= 'DATS identifier of the Dataset linked to the StudyGroup of interest.') parser.add_argument('--study_group_name', required=False, help='DATS identifier of the StudyGroup of interest.') args = parser.parse_args() # logging logging.basicConfig(level=logging.INFO) # parse JSON LD g = ru.read_json_ld_graph(args.dats_file) # obo:IAO_0000100 - "data set" # obo:IAO_0000577 - "centrally registered identifier symbol" # obo:RO_0003001 - "produced by" # obo:OBI_0000066 - "investigation" # obo:BFO_0000051 - "has part" # obo:STATO_0000193 - "study group population" # obo:RO_0002351 - "has member" # obo:IAO_0000590 - "a textual entity that denotes a particular in reality" # obo:BFO_0000040 - "material entity" # SELECT ?dbgap_study_acc ?study_group_name ?subject_name # WHERE { # ----> ?dataset a obo:IAO_0000100. # ----> ?dataset obo:IAO_0000577 ?dataset_id. # ----> ?dataset_id sdo:identifier ?dbgap_study_acc. # ?dataset obo:RO_0003001 ?study. # ?study a obo:OBI_0000066. # ?study obo:BFO_0000051 ?study_group. # ?study_group a obo:STATO_0000193. # ?study_group obo:IAO_0000590 ?study_group_name. # ?study_group obo:RO_0002351 ?subject. # ?subject a obo:BFO_0000040. # ?subject obo:IAO_0000590 ?subject_name. # } # ORDER BY ?dbgap_study_acc ?study_group_name ?subject_name # find ALL Datasets, retain those with a DATS identifier all_datasets = [ s for (s, p, o) in g.triples((None, None, ru.DATS_DATASET_TERM)) ] dataset_ids = {} datasets = [] for d in all_datasets: for (s, p, o) in g.triples((d, ru.CENTRAL_ID_TERM, None)): for (s2, p2, o2) in g.triples((o, ru.SDO_IDENT_TERM, None)): dataset_ids[d] = o2 if d in dataset_ids: datasets.append(d) # filter datasets by id if one was specified datasets = [ d for d in datasets if (args.dataset_id is None) or ( rdflib.term.Literal(args.dataset_id) == dataset_ids[d]) ] # SELECT ?dbgap_study_acc ?study_group_name ?subject_name # WHERE { # ?dataset a obo:IAO_0000100. # ?dataset obo:IAO_0000577 ?dataset_id. # ?dataset_id sdo:identifier ?dbgap_study_acc. # ----> ?dataset obo:RO_0003001 ?study. # ----> ?study a obo:OBI_0000066. # ?study obo:BFO_0000051 ?study_group. # ?study_group a obo:STATO_0000193. # ?study_group obo:IAO_0000590 ?study_group_name. # ?study_group obo:RO_0002351 ?subject. # ?subject a obo:BFO_0000040. # ?subject obo:IAO_0000590 ?subject_name. # } # ORDER BY ?dbgap_study_acc ?study_group_name ?subject_name # link each Dataset to Study (should be 1-1) ds_to_study = {} for d in datasets: for (s, p, o) in g.triples((d, ru.PRODUCED_BY_TERM, None)): for (s2, p2, o2) in g.triples( (o, ru.RDF_TYPE_TERM, ru.DATS_STUDY_TERM)): ds_to_study[d] = o # filter Datasets not linked to a study datasets = [d for d in datasets if d in ds_to_study] # SELECT ?dbgap_study_acc ?study_group_name ?subject_name # WHERE { # ?dataset a obo:IAO_0000100. # ?dataset obo:IAO_0000577 ?dataset_id. # ?dataset_id sdo:identifier ?dbgap_study_acc. # ?dataset obo:RO_0003001 ?study. # ?study a obo:OBI_0000066. # ----> ?study obo:BFO_0000051 ?study_group. # ----> ?study_group a obo:STATO_0000193. # ----> ?study_group obo:IAO_0000590 ?study_group_name. # ?study_group obo:RO_0002351 ?subject. # ?subject a obo:BFO_0000040. # ?subject obo:IAO_0000590 ?subject_name. # } # ORDER BY ?dbgap_study_acc ?study_group_name ?subject_name # link each Study to StudyGroup (1-many) and get StudyGroup name study_to_groups = {} study_group_to_name = {} for s in ds_to_study.values(): groups = [] for (s, p, o) in g.triples((s, ru.HAS_PART_TERM, None)): for (s2, p2, o2) in g.triples( (o, ru.RDF_TYPE_TERM, ru.DATS_STUDY_GROUP_TERM)): # get name n_names = 0 for (s3, p3, o3) in g.triples((o, ru.NAME_TERM, None)): study_group_to_name[o] = o3 n_names += 1 if n_names == 1: groups.append(o) # filter study groups by name if one was specified groups = [ g for g in groups if (args.study_group_name is None) or (rdflib.term.Literal( args.study_group_name) == study_group_to_name[g]) ] study_to_groups[s] = groups # SELECT ?dbgap_study_acc ?study_group_name ?subject_name # WHERE { # ?dataset a obo:IAO_0000100. # ?dataset obo:IAO_0000577 ?dataset_id. # ?dataset_id sdo:identifier ?dbgap_study_acc. # ?dataset obo:RO_0003001 ?study. # ?study a obo:OBI_0000066. # ?study obo:BFO_0000051 ?study_group. # ?study_group a obo:STATO_0000193. # ?study_group obo:IAO_0000590 ?study_group_name. # ----> ?study_group obo:RO_0002351 ?subject. # ----> ?subject a obo:BFO_0000040. # ----> ?subject obo:IAO_0000590 ?subject_name. # } # ORDER BY ?dbgap_study_acc ?study_group_name ?subject_name # find subjects in each study group and retrieve their names study_group_to_subjects = {} subject_to_name = {} for sg in study_group_to_name.keys(): subjects = [] for (s, p, o) in g.triples((sg, ru.HAS_MEMBER_TERM, None)): for (s2, p2, o2) in g.triples( (o, ru.RDF_TYPE_TERM, ru.DATS_MATERIAL_TERM)): for (s3, p3, o3) in g.triples((o, ru.NAME_TERM, None)): subject_to_name[o] = o3 subjects.append(o) study_group_to_subjects[sg] = subjects # SELECT ?dbgap_study_acc ?study_group_name ?subject_name # WHERE { # ?dataset a obo:IAO_0000100. # ?dataset obo:IAO_0000577 ?dataset_id. # ?dataset_id sdo:identifier ?dbgap_study_acc. # ?dataset obo:RO_0003001 ?study. # ?study a obo:OBI_0000066. # ?study obo:BFO_0000051 ?study_group. # ?study_group a obo:STATO_0000193. # ?study_group obo:IAO_0000590 ?study_group_name. # ?study_group obo:RO_0002351 ?subject. # ?subject a obo:BFO_0000040. # ?subject obo:IAO_0000590 ?subject_name. # } # ----> ORDER BY ?dbgap_study_acc ?study_group_name ?subject_name print() print("StudyGroup members:") print() print("dbGaP Study\tStudy Group\tSubject ID") # sort datasets datasets.sort(key=lambda x: dataset_ids[x]) for d in datasets: dataset_id = dataset_ids[d] study = ds_to_study[d] groups = study_to_groups[study] # sort study groups groups.sort(key=lambda x: study_group_to_name[x]) for g in groups: group_name = study_group_to_name[g] subjects = study_group_to_subjects[g] # sort subjects subjects.sort(key=lambda x: subject_to_name[x]) for s in subjects: subject_name = subject_to_name[s] print("%s\t%s\t%s" % (dataset_id, group_name, subject_name)) print()