def make_var_dimension(name, var_value): value = var_value["value"] dim = DatsObj("Dimension", [("name", DatsObj("Annotation", [("value", name)])), ("values", [value])]) # find existing DATS identifier for the corresponding Dataset Dimension if "var" in var_value: dbgap_var_dim = var_value["var"]["dim"] dim.setProperty("identifier", dbgap_var_dim.get("identifier").getIdRef()) return dim
def add_study_vars(study, study_md): # maps dbGaP variable id to DATS dimension and variable report id_to_var = {} # maps variable type (e.g., Subject, Sample_Attributes), name and consent group to DATS dimension and variable report type_name_cg_to_var = {} for var_type in ('Subject', 'Subject_Phenotypes', 'Sample', 'Sample_Attributes'): if var_type in study_md: var_data = study_md[var_type]['data_dict']['data'] vars = var_data['vars'] vdict = {} type_name_cg_to_var[var_type] = vdict for var in vars: var_name = var['name'] id = DatsObj("Identifier", [ ("identifier", var['id']), ("identifierSource", "dbGaP")]) dim = DatsObj("Dimension", [ ("identifier", id), ("name", DatsObj("Annotation", [("value", var_name)])), ("description", var['description']) # TODO: include stats ]) study.getProperty("dimensions").append(dim) # track dbGaP variable Dimension and variable report by dbGaP id if var['id'] in id_to_var: logging.fatal("duplicate definition found for dbGaP variable " + var_name + " with accession=" + var['id']) sys.exit(1) t ={"dim": dim, "var": var} id_to_var[var['id']] = t # track by name and consent group m = re.match(r'^(.*)(\.(c\d+))$', var['id']) if m is None: suffix = "" else: suffix = "." + m.group(3) key = "".join([var_name, suffix]) if key in vdict: logging.fatal("duplicate definition found for dbGaP variable " + key + " in " + var_type + " file") vdict[key] = t return { "id_to_var": id_to_var, "type_name_cg_to_var": type_name_cg_to_var }
def get_subject_dats_material(cache, p_subject, gh_subject, var_lookup): subj_id = p_subject['SUBJID']['mapped_value'] # retrieve id reference for the Identifier of the DATS Dimension for the "all subjects" consent group version of the variable def get_var_id(name): return var_lookup[name]['dim'].get("identifier").getIdRef() # human experimental subject/patient subject_sex = DatsObj( "Dimension", [("name", util.get_value_annotation("Gender", cache)), ("description", "Gender of the subject"), ("identifier", get_var_id("SEX")), ("values", [p_subject['SEX']['mapped_value']])]) subject_age = DatsObj( "Dimension", [("name", util.get_value_annotation("Age range", cache)), ("description", "Age range of the subject"), ("identifier", get_var_id("AGE")), ("values", [p_subject['AGE']['mapped_value']])]) subject_hardy_scale = DatsObj( "Dimension", [("name", util.get_value_annotation("Hardy scale", cache)), ("description", "Hardy scale death classification for the subject"), ("identifier", get_var_id("DTHHRDY")), ("values", [p_subject['DTHHRDY']['mapped_value']])]) subject_characteristics = [subject_sex, subject_age, subject_hardy_scale] # use URI from GTEx id dump if present identifier = subj_id if gh_subject is not None: identifier = gh_subject['Destination URL']['raw_value'] # human experimental subject/patient subject_material = DatsObj( "Material", [("name", subj_id), ("identifier", DatsObj("Identifier", [("identifier", identifier)])), ("description", "GTEx subject " + subj_id), ("characteristics", subject_characteristics), ("taxonomy", [util.get_taxon_human(cache)]), ("roles", util.get_donor_roles(cache))]) # add to the cache subj_key = ":".join(["Material", subj_id]) dats_subj = cache.get_obj_or_ref(subj_key, lambda: subject_material) return dats_subj
def make_anat_part(anat_id, anatomy_name): # EFO id if re.match(r'^EFO_\d+', anat_id): anatomy_identifier = OrderedDict([("identifier", anat_id), ("identifierSource", "EFO")]) anatomy_alt_ids = [ OrderedDict([( "identifier", "https://www.ebi.ac.uk/ols/ontologies/efo/terms?short_form=" + str(anat_id)), ("identifierSource", "EFO")]) ] # Uberon id else: anatomy_identifier = OrderedDict([("identifier", "UBERON:" + str(anat_id)), ("identifierSource", "UBERON")]) anatomy_alt_ids = [ OrderedDict([ ("identifier", "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)), ("identifierSource", "UBERON") ]) ] # anatomical part anatomical_part = DatsObj("AnatomicalPart", [("name", anatomy_name), ("identifier", anatomy_identifier), ("alternateIdentifiers", anatomy_alt_ids)]) return anatomical_part
def get_dats_id_aux(id_type, source, id, rel_type): url_prefix = ID_URL_PREFIXES[source] atts = [("identifier", url_prefix + id)] atts.append(("identifierSource", source)) if rel_type is not None: atts.append(("relationType", rel_type)) return DatsObj(id_type, atts)
def get_ref_genomes(acc_d): ref_genomes = [] ref_genome = None for arf in AGR_ref_genomes: m = re.match(r'^(\w+_)(\d+\.\d+\.\d+)$', arf) if m is not None: ref_genome = {'id': m.group(1) + m.group(2)} ref_genome['versions'] = m.group(2) ref_genomes.append(ref_genome) continue # filter ref_genomes by acc_d filtered_ref_genomes = [r for r in ref_genomes if r['id'] in acc_d] ref_genomes = filtered_ref_genomes n_ref_genomes = len(ref_genomes) logging.info("found " + str(n_ref_genomes) + " Reference genomes in AGR Directory") # convert ref_genomes to DATS Datasets datasets = [] for r in ref_genomes: m = re.match(r'^\w+_(\d+\.\d+\.\d+)$', r['id']) if m is None: logging.fatal("unable to parse reference genome version from id " + r['id']) sys.exit(1) version = m.group(1) types = [OrderedDict([("information", DNA_SEQUENCING_TYPE)])] creators = [NIH_NHLBI, NIH_NHGRI] # Dataset dataset = DatsObj( "Dataset", [("identifier", DatsObj("Identifier", [("identifier", r['id'])])), ("version", version), ("title", AGR_ref_genomes[r['id']]), ("storedIn", AGR_BUCKET), ("types", types), ("creators", creators) # ("dimensions", dimensions) ]) datasets.append(dataset) return datasets
def get_dataset_json(): # individual datasets corresponding to studies within GTEx data_subsets = [] # pull studies from dbGaP data_subsets = get_dbgap_studies("phs000424") # parent GTEx Dataset that represents the entire GTEx program parent_gtex_dataset = DatsObj( "Dataset", [ ( "identifier", DatsObj( "Identifier", [ # GTEx value - "GTEx_Analysis_2016-01-15_v7_RNA-SEQ" ("identifier", "Genotype-Tissue Expression Project (GTEx)") ])), ("title", "Genotype-Tissue Expression Project (GTEx)"), ("description", GTEX_DESCRIPTION), ("storedIn", DB_GAP), ("types", GTEX_V7_TYPES), ("creators", [NIH_NHGRI]), ("distributions", [ DatsObj("DatasetDistribution", [("access", DatsObj("Access", [("landingPage", DBGAP_GTEX_QUERY_URL)]))]) ]), ("hasPart", data_subsets) ]) # TODO - add 'licenses', 'availability', 'dimensions', 'primaryPublications'? return parent_gtex_dataset
def get_dataset_json(): # individual datasets corresponding to studies within TOPMed data_subsets = [] # pull studies from dbGaP data_subsets = get_dbgap_studies("topmed") # parent TOPMed dataset parent_topmed_dataset = DatsObj( "Dataset", [ ( "identifier", DatsObj( "Identifier", [ # GTEx value - "GTEx_Analysis_2016-01-15_v7_RNA-SEQ" ("identifier", "TOPMed") ])), ("title", "Trans-Omics for Precision Medicine (TOPMed)"), ("description", TOPMED_DESCRIPTION), ("storedIn", DB_GAP), ("types", TOPMED_TYPES), ("creators", [NIH_NHLBI]), ("distributions", [ DatsObj("DatasetDistribution", [("access", DatsObj("Access", [("landingPage", DBGAP_TOPMED_QUERY_URL)]))]) ]), ("hasPart", data_subsets) ]) # TODO - add 'licenses', 'availability', 'dimensions', 'primaryPublications'? return parent_topmed_dataset
def get_dataset_json(acc_d): # individual datasets corresponding to model organisms within AGR data dump dats_subsets = [] # pull genome build from AWS files data_subsets = get_ref_genomes(acc_d) # parent AGR reference genome dataset parent_agr_dataset = DatsObj("Dataset", [ ("identifier", DatsObj("Identifier", [("identifier", "AGR")])), ("title", "Alliance of Genome Resources (AGR)"), ("description", AGR_DESCRIPTION), ("storedIn", AGR_BUCKET), ("types", AGR_TYPES), ("creators", [AGR]), ("distributions", [ DatsObj("DatasetDistribution", [("access", DatsObj("Access", [("landingPage", AGR_DOWNLOAD_URL)]))]) ]), ("hasPart", data_subsets) ]) # TODO - add 'licenses', 'availability', 'dimensions', 'primaryPublications'? return parent_agr_dataset
def make_anat_part(anat_id, anatomy_name): # anatomical part anatomical_part = DatsObj("AnatomicalPart", [("name", anatomy_name)]) if anat_id is not None: anatomy_identifier = OrderedDict([("identifier", "UBERON:" + str(anat_id)), ("identifierSource", "UBERON")]) anatomy_alt_ids = [ OrderedDict([ ("identifier", "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)), ("identifierSource", "UBERON") ]) ] anatomical_part.set("identifier", anatomy_identifier) anatomical_part.set("alternateIdentifiers", anatomy_alt_ids) return anatomical_part
def get_dataset_json(): set_dataset_guids() # individual RNA-Seq datasets/files rnaseq_data_subsets = [] # create DATS Dataset for each RNA-Seq data product for dss in RNASEQ_DATASETS: descr = dss["descr"] file = dss["file"] analysis = dss["analysis"] measures = analysis["measures"] uses = analysis["uses"] # "The name of the activity, usually one sentece or short description of the data analysis." analysis_name = analysis["name"] # "A textual narrative comprised of one or more statements describing the data analysis." # analysis_descr = analysis["descr"] # DataAnalysis data_analysis = DatsObj( "DataAnalysis", [ ("name", analysis_name), # ("description", analysis_descr), ("measures", measures), ("uses", uses) ]) # Dataset subset = DatsObj( "Dataset", id=dss["doi"], atts=[ ("identifier", DatsObj("Identifier", [("identifier", "GTEx_Analysis_2016-01-15_v7_RNA-SEQ_" + file)])), ("version", "v7"), ("dates", [GTEX_V7_RELEASE_DATE]), ("title", "GTEx v7 RNA-Seq Analysis, " + descr), ("storedIn", DB_GAP), ("types", [GTEX_V7_RNASEQ_TYPE]), ("creators", [GTEX_CONSORTIUM]), ("producedBy", data_analysis), # TODO - where does the actual filename belong? ("distributions", [ DatsObj("DatasetDistribution", [("access", DatsObj("Access", [("landingPage", GTEX_DATASETS_URL)]))]) ]), ]) rnaseq_data_subsets.append(subset) # parent RNA-Seq dataset parent_rnaseq_dataset = DatsObj( "Dataset", [("identifier", DatsObj("Identifier", [("identifier", "GTEx_Analysis_2016-01-15_v7_RNA-SEQ")])), ("version", "v7"), ("dates", [GTEX_V7_RELEASE_DATE]), ("title", "GTEx v7 RNA-Seq Analysis"), ("storedIn", DB_GAP), ("types", [GTEX_V7_RNASEQ_TYPE]), ("creators", [GTEX_CONSORTIUM]), ("distributions", [ DatsObj("DatasetDistribution", [ ("access", DatsObj("Access", [("landingPage", GTEX_DATASETS_URL)])) ]) ]), ("hasPart", rnaseq_data_subsets)]) # parent GTEx dataset gtex_dataset = DatsObj( "Dataset", [ ("identifier", DatsObj("Identifier", [("identifier", GTEX_DB_GAP_ID)])), ("version", "v7"), ("dates", [GTEX_V7_RELEASE_DATE]), ("title", "Genotype-Tissue Expression Project (GTEx)"), ("storedIn", DB_GAP), # TODO add types for parent GTEx project ("types", GTEX_V7_TYPES), ("creators", [GTEX_CONSORTIUM]), ("distributions", [ DatsObj("DatasetDistribution", [ ("access", DatsObj("Access", [("landingPage", GTEX_DB_GAP_URL)])) ]) ]), ("hasPart", [parent_rnaseq_dataset]) ]) # TODO - add 'licenses', 'availability', 'dimensions', 'primaryPublications'? return gtex_dataset
# "Illumina" ILLUMINA_TYPE = OrderedDict([("value", "Illumina"), ("valueIRI", "http://purl.obolibrary.org/obo/OBI_0000759")]) # "DNA sequencing" DNA_SEQUENCING_TYPE = OrderedDict([ ("value", "DNA sequencing"), ("valueIRI", "http://purl.obolibrary.org/obo/OBI_0000626") ]) # "whole genome sequencing assay" WGS_ASSAY_TYPE = OrderedDict([("value", "whole genome sequencing assay"), ("valueIRI", "http://purl.obolibrary.org/obo/OBI_0002117")]) # documentation for earlier version at https://data.broadinstitute.org/cancer/cga/tools/rnaseqc/RNA-SeQC_Help_v1.1.2.pdf RNA_SEQ_QC = DatsObj("Software", [("name", "RNASeQC"), ("version", "v1.1.8")]) # gene read counts Dimension GENE_READ_COUNTS_DIM = DatsObj("Dimension", [("name", GENE_READ_COUNT_NAME), ("description", "gene read counts"), ("types", [COUNT_TYPE])]) TRANSCRIPT_READ_COUNTS_DIM = DatsObj( "Dimension", [("name", TRANSCRIPT_READ_COUNT_NAME), ("description", "transcript read counts"), ("types", [COUNT_TYPE])]) EXON_READ_COUNTS_DIM = DatsObj("Dimension", [("name", EXON_READ_COUNT_NAME), ("description", "exon read counts"),
def get_sample_dats_material(cache, dats_subject, p_sample, gh_sample, var_lookup): samp_id = p_sample['SAMPID']['mapped_value'] subj_id = p_sample['SUBJID']['mapped_value'] # retrieve id reference for the Identifier of the DATS Dimension for the "all subjects" consent group version of the variable def get_var_id(name): return var_lookup[name]['dim'].get("identifier").getIdRef() # Uberon id (or EFO id, contrary to the documentation) anat_id = p_sample['SMUBRID']['mapped_value'] if anat_id is None: print("No Uberon/anatomy ID specified for sample " + samp_id) sys.exit(1) anatomy_identifier = None anatomy_alt_ids = None # TODO - query anatomy term from UBERON/EFO instead? anatomy_name = p_sample['SMTSD']['mapped_value'] def make_anat_part(anat_id, anatomy_name): # EFO id if re.match(r'^EFO_\d+', anat_id): anatomy_identifier = OrderedDict([("identifier", anat_id), ("identifierSource", "EFO")]) anatomy_alt_ids = [ OrderedDict([( "identifier", "https://www.ebi.ac.uk/ols/ontologies/efo/terms?short_form=" + str(anat_id)), ("identifierSource", "EFO")]) ] # Uberon id else: anatomy_identifier = OrderedDict([("identifier", "UBERON:" + str(anat_id)), ("identifierSource", "UBERON")]) anatomy_alt_ids = [ OrderedDict([ ("identifier", "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)), ("identifierSource", "UBERON") ]) ] # anatomical part anatomical_part = DatsObj("AnatomicalPart", [("name", anatomy_name), ("identifier", anatomy_identifier), ("alternateIdentifiers", anatomy_alt_ids)]) return anatomical_part # use cached value for AnatomicalPart if possible anat_part_key = ":".join(["AnatomicalPart", anatomy_name]) anatomical_part = cache.get_obj_or_ref( anat_part_key, lambda: make_anat_part(anat_id, anatomy_name)) # use URI from GitHub GTEx id dump if available identifier = samp_id if gh_sample is not None: identifier_id = gh_sample['Destination URL']['raw_value'] subj_key = ":".join(["Material", dats_subject.get("name")]) dats_subj = cache.get_obj_or_ref(subj_key, lambda: dats_subject) # add sample characteristics from p_sample metadata sample_chars = [] for key in p_sample: if re.match(r'^(subject|id)$', key): continue # TODO - currently including only a small subset of the available values for demonstration purposes if not re.match(r'^(SMATSSCR|SMRIN|SMMAPRT|SMGNSDTC)$', key): continue var = p_sample[key] # print("got key=" + key + " var=" + str(var)) mapped_val = var['mapped_value'] char = DatsObj("Dimension", [("name", util.get_value_annotation(key, cache)), ("identifier", get_var_id(key)), ("values", [mapped_val])]) sample_chars.append(char) # biological/tissue sample biological_sample_material = DatsObj( "Material", [("name", samp_id), ("identifier", { "identifier": identifier }), ("description", anatomy_name + " specimen collected from subject " + subj_id), ("characteristics", sample_chars), ("taxonomy", [util.get_taxon_human(cache)]), ("roles", [util.get_annotation("specimen", cache)]), ("derivesFrom", [dats_subj, anatomical_part])]) # analysis freeze classification smafrze = p_sample['SMAFRZE']['mapped_value'] # expected sequence type depending on data freeze classification expected_stype = None stype = None if smafrze == "RNASEQ": expected_stype = "RNA" elif smafrze == "WGS": expected_stype = "DNA" elif smafrze == "WES": expected_stype = "DNA" # Illumina OMNI SNP Array elif smafrze == "OMNI": expected_stype = "DNA" elif smafrze == "EXCLUDE": pass else: logging.fatal("unknown SMAFRZE " + smafrze) sys.exit(1) # sample type - DNA or RNA stype = None smnabtcht = p_sample['SMNABTCHT']['mapped_value'] if re.match(r'^DNA ([iI]solation|[eE]xtraction).*', smnabtcht): stype = 'DNA' elif re.match(r'^RNA ([iI]solation|[eE]xtraction).*', smnabtcht): stype = 'RNA' elif re.match( r'DNA or RNA Extraction from Paxgene-derived Lysate Plate Based', smnabtcht): stype = 'RNA' elif re.match(r'Transfer To Matrix \(Manual\)', smnabtcht): stype = 'DNA' if stype is None: if expected_stype is not None: stype = expected_stype else: print("couldn't determine sequence type for smafrze=" + smafrze + " smnabtcht=" + smnabtcht) return None else: if (expected_stype is not None) and (stype != expected_stype): logging.fatal("seq type " + stype + " doesn't match expected stype " + expected_stype) sys.exit(1) # DNA or RNA extract dna_or_rna_material = DatsObj( "Material", [("name", stype + " from " + samp_id), ("description", "total " + stype + " extracted from " + anatomy_name + " specimen collected from subject " + subj_id), ("taxonomy", [util.get_taxon_human(cache)]), ("roles", [util.get_annotation(stype + " extract", cache)]), ("derivesFrom", [biological_sample_material])]) return dna_or_rna_material
def get_dataset_json(gff3_path, human_homologs_path): # read human homologs from MGI human/mouse sequence file mgi2hgene_h = ccmm.mgd.human_homologs.read_mgd_mouse_human_seq_file( human_homologs_path) # read mouse features/genes from GFF3 data = read_mgd_gff3(gff3_path) # molecular entities that represent genes and other top-level features of interest entities = [] # number of mouse genes/features with no human homolog n_homolog = 0 n_no_homolog = 0 n_genes = 0 # relationType is a string/uri h_region = SO_TERMS['homologous_region'].get('valueIRI') # TODO - add taxonomy, either in 'isAbout' or in each individual gene (or both) for f in data['features']: if f['source'] == 'MGI': if re.match(r'^gene|pseudogene|sequence_feature$', f['type']) and (f['bioType'] != 'DNA segment'): id = f['ID'] id = re.sub(r'MGI:MGI:', 'MGI:', id) roles = None # is this a gene or a gene segment is_gene = re.match(r'gene', f['type']) or re.match( r'gene', f['bioType']) if is_gene: n_genes += 1 # specify/map GFF3 feature type to role if f['type'] == 'sequence_feature': bioType = re.sub(r' ', '_', f['bioType']) roles = [SO_TERMS[bioType]] else: roles = [SO_TERMS[f['type']]] # array of dimension or material characteristics = [ # chromosome DatsObj("Dimension", [("name", { 'value': 'chromosome' }), ("types", [SO_TERMS['chromosome']]), ("values", [f['seqid']])]), # start coordinate # end coordinate ] # strand # direction_attribute only allows 'forward' or 'reverse' so if the strand is unknown the characteristic is omitted if f['strand'] in STRAND_CHADO2SO: # map chado strand to name of corresponding SO term SO_strand = STRAND_CHADO2SO[f['strand']] DATS_strand = DatsObj( "Dimension", [ ("name", { 'value': 'direction_attribute' }), ("types", [SO_TERMS['direction_attribute']]), # in dimension_schema.json 'values' is an array with no other constraints, hence # our use of a string rather than an explicit reference to the SO ID ("values", [SO_strand]) ]) characteristics.append(DATS_strand) # dbxrefs alt_ids = [] if 'Dbxref' in f: dbxref = f['Dbxref'] dbxrefs = dbxref.rsplit(',') for dbx in dbxrefs: (src, delim, src_id) = dbx.partition(':') alt_ids.append(get_dats_alternate_id(src, src_id)) # unharmonized data/anything that doesn't map anywhere else extra_props = [ DatsObj("CategoryValuesPair", [("category", "reference sequence"), ("values", [f['seqid']])]), DatsObj("CategoryValuesPair", [("category", "start coordinate"), ("values", [f['start']])]), DatsObj("CategoryValuesPair", [("category", "end coordinate"), ("values", [f['end']])]), DatsObj("CategoryValuesPair", [("category", "strand"), ("values", [f['strand']])]) ] # human homologs related_ids = [] has_homolog = False if id in mgi2hgene_h: hgene = mgi2hgene_h[id] homologene_id = hgene['id'] human_genes = [] # add HomoloGene reference related_ids.append( get_dats_related_id("NCBI_HomoloGene", homologene_id, h_region)) if 'human' in hgene: human_genes = hgene['human'] has_homolog = True for human_gene in human_genes: entrez_gene_id = human_gene['EntrezGene ID'] related_ids.append( get_dats_related_id("NCBI_Gene", entrez_gene_id, h_region)) if is_gene: if has_homolog: n_homolog += 1 else: n_no_homolog += 1 me = DatsObj('MolecularEntity', [ ("name", f['Name']), ("identifier", get_dats_id("MGI", id)), ("alternateIdentifiers", alt_ids), ("relatedIdentifiers", related_ids), ("characteristics", characteristics), ("roles", roles), ("extraProperties", extra_props), ]) entities.append(me) else: logging.debug("skipped feature of type " + f['type'] + " at line " + str(f['lnum']) + ": mgiName=" + f['mgiName'] + ", bioType=" + f['bioType']) # parent MGD reference genome dataset parent_mgd_dataset = DatsObj("Dataset", [ ("identifier", DatsObj("Identifier", [("identifier", "GRCm38-C57BL/6J"), ("identifierSource", "MGI")])), ("title", "GRCm38-C57BL/6J reference genome, genes, and human orthologs"), ("description", "GRCm38-C57BL/6J reference genome, genes, and human orthologs based on MGI/MGD Unified Mouse Gene Catalog." ), ("storedIn", MGD), ("types", MGD_TYPES), ("creators", [MGI, DatsObj("Person", [("email", data['metadata']['Contact'])])]), ("distributions", [ DatsObj("DatasetDistribution", [ ("access", DatsObj("Access", [("landingPage", MGD_SEQ_DOWNLOAD_URL)])) ]) ]), ("version", data['metadata']['Last updated']), ("isAbout", entities) ]) # TODO - add 'licenses', 'availability', 'dimensions', 'primaryPublications'? # TODO - add sub-Datasets for the individual MGI files that contributed to the DATS encoding? # metadata['URL'] gives the FTP URI of the source data file logging.debug("human homolog found for " + str(n_homolog) + "/" + str(n_genes) + " mouse (pseudo)genes") logging.debug("no human homolog found for " + str(n_no_homolog) + "/" + str(n_genes) + " mouse (pseudo)genes") return parent_mgd_dataset
def get_single_sample_json(sample, dats_obj_cache): # print("converting sample to json: " + str(sample)) samp_id = sample['SAMPID']['mapped_value'] subj_id = sample['SUBJID']['mapped_value'] subject = sample['subject'] # Uberon id (or EFO id, contrary to the documentation) anat_id = sample['SMUBRID']['mapped_value'] if anat_id is None: print("No Uberon/anatomy ID specified for sample " + samp_id) sys.exit(1) anatomy_identifier = None anatomy_alt_ids = None # TODO - query anatomy term from UBERON/EFO instead? anatomy_name = sample['SMTSD']['mapped_value'] # EFO id if re.match(r'^EFO_\d+', anat_id): anatomy_identifier = OrderedDict([("identifier", anat_id), ("identifierSource", "EFO")]) anatomy_alt_ids = [ OrderedDict([ ("identifier", "https://www.ebi.ac.uk/ols/ontologies/efo/terms?short_form=" + str(anat_id)), ("identifierSource", "EFO") ]) ] # Uberon id else: anatomy_identifier = OrderedDict([("identifier", "UBERON:" + str(anat_id)), ("identifierSource", "UBERON")]) anatomy_alt_ids = [ OrderedDict([ ("identifier", "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)), ("identifierSource", "UBERON") ]) ] # anatomical part anat_part_key = ":".join(["AnatomicalPart", anatomy_name]) if anat_part_key in dats_obj_cache: anatomical_part = dats_obj_cache[anat_part_key] else: anatomical_part = DatsObj("AnatomicalPart", [("name", anatomy_name), ("identifier", anatomy_identifier), ("alternateIdentifiers", anatomy_alt_ids)]) dats_obj_cache[anat_part_key] = anatomical_part # human experimental subject/patient subject_sex = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Gender")])), ("description", "Gender of the subject"), ("identifier", DatsObj("Identifier", [("identifier", "SEX"), ("identifierSource", "GTEx")])), ("values", [subject['SEX']['mapped_value']])]) subject_age = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Age range")])), ("description", "Age range of the subject"), ("identifier", DatsObj("Identifier", [("identifier", "AGE"), ("identifierSource", "GTEx")])), ("values", [subject['AGE']['mapped_value']])]) subject_hardy_scale = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Hardy scale")])), ("description", "Hardy scale death classification for the subject"), ("identifier", DatsObj("Identifier", [("identifier", "DTHHRDY"), ("identifierSource", "GTEx")])), ("values", [subject['DTHHRDY']['mapped_value']])]) subject_characteristics = [subject_sex, subject_age, subject_hardy_scale] # human experimental subject/patient subj_key = ":".join(["Material", subj_id]) if subj_key in dats_obj_cache: subject_material = dats_obj_cache[subj_key] else: subject_material = DatsObj( "Material", [("name", subj_id), ("identifier", { "identifier": subj_id }), ("description", "GTEx subject " + subj_id), ("characteristics", subject_characteristics), ("taxonomy", [util.get_taxon_human(dats_obj_cache)]), ("roles", util.get_donor_roles(dats_obj_cache))]) dats_obj_cache[subj_key] = subject_material specimen_annot = util.get_annotation("specimen", dats_obj_cache) rna_extract_annot = util.get_annotation("RNA extract", dats_obj_cache) # biological/tissue sample sample_name = samp_id biological_sample_material = DatsObj( "Material", [("name", sample_name), ("identifier", { "identifier": samp_id }), ("description", anatomy_name + " specimen collected from subject " + subj_id), ("taxonomy", [util.get_taxon_human(dats_obj_cache)]), ("roles", [specimen_annot]), ("derivesFrom", [subject_material, anatomical_part])]) # RNA extracted from tissue sample rna_material = DatsObj( "Material", [("name", "RNA from " + sample_name), ("description", "total RNA extracted from " + anatomy_name + " specimen collected from subject " + subj_id), ("taxonomy", [util.get_taxon_human(dats_obj_cache)]), ("roles", [rna_extract_annot]), ("derivesFrom", [biological_sample_material])]) return rna_material
def update_single_subject(cache, study, study_md, subj, subj_var_values, use_all_dbgap_vars): # extract subject attributes gender = None age = None visit_year = None sys_bp = None dias_bp = None disease = {} disease['hypertension'] = "unknown" for name in subj_var_values: name_upper = name.upper() if name_upper == "GENDER" or name_upper == "SEX": gender = subj_var_values[name]['value'].lower() elif name_upper == "VISIT_AGE" or name_upper == "AGE" or name_upper == "AGE_ENROLL": #need to confirm that these allmean the same thing age = subj_var_values[name]['value'] elif name_upper == "VISIT_YEAR": visit_year = subj_var_values[name]['value'] elif name_upper == "SYSBP": sys_bp = subj_var_values[name]['value'] elif name_upper == "DIASBP": dias_bp = subj_var_values[name]['value'] elif name_upper == "HYPERTENSION" or name_upper == "MHHTN": if subj_var_values[name]['value'].lower( ) == "yes" or subj_var_values[name]['value'] == '1': disease['hypertension'] = "yes" elif re.match(r'\S', subj_var_values[name]['value']): disease['hypertension'] = "no" subject_characteristics = [] subject_bearerOfDisease = [] # harmonized/standardized characteristics if gender is not None: # TODO - this ss = None if gender == "2": ss = "female" if gender == "1": ss = "male" subject_sex = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Gender")])), ("description", "Gender of the subject"), ("values", [ss])]) subject_characteristics.append(subject_sex) if age is not None: subject_age = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Age")])), ("description", "Age of the subject"), ("values", [age])]) subject_characteristics.append(subject_age) if visit_year is not None: subject_visitYear = DatsObj("Dimension", [ ("name", DatsObj("Annotation", [("value", "Visit year")])), ("description", "Year of visit, to use for longitudinal analysis"), ("values", [visit_year]) ]) subject_characteristics.append(subject_visitYear) if sys_bp is not None: subject_sysBP = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Systolic blood pressure")])), ("description", "Systolic blood pressure of subject, measured in mmHg"), ("values", [sys_bp])]) subject_characteristics.append(subject_sysBP) if dias_bp is not None: subject_diasBP = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Diastolic blood pressure")])), ("description", "Diastolic blood pressure of subject, measured in mmHg"), ("values", [dias_bp])]) subject_characteristics.append(subject_diasBP) if disease['hypertension'] != "unknown": disease_name = "hypertension" disease_id = "10763" disease_identifier = OrderedDict([ ("identifier", "DOID:" + str(disease_id)), ("identifierSource", "Disease Ontology") ]) disease_alt_ids = [ OrderedDict([ ("identifier", "http://purl.obolibrary.org/obo/DOID_" + str(disease_id)), ("identifierSource", "Disease Ontology") ]) ] subject_hypertension = DatsObj( "Disease", [("name", "Hypertension"), ("identifier", disease_identifier), ("alternateIdentifiers", disease_alt_ids), ("diseaseStatus", DatsObj("Annotation", [("value", disease['hypertension'])]))]) subject_bearerOfDisease.append(subject_hypertension) # create a DATS Dimension from a dbGaP variable value def make_var_dimension(name, var_value): value = var_value["value"] dim = DatsObj("Dimension", [("name", DatsObj("Annotation", [("value", name)])), ("values", [value])]) # find existing DATS identifier for the corresponding Dataset Dimension if "var" in var_value: dim.setProperty( "identifier", var_value["var"]["dim"].get("identifier").getIdRef()) return dim # add "raw" characteristics / DATS Dimensions for _all_ dbGaP subject metadata # create DATS Dimensions for dbGaP subject metadata if use_all_dbgap_vars: subject_dimensions = [ make_var_dimension(vname, subj_var_values[vname]) for vname in sorted(subj_var_values) ] subject_characteristics.extend(subject_dimensions) # update subject dbgap_subj_id = subj_var_values['dbGaP_Subject_ID']['value'] subj.set("alternateIdentifiers", [util.get_alt_id(dbgap_subj_id, "dbGaP")]) subj.set("bearerOfDisease", subject_bearerOfDisease) # update characteristics chars = subj.get("characteristics") new_chars = [] # remove values from public data release that will be overwritten for c in chars: name = c.get("name") # TODO - 'member of study group' is string, not Annotation if not isinstance(name, DatsObj) and name == 'member of study group': new_chars.append(c) new_chars.extend(subject_characteristics) subj.set("characteristics", new_chars)
#!/usr/bin/env python3 from ccmm.dats.datsobj import DatsObj import ccmm.dats.util as util import ccmm.topmed.dna_extracts as dna_extracts from collections import OrderedDict import logging import re import sys NIH_NHLBI = DatsObj("Organization", [( "name", "The National Institute of Health's National Heart, Lung and Blood Institute" ), ("abbreviation", "NHLBI")]) # Produce a DATS Material for a single sample. def get_sample_dats_material(cache, dats_subject, study, study_md, samp_var_values): # Almost all samples in TOPMed WGS phase are blood samples, named "Blood", "Peripheral Blood"... # Few samples are saliva samples probably due to sample collection issues name = None if 'BODY_SITE' in samp_var_values: name = 'BODY_SITE' elif 'Body_Site' in samp_var_values: name = 'Body_Site' elif 'Body Site' in samp_var_values: name = 'Body Site'
#HiSeq 2000 #HiSeq 2000 #GeneChip Human Gene 1.0 ST Array #HiSeq 2000 #HiSeq X Ten DBGAP_QUERY_URL_PREFIX = 'https://www.ncbi.nlm.nih.gov/gap/?term=' DBGAP_GTEX_QUERY_URL = DBGAP_QUERY_URL_PREFIX + 'phs000424' HISEQ_TYPES = { "HiSeq 2000": util.get_annotation("Illumina HiSeq 2000"), "HiSeq X Ten": util.get_annotation("Illumina HiSeq X Ten"), } # TODO - duplicated from rnaseq_datasets.py DB_GAP = DatsObj("DataRepository", [("name", "dbGaP")]) NIH_NHGRI = DatsObj("Organization", [("name", "National Human Genome Research Institute"), ("abbreviation", "NHGRI")]) # TODO - use DatsObjCache cache = None GTEX_V7_RNASEQ_TYPE = DatsObj( "DataType", [("information", util.get_annotation("transcription profiling", cache)), ("method", util.get_annotation("RNA-seq assay", cache)), ("platform", util.get_annotation("Illumina", cache))]) GTEX_V7_WGS_TYPE = DatsObj( "DataType",
def get_files_dats_datasets(cache, dats_samples_d, p_samples, gh_samples, protected_cram_files, no_circular_links): file_datasets = [] rnaseq_datatype = DatsObj("DataType", [ ("information", util.get_annotation("transcription profiling", cache)), ("method", util.get_annotation("RNA-seq assay", cache)), ("platform", util.get_annotation("Illumina", cache)) ]) def get_rnaseq_datatype(): dkey = ".".join(["DataType", "RNA-seq"]) return cache.get_obj_or_ref(dkey, lambda: rnaseq_datatype) wgs_datatype = DatsObj( "DataType", [("information", util.get_annotation("DNA sequencing", cache)), ("method", util.get_annotation("whole genome sequencing assay", cache)), ("platform", util.get_annotation("Illumina", cache))]) def get_wgs_datatype(): dkey = ".".join(["DataType", "WGS"]) return cache.get_obj_or_ref(dkey, lambda: wgs_datatype) broad_key = ":".join(["Organization", "Broad Institute"]) broad = cache.get_obj_or_ref( broad_key, lambda: DatsObj("Organization", [("name", "Broad Institute")])) creators = [broad] def make_data_standard(format): return DatsObj("DataStandard", [("name", format), ("type", util.get_value_annotation("format", cache)), ("description", format + " file format")]) cram_ds_key = ":".join(["DataStandard", "CRAM"]) cram_ds = cache.get_obj_or_ref(cram_ds_key, lambda: make_data_standard("CRAM")) crai_ds_key = ":".join(["DataStandard", "CRAI"]) crai_ds = cache.get_obj_or_ref(crai_ds_key, lambda: make_data_standard("CRAI")) for sample_id in protected_cram_files: file = protected_cram_files[sample_id] material_type = None ds_types = None # determine file type if re.search(r'wgs\/', file['cram_file_aws']['raw_value']): material_type = 'DNA' ds_types = get_wgs_datatype() gcp_suffix = '_gcp' elif re.search(r'rnaseq\/', file['cram_file_aws']['raw_value']): material_type = 'RNA' ds_types = get_rnaseq_datatype() gcp_suffix = '' else: logging.fatal( "unable to determine material/sequence type from cram_file_aws=" + file['cram_file_aws']['raw_value']) sys.exit(1) # RNA-Seq keys = sample_id cram_file cram_file_md5 cram_file_size cram_index cram_file_aws cram_index_aws # WGS keys = same as above + firecloud_id cram_file = file['cram_file' + gcp_suffix]['raw_value'] cram_file_md5 = file['cram_file_md5']['raw_value'] # TODO - review the following encoding decisions: # - storing .crai URI as relatedIdentifier of the DatasetDistribution for the .cram file # - storing MD5 checksum of the .cram file as an extraProperty of the DatasetDistribution # - storing firecloud_id as a relatedIdentifier of the Dataset (not the DatasetDistribution) # Google Cloud Platform / Google Storage copy gs_access = DatsObj( "Access", [("landingPage", file['cram_file' + gcp_suffix]['raw_value'])]) gs_distro = DatsObj( "DatasetDistribution", [ ("access", gs_access), ("identifier", DatsObj("Identifier", [("identifier", file['cram_file' + gcp_suffix]['raw_value'])])), ("relatedIdentifiers", [ DatsObj("RelatedIdentifier", [("identifier", file['cram_index' + gcp_suffix]['raw_value']), ("relationType", "cram_index")]) ]), ("size", int(file['cram_file_size']['raw_value'])), # TODO - add unit for bytes, include IRI? # ("unit", util.get_value_annotation("bytes", cache)) ("conformsTo", [ cache.get_obj_or_ref(cram_ds_key, lambda: make_data_standard("CRAM")) ]) ]) # AWS / S3 copy s3_access = DatsObj( "Access", [("landingPage", file['cram_file_aws']['raw_value'])]) s3_distro = DatsObj( "DatasetDistribution", [ ("access", s3_access), ("identifier", DatsObj("Identifier", [ ("identifier", file['cram_file_aws']['raw_value']) ])), ("relatedIdentifiers", [ DatsObj("RelatedIdentifier", [ ("identifier", file['cram_index_aws']['raw_value']), ("relationType", "cram_index") ]) ]), ("size", int(file['cram_file_size']['raw_value'])), # TODO - add unit for bytes, include IRI? # ("unit", util.get_value_annotation("bytes", cache)) ("conformsTo", [ cache.get_obj_or_ref(cram_ds_key, lambda: make_data_standard("CRAM")) ]) ]) m = re.match(r'^.*\/([^\/]+)$', cram_file) if m is None: logging.fatal("unable to parse filename from CRAM file URI " + cram_file) sys.exit(1) filename = m.group(1) # TODO - replace this with DATS-specific MD5 checksum encoding (TBD) md5_dimension = DatsObj( "Dimension", [("name", util.get_value_annotation("MD5", cache)), ("values", [file['cram_file_md5']['raw_value']])]) ds = DatsObj("Dataset", [ ("distributions", [gs_distro, s3_distro]), ("dimensions", [md5_dimension]), ("title", filename), ("types", [ds_types]), ("creators", creators), ]) # add firecloud_id for WGS if 'firecloud_id' in file: f_id = DatsObj("RelatedIdentifier", [("identifier", file['firecloud_id']['raw_value']), ("identifierSource", "FireCloud")]) ds.set("relatedIdentifiers", [f_id]) # input RNA/DNA extract that was sequenced if sample_id not in dats_samples_d: logging.fatal("no sample exists for " + sample_id + " found in file " + file['cram_file_aws']['raw_value']) sys.exit(1) dats_sample = dats_samples_d[sample_id] dats_samp_key = ":".join(["Material", dats_sample.get("name")]) dats_samp = cache.get_obj_or_ref(dats_samp_key, lambda: dats_sample) da = DatsObj( "DataAcquisition", [("name", filename), ("input", [dats_samp]) # ("uses", []) # software used ]) if not no_circular_links: # circular link back to enclosing Dataset as the output da.set("output", [ds.getIdRef()]) ds.set("producedBy", da) file_datasets.append(ds) return file_datasets
def make_consent_group(args, group_name, group_index, subject_l, dats_subject_d): # find DATS subject that corresponds to each named subject dats_subjects_l = [] # parallel array in which existing subjects are represented by idref dats_subjects_idrefs_l = [] for s in subject_l: if s['SUBJID'] not in dats_subject_d: logging.warn( "GTEx subject " + s['SUBJID'] + " not found in public metadata, creating new subject Material") # create new placeholder Material and 1. add it to "all subjects" group 2. subject = DatsObj("Material", [("name", s['SUBJID']), ("characteristics", []), ("description", "GTEx subject " + s['SUBJID'])]) dats_subject_d[s['SUBJID']] = subject dats_subjects_l.append(subject) dats_subjects_idrefs_l.append(subject) else: ds = dats_subject_d[s['SUBJID']] dats_subjects_l.append(ds) dats_subjects_idrefs_l.append(ds.getIdRef()) # create StudyGroup and associated ConsentInfo # TODO - determine if/where to store group_index (0 or 1) # only 2 consent groups in GTEx study: # 0 - Subjects did not participate in the study, did not complete a consent document and # are included only for the pedigree structure and/or genotype controls, such as HapMap subjects # 1 - General Research Use (GRU) consent_info = None if group_name == "General Research Use (GRU)": # Data Use Ontology for consent info - http://www.obofoundry.org/ontology/duo.html # http://purl.obolibrary.org/obo/DUO_0000005 - "general research use and clinical care" # "This primary category consent code indicates that use is allowed for health/medical/biomedical # purposes and other biological research, including the study of population origins or ancestry." consent_info = DatsObj( "ConsentInfo", [("name", group_name), ("abbreviation", "GRU"), ("description", group_name), ("relatedIdentifiers", [ DatsObj("RelatedIdentifier", [("identifier", "http://purl.obolibrary.org/obo/DUO_0000005")]) ])]) elif group_name == "Subjects did not participate in the study, did not complete a consent document and are included only for the pedigree structure and/or genotype controls, such as HapMap subjects": consent_info = DatsObj("ConsentInfo", [("name", group_name), ("description", group_name)]) else: logging.fatal("unrecognized consent group " + group_name) sys.exit(1) group = DatsObj("StudyGroup", [("name", group_name), ("members", dats_subjects_idrefs_l), ("size", len(dats_subjects_idrefs_l)), ("consentInformation", [consent_info])]) # create link back from each subject to the parent StudyGroup if args.no_circular_links: logging.warn( "not creating Subject level circular links because of --no_circular_links option" ) else: for s in dats_subjects_l: cl = s.get("characteristics") cl.append( DatsObj("Dimension", [("name", "member of study group"), ("values", [group.getIdRef()])])) return group
def get_subject_dats_material(cache, study, study_md, subj_var_values): # extract subject attributes gender = None age = None visit_year = None sys_bp = None dias_bp = None disease = {} disease['hypertension'] = "unknown" for name in subj_var_values: name_upper = name.upper() if name_upper == "GENDER" or name_upper == "SEX": gender = subj_var_values[name]['value'].lower() elif name_upper == "VISIT_AGE" or name_upper == "AGE" or name_upper == "AGE_ENROLL": #need to confirm that these allmean the same thing age = subj_var_values[name]['value'] elif name_upper == "VISIT_YEAR": visit_year = subj_var_values[name]['value'] elif name_upper == "SYSBP": sys_bp = subj_var_values[name]['value'] elif name_upper == "DIASBP": dias_bp = subj_var_values[name]['value'] elif name_upper == "HYPERTENSION" or name_upper == "HIGHBLOODPRES": if subj_var_values[name]['value'].lower() == "yes" or subj_var_values[name]['value'] == '1': disease['hypertension'] = "yes" elif re.match(r'\S', subj_var_values[name]['value']): disease['hypertension'] = "no" subject_characteristics = [] subject_bearerOfDisease = [] # harmonized/standardized characteristics if gender is not None: subject_sex = DatsObj("Dimension", [ ("name", DatsObj("Annotation", [("value", "Gender")])), ("description", "Gender of the subject"), ("values", [ gender ]) ]) subject_characteristics.append(subject_sex) if age is not None: subject_age = DatsObj("Dimension", [ ("name", DatsObj("Annotation", [("value", "Age")])), ("description", "Age of the subject"), ("values", [ age ]) ]) subject_characteristics.append(subject_age) if visit_year is not None: subject_visitYear = DatsObj("Dimension", [ ("name", DatsObj("Annotation", [("value", "Visit year")])), ("description", "Year of visit, to use for longitudinal analysis"), ("values", [ visit_year ]) ]) subject_characteristics.append(subject_visitYear) if sys_bp is not None: subject_sysBP = DatsObj("Dimension", [ ("name", DatsObj("Annotation", [("value", "Systolic blood pressure")])), ("description", "Systolic blood pressure of subject, measured in mmHg"), ("values", [ sys_bp ]) ]) subject_characteristics.append(subject_sysBP) if dias_bp is not None: subject_diasBP = DatsObj("Dimension", [ ("name", DatsObj("Annotation", [("value", "Diastolic blood pressure")])), ("description", "Diastolic blood pressure of subject, measured in mmHg"), ("values", [ dias_bp ]) ]) subject_characteristics.append(subject_diasBP) if disease['hypertension'] != "unknown": disease_name = "hypertension" disease_id = "10763" disease_identifier = OrderedDict([ ("identifier", "DOID:" + str(disease_id)), ("identifierSource", "Disease Ontology")]) disease_alt_ids = [OrderedDict([ ("identifier", "http://purl.obolibrary.org/obo/DOID_" + str(disease_id)), ("identifierSource", "Disease Ontology")])] subject_hypertension = DatsObj("Disease", [ ("name", "Hypertension"), ("identifier", disease_identifier), ("alternateIdentifiers", disease_alt_ids), ("diseaseStatus", DatsObj("Annotation", [("value", disease['hypertension'] ), ("valueIRI", "")])), ]) subject_bearerOfDisease.append(subject_hypertension) # create a DATS Dimension from a dbGaP variable value def make_var_dimension(name, var_value): value = var_value["value"] dim = DatsObj("Dimension", [("name", DatsObj("Annotation", [( "value", name )])), ("values", [ value ]) ]) # find existing DATS identifier for the corresponding Dataset Dimension if "var" in var_value: dbgap_var_dim = var_value["var"]["dim"] dim.setProperty("identifier", dbgap_var_dim.get("identifier").getIdRef()) return dim # create DATS Dimensions for dbGaP subject metadata subject_dimensions = [ make_var_dimension(vname, subj_var_values[vname]) for vname in sorted(subj_var_values) ] # "raw" characteristics from dbGaP metadata subject_characteristics.extend(subject_dimensions) human_t = util.get_taxon_human(cache) subj_id = subj_var_values['SUBJECT_ID']['value'] dbgap_subj_id = subj_var_values['dbGaP_Subject_ID']['value'] study_title = study.get("title") # human experimental subject/patient subject_material = DatsObj("Material", [ ("name", subj_id), ("identifier", { "identifier": subj_id }), ("alternateIdentifiers", [ util.get_alt_id(dbgap_subj_id, "dbGaP") ]), ("description", study_title + " subject " + subj_id), ("characteristics", subject_characteristics), ("bearerOfDisease", subject_bearerOfDisease), ("taxonomy", [ human_t ]), ("roles", util.get_donor_roles(cache)) ]) # add to the cache subj_key = ":".join(["Material", subj_id]) dats_subj = cache.get_obj_or_ref(subj_key, lambda: subject_material) return dats_subj
def main(): # input parser = argparse.ArgumentParser( description='Create DATS JSON for dbGaP GTEx public metadata.') parser.add_argument( '--output_file', required=True, help= 'Output file path for the DATS JSON file containing the top-level DATS Dataset.' ) parser.add_argument( '--dbgap_public_xml_path', required=True, help= 'Path to directory that contains public dbGaP metadata files e.g., *.data_dict.xml and *.var_report.xml' ) parser.add_argument( '--dbgap_protected_metadata_path', required=False, help= 'Path to directory that contains access-controlled dbGaP tab-delimited metadata files.' ) parser.add_argument( '--max_output_samples', required=False, type=int, help= 'Impose a limit on the number of sample Materials in the output DATS. For testing purposes only.' ) parser.add_argument('--subject_phenotypes_path', default=V7_SUBJECT_PHENOTYPES_FILE, required=False, help='Path to ' + V7_SUBJECT_PHENOTYPES_FILE) parser.add_argument('--sample_attributes_path', default=V7_SAMPLE_ATTRIBUTES_FILE, required=False, help='Path to ' + V7_SAMPLE_ATTRIBUTES_FILE) parser.add_argument( '--data_stewards_repo_path', default='data-stewards', required=False, help='Path to local copy of https://github.com/dcppc/data-stewards') parser.add_argument( '--no_circular_links', action='store_true', help= 'Whether to disallow circular links/paths within the JSON-LD output.') parser.add_argument( '--use_all_dbgap_subject_vars', action='store_true', help= 'Whether to store all available dbGaP variable values as characteristics of the DATS subject Materials.' ) # parser.add_argument('--use_all_dbgap_sample_vars', action='store_true', help ='Whether to store all available dbGaP variable values as characteristics of the DATS sample Materials.') args = parser.parse_args() # logging logging.basicConfig(level=logging.INFO) # logging.basicConfig(level=logging.DEBUG) # read portal metadata for subjects and samples p_subjects = portal_files.read_subject_phenotypes_file( args.subject_phenotypes_path) p_samples = portal_files.read_sample_attributes_file( args.sample_attributes_path) portal_files.link_samples_to_subjects(p_samples, p_subjects) # read id dump and manifest files from GitHub data-stewards repo # id dumps subject_id_file = args.data_stewards_repo_path + "/gtex/v7/id_dumps/gtex_v7_subject_ids.txt" gh_subjects = github_files.read_subject_id_file(subject_id_file) sample_id_file = args.data_stewards_repo_path + "/gtex/v7/id_dumps/gtex_v7_sample_ids.txt" gh_samples = github_files.read_sample_id_file(sample_id_file) tissue_id_file = args.data_stewards_repo_path + "/gtex/v7/id_dumps/gtex_v7_tissue_ids.txt" gh_tissues = github_files.read_tissue_id_file(tissue_id_file) # manifest files protected_rnaseq_manifest = args.data_stewards_repo_path + "/gtex/v7/manifests/protected_data/" + RNASEQ_MANIFEST_FILE protected_rnaseq_files = github_files.read_protected_rnaseq_manifest( protected_rnaseq_manifest) protected_wgs_manifest = args.data_stewards_repo_path + "/gtex/v7/manifests/protected_data/" + WGS_MANIFEST_FILE protected_wgs_files = github_files.read_protected_wgs_manifest( protected_wgs_manifest) # DOIs rnaseq_dois_file = args.data_stewards_repo_path + "/gtex/v7/manifests/protected_data/" + RNASEQ_DOIS_FILE rnaseq_dois = github_files.read_dois_manifest(rnaseq_dois_file) wgs_dois_file = args.data_stewards_repo_path + "/gtex/v7/manifests/protected_data/" + WGS_DOIS_FILE wgs_dois = github_files.read_dois_manifest(wgs_dois_file) # compare GitHub manifest files with GitHub id dumps cross_check_ids(gh_subjects, gh_samples, protected_rnaseq_files, protected_rnaseq_manifest, "RNA-Seq", "GitHub id dumps") cross_check_ids(gh_subjects, gh_samples, protected_wgs_files, protected_wgs_manifest, "WGS", "GitHub id dumps") # compare GitHub manifest files with GTEx Portal metdata files cross_check_ids(p_subjects, p_samples, protected_rnaseq_files, protected_rnaseq_manifest, "RNA-Seq", "GTEx Portal metadata") cross_check_ids(p_subjects, p_samples, protected_wgs_files, protected_wgs_manifest, "WGS", "GTEx Portal metadata") # create top-level dataset gtex_dataset = ccmm.gtex.wgs_datasets.get_dataset_json() # index dbGaP study Datasets by id dbgap_study_datasets_by_id = {} for tds in gtex_dataset.get("hasPart"): dbgap_study_id = tds.get("identifier").get("identifier") if dbgap_study_id in dbgap_study_datasets_by_id: logging.fatal("encountered duplicate study_id " + dbgap_study_id) sys.exit(1) m = re.match(r'^(phs\d+\.v\d+)\.p\d+$', dbgap_study_id) if m is None: logging.fatal("unable to parse study_id " + dbgap_study_id) sys.exit(1) dbgap_study_datasets_by_id[m.group(1)] = tds # read public dbGaP metadata pub_xp = args.dbgap_public_xml_path restricted_mp = args.dbgap_protected_metadata_path # read public metadata dbgap_study_pub_md = ccmm.gtex.public_metadata.read_study_metadata(pub_xp) # there should be only one study study_ids = [k for k in dbgap_study_pub_md.keys()] n_study_ids = len(study_ids) study_id = study_ids[0] if n_study_ids != 1: logging.fatal("read " + str(n_study_ids) + " dbGaP studies from " + pub_xp) sys.exit(1) dbgap_study_dataset = dbgap_study_datasets_by_id[study_id] dbgap_study_md = dbgap_study_pub_md[study_id] sv = ccmm.gtex.public_metadata.add_study_vars(dbgap_study_dataset, dbgap_study_md) dbgap_study_md['id_to_var'] = sv['id_to_var'] dbgap_study_md['type_name_cg_to_var'] = sv['type_name_cg_to_var'] # set 2nd level types to be the same as the top-level types: WGS and RNA-Seq dbgap_study_dataset.set("types", gtex_dataset.get("types")) # cache used to minimize duplication of JSON objects in JSON-LD output cache = DatsObjCache() # -------------------------- # subjects # -------------------------- # create subjects based on GTEx Portal subject phenotype file and GitHub data-stewards id dump dats_subjects_d = ccmm.gtex.subjects.get_subjects_dats_materials( cache, p_subjects, gh_subjects, dbgap_study_md['type_name_cg_to_var']['Subject_Phenotypes']) # sorted list of subjects dats_subjects_l = sorted([dats_subjects_d[s] for s in dats_subjects_d], key=lambda s: s.get("name")) # TODO - add consent groups, of which GTEx has 2: 0=didn't participate, 1=General Research Use (GRU) # create StudyGroup that lists all the subjects logging.info("creating 'all subjects' StudyGroup containing " + str(len(dats_subjects_l)) + " subject(s) from public metadata") all_subjects = DatsObj( "StudyGroup", [ ("name", "all subjects"), # subjects appear in full here, but id references will be used elsewhere in the instance: ("members", dats_subjects_l), ("size", len(dats_subjects_l)) ]) # create link back from each subject to the parent StudyGroup if args.no_circular_links: logging.warn( "not creating Subject level circular links because of --no_circular_links option" ) else: for s in dats_subjects_l: cl = s.get("characteristics") cl.append( DatsObj("Dimension", [("name", "member of study group"), ("values", [all_subjects.getIdRef()])])) dats_study = DatsObj("Study", [("name", "GTEx"), ("studyGroups", [all_subjects])]) # link Study to Dataset dbgap_study_dataset.set("producedBy", dats_study) # -------------------------- # sample Materials # -------------------------- # create samples based on GTEx Portal sample attributes file and GitHub data-stewards id dump dats_samples_d = ccmm.gtex.samples.get_samples_dats_materials( cache, dats_subjects_d, p_samples, gh_samples, dbgap_study_md['type_name_cg_to_var']['Sample_Attributes']) # sorted list of samples dats_samples_l = sorted([dats_samples_d[s] for s in dats_samples_d], key=lambda s: s.get("name")) if args.max_output_samples is not None: dats_samples_l = dats_samples_l[0:int(args.max_output_samples)] logging.warn("limiting output to " + str(len(dats_samples_l)) + " sample(s) due to value of --max_output_samples") dbgap_study_dataset.set("isAbout", dats_samples_l) # -------------------------- # file Datasets # -------------------------- file_datasets_l = [] # WGS CRAM wgs_dats_file_datasets_l = ccmm.gtex.samples.get_files_dats_datasets( cache, dats_samples_d, p_samples, gh_samples, protected_wgs_files, wgs_dois, args.no_circular_links) logging.info("adding Datasets for " + str(len(wgs_dats_file_datasets_l)) + " WGS CRAM files") file_datasets_l.extend(wgs_dats_file_datasets_l) # RNA-Seq CRAM rnaseq_dats_file_datasets_l = ccmm.gtex.samples.get_files_dats_datasets( cache, dats_samples_d, p_samples, gh_samples, protected_rnaseq_files, rnaseq_dois, args.no_circular_links) logging.info("adding Datasets for " + str(len(rnaseq_dats_file_datasets_l)) + " RNA-Seq CRAM files") file_datasets_l.extend(rnaseq_dats_file_datasets_l) dbgap_study_dataset.set("hasPart", file_datasets_l) # augment public (meta)data with restricted-access (meta)data if restricted_mp is not None: # create study groups and update subjects/samples with restricted phenotype data add_restricted_data(cache, args, dbgap_study_md, dats_subjects_l, dats_samples_d, dats_study, study_id) # write Dataset to DATS JSON file with open(args.output_file, mode="w") as jf: jf.write(json.dumps(gtex_dataset, indent=2, cls=DATSEncoder))
def get_single_dna_extract_json(cache, study, study_md, subj_var_values, samp_var_values): # Almost all samples in TOPMed WGS phase are blood samples, named "Blood", "Peripheral Blood"... # Few samples are saliva samples probably due to sample collection issues name = None if 'BODY_SITE' in samp_var_values: name = 'BODY_SITE' elif 'Body_Site' in samp_var_values: name = 'Body_Site' elif 'Body Site' in samp_var_values: name = 'Body Site' anat_id = None anatomy_name = None if name is not None: if "blood" in samp_var_values[name]['value'].lower(): anatomy_name = "blood" anat_id = "0000178" elif samp_var_values[name]['value'].lower() == "saliva": anatomy_name = "saliva" anat_id = "0001836" else: logging.fatal( "encountered BODY_SITE other than 'Blood' and 'Saliva' in TOPMed sample metadata - " + samp_var_values['BODY_SITE']['value']) sys.exit(1) if anat_id is not None: anatomy_identifier = OrderedDict([("identifier", "UBERON:" + str(anat_id)), ("identifierSource", "UBERON")]) anatomy_alt_ids = [ OrderedDict([ ("identifier", "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)), ("identifierSource", "UBERON") ]) ] # extract subject attributes gender = None age = None visit_year = None sys_bp = None dias_bp = None disease = {} disease['hypertension'] = "unknown" for name in subj_var_values: name_upper = name.upper() if name_upper == "GENDER" or name_upper == "SEX": gender = subj_var_values[name]['value'].lower() elif name_upper == "VISIT_AGE" or name_upper == "AGE" or name_upper == "AGE_ENROLL": #need to confirm that these allmean the same thing age = subj_var_values[name]['value'] elif name_upper == "VISIT_YEAR": visit_year = subj_var_values[name]['value'] elif name_upper == "SYSBP": sys_bp = subj_var_values[name]['value'] elif name_upper == "DIASBP": dias_bp = subj_var_values[name]['value'] elif name_upper == "HYPERTENSION" or name_upper == "HIGHBLOODPRES": if subj_var_values[name]['value'].lower( ) == "yes" or subj_var_values[name]['value'] == '1': disease['hypertension'] = "yes" elif re.match(r'\S', subj_var_values[name]['value']): disease['hypertension'] = "no" # anatomical part anatomical_part = None if anatomy_name is not None: anatomical_part = DatsObj("AnatomicalPart", [("name", anatomy_name), ("identifier", anatomy_identifier), ("alternateIdentifiers", anatomy_alt_ids)]) subject_characteristics = [] subject_bearerOfDisease = [] # harmonized/standardized characteristics if gender is not None: subject_sex = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Gender")])), ("description", "Gender of the subject"), ("values", [gender])]) subject_characteristics.append(subject_sex) if age is not None: subject_age = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Age")])), ("description", "Age of the subject"), ("values", [age])]) subject_characteristics.append(subject_age) if visit_year is not None: subject_visitYear = DatsObj("Dimension", [ ("name", DatsObj("Annotation", [("value", "Visit year")])), ("description", "Year of visit, to use for longitudinal analysis"), ("values", [visit_year]) ]) subject_characteristics.append(subject_visitYear) if sys_bp is not None: subject_sysBP = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Systolic blood pressure")])), ("description", "Systolic blood pressure of subject, measured in mmHg"), ("values", [sys_bp])]) subject_characteristics.append(subject_sysBP) if dias_bp is not None: subject_diasBP = DatsObj( "Dimension", [("name", DatsObj("Annotation", [("value", "Diastolic blood pressure")])), ("description", "Diastolic blood pressure of subject, measured in mmHg"), ("values", [dias_bp])]) subject_characteristics.append(subject_diasBP) if disease['hypertension'] != "unknown": disease_name = "hypertension" disease_id = "10763" disease_identifier = OrderedDict([ ("identifier", "DOID:" + str(disease_id)), ("identifierSource", "Disease Ontology") ]) disease_alt_ids = [ OrderedDict([ ("identifier", "http://purl.obolibrary.org/obo/DOID_" + str(disease_id)), ("identifierSource", "Disease Ontology") ]) ] subject_hypertension = DatsObj("Disease", [ ("name", "Hypertension"), ("identifier", disease_identifier), ("alternateIdentifiers", disease_alt_ids), ("diseaseStatus", DatsObj("Annotation", [("value", disease['hypertension']), ("valueIRI", "")])), ]) subject_bearerOfDisease.append(subject_hypertension) # create a DATS Dimension from a dbGaP variable value def make_var_dimension(name, var_value): value = var_value["value"] dim = DatsObj("Dimension", [("name", DatsObj("Annotation", [("value", name)])), ("values", [value])]) # find existing DATS identifier for the corresponding Dataset Dimension if "var" in var_value: id = var_value["var"]["id"] dbgap_var_dim = study_md['dbgap_vars'][id] dim.setProperty("identifier", dbgap_var_dim.get("identifier").getIdRef()) return dim # create DATS Dimensions for dbGaP subject metadata subject_dimensions = [ make_var_dimension(vname, subj_var_values[vname]) for vname in sorted(subj_var_values) ] # create DATS Dimensions for dbGaP sample metadata sample_dimensions = [ make_var_dimension(vname, samp_var_values[vname]) for vname in sorted(samp_var_values) ] # "raw" characteristics from dbGaP metadata subject_characteristics.extend(subject_dimensions) sample_characteristics = sample_dimensions human_t = util.get_taxon_human(cache) subj_id = subj_var_values['SUBJECT_ID']['value'] dbgap_subj_id = subj_var_values['dbGaP_Subject_ID']['value'] samp_id = samp_var_values['SAMPLE_ID']['value'] dbgap_samp_id = samp_var_values['dbGaP_Sample_ID']['value'] study_title = study.get("title") # human experimental subject/patient subject_material = DatsObj( "Material", [("name", subj_id), ("identifier", { "identifier": subj_id }), ("alternateIdentifiers", [util.get_alt_id(dbgap_subj_id, "dbGaP")]), ("description", study_title + " subject " + subj_id), ("characteristics", subject_characteristics), ("bearerOfDisease", subject_bearerOfDisease), ("taxonomy", [human_t]), ("roles", util.get_donor_roles(cache))]) # TODO - use DatsObjCache specimen_annot = util.get_annotation("specimen") dna_extract_annot = util.get_annotation("DNA extract") # biological/tissue sample sample_name = samp_id sample_derives_from = [subject_material] sample_descr = "specimen collected from subject " + subj_id if anatomical_part is not None: sample_derives_from.append(anatomical_part) sample_descr = anatomy_name + " " + sample_descr biological_sample_material = DatsObj( "Material", [("name", sample_name), ("identifier", { "identifier": samp_id }), ("alternateIdentifiers", [util.get_alt_id(dbgap_samp_id, "dbGaP")]), ("description", sample_descr), ("characteristics", sample_characteristics), ("taxonomy", [human_t]), ("roles", [specimen_annot]), ("derivesFrom", sample_derives_from)]) # DNA extracted from tissue sample dna_descr = "DNA extracted from specimen collected from subject " + subj_id if anatomical_part is not None: dna_descr = "DNA extracted from " + anatomy_name + " specimen collected from subject " + subj_id dna_material = DatsObj("Material", [("name", "DNA from " + sample_name), ("description", dna_descr), ("taxonomy", [human_t]), ("roles", [dna_extract_annot]), ("derivesFrom", [biological_sample_material])]) return dna_material
def get_sample_dats_material(cache, dats_subject, study, study_md, samp_var_values): # Almost all samples in TOPMed WGS phase are blood samples, named "Blood", "Peripheral Blood"... # Few samples are saliva samples probably due to sample collection issues name = None if 'BODY_SITE' in samp_var_values: name = 'BODY_SITE' elif 'Body_Site' in samp_var_values: name = 'Body_Site' elif 'Body Site' in samp_var_values: name = 'Body Site' anat_id = None anatomy_name = None if name is not None: if "blood" in samp_var_values[name]['value'].lower(): anatomy_name = "blood" anat_id = "0000178" elif samp_var_values[name]['value'].lower() == "saliva": anatomy_name = "saliva" anat_id = "0001836" else: logging.fatal( "encountered BODY_SITE other than 'Blood' and 'Saliva' in TOPMed sample metadata - " + samp_var_values['BODY_SITE']['value']) sys.exit(1) def make_anat_part(anat_id, anatomy_name): # anatomical part anatomical_part = DatsObj("AnatomicalPart", [("name", anatomy_name)]) if anat_id is not None: anatomy_identifier = OrderedDict([("identifier", "UBERON:" + str(anat_id)), ("identifierSource", "UBERON")]) anatomy_alt_ids = [ OrderedDict([ ("identifier", "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)), ("identifierSource", "UBERON") ]) ] anatomical_part.set("identifier", anatomy_identifier) anatomical_part.set("alternateIdentifiers", anatomy_alt_ids) return anatomical_part if anatomy_name is not None: # use cached value for AnatomicalPart if possible anat_part_key = ":".join(["AnatomicalPart", anatomy_name]) anatomical_part = cache.get_obj_or_ref( anat_part_key, lambda: make_anat_part(anat_id, anatomy_name)) else: anatomical_part = None # create a DATS Dimension from a dbGaP variable value def make_var_dimension(name, var_value): value = var_value["value"] dim = DatsObj("Dimension", [("name", DatsObj("Annotation", [("value", name)])), ("values", [value])]) # find existing DATS identifier for the corresponding Dataset Dimension if "var" in var_value: dbgap_var_dim = var_value["var"]["dim"] dim.setProperty("identifier", dbgap_var_dim.get("identifier").getIdRef()) return dim # create DATS Dimensions for dbGaP sample metadata sample_dimensions = [ make_var_dimension(vname, samp_var_values[vname]) for vname in sorted(samp_var_values) ] sample_characteristics = sample_dimensions samp_id = samp_var_values['SAMPLE_ID']['value'] dbgap_samp_id = samp_var_values['dbGaP_Sample_ID']['value'] study_title = study.get("title") specimen_annot = util.get_annotation("specimen", cache) # corresponding DATS subject Material subj_key = ":".join(["Material", dats_subject.get("name")]) dats_subj = cache.get_obj_or_ref(subj_key, lambda: dats_subject) dats_subj_name = dats_subject.get("name") # biological/tissue sample sample_name = samp_id sample_derives_from = [dats_subj] sample_descr = "specimen collected from subject " + dats_subj_name if anatomical_part is not None: sample_derives_from.append(anatomical_part) sample_descr = anatomy_name + " " + sample_descr biological_sample_material = DatsObj( "Material", [("name", sample_name), ("identifier", { "identifier": samp_id }), ("alternateIdentifiers", [util.get_alt_id(dbgap_samp_id, "dbGaP")]), ("description", sample_descr), ("characteristics", sample_characteristics), ("taxonomy", [util.get_taxon_human(cache)]), ("roles", [specimen_annot]), ("derivesFrom", sample_derives_from)]) # RNA or DNA extracted from tissue sample stype = "DNA" # TODO - check if RNA, not DNA dna_or_rna_descr = stype + " extracted from specimen collected from subject " + dats_subj_name if anatomical_part is not None: dna_or_rna_descr = stype + " extracted from " + anatomy_name + " specimen collected from subject " + dats_subj_name dna_or_rna_material = DatsObj( "Material", [("name", stype + " from " + sample_name), ("description", dna_or_rna_descr), ("taxonomy", [util.get_taxon_human(cache)]), ("roles", [util.get_annotation(stype + " extract", cache)]), ("derivesFrom", [biological_sample_material])]) return dna_or_rna_material
import gzip import logging import re import sys AGR_DESCRIPTION = """The Alliance (AGR) develops and maintains sustainable genome information resources \ that facilitates the use of diverse model organisms in understanding the genetic and genomic basis of \ human biology, health and disease. """ # List of AGR Genome builds AGR_ref_genomes = { "MGI_1.0.4": "Mouse Genome (Mouse Genome Informatics)", "RGD_1.0.4": "Rat Genome (Rat Genome Database)" } AGR = DatsObj("Organization", [("abbreviation", "AGR"), ("name", "Alliance of Genome Resources")]) AGR_BUCKET = DatsObj("DataRepository", [("name", "AGR"), ("description", "The Alliance AWS S3 BUCKET"), ("publishers", [AGR])]) ## Ontology for Biomedical Investigations # "DNA sequencing" DNA_SEQUENCING_TYPE = OrderedDict([ ("value", "DNA sequencing"), ("valueIRI", "http://purl.obolibrary.org/obo/OBI_0000626") ]) WGS_ASSAY_TYPE = OrderedDict([("value", "whole genome sequencing assay"), ("valueIRI", "http://purl.obolibrary.org/obo/OBI_0002117")])
def get_files_dats_datasets(cache, dats_samples_d, sample_manifest, file_guids, no_circular_links): file_datasets_l = [] wgs_datatype = DatsObj( "DataType", [("information", util.get_annotation("DNA sequencing", cache)), ("method", util.get_annotation("whole genome sequencing assay", cache)), ("platform", util.get_annotation("Illumina", cache))]) def get_wgs_datatype(): dkey = ".".join(["DataType", "WGS"]) return cache.get_obj_or_ref(dkey, lambda: wgs_datatype) snp_datatype = DatsObj( "DataType", [("information", util.get_annotation("SNP", cache)), ("method", util.get_annotation("SNP analysis", cache))]) cnv_datatype = DatsObj( "DataType", [("information", util.get_annotation("CNV", cache)), ("method", util.get_annotation("CNV analysis", cache))]) def get_snp_datatype(): dkey = ".".join(["DataType", "SNP"]) return cache.get_obj_or_ref(dkey, lambda: snp_datatype) def get_cnv_datatype(): dkey = ".".join(["DataType", "CNV"]) return cache.get_obj_or_ref(dkey, lambda: cnv_datatype) nhlbi_key = ":".join(["Organization", "NHLBI"]) nhlbi = cache.get_obj_or_ref(nhlbi_key, lambda: NIH_NHLBI) creators = [nhlbi] def make_data_standard(format): return DatsObj("DataStandard", [("name", format), ("type", util.get_value_annotation("format", cache)), ("description", format + " file format")]) cram_ds_key = ":".join(["DataStandard", "CRAM"]) cram_dstan = cache.get_obj_or_ref(cram_ds_key, lambda: make_data_standard("CRAM")) vcf_ds_key = ":".join(["DataStandard", "VCF"]) vcf_dstan = cache.get_obj_or_ref(vcf_ds_key, lambda: make_data_standard("VCF")) n_samples = len(dats_samples_d) n_samples_found = 0 for sample_id in dats_samples_d: dats_sample = dats_samples_d[sample_id] # look up corresponding file paths in manifest file if sample_id not in sample_manifest: logging.debug("sample not found in manifest - " + sample_id) continue n_samples_found += 1 ms = sample_manifest[sample_id] material_type = 'DNA' wgs_type = get_wgs_datatype() snp_type = get_snp_datatype() cnv_type = get_cnv_datatype() # ------------------------------------------------ # WGS sequence - CRAM and CRAI files # ------------------------------------------------ def get_filename(gs_uri): m = re.match(r'^.*\/([^\/]+)$', gs_uri) if m is None: logging.fatal("unable to parse filename from " + gs_uri) sys.exit(1) filename = m.group(1) return filename gs_cram = ms['gs_cram']['mapped_value'] gs_crai = ms['gs_crai']['mapped_value'] # GUID lookup cram_file = get_filename(gs_cram) crai_file = get_filename(gs_crai) cram_doi = file_guids[cram_file]['Sodium_GUID']['raw_value'] cram_size = file_guids[cram_file]['File size']['raw_value'] cram_md5 = file_guids[cram_file]['md5sum']['raw_value'] crai_doi = file_guids[crai_file]['Sodium_GUID']['raw_value'] crai_md5 = file_guids[crai_file]['md5sum']['raw_value'] # handle file size values with "e" in them def filesize_to_int(size): if re.match(r'.*e.*', size): size = int(float(size)) else: size = int(size) # Google Cloud Platform / Google Storage copy gs_cram_access = DatsObj("Access", [("accessURL", gs_cram)]) gs_cram_distro = DatsObj( "DatasetDistribution", [ ("access", gs_cram_access), ("identifier", DatsObj("Identifier", [("identifier", cram_doi)])), ("relatedIdentifiers", [ DatsObj("RelatedIdentifier", [("identifier", crai_doi), ("relationType", "cram_index")]) ]), ("size", filesize_to_int(cram_size)), # TODO - add file size units ("conformsTo", [cram_dstan]) ]) # AWS / S3 copy s3_cram = ms['s3_cram']['mapped_value'] s3_crai = ms['s3_crai']['mapped_value'] s3_cram_access = DatsObj("Access", [("accessURL", s3_cram)]) s3_cram_distro = DatsObj( "DatasetDistribution", [ ("access", s3_cram_access), ("identifier", DatsObj("Identifier", [("identifier", cram_doi)])), ("relatedIdentifiers", [ DatsObj("RelatedIdentifier", [("identifier", crai_doi), ("relationType", "cram_index")]) ]), ("size", filesize_to_int(cram_size)), # TODO - add file size units ("conformsTo", [cram_dstan]) ]) m = re.match(r'^.*\/([^\/]+)$', gs_cram) if m is None: logging.fatal("unable to parse filename from CRAM file URI " + gs_cram) sys.exit(1) filename = m.group(1) # TODO - replace this with DATS-specific MD5 checksum encoding (TBD) md5_dimension = DatsObj( "Dimension", [("name", util.get_value_annotation("MD5", cache)), ("values", [cram_md5])]) cram_dataset = DatsObj("Dataset", [ ("distributions", [gs_cram_distro, s3_cram_distro]), ("dimensions", [md5_dimension]), ("title", filename), ("types", [wgs_type]), ("creators", creators), ]) cram_da = DatsObj( "DataAcquisition", [("name", filename), ("input", [dats_sample.getIdRef()]) # ("uses", []) # software used ]) cram_dataset.set("producedBy", cram_da) # circular link back to enclosing Dataset as the output if not no_circular_links: cram_da.set("output", [cram_dataset.getIdRef()]) file_datasets_l.append(cram_dataset) # ------------------------------------------------ # Variant calls - VCF and CSI files # ------------------------------------------------ gs_vcf = ms['gs_vcf']['mapped_value'] gs_csi = ms['gs_csi']['mapped_value'] if gs_vcf is None: logging.warn("no VCF file found for " + sample_id) continue # GUID lookup vcf_file = get_filename(gs_vcf) csi_file = get_filename(gs_csi) vcf_doi = file_guids[vcf_file]['Sodium_GUID']['raw_value'] vcf_size = file_guids[vcf_file]['File size']['raw_value'] vcf_md5 = file_guids[vcf_file]['md5sum']['raw_value'] csi_doi = file_guids[csi_file]['Sodium_GUID']['raw_value'] csi_md5 = file_guids[csi_file]['md5sum']['raw_value'] # Google Cloud Platform / Google Storage copy gs_vcf_access = DatsObj("Access", [("accessURL", gs_vcf)]) gs_vcf_distro = DatsObj( "DatasetDistribution", [ ("access", gs_vcf_access), ("identifier", DatsObj("Identifier", [("identifier", vcf_doi)])), ("relatedIdentifiers", [ DatsObj("RelatedIdentifier", [("identifier", csi_doi), ("relationType", "vcf_index")]) ]), ("size", filesize_to_int(vcf_size)), # TODO - add file size units ("conformsTo", [vcf_dstan]) ]) # AWS / S3 copy s3_vcf = ms['s3_vcf']['mapped_value'] s3_csi = ms['s3_csi']['mapped_value'] s3_vcf_access = DatsObj("Access", [("accessURL", s3_vcf)]) s3_vcf_distro = DatsObj( "DatasetDistribution", [ ("access", s3_vcf_access), ("identifier", DatsObj("Identifier", [("identifier", vcf_doi)])), ("relatedIdentifiers", [ DatsObj("RelatedIdentifier", [("identifier", csi_doi), ("relationType", "vcf_index")]) ]), ("size", filesize_to_int(vcf_size)), # TODO - add file size units ("conformsTo", [vcf_dstan]) ]) m = re.match(r'^.*\/([^\/]+)$', gs_vcf) if m is None: logging.fatal("unable to parse filename from VCF file URI " + gs_vcf) sys.exit(1) filename = m.group(1) # TODO - replace this with DATS-specific MD5 checksum encoding (TBD) md5_dimension = DatsObj( "Dimension", [("name", util.get_value_annotation("MD5", cache)), ("values", [vcf_md5])]) vcf_dataset = DatsObj("Dataset", [ ("distributions", [gs_vcf_distro, s3_vcf_distro]), ("dimensions", [md5_dimension]), ("title", filename), ("types", [snp_type, cnv_type]), ("creators", creators), ]) vcf_da = DatsObj( "DataAcquisition", [("name", filename), ("input", [dats_sample.getIdRef()]) # ("uses", []) # software used ]) vcf_dataset.set("producedBy", vcf_da) # circular link back to enclosing Dataset as the output if not no_circular_links: vcf_da.set("output", [vcf_dataset.getIdRef()]) file_datasets_l.append(vcf_dataset) logging.info("found " + str(n_samples_found) + " / " + str(n_samples) + " sample(s) in TOPMed file manifest") return file_datasets_l
def get_single_dna_extract_json(study, subj_var_values, samp_var_values): # all samples in TOPMed WGS phase are blood samples if samp_var_values['BODY_SITE'] != 'Blood': logging.fatal( "encountered BODY_SITE other than 'Blood' in TOPMed sample metadata - " + samp_var_values['BODY_SITE']) sys.exit(1) anatomy_name = "blood" anat_id = "0000178" anatomy_identifier = OrderedDict([("identifier", "UBERON:" + str(anat_id)), ("identifierSource", "UBERON")]) anatomy_alt_ids = [ OrderedDict([("identifier", "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)), ("identifierSource", "UBERON")]) ] # extract subject attributes gender = None age = None for name in subj_var_values: if name == "GENDER": gender = subj_var_values[name].lower() elif name == "VISIT_AGE": age = subj_var_values[name] # TODO - determine what other subject attributes can be mapped directly to core DATS objects # place original dbGaP subject metadata into extraProperties # TODO - consider alternative of doing this only for un-harmonized metadata subj_extra_props = [ DatsObj("CategoryValuesPair", [("category", xp), ("values", [subj_var_values[xp]])]) for xp in sorted(subj_var_values) ] # extract sample attributes for name in samp_var_values: if name == 'SEQUENCING_CENTER': # TODO - determine which DATS objects (e.g., biological sample, DNA prep, sequence data) this property should attach to pass # TODO - determine what other subject attributes can be mapped directly to core DATS objects # e.g., IS_TUMOR -> bearerOfDisease ("the pathology affecting the material...") # place original dbGaP sample metadata into extraProperties samp_extra_props = [ DatsObj("CategoryValuesPair", [("category", xp), ("values", [samp_var_values[xp]])]) for xp in sorted(samp_var_values) ] # anatomical part anatomical_part = DatsObj("AnatomicalPart", [("name", anatomy_name), ("identifier", anatomy_identifier), ("alternateIdentifiers", anatomy_alt_ids)]) subject_sex = DatsObj("Dimension", [("name", { "value": "Gender" }), ("description", "Gender of the subject"), ("values", [gender])]) subject_age = DatsObj("Dimension", [("name", { "value": "Age" }), ("description", "Age of the subject"), ("values", [age])]) subject_characteristics = [subject_sex, subject_age] human_t = util.get_taxon_human() subj_id = subj_var_values['SUBJECT_ID'] dbgap_subj_id = subj_var_values['dbGaP_Subject_ID'] samp_id = samp_var_values['SAMPLE_ID'] dbgap_samp_id = samp_var_values['dbGaP_Sample_ID'] study_title = study.get("title") # human experimental subject/patient subject_material = DatsObj( "Material", [("name", subj_id), ("identifier", { "identifier": subj_id }), ("alternateIdentifiers", [util.get_alt_id(dbgap_subj_id, "dbGaP")]), ("description", study_title + " subject " + subj_id), ("characteristics", subject_characteristics), ("taxonomy", human_t), ("roles", util.get_donor_roles()), ("extraProperties", subj_extra_props)]) # biological/tissue sample sample_name = samp_id biological_sample_material = DatsObj( "Material", [("name", sample_name), ("identifier", { "identifier": samp_id }), ("alternateIdentifiers", [util.get_alt_id(dbgap_samp_id, "dbGaP")]), ("description", anatomy_name + " specimen collected from subject " + subj_id), ("taxonomy", human_t), ("roles", [OrderedDict([("value", "specimen"), ("valueIRI", "")])]), ("derivesFrom", [subject_material, anatomical_part]), ("extraProperties", samp_extra_props)]) # DNA extracted from tissue sample dna_material = DatsObj( "Material", [("name", "DNA from " + sample_name), ("description", "DNA extracted from " + anatomy_name + " specimen collected from subject " + subj_id), ("taxonomy", human_t), ("roles", [OrderedDict([("value", "DNA extract"), ("valueIRI", "")])]), ("derivesFrom", [biological_sample_material])]) return dna_material
def make_data_standard(format): return DatsObj("DataStandard", [("name", format), ("type", util.get_value_annotation("format", cache)), ("description", format + " file format")])
def get_dbgap_studies(qterm): studies = [] study = None lnum = 0 # Add newline before each occurrence of "Versions" if not already present lines = [] for line in GTEX_STUDIES_STR.split('\n'): m = re.match(r'^(\S+.*)(Versions?.*)$', line) if m is None: lines.append(line) else: lines.append(m.group(1)) lines.append(m.group(2)) for line in lines: lnum += 1 # blank line if re.match(r'^\s*$', line): continue # study id m = re.match('^(phs\S+)$', line) if m is not None: study = {'id': m.group(1)} studies.append(study) continue # study description m = re.match(r'^Genotype-Tissue Expression(.*)$', line) if m is not None: study['descr'] = m.group(1) continue # embargo release(s) m = re.match(r'^(Version.*)$', line) if m is not None: if 'versions' not in study: study['versions'] = [] study['versions'].append(m.group(1)) continue # details/participants/type of study m = re.match('^VDAS(\d+)(\D.*)Links$', line) if m is not None: study['n_participants'] = int(m.group(1)) study['study_type'] = m.group(2) continue # platform m = re.match(r'^(HiSeq.*)$', line) if m is not None: study['platform'] = m.group(1) continue # parse error logging.fatal("unexpected content at line " + str(lnum) + " of dbGaP studies: " + line) sys.exit(1) n_studies = len(studies) logging.info("found " + str(n_studies) + " GTEx study in dbGaP") # convert studies to DATS Datasets datasets = [] for s in studies: m = re.match(r'^phs\d+\.(v\d+)\.p\d+$', s['id']) if m is None: logging.fatal( "unable to parse dataset/study version from study id " + s['id']) sys.exit(1) version = m.group(1) dimensions = [ DatsObj("Dimension", [ ("name", { "value": "Actual Subject Count" }), ("description", "The actual number of subjects entered into a clinical trial." ), ("types", [util.get_annotation("Actual Subject Number")]), ("values", [s['n_participants']]) ]) ] types = [ OrderedDict([ ("information", util.get_annotation("DNA sequencing")), ("method", util.get_annotation("whole genome sequencing assay")), ("platform", HISEQ_TYPES[s['platform']]) ]) ] # TODO - Specify creators and release date(s) of this particular dataset. # This may require parsing some of the metadata files and/or documents. # TODO - required field - using NIH NHLBI as placeholder, but need to revisit and assign specific study-level creator creators = [NIH_NHGRI] # TODO - find better location for study_type? extra_props = [ DatsObj("CategoryValuesPair", [("category", "study_type"), ("values", [s['study_type']])]) ] # Dataset dataset = DatsObj( "Dataset", [ ("identifier", DatsObj("Identifier", [("identifier", s['id'])])), ("version", version), # ("dates", []), #("title", s['descr']), ("title", "Genotype-Tissue Expression Project (GTEx) WGS and RNA-Seq data" ), ("storedIn", DB_GAP), ("types", types), ("creators", creators), ("dimensions", dimensions), ("extraProperties", extra_props) # ("producedBy", data_analysis), # ("distributions", [DatsObj("DatasetDistribution", [ # ("access", DatsObj("Access", [ # ("landingPage", GTEX_DATASETS_URL) # ])) # ])]), ]) datasets.append(dataset) return datasets
#!/usr/bin/env python3 from ccmm.dats.datsobj import DatsObj import ccmm.mgd.human_homologs from collections import OrderedDict import csv import gzip import logging import re import sys EXPECTED_GENOME_BUILD = "GRCm38-C57BL/6J" N_GFF_FIELDS = 9 MGI = DatsObj("Organization", [("abbreviation", "MGI"), ("name", "Mouse Genome Informatics")]) MGD = DatsObj("DataRepository", [("name", "MGD"), ("description", "Mouse Genome Database"), ("publishers", [MGI])]) ## Ontology for Biomedical Investigations # "DNA sequencing" DNA_SEQUENCING_TYPE = OrderedDict([ ("value", "DNA sequencing"), ("valueIRI", "http://purl.obolibrary.org/obo/OBI_0000626") ]) MGD_TYPES = [ DatsObj("DataType", [("information", DNA_SEQUENCING_TYPE)]) # TODO - add other types e.g., genes, human homologs, other feature types