Exemplo n.º 1
0
    def make_var_dimension(name, var_value):
        value = var_value["value"]

        dim = DatsObj("Dimension",
                      [("name", DatsObj("Annotation", [("value", name)])),
                       ("values", [value])])

        # find existing DATS identifier for the corresponding Dataset Dimension
        if "var" in var_value:
            dbgap_var_dim = var_value["var"]["dim"]
            dim.setProperty("identifier",
                            dbgap_var_dim.get("identifier").getIdRef())
        return dim
Exemplo n.º 2
0
def add_study_vars(study, study_md):
    
    # maps dbGaP variable id to DATS dimension and variable report
    id_to_var = {}
    # maps variable type (e.g., Subject, Sample_Attributes), name and consent group to DATS dimension and variable report
    type_name_cg_to_var = {}

    for var_type in ('Subject', 'Subject_Phenotypes', 'Sample', 'Sample_Attributes'):
        if var_type in study_md:
            var_data = study_md[var_type]['data_dict']['data']
            vars = var_data['vars']
            vdict = {}
            type_name_cg_to_var[var_type] = vdict

            for var in vars:
                var_name = var['name']
                id = DatsObj("Identifier", [
                    ("identifier",  var['id']),
                    ("identifierSource", "dbGaP")])
        
                dim = DatsObj("Dimension", [
                    ("identifier", id),
                    ("name", DatsObj("Annotation", [("value", var_name)])),
                    ("description", var['description'])
                    # TODO: include stats
                ])  

                study.getProperty("dimensions").append(dim)
            
                # track dbGaP variable Dimension and variable report by dbGaP id
                if var['id'] in id_to_var:
                    logging.fatal("duplicate definition found for dbGaP variable " + var_name + " with accession=" + var['id'])
                    sys.exit(1)

                t ={"dim": dim, "var": var}
                id_to_var[var['id']] = t
                
                # track by name and consent group
                m = re.match(r'^(.*)(\.(c\d+))$', var['id'])
        
                if m is None:
                    suffix = ""
                else:
                    suffix = "." + m.group(3)

                key = "".join([var_name, suffix])
                if key in vdict:
                    logging.fatal("duplicate definition found for dbGaP variable " + key + " in " + var_type + " file")
                vdict[key] = t

    return { "id_to_var": id_to_var, "type_name_cg_to_var": type_name_cg_to_var }
Exemplo n.º 3
0
def get_subject_dats_material(cache, p_subject, gh_subject, var_lookup):
    subj_id = p_subject['SUBJID']['mapped_value']

    # retrieve id reference for the Identifier of the DATS Dimension for the "all subjects" consent group version of the variable
    def get_var_id(name):
        return var_lookup[name]['dim'].get("identifier").getIdRef()

    # human experimental subject/patient
    subject_sex = DatsObj(
        "Dimension", [("name", util.get_value_annotation("Gender", cache)),
                      ("description", "Gender of the subject"),
                      ("identifier", get_var_id("SEX")),
                      ("values", [p_subject['SEX']['mapped_value']])])

    subject_age = DatsObj(
        "Dimension", [("name", util.get_value_annotation("Age range", cache)),
                      ("description", "Age range of the subject"),
                      ("identifier", get_var_id("AGE")),
                      ("values", [p_subject['AGE']['mapped_value']])])

    subject_hardy_scale = DatsObj(
        "Dimension",
        [("name", util.get_value_annotation("Hardy scale", cache)),
         ("description", "Hardy scale death classification for the subject"),
         ("identifier", get_var_id("DTHHRDY")),
         ("values", [p_subject['DTHHRDY']['mapped_value']])])

    subject_characteristics = [subject_sex, subject_age, subject_hardy_scale]

    # use URI from GTEx id dump if present
    identifier = subj_id
    if gh_subject is not None:
        identifier = gh_subject['Destination URL']['raw_value']

    # human experimental subject/patient
    subject_material = DatsObj(
        "Material",
        [("name", subj_id),
         ("identifier", DatsObj("Identifier", [("identifier", identifier)])),
         ("description", "GTEx subject " + subj_id),
         ("characteristics", subject_characteristics),
         ("taxonomy", [util.get_taxon_human(cache)]),
         ("roles", util.get_donor_roles(cache))])

    # add to the cache
    subj_key = ":".join(["Material", subj_id])
    dats_subj = cache.get_obj_or_ref(subj_key, lambda: subject_material)

    return dats_subj
Exemplo n.º 4
0
    def make_anat_part(anat_id, anatomy_name):
        # EFO id
        if re.match(r'^EFO_\d+', anat_id):
            anatomy_identifier = OrderedDict([("identifier", anat_id),
                                              ("identifierSource", "EFO")])
            anatomy_alt_ids = [
                OrderedDict([(
                    "identifier",
                    "https://www.ebi.ac.uk/ols/ontologies/efo/terms?short_form="
                    + str(anat_id)), ("identifierSource", "EFO")])
            ]
        # Uberon id
        else:
            anatomy_identifier = OrderedDict([("identifier",
                                               "UBERON:" + str(anat_id)),
                                              ("identifierSource", "UBERON")])
            anatomy_alt_ids = [
                OrderedDict([
                    ("identifier",
                     "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)),
                    ("identifierSource", "UBERON")
                ])
            ]

        # anatomical part
        anatomical_part = DatsObj("AnatomicalPart",
                                  [("name", anatomy_name),
                                   ("identifier", anatomy_identifier),
                                   ("alternateIdentifiers", anatomy_alt_ids)])

        return anatomical_part
Exemplo n.º 5
0
def get_dats_id_aux(id_type, source, id, rel_type):
    url_prefix = ID_URL_PREFIXES[source]
    atts = [("identifier", url_prefix + id)]
    atts.append(("identifierSource", source))
    if rel_type is not None:
        atts.append(("relationType", rel_type))
    return DatsObj(id_type, atts)
Exemplo n.º 6
0
def get_ref_genomes(acc_d):
    ref_genomes = []
    ref_genome = None

    for arf in AGR_ref_genomes:
        m = re.match(r'^(\w+_)(\d+\.\d+\.\d+)$', arf)
        if m is not None:
            ref_genome = {'id': m.group(1) + m.group(2)}
            ref_genome['versions'] = m.group(2)
            ref_genomes.append(ref_genome)
            continue

    # filter ref_genomes by acc_d
    filtered_ref_genomes = [r for r in ref_genomes if r['id'] in acc_d]
    ref_genomes = filtered_ref_genomes

    n_ref_genomes = len(ref_genomes)
    logging.info("found " + str(n_ref_genomes) +
                 " Reference genomes in AGR Directory")

    # convert ref_genomes to DATS Datasets
    datasets = []
    for r in ref_genomes:
        m = re.match(r'^\w+_(\d+\.\d+\.\d+)$', r['id'])
        if m is None:
            logging.fatal("unable to parse reference genome version from id " +
                          r['id'])
            sys.exit(1)
        version = m.group(1)

        types = [OrderedDict([("information", DNA_SEQUENCING_TYPE)])]

        creators = [NIH_NHLBI, NIH_NHGRI]

        # Dataset
        dataset = DatsObj(
            "Dataset",
            [("identifier", DatsObj("Identifier", [("identifier", r['id'])])),
             ("version", version), ("title", AGR_ref_genomes[r['id']]),
             ("storedIn", AGR_BUCKET), ("types", types), ("creators", creators)
             # ("dimensions", dimensions)
             ])
        datasets.append(dataset)

    return datasets
Exemplo n.º 7
0
def get_dataset_json():
    # individual datasets corresponding to studies within GTEx
    data_subsets = []

    # pull studies from dbGaP
    data_subsets = get_dbgap_studies("phs000424")

    # parent GTEx Dataset that represents the entire GTEx program
    parent_gtex_dataset = DatsObj(
        "Dataset",
        [
            (
                "identifier",
                DatsObj(
                    "Identifier",
                    [
                        # GTEx value - "GTEx_Analysis_2016-01-15_v7_RNA-SEQ"
                        ("identifier",
                         "Genotype-Tissue Expression Project (GTEx)")
                    ])),
            ("title", "Genotype-Tissue Expression Project (GTEx)"),
            ("description", GTEX_DESCRIPTION),
            ("storedIn", DB_GAP),
            ("types", GTEX_V7_TYPES),
            ("creators", [NIH_NHGRI]),
            ("distributions", [
                DatsObj("DatasetDistribution",
                        [("access",
                          DatsObj("Access",
                                  [("landingPage", DBGAP_GTEX_QUERY_URL)]))])
            ]),
            ("hasPart", data_subsets)
        ])

    # TODO - add 'licenses', 'availability', 'dimensions', 'primaryPublications'?

    return parent_gtex_dataset
Exemplo n.º 8
0
def get_dataset_json():
    # individual datasets corresponding to studies within TOPMed
    data_subsets = []

    # pull studies from dbGaP
    data_subsets = get_dbgap_studies("topmed")

    # parent TOPMed dataset
    parent_topmed_dataset = DatsObj(
        "Dataset",
        [
            (
                "identifier",
                DatsObj(
                    "Identifier",
                    [
                        # GTEx value - "GTEx_Analysis_2016-01-15_v7_RNA-SEQ"
                        ("identifier", "TOPMed")
                    ])),
            ("title", "Trans-Omics for Precision Medicine (TOPMed)"),
            ("description", TOPMED_DESCRIPTION),
            ("storedIn", DB_GAP),
            ("types", TOPMED_TYPES),
            ("creators", [NIH_NHLBI]),
            ("distributions", [
                DatsObj("DatasetDistribution",
                        [("access",
                          DatsObj("Access",
                                  [("landingPage", DBGAP_TOPMED_QUERY_URL)]))])
            ]),
            ("hasPart", data_subsets)
        ])

    # TODO - add 'licenses', 'availability', 'dimensions', 'primaryPublications'?

    return parent_topmed_dataset
Exemplo n.º 9
0
def get_dataset_json(acc_d):

    # individual datasets corresponding to model organisms within AGR data dump
    dats_subsets = []

    # pull genome build from AWS files
    data_subsets = get_ref_genomes(acc_d)

    # parent AGR reference genome dataset
    parent_agr_dataset = DatsObj("Dataset", [
        ("identifier", DatsObj("Identifier", [("identifier", "AGR")])),
        ("title", "Alliance of Genome Resources (AGR)"),
        ("description", AGR_DESCRIPTION), ("storedIn", AGR_BUCKET),
        ("types", AGR_TYPES), ("creators", [AGR]),
        ("distributions", [
            DatsObj("DatasetDistribution",
                    [("access",
                      DatsObj("Access", [("landingPage", AGR_DOWNLOAD_URL)]))])
        ]), ("hasPart", data_subsets)
    ])

    # TODO - add 'licenses', 'availability', 'dimensions', 'primaryPublications'?

    return parent_agr_dataset
Exemplo n.º 10
0
    def make_anat_part(anat_id, anatomy_name):
        # anatomical part
        anatomical_part = DatsObj("AnatomicalPart", [("name", anatomy_name)])

        if anat_id is not None:
            anatomy_identifier = OrderedDict([("identifier",
                                               "UBERON:" + str(anat_id)),
                                              ("identifierSource", "UBERON")])
            anatomy_alt_ids = [
                OrderedDict([
                    ("identifier",
                     "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)),
                    ("identifierSource", "UBERON")
                ])
            ]

            anatomical_part.set("identifier", anatomy_identifier)
            anatomical_part.set("alternateIdentifiers", anatomy_alt_ids)

        return anatomical_part
Exemplo n.º 11
0
def get_dataset_json():
    set_dataset_guids()
    # individual RNA-Seq datasets/files
    rnaseq_data_subsets = []

    # create DATS Dataset for each RNA-Seq data product
    for dss in RNASEQ_DATASETS:
        descr = dss["descr"]
        file = dss["file"]

        analysis = dss["analysis"]
        measures = analysis["measures"]
        uses = analysis["uses"]
        # "The name of the activity, usually one sentece or short description of the data analysis."
        analysis_name = analysis["name"]
        # "A textual narrative comprised of one or more statements describing the data analysis."
        #        analysis_descr = analysis["descr"]

        # DataAnalysis
        data_analysis = DatsObj(
            "DataAnalysis",
            [
                ("name", analysis_name),
                #                ("description", analysis_descr),
                ("measures", measures),
                ("uses", uses)
            ])

        # Dataset
        subset = DatsObj(
            "Dataset",
            id=dss["doi"],
            atts=[
                ("identifier",
                 DatsObj("Identifier",
                         [("identifier",
                           "GTEx_Analysis_2016-01-15_v7_RNA-SEQ_" + file)])),
                ("version", "v7"),
                ("dates", [GTEX_V7_RELEASE_DATE]),
                ("title", "GTEx v7 RNA-Seq Analysis, " + descr),
                ("storedIn", DB_GAP),
                ("types", [GTEX_V7_RNASEQ_TYPE]),
                ("creators", [GTEX_CONSORTIUM]),
                ("producedBy", data_analysis),
                # TODO - where does the actual filename belong?
                ("distributions", [
                    DatsObj("DatasetDistribution",
                            [("access",
                              DatsObj("Access",
                                      [("landingPage", GTEX_DATASETS_URL)]))])
                ]),
            ])
        rnaseq_data_subsets.append(subset)

    # parent RNA-Seq dataset
    parent_rnaseq_dataset = DatsObj(
        "Dataset",
        [("identifier",
          DatsObj("Identifier",
                  [("identifier", "GTEx_Analysis_2016-01-15_v7_RNA-SEQ")])),
         ("version", "v7"), ("dates", [GTEX_V7_RELEASE_DATE]),
         ("title", "GTEx v7 RNA-Seq Analysis"), ("storedIn", DB_GAP),
         ("types", [GTEX_V7_RNASEQ_TYPE]), ("creators", [GTEX_CONSORTIUM]),
         ("distributions", [
             DatsObj("DatasetDistribution", [
                 ("access",
                  DatsObj("Access", [("landingPage", GTEX_DATASETS_URL)]))
             ])
         ]), ("hasPart", rnaseq_data_subsets)])

    # parent GTEx dataset
    gtex_dataset = DatsObj(
        "Dataset",
        [
            ("identifier",
             DatsObj("Identifier", [("identifier", GTEX_DB_GAP_ID)])),
            ("version", "v7"),
            ("dates", [GTEX_V7_RELEASE_DATE]),
            ("title", "Genotype-Tissue Expression Project (GTEx)"),
            ("storedIn", DB_GAP),
            # TODO add types for parent GTEx project
            ("types", GTEX_V7_TYPES),
            ("creators", [GTEX_CONSORTIUM]),
            ("distributions", [
                DatsObj("DatasetDistribution", [
                    ("access",
                     DatsObj("Access", [("landingPage", GTEX_DB_GAP_URL)]))
                ])
            ]),
            ("hasPart", [parent_rnaseq_dataset])
        ])

    # TODO - add 'licenses', 'availability', 'dimensions', 'primaryPublications'?
    return gtex_dataset
Exemplo n.º 12
0
# "Illumina"
ILLUMINA_TYPE = OrderedDict([("value", "Illumina"),
                             ("valueIRI",
                              "http://purl.obolibrary.org/obo/OBI_0000759")])
# "DNA sequencing"
DNA_SEQUENCING_TYPE = OrderedDict([
    ("value", "DNA sequencing"),
    ("valueIRI", "http://purl.obolibrary.org/obo/OBI_0000626")
])
# "whole genome sequencing assay"
WGS_ASSAY_TYPE = OrderedDict([("value", "whole genome sequencing assay"),
                              ("valueIRI",
                               "http://purl.obolibrary.org/obo/OBI_0002117")])

# documentation for earlier version at https://data.broadinstitute.org/cancer/cga/tools/rnaseqc/RNA-SeQC_Help_v1.1.2.pdf
RNA_SEQ_QC = DatsObj("Software", [("name", "RNASeQC"), ("version", "v1.1.8")])

# gene read counts Dimension
GENE_READ_COUNTS_DIM = DatsObj("Dimension",
                               [("name", GENE_READ_COUNT_NAME),
                                ("description", "gene read counts"),
                                ("types", [COUNT_TYPE])])

TRANSCRIPT_READ_COUNTS_DIM = DatsObj(
    "Dimension", [("name", TRANSCRIPT_READ_COUNT_NAME),
                  ("description", "transcript read counts"),
                  ("types", [COUNT_TYPE])])

EXON_READ_COUNTS_DIM = DatsObj("Dimension",
                               [("name", EXON_READ_COUNT_NAME),
                                ("description", "exon read counts"),
Exemplo n.º 13
0
def get_sample_dats_material(cache, dats_subject, p_sample, gh_sample,
                             var_lookup):
    samp_id = p_sample['SAMPID']['mapped_value']
    subj_id = p_sample['SUBJID']['mapped_value']

    # retrieve id reference for the Identifier of the DATS Dimension for the "all subjects" consent group version of the variable
    def get_var_id(name):
        return var_lookup[name]['dim'].get("identifier").getIdRef()

    # Uberon id (or EFO id, contrary to the documentation)
    anat_id = p_sample['SMUBRID']['mapped_value']
    if anat_id is None:
        print("No Uberon/anatomy ID specified for sample " + samp_id)
        sys.exit(1)

    anatomy_identifier = None
    anatomy_alt_ids = None
    # TODO - query anatomy term from UBERON/EFO instead?
    anatomy_name = p_sample['SMTSD']['mapped_value']

    def make_anat_part(anat_id, anatomy_name):
        # EFO id
        if re.match(r'^EFO_\d+', anat_id):
            anatomy_identifier = OrderedDict([("identifier", anat_id),
                                              ("identifierSource", "EFO")])
            anatomy_alt_ids = [
                OrderedDict([(
                    "identifier",
                    "https://www.ebi.ac.uk/ols/ontologies/efo/terms?short_form="
                    + str(anat_id)), ("identifierSource", "EFO")])
            ]
        # Uberon id
        else:
            anatomy_identifier = OrderedDict([("identifier",
                                               "UBERON:" + str(anat_id)),
                                              ("identifierSource", "UBERON")])
            anatomy_alt_ids = [
                OrderedDict([
                    ("identifier",
                     "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)),
                    ("identifierSource", "UBERON")
                ])
            ]

        # anatomical part
        anatomical_part = DatsObj("AnatomicalPart",
                                  [("name", anatomy_name),
                                   ("identifier", anatomy_identifier),
                                   ("alternateIdentifiers", anatomy_alt_ids)])

        return anatomical_part

    # use cached value for AnatomicalPart if possible
    anat_part_key = ":".join(["AnatomicalPart", anatomy_name])
    anatomical_part = cache.get_obj_or_ref(
        anat_part_key, lambda: make_anat_part(anat_id, anatomy_name))

    # use URI from GitHub GTEx id dump if available
    identifier = samp_id
    if gh_sample is not None:
        identifier_id = gh_sample['Destination URL']['raw_value']

    subj_key = ":".join(["Material", dats_subject.get("name")])
    dats_subj = cache.get_obj_or_ref(subj_key, lambda: dats_subject)

    # add sample characteristics from p_sample metadata
    sample_chars = []
    for key in p_sample:
        if re.match(r'^(subject|id)$', key):
            continue
        # TODO - currently including only a small subset of the available values for demonstration purposes
        if not re.match(r'^(SMATSSCR|SMRIN|SMMAPRT|SMGNSDTC)$', key):
            continue
        var = p_sample[key]
        #        print("got key=" + key + " var=" + str(var))
        mapped_val = var['mapped_value']
        char = DatsObj("Dimension",
                       [("name", util.get_value_annotation(key, cache)),
                        ("identifier", get_var_id(key)),
                        ("values", [mapped_val])])
        sample_chars.append(char)

    # biological/tissue sample
    biological_sample_material = DatsObj(
        "Material",
        [("name", samp_id), ("identifier", {
            "identifier": identifier
        }),
         ("description",
          anatomy_name + " specimen collected from subject " + subj_id),
         ("characteristics", sample_chars),
         ("taxonomy", [util.get_taxon_human(cache)]),
         ("roles", [util.get_annotation("specimen", cache)]),
         ("derivesFrom", [dats_subj, anatomical_part])])

    # analysis freeze classification
    smafrze = p_sample['SMAFRZE']['mapped_value']
    # expected sequence type depending on data freeze classification
    expected_stype = None

    stype = None
    if smafrze == "RNASEQ":
        expected_stype = "RNA"
    elif smafrze == "WGS":
        expected_stype = "DNA"
    elif smafrze == "WES":
        expected_stype = "DNA"
    # Illumina OMNI SNP Array
    elif smafrze == "OMNI":
        expected_stype = "DNA"
    elif smafrze == "EXCLUDE":
        pass
    else:
        logging.fatal("unknown SMAFRZE " + smafrze)
        sys.exit(1)

    # sample type - DNA or RNA
    stype = None
    smnabtcht = p_sample['SMNABTCHT']['mapped_value']
    if re.match(r'^DNA ([iI]solation|[eE]xtraction).*', smnabtcht):
        stype = 'DNA'
    elif re.match(r'^RNA ([iI]solation|[eE]xtraction).*', smnabtcht):
        stype = 'RNA'
    elif re.match(
            r'DNA or RNA Extraction from Paxgene-derived Lysate Plate Based',
            smnabtcht):
        stype = 'RNA'
    elif re.match(r'Transfer To Matrix \(Manual\)', smnabtcht):
        stype = 'DNA'

    if stype is None:
        if expected_stype is not None:
            stype = expected_stype
        else:
            print("couldn't determine sequence type for smafrze=" + smafrze +
                  " smnabtcht=" + smnabtcht)
            return None
    else:
        if (expected_stype is not None) and (stype != expected_stype):
            logging.fatal("seq type " + stype +
                          " doesn't match expected stype " + expected_stype)
            sys.exit(1)

    # DNA or RNA extract
    dna_or_rna_material = DatsObj(
        "Material",
        [("name", stype + " from " + samp_id),
         ("description", "total " + stype + " extracted from " + anatomy_name +
          " specimen collected from subject " + subj_id),
         ("taxonomy", [util.get_taxon_human(cache)]),
         ("roles", [util.get_annotation(stype + " extract", cache)]),
         ("derivesFrom", [biological_sample_material])])

    return dna_or_rna_material
Exemplo n.º 14
0
def get_dataset_json(gff3_path, human_homologs_path):
    # read human homologs from MGI human/mouse sequence file
    mgi2hgene_h = ccmm.mgd.human_homologs.read_mgd_mouse_human_seq_file(
        human_homologs_path)

    # read mouse features/genes from GFF3
    data = read_mgd_gff3(gff3_path)

    # molecular entities that represent genes and other top-level features of interest
    entities = []

    # number of mouse genes/features with no human homolog
    n_homolog = 0
    n_no_homolog = 0
    n_genes = 0

    # relationType is a string/uri
    h_region = SO_TERMS['homologous_region'].get('valueIRI')

    # TODO - add taxonomy, either in 'isAbout' or in each individual gene (or both)

    for f in data['features']:
        if f['source'] == 'MGI':
            if re.match(r'^gene|pseudogene|sequence_feature$',
                        f['type']) and (f['bioType'] != 'DNA segment'):
                id = f['ID']
                id = re.sub(r'MGI:MGI:', 'MGI:', id)
                roles = None
                # is this a gene or a gene segment
                is_gene = re.match(r'gene', f['type']) or re.match(
                    r'gene', f['bioType'])
                if is_gene:
                    n_genes += 1

                # specify/map GFF3 feature type to role
                if f['type'] == 'sequence_feature':
                    bioType = re.sub(r' ', '_', f['bioType'])
                    roles = [SO_TERMS[bioType]]
                else:
                    roles = [SO_TERMS[f['type']]]

                # array of dimension or material
                characteristics = [
                    # chromosome
                    DatsObj("Dimension", [("name", {
                        'value': 'chromosome'
                    }), ("types", [SO_TERMS['chromosome']]),
                                          ("values", [f['seqid']])]),
                    # start coordinate
                    # end coordinate
                ]

                # strand
                # direction_attribute only allows 'forward' or 'reverse' so if the strand is unknown the characteristic is omitted
                if f['strand'] in STRAND_CHADO2SO:
                    # map chado strand to name of corresponding SO term
                    SO_strand = STRAND_CHADO2SO[f['strand']]
                    DATS_strand = DatsObj(
                        "Dimension",
                        [
                            ("name", {
                                'value': 'direction_attribute'
                            }),
                            ("types", [SO_TERMS['direction_attribute']]),
                            # in dimension_schema.json 'values' is an array with no other constraints, hence
                            # our use of a string rather than an explicit reference to the SO ID
                            ("values", [SO_strand])
                        ])
                    characteristics.append(DATS_strand)

                # dbxrefs
                alt_ids = []
                if 'Dbxref' in f:
                    dbxref = f['Dbxref']
                    dbxrefs = dbxref.rsplit(',')
                    for dbx in dbxrefs:
                        (src, delim, src_id) = dbx.partition(':')
                        alt_ids.append(get_dats_alternate_id(src, src_id))

                # unharmonized data/anything that doesn't map anywhere else
                extra_props = [
                    DatsObj("CategoryValuesPair",
                            [("category", "reference sequence"),
                             ("values", [f['seqid']])]),
                    DatsObj("CategoryValuesPair",
                            [("category", "start coordinate"),
                             ("values", [f['start']])]),
                    DatsObj("CategoryValuesPair",
                            [("category", "end coordinate"),
                             ("values", [f['end']])]),
                    DatsObj("CategoryValuesPair", [("category", "strand"),
                                                   ("values", [f['strand']])])
                ]

                # human homologs
                related_ids = []
                has_homolog = False

                if id in mgi2hgene_h:
                    hgene = mgi2hgene_h[id]
                    homologene_id = hgene['id']
                    human_genes = []

                    # add HomoloGene reference
                    related_ids.append(
                        get_dats_related_id("NCBI_HomoloGene", homologene_id,
                                            h_region))

                    if 'human' in hgene:
                        human_genes = hgene['human']
                        has_homolog = True
                    for human_gene in human_genes:
                        entrez_gene_id = human_gene['EntrezGene ID']
                        related_ids.append(
                            get_dats_related_id("NCBI_Gene", entrez_gene_id,
                                                h_region))

                if is_gene:
                    if has_homolog:
                        n_homolog += 1
                    else:
                        n_no_homolog += 1

                me = DatsObj('MolecularEntity', [
                    ("name", f['Name']),
                    ("identifier", get_dats_id("MGI", id)),
                    ("alternateIdentifiers", alt_ids),
                    ("relatedIdentifiers", related_ids),
                    ("characteristics", characteristics),
                    ("roles", roles),
                    ("extraProperties", extra_props),
                ])

                entities.append(me)

            else:
                logging.debug("skipped feature of type " + f['type'] +
                              " at line " + str(f['lnum']) + ": mgiName=" +
                              f['mgiName'] + ", bioType=" + f['bioType'])

    # parent MGD reference genome dataset
    parent_mgd_dataset = DatsObj("Dataset", [
        ("identifier",
         DatsObj("Identifier", [("identifier", "GRCm38-C57BL/6J"),
                                ("identifierSource", "MGI")])),
        ("title",
         "GRCm38-C57BL/6J reference genome, genes, and human orthologs"),
        ("description",
         "GRCm38-C57BL/6J reference genome, genes, and human orthologs based on MGI/MGD Unified Mouse Gene Catalog."
         ), ("storedIn", MGD), ("types", MGD_TYPES),
        ("creators",
         [MGI,
          DatsObj("Person", [("email", data['metadata']['Contact'])])]),
        ("distributions", [
            DatsObj("DatasetDistribution", [
                ("access",
                 DatsObj("Access", [("landingPage", MGD_SEQ_DOWNLOAD_URL)]))
            ])
        ]), ("version", data['metadata']['Last updated']),
        ("isAbout", entities)
    ])

    # TODO - add 'licenses', 'availability', 'dimensions', 'primaryPublications'?
    # TODO - add sub-Datasets for the individual MGI files that contributed to the DATS encoding?
    #   metadata['URL'] gives the FTP URI of the source data file

    logging.debug("human homolog found for " + str(n_homolog) + "/" +
                  str(n_genes) + " mouse (pseudo)genes")
    logging.debug("no human homolog found for " + str(n_no_homolog) + "/" +
                  str(n_genes) + " mouse (pseudo)genes")
    return parent_mgd_dataset
Exemplo n.º 15
0
def get_single_sample_json(sample, dats_obj_cache):
    #    print("converting sample to json: " + str(sample))
    samp_id = sample['SAMPID']['mapped_value']
    subj_id = sample['SUBJID']['mapped_value']
    subject = sample['subject']

    # Uberon id (or EFO id, contrary to the documentation)
    anat_id = sample['SMUBRID']['mapped_value']
    if anat_id is None:
        print("No Uberon/anatomy ID specified for sample " + samp_id)
        sys.exit(1)

    anatomy_identifier = None
    anatomy_alt_ids = None
    # TODO - query anatomy term from UBERON/EFO instead?
    anatomy_name = sample['SMTSD']['mapped_value']

    # EFO id
    if re.match(r'^EFO_\d+', anat_id):
        anatomy_identifier = OrderedDict([("identifier", anat_id),
                                          ("identifierSource", "EFO")])
        anatomy_alt_ids = [
            OrderedDict([
                ("identifier",
                 "https://www.ebi.ac.uk/ols/ontologies/efo/terms?short_form=" +
                 str(anat_id)), ("identifierSource", "EFO")
            ])
        ]
    # Uberon id
    else:
        anatomy_identifier = OrderedDict([("identifier",
                                           "UBERON:" + str(anat_id)),
                                          ("identifierSource", "UBERON")])
        anatomy_alt_ids = [
            OrderedDict([
                ("identifier",
                 "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)),
                ("identifierSource", "UBERON")
            ])
        ]

    # anatomical part
    anat_part_key = ":".join(["AnatomicalPart", anatomy_name])
    if anat_part_key in dats_obj_cache:
        anatomical_part = dats_obj_cache[anat_part_key]
    else:
        anatomical_part = DatsObj("AnatomicalPart",
                                  [("name", anatomy_name),
                                   ("identifier", anatomy_identifier),
                                   ("alternateIdentifiers", anatomy_alt_ids)])
        dats_obj_cache[anat_part_key] = anatomical_part

    # human experimental subject/patient
    subject_sex = DatsObj(
        "Dimension", [("name", DatsObj("Annotation", [("value", "Gender")])),
                      ("description", "Gender of the subject"),
                      ("identifier",
                       DatsObj("Identifier", [("identifier", "SEX"),
                                              ("identifierSource", "GTEx")])),
                      ("values", [subject['SEX']['mapped_value']])])

    subject_age = DatsObj(
        "Dimension",
        [("name", DatsObj("Annotation", [("value", "Age range")])),
         ("description", "Age range of the subject"),
         ("identifier",
          DatsObj("Identifier", [("identifier", "AGE"),
                                 ("identifierSource", "GTEx")])),
         ("values", [subject['AGE']['mapped_value']])])

    subject_hardy_scale = DatsObj(
        "Dimension",
        [("name", DatsObj("Annotation", [("value", "Hardy scale")])),
         ("description", "Hardy scale death classification for the subject"),
         ("identifier",
          DatsObj("Identifier", [("identifier", "DTHHRDY"),
                                 ("identifierSource", "GTEx")])),
         ("values", [subject['DTHHRDY']['mapped_value']])])

    subject_characteristics = [subject_sex, subject_age, subject_hardy_scale]

    # human experimental subject/patient
    subj_key = ":".join(["Material", subj_id])
    if subj_key in dats_obj_cache:
        subject_material = dats_obj_cache[subj_key]
    else:
        subject_material = DatsObj(
            "Material", [("name", subj_id),
                         ("identifier", {
                             "identifier": subj_id
                         }), ("description", "GTEx subject " + subj_id),
                         ("characteristics", subject_characteristics),
                         ("taxonomy", [util.get_taxon_human(dats_obj_cache)]),
                         ("roles", util.get_donor_roles(dats_obj_cache))])
        dats_obj_cache[subj_key] = subject_material

    specimen_annot = util.get_annotation("specimen", dats_obj_cache)
    rna_extract_annot = util.get_annotation("RNA extract", dats_obj_cache)

    # biological/tissue sample
    sample_name = samp_id
    biological_sample_material = DatsObj(
        "Material",
        [("name", sample_name), ("identifier", {
            "identifier": samp_id
        }),
         ("description",
          anatomy_name + " specimen collected from subject " + subj_id),
         ("taxonomy", [util.get_taxon_human(dats_obj_cache)]),
         ("roles", [specimen_annot]),
         ("derivesFrom", [subject_material, anatomical_part])])

    # RNA extracted from tissue sample
    rna_material = DatsObj(
        "Material",
        [("name", "RNA from " + sample_name),
         ("description", "total RNA extracted from " + anatomy_name +
          " specimen collected from subject " + subj_id),
         ("taxonomy", [util.get_taxon_human(dats_obj_cache)]),
         ("roles", [rna_extract_annot]),
         ("derivesFrom", [biological_sample_material])])

    return rna_material
Exemplo n.º 16
0
def update_single_subject(cache, study, study_md, subj, subj_var_values,
                          use_all_dbgap_vars):

    # extract subject attributes
    gender = None
    age = None
    visit_year = None
    sys_bp = None
    dias_bp = None
    disease = {}
    disease['hypertension'] = "unknown"

    for name in subj_var_values:
        name_upper = name.upper()
        if name_upper == "GENDER" or name_upper == "SEX":
            gender = subj_var_values[name]['value'].lower()
        elif name_upper == "VISIT_AGE" or name_upper == "AGE" or name_upper == "AGE_ENROLL":  #need to confirm that these  allmean the same thing
            age = subj_var_values[name]['value']
        elif name_upper == "VISIT_YEAR":
            visit_year = subj_var_values[name]['value']
        elif name_upper == "SYSBP":
            sys_bp = subj_var_values[name]['value']
        elif name_upper == "DIASBP":
            dias_bp = subj_var_values[name]['value']
        elif name_upper == "HYPERTENSION" or name_upper == "MHHTN":
            if subj_var_values[name]['value'].lower(
            ) == "yes" or subj_var_values[name]['value'] == '1':
                disease['hypertension'] = "yes"
            elif re.match(r'\S', subj_var_values[name]['value']):
                disease['hypertension'] = "no"

    subject_characteristics = []
    subject_bearerOfDisease = []

    # harmonized/standardized characteristics
    if gender is not None:
        # TODO - this
        ss = None
        if gender == "2":
            ss = "female"
        if gender == "1":
            ss = "male"
        subject_sex = DatsObj(
            "Dimension",
            [("name", DatsObj("Annotation", [("value", "Gender")])),
             ("description", "Gender of the subject"), ("values", [ss])])
        subject_characteristics.append(subject_sex)

    if age is not None:
        subject_age = DatsObj(
            "Dimension", [("name", DatsObj("Annotation", [("value", "Age")])),
                          ("description", "Age of the subject"),
                          ("values", [age])])
        subject_characteristics.append(subject_age)

    if visit_year is not None:
        subject_visitYear = DatsObj("Dimension", [
            ("name", DatsObj("Annotation", [("value", "Visit year")])),
            ("description", "Year of visit, to use for longitudinal analysis"),
            ("values", [visit_year])
        ])
        subject_characteristics.append(subject_visitYear)

    if sys_bp is not None:
        subject_sysBP = DatsObj(
            "Dimension",
            [("name",
              DatsObj("Annotation", [("value", "Systolic blood pressure")])),
             ("description",
              "Systolic blood pressure of subject, measured in mmHg"),
             ("values", [sys_bp])])
        subject_characteristics.append(subject_sysBP)

    if dias_bp is not None:
        subject_diasBP = DatsObj(
            "Dimension",
            [("name",
              DatsObj("Annotation", [("value", "Diastolic blood pressure")])),
             ("description",
              "Diastolic blood pressure of subject, measured in mmHg"),
             ("values", [dias_bp])])
        subject_characteristics.append(subject_diasBP)

    if disease['hypertension'] != "unknown":
        disease_name = "hypertension"
        disease_id = "10763"
        disease_identifier = OrderedDict([
            ("identifier", "DOID:" + str(disease_id)),
            ("identifierSource", "Disease Ontology")
        ])
        disease_alt_ids = [
            OrderedDict([
                ("identifier",
                 "http://purl.obolibrary.org/obo/DOID_" + str(disease_id)),
                ("identifierSource", "Disease Ontology")
            ])
        ]
        subject_hypertension = DatsObj(
            "Disease",
            [("name", "Hypertension"), ("identifier", disease_identifier),
             ("alternateIdentifiers", disease_alt_ids),
             ("diseaseStatus",
              DatsObj("Annotation", [("value", disease['hypertension'])]))])
        subject_bearerOfDisease.append(subject_hypertension)

    # create a DATS Dimension from a dbGaP variable value
    def make_var_dimension(name, var_value):
        value = var_value["value"]

        dim = DatsObj("Dimension",
                      [("name", DatsObj("Annotation", [("value", name)])),
                       ("values", [value])])

        # find existing DATS identifier for the corresponding Dataset Dimension
        if "var" in var_value:
            dim.setProperty(
                "identifier",
                var_value["var"]["dim"].get("identifier").getIdRef())

        return dim

    # add "raw" characteristics / DATS Dimensions for _all_ dbGaP subject metadata
    # create DATS Dimensions for dbGaP subject metadata
    if use_all_dbgap_vars:
        subject_dimensions = [
            make_var_dimension(vname, subj_var_values[vname])
            for vname in sorted(subj_var_values)
        ]
        subject_characteristics.extend(subject_dimensions)

    # update subject
    dbgap_subj_id = subj_var_values['dbGaP_Subject_ID']['value']
    subj.set("alternateIdentifiers", [util.get_alt_id(dbgap_subj_id, "dbGaP")])
    subj.set("bearerOfDisease", subject_bearerOfDisease)

    # update characteristics
    chars = subj.get("characteristics")
    new_chars = []

    # remove values from public data release that will be overwritten
    for c in chars:
        name = c.get("name")
        # TODO - 'member of study group' is string, not Annotation
        if not isinstance(name, DatsObj) and name == 'member of study group':
            new_chars.append(c)
    new_chars.extend(subject_characteristics)
    subj.set("characteristics", new_chars)
Exemplo n.º 17
0
#!/usr/bin/env python3

from ccmm.dats.datsobj import DatsObj
import ccmm.dats.util as util
import ccmm.topmed.dna_extracts as dna_extracts
from collections import OrderedDict
import logging
import re
import sys

NIH_NHLBI = DatsObj("Organization", [(
    "name",
    "The National Institute of Health's National Heart, Lung and Blood Institute"
), ("abbreviation", "NHLBI")])

# Produce a DATS Material for a single sample.


def get_sample_dats_material(cache, dats_subject, study, study_md,
                             samp_var_values):

    # Almost all samples in TOPMed WGS phase are blood samples, named "Blood", "Peripheral Blood"...
    # Few samples are saliva samples probably due to sample collection issues
    name = None
    if 'BODY_SITE' in samp_var_values:
        name = 'BODY_SITE'
    elif 'Body_Site' in samp_var_values:
        name = 'Body_Site'
    elif 'Body Site' in samp_var_values:
        name = 'Body Site'
Exemplo n.º 18
0
#HiSeq 2000
#HiSeq 2000
#GeneChip Human Gene 1.0 ST Array
#HiSeq 2000
#HiSeq X Ten

DBGAP_QUERY_URL_PREFIX = 'https://www.ncbi.nlm.nih.gov/gap/?term='
DBGAP_GTEX_QUERY_URL = DBGAP_QUERY_URL_PREFIX + 'phs000424'

HISEQ_TYPES = {
    "HiSeq 2000": util.get_annotation("Illumina HiSeq 2000"),
    "HiSeq X Ten": util.get_annotation("Illumina HiSeq X Ten"),
}

# TODO - duplicated from rnaseq_datasets.py
DB_GAP = DatsObj("DataRepository", [("name", "dbGaP")])

NIH_NHGRI = DatsObj("Organization",
                    [("name", "National Human Genome Research Institute"),
                     ("abbreviation", "NHGRI")])

# TODO - use DatsObjCache
cache = None
GTEX_V7_RNASEQ_TYPE = DatsObj(
    "DataType",
    [("information", util.get_annotation("transcription profiling", cache)),
     ("method", util.get_annotation("RNA-seq assay", cache)),
     ("platform", util.get_annotation("Illumina", cache))])

GTEX_V7_WGS_TYPE = DatsObj(
    "DataType",
Exemplo n.º 19
0
def get_files_dats_datasets(cache, dats_samples_d, p_samples, gh_samples,
                            protected_cram_files, no_circular_links):
    file_datasets = []

    rnaseq_datatype = DatsObj("DataType", [
        ("information", util.get_annotation("transcription profiling", cache)),
        ("method", util.get_annotation("RNA-seq assay", cache)),
        ("platform", util.get_annotation("Illumina", cache))
    ])

    def get_rnaseq_datatype():
        dkey = ".".join(["DataType", "RNA-seq"])
        return cache.get_obj_or_ref(dkey, lambda: rnaseq_datatype)

    wgs_datatype = DatsObj(
        "DataType",
        [("information", util.get_annotation("DNA sequencing", cache)),
         ("method", util.get_annotation("whole genome sequencing assay",
                                        cache)),
         ("platform", util.get_annotation("Illumina", cache))])

    def get_wgs_datatype():
        dkey = ".".join(["DataType", "WGS"])
        return cache.get_obj_or_ref(dkey, lambda: wgs_datatype)

    broad_key = ":".join(["Organization", "Broad Institute"])
    broad = cache.get_obj_or_ref(
        broad_key,
        lambda: DatsObj("Organization", [("name", "Broad Institute")]))
    creators = [broad]

    def make_data_standard(format):
        return DatsObj("DataStandard",
                       [("name", format),
                        ("type", util.get_value_annotation("format", cache)),
                        ("description", format + " file format")])

    cram_ds_key = ":".join(["DataStandard", "CRAM"])
    cram_ds = cache.get_obj_or_ref(cram_ds_key,
                                   lambda: make_data_standard("CRAM"))

    crai_ds_key = ":".join(["DataStandard", "CRAI"])
    crai_ds = cache.get_obj_or_ref(crai_ds_key,
                                   lambda: make_data_standard("CRAI"))

    for sample_id in protected_cram_files:
        file = protected_cram_files[sample_id]
        material_type = None
        ds_types = None

        # determine file type
        if re.search(r'wgs\/', file['cram_file_aws']['raw_value']):
            material_type = 'DNA'
            ds_types = get_wgs_datatype()
            gcp_suffix = '_gcp'
        elif re.search(r'rnaseq\/', file['cram_file_aws']['raw_value']):
            material_type = 'RNA'
            ds_types = get_rnaseq_datatype()
            gcp_suffix = ''
        else:
            logging.fatal(
                "unable to determine material/sequence type from cram_file_aws="
                + file['cram_file_aws']['raw_value'])
            sys.exit(1)

        # RNA-Seq keys = sample_id	cram_file	cram_file_md5	cram_file_size	cram_index	cram_file_aws	cram_index_aws
        # WGS keys = same as above + firecloud_id
        cram_file = file['cram_file' + gcp_suffix]['raw_value']
        cram_file_md5 = file['cram_file_md5']['raw_value']

        # TODO - review the following encoding decisions:
        #  - storing .crai URI as relatedIdentifier of the DatasetDistribution for the .cram file
        #  - storing MD5 checksum of the .cram file as an extraProperty of the DatasetDistribution
        #  - storing firecloud_id as a relatedIdentifier of the Dataset (not the DatasetDistribution)

        # Google Cloud Platform / Google Storage copy
        gs_access = DatsObj(
            "Access",
            [("landingPage", file['cram_file' + gcp_suffix]['raw_value'])])
        gs_distro = DatsObj(
            "DatasetDistribution",
            [
                ("access", gs_access),
                ("identifier",
                 DatsObj("Identifier",
                         [("identifier",
                           file['cram_file' + gcp_suffix]['raw_value'])])),
                ("relatedIdentifiers", [
                    DatsObj("RelatedIdentifier",
                            [("identifier",
                              file['cram_index' + gcp_suffix]['raw_value']),
                             ("relationType", "cram_index")])
                ]),
                ("size", int(file['cram_file_size']['raw_value'])),
                # TODO - add unit for bytes, include IRI?
                #                ("unit", util.get_value_annotation("bytes", cache))
                ("conformsTo", [
                    cache.get_obj_or_ref(cram_ds_key,
                                         lambda: make_data_standard("CRAM"))
                ])
            ])

        # AWS / S3 copy
        s3_access = DatsObj(
            "Access", [("landingPage", file['cram_file_aws']['raw_value'])])
        s3_distro = DatsObj(
            "DatasetDistribution",
            [
                ("access", s3_access),
                ("identifier",
                 DatsObj("Identifier", [
                     ("identifier", file['cram_file_aws']['raw_value'])
                 ])),
                ("relatedIdentifiers", [
                    DatsObj("RelatedIdentifier", [
                        ("identifier", file['cram_index_aws']['raw_value']),
                        ("relationType", "cram_index")
                    ])
                ]),
                ("size", int(file['cram_file_size']['raw_value'])),
                # TODO - add unit for bytes, include IRI?
                #                ("unit", util.get_value_annotation("bytes", cache))
                ("conformsTo", [
                    cache.get_obj_or_ref(cram_ds_key,
                                         lambda: make_data_standard("CRAM"))
                ])
            ])

        m = re.match(r'^.*\/([^\/]+)$', cram_file)
        if m is None:
            logging.fatal("unable to parse filename from CRAM file URI " +
                          cram_file)
            sys.exit(1)
        filename = m.group(1)

        # TODO - replace this with DATS-specific MD5 checksum encoding (TBD)
        md5_dimension = DatsObj(
            "Dimension", [("name", util.get_value_annotation("MD5", cache)),
                          ("values", [file['cram_file_md5']['raw_value']])])

        ds = DatsObj("Dataset", [
            ("distributions", [gs_distro, s3_distro]),
            ("dimensions", [md5_dimension]),
            ("title", filename),
            ("types", [ds_types]),
            ("creators", creators),
        ])

        # add firecloud_id for WGS
        if 'firecloud_id' in file:
            f_id = DatsObj("RelatedIdentifier",
                           [("identifier", file['firecloud_id']['raw_value']),
                            ("identifierSource", "FireCloud")])
            ds.set("relatedIdentifiers", [f_id])

        # input RNA/DNA extract that was sequenced
        if sample_id not in dats_samples_d:
            logging.fatal("no sample exists for " + sample_id +
                          " found in file " +
                          file['cram_file_aws']['raw_value'])
            sys.exit(1)

        dats_sample = dats_samples_d[sample_id]
        dats_samp_key = ":".join(["Material", dats_sample.get("name")])
        dats_samp = cache.get_obj_or_ref(dats_samp_key, lambda: dats_sample)

        da = DatsObj(
            "DataAcquisition",
            [("name", filename), ("input", [dats_samp])
             #                ("uses", [])                          # software used
             ])

        if not no_circular_links:
            # circular link back to enclosing Dataset as the output
            da.set("output", [ds.getIdRef()])

        ds.set("producedBy", da)
        file_datasets.append(ds)

    return file_datasets
Exemplo n.º 20
0
def make_consent_group(args, group_name, group_index, subject_l,
                       dats_subject_d):

    # find DATS subject that corresponds to each named subject
    dats_subjects_l = []
    # parallel array in which existing subjects are represented by idref
    dats_subjects_idrefs_l = []

    for s in subject_l:
        if s['SUBJID'] not in dats_subject_d:
            logging.warn(
                "GTEx subject " + s['SUBJID'] +
                " not found in public metadata, creating new subject Material")

            # create new placeholder Material and 1. add it to "all subjects" group 2.
            subject = DatsObj("Material",
                              [("name", s['SUBJID']), ("characteristics", []),
                               ("description", "GTEx subject " + s['SUBJID'])])
            dats_subject_d[s['SUBJID']] = subject
            dats_subjects_l.append(subject)
            dats_subjects_idrefs_l.append(subject)
        else:
            ds = dats_subject_d[s['SUBJID']]
            dats_subjects_l.append(ds)
            dats_subjects_idrefs_l.append(ds.getIdRef())

    # create StudyGroup and associated ConsentInfo

    # TODO - determine if/where to store group_index (0 or 1)

    # only 2 consent groups in GTEx study:
    #   0 - Subjects did not participate in the study, did not complete a consent document and
    #       are included only for the pedigree structure and/or genotype controls, such as HapMap subjects
    #   1 - General Research Use (GRU)
    consent_info = None
    if group_name == "General Research Use (GRU)":
        # Data Use Ontology for consent info - http://www.obofoundry.org/ontology/duo.html
        #  http://purl.obolibrary.org/obo/DUO_0000005 - "general research use and clinical care"
        #  "This primary category consent code indicates that use is allowed for health/medical/biomedical
        # purposes and other biological research, including the study of population origins or ancestry."
        consent_info = DatsObj(
            "ConsentInfo",
            [("name", group_name), ("abbreviation", "GRU"),
             ("description", group_name),
             ("relatedIdentifiers", [
                 DatsObj("RelatedIdentifier",
                         [("identifier",
                           "http://purl.obolibrary.org/obo/DUO_0000005")])
             ])])
    elif group_name == "Subjects did not participate in the study, did not complete a consent document and are included only for the pedigree structure and/or genotype controls, such as HapMap subjects":
        consent_info = DatsObj("ConsentInfo", [("name", group_name),
                                               ("description", group_name)])
    else:
        logging.fatal("unrecognized consent group " + group_name)
        sys.exit(1)

    group = DatsObj("StudyGroup", [("name", group_name),
                                   ("members", dats_subjects_idrefs_l),
                                   ("size", len(dats_subjects_idrefs_l)),
                                   ("consentInformation", [consent_info])])

    # create link back from each subject to the parent StudyGroup
    if args.no_circular_links:
        logging.warn(
            "not creating Subject level circular links because of --no_circular_links option"
        )
    else:
        for s in dats_subjects_l:
            cl = s.get("characteristics")
            cl.append(
                DatsObj("Dimension", [("name", "member of study group"),
                                      ("values", [group.getIdRef()])]))
    return group
Exemplo n.º 21
0
def get_subject_dats_material(cache, study, study_md, subj_var_values):

    # extract subject attributes
    gender = None
    age = None
    visit_year = None
    sys_bp = None
    dias_bp = None
    disease = {}
    disease['hypertension'] = "unknown"
    
    for name in subj_var_values:
        name_upper = name.upper()
        if name_upper == "GENDER" or name_upper == "SEX":
            gender = subj_var_values[name]['value'].lower()
        elif name_upper == "VISIT_AGE" or name_upper == "AGE" or name_upper == "AGE_ENROLL": #need to confirm that these  allmean the same thing
            age = subj_var_values[name]['value']
        elif name_upper == "VISIT_YEAR":
            visit_year =  subj_var_values[name]['value']
        elif name_upper == "SYSBP":
            sys_bp = subj_var_values[name]['value']
        elif name_upper == "DIASBP":
            dias_bp = subj_var_values[name]['value']
        elif name_upper == "HYPERTENSION" or name_upper == "HIGHBLOODPRES":
            if subj_var_values[name]['value'].lower() == "yes" or subj_var_values[name]['value'] == '1':
                disease['hypertension'] = "yes"
            elif re.match(r'\S', subj_var_values[name]['value']):
                disease['hypertension'] = "no"

    subject_characteristics = []
    subject_bearerOfDisease = []

    # harmonized/standardized characteristics
    if gender is not None:
        subject_sex = DatsObj("Dimension", [
                ("name", DatsObj("Annotation", [("value", "Gender")])),
                ("description", "Gender of the subject"),
                ("values", [ gender ])
                ])
        subject_characteristics.append(subject_sex)

    if age is not None:
        subject_age = DatsObj("Dimension", [
                ("name", DatsObj("Annotation", [("value", "Age")])),
                ("description", "Age of the subject"),
                ("values", [ age ])
                ])
        subject_characteristics.append(subject_age)
    
    if visit_year is not None:
        subject_visitYear = DatsObj("Dimension", [
                ("name", DatsObj("Annotation", [("value", "Visit year")])),
                ("description", "Year of visit, to use for longitudinal analysis"),
                ("values", [ visit_year ])
                ])
        subject_characteristics.append(subject_visitYear)
    
    if sys_bp is not None:
        subject_sysBP = DatsObj("Dimension", [
                ("name", DatsObj("Annotation", [("value", "Systolic blood pressure")])),
                ("description", "Systolic blood pressure of subject, measured in mmHg"),
                ("values", [ sys_bp ])
                ])
        subject_characteristics.append(subject_sysBP)
        
    if dias_bp is not None:
        subject_diasBP = DatsObj("Dimension", [
                ("name", DatsObj("Annotation", [("value", "Diastolic blood pressure")])),
                ("description", "Diastolic blood pressure of subject, measured in mmHg"),
                ("values", [ dias_bp ])
                ])
        subject_characteristics.append(subject_diasBP)                                      
    
    if disease['hypertension'] != "unknown":
        disease_name = "hypertension"
        disease_id = "10763"
        disease_identifier = OrderedDict([
            ("identifier",  "DOID:" + str(disease_id)),
            ("identifierSource", "Disease Ontology")])
        disease_alt_ids = [OrderedDict([
            ("identifier", "http://purl.obolibrary.org/obo/DOID_" + str(disease_id)),
            ("identifierSource", "Disease Ontology")])]
        subject_hypertension = DatsObj("Disease", [
            ("name", "Hypertension"),
            ("identifier", disease_identifier),
            ("alternateIdentifiers", disease_alt_ids),
            ("diseaseStatus", DatsObj("Annotation", [("value", disease['hypertension'] ), ("valueIRI", "")])), 
            ])
        subject_bearerOfDisease.append(subject_hypertension)

    # create a DATS Dimension from a dbGaP variable value
    def make_var_dimension(name, var_value):
        value = var_value["value"]

        dim = DatsObj("Dimension", 
                      [("name", DatsObj("Annotation", [( "value",  name )])), 
                       ("values", [ value ])
                       ])

        # find existing DATS identifier for the corresponding Dataset Dimension 
        if "var" in var_value:
            dbgap_var_dim = var_value["var"]["dim"]
            dim.setProperty("identifier", dbgap_var_dim.get("identifier").getIdRef())
        return dim

    # create DATS Dimensions for dbGaP subject metadata
    subject_dimensions = [ make_var_dimension(vname, subj_var_values[vname]) for vname in sorted(subj_var_values) ]

    # "raw" characteristics from dbGaP metadata
    subject_characteristics.extend(subject_dimensions)
    
    human_t = util.get_taxon_human(cache)
    subj_id = subj_var_values['SUBJECT_ID']['value']
    dbgap_subj_id = subj_var_values['dbGaP_Subject_ID']['value']

    study_title = study.get("title")

    # human experimental subject/patient
    subject_material = DatsObj("Material", [
            ("name", subj_id),
            ("identifier", { "identifier": subj_id }),
            ("alternateIdentifiers", [ util.get_alt_id(dbgap_subj_id, "dbGaP") ]),
            ("description", study_title + " subject " + subj_id),
            ("characteristics", subject_characteristics),
            ("bearerOfDisease", subject_bearerOfDisease),
            ("taxonomy", [ human_t ]),
            ("roles", util.get_donor_roles(cache))
            ])

    # add to the cache
    subj_key = ":".join(["Material", subj_id])
    dats_subj = cache.get_obj_or_ref(subj_key, lambda: subject_material)

    return dats_subj
Exemplo n.º 22
0
def main():

    # input
    parser = argparse.ArgumentParser(
        description='Create DATS JSON for dbGaP GTEx public metadata.')
    parser.add_argument(
        '--output_file',
        required=True,
        help=
        'Output file path for the DATS JSON file containing the top-level DATS Dataset.'
    )
    parser.add_argument(
        '--dbgap_public_xml_path',
        required=True,
        help=
        'Path to directory that contains public dbGaP metadata files e.g., *.data_dict.xml and *.var_report.xml'
    )
    parser.add_argument(
        '--dbgap_protected_metadata_path',
        required=False,
        help=
        'Path to directory that contains access-controlled dbGaP tab-delimited metadata files.'
    )
    parser.add_argument(
        '--max_output_samples',
        required=False,
        type=int,
        help=
        'Impose a limit on the number of sample Materials in the output DATS. For testing purposes only.'
    )
    parser.add_argument('--subject_phenotypes_path',
                        default=V7_SUBJECT_PHENOTYPES_FILE,
                        required=False,
                        help='Path to ' + V7_SUBJECT_PHENOTYPES_FILE)
    parser.add_argument('--sample_attributes_path',
                        default=V7_SAMPLE_ATTRIBUTES_FILE,
                        required=False,
                        help='Path to ' + V7_SAMPLE_ATTRIBUTES_FILE)
    parser.add_argument(
        '--data_stewards_repo_path',
        default='data-stewards',
        required=False,
        help='Path to local copy of https://github.com/dcppc/data-stewards')
    parser.add_argument(
        '--no_circular_links',
        action='store_true',
        help=
        'Whether to disallow circular links/paths within the JSON-LD output.')
    parser.add_argument(
        '--use_all_dbgap_subject_vars',
        action='store_true',
        help=
        'Whether to store all available dbGaP variable values as characteristics of the DATS subject Materials.'
    )
    #    parser.add_argument('--use_all_dbgap_sample_vars', action='store_true', help ='Whether to store all available dbGaP variable values as characteristics of the DATS sample Materials.')
    args = parser.parse_args()

    # logging
    logging.basicConfig(level=logging.INFO)
    #    logging.basicConfig(level=logging.DEBUG)

    # read portal metadata for subjects and samples
    p_subjects = portal_files.read_subject_phenotypes_file(
        args.subject_phenotypes_path)
    p_samples = portal_files.read_sample_attributes_file(
        args.sample_attributes_path)
    portal_files.link_samples_to_subjects(p_samples, p_subjects)

    # read id dump and manifest files from GitHub data-stewards repo

    # id dumps
    subject_id_file = args.data_stewards_repo_path + "/gtex/v7/id_dumps/gtex_v7_subject_ids.txt"
    gh_subjects = github_files.read_subject_id_file(subject_id_file)
    sample_id_file = args.data_stewards_repo_path + "/gtex/v7/id_dumps/gtex_v7_sample_ids.txt"
    gh_samples = github_files.read_sample_id_file(sample_id_file)
    tissue_id_file = args.data_stewards_repo_path + "/gtex/v7/id_dumps/gtex_v7_tissue_ids.txt"
    gh_tissues = github_files.read_tissue_id_file(tissue_id_file)

    # manifest files
    protected_rnaseq_manifest = args.data_stewards_repo_path + "/gtex/v7/manifests/protected_data/" + RNASEQ_MANIFEST_FILE
    protected_rnaseq_files = github_files.read_protected_rnaseq_manifest(
        protected_rnaseq_manifest)
    protected_wgs_manifest = args.data_stewards_repo_path + "/gtex/v7/manifests/protected_data/" + WGS_MANIFEST_FILE
    protected_wgs_files = github_files.read_protected_wgs_manifest(
        protected_wgs_manifest)

    # DOIs
    rnaseq_dois_file = args.data_stewards_repo_path + "/gtex/v7/manifests/protected_data/" + RNASEQ_DOIS_FILE
    rnaseq_dois = github_files.read_dois_manifest(rnaseq_dois_file)
    wgs_dois_file = args.data_stewards_repo_path + "/gtex/v7/manifests/protected_data/" + WGS_DOIS_FILE
    wgs_dois = github_files.read_dois_manifest(wgs_dois_file)

    # compare GitHub manifest files with GitHub id dumps
    cross_check_ids(gh_subjects, gh_samples, protected_rnaseq_files,
                    protected_rnaseq_manifest, "RNA-Seq", "GitHub id dumps")
    cross_check_ids(gh_subjects, gh_samples, protected_wgs_files,
                    protected_wgs_manifest, "WGS", "GitHub id dumps")

    # compare GitHub manifest files with GTEx Portal metdata files
    cross_check_ids(p_subjects, p_samples, protected_rnaseq_files,
                    protected_rnaseq_manifest, "RNA-Seq",
                    "GTEx Portal metadata")
    cross_check_ids(p_subjects, p_samples, protected_wgs_files,
                    protected_wgs_manifest, "WGS", "GTEx Portal metadata")

    # create top-level dataset
    gtex_dataset = ccmm.gtex.wgs_datasets.get_dataset_json()

    # index dbGaP study Datasets by id
    dbgap_study_datasets_by_id = {}
    for tds in gtex_dataset.get("hasPart"):
        dbgap_study_id = tds.get("identifier").get("identifier")
        if dbgap_study_id in dbgap_study_datasets_by_id:
            logging.fatal("encountered duplicate study_id " + dbgap_study_id)
            sys.exit(1)
        m = re.match(r'^(phs\d+\.v\d+)\.p\d+$', dbgap_study_id)
        if m is None:
            logging.fatal("unable to parse study_id " + dbgap_study_id)
            sys.exit(1)
        dbgap_study_datasets_by_id[m.group(1)] = tds

    # read public dbGaP metadata
    pub_xp = args.dbgap_public_xml_path
    restricted_mp = args.dbgap_protected_metadata_path
    # read public metadata
    dbgap_study_pub_md = ccmm.gtex.public_metadata.read_study_metadata(pub_xp)
    # there should be only one study
    study_ids = [k for k in dbgap_study_pub_md.keys()]
    n_study_ids = len(study_ids)
    study_id = study_ids[0]
    if n_study_ids != 1:
        logging.fatal("read " + str(n_study_ids) + " dbGaP studies from " +
                      pub_xp)
        sys.exit(1)

    dbgap_study_dataset = dbgap_study_datasets_by_id[study_id]
    dbgap_study_md = dbgap_study_pub_md[study_id]
    sv = ccmm.gtex.public_metadata.add_study_vars(dbgap_study_dataset,
                                                  dbgap_study_md)
    dbgap_study_md['id_to_var'] = sv['id_to_var']
    dbgap_study_md['type_name_cg_to_var'] = sv['type_name_cg_to_var']

    # set 2nd level types to be the same as the top-level types: WGS and RNA-Seq
    dbgap_study_dataset.set("types", gtex_dataset.get("types"))

    # cache used to minimize duplication of JSON objects in JSON-LD output
    cache = DatsObjCache()

    # --------------------------
    # subjects
    # --------------------------

    # create subjects based on GTEx Portal subject phenotype file and GitHub data-stewards id dump
    dats_subjects_d = ccmm.gtex.subjects.get_subjects_dats_materials(
        cache, p_subjects, gh_subjects,
        dbgap_study_md['type_name_cg_to_var']['Subject_Phenotypes'])
    # sorted list of subjects
    dats_subjects_l = sorted([dats_subjects_d[s] for s in dats_subjects_d],
                             key=lambda s: s.get("name"))

    # TODO - add consent groups, of which GTEx has 2: 0=didn't participate, 1=General Research Use (GRU)

    # create StudyGroup that lists all the subjects
    logging.info("creating 'all subjects' StudyGroup containing " +
                 str(len(dats_subjects_l)) +
                 " subject(s) from public metadata")
    all_subjects = DatsObj(
        "StudyGroup",
        [
            ("name", "all subjects"),
            # subjects appear in full here, but id references will be used elsewhere in the instance:
            ("members", dats_subjects_l),
            ("size", len(dats_subjects_l))
        ])

    # create link back from each subject to the parent StudyGroup
    if args.no_circular_links:
        logging.warn(
            "not creating Subject level circular links because of --no_circular_links option"
        )
    else:
        for s in dats_subjects_l:
            cl = s.get("characteristics")
            cl.append(
                DatsObj("Dimension", [("name", "member of study group"),
                                      ("values", [all_subjects.getIdRef()])]))

    dats_study = DatsObj("Study", [("name", "GTEx"),
                                   ("studyGroups", [all_subjects])])

    # link Study to Dataset
    dbgap_study_dataset.set("producedBy", dats_study)

    # --------------------------
    # sample Materials
    # --------------------------

    # create samples based on GTEx Portal sample attributes file and GitHub data-stewards id dump
    dats_samples_d = ccmm.gtex.samples.get_samples_dats_materials(
        cache, dats_subjects_d, p_samples, gh_samples,
        dbgap_study_md['type_name_cg_to_var']['Sample_Attributes'])
    # sorted list of samples
    dats_samples_l = sorted([dats_samples_d[s] for s in dats_samples_d],
                            key=lambda s: s.get("name"))
    if args.max_output_samples is not None:
        dats_samples_l = dats_samples_l[0:int(args.max_output_samples)]
        logging.warn("limiting output to " + str(len(dats_samples_l)) +
                     " sample(s) due to value of --max_output_samples")
    dbgap_study_dataset.set("isAbout", dats_samples_l)

    # --------------------------
    # file Datasets
    # --------------------------

    file_datasets_l = []

    # WGS CRAM
    wgs_dats_file_datasets_l = ccmm.gtex.samples.get_files_dats_datasets(
        cache, dats_samples_d, p_samples, gh_samples, protected_wgs_files,
        wgs_dois, args.no_circular_links)
    logging.info("adding Datasets for " + str(len(wgs_dats_file_datasets_l)) +
                 " WGS CRAM files")
    file_datasets_l.extend(wgs_dats_file_datasets_l)

    # RNA-Seq CRAM
    rnaseq_dats_file_datasets_l = ccmm.gtex.samples.get_files_dats_datasets(
        cache, dats_samples_d, p_samples, gh_samples, protected_rnaseq_files,
        rnaseq_dois, args.no_circular_links)
    logging.info("adding Datasets for " +
                 str(len(rnaseq_dats_file_datasets_l)) + " RNA-Seq CRAM files")
    file_datasets_l.extend(rnaseq_dats_file_datasets_l)

    dbgap_study_dataset.set("hasPart", file_datasets_l)

    # augment public (meta)data with restricted-access (meta)data
    if restricted_mp is not None:
        # create study groups and update subjects/samples with restricted phenotype data
        add_restricted_data(cache, args, dbgap_study_md, dats_subjects_l,
                            dats_samples_d, dats_study, study_id)

    # write Dataset to DATS JSON file
    with open(args.output_file, mode="w") as jf:
        jf.write(json.dumps(gtex_dataset, indent=2, cls=DATSEncoder))
Exemplo n.º 23
0
def get_single_dna_extract_json(cache, study, study_md, subj_var_values,
                                samp_var_values):
    # Almost all samples in TOPMed WGS phase are blood samples, named "Blood", "Peripheral Blood"...
    # Few samples are saliva samples probably due to sample collection issues
    name = None
    if 'BODY_SITE' in samp_var_values:
        name = 'BODY_SITE'
    elif 'Body_Site' in samp_var_values:
        name = 'Body_Site'
    elif 'Body Site' in samp_var_values:
        name = 'Body Site'

    anat_id = None
    anatomy_name = None

    if name is not None:
        if "blood" in samp_var_values[name]['value'].lower():
            anatomy_name = "blood"
            anat_id = "0000178"
        elif samp_var_values[name]['value'].lower() == "saliva":
            anatomy_name = "saliva"
            anat_id = "0001836"
        else:
            logging.fatal(
                "encountered BODY_SITE other than 'Blood' and 'Saliva' in TOPMed sample metadata - "
                + samp_var_values['BODY_SITE']['value'])
            sys.exit(1)

    if anat_id is not None:
        anatomy_identifier = OrderedDict([("identifier",
                                           "UBERON:" + str(anat_id)),
                                          ("identifierSource", "UBERON")])
        anatomy_alt_ids = [
            OrderedDict([
                ("identifier",
                 "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)),
                ("identifierSource", "UBERON")
            ])
        ]

    # extract subject attributes
    gender = None
    age = None
    visit_year = None
    sys_bp = None
    dias_bp = None
    disease = {}
    disease['hypertension'] = "unknown"

    for name in subj_var_values:
        name_upper = name.upper()
        if name_upper == "GENDER" or name_upper == "SEX":
            gender = subj_var_values[name]['value'].lower()
        elif name_upper == "VISIT_AGE" or name_upper == "AGE" or name_upper == "AGE_ENROLL":  #need to confirm that these  allmean the same thing
            age = subj_var_values[name]['value']
        elif name_upper == "VISIT_YEAR":
            visit_year = subj_var_values[name]['value']
        elif name_upper == "SYSBP":
            sys_bp = subj_var_values[name]['value']
        elif name_upper == "DIASBP":
            dias_bp = subj_var_values[name]['value']
        elif name_upper == "HYPERTENSION" or name_upper == "HIGHBLOODPRES":
            if subj_var_values[name]['value'].lower(
            ) == "yes" or subj_var_values[name]['value'] == '1':
                disease['hypertension'] = "yes"
            elif re.match(r'\S', subj_var_values[name]['value']):
                disease['hypertension'] = "no"

    # anatomical part
    anatomical_part = None
    if anatomy_name is not None:
        anatomical_part = DatsObj("AnatomicalPart",
                                  [("name", anatomy_name),
                                   ("identifier", anatomy_identifier),
                                   ("alternateIdentifiers", anatomy_alt_ids)])

    subject_characteristics = []
    subject_bearerOfDisease = []

    # harmonized/standardized characteristics
    if gender is not None:
        subject_sex = DatsObj(
            "Dimension",
            [("name", DatsObj("Annotation", [("value", "Gender")])),
             ("description", "Gender of the subject"), ("values", [gender])])
        subject_characteristics.append(subject_sex)

    if age is not None:
        subject_age = DatsObj(
            "Dimension", [("name", DatsObj("Annotation", [("value", "Age")])),
                          ("description", "Age of the subject"),
                          ("values", [age])])
        subject_characteristics.append(subject_age)

    if visit_year is not None:
        subject_visitYear = DatsObj("Dimension", [
            ("name", DatsObj("Annotation", [("value", "Visit year")])),
            ("description", "Year of visit, to use for longitudinal analysis"),
            ("values", [visit_year])
        ])
        subject_characteristics.append(subject_visitYear)

    if sys_bp is not None:
        subject_sysBP = DatsObj(
            "Dimension",
            [("name",
              DatsObj("Annotation", [("value", "Systolic blood pressure")])),
             ("description",
              "Systolic blood pressure of subject, measured in mmHg"),
             ("values", [sys_bp])])
        subject_characteristics.append(subject_sysBP)

    if dias_bp is not None:
        subject_diasBP = DatsObj(
            "Dimension",
            [("name",
              DatsObj("Annotation", [("value", "Diastolic blood pressure")])),
             ("description",
              "Diastolic blood pressure of subject, measured in mmHg"),
             ("values", [dias_bp])])
        subject_characteristics.append(subject_diasBP)

    if disease['hypertension'] != "unknown":
        disease_name = "hypertension"
        disease_id = "10763"
        disease_identifier = OrderedDict([
            ("identifier", "DOID:" + str(disease_id)),
            ("identifierSource", "Disease Ontology")
        ])
        disease_alt_ids = [
            OrderedDict([
                ("identifier",
                 "http://purl.obolibrary.org/obo/DOID_" + str(disease_id)),
                ("identifierSource", "Disease Ontology")
            ])
        ]
        subject_hypertension = DatsObj("Disease", [
            ("name", "Hypertension"),
            ("identifier", disease_identifier),
            ("alternateIdentifiers", disease_alt_ids),
            ("diseaseStatus",
             DatsObj("Annotation", [("value", disease['hypertension']),
                                    ("valueIRI", "")])),
        ])
        subject_bearerOfDisease.append(subject_hypertension)

    # create a DATS Dimension from a dbGaP variable value
    def make_var_dimension(name, var_value):
        value = var_value["value"]

        dim = DatsObj("Dimension",
                      [("name", DatsObj("Annotation", [("value", name)])),
                       ("values", [value])])

        # find existing DATS identifier for the corresponding Dataset Dimension
        if "var" in var_value:
            id = var_value["var"]["id"]
            dbgap_var_dim = study_md['dbgap_vars'][id]
            dim.setProperty("identifier",
                            dbgap_var_dim.get("identifier").getIdRef())

        return dim

    # create DATS Dimensions for dbGaP subject metadata
    subject_dimensions = [
        make_var_dimension(vname, subj_var_values[vname])
        for vname in sorted(subj_var_values)
    ]

    # create DATS Dimensions for dbGaP sample metadata
    sample_dimensions = [
        make_var_dimension(vname, samp_var_values[vname])
        for vname in sorted(samp_var_values)
    ]

    # "raw" characteristics from dbGaP metadata
    subject_characteristics.extend(subject_dimensions)
    sample_characteristics = sample_dimensions

    human_t = util.get_taxon_human(cache)
    subj_id = subj_var_values['SUBJECT_ID']['value']
    dbgap_subj_id = subj_var_values['dbGaP_Subject_ID']['value']
    samp_id = samp_var_values['SAMPLE_ID']['value']
    dbgap_samp_id = samp_var_values['dbGaP_Sample_ID']['value']

    study_title = study.get("title")

    # human experimental subject/patient
    subject_material = DatsObj(
        "Material",
        [("name", subj_id), ("identifier", {
            "identifier": subj_id
        }),
         ("alternateIdentifiers", [util.get_alt_id(dbgap_subj_id, "dbGaP")]),
         ("description", study_title + " subject " + subj_id),
         ("characteristics", subject_characteristics),
         ("bearerOfDisease", subject_bearerOfDisease), ("taxonomy", [human_t]),
         ("roles", util.get_donor_roles(cache))])

    # TODO - use DatsObjCache
    specimen_annot = util.get_annotation("specimen")
    dna_extract_annot = util.get_annotation("DNA extract")

    # biological/tissue sample
    sample_name = samp_id
    sample_derives_from = [subject_material]
    sample_descr = "specimen collected from subject " + subj_id
    if anatomical_part is not None:
        sample_derives_from.append(anatomical_part)
        sample_descr = anatomy_name + " " + sample_descr

    biological_sample_material = DatsObj(
        "Material",
        [("name", sample_name), ("identifier", {
            "identifier": samp_id
        }),
         ("alternateIdentifiers", [util.get_alt_id(dbgap_samp_id, "dbGaP")]),
         ("description", sample_descr),
         ("characteristics", sample_characteristics), ("taxonomy", [human_t]),
         ("roles", [specimen_annot]), ("derivesFrom", sample_derives_from)])

    # DNA extracted from tissue sample
    dna_descr = "DNA extracted from specimen collected from subject " + subj_id
    if anatomical_part is not None:
        dna_descr = "DNA extracted from " + anatomy_name + " specimen collected from subject " + subj_id

    dna_material = DatsObj("Material",
                           [("name", "DNA from " + sample_name),
                            ("description", dna_descr),
                            ("taxonomy", [human_t]),
                            ("roles", [dna_extract_annot]),
                            ("derivesFrom", [biological_sample_material])])

    return dna_material
Exemplo n.º 24
0
def get_sample_dats_material(cache, dats_subject, study, study_md,
                             samp_var_values):

    # Almost all samples in TOPMed WGS phase are blood samples, named "Blood", "Peripheral Blood"...
    # Few samples are saliva samples probably due to sample collection issues
    name = None
    if 'BODY_SITE' in samp_var_values:
        name = 'BODY_SITE'
    elif 'Body_Site' in samp_var_values:
        name = 'Body_Site'
    elif 'Body Site' in samp_var_values:
        name = 'Body Site'

    anat_id = None
    anatomy_name = None

    if name is not None:
        if "blood" in samp_var_values[name]['value'].lower():
            anatomy_name = "blood"
            anat_id = "0000178"
        elif samp_var_values[name]['value'].lower() == "saliva":
            anatomy_name = "saliva"
            anat_id = "0001836"
        else:
            logging.fatal(
                "encountered BODY_SITE other than 'Blood' and 'Saliva' in TOPMed sample metadata - "
                + samp_var_values['BODY_SITE']['value'])
            sys.exit(1)

    def make_anat_part(anat_id, anatomy_name):
        # anatomical part
        anatomical_part = DatsObj("AnatomicalPart", [("name", anatomy_name)])

        if anat_id is not None:
            anatomy_identifier = OrderedDict([("identifier",
                                               "UBERON:" + str(anat_id)),
                                              ("identifierSource", "UBERON")])
            anatomy_alt_ids = [
                OrderedDict([
                    ("identifier",
                     "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)),
                    ("identifierSource", "UBERON")
                ])
            ]

            anatomical_part.set("identifier", anatomy_identifier)
            anatomical_part.set("alternateIdentifiers", anatomy_alt_ids)

        return anatomical_part

    if anatomy_name is not None:
        # use cached value for AnatomicalPart if possible
        anat_part_key = ":".join(["AnatomicalPart", anatomy_name])
        anatomical_part = cache.get_obj_or_ref(
            anat_part_key, lambda: make_anat_part(anat_id, anatomy_name))
    else:
        anatomical_part = None

    # create a DATS Dimension from a dbGaP variable value
    def make_var_dimension(name, var_value):
        value = var_value["value"]

        dim = DatsObj("Dimension",
                      [("name", DatsObj("Annotation", [("value", name)])),
                       ("values", [value])])

        # find existing DATS identifier for the corresponding Dataset Dimension
        if "var" in var_value:
            dbgap_var_dim = var_value["var"]["dim"]
            dim.setProperty("identifier",
                            dbgap_var_dim.get("identifier").getIdRef())
        return dim

    # create DATS Dimensions for dbGaP sample metadata
    sample_dimensions = [
        make_var_dimension(vname, samp_var_values[vname])
        for vname in sorted(samp_var_values)
    ]

    sample_characteristics = sample_dimensions
    samp_id = samp_var_values['SAMPLE_ID']['value']
    dbgap_samp_id = samp_var_values['dbGaP_Sample_ID']['value']
    study_title = study.get("title")
    specimen_annot = util.get_annotation("specimen", cache)

    # corresponding DATS subject Material
    subj_key = ":".join(["Material", dats_subject.get("name")])
    dats_subj = cache.get_obj_or_ref(subj_key, lambda: dats_subject)
    dats_subj_name = dats_subject.get("name")

    # biological/tissue sample
    sample_name = samp_id
    sample_derives_from = [dats_subj]

    sample_descr = "specimen collected from subject " + dats_subj_name
    if anatomical_part is not None:
        sample_derives_from.append(anatomical_part)
        sample_descr = anatomy_name + " " + sample_descr

    biological_sample_material = DatsObj(
        "Material",
        [("name", sample_name), ("identifier", {
            "identifier": samp_id
        }),
         ("alternateIdentifiers", [util.get_alt_id(dbgap_samp_id, "dbGaP")]),
         ("description", sample_descr),
         ("characteristics", sample_characteristics),
         ("taxonomy", [util.get_taxon_human(cache)]),
         ("roles", [specimen_annot]), ("derivesFrom", sample_derives_from)])

    # RNA or DNA extracted from tissue sample
    stype = "DNA"
    # TODO - check if RNA, not DNA

    dna_or_rna_descr = stype + " extracted from specimen collected from subject " + dats_subj_name
    if anatomical_part is not None:
        dna_or_rna_descr = stype + " extracted from " + anatomy_name + " specimen collected from subject " + dats_subj_name

    dna_or_rna_material = DatsObj(
        "Material",
        [("name", stype + " from " + sample_name),
         ("description", dna_or_rna_descr),
         ("taxonomy", [util.get_taxon_human(cache)]),
         ("roles", [util.get_annotation(stype + " extract", cache)]),
         ("derivesFrom", [biological_sample_material])])
    return dna_or_rna_material
Exemplo n.º 25
0
import gzip
import logging
import re
import sys

AGR_DESCRIPTION = """The Alliance (AGR) develops and maintains sustainable genome information resources \
that facilitates the use of diverse model organisms in understanding the genetic and genomic basis of \
human biology, health and disease. """

# List of AGR Genome builds
AGR_ref_genomes = {
    "MGI_1.0.4": "Mouse Genome (Mouse Genome Informatics)",
    "RGD_1.0.4": "Rat Genome (Rat Genome Database)"
}

AGR = DatsObj("Organization", [("abbreviation", "AGR"),
                               ("name", "Alliance of Genome Resources")])
AGR_BUCKET = DatsObj("DataRepository",
                     [("name", "AGR"),
                      ("description", "The Alliance AWS S3 BUCKET"),
                      ("publishers", [AGR])])

## Ontology for Biomedical Investigations
# "DNA sequencing"
DNA_SEQUENCING_TYPE = OrderedDict([
    ("value", "DNA sequencing"),
    ("valueIRI", "http://purl.obolibrary.org/obo/OBI_0000626")
])
WGS_ASSAY_TYPE = OrderedDict([("value", "whole genome sequencing assay"),
                              ("valueIRI",
                               "http://purl.obolibrary.org/obo/OBI_0002117")])
Exemplo n.º 26
0
def get_files_dats_datasets(cache, dats_samples_d, sample_manifest, file_guids,
                            no_circular_links):
    file_datasets_l = []

    wgs_datatype = DatsObj(
        "DataType",
        [("information", util.get_annotation("DNA sequencing", cache)),
         ("method", util.get_annotation("whole genome sequencing assay",
                                        cache)),
         ("platform", util.get_annotation("Illumina", cache))])

    def get_wgs_datatype():
        dkey = ".".join(["DataType", "WGS"])
        return cache.get_obj_or_ref(dkey, lambda: wgs_datatype)

    snp_datatype = DatsObj(
        "DataType", [("information", util.get_annotation("SNP", cache)),
                     ("method", util.get_annotation("SNP analysis", cache))])

    cnv_datatype = DatsObj(
        "DataType", [("information", util.get_annotation("CNV", cache)),
                     ("method", util.get_annotation("CNV analysis", cache))])

    def get_snp_datatype():
        dkey = ".".join(["DataType", "SNP"])
        return cache.get_obj_or_ref(dkey, lambda: snp_datatype)

    def get_cnv_datatype():
        dkey = ".".join(["DataType", "CNV"])
        return cache.get_obj_or_ref(dkey, lambda: cnv_datatype)

    nhlbi_key = ":".join(["Organization", "NHLBI"])
    nhlbi = cache.get_obj_or_ref(nhlbi_key, lambda: NIH_NHLBI)
    creators = [nhlbi]

    def make_data_standard(format):
        return DatsObj("DataStandard",
                       [("name", format),
                        ("type", util.get_value_annotation("format", cache)),
                        ("description", format + " file format")])

    cram_ds_key = ":".join(["DataStandard", "CRAM"])
    cram_dstan = cache.get_obj_or_ref(cram_ds_key,
                                      lambda: make_data_standard("CRAM"))

    vcf_ds_key = ":".join(["DataStandard", "VCF"])
    vcf_dstan = cache.get_obj_or_ref(vcf_ds_key,
                                     lambda: make_data_standard("VCF"))

    n_samples = len(dats_samples_d)
    n_samples_found = 0

    for sample_id in dats_samples_d:
        dats_sample = dats_samples_d[sample_id]

        # look up corresponding file paths in manifest file
        if sample_id not in sample_manifest:
            logging.debug("sample not found in manifest - " + sample_id)
            continue

        n_samples_found += 1
        ms = sample_manifest[sample_id]

        material_type = 'DNA'
        wgs_type = get_wgs_datatype()
        snp_type = get_snp_datatype()
        cnv_type = get_cnv_datatype()

        # ------------------------------------------------
        # WGS sequence - CRAM and CRAI files
        # ------------------------------------------------

        def get_filename(gs_uri):
            m = re.match(r'^.*\/([^\/]+)$', gs_uri)
            if m is None:
                logging.fatal("unable to parse filename from " + gs_uri)
                sys.exit(1)
            filename = m.group(1)
            return filename

        gs_cram = ms['gs_cram']['mapped_value']
        gs_crai = ms['gs_crai']['mapped_value']

        # GUID lookup
        cram_file = get_filename(gs_cram)
        crai_file = get_filename(gs_crai)

        cram_doi = file_guids[cram_file]['Sodium_GUID']['raw_value']
        cram_size = file_guids[cram_file]['File size']['raw_value']
        cram_md5 = file_guids[cram_file]['md5sum']['raw_value']

        crai_doi = file_guids[crai_file]['Sodium_GUID']['raw_value']
        crai_md5 = file_guids[crai_file]['md5sum']['raw_value']

        # handle file size values with "e" in them
        def filesize_to_int(size):
            if re.match(r'.*e.*', size):
                size = int(float(size))
            else:
                size = int(size)

        # Google Cloud Platform / Google Storage copy
        gs_cram_access = DatsObj("Access", [("accessURL", gs_cram)])
        gs_cram_distro = DatsObj(
            "DatasetDistribution",
            [
                ("access", gs_cram_access),
                ("identifier", DatsObj("Identifier",
                                       [("identifier", cram_doi)])),
                ("relatedIdentifiers", [
                    DatsObj("RelatedIdentifier",
                            [("identifier", crai_doi),
                             ("relationType", "cram_index")])
                ]),
                ("size", filesize_to_int(cram_size)),
                # TODO - add file size units
                ("conformsTo", [cram_dstan])
            ])

        # AWS / S3 copy
        s3_cram = ms['s3_cram']['mapped_value']
        s3_crai = ms['s3_crai']['mapped_value']
        s3_cram_access = DatsObj("Access", [("accessURL", s3_cram)])
        s3_cram_distro = DatsObj(
            "DatasetDistribution",
            [
                ("access", s3_cram_access),
                ("identifier", DatsObj("Identifier",
                                       [("identifier", cram_doi)])),
                ("relatedIdentifiers", [
                    DatsObj("RelatedIdentifier",
                            [("identifier", crai_doi),
                             ("relationType", "cram_index")])
                ]),
                ("size", filesize_to_int(cram_size)),
                # TODO - add file size units
                ("conformsTo", [cram_dstan])
            ])

        m = re.match(r'^.*\/([^\/]+)$', gs_cram)
        if m is None:
            logging.fatal("unable to parse filename from CRAM file URI " +
                          gs_cram)
            sys.exit(1)
        filename = m.group(1)

        # TODO - replace this with DATS-specific MD5 checksum encoding (TBD)
        md5_dimension = DatsObj(
            "Dimension", [("name", util.get_value_annotation("MD5", cache)),
                          ("values", [cram_md5])])

        cram_dataset = DatsObj("Dataset", [
            ("distributions", [gs_cram_distro, s3_cram_distro]),
            ("dimensions", [md5_dimension]),
            ("title", filename),
            ("types", [wgs_type]),
            ("creators", creators),
        ])

        cram_da = DatsObj(
            "DataAcquisition",
            [("name", filename), ("input", [dats_sample.getIdRef()])
             #            ("uses", [])                          # software used
             ])

        cram_dataset.set("producedBy", cram_da)
        # circular link back to enclosing Dataset as the output
        if not no_circular_links:
            cram_da.set("output", [cram_dataset.getIdRef()])
        file_datasets_l.append(cram_dataset)

        # ------------------------------------------------
        # Variant calls - VCF and CSI files
        # ------------------------------------------------

        gs_vcf = ms['gs_vcf']['mapped_value']
        gs_csi = ms['gs_csi']['mapped_value']

        if gs_vcf is None:
            logging.warn("no VCF file found for " + sample_id)
            continue

        # GUID lookup
        vcf_file = get_filename(gs_vcf)
        csi_file = get_filename(gs_csi)

        vcf_doi = file_guids[vcf_file]['Sodium_GUID']['raw_value']
        vcf_size = file_guids[vcf_file]['File size']['raw_value']
        vcf_md5 = file_guids[vcf_file]['md5sum']['raw_value']

        csi_doi = file_guids[csi_file]['Sodium_GUID']['raw_value']
        csi_md5 = file_guids[csi_file]['md5sum']['raw_value']

        # Google Cloud Platform / Google Storage copy
        gs_vcf_access = DatsObj("Access", [("accessURL", gs_vcf)])
        gs_vcf_distro = DatsObj(
            "DatasetDistribution",
            [
                ("access", gs_vcf_access),
                ("identifier", DatsObj("Identifier",
                                       [("identifier", vcf_doi)])),
                ("relatedIdentifiers", [
                    DatsObj("RelatedIdentifier",
                            [("identifier", csi_doi),
                             ("relationType", "vcf_index")])
                ]),
                ("size", filesize_to_int(vcf_size)),
                # TODO - add file size units
                ("conformsTo", [vcf_dstan])
            ])

        # AWS / S3 copy
        s3_vcf = ms['s3_vcf']['mapped_value']
        s3_csi = ms['s3_csi']['mapped_value']
        s3_vcf_access = DatsObj("Access", [("accessURL", s3_vcf)])
        s3_vcf_distro = DatsObj(
            "DatasetDistribution",
            [
                ("access", s3_vcf_access),
                ("identifier", DatsObj("Identifier",
                                       [("identifier", vcf_doi)])),
                ("relatedIdentifiers", [
                    DatsObj("RelatedIdentifier",
                            [("identifier", csi_doi),
                             ("relationType", "vcf_index")])
                ]),
                ("size", filesize_to_int(vcf_size)),
                # TODO - add file size units
                ("conformsTo", [vcf_dstan])
            ])

        m = re.match(r'^.*\/([^\/]+)$', gs_vcf)
        if m is None:
            logging.fatal("unable to parse filename from VCF file URI " +
                          gs_vcf)
            sys.exit(1)
        filename = m.group(1)

        # TODO - replace this with DATS-specific MD5 checksum encoding (TBD)
        md5_dimension = DatsObj(
            "Dimension", [("name", util.get_value_annotation("MD5", cache)),
                          ("values", [vcf_md5])])

        vcf_dataset = DatsObj("Dataset", [
            ("distributions", [gs_vcf_distro, s3_vcf_distro]),
            ("dimensions", [md5_dimension]),
            ("title", filename),
            ("types", [snp_type, cnv_type]),
            ("creators", creators),
        ])

        vcf_da = DatsObj(
            "DataAcquisition",
            [("name", filename), ("input", [dats_sample.getIdRef()])
             #            ("uses", [])                          # software used
             ])

        vcf_dataset.set("producedBy", vcf_da)
        # circular link back to enclosing Dataset as the output
        if not no_circular_links:
            vcf_da.set("output", [vcf_dataset.getIdRef()])
        file_datasets_l.append(vcf_dataset)

    logging.info("found " + str(n_samples_found) + " / " + str(n_samples) +
                 " sample(s) in TOPMed file manifest")
    return file_datasets_l
Exemplo n.º 27
0
def get_single_dna_extract_json(study, subj_var_values, samp_var_values):

    # all samples in TOPMed WGS phase are blood samples
    if samp_var_values['BODY_SITE'] != 'Blood':
        logging.fatal(
            "encountered BODY_SITE other than 'Blood' in TOPMed sample metadata - "
            + samp_var_values['BODY_SITE'])
        sys.exit(1)

    anatomy_name = "blood"
    anat_id = "0000178"

    anatomy_identifier = OrderedDict([("identifier", "UBERON:" + str(anat_id)),
                                      ("identifierSource", "UBERON")])
    anatomy_alt_ids = [
        OrderedDict([("identifier",
                      "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)),
                     ("identifierSource", "UBERON")])
    ]

    # extract subject attributes
    gender = None
    age = None
    for name in subj_var_values:
        if name == "GENDER":
            gender = subj_var_values[name].lower()
        elif name == "VISIT_AGE":
            age = subj_var_values[name]
    # TODO - determine what other subject attributes can be mapped directly to core DATS objects

    # place original dbGaP subject metadata into extraProperties
    # TODO - consider alternative of doing this only for un-harmonized metadata
    subj_extra_props = [
        DatsObj("CategoryValuesPair", [("category", xp),
                                       ("values", [subj_var_values[xp]])])
        for xp in sorted(subj_var_values)
    ]

    # extract sample attributes
    for name in samp_var_values:
        if name == 'SEQUENCING_CENTER':
            # TODO - determine which DATS objects (e.g., biological sample, DNA prep, sequence data) this property should attach to
            pass

    # TODO - determine what other subject attributes can be mapped directly to core DATS objects
    # e.g., IS_TUMOR -> bearerOfDisease ("the pathology affecting the material...")

    # place original dbGaP sample metadata into extraProperties
    samp_extra_props = [
        DatsObj("CategoryValuesPair", [("category", xp),
                                       ("values", [samp_var_values[xp]])])
        for xp in sorted(samp_var_values)
    ]

    # anatomical part
    anatomical_part = DatsObj("AnatomicalPart",
                              [("name", anatomy_name),
                               ("identifier", anatomy_identifier),
                               ("alternateIdentifiers", anatomy_alt_ids)])

    subject_sex = DatsObj("Dimension", [("name", {
        "value": "Gender"
    }), ("description", "Gender of the subject"), ("values", [gender])])

    subject_age = DatsObj("Dimension", [("name", {
        "value": "Age"
    }), ("description", "Age of the subject"), ("values", [age])])

    subject_characteristics = [subject_sex, subject_age]

    human_t = util.get_taxon_human()
    subj_id = subj_var_values['SUBJECT_ID']
    dbgap_subj_id = subj_var_values['dbGaP_Subject_ID']
    samp_id = samp_var_values['SAMPLE_ID']
    dbgap_samp_id = samp_var_values['dbGaP_Sample_ID']

    study_title = study.get("title")

    # human experimental subject/patient
    subject_material = DatsObj(
        "Material",
        [("name", subj_id), ("identifier", {
            "identifier": subj_id
        }),
         ("alternateIdentifiers", [util.get_alt_id(dbgap_subj_id, "dbGaP")]),
         ("description", study_title + " subject " + subj_id),
         ("characteristics", subject_characteristics), ("taxonomy", human_t),
         ("roles", util.get_donor_roles()),
         ("extraProperties", subj_extra_props)])

    # biological/tissue sample
    sample_name = samp_id
    biological_sample_material = DatsObj(
        "Material",
        [("name", sample_name), ("identifier", {
            "identifier": samp_id
        }),
         ("alternateIdentifiers", [util.get_alt_id(dbgap_samp_id, "dbGaP")]),
         ("description",
          anatomy_name + " specimen collected from subject " + subj_id),
         ("taxonomy", human_t),
         ("roles", [OrderedDict([("value", "specimen"), ("valueIRI", "")])]),
         ("derivesFrom", [subject_material, anatomical_part]),
         ("extraProperties", samp_extra_props)])

    # DNA extracted from tissue sample
    dna_material = DatsObj(
        "Material",
        [("name", "DNA from " + sample_name),
         ("description", "DNA extracted from " +
          anatomy_name + " specimen collected from subject " + subj_id),
         ("taxonomy", human_t),
         ("roles", [OrderedDict([("value", "DNA extract"),
                                 ("valueIRI", "")])]),
         ("derivesFrom", [biological_sample_material])])

    return dna_material
Exemplo n.º 28
0
 def make_data_standard(format):
     return DatsObj("DataStandard",
                    [("name", format),
                     ("type", util.get_value_annotation("format", cache)),
                     ("description", format + " file format")])
Exemplo n.º 29
0
def get_dbgap_studies(qterm):
    studies = []
    study = None
    lnum = 0

    # Add newline before each occurrence of "Versions" if not already present
    lines = []
    for line in GTEX_STUDIES_STR.split('\n'):
        m = re.match(r'^(\S+.*)(Versions?.*)$', line)
        if m is None:
            lines.append(line)
        else:
            lines.append(m.group(1))
            lines.append(m.group(2))

    for line in lines:
        lnum += 1
        # blank line
        if re.match(r'^\s*$', line):
            continue
        # study id
        m = re.match('^(phs\S+)$', line)
        if m is not None:
            study = {'id': m.group(1)}
            studies.append(study)
            continue
        # study description
        m = re.match(r'^Genotype-Tissue Expression(.*)$', line)
        if m is not None:
            study['descr'] = m.group(1)
            continue
        # embargo release(s)
        m = re.match(r'^(Version.*)$', line)
        if m is not None:
            if 'versions' not in study:
                study['versions'] = []
            study['versions'].append(m.group(1))
            continue
        # details/participants/type of study
        m = re.match('^VDAS(\d+)(\D.*)Links$', line)
        if m is not None:
            study['n_participants'] = int(m.group(1))
            study['study_type'] = m.group(2)
            continue
        # platform
        m = re.match(r'^(HiSeq.*)$', line)
        if m is not None:
            study['platform'] = m.group(1)
            continue
        # parse error
        logging.fatal("unexpected content at line " + str(lnum) +
                      " of dbGaP studies: " + line)
        sys.exit(1)

    n_studies = len(studies)
    logging.info("found " + str(n_studies) + " GTEx study in dbGaP")

    # convert studies to DATS Datasets
    datasets = []
    for s in studies:
        m = re.match(r'^phs\d+\.(v\d+)\.p\d+$', s['id'])
        if m is None:
            logging.fatal(
                "unable to parse dataset/study version from study id " +
                s['id'])
            sys.exit(1)
        version = m.group(1)

        dimensions = [
            DatsObj("Dimension", [
                ("name", {
                    "value": "Actual Subject Count"
                }),
                ("description",
                 "The actual number of subjects entered into a clinical trial."
                 ), ("types", [util.get_annotation("Actual Subject Number")]),
                ("values", [s['n_participants']])
            ])
        ]

        types = [
            OrderedDict([
                ("information", util.get_annotation("DNA sequencing")),
                ("method",
                 util.get_annotation("whole genome sequencing assay")),
                ("platform", HISEQ_TYPES[s['platform']])
            ])
        ]

        # TODO - Specify creators and release date(s) of this particular dataset.
        #  This may require parsing some of the metadata files and/or documents.
        # TODO - required field - using NIH NHLBI as placeholder, but need to revisit and assign specific study-level creator
        creators = [NIH_NHGRI]

        # TODO - find better location for study_type?
        extra_props = [
            DatsObj("CategoryValuesPair", [("category", "study_type"),
                                           ("values", [s['study_type']])])
        ]

        # Dataset
        dataset = DatsObj(
            "Dataset",
            [
                ("identifier", DatsObj("Identifier",
                                       [("identifier", s['id'])])),
                ("version", version),
                #                ("dates", []),
                #("title", s['descr']),
                ("title",
                 "Genotype-Tissue Expression Project (GTEx) WGS and RNA-Seq data"
                 ),
                ("storedIn", DB_GAP),
                ("types", types),
                ("creators", creators),
                ("dimensions", dimensions),
                ("extraProperties", extra_props)
                #                ("producedBy", data_analysis),
                #               ("distributions", [DatsObj("DatasetDistribution", [
                #                                ("access", DatsObj("Access", [
                #                                            ("landingPage", GTEX_DATASETS_URL)
                #                                            ]))
                #                                ])]),
            ])

        datasets.append(dataset)

    return datasets
Exemplo n.º 30
0
#!/usr/bin/env python3

from ccmm.dats.datsobj import DatsObj
import ccmm.mgd.human_homologs
from collections import OrderedDict
import csv
import gzip
import logging
import re
import sys

EXPECTED_GENOME_BUILD = "GRCm38-C57BL/6J"
N_GFF_FIELDS = 9

MGI = DatsObj("Organization", [("abbreviation", "MGI"),
                               ("name", "Mouse Genome Informatics")])

MGD = DatsObj("DataRepository", [("name", "MGD"),
                                 ("description", "Mouse Genome Database"),
                                 ("publishers", [MGI])])

## Ontology for Biomedical Investigations
# "DNA sequencing"
DNA_SEQUENCING_TYPE = OrderedDict([
    ("value", "DNA sequencing"),
    ("valueIRI", "http://purl.obolibrary.org/obo/OBI_0000626")
])

MGD_TYPES = [
    DatsObj("DataType", [("information", DNA_SEQUENCING_TYPE)])
    # TODO - add other types e.g., genes, human homologs, other feature types