示例#1
0
def get_single_sample_json(sample, dats_obj_cache):
    #    print("converting sample to json: " + str(sample))
    samp_id = sample['SAMPID']['mapped_value']
    subj_id = sample['SUBJID']['mapped_value']
    subject = sample['subject']

    # Uberon id (or EFO id, contrary to the documentation)
    anat_id = sample['SMUBRID']['mapped_value']
    if anat_id is None:
        print("No Uberon/anatomy ID specified for sample " + samp_id)
        sys.exit(1)

    anatomy_identifier = None
    anatomy_alt_ids = None
    # TODO - query anatomy term from UBERON/EFO instead?
    anatomy_name = sample['SMTSD']['mapped_value']

    # EFO id
    if re.match(r'^EFO_\d+', anat_id):
        anatomy_identifier = OrderedDict([("identifier", anat_id),
                                          ("identifierSource", "EFO")])
        anatomy_alt_ids = [
            OrderedDict([
                ("identifier",
                 "https://www.ebi.ac.uk/ols/ontologies/efo/terms?short_form=" +
                 str(anat_id)), ("identifierSource", "EFO")
            ])
        ]
    # Uberon id
    else:
        anatomy_identifier = OrderedDict([("identifier",
                                           "UBERON:" + str(anat_id)),
                                          ("identifierSource", "UBERON")])
        anatomy_alt_ids = [
            OrderedDict([
                ("identifier",
                 "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)),
                ("identifierSource", "UBERON")
            ])
        ]

    # anatomical part
    anat_part_key = ":".join(["AnatomicalPart", anatomy_name])
    if anat_part_key in dats_obj_cache:
        anatomical_part = dats_obj_cache[anat_part_key]
    else:
        anatomical_part = DatsObj("AnatomicalPart",
                                  [("name", anatomy_name),
                                   ("identifier", anatomy_identifier),
                                   ("alternateIdentifiers", anatomy_alt_ids)])
        dats_obj_cache[anat_part_key] = anatomical_part

    # human experimental subject/patient
    subject_sex = DatsObj(
        "Dimension", [("name", DatsObj("Annotation", [("value", "Gender")])),
                      ("description", "Gender of the subject"),
                      ("identifier",
                       DatsObj("Identifier", [("identifier", "SEX"),
                                              ("identifierSource", "GTEx")])),
                      ("values", [subject['SEX']['mapped_value']])])

    subject_age = DatsObj(
        "Dimension",
        [("name", DatsObj("Annotation", [("value", "Age range")])),
         ("description", "Age range of the subject"),
         ("identifier",
          DatsObj("Identifier", [("identifier", "AGE"),
                                 ("identifierSource", "GTEx")])),
         ("values", [subject['AGE']['mapped_value']])])

    subject_hardy_scale = DatsObj(
        "Dimension",
        [("name", DatsObj("Annotation", [("value", "Hardy scale")])),
         ("description", "Hardy scale death classification for the subject"),
         ("identifier",
          DatsObj("Identifier", [("identifier", "DTHHRDY"),
                                 ("identifierSource", "GTEx")])),
         ("values", [subject['DTHHRDY']['mapped_value']])])

    subject_characteristics = [subject_sex, subject_age, subject_hardy_scale]

    # human experimental subject/patient
    subj_key = ":".join(["Material", subj_id])
    if subj_key in dats_obj_cache:
        subject_material = dats_obj_cache[subj_key]
    else:
        subject_material = DatsObj(
            "Material", [("name", subj_id),
                         ("identifier", {
                             "identifier": subj_id
                         }), ("description", "GTEx subject " + subj_id),
                         ("characteristics", subject_characteristics),
                         ("taxonomy", [util.get_taxon_human(dats_obj_cache)]),
                         ("roles", util.get_donor_roles(dats_obj_cache))])
        dats_obj_cache[subj_key] = subject_material

    specimen_annot = util.get_annotation("specimen", dats_obj_cache)
    rna_extract_annot = util.get_annotation("RNA extract", dats_obj_cache)

    # biological/tissue sample
    sample_name = samp_id
    biological_sample_material = DatsObj(
        "Material",
        [("name", sample_name), ("identifier", {
            "identifier": samp_id
        }),
         ("description",
          anatomy_name + " specimen collected from subject " + subj_id),
         ("taxonomy", [util.get_taxon_human(dats_obj_cache)]),
         ("roles", [specimen_annot]),
         ("derivesFrom", [subject_material, anatomical_part])])

    # RNA extracted from tissue sample
    rna_material = DatsObj(
        "Material",
        [("name", "RNA from " + sample_name),
         ("description", "total RNA extracted from " + anatomy_name +
          " specimen collected from subject " + subj_id),
         ("taxonomy", [util.get_taxon_human(dats_obj_cache)]),
         ("roles", [rna_extract_annot]),
         ("derivesFrom", [biological_sample_material])])

    return rna_material
示例#2
0
def get_files_dats_datasets(cache, dats_samples_d, sample_manifest, file_guids,
                            no_circular_links):
    file_datasets_l = []

    wgs_datatype = DatsObj(
        "DataType",
        [("information", util.get_annotation("DNA sequencing", cache)),
         ("method", util.get_annotation("whole genome sequencing assay",
                                        cache)),
         ("platform", util.get_annotation("Illumina", cache))])

    def get_wgs_datatype():
        dkey = ".".join(["DataType", "WGS"])
        return cache.get_obj_or_ref(dkey, lambda: wgs_datatype)

    snp_datatype = DatsObj(
        "DataType", [("information", util.get_annotation("SNP", cache)),
                     ("method", util.get_annotation("SNP analysis", cache))])

    cnv_datatype = DatsObj(
        "DataType", [("information", util.get_annotation("CNV", cache)),
                     ("method", util.get_annotation("CNV analysis", cache))])

    def get_snp_datatype():
        dkey = ".".join(["DataType", "SNP"])
        return cache.get_obj_or_ref(dkey, lambda: snp_datatype)

    def get_cnv_datatype():
        dkey = ".".join(["DataType", "CNV"])
        return cache.get_obj_or_ref(dkey, lambda: cnv_datatype)

    nhlbi_key = ":".join(["Organization", "NHLBI"])
    nhlbi = cache.get_obj_or_ref(nhlbi_key, lambda: NIH_NHLBI)
    creators = [nhlbi]

    def make_data_standard(format):
        return DatsObj("DataStandard",
                       [("name", format),
                        ("type", util.get_value_annotation("format", cache)),
                        ("description", format + " file format")])

    cram_ds_key = ":".join(["DataStandard", "CRAM"])
    cram_dstan = cache.get_obj_or_ref(cram_ds_key,
                                      lambda: make_data_standard("CRAM"))

    vcf_ds_key = ":".join(["DataStandard", "VCF"])
    vcf_dstan = cache.get_obj_or_ref(vcf_ds_key,
                                     lambda: make_data_standard("VCF"))

    n_samples = len(dats_samples_d)
    n_samples_found = 0

    for sample_id in dats_samples_d:
        dats_sample = dats_samples_d[sample_id]

        # look up corresponding file paths in manifest file
        if sample_id not in sample_manifest:
            logging.debug("sample not found in manifest - " + sample_id)
            continue

        n_samples_found += 1
        ms = sample_manifest[sample_id]

        material_type = 'DNA'
        wgs_type = get_wgs_datatype()
        snp_type = get_snp_datatype()
        cnv_type = get_cnv_datatype()

        # ------------------------------------------------
        # WGS sequence - CRAM and CRAI files
        # ------------------------------------------------

        def get_filename(gs_uri):
            m = re.match(r'^.*\/([^\/]+)$', gs_uri)
            if m is None:
                logging.fatal("unable to parse filename from " + gs_uri)
                sys.exit(1)
            filename = m.group(1)
            return filename

        gs_cram = ms['gs_cram']['mapped_value']
        gs_crai = ms['gs_crai']['mapped_value']

        # GUID lookup
        cram_file = get_filename(gs_cram)
        crai_file = get_filename(gs_crai)

        cram_doi = file_guids[cram_file]['Sodium_GUID']['raw_value']
        cram_size = file_guids[cram_file]['File size']['raw_value']
        cram_md5 = file_guids[cram_file]['md5sum']['raw_value']

        crai_doi = file_guids[crai_file]['Sodium_GUID']['raw_value']
        crai_md5 = file_guids[crai_file]['md5sum']['raw_value']

        # handle file size values with "e" in them
        def filesize_to_int(size):
            if re.match(r'.*e.*', size):
                size = int(float(size))
            else:
                size = int(size)

        # Google Cloud Platform / Google Storage copy
        gs_cram_access = DatsObj("Access", [("accessURL", gs_cram)])
        gs_cram_distro = DatsObj(
            "DatasetDistribution",
            [
                ("access", gs_cram_access),
                ("identifier", DatsObj("Identifier",
                                       [("identifier", cram_doi)])),
                ("relatedIdentifiers", [
                    DatsObj("RelatedIdentifier",
                            [("identifier", crai_doi),
                             ("relationType", "cram_index")])
                ]),
                ("size", filesize_to_int(cram_size)),
                # TODO - add file size units
                ("conformsTo", [cram_dstan])
            ])

        # AWS / S3 copy
        s3_cram = ms['s3_cram']['mapped_value']
        s3_crai = ms['s3_crai']['mapped_value']
        s3_cram_access = DatsObj("Access", [("accessURL", s3_cram)])
        s3_cram_distro = DatsObj(
            "DatasetDistribution",
            [
                ("access", s3_cram_access),
                ("identifier", DatsObj("Identifier",
                                       [("identifier", cram_doi)])),
                ("relatedIdentifiers", [
                    DatsObj("RelatedIdentifier",
                            [("identifier", crai_doi),
                             ("relationType", "cram_index")])
                ]),
                ("size", filesize_to_int(cram_size)),
                # TODO - add file size units
                ("conformsTo", [cram_dstan])
            ])

        m = re.match(r'^.*\/([^\/]+)$', gs_cram)
        if m is None:
            logging.fatal("unable to parse filename from CRAM file URI " +
                          gs_cram)
            sys.exit(1)
        filename = m.group(1)

        # TODO - replace this with DATS-specific MD5 checksum encoding (TBD)
        md5_dimension = DatsObj(
            "Dimension", [("name", util.get_value_annotation("MD5", cache)),
                          ("values", [cram_md5])])

        cram_dataset = DatsObj("Dataset", [
            ("distributions", [gs_cram_distro, s3_cram_distro]),
            ("dimensions", [md5_dimension]),
            ("title", filename),
            ("types", [wgs_type]),
            ("creators", creators),
        ])

        cram_da = DatsObj(
            "DataAcquisition",
            [("name", filename), ("input", [dats_sample.getIdRef()])
             #            ("uses", [])                          # software used
             ])

        cram_dataset.set("producedBy", cram_da)
        # circular link back to enclosing Dataset as the output
        if not no_circular_links:
            cram_da.set("output", [cram_dataset.getIdRef()])
        file_datasets_l.append(cram_dataset)

        # ------------------------------------------------
        # Variant calls - VCF and CSI files
        # ------------------------------------------------

        gs_vcf = ms['gs_vcf']['mapped_value']
        gs_csi = ms['gs_csi']['mapped_value']

        if gs_vcf is None:
            logging.warn("no VCF file found for " + sample_id)
            continue

        # GUID lookup
        vcf_file = get_filename(gs_vcf)
        csi_file = get_filename(gs_csi)

        vcf_doi = file_guids[vcf_file]['Sodium_GUID']['raw_value']
        vcf_size = file_guids[vcf_file]['File size']['raw_value']
        vcf_md5 = file_guids[vcf_file]['md5sum']['raw_value']

        csi_doi = file_guids[csi_file]['Sodium_GUID']['raw_value']
        csi_md5 = file_guids[csi_file]['md5sum']['raw_value']

        # Google Cloud Platform / Google Storage copy
        gs_vcf_access = DatsObj("Access", [("accessURL", gs_vcf)])
        gs_vcf_distro = DatsObj(
            "DatasetDistribution",
            [
                ("access", gs_vcf_access),
                ("identifier", DatsObj("Identifier",
                                       [("identifier", vcf_doi)])),
                ("relatedIdentifiers", [
                    DatsObj("RelatedIdentifier",
                            [("identifier", csi_doi),
                             ("relationType", "vcf_index")])
                ]),
                ("size", filesize_to_int(vcf_size)),
                # TODO - add file size units
                ("conformsTo", [vcf_dstan])
            ])

        # AWS / S3 copy
        s3_vcf = ms['s3_vcf']['mapped_value']
        s3_csi = ms['s3_csi']['mapped_value']
        s3_vcf_access = DatsObj("Access", [("accessURL", s3_vcf)])
        s3_vcf_distro = DatsObj(
            "DatasetDistribution",
            [
                ("access", s3_vcf_access),
                ("identifier", DatsObj("Identifier",
                                       [("identifier", vcf_doi)])),
                ("relatedIdentifiers", [
                    DatsObj("RelatedIdentifier",
                            [("identifier", csi_doi),
                             ("relationType", "vcf_index")])
                ]),
                ("size", filesize_to_int(vcf_size)),
                # TODO - add file size units
                ("conformsTo", [vcf_dstan])
            ])

        m = re.match(r'^.*\/([^\/]+)$', gs_vcf)
        if m is None:
            logging.fatal("unable to parse filename from VCF file URI " +
                          gs_vcf)
            sys.exit(1)
        filename = m.group(1)

        # TODO - replace this with DATS-specific MD5 checksum encoding (TBD)
        md5_dimension = DatsObj(
            "Dimension", [("name", util.get_value_annotation("MD5", cache)),
                          ("values", [vcf_md5])])

        vcf_dataset = DatsObj("Dataset", [
            ("distributions", [gs_vcf_distro, s3_vcf_distro]),
            ("dimensions", [md5_dimension]),
            ("title", filename),
            ("types", [snp_type, cnv_type]),
            ("creators", creators),
        ])

        vcf_da = DatsObj(
            "DataAcquisition",
            [("name", filename), ("input", [dats_sample.getIdRef()])
             #            ("uses", [])                          # software used
             ])

        vcf_dataset.set("producedBy", vcf_da)
        # circular link back to enclosing Dataset as the output
        if not no_circular_links:
            vcf_da.set("output", [vcf_dataset.getIdRef()])
        file_datasets_l.append(vcf_dataset)

    logging.info("found " + str(n_samples_found) + " / " + str(n_samples) +
                 " sample(s) in TOPMed file manifest")
    return file_datasets_l
示例#3
0
def get_single_dna_extract_json(cache, study, study_md, subj_var_values,
                                samp_var_values):
    # Almost all samples in TOPMed WGS phase are blood samples, named "Blood", "Peripheral Blood"...
    # Few samples are saliva samples probably due to sample collection issues
    name = None
    if 'BODY_SITE' in samp_var_values:
        name = 'BODY_SITE'
    elif 'Body_Site' in samp_var_values:
        name = 'Body_Site'
    elif 'Body Site' in samp_var_values:
        name = 'Body Site'

    anat_id = None
    anatomy_name = None

    if name is not None:
        if "blood" in samp_var_values[name]['value'].lower():
            anatomy_name = "blood"
            anat_id = "0000178"
        elif samp_var_values[name]['value'].lower() == "saliva":
            anatomy_name = "saliva"
            anat_id = "0001836"
        else:
            logging.fatal(
                "encountered BODY_SITE other than 'Blood' and 'Saliva' in TOPMed sample metadata - "
                + samp_var_values['BODY_SITE']['value'])
            sys.exit(1)

    if anat_id is not None:
        anatomy_identifier = OrderedDict([("identifier",
                                           "UBERON:" + str(anat_id)),
                                          ("identifierSource", "UBERON")])
        anatomy_alt_ids = [
            OrderedDict([
                ("identifier",
                 "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)),
                ("identifierSource", "UBERON")
            ])
        ]

    # extract subject attributes
    gender = None
    age = None
    visit_year = None
    sys_bp = None
    dias_bp = None
    disease = {}
    disease['hypertension'] = "unknown"

    for name in subj_var_values:
        name_upper = name.upper()
        if name_upper == "GENDER" or name_upper == "SEX":
            gender = subj_var_values[name]['value'].lower()
        elif name_upper == "VISIT_AGE" or name_upper == "AGE" or name_upper == "AGE_ENROLL":  #need to confirm that these  allmean the same thing
            age = subj_var_values[name]['value']
        elif name_upper == "VISIT_YEAR":
            visit_year = subj_var_values[name]['value']
        elif name_upper == "SYSBP":
            sys_bp = subj_var_values[name]['value']
        elif name_upper == "DIASBP":
            dias_bp = subj_var_values[name]['value']
        elif name_upper == "HYPERTENSION" or name_upper == "HIGHBLOODPRES":
            if subj_var_values[name]['value'].lower(
            ) == "yes" or subj_var_values[name]['value'] == '1':
                disease['hypertension'] = "yes"
            elif re.match(r'\S', subj_var_values[name]['value']):
                disease['hypertension'] = "no"

    # anatomical part
    anatomical_part = None
    if anatomy_name is not None:
        anatomical_part = DatsObj("AnatomicalPart",
                                  [("name", anatomy_name),
                                   ("identifier", anatomy_identifier),
                                   ("alternateIdentifiers", anatomy_alt_ids)])

    subject_characteristics = []
    subject_bearerOfDisease = []

    # harmonized/standardized characteristics
    if gender is not None:
        subject_sex = DatsObj(
            "Dimension",
            [("name", DatsObj("Annotation", [("value", "Gender")])),
             ("description", "Gender of the subject"), ("values", [gender])])
        subject_characteristics.append(subject_sex)

    if age is not None:
        subject_age = DatsObj(
            "Dimension", [("name", DatsObj("Annotation", [("value", "Age")])),
                          ("description", "Age of the subject"),
                          ("values", [age])])
        subject_characteristics.append(subject_age)

    if visit_year is not None:
        subject_visitYear = DatsObj("Dimension", [
            ("name", DatsObj("Annotation", [("value", "Visit year")])),
            ("description", "Year of visit, to use for longitudinal analysis"),
            ("values", [visit_year])
        ])
        subject_characteristics.append(subject_visitYear)

    if sys_bp is not None:
        subject_sysBP = DatsObj(
            "Dimension",
            [("name",
              DatsObj("Annotation", [("value", "Systolic blood pressure")])),
             ("description",
              "Systolic blood pressure of subject, measured in mmHg"),
             ("values", [sys_bp])])
        subject_characteristics.append(subject_sysBP)

    if dias_bp is not None:
        subject_diasBP = DatsObj(
            "Dimension",
            [("name",
              DatsObj("Annotation", [("value", "Diastolic blood pressure")])),
             ("description",
              "Diastolic blood pressure of subject, measured in mmHg"),
             ("values", [dias_bp])])
        subject_characteristics.append(subject_diasBP)

    if disease['hypertension'] != "unknown":
        disease_name = "hypertension"
        disease_id = "10763"
        disease_identifier = OrderedDict([
            ("identifier", "DOID:" + str(disease_id)),
            ("identifierSource", "Disease Ontology")
        ])
        disease_alt_ids = [
            OrderedDict([
                ("identifier",
                 "http://purl.obolibrary.org/obo/DOID_" + str(disease_id)),
                ("identifierSource", "Disease Ontology")
            ])
        ]
        subject_hypertension = DatsObj("Disease", [
            ("name", "Hypertension"),
            ("identifier", disease_identifier),
            ("alternateIdentifiers", disease_alt_ids),
            ("diseaseStatus",
             DatsObj("Annotation", [("value", disease['hypertension']),
                                    ("valueIRI", "")])),
        ])
        subject_bearerOfDisease.append(subject_hypertension)

    # create a DATS Dimension from a dbGaP variable value
    def make_var_dimension(name, var_value):
        value = var_value["value"]

        dim = DatsObj("Dimension",
                      [("name", DatsObj("Annotation", [("value", name)])),
                       ("values", [value])])

        # find existing DATS identifier for the corresponding Dataset Dimension
        if "var" in var_value:
            id = var_value["var"]["id"]
            dbgap_var_dim = study_md['dbgap_vars'][id]
            dim.setProperty("identifier",
                            dbgap_var_dim.get("identifier").getIdRef())

        return dim

    # create DATS Dimensions for dbGaP subject metadata
    subject_dimensions = [
        make_var_dimension(vname, subj_var_values[vname])
        for vname in sorted(subj_var_values)
    ]

    # create DATS Dimensions for dbGaP sample metadata
    sample_dimensions = [
        make_var_dimension(vname, samp_var_values[vname])
        for vname in sorted(samp_var_values)
    ]

    # "raw" characteristics from dbGaP metadata
    subject_characteristics.extend(subject_dimensions)
    sample_characteristics = sample_dimensions

    human_t = util.get_taxon_human(cache)
    subj_id = subj_var_values['SUBJECT_ID']['value']
    dbgap_subj_id = subj_var_values['dbGaP_Subject_ID']['value']
    samp_id = samp_var_values['SAMPLE_ID']['value']
    dbgap_samp_id = samp_var_values['dbGaP_Sample_ID']['value']

    study_title = study.get("title")

    # human experimental subject/patient
    subject_material = DatsObj(
        "Material",
        [("name", subj_id), ("identifier", {
            "identifier": subj_id
        }),
         ("alternateIdentifiers", [util.get_alt_id(dbgap_subj_id, "dbGaP")]),
         ("description", study_title + " subject " + subj_id),
         ("characteristics", subject_characteristics),
         ("bearerOfDisease", subject_bearerOfDisease), ("taxonomy", [human_t]),
         ("roles", util.get_donor_roles(cache))])

    # TODO - use DatsObjCache
    specimen_annot = util.get_annotation("specimen")
    dna_extract_annot = util.get_annotation("DNA extract")

    # biological/tissue sample
    sample_name = samp_id
    sample_derives_from = [subject_material]
    sample_descr = "specimen collected from subject " + subj_id
    if anatomical_part is not None:
        sample_derives_from.append(anatomical_part)
        sample_descr = anatomy_name + " " + sample_descr

    biological_sample_material = DatsObj(
        "Material",
        [("name", sample_name), ("identifier", {
            "identifier": samp_id
        }),
         ("alternateIdentifiers", [util.get_alt_id(dbgap_samp_id, "dbGaP")]),
         ("description", sample_descr),
         ("characteristics", sample_characteristics), ("taxonomy", [human_t]),
         ("roles", [specimen_annot]), ("derivesFrom", sample_derives_from)])

    # DNA extracted from tissue sample
    dna_descr = "DNA extracted from specimen collected from subject " + subj_id
    if anatomical_part is not None:
        dna_descr = "DNA extracted from " + anatomy_name + " specimen collected from subject " + subj_id

    dna_material = DatsObj("Material",
                           [("name", "DNA from " + sample_name),
                            ("description", dna_descr),
                            ("taxonomy", [human_t]),
                            ("roles", [dna_extract_annot]),
                            ("derivesFrom", [biological_sample_material])])

    return dna_material
示例#4
0
def get_sample_dats_material(cache, dats_subject, study, study_md,
                             samp_var_values):

    # Almost all samples in TOPMed WGS phase are blood samples, named "Blood", "Peripheral Blood"...
    # Few samples are saliva samples probably due to sample collection issues
    name = None
    if 'BODY_SITE' in samp_var_values:
        name = 'BODY_SITE'
    elif 'Body_Site' in samp_var_values:
        name = 'Body_Site'
    elif 'Body Site' in samp_var_values:
        name = 'Body Site'

    anat_id = None
    anatomy_name = None

    if name is not None:
        if "blood" in samp_var_values[name]['value'].lower():
            anatomy_name = "blood"
            anat_id = "0000178"
        elif samp_var_values[name]['value'].lower() == "saliva":
            anatomy_name = "saliva"
            anat_id = "0001836"
        else:
            logging.fatal(
                "encountered BODY_SITE other than 'Blood' and 'Saliva' in TOPMed sample metadata - "
                + samp_var_values['BODY_SITE']['value'])
            sys.exit(1)

    def make_anat_part(anat_id, anatomy_name):
        # anatomical part
        anatomical_part = DatsObj("AnatomicalPart", [("name", anatomy_name)])

        if anat_id is not None:
            anatomy_identifier = OrderedDict([("identifier",
                                               "UBERON:" + str(anat_id)),
                                              ("identifierSource", "UBERON")])
            anatomy_alt_ids = [
                OrderedDict([
                    ("identifier",
                     "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)),
                    ("identifierSource", "UBERON")
                ])
            ]

            anatomical_part.set("identifier", anatomy_identifier)
            anatomical_part.set("alternateIdentifiers", anatomy_alt_ids)

        return anatomical_part

    if anatomy_name is not None:
        # use cached value for AnatomicalPart if possible
        anat_part_key = ":".join(["AnatomicalPart", anatomy_name])
        anatomical_part = cache.get_obj_or_ref(
            anat_part_key, lambda: make_anat_part(anat_id, anatomy_name))
    else:
        anatomical_part = None

    # create a DATS Dimension from a dbGaP variable value
    def make_var_dimension(name, var_value):
        value = var_value["value"]

        dim = DatsObj("Dimension",
                      [("name", DatsObj("Annotation", [("value", name)])),
                       ("values", [value])])

        # find existing DATS identifier for the corresponding Dataset Dimension
        if "var" in var_value:
            dbgap_var_dim = var_value["var"]["dim"]
            dim.setProperty("identifier",
                            dbgap_var_dim.get("identifier").getIdRef())
        return dim

    # create DATS Dimensions for dbGaP sample metadata
    sample_dimensions = [
        make_var_dimension(vname, samp_var_values[vname])
        for vname in sorted(samp_var_values)
    ]

    sample_characteristics = sample_dimensions
    samp_id = samp_var_values['SAMPLE_ID']['value']
    dbgap_samp_id = samp_var_values['dbGaP_Sample_ID']['value']
    study_title = study.get("title")
    specimen_annot = util.get_annotation("specimen", cache)

    # corresponding DATS subject Material
    subj_key = ":".join(["Material", dats_subject.get("name")])
    dats_subj = cache.get_obj_or_ref(subj_key, lambda: dats_subject)
    dats_subj_name = dats_subject.get("name")

    # biological/tissue sample
    sample_name = samp_id
    sample_derives_from = [dats_subj]

    sample_descr = "specimen collected from subject " + dats_subj_name
    if anatomical_part is not None:
        sample_derives_from.append(anatomical_part)
        sample_descr = anatomy_name + " " + sample_descr

    biological_sample_material = DatsObj(
        "Material",
        [("name", sample_name), ("identifier", {
            "identifier": samp_id
        }),
         ("alternateIdentifiers", [util.get_alt_id(dbgap_samp_id, "dbGaP")]),
         ("description", sample_descr),
         ("characteristics", sample_characteristics),
         ("taxonomy", [util.get_taxon_human(cache)]),
         ("roles", [specimen_annot]), ("derivesFrom", sample_derives_from)])

    # RNA or DNA extracted from tissue sample
    stype = "DNA"
    # TODO - check if RNA, not DNA

    dna_or_rna_descr = stype + " extracted from specimen collected from subject " + dats_subj_name
    if anatomical_part is not None:
        dna_or_rna_descr = stype + " extracted from " + anatomy_name + " specimen collected from subject " + dats_subj_name

    dna_or_rna_material = DatsObj(
        "Material",
        [("name", stype + " from " + sample_name),
         ("description", dna_or_rna_descr),
         ("taxonomy", [util.get_taxon_human(cache)]),
         ("roles", [util.get_annotation(stype + " extract", cache)]),
         ("derivesFrom", [biological_sample_material])])
    return dna_or_rna_material
示例#5
0
def get_dbgap_studies(qterm):
    studies = []
    study = None
    lnum = 0

    # Add newline before each occurrence of "Versions" if not already present
    lines = []
    for line in GTEX_STUDIES_STR.split('\n'):
        m = re.match(r'^(\S+.*)(Versions?.*)$', line)
        if m is None:
            lines.append(line)
        else:
            lines.append(m.group(1))
            lines.append(m.group(2))

    for line in lines:
        lnum += 1
        # blank line
        if re.match(r'^\s*$', line):
            continue
        # study id
        m = re.match('^(phs\S+)$', line)
        if m is not None:
            study = {'id': m.group(1)}
            studies.append(study)
            continue
        # study description
        m = re.match(r'^Genotype-Tissue Expression(.*)$', line)
        if m is not None:
            study['descr'] = m.group(1)
            continue
        # embargo release(s)
        m = re.match(r'^(Version.*)$', line)
        if m is not None:
            if 'versions' not in study:
                study['versions'] = []
            study['versions'].append(m.group(1))
            continue
        # details/participants/type of study
        m = re.match('^VDAS(\d+)(\D.*)Links$', line)
        if m is not None:
            study['n_participants'] = int(m.group(1))
            study['study_type'] = m.group(2)
            continue
        # platform
        m = re.match(r'^(HiSeq.*)$', line)
        if m is not None:
            study['platform'] = m.group(1)
            continue
        # parse error
        logging.fatal("unexpected content at line " + str(lnum) +
                      " of dbGaP studies: " + line)
        sys.exit(1)

    n_studies = len(studies)
    logging.info("found " + str(n_studies) + " GTEx study in dbGaP")

    # convert studies to DATS Datasets
    datasets = []
    for s in studies:
        m = re.match(r'^phs\d+\.(v\d+)\.p\d+$', s['id'])
        if m is None:
            logging.fatal(
                "unable to parse dataset/study version from study id " +
                s['id'])
            sys.exit(1)
        version = m.group(1)

        dimensions = [
            DatsObj("Dimension", [
                ("name", {
                    "value": "Actual Subject Count"
                }),
                ("description",
                 "The actual number of subjects entered into a clinical trial."
                 ), ("types", [util.get_annotation("Actual Subject Number")]),
                ("values", [s['n_participants']])
            ])
        ]

        types = [
            OrderedDict([
                ("information", util.get_annotation("DNA sequencing")),
                ("method",
                 util.get_annotation("whole genome sequencing assay")),
                ("platform", HISEQ_TYPES[s['platform']])
            ])
        ]

        # TODO - Specify creators and release date(s) of this particular dataset.
        #  This may require parsing some of the metadata files and/or documents.
        # TODO - required field - using NIH NHLBI as placeholder, but need to revisit and assign specific study-level creator
        creators = [NIH_NHGRI]

        # TODO - find better location for study_type?
        extra_props = [
            DatsObj("CategoryValuesPair", [("category", "study_type"),
                                           ("values", [s['study_type']])])
        ]

        # Dataset
        dataset = DatsObj(
            "Dataset",
            [
                ("identifier", DatsObj("Identifier",
                                       [("identifier", s['id'])])),
                ("version", version),
                #                ("dates", []),
                #("title", s['descr']),
                ("title",
                 "Genotype-Tissue Expression Project (GTEx) WGS and RNA-Seq data"
                 ),
                ("storedIn", DB_GAP),
                ("types", types),
                ("creators", creators),
                ("dimensions", dimensions),
                ("extraProperties", extra_props)
                #                ("producedBy", data_analysis),
                #               ("distributions", [DatsObj("DatasetDistribution", [
                #                                ("access", DatsObj("Access", [
                #                                            ("landingPage", GTEX_DATASETS_URL)
                #                                            ]))
                #                                ])]),
            ])

        datasets.append(dataset)

    return datasets
示例#6
0
#To do: incorporate
#HumanOmni5-Quad
#HumanOmni2.5
#Infinium HumanExome BeadChip
#HiSeq 2000
#HiSeq 2000
#GeneChip Human Gene 1.0 ST Array
#HiSeq 2000
#HiSeq X Ten

DBGAP_QUERY_URL_PREFIX = 'https://www.ncbi.nlm.nih.gov/gap/?term='
DBGAP_GTEX_QUERY_URL = DBGAP_QUERY_URL_PREFIX + 'phs000424'

HISEQ_TYPES = {
    "HiSeq 2000": util.get_annotation("Illumina HiSeq 2000"),
    "HiSeq X Ten": util.get_annotation("Illumina HiSeq X Ten"),
}

# TODO - duplicated from rnaseq_datasets.py
DB_GAP = DatsObj("DataRepository", [("name", "dbGaP")])

NIH_NHGRI = DatsObj("Organization",
                    [("name", "National Human Genome Research Institute"),
                     ("abbreviation", "NHGRI")])

# TODO - use DatsObjCache
cache = None
GTEX_V7_RNASEQ_TYPE = DatsObj(
    "DataType",
    [("information", util.get_annotation("transcription profiling", cache)),
示例#7
0
def get_files_dats_datasets(cache, dats_samples_d, p_samples, gh_samples,
                            protected_cram_files, no_circular_links):
    file_datasets = []

    rnaseq_datatype = DatsObj("DataType", [
        ("information", util.get_annotation("transcription profiling", cache)),
        ("method", util.get_annotation("RNA-seq assay", cache)),
        ("platform", util.get_annotation("Illumina", cache))
    ])

    def get_rnaseq_datatype():
        dkey = ".".join(["DataType", "RNA-seq"])
        return cache.get_obj_or_ref(dkey, lambda: rnaseq_datatype)

    wgs_datatype = DatsObj(
        "DataType",
        [("information", util.get_annotation("DNA sequencing", cache)),
         ("method", util.get_annotation("whole genome sequencing assay",
                                        cache)),
         ("platform", util.get_annotation("Illumina", cache))])

    def get_wgs_datatype():
        dkey = ".".join(["DataType", "WGS"])
        return cache.get_obj_or_ref(dkey, lambda: wgs_datatype)

    broad_key = ":".join(["Organization", "Broad Institute"])
    broad = cache.get_obj_or_ref(
        broad_key,
        lambda: DatsObj("Organization", [("name", "Broad Institute")]))
    creators = [broad]

    def make_data_standard(format):
        return DatsObj("DataStandard",
                       [("name", format),
                        ("type", util.get_value_annotation("format", cache)),
                        ("description", format + " file format")])

    cram_ds_key = ":".join(["DataStandard", "CRAM"])
    cram_ds = cache.get_obj_or_ref(cram_ds_key,
                                   lambda: make_data_standard("CRAM"))

    crai_ds_key = ":".join(["DataStandard", "CRAI"])
    crai_ds = cache.get_obj_or_ref(crai_ds_key,
                                   lambda: make_data_standard("CRAI"))

    for sample_id in protected_cram_files:
        file = protected_cram_files[sample_id]
        material_type = None
        ds_types = None

        # determine file type
        if re.search(r'wgs\/', file['cram_file_aws']['raw_value']):
            material_type = 'DNA'
            ds_types = get_wgs_datatype()
            gcp_suffix = '_gcp'
        elif re.search(r'rnaseq\/', file['cram_file_aws']['raw_value']):
            material_type = 'RNA'
            ds_types = get_rnaseq_datatype()
            gcp_suffix = ''
        else:
            logging.fatal(
                "unable to determine material/sequence type from cram_file_aws="
                + file['cram_file_aws']['raw_value'])
            sys.exit(1)

        # RNA-Seq keys = sample_id	cram_file	cram_file_md5	cram_file_size	cram_index	cram_file_aws	cram_index_aws
        # WGS keys = same as above + firecloud_id
        cram_file = file['cram_file' + gcp_suffix]['raw_value']
        cram_file_md5 = file['cram_file_md5']['raw_value']

        # TODO - review the following encoding decisions:
        #  - storing .crai URI as relatedIdentifier of the DatasetDistribution for the .cram file
        #  - storing MD5 checksum of the .cram file as an extraProperty of the DatasetDistribution
        #  - storing firecloud_id as a relatedIdentifier of the Dataset (not the DatasetDistribution)

        # Google Cloud Platform / Google Storage copy
        gs_access = DatsObj(
            "Access",
            [("landingPage", file['cram_file' + gcp_suffix]['raw_value'])])
        gs_distro = DatsObj(
            "DatasetDistribution",
            [
                ("access", gs_access),
                ("identifier",
                 DatsObj("Identifier",
                         [("identifier",
                           file['cram_file' + gcp_suffix]['raw_value'])])),
                ("relatedIdentifiers", [
                    DatsObj("RelatedIdentifier",
                            [("identifier",
                              file['cram_index' + gcp_suffix]['raw_value']),
                             ("relationType", "cram_index")])
                ]),
                ("size", int(file['cram_file_size']['raw_value'])),
                # TODO - add unit for bytes, include IRI?
                #                ("unit", util.get_value_annotation("bytes", cache))
                ("conformsTo", [
                    cache.get_obj_or_ref(cram_ds_key,
                                         lambda: make_data_standard("CRAM"))
                ])
            ])

        # AWS / S3 copy
        s3_access = DatsObj(
            "Access", [("landingPage", file['cram_file_aws']['raw_value'])])
        s3_distro = DatsObj(
            "DatasetDistribution",
            [
                ("access", s3_access),
                ("identifier",
                 DatsObj("Identifier", [
                     ("identifier", file['cram_file_aws']['raw_value'])
                 ])),
                ("relatedIdentifiers", [
                    DatsObj("RelatedIdentifier", [
                        ("identifier", file['cram_index_aws']['raw_value']),
                        ("relationType", "cram_index")
                    ])
                ]),
                ("size", int(file['cram_file_size']['raw_value'])),
                # TODO - add unit for bytes, include IRI?
                #                ("unit", util.get_value_annotation("bytes", cache))
                ("conformsTo", [
                    cache.get_obj_or_ref(cram_ds_key,
                                         lambda: make_data_standard("CRAM"))
                ])
            ])

        m = re.match(r'^.*\/([^\/]+)$', cram_file)
        if m is None:
            logging.fatal("unable to parse filename from CRAM file URI " +
                          cram_file)
            sys.exit(1)
        filename = m.group(1)

        # TODO - replace this with DATS-specific MD5 checksum encoding (TBD)
        md5_dimension = DatsObj(
            "Dimension", [("name", util.get_value_annotation("MD5", cache)),
                          ("values", [file['cram_file_md5']['raw_value']])])

        ds = DatsObj("Dataset", [
            ("distributions", [gs_distro, s3_distro]),
            ("dimensions", [md5_dimension]),
            ("title", filename),
            ("types", [ds_types]),
            ("creators", creators),
        ])

        # add firecloud_id for WGS
        if 'firecloud_id' in file:
            f_id = DatsObj("RelatedIdentifier",
                           [("identifier", file['firecloud_id']['raw_value']),
                            ("identifierSource", "FireCloud")])
            ds.set("relatedIdentifiers", [f_id])

        # input RNA/DNA extract that was sequenced
        if sample_id not in dats_samples_d:
            logging.fatal("no sample exists for " + sample_id +
                          " found in file " +
                          file['cram_file_aws']['raw_value'])
            sys.exit(1)

        dats_sample = dats_samples_d[sample_id]
        dats_samp_key = ":".join(["Material", dats_sample.get("name")])
        dats_samp = cache.get_obj_or_ref(dats_samp_key, lambda: dats_sample)

        da = DatsObj(
            "DataAcquisition",
            [("name", filename), ("input", [dats_samp])
             #                ("uses", [])                          # software used
             ])

        if not no_circular_links:
            # circular link back to enclosing Dataset as the output
            da.set("output", [ds.getIdRef()])

        ds.set("producedBy", da)
        file_datasets.append(ds)

    return file_datasets
示例#8
0
def get_sample_dats_material(cache, dats_subject, p_sample, gh_sample,
                             var_lookup):
    samp_id = p_sample['SAMPID']['mapped_value']
    subj_id = p_sample['SUBJID']['mapped_value']

    # retrieve id reference for the Identifier of the DATS Dimension for the "all subjects" consent group version of the variable
    def get_var_id(name):
        return var_lookup[name]['dim'].get("identifier").getIdRef()

    # Uberon id (or EFO id, contrary to the documentation)
    anat_id = p_sample['SMUBRID']['mapped_value']
    if anat_id is None:
        print("No Uberon/anatomy ID specified for sample " + samp_id)
        sys.exit(1)

    anatomy_identifier = None
    anatomy_alt_ids = None
    # TODO - query anatomy term from UBERON/EFO instead?
    anatomy_name = p_sample['SMTSD']['mapped_value']

    def make_anat_part(anat_id, anatomy_name):
        # EFO id
        if re.match(r'^EFO_\d+', anat_id):
            anatomy_identifier = OrderedDict([("identifier", anat_id),
                                              ("identifierSource", "EFO")])
            anatomy_alt_ids = [
                OrderedDict([(
                    "identifier",
                    "https://www.ebi.ac.uk/ols/ontologies/efo/terms?short_form="
                    + str(anat_id)), ("identifierSource", "EFO")])
            ]
        # Uberon id
        else:
            anatomy_identifier = OrderedDict([("identifier",
                                               "UBERON:" + str(anat_id)),
                                              ("identifierSource", "UBERON")])
            anatomy_alt_ids = [
                OrderedDict([
                    ("identifier",
                     "http://purl.obolibrary.org/obo/UBERON_" + str(anat_id)),
                    ("identifierSource", "UBERON")
                ])
            ]

        # anatomical part
        anatomical_part = DatsObj("AnatomicalPart",
                                  [("name", anatomy_name),
                                   ("identifier", anatomy_identifier),
                                   ("alternateIdentifiers", anatomy_alt_ids)])

        return anatomical_part

    # use cached value for AnatomicalPart if possible
    anat_part_key = ":".join(["AnatomicalPart", anatomy_name])
    anatomical_part = cache.get_obj_or_ref(
        anat_part_key, lambda: make_anat_part(anat_id, anatomy_name))

    # use URI from GitHub GTEx id dump if available
    identifier = samp_id
    if gh_sample is not None:
        identifier_id = gh_sample['Destination URL']['raw_value']

    subj_key = ":".join(["Material", dats_subject.get("name")])
    dats_subj = cache.get_obj_or_ref(subj_key, lambda: dats_subject)

    # add sample characteristics from p_sample metadata
    sample_chars = []
    for key in p_sample:
        if re.match(r'^(subject|id)$', key):
            continue
        # TODO - currently including only a small subset of the available values for demonstration purposes
        if not re.match(r'^(SMATSSCR|SMRIN|SMMAPRT|SMGNSDTC)$', key):
            continue
        var = p_sample[key]
        #        print("got key=" + key + " var=" + str(var))
        mapped_val = var['mapped_value']
        char = DatsObj("Dimension",
                       [("name", util.get_value_annotation(key, cache)),
                        ("identifier", get_var_id(key)),
                        ("values", [mapped_val])])
        sample_chars.append(char)

    # biological/tissue sample
    biological_sample_material = DatsObj(
        "Material",
        [("name", samp_id), ("identifier", {
            "identifier": identifier
        }),
         ("description",
          anatomy_name + " specimen collected from subject " + subj_id),
         ("characteristics", sample_chars),
         ("taxonomy", [util.get_taxon_human(cache)]),
         ("roles", [util.get_annotation("specimen", cache)]),
         ("derivesFrom", [dats_subj, anatomical_part])])

    # analysis freeze classification
    smafrze = p_sample['SMAFRZE']['mapped_value']
    # expected sequence type depending on data freeze classification
    expected_stype = None

    stype = None
    if smafrze == "RNASEQ":
        expected_stype = "RNA"
    elif smafrze == "WGS":
        expected_stype = "DNA"
    elif smafrze == "WES":
        expected_stype = "DNA"
    # Illumina OMNI SNP Array
    elif smafrze == "OMNI":
        expected_stype = "DNA"
    elif smafrze == "EXCLUDE":
        pass
    else:
        logging.fatal("unknown SMAFRZE " + smafrze)
        sys.exit(1)

    # sample type - DNA or RNA
    stype = None
    smnabtcht = p_sample['SMNABTCHT']['mapped_value']
    if re.match(r'^DNA ([iI]solation|[eE]xtraction).*', smnabtcht):
        stype = 'DNA'
    elif re.match(r'^RNA ([iI]solation|[eE]xtraction).*', smnabtcht):
        stype = 'RNA'
    elif re.match(
            r'DNA or RNA Extraction from Paxgene-derived Lysate Plate Based',
            smnabtcht):
        stype = 'RNA'
    elif re.match(r'Transfer To Matrix \(Manual\)', smnabtcht):
        stype = 'DNA'

    if stype is None:
        if expected_stype is not None:
            stype = expected_stype
        else:
            print("couldn't determine sequence type for smafrze=" + smafrze +
                  " smnabtcht=" + smnabtcht)
            return None
    else:
        if (expected_stype is not None) and (stype != expected_stype):
            logging.fatal("seq type " + stype +
                          " doesn't match expected stype " + expected_stype)
            sys.exit(1)

    # DNA or RNA extract
    dna_or_rna_material = DatsObj(
        "Material",
        [("name", stype + " from " + samp_id),
         ("description", "total " + stype + " extracted from " + anatomy_name +
          " specimen collected from subject " + subj_id),
         ("taxonomy", [util.get_taxon_human(cache)]),
         ("roles", [util.get_annotation(stype + " extract", cache)]),
         ("derivesFrom", [biological_sample_material])])

    return dna_or_rna_material