Пример #1
0
    def _process_phenotype_data(self, limit):
        """
        NOTE: If a Strain carries more than one mutation,
        then each Mutation description,
        i.e., the set: (
            Mutation Type - Chromosome - Gene Symbol -
            Gene Name - Allele Symbol - Allele Name)
        will require a separate line.

        Note that MMRRC curates phenotypes to alleles,
        even though they distribute only one file with the
        phenotypes appearing to be associated with a strain.

        So, here we process the allele-to-phenotype relationships separately
        from the strain-to-allele relationships.

        :param limit:
        :return:

        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        line_counter = 0
        gu = GraphUtils(curie_map.get())
        fname = '/'.join((self.rawdir, self.files['catalog']['file']))

        self.strain_hash = {}
        self.id_label_hash = {}
        genes_with_no_ids = set()
        stem_cell_class = 'CL:0000034'
        mouse_taxon = 'NCBITaxon:10090'
        geno = Genotype(g)
        with open(fname, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            for row in filereader:
                line_counter += 1
                # skip the first 3 lines which are header, etc.
                if line_counter < 4:
                    continue

                (strain_id, strain_label, strain_type_symbol, strain_state,
                 mgi_allele_id, mgi_allele_symbol, mgi_allele_name,
                 mutation_type, chrom, mgi_gene_id, mgi_gene_symbol,
                 mgi_gene_name, sds_url, accepted_date, mp_ids, pubmed_nums,
                 research_areas) = row

                if self.testMode and (strain_id not in self.test_ids):
                    continue

                # strip off stuff after the dash -
                # is the holding center important?
                # MMRRC:00001-UNC --> MMRRC:00001
                strain_id = re.sub(r'-\w+$', '', strain_id)

                self.id_label_hash[strain_id] = strain_label

                # get the variant or gene to save for later building of
                # the genotype
                if strain_id not in self.strain_hash:
                    self.strain_hash[strain_id] = {'variants': set(),
                                                   'genes': set()}

                # clean up the bad one
                if mgi_allele_id == 'multiple mutation':
                    logger.error("Erroneous gene id: %s", mgi_allele_id)
                    mgi_allele_id = ''

                if mgi_allele_id != '':
                    self.strain_hash[strain_id]['variants'].add(mgi_allele_id)
                    self.id_label_hash[mgi_allele_id] = mgi_allele_symbol

                    # use the following if needing to add the
                    # sequence alteration types
                    # var_type =
                    #   self._get_variant_type_from_abbrev(mutation_type)
                    # make a sequence alteration for this variant locus,
                    # and link the variation type to it
                    # sa_id = '_'+re.sub(r':','',mgi_allele_id)+'SA'
                    # if self.nobnodes:
                    #     sa_id = ':'+sa_id
                    # gu.addIndividualToGraph(g, sa_id, None, var_type)
                    # geno.addSequenceAlterationToVariantLocus(sa_id,
                    #                                          mgi_allele_id)

                # scrub out any spaces
                mgi_gene_id = re.sub(r'\s+', '', mgi_gene_id)
                if mgi_gene_id.strip() != '':
                    if re.match(r'Gene\s*ID:', mgi_gene_id, re.I):
                        mgi_gene_id = re.sub(r'Gene\s*ID:\s*', 'NCBIGene:',
                                             mgi_gene_id)
                    elif not re.match(r'MGI', mgi_gene_id):
                        logger.info("Gene id not recognized: %s", mgi_gene_id)
                        if re.match(r'\d+$', mgi_gene_id):
                            # assume that if it's all numbers, then it's MGI
                            mgi_gene_id = 'MGI:'+str(mgi_gene_id)
                            logger.info("Assuming numerics are MGI.")
                    self.strain_hash[strain_id]['genes'].add(mgi_gene_id)
                    self.id_label_hash[mgi_gene_id] = mgi_gene_symbol

                # catch some errors -
                # some things have gene labels, but no identifiers - report
                if mgi_gene_symbol.strip() != '' and mgi_gene_id == '':
                    logger.error(
                        "Gene label with no identifier for strain %s: %s",
                        strain_id, mgi_gene_symbol)
                    genes_with_no_ids.add(mgi_gene_symbol.strip())
                    # make a temp id for genes that aren't identified
                    # tmp_gene_id = '_'+mgi_gene_symbol
                    # self.id_label_hash[tmp_gene_id] = mgi_gene_symbol
                    # self.strain_hash[strain_id]['genes'].add(tmp_gene_id)

                # split apart the mp ids
                # ataxia [MP:0001393] ,hypoactivity [MP:0001402] ...
                # mp_ids are now a comma delimited list
                # with MP terms in brackets
                phenotype_ids = []
                if mp_ids != '':
                    for i in re.split(r',', mp_ids):
                        i = i.strip()
                        mps = re.search(r'\[(.*)\]', i)
                        if mps is not None:
                            mp_id = mps.group(1).strip()
                            phenotype_ids.append(mp_id)

                # pubmed ids are space delimited
                pubmed_ids = []
                if pubmed_nums.strip() != '':
                    for i in re.split(r'\s+', pubmed_nums):
                        pmid = 'PMID:'+i.strip()
                        pubmed_ids.append(pmid)
                        r = Reference(pmid,
                                      Reference.ref_types['journal_article'])
                        r.addRefToGraph(g)

                # https://www.mmrrc.org/catalog/sds.php?mmrrc_id=00001
                # is a good example of 4 genotype parts

                gu.addClassToGraph(g, mouse_taxon, None)
                if research_areas.strip() == '':
                    research_areas = None
                else:
                    research_areas = 'Research Areas: '+research_areas
                strain_type = mouse_taxon
                if strain_state == 'ES':
                    strain_type = stem_cell_class
                gu.addIndividualToGraph(
                    g, strain_id, strain_label, strain_type,
                    research_areas)  # an inst of mouse??
                gu.makeLeader(g, strain_id)

                # phenotypes are associated with the alleles
                for pid in phenotype_ids:
                    # assume the phenotype label is in the ontology
                    gu.addClassToGraph(g, pid, None)
                    if mgi_allele_id is not None and mgi_allele_id != '':
                        assoc = G2PAssoc(self.name, mgi_allele_id, pid,
                                         gu.object_properties['has_phenotype'])
                        for p in pubmed_ids:
                            assoc.add_source(p)
                        assoc.add_association_to_graph(g)
                    else:
                        logger.info("Phenotypes and no allele for %s",
                                    strain_id)

                if not self.testMode and (
                        limit is not None and line_counter > limit):
                    break

            # now that we've collected all of the variant information, build it
            # we don't know their zygosities
            for s in self.strain_hash:
                h = self.strain_hash.get(s)
                variants = h['variants']
                genes = h['genes']
                vl_set = set()
                # make variant loci for each gene
                if len(variants) > 0:
                    for v in variants:
                        vl_id = v
                        vl_symbol = self.id_label_hash[vl_id]
                        geno.addAllele(vl_id, vl_symbol,
                                       geno.genoparts['variant_locus'])
                        vl_set.add(vl_id)
                        if len(variants) == 1 and len(genes) == 1:
                            for gene in genes:
                                geno.addAlleleOfGene(vl_id, gene)
                        else:
                            geno.addAllele(vl_id, vl_symbol)
                else:  # len(vars) == 0
                    # it's just anonymous variants in some gene
                    for gene in genes:
                        vl_id = '_'+gene+'-VL'
                        vl_id = re.sub(r':', '', vl_id)
                        if self.nobnodes:
                            vl_id = ':'+vl_id
                        vl_symbol = self.id_label_hash[gene]+'<?>'
                        self.id_label_hash[vl_id] = vl_symbol
                        geno.addAllele(vl_id, vl_symbol,
                                       geno.genoparts['variant_locus'])
                        geno.addGene(gene, self.id_label_hash[gene])
                        geno.addAlleleOfGene(vl_id, gene)
                        vl_set.add(vl_id)

                # make the vslcs
                vl_list = sorted(vl_set)
                vslc_list = []
                for vl in vl_list:
                    # for unknown zygosity
                    vslc_id = '_'+re.sub(r'^_', '', vl)+'U'
                    vslc_id = re.sub(r':', '', vslc_id)
                    if self.nobnodes:
                        vslc_id = ':' + vslc_id
                    vslc_label = self.id_label_hash[vl] + '/?'
                    self.id_label_hash[vslc_id] = vslc_label
                    vslc_list.append(vslc_id)
                    geno.addPartsToVSLC(
                        vslc_id, vl, None, geno.zygosity['indeterminate'],
                        geno.object_properties['has_alternate_part'], None)
                    gu.addIndividualToGraph(
                        g, vslc_id, vslc_label,
                        geno.genoparts['variant_single_locus_complement'])
                if len(vslc_list) > 0:
                    if len(vslc_list) > 1:
                        gvc_id = '-'.join(vslc_list)
                        gvc_id = re.sub(r':', '', gvc_id)
                        if self.nobnodes:
                            gvc_id = ':'+gvc_id
                        gvc_label = \
                            '; '.join(self.id_label_hash[v] for v in vslc_list)
                        gu.addIndividualToGraph(
                            g, gvc_id, gvc_label,
                            geno.genoparts['genomic_variation_complement'])
                        for vslc_id in vslc_list:
                            geno.addVSLCtoParent(vslc_id, gvc_id)
                    else:
                        # the GVC == VSLC, so don't have to make an extra piece
                        gvc_id = vslc_list.pop()
                        gvc_label = self.id_label_hash[gvc_id]

                    genotype_label = gvc_label + ' [n.s.]'
                    bkgd_id = \
                        '_' + re.sub(r':', '', '-'.join(
                            (geno.genoparts['unspecified_genomic_background'],
                             s)))
                    genotype_id = '-'.join((gvc_id, bkgd_id))
                    if self.nobnodes:
                        bkgd_id = ':'+bkgd_id
                    geno.addTaxon(mouse_taxon, bkgd_id)
                    geno.addGenomicBackground(
                        bkgd_id, 'unspecified ('+s+')',
                        geno.genoparts['unspecified_genomic_background'],
                        "A placeholder for the " +
                        "unspecified genetic background for "+s)
                    geno.addGenomicBackgroundToGenotype(
                        bkgd_id, genotype_id,
                        geno.genoparts['unspecified_genomic_background'])
                    geno.addParts(
                        gvc_id, genotype_id,
                        geno.object_properties['has_alternate_part'])
                    geno.addGenotype(genotype_id, genotype_label)
                    gu.addTriple(
                        g, s, geno.object_properties['has_genotype'],
                        genotype_id)
                else:
                    # logger.debug(
                    #   "Strain %s is not making a proper genotype.", s)
                    pass

            gu.loadProperties(
                g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP)
            gu.loadProperties(
                g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP)
            gu.loadProperties(
                g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP)
            gu.loadAllProperties(g)

            logger.warning(
                "The following gene symbols did not list identifiers: %s",
                str(sorted(list(genes_with_no_ids))))

        return
Пример #2
0
    def _get_variants(self, limit):
        """
        Currently loops through the variant_summary file.

        :param limit:
        :return:

        """
        gu = GraphUtils(curie_map.get())

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        geno = Genotype(g)
        gu.loadAllProperties(g)
        f = Feature(None, None, None)
        f.loadAllProperties(g)

        gu.loadAllProperties(g)

        # add the taxon and the genome
        tax_num = '9606'  # HARDCODE
        tax_id = 'NCBITaxon:'+tax_num
        tax_label = 'Human'
        gu.addClassToGraph(g, tax_id, None)
        geno.addGenome(tax_id, tax_label)  # label gets added elsewhere

        # not unzipping the file
        logger.info("Processing Variant records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['variant_summary']['file']))
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue

                # AlleleID               integer value as stored in the AlleleID field in ClinVar  (//Measure/@ID in the XML)
                # Type                   character, the type of variation
                # Name                   character, the preferred name for the variation
                # GeneID                 integer, GeneID in NCBI's Gene database
                # GeneSymbol             character, comma-separated list of GeneIDs overlapping the variation
                # ClinicalSignificance   character, comma-separated list of values of clinical significance reported for this variation
                #                          for the mapping between the terms listed here and the integers in the .VCF files, see
                #                          http://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/
                # RS# (dbSNP)            integer, rs# in dbSNP
                # nsv (dbVar)            character, the NSV identifier for the region in dbVar
                # RCVaccession           character, list of RCV accessions that report this variant
                # TestedInGTR            character, Y/N for Yes/No if there is a test registered as specific to this variation in the NIH Genetic Testing Registry (GTR)
                # PhenotypeIDs           character, list of db names and identifiers for phenotype(s) reported for this variant
                # Origin                 character, list of all allelic origins for this variation
                # Assembly               character, name of the assembly on which locations are based
                # Chromosome             character, chromosomal location
                # Start                  integer, starting location, in pter->qter orientation
                # Stop                   integer, end location, in pter->qter orientation
                # Cytogenetic            character, ISCN band
                # ReviewStatus           character, highest review status for reporting this measure. For the key to the terms,
                #                            and their relationship to the star graphics ClinVar displays on its web pages,
                #                            see http://www.ncbi.nlm.nih.gov/clinvar/docs/variation_report/#interpretation
                # HGVS(c.)               character, RefSeq cDNA-based HGVS expression
                # HGVS(p.)               character, RefSeq protein-based HGVS expression
                # NumberSubmitters       integer, number of submissions with this variant
                # LastEvaluated          datetime, the latest time any submitter reported clinical significance
                # Guidelines             character, ACMG only right now, for the reporting of incidental variation in a Gene
                #                                (NOTE: if ACMG, not a specific to the allele but to the Gene)
                # OtherIDs               character, list of other identifiers or sources of information about this variant
                # VariantID              integer, the value used to build the URL for the current default report,
                #                            e.g. http://www.ncbi.nlm.nih.gov/clinvar/variation/1756/
                #

                # a crude check that there's an expected number of cols.
                # if not, error out because something changed.
                num_cols = len(line.split('\t'))
                expected_numcols = 29
                if num_cols != expected_numcols:
                    logger.error(
                        "Unexpected number of columns in raw file " +
                        "(%d actual vs %d expected)",
                        num_cols, expected_numcols)

                (allele_num, allele_type, allele_name, gene_num, gene_symbol,
                 clinical_significance, dbsnp_num, dbvar_num, rcv_nums,
                 tested_in_gtr, phenotype_ids, origin, assembly, chr, start,
                 stop, cytogenetic_loc, review_status, hgvs_c, hgvs_p,
                 number_of_submitters, last_eval, guidelines, other_ids,
                 variant_num, reference_allele, alternate_allele, categories,
                 ChromosomeAccession) = line.split('\t')

                # ###set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #    if ((self.filter == 'taxids' and\
                #            (int(tax_num) not in self.tax_ids)) or\
                #            (self.filter == 'geneids' and\
                #             (int(gene_num) not in self.gene_ids))):
                #        continue
                # #### end filter

                line_counter += 1

                pheno_list = []
                if phenotype_ids != '-':
                    # trim any leading/trailing semicolons/commas
                    phenotype_ids = re.sub(r'^[;,]', '', phenotype_ids)
                    phenotype_ids = re.sub(r'[;,]$', '', phenotype_ids)
                    pheno_list = re.split(r'[,;]', phenotype_ids)

                if self.testMode:
                    # get intersection of test disease ids
                    # and these phenotype_ids
                    intersect = \
                        list(
                            set([str(i)
                                for i in self.disease_ids]) & set(pheno_list))
                    if int(gene_num) not in self.gene_ids and\
                            int(variant_num) not in self.variant_ids and\
                            len(intersect) < 1:
                        continue

                # TODO may need to switch on assembly to create correct
                # assembly/build identifiers
                build_id = ':'.join(('NCBIGenome', assembly))

                # make the reference genome build
                geno.addReferenceGenome(build_id, assembly, tax_id)

                allele_type_id = self._map_type_of_allele(allele_type)
                bandinbuild_id = None
                if str(chr) == '':
                    # check cytogenic location
                    if str(cytogenetic_loc).strip() != '':
                        # use cytogenic location to get the apx location
                        # oddly, they still put an assembly number even when
                        # there's no numeric location
                        if not re.search(r'-', str(cytogenetic_loc)):
                            band_id = makeChromID(
                                re.split(r'-', str(cytogenetic_loc)),
                                tax_num, 'CHR')
                            geno.addChromosomeInstance(
                                cytogenetic_loc, build_id, assembly, band_id)
                            bandinbuild_id = makeChromID(
                                re.split(r'-', str(cytogenetic_loc)),
                                assembly, 'MONARCH')
                        else:
                            # can't deal with ranges yet
                            pass
                else:
                    # add the human chromosome class to the graph,
                    # and add the build-specific version of it
                    chr_id = makeChromID(str(chr), tax_num, 'CHR')
                    geno.addChromosomeClass(str(chr), tax_id, tax_label)
                    geno.addChromosomeInstance(
                        str(chr), build_id, assembly, chr_id)
                    chrinbuild_id = makeChromID(str(chr), assembly, 'MONARCH')

                seqalt_id = ':'.join(('ClinVarVariant', variant_num))
                gene_id = None

                # they use -1 to indicate unknown gene
                if str(gene_num) != '-1' and str(gene_num) != 'more than 10':
                    gene_id = ':'.join(('NCBIGene', str(gene_num)))

                # FIXME there are some "variants" that are actually haplotypes
                # probably will get taken care of when we switch to processing
                # the xml for example, variant_num = 38562
                # but there's no way to tell if it's a haplotype
                # in the csv data so the dbsnp or dbvar
                # should probably be primary,
                # and the variant num be the vslc,
                # with each of the dbsnps being added to it

                # TODO clinical significance needs to be mapped to
                # a list of terms
                # first, make the variant:
                f = Feature(seqalt_id, allele_name, allele_type_id)

                if start != '-' and start.strip() != '':
                    f.addFeatureStartLocation(start, chrinbuild_id)
                if stop != '-' and stop.strip() != '':
                    f.addFeatureEndLocation(stop, chrinbuild_id)

                f.addFeatureToGraph(g)
                f.addTaxonToFeature(g, tax_id)
                # make the ClinVarVariant the clique leader
                gu.makeLeader(g, seqalt_id)

                if bandinbuild_id is not None:
                    f.addSubsequenceOfFeature(g, bandinbuild_id)

                # CHECK - this makes the assumption that there is
                # only one affected chromosome per variant what happens with
                # chromosomal rearrangement variants?
                # shouldn't both chromosomes be here?

                # add the hgvs as synonyms
                if hgvs_c != '-' and hgvs_c.strip() != '':
                    gu.addSynonym(g, seqalt_id, hgvs_c)
                if hgvs_p != '-' and hgvs_p.strip() != '':
                    gu.addSynonym(g, seqalt_id, hgvs_p)

                # add the dbsnp and dbvar ids as equivalent
                if dbsnp_num != '-' and int(dbsnp_num) != -1:
                    dbsnp_id = 'dbSNP:rs'+str(dbsnp_num)
                    gu.addIndividualToGraph(g, dbsnp_id, None)
                    gu.addSameIndividual(g, seqalt_id, dbsnp_id)
                if dbvar_num != '-':
                    dbvar_id = 'dbVar:'+dbvar_num
                    gu.addIndividualToGraph(g, dbvar_id, None)
                    gu.addSameIndividual(g, seqalt_id, dbvar_id)

                # TODO - not sure if this is right... add as xref?
                # the rcv is like the combo of the phenotype with the variant
                if rcv_nums != '-':
                    for rcv_num in re.split(r';', rcv_nums):
                        rcv_id = 'ClinVar:' + rcv_num
                        gu.addIndividualToGraph(g, rcv_id, None)
                        gu.addXref(g, seqalt_id, rcv_id)

                if gene_id is not None:
                    # add the gene
                    gu.addClassToGraph(g, gene_id, gene_symbol)
                    # make a variant locus
                    vl_id = '_'+gene_num+'-'+variant_num
                    if self.nobnodes:
                        vl_id = ':'+vl_id
                    vl_label = allele_name
                    gu.addIndividualToGraph(
                        g, vl_id, vl_label, geno.genoparts['variant_locus'])
                    geno.addSequenceAlterationToVariantLocus(seqalt_id, vl_id)
                    geno.addAlleleOfGene(vl_id, gene_id)
                else:
                    # some basic reporting
                    gmatch = re.search(r'\(\w+\)', allele_name)
                    if gmatch is not None and len(gmatch.groups()) > 0:
                        logger.info(
                            "Gene found in allele label, but no id provided: %s",
                            gmatch.group(1))
                    elif re.match(r'more than 10', gene_symbol):
                        logger.info(
                            "More than 10 genes found; "
                            "need to process XML to fetch (variant=%d)",
                            int(variant_num))
                    else:
                        logger.info(
                            "No gene listed for variant %d",
                            int(variant_num))

                # parse the list of "phenotypes" which are diseases.
                # add them as an association
                # ;GeneReviews:NBK1440,MedGen:C0392514,OMIM:235200,SNOMED CT:35400008;MedGen:C3280096,OMIM:614193;MedGen:CN034317,OMIM:612635;MedGen:CN169374
                # the list is both semicolon delimited and comma delimited,
                # but i don't know why! some are bad, like:
                # Orphanet:ORPHA ORPHA319705,SNOMED CT:49049000
                if phenotype_ids != '-':
                    for p in pheno_list:
                        m = re.match(r"(Orphanet:ORPHA(?:\s*ORPHA)?)", p)
                        if m is not None and len(m.groups()) > 0:
                            p = re.sub(m.group(1), 'Orphanet:', p.strip())
                        elif re.match(r'SNOMED CT', p):
                            p = re.sub(r'SNOMED CT', 'SNOMED', p.strip())

                        assoc = G2PAssoc(self.name, seqalt_id, p.strip())
                        assoc.add_association_to_graph(g)

                if other_ids != '-':
                    id_list = other_ids.split(',')
                    # process the "other ids" ex:
                    # CFTR2:F508del,HGMD:CD890142,OMIM Allelic Variant:602421.0001
                    # TODO make more xrefs
                    for xrefid in id_list:
                        prefix = xrefid.split(':')[0].strip()
                        if prefix == 'OMIM Allelic Variant':
                            xrefid = 'OMIM:'+xrefid.split(':')[1]
                            gu.addIndividualToGraph(g, xrefid, None)
                            gu.addSameIndividual(g, seqalt_id, xrefid)
                        elif prefix == 'HGMD':
                            gu.addIndividualToGraph(g, xrefid, None)
                            gu.addSameIndividual(g, seqalt_id, xrefid)
                        elif prefix == 'dbVar' \
                                and dbvar_num == xrefid.split(':')[1].strip():
                            pass  # skip over this one
                        elif re.search(r'\s', prefix):
                            pass
                            # logger.debug(
                            #   'xref prefix has a space: %s', xrefid)
                        else:
                            # should be a good clean prefix
                            # note that HGMD variants are in here as Xrefs
                            # because we can't resolve URIs for them
                            # logger.info("Adding xref: %s", xrefid)
                            # gu.addXref(g, seqalt_id, xrefid)
                            # logger.info("xref prefix to add: %s", xrefid)
                            pass

                if not self.testMode and limit is not None \
                        and line_counter > limit:
                    break

        gu.loadProperties(g, G2PAssoc.object_properties, gu.OBJPROP)
        gu.loadProperties(g, G2PAssoc.annotation_properties, gu.ANNOTPROP)
        gu.loadProperties(g, G2PAssoc.datatype_properties, gu.DATAPROP)

        logger.info("Finished parsing variants")

        return
Пример #3
0
    def _get_var_citations(self, limit):

        # Generated weekly, the first of the week
        # A tab-delimited report of citations associated with data in ClinVar,
        # connected to the AlleleID, the VariationID, and either rs# from dbSNP
        # or nsv in dbVar.
        #
        # AlleleID          int value  (xpath //Measure/@ID )
        # VariationID       ID ClinVar uses to anchor default display.
        #                   (xpath  //MeasureSet/@ID)
        # rs			    rs identifier from dbSNP
        # nsv				nsv identifier from dbVar
        # citation_source	The source of the citation, either PubMed,
        #                   PubMedCentral, or the NCBI Bookshelf
        # citation_id		The identifier used by that source

        gu = GraphUtils(curie_map.get())
        logger.info("Processing Citations for variants")
        line_counter = 0
        myfile = \
            '/'.join((self.rawdir, self.files['variant_citations']['file']))
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        with open(myfile, 'r', encoding="utf8") as f:
            filereader = csv.reader(f, delimiter='\t', quotechar='\"')

            for line in filereader:
                # skip comments
                line = line
                if re.match(r'^#', line[0]):
                    continue
                (allele_num, variant_num, rs_num, nsv_num, citation_source,
                 citation_id) = line

                line_counter += 1

                if self.testMode:
                    if int(variant_num) not in self.variant_ids:
                        continue

                if citation_id.strip() == '':
                    logger.info(
                        "Skipping blank citation for ClinVarVariant:%s",
                        str(variant_num))
                    continue

                # the citation for a variant is made to some kind of
                # combination of the ids here.
                # but i'm not sure which, we don't know what the
                # citation is for exactly, other than the variant.
                # so use mentions

                var_id = 'ClinVarVariant:'+variant_num

                # citation source: PubMed | PubMedCentral | citation_source
                # citation id:
                # format the citation id:
                ref_id = None
                if citation_source == 'PubMed':
                    ref_id = 'PMID:'+str(citation_id)
                    gu.makeLeader(g, ref_id)
                elif citation_source == 'PubMedCentral':
                    ref_id = 'PMCID:'+str(citation_id)
                if ref_id is not None:
                    r = Reference(
                        ref_id, Reference.ref_types['journal_article'])
                    r.addRefToGraph(g)
                    gu.addTriple(
                        g, ref_id, self.properties['is_about'], var_id)

                if not self.testMode \
                        and (limit is not None and line_counter > limit):
                    break

        logger.info("Finished processing citations for variants")

        return
Пример #4
0
    def process_catalog(self, limit=None):
        """
        :param limit:
        :return:

        """
        raw = '/'.join((self.rawdir, self.files['catalog']['file']))
        logger.info("Processing Data from %s", raw)
        gu = GraphUtils(curie_map.get())

        if self.testMode:      # set the graph to build
            g = self.testgraph
        else:
            g = self.graph

        line_counter = 0
        geno = Genotype(g)

        gu.loadProperties(g, geno.object_properties, gu.OBJPROP)
        gu.loadAllProperties(g)

        tax_id = 'NCBITaxon:9606'  # hardcode
        genome_version = 'GRCh38'  # hardcode

        # build a hashmap of genomic location to identifiers,
        # to try to get the equivalences

        loc_to_id_hash = {}

        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            next(filereader, None)  # skip the header row
            for row in filereader:
                if not row:
                    pass
                else:
                    line_counter += 1
                    (date_added_to_catalog, pubmed_num, first_author, pub_date,
                     journal, link, study_name, disease_or_trait,
                     initial_sample_description, replicate_sample_description,
                     region, chrom_num, chrom_pos, reported_gene_nums,
                     mapped_gene, upstream_gene_num, downstream_gene_num,
                     snp_gene_nums, upstream_gene_distance,
                     downstream_gene_distance, strongest_snp_risk_allele, snps,
                     merged, snp_id_current, context, intergenic_flag,
                     risk_allele_frequency, pvalue, pvalue_mlog, pvalue_text,
                     or_or_beta, confidence_interval_95,
                     platform_with_snps_passing_qc, cnv_flag, mapped_trait,
                     mapped_trait_uri) = row

                    intersect = \
                        list(set([str(i) for i in self.test_ids['gene']]) &
                             set(re.split(r',', snp_gene_nums)))
                    # skip if no matches found in test set
                    if self.testMode and len(intersect) == 0:
                        continue

# 06-May-2015	25917933	Zai CC	20-Nov-2014	J Psychiatr Res	http://europepmc.org/abstract/MED/25917933
# A genome-wide association study of suicide severity scores in bipolar disorder.
# Suicide in bipolar disorder
# 959 European ancestry individuals	NA
# 10p11.22	10	32704340	C10orf68, CCDC7, ITGB1	CCDC7
# rs7079041-A	rs7079041	0	7079041	intron	0		2E-6	5.698970
                    if chrom_num != '' and chrom_pos != '':
                        loc = 'chr'+str(chrom_num)+':'+str(chrom_pos)
                        if loc not in loc_to_id_hash:
                            loc_to_id_hash[loc] = set()
                    else:
                        loc = None

                    if re.search(r' x ', strongest_snp_risk_allele) \
                            or re.search(r',', strongest_snp_risk_allele):
                        # TODO deal with haplotypes
                        logger.warning(
                            "We can't deal with haplotypes yet: %s",
                            strongest_snp_risk_allele)
                        continue
                    elif re.match(r'rs', strongest_snp_risk_allele):
                        rs_id = 'dbSNP:'+strongest_snp_risk_allele.strip()
                        # remove the alteration
                    elif re.match(r'kgp', strongest_snp_risk_allele):
                        # FIXME this isn't correct
                        rs_id = 'dbSNP:'+strongest_snp_risk_allele.strip()
                        # http://www.1000genomes.org/faq/what-are-kgp-identifiers
                        # for some information
                        # They were created by Illumina for their genotyping
                        # platform before some variants identified during the
                        # pilot phase of the project had been assigned
                        # rs numbers.
                    elif re.match(r'chr', strongest_snp_risk_allele):
                        # like: chr10:106180121-G
                        rs_id = ':gwas-' + \
                            re.sub(
                                r':', '-', strongest_snp_risk_allele.strip())
                    elif strongest_snp_risk_allele.strip() == '':
                        # logger.debug(
                        #    "No strongest SNP risk allele for %s:\n%s",
                        #    pubmed_num, str(row))
                        # FIXME still consider adding in the EFO terms
                        # for what the study measured?
                        continue
                    else:
                        logger.warning(
                            "There's a snp id i can't manage: %s",
                            strongest_snp_risk_allele)
                        continue

                    alteration = re.search(r'-(.*)$', rs_id)
                    if alteration is not None \
                            and re.match(r'[ATGC]', alteration.group(1)):
                        # add variation to snp
                        pass  # TODO
                    rs_id = re.sub(r'-.*$', '', rs_id).strip()
                    if loc is not None:
                        loc_to_id_hash[loc].add(rs_id)

                    pubmed_id = 'PMID:'+pubmed_num

                    r = Reference(
                        pubmed_id, Reference.ref_types['journal_article'])
                    r.addRefToGraph(g)

                    # create the chromosome
                    chrom_id = makeChromID(chrom_num, genome_version, 'CHR')

                    # add the feature to the graph
                    snp_description = None
                    if risk_allele_frequency != '' and \
                            risk_allele_frequency != 'NR':
                        snp_description = \
                            str(risk_allele_frequency) + \
                            ' [risk allele frequency]'

                    f = Feature(
                        rs_id, strongest_snp_risk_allele.strip(),
                        Feature.types[r'SNP'], snp_description)
                    if chrom_num != '' and chrom_pos != '':
                        f.addFeatureStartLocation(chrom_pos, chrom_id)
                        f.addFeatureEndLocation(chrom_pos, chrom_id)
                    f.addFeatureToGraph(g)
                    f.addTaxonToFeature(g, tax_id)
                    # TODO consider adding allele frequency as property;
                    # but would need background info to do that

                    # also want to add other descriptive info about
                    # the variant from the context
                    for c in re.split(r';', context):
                        cid = self._map_variant_type(c.strip())
                        if cid is not None:
                            gu.addType(g, rs_id, cid)

                    # add deprecation information
                    if merged == 1 and str(snp_id_current.strip()) != '':
                        # get the current rs_id
                        current_rs_id = 'dbSNP:'
                        if not re.match(r'rs', snp_id_current):
                            current_rs_id += 'rs'
                        if loc is not None:
                            loc_to_id_hash[loc].append(current_rs_id)
                        current_rs_id += str(snp_id_current)
                        gu.addDeprecatedIndividual(g, rs_id, current_rs_id)
                        # TODO check on this
                        # should we add the annotations to the current
                        # or orig?
                        gu.makeLeader(g, current_rs_id)
                    else:
                        gu.makeLeader(g, rs_id)

                    # add the feature as a sequence alteration
                    # affecting various genes
                    # note that intronic variations don't necessarily list
                    # the genes such as for rs10448080  FIXME
                    if snp_gene_nums != '':
                        for s in re.split(r',', snp_gene_nums):
                            s = s.strip()
                            # still have to test for this,
                            # because sometimes there's a leading comma
                            if s != '':
                                gene_id = 'NCBIGene:'+s
                                geno.addAlleleOfGene(rs_id, gene_id)

                    # add the up and downstream genes if they are available
                    if upstream_gene_num != '':
                        downstream_gene_id = 'NCBIGene:'+downstream_gene_num
                        gu.addTriple(
                            g, rs_id,
                            Feature.object_properties[
                                r'upstream_of_sequence_of'],
                            downstream_gene_id)
                    if downstream_gene_num != '':
                        upstream_gene_id = 'NCBIGene:'+upstream_gene_num
                        gu.addTriple(
                            g, rs_id,
                            Feature.object_properties[
                                'downstream_of_sequence_of'],
                            upstream_gene_id)

                    description = 'A study of ' + disease_or_trait + \
                        ' in ' + initial_sample_description
                    if replicate_sample_description != '':
                        description = \
                            ' '.join(
                                (description, 'with',
                                 replicate_sample_description))
                    if platform_with_snps_passing_qc != '':
                        description = ' '.join(
                            (description, 'on platform',
                             platform_with_snps_passing_qc))
                    description = ' '.join((description, '(p='+pvalue+')'))

                    # make associations to the EFO terms; there can be >1
                    if mapped_trait_uri.strip() != '':
                        for t in re.split(r',', mapped_trait_uri):
                            t = t.strip()

                            cu = CurieUtil(curie_map.get())
                            tid = cu.get_curie(t)

                            assoc = G2PAssoc(
                                self.name, rs_id, tid,
                                gu.object_properties['contributes_to'])
                            assoc.add_source(pubmed_id)
                            # combinatorial evidence
                            # used in automatic assertion
                            eco_id = 'ECO:0000213'
                            assoc.add_evidence(eco_id)

                            # assoc.set_description(description)
                            # FIXME score should get added to provenance/study
                            # assoc.set_score(pvalue)
                            assoc.add_association_to_graph(g)

                    if not self.testMode and\
                            (limit is not None and line_counter > limit):
                        break

            Assoc(self.name).load_all_properties(g)

        # loop through the location hash,
        # and make all snps at that location equivalent
        for l in loc_to_id_hash:
            snp_ids = loc_to_id_hash[l]
            if len(snp_ids) > 1:
                logger.info("%s has >1 snp id: %s", l, str(snp_ids))
        return