Пример #1
0
    def parse(self, limit=None):
        """

        :param limit:
        :return:
        """
        if limit is not None:
            logger.info("Only parsing first %s rows fo each file", str(limit))

        logger.info("Parsing files...")

        if self.testOnly:
            self.testMode = True
            g = self.testgraph
        else:
            g = self.graph

        tmap = '/'.join((self.rawdir, self.files['trait_mappings']['file']))
        self._process_trait_mappings(tmap, limit)

        geno = Genotype(g)
        # organisms  = ['chicken']
        organisms = [
            'chicken', 'pig', 'horse', 'rainbow_trout', 'sheep', 'cattle']

        for o in organisms:
            tax_id = self._get_tax_by_common_name(o)
            geno.addGenome(tax_id, o)
            build_id = None
            build = None

            k = o+'_bp'
            if k in self.files:
                file = self.files[k]['file']
                m = re.search(r'QTL_([\w\.]+)\.gff.txt.gz', file)
                if m is None:
                    logger.error("Can't match a gff build")
                else:
                    build = m.group(1)
                    build_id = self._map_build_by_abbrev(build)
                    logger.info("Build = %s", build_id)
                    geno.addReferenceGenome(build_id, build, tax_id)
                if build_id is not None:
                    self._process_QTLs_genomic_location(
                        '/'.join((self.rawdir, file)), tax_id, build_id, build,
                        limit)

            k = o+'_cm'
            if k in self.files:
                file = self.files[k]['file']
                self._process_QTLs_genetic_location(
                    '/'.join((self.rawdir, file)), tax_id, o, limit)

        logger.info("Finished parsing")

        self.load_bindings()

        logger.info("Found %d nodes", len(self.graph))
        return
Пример #2
0
    def parse(self, limit=None):
        """

        :param limit:
        :return:
        """
        if limit is not None:
            logger.info("Only parsing first %s rows fo each file", str(limit))

        logger.info("Parsing files...")

        if self.testOnly:
            self.testMode = True
            g = self.testgraph
        else:
            g = self.graph

        tmap = '/'.join((self.rawdir, self.files['trait_mappings']['file']))
        self._process_trait_mappings(tmap, limit)

        geno = Genotype(g)
        # organisms  = ['chicken']
        organisms = [
            'chicken', 'pig', 'horse', 'rainbow_trout', 'sheep', 'cattle']

        for o in organisms:
            tax_id = self._get_tax_by_common_name(o)
            geno.addGenome(tax_id, o)
            build_id = None
            build = None

            k = o + '_bp'
            if k in self.files:
                file = self.files[k]['file']
                m = re.search(r'QTL_([\w\.]+)\.gff.txt.gz', file)
                if m is None:
                    logger.error("Can't match a gff build")
                else:
                    build = m.group(1)
                    build_id = self._map_build_by_abbrev(build)
                    logger.info("Build = %s", build_id)
                    geno.addReferenceGenome(build_id, build, tax_id)
                if build_id is not None:
                    self._process_QTLs_genomic_location(
                        '/'.join((self.rawdir, file)), tax_id, build_id, build,
                        limit)

            k = o+'_cm'
            if k in self.files:
                file = self.files[k]['file']
                self._process_QTLs_genetic_location(
                    '/'.join((self.rawdir, file)), tax_id, o, limit)

        logger.info("Finished parsing")
        return
Пример #3
0
    def _create_genome_builds(self):
        """
        Various resources will map variations to either UCSC (hg*)
        or to NCBI assemblies. Here we create the equivalences between them.
        Data taken from:
        https://genome.ucsc.edu/FAQ/FAQreleases.html#release1

        :return:

        """

        # TODO add more species

        graph = self.graph
        geno = Genotype(graph)
        model = Model(graph)
        logger.info("Adding equivalent assembly identifiers")
        for sp in self.species:
            tax_id = self.resolve(sp)
            txid_num = tax_id.split(':')[1]
            for key in self.files[txid_num]['assembly']:
                ucsc_id = key
                try:
                    ucsc_label = ucsc_id.split(':')[1]
                except IndexError:
                    logger.error('%s Assembly id:  "%s" is problematic', sp,
                                 key)
                    continue
                if key in self.localtt:
                    mapped_id = self.localtt[key]
                else:
                    logger.error(
                        '%s Assembly id:  "%s" is not in local translation table',
                        sp, key)

                mapped_label = mapped_id.split(':')[1]

                mapped_label = 'NCBI build ' + str(mapped_label)
                geno.addReferenceGenome(ucsc_id, ucsc_label, tax_id)
                geno.addReferenceGenome(mapped_id, mapped_label, tax_id)
                model.addSameIndividual(ucsc_id, mapped_id)

        return
Пример #4
0
    def _create_genome_builds(self):
        """
        Various resources will map variations to either UCSC (hg*)
        or to NCBI assemblies. Here we create the equivalences between them.
        Data taken from:
        https://genome.ucsc.edu/FAQ/FAQreleases.html#release1

        :return:

        """

        # TODO add more species

        graph = self.graph
        geno = Genotype(graph)
        model = Model(graph)
        LOG.info("Adding equivalent assembly identifiers")
        for sp in self.species:
            tax_id = self.globaltt[sp]
            txid_num = tax_id.split(':')[1]
            for key in self.files[txid_num]['assembly']:
                ucsc_id = key
                try:
                    ucsc_label = ucsc_id.split(':')[1]
                except IndexError:
                    LOG.error('%s Assembly id:  "%s" is problematic', sp, key)
                    continue
                if key in self.localtt:
                    mapped_id = self.localtt[key]
                else:
                    LOG.error(
                        '%s Assembly id:  "%s" is not in local translation table',
                        sp, key)

                mapped_label = mapped_id.split(':')[1]

                mapped_label = 'NCBI build ' + str(mapped_label)
                geno.addReferenceGenome(ucsc_id, ucsc_label, tax_id)
                geno.addReferenceGenome(mapped_id, mapped_label, tax_id)
                model.addSameIndividual(ucsc_id, mapped_id)

        return
Пример #5
0
    def _get_chrbands(self, limit, taxon):
        """
        :param limit:
        :return:

        """
        model = Model(self.graph)
        # TODO PYLINT figure out what limit was for and why it is unused
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files[taxon]['file']))
        logger.info("Processing Chr bands from FILE: %s", myfile)
        geno = Genotype(self.graph)
        monochrom = Monochrom(self.graph_type, self.are_bnodes_skized)

        # used to hold band definitions for a chr
        # in order to compute extent of encompasing bands

        mybands = {}
        # build the organism's genome from the taxon
        genome_label = self.files[taxon]['genome_label']
        taxon_id = 'NCBITaxon:'+taxon

        # add the taxon as a class.  adding the class label elsewhere
        model.addClassToGraph(taxon_id, None)
        model.addSynonym(taxon_id, genome_label)

        geno.addGenome(taxon_id, genome_label)

        # add the build and the taxon it's in
        build_num = self.files[taxon]['build_num']
        build_id = 'UCSC:'+build_num
        geno.addReferenceGenome(build_id, build_num, taxon_id)

        # process the bands
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match('^#', line):
                    continue

                # chr13	4500000	10000000	p12	stalk
                (scaffold, start, stop, band_num, rtype) = line.split('\t')
                line_counter += 1

                # NOTE some less-finished genomes have
                # placed and unplaced scaffolds
                # * Placed scaffolds:
                #       the scaffolds have been placed within a chromosome.
                # * Unlocalized scaffolds:
                #   although the chromosome within which the scaffold occurs
                #   is known, the scaffold's position or orientation
                #   is not known.
                # * Unplaced scaffolds:
                #   it is not known which chromosome the scaffold belongs to
                #
                # find out if the thing is a full on chromosome, or a scaffold:
                # ex: unlocalized scaffold: chr10_KL568008v1_random
                # ex: unplaced scaffold: chrUn_AABR07022428v1
                placed_scaffold_pattern = r'(chr(?:\d+|X|Y|Z|W|M))'
                unlocalized_scaffold_pattern = \
                    placed_scaffold_pattern+r'_(\w+)_random'
                unplaced_scaffold_pattern = r'chr(Un(?:_\w+)?)'

                m = re.match(placed_scaffold_pattern+r'$', scaffold)
                if m is not None and len(m.groups()) == 1:
                    # the chromosome is the first match of the pattern
                    chrom_num = m.group(1)
                else:
                    # skip over anything that isn't a placed_scaffold
                    # at the class level
                    logger.info("Found non-placed chromosome %s", scaffold)
                    chrom_num = None

                m_chr_unloc = re.match(unlocalized_scaffold_pattern, scaffold)
                m_chr_unplaced = re.match(unplaced_scaffold_pattern, scaffold)

                scaffold_num = None
                if m:
                    pass
                elif m_chr_unloc is not None and\
                        len(m_chr_unloc.groups()) == 2:
                    chrom_num = m_chr_unloc.group(1)
                    scaffold_num = chrom_num+'_'+m_chr_unloc.group(2)
                elif m_chr_unplaced is not None and\
                        len(m_chr_unplaced.groups()) == 1:
                    scaffold_num = m_chr_unplaced.group(1)
                else:
                    logger.error(
                        "There's a chr pattern that we aren't matching: %s",
                        scaffold)

                if chrom_num is not None:
                    # the chrom class (generic) id
                    chrom_class_id = makeChromID(chrom_num, taxon, 'CHR')

                    # first, add the chromosome class (in the taxon)
                    geno.addChromosomeClass(
                        chrom_num, taxon_id, self.files[taxon]['genome_label'])

                    # then, add the chromosome instance (from the given build)
                    geno.addChromosomeInstance(chrom_num, build_id, build_num,
                                               chrom_class_id)

                    # add the chr to the hashmap of coordinates for this build
                    # the chromosome coordinate space is itself
                    if chrom_num not in mybands.keys():
                        mybands[chrom_num] = {
                            'min': 0,
                            'max': int(stop),
                            'chr': chrom_num,
                            'ref': build_id,
                            'parent': None,
                            'stain': None,
                            'type': Feature.types['chromosome']}

                if scaffold_num is not None:
                    # this will put the coordinates of the scaffold
                    # in the scaffold-space and make sure that the scaffold
                    # is part of the correct parent.
                    # if chrom_num is None,
                    # then it will attach it to the genome,
                    # just like a reg chrom
                    mybands[scaffold_num] = {
                        'min': start,
                        'max': stop,
                        'chr': scaffold_num,
                        'ref': build_id,
                        'parent': chrom_num,
                        'stain': None,
                        'type': Feature.types['assembly_component'],
                        'synonym': scaffold}

                if band_num is not None and band_num.strip() != '':
                    # add the specific band
                    mybands[chrom_num+band_num] = {'min': start,
                                                   'max': stop,
                                                   'chr': chrom_num,
                                                   'ref': build_id,
                                                   'parent': None,
                                                   'stain': None,
                                                   'type': None}

                    # add the staining intensity of the band
                    if re.match(r'g(neg|pos|var)', rtype):
                        mybands[chrom_num+band_num]['stain'] = \
                            Feature.types.get(rtype)

                    # get the parent bands, and make them unique
                    parents = list(
                        monochrom.make_parent_bands(band_num, set()))
                    # alphabetical sort will put them in smallest to biggest,
                    # so we reverse
                    parents.sort(reverse=True)
                    # print('parents of',chrom,band,':',parents)

                    if len(parents) > 0:
                        mybands[chrom_num+band_num]['parent'] = \
                            chrom_num+parents[0]
                else:
                    # TODO PYLINT why is 'parent'
                    # a list() a couple of lines up and a set() here?
                    parents = set()

                # loop through the parents and add them to the hash
                # add the parents to the graph, in hierarchical order
                # TODO PYLINT Consider using enumerate
                # instead of iterating with range and len
                for i in range(len(parents)):
                    rti = getChrPartTypeByNotation(parents[i])

                    pnum = chrom_num+parents[i]
                    sta = int(start)
                    sto = int(stop)
                    if pnum not in mybands.keys():
                        # add the parental band to the hash
                        b = {'min': min(sta, sto),
                             'max': max(sta, sto),
                             'chr': chrom_num,
                             'ref': build_id,
                             'parent': None,
                             'stain': None,
                             'type': rti}
                        mybands[pnum] = b
                    else:
                        # band already in the hash means it's a grouping band
                        # need to update the min/max coords
                        b = mybands.get(pnum)
                        b['min'] = min(sta, sto, b['min'])
                        b['max'] = max(sta, sto, b['max'])
                        mybands[pnum] = b

                        # also, set the max for the chrom
                        c = mybands.get(chrom_num)
                        c['max'] = max(sta, sto, c['max'])
                        mybands[chrom_num] = c

                    # add the parent relationships to each
                    if i < len(parents) - 1:
                        mybands[pnum]['parent'] = chrom_num+parents[i+1]
                    else:
                        # add the last one (p or q usually)
                        # as attached to the chromosome
                        mybands[pnum]['parent'] = chrom_num

        f.close()  # end looping through file

        # loop through the hash and add the bands to the graph
        for b in mybands.keys():
            myband = mybands.get(b)
            band_class_id = makeChromID(b, taxon, 'CHR')
            band_class_label = makeChromLabel(b, genome_label)
            band_build_id = makeChromID(b, build_num, 'MONARCH')
            band_build_label = makeChromLabel(b, build_num)
            # the build-specific chrom
            chrom_in_build_id = makeChromID(
                myband['chr'], build_num, 'MONARCH')
            # if it's != part, then add the class
            if myband['type'] != Feature.types['assembly_component']:
                model.addClassToGraph(band_class_id,
                                      band_class_label, myband['type'])
                bfeature = Feature(self.graph, band_build_id, band_build_label,
                                   band_class_id)
            else:
                bfeature = Feature(self.graph, band_build_id, band_build_label,
                                   myband['type'])
                if 'synonym' in myband:
                    model.addSynonym(band_build_id, myband['synonym'])

            if myband['parent'] is None:
                if myband['type'] == Feature.types['assembly_component']:
                    # since we likely don't know the chr,
                    # add it as a part of the build
                    geno.addParts(band_build_id, build_id)
            elif myband['type'] == Feature.types['assembly_component']:
                # geno.addParts(band_build_id, chrom_in_build_id)
                parent_chrom_in_build = makeChromID(myband['parent'],
                                                    build_num, 'MONARCH')
                bfeature.addSubsequenceOfFeature(parent_chrom_in_build)

            # add the band as a feature
            # (which also instantiates the owl:Individual)
            bfeature.addFeatureStartLocation(myband['min'], chrom_in_build_id)
            bfeature.addFeatureEndLocation(myband['max'], chrom_in_build_id)
            if 'stain' in myband and myband['stain'] is not None:
                # TODO 'has_staining_intensity' being dropped by MB
                bfeature.addFeatureProperty(
                    Feature.properties['has_staining_intensity'],
                    myband['stain'])

            # type the band as a faldo:Region directly (add_region=False)
            # bfeature.setNoBNodes(self.nobnodes)
            # to come when we merge in ZFIN.py
            bfeature.addFeatureToGraph(False)

        return
Пример #6
0
    def _process_qtls_genetic_location(
            self, raw, txid, common_name, limit=None):
        """
        This function processes

        Triples created:

        :param limit:
        :return:

        """
        aql_curie = self.files[common_name + '_cm']['curie']

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        line_counter = 0
        geno = Genotype(graph)
        model = Model(graph)
        eco_id = self.globaltt['quantitative trait analysis evidence']

        taxon_curie = 'NCBITaxon:' + txid

        LOG.info("Processing genetic location for %s from %s", taxon_curie, raw)
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (qtl_id,
                 qtl_symbol,
                 trait_name,
                 assotype,
                 empty,
                 chromosome,
                 position_cm,
                 range_cm,
                 flankmark_a2,
                 flankmark_a1,
                 peak_mark,
                 flankmark_b1,
                 flankmark_b2,
                 exp_id,
                 model_id,
                 test_base,
                 sig_level,
                 lod_score,
                 ls_mean,
                 p_values,
                 f_statistics,
                 variance,
                 bayes_value,
                 likelihood_ratio,
                 trait_id, dom_effect,
                 add_effect,
                 pubmed_id,
                 gene_id,
                 gene_id_src,
                 gene_id_type,
                 empty2) = row

                if self.test_mode and int(qtl_id) not in self.test_ids:
                    continue

                qtl_id = common_name + 'QTL:' + qtl_id.strip()
                trait_id = ':'.join((aql_curie, trait_id.strip()))

                # Add QTL to graph
                feature = Feature(graph, qtl_id, qtl_symbol, self.globaltt['QTL'])
                feature.addTaxonToFeature(taxon_curie)

                # deal with the chromosome
                chrom_id = makeChromID(chromosome, taxon_curie, 'CHR')

                # add a version of the chromosome which is defined as
                # the genetic map
                build_id = 'MONARCH:'+common_name.strip()+'-linkage'
                build_label = common_name+' genetic map'
                geno.addReferenceGenome(build_id, build_label, taxon_curie)
                chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH')
                geno.addChromosomeInstance(
                    chromosome, build_id, build_label, chrom_id)
                start = stop = None
                # range_cm sometimes ends in "(Mb)"  (i.e pig 2016 Nov)
                range_mb = re.split(r'\(', range_cm)
                if range_mb is not None:
                    range_cm = range_mb[0]

                if re.search(r'[0-9].*-.*[0-9]', range_cm):
                    range_parts = re.split(r'-', range_cm)

                    # check for poorly formed ranges
                    if len(range_parts) == 2 and\
                            range_parts[0] != '' and range_parts[1] != '':
                        (start, stop) = [
                            int(float(x.strip())) for x in re.split(r'-', range_cm)]
                    else:
                        LOG.info(
                            "A cM range we can't handle for QTL %s: %s",
                            qtl_id, range_cm)
                elif position_cm != '':
                    match = re.match(r'([0-9]*\.[0-9]*)', position_cm)
                    if match is not None:
                        position_cm = match.group()
                        start = stop = int(float(position_cm))

                # FIXME remove converion to int for start/stop
                # when schema can handle floats add in the genetic location
                # based on the range
                feature.addFeatureStartLocation(
                    start, chrom_in_build_id, None,
                    [self.globaltt['FuzzyPosition']])
                feature.addFeatureEndLocation(
                    stop, chrom_in_build_id, None,
                    [self.globaltt['FuzzyPosition']])
                feature.addFeatureToGraph()

                # sometimes there's a peak marker, like a rsid.
                # we want to add that as a variant of the gene,
                # and xref it to the qtl.
                dbsnp_id = None
                if peak_mark != '' and peak_mark != '.' and \
                        re.match(r'rs', peak_mark.strip()):
                    dbsnp_id = 'dbSNP:'+peak_mark.strip()

                    model.addIndividualToGraph(
                        dbsnp_id, None,
                        self.globaltt['sequence_alteration'])
                    model.addXref(qtl_id, dbsnp_id)

                gene_id = gene_id.replace('uncharacterized ', '').strip()
                if gene_id is not None and gene_id != '' and gene_id != '.'\
                        and re.fullmatch(r'[^ ]*', gene_id) is not None:

                    # we assume if no src is provided and gene_id is an integer,
                    # then it is an NCBI gene ... (okay, lets crank that back a notch)
                    if gene_id_src == '' and gene_id.isdigit() and \
                            gene_id in self.gene_info:
                        # LOG.info(
                        #    'Warm & Fuzzy saying %s is a NCBI gene for %s',
                        #    gene_id, common_name)
                        gene_id_src = 'NCBIgene'
                    elif gene_id_src == '' and gene_id.isdigit():
                        LOG.warning(
                            'Cold & Prickely saying %s is a NCBI gene for %s',
                            gene_id, common_name)
                        gene_id_src = 'NCBIgene'
                    elif gene_id_src == '':
                        LOG.error(
                            ' "%s" is a NOT NCBI gene for %s', gene_id, common_name)
                        gene_id_src = None

                    if gene_id_src == 'NCBIgene':
                        gene_id = 'NCBIGene:' + gene_id
                        # we will expect that these will get labels elsewhere
                        geno.addGene(gene_id, None)
                        # FIXME what is the right relationship here?
                        geno.addAffectedLocus(qtl_id, gene_id)

                        if dbsnp_id is not None:
                            # add the rsid as a seq alt of the gene_id
                            vl_id = '_:' + re.sub(
                                r':', '', gene_id) + '-' + peak_mark.strip()
                            geno.addSequenceAlterationToVariantLocus(
                                dbsnp_id, vl_id)
                            geno.addAffectedLocus(vl_id, gene_id)

                # add the trait
                model.addClassToGraph(trait_id, trait_name)

                # Add publication
                reference = None
                if re.match(r'ISU.*', pubmed_id):
                    pub_id = 'AQTLPub:'+pubmed_id.strip()
                    reference = Reference(graph, pub_id)
                elif pubmed_id != '':
                    pub_id = 'PMID:' + pubmed_id.strip()
                    reference = Reference(
                        graph, pub_id, self.globaltt['journal article'])

                if reference is not None:
                    reference.addRefToGraph()

                # make the association to the QTL
                assoc = G2PAssoc(
                    graph, self.name, qtl_id, trait_id, self.globaltt['is marker for'])
                assoc.add_evidence(eco_id)
                assoc.add_source(pub_id)

                # create a description from the contents of the file
                # desc = ''

                # assoc.addDescription(g, assoc_id, desc)

                # TODO add exp_id as evidence
                # if exp_id != '':
                #     exp_id = 'AQTLExp:'+exp_id
                #     gu.addIndividualToGraph(g, exp_id, None, eco_id)

                if p_values != '':
                    scr = re.sub(r'<', '', p_values)
                    scr = re.sub(r',', '.', scr)  # international notation
                    if scr.isnumeric():
                        score = float(scr)
                        assoc.set_score(score)  # todo add score type
                # TODO add LOD score?
                assoc.add_association_to_graph()

                # make the association to the dbsnp_id, if found
                if dbsnp_id is not None:
                    # make the association to the dbsnp_id
                    assoc = G2PAssoc(
                        graph, self.name, dbsnp_id, trait_id,
                        self.globaltt['is marker for'])
                    assoc.add_evidence(eco_id)
                    assoc.add_source(pub_id)

                    # create a description from the contents of the file
                    # desc = ''
                    # assoc.addDescription(g, assoc_id, desc)

                    # TODO add exp_id
                    # if exp_id != '':
                    #     exp_id = 'AQTLExp:'+exp_id
                    #     gu.addIndividualToGraph(g, exp_id, None, eco_id)

                    if p_values != '':
                        scr = re.sub(r'<', '', p_values)
                        scr = re.sub(r',', '.', scr)
                        if scr.isnumeric():
                            score = float(scr)
                            assoc.set_score(score)  # todo add score type
                    # TODO add LOD score?

                    assoc.add_association_to_graph()

                if not self.test_mode and limit is not None and line_counter > limit:
                    break

        LOG.info("Done with QTL genetic info")
        return
Пример #7
0
    def parse(self, limit=None):
        """

        :param limit:
        :return:
        """
        if limit is not None:
            LOG.info("Only parsing first %s rows fo each file", str(limit))

        LOG.info("Parsing files...")

        if self.test_only:
            self.test_mode = True
            graph = self.testgraph
        else:
            graph = self.graph

        traitmap = '/'.join((self.rawdir, self.files['trait_mappings']['file']))
        self._process_trait_mappings(traitmap, limit)

        geno = Genotype(graph)
        animals = ['chicken', 'pig', 'horse', 'rainbow_trout', 'sheep', 'cattle']

        for common_name in animals:
            txid_num = self.resolve(common_name).split(':')[1]
            taxon_label = self.localtt[common_name]
            taxon_curie = self.globaltt[taxon_label]
            taxon_num = taxon_curie.split(':')[1]
            txid_num = taxon_num  # for now
            taxon_word = taxon_label.replace(' ', '_')
            gene_info_file = '/'.join((
                self.rawdir, self.files[taxon_word + '_info']['file']))
            self.gene_info = list()
            LOG.info('Ingesting %s', gene_info_file)
            with gzip.open(gene_info_file, 'rt') as gi_gz:
                filereader = csv.reader(gi_gz, delimiter='\t')
                for row in filereader:
                    if row[0][0] == '#':
                        continue
                    else:
                        self.gene_info.append(str(row[1]))  # tossing lots of good stuff
            LOG.info(
                'Gene Info for %s has %i enteries', common_name, len(self.gene_info))
            # LOG.info('Gene Info entery looks like %s', self.gene_info[5])

            build = None

            fname_bp = common_name + '_bp'
            if fname_bp in self.files:
                bpfile = self.files[fname_bp]['file']
                mch = re.search(r'QTL_([\w\.]+)\.gff.txt.gz', bpfile)
                if mch is None:
                    LOG.error("Can't match a gff build to " + fname_bp)
                else:
                    build = mch.group(1)
                    build_id = self.localtt[build]
                    LOG.info("Build UCSC label is: %s", build_id)

                    # NCBI assembly curie is

                    geno.addReferenceGenome(build_id, build, txid_num)

                if build_id is not None:
                    self._process_qtls_genomic_location(
                        '/'.join((self.rawdir, bpfile)), txid_num, build_id, build,
                        common_name, limit)

            fname_cm = common_name + '_cm'
            if fname_cm in self.files:
                cmfile = self.files[fname_cm]['file']
                self._process_qtls_genetic_location(
                    '/'.join((self.rawdir, cmfile)), txid_num, common_name, limit)

        LOG.info("Finished parsing")
        return
Пример #8
0
    def _process_qtls_genetic_location(
            self, raw, src_key, txid, common_name, limit=None):
        """
        This function processes

        Triples created:

        :param limit:
        :return:

        """
        aql_curie = self.files[src_key]['curie']
        common_name = common_name.strip()
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        geno = Genotype(graph)
        model = Model(graph)
        eco_id = self.globaltt['quantitative trait analysis evidence']
        taxon_curie = 'NCBITaxon:' + txid

        LOG.info("Processing genetic location for %s from %s", taxon_curie, raw)
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            reader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            # no header in these files, so no header checking
            col = self.files[src_key]['columns']
            col_len = len(col)
            for row in reader:
                if len(row) != col_len and ''.join(row[col_len:]) != '':
                    LOG.warning(
                        "Problem parsing %s line %i containing: \n%s\n"
                        "got %i cols but expected %i",
                        raw, reader.line_num, row, len(row), col_len)
                    # LOG.info(row)
                    continue

                qtl_id = row[col.index('QTL_ID')].strip()
                qtl_symbol = row[col.index('QTL_symbol')].strip()
                trait_name = row[col.index('Trait_name')].strip()
                # assotype = row[col.index('assotype')].strip()
                chromosome = row[col.index('Chromosome')].strip()
                position_cm = row[col.index('Position_cm')].strip()
                range_cm = row[col.index('range_cm')].strip()
                # flankmark_a2 = row[col.index('FlankMark_A2')].strip()
                # flankmark_a1 = row[col.index('FlankMark_A1')].strip()
                peak_mark = row[col.index('Peak_Mark')].strip()
                # flankmark_b1 = row[col.index('FlankMark_B1')].strip()
                # flankmark_b2 = row[col.index('FlankMark_B2')].strip()
                # exp_id = row[col.index('Exp_ID')].strip()
                # model_id = row[col.index('Model')].strip()
                # test_base = row[col.index('testbase')].strip()
                # sig_level = row[col.index('siglevel')].strip()
                # lod_score = row[col.index('LOD_score')].strip()
                # ls_mean = row[col.index('LS_mean')].strip()
                p_values = row[col.index('P_values')].strip()
                # f_statistics = row[col.index('F_Statistics')].strip()
                # variance = row[col.index('VARIANCE')].strip()
                # bayes_value = row[col.index('Bayes_value')].strip()
                # likelihood_ratio = row[col.index('LikelihoodR')].strip()
                trait_id = row[col.index('TRAIT_ID')].strip()
                # dom_effect = row[col.index('Dom_effect')].strip()
                # add_effect = row[col.index('Add_effect')].strip()
                pubmed_id = row[col.index('PUBMED_ID')].strip()
                gene_id = row[col.index('geneID')].strip()
                gene_id_src = row[col.index('geneIDsrc')].strip()
                # gene_id_type = row[col.index('geneIDtype')].strip()

                if self.test_mode and int(qtl_id) not in self.test_ids:
                    continue

                qtl_id = common_name + 'QTL:' + qtl_id.strip()
                trait_id = ':'.join((aql_curie, trait_id.strip()))

                # Add QTL to graph
                feature = Feature(graph, qtl_id, qtl_symbol, self.globaltt['QTL'])
                feature.addTaxonToFeature(taxon_curie)

                # deal with the chromosome
                chrom_id = makeChromID(chromosome, taxon_curie, 'CHR')

                # add a version of the chromosome which is defined as
                # the genetic map
                build_id = 'MONARCH:' + common_name + '-linkage'
                build_label = common_name + ' genetic map'
                geno.addReferenceGenome(build_id, build_label, taxon_curie)
                chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH')
                geno.addChromosomeInstance(
                    chromosome, build_id, build_label, chrom_id)
                start = stop = None
                # range_cm sometimes ends in "(Mb)"  (i.e pig 2016 Nov)
                range_mb = re.split(r'\(', range_cm)
                if range_mb is not None:
                    range_cm = range_mb[0]

                if re.search(r'[0-9].*-.*[0-9]', range_cm):
                    range_parts = re.split(r'-', range_cm)

                    # check for poorly formed ranges
                    if len(range_parts) == 2 and\
                            range_parts[0] != '' and range_parts[1] != '':
                        (start, stop) = [
                            int(float(x.strip())) for x in re.split(r'-', range_cm)]
                    else:
                        LOG.info(
                            "A cM range we can't handle for QTL %s: %s",
                            qtl_id, range_cm)
                elif position_cm != '':
                    match = re.match(r'([0-9]*\.[0-9]*)', position_cm)
                    if match is not None:
                        position_cm = match.group()
                        start = stop = int(float(position_cm))

                # FIXME remove converion to int for start/stop
                # when schema can handle floats add in the genetic location
                # based on the range
                feature.addFeatureStartLocation(
                    start, chrom_in_build_id, None,
                    [self.globaltt['FuzzyPosition']])
                feature.addFeatureEndLocation(
                    stop, chrom_in_build_id, None,
                    [self.globaltt['FuzzyPosition']])
                feature.addFeatureToGraph()

                # sometimes there's a peak marker, like a rsid.
                # we want to add that as a variant of the gene,
                # and xref it to the qtl.
                dbsnp_id = None
                if peak_mark != '' and peak_mark != '.' and \
                        re.match(r'rs', peak_mark.strip()):
                    dbsnp_id = 'dbSNP:' + peak_mark.strip()

                    model.addIndividualToGraph(
                        dbsnp_id, None, self.globaltt['sequence_alteration'])

                    model.addXref(
                        qtl_id, dbsnp_id, xref_category=blv.terms['SequenceVariant'])

                gene_id = gene_id.replace('uncharacterized ', '').strip()
                gene_id = gene_id.strip(',')  # for "100157483,"  in pig_QTLdata.txt
                if gene_id is not None and gene_id != '' and gene_id != '.'\
                        and re.fullmatch(r'[^ ]*', gene_id) is not None:

                    # we assume if no src is provided and gene_id is an integer,
                    # then it is an NCBI gene ... (okay, lets crank that back a notch)
                    if gene_id_src == '' and gene_id.isdigit() and \
                            gene_id in self.gene_info:
                        # LOG.info(
                        #    'Warm & Fuzzy saying %s is a NCBI gene for %s',
                        #    gene_id, common_name)
                        gene_id_src = 'NCBIgene'
                    elif gene_id_src == '' and gene_id.isdigit():
                        LOG.warning(
                            'Cold & Prickely saying %s is a NCBI gene for %s',
                            gene_id, common_name)
                        gene_id_src = 'NCBIgene'
                    elif gene_id_src == '':
                        LOG.error(
                            ' "%s" is a NOT NCBI gene for %s', gene_id, common_name)
                        gene_id_src = None

                    if gene_id_src == 'NCBIgene':
                        gene_id = 'NCBIGene:' + gene_id
                        # we will expect that these will get labels elsewhere
                        geno.addGene(gene_id, None)
                        # FIXME what is the right relationship here?
                        geno.addAffectedLocus(qtl_id, gene_id)

                        if dbsnp_id is not None:
                            # add the rsid as a seq alt of the gene_id as a bnode
                            vl_id = self.make_id(re.sub(
                                r':', '', gene_id) + '-' + peak_mark.strip(), '_')
                            geno.addSequenceAlterationToVariantLocus(dbsnp_id, vl_id)
                            geno.addAffectedLocus(vl_id, gene_id)

                # add the trait
                model.addClassToGraph(
                    trait_id,
                    trait_name,
                    class_category=blv.terms['PhenotypicFeature'])

                # Add publication
                reference = None
                if re.match(r'ISU.*', pubmed_id):
                    pub_id = 'AQTLPub:' + pubmed_id.strip()
                    reference = Reference(graph, pub_id)
                elif pubmed_id != '':
                    pub_id = 'PMID:' + pubmed_id.strip()
                    reference = Reference(
                        graph, pub_id, self.globaltt['journal article'])

                if reference is not None:
                    reference.addRefToGraph()

                # make the association to the QTL
                assoc = G2PAssoc(
                    graph, self.name, qtl_id, trait_id, self.globaltt['is marker for'])
                assoc.add_evidence(eco_id)
                assoc.add_source(pub_id)

                # create a description from the contents of the file
                # desc = ''

                # assoc.addDescription(g, assoc_id, desc)

                # TODO add exp_id as evidence
                # if exp_id != '':
                #     exp_id = 'AQTLExp:'+exp_id
                #     gu.addIndividualToGraph(g, exp_id, None, eco_id)

                if p_values != '':
                    scr = re.sub(r'<', '', p_values)
                    scr = re.sub(r',', '.', scr)  # international notation
                    if scr.isnumeric():
                        score = float(scr)
                        assoc.set_score(score)  # todo add score type
                # TODO add LOD score?
                assoc.add_association_to_graph()

                # make the association to the dbsnp_id, if found
                if dbsnp_id is not None:
                    # make the association to the dbsnp_id
                    assoc = G2PAssoc(
                        graph, self.name, dbsnp_id, trait_id,
                        self.globaltt['is marker for'])
                    assoc.add_evidence(eco_id)
                    assoc.add_source(pub_id)

                    # create a description from the contents of the file
                    # desc = ''
                    # assoc.addDescription(g, assoc_id, desc)

                    # TODO add exp_id
                    # if exp_id != '':
                    #     exp_id = 'AQTLExp:'+exp_id
                    #     gu.addIndividualToGraph(g, exp_id, None, eco_id)

                    if p_values != '':
                        scr = re.sub(r'<', '', p_values)
                        scr = re.sub(r',', '.', scr)
                        if scr.isnumeric():
                            score = float(scr)
                            assoc.set_score(score)  # todo add score type
                    # TODO add LOD score?

                    assoc.add_association_to_graph()

                # off by one - the following actually gives us (limit + 1) records
                if not self.test_mode and limit is not None and reader.line_num > limit:
                    break

        LOG.info("Done with QTL genetic info")
Пример #9
0
    def parse(self, limit=None):
        """

        :param limit:
        :return:
        """
        if limit is not None:
            LOG.info("Only parsing first %s rows fo each file", str(limit))

        if self.test_only:
            self.test_mode = True
            graph = self.testgraph
        else:
            graph = self.graph

        trait_src_key = 'trait_mappings'
        traitmap = '/'.join((self.rawdir, self.files[trait_src_key]['file']))

        LOG.info("Parsing trait mapping  file %s", traitmap)
        self._process_trait_mappings(traitmap, trait_src_key, limit)

        geno = Genotype(graph)
        animals = ['chicken', 'pig', 'horse', 'rainbow_trout', 'sheep', 'cattle']

        for common_name in animals:
            txid_num = self.resolve(common_name).split(':')[1]
            taxon_label = self.localtt[common_name]
            taxon_curie = self.globaltt[taxon_label]
            taxon_num = taxon_curie.split(':')[1]
            txid_num = taxon_num  # for now
            taxon_word = taxon_label.strip().replace(' ', '_')
            src_key = taxon_word + '_info'
            gene_info_file = '/'.join((
                self.rawdir, self.files[src_key]['file']))
            self.gene_info = list()
            LOG.info('Ingesting %s', gene_info_file)
            with gzip.open(gene_info_file, 'rt') as gi_gz:
                reader = csv.reader(gi_gz, delimiter='\t')
                # skipping header checking, b/c not all of these gene_info files have
                # headers
                col = self.files[src_key]['columns']
                col_len = len(col)
                for row in reader:
                    if row[0][0] == '#':
                        # LOG.info(row)
                        continue
                    if len(row) != col_len and ''.join(row[col_len:]) != '':
                        LOG.warning(
                            "Problem parsing in %s row %i\n"
                            "got %s cols but expected %s",
                            gene_info_file, reader.line_num, len(row), col_len)
                        LOG.info(row)

                    self.gene_info.append(row[col.index('GeneID')])
            LOG.info(
                'Gene Info for %s has %i entries', common_name, len(self.gene_info))
            build = None

            fname_bp = common_name + '_bp'
            if fname_bp in self.files:
                bpfile = self.files[fname_bp]['file']
                mch = re.search(r'QTL_([\w\.]+)\.gff.txt.gz', bpfile)
                if mch is None:
                    LOG.error("Can't match a gff build to " + fname_bp)
                else:
                    build = mch.group(1)
                    build_id = self.localtt[build]
                    LOG.info("Build UCSC label is: %s", build_id)

                    geno.addReferenceGenome(build_id, build, txid_num)

                if build_id is not None:
                    self._process_qtls_genomic_location(
                        '/'.join((self.rawdir, bpfile)),
                        fname_bp,
                        txid_num,
                        build_id,
                        build,
                        common_name,
                        limit
                    )

            fname_cm = common_name + '_cm'
            if fname_cm in self.files:
                cmfile = self.files[fname_cm]['file']
                self._process_qtls_genetic_location(
                    '/'.join((self.rawdir, cmfile)),
                    fname_cm,
                    txid_num,
                    common_name,
                    limit)

        LOG.info("Finished parsing")
Пример #10
0
    def _create_genome_builds(self):
        """
        Various resources will map variations to either UCSC (hg*)
        or to NCBI assemblies. Here we create the equivalences between them.
        Data taken from:
        https://genome.ucsc.edu/FAQ/FAQreleases.html#release1

        :return:

        """

        # TODO add more species
        ucsc_assembly_id_map = {
            "9606": {
                "UCSC:hg38": "NCBIGenome:GRCh38",
                "UCSC:hg19": "NCBIGenome:GRCh37",
                "UCSC:hg18": "NCBIGenome:36.1",
                "UCSC:hg17": "NCBIGenome:35",
                "UCSC:hg16": "NCBIGenome:34",
                "UCSC:hg15": "NCBIGenome:33",
            },
            "7955": {
                "UCSC:danRer10": "NCBIGenome:GRCz10",
                "UCSC:danRer7": "NCBIGenome:Zv9",
                "UCSC:danRer6": "NCBIGenome:Zv8",
            },
            "10090": {
                "UCSC:mm10": "NCBIGenome:GRCm38",
                "UCSC:mm9": "NCBIGenome:37"
            },
            "9031": {
                "UCSC:galGal4": "NCBIAssembly:317958",
            },
            "9913": {
                "UCSC:bosTau7": "NCBIAssembly:GCF_000003205.5",
            },
            "9823": {
                "UCSC:susScr3": "NCBIAssembly:304498",
            },
            "9940": {
                "UCSC:oviAri3": "NCBIAssembly:GCF_000298735.1",
            },
            "9796": {
                "UCSC:equCab2": "NCBIAssembly:GCF_000002305.2",
            }
        }
        g = self.graph
        geno = Genotype(g)
        model = Model(g)
        logger.info("Adding equivalent assembly identifiers")
        for sp in ucsc_assembly_id_map:
            tax_num = sp
            tax_id = 'NCBITaxon:' + tax_num
            mappings = ucsc_assembly_id_map[sp]
            for i in mappings:
                ucsc_id = i
                ucsc_label = re.split(':', i)[1]
                mapped_id = mappings[i]
                mapped_label = re.split(':', mapped_id)[1]
                mapped_label = 'NCBI build ' + str(mapped_label)
                geno.addReferenceGenome(ucsc_id, ucsc_label, tax_id)
                geno.addReferenceGenome(mapped_id, mapped_label, tax_id)
                model.addSameIndividual(ucsc_id, mapped_id)

        return
Пример #11
0
    def _get_chrbands(self, limit, src_key, genome_id):
        """
        :param limit:
        :return:

        """
        tax_num = src_key
        if limit is None:
            limit = sys.maxsize  # practical limit anyway
        model = Model(self.graph)
        line_num = 0
        myfile = '/'.join((self.rawdir, self.files[src_key]['file']))
        LOG.info("Processing Chr bands from FILE: %s", myfile)
        geno = Genotype(self.graph)
        monochrom = Monochrom(self.graph_type, self.are_bnodes_skized)

        # used to hold band definitions for a chr
        # in order to compute extent of encompasing bands

        mybands = {}
        # build the organism's genome from the taxon
        genome_label = self.files[src_key]['genome_label']
        taxon_curie = 'NCBITaxon:' + tax_num
        species_name = self.globaltcid[taxon_curie]  # for logging

        # add the taxon as a class.  adding the class label elsewhere
        model.addClassToGraph(taxon_curie, None)
        model.addSynonym(taxon_curie, genome_label)

        geno.addGenome(taxon_curie, genome_label, genome_id)

        # add the build and the taxon it's in
        build_num = self.files[src_key]['build_num']
        build_id = 'UCSC:' + build_num
        geno.addReferenceGenome(build_id, build_num, taxon_curie)

        # cat (at least)  also has  chr[BDAECF]... hex? must be a back cat.
        if tax_num == self.localtt['Felis catus']:
            placed_scaffold_regex = re.compile(
                r'(chr(?:[BDAECF]\d+|X|Y|Z|W|M|))$')
        else:
            placed_scaffold_regex = re.compile(r'(chr(?:\d+|X|Y|Z|W|M))$')
        unlocalized_scaffold_regex = re.compile(r'_(\w+)_random')
        unplaced_scaffold_regex = re.compile(r'chr(Un(?:_\w+)?)')

        # process the bands
        col = self.files[src_key]['columns']

        with gzip.open(myfile, 'rb') as binreader:
            for line in binreader:
                line_num += 1
                # skip comments
                line = line.decode().strip()
                if line[0] == '#' or line_num > limit:
                    continue
                # chr13	4500000	10000000	p12	stalk
                row = line.split('\t')
                scaffold = row[col.index('chrom')].strip()
                start = row[col.index('chromStart')]
                stop = row[col.index('chromEnd')]
                band_num = row[col.index('name')].strip()
                rtype = row[col.index('gieStain')]

                # NOTE some less-finished genomes have
                # placed and unplaced scaffolds
                # * Placed scaffolds:
                #       the scaffolds have been placed within a chromosome.
                # * Unlocalized scaffolds:
                #   although the chromosome within which the scaffold occurs
                #   is known, the scaffold's position or orientation
                #   is not known.
                # * Unplaced scaffolds:
                #   it is not known which chromosome the scaffold belongs to
                #
                # find out if the thing is a full on chromosome, or a scaffold:
                # ex: unlocalized scaffold: chr10_KL568008v1_random
                # ex: unplaced scaffold: chrUn_AABR07022428v1

                mch = placed_scaffold_regex.match(scaffold)
                if mch is not None and len(mch.groups()) == 1:
                    # the chromosome is the first match of the pattern
                    chrom_num = mch.group(1)
                else:
                    # skip over anything that isn't a placed_scaffold at the class level
                    # LOG.info("Found non-placed chromosome %s", scaffold)
                    chrom_num = None

                m_chr_unloc = unlocalized_scaffold_regex.match(scaffold)
                m_chr_unplaced = unplaced_scaffold_regex.match(scaffold)

                scaffold_num = None
                if mch:
                    pass
                elif m_chr_unloc is not None and len(
                        m_chr_unloc.groups()) == 2:
                    chrom_num = m_chr_unloc.group(1)
                    scaffold_num = chrom_num + '_' + m_chr_unloc.group(2)
                elif m_chr_unplaced is not None and len(
                        m_chr_unplaced.groups()) == 1:
                    scaffold_num = m_chr_unplaced.group(1)
                # else:
                #    LOG.error(
                #        "There's a chr pattern that we aren't matching: %s", scaffold)

                if chrom_num is not None:
                    # the chrom class (generic) id
                    chrom_class_id = makeChromID(chrom_num, tax_num, 'CHR')

                    # first, add the chromosome class (in the taxon)
                    geno.addChromosomeClass(
                        chrom_num, taxon_curie,
                        self.files[src_key]['genome_label'])

                    # then, add the chromosome instance (from the given build)
                    geno.addChromosomeInstance(chrom_num, build_id, build_num,
                                               chrom_class_id)

                    # add the chr to the hashmap of coordinates for this build
                    # the chromosome coordinate space is itself
                    if chrom_num not in mybands.keys():
                        mybands[chrom_num] = {
                            'min': 0,
                            'max': int(stop),
                            'chr': chrom_num,
                            'ref': build_id,
                            'parent': None,
                            'stain': None,
                            'type': self.globaltt['chromosome']
                        }
                elif scaffold_num is not None:
                    # this will put the coordinates of the scaffold
                    # in the scaffold-space and make sure that the scaffold
                    # is part of the correct parent.
                    # if chrom_num is None,
                    # then it will attach it to the genome,
                    # just like a reg chrom
                    mybands[scaffold_num] = {
                        'min': start,
                        'max': stop,
                        'chr': scaffold_num,
                        'ref': build_id,
                        'parent': chrom_num,
                        'stain': None,
                        'type': self.globaltt['assembly_component'],
                        'synonym': scaffold
                    }
                else:
                    LOG.info('%s line %i DROPPED chromosome/scaffold  %s',
                             species_name, line_num, scaffold)

                parents = list()

                # see it new types have showed up
                if rtype is not None and rtype not in [
                        'gneg', 'gpos25', 'gpos33', 'gpos50', 'gpos66',
                        'gpos75', 'gpos100', 'acen', 'gvar', 'stalk'
                ]:
                    LOG.info('Unknown gieStain type "%s" in %s at %i', rtype,
                             src_key, line_num)
                    self.globaltt[rtype]  # blow up

                if rtype == 'acen':  # hacky, revisit if ontology improves
                    rtype = self.localtt[rtype]

                if band_num is not None and band_num != '' and \
                        rtype is not None and rtype != '':
                    # add the specific band
                    mybands[chrom_num + band_num] = {
                        'min': start,
                        'max': stop,
                        'chr': chrom_num,
                        'ref': build_id,
                        'parent': None,
                        'stain': None,
                        'type': self.globaltt[rtype],
                    }

                    # add the staining intensity of the band
                    # get the parent bands, and make them unique
                    parents = list(monochrom.make_parent_bands(
                        band_num, set()))
                    # alphabetical sort will put them in smallest to biggest,
                    # so we reverse
                    parents.sort(reverse=True)
                    # print('parents of',chrom,band,':',parents)

                    if len(parents) > 0:
                        mybands[chrom_num +
                                band_num]['parent'] = chrom_num + parents[0]
                    # else:   # band has no parents

                # loop through the parents and add them to the dict
                # add the parents to the graph, in hierarchical order
                # TODO PYLINT Consider using enumerate
                # instead of iterating with range and len
                for i in range(len(parents)):
                    rti = getChrPartTypeByNotation(parents[i], self.graph)

                    pnum = chrom_num + parents[i]
                    sta = int(start)
                    sto = int(stop)
                    if pnum is not None and pnum not in mybands.keys():
                        # add the parental band to the hash
                        bnd = {
                            'min': min(sta, sto),
                            'max': max(sta, sto),
                            'chr': chrom_num,
                            'ref': build_id,
                            'parent': None,
                            'stain': None,
                            'type': rti
                        }
                        mybands[pnum] = bnd
                    elif pnum is not None:
                        # band already in the hash means it's a grouping band
                        # need to update the min/max coords
                        bnd = mybands.get(pnum)
                        bnd['min'] = min(sta, sto, bnd['min'])
                        bnd['max'] = max(sta, sto, bnd['max'])
                        mybands[pnum] = bnd

                        # also, set the max for the chrom
                        chrom = mybands.get(chrom_num)
                        chrom['max'] = max(sta, sto, chrom['max'])
                        mybands[chrom_num] = chrom
                    else:
                        LOG.error("pnum is None")
                    # add the parent relationships to each
                    if i < len(parents) - 1:
                        mybands[pnum]['parent'] = chrom_num + parents[i + 1]
                    else:
                        # add the last one (p or q usually)
                        # as attached to the chromosome
                        mybands[pnum]['parent'] = chrom_num

        binreader.close()  # end looping through file

        # loop through the hash and add the bands to the graph
        for bnd in mybands.keys():
            myband = mybands.get(bnd)
            band_class_id = makeChromID(bnd, tax_num, 'CHR')
            band_class_label = makeChromLabel(bnd, genome_label)
            band_build_id = makeChromID(bnd, build_num, 'MONARCH')
            band_build_label = makeChromLabel(bnd, build_num)
            # the build-specific chrom
            chrom_in_build_id = makeChromID(myband['chr'], build_num,
                                            'MONARCH')
            # if it's != part, then add the class
            if myband['type'] != self.globaltt['assembly_component']:
                model.addClassToGraph(band_class_id, band_class_label,
                                      myband['type'])
                bfeature = Feature(self.graph, band_build_id, band_build_label,
                                   band_class_id)
            else:
                bfeature = Feature(self.graph, band_build_id, band_build_label,
                                   myband['type'])
                if 'synonym' in myband:
                    model.addSynonym(band_build_id, myband['synonym'])

            if myband['parent'] is None:
                if myband['type'] == self.globaltt['assembly_component']:
                    # since we likely don't know the chr,
                    # add it as a part of the build
                    geno.addParts(band_build_id, build_id)
            elif myband['type'] == self.globaltt['assembly_component']:
                # geno.addParts(band_build_id, chrom_in_build_id)
                parent_chrom_in_build = makeChromID(myband['parent'],
                                                    build_num, 'MONARCH')
                bfeature.addSubsequenceOfFeature(parent_chrom_in_build)

            # add the band as a feature
            # (which also instantiates the owl:Individual)
            bfeature.addFeatureStartLocation(myband['min'], chrom_in_build_id)
            bfeature.addFeatureEndLocation(myband['max'], chrom_in_build_id)
            if 'stain' in myband and myband['stain'] is not None:
                bfeature.addFeatureProperty(
                    self.globaltt['has_sequence_attribute'], myband['stain'])

            # type the band as a faldo:Region directly (add_region=False)
            # bfeature.setNoBNodes(self.nobnodes)
            # to come when we merge in ZFIN.py
            bfeature.addFeatureToGraph(False)
Пример #12
0
    def parse(self, limit=None):
        """

        :param limit:
        :return:
        """
        if limit is not None:
            LOG.info("Only parsing first %s rows fo each file", str(limit))

        LOG.info("Parsing files...")

        if self.testOnly:
            self.testMode = True
            graph = self.testgraph
        else:
            graph = self.graph

        traitmap = '/'.join((self.rawdir, self.files['trait_mappings']['file']))
        self._process_trait_mappings(traitmap, limit)

        geno = Genotype(graph)
        animals = ['chicken', 'pig', 'horse', 'rainbow_trout', 'sheep', 'cattle']

        for common_name in animals:
            txid_num = self.resolve(common_name).split(':')[1]
            taxon_label = self.localtt[common_name]
            taxon_curie = self.globaltt[taxon_label]
            taxon_num = taxon_curie.split(':')[1]
            txid_num = taxon_num  # for now
            taxon_word = taxon_label.replace(' ', '_')
            gene_info_file = '/'.join(
                (self.rawdir, self.files[taxon_word + '_info']['file']))
            self.gene_info = list()
            with gzip.open(gene_info_file, 'rt') as gi_gz:
                filereader = csv.reader(gi_gz, delimiter='\t')
                for row in filereader:
                    if row[0][0] == '#':
                        continue
                    else:
                        self.gene_info.append(str(row[1]))  # tossing lots of good stuff
            LOG.info(
                'Gene Info for %s has %i enteries', common_name, len(self.gene_info))
            # LOG.info('Gene Info entery looks like %s', self.gene_info[5])

            build = None

            fname_bp = common_name + '_bp'
            if fname_bp in self.files:
                bpfile = self.files[fname_bp]['file']
                mch = re.search(r'QTL_([\w\.]+)\.gff.txt.gz', bpfile)
                if mch is None:
                    LOG.error("Can't match a gff build to " + fname_bp)
                else:
                    build = mch.group(1)
                    build_id = self._map_build_by_abbrev(build)
                    LOG.info("Build = %s", build_id)
                    geno.addReferenceGenome(build_id, build, txid_num)
                if build_id is not None:
                    self._process_qtls_genomic_location(
                        '/'.join((self.rawdir, bpfile)), txid_num, build_id, build,
                        common_name, limit)

            fname_cm = common_name + '_cm'
            if fname_cm in self.files:
                cmfile = self.files[fname_cm]['file']
                self._process_qtls_genetic_location(
                    '/'.join((self.rawdir, cmfile)), txid_num, common_name, limit)

        LOG.info("Finished parsing")
        return
Пример #13
0
    def _get_chrbands(self, limit, taxon):
        """
        :param limit:
        :return:

        """
        model = Model(self.graph)
        # TODO PYLINT figure out what limit was for and why it is unused
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files[taxon]['file']))
        logger.info("Processing Chr bands from FILE: %s", myfile)
        geno = Genotype(self.graph)
        monochrom = Monochrom(self.graph_type, self.are_bnodes_skized)

        # used to hold band definitions for a chr
        # in order to compute extent of encompasing bands

        mybands = {}
        # build the organism's genome from the taxon
        genome_label = self.files[taxon]['genome_label']
        taxon_id = 'NCBITaxon:' + taxon

        # add the taxon as a class.  adding the class label elsewhere
        model.addClassToGraph(taxon_id, None)
        model.addSynonym(taxon_id, genome_label)

        geno.addGenome(taxon_id, genome_label)

        # add the build and the taxon it's in
        build_num = self.files[taxon]['build_num']
        build_id = 'UCSC:' + build_num
        geno.addReferenceGenome(build_id, build_num, taxon_id)

        # process the bands
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match('^#', line):
                    continue

                # chr13	4500000	10000000	p12	stalk
                (scaffold, start, stop, band_num, rtype) = line.split('\t')
                line_counter += 1

                # NOTE some less-finished genomes have
                # placed and unplaced scaffolds
                # * Placed scaffolds:
                #       the scaffolds have been placed within a chromosome.
                # * Unlocalized scaffolds:
                #   although the chromosome within which the scaffold occurs
                #   is known, the scaffold's position or orientation
                #   is not known.
                # * Unplaced scaffolds:
                #   it is not known which chromosome the scaffold belongs to
                #
                # find out if the thing is a full on chromosome, or a scaffold:
                # ex: unlocalized scaffold: chr10_KL568008v1_random
                # ex: unplaced scaffold: chrUn_AABR07022428v1
                placed_scaffold_pattern = r'(chr(?:\d+|X|Y|Z|W|M))'
                unlocalized_scaffold_pattern = placed_scaffold_pattern + r'_(\w+)_random'
                unplaced_scaffold_pattern = r'chr(Un(?:_\w+)?)'

                mch = re.match(placed_scaffold_pattern + r'$', scaffold)
                if mch is not None and len(mch.groups()) == 1:
                    # the chromosome is the first match of the pattern
                    chrom_num = mch.group(1)
                else:
                    # skip over anything that isn't a placed_scaffold
                    # at the class level
                    logger.info("Found non-placed chromosome %s", scaffold)
                    chrom_num = None

                m_chr_unloc = re.match(unlocalized_scaffold_pattern, scaffold)
                m_chr_unplaced = re.match(unplaced_scaffold_pattern, scaffold)

                scaffold_num = None
                if mch:
                    pass
                elif m_chr_unloc is not None and len(
                        m_chr_unloc.groups()) == 2:
                    chrom_num = m_chr_unloc.group(1)
                    scaffold_num = chrom_num + '_' + m_chr_unloc.group(2)
                elif m_chr_unplaced is not None and len(
                        m_chr_unplaced.groups()) == 1:
                    scaffold_num = m_chr_unplaced.group(1)
                else:
                    logger.error(
                        "There's a chr pattern that we aren't matching: %s",
                        scaffold)

                if chrom_num is not None:
                    # the chrom class (generic) id
                    chrom_class_id = makeChromID(chrom_num, taxon, 'CHR')

                    # first, add the chromosome class (in the taxon)
                    geno.addChromosomeClass(chrom_num, taxon_id,
                                            self.files[taxon]['genome_label'])

                    # then, add the chromosome instance (from the given build)
                    geno.addChromosomeInstance(chrom_num, build_id, build_num,
                                               chrom_class_id)

                    # add the chr to the hashmap of coordinates for this build
                    # the chromosome coordinate space is itself
                    if chrom_num not in mybands.keys():
                        mybands[chrom_num] = {
                            'min': 0,
                            'max': int(stop),
                            'chr': chrom_num,
                            'ref': build_id,
                            'parent': None,
                            'stain': None,
                            'type': self.globaltt['chromosome']
                        }

                if scaffold_num is not None:
                    # this will put the coordinates of the scaffold
                    # in the scaffold-space and make sure that the scaffold
                    # is part of the correct parent.
                    # if chrom_num is None,
                    # then it will attach it to the genome,
                    # just like a reg chrom
                    mybands[scaffold_num] = {
                        'min': start,
                        'max': stop,
                        'chr': scaffold_num,
                        'ref': build_id,
                        'parent': chrom_num,
                        'stain': None,
                        'type': self.globaltt['assembly_component'],
                        'synonym': scaffold
                    }

                if band_num is not None and band_num.strip() != '':
                    # add the specific band
                    mybands[chrom_num + band_num] = {
                        'min': start,
                        'max': stop,
                        'chr': chrom_num,
                        'ref': build_id,
                        'parent': None,
                        'stain': None,
                        'type': None
                    }

                    # add the staining intensity of the band
                    if re.match(r'g(neg|pos|var)', rtype):
                        mybands[chrom_num +
                                band_num]['stain'] = self.resolve(rtype)

                    # get the parent bands, and make them unique
                    parents = list(monochrom.make_parent_bands(
                        band_num, set()))
                    # alphabetical sort will put them in smallest to biggest,
                    # so we reverse
                    parents.sort(reverse=True)
                    # print('parents of',chrom,band,':',parents)

                    if len(parents) > 0:
                        mybands[chrom_num +
                                band_num]['parent'] = chrom_num + parents[0]
                else:
                    # TODO PYLINT why is 'parent'
                    # a list() a couple of lines up and a set() here?
                    parents = set()

                # loop through the parents and add them to the hash
                # add the parents to the graph, in hierarchical order
                # TODO PYLINT Consider using enumerate
                # instead of iterating with range and len
                for i in range(len(parents)):
                    rti = getChrPartTypeByNotation(parents[i])

                    pnum = chrom_num + parents[i]
                    sta = int(start)
                    sto = int(stop)
                    if pnum not in mybands.keys():
                        # add the parental band to the hash
                        bnd = {
                            'min': min(sta, sto),
                            'max': max(sta, sto),
                            'chr': chrom_num,
                            'ref': build_id,
                            'parent': None,
                            'stain': None,
                            'type': rti
                        }
                        mybands[pnum] = bnd
                    else:
                        # band already in the hash means it's a grouping band
                        # need to update the min/max coords
                        bnd = mybands.get(pnum)
                        bnd['min'] = min(sta, sto, bnd['min'])
                        bnd['max'] = max(sta, sto, bnd['max'])
                        mybands[pnum] = bnd

                        # also, set the max for the chrom
                        chrom = mybands.get(chrom_num)
                        chrom['max'] = max(sta, sto, chrom['max'])
                        mybands[chrom_num] = chrom

                    # add the parent relationships to each
                    if i < len(parents) - 1:
                        mybands[pnum]['parent'] = chrom_num + parents[i + 1]
                    else:
                        # add the last one (p or q usually)
                        # as attached to the chromosome
                        mybands[pnum]['parent'] = chrom_num

        f.close()  # end looping through file

        # loop through the hash and add the bands to the graph
        for bnd in mybands.keys():
            myband = mybands.get(bnd)
            band_class_id = makeChromID(bnd, taxon, 'CHR')
            band_class_label = makeChromLabel(bnd, genome_label)
            band_build_id = makeChromID(bnd, build_num, 'MONARCH')
            band_build_label = makeChromLabel(bnd, build_num)
            # the build-specific chrom
            chrom_in_build_id = makeChromID(myband['chr'], build_num,
                                            'MONARCH')
            # if it's != part, then add the class
            if myband['type'] != self.globaltt['assembly_component']:
                model.addClassToGraph(band_class_id, band_class_label,
                                      myband['type'])
                bfeature = Feature(self.graph, band_build_id, band_build_label,
                                   band_class_id)
            else:
                bfeature = Feature(self.graph, band_build_id, band_build_label,
                                   myband['type'])
                if 'synonym' in myband:
                    model.addSynonym(band_build_id, myband['synonym'])

            if myband['parent'] is None:
                if myband['type'] == self.globaltt['assembly_component']:
                    # since we likely don't know the chr,
                    # add it as a part of the build
                    geno.addParts(band_build_id, build_id)
            elif myband['type'] == self.globaltt['assembly_component']:
                # geno.addParts(band_build_id, chrom_in_build_id)
                parent_chrom_in_build = makeChromID(myband['parent'],
                                                    build_num, 'MONARCH')
                bfeature.addSubsequenceOfFeature(parent_chrom_in_build)

            # add the band as a feature
            # (which also instantiates the owl:Individual)
            bfeature.addFeatureStartLocation(myband['min'], chrom_in_build_id)
            bfeature.addFeatureEndLocation(myband['max'], chrom_in_build_id)
            if 'stain' in myband and myband['stain'] is not None:
                bfeature.addFeatureProperty(
                    self.globaltt['has_sequence_attribute'], myband['stain'])

            # type the band as a faldo:Region directly (add_region=False)
            # bfeature.setNoBNodes(self.nobnodes)
            # to come when we merge in ZFIN.py
            bfeature.addFeatureToGraph(False)

        return
Пример #14
0
    def _process_QTLs_genetic_location(self, raw, taxon_id, common_name, limit=None):
        """
        This function processes

        Triples created:

        :param limit:
        :return:
        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        geno = Genotype(g)
        gu = GraphUtils(curie_map.get())
        eco_id = "ECO:0000061"  # Quantitative Trait Analysis Evidence

        logger.info("Processing genetic location for %s", taxon_id)
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (qtl_id, qtl_symbol, trait_name, assotype, empty, chromosome, position_cm, range_cm,
                 flankmark_a2, flankmark_a1, peak_mark, flankmark_b1, flankmark_b2, exp_id, model, test_base,
                 sig_level, lod_score, ls_mean, p_values, f_statistics, variance, bayes_value, likelihood_ratio,
                 trait_id, dom_effect, add_effect, pubmed_id, gene_id, gene_id_src, gene_id_type, empty2) = row

                if self.testMode and int(qtl_id) not in self.test_ids:
                    continue

                qtl_id = 'AQTL:'+qtl_id
                trait_id = 'AQTLTrait:'+trait_id

                # Add QTL to graph
                f = Feature(qtl_id, qtl_symbol, geno.genoparts['QTL'])
                f.addTaxonToFeature(g, taxon_id)

                # deal with the chromosome
                chrom_id = makeChromID(chromosome, taxon_id, 'CHR')

                # add a version of the chromosome which is defined as the genetic map
                build_id = 'MONARCH:'+common_name.strip()+'-linkage'
                build_label = common_name+' genetic map'
                geno.addReferenceGenome(build_id, build_label, taxon_id)
                chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH')
                geno.addChromosomeInstance(chromosome, build_id, build_label, chrom_id)
                start = stop = None
                if re.search('-', range_cm):
                    range_parts = re.split('-', range_cm)
                    # check for poorly formed ranges
                    if len(range_parts) == 2 and range_parts[0] != '' and range_parts[1] != '':
                        (start, stop) = [int(float(x.strip())) for x in re.split('-', range_cm)]
                    else:
                        logger.info("There's a cM range we can't handle for QTL %s: %s", qtl_id, range_cm)
                elif position_cm != '':
                    start = stop = int(float(position_cm))

                # FIXME remove converion to int for start/stop when schema can handle floats
                # add in the genetic location based on the range
                f.addFeatureStartLocation(start, chrom_in_build_id, None, [Feature.types['FuzzyPosition']])
                f.addFeatureEndLocation(stop, chrom_in_build_id, None, [Feature.types['FuzzyPosition']])
                f.addFeatureToGraph(g)

                # sometimes there's a peak marker, like a rsid.  we want to add that as a variant of the gene,
                # and xref it to the qtl.
                dbsnp_id = None
                if peak_mark != '' and peak_mark != '.' and re.match('rs', peak_mark.strip()):
                    dbsnp_id = 'dbSNP:'+peak_mark.strip()

                    gu.addIndividualToGraph(g, dbsnp_id, None, geno.genoparts['sequence_alteration'])
                    gu.addXref(g, qtl_id, dbsnp_id)

                if gene_id is not None and gene_id != '' and gene_id != '.':
                    if gene_id_src == 'NCBIgene' or gene_id_src == '':  # we assume if no src is provided, it's NCBI
                        gene_id = 'NCBIGene:'+gene_id.strip()
                        geno.addGene(gene_id, None)  # we will expect that these labels provided elsewhere
                        geno.addAlleleOfGene(qtl_id, gene_id, geno.object_properties['feature_to_gene_relation'])   # FIXME what is the right relationship here?

                        if dbsnp_id is not None:
                            # add the rsid as a seq alt of the gene_id
                            vl_id = '_' + re.sub(':', '', gene_id) + '-' + peak_mark
                            if self.nobnodes:
                                vl_id = ':' + vl_id
                            geno.addSequenceAlterationToVariantLocus(dbsnp_id, vl_id)
                            geno.addAlleleOfGene(vl_id, gene_id)

                # add the trait
                gu.addClassToGraph(g, trait_id, trait_name)

                # Add publication
                r = None
                if re.match('ISU.*', pubmed_id):
                    pub_id = 'AQTLPub:'+pubmed_id.strip()
                    r = Reference(pub_id)
                elif pubmed_id != '':
                    pub_id = 'PMID:'+pubmed_id.strip()
                    r = Reference(pub_id, Reference.ref_types['journal_article'])

                if r is not None:
                    r.addRefToGraph(g)

                # make the association to the QTL
                assoc = G2PAssoc(self.name, qtl_id, trait_id, gu.object_properties['is_marker_for'])
                assoc.add_evidence(eco_id)
                assoc.add_source(pub_id)

                # create a description from the contents of the file
                # desc = ''

                # assoc.addDescription(g, assoc_id, desc)

                # TODO add exp_id as evidence
                # if exp_id != '':
                #     exp_id = 'AQTLExp:'+exp_id
                #     gu.addIndividualToGraph(g, exp_id, None, eco_id)

                if p_values != '':
                    score = float(re.sub('<', '', p_values))
                    assoc.set_score(score)  # todo add score type
                # TODO add LOD score?
                assoc.add_association_to_graph(g)

                # make the association to the dbsnp_id, if found
                if dbsnp_id is not None:
                    # make the association to the dbsnp_id
                    assoc = G2PAssoc(self.name, dbsnp_id, trait_id, gu.object_properties['is_marker_for'])
                    assoc.add_evidence(eco_id)
                    assoc.add_source(pub_id)

                    # create a description from the contents of the file
                    # desc = ''
                    # assoc.addDescription(g, assoc_id, desc)

                    # TODO add exp_id
                    # if exp_id != '':
                    #     exp_id = 'AQTLExp:'+exp_id
                    #     gu.addIndividualToGraph(g, exp_id, None, eco_id)

                    if p_values != '':
                        score = float(re.sub('<', '', p_values))
                        assoc.set_score(score)  # todo add score type
                    # TODO add LOD score?

                    assoc.add_association_to_graph(g)

                if not self.testMode and limit is not None and line_counter > limit:
                    break

        logger.info("Done with QTL genetic info")
        return
Пример #15
0
    def _create_genome_builds(self):
        """
        Various resources will map variations to either UCSC (hg*)
        or to NCBI assemblies. Here we create the equivalences between them.
        Data taken from:
        https://genome.ucsc.edu/FAQ/FAQreleases.html#release1

        :return:

        """

        # TODO add more species
        ucsc_assembly_id_map = {
            "9606": {
                "UCSC:hg38": "NCBIGenome:GRCh38",
                "UCSC:hg19": "NCBIGenome:GRCh37",
                "UCSC:hg18": "NCBIGenome:36.1",
                "UCSC:hg17": "NCBIGenome:35",
                "UCSC:hg16": "NCBIGenome:34",
                "UCSC:hg15": "NCBIGenome:33",
                },
            "7955": {
                "UCSC:danRer10": "NCBIGenome:GRCz10",
                "UCSC:danRer7":	"NCBIGenome:Zv9",
                "UCSC:danRer6": "NCBIGenome:Zv8",
                },
            "10090": {
                "UCSC:mm10": "NCBIGenome:GRCm38",
                "UCSC:mm9":	"NCBIGenome:37"
            },
            "9031": {
                "UCSC:galGal4": "NCBIAssembly:317958",
                },
            "9913": {
                "UCSC:bosTau7": "NCBIAssembly:GCF_000003205.5",
                },
            "9823": {
                "UCSC:susScr3": "NCBIAssembly:304498",
                },
            "9940": {
                "UCSC:oviAri3": "NCBIAssembly:GCF_000298735.1",
                },
            "9796": {
                "UCSC:equCab2": "NCBIAssembly:GCF_000002305.2",
                }
        }
        g = self.graph
        geno = Genotype(g)
        model = Model(g)
        logger.info("Adding equivalent assembly identifiers")
        for sp in ucsc_assembly_id_map:
            tax_num = sp
            tax_id = 'NCBITaxon:'+tax_num
            mappings = ucsc_assembly_id_map[sp]
            for i in mappings:
                ucsc_id = i
                ucsc_label = re.split(':', i)[1]
                mapped_id = mappings[i]
                mapped_label = re.split(':', mapped_id)[1]
                mapped_label = 'NCBI build '+str(mapped_label)
                geno.addReferenceGenome(ucsc_id, ucsc_label, tax_id)
                geno.addReferenceGenome(mapped_id, mapped_label, tax_id)
                model.addSameIndividual(ucsc_id, mapped_id)

        return
Пример #16
0
    def _get_variants(self, limit):
        """
        Currently loops through the variant_summary file.

        :param limit:
        :return:

        """

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)

        geno = Genotype(g)
        f = Feature(g, None, None, None)

        # add the taxon and the genome
        tax_num = '9606'  # HARDCODE
        tax_id = 'NCBITaxon:' + tax_num
        tax_label = 'Human'
        model.addClassToGraph(tax_id, None)
        geno.addGenome(tax_id, tax_label)  # label gets added elsewhere

        # not unzipping the file
        logger.info("Processing Variant records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['variant_summary']['file']))
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue

                # AlleleID               integer value as stored in the AlleleID field in ClinVar  (//Measure/@ID in the XML)
                # Type                   character, the type of variation
                # Name                   character, the preferred name for the variation
                # GeneID                 integer, GeneID in NCBI's Gene database
                # GeneSymbol             character, comma-separated list of GeneIDs overlapping the variation
                # ClinicalSignificance   character, comma-separated list of values of clinical significance reported for this variation
                #                          for the mapping between the terms listed here and the integers in the .VCF files, see
                #                          http://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/
                # RS# (dbSNP)            integer, rs# in dbSNP
                # nsv (dbVar)            character, the NSV identifier for the region in dbVar
                # RCVaccession           character, list of RCV accessions that report this variant
                # TestedInGTR            character, Y/N for Yes/No if there is a test registered as specific to this variation in the NIH Genetic Testing Registry (GTR)
                # PhenotypeIDs           character, list of db names and identifiers for phenotype(s) reported for this variant
                # Origin                 character, list of all allelic origins for this variation
                # Assembly               character, name of the assembly on which locations are based
                # Chromosome             character, chromosomal location
                # Start                  integer, starting location, in pter->qter orientation
                # Stop                   integer, end location, in pter->qter orientation
                # Cytogenetic            character, ISCN band
                # ReviewStatus           character, highest review status for reporting this measure. For the key to the terms,
                #                            and their relationship to the star graphics ClinVar displays on its web pages,
                #                            see http://www.ncbi.nlm.nih.gov/clinvar/docs/variation_report/#interpretation
                # HGVS(c.)               character, RefSeq cDNA-based HGVS expression
                # HGVS(p.)               character, RefSeq protein-based HGVS expression
                # NumberSubmitters       integer, number of submissions with this variant
                # LastEvaluated          datetime, the latest time any submitter reported clinical significance
                # Guidelines             character, ACMG only right now, for the reporting of incidental variation in a Gene
                #                                (NOTE: if ACMG, not a specific to the allele but to the Gene)
                # OtherIDs               character, list of other identifiers or sources of information about this variant
                # VariantID              integer, the value used to build the URL for the current default report,
                #                            e.g. http://www.ncbi.nlm.nih.gov/clinvar/variation/1756/
                #

                # a crude check that there's an expected number of cols.
                # if not, error out because something changed.
                num_cols = len(line.split('\t'))
                expected_numcols = 29
                if num_cols != expected_numcols:
                    logger.error(
                        "Unexpected number of columns in raw file " +
                        "(%d actual vs %d expected)", num_cols,
                        expected_numcols)

                (allele_num, allele_type, allele_name, gene_num, gene_symbol,
                 clinical_significance, dbsnp_num, dbvar_num, rcv_nums,
                 tested_in_gtr, phenotype_ids, origin, assembly, chr, start,
                 stop, cytogenetic_loc, review_status, hgvs_c, hgvs_p,
                 number_of_submitters, last_eval, guidelines, other_ids,
                 variant_num, reference_allele, alternate_allele, categories,
                 ChromosomeAccession) = line.split('\t')

                # ###set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #    if ((self.filter == 'taxids' and\
                #            (int(tax_num) not in self.tax_ids)) or\
                #            (self.filter == 'geneids' and\
                #             (int(gene_num) not in self.gene_ids))):
                #        continue
                # #### end filter

                line_counter += 1

                pheno_list = []
                if phenotype_ids != '-':
                    # trim any leading/trailing semicolons/commas
                    phenotype_ids = re.sub(r'^[;,]', '', phenotype_ids)
                    phenotype_ids = re.sub(r'[;,]$', '', phenotype_ids)
                    pheno_list = re.split(r'[,;]', phenotype_ids)

                if self.testMode:
                    # get intersection of test disease ids
                    # and these phenotype_ids
                    intersect = \
                        list(
                            set([str(i)
                                for i in self.disease_ids]) & set(pheno_list))
                    if int(gene_num) not in self.gene_ids and\
                            int(variant_num) not in self.variant_ids and\
                            len(intersect) < 1:
                        continue

                # TODO may need to switch on assembly to create correct
                # assembly/build identifiers
                build_id = ':'.join(('NCBIGenome', assembly))

                # make the reference genome build
                geno.addReferenceGenome(build_id, assembly, tax_id)

                allele_type_id = self._map_type_of_allele(allele_type)
                bandinbuild_id = None
                if str(chr) == '':
                    # check cytogenic location
                    if str(cytogenetic_loc).strip() != '':
                        # use cytogenic location to get the apx location
                        # oddly, they still put an assembly number even when
                        # there's no numeric location
                        if not re.search(r'-', str(cytogenetic_loc)):
                            band_id = makeChromID(
                                re.split(r'-', str(cytogenetic_loc)), tax_num,
                                'CHR')
                            geno.addChromosomeInstance(cytogenetic_loc,
                                                       build_id, assembly,
                                                       band_id)
                            bandinbuild_id = makeChromID(
                                re.split(r'-', str(cytogenetic_loc)), assembly,
                                'MONARCH')
                        else:
                            # can't deal with ranges yet
                            pass
                else:
                    # add the human chromosome class to the graph,
                    # and add the build-specific version of it
                    chr_id = makeChromID(str(chr), tax_num, 'CHR')
                    geno.addChromosomeClass(str(chr), tax_id, tax_label)
                    geno.addChromosomeInstance(str(chr), build_id, assembly,
                                               chr_id)
                    chrinbuild_id = makeChromID(str(chr), assembly, 'MONARCH')

                seqalt_id = ':'.join(('ClinVarVariant', variant_num))
                gene_id = None

                # they use -1 to indicate unknown gene
                if str(gene_num) != '-1' and str(gene_num) != 'more than 10':
                    if re.match(r'^Gene:', gene_num):
                        gene_num = "NCBI" + gene_num
                    else:
                        gene_id = ':'.join(('NCBIGene', str(gene_num)))

                # FIXME there are some "variants" that are actually haplotypes
                # probably will get taken care of when we switch to processing
                # the xml for example, variant_num = 38562
                # but there's no way to tell if it's a haplotype
                # in the csv data so the dbsnp or dbvar
                # should probably be primary,
                # and the variant num be the vslc,
                # with each of the dbsnps being added to it

                # TODO clinical significance needs to be mapped to
                # a list of terms
                # first, make the variant:
                f = Feature(seqalt_id, allele_name, allele_type_id)

                if start != '-' and start.strip() != '':
                    f.addFeatureStartLocation(start, chrinbuild_id)
                if stop != '-' and stop.strip() != '':
                    f.addFeatureEndLocation(stop, chrinbuild_id)

                f.addFeatureToGraph()
                f.addTaxonToFeature(tax_id)
                # make the ClinVarVariant the clique leader
                model.makeLeader(seqalt_id)

                if bandinbuild_id is not None:
                    f.addSubsequenceOfFeature(bandinbuild_id)

                # CHECK - this makes the assumption that there is
                # only one affected chromosome per variant what happens with
                # chromosomal rearrangement variants?
                # shouldn't both chromosomes be here?

                # add the hgvs as synonyms
                if hgvs_c != '-' and hgvs_c.strip() != '':
                    model.addSynonym(seqalt_id, hgvs_c)
                if hgvs_p != '-' and hgvs_p.strip() != '':
                    model.addSynonym(seqalt_id, hgvs_p)

                # add the dbsnp and dbvar ids as equivalent
                if dbsnp_num != '-' and int(dbsnp_num) != -1:
                    dbsnp_id = 'dbSNP:rs' + str(dbsnp_num)
                    model.addIndividualToGraph(dbsnp_id, None)
                    model.addSameIndividual(seqalt_id, dbsnp_id)
                if dbvar_num != '-':
                    dbvar_id = 'dbVar:' + dbvar_num
                    model.addIndividualToGraph(dbvar_id, None)
                    model.addSameIndividual(seqalt_id, dbvar_id)

                # TODO - not sure if this is right... add as xref?
                # the rcv is like the combo of the phenotype with the variant
                if rcv_nums != '-':
                    for rcv_num in re.split(r';', rcv_nums):
                        rcv_id = 'ClinVar:' + rcv_num
                        model.addIndividualToGraph(rcv_id, None)
                        model.addXref(seqalt_id, rcv_id)

                if gene_id is not None:
                    # add the gene
                    model.addClassToGraph(gene_id, gene_symbol)
                    # make a variant locus
                    vl_id = '_' + gene_num + '-' + variant_num
                    if self.nobnodes:
                        vl_id = ':' + vl_id
                    vl_label = allele_name
                    model.addIndividualToGraph(vl_id, vl_label,
                                               geno.genoparts['variant_locus'])
                    geno.addSequenceAlterationToVariantLocus(seqalt_id, vl_id)
                    geno.addAlleleOfGene(vl_id, gene_id)
                else:
                    # some basic reporting
                    gmatch = re.search(r'\(\w+\)', allele_name)
                    if gmatch is not None and len(gmatch.groups()) > 0:
                        logger.info(
                            "Gene found in allele label, but no id provided: %s",
                            gmatch.group(1))
                    elif re.match(r'more than 10', gene_symbol):
                        logger.info(
                            "More than 10 genes found; "
                            "need to process XML to fetch (variant=%d)",
                            int(variant_num))
                    else:
                        logger.info("No gene listed for variant %d",
                                    int(variant_num))

                # parse the list of "phenotypes" which are diseases.
                # add them as an association
                # ;GeneReviews:NBK1440,MedGen:C0392514,OMIM:235200,SNOMED CT:35400008;MedGen:C3280096,OMIM:614193;MedGen:CN034317,OMIM:612635;MedGen:CN169374
                # the list is both semicolon delimited and comma delimited,
                # but i don't know why! some are bad, like:
                # Orphanet:ORPHA ORPHA319705,SNOMED CT:49049000
                if phenotype_ids != '-':
                    for phenotype in pheno_list:
                        m = re.match(r"(Orphanet:ORPHA(?:\s*ORPHA)?)",
                                     phenotype)
                        if m is not None and len(m.groups()) > 0:
                            phenotype = re.sub(m.group(1), 'Orphanet:',
                                               phenotype.strip())
                        elif re.match(r'ORPHA:\d+', phenotype):
                            phenotype = re.sub(r'^ORPHA', 'Orphanet',
                                               phenotype.strip())
                        elif re.match(r'Human Phenotype Ontology', phenotype):
                            phenotype = re.sub(r'^Human Phenotype Ontology',
                                               '', phenotype.strip())
                        elif re.match(r'SNOMED CT:\s?', phenotype):
                            phenotype = re.sub(r'SNOMED CT:\s?', 'SNOMED:',
                                               phenotype.strip())
                        elif re.match(r'^Gene:', phenotype):
                            continue

                        assoc = G2PAssoc(g, self.name, seqalt_id,
                                         phenotype.strip())
                        assoc.add_association_to_graph()

                if other_ids != '-':
                    id_list = other_ids.split(',')
                    # process the "other ids" ex:
                    # CFTR2:F508del,HGMD:CD890142,OMIM Allelic Variant:602421.0001
                    # TODO make more xrefs
                    for xrefid in id_list:
                        prefix = xrefid.split(':')[0].strip()
                        if prefix == 'OMIM Allelic Variant':
                            xrefid = 'OMIM:' + xrefid.split(':')[1]
                            model.addIndividualToGraph(xrefid, None)
                            model.addSameIndividual(seqalt_id, xrefid)
                        elif prefix == 'HGMD':
                            model.addIndividualToGraph(xrefid, None)
                            model.addSameIndividual(seqalt_id, xrefid)
                        elif prefix == 'dbVar' \
                                and dbvar_num == xrefid.split(':')[1].strip():
                            pass  # skip over this one
                        elif re.search(r'\s', prefix):
                            pass
                            # logger.debug(
                            #   'xref prefix has a space: %s', xrefid)
                        else:
                            # should be a good clean prefix
                            # note that HGMD variants are in here as Xrefs
                            # because we can't resolve URIs for them
                            # logger.info("Adding xref: %s", xrefid)
                            # gu.addXref(g, seqalt_id, xrefid)
                            # logger.info("xref prefix to add: %s", xrefid)
                            pass

                if not self.testMode and limit is not None \
                        and line_counter > limit:
                    break

        logger.info("Finished parsing variants")

        return
Пример #17
0
    def _process_qtls_genetic_location(
            self, raw, txid, common_name, limit=None):
        """
        This function processes

        Triples created:

        :param limit:
        :return:

        """
        if self.testMode:
            graph = self.testgraph
        else:
            graph = self.graph
        line_counter = 0
        geno = Genotype(graph)
        model = Model(graph)
        eco_id = self.globaltt['quantitative trait analysis evidence']

        taxon_curie = 'NCBITaxon:' + txid

        LOG.info("Processing genetic location for %s from %s", taxon_curie, raw)
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (qtl_id,
                 qtl_symbol,
                 trait_name,
                 assotype,
                 empty,
                 chromosome,
                 position_cm,
                 range_cm,
                 flankmark_a2,
                 flankmark_a1,
                 peak_mark,
                 flankmark_b1,
                 flankmark_b2,
                 exp_id,
                 model_id,
                 test_base,
                 sig_level,
                 lod_score,
                 ls_mean,
                 p_values,
                 f_statistics,
                 variance,
                 bayes_value,
                 likelihood_ratio,
                 trait_id, dom_effect,
                 add_effect,
                 pubmed_id,
                 gene_id,
                 gene_id_src,
                 gene_id_type,
                 empty2) = row

                if self.testMode and int(qtl_id) not in self.test_ids:
                    continue

                qtl_id = common_name + 'QTL:' + qtl_id.strip()
                trait_id = 'AQTLTrait:' + trait_id.strip()

                # Add QTL to graph
                feature = Feature(graph, qtl_id, qtl_symbol, self.globaltt['QTL'])
                feature.addTaxonToFeature(taxon_curie)

                # deal with the chromosome
                chrom_id = makeChromID(chromosome, taxon_curie, 'CHR')

                # add a version of the chromosome which is defined as
                # the genetic map
                build_id = 'MONARCH:'+common_name.strip()+'-linkage'
                build_label = common_name+' genetic map'
                geno.addReferenceGenome(build_id, build_label, taxon_curie)
                chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH')
                geno.addChromosomeInstance(
                    chromosome, build_id, build_label, chrom_id)
                start = stop = None
                # range_cm sometimes ends in "(Mb)"  (i.e pig 2016 Nov)
                range_mb = re.split(r'\(', range_cm)
                if range_mb is not None:
                    range_cm = range_mb[0]

                if re.search(r'[0-9].*-.*[0-9]', range_cm):
                    range_parts = re.split(r'-', range_cm)

                    # check for poorly formed ranges
                    if len(range_parts) == 2 and\
                            range_parts[0] != '' and range_parts[1] != '':
                        (start, stop) = [
                            int(float(x.strip())) for x in re.split(r'-', range_cm)]
                    else:
                        LOG.info(
                            "A cM range we can't handle for QTL %s: %s",
                            qtl_id, range_cm)
                elif position_cm != '':
                    match = re.match(r'([0-9]*\.[0-9]*)', position_cm)
                    if match is not None:
                        position_cm = match.group()
                        start = stop = int(float(position_cm))

                # FIXME remove converion to int for start/stop
                # when schema can handle floats add in the genetic location
                # based on the range
                feature.addFeatureStartLocation(
                    start, chrom_in_build_id, None,
                    [self.globaltt['FuzzyPosition']])
                feature.addFeatureEndLocation(
                    stop, chrom_in_build_id, None,
                    [self.globaltt['FuzzyPosition']])
                feature.addFeatureToGraph()

                # sometimes there's a peak marker, like a rsid.
                # we want to add that as a variant of the gene,
                # and xref it to the qtl.
                dbsnp_id = None
                if peak_mark != '' and peak_mark != '.' and \
                        re.match(r'rs', peak_mark.strip()):
                    dbsnp_id = 'dbSNP:'+peak_mark.strip()

                    model.addIndividualToGraph(
                        dbsnp_id, None,
                        self.globaltt['sequence_alteration'])
                    model.addXref(qtl_id, dbsnp_id)

                gene_id = gene_id.replace('uncharacterized ', '').strip()
                if gene_id is not None and gene_id != '' and gene_id != '.'\
                        and re.fullmatch(r'[^ ]*', gene_id) is not None:

                    # we assume if no src is provided and gene_id is an integer,
                    # then it is an NCBI gene ... (okay, lets crank that back a notch)
                    if gene_id_src == '' and gene_id.isdigit() and \
                            gene_id in self.gene_info:
                        # LOG.info(
                        #    'Warm & Fuzzy saying %s is a NCBI gene for %s',
                        #    gene_id, common_name)
                        gene_id_src = 'NCBIgene'
                    elif gene_id_src == '' and gene_id.isdigit():
                        LOG.warning(
                            'Cold & Prickely saying %s is a NCBI gene for %s',
                            gene_id, common_name)
                        gene_id_src = 'NCBIgene'
                    elif gene_id_src == '':
                        LOG.error(
                            ' "%s" is a NOT NCBI gene for %s', gene_id, common_name)
                        gene_id_src = None

                    if gene_id_src == 'NCBIgene':
                        gene_id = 'NCBIGene:' + gene_id
                        # we will expect that these will get labels elsewhere
                        geno.addGene(gene_id, None)
                        # FIXME what is the right relationship here?
                        geno.addAffectedLocus(qtl_id, gene_id)

                        if dbsnp_id is not None:
                            # add the rsid as a seq alt of the gene_id
                            vl_id = '_:' + re.sub(
                                r':', '', gene_id) + '-' + peak_mark.strip()
                            geno.addSequenceAlterationToVariantLocus(
                                dbsnp_id, vl_id)
                            geno.addAffectedLocus(vl_id, gene_id)

                # add the trait
                model.addClassToGraph(trait_id, trait_name)

                # Add publication
                reference = None
                if re.match(r'ISU.*', pubmed_id):
                    pub_id = 'AQTLPub:'+pubmed_id.strip()
                    reference = Reference(graph, pub_id)
                elif pubmed_id != '':
                    pub_id = 'PMID:' + pubmed_id.strip()
                    reference = Reference(
                        graph, pub_id, self.globaltt['journal article'])

                if reference is not None:
                    reference.addRefToGraph()

                # make the association to the QTL
                assoc = G2PAssoc(
                    graph, self.name, qtl_id, trait_id, self.globaltt['is marker for'])
                assoc.add_evidence(eco_id)
                assoc.add_source(pub_id)

                # create a description from the contents of the file
                # desc = ''

                # assoc.addDescription(g, assoc_id, desc)

                # TODO add exp_id as evidence
                # if exp_id != '':
                #     exp_id = 'AQTLExp:'+exp_id
                #     gu.addIndividualToGraph(g, exp_id, None, eco_id)

                if p_values != '':
                    scr = re.sub(r'<', '', p_values)
                    scr = re.sub(r',', '.', scr)  # international notation
                    if scr.isnumeric():
                        score = float(scr)
                        assoc.set_score(score)  # todo add score type
                # TODO add LOD score?
                assoc.add_association_to_graph()

                # make the association to the dbsnp_id, if found
                if dbsnp_id is not None:
                    # make the association to the dbsnp_id
                    assoc = G2PAssoc(
                        graph, self.name, dbsnp_id, trait_id,
                        self.globaltt['is marker for'])
                    assoc.add_evidence(eco_id)
                    assoc.add_source(pub_id)

                    # create a description from the contents of the file
                    # desc = ''
                    # assoc.addDescription(g, assoc_id, desc)

                    # TODO add exp_id
                    # if exp_id != '':
                    #     exp_id = 'AQTLExp:'+exp_id
                    #     gu.addIndividualToGraph(g, exp_id, None, eco_id)

                    if p_values != '':
                        scr = re.sub(r'<', '', p_values)
                        scr = re.sub(r',', '.', scr)
                        if scr.isnumeric():
                            score = float(scr)
                            assoc.set_score(score)  # todo add score type
                    # TODO add LOD score?

                    assoc.add_association_to_graph()

                if not self.testMode and limit is not None and line_counter > limit:
                    break

        LOG.info("Done with QTL genetic info")
        return
Пример #18
0
    def _get_variants(self, limit):
        """
        Currently loops through the variant_summary file.

        :param limit:
        :return:

        """

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)

        geno = Genotype(g)
        f = Feature(g, None, None, None)

        # add the taxon and the genome
        tax_num = '9606'  # HARDCODE
        tax_id = 'NCBITaxon:'+tax_num
        tax_label = 'Human'
        model.addClassToGraph(tax_id, None)
        geno.addGenome(tax_id, tax_label)  # label gets added elsewhere

        # not unzipping the file
        logger.info("Processing Variant records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['variant_summary']['file']))
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue

                # AlleleID               integer value as stored in the AlleleID field in ClinVar  (//Measure/@ID in the XML)
                # Type                   character, the type of variation
                # Name                   character, the preferred name for the variation
                # GeneID                 integer, GeneID in NCBI's Gene database
                # GeneSymbol             character, comma-separated list of GeneIDs overlapping the variation
                # ClinicalSignificance   character, comma-separated list of values of clinical significance reported for this variation
                #                          for the mapping between the terms listed here and the integers in the .VCF files, see
                #                          http://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/
                # RS# (dbSNP)            integer, rs# in dbSNP
                # nsv (dbVar)            character, the NSV identifier for the region in dbVar
                # RCVaccession           character, list of RCV accessions that report this variant
                # TestedInGTR            character, Y/N for Yes/No if there is a test registered as specific to this variation in the NIH Genetic Testing Registry (GTR)
                # PhenotypeIDs           character, list of db names and identifiers for phenotype(s) reported for this variant
                # Origin                 character, list of all allelic origins for this variation
                # Assembly               character, name of the assembly on which locations are based
                # Chromosome             character, chromosomal location
                # Start                  integer, starting location, in pter->qter orientation
                # Stop                   integer, end location, in pter->qter orientation
                # Cytogenetic            character, ISCN band
                # ReviewStatus           character, highest review status for reporting this measure. For the key to the terms,
                #                            and their relationship to the star graphics ClinVar displays on its web pages,
                #                            see http://www.ncbi.nlm.nih.gov/clinvar/docs/variation_report/#interpretation
                # HGVS(c.)               character, RefSeq cDNA-based HGVS expression
                # HGVS(p.)               character, RefSeq protein-based HGVS expression
                # NumberSubmitters       integer, number of submissions with this variant
                # LastEvaluated          datetime, the latest time any submitter reported clinical significance
                # Guidelines             character, ACMG only right now, for the reporting of incidental variation in a Gene
                #                                (NOTE: if ACMG, not a specific to the allele but to the Gene)
                # OtherIDs               character, list of other identifiers or sources of information about this variant
                # VariantID              integer, the value used to build the URL for the current default report,
                #                            e.g. http://www.ncbi.nlm.nih.gov/clinvar/variation/1756/
                #

                # a crude check that there's an expected number of cols.
                # if not, error out because something changed.
                num_cols = len(line.split('\t'))
                expected_numcols = 29
                if num_cols != expected_numcols:
                    logger.error(
                        "Unexpected number of columns in raw file " +
                        "(%d actual vs %d expected)",
                        num_cols, expected_numcols)

                (allele_num, allele_type, allele_name, gene_num, gene_symbol,
                 clinical_significance, dbsnp_num, dbvar_num, rcv_nums,
                 tested_in_gtr, phenotype_ids, origin, assembly, chr, start,
                 stop, cytogenetic_loc, review_status, hgvs_c, hgvs_p,
                 number_of_submitters, last_eval, guidelines, other_ids,
                 variant_num, reference_allele, alternate_allele, categories,
                 ChromosomeAccession) = line.split('\t')

                # ###set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #    if ((self.filter == 'taxids' and\
                #            (int(tax_num) not in self.tax_ids)) or\
                #            (self.filter == 'geneids' and\
                #             (int(gene_num) not in self.gene_ids))):
                #        continue
                # #### end filter

                line_counter += 1

                pheno_list = []
                if phenotype_ids != '-':
                    # trim any leading/trailing semicolons/commas
                    phenotype_ids = re.sub(r'^[;,]', '', phenotype_ids)
                    phenotype_ids = re.sub(r'[;,]$', '', phenotype_ids)
                    pheno_list = re.split(r'[,;]', phenotype_ids)

                if self.testMode:
                    # get intersection of test disease ids
                    # and these phenotype_ids
                    intersect = \
                        list(
                            set([str(i)
                                for i in self.disease_ids]) & set(pheno_list))
                    if int(gene_num) not in self.gene_ids and\
                            int(variant_num) not in self.variant_ids and\
                            len(intersect) < 1:
                        continue

                # TODO may need to switch on assembly to create correct
                # assembly/build identifiers
                build_id = ':'.join(('NCBIGenome', assembly))

                # make the reference genome build
                geno.addReferenceGenome(build_id, assembly, tax_id)

                allele_type_id = self._map_type_of_allele(allele_type)
                bandinbuild_id = None
                if str(chr) == '':
                    # check cytogenic location
                    if str(cytogenetic_loc).strip() != '':
                        # use cytogenic location to get the apx location
                        # oddly, they still put an assembly number even when
                        # there's no numeric location
                        if not re.search(r'-', str(cytogenetic_loc)):
                            band_id = makeChromID(
                                re.split(r'-', str(cytogenetic_loc)),
                                tax_num, 'CHR')
                            geno.addChromosomeInstance(
                                cytogenetic_loc, build_id, assembly, band_id)
                            bandinbuild_id = makeChromID(
                                re.split(r'-', str(cytogenetic_loc)),
                                assembly, 'MONARCH')
                        else:
                            # can't deal with ranges yet
                            pass
                else:
                    # add the human chromosome class to the graph,
                    # and add the build-specific version of it
                    chr_id = makeChromID(str(chr), tax_num, 'CHR')
                    geno.addChromosomeClass(str(chr), tax_id, tax_label)
                    geno.addChromosomeInstance(
                        str(chr), build_id, assembly, chr_id)
                    chrinbuild_id = makeChromID(str(chr), assembly, 'MONARCH')

                seqalt_id = ':'.join(('ClinVarVariant', variant_num))
                gene_id = None

                # they use -1 to indicate unknown gene
                if str(gene_num) != '-1' and str(gene_num) != 'more than 10':
                    if re.match(r'^Gene:', gene_num):
                        gene_num = "NCBI" + gene_num
                    else:
                        gene_id = ':'.join(('NCBIGene', str(gene_num)))

                # FIXME there are some "variants" that are actually haplotypes
                # probably will get taken care of when we switch to processing
                # the xml for example, variant_num = 38562
                # but there's no way to tell if it's a haplotype
                # in the csv data so the dbsnp or dbvar
                # should probably be primary,
                # and the variant num be the vslc,
                # with each of the dbsnps being added to it

                # TODO clinical significance needs to be mapped to
                # a list of terms
                # first, make the variant:
                f = Feature(seqalt_id, allele_name, allele_type_id)

                if start != '-' and start.strip() != '':
                    f.addFeatureStartLocation(start, chrinbuild_id)
                if stop != '-' and stop.strip() != '':
                    f.addFeatureEndLocation(stop, chrinbuild_id)

                f.addFeatureToGraph()
                f.addTaxonToFeature(tax_id)
                # make the ClinVarVariant the clique leader
                model.makeLeader(seqalt_id)

                if bandinbuild_id is not None:
                    f.addSubsequenceOfFeature(bandinbuild_id)

                # CHECK - this makes the assumption that there is
                # only one affected chromosome per variant what happens with
                # chromosomal rearrangement variants?
                # shouldn't both chromosomes be here?

                # add the hgvs as synonyms
                if hgvs_c != '-' and hgvs_c.strip() != '':
                    model.addSynonym(seqalt_id, hgvs_c)
                if hgvs_p != '-' and hgvs_p.strip() != '':
                    model.addSynonym(seqalt_id, hgvs_p)

                # add the dbsnp and dbvar ids as equivalent
                if dbsnp_num != '-' and int(dbsnp_num) != -1:
                    dbsnp_id = 'dbSNP:rs'+str(dbsnp_num)
                    model.addIndividualToGraph(dbsnp_id, None)
                    model.addSameIndividual(seqalt_id, dbsnp_id)
                if dbvar_num != '-':
                    dbvar_id = 'dbVar:'+dbvar_num
                    model.addIndividualToGraph(dbvar_id, None)
                    model.addSameIndividual(seqalt_id, dbvar_id)

                # TODO - not sure if this is right... add as xref?
                # the rcv is like the combo of the phenotype with the variant
                if rcv_nums != '-':
                    for rcv_num in re.split(r';', rcv_nums):
                        rcv_id = 'ClinVar:' + rcv_num
                        model.addIndividualToGraph(rcv_id, None)
                        model.addXref(seqalt_id, rcv_id)

                if gene_id is not None:
                    # add the gene
                    model.addClassToGraph(gene_id, gene_symbol)
                    # make a variant locus
                    vl_id = '_'+gene_num+'-'+variant_num
                    if self.nobnodes:
                        vl_id = ':'+vl_id
                    vl_label = allele_name
                    model.addIndividualToGraph(
                        vl_id, vl_label, geno.genoparts['variant_locus'])
                    geno.addSequenceAlterationToVariantLocus(seqalt_id, vl_id)
                    geno.addAlleleOfGene(vl_id, gene_id)
                else:
                    # some basic reporting
                    gmatch = re.search(r'\(\w+\)', allele_name)
                    if gmatch is not None and len(gmatch.groups()) > 0:
                        logger.info(
                            "Gene found in allele label, but no id provided: %s",
                            gmatch.group(1))
                    elif re.match(r'more than 10', gene_symbol):
                        logger.info(
                            "More than 10 genes found; "
                            "need to process XML to fetch (variant=%d)",
                            int(variant_num))
                    else:
                        logger.info(
                            "No gene listed for variant %d",
                            int(variant_num))

                # parse the list of "phenotypes" which are diseases.
                # add them as an association
                # ;GeneReviews:NBK1440,MedGen:C0392514,OMIM:235200,SNOMED CT:35400008;MedGen:C3280096,OMIM:614193;MedGen:CN034317,OMIM:612635;MedGen:CN169374
                # the list is both semicolon delimited and comma delimited,
                # but i don't know why! some are bad, like:
                # Orphanet:ORPHA ORPHA319705,SNOMED CT:49049000
                if phenotype_ids != '-':
                    for phenotype in pheno_list:
                        m = re.match(
                            r"(Orphanet:ORPHA(?:\s*ORPHA)?)", phenotype)
                        if m is not None and len(m.groups()) > 0:
                            phenotype = re.sub(
                                m.group(1), 'Orphanet:', phenotype.strip())
                        elif re.match(r'ORPHA:\d+', phenotype):
                            phenotype = re.sub(
                                r'^ORPHA', 'Orphanet', phenotype.strip())
                        elif re.match(r'Human Phenotype Ontology', phenotype):
                            phenotype = re.sub(
                                r'^Human Phenotype Ontology', '',
                                phenotype.strip())
                        elif re.match(r'SNOMED CT:\s?', phenotype):
                            phenotype = re.sub(
                                r'SNOMED CT:\s?', 'SNOMED:', phenotype.strip())
                        elif re.match(r'^Gene:', phenotype):
                            continue

                        assoc = G2PAssoc(
                            g, self.name, seqalt_id, phenotype.strip())
                        assoc.add_association_to_graph()

                if other_ids != '-':
                    id_list = other_ids.split(',')
                    # process the "other ids" ex:
                    # CFTR2:F508del,HGMD:CD890142,OMIM Allelic Variant:602421.0001
                    # TODO make more xrefs
                    for xrefid in id_list:
                        prefix = xrefid.split(':')[0].strip()
                        if prefix == 'OMIM Allelic Variant':
                            xrefid = 'OMIM:'+xrefid.split(':')[1]
                            model.addIndividualToGraph(xrefid, None)
                            model.addSameIndividual(seqalt_id, xrefid)
                        elif prefix == 'HGMD':
                            model.addIndividualToGraph(xrefid, None)
                            model.addSameIndividual(seqalt_id, xrefid)
                        elif prefix == 'dbVar' \
                                and dbvar_num == xrefid.split(':')[1].strip():
                            pass  # skip over this one
                        elif re.search(r'\s', prefix):
                            pass
                            # logger.debug(
                            #   'xref prefix has a space: %s', xrefid)
                        else:
                            # should be a good clean prefix
                            # note that HGMD variants are in here as Xrefs
                            # because we can't resolve URIs for them
                            # logger.info("Adding xref: %s", xrefid)
                            # gu.addXref(g, seqalt_id, xrefid)
                            # logger.info("xref prefix to add: %s", xrefid)
                            pass

                if not self.testMode and limit is not None \
                        and line_counter > limit:
                    break

        logger.info("Finished parsing variants")

        return
Пример #19
0
    def _add_variant_cdna_variant_assoc_to_graph(self, row):
        """
        Generates relationships between variants and cDNA variants
        given a row of data
        :param iterable: row of data, see add_variant_info_to_graph()
                                      docstring for expected structure.
                                      Only applicable for structure 2.
        :return None
        """
        gu = GraphUtils(curie_map.get())
        geno = Genotype(self.graph)
        is_literal = True

        (variant_key, variant_label, amino_acid_variant, amino_acid_position,
         transcript_id, transcript_priority, protein_variant_type,
         functional_impact, stop_gain_loss, transcript_gene,
         protein_variant_source, variant_gene, bp_pos, variant_cdna,
         cosmic_id, db_snp_id, genome_pos_start, genome_pos_end, ref_base,
         variant_base, primary_transcript_exons,
         primary_transcript_variant_sub_types, variant_type, chromosome,
         genome_build, build_version, build_date) = row

        variant_id = self.make_cgd_id('variant{0}'.format(variant_key))

        # Add gene
        self._add_variant_gene_relationship(variant_id, variant_gene)

        # Transcript reference for nucleotide position
        transcript_curie = self._make_transcript_curie(transcript_id)

        # Make region IDs
        cdna_region_id = ":_{0}Region".format(transcript_curie)
        chrom_region_id = ":_{0}{1}Region-{2}-{3}".format(genome_build,
                                                          chromosome,
                                                          genome_pos_start,
                                                          genome_pos_end)

        # Add the genome build
        genome_label = "Human"
        build_id = "UCSC:{0}".format(genome_build)
        taxon_id = 'NCBITaxon:9606'
        geno.addGenome(taxon_id, genome_label)
        geno.addReferenceGenome(build_id, genome_build, taxon_id)

        # Add chromosome

        chrom_class_id = makeChromID(chromosome, '9606', 'CHR')  # the chrom class (generic) id
        chrom_instance_id = makeChromID(chromosome, build_id, 'MONARCH')

        # first, add the chromosome class (in the taxon)
        geno.addChromosomeClass(chromosome, taxon_id, 'Human')

        # then, add the chromosome instance (from the given build)
        geno.addChromosomeInstance(chromosome, build_id, genome_build, chrom_class_id)

        # Add variant coordinates in reference to chromosome
        self._add_feature_with_coords(variant_id,genome_pos_start,
                                      genome_pos_end, chrom_instance_id, chrom_region_id)

        # Add mutation coordinates in reference to gene
        self._add_feature_with_coords(variant_id, bp_pos,
                                      bp_pos, transcript_curie, cdna_region_id)

        # Add nucleotide mutation
        gu.addTriple(self.graph, variant_id,
                     geno.properties['reference_nucleotide'],
                     ref_base, is_literal)
        gu.addTriple(self.graph, variant_id,
                     geno.properties['altered_nucleotide'],
                     variant_base, is_literal)

        """
        Here we update any internal cgd variant IDS with a cosmic ID
        or dbSNP ID.  Alternatively we could do this using sql rather
        than a sparql update which may be safer
        """
        # Add SNP xrefs
        if cosmic_id is not None:
            cosmic_id_list = cosmic_id.split(', ')
            cosmic_curie_list = []
            for c_id in cosmic_id_list:
                cosmic_curie = re.sub(r'COSM(\d+)', r'COSMIC:\1', c_id)
                cosmic_curie_list.append(cosmic_curie)
                gu.addIndividualToGraph(self.graph, cosmic_curie, c_id,
                                        geno.genoparts['missense_variant'])

            # If there are multiple ids set them equivalent to the first
            for curie in cosmic_curie_list[1:]:
                gu.addSameIndividual(self.graph, cosmic_curie_list[0], curie)

            self._replace_entity(self.graph, variant_id, cosmic_curie_list[0], self.bindings)

        if db_snp_id is not None:
            db_snp_curie = re.sub(r'rs(\d+)', r'dbSNP:\1', db_snp_id)
            gu.addIndividualToGraph(self.graph, db_snp_curie, db_snp_id,
                                    geno.genoparts['missense_variant'])

            if cosmic_id is None:
                self._replace_entity(self.graph, variant_id, db_snp_curie, self.bindings)
            else:
                cosmic_id_list = cosmic_id.split(', ')
                for c_id in cosmic_id_list:
                    cosmic_curie = re.sub(r'COSM(\d+)', r'COSMIC:\1', c_id)
                    gu.addSameIndividual(self.graph, cosmic_curie, db_snp_curie)

        return