Python Genotype.addReferenceGenome примеры использования

Язык программирования: Python

Пространство имен/Пакет: dipper.models.Genotype

Класс/Тип: Genotype

Метод/Функция: addReferenceGenome

Примеров на hotexamples.com: 19

Python Genotype.addReferenceGenome - 19 примеров найдено. Это лучшие примеры Python кода для dipper.models.Genotype.Genotype.addReferenceGenome, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Genotype(30)

addTaxon(29)

addAffectedLocus(18)

addAlleleOfGene(14)

addChromosomeInstance(12)

addGene(12)

addGenome(11)

addReferenceGenome(11)

addGenotype(11)

addAllele(10)

addSequenceAlteration(7)

addParts(7)

addSequenceAlterationToVariantLocus(6)

addReagentTargetedGene(6)

addVSLCtoParent(5)

addGenomicBackgroundToGenotype(5)

addPartsToVSLC(5)

addChromosomeClass(4)

addGeneTargetingReagent(3)

addGeneProduct(3)

addSequenceDerivesFrom(2)

addGenomicBackground(2)

makeGenomeID(2)

addPolypeptide(1)

addDerivesFrom(1)

addTranscript(1)

make_experimental_model_with_genotype(1)

Пример #1

Показать файл

Файл: AnimalQTLdb.py Проект: JervenBolleman/dipper

    def parse(self, limit=None):
        """

        :param limit:
        :return:
        """
        if limit is not None:
            logger.info("Only parsing first %s rows fo each file", str(limit))

        logger.info("Parsing files...")

        if self.testOnly:
            self.testMode = True
            g = self.testgraph
        else:
            g = self.graph

        tmap = '/'.join((self.rawdir, self.files['trait_mappings']['file']))
        self._process_trait_mappings(tmap, limit)

        geno = Genotype(g)
        # organisms  = ['chicken']
        organisms = [
            'chicken', 'pig', 'horse', 'rainbow_trout', 'sheep', 'cattle']

        for o in organisms:
            tax_id = self._get_tax_by_common_name(o)
            geno.addGenome(tax_id, o)
            build_id = None
            build = None

            k = o+'_bp'
            if k in self.files:
                file = self.files[k]['file']
                m = re.search(r'QTL_([\w\.]+)\.gff.txt.gz', file)
                if m is None:
                    logger.error("Can't match a gff build")
                else:
                    build = m.group(1)
                    build_id = self._map_build_by_abbrev(build)
                    logger.info("Build = %s", build_id)
                    geno.addReferenceGenome(build_id, build, tax_id)
                if build_id is not None:
                    self._process_QTLs_genomic_location(
                        '/'.join((self.rawdir, file)), tax_id, build_id, build,
                        limit)

            k = o+'_cm'
            if k in self.files:
                file = self.files[k]['file']
                self._process_QTLs_genetic_location(
                    '/'.join((self.rawdir, file)), tax_id, o, limit)

        logger.info("Finished parsing")

        self.load_bindings()

        logger.info("Found %d nodes", len(self.graph))
        return

Пример #2

Показать файл

    def parse(self, limit=None):
        """

        :param limit:
        :return:
        """
        if limit is not None:
            logger.info("Only parsing first %s rows fo each file", str(limit))

        logger.info("Parsing files...")

        if self.testOnly:
            self.testMode = True
            g = self.testgraph
        else:
            g = self.graph

        tmap = '/'.join((self.rawdir, self.files['trait_mappings']['file']))
        self._process_trait_mappings(tmap, limit)

        geno = Genotype(g)
        # organisms  = ['chicken']
        organisms = [
            'chicken', 'pig', 'horse', 'rainbow_trout', 'sheep', 'cattle']

        for o in organisms:
            tax_id = self._get_tax_by_common_name(o)
            geno.addGenome(tax_id, o)
            build_id = None
            build = None

            k = o + '_bp'
            if k in self.files:
                file = self.files[k]['file']
                m = re.search(r'QTL_([\w\.]+)\.gff.txt.gz', file)
                if m is None:
                    logger.error("Can't match a gff build")
                else:
                    build = m.group(1)
                    build_id = self._map_build_by_abbrev(build)
                    logger.info("Build = %s", build_id)
                    geno.addReferenceGenome(build_id, build, tax_id)
                if build_id is not None:
                    self._process_QTLs_genomic_location(
                        '/'.join((self.rawdir, file)), tax_id, build_id, build,
                        limit)

            k = o+'_cm'
            if k in self.files:
                file = self.files[k]['file']
                self._process_QTLs_genetic_location(
                    '/'.join((self.rawdir, file)), tax_id, o, limit)

        logger.info("Finished parsing")
        return

Пример #3

Показать файл

    def _create_genome_builds(self):
        """
        Various resources will map variations to either UCSC (hg*)
        or to NCBI assemblies. Here we create the equivalences between them.
        Data taken from:
        https://genome.ucsc.edu/FAQ/FAQreleases.html#release1

        :return:

        """

        # TODO add more species

        graph = self.graph
        geno = Genotype(graph)
        model = Model(graph)
        logger.info("Adding equivalent assembly identifiers")
        for sp in self.species:
            tax_id = self.resolve(sp)
            txid_num = tax_id.split(':')[1]
            for key in self.files[txid_num]['assembly']:
                ucsc_id = key
                try:
                    ucsc_label = ucsc_id.split(':')[1]
                except IndexError:
                    logger.error('%s Assembly id:  "%s" is problematic', sp,
                                 key)
                    continue
                if key in self.localtt:
                    mapped_id = self.localtt[key]
                else:
                    logger.error(
                        '%s Assembly id:  "%s" is not in local translation table',
                        sp, key)

                mapped_label = mapped_id.split(':')[1]

                mapped_label = 'NCBI build ' + str(mapped_label)
                geno.addReferenceGenome(ucsc_id, ucsc_label, tax_id)
                geno.addReferenceGenome(mapped_id, mapped_label, tax_id)
                model.addSameIndividual(ucsc_id, mapped_id)

        return

Пример #4

Показать файл

Файл: UCSCBands.py Проект: TomConlin/dipper

    def _create_genome_builds(self):
        """
        Various resources will map variations to either UCSC (hg*)
        or to NCBI assemblies. Here we create the equivalences between them.
        Data taken from:
        https://genome.ucsc.edu/FAQ/FAQreleases.html#release1

        :return:

        """

        # TODO add more species

        graph = self.graph
        geno = Genotype(graph)
        model = Model(graph)
        LOG.info("Adding equivalent assembly identifiers")
        for sp in self.species:
            tax_id = self.globaltt[sp]
            txid_num = tax_id.split(':')[1]
            for key in self.files[txid_num]['assembly']:
                ucsc_id = key
                try:
                    ucsc_label = ucsc_id.split(':')[1]
                except IndexError:
                    LOG.error('%s Assembly id:  "%s" is problematic', sp, key)
                    continue
                if key in self.localtt:
                    mapped_id = self.localtt[key]
                else:
                    LOG.error(
                        '%s Assembly id:  "%s" is not in local translation table',
                        sp, key)

                mapped_label = mapped_id.split(':')[1]

                mapped_label = 'NCBI build ' + str(mapped_label)
                geno.addReferenceGenome(ucsc_id, ucsc_label, tax_id)
                geno.addReferenceGenome(mapped_id, mapped_label, tax_id)
                model.addSameIndividual(ucsc_id, mapped_id)

        return

Пример #5

Показать файл

Файл: UCSCBands.py Проект: DoctorBud/dipper

    def _get_chrbands(self, limit, taxon):
        """
        :param limit:
        :return:

        """
        model = Model(self.graph)
        # TODO PYLINT figure out what limit was for and why it is unused
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files[taxon]['file']))
        logger.info("Processing Chr bands from FILE: %s", myfile)
        geno = Genotype(self.graph)
        monochrom = Monochrom(self.graph_type, self.are_bnodes_skized)

        # used to hold band definitions for a chr
        # in order to compute extent of encompasing bands

        mybands = {}
        # build the organism's genome from the taxon
        genome_label = self.files[taxon]['genome_label']
        taxon_id = 'NCBITaxon:'+taxon

        # add the taxon as a class.  adding the class label elsewhere
        model.addClassToGraph(taxon_id, None)
        model.addSynonym(taxon_id, genome_label)

        geno.addGenome(taxon_id, genome_label)

        # add the build and the taxon it's in
        build_num = self.files[taxon]['build_num']
        build_id = 'UCSC:'+build_num
        geno.addReferenceGenome(build_id, build_num, taxon_id)

        # process the bands
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match('^#', line):
                    continue

                # chr13	4500000	10000000	p12	stalk
                (scaffold, start, stop, band_num, rtype) = line.split('\t')
                line_counter += 1

                # NOTE some less-finished genomes have
                # placed and unplaced scaffolds
                # * Placed scaffolds:
                #       the scaffolds have been placed within a chromosome.
                # * Unlocalized scaffolds:
                #   although the chromosome within which the scaffold occurs
                #   is known, the scaffold's position or orientation
                #   is not known.
                # * Unplaced scaffolds:
                #   it is not known which chromosome the scaffold belongs to
                #
                # find out if the thing is a full on chromosome, or a scaffold:
                # ex: unlocalized scaffold: chr10_KL568008v1_random
                # ex: unplaced scaffold: chrUn_AABR07022428v1
                placed_scaffold_pattern = r'(chr(?:\d+|X|Y|Z|W|M))'
                unlocalized_scaffold_pattern = \
                    placed_scaffold_pattern+r'_(\w+)_random'
                unplaced_scaffold_pattern = r'chr(Un(?:_\w+)?)'

                m = re.match(placed_scaffold_pattern+r'$', scaffold)
                if m is not None and len(m.groups()) == 1:
                    # the chromosome is the first match of the pattern
                    chrom_num = m.group(1)
                else:
                    # skip over anything that isn't a placed_scaffold
                    # at the class level
                    logger.info("Found non-placed chromosome %s", scaffold)
                    chrom_num = None

                m_chr_unloc = re.match(unlocalized_scaffold_pattern, scaffold)
                m_chr_unplaced = re.match(unplaced_scaffold_pattern, scaffold)

                scaffold_num = None
                if m:
                    pass
                elif m_chr_unloc is not None and\
                        len(m_chr_unloc.groups()) == 2:
                    chrom_num = m_chr_unloc.group(1)
                    scaffold_num = chrom_num+'_'+m_chr_unloc.group(2)
                elif m_chr_unplaced is not None and\
                        len(m_chr_unplaced.groups()) == 1:
                    scaffold_num = m_chr_unplaced.group(1)
                else:
                    logger.error(
                        "There's a chr pattern that we aren't matching: %s",
                        scaffold)

                if chrom_num is not None:
                    # the chrom class (generic) id
                    chrom_class_id = makeChromID(chrom_num, taxon, 'CHR')

                    # first, add the chromosome class (in the taxon)
                    geno.addChromosomeClass(
                        chrom_num, taxon_id, self.files[taxon]['genome_label'])

                    # then, add the chromosome instance (from the given build)
                    geno.addChromosomeInstance(chrom_num, build_id, build_num,
                                               chrom_class_id)

                    # add the chr to the hashmap of coordinates for this build
                    # the chromosome coordinate space is itself
                    if chrom_num not in mybands.keys():
                        mybands[chrom_num] = {
                            'min': 0,
                            'max': int(stop),
                            'chr': chrom_num,
                            'ref': build_id,
                            'parent': None,
                            'stain': None,
                            'type': Feature.types['chromosome']}

                if scaffold_num is not None:
                    # this will put the coordinates of the scaffold
                    # in the scaffold-space and make sure that the scaffold
                    # is part of the correct parent.
                    # if chrom_num is None,
                    # then it will attach it to the genome,
                    # just like a reg chrom
                    mybands[scaffold_num] = {
                        'min': start,
                        'max': stop,
                        'chr': scaffold_num,
                        'ref': build_id,
                        'parent': chrom_num,
                        'stain': None,
                        'type': Feature.types['assembly_component'],
                        'synonym': scaffold}

                if band_num is not None and band_num.strip() != '':
                    # add the specific band
                    mybands[chrom_num+band_num] = {'min': start,
                                                   'max': stop,
                                                   'chr': chrom_num,
                                                   'ref': build_id,
                                                   'parent': None,
                                                   'stain': None,
                                                   'type': None}

                    # add the staining intensity of the band
                    if re.match(r'g(neg|pos|var)', rtype):
                        mybands[chrom_num+band_num]['stain'] = \
                            Feature.types.get(rtype)

                    # get the parent bands, and make them unique
                    parents = list(
                        monochrom.make_parent_bands(band_num, set()))
                    # alphabetical sort will put them in smallest to biggest,
                    # so we reverse
                    parents.sort(reverse=True)
                    # print('parents of',chrom,band,':',parents)

                    if len(parents) > 0:
                        mybands[chrom_num+band_num]['parent'] = \
                            chrom_num+parents[0]
                else:
                    # TODO PYLINT why is 'parent'
                    # a list() a couple of lines up and a set() here?
                    parents = set()

                # loop through the parents and add them to the hash
                # add the parents to the graph, in hierarchical order
                # TODO PYLINT Consider using enumerate
                # instead of iterating with range and len
                for i in range(len(parents)):
                    rti = getChrPartTypeByNotation(parents[i])

                    pnum = chrom_num+parents[i]
                    sta = int(start)
                    sto = int(stop)
                    if pnum not in mybands.keys():
                        # add the parental band to the hash
                        b = {'min': min(sta, sto),
                             'max': max(sta, sto),
                             'chr': chrom_num,
                             'ref': build_id,
                             'parent': None,
                             'stain': None,
                             'type': rti}
                        mybands[pnum] = b
                    else:
                        # band already in the hash means it's a grouping band
                        # need to update the min/max coords
                        b = mybands.get(pnum)
                        b['min'] = min(sta, sto, b['min'])
                        b['max'] = max(sta, sto, b['max'])
                        mybands[pnum] = b

                        # also, set the max for the chrom
                        c = mybands.get(chrom_num)
                        c['max'] = max(sta, sto, c['max'])
                        mybands[chrom_num] = c

                    # add the parent relationships to each
                    if i < len(parents) - 1:
                        mybands[pnum]['parent'] = chrom_num+parents[i+1]
                    else:
                        # add the last one (p or q usually)
                        # as attached to the chromosome
                        mybands[pnum]['parent'] = chrom_num

        f.close()  # end looping through file

        # loop through the hash and add the bands to the graph
        for b in mybands.keys():
            myband = mybands.get(b)
            band_class_id = makeChromID(b, taxon, 'CHR')
            band_class_label = makeChromLabel(b, genome_label)
            band_build_id = makeChromID(b, build_num, 'MONARCH')
            band_build_label = makeChromLabel(b, build_num)
            # the build-specific chrom
            chrom_in_build_id = makeChromID(
                myband['chr'], build_num, 'MONARCH')
            # if it's != part, then add the class
            if myband['type'] != Feature.types['assembly_component']:
                model.addClassToGraph(band_class_id,
                                      band_class_label, myband['type'])
                bfeature = Feature(self.graph, band_build_id, band_build_label,
                                   band_class_id)
            else:
                bfeature = Feature(self.graph, band_build_id, band_build_label,
                                   myband['type'])
                if 'synonym' in myband:
                    model.addSynonym(band_build_id, myband['synonym'])

            if myband['parent'] is None:
                if myband['type'] == Feature.types['assembly_component']:
                    # since we likely don't know the chr,
                    # add it as a part of the build
                    geno.addParts(band_build_id, build_id)
            elif myband['type'] == Feature.types['assembly_component']:
                # geno.addParts(band_build_id, chrom_in_build_id)
                parent_chrom_in_build = makeChromID(myband['parent'],
                                                    build_num, 'MONARCH')
                bfeature.addSubsequenceOfFeature(parent_chrom_in_build)

            # add the band as a feature
            # (which also instantiates the owl:Individual)
            bfeature.addFeatureStartLocation(myband['min'], chrom_in_build_id)
            bfeature.addFeatureEndLocation(myband['max'], chrom_in_build_id)
            if 'stain' in myband and myband['stain'] is not None:
                # TODO 'has_staining_intensity' being dropped by MB
                bfeature.addFeatureProperty(
                    Feature.properties['has_staining_intensity'],
                    myband['stain'])

            # type the band as a faldo:Region directly (add_region=False)
            # bfeature.setNoBNodes(self.nobnodes)
            # to come when we merge in ZFIN.py
            bfeature.addFeatureToGraph(False)

        return

Пример #6

Показать файл

Файл: AnimalQTLdb.py Проект: TomConlin/dipper

    def _process_qtls_genetic_location(
            self, raw, txid, common_name, limit=None):
        """
        This function processes

        Triples created:

        :param limit:
        :return:

        """
        aql_curie = self.files[common_name + '_cm']['curie']

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        line_counter = 0
        geno = Genotype(graph)
        model = Model(graph)
        eco_id = self.globaltt['quantitative trait analysis evidence']

        taxon_curie = 'NCBITaxon:' + txid

        LOG.info("Processing genetic location for %s from %s", taxon_curie, raw)
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (qtl_id,
                 qtl_symbol,
                 trait_name,
                 assotype,
                 empty,
                 chromosome,
                 position_cm,
                 range_cm,
                 flankmark_a2,
                 flankmark_a1,
                 peak_mark,
                 flankmark_b1,
                 flankmark_b2,
                 exp_id,
                 model_id,
                 test_base,
                 sig_level,
                 lod_score,
                 ls_mean,
                 p_values,
                 f_statistics,
                 variance,
                 bayes_value,
                 likelihood_ratio,
                 trait_id, dom_effect,
                 add_effect,
                 pubmed_id,
                 gene_id,
                 gene_id_src,
                 gene_id_type,
                 empty2) = row

                if self.test_mode and int(qtl_id) not in self.test_ids:
                    continue

                qtl_id = common_name + 'QTL:' + qtl_id.strip()
                trait_id = ':'.join((aql_curie, trait_id.strip()))

                # Add QTL to graph
                feature = Feature(graph, qtl_id, qtl_symbol, self.globaltt['QTL'])
                feature.addTaxonToFeature(taxon_curie)

                # deal with the chromosome
                chrom_id = makeChromID(chromosome, taxon_curie, 'CHR')

                # add a version of the chromosome which is defined as
                # the genetic map
                build_id = 'MONARCH:'+common_name.strip()+'-linkage'
                build_label = common_name+' genetic map'
                geno.addReferenceGenome(build_id, build_label, taxon_curie)
                chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH')
                geno.addChromosomeInstance(
                    chromosome, build_id, build_label, chrom_id)
                start = stop = None
                # range_cm sometimes ends in "(Mb)"  (i.e pig 2016 Nov)
                range_mb = re.split(r'\(', range_cm)
                if range_mb is not None:
                    range_cm = range_mb[0]

                if re.search(r'[0-9].*-.*[0-9]', range_cm):
                    range_parts = re.split(r'-', range_cm)

                    # check for poorly formed ranges
                    if len(range_parts) == 2 and\
                            range_parts[0] != '' and range_parts[1] != '':
                        (start, stop) = [
                            int(float(x.strip())) for x in re.split(r'-', range_cm)]
                    else:
                        LOG.info(
                            "A cM range we can't handle for QTL %s: %s",
                            qtl_id, range_cm)
                elif position_cm != '':
                    match = re.match(r'([0-9]*\.[0-9]*)', position_cm)
                    if match is not None:
                        position_cm = match.group()
                        start = stop = int(float(position_cm))

                # FIXME remove converion to int for start/stop
                # when schema can handle floats add in the genetic location
                # based on the range
                feature.addFeatureStartLocation(
                    start, chrom_in_build_id, None,
                    [self.globaltt['FuzzyPosition']])
                feature.addFeatureEndLocation(
                    stop, chrom_in_build_id, None,
                    [self.globaltt['FuzzyPosition']])
                feature.addFeatureToGraph()

                # sometimes there's a peak marker, like a rsid.
                # we want to add that as a variant of the gene,
                # and xref it to the qtl.
                dbsnp_id = None
                if peak_mark != '' and peak_mark != '.' and \
                        re.match(r'rs', peak_mark.strip()):
                    dbsnp_id = 'dbSNP:'+peak_mark.strip()

                    model.addIndividualToGraph(
                        dbsnp_id, None,
                        self.globaltt['sequence_alteration'])
                    model.addXref(qtl_id, dbsnp_id)

                gene_id = gene_id.replace('uncharacterized ', '').strip()
                if gene_id is not None and gene_id != '' and gene_id != '.'\
                        and re.fullmatch(r'[^ ]*', gene_id) is not None:

                    # we assume if no src is provided and gene_id is an integer,
                    # then it is an NCBI gene ... (okay, lets crank that back a notch)
                    if gene_id_src == '' and gene_id.isdigit() and \
                            gene_id in self.gene_info:
                        # LOG.info(
                        #    'Warm & Fuzzy saying %s is a NCBI gene for %s',
                        #    gene_id, common_name)
                        gene_id_src = 'NCBIgene'
                    elif gene_id_src == '' and gene_id.isdigit():
                        LOG.warning(
                            'Cold & Prickely saying %s is a NCBI gene for %s',
                            gene_id, common_name)
                        gene_id_src = 'NCBIgene'
                    elif gene_id_src == '':
                        LOG.error(
                            ' "%s" is a NOT NCBI gene for %s', gene_id, common_name)
                        gene_id_src = None

                    if gene_id_src == 'NCBIgene':
                        gene_id = 'NCBIGene:' + gene_id
                        # we will expect that these will get labels elsewhere
                        geno.addGene(gene_id, None)
                        # FIXME what is the right relationship here?
                        geno.addAffectedLocus(qtl_id, gene_id)

                        if dbsnp_id is not None:
                            # add the rsid as a seq alt of the gene_id
                            vl_id = '_:' + re.sub(
                                r':', '', gene_id) + '-' + peak_mark.strip()
                            geno.addSequenceAlterationToVariantLocus(
                                dbsnp_id, vl_id)
                            geno.addAffectedLocus(vl_id, gene_id)

                # add the trait
                model.addClassToGraph(trait_id, trait_name)

                # Add publication
                reference = None
                if re.match(r'ISU.*', pubmed_id):
                    pub_id = 'AQTLPub:'+pubmed_id.strip()
                    reference = Reference(graph, pub_id)
                elif pubmed_id != '':
                    pub_id = 'PMID:' + pubmed_id.strip()
                    reference = Reference(
                        graph, pub_id, self.globaltt['journal article'])

                if reference is not None:
                    reference.addRefToGraph()

                # make the association to the QTL
                assoc = G2PAssoc(
                    graph, self.name, qtl_id, trait_id, self.globaltt['is marker for'])
                assoc.add_evidence(eco_id)
                assoc.add_source(pub_id)

                # create a description from the contents of the file
                # desc = ''

                # assoc.addDescription(g, assoc_id, desc)

                # TODO add exp_id as evidence
                # if exp_id != '':
                #     exp_id = 'AQTLExp:'+exp_id
                #     gu.addIndividualToGraph(g, exp_id, None, eco_id)

                if p_values != '':
                    scr = re.sub(r'<', '', p_values)
                    scr = re.sub(r',', '.', scr)  # international notation
                    if scr.isnumeric():
                        score = float(scr)
                        assoc.set_score(score)  # todo add score type
                # TODO add LOD score?
                assoc.add_association_to_graph()

                # make the association to the dbsnp_id, if found
                if dbsnp_id is not None:
                    # make the association to the dbsnp_id
                    assoc = G2PAssoc(
                        graph, self.name, dbsnp_id, trait_id,
                        self.globaltt['is marker for'])
                    assoc.add_evidence(eco_id)
                    assoc.add_source(pub_id)

                    # create a description from the contents of the file
                    # desc = ''
                    # assoc.addDescription(g, assoc_id, desc)

                    # TODO add exp_id
                    # if exp_id != '':
                    #     exp_id = 'AQTLExp:'+exp_id
                    #     gu.addIndividualToGraph(g, exp_id, None, eco_id)

                    if p_values != '':
                        scr = re.sub(r'<', '', p_values)
                        scr = re.sub(r',', '.', scr)
                        if scr.isnumeric():
                            score = float(scr)
                            assoc.set_score(score)  # todo add score type
                    # TODO add LOD score?

                    assoc.add_association_to_graph()

                if not self.test_mode and limit is not None and line_counter > limit:
                    break

        LOG.info("Done with QTL genetic info")
        return

Пример #7

Показать файл

Файл: AnimalQTLdb.py Проект: TomConlin/dipper

    def parse(self, limit=None):
        """

        :param limit:
        :return:
        """
        if limit is not None:
            LOG.info("Only parsing first %s rows fo each file", str(limit))

        LOG.info("Parsing files...")

        if self.test_only:
            self.test_mode = True
            graph = self.testgraph
        else:
            graph = self.graph

        traitmap = '/'.join((self.rawdir, self.files['trait_mappings']['file']))
        self._process_trait_mappings(traitmap, limit)

        geno = Genotype(graph)
        animals = ['chicken', 'pig', 'horse', 'rainbow_trout', 'sheep', 'cattle']

        for common_name in animals:
            txid_num = self.resolve(common_name).split(':')[1]
            taxon_label = self.localtt[common_name]
            taxon_curie = self.globaltt[taxon_label]
            taxon_num = taxon_curie.split(':')[1]
            txid_num = taxon_num  # for now
            taxon_word = taxon_label.replace(' ', '_')
            gene_info_file = '/'.join((
                self.rawdir, self.files[taxon_word + '_info']['file']))
            self.gene_info = list()
            LOG.info('Ingesting %s', gene_info_file)
            with gzip.open(gene_info_file, 'rt') as gi_gz:
                filereader = csv.reader(gi_gz, delimiter='\t')
                for row in filereader:
                    if row[0][0] == '#':
                        continue
                    else:
                        self.gene_info.append(str(row[1]))  # tossing lots of good stuff
            LOG.info(
                'Gene Info for %s has %i enteries', common_name, len(self.gene_info))
            # LOG.info('Gene Info entery looks like %s', self.gene_info[5])

            build = None

            fname_bp = common_name + '_bp'
            if fname_bp in self.files:
                bpfile = self.files[fname_bp]['file']
                mch = re.search(r'QTL_([\w\.]+)\.gff.txt.gz', bpfile)
                if mch is None:
                    LOG.error("Can't match a gff build to " + fname_bp)
                else:
                    build = mch.group(1)
                    build_id = self.localtt[build]
                    LOG.info("Build UCSC label is: %s", build_id)

                    # NCBI assembly curie is

                    geno.addReferenceGenome(build_id, build, txid_num)

                if build_id is not None:
                    self._process_qtls_genomic_location(
                        '/'.join((self.rawdir, bpfile)), txid_num, build_id, build,
                        common_name, limit)

            fname_cm = common_name + '_cm'
            if fname_cm in self.files:
                cmfile = self.files[fname_cm]['file']
                self._process_qtls_genetic_location(
                    '/'.join((self.rawdir, cmfile)), txid_num, common_name, limit)

        LOG.info("Finished parsing")
        return

Пример #8

Показать файл

    def _process_qtls_genetic_location(
            self, raw, src_key, txid, common_name, limit=None):
        """
        This function processes

        Triples created:

        :param limit:
        :return:

        """
        aql_curie = self.files[src_key]['curie']
        common_name = common_name.strip()
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        geno = Genotype(graph)
        model = Model(graph)
        eco_id = self.globaltt['quantitative trait analysis evidence']
        taxon_curie = 'NCBITaxon:' + txid

        LOG.info("Processing genetic location for %s from %s", taxon_curie, raw)
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            reader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            # no header in these files, so no header checking
            col = self.files[src_key]['columns']
            col_len = len(col)
            for row in reader:
                if len(row) != col_len and ''.join(row[col_len:]) != '':
                    LOG.warning(
                        "Problem parsing %s line %i containing: \n%s\n"
                        "got %i cols but expected %i",
                        raw, reader.line_num, row, len(row), col_len)
                    # LOG.info(row)
                    continue

                qtl_id = row[col.index('QTL_ID')].strip()
                qtl_symbol = row[col.index('QTL_symbol')].strip()
                trait_name = row[col.index('Trait_name')].strip()
                # assotype = row[col.index('assotype')].strip()
                chromosome = row[col.index('Chromosome')].strip()
                position_cm = row[col.index('Position_cm')].strip()
                range_cm = row[col.index('range_cm')].strip()
                # flankmark_a2 = row[col.index('FlankMark_A2')].strip()
                # flankmark_a1 = row[col.index('FlankMark_A1')].strip()
                peak_mark = row[col.index('Peak_Mark')].strip()
                # flankmark_b1 = row[col.index('FlankMark_B1')].strip()
                # flankmark_b2 = row[col.index('FlankMark_B2')].strip()
                # exp_id = row[col.index('Exp_ID')].strip()
                # model_id = row[col.index('Model')].strip()
                # test_base = row[col.index('testbase')].strip()
                # sig_level = row[col.index('siglevel')].strip()
                # lod_score = row[col.index('LOD_score')].strip()
                # ls_mean = row[col.index('LS_mean')].strip()
                p_values = row[col.index('P_values')].strip()
                # f_statistics = row[col.index('F_Statistics')].strip()
                # variance = row[col.index('VARIANCE')].strip()
                # bayes_value = row[col.index('Bayes_value')].strip()
                # likelihood_ratio = row[col.index('LikelihoodR')].strip()
                trait_id = row[col.index('TRAIT_ID')].strip()
                # dom_effect = row[col.index('Dom_effect')].strip()
                # add_effect = row[col.index('Add_effect')].strip()
                pubmed_id = row[col.index('PUBMED_ID')].strip()
                gene_id = row[col.index('geneID')].strip()
                gene_id_src = row[col.index('geneIDsrc')].strip()
                # gene_id_type = row[col.index('geneIDtype')].strip()

                if self.test_mode and int(qtl_id) not in self.test_ids:
                    continue

                qtl_id = common_name + 'QTL:' + qtl_id.strip()
                trait_id = ':'.join((aql_curie, trait_id.strip()))

                # Add QTL to graph
                feature = Feature(graph, qtl_id, qtl_symbol, self.globaltt['QTL'])
                feature.addTaxonToFeature(taxon_curie)

                # deal with the chromosome
                chrom_id = makeChromID(chromosome, taxon_curie, 'CHR')

                # add a version of the chromosome which is defined as
                # the genetic map
                build_id = 'MONARCH:' + common_name + '-linkage'
                build_label = common_name + ' genetic map'
                geno.addReferenceGenome(build_id, build_label, taxon_curie)
                chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH')
                geno.addChromosomeInstance(
                    chromosome, build_id, build_label, chrom_id)
                start = stop = None
                # range_cm sometimes ends in "(Mb)"  (i.e pig 2016 Nov)
                range_mb = re.split(r'\(', range_cm)
                if range_mb is not None:
                    range_cm = range_mb[0]

                if re.search(r'[0-9].*-.*[0-9]', range_cm):
                    range_parts = re.split(r'-', range_cm)

                    # check for poorly formed ranges
                    if len(range_parts) == 2 and\
                            range_parts[0] != '' and range_parts[1] != '':
                        (start, stop) = [
                            int(float(x.strip())) for x in re.split(r'-', range_cm)]
                    else:
                        LOG.info(
                            "A cM range we can't handle for QTL %s: %s",
                            qtl_id, range_cm)
                elif position_cm != '':
                    match = re.match(r'([0-9]*\.[0-9]*)', position_cm)
                    if match is not None:
                        position_cm = match.group()
                        start = stop = int(float(position_cm))

                # FIXME remove converion to int for start/stop
                # when schema can handle floats add in the genetic location
                # based on the range
                feature.addFeatureStartLocation(
                    start, chrom_in_build_id, None,
                    [self.globaltt['FuzzyPosition']])
                feature.addFeatureEndLocation(
                    stop, chrom_in_build_id, None,
                    [self.globaltt['FuzzyPosition']])
                feature.addFeatureToGraph()

                # sometimes there's a peak marker, like a rsid.
                # we want to add that as a variant of the gene,
                # and xref it to the qtl.
                dbsnp_id = None
                if peak_mark != '' and peak_mark != '.' and \
                        re.match(r'rs', peak_mark.strip()):
                    dbsnp_id = 'dbSNP:' + peak_mark.strip()

                    model.addIndividualToGraph(
                        dbsnp_id, None, self.globaltt['sequence_alteration'])

                    model.addXref(
                        qtl_id, dbsnp_id, xref_category=blv.terms['SequenceVariant'])

                gene_id = gene_id.replace('uncharacterized ', '').strip()
                gene_id = gene_id.strip(',')  # for "100157483,"  in pig_QTLdata.txt
                if gene_id is not None and gene_id != '' and gene_id != '.'\
                        and re.fullmatch(r'[^ ]*', gene_id) is not None:

                    # we assume if no src is provided and gene_id is an integer,
                    # then it is an NCBI gene ... (okay, lets crank that back a notch)
                    if gene_id_src == '' and gene_id.isdigit() and \
                            gene_id in self.gene_info:
                        # LOG.info(
                        #    'Warm & Fuzzy saying %s is a NCBI gene for %s',
                        #    gene_id, common_name)
                        gene_id_src = 'NCBIgene'
                    elif gene_id_src == '' and gene_id.isdigit():
                        LOG.warning(
                            'Cold & Prickely saying %s is a NCBI gene for %s',
                            gene_id, common_name)
                        gene_id_src = 'NCBIgene'
                    elif gene_id_src == '':
                        LOG.error(
                            ' "%s" is a NOT NCBI gene for %s', gene_id, common_name)
                        gene_id_src = None

                    if gene_id_src == 'NCBIgene':
                        gene_id = 'NCBIGene:' + gene_id
                        # we will expect that these will get labels elsewhere
                        geno.addGene(gene_id, None)
                        # FIXME what is the right relationship here?
                        geno.addAffectedLocus(qtl_id, gene_id)

                        if dbsnp_id is not None:
                            # add the rsid as a seq alt of the gene_id as a bnode
                            vl_id = self.make_id(re.sub(
                                r':', '', gene_id) + '-' + peak_mark.strip(), '_')
                            geno.addSequenceAlterationToVariantLocus(dbsnp_id, vl_id)
                            geno.addAffectedLocus(vl_id, gene_id)

                # add the trait
                model.addClassToGraph(
                    trait_id,
                    trait_name,
                    class_category=blv.terms['PhenotypicFeature'])

                # Add publication
                reference = None
                if re.match(r'ISU.*', pubmed_id):
                    pub_id = 'AQTLPub:' + pubmed_id.strip()
                    reference = Reference(graph, pub_id)
                elif pubmed_id != '':
                    pub_id = 'PMID:' + pubmed_id.strip()
                    reference = Reference(
                        graph, pub_id, self.globaltt['journal article'])

                if reference is not None:
                    reference.addRefToGraph()

                # make the association to the QTL
                assoc = G2PAssoc(
                    graph, self.name, qtl_id, trait_id, self.globaltt['is marker for'])
                assoc.add_evidence(eco_id)
                assoc.add_source(pub_id)

                # create a description from the contents of the file
                # desc = ''

                # assoc.addDescription(g, assoc_id, desc)

                # TODO add exp_id as evidence
                # if exp_id != '':
                #     exp_id = 'AQTLExp:'+exp_id
                #     gu.addIndividualToGraph(g, exp_id, None, eco_id)

                if p_values != '':
                    scr = re.sub(r'<', '', p_values)
                    scr = re.sub(r',', '.', scr)  # international notation
                    if scr.isnumeric():
                        score = float(scr)
                        assoc.set_score(score)  # todo add score type
                # TODO add LOD score?
                assoc.add_association_to_graph()

                # make the association to the dbsnp_id, if found
                if dbsnp_id is not None:
                    # make the association to the dbsnp_id
                    assoc = G2PAssoc(
                        graph, self.name, dbsnp_id, trait_id,
                        self.globaltt['is marker for'])
                    assoc.add_evidence(eco_id)
                    assoc.add_source(pub_id)

                    # create a description from the contents of the file
                    # desc = ''
                    # assoc.addDescription(g, assoc_id, desc)

                    # TODO add exp_id
                    # if exp_id != '':
                    #     exp_id = 'AQTLExp:'+exp_id
                    #     gu.addIndividualToGraph(g, exp_id, None, eco_id)

                    if p_values != '':
                        scr = re.sub(r'<', '', p_values)
                        scr = re.sub(r',', '.', scr)
                        if scr.isnumeric():
                            score = float(scr)
                            assoc.set_score(score)  # todo add score type
                    # TODO add LOD score?

                    assoc.add_association_to_graph()

                # off by one - the following actually gives us (limit + 1) records
                if not self.test_mode and limit is not None and reader.line_num > limit:
                    break

        LOG.info("Done with QTL genetic info")

Пример #9

Показать файл

    def parse(self, limit=None):
        """

        :param limit:
        :return:
        """
        if limit is not None:
            LOG.info("Only parsing first %s rows fo each file", str(limit))

        if self.test_only:
            self.test_mode = True
            graph = self.testgraph
        else:
            graph = self.graph

        trait_src_key = 'trait_mappings'
        traitmap = '/'.join((self.rawdir, self.files[trait_src_key]['file']))

        LOG.info("Parsing trait mapping  file %s", traitmap)
        self._process_trait_mappings(traitmap, trait_src_key, limit)

        geno = Genotype(graph)
        animals = ['chicken', 'pig', 'horse', 'rainbow_trout', 'sheep', 'cattle']

        for common_name in animals:
            txid_num = self.resolve(common_name).split(':')[1]
            taxon_label = self.localtt[common_name]
            taxon_curie = self.globaltt[taxon_label]
            taxon_num = taxon_curie.split(':')[1]
            txid_num = taxon_num  # for now
            taxon_word = taxon_label.strip().replace(' ', '_')
            src_key = taxon_word + '_info'
            gene_info_file = '/'.join((
                self.rawdir, self.files[src_key]['file']))
            self.gene_info = list()
            LOG.info('Ingesting %s', gene_info_file)
            with gzip.open(gene_info_file, 'rt') as gi_gz:
                reader = csv.reader(gi_gz, delimiter='\t')
                # skipping header checking, b/c not all of these gene_info files have
                # headers
                col = self.files[src_key]['columns']
                col_len = len(col)
                for row in reader:
                    if row[0][0] == '#':
                        # LOG.info(row)
                        continue
                    if len(row) != col_len and ''.join(row[col_len:]) != '':
                        LOG.warning(
                            "Problem parsing in %s row %i\n"
                            "got %s cols but expected %s",
                            gene_info_file, reader.line_num, len(row), col_len)
                        LOG.info(row)

                    self.gene_info.append(row[col.index('GeneID')])
            LOG.info(
                'Gene Info for %s has %i entries', common_name, len(self.gene_info))
            build = None

            fname_bp = common_name + '_bp'
            if fname_bp in self.files:
                bpfile = self.files[fname_bp]['file']
                mch = re.search(r'QTL_([\w\.]+)\.gff.txt.gz', bpfile)
                if mch is None:
                    LOG.error("Can't match a gff build to " + fname_bp)
                else:
                    build = mch.group(1)
                    build_id = self.localtt[build]
                    LOG.info("Build UCSC label is: %s", build_id)

                    geno.addReferenceGenome(build_id, build, txid_num)

                if build_id is not None:
                    self._process_qtls_genomic_location(
                        '/'.join((self.rawdir, bpfile)),
                        fname_bp,
                        txid_num,
                        build_id,
                        build,
                        common_name,
                        limit
                    )

            fname_cm = common_name + '_cm'
            if fname_cm in self.files:
                cmfile = self.files[fname_cm]['file']
                self._process_qtls_genetic_location(
                    '/'.join((self.rawdir, cmfile)),
                    fname_cm,
                    txid_num,
                    common_name,
                    limit)

        LOG.info("Finished parsing")

Пример #10

Показать файл

Файл: UCSCBands.py Проект: putmantime/dipper

    def _create_genome_builds(self):
        """
        Various resources will map variations to either UCSC (hg*)
        or to NCBI assemblies. Here we create the equivalences between them.
        Data taken from:
        https://genome.ucsc.edu/FAQ/FAQreleases.html#release1

        :return:

        """

        # TODO add more species
        ucsc_assembly_id_map = {
            "9606": {
                "UCSC:hg38": "NCBIGenome:GRCh38",
                "UCSC:hg19": "NCBIGenome:GRCh37",
                "UCSC:hg18": "NCBIGenome:36.1",
                "UCSC:hg17": "NCBIGenome:35",
                "UCSC:hg16": "NCBIGenome:34",
                "UCSC:hg15": "NCBIGenome:33",
            },
            "7955": {
                "UCSC:danRer10": "NCBIGenome:GRCz10",
                "UCSC:danRer7": "NCBIGenome:Zv9",
                "UCSC:danRer6": "NCBIGenome:Zv8",
            },
            "10090": {
                "UCSC:mm10": "NCBIGenome:GRCm38",
                "UCSC:mm9": "NCBIGenome:37"
            },
            "9031": {
                "UCSC:galGal4": "NCBIAssembly:317958",
            },
            "9913": {
                "UCSC:bosTau7": "NCBIAssembly:GCF_000003205.5",
            },
            "9823": {
                "UCSC:susScr3": "NCBIAssembly:304498",
            },
            "9940": {
                "UCSC:oviAri3": "NCBIAssembly:GCF_000298735.1",
            },
            "9796": {
                "UCSC:equCab2": "NCBIAssembly:GCF_000002305.2",
            }
        }
        g = self.graph
        geno = Genotype(g)
        model = Model(g)
        logger.info("Adding equivalent assembly identifiers")
        for sp in ucsc_assembly_id_map:
            tax_num = sp
            tax_id = 'NCBITaxon:' + tax_num
            mappings = ucsc_assembly_id_map[sp]
            for i in mappings:
                ucsc_id = i
                ucsc_label = re.split(':', i)[1]
                mapped_id = mappings[i]
                mapped_label = re.split(':', mapped_id)[1]
                mapped_label = 'NCBI build ' + str(mapped_label)
                geno.addReferenceGenome(ucsc_id, ucsc_label, tax_id)
                geno.addReferenceGenome(mapped_id, mapped_label, tax_id)
                model.addSameIndividual(ucsc_id, mapped_id)

        return

Пример #11

Показать файл

    def _get_chrbands(self, limit, src_key, genome_id):
        """
        :param limit:
        :return:

        """
        tax_num = src_key
        if limit is None:
            limit = sys.maxsize  # practical limit anyway
        model = Model(self.graph)
        line_num = 0
        myfile = '/'.join((self.rawdir, self.files[src_key]['file']))
        LOG.info("Processing Chr bands from FILE: %s", myfile)
        geno = Genotype(self.graph)
        monochrom = Monochrom(self.graph_type, self.are_bnodes_skized)

        # used to hold band definitions for a chr
        # in order to compute extent of encompasing bands

        mybands = {}
        # build the organism's genome from the taxon
        genome_label = self.files[src_key]['genome_label']
        taxon_curie = 'NCBITaxon:' + tax_num
        species_name = self.globaltcid[taxon_curie]  # for logging

        # add the taxon as a class.  adding the class label elsewhere
        model.addClassToGraph(taxon_curie, None)
        model.addSynonym(taxon_curie, genome_label)

        geno.addGenome(taxon_curie, genome_label, genome_id)

        # add the build and the taxon it's in
        build_num = self.files[src_key]['build_num']
        build_id = 'UCSC:' + build_num
        geno.addReferenceGenome(build_id, build_num, taxon_curie)

        # cat (at least)  also has  chr[BDAECF]... hex? must be a back cat.
        if tax_num == self.localtt['Felis catus']:
            placed_scaffold_regex = re.compile(
                r'(chr(?:[BDAECF]\d+|X|Y|Z|W|M|))$')
        else:
            placed_scaffold_regex = re.compile(r'(chr(?:\d+|X|Y|Z|W|M))$')
        unlocalized_scaffold_regex = re.compile(r'_(\w+)_random')
        unplaced_scaffold_regex = re.compile(r'chr(Un(?:_\w+)?)')

        # process the bands
        col = self.files[src_key]['columns']

        with gzip.open(myfile, 'rb') as binreader:
            for line in binreader:
                line_num += 1
                # skip comments
                line = line.decode().strip()
                if line[0] == '#' or line_num > limit:
                    continue
                # chr13	4500000	10000000	p12	stalk
                row = line.split('\t')
                scaffold = row[col.index('chrom')].strip()
                start = row[col.index('chromStart')]
                stop = row[col.index('chromEnd')]
                band_num = row[col.index('name')].strip()
                rtype = row[col.index('gieStain')]

                # NOTE some less-finished genomes have
                # placed and unplaced scaffolds
                # * Placed scaffolds:
                #       the scaffolds have been placed within a chromosome.
                # * Unlocalized scaffolds:
                #   although the chromosome within which the scaffold occurs
                #   is known, the scaffold's position or orientation
                #   is not known.
                # * Unplaced scaffolds:
                #   it is not known which chromosome the scaffold belongs to
                #
                # find out if the thing is a full on chromosome, or a scaffold:
                # ex: unlocalized scaffold: chr10_KL568008v1_random
                # ex: unplaced scaffold: chrUn_AABR07022428v1

                mch = placed_scaffold_regex.match(scaffold)
                if mch is not None and len(mch.groups()) == 1:
                    # the chromosome is the first match of the pattern
                    chrom_num = mch.group(1)
                else:
                    # skip over anything that isn't a placed_scaffold at the class level
                    # LOG.info("Found non-placed chromosome %s", scaffold)
                    chrom_num = None

                m_chr_unloc = unlocalized_scaffold_regex.match(scaffold)
                m_chr_unplaced = unplaced_scaffold_regex.match(scaffold)

                scaffold_num = None
                if mch:
                    pass
                elif m_chr_unloc is not None and len(
                        m_chr_unloc.groups()) == 2:
                    chrom_num = m_chr_unloc.group(1)
                    scaffold_num = chrom_num + '_' + m_chr_unloc.group(2)
                elif m_chr_unplaced is not None and len(
                        m_chr_unplaced.groups()) == 1:
                    scaffold_num = m_chr_unplaced.group(1)
                # else:
                #    LOG.error(
                #        "There's a chr pattern that we aren't matching: %s", scaffold)

                if chrom_num is not None:
                    # the chrom class (generic) id
                    chrom_class_id = makeChromID(chrom_num, tax_num, 'CHR')

                    # first, add the chromosome class (in the taxon)
                    geno.addChromosomeClass(
                        chrom_num, taxon_curie,
                        self.files[src_key]['genome_label'])

                    # then, add the chromosome instance (from the given build)
                    geno.addChromosomeInstance(chrom_num, build_id, build_num,
                                               chrom_class_id)

                    # add the chr to the hashmap of coordinates for this build
                    # the chromosome coordinate space is itself
                    if chrom_num not in mybands.keys():
                        mybands[chrom_num] = {
                            'min': 0,
                            'max': int(stop),
                            'chr': chrom_num,
                            'ref': build_id,
                            'parent': None,
                            'stain': None,
                            'type': self.globaltt['chromosome']
                        }
                elif scaffold_num is not None:
                    # this will put the coordinates of the scaffold
                    # in the scaffold-space and make sure that the scaffold
                    # is part of the correct parent.
                    # if chrom_num is None,
                    # then it will attach it to the genome,
                    # just like a reg chrom
                    mybands[scaffold_num] = {
                        'min': start,
                        'max': stop,
                        'chr': scaffold_num,
                        'ref': build_id,
                        'parent': chrom_num,
                        'stain': None,
                        'type': self.globaltt['assembly_component'],
                        'synonym': scaffold
                    }
                else:
                    LOG.info('%s line %i DROPPED chromosome/scaffold  %s',
                             species_name, line_num, scaffold)

                parents = list()

                # see it new types have showed up
                if rtype is not None and rtype not in [
                        'gneg', 'gpos25', 'gpos33', 'gpos50', 'gpos66',
                        'gpos75', 'gpos100', 'acen', 'gvar', 'stalk'
                ]:
                    LOG.info('Unknown gieStain type "%s" in %s at %i', rtype,
                             src_key, line_num)
                    self.globaltt[rtype]  # blow up

                if rtype == 'acen':  # hacky, revisit if ontology improves
                    rtype = self.localtt[rtype]

                if band_num is not None and band_num != '' and \
                        rtype is not None and rtype != '':
                    # add the specific band
                    mybands[chrom_num + band_num] = {
                        'min': start,
                        'max': stop,
                        'chr': chrom_num,
                        'ref': build_id,
                        'parent': None,
                        'stain': None,
                        'type': self.globaltt[rtype],
                    }

                    # add the staining intensity of the band
                    # get the parent bands, and make them unique
                    parents = list(monochrom.make_parent_bands(
                        band_num, set()))
                    # alphabetical sort will put them in smallest to biggest,
                    # so we reverse
                    parents.sort(reverse=True)
                    # print('parents of',chrom,band,':',parents)

                    if len(parents) > 0:
                        mybands[chrom_num +
                                band_num]['parent'] = chrom_num + parents[0]
                    # else:   # band has no parents

                # loop through the parents and add them to the dict
                # add the parents to the graph, in hierarchical order
                # TODO PYLINT Consider using enumerate
                # instead of iterating with range and len
                for i in range(len(parents)):
                    rti = getChrPartTypeByNotation(parents[i], self.graph)

                    pnum = chrom_num + parents[i]
                    sta = int(start)
                    sto = int(stop)
                    if pnum is not None and pnum not in mybands.keys():
                        # add the parental band to the hash
                        bnd = {
                            'min': min(sta, sto),
                            'max': max(sta, sto),
                            'chr': chrom_num,
                            'ref': build_id,
                            'parent': None,
                            'stain': None,
                            'type': rti
                        }
                        mybands[pnum] = bnd
                    elif pnum is not None:
                        # band already in the hash means it's a grouping band
                        # need to update the min/max coords
                        bnd = mybands.get(pnum)
                        bnd['min'] = min(sta, sto, bnd['min'])
                        bnd['max'] = max(sta, sto, bnd['max'])
                        mybands[pnum] = bnd

                        # also, set the max for the chrom
                        chrom = mybands.get(chrom_num)
                        chrom['max'] = max(sta, sto, chrom['max'])
                        mybands[chrom_num] = chrom
                    else:
                        LOG.error("pnum is None")
                    # add the parent relationships to each
                    if i < len(parents) - 1:
                        mybands[pnum]['parent'] = chrom_num + parents[i + 1]
                    else:
                        # add the last one (p or q usually)
                        # as attached to the chromosome
                        mybands[pnum]['parent'] = chrom_num

        binreader.close()  # end looping through file

        # loop through the hash and add the bands to the graph
        for bnd in mybands.keys():
            myband = mybands.get(bnd)
            band_class_id = makeChromID(bnd, tax_num, 'CHR')
            band_class_label = makeChromLabel(bnd, genome_label)
            band_build_id = makeChromID(bnd, build_num, 'MONARCH')
            band_build_label = makeChromLabel(bnd, build_num)
            # the build-specific chrom
            chrom_in_build_id = makeChromID(myband['chr'], build_num,
                                            'MONARCH')
            # if it's != part, then add the class
            if myband['type'] != self.globaltt['assembly_component']:
                model.addClassToGraph(band_class_id, band_class_label,
                                      myband['type'])
                bfeature = Feature(self.graph, band_build_id, band_build_label,
                                   band_class_id)
            else:
                bfeature = Feature(self.graph, band_build_id, band_build_label,
                                   myband['type'])
                if 'synonym' in myband:
                    model.addSynonym(band_build_id, myband['synonym'])

            if myband['parent'] is None:
                if myband['type'] == self.globaltt['assembly_component']:
                    # since we likely don't know the chr,
                    # add it as a part of the build
                    geno.addParts(band_build_id, build_id)
            elif myband['type'] == self.globaltt['assembly_component']:
                # geno.addParts(band_build_id, chrom_in_build_id)
                parent_chrom_in_build = makeChromID(myband['parent'],
                                                    build_num, 'MONARCH')
                bfeature.addSubsequenceOfFeature(parent_chrom_in_build)

            # add the band as a feature
            # (which also instantiates the owl:Individual)
            bfeature.addFeatureStartLocation(myband['min'], chrom_in_build_id)
            bfeature.addFeatureEndLocation(myband['max'], chrom_in_build_id)
            if 'stain' in myband and myband['stain'] is not None:
                bfeature.addFeatureProperty(
                    self.globaltt['has_sequence_attribute'], myband['stain'])

            # type the band as a faldo:Region directly (add_region=False)
            # bfeature.setNoBNodes(self.nobnodes)
            # to come when we merge in ZFIN.py
            bfeature.addFeatureToGraph(False)

Пример #12

Показать файл

    def parse(self, limit=None):
        """

        :param limit:
        :return:
        """
        if limit is not None:
            LOG.info("Only parsing first %s rows fo each file", str(limit))

        LOG.info("Parsing files...")

        if self.testOnly:
            self.testMode = True
            graph = self.testgraph
        else:
            graph = self.graph

        traitmap = '/'.join((self.rawdir, self.files['trait_mappings']['file']))
        self._process_trait_mappings(traitmap, limit)

        geno = Genotype(graph)
        animals = ['chicken', 'pig', 'horse', 'rainbow_trout', 'sheep', 'cattle']

        for common_name in animals:
            txid_num = self.resolve(common_name).split(':')[1]
            taxon_label = self.localtt[common_name]
            taxon_curie = self.globaltt[taxon_label]
            taxon_num = taxon_curie.split(':')[1]
            txid_num = taxon_num  # for now
            taxon_word = taxon_label.replace(' ', '_')
            gene_info_file = '/'.join(
                (self.rawdir, self.files[taxon_word + '_info']['file']))
            self.gene_info = list()
            with gzip.open(gene_info_file, 'rt') as gi_gz:
                filereader = csv.reader(gi_gz, delimiter='\t')
                for row in filereader:
                    if row[0][0] == '#':
                        continue
                    else:
                        self.gene_info.append(str(row[1]))  # tossing lots of good stuff
            LOG.info(
                'Gene Info for %s has %i enteries', common_name, len(self.gene_info))
            # LOG.info('Gene Info entery looks like %s', self.gene_info[5])

            build = None

            fname_bp = common_name + '_bp'
            if fname_bp in self.files:
                bpfile = self.files[fname_bp]['file']
                mch = re.search(r'QTL_([\w\.]+)\.gff.txt.gz', bpfile)
                if mch is None:
                    LOG.error("Can't match a gff build to " + fname_bp)
                else:
                    build = mch.group(1)
                    build_id = self._map_build_by_abbrev(build)
                    LOG.info("Build = %s", build_id)
                    geno.addReferenceGenome(build_id, build, txid_num)
                if build_id is not None:
                    self._process_qtls_genomic_location(
                        '/'.join((self.rawdir, bpfile)), txid_num, build_id, build,
                        common_name, limit)

            fname_cm = common_name + '_cm'
            if fname_cm in self.files:
                cmfile = self.files[fname_cm]['file']
                self._process_qtls_genetic_location(
                    '/'.join((self.rawdir, cmfile)), txid_num, common_name, limit)

        LOG.info("Finished parsing")
        return

Пример #13

Показать файл

    def _get_chrbands(self, limit, taxon):
        """
        :param limit:
        :return:

        """
        model = Model(self.graph)
        # TODO PYLINT figure out what limit was for and why it is unused
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files[taxon]['file']))
        logger.info("Processing Chr bands from FILE: %s", myfile)
        geno = Genotype(self.graph)
        monochrom = Monochrom(self.graph_type, self.are_bnodes_skized)

        # used to hold band definitions for a chr
        # in order to compute extent of encompasing bands

        mybands = {}
        # build the organism's genome from the taxon
        genome_label = self.files[taxon]['genome_label']
        taxon_id = 'NCBITaxon:' + taxon

        # add the taxon as a class.  adding the class label elsewhere
        model.addClassToGraph(taxon_id, None)
        model.addSynonym(taxon_id, genome_label)

        geno.addGenome(taxon_id, genome_label)

        # add the build and the taxon it's in
        build_num = self.files[taxon]['build_num']
        build_id = 'UCSC:' + build_num
        geno.addReferenceGenome(build_id, build_num, taxon_id)

        # process the bands
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match('^#', line):
                    continue

                # chr13	4500000	10000000	p12	stalk
                (scaffold, start, stop, band_num, rtype) = line.split('\t')
                line_counter += 1

                # NOTE some less-finished genomes have
                # placed and unplaced scaffolds
                # * Placed scaffolds:
                #       the scaffolds have been placed within a chromosome.
                # * Unlocalized scaffolds:
                #   although the chromosome within which the scaffold occurs
                #   is known, the scaffold's position or orientation
                #   is not known.
                # * Unplaced scaffolds:
                #   it is not known which chromosome the scaffold belongs to
                #
                # find out if the thing is a full on chromosome, or a scaffold:
                # ex: unlocalized scaffold: chr10_KL568008v1_random
                # ex: unplaced scaffold: chrUn_AABR07022428v1
                placed_scaffold_pattern = r'(chr(?:\d+|X|Y|Z|W|M))'
                unlocalized_scaffold_pattern = placed_scaffold_pattern + r'_(\w+)_random'
                unplaced_scaffold_pattern = r'chr(Un(?:_\w+)?)'

                mch = re.match(placed_scaffold_pattern + r'$', scaffold)
                if mch is not None and len(mch.groups()) == 1:
                    # the chromosome is the first match of the pattern
                    chrom_num = mch.group(1)
                else:
                    # skip over anything that isn't a placed_scaffold
                    # at the class level
                    logger.info("Found non-placed chromosome %s", scaffold)
                    chrom_num = None

                m_chr_unloc = re.match(unlocalized_scaffold_pattern, scaffold)
                m_chr_unplaced = re.match(unplaced_scaffold_pattern, scaffold)

                scaffold_num = None
                if mch:
                    pass
                elif m_chr_unloc is not None and len(
                        m_chr_unloc.groups()) == 2:
                    chrom_num = m_chr_unloc.group(1)
                    scaffold_num = chrom_num + '_' + m_chr_unloc.group(2)
                elif m_chr_unplaced is not None and len(
                        m_chr_unplaced.groups()) == 1:
                    scaffold_num = m_chr_unplaced.group(1)
                else:
                    logger.error(
                        "There's a chr pattern that we aren't matching: %s",
                        scaffold)

                if chrom_num is not None:
                    # the chrom class (generic) id
                    chrom_class_id = makeChromID(chrom_num, taxon, 'CHR')

                    # first, add the chromosome class (in the taxon)
                    geno.addChromosomeClass(chrom_num, taxon_id,
                                            self.files[taxon]['genome_label'])

                    # then, add the chromosome instance (from the given build)
                    geno.addChromosomeInstance(chrom_num, build_id, build_num,
                                               chrom_class_id)

                    # add the chr to the hashmap of coordinates for this build
                    # the chromosome coordinate space is itself
                    if chrom_num not in mybands.keys():
                        mybands[chrom_num] = {
                            'min': 0,
                            'max': int(stop),
                            'chr': chrom_num,
                            'ref': build_id,
                            'parent': None,
                            'stain': None,
                            'type': self.globaltt['chromosome']
                        }

                if scaffold_num is not None:
                    # this will put the coordinates of the scaffold
                    # in the scaffold-space and make sure that the scaffold
                    # is part of the correct parent.
                    # if chrom_num is None,
                    # then it will attach it to the genome,
                    # just like a reg chrom
                    mybands[scaffold_num] = {
                        'min': start,
                        'max': stop,
                        'chr': scaffold_num,
                        'ref': build_id,
                        'parent': chrom_num,
                        'stain': None,
                        'type': self.globaltt['assembly_component'],
                        'synonym': scaffold
                    }

                if band_num is not None and band_num.strip() != '':
                    # add the specific band
                    mybands[chrom_num + band_num] = {
                        'min': start,
                        'max': stop,
                        'chr': chrom_num,
                        'ref': build_id,
                        'parent': None,
                        'stain': None,
                        'type': None
                    }

                    # add the staining intensity of the band
                    if re.match(r'g(neg|pos|var)', rtype):
                        mybands[chrom_num +
                                band_num]['stain'] = self.resolve(rtype)

                    # get the parent bands, and make them unique
                    parents = list(monochrom.make_parent_bands(
                        band_num, set()))
                    # alphabetical sort will put them in smallest to biggest,
                    # so we reverse
                    parents.sort(reverse=True)
                    # print('parents of',chrom,band,':',parents)

                    if len(parents) > 0:
                        mybands[chrom_num +
                                band_num]['parent'] = chrom_num + parents[0]
                else:
                    # TODO PYLINT why is 'parent'
                    # a list() a couple of lines up and a set() here?
                    parents = set()

                # loop through the parents and add them to the hash
                # add the parents to the graph, in hierarchical order
                # TODO PYLINT Consider using enumerate
                # instead of iterating with range and len
                for i in range(len(parents)):
                    rti = getChrPartTypeByNotation(parents[i])

                    pnum = chrom_num + parents[i]
                    sta = int(start)
                    sto = int(stop)
                    if pnum not in mybands.keys():
                        # add the parental band to the hash
                        bnd = {
                            'min': min(sta, sto),
                            'max': max(sta, sto),
                            'chr': chrom_num,
                            'ref': build_id,
                            'parent': None,
                            'stain': None,
                            'type': rti
                        }
                        mybands[pnum] = bnd
                    else:
                        # band already in the hash means it's a grouping band
                        # need to update the min/max coords
                        bnd = mybands.get(pnum)
                        bnd['min'] = min(sta, sto, bnd['min'])
                        bnd['max'] = max(sta, sto, bnd['max'])
                        mybands[pnum] = bnd

                        # also, set the max for the chrom
                        chrom = mybands.get(chrom_num)
                        chrom['max'] = max(sta, sto, chrom['max'])
                        mybands[chrom_num] = chrom

                    # add the parent relationships to each
                    if i < len(parents) - 1:
                        mybands[pnum]['parent'] = chrom_num + parents[i + 1]
                    else:
                        # add the last one (p or q usually)
                        # as attached to the chromosome
                        mybands[pnum]['parent'] = chrom_num

        f.close()  # end looping through file

        # loop through the hash and add the bands to the graph
        for bnd in mybands.keys():
            myband = mybands.get(bnd)
            band_class_id = makeChromID(bnd, taxon, 'CHR')
            band_class_label = makeChromLabel(bnd, genome_label)
            band_build_id = makeChromID(bnd, build_num, 'MONARCH')
            band_build_label = makeChromLabel(bnd, build_num)
            # the build-specific chrom
            chrom_in_build_id = makeChromID(myband['chr'], build_num,
                                            'MONARCH')
            # if it's != part, then add the class
            if myband['type'] != self.globaltt['assembly_component']:
                model.addClassToGraph(band_class_id, band_class_label,
                                      myband['type'])
                bfeature = Feature(self.graph, band_build_id, band_build_label,
                                   band_class_id)
            else:
                bfeature = Feature(self.graph, band_build_id, band_build_label,
                                   myband['type'])
                if 'synonym' in myband:
                    model.addSynonym(band_build_id, myband['synonym'])

            if myband['parent'] is None:
                if myband['type'] == self.globaltt['assembly_component']:
                    # since we likely don't know the chr,
                    # add it as a part of the build
                    geno.addParts(band_build_id, build_id)
            elif myband['type'] == self.globaltt['assembly_component']:
                # geno.addParts(band_build_id, chrom_in_build_id)
                parent_chrom_in_build = makeChromID(myband['parent'],
                                                    build_num, 'MONARCH')
                bfeature.addSubsequenceOfFeature(parent_chrom_in_build)

            # add the band as a feature
            # (which also instantiates the owl:Individual)
            bfeature.addFeatureStartLocation(myband['min'], chrom_in_build_id)
            bfeature.addFeatureEndLocation(myband['max'], chrom_in_build_id)
            if 'stain' in myband and myband['stain'] is not None:
                bfeature.addFeatureProperty(
                    self.globaltt['has_sequence_attribute'], myband['stain'])

            # type the band as a faldo:Region directly (add_region=False)
            # bfeature.setNoBNodes(self.nobnodes)
            # to come when we merge in ZFIN.py
            bfeature.addFeatureToGraph(False)

        return

Пример #14

Показать файл

Файл: AnimalQTLdb.py Проект: d3borah/dipper

    def _process_QTLs_genetic_location(self, raw, taxon_id, common_name, limit=None):
        """
        This function processes

        Triples created:

        :param limit:
        :return:
        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        geno = Genotype(g)
        gu = GraphUtils(curie_map.get())
        eco_id = "ECO:0000061"  # Quantitative Trait Analysis Evidence

        logger.info("Processing genetic location for %s", taxon_id)
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (qtl_id, qtl_symbol, trait_name, assotype, empty, chromosome, position_cm, range_cm,
                 flankmark_a2, flankmark_a1, peak_mark, flankmark_b1, flankmark_b2, exp_id, model, test_base,
                 sig_level, lod_score, ls_mean, p_values, f_statistics, variance, bayes_value, likelihood_ratio,
                 trait_id, dom_effect, add_effect, pubmed_id, gene_id, gene_id_src, gene_id_type, empty2) = row

                if self.testMode and int(qtl_id) not in self.test_ids:
                    continue

                qtl_id = 'AQTL:'+qtl_id
                trait_id = 'AQTLTrait:'+trait_id

                # Add QTL to graph
                f = Feature(qtl_id, qtl_symbol, geno.genoparts['QTL'])
                f.addTaxonToFeature(g, taxon_id)

                # deal with the chromosome
                chrom_id = makeChromID(chromosome, taxon_id, 'CHR')

                # add a version of the chromosome which is defined as the genetic map
                build_id = 'MONARCH:'+common_name.strip()+'-linkage'
                build_label = common_name+' genetic map'
                geno.addReferenceGenome(build_id, build_label, taxon_id)
                chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH')
                geno.addChromosomeInstance(chromosome, build_id, build_label, chrom_id)
                start = stop = None
                if re.search('-', range_cm):
                    range_parts = re.split('-', range_cm)
                    # check for poorly formed ranges
                    if len(range_parts) == 2 and range_parts[0] != '' and range_parts[1] != '':
                        (start, stop) = [int(float(x.strip())) for x in re.split('-', range_cm)]
                    else:
                        logger.info("There's a cM range we can't handle for QTL %s: %s", qtl_id, range_cm)
                elif position_cm != '':
                    start = stop = int(float(position_cm))

                # FIXME remove converion to int for start/stop when schema can handle floats
                # add in the genetic location based on the range
                f.addFeatureStartLocation(start, chrom_in_build_id, None, [Feature.types['FuzzyPosition']])
                f.addFeatureEndLocation(stop, chrom_in_build_id, None, [Feature.types['FuzzyPosition']])
                f.addFeatureToGraph(g)

                # sometimes there's a peak marker, like a rsid.  we want to add that as a variant of the gene,
                # and xref it to the qtl.
                dbsnp_id = None
                if peak_mark != '' and peak_mark != '.' and re.match('rs', peak_mark.strip()):
                    dbsnp_id = 'dbSNP:'+peak_mark.strip()

                    gu.addIndividualToGraph(g, dbsnp_id, None, geno.genoparts['sequence_alteration'])
                    gu.addXref(g, qtl_id, dbsnp_id)

                if gene_id is not None and gene_id != '' and gene_id != '.':
                    if gene_id_src == 'NCBIgene' or gene_id_src == '':  # we assume if no src is provided, it's NCBI
                        gene_id = 'NCBIGene:'+gene_id.strip()
                        geno.addGene(gene_id, None)  # we will expect that these labels provided elsewhere
                        geno.addAlleleOfGene(qtl_id, gene_id, geno.object_properties['feature_to_gene_relation'])   # FIXME what is the right relationship here?

                        if dbsnp_id is not None:
                            # add the rsid as a seq alt of the gene_id
                            vl_id = '_' + re.sub(':', '', gene_id) + '-' + peak_mark
                            if self.nobnodes:
                                vl_id = ':' + vl_id
                            geno.addSequenceAlterationToVariantLocus(dbsnp_id, vl_id)
                            geno.addAlleleOfGene(vl_id, gene_id)

                # add the trait
                gu.addClassToGraph(g, trait_id, trait_name)

                # Add publication
                r = None
                if re.match('ISU.*', pubmed_id):
                    pub_id = 'AQTLPub:'+pubmed_id.strip()
                    r = Reference(pub_id)
                elif pubmed_id != '':
                    pub_id = 'PMID:'+pubmed_id.strip()
                    r = Reference(pub_id, Reference.ref_types['journal_article'])

                if r is not None:
                    r.addRefToGraph(g)

                # make the association to the QTL
                assoc = G2PAssoc(self.name, qtl_id, trait_id, gu.object_properties['is_marker_for'])
                assoc.add_evidence(eco_id)
                assoc.add_source(pub_id)

                # create a description from the contents of the file
                # desc = ''

                # assoc.addDescription(g, assoc_id, desc)

                # TODO add exp_id as evidence
                # if exp_id != '':
                #     exp_id = 'AQTLExp:'+exp_id
                #     gu.addIndividualToGraph(g, exp_id, None, eco_id)

                if p_values != '':
                    score = float(re.sub('<', '', p_values))
                    assoc.set_score(score)  # todo add score type
                # TODO add LOD score?
                assoc.add_association_to_graph(g)

                # make the association to the dbsnp_id, if found
                if dbsnp_id is not None:
                    # make the association to the dbsnp_id
                    assoc = G2PAssoc(self.name, dbsnp_id, trait_id, gu.object_properties['is_marker_for'])
                    assoc.add_evidence(eco_id)
                    assoc.add_source(pub_id)

                    # create a description from the contents of the file
                    # desc = ''
                    # assoc.addDescription(g, assoc_id, desc)

                    # TODO add exp_id
                    # if exp_id != '':
                    #     exp_id = 'AQTLExp:'+exp_id
                    #     gu.addIndividualToGraph(g, exp_id, None, eco_id)

                    if p_values != '':
                        score = float(re.sub('<', '', p_values))
                        assoc.set_score(score)  # todo add score type
                    # TODO add LOD score?

                    assoc.add_association_to_graph(g)

                if not self.testMode and limit is not None and line_counter > limit:
                    break

        logger.info("Done with QTL genetic info")
        return

Пример #15

Показать файл

Файл: UCSCBands.py Проект: DoctorBud/dipper

    def _create_genome_builds(self):
        """
        Various resources will map variations to either UCSC (hg*)
        or to NCBI assemblies. Here we create the equivalences between them.
        Data taken from:
        https://genome.ucsc.edu/FAQ/FAQreleases.html#release1

        :return:

        """

        # TODO add more species
        ucsc_assembly_id_map = {
            "9606": {
                "UCSC:hg38": "NCBIGenome:GRCh38",
                "UCSC:hg19": "NCBIGenome:GRCh37",
                "UCSC:hg18": "NCBIGenome:36.1",
                "UCSC:hg17": "NCBIGenome:35",
                "UCSC:hg16": "NCBIGenome:34",
                "UCSC:hg15": "NCBIGenome:33",
                },
            "7955": {
                "UCSC:danRer10": "NCBIGenome:GRCz10",
                "UCSC:danRer7":	"NCBIGenome:Zv9",
                "UCSC:danRer6": "NCBIGenome:Zv8",
                },
            "10090": {
                "UCSC:mm10": "NCBIGenome:GRCm38",
                "UCSC:mm9":	"NCBIGenome:37"
            },
            "9031": {
                "UCSC:galGal4": "NCBIAssembly:317958",
                },
            "9913": {
                "UCSC:bosTau7": "NCBIAssembly:GCF_000003205.5",
                },
            "9823": {
                "UCSC:susScr3": "NCBIAssembly:304498",
                },
            "9940": {
                "UCSC:oviAri3": "NCBIAssembly:GCF_000298735.1",
                },
            "9796": {
                "UCSC:equCab2": "NCBIAssembly:GCF_000002305.2",
                }
        }
        g = self.graph
        geno = Genotype(g)
        model = Model(g)
        logger.info("Adding equivalent assembly identifiers")
        for sp in ucsc_assembly_id_map:
            tax_num = sp
            tax_id = 'NCBITaxon:'+tax_num
            mappings = ucsc_assembly_id_map[sp]
            for i in mappings:
                ucsc_id = i
                ucsc_label = re.split(':', i)[1]
                mapped_id = mappings[i]
                mapped_label = re.split(':', mapped_id)[1]
                mapped_label = 'NCBI build '+str(mapped_label)
                geno.addReferenceGenome(ucsc_id, ucsc_label, tax_id)
                geno.addReferenceGenome(mapped_id, mapped_label, tax_id)
                model.addSameIndividual(ucsc_id, mapped_id)

        return

Пример #16

Показать файл

Файл: ClinVar.py Проект: putmantime/dipper

    def _get_variants(self, limit):
        """
        Currently loops through the variant_summary file.

        :param limit:
        :return:

        """

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)

        geno = Genotype(g)
        f = Feature(g, None, None, None)

        # add the taxon and the genome
        tax_num = '9606'  # HARDCODE
        tax_id = 'NCBITaxon:' + tax_num
        tax_label = 'Human'
        model.addClassToGraph(tax_id, None)
        geno.addGenome(tax_id, tax_label)  # label gets added elsewhere

        # not unzipping the file
        logger.info("Processing Variant records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['variant_summary']['file']))
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue

                # AlleleID               integer value as stored in the AlleleID field in ClinVar  (//Measure/@ID in the XML)
                # Type                   character, the type of variation
                # Name                   character, the preferred name for the variation
                # GeneID                 integer, GeneID in NCBI's Gene database
                # GeneSymbol             character, comma-separated list of GeneIDs overlapping the variation
                # ClinicalSignificance   character, comma-separated list of values of clinical significance reported for this variation
                #                          for the mapping between the terms listed here and the integers in the .VCF files, see
                #                          http://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/
                # RS# (dbSNP)            integer, rs# in dbSNP
                # nsv (dbVar)            character, the NSV identifier for the region in dbVar
                # RCVaccession           character, list of RCV accessions that report this variant
                # TestedInGTR            character, Y/N for Yes/No if there is a test registered as specific to this variation in the NIH Genetic Testing Registry (GTR)
                # PhenotypeIDs           character, list of db names and identifiers for phenotype(s) reported for this variant
                # Origin                 character, list of all allelic origins for this variation
                # Assembly               character, name of the assembly on which locations are based
                # Chromosome             character, chromosomal location
                # Start                  integer, starting location, in pter->qter orientation
                # Stop                   integer, end location, in pter->qter orientation
                # Cytogenetic            character, ISCN band
                # ReviewStatus           character, highest review status for reporting this measure. For the key to the terms,
                #                            and their relationship to the star graphics ClinVar displays on its web pages,
                #                            see http://www.ncbi.nlm.nih.gov/clinvar/docs/variation_report/#interpretation
                # HGVS(c.)               character, RefSeq cDNA-based HGVS expression
                # HGVS(p.)               character, RefSeq protein-based HGVS expression
                # NumberSubmitters       integer, number of submissions with this variant
                # LastEvaluated          datetime, the latest time any submitter reported clinical significance
                # Guidelines             character, ACMG only right now, for the reporting of incidental variation in a Gene
                #                                (NOTE: if ACMG, not a specific to the allele but to the Gene)
                # OtherIDs               character, list of other identifiers or sources of information about this variant
                # VariantID              integer, the value used to build the URL for the current default report,
                #                            e.g. http://www.ncbi.nlm.nih.gov/clinvar/variation/1756/
                #

                # a crude check that there's an expected number of cols.
                # if not, error out because something changed.
                num_cols = len(line.split('\t'))
                expected_numcols = 29
                if num_cols != expected_numcols:
                    logger.error(
                        "Unexpected number of columns in raw file " +
                        "(%d actual vs %d expected)", num_cols,
                        expected_numcols)

                (allele_num, allele_type, allele_name, gene_num, gene_symbol,
                 clinical_significance, dbsnp_num, dbvar_num, rcv_nums,
                 tested_in_gtr, phenotype_ids, origin, assembly, chr, start,
                 stop, cytogenetic_loc, review_status, hgvs_c, hgvs_p,
                 number_of_submitters, last_eval, guidelines, other_ids,
                 variant_num, reference_allele, alternate_allele, categories,
                 ChromosomeAccession) = line.split('\t')

                # ###set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #    if ((self.filter == 'taxids' and\
                #            (int(tax_num) not in self.tax_ids)) or\
                #            (self.filter == 'geneids' and\
                #             (int(gene_num) not in self.gene_ids))):
                #        continue
                # #### end filter

                line_counter += 1

                pheno_list = []
                if phenotype_ids != '-':
                    # trim any leading/trailing semicolons/commas
                    phenotype_ids = re.sub(r'^[;,]', '', phenotype_ids)
                    phenotype_ids = re.sub(r'[;,]$', '', phenotype_ids)
                    pheno_list = re.split(r'[,;]', phenotype_ids)

                if self.testMode:
                    # get intersection of test disease ids
                    # and these phenotype_ids
                    intersect = \
                        list(
                            set([str(i)
                                for i in self.disease_ids]) & set(pheno_list))
                    if int(gene_num) not in self.gene_ids and\
                            int(variant_num) not in self.variant_ids and\
                            len(intersect) < 1:
                        continue

                # TODO may need to switch on assembly to create correct
                # assembly/build identifiers
                build_id = ':'.join(('NCBIGenome', assembly))

                # make the reference genome build
                geno.addReferenceGenome(build_id, assembly, tax_id)

                allele_type_id = self._map_type_of_allele(allele_type)
                bandinbuild_id = None
                if str(chr) == '':
                    # check cytogenic location
                    if str(cytogenetic_loc).strip() != '':
                        # use cytogenic location to get the apx location
                        # oddly, they still put an assembly number even when
                        # there's no numeric location
                        if not re.search(r'-', str(cytogenetic_loc)):
                            band_id = makeChromID(
                                re.split(r'-', str(cytogenetic_loc)), tax_num,
                                'CHR')
                            geno.addChromosomeInstance(cytogenetic_loc,
                                                       build_id, assembly,
                                                       band_id)
                            bandinbuild_id = makeChromID(
                                re.split(r'-', str(cytogenetic_loc)), assembly,
                                'MONARCH')
                        else:
                            # can't deal with ranges yet
                            pass
                else:
                    # add the human chromosome class to the graph,
                    # and add the build-specific version of it
                    chr_id = makeChromID(str(chr), tax_num, 'CHR')
                    geno.addChromosomeClass(str(chr), tax_id, tax_label)
                    geno.addChromosomeInstance(str(chr), build_id, assembly,
                                               chr_id)
                    chrinbuild_id = makeChromID(str(chr), assembly, 'MONARCH')

                seqalt_id = ':'.join(('ClinVarVariant', variant_num))
                gene_id = None

                # they use -1 to indicate unknown gene
                if str(gene_num) != '-1' and str(gene_num) != 'more than 10':
                    if re.match(r'^Gene:', gene_num):
                        gene_num = "NCBI" + gene_num
                    else:
                        gene_id = ':'.join(('NCBIGene', str(gene_num)))

                # FIXME there are some "variants" that are actually haplotypes
                # probably will get taken care of when we switch to processing
                # the xml for example, variant_num = 38562
                # but there's no way to tell if it's a haplotype
                # in the csv data so the dbsnp or dbvar
                # should probably be primary,
                # and the variant num be the vslc,
                # with each of the dbsnps being added to it

                # TODO clinical significance needs to be mapped to
                # a list of terms
                # first, make the variant:
                f = Feature(seqalt_id, allele_name, allele_type_id)

                if start != '-' and start.strip() != '':
                    f.addFeatureStartLocation(start, chrinbuild_id)
                if stop != '-' and stop.strip() != '':
                    f.addFeatureEndLocation(stop, chrinbuild_id)

                f.addFeatureToGraph()
                f.addTaxonToFeature(tax_id)
                # make the ClinVarVariant the clique leader
                model.makeLeader(seqalt_id)

                if bandinbuild_id is not None:
                    f.addSubsequenceOfFeature(bandinbuild_id)

                # CHECK - this makes the assumption that there is
                # only one affected chromosome per variant what happens with
                # chromosomal rearrangement variants?
                # shouldn't both chromosomes be here?

                # add the hgvs as synonyms
                if hgvs_c != '-' and hgvs_c.strip() != '':
                    model.addSynonym(seqalt_id, hgvs_c)
                if hgvs_p != '-' and hgvs_p.strip() != '':
                    model.addSynonym(seqalt_id, hgvs_p)

                # add the dbsnp and dbvar ids as equivalent
                if dbsnp_num != '-' and int(dbsnp_num) != -1:
                    dbsnp_id = 'dbSNP:rs' + str(dbsnp_num)
                    model.addIndividualToGraph(dbsnp_id, None)
                    model.addSameIndividual(seqalt_id, dbsnp_id)
                if dbvar_num != '-':
                    dbvar_id = 'dbVar:' + dbvar_num
                    model.addIndividualToGraph(dbvar_id, None)
                    model.addSameIndividual(seqalt_id, dbvar_id)

                # TODO - not sure if this is right... add as xref?
                # the rcv is like the combo of the phenotype with the variant
                if rcv_nums != '-':
                    for rcv_num in re.split(r';', rcv_nums):
                        rcv_id = 'ClinVar:' + rcv_num
                        model.addIndividualToGraph(rcv_id, None)
                        model.addXref(seqalt_id, rcv_id)

                if gene_id is not None:
                    # add the gene
                    model.addClassToGraph(gene_id, gene_symbol)
                    # make a variant locus
                    vl_id = '_' + gene_num + '-' + variant_num
                    if self.nobnodes:
                        vl_id = ':' + vl_id
                    vl_label = allele_name
                    model.addIndividualToGraph(vl_id, vl_label,
                                               geno.genoparts['variant_locus'])
                    geno.addSequenceAlterationToVariantLocus(seqalt_id, vl_id)
                    geno.addAlleleOfGene(vl_id, gene_id)
                else:
                    # some basic reporting
                    gmatch = re.search(r'\(\w+\)', allele_name)
                    if gmatch is not None and len(gmatch.groups()) > 0:
                        logger.info(
                            "Gene found in allele label, but no id provided: %s",
                            gmatch.group(1))
                    elif re.match(r'more than 10', gene_symbol):
                        logger.info(
                            "More than 10 genes found; "
                            "need to process XML to fetch (variant=%d)",
                            int(variant_num))
                    else:
                        logger.info("No gene listed for variant %d",
                                    int(variant_num))

                # parse the list of "phenotypes" which are diseases.
                # add them as an association
                # ;GeneReviews:NBK1440,MedGen:C0392514,OMIM:235200,SNOMED CT:35400008;MedGen:C3280096,OMIM:614193;MedGen:CN034317,OMIM:612635;MedGen:CN169374
                # the list is both semicolon delimited and comma delimited,
                # but i don't know why! some are bad, like:
                # Orphanet:ORPHA ORPHA319705,SNOMED CT:49049000
                if phenotype_ids != '-':
                    for phenotype in pheno_list:
                        m = re.match(r"(Orphanet:ORPHA(?:\s*ORPHA)?)",
                                     phenotype)
                        if m is not None and len(m.groups()) > 0:
                            phenotype = re.sub(m.group(1), 'Orphanet:',
                                               phenotype.strip())
                        elif re.match(r'ORPHA:\d+', phenotype):
                            phenotype = re.sub(r'^ORPHA', 'Orphanet',
                                               phenotype.strip())
                        elif re.match(r'Human Phenotype Ontology', phenotype):
                            phenotype = re.sub(r'^Human Phenotype Ontology',
                                               '', phenotype.strip())
                        elif re.match(r'SNOMED CT:\s?', phenotype):
                            phenotype = re.sub(r'SNOMED CT:\s?', 'SNOMED:',
                                               phenotype.strip())
                        elif re.match(r'^Gene:', phenotype):
                            continue

                        assoc = G2PAssoc(g, self.name, seqalt_id,
                                         phenotype.strip())
                        assoc.add_association_to_graph()

                if other_ids != '-':
                    id_list = other_ids.split(',')
                    # process the "other ids" ex:
                    # CFTR2:F508del,HGMD:CD890142,OMIM Allelic Variant:602421.0001
                    # TODO make more xrefs
                    for xrefid in id_list:
                        prefix = xrefid.split(':')[0].strip()
                        if prefix == 'OMIM Allelic Variant':
                            xrefid = 'OMIM:' + xrefid.split(':')[1]
                            model.addIndividualToGraph(xrefid, None)
                            model.addSameIndividual(seqalt_id, xrefid)
                        elif prefix == 'HGMD':
                            model.addIndividualToGraph(xrefid, None)
                            model.addSameIndividual(seqalt_id, xrefid)
                        elif prefix == 'dbVar' \
                                and dbvar_num == xrefid.split(':')[1].strip():
                            pass  # skip over this one
                        elif re.search(r'\s', prefix):
                            pass
                            # logger.debug(
                            #   'xref prefix has a space: %s', xrefid)
                        else:
                            # should be a good clean prefix
                            # note that HGMD variants are in here as Xrefs
                            # because we can't resolve URIs for them
                            # logger.info("Adding xref: %s", xrefid)
                            # gu.addXref(g, seqalt_id, xrefid)
                            # logger.info("xref prefix to add: %s", xrefid)
                            pass

                if not self.testMode and limit is not None \
                        and line_counter > limit:
                    break

        logger.info("Finished parsing variants")

        return

Пример #17

Показать файл

    def _process_qtls_genetic_location(
            self, raw, txid, common_name, limit=None):
        """
        This function processes

        Triples created:

        :param limit:
        :return:

        """
        if self.testMode:
            graph = self.testgraph
        else:
            graph = self.graph
        line_counter = 0
        geno = Genotype(graph)
        model = Model(graph)
        eco_id = self.globaltt['quantitative trait analysis evidence']

        taxon_curie = 'NCBITaxon:' + txid

        LOG.info("Processing genetic location for %s from %s", taxon_curie, raw)
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (qtl_id,
                 qtl_symbol,
                 trait_name,
                 assotype,
                 empty,
                 chromosome,
                 position_cm,
                 range_cm,
                 flankmark_a2,
                 flankmark_a1,
                 peak_mark,
                 flankmark_b1,
                 flankmark_b2,
                 exp_id,
                 model_id,
                 test_base,
                 sig_level,
                 lod_score,
                 ls_mean,
                 p_values,
                 f_statistics,
                 variance,
                 bayes_value,
                 likelihood_ratio,
                 trait_id, dom_effect,
                 add_effect,
                 pubmed_id,
                 gene_id,
                 gene_id_src,
                 gene_id_type,
                 empty2) = row

                if self.testMode and int(qtl_id) not in self.test_ids:
                    continue

                qtl_id = common_name + 'QTL:' + qtl_id.strip()
                trait_id = 'AQTLTrait:' + trait_id.strip()

                # Add QTL to graph
                feature = Feature(graph, qtl_id, qtl_symbol, self.globaltt['QTL'])
                feature.addTaxonToFeature(taxon_curie)

                # deal with the chromosome
                chrom_id = makeChromID(chromosome, taxon_curie, 'CHR')

                # add a version of the chromosome which is defined as
                # the genetic map
                build_id = 'MONARCH:'+common_name.strip()+'-linkage'
                build_label = common_name+' genetic map'
                geno.addReferenceGenome(build_id, build_label, taxon_curie)
                chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH')
                geno.addChromosomeInstance(
                    chromosome, build_id, build_label, chrom_id)
                start = stop = None
                # range_cm sometimes ends in "(Mb)"  (i.e pig 2016 Nov)
                range_mb = re.split(r'\(', range_cm)
                if range_mb is not None:
                    range_cm = range_mb[0]

                if re.search(r'[0-9].*-.*[0-9]', range_cm):
                    range_parts = re.split(r'-', range_cm)

                    # check for poorly formed ranges
                    if len(range_parts) == 2 and\
                            range_parts[0] != '' and range_parts[1] != '':
                        (start, stop) = [
                            int(float(x.strip())) for x in re.split(r'-', range_cm)]
                    else:
                        LOG.info(
                            "A cM range we can't handle for QTL %s: %s",
                            qtl_id, range_cm)
                elif position_cm != '':
                    match = re.match(r'([0-9]*\.[0-9]*)', position_cm)
                    if match is not None:
                        position_cm = match.group()
                        start = stop = int(float(position_cm))

                # FIXME remove converion to int for start/stop
                # when schema can handle floats add in the genetic location
                # based on the range
                feature.addFeatureStartLocation(
                    start, chrom_in_build_id, None,
                    [self.globaltt['FuzzyPosition']])
                feature.addFeatureEndLocation(
                    stop, chrom_in_build_id, None,
                    [self.globaltt['FuzzyPosition']])
                feature.addFeatureToGraph()

                # sometimes there's a peak marker, like a rsid.
                # we want to add that as a variant of the gene,
                # and xref it to the qtl.
                dbsnp_id = None
                if peak_mark != '' and peak_mark != '.' and \
                        re.match(r'rs', peak_mark.strip()):
                    dbsnp_id = 'dbSNP:'+peak_mark.strip()

                    model.addIndividualToGraph(
                        dbsnp_id, None,
                        self.globaltt['sequence_alteration'])
                    model.addXref(qtl_id, dbsnp_id)

                gene_id = gene_id.replace('uncharacterized ', '').strip()
                if gene_id is not None and gene_id != '' and gene_id != '.'\
                        and re.fullmatch(r'[^ ]*', gene_id) is not None:

                    # we assume if no src is provided and gene_id is an integer,
                    # then it is an NCBI gene ... (okay, lets crank that back a notch)
                    if gene_id_src == '' and gene_id.isdigit() and \
                            gene_id in self.gene_info:
                        # LOG.info(
                        #    'Warm & Fuzzy saying %s is a NCBI gene for %s',
                        #    gene_id, common_name)
                        gene_id_src = 'NCBIgene'
                    elif gene_id_src == '' and gene_id.isdigit():
                        LOG.warning(
                            'Cold & Prickely saying %s is a NCBI gene for %s',
                            gene_id, common_name)
                        gene_id_src = 'NCBIgene'
                    elif gene_id_src == '':
                        LOG.error(
                            ' "%s" is a NOT NCBI gene for %s', gene_id, common_name)
                        gene_id_src = None

                    if gene_id_src == 'NCBIgene':
                        gene_id = 'NCBIGene:' + gene_id
                        # we will expect that these will get labels elsewhere
                        geno.addGene(gene_id, None)
                        # FIXME what is the right relationship here?
                        geno.addAffectedLocus(qtl_id, gene_id)

                        if dbsnp_id is not None:
                            # add the rsid as a seq alt of the gene_id
                            vl_id = '_:' + re.sub(
                                r':', '', gene_id) + '-' + peak_mark.strip()
                            geno.addSequenceAlterationToVariantLocus(
                                dbsnp_id, vl_id)
                            geno.addAffectedLocus(vl_id, gene_id)

                # add the trait
                model.addClassToGraph(trait_id, trait_name)

                # Add publication
                reference = None
                if re.match(r'ISU.*', pubmed_id):
                    pub_id = 'AQTLPub:'+pubmed_id.strip()
                    reference = Reference(graph, pub_id)
                elif pubmed_id != '':
                    pub_id = 'PMID:' + pubmed_id.strip()
                    reference = Reference(
                        graph, pub_id, self.globaltt['journal article'])

                if reference is not None:
                    reference.addRefToGraph()

                # make the association to the QTL
                assoc = G2PAssoc(
                    graph, self.name, qtl_id, trait_id, self.globaltt['is marker for'])
                assoc.add_evidence(eco_id)
                assoc.add_source(pub_id)

                # create a description from the contents of the file
                # desc = ''

                # assoc.addDescription(g, assoc_id, desc)

                # TODO add exp_id as evidence
                # if exp_id != '':
                #     exp_id = 'AQTLExp:'+exp_id
                #     gu.addIndividualToGraph(g, exp_id, None, eco_id)

                if p_values != '':
                    scr = re.sub(r'<', '', p_values)
                    scr = re.sub(r',', '.', scr)  # international notation
                    if scr.isnumeric():
                        score = float(scr)
                        assoc.set_score(score)  # todo add score type
                # TODO add LOD score?
                assoc.add_association_to_graph()

                # make the association to the dbsnp_id, if found
                if dbsnp_id is not None:
                    # make the association to the dbsnp_id
                    assoc = G2PAssoc(
                        graph, self.name, dbsnp_id, trait_id,
                        self.globaltt['is marker for'])
                    assoc.add_evidence(eco_id)
                    assoc.add_source(pub_id)

                    # create a description from the contents of the file
                    # desc = ''
                    # assoc.addDescription(g, assoc_id, desc)

                    # TODO add exp_id
                    # if exp_id != '':
                    #     exp_id = 'AQTLExp:'+exp_id
                    #     gu.addIndividualToGraph(g, exp_id, None, eco_id)

                    if p_values != '':
                        scr = re.sub(r'<', '', p_values)
                        scr = re.sub(r',', '.', scr)
                        if scr.isnumeric():
                            score = float(scr)
                            assoc.set_score(score)  # todo add score type
                    # TODO add LOD score?

                    assoc.add_association_to_graph()

                if not self.testMode and limit is not None and line_counter > limit:
                    break

        LOG.info("Done with QTL genetic info")
        return

Пример #18

Показать файл

Файл: ClinVar.py Проект: DoctorBud/dipper

    def _get_variants(self, limit):
        """
        Currently loops through the variant_summary file.

        :param limit:
        :return:

        """

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)

        geno = Genotype(g)
        f = Feature(g, None, None, None)

        # add the taxon and the genome
        tax_num = '9606'  # HARDCODE
        tax_id = 'NCBITaxon:'+tax_num
        tax_label = 'Human'
        model.addClassToGraph(tax_id, None)
        geno.addGenome(tax_id, tax_label)  # label gets added elsewhere

        # not unzipping the file
        logger.info("Processing Variant records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['variant_summary']['file']))
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue

                # AlleleID               integer value as stored in the AlleleID field in ClinVar  (//Measure/@ID in the XML)
                # Type                   character, the type of variation
                # Name                   character, the preferred name for the variation
                # GeneID                 integer, GeneID in NCBI's Gene database
                # GeneSymbol             character, comma-separated list of GeneIDs overlapping the variation
                # ClinicalSignificance   character, comma-separated list of values of clinical significance reported for this variation
                #                          for the mapping between the terms listed here and the integers in the .VCF files, see
                #                          http://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/
                # RS# (dbSNP)            integer, rs# in dbSNP
                # nsv (dbVar)            character, the NSV identifier for the region in dbVar
                # RCVaccession           character, list of RCV accessions that report this variant
                # TestedInGTR            character, Y/N for Yes/No if there is a test registered as specific to this variation in the NIH Genetic Testing Registry (GTR)
                # PhenotypeIDs           character, list of db names and identifiers for phenotype(s) reported for this variant
                # Origin                 character, list of all allelic origins for this variation
                # Assembly               character, name of the assembly on which locations are based
                # Chromosome             character, chromosomal location
                # Start                  integer, starting location, in pter->qter orientation
                # Stop                   integer, end location, in pter->qter orientation
                # Cytogenetic            character, ISCN band
                # ReviewStatus           character, highest review status for reporting this measure. For the key to the terms,
                #                            and their relationship to the star graphics ClinVar displays on its web pages,
                #                            see http://www.ncbi.nlm.nih.gov/clinvar/docs/variation_report/#interpretation
                # HGVS(c.)               character, RefSeq cDNA-based HGVS expression
                # HGVS(p.)               character, RefSeq protein-based HGVS expression
                # NumberSubmitters       integer, number of submissions with this variant
                # LastEvaluated          datetime, the latest time any submitter reported clinical significance
                # Guidelines             character, ACMG only right now, for the reporting of incidental variation in a Gene
                #                                (NOTE: if ACMG, not a specific to the allele but to the Gene)
                # OtherIDs               character, list of other identifiers or sources of information about this variant
                # VariantID              integer, the value used to build the URL for the current default report,
                #                            e.g. http://www.ncbi.nlm.nih.gov/clinvar/variation/1756/
                #

                # a crude check that there's an expected number of cols.
                # if not, error out because something changed.
                num_cols = len(line.split('\t'))
                expected_numcols = 29
                if num_cols != expected_numcols:
                    logger.error(
                        "Unexpected number of columns in raw file " +
                        "(%d actual vs %d expected)",
                        num_cols, expected_numcols)

                (allele_num, allele_type, allele_name, gene_num, gene_symbol,
                 clinical_significance, dbsnp_num, dbvar_num, rcv_nums,
                 tested_in_gtr, phenotype_ids, origin, assembly, chr, start,
                 stop, cytogenetic_loc, review_status, hgvs_c, hgvs_p,
                 number_of_submitters, last_eval, guidelines, other_ids,
                 variant_num, reference_allele, alternate_allele, categories,
                 ChromosomeAccession) = line.split('\t')

                # ###set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #    if ((self.filter == 'taxids' and\
                #            (int(tax_num) not in self.tax_ids)) or\
                #            (self.filter == 'geneids' and\
                #             (int(gene_num) not in self.gene_ids))):
                #        continue
                # #### end filter

                line_counter += 1

                pheno_list = []
                if phenotype_ids != '-':
                    # trim any leading/trailing semicolons/commas
                    phenotype_ids = re.sub(r'^[;,]', '', phenotype_ids)
                    phenotype_ids = re.sub(r'[;,]$', '', phenotype_ids)
                    pheno_list = re.split(r'[,;]', phenotype_ids)

                if self.testMode:
                    # get intersection of test disease ids
                    # and these phenotype_ids
                    intersect = \
                        list(
                            set([str(i)
                                for i in self.disease_ids]) & set(pheno_list))
                    if int(gene_num) not in self.gene_ids and\
                            int(variant_num) not in self.variant_ids and\
                            len(intersect) < 1:
                        continue

                # TODO may need to switch on assembly to create correct
                # assembly/build identifiers
                build_id = ':'.join(('NCBIGenome', assembly))

                # make the reference genome build
                geno.addReferenceGenome(build_id, assembly, tax_id)

                allele_type_id = self._map_type_of_allele(allele_type)
                bandinbuild_id = None
                if str(chr) == '':
                    # check cytogenic location
                    if str(cytogenetic_loc).strip() != '':
                        # use cytogenic location to get the apx location
                        # oddly, they still put an assembly number even when
                        # there's no numeric location
                        if not re.search(r'-', str(cytogenetic_loc)):
                            band_id = makeChromID(
                                re.split(r'-', str(cytogenetic_loc)),
                                tax_num, 'CHR')
                            geno.addChromosomeInstance(
                                cytogenetic_loc, build_id, assembly, band_id)
                            bandinbuild_id = makeChromID(
                                re.split(r'-', str(cytogenetic_loc)),
                                assembly, 'MONARCH')
                        else:
                            # can't deal with ranges yet
                            pass
                else:
                    # add the human chromosome class to the graph,
                    # and add the build-specific version of it
                    chr_id = makeChromID(str(chr), tax_num, 'CHR')
                    geno.addChromosomeClass(str(chr), tax_id, tax_label)
                    geno.addChromosomeInstance(
                        str(chr), build_id, assembly, chr_id)
                    chrinbuild_id = makeChromID(str(chr), assembly, 'MONARCH')

                seqalt_id = ':'.join(('ClinVarVariant', variant_num))
                gene_id = None

                # they use -1 to indicate unknown gene
                if str(gene_num) != '-1' and str(gene_num) != 'more than 10':
                    if re.match(r'^Gene:', gene_num):
                        gene_num = "NCBI" + gene_num
                    else:
                        gene_id = ':'.join(('NCBIGene', str(gene_num)))

                # FIXME there are some "variants" that are actually haplotypes
                # probably will get taken care of when we switch to processing
                # the xml for example, variant_num = 38562
                # but there's no way to tell if it's a haplotype
                # in the csv data so the dbsnp or dbvar
                # should probably be primary,
                # and the variant num be the vslc,
                # with each of the dbsnps being added to it

                # TODO clinical significance needs to be mapped to
                # a list of terms
                # first, make the variant:
                f = Feature(seqalt_id, allele_name, allele_type_id)

                if start != '-' and start.strip() != '':
                    f.addFeatureStartLocation(start, chrinbuild_id)
                if stop != '-' and stop.strip() != '':
                    f.addFeatureEndLocation(stop, chrinbuild_id)

                f.addFeatureToGraph()
                f.addTaxonToFeature(tax_id)
                # make the ClinVarVariant the clique leader
                model.makeLeader(seqalt_id)

                if bandinbuild_id is not None:
                    f.addSubsequenceOfFeature(bandinbuild_id)

                # CHECK - this makes the assumption that there is
                # only one affected chromosome per variant what happens with
                # chromosomal rearrangement variants?
                # shouldn't both chromosomes be here?

                # add the hgvs as synonyms
                if hgvs_c != '-' and hgvs_c.strip() != '':
                    model.addSynonym(seqalt_id, hgvs_c)
                if hgvs_p != '-' and hgvs_p.strip() != '':
                    model.addSynonym(seqalt_id, hgvs_p)

                # add the dbsnp and dbvar ids as equivalent
                if dbsnp_num != '-' and int(dbsnp_num) != -1:
                    dbsnp_id = 'dbSNP:rs'+str(dbsnp_num)
                    model.addIndividualToGraph(dbsnp_id, None)
                    model.addSameIndividual(seqalt_id, dbsnp_id)
                if dbvar_num != '-':
                    dbvar_id = 'dbVar:'+dbvar_num
                    model.addIndividualToGraph(dbvar_id, None)
                    model.addSameIndividual(seqalt_id, dbvar_id)

                # TODO - not sure if this is right... add as xref?
                # the rcv is like the combo of the phenotype with the variant
                if rcv_nums != '-':
                    for rcv_num in re.split(r';', rcv_nums):
                        rcv_id = 'ClinVar:' + rcv_num
                        model.addIndividualToGraph(rcv_id, None)
                        model.addXref(seqalt_id, rcv_id)

                if gene_id is not None:
                    # add the gene
                    model.addClassToGraph(gene_id, gene_symbol)
                    # make a variant locus
                    vl_id = '_'+gene_num+'-'+variant_num
                    if self.nobnodes:
                        vl_id = ':'+vl_id
                    vl_label = allele_name
                    model.addIndividualToGraph(
                        vl_id, vl_label, geno.genoparts['variant_locus'])
                    geno.addSequenceAlterationToVariantLocus(seqalt_id, vl_id)
                    geno.addAlleleOfGene(vl_id, gene_id)
                else:
                    # some basic reporting
                    gmatch = re.search(r'\(\w+\)', allele_name)
                    if gmatch is not None and len(gmatch.groups()) > 0:
                        logger.info(
                            "Gene found in allele label, but no id provided: %s",
                            gmatch.group(1))
                    elif re.match(r'more than 10', gene_symbol):
                        logger.info(
                            "More than 10 genes found; "
                            "need to process XML to fetch (variant=%d)",
                            int(variant_num))
                    else:
                        logger.info(
                            "No gene listed for variant %d",
                            int(variant_num))

                # parse the list of "phenotypes" which are diseases.
                # add them as an association
                # ;GeneReviews:NBK1440,MedGen:C0392514,OMIM:235200,SNOMED CT:35400008;MedGen:C3280096,OMIM:614193;MedGen:CN034317,OMIM:612635;MedGen:CN169374
                # the list is both semicolon delimited and comma delimited,
                # but i don't know why! some are bad, like:
                # Orphanet:ORPHA ORPHA319705,SNOMED CT:49049000
                if phenotype_ids != '-':
                    for phenotype in pheno_list:
                        m = re.match(
                            r"(Orphanet:ORPHA(?:\s*ORPHA)?)", phenotype)
                        if m is not None and len(m.groups()) > 0:
                            phenotype = re.sub(
                                m.group(1), 'Orphanet:', phenotype.strip())
                        elif re.match(r'ORPHA:\d+', phenotype):
                            phenotype = re.sub(
                                r'^ORPHA', 'Orphanet', phenotype.strip())
                        elif re.match(r'Human Phenotype Ontology', phenotype):
                            phenotype = re.sub(
                                r'^Human Phenotype Ontology', '',
                                phenotype.strip())
                        elif re.match(r'SNOMED CT:\s?', phenotype):
                            phenotype = re.sub(
                                r'SNOMED CT:\s?', 'SNOMED:', phenotype.strip())
                        elif re.match(r'^Gene:', phenotype):
                            continue

                        assoc = G2PAssoc(
                            g, self.name, seqalt_id, phenotype.strip())
                        assoc.add_association_to_graph()

                if other_ids != '-':
                    id_list = other_ids.split(',')
                    # process the "other ids" ex:
                    # CFTR2:F508del,HGMD:CD890142,OMIM Allelic Variant:602421.0001
                    # TODO make more xrefs
                    for xrefid in id_list:
                        prefix = xrefid.split(':')[0].strip()
                        if prefix == 'OMIM Allelic Variant':
                            xrefid = 'OMIM:'+xrefid.split(':')[1]
                            model.addIndividualToGraph(xrefid, None)
                            model.addSameIndividual(seqalt_id, xrefid)
                        elif prefix == 'HGMD':
                            model.addIndividualToGraph(xrefid, None)
                            model.addSameIndividual(seqalt_id, xrefid)
                        elif prefix == 'dbVar' \
                                and dbvar_num == xrefid.split(':')[1].strip():
                            pass  # skip over this one
                        elif re.search(r'\s', prefix):
                            pass
                            # logger.debug(
                            #   'xref prefix has a space: %s', xrefid)
                        else:
                            # should be a good clean prefix
                            # note that HGMD variants are in here as Xrefs
                            # because we can't resolve URIs for them
                            # logger.info("Adding xref: %s", xrefid)
                            # gu.addXref(g, seqalt_id, xrefid)
                            # logger.info("xref prefix to add: %s", xrefid)
                            pass

                if not self.testMode and limit is not None \
                        and line_counter > limit:
                    break

        logger.info("Finished parsing variants")

        return

Пример #19

Показать файл

Файл: CGD.py Проект: zzygyx9119/mckb

    def _add_variant_cdna_variant_assoc_to_graph(self, row):
        """
        Generates relationships between variants and cDNA variants
        given a row of data
        :param iterable: row of data, see add_variant_info_to_graph()
                                      docstring for expected structure.
                                      Only applicable for structure 2.
        :return None
        """
        gu = GraphUtils(curie_map.get())
        geno = Genotype(self.graph)
        is_literal = True

        (variant_key, variant_label, amino_acid_variant, amino_acid_position,
         transcript_id, transcript_priority, protein_variant_type,
         functional_impact, stop_gain_loss, transcript_gene,
         protein_variant_source, variant_gene, bp_pos, variant_cdna,
         cosmic_id, db_snp_id, genome_pos_start, genome_pos_end, ref_base,
         variant_base, primary_transcript_exons,
         primary_transcript_variant_sub_types, variant_type, chromosome,
         genome_build, build_version, build_date) = row

        variant_id = self.make_cgd_id('variant{0}'.format(variant_key))

        # Add gene
        self._add_variant_gene_relationship(variant_id, variant_gene)

        # Transcript reference for nucleotide position
        transcript_curie = self._make_transcript_curie(transcript_id)

        # Make region IDs
        cdna_region_id = ":_{0}Region".format(transcript_curie)
        chrom_region_id = ":_{0}{1}Region-{2}-{3}".format(genome_build,
                                                          chromosome,
                                                          genome_pos_start,
                                                          genome_pos_end)

        # Add the genome build
        genome_label = "Human"
        build_id = "UCSC:{0}".format(genome_build)
        taxon_id = 'NCBITaxon:9606'
        geno.addGenome(taxon_id, genome_label)
        geno.addReferenceGenome(build_id, genome_build, taxon_id)

        # Add chromosome

        chrom_class_id = makeChromID(chromosome, '9606', 'CHR')  # the chrom class (generic) id
        chrom_instance_id = makeChromID(chromosome, build_id, 'MONARCH')

        # first, add the chromosome class (in the taxon)
        geno.addChromosomeClass(chromosome, taxon_id, 'Human')

        # then, add the chromosome instance (from the given build)
        geno.addChromosomeInstance(chromosome, build_id, genome_build, chrom_class_id)

        # Add variant coordinates in reference to chromosome
        self._add_feature_with_coords(variant_id,genome_pos_start,
                                      genome_pos_end, chrom_instance_id, chrom_region_id)

        # Add mutation coordinates in reference to gene
        self._add_feature_with_coords(variant_id, bp_pos,
                                      bp_pos, transcript_curie, cdna_region_id)

        # Add nucleotide mutation
        gu.addTriple(self.graph, variant_id,
                     geno.properties['reference_nucleotide'],
                     ref_base, is_literal)
        gu.addTriple(self.graph, variant_id,
                     geno.properties['altered_nucleotide'],
                     variant_base, is_literal)

        """
        Here we update any internal cgd variant IDS with a cosmic ID
        or dbSNP ID.  Alternatively we could do this using sql rather
        than a sparql update which may be safer
        """
        # Add SNP xrefs
        if cosmic_id is not None:
            cosmic_id_list = cosmic_id.split(', ')
            cosmic_curie_list = []
            for c_id in cosmic_id_list:
                cosmic_curie = re.sub(r'COSM(\d+)', r'COSMIC:\1', c_id)
                cosmic_curie_list.append(cosmic_curie)
                gu.addIndividualToGraph(self.graph, cosmic_curie, c_id,
                                        geno.genoparts['missense_variant'])

            # If there are multiple ids set them equivalent to the first
            for curie in cosmic_curie_list[1:]:
                gu.addSameIndividual(self.graph, cosmic_curie_list[0], curie)

            self._replace_entity(self.graph, variant_id, cosmic_curie_list[0], self.bindings)

        if db_snp_id is not None:
            db_snp_curie = re.sub(r'rs(\d+)', r'dbSNP:\1', db_snp_id)
            gu.addIndividualToGraph(self.graph, db_snp_curie, db_snp_id,
                                    geno.genoparts['missense_variant'])

            if cosmic_id is None:
                self._replace_entity(self.graph, variant_id, db_snp_curie, self.bindings)
            else:
                cosmic_id_list = cosmic_id.split(', ')
                for c_id in cosmic_id_list:
                    cosmic_curie = re.sub(r'COSM(\d+)', r'COSMIC:\1', c_id)
                    gu.addSameIndividual(self.graph, cosmic_curie, db_snp_curie)

        return