Пример #1
0
    def _process_haplotype(self, hap_id, hap_label, chrom_num, chrom_pos,
                           context, risk_allele_frequency, mapped_gene,
                           so_ontology):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        geno = Genotype(graph)
        model = Model(graph)
        # add the feature to the graph
        hap_description = None
        if risk_allele_frequency not in ['', 'NR']:
            hap_description = str(
                risk_allele_frequency) + ' [risk allele frequency]'

        model.addIndividualToGraph(hap_id, hap_label.strip(),
                                   self.globaltt['haplotype'], hap_description)
        geno.addTaxon(self.globaltt["H**o sapiens"], hap_id)

        snp_labels = re.split(r';\s?', hap_label)
        chrom_nums = re.split(r';\s?', chrom_num)
        chrom_positions = re.split(r';\s?', chrom_pos)
        context_list = re.split(r';\s?', context)
        mapped_genes = re.split(r';\s?', mapped_gene)

        # Not having four "PAX5" as a list might be better, but it breaks unit tests
        # mapped_genes = list(set(mapped_genes)) # make uniq
        # snp_labels = list(set(snp_labels)) # make uniq

        snp_curies = list()

        for snp in snp_labels:
            snp_curie, snp_type = self._get_curie_and_type_from_id(snp)
            if snp_type is None:
                LOG.info('cant find type for SNP in %s', snp)
                # make blank node
                snp_curie = self.make_id(snp, "_")
                model.addLabel(snp_curie, snp)
            elif snp_curie[0] == '_':  # arrived an unlabeled blanknode
                model.addLabel(snp_curie, snp)

            graph.addTriple(hap_id, self.globaltt['has_variant_part'],
                            snp_curie)
            snp_curies.append(snp_curie)

        # courtesy http://stackoverflow.com/a/16720915
        # check lengths of mutiple lists
        length = len(snp_curies)
        if not all(
                len(lst) == length for lst in
            [snp_labels, chrom_nums, chrom_positions, context_list]):
            LOG.warning(
                "Incongruous data field(s) for haplotype %s \n "
                "will not add snp details", hap_label)
        else:

            variant_in_gene_count = 0
            for index, snp_curie in enumerate(snp_curies):
                self._add_snp_to_graph(snp_curie, snp_labels[index],
                                       chrom_nums[index],
                                       chrom_positions[index],
                                       context_list[index])

                if mapped_genes and len(mapped_genes) != len(snp_labels):
                    LOG.warning("More mapped genes than snps,"
                                " cannot disambiguate for\n%s\n%s",
                                mapped_genes, snp_labels)  # hap_label)
                else:
                    so_class = self.resolve(context_list[index])
                    so_query = """
        SELECT ?variant_label
        WHERE {{
            {0} rdfs:subClassOf+ {1} ;
            rdfs:label ?variant_label .
        }}
                    """.format(so_class, self.globaltt['gene_variant'])

                    query_result = so_ontology.query(so_query)

                    gene_id = DipperUtil.get_hgnc_id_from_symbol(
                        mapped_genes[index])

                    if gene_id is not None and len(list(query_result)) == 1:
                        if context_list[index] in [
                                'upstream_gene_variant',
                                'downstream_gene_variant'
                        ]:
                            graph.addTriple(snp_curie,
                                            self.resolve(context_list[index]),
                                            gene_id)
                        else:
                            geno.addAffectedLocus(snp_curie, gene_id)
                            variant_in_gene_count += 1

            # Seperate in case we want to apply a different relation
            # If not this is redundant with triples added above
            if len(mapped_genes) == variant_in_gene_count and \
                    len(set(mapped_genes)) == 1:
                gene_id = DipperUtil.get_hgnc_id_from_symbol(mapped_genes[0])
                geno.addAffectedLocus(hap_id, gene_id)
Пример #2
0
    def _add_variant_gene_relationship(self, patient_var_map,
                                       gene_coordinate_map):
        """
        Right now it is unclear the best approach on how to connect
        variants to genes.  In most cases has_affected_locus/GENO:0000418
        is accurate; however, there are cases where a variant is in the intron
        on one gene and is purported to causally affect another gene down or
        upstream.  In these cases we must first disambiguate which gene
        is the affected locus, and which gene(s) are predicated to be
        causully influenced by (RO:0002566)

        UPDATE 8-30: In the latest dataset we no longer have 1-many mappings
        between variants and genes, but leaving this here in case we see
        these in the future

        The logic followed here is:
        if mutation type contains downstream/upstream and more than one
        gene of interest, investigate coordinates of all genes to
        see if we can disambiguate which genes are which
        :return: None
        """
        # genotype = Genotype(self.graph)
        dipper_util = DipperUtil()
        model = Model(self.graph)
        # Note this could be compressed in someway to remove one level of for looping
        for patient in patient_var_map:
            for variant_id, variant in patient_var_map[patient].items():
                variant_bnode = self.make_id("{0}".format(variant_id), "_")
                genes_of_interest = variant['genes_of_interest']
                if len(genes_of_interest) == 1:
                    # Assume variant is variant allele of gene
                    gene = genes_of_interest[0]
                    gene_id = dipper_util.get_hgnc_id_from_symbol(gene)
                    self._add_gene_to_graph(
                        gene, variant_bnode, gene_id,
                        self.globaltt['has_affected_feature'])

                elif re.search(r'upstream|downstream',
                               variant['type'],
                               flags=re.I):
                    # Attempt to disambiguate
                    ref_gene = []
                    up_down_gene = []
                    unmatched_genes = []
                    for gene in variant['genes_of_interest']:
                        if gene_id and gene_id != '' and gene_id in gene_coordinate_map:
                            if gene_coordinate_map[gene_id]['start'] \
                                    <= variant['position']\
                                    <= gene_coordinate_map[gene_id]['end']:
                                gene_info = {
                                    'symbol':
                                    gene,
                                    'strand':
                                    gene_coordinate_map[gene_id]['strand']
                                }
                                ref_gene.append(gene_info)
                            else:
                                up_down_gene.append(gene)
                        else:
                            unmatched_genes.append(gene)
                    if len(ref_gene) == 1:
                        self._add_gene_to_graph(
                            ref_gene[0]['symbol'], variant_bnode, gene_id,
                            self.globaltt['has_affected_feature'])

                        # update label with gene
                        gene_list = [ref_gene[0]['symbol']
                                     ]  # build label expects list
                        variant_label = self._build_variant_label(
                            variant['build'], variant['chromosome'],
                            variant['position'], variant['reference_allele'],
                            variant['variant_allele'], gene_list)
                        model.addLabel(variant_bnode, variant_label)

                    # In some cases there are multiple instances
                    # of same gene from dupe rows in the source
                    # Credit http://stackoverflow.com/a/3844832
                    elif len(ref_gene) > 0 and ref_gene[1:] == ref_gene[:-1]:
                        self._add_gene_to_graph(
                            ref_gene[0]['symbol'], variant_bnode, gene_id,
                            self.globaltt['has_affected_feature'])

                        # build label function expects list
                        gene_list = [ref_gene[0]['symbol']]
                        variant_label = self._build_variant_label(
                            variant['build'], variant['chromosome'],
                            variant['position'], variant['reference_allele'],
                            variant['variant_allele'], gene_list)
                        model.addLabel(variant_bnode, variant_label)

                    # Check if reference genes are on different strands
                    elif len(ref_gene) == 2:
                        strands = [st['strand'] for st in ref_gene]
                        if "minus" in strands and "plus" in strands:
                            for r_gene in ref_gene:
                                self._add_gene_to_graph(
                                    r_gene['symbol'], variant_bnode, gene_id,
                                    self.globaltt['has_affected_feature'])
                        else:
                            LOG.warning(
                                "unable to map intron variant to gene coordinates: %s",
                                variant)
                            for r_gene in ref_gene:
                                self._add_gene_to_graph(
                                    r_gene['symbol'], variant_bnode, gene_id,
                                    self.globaltt['causally_influences'])
                    elif re.search(r'intron', variant['type'], flags=re.I):
                        LOG.warning(
                            "unable to map intron variant to gene coordinates_2: %s",
                            variant)
                    for neighbor in up_down_gene:
                        self._add_gene_to_graph(
                            neighbor, variant_bnode, gene_id,
                            self.globaltt['causally_influences'])
                    # Unmatched genes are likely because we cannot map to an NCBIGene
                    # or we do not have coordinate information
                    for unmatched_gene in unmatched_genes:
                        self._add_gene_to_graph(
                            unmatched_gene, variant_bnode, gene_id,
                            self.globaltt['causally_influences'])

        return