def _add_variant_gene_relationship(self, patient_var_map, gene_coordinate_map): """ Right now it is unclear the best approach on how to connect variants to genes. In most cases has_affected_locus/GENO:0000418 is accurate; however, there are cases where a variant is in the intron on one gene and is purported to causally affect another gene down or upstream. In these cases we must first disambiguate which gene is the affected locus, and which gene(s) are predicated to be causully influenced by (RO:0002566) UPDATE 8-30: In the latest dataset we no longer have 1-many mappings between variants and genes, but leaving this here in case we see these in the future The logic followed here is: if mutation type contains downstream/upstream and more than one gene of interest, investigate coordinates of all genes to see if we can disambiguate which genes are which :return: None """ # genotype = Genotype(self.graph) dipper_util = DipperUtil() model = Model(self.graph) # Note this could be compressed in someway to remove one level of for looping for patient in patient_var_map: for variant_id, variant in patient_var_map[patient].items(): variant_bnode = self.make_id("{0}".format(variant_id), "_") genes_of_interest = variant['genes_of_interest'] if len(genes_of_interest) == 1: # Assume variant is variant allele of gene gene = genes_of_interest[0] gene_id = dipper_util.get_ncbi_id_from_symbol(gene) self._add_gene_to_graph( gene, variant_bnode, gene_id, self.globaltt['has_affected_feature']) elif re.search(r'upstream|downstream', variant['type'], flags=re.I): # Attempt to disambiguate ref_gene = [] up_down_gene = [] unmatched_genes = [] for gene in variant['genes_of_interest']: if gene_id and gene_id != '' and gene_id in gene_coordinate_map: if gene_coordinate_map[gene_id]['start'] \ <= variant['position']\ <= gene_coordinate_map[gene_id]['end']: gene_info = { 'symbol': gene, 'strand': gene_coordinate_map[gene_id]['strand'] } ref_gene.append(gene_info) else: up_down_gene.append(gene) else: unmatched_genes.append(gene) if len(ref_gene) == 1: self._add_gene_to_graph( ref_gene[0]['symbol'], variant_bnode, gene_id, self.globaltt['has_affected_feature']) # update label with gene gene_list = [ref_gene[0]['symbol'] ] # build label expects list variant_label = self._build_variant_label( variant['build'], variant['chromosome'], variant['position'], variant['reference_allele'], variant['variant_allele'], gene_list) model.addLabel(variant_bnode, variant_label) # In some cases there are multiple instances # of same gene from dupe rows in the source # Credit http://stackoverflow.com/a/3844832 elif len(ref_gene) > 0 and ref_gene[1:] == ref_gene[:-1]: self._add_gene_to_graph( ref_gene[0]['symbol'], variant_bnode, gene_id, self.globaltt['has_affected_feature']) # build label function expects list gene_list = [ref_gene[0]['symbol']] variant_label = self._build_variant_label( variant['build'], variant['chromosome'], variant['position'], variant['reference_allele'], variant['variant_allele'], gene_list) model.addLabel(variant_bnode, variant_label) # Check if reference genes are on different strands elif len(ref_gene) == 2: strands = [st['strand'] for st in ref_gene] if "minus" in strands and "plus" in strands: for r_gene in ref_gene: self._add_gene_to_graph( r_gene['symbol'], variant_bnode, gene_id, self.globaltt['has_affected_feature']) else: logger.warn( "unable to map intron variant" " to gene coordinates: {0}".format(variant)) for r_gene in ref_gene: self._add_gene_to_graph( r_gene['symbol'], variant_bnode, gene_id, self.globaltt['causally_influences']) elif re.search(r'intron', variant['type'], flags=re.I): logger.warn( "unable to map intron variant" " to gene coordinates: {0}".format(variant)) for neighbor in up_down_gene: self._add_gene_to_graph( neighbor, variant_bnode, gene_id, self.globaltt['causally_influences']) # Unmatched genes are likely because we cannot map to an NCBIGene # or we do not have coordinate information for unmatched_gene in unmatched_genes: self._add_gene_to_graph( unmatched_gene, variant_bnode, gene_id, self.globaltt['causally_influences']) return
def _add_variant_gene_relationship(self, patient_var_map, gene_coordinate_map): """ Right now it is unclear the best approach on how to connect variants to genes. In most cases has_affected_locus/GENO:0000418 is accurate; however, there are cases where a variant is in the intron on one gene and is purported to causally affect another gene down or upstream. In these cases we must first disambiguate which gene is the affected locus, and which gene(s) are predicated to be causully influenced by (RO:0002566) UPDATE 8-30: In the latest dataset we no longer have 1-many mappings between variants and genes, but leaving this here in case we see these in the future The logic followed here is: if mutation type contains downstream/upstream and more than one gene of interest, investigate coordinates of all genes to see if we can disambiguate which genes are which :return: None """ # genotype = Genotype(self.graph) dipper_util = DipperUtil() model = Model(self.graph) # Note this could be compressed in someway to remove one level of for looping for patient in patient_var_map: for variant_id, variant in patient_var_map[patient].items(): variant_bnode = self.make_id("{0}".format(variant_id), "_") genes_of_interest = variant['genes_of_interest'] if len(genes_of_interest) == 1: # Assume variant is variant allele of gene gene = genes_of_interest[0] gene_id = dipper_util.get_ncbi_id_from_symbol(gene) self._add_gene_to_graph( gene, variant_bnode, gene_id, self.globaltt['has_affected_feature']) elif re.search(r'upstream|downstream', variant['type'], flags=re.I): # Attempt to disambiguate ref_gene = [] up_down_gene = [] unmatched_genes = [] for gene in variant['genes_of_interest']: if gene_id and gene_id != '' and gene_id in gene_coordinate_map: if gene_coordinate_map[gene_id]['start'] \ <= variant['position']\ <= gene_coordinate_map[gene_id]['end']: gene_info = { 'symbol': gene, 'strand': gene_coordinate_map[gene_id]['strand'] } ref_gene.append(gene_info) else: up_down_gene.append(gene) else: unmatched_genes.append(gene) if len(ref_gene) == 1: self._add_gene_to_graph( ref_gene[0]['symbol'], variant_bnode, gene_id, self.globaltt['has_affected_feature']) # update label with gene gene_list = [ref_gene[0]['symbol']] # build label expects list variant_label = self._build_variant_label( variant['build'], variant['chromosome'], variant['position'], variant['reference_allele'], variant['variant_allele'], gene_list) model.addLabel(variant_bnode, variant_label) # In some cases there are multiple instances # of same gene from dupe rows in the source # Credit http://stackoverflow.com/a/3844832 elif len(ref_gene) > 0 and ref_gene[1:] == ref_gene[:-1]: self._add_gene_to_graph( ref_gene[0]['symbol'], variant_bnode, gene_id, self.globaltt['has_affected_feature']) # build label function expects list gene_list = [ref_gene[0]['symbol']] variant_label = self._build_variant_label( variant['build'], variant['chromosome'], variant['position'], variant['reference_allele'], variant['variant_allele'], gene_list) model.addLabel(variant_bnode, variant_label) # Check if reference genes are on different strands elif len(ref_gene) == 2: strands = [st['strand'] for st in ref_gene] if "minus" in strands and "plus" in strands: for r_gene in ref_gene: self._add_gene_to_graph( r_gene['symbol'], variant_bnode, gene_id, self.globaltt['has_affected_feature']) else: LOG.warning( "unable to map intron variant to gene coordinates: %s", variant) for r_gene in ref_gene: self._add_gene_to_graph( r_gene['symbol'], variant_bnode, gene_id, self.globaltt['causally_influences']) elif re.search(r'intron', variant['type'], flags=re.I): LOG.warning( "unable to map intron variant to gene coordinates_2: %s", variant) for neighbor in up_down_gene: self._add_gene_to_graph( neighbor, variant_bnode, gene_id, self.globaltt['causally_influences']) # Unmatched genes are likely because we cannot map to an NCBIGene # or we do not have coordinate information for unmatched_gene in unmatched_genes: self._add_gene_to_graph( unmatched_gene, variant_bnode, gene_id, self.globaltt['causally_influences']) return
def _process_haplotype( self, hap_id, hap_label, chrom_num, chrom_pos, context, risk_allele_frequency, mapped_gene, so_ontology): tax_id = 'NCBITaxon:9606' if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) model = Model(g) # add the feature to the graph hap_description = None if risk_allele_frequency != '' and \ risk_allele_frequency != 'NR': hap_description = \ str(risk_allele_frequency) + \ ' [risk allele frequency]' model.addIndividualToGraph(hap_id, hap_label.strip(), Feature.types['haplotype'], hap_description) geno.addTaxon(tax_id, hap_id) snp_labels = re.split(r';\s?', hap_label) chrom_nums = re.split(r';\s?', chrom_num) chrom_positions = re.split(r';\s?', chrom_pos) context_list = re.split(r';\s?', context) mapped_genes = re.split(r';\s?', mapped_gene) snp_curies = list() for index, snp in enumerate(snp_labels): snp_curie, snp_type = self._get_curie_and_type_from_id(snp) if snp_type is None: # make blank node snp_curie = self.make_id(snp, "_") g.addTriple(hap_id, geno.object_properties['has_variant_part'], snp_curie) snp_curies.append(snp_curie) # courtesy http://stackoverflow.com/a/16720915 length = len(snp_labels) if not all(len(lst) == length for lst in [chrom_nums, chrom_positions, context_list]): logger.warn( "Unexpected data field for haplotype {} \n " "will not add snp details".format(hap_label)) return variant_in_gene_count = 0 for index, snp_curie in enumerate(snp_curies): self._add_snp_to_graph( snp_curie, snp_labels[index], chrom_nums[index], chrom_positions[index], context_list[index]) if len(mapped_genes) == len(snp_labels): so_class = self._map_variant_type(context_list[index]) if so_class is None: raise ValueError("Unknown SO class {} in haplotype {}" .format(context_list[index], hap_label)) so_query = """ SELECT ?variant_label WHERE {{ {0} rdfs:subClassOf+ SO:0001564 ; rdfs:label ?variant_label . }} """.format(so_class) query_result = so_ontology.query(so_query) if len(list(query_result)) > 0: gene_id = DipperUtil.get_ncbi_id_from_symbol( mapped_genes[index]) if gene_id is not None: geno.addAffectedLocus(snp_curie, gene_id) geno.addAffectedLocus(hap_id, gene_id) variant_in_gene_count += 1 if context_list[index] == 'upstream_gene_variant': gene_id = DipperUtil.get_ncbi_id_from_symbol( mapped_genes[index]) if gene_id is not None: g.addTriple( snp_curie, Feature.object_properties[ 'upstream_of_sequence_of'], gene_id) elif context_list[index] == 'downstream_gene_variant': gene_id = DipperUtil.get_ncbi_id_from_symbol( mapped_genes[index]) if gene_id is not None: g.addTriple( snp_curie, Feature.object_properties[ 'downstream_of_sequence_of'], gene_id) else: logger.warn("More mapped genes than snps, " "cannot disambiguate for {}".format(hap_label)) # Seperate in case we want to apply a different relation # If not this is redundant with triples added above if len(mapped_genes) == variant_in_gene_count \ and len(set(mapped_genes)) == 1: gene_id = DipperUtil.get_ncbi_id_from_symbol(mapped_genes[0]) geno.addAffectedLocus(hap_id, gene_id) return
def _process_haplotype( self, hap_id, hap_label, chrom_num, chrom_pos, context, risk_allele_frequency, mapped_gene, so_ontology): if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) # add the feature to the graph hap_description = None if risk_allele_frequency != '' and risk_allele_frequency != 'NR': hap_description = str(risk_allele_frequency) + ' [risk allele frequency]' model.addIndividualToGraph( hap_id, hap_label.strip(), self.globaltt['haplotype'], hap_description) geno.addTaxon(self.globaltt["H**o sapiens"], hap_id) snp_labels = re.split(r';\s?', hap_label) chrom_nums = re.split(r';\s?', chrom_num) chrom_positions = re.split(r';\s?', chrom_pos) context_list = re.split(r';\s?', context) mapped_genes = re.split(r';\s?', mapped_gene) snp_curies = list() for index, snp in enumerate(snp_labels): snp_curie, snp_type = self._get_curie_and_type_from_id(snp) if snp_type is None: # make blank node snp_curie = self.make_id(snp, "_") graph.addTriple(hap_id, self.globaltt['has_variant_part'], snp_curie) snp_curies.append(snp_curie) # courtesy http://stackoverflow.com/a/16720915 length = len(snp_labels) if not all(len(lst) == length for lst in [chrom_nums, chrom_positions, context_list]): LOG.warning( "Unexpected data field for haplotype %s \n " "will not add snp details", hap_label) return variant_in_gene_count = 0 for index, snp_curie in enumerate(snp_curies): self._add_snp_to_graph( snp_curie, snp_labels[index], chrom_nums[index], chrom_positions[index], context_list[index]) if len(mapped_genes) == len(snp_labels): so_class = self.resolve(context_list[index]) # removed the '+' for recursive one-or-more rdfs:subClassOf paths # just so it did not return an empty graph so_query = """ SELECT ?variant_label WHERE {{ {0} rdfs:subClassOf {1} ; rdfs:label ?variant_label . }} """.format(so_class, self.globaltt['gene_variant']) query_result = so_ontology.query(so_query) if len(list(query_result)) == 1: gene_id = DipperUtil.get_ncbi_id_from_symbol(mapped_genes[index]) if gene_id is not None: geno.addAffectedLocus(snp_curie, gene_id) geno.addAffectedLocus(hap_id, gene_id) variant_in_gene_count += 1 gene_id = DipperUtil.get_ncbi_id_from_symbol(mapped_genes[index]) if gene_id is not None: graph.addTriple( snp_curie, self.resolve(context_list[index]), gene_id) else: LOG.warning( "More mapped genes than snps, cannot disambiguate for %s", hap_label) # Seperate in case we want to apply a different relation # If not this is redundant with triples added above if len(mapped_genes) == variant_in_gene_count and len(set(mapped_genes)) == 1: gene_id = DipperUtil.get_ncbi_id_from_symbol(mapped_genes[0]) geno.addAffectedLocus(hap_id, gene_id) return
def _process_haplotype(self, hap_id, hap_label, chrom_num, chrom_pos, context, risk_allele_frequency, mapped_gene, so_ontology): if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) # add the feature to the graph hap_description = None if risk_allele_frequency != '' and risk_allele_frequency != 'NR': hap_description = str( risk_allele_frequency) + ' [risk allele frequency]' model.addIndividualToGraph(hap_id, hap_label.strip(), self.globaltt['haplotype'], hap_description) geno.addTaxon(self.globaltt["H**o sapiens"], hap_id) snp_labels = re.split(r';\s?', hap_label) chrom_nums = re.split(r';\s?', chrom_num) chrom_positions = re.split(r';\s?', chrom_pos) context_list = re.split(r';\s?', context) mapped_genes = re.split(r';\s?', mapped_gene) snp_curies = list() for index, snp in enumerate(snp_labels): snp_curie, snp_type = self._get_curie_and_type_from_id(snp) if snp_type is None: # make blank node snp_curie = self.make_id(snp, "_") graph.addTriple(hap_id, self.globaltt['has_variant_part'], snp_curie) snp_curies.append(snp_curie) # courtesy http://stackoverflow.com/a/16720915 length = len(snp_labels) if not all( len(lst) == length for lst in [chrom_nums, chrom_positions, context_list]): LOG.warning( "Unexpected data field for haplotype %s \n " "will not add snp details", hap_label) return variant_in_gene_count = 0 for index, snp_curie in enumerate(snp_curies): self._add_snp_to_graph(snp_curie, snp_labels[index], chrom_nums[index], chrom_positions[index], context_list[index]) if len(mapped_genes) == len(snp_labels): so_class = self.resolve(context_list[index]) # removed the '+' for recursive one-or-more rdfs:subClassOf paths # just so it did not return an empty graph so_query = """ SELECT ?variant_label WHERE {{ {0} rdfs:subClassOf {1} ; rdfs:label ?variant_label . }} """.format(so_class, self.globaltt['gene_variant']) query_result = so_ontology.query(so_query) if len(list(query_result)) == 1: gene_id = DipperUtil.get_ncbi_id_from_symbol( mapped_genes[index]) if gene_id is not None: geno.addAffectedLocus(snp_curie, gene_id) geno.addAffectedLocus(hap_id, gene_id) variant_in_gene_count += 1 gene_id = DipperUtil.get_ncbi_id_from_symbol( mapped_genes[index]) if gene_id is not None: graph.addTriple(snp_curie, self.resolve(context_list[index]), gene_id) else: LOG.warning( "More mapped genes than snps, cannot disambiguate for %s", hap_label) # Seperate in case we want to apply a different relation # If not this is redundant with triples added above if len(mapped_genes) == variant_in_gene_count and len( set(mapped_genes)) == 1: gene_id = DipperUtil.get_ncbi_id_from_symbol(mapped_genes[0]) geno.addAffectedLocus(hap_id, gene_id) return