def _process_haplotype(self, hap_id, hap_label, chrom_num, chrom_pos, context, risk_allele_frequency, mapped_gene, so_ontology): if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) # add the feature to the graph hap_description = None if risk_allele_frequency not in ['', 'NR']: hap_description = str( risk_allele_frequency) + ' [risk allele frequency]' model.addIndividualToGraph(hap_id, hap_label.strip(), self.globaltt['haplotype'], hap_description) geno.addTaxon(self.globaltt["H**o sapiens"], hap_id) snp_labels = re.split(r';\s?', hap_label) chrom_nums = re.split(r';\s?', chrom_num) chrom_positions = re.split(r';\s?', chrom_pos) context_list = re.split(r';\s?', context) mapped_genes = re.split(r';\s?', mapped_gene) # Not having four "PAX5" as a list might be better, but it breaks unit tests # mapped_genes = list(set(mapped_genes)) # make uniq # snp_labels = list(set(snp_labels)) # make uniq snp_curies = list() for snp in snp_labels: snp_curie, snp_type = self._get_curie_and_type_from_id(snp) if snp_type is None: LOG.info('cant find type for SNP in %s', snp) # make blank node snp_curie = self.make_id(snp, "_") model.addLabel(snp_curie, snp) elif snp_curie[0] == '_': # arrived an unlabeled blanknode model.addLabel(snp_curie, snp) graph.addTriple(hap_id, self.globaltt['has_variant_part'], snp_curie) snp_curies.append(snp_curie) # courtesy http://stackoverflow.com/a/16720915 # check lengths of mutiple lists length = len(snp_curies) if not all( len(lst) == length for lst in [snp_labels, chrom_nums, chrom_positions, context_list]): LOG.warning( "Incongruous data field(s) for haplotype %s \n " "will not add snp details", hap_label) else: variant_in_gene_count = 0 for index, snp_curie in enumerate(snp_curies): self._add_snp_to_graph(snp_curie, snp_labels[index], chrom_nums[index], chrom_positions[index], context_list[index]) if mapped_genes and len(mapped_genes) != len(snp_labels): LOG.warning("More mapped genes than snps," " cannot disambiguate for\n%s\n%s", mapped_genes, snp_labels) # hap_label) else: so_class = self.resolve(context_list[index]) so_query = """ SELECT ?variant_label WHERE {{ {0} rdfs:subClassOf+ {1} ; rdfs:label ?variant_label . }} """.format(so_class, self.globaltt['gene_variant']) query_result = so_ontology.query(so_query) gene_id = DipperUtil.get_hgnc_id_from_symbol( mapped_genes[index]) if gene_id is not None and len(list(query_result)) == 1: if context_list[index] in [ 'upstream_gene_variant', 'downstream_gene_variant' ]: graph.addTriple(snp_curie, self.resolve(context_list[index]), gene_id) else: geno.addAffectedLocus(snp_curie, gene_id) variant_in_gene_count += 1 # Seperate in case we want to apply a different relation # If not this is redundant with triples added above if len(mapped_genes) == variant_in_gene_count and \ len(set(mapped_genes)) == 1: gene_id = DipperUtil.get_hgnc_id_from_symbol(mapped_genes[0]) geno.addAffectedLocus(hap_id, gene_id)
def _add_variant_gene_relationship(self, patient_var_map, gene_coordinate_map): """ Right now it is unclear the best approach on how to connect variants to genes. In most cases has_affected_locus/GENO:0000418 is accurate; however, there are cases where a variant is in the intron on one gene and is purported to causally affect another gene down or upstream. In these cases we must first disambiguate which gene is the affected locus, and which gene(s) are predicated to be causully influenced by (RO:0002566) UPDATE 8-30: In the latest dataset we no longer have 1-many mappings between variants and genes, but leaving this here in case we see these in the future The logic followed here is: if mutation type contains downstream/upstream and more than one gene of interest, investigate coordinates of all genes to see if we can disambiguate which genes are which :return: None """ # genotype = Genotype(self.graph) dipper_util = DipperUtil() model = Model(self.graph) # Note this could be compressed in someway to remove one level of for looping for patient in patient_var_map: for variant_id, variant in patient_var_map[patient].items(): variant_bnode = self.make_id("{0}".format(variant_id), "_") genes_of_interest = variant['genes_of_interest'] if len(genes_of_interest) == 1: # Assume variant is variant allele of gene gene = genes_of_interest[0] gene_id = dipper_util.get_hgnc_id_from_symbol(gene) self._add_gene_to_graph( gene, variant_bnode, gene_id, self.globaltt['has_affected_feature']) elif re.search(r'upstream|downstream', variant['type'], flags=re.I): # Attempt to disambiguate ref_gene = [] up_down_gene = [] unmatched_genes = [] for gene in variant['genes_of_interest']: if gene_id and gene_id != '' and gene_id in gene_coordinate_map: if gene_coordinate_map[gene_id]['start'] \ <= variant['position']\ <= gene_coordinate_map[gene_id]['end']: gene_info = { 'symbol': gene, 'strand': gene_coordinate_map[gene_id]['strand'] } ref_gene.append(gene_info) else: up_down_gene.append(gene) else: unmatched_genes.append(gene) if len(ref_gene) == 1: self._add_gene_to_graph( ref_gene[0]['symbol'], variant_bnode, gene_id, self.globaltt['has_affected_feature']) # update label with gene gene_list = [ref_gene[0]['symbol'] ] # build label expects list variant_label = self._build_variant_label( variant['build'], variant['chromosome'], variant['position'], variant['reference_allele'], variant['variant_allele'], gene_list) model.addLabel(variant_bnode, variant_label) # In some cases there are multiple instances # of same gene from dupe rows in the source # Credit http://stackoverflow.com/a/3844832 elif len(ref_gene) > 0 and ref_gene[1:] == ref_gene[:-1]: self._add_gene_to_graph( ref_gene[0]['symbol'], variant_bnode, gene_id, self.globaltt['has_affected_feature']) # build label function expects list gene_list = [ref_gene[0]['symbol']] variant_label = self._build_variant_label( variant['build'], variant['chromosome'], variant['position'], variant['reference_allele'], variant['variant_allele'], gene_list) model.addLabel(variant_bnode, variant_label) # Check if reference genes are on different strands elif len(ref_gene) == 2: strands = [st['strand'] for st in ref_gene] if "minus" in strands and "plus" in strands: for r_gene in ref_gene: self._add_gene_to_graph( r_gene['symbol'], variant_bnode, gene_id, self.globaltt['has_affected_feature']) else: LOG.warning( "unable to map intron variant to gene coordinates: %s", variant) for r_gene in ref_gene: self._add_gene_to_graph( r_gene['symbol'], variant_bnode, gene_id, self.globaltt['causally_influences']) elif re.search(r'intron', variant['type'], flags=re.I): LOG.warning( "unable to map intron variant to gene coordinates_2: %s", variant) for neighbor in up_down_gene: self._add_gene_to_graph( neighbor, variant_bnode, gene_id, self.globaltt['causally_influences']) # Unmatched genes are likely because we cannot map to an NCBIGene # or we do not have coordinate information for unmatched_gene in unmatched_genes: self._add_gene_to_graph( unmatched_gene, variant_bnode, gene_id, self.globaltt['causally_influences']) return