def scrub(self): """ The XML file seems to have mixed-encoding; we scrub out the control characters from the file for processing. i.e.?i omia.xml:1555328.28: PCDATA invalid Char value 2 <field name="journal">Bulletin et Memoires de la Societe Centrale de Medic :return: """ logger.info( "Scrubbing out the nasty characters that break our parser.") myfile = '/'.join((self.rawdir, self.files['data']['file'])) tmpfile = '/'.join((self.rawdir, self.files['data']['file']+'.tmp.gz')) t = gzip.open(tmpfile, 'wb') du = DipperUtil() with gzip.open(myfile, 'rb') as f: filereader = io.TextIOWrapper(f, newline="") for l in filereader: l = du.remove_control_characters(l) + '\n' t.write(l.encode('utf-8')) t.close() # TEC I do not like this at all. original data must be preserved as is. # also may be heavy handed as chars which do not break the parser # are stripped as well (i.e. tabs and newlines) # move the temp file logger.info("Replacing the original data with the scrubbed file.") shutil.move(tmpfile, myfile) return
def _add_gene_equivalencies(self, xrefs, gene_id, taxon): """ Add equivalentClass and sameAs relationships Uses external resource map located in /resources/clique_leader.yaml to determine if an ID space is a clique leader """ clique_map = self.open_and_parse_yaml(self.resources['clique_leader']) if self.testMode: graph = self.testgraph else: graph = self.graph filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport'] # These will be made xrefs taxon_spec_xref_filters = {'10090': ['ENSEMBL'], '9606': ['ENSEMBL']} if taxon in taxon_spec_xref_filters: taxon_spec_filters = taxon_spec_xref_filters[taxon] else: taxon_spec_filters = [] model = Model(graph) # deal with the xrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 for ref in xrefs.strip().split('|'): xref_curie = self._cleanup_id(ref) if xref_curie is not None and xref_curie.strip() != '': if re.match(r'HPRD', xref_curie): # proteins are not == genes. model.addTriple(gene_id, self.properties['has_gene_product'], xref_curie) continue # skip some of these for now if xref_curie.split(':')[0] in filter_out: continue if xref_curie.split(':')[0] in taxon_spec_xref_filters: model.addXref(gene_id, xref_curie) if re.match(r'^OMIM', xref_curie): if DipperUtil.is_omim_disease(xref_curie): continue try: if self.class_or_indiv.get(gene_id) == 'C': model.addEquivalentClass(gene_id, xref_curie) if int(taxon) in clique_map: if clique_map[int(taxon)] == xref_curie.split( ':')[0]: model.makeLeader(xref_curie) elif clique_map[int(taxon)] == gene_id.split( ':')[0]: model.makeLeader(gene_id) else: model.addSameIndividual(gene_id, xref_curie) except AssertionError as e: logger.warn("Error parsing {0}: {1}".format(gene_id, e)) return
def _add_variant_trait_association(self, variant_id, mapped_trait_uri, mapped_trait, mondo_data, pubmed_id, description=None): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) # Use mapped traits for labels, hope that labels do not contain commas mapped_traits = [trait.strip() for trait in mapped_trait.split(',')] mapped_trait_uris = [ iri.strip() for iri in mapped_trait_uri.split(',') ] if mapped_trait_uris: for index, trait in enumerate(mapped_trait_uris): trait_curie = trait.replace("http://www.ebi.ac.uk/efo/EFO_", "EFO:") if not DipperUtil.is_id_in_mondo(trait_curie, mondo_data): if re.match(r'^EFO', trait_curie): model.addClassToGraph( trait_curie, mapped_traits[index], self.globaltt['phenotype'], class_category=blv.terms['PhenotypicFeature']) LOG.debug("{} not in mondo".format(trait_curie)) else: LOG.debug("{} in mondo".format(trait_curie)) pubmed_curie = 'PMID:' + pubmed_id ref = Reference(graph, pubmed_curie, self.globaltt['journal article']) ref.addRefToGraph() if trait_curie is not None and trait_curie != '' and \ variant_id is not None and variant_id != '': assoc = G2PAssoc( graph, self.name, variant_id, trait_curie, model.globaltt['contributes to condition']) assoc.add_source(pubmed_curie) assoc.add_evidence(self.globaltt[ 'combinatorial evidence used in automatic assertion']) if description is not None: assoc.set_description(description) # FIXME score should get added to provenance/study # assoc.set_score(pvalue) assoc.add_association_to_graph()
def _add_gene_equivalencies(self, xrefs, gene_id, taxon): """ Add equivalentClass and sameAs relationships Uses external resource map located in /resources/clique_leader.yaml to determine if an NCBITaxon ID space is a clique leader """ clique_map = self.open_and_parse_yaml(self.resources['clique_leader']) if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport'] # deal with the dbxrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 for dbxref in xrefs.strip().split('|'): prefix = ':'.join(dbxref.split(':')[:-1]).strip() if prefix in self.localtt: prefix = self.localtt[prefix] dbxref_curie = ':'.join((prefix, dbxref.split(':')[-1])) if dbxref_curie is not None and prefix != '': if prefix == 'HPRD': # proteins are not == genes. model.addTriple(gene_id, self.globaltt['has gene product'], dbxref_curie) continue # skip some of these for now based on curie prefix if prefix in filter_out: continue if prefix == 'ENSEMBL': model.addXref(gene_id, dbxref_curie) if prefix == 'OMIM': if DipperUtil.is_omim_disease(dbxref_curie): continue try: if self.class_or_indiv.get(gene_id) == 'C': model.addEquivalentClass(gene_id, dbxref_curie) if taxon in clique_map: if clique_map[taxon] == prefix: model.makeLeader(dbxref_curie) elif clique_map[taxon] == gene_id.split(':')[0]: model.makeLeader(gene_id) else: model.addSameIndividual(gene_id, dbxref_curie) except AssertionError as err: LOG.warning("Error parsing %s: %s", gene_id, err) return
def _process_haplotype(self, hap_id, hap_label, chrom_num, chrom_pos, context, risk_allele_frequency, mapped_gene, so_ontology): if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) # add the feature to the graph hap_description = None if risk_allele_frequency not in ['', 'NR']: hap_description = str( risk_allele_frequency) + ' [risk allele frequency]' model.addIndividualToGraph(hap_id, hap_label.strip(), self.globaltt['haplotype'], hap_description) geno.addTaxon(self.globaltt["H**o sapiens"], hap_id) snp_labels = re.split(r';\s?', hap_label) chrom_nums = re.split(r';\s?', chrom_num) chrom_positions = re.split(r';\s?', chrom_pos) context_list = re.split(r';\s?', context) mapped_genes = re.split(r';\s?', mapped_gene) # Not having four "PAX5" as a list might be better, but it breaks unit tests # mapped_genes = list(set(mapped_genes)) # make uniq # snp_labels = list(set(snp_labels)) # make uniq snp_curies = list() for snp in snp_labels: snp_curie, snp_type = self._get_curie_and_type_from_id(snp) if snp_type is None: LOG.info('cant find type for SNP in %s', snp) # make blank node snp_curie = self.make_id(snp, "_") model.addLabel(snp_curie, snp) elif snp_curie[0] == '_': # arrived an unlabeled blanknode model.addLabel(snp_curie, snp) graph.addTriple(hap_id, self.globaltt['has_variant_part'], snp_curie) snp_curies.append(snp_curie) # courtesy http://stackoverflow.com/a/16720915 # check lengths of mutiple lists length = len(snp_curies) if not all( len(lst) == length for lst in [snp_labels, chrom_nums, chrom_positions, context_list]): LOG.warning( "Incongruous data field(s) for haplotype %s \n " "will not add snp details", hap_label) else: variant_in_gene_count = 0 for index, snp_curie in enumerate(snp_curies): self._add_snp_to_graph(snp_curie, snp_labels[index], chrom_nums[index], chrom_positions[index], context_list[index]) if mapped_genes and len(mapped_genes) != len(snp_labels): LOG.warning("More mapped genes than snps," " cannot disambiguate for\n%s\n%s", mapped_genes, snp_labels) # hap_label) else: so_class = self.resolve(context_list[index]) so_query = """ SELECT ?variant_label WHERE {{ {0} rdfs:subClassOf+ {1} ; rdfs:label ?variant_label . }} """.format(so_class, self.globaltt['gene_variant']) query_result = so_ontology.query(so_query) gene_id = DipperUtil.get_hgnc_id_from_symbol( mapped_genes[index]) if gene_id is not None and len(list(query_result)) == 1: if context_list[index] in [ 'upstream_gene_variant', 'downstream_gene_variant' ]: graph.addTriple(snp_curie, self.resolve(context_list[index]), gene_id) else: geno.addAffectedLocus(snp_curie, gene_id) variant_in_gene_count += 1 # Seperate in case we want to apply a different relation # If not this is redundant with triples added above if len(mapped_genes) == variant_in_gene_count and \ len(set(mapped_genes)) == 1: gene_id = DipperUtil.get_hgnc_id_from_symbol(mapped_genes[0]) geno.addAffectedLocus(hap_id, gene_id)
def _process_genes(self, limit=None): if self.testMode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) raw = '/'.join((self.rawdir, self.files['genes']['file'])) line_counter = 0 logger.info("Processing HGNC genes") with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') # curl -s ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt | head -1 | tr '\t' '\n' | grep -n . for row in filereader: (hgnc_id, symbol, name, locus_group, locus_type, status, location, location_sortable, alias_symbol, alias_name, prev_symbol, prev_name, gene_family, gene_family_id, date_approved_reserved, date_symbol_changed, date_name_changed, date_modified, entrez_id, ensembl_gene_id, vega_id, ucsc_id, ena, refseq_accession, ccds_id, uniprot_ids, pubmed_id, mgd_id, rgd_id, lsdb, cosmic, omim_id, mirbase, homeodb, snornabase, bioparadigms_slc, orphanet, pseudogene_org, horde_id, merops, imgt, iuphar, kznf_gene_catalog, mamit_trnadb, cd, lncrnadb, enzyme_id, intermediate_filament_db, rna_central_ids) = row line_counter += 1 # skip header if line_counter <= 1: continue if self.testMode and entrez_id != '' and \ int(entrez_id) not in self.gene_ids: continue if name == '': name = None gene_type_id = self.resolve(locus_type, False) # withdrawn -> None? if gene_type_id != locus_type: model.addClassToGraph(hgnc_id, symbol, gene_type_id, name) if locus_type == 'withdrawn': model.addDeprecatedClass(hgnc_id) else: model.makeLeader(hgnc_id) if entrez_id != '': model.addEquivalentClass(hgnc_id, 'NCBIGene:' + entrez_id) if ensembl_gene_id != '': model.addEquivalentClass(hgnc_id, 'ENSEMBL:' + ensembl_gene_id) if omim_id != '' and "|" not in omim_id: omim_curie = 'OMIM:' + omim_id if not DipperUtil.is_omim_disease(omim_curie): model.addEquivalentClass(hgnc_id, omim_curie) geno.addTaxon(self.hs_txid, hgnc_id) # add pubs as "is about" if pubmed_id != '': for p in re.split(r'\|', pubmed_id.strip()): if str(p) != '': graph.addTriple('PMID:' + str(p.strip()), self.globaltt['is_about'], hgnc_id) # add chr location # sometimes two are listed, like: 10p11.2 or 17q25 # -- there are only 2 of these FRA10A and MPFD # sometimes listed like "1 not on reference assembly" # sometimes listed like 10q24.1-q24.3 # sometimes like 11q11 alternate reference locus band = chrom = None chr_pattern = r'(\d+|X|Y|Z|W|MT)[pq$]' chr_match = re.match(chr_pattern, location) if chr_match is not None and len(chr_match.groups()) > 0: chrom = chr_match.group(1) chrom_id = makeChromID(chrom, self.hs_txid, 'CHR') band_pattern = r'([pq][A-H\d]?\d?(?:\.\d+)?)' band_match = re.search(band_pattern, location) feat = Feature(graph, hgnc_id, None, None) if band_match is not None and len(band_match.groups()) > 0: band = band_match.group(1) band = chrom + band # add the chr band as the parent to this gene # as a feature but assume that the band is created # as a class with properties elsewhere in Monochrom band_id = makeChromID(band, self.hs_txid, 'CHR') model.addClassToGraph(band_id, None) feat.addSubsequenceOfFeature(band_id) else: model.addClassToGraph(chrom_id, None) feat.addSubsequenceOfFeature(chrom_id) if not self.testMode and limit is not None and line_counter > limit: break # end loop through file return
def _process_data(self, source, limit=None): """ This function will process the data files from Coriell. We make the assumption that any alleles listed are variants (alternates to w.t.) Triples: (examples) :NIGMSrepository a CLO_0000008 #repository label : NIGMS Human Genetic Cell Repository foaf:page https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8 line_id a CL_0000057, #fibroblast line derives_from patient_id part_of :NIGMSrepository RO:model_of OMIM:disease_id patient id a foaf:person, label: "fibroblast from patient 12345 with disease X" member_of family_id #what is the right thing here? SIO:race EFO:caucasian #subclass of EFO:0001799 in_taxon NCBITaxon:9606 dc:description Literal(remark) RO:has_phenotype OMIM:disease_id GENO:has_genotype genotype_id family_id a owl:NamedIndividual foaf:page "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM" genotype_id a intrinsic_genotype GENO:has_alternate_part allelic_variant_id we don't necessarily know much about the genotype, other than the allelic variant. also there's the sex here pub_id mentions cell_line_id :param raw: :param limit: :return: """ raw = '/'.join((self.rawdir, self.files[source]['file'])) LOG.info("Processing Data from %s", raw) if self.testMode: # set the graph to build graph = self.testgraph else: graph = self.graph family = Family(graph) model = Model(graph) line_counter = 1 geno = Genotype(graph) diputil = DipperUtil() col = self.files[source]['columns'] # affords access with # x = row[col.index('x')].strip() with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar=r'"') # we can keep a close watch on changing file formats fileheader = next(filereader, None) fileheader = [c.lower() for c in fileheader] if col != fileheader: # assert LOG.error('Expected %s to have columns: %s', raw, col) LOG.error('But Found %s to have columns: %s', raw, fileheader) raise AssertionError('Incomming data headers have changed.') for row in filereader: line_counter += 1 if len(row) != len(col): LOG.warning('Expected %i values but find %i in row %i', len(col), len(row), line_counter) continue # (catalog_id, description, omim_number, sample_type, # cell_line_available, dna_in_stock, dna_ref, gender, age, # race, ethnicity, affected, karyotype, relprob, mutation, # gene, family_id, collection, url, cat_remark, pubmed_ids, # family_member, variant_id, dbsnp_id, species) = row # example: # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No, # ,Female,26 YR,Caucasian,,,, # parent,,,39,NIGMS Human Genetic Cell Repository, # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003, # 46;XX; clinically normal mother of a child with Hurler syndrome; # proband not in Repository,, # 2,,18343,H**o sapiens catalog_id = row[col.index('catalog_id')].strip() if self.testMode and catalog_id not in self.test_lines: # skip rows not in our test lines, when in test mode continue # ########### BUILD REQUIRED VARIABLES ########### # Make the cell line ID cell_line_id = 'Coriell:' + catalog_id # Map the cell/sample type cell_type = self.resolve(row[col.index('sample_type')].strip()) # on fail cell_type = self.globaltt['cell'] ? # Make a cell line label collection = row[col.index('collection')].strip() line_label = collection.partition(' ')[0] + '-' + catalog_id # Map the repository/collection repository = self.localtt[collection] # patients are uniquely identified by one of: # dbsnp id (which is == an individual haplotype) # family id + family member (if present) OR # probands are usually family member zero # cell line id # since some patients have >1 cell line derived from them, # we must make sure that the genotype is attached to # the patient, and can be inferred to the cell line # examples of repeated patients are: # famid=1159, member=1; fam=152,member=1 # Make the patient ID # make an anonymous patient patient_id = '_:person' fam_id = row[col.index('fam')].strip() fammember = row[col.index('fammember')].strip() if fam_id != '': patient_id = '-'.join((patient_id, fam_id, fammember)) else: # make an anonymous patient patient_id = '-'.join((patient_id, catalog_id)) # properties of the individual patients: sex, family id, # member/relproband, description descriptions are # really long and ugly SCREAMING text, so need to clean up # the control cases are so odd with this labeling scheme; # but we'll deal with it as-is for now. description = row[col.index('description')].strip() short_desc = (description.split(';')[0]).capitalize() gender = row[col.index('gender')].strip().lower() affected = row[col.index('affected')].strip() relprob = row[col.index('relprob')].strip() if affected == '': affected = 'unspecified' elif affected in self.localtt: affected = self.localtt[affected] else: LOG.warning('Novel Affected status %s at row: %i of %s', affected, line_counter, raw) patient_label = ' '.join((affected, gender, relprob)) if relprob == 'proband': patient_label = ' '.join( (patient_label.strip(), 'with', short_desc)) else: patient_label = ' '.join( (patient_label.strip(), 'of proband with', short_desc)) # ############# BUILD THE CELL LINE ############# # Adding the cell line as a typed individual. cell_line_reagent_id = self.globaltt['cell line'] model.addIndividualToGraph(cell_line_id, line_label, cell_line_reagent_id) # add the equivalent id == dna_ref dna_ref = row[col.index('dna_ref')].strip() if dna_ref != '' and dna_ref != catalog_id: equiv_cell_line = 'Coriell:' + dna_ref # some of the equivalent ids are not defined # in the source data; so add them model.addIndividualToGraph(equiv_cell_line, None, cell_line_reagent_id) model.addSameIndividual(cell_line_id, equiv_cell_line) # Cell line derives from patient geno.addDerivesFrom(cell_line_id, patient_id) geno.addDerivesFrom(cell_line_id, cell_type) # Cell line a member of repository family.addMember(repository, cell_line_id) cat_remark = row[col.index('cat_remark')].strip() if cat_remark != '': model.addDescription(cell_line_id, cat_remark) # Cell age_at_sampling # TODO add the age nodes when modeled properly in #78 # if (age != ''): # this would give a BNode that is an instance of Age. # but i don't know how to connect # the age node to the cell line? we need to ask @mbrush # age_id = '_'+re.sub('\s+','_',age) # gu.addIndividualToGraph( # graph,age_id,age,self.globaltt['age']) # gu.addTriple( # graph,age_id,self.globaltt['has measurement value'],age, # True) # ############# BUILD THE PATIENT ############# # Add the patient ID as an individual. model.addPerson(patient_id, patient_label) # TODO map relationship to proband as a class # (what ontology?) # Add race of patient # FIXME: Adjust for subcategories based on ethnicity field # EDIT: There are 743 different entries for ethnicity... # Too many to map? # Add ethnicity as literal in addition to the mapped race? # Adjust the ethnicity txt (if using) # to initial capitalization to remove ALLCAPS # TODO race should go into the individual's background # and abstracted out to the Genotype class punting for now. # if race != '': # mapped_race = self.resolve(race) # if mapped_race is not None: # gu.addTriple( # g,patient_id,self.globaltt['race'], mapped_race) # model.addSubClass( # mapped_race,self.globaltt['ethnic_group']) # ############# BUILD THE FAMILY ############# # Add triples for family_id, if present. if fam_id != '': family_comp_id = 'CoriellFamily:' + fam_id family_label = ' '.join( ('Family of proband with', short_desc)) # Add the family ID as a named individual model.addIndividualToGraph(family_comp_id, family_label, self.globaltt['family']) # Add the patient as a member of the family family.addMemberOf(patient_id, family_comp_id) # ############# BUILD THE GENOTYPE ############# # the important things to pay attention to here are: # karyotype = chr rearrangements (somatic?) # mutation = protein-level mutation as a label, # often from omim # gene = gene symbol - TODO get id # variant_id = omim variant ids (; delimited) # dbsnp_id = snp individual ids = full genotype? # note GM00633 is a good example of chromosomal variation # - do we have enough to capture this? # GM00325 has both abnormal karyotype and variation # make an assumption that if the taxon is blank, # that it is human! species = row[col.index('species')].strip() if species is None or species == '': species = 'H**o sapiens' taxon = self.resolve(species) # if there's a dbSNP id, # this is actually the individual's genotype genotype_id = None genotype_label = None dbsnp_id = row[col.index('dbsnp_id')].strip() if dbsnp_id != '': genotype_id = 'dbSNPIndividual:' + dbsnp_id omim_map = {} gvc_id = None # some of the karyotypes are encoded # with terrible hidden codes. remove them here # i've seen a <98> character karyotype = row[col.index('karyotype')].strip() karyotype = diputil.remove_control_characters(karyotype) karyotype_id = None if karyotype.strip() != '': karyotype_id = '_:' + re.sub('MONARCH:', '', self.make_id(karyotype)) # add karyotype as karyotype_variation_complement model.addIndividualToGraph( karyotype_id, karyotype, self.globaltt['karyotype_variation_complement']) # TODO break down the karyotype into parts # and map into GENO. depends on #77 # place the karyotype in a location(s). karyo_chrs = self._get_affected_chromosomes_from_karyotype( karyotype) for chrom in karyo_chrs: chr_id = makeChromID(chrom, taxon, 'CHR') # add an anonymous sequence feature, # each located on chr karyotype_feature_id = '-'.join((karyotype_id, chrom)) karyotype_feature_label = \ 'some karyotype alteration on chr' + str(chrom) feat = Feature(graph, karyotype_feature_id, karyotype_feature_label, self.globaltt['sequence_alteration']) feat.addFeatureStartLocation(None, chr_id) feat.addFeatureToGraph() geno.addParts(karyotype_feature_id, karyotype_id, self.globaltt['has_variant_part']) gene = row[col.index('gene')].strip() mutation = row[col.index('mutation')].strip() if gene != '': vl = gene + '(' + mutation + ')' # fix the variant_id so it's always in the same order variant_id = row[col.index('variant_id')].strip() vids = variant_id.split(';') variant_id = ';'.join(sorted(list(set(vids)))) if karyotype.strip() != '' and not self._is_normal_karyotype( karyotype): gvc_id = karyotype_id if variant_id != '': gvc_id = '_:' + variant_id.replace(';', '-') + '-' \ + re.sub(r'\w*:', '', karyotype_id) if mutation.strip() != '': gvc_label = '; '.join((vl, karyotype)) else: gvc_label = karyotype elif variant_id.strip() != '': gvc_id = '_:' + variant_id.replace(';', '-') gvc_label = vl else: # wildtype? pass # add the karyotype to the gvc. # use reference if normal karyotype karyo_rel = self.globaltt['has_variant_part'] if self._is_normal_karyotype(karyotype): karyo_rel = self.globaltt['has_reference_part'] if karyotype_id is not None \ and not self._is_normal_karyotype(karyotype) \ and gvc_id is not None and karyotype_id != gvc_id: geno.addParts(karyotype_id, gvc_id, karyo_rel) if variant_id.strip() != '': # split the variants & add them as part of the genotype # we don't necessarily know their zygosity, # just that they are part of the genotype variant ids # are from OMIM, so prefix as such we assume that the # sequence alts will be defined in OMIM not here # TODO sort the variant_id list, if the omim prefix is # the same, then assume it's the locus make a hashmap # of the omim id to variant id list; # then build the genotype hashmap is also useful for # removing the "genes" from the list of "phenotypes" # will hold gene/locus id to variant list omim_map = {} locus_num = None for var in variant_id.split(';'): # handle omim-style and odd var ids # like 610661.p.R401X mch = re.match(r'(\d+)\.+(.*)', var.strip()) if mch is not None and len(mch.groups()) == 2: (locus_num, var_num) = mch.groups() if locus_num is not None and locus_num not in omim_map: omim_map[locus_num] = [var_num] else: omim_map[locus_num] += [var_num] for omim in omim_map: # gene_id = 'OMIM:' + omim # TODO unused vslc_id = '_:' + '-'.join( [omim + '.' + a for a in omim_map.get(omim)]) vslc_label = vl # we don't really know the zygosity of # the alleles at all. # so the vslcs are just a pot of them model.addIndividualToGraph( vslc_id, vslc_label, self.globaltt['variant single locus complement']) for var in omim_map.get(omim): # this is actually a sequence alt allele1_id = 'OMIM:' + omim + '.' + var geno.addSequenceAlteration(allele1_id, None) # assume that the sa -> var_loc -> gene # is taken care of in OMIM geno.addPartsToVSLC( vslc_id, allele1_id, None, self.globaltt['indeterminate'], self.globaltt['has_variant_part']) if vslc_id != gvc_id: geno.addVSLCtoParent(vslc_id, gvc_id) if affected == 'unaffected': # let's just say that this person is wildtype model.addType(patient_id, self.globaltt['wildtype']) elif genotype_id is None: # make an anonymous genotype id (aka blank node) genotype_id = '_:geno' + catalog_id.strip() # add the gvc if gvc_id is not None: model.addIndividualToGraph( gvc_id, gvc_label, self.globaltt['genomic_variation_complement']) # add the gvc to the genotype if genotype_id is not None: if affected == 'unaffected': rel = self.globaltt['has_reference_part'] else: rel = self.globaltt['has_variant_part'] geno.addParts(gvc_id, genotype_id, rel) if karyotype_id is not None \ and self._is_normal_karyotype(karyotype): if gvc_label is not None and gvc_label != '': genotype_label = '; '.join((gvc_label, karyotype)) elif karyotype is not None: genotype_label = karyotype if genotype_id is None: genotype_id = karyotype_id else: geno.addParts(karyotype_id, genotype_id, self.globaltt['has_reference_part']) else: genotype_label = gvc_label # use the catalog id as the background genotype_label += ' [' + catalog_id.strip() + ']' if genotype_id is not None and gvc_id is not None: # only add the genotype if it has some parts geno.addGenotype(genotype_id, genotype_label, self.globaltt['intrinsic_genotype']) geno.addTaxon(taxon, genotype_id) # add that the patient has the genotype # TODO check if the genotype belongs to # the cell line or to the patient graph.addTriple(patient_id, self.globaltt['has_genotype'], genotype_id) else: geno.addTaxon(taxon, patient_id) # TODO: Add sex/gender (as part of the karyotype?) # = row[col.index('')].strip() # ############# DEAL WITH THE DISEASES ############# omim_num = row[col.index('omim_num')].strip() # we associate the disease to the patient if affected == 'affected' and omim_num != '': for d in omim_num.split(';'): if d is not None and d != '': # if the omim number is in omim_map, # then it is a gene not a pheno # TEC - another place to use the mimTitle omim # classifier omia & genereviews are using if d not in omim_map: disease_id = 'OMIM:' + d.strip() # assume the label is taken care of in OMIM model.addClassToGraph(disease_id, None) # add the association: # the patient has the disease assoc = G2PAssoc(graph, self.name, patient_id, disease_id) assoc.add_association_to_graph() # this line is a model of this disease # TODO abstract out model into # it's own association class? graph.addTriple(cell_line_id, self.globaltt['is model of'], disease_id) else: LOG.info('drop gene %s from disease list', d) # ############# ADD PUBLICATIONS ############# pubmed_ids = row[col.index('pubmed_ids')].strip() if pubmed_ids != '': for s in pubmed_ids.split(';'): pubmed_id = 'PMID:' + s.strip() ref = Reference(graph, pubmed_id) ref.setType(self.globaltt['journal article']) ref.addRefToGraph() graph.addTriple(pubmed_id, self.globaltt['mentions'], cell_line_id) if not self.testMode and (limit is not None and line_counter > limit): break return
def _add_variant_gene_relationship(self, patient_var_map, gene_coordinate_map): """ Right now it is unclear the best approach on how to connect variants to genes. In most cases has_affected_locus/GENO:0000418 is accurate; however, there are cases where a variant is in the intron on one gene and is purported to causally affect another gene down or upstream. In these cases we must first disambiguate which gene is the affected locus, and which gene(s) are predicated to be causully influenced by (RO:0002566) UPDATE 8-30: In the latest dataset we no longer have 1-many mappings between variants and genes, but leaving this here in case we see these in the future The logic followed here is: if mutation type contains downstream/upstream and more than one gene of interest, investigate coordinates of all genes to see if we can disambiguate which genes are which :return: None """ # genotype = Genotype(self.graph) dipper_util = DipperUtil() model = Model(self.graph) # Note this could be compressed in someway to remove one level of for looping for patient in patient_var_map: for variant_id, variant in patient_var_map[patient].items(): variant_bnode = self.make_id("{0}".format(variant_id), "_") genes_of_interest = variant['genes_of_interest'] if len(genes_of_interest) == 1: # Assume variant is variant allele of gene gene = genes_of_interest[0] gene_id = dipper_util.get_hgnc_id_from_symbol(gene) self._add_gene_to_graph( gene, variant_bnode, gene_id, self.globaltt['has_affected_feature']) elif re.search(r'upstream|downstream', variant['type'], flags=re.I): # Attempt to disambiguate ref_gene = [] up_down_gene = [] unmatched_genes = [] for gene in variant['genes_of_interest']: if gene_id and gene_id != '' and gene_id in gene_coordinate_map: if gene_coordinate_map[gene_id]['start'] \ <= variant['position']\ <= gene_coordinate_map[gene_id]['end']: gene_info = { 'symbol': gene, 'strand': gene_coordinate_map[gene_id]['strand'] } ref_gene.append(gene_info) else: up_down_gene.append(gene) else: unmatched_genes.append(gene) if len(ref_gene) == 1: self._add_gene_to_graph( ref_gene[0]['symbol'], variant_bnode, gene_id, self.globaltt['has_affected_feature']) # update label with gene gene_list = [ref_gene[0]['symbol'] ] # build label expects list variant_label = self._build_variant_label( variant['build'], variant['chromosome'], variant['position'], variant['reference_allele'], variant['variant_allele'], gene_list) model.addLabel(variant_bnode, variant_label) # In some cases there are multiple instances # of same gene from dupe rows in the source # Credit http://stackoverflow.com/a/3844832 elif len(ref_gene) > 0 and ref_gene[1:] == ref_gene[:-1]: self._add_gene_to_graph( ref_gene[0]['symbol'], variant_bnode, gene_id, self.globaltt['has_affected_feature']) # build label function expects list gene_list = [ref_gene[0]['symbol']] variant_label = self._build_variant_label( variant['build'], variant['chromosome'], variant['position'], variant['reference_allele'], variant['variant_allele'], gene_list) model.addLabel(variant_bnode, variant_label) # Check if reference genes are on different strands elif len(ref_gene) == 2: strands = [st['strand'] for st in ref_gene] if "minus" in strands and "plus" in strands: for r_gene in ref_gene: self._add_gene_to_graph( r_gene['symbol'], variant_bnode, gene_id, self.globaltt['has_affected_feature']) else: LOG.warning( "unable to map intron variant to gene coordinates: %s", variant) for r_gene in ref_gene: self._add_gene_to_graph( r_gene['symbol'], variant_bnode, gene_id, self.globaltt['causally_influences']) elif re.search(r'intron', variant['type'], flags=re.I): LOG.warning( "unable to map intron variant to gene coordinates_2: %s", variant) for neighbor in up_down_gene: self._add_gene_to_graph( neighbor, variant_bnode, gene_id, self.globaltt['causally_influences']) # Unmatched genes are likely because we cannot map to an NCBIGene # or we do not have coordinate information for unmatched_gene in unmatched_genes: self._add_gene_to_graph( unmatched_gene, variant_bnode, gene_id, self.globaltt['causally_influences']) return
def _process_omim2gene(self, limit=None): """ This method maps the OMIM IDs and KEGG gene ID. Currently split based on the link_type field. Equivalent link types are mapped as gene XRefs. Reverse link types are mapped as disease to gene associations. Original link types are currently skipped. Triples created: <kegg_gene_id> is a Gene <omim_gene_id> is a Gene <kegg_gene_id>> hasXref <omim_gene_id> <assoc_id> has subject <omim_disease_id> <assoc_id> has object <kegg_gene_id> :param limit: :return: """ LOG.info("Processing OMIM to KEGG gene") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) line_counter = 0 geno = Genotype(graph) raw = '/'.join((self.rawdir, self.files['omim2gene']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (kegg_gene_id, omim_id, link_type) = row if self.test_mode and kegg_gene_id not in self.test_ids['genes']: continue kegg_gene_id = 'KEGG-' + kegg_gene_id.strip() omim_id = re.sub(r'omim', 'OMIM', omim_id) if link_type == 'equivalent': # these are genes! # so add them as a class then make equivalence model.addClassToGraph(omim_id, None) geno.addGene(kegg_gene_id, None) if not DipperUtil.is_omim_disease(omim_id): model.addEquivalentClass(kegg_gene_id, omim_id) elif link_type == 'reverse': # make an association between an OMIM ID & the KEGG gene ID # we do this with omim ids because # they are more atomic than KEGG ids alt_locus_id = self._make_variant_locus_id(kegg_gene_id, omim_id) alt_label = self.label_hash[alt_locus_id] model.addIndividualToGraph( alt_locus_id, alt_label, self.globaltt['variant_locus']) geno.addAffectedLocus(alt_locus_id, kegg_gene_id) model.addBlankNodeAnnotation(alt_locus_id) # Add the disease to gene relationship. rel = self.globaltt['is marker for'] assoc = G2PAssoc(graph, self.name, alt_locus_id, omim_id, rel) assoc.add_association_to_graph() elif link_type == 'original': # these are sometimes a gene, and sometimes a disease LOG.info( 'Unable to handle original link for %s-%s', kegg_gene_id, omim_id) else: # don't know what these are LOG.warning( 'Unhandled link type for %s-%s: %s', kegg_gene_id, omim_id, link_type) if (not self.test_mode) and ( limit is not None and line_counter > limit): break LOG.info("Done with OMIM to KEGG gene") return
def _process_haplotype( self, hap_id, hap_label, chrom_num, chrom_pos, context, risk_allele_frequency, mapped_gene, so_ontology): tax_id = 'NCBITaxon:9606' if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) model = Model(g) # add the feature to the graph hap_description = None if risk_allele_frequency != '' and \ risk_allele_frequency != 'NR': hap_description = \ str(risk_allele_frequency) + \ ' [risk allele frequency]' model.addIndividualToGraph(hap_id, hap_label.strip(), Feature.types['haplotype'], hap_description) geno.addTaxon(tax_id, hap_id) snp_labels = re.split(r';\s?', hap_label) chrom_nums = re.split(r';\s?', chrom_num) chrom_positions = re.split(r';\s?', chrom_pos) context_list = re.split(r';\s?', context) mapped_genes = re.split(r';\s?', mapped_gene) snp_curies = list() for index, snp in enumerate(snp_labels): snp_curie, snp_type = self._get_curie_and_type_from_id(snp) if snp_type is None: # make blank node snp_curie = self.make_id(snp, "_") g.addTriple(hap_id, geno.object_properties['has_variant_part'], snp_curie) snp_curies.append(snp_curie) # courtesy http://stackoverflow.com/a/16720915 length = len(snp_labels) if not all(len(lst) == length for lst in [chrom_nums, chrom_positions, context_list]): logger.warn( "Unexpected data field for haplotype {} \n " "will not add snp details".format(hap_label)) return variant_in_gene_count = 0 for index, snp_curie in enumerate(snp_curies): self._add_snp_to_graph( snp_curie, snp_labels[index], chrom_nums[index], chrom_positions[index], context_list[index]) if len(mapped_genes) == len(snp_labels): so_class = self._map_variant_type(context_list[index]) if so_class is None: raise ValueError("Unknown SO class {} in haplotype {}" .format(context_list[index], hap_label)) so_query = """ SELECT ?variant_label WHERE {{ {0} rdfs:subClassOf+ SO:0001564 ; rdfs:label ?variant_label . }} """.format(so_class) query_result = so_ontology.query(so_query) if len(list(query_result)) > 0: gene_id = DipperUtil.get_ncbi_id_from_symbol( mapped_genes[index]) if gene_id is not None: geno.addAffectedLocus(snp_curie, gene_id) geno.addAffectedLocus(hap_id, gene_id) variant_in_gene_count += 1 if context_list[index] == 'upstream_gene_variant': gene_id = DipperUtil.get_ncbi_id_from_symbol( mapped_genes[index]) if gene_id is not None: g.addTriple( snp_curie, Feature.object_properties[ 'upstream_of_sequence_of'], gene_id) elif context_list[index] == 'downstream_gene_variant': gene_id = DipperUtil.get_ncbi_id_from_symbol( mapped_genes[index]) if gene_id is not None: g.addTriple( snp_curie, Feature.object_properties[ 'downstream_of_sequence_of'], gene_id) else: logger.warn("More mapped genes than snps, " "cannot disambiguate for {}".format(hap_label)) # Seperate in case we want to apply a different relation # If not this is redundant with triples added above if len(mapped_genes) == variant_in_gene_count \ and len(set(mapped_genes)) == 1: gene_id = DipperUtil.get_ncbi_id_from_symbol(mapped_genes[0]) geno.addAffectedLocus(hap_id, gene_id) return
def _process_haplotype(self, hap_id, hap_label, chrom_num, chrom_pos, context, risk_allele_frequency, mapped_gene, so_ontology): if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) # add the feature to the graph hap_description = None if risk_allele_frequency != '' and risk_allele_frequency != 'NR': hap_description = str( risk_allele_frequency) + ' [risk allele frequency]' model.addIndividualToGraph(hap_id, hap_label.strip(), self.globaltt['haplotype'], hap_description) geno.addTaxon(self.globaltt["H**o sapiens"], hap_id) snp_labels = re.split(r';\s?', hap_label) chrom_nums = re.split(r';\s?', chrom_num) chrom_positions = re.split(r';\s?', chrom_pos) context_list = re.split(r';\s?', context) mapped_genes = re.split(r';\s?', mapped_gene) snp_curies = list() for index, snp in enumerate(snp_labels): snp_curie, snp_type = self._get_curie_and_type_from_id(snp) if snp_type is None: # make blank node snp_curie = self.make_id(snp, "_") graph.addTriple(hap_id, self.globaltt['has_variant_part'], snp_curie) snp_curies.append(snp_curie) # courtesy http://stackoverflow.com/a/16720915 length = len(snp_labels) if not all( len(lst) == length for lst in [chrom_nums, chrom_positions, context_list]): LOG.warning( "Unexpected data field for haplotype %s \n " "will not add snp details", hap_label) return variant_in_gene_count = 0 for index, snp_curie in enumerate(snp_curies): self._add_snp_to_graph(snp_curie, snp_labels[index], chrom_nums[index], chrom_positions[index], context_list[index]) if len(mapped_genes) == len(snp_labels): so_class = self.resolve(context_list[index]) # removed the '+' for recursive one-or-more rdfs:subClassOf paths # just so it did not return an empty graph so_query = """ SELECT ?variant_label WHERE {{ {0} rdfs:subClassOf {1} ; rdfs:label ?variant_label . }} """.format(so_class, self.globaltt['gene_variant']) query_result = so_ontology.query(so_query) if len(list(query_result)) == 1: gene_id = DipperUtil.get_ncbi_id_from_symbol( mapped_genes[index]) if gene_id is not None: geno.addAffectedLocus(snp_curie, gene_id) geno.addAffectedLocus(hap_id, gene_id) variant_in_gene_count += 1 gene_id = DipperUtil.get_ncbi_id_from_symbol( mapped_genes[index]) if gene_id is not None: graph.addTriple(snp_curie, self.resolve(context_list[index]), gene_id) else: LOG.warning( "More mapped genes than snps, cannot disambiguate for %s", hap_label) # Seperate in case we want to apply a different relation # If not this is redundant with triples added above if len(mapped_genes) == variant_in_gene_count and len( set(mapped_genes)) == 1: gene_id = DipperUtil.get_ncbi_id_from_symbol(mapped_genes[0]) geno.addAffectedLocus(hap_id, gene_id) return