def _get_gene_history(self, limit): """ Loops through the gene_history file and adds the old gene ids as deprecated classes, where the new gene id is the replacement for it. The old gene symbol is added as a synonym to the gene. :param limit: :return: """ gu = GraphUtils(curie_map.get()) if self.testMode: g = self.testgraph else: g = self.graph logger.info("Processing Gene records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['gene_history']['file'])) logger.info("FILE: %s", myfile) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match('^#', line): continue (tax_num, gene_num, discontinued_num, discontinued_symbol, discontinued_date) = line.split('\t') ##### set filter=None in init if you don't want to have a filter #if self.filter is not None: # if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids)) # or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))): # continue ##### end filter if gene_num == '-' or discontinued_num == '-': continue if self.testMode and int(gene_num) not in self.gene_ids: continue if int(tax_num) not in self.tax_ids: continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num)) tax_id = ':'.join(('NCBITaxon', tax_num)) # add the two genes gu.addClassToGraph(g, gene_id, None) gu.addClassToGraph(g, discontinued_gene_id, discontinued_symbol) # add the new gene id to replace the old gene id gu.addDeprecatedClass(g, discontinued_gene_id, [gene_id]) # also add the old symbol as a synonym of the new gene gu.addSynonym(g, gene_id, discontinued_symbol) if (not self.testMode) and (limit is not None and line_counter > limit): break return
def _process_straininfo(self, limit): # line_counter = 0 # TODO unused if self.testMode: g = self.testgraph else: g = self.graph logger.info("Processing measurements ...") raw = '/'.join((self.rawdir, self.files['straininfo']['file'])) tax_id = 'NCBITaxon:10090' gu = GraphUtils(curie_map.get()) with open(raw, 'r') as f: reader = csv.reader(f, delimiter=',', quotechar='\"') f.readline() # read the header row; skip for row in reader: (strain_name, vendor, stocknum, panel, mpd_strainid, straintype, n_proj, n_snp_datasets, mpdshortname, url) = row # C57BL/6J,J,000664,,7,IN,225,17,,http://jaxmice.jax.org/strain/000664.html # create the strain as an instance of the taxon if self.testMode and \ 'MPD:'+str(mpd_strainid) not in self.test_ids: continue strain_id = 'MPD-strain:'+str(mpd_strainid) gu.addIndividualToGraph(g, strain_id, strain_name, tax_id) if mpdshortname.strip() != '': gu.addSynonym(g, strain_id, mpdshortname.strip()) self.idlabel_hash[strain_id] = strain_name # make it equivalent to the vendor+stock if stocknum != '': if vendor == 'J': jax_id = 'JAX:'+stocknum gu.addSameIndividual(g, strain_id, jax_id) elif vendor == 'Rbrc': # reiken reiken_id = 'RBRC:'+re.sub(r'RBRC', '', stocknum) gu.addSameIndividual(g, strain_id, reiken_id) else: if url != '': gu.addXref(g, strain_id, url, True) if vendor != '': gu.addXref( g, strain_id, ':'.join((vendor, stocknum)), True) # add the panel information if panel != '': desc = panel+' [panel]' gu.addDescription(g, strain_id, desc) # TODO make the panels as a resource collection return
def _process_ortholog_classes(self, limit=None): """ This method add the KEGG orthology classes to the graph. Triples created: <orthology_class_id> is a class <orthology_class_id> has label <orthology_symbols> <orthology_class_id> has description <orthology_description> :param limit: :return: """ logger.info("Processing ortholog classes") if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 gu = GraphUtils(curie_map.get()) raw = '/'.join((self.rawdir, self.files['ortholog_classes']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (orthology_class_id, orthology_class_name) = row if self.testMode and orthology_class_id not in self.test_ids['ortholog_classes']: continue # FIXME: What's the proper route for this? # The orthology class is essentially a KEGG gene ID that is species agnostic. # Add the ID and label as a class. Would it be considered a gene as well? other_labels = re.split(';', orthology_class_name) orthology_label = other_labels[0] # the first one is the label we'll use orthology_class_id = 'KEGG-'+orthology_class_id.strip() orthology_type = OrthologyAssoc.terms['gene_family'] gu.addClassToGraph(g, orthology_class_id, orthology_label, orthology_type) if len(other_labels) > 1: # add the rest as synonyms # todo skip the first for s in other_labels: gu.addSynonym(g, orthology_class_id, s) # add the last one as the description gu.addDescription(g, orthology_class_id, other_labels[len(other_labels)-1]) if (not self.testMode) and (limit is not None and line_counter > limit): break logger.info("Done with ortholog classes") return
def process_gene_ids(self, limit): raw = '/'.join((self.rawdir, self.files['gene_ids']['file'])) if self.testMode: g = self.testgraph else: g = self.graph gu = GraphUtils(curie_map.get()) logger.info("Processing Gene IDs") line_counter = 0 geno = Genotype(g) with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter=',', quotechar='\"') for row in filereader: line_counter += 1 (taxon_num, gene_num, gene_symbol, gene_synonym, live) = row # 6239,WBGene00000001,aap-1,Y110A7A.10,Live if self.testMode and gene_num not in self.test_ids['gene']: continue taxon_id = 'NCBITaxon:'+taxon_num gene_id = 'WormBase:'+gene_num if gene_symbol == '': gene_symbol = gene_synonym if gene_symbol == '': gene_symbol = None gu.addClassToGraph( g, gene_id, gene_symbol, Genotype.genoparts['gene']) if live == 'Dead': gu.addDeprecatedClass(g, gene_id) geno.addTaxon(taxon_id, gene_id) if gene_synonym != '': gu.addSynonym(g, gene_id, gene_synonym) if not self.testMode \ and limit is not None and line_counter > limit: break return
def _get_titles(self, limit): """ The file processed here is of the format: #NBK_id GR_shortname OMIM NBK1103 trimethylaminuria 136132 NBK1103 trimethylaminuria 602079 NBK1104 cdls 122470 Where each of the rows represents a mapping between a gr id and an omim id. These are a 1:many relationship, and some of the omim ids are genes (not diseases). Therefore, we need to create a loose coupling here. We make the assumption that these NBKs are generally higher-level grouping classes; therefore the OMIM ids are treated as subclasses. (This assumption is poor for those omims that are actually genes, but we have no way of knowing what those are here... we will just have to deal with that for now.) :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['titles']['file'])) gu = GraphUtils(curie_map.get()) line_counter = 0 with open(raw, 'r', encoding='latin-1') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 if line_counter == 1: # skip header continue (shortname, title, nbk_num) = row gr_id = 'GeneReviews:'+nbk_num self.book_ids.add(nbk_num) # a global set of the book nums if limit is None or line_counter < limit: gu.addClassToGraph(self.graph, gr_id, title) gu.addSynonym(self.graph, gr_id, shortname) return
def _get_gene_info(self, limit): """ Currently loops through the gene_info file and creates the genes as classes, typed with SO. It will add their label, any alternate labels as synonyms, alternate ids as equivlaent classes. HPRDs get added as protein products. The chromosome and chr band get added as blank node regions, and the gene is faldo:located on the chr band. :param limit: :return: """ gu = GraphUtils(curie_map.get()) if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) # not unzipping the file logger.info("Processing Gene records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['gene_info']['file'])) logger.info("FILE: %s", myfile) # Add taxa and genome classes for those in our filter for tax_num in self.tax_ids: tax_id = ':'.join(('NCBITaxon', str(tax_num))) geno.addGenome(tax_id, str(tax_num)) # tax label can get added elsewhere gu.addClassToGraph(g, tax_id, None) # label added elsewhere with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match('^#', line): continue (tax_num, gene_num, symbol, locustag, synonyms, xrefs, chr, map_loc, desc, gtype, authority_symbol, name, nomenclature_status, other_designations, modification_date) = line.split('\t') ##### set filter=None in init if you don't want to have a filter #if self.filter is not None: # if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids)) # or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))): # continue ##### end filter if self.testMode and int(gene_num) not in self.gene_ids: continue if int(tax_num) not in self.tax_ids: continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) tax_id = ':'.join(('NCBITaxon', tax_num)) gene_type_id = self._map_type_of_gene(gtype) if symbol == 'NEWENTRY': label = None else: label = symbol # TODO might have to figure out if things aren't genes, and make them individuals gu.addClassToGraph(g, gene_id, label, gene_type_id, desc) # we have to do special things here for genes, because they're classes not individuals # f = Feature(gene_id,label,gene_type_id,desc) if name != '-': gu.addSynonym(g, gene_id, name) if synonyms.strip() != '-': for s in synonyms.split('|'): gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym']) if other_designations.strip() != '-': for s in other_designations.split('|'): gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym']) # deal with the xrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 if xrefs.strip() != '-': for r in xrefs.strip().split('|'): fixedr = self._cleanup_id(r) if fixedr is not None and fixedr.strip() != '': if re.match('HPRD', fixedr): # proteins are not == genes. gu.addTriple(g, gene_id, self.properties['has_gene_product'], fixedr) else: # skip some of these for now if fixedr.split(':')[0] not in ['Vega', 'IMGT/GENE-DB']: gu.addEquivalentClass(g, gene_id, fixedr) # edge cases of id | symbol | chr | map_loc: # 263 AMD1P2 X|Y with Xq28 and Yq12 # 438 ASMT X|Y with Xp22.3 or Yp11.3 # in PAR # 419 ART3 4 with 4q21.1|4p15.1-p14 # no idea why there's two bands listed - possibly 2 assemblies # 28227 PPP2R3B X|Y Xp22.33; Yp11.3 # in PAR # 619538 OMS 10|19|3 10q26.3;19q13.42-q13.43;3p25.3 #this is of "unknown" type == susceptibility # 101928066 LOC101928066 1|Un - # unlocated scaffold # 11435 Chrna1 2 2 C3|2 43.76 cM # mouse --> 2C3 # 11548 Adra1b 11 11 B1.1|11 25.81 cM # mouse --> 11B1.1 # 11717 Ampd3 7 7 57.85 cM|7 E2-E3 # mouse # 14421 B4galnt1 10 10 D3|10 74.5 cM # mouse # 323212 wu:fb92e12 19|20 - # fish # 323368 ints10 6|18 - # fish # 323666 wu:fc06e02 11|23 - # fish # feel that the chr placement can't be trusted in this table when there is > 1 listed # with the exception of human X|Y, i will only take those that align to one chr # FIXME remove the chr mapping below when we pull in the genomic coords if str(chr) != '-' and str(chr) != '': if re.search('\|', str(chr)) and str(chr) not in ['X|Y','X; Y']: # this means that there's uncertainty in the mapping. skip it # TODO we'll need to figure out how to deal with >1 loc mapping logger.info('%s is non-uniquely mapped to %s. Skipping for now.', gene_id, str(chr)) continue # X|Y Xp22.33;Yp11.3 # if (not re.match('(\d+|(MT)|[XY]|(Un)$',str(chr).strip())): # print('odd chr=',str(chr)) if str(chr) == 'X; Y': chr = 'X|Y' # rewrite the PAR regions for processing # do this in a loop to allow PAR regions like X|Y for c in re.split('\|',str(chr)) : geno.addChromosomeClass(c, tax_id, None) # assume that the chromosome label will get added elsewhere mychrom = makeChromID(c, tax_num, 'CHR') mychrom_syn = makeChromLabel(c, tax_num) # temporarily use the taxnum for the disambiguating label gu.addSynonym(g, mychrom, mychrom_syn) band_match = re.match('[0-9A-Z]+[pq](\d+)?(\.\d+)?$', map_loc) if band_match is not None and len(band_match.groups()) > 0: # if tax_num != '9606': # continue # this matches the regular kind of chrs, so make that kind of band # not sure why this matches? chrX|Y or 10090chr12|Un" # TODO we probably need a different regex per organism # the maploc_id already has the numeric chromosome in it, strip it first bid = re.sub('^'+c, '', map_loc) maploc_id = makeChromID(c+bid, tax_num, 'CHR') # the generic location (no coordinates) # print(map_loc,'-->',bid,'-->',maploc_id) band = Feature(maploc_id, None, None) # Assume it's type will be added elsewhere band.addFeatureToGraph(g) # add the band as the containing feature gu.addTriple(g, gene_id, Feature.object_properties['is_subsequence_of'], maploc_id) else: # TODO handle these cases # examples are: 15q11-q22, Xp21.2-p11.23, 15q22-qter, 10q11.1-q24, ## 12p13.3-p13.2|12p13-p12, 1p13.3|1p21.3-p13.1, 12cen-q21, 22q13.3|22q13.3 logger.debug('not regular band pattern for %s: %s', gene_id, map_loc) # add the gene as a subsequence of the chromosome gu.addTriple(g, gene_id, Feature.object_properties['is_subsequence_of'], mychrom) geno.addTaxon(tax_id, gene_id) if not self.testMode and limit is not None and line_counter > limit: break gu.loadProperties(g, Feature.object_properties, gu.OBJPROP) gu.loadProperties(g, Feature.data_properties, gu.DATAPROP) gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP) gu.loadAllProperties(g) return
class Monochrom(Source): """ This class will leverage the GENO ontology and modeling patterns to build an ontology of chromosomes for any species. These classes represent major structural pieces of Chromosomes which are often universally referenced, using physical properties/observations that remain constant over different genome builds (such as banding patterns and arms). The idea is to create a scaffold upon which we can hang build-specific chromosomal coordinates, and reason across them. In general, this will take the cytogenic bands files from UCSC, and create missing grouping classes, in order to build the partonomy from a very specific chromosomal band up through the chromosome itself and enable overlap and containment queries. We use RO:subsequence_of as our relationship between nested chromosomal parts. For example, 13q21.31 ==> 13q21.31, 13q21.3, 13q21, 13q2, 13q, 13 At the moment, this only computes the bands for Human, Mouse, Zebrafish, and Rat but will be expanding in the future as needed. Because this is a universal framework to represent the chromosomal structure of any species, we must mint identifiers for each chromosome and part. We differentiate species by first creating a species-specific genome, then for each species-specific chromosome we include the NCBI taxon number together with the chromosome number, like: ```<species number>chr<num><band>```. For 13q21.31, this would be 9606chr13q21.31. We then create triples for a given band like: <pre> CHR:9606chr1p36.33 rdf[type] SO:chromosome_band CHR:9606chr1p36 subsequence_of :9606chr1p36.3 </pre> where any band in the file is an instance of a chr_band (or a more specific type), is a subsequence of it's containing region. We determine the containing regions of the band by parsing the band-string; since each alphanumeric is a significant "place", we can split it with the shorter strings being parents of the longer string Since this is small, and we have not limited other items in our test set to a small region, we simply use the whole graph (genome) for testing purposes, and copy the main graph to the test graph. Since this Dipper class is building an ONTOLOGY, rather than instance-level data, we must also include domain and range constraints, and other owl-isms. TODO: any species by commandline argument We are currently mapping these to the **CHR idspace**, but this is NOT YET APPROVED and is subject to change. """ files = { '9606': { 'file': '9606cytoBand.txt.gz', 'url': MCDL + '/hg19/database/cytoBand.txt.gz', 'build_num': 'hg19', 'genome_label': 'Human' }, '10090': { 'file': '10090cytoBand.txt.gz', 'url': MCDL + '/mm10/database/cytoBandIdeo.txt.gz', 'build_num': 'mm10', 'genome_label': 'Mouse' }, # Note that there are no bands, arms or staining components # for the following genomes at the moment '7955': { 'file': '7955cytoBand.txt.gz', 'url': MCDL + '/danRer10/database/cytoBandIdeo.txt.gz', 'build_num': 'danRer10', 'genome_label': 'Zebrafish' }, '10116': { 'file': '10116cytoBand.txt.gz', 'url': MCDL + '/rn6/database/cytoBandIdeo.txt.gz', 'build_num': 'rn6', 'genome_label': 'Rat' }, '9913': { 'file': 'bosTau7cytoBand.txt.gz', 'url': MCDL + '/bosTau7/database/cytoBandIdeo.txt.gz', 'build_num': 'bosTau7', 'genome_label': 'cow' }, '9031': { 'file': 'galGal4cytoBand.txt.gz', 'url': MCDL + '/galGal4/database/cytoBandIdeo.txt.gz', 'build_num': 'galGal4', 'genome_label': 'chicken' }, '9823': { 'file': 'susScr3cytoBand.txt.gz', 'url': MCDL + '/susScr3/database/cytoBandIdeo.txt.gz', 'build_num': 'susScr3', 'genome_label': 'pig' }, '9940': { 'file': 'oviAri3cytoBand.txt.gz', 'url': MCDL + '/oviAri3/database/cytoBandIdeo.txt.gz', 'build_num': 'oviAri3', 'genome_label': 'sheep' }, '9796': { 'file': 'equCab2cytoBand.txt.gz', 'url': MCDL + '/equCab2/database/cytoBandIdeo.txt.gz', 'build_num': 'equCab2', 'genome_label': 'horse' }, } region_type_map = { 'acen': Feature.types['centromere'], 'gvar': Feature.types['chromosome_band'], 'stalk': Feature.types['chromosome_band'], 'gneg': Feature.types['chromosome_band'], 'gpos100': Feature.types['chromosome_band'], 'gpos25': Feature.types['chromosome_band'], 'gpos33': Feature.types['chromosome_band'], 'gpos50': Feature.types['chromosome_band'], 'gpos66': Feature.types['chromosome_band'], 'gpos75': Feature.types['chromosome_band'], 'chromosome': Feature.types['chromosome'], 'chromosome_arm': Feature.types['chromosome_arm'], 'chromosome_band': Feature.types['chromosome_band'], 'chromosome_part': Feature.types['chromosome_part'] } def __init__(self, tax_ids=None): super().__init__('monochrom') self.tax_ids = tax_ids self.load_bindings() self.gu = GraphUtils(curie_map.get()) # Defaults if self.tax_ids is None: self.tax_ids = [ 9606, 10090, 7955, 10116, 9913, 9031, 9823, 9940, 9796] self._check_tax_ids() # TODO add license self.dataset = Dataset( 'monochrom', 'Monarch Chromosome Ontology', 'http://monarchinitiative.org', None, 'http://creativecommons.org/licenses/by/4.0/') return def fetch(self, is_dl_forced=False): self.get_files(is_dl_forced) return def parse(self, limit=None): if limit is not None: logger.info("Only parsing first %d rows", limit) logger.info("Parsing files...") if self.testOnly: self.testMode = True for taxon in self.tax_ids: self._get_chrbands(limit, str(taxon)) self.load_core_bindings() self.load_bindings() # using the full graph as the test here self.testgraph = self.graph logger.info("Found %d nodes", len(self.graph)) logger.info("Done parsing files.") return def _get_chrbands(self, limit, taxon): """ For the given taxon, it will fetch the chr band file. We will not deal with the coordinate information with this parser. Here, we only are concerned with building the partonomy. :param limit: :return: """ line_counter = 0 myfile = '/'.join((self.rawdir, self.files[taxon]['file'])) logger.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) # build the organism's genome from the taxon genome_label = self.files[taxon]['genome_label'] taxon_id = 'NCBITaxon:'+taxon # add the taxon as a class. adding the class label elsewhere self.gu.addClassToGraph(self.graph, taxon_id, None) self.gu.addSynonym(self.graph, taxon_id, genome_label) self.gu.loadObjectProperties(self.graph, Feature.object_properties) genome_id = geno.makeGenomeID(taxon_id) geno.addGenome(taxon_id, genome_label) self.gu.addOWLPropertyClassRestriction( self.graph, genome_id, Genotype.object_properties['in_taxon'], taxon_id) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue # chr13 4500000 10000000 p12 stalk (chrom, start, stop, band, rtype) = line.split('\t') line_counter += 1 # NOTE # some less-finished genomes have placed and unplaced scaffolds # * Placed scaffolds: # Scaffold has an oriented location within a chromosome. # * Unlocalized scaffolds: # scaffold 's chromosome is known, # scaffold's position, orientation or both is not known. # *Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to. # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 placed_scaffold_pattern = r'chr(\d+|X|Y|Z|W|MT|M)' # TODO unused # unlocalized_scaffold_pattern = \ # placed_scaffold_pattern + r'_(\w+)_random' # unplaced_scaffold_pattern = r'chrUn_(\w+)' m = re.match(placed_scaffold_pattern+r'$', chrom) if m is not None and len(m.groups()) == 1: # the chromosome is the first match of the pattern # ch = m.group(1) # TODO unused pass else: # let's skip over anything that isn't a placed_scaffold # at the class level logger.info("Skipping non-placed chromosome %s", chrom) continue # the chrom class, taxon as the reference cclassid = makeChromID(chrom, taxon, 'CHR') # add the chromosome as a class geno.addChromosomeClass(chrom, taxon_id, genome_label) self.gu.addOWLPropertyClassRestriction( self.graph, cclassid, self.gu.object_properties['member_of'], genome_id) # add the band(region) as a class maplocclass_id = cclassid+band maplocclass_label = makeChromLabel(chrom+band, genome_label) if band is not None and band.strip() != '': region_type_id = self.map_type_of_region(rtype) self.gu.addClassToGraph( self.graph, maplocclass_id, maplocclass_label, region_type_id) else: region_type_id = Feature.types['chromosome'] # add the staining intensity of the band if re.match(r'g(neg|pos|var)', rtype): if region_type_id in [ Feature.types['chromosome_band'], Feature.types['chromosome_subband']]: stain_type = Feature.types.get(rtype) if stain_type is not None: self.gu.addOWLPropertyClassRestriction( self.graph, maplocclass_id, Feature.properties['has_staining_intensity'], Feature.types.get(rtype)) else: # usually happens if it's a chromosome because # they don't actually have banding info logger.info("feature type %s != chr band", region_type_id) else: logger.warning('staining type not found: %s', rtype) # get the parent bands, and make them unique parents = list(self.make_parent_bands(band, set())) # alphabetical sort will put them in smallest to biggest parents.sort(reverse=True) # print("PARENTS of",maplocclass_id,"=",parents) # add the parents to the graph, in hierarchical order # TODO this is somewhat inefficient due to # re-adding upper-level nodes when iterating over the file # TODO PYLINT Consider using enumerate # instead of iterating with range and len for i in range(len(parents)): pclassid = cclassid+parents[i] # class chr parts pclass_label = \ makeChromLabel(chrom+parents[i], genome_label) rti = getChrPartTypeByNotation(parents[i]) self.gu.addClassToGraph( self.graph, pclassid, pclass_label, rti) # for canonical chromosomes, # then the subbands are subsequences of the full band # add the subsequence stuff as restrictions if i < len(parents) - 1: pid = cclassid+parents[i+1] # the instance self.gu.addOWLPropertyClassRestriction( self.graph, pclassid, Feature.object_properties['is_subsequence_of'], pid) self.gu.addOWLPropertyClassRestriction( self.graph, pid, Feature.object_properties['has_subsequence'], pclassid) else: # add the last one (p or q usually) # as attached to the chromosome self.gu.addOWLPropertyClassRestriction( self.graph, pclassid, Feature.object_properties['is_subsequence_of'], cclassid) self.gu.addOWLPropertyClassRestriction( self.graph, cclassid, Feature.object_properties['has_subsequence'], pclassid) # connect the band here to the first one in the parent list if len(parents) > 0: self.gu.addOWLPropertyClassRestriction( self.graph, maplocclass_id, Feature.object_properties['is_subsequence_of'], cclassid+parents[0]) self.gu.addOWLPropertyClassRestriction( self.graph, cclassid+parents[0], Feature.object_properties['has_subsequence'], maplocclass_id) if limit is not None and line_counter > limit: break self.gu.loadAllProperties(self.graph) # TODO figure out the staining intensities for the encompassing bands return def make_parent_bands(self, band, child_bands): """ this will determine the grouping bands that it belongs to, recursively 13q21.31 ==> 13, 13q, 13q2, 13q21, 13q21.3, 13q21.31 :param band: :param child_bands: :return: """ m = re.match(r'([pq][A-H\d]+(?:\.\d+)?)', band) if len(band) > 0: if m: p = str(band[0:len(band)-1]) p = re.sub(r'\.$', '', p) if p is not None: child_bands.add(p) self.make_parent_bands(p, child_bands) else: child_bands = set() return child_bands def map_type_of_region(self, regiontype): """ Note that "stalk" refers to the short arm of acrocentric chromosomes chr13,14,15,21,22 for human. :param regiontype: :return: """ so_id = Feature.types['chromosome_part'] if regiontype in self.region_type_map.keys(): so_id = self.region_type_map.get(regiontype) else: logger.warning( "Unmapped code %s. Defaulting to chr_part 'SO:0000830'.", regiontype) return so_id def _check_tax_ids(self): for taxon in self.tax_ids: if str(taxon) not in self.files: raise Exception("Taxon " + str(taxon) + " not supported by source Monochrom") def getTestSuite(self): # import unittest # from tests.test_ucscbands import UCSCBandsTestCase test_suite = None # test_suite = \ # unittest.TestLoader().loadTestsFromTestCase(UCSCBandsTestCase) return test_suite
def _process_diseasegene(self, limit): """ :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) gu = GraphUtils(curie_map.get()) myfile = '/'.join((self.rawdir, self.files['disease-gene']['file'])) # PYLINT complains iterparse deprecated, # but as of py 3.4 only the optional & unsupplied parse arg is. for event, elem in ET.iterparse(myfile): if elem.tag == 'Disorder': # get the element name and id, ignoreS element name # id = elem.get('id') # some internal identifier disorder_num = elem.find('OrphaNumber').text disorder_id = 'Orphanet:'+str(disorder_num) if self.testMode and \ disorder_id not in \ config.get_config()['test_ids']['disease']: continue disorder_label = elem.find('Name').text # make a hash of internal gene id to type for later lookup gene_iid_to_type = {} gene_list = elem.find('GeneList') for gene in gene_list.findall('Gene'): gene_iid = gene.get('id') gene_type = gene.find('GeneType').get('id') gene_iid_to_type[gene_iid] = gene_type # assuming that these are in the ontology gu.addClassToGraph(g, disorder_id, disorder_label) assoc_list = elem.find('DisorderGeneAssociationList') for a in assoc_list.findall('DisorderGeneAssociation'): gene_iid = a.find('.//Gene').get('id') gene_name = a.find('.//Gene/Name').text gene_symbol = a.find('.//Gene/Symbol').text gene_num = a.find('./Gene/OrphaNumber').text gene_id = 'Orphanet:'+str(gene_num) gene_type_id = \ self._map_gene_type_id(gene_iid_to_type[gene_iid]) gu.addClassToGraph( g, gene_id, gene_symbol, gene_type_id, gene_name) syn_list = a.find('./Gene/SynonymList') if int(syn_list.get('count')) > 0: for s in syn_list.findall('./Synonym'): gu.addSynonym(g, gene_id, s.text) dgtype = a.find('DisorderGeneAssociationType').get('id') rel_id = self._map_rel_id(dgtype) dg_label = \ a.find('./DisorderGeneAssociationType/Name').text if rel_id is None: logger.warning( "Cannot map association type (%s) to RO " + "for association (%s | %s). Skipping.", dg_label, disorder_label, gene_symbol) continue alt_locus_id = '_'+gene_num+'-'+disorder_num+'VL' alt_label = \ ' '.join(('some variant of', gene_symbol.strip(), 'that is a', dg_label.lower(), disorder_label)) if self.nobnodes: alt_locus_id = ':'+alt_locus_id gu.addIndividualToGraph(g, alt_locus_id, alt_label, geno.genoparts['variant_locus']) geno.addAlleleOfGene(alt_locus_id, gene_id) # consider typing the gain/loss-of-function variants like: # http://sequenceontology.org/browser/current_svn/term/SO:0002054 # http://sequenceontology.org/browser/current_svn/term/SO:0002053 # use "assessed" status to issue an evidence code # FIXME I think that these codes are sub-optimal status_code = \ a.find('DisorderGeneAssociationStatus').get('id') # imported automatically asserted information # used in automatic assertion eco_id = 'ECO:0000323' # Assessed # TODO are these internal ids stable between releases? if status_code == '17991': # imported manually asserted information # used in automatic assertion eco_id = 'ECO:0000322' # Non-traceable author statement ECO_0000034 # imported information in automatic assertion ECO_0000313 assoc = G2PAssoc(self.name, alt_locus_id, disorder_id, rel_id) assoc.add_evidence(eco_id) assoc.add_association_to_graph(g) rlist = a.find('./Gene/ExternalReferenceList') eqid = None for r in rlist.findall('ExternalReference'): if r.find('Source').text == 'Ensembl': eqid = 'ENSEMBL:'+r.find('Reference').text elif r.find('Source').text == 'HGNC': eqid = 'HGNC:'+r.find('Reference').text elif r.find('Source').text == 'OMIM': eqid = 'OMIM:'+r.find('Reference').text else: pass # skip the others for now if eqid is not None: gu.addClassToGraph(g, eqid, None) gu.addEquivalentClass(g, gene_id, eqid) elem.clear() # discard the element if self.testMode and limit is not None and line_counter > limit: return gu.loadProperties( g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP) gu.loadProperties(g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP) gu.loadProperties(g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP) gu.loadAllProperties(g) return
class OMIA(Source): """ This is the parser for the [Online Mendelian Inheritance in Animals (OMIA)](http://www.http://omia.angis.org.au), from which we process inherited disorders, other (single-locus) traits, and genes in >200 animal species (other than human and mouse and rats). We generate the omia graph to include the following information: * genes * animal taxonomy, and breeds as instances of those taxa (breeds are akin to "strains" in other taxa) * animal diseases, along with species-specific subtypes of those diseases * publications (and their mapping to PMIDs, if available) * gene-to-phenotype associations (via an anonymous variant-locus * breed-to-phenotype associations We make links between OMIA and OMIM in two ways: 1. mappings between OMIA and OMIM are created as OMIA --> hasdbXref OMIM 2. mappings between a breed and OMIA disease are created to be a model for the mapped OMIM disease, IF AND ONLY IF it is a 1:1 mapping. there are some 1:many mappings, and these often happen if the OMIM item is a gene. Because many of these species are not covered in the PANTHER orthology datafiles, we also pull any orthology relationships from the gene_group files from NCBI. """ files = { 'data': { 'file': 'omia.xml.gz', 'url': 'http://omia.angis.org.au/dumps/omia.xml.gz'}, } def __init__(self): Source.__init__(self, 'omia') self.load_bindings() self.dataset = Dataset( 'omia', 'Online Mendelian Inheritance in Animals', 'http://omia.angis.org.au', None, None, 'http://sydney.edu.au/disclaimer.shtml') self.id_hash = { 'article': {}, 'phene': {}, 'breed': {}, 'taxon': {}, 'gene': {} } self.label_hash = {} self.gu = GraphUtils(curie_map.get()) # used to store the omia to omim phene mappings self.omia_omim_map = {} # used to store the unique genes that have phenes # (for fetching orthology) self.annotated_genes = set() self.test_ids = { 'disease': [ 'OMIA:001702', 'OMIA:001867', 'OMIA:000478', 'OMIA:000201', 'OMIA:000810', 'OMIA:001400'], 'gene': [ 492297, 434, 492296, 3430235, 200685834, 394659996, 200685845, 28713538, 291822383], 'taxon': [9691, 9685, 9606, 9615, 9913, 93934, 37029, 9627, 9825], # to be filled in during parsing of breed table # for lookup by breed-associations 'breed': [] } # to store a map of omia ids and any molecular info # to write a report for curation self.stored_omia_mol_gen = {} self.g = self.graph self.geno = Genotype(self.g) return def fetch(self, is_dl_forced=False): """ :param is_dl_forced: :return: """ self.get_files(is_dl_forced) ncbi = NCBIGene() # ncbi.fetch() gene_group = ncbi.files['gene_group'] self.fetch_from_url( gene_group['url'], '/'.join((ncbi.rawdir, gene_group['file'])), False) return def parse(self, limit=None): # names of tables to iterate - probably don't need all these: # Article_Breed, Article_Keyword, Article_Gene, Article_Keyword, # Article_People, Article_Phene, Articles, Breed, Breed_Phene, # Genes_gb, Group_Categories, Group_MPO, Inherit_Type, Keywords, # Landmark, Lida_Links, OMIA_Group, OMIA_author, Omim_Xref, People, # Phene, Phene_Gene, Publishers, Resources, Species_gb, Synonyms self.scrub() if limit is not None: logger.info("Only parsing first %d rows", limit) logger.info("Parsing files...") if self.testOnly: self.testMode = True if self.testMode: self.g = self.testgraph else: self.g = self.graph self.geno = Genotype(self.g) # we do three passes through the file # first process species (two others reference this one) self.process_species(limit) # then, process the breeds, genes, articles, and other static stuff self.process_classes(limit) # next process the association data self.process_associations(limit) # process the vertebrate orthology for genes # that are annotated with phenotypes ncbi = NCBIGene() ncbi.add_orthologs_by_gene_group(self.g, self.annotated_genes) self.load_core_bindings() self.load_bindings() logger.info("Done parsing.") self.write_molgen_report() return def scrub(self): """ The XML file seems to have mixed-encoding; we scrub out the control characters from the file for processing. :return: """ logger.info( "Scrubbing out the nasty characters that break our parser.") myfile = '/'.join((self.rawdir, self.files['data']['file'])) tmpfile = '/'.join((self.rawdir, self.files['data']['file']+'.tmp.gz')) t = gzip.open(tmpfile, 'wb') du = DipperUtil() with gzip.open(myfile, 'rb') as f: filereader = io.TextIOWrapper(f, newline="") for l in filereader: l = du.remove_control_characters(l) + '\n' t.write(l.encode('utf-8')) t.close() # move the temp file logger.info("Replacing the original data with the scrubbed file.") shutil.move(tmpfile, myfile) return # ###################### XML LOOPING FUNCTIONS ################## def process_species(self, limit): """ Loop through the xml file and process the species. We add elements to the graph, and store the id-to-label in the label_hash dict. :param limit: :return: """ myfile = '/'.join((self.rawdir, self.files['data']['file'])) f = gzip.open(myfile, 'rb') filereader = io.TextIOWrapper(f, newline="") filereader.readline() # remove the xml declaration line for event, elem in ET.iterparse(filereader): # Species ids are == genbank species ids! self.process_xml_table( elem, 'Species_gb', self._process_species_table_row, limit) f.close() return def process_classes(self, limit): """ Loop through the xml file and process the articles, breed, genes, phenes, and phenotype-grouping classes. We add elements to the graph, and store the id-to-label in the label_hash dict, along with the internal key-to-external id in the id_hash dict. The latter are referenced in the association processing functions. :param limit: :return: """ myfile = '/'.join((self.rawdir, self.files['data']['file'])) f = gzip.open(myfile, 'rb') filereader = io.TextIOWrapper(f, newline="") filereader.readline() # remove the xml declaration line parser = ET.XMLParser(encoding='utf-8') for event, elem in ET.iterparse(filereader, parser=parser): self.process_xml_table( elem, 'Articles', self._process_article_row, limit) self.process_xml_table( elem, 'Breed', self._process_breed_row, limit) self.process_xml_table( elem, 'Genes_gb', self._process_gene_row, limit) self.process_xml_table( elem, 'OMIA_Group', self._process_omia_group_row, limit) self.process_xml_table( elem, 'Phene', self._process_phene_row, limit) self.process_xml_table( elem, 'Omim_Xref', self._process_omia_omim_map, limit) f.close() # post-process the omia-omim associations to filter out the genes # (keep only phenotypes/diseases) self.clean_up_omim_genes() return def process_associations(self, limit): """ Loop through the xml file and process the article-breed, article-phene, breed-phene, phene-gene associations, and the external links to LIDA. :param limit: :return: """ myfile = '/'.join((self.rawdir, self.files['data']['file'])) f = gzip.open(myfile, 'rb') filereader = io.TextIOWrapper(f, newline="") filereader.readline() # remove the xml declaration line for event, elem in ET.iterparse(filereader): self.process_xml_table( elem, 'Article_Breed', self._process_article_breed_row, limit) self.process_xml_table( elem, 'Article_Phene', self._process_article_phene_row, limit) self.process_xml_table( elem, 'Breed_Phene', self._process_breed_phene_row, limit) self.process_xml_table( elem, 'Lida_Links', self._process_lida_links_row, limit) self.process_xml_table( elem, 'Phene_Gene', self._process_phene_gene_row, limit) self.process_xml_table( elem, 'Group_MPO', self._process_group_mpo_row, limit) f.close() return # ############ INDIVIDUAL TABLE-LEVEL PROCESSING FUNCTIONS ################ def _process_species_table_row(self, row): # gb_species_id, sci_name, com_name, added_by, date_modified tax_id = 'NCBITaxon:'+str(row['gb_species_id']) sci_name = row['sci_name'] com_name = row['com_name'] if self.testMode and \ (int(row['gb_species_id']) not in self.test_ids['taxon']): return self.gu.addClassToGraph(self.g, tax_id, sci_name) if com_name != '': self.gu.addSynonym(self.g, tax_id, com_name) self.label_hash[tax_id] = com_name # for lookup later else: self.label_hash[tax_id] = sci_name return def _process_breed_row(self, row): # in test mode, keep all breeds of our test species if self.testMode and \ (int(row['gb_species_id']) not in self.test_ids['taxon']): return # save the breed keys in the test_ids for later processing self.test_ids['breed'] += [int(row['breed_id'])] breed_id = self.make_breed_id(row['breed_id']) self.id_hash['breed'][row['breed_id']] = breed_id tax_id = 'NCBITaxon:'+str(row['gb_species_id']) breed_label = row['breed_name'] species_label = self.label_hash.get(tax_id) if species_label is not None: breed_label = breed_label + ' ('+species_label+')' self.gu.addIndividualToGraph(self.g, breed_id, breed_label, tax_id) self.label_hash[breed_id] = breed_label return def _process_phene_row(self, row): phenotype_id = None sp_phene_label = row['phene_name'] if sp_phene_label == '': sp_phene_label = None if 'omia_id' not in row: logger.info("omia_id not present for %s", row['phene_id']) omia_id = self._make_internal_id('phene', phenotype_id) else: omia_id = 'OMIA:'+str(row['omia_id']) if self.testMode and not\ (int(row['gb_species_id']) in self.test_ids['taxon'] and omia_id in self.test_ids['disease']): return # add to internal hash store for later lookup self.id_hash['phene'][row['phene_id']] = omia_id descr = row['summary'] if descr == '': descr = None # omia label omia_label = self.label_hash.get(omia_id) # add the species-specific subclass (TODO please review this choice) gb_species_id = row['gb_species_id'] if gb_species_id != '': sp_phene_id = '-'.join((omia_id, gb_species_id)) else: logger.error( "No species supplied in species-specific phene table for %s", omia_id) return species_id = 'NCBITaxon:'+str(gb_species_id) # use this instead species_label = self.label_hash.get('NCBITaxon:'+gb_species_id) if sp_phene_label is None and \ omia_label is not None and species_label is not None: sp_phene_label = ' '.join((omia_label, 'in', species_label)) self.gu.addClassToGraph( self.g, sp_phene_id, sp_phene_label, omia_id, descr) # add to internal hash store for later lookup self.id_hash['phene'][row['phene_id']] = sp_phene_id self.label_hash[sp_phene_id] = sp_phene_label # add each of the following descriptions, # if they are populated, with a tag at the end. for item in [ 'clin_feat', 'history', 'pathology', 'mol_gen', 'control']: if row[item] is not None and row[item] != '': self.gu.addDescription( self.g, sp_phene_id, row[item] + ' ['+item+']') # if row['symbol'] is not None: # species-specific # CHECK ME - sometimes spaces or gene labels # gu.addSynonym(g, sp_phene, row['symbol']) self.gu.addOWLPropertyClassRestriction( self.g, sp_phene_id, self.gu.object_properties['in_taxon'], species_id) # add inheritance as an association inheritance_id = self._map_inheritance_term_id(row['inherit']) if inheritance_id is not None: assoc = DispositionAssoc(self.name, sp_phene_id, inheritance_id) assoc.add_association_to_graph(self.g) if row['characterised'] == 'Yes': self.stored_omia_mol_gen[omia_id] = { 'mol_gen': row['mol_gen'], 'map_info': row['map_info'], 'species': row['gb_species_id']} return def write_molgen_report(self): import csv logger.info("Writing G2P report for OMIA") f = '/'.join((self.outdir, 'omia_molgen_report.txt')) with open(f, 'w', newline='\n') as csvfile: writer = csv.writer(csvfile, delimiter='\t') # write header h = ['omia_id', 'molecular_description', 'mapping_info', 'species'] writer.writerow(h) for phene in self.stored_omia_mol_gen: writer.writerow((str(phene), self.stored_omia_mol_gen[phene]['mol_gen'], self.stored_omia_mol_gen[phene]['map_info'], self.stored_omia_mol_gen[phene]['species'])) logger.info( "Wrote %d potential G2P descriptions for curation to %s", len(self.stored_omia_mol_gen), f) return def _process_article_row(self, row): # don't bother in test mode if self.testMode: return iarticle_id = self._make_internal_id('article', row['article_id']) self.id_hash['article'][row['article_id']] = iarticle_id rtype = None if row['journal'] != '': rtype = Reference.ref_types['journal_article'] r = Reference(iarticle_id, rtype) if row['title'] is not None: r.setTitle(row['title'].strip()) if row['year'] is not None: r.setYear(row['year']) r.addRefToGraph(self.g) if row['pubmed_id'] is not None: pmid = 'PMID:'+str(row['pubmed_id']) self.id_hash['article'][row['article_id']] = pmid self.gu.addSameIndividual(self.g, iarticle_id, pmid) self.gu.addComment(self.g, pmid, iarticle_id) return def _process_omia_group_row(self, row): omia_id = 'OMIA:'+row['omia_id'] if self.testMode and omia_id not in self.test_ids['disease']: return group_name = row['group_name'] group_summary = row['group_summary'] disease_id = None group_category = row.get('group_category') disease_id = \ self.map_omia_group_category_to_ontology_id(group_category) if disease_id is not None: self.gu.addClassToGraph(self.g, disease_id, None) if disease_id == 'MP:0008762': # embryonic lethal # add this as a phenotype association # add embryonic onset assoc = D2PAssoc(self.name, omia_id, disease_id) assoc.add_association_to_graph(self.g) disease_id = None else: logger.info( "No disease superclass defined for %s: %s", omia_id, group_name) # default to general disease FIXME this may not be desired disease_id = 'DOID:4' if group_summary == '': group_summary = None if group_name == '': group_name = None self.gu.addClassToGraph( self.g, omia_id, group_name, disease_id, group_summary) self.label_hash[omia_id] = group_name return def _process_gene_row(self, row): if self.testMode and row['gene_id'] not in self.test_ids['gene']: return gene_id = 'NCBIGene:'+str(row['gene_id']) self.id_hash['gene'][row['gene_id']] = gene_id gene_label = row['symbol'] self.label_hash[gene_id] = gene_label tax_id = 'NCBITaxon:'+str(row['gb_species_id']) gene_type_id = NCBIGene.map_type_of_gene(row['gene_type']) self.gu.addClassToGraph(self.g, gene_id, gene_label, gene_type_id) self.geno.addTaxon(tax_id, gene_id) return def _process_article_breed_row(self, row): # article_id, breed_id, added_by # don't bother putting these into the test... too many! # and int(row['breed_id']) not in self.test_ids['breed']: if self.testMode: return article_id = self.id_hash['article'].get(row['article_id']) breed_id = self.id_hash['breed'].get(row['breed_id']) # there's some missing data (article=6038). in that case skip if article_id is not None: self.gu.addTriple( self.g, article_id, self.gu.object_properties['is_about'], breed_id) else: logger.warning("Missing article key %s", str(row['article_id'])) return def _process_article_phene_row(self, row): """ Linking articles to species-specific phenes. :param row: :return: """ # article_id, phene_id, added_by # look up the article in the hashmap phenotype_id = self.id_hash['phene'].get(row['phene_id']) article_id = self.id_hash['article'].get(row['article_id']) omia_id = self._get_omia_id_from_phene_id(phenotype_id) if self.testMode and omia_id not in self.test_ids['disease'] \ or phenotype_id is None or article_id is None: return # make a triple, where the article is about the phenotype self.gu.addTriple( self.g, article_id, self.gu.object_properties['is_about'], phenotype_id) return def _process_breed_phene_row(self, row): # Linking disorders/characteristic to breeds # breed_id, phene_id, added_by breed_id = self.id_hash['breed'].get(row['breed_id']) phene_id = self.id_hash['phene'].get(row['phene_id']) # get the omia id omia_id = self._get_omia_id_from_phene_id(phene_id) if (self.testMode and not ( omia_id in self.test_ids['disease'] and int(row['breed_id']) in self.test_ids['breed']) or breed_id is None or phene_id is None): return # FIXME we want a different relationship here assoc = G2PAssoc( self.name, breed_id, phene_id, self.gu.object_properties['has_phenotype']) assoc.add_association_to_graph(self.g) # add that the breed is a model of the human disease # use the omia-omim mappings for this # we assume that we have already scrubbed out the genes # from the omim list, so we can make the model associations here omim_ids = self.omia_omim_map.get(omia_id) eco_id = "ECO:0000214" # biological aspect of descendant evidence if omim_ids is not None and len(omim_ids) > 0: if len(omim_ids) > 1: logger.info( "There's 1:many omia:omim mapping: %s, %s", omia_id, str(omim_ids)) for i in omim_ids: assoc = G2PAssoc( self.name, breed_id, i, self.gu.object_properties['model_of']) assoc.add_evidence(eco_id) assoc.add_association_to_graph(self.g) aid = assoc.get_association_id() breed_label = self.label_hash.get(breed_id) if breed_label is None: breed_label = "this breed" m = re.search(r'\((.*)\)', breed_label) if m: sp_label = m.group(1) else: sp_label = '' phene_label = self.label_hash.get(phene_id) if phene_label is None: phene_label = "phenotype" elif phene_label.endswith(sp_label): # some of the labels we made already include the species; # remove it to make a cleaner desc phene_label = re.sub(r' in '+sp_label, '', phene_label) desc = ' '.join( ("High incidence of", phene_label, "in", breed_label, "suggests it to be a model of disease", i + ".")) self.gu.addDescription(self.g, aid, desc) return def _process_lida_links_row(self, row): # lidaurl, omia_id, added_by omia_id = 'OMIA:'+row['omia_id'] lidaurl = row['lidaurl'] if self.testMode and omia_id not in self.test_ids['disease']: return self.gu.addXref(self.g, omia_id, lidaurl, True) return def _process_phene_gene_row(self, row): gene_id = self.id_hash['gene'].get(row['gene_id']) phene_id = self.id_hash['phene'].get(row['phene_id']) omia_id = self._get_omia_id_from_phene_id(phene_id) if self.testMode and not ( omia_id in self.test_ids['disease'] and row['gene_id'] in self.test_ids['gene']) or\ gene_id is None or phene_id is None: return # occasionally some phenes are missing! (ex: 406) if phene_id is None: logger.warning("Phene id %s is missing", str(row['phene_id'])) return gene_label = self.label_hash[gene_id] # some variant of gene_id has phenotype d vl = '_'+re.sub(r'NCBIGene:', '', str(gene_id)) + 'VL' if self.nobnodes: vl = ':'+vl self.geno.addAllele(vl, 'some variant of ' + gene_label) self.geno.addAlleleOfGene(vl, gene_id) assoc = G2PAssoc(self.name, vl, phene_id) assoc.add_association_to_graph(self.g) # add the gene id to the set of annotated genes # for later lookup by orthology self.annotated_genes.add(gene_id) return def _process_omia_omim_map(self, row): """ Links OMIA groups to OMIM equivalents. :param row: :return: """ # omia_id, omim_id, added_by omia_id = 'OMIA:'+row['omia_id'] omim_id = 'OMIM:'+row['omim_id'] # also store this for use when we say that a given animal is # a model of a disease if omia_id not in self.omia_omim_map: self.omia_omim_map[omia_id] = set() self.omia_omim_map[omia_id].add(omim_id) if self.testMode and omia_id not in self.test_ids['disease']: return self.gu.addXref(self.g, omia_id, omim_id) return def map_omia_group_category_to_ontology_id(self, category_num): """ Using the category number in the OMIA_groups table, map them to a disease id. This may be superceeded by other MONDO methods. Platelet disorders will be more specific once https://github.com/obophenotype/human-disease-ontology/issues/46 is fulfilled. :param category_num: :return: """ category_map = { 1: 'DOID:0014667', # Inborn error of metabolism 2: 'MESH:D004392', # Dwarfism 3: 'DOID:1682', # congenital heart disease 4: 'DOID:74', # blood system disease 5: 'DOID:3211', # lysosomal storage disease 6: 'DOID:16', # integumentary system disease # --> retinal degeneration ==> OMIA:000830 7: 'DOID:8466', # progressive retinal atrophy 8: 'DOID:0050572', # Cone–rod dystrophy 9: 'MESH:C536122', # stationary night blindness 10: 'Orphanet:98553', # developmental retinal disorder 11: 'DOID:5679', # retinal disorder 12: 'Orphanet:90771', # Disorder of Sex Development # - what to do about this one? 13: 'MP:0008762', # embryonic lethal # - not sure what to do with this 14: None, # blood group # FIXME make me more specific 15: 'DOID:2218', # intrinsic platelet disorder # FIXME make me more specific 16: 'DOID:2218', # extrinsic platelet disorder 17: None # transgenic ??? } disease_id = None if category_num is not None and int(category_num) in category_map: disease_id = category_map.get(int(category_num)) logger.info( "Found %s for category %s", str(disease_id), str(category_num)) else: logger.info( "There's a group category I don't know anything about: %s", str(category_num)) return disease_id def _process_group_mpo_row(self, row): """ Make OMIA to MP associations :param row: :return: """ omia_id = 'OMIA:'+row['omia_id'] mpo_num = int(row['MPO_no']) mpo_id = 'MP:'+str(mpo_num).zfill(7) assoc = D2PAssoc(self.name, omia_id, mpo_id) assoc.add_association_to_graph(self.g) return def clean_up_omim_genes(self): omim = OMIM() # get all the omim ids allomimids = set() for omia in self.omia_omim_map: allomimids.update(self.omia_omim_map[omia]) entries_that_are_phenotypes = omim.process_entries( list(allomimids), filter_keep_phenotype_entry_ids, None, None) logger.info( "Filtered out %d/%d entries that are genes or features", len(allomimids)-len(entries_that_are_phenotypes), len(allomimids)) # now iterate again and remove those non-phenotype ids removed_count = 0 for omia in self.omia_omim_map: ids = self.omia_omim_map[omia] cleanids = set() for i in ids: if i in entries_that_are_phenotypes: cleanids.add(i) else: removed_count += 1 # keep track of how many we've removed self.omia_omim_map[omia] = cleanids logger.info( "Removed %d omim ids from the omia-to-omim map", removed_count) return def _make_internal_id(self, prefix, key): iid = '_'+''.join(('omia', prefix, 'key', str(key))) if self.nobnodes: iid = ':'+iid return iid def make_breed_id(self, key): breed_id = 'OMIA-breed:'+str(key) return breed_id @staticmethod def _get_omia_id_from_phene_id(phene_id): omia_id = None if phene_id is not None: m = re.match(r'OMIA:\d+', str(phene_id)) if m: omia_id = m.group(0) return omia_id @staticmethod def _map_inheritance_term_id(inheritance_symbol): inherit_map = { 'A': None, # Autosomal 'ACD': 'GENO:0000143', # Autosomal co-dominant 'ADV': None, # autosomal dominant with variable expressivity 'AID': 'GENO:0000259', # autosomal incompletely dominant 'ASD': 'GENO:0000145', # autosomal semi-dominant # autosomal recessive, semi-lethal # using generic autosomal recessive 'ASL': 'GENO:0000150', 'D': 'GENO:0000147', # autosomal dominant 'M': None, # multifactorial 'MAT': None, # Maternal # probably autosomal recessive # using generic autosomal recessive 'PR': 'GENO:0000150', 'R': 'GENO:0000150', # Autosomal Recessive # Recessive Embryonic Lethal # using plain recessive 'REL': 'GENO:0000148', # Autosomal Recessive Lethal # using plain autosomal recessive 'RL': 'GENO:0000150', 'S': 'GENO:0000146', # Sex-linked <--using allosomal dominant 'SLi': None, # Sex-limited 'UD': 'GENO:0000144', # Dominant 'X': None, # x-linked # HP:0001417 ? # X-linked Dominant <-- temp using allosomal dominant FIXME 'XLD': 'GENO:0000146', # X-linked Recessive <-- temp using allosomal recessive FIXME 'XLR': 'GENO:0000149', 'Y': None, # Y-linked 'Z': None, # Z-linked # Z-linked recessive <-- temp using allosomal recessive FIXME 'ZR': 'GENO:0000149', '999': None, # Z-linked incompletely dominant } inheritance_id = inherit_map.get(inheritance_symbol) if inheritance_id is None and inheritance_symbol is not None: logger.warning( "No inheritance id is mapped for %s", inheritance_symbol) return inheritance_id def getTestSuite(self): import unittest from tests.test_omia import OMIATestCase test_suite = unittest.TestLoader().loadTestsFromTestCase(OMIATestCase) return test_suite
def _get_equivids(self, limit): """ The file processed here is of the format: #NBK_id GR_shortname OMIM NBK1103 trimethylaminuria 136132 NBK1103 trimethylaminuria 602079 NBK1104 cdls 122470 Where each of the rows represents a mapping between a gr id and an omim id. These are a 1:many relationship, and some of the omim ids are genes(not diseases). Therefore, we need to create a loose coupling here. We make the assumption that these NBKs are generally higher-level grouping classes; therefore the OMIM ids are treated as subclasses. (This assumption is poor for those omims that are actually genes, but we have no way of knowing what those are here... we will just have to deal with that for now.) :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['idmap']['file'])) gu = GraphUtils(curie_map.get()) line_counter = 0 # we look some stuff up in OMIM, so initialize here omim = OMIM() id_map = {} allomimids = set() with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 if line_counter == 1: # skip header continue (nbk_num, shortname, omim_num) = row gr_id = 'GeneReviews:'+nbk_num omim_id = 'OMIM:'+omim_num if not ( (self.testMode and len(self.test_ids) > 0 and omim_id in self.test_ids) or not self.testMode): continue # sometimes there's bad omim nums if len(omim_num) > 6: logger.warning( "OMIM number incorrectly formatted " + "in row %d; skipping:\n%s", line_counter, '\t'.join(row)) continue # build up a hashmap of the mappings; then process later if nbk_num not in id_map: id_map[nbk_num] = set() id_map[nbk_num].add(omim_num) # add the class along with the shortname gu.addClassToGraph(self.graph, gr_id, None) gu.addSynonym(self.graph, gr_id, shortname) allomimids.add(omim_num) if not self.testMode and \ limit is not None and line_counter > limit: break # end looping through file # get the omim ids that are not genes entries_that_are_phenotypes = \ omim.process_entries( list(allomimids), filter_keep_phenotype_entry_ids, None, None, limit) logger.info("Filtered out %d/%d entries that are genes or features", len(allomimids)-len(entries_that_are_phenotypes), len(allomimids)) for nbk_num in self.book_ids: gr_id = 'GeneReviews:'+nbk_num if nbk_num in id_map: omim_ids = id_map.get(nbk_num) for omim_num in omim_ids: omim_id = 'OMIM:'+omim_num # add the gene reviews as a superclass to the omim id, # but only if the omim id is not a gene if omim_id in entries_that_are_phenotypes: gu.addClassToGraph(self.graph, omim_id, None) gu.addSubclass(self.graph, gr_id, omim_id) # add this as a generic subclass of DOID:4 gu.addSubclass(self.graph, 'DOID:4', gr_id) return
class UCSCBands(Source): """ This will take the UCSC defintions of cytogenic bands and create the nested structures to enable overlap and containment queries. We use ```Monochrom.py``` to create the OWL-classes of the chromosomal parts. Here, we simply worry about the instance-level values for particular genome builds. Given a chr band definition, the nested containment structures look like: 13q21.31 ==> 13q21.31, 13q21.3, 13q21, 13q2, 13q, 13 We determine the containing regions of the band by parsing the band-string; since each alphanumeric is a significant "place", we can split it with the shorter strings being parents of the longer string Here we create build-specific chroms, which are instances of the classes produced from ```Monochrom.py```. You can instantiate any number of builds for a genome. We leverage the Faldo model here for region definitions, and map each of the chromosomal parts to SO. We differentiate the build by adding the build id to the identifier prior to the chromosome number. These then are instances of the species-specific chromosomal class. The build-specific chromosomes are created like: <pre> <build number>chr<num><band> with triples for a given band like: :hg19chr1p36.33 rdf[type] SO:chromosome_band, faldo:Region, CHR:9606chr1p36.33 :hg19chr1p36.33 subsequence_of :hg19chr1p36.3 :hg19chr1p36.33 faldo:location [ a faldo:BothStrandPosition faldo:begin 0, faldo:end 2300000, faldo:reference 'hg19'] </pre> where any band in the file is an instance of a chr_band (or a more specific type), is a subsequence of it's containing region, \ and is located in the specified coordinates. We do not have a separate graph for testing. TODO: any species by commandline argument """ files = { # TODO accommodate multiple builds per species '9606': { 'file': 'hg19cytoBand.txt.gz', 'url': 'http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz', 'build_num': 'hg19', 'genome_label': 'Human' }, '10090': { 'file': 'mm10cytoBand.txt.gz', 'url': 'http://hgdownload.cse.ucsc.edu/goldenPath/mm10/database/cytoBandIdeo.txt.gz', 'build_num': 'mm10', 'genome_label': 'Mouse' }, # Note that there are no bands, # arms or staining components for the species below at the moment '7955': { 'file': 'danRer10cytoBand.txt.gz', 'url': 'http://hgdownload.cse.ucsc.edu/goldenPath/danRer10/database/cytoBandIdeo.txt.gz', 'build_num': 'danRer10', 'genome_label': 'Zebrafish' }, '9913': { 'file': 'bosTau7cytoBand.txt.gz', 'url': 'http://hgdownload.cse.ucsc.edu/goldenPath/bosTau7/database/cytoBandIdeo.txt.gz', 'build_num': 'bosTau7', 'genome_label': 'cow' }, '9031': { 'file': 'galGal4cytoBand.txt.gz', 'url': 'http://hgdownload.cse.ucsc.edu/goldenPath/galGal4/database/cytoBandIdeo.txt.gz', 'build_num': 'galGal4', 'genome_label': 'chicken' }, '9823': { 'file': 'susScr3cytoBand.txt.gz', 'url': 'http://hgdownload.cse.ucsc.edu/goldenPath/susScr3/database/cytoBandIdeo.txt.gz', 'build_num': 'susScr3', 'genome_label': 'pig' }, '9940': { 'file': 'oviAri3cytoBand.txt.gz', 'url': 'http://hgdownload.cse.ucsc.edu/goldenPath/oviAri3/database/cytoBandIdeo.txt.gz', 'build_num': 'oviAri3', 'genome_label': 'sheep' }, '9796': { 'file': 'equCab2cytoBand.txt.gz', 'url': 'http://hgdownload.cse.ucsc.edu/goldenPath/equCab2/database/cytoBandIdeo.txt.gz', 'build_num': 'equCab2', 'genome_label': 'horse' }, # TODO rainbow trout, 8022, when available } def __init__(self, tax_ids=None): super().__init__('ucscbands') self.tax_ids = tax_ids self.load_bindings() self.gu = GraphUtils(curie_map.get()) # Defaults if self.tax_ids is None: # self.tax_ids = [9606, 10090, 7955] self.tax_ids = [9606, 10090, 7955, 9913, 9031, 9823, 9940, 9796] # TODO add other species as defaults self._check_tax_ids() self.dataset = Dataset('ucscbands', 'UCSC Cytogenic Bands', 'http://hgdownload.cse.ucsc.edu', None, 'http://genome.ucsc.edu/license/') # data-source specific warnings # (will be removed when issues are cleared) return def fetch(self, is_dl_forced=False): self.get_files(is_dl_forced) return def parse(self, limit=None): if limit is not None: logger.info("Only parsing first %d rows", limit) logger.info("Parsing files...") if self.testOnly: self.testMode = True for taxon in self.tax_ids: self._get_chrbands(limit, str(taxon)) self._create_genome_builds() self.load_core_bindings() self.load_bindings() # using the full graph as the test here self.testgraph = self.graph logger.info("Found %d nodes", len(self.graph)) logger.info("Done parsing files.") return def _get_chrbands(self, limit, taxon): """ :param limit: :return: """ # TODO PYLINT figure out what limit was for and why it is unused line_counter = 0 myfile = '/'.join((self.rawdir, self.files[taxon]['file'])) logger.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) monochrom = Monochrom() # used to hold band definitions for a chr # in order to compute extent of encompasing bands mybands = {} # build the organism's genome from the taxon genome_label = self.files[taxon]['genome_label'] taxon_id = 'NCBITaxon:'+taxon # add the taxon as a class. adding the class label elsewhere self.gu.addClassToGraph(self.graph, taxon_id, None) self.gu.addSynonym(self.graph, taxon_id, genome_label) self.gu.loadObjectProperties(self.graph, Feature.object_properties) self.gu.loadProperties(self.graph, Feature.data_properties, self.gu.DATAPROP) self.gu.loadAllProperties(self.graph) geno.addGenome(taxon_id, genome_label) # add the build and the taxon it's in build_num = self.files[taxon]['build_num'] build_id = 'UCSC:'+build_num geno.addReferenceGenome(build_id, build_num, taxon_id) # process the bands with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match('^#', line): continue # chr13 4500000 10000000 p12 stalk (scaffold, start, stop, band_num, rtype) = line.split('\t') line_counter += 1 # NOTE some less-finished genomes have # placed and unplaced scaffolds # * Placed scaffolds: # the scaffolds have been placed within a chromosome. # * Unlocalized scaffolds: # although the chromosome within which the scaffold occurs # is known, the scaffold's position or orientation # is not known. # * Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to # # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 placed_scaffold_pattern = r'(chr(?:\d+|X|Y|Z|W|M))' unlocalized_scaffold_pattern = placed_scaffold_pattern+r'_(\w+)_random' unplaced_scaffold_pattern = r'chr(Un(?:_\w+)?)' m = re.match(placed_scaffold_pattern+r'$', scaffold) if m is not None and len(m.groups()) == 1: # the chromosome is the first match of the pattern chrom_num = m.group(1) else: # skip over anything that isn't a placed_scaffold # at the class level logger.info("Found non-placed chromosome %s", scaffold) chrom_num = None m_chr_unloc = re.match(unlocalized_scaffold_pattern, scaffold) m_chr_unplaced = re.match(unplaced_scaffold_pattern, scaffold) scaffold_num = None if m: pass elif m_chr_unloc is not None and len(m_chr_unloc.groups()) == 2: chrom_num = m_chr_unloc.group(1) scaffold_num = chrom_num+'_'+m_chr_unloc.group(2) elif m_chr_unplaced is not None and len(m_chr_unplaced.groups()) == 1: scaffold_num = m_chr_unplaced.group(1) else: logger.error("There's a chr pattern that we aren't matching: %s", scaffold) if chrom_num is not None: # the chrom class (generic) id chrom_class_id = makeChromID(chrom_num, taxon, 'CHR') # first, add the chromosome class (in the taxon) geno.addChromosomeClass(chrom_num, taxon_id, self.files[taxon]['genome_label']) # then, add the chromosome instance (from the given build) geno.addChromosomeInstance(chrom_num, build_id, build_num, chrom_class_id) # add the chr to the hashmap of coordinates for this build # the chromosome coordinate space is itself if chrom_num not in mybands.keys(): mybands[chrom_num] = {'min': 0, 'max': int(stop), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': Feature.types['chromosome']} if scaffold_num is not None: # this will put the coordinates of the scaffold # in the scaffold-space and make sure that the scaffold # is part of the correct parent. # if chrom_num is None, # then it will attach it to the genome, # just like a reg chrom mybands[scaffold_num] = {'min': start, 'max': stop, 'chr': scaffold_num, 'ref': build_id, 'parent': chrom_num, 'stain': None, 'type': Feature.types['assembly_component'], 'synonym': scaffold} if band_num is not None and band_num.strip() != '': # add the specific band mybands[chrom_num+band_num] = {'min': start, 'max': stop, 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': None} # add the staining intensity of the band if re.match(r'g(neg|pos|var)', rtype): mybands[chrom_num+band_num]['stain'] = Feature.types.get(rtype) # get the parent bands, and make them unique parents = list(monochrom.make_parent_bands(band_num, set())) # alphabetical sort will put them in smallest to biggest, # so we reverse parents.sort(reverse=True) # print('parents of',chrom,band,':',parents) if len(parents) > 0: mybands[chrom_num+band_num]['parent'] = chrom_num+parents[0] else: # TODO PYLINT why is 'parent' # a list() a couple of lines up and a set() here? parents = set() # loop through the parents and add them to the hash # add the parents to the graph, in hierarchical order # TODO PYLINT Consider using enumerate # instead of iterating with range and len for i in range(len(parents)): rti = getChrPartTypeByNotation(parents[i]) pnum = chrom_num+parents[i] sta = int(start) sto = int(stop) if pnum not in mybands.keys(): # add the parental band to the hash b = {'min': min(sta, sto), 'max': max(sta, sto), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': rti} mybands[pnum] = b else: # band already in the hash means it's a grouping band # need to update the min/max coords b = mybands.get(pnum) b['min'] = min(sta, sto, b['min']) b['max'] = max(sta, sto, b['max']) mybands[pnum] = b # also, set the max for the chrom c = mybands.get(chrom_num) c['max'] = max(sta, sto, c['max']) mybands[chrom_num] = c # add the parent relationships to each if i < len(parents) - 1: mybands[pnum]['parent'] = chrom_num+parents[i+1] else: # add the last one (p or q usually) # as attached to the chromosome mybands[pnum]['parent'] = chrom_num f.close() # end looping through file # loop through the hash and add the bands to the graph for b in mybands.keys(): myband = mybands.get(b) band_class_id = makeChromID(b, taxon, 'CHR') band_class_label = makeChromLabel(b, genome_label) band_build_id = makeChromID(b, build_num, 'MONARCH') band_build_label = makeChromLabel(b, build_num) # the build-specific chrom chrom_in_build_id = makeChromID(myband['chr'], build_num, 'MONARCH') # if it's != part, then add the class if myband['type'] != Feature.types['assembly_component']: self.gu.addClassToGraph(self.graph, band_class_id, band_class_label, myband['type']) bfeature = Feature(band_build_id, band_build_label, band_class_id) else: bfeature = Feature(band_build_id, band_build_label, myband['type']) if 'synonym' in myband: self.gu.addSynonym(self.graph, band_build_id, myband['synonym']) if myband['parent'] is None: if myband['type'] == Feature.types['assembly_component']: # since we likely don't know the chr, # add it as a part of the build geno.addParts(band_build_id, build_id) elif myband['type'] == Feature.types['assembly_component']: # geno.addParts(band_build_id, chrom_in_build_id) parent_chrom_in_build = makeChromID(myband['parent'], build_num, 'MONARCH') bfeature.addSubsequenceOfFeature(self.graph, parent_chrom_in_build) # add the band as a feature # (which also instantiates the owl:Individual) bfeature.addFeatureStartLocation(myband['min'], chrom_in_build_id) bfeature.addFeatureEndLocation(myband['max'], chrom_in_build_id) if 'stain' in myband and myband['stain'] is not None: # TODO TEC I recall 'has_staining_intensity' being dropped by MB bfeature.addFeatureProperty(self.graph, Feature.properties['has_staining_intensity'], myband['stain']) # type the band as a faldo:Region directly (add_region=False) # bfeature.setNoBNodes(self.nobnodes) # to come when we merge in ZFIN.py bfeature.addFeatureToGraph(self.graph, False) return def _create_genome_builds(self): """ Various resources will map variations to either UCSC (hg*) or to NCBI assemblies. Here we create the equivalences between them. Data taken from: https://genome.ucsc.edu/FAQ/FAQreleases.html#release1 :return: """ # TODO add more species ucsc_assembly_id_map = { "9606": { "UCSC:hg38": "NCBIGenome:GRCh38", "UCSC:hg19": "NCBIGenome:GRCh37", "UCSC:hg18": "NCBIGenome:36.1", "UCSC:hg17": "NCBIGenome:35", "UCSC:hg16": "NCBIGenome:34", "UCSC:hg15": "NCBIGenome:33", }, "7955": { "UCSC:danRer10": "NCBIGenome:GRCz10", "UCSC:danRer7": "NCBIGenome:Zv9", "UCSC:danRer6": "NCBIGenome:Zv8", }, "10090": { "UCSC:mm10": "NCBIGenome:GRCm38", "UCSC:mm9": "NCBIGenome:37" }, "9031": { "UCSC:galGal4": "NCBIAssembly:317958", }, "9913": { "UCSC:bosTau7": "NCBIAssembly:GCF_000003205.5", }, "9823": { "UCSC:susScr3": "NCBIAssembly:304498", }, "9940": { "UCSC:oviAri3": "NCBIAssembly:GCF_000298735.1", }, "9796": { "UCSC:equCab2": "NCBIAssembly:GCF_000002305.2", } } g = self.graph geno = Genotype(g) logger.info("Adding equivalent assembly identifiers") for sp in ucsc_assembly_id_map: tax_num = sp tax_id = 'NCBITaxon:'+tax_num mappings = ucsc_assembly_id_map[sp] for i in mappings: ucsc_id = i ucsc_label = re.split(':', i)[1] mapped_id = mappings[i] mapped_label = re.split(':', mapped_id)[1] mapped_label = 'NCBI build '+str(mapped_label) geno.addReferenceGenome(ucsc_id, ucsc_label, tax_id) geno.addReferenceGenome(mapped_id, mapped_label, tax_id) self.gu.addSameIndividual(g, ucsc_id, mapped_id) return def _check_tax_ids(self): for taxon in self.tax_ids: if str(taxon) not in self.files: raise Exception("Taxon " + str(taxon) + " not supported" " by source UCSCBands") def getTestSuite(self): import unittest from tests.test_ucscbands import UCSCBandsTestCase test_suite = unittest.TestLoader().loadTestsFromTestCase(UCSCBandsTestCase) return test_suite
def _get_variants(self, limit): """ Currently loops through the variant_summary file. :param limit: :return: """ gu = GraphUtils(curie_map.get()) if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) gu.loadAllProperties(g) f = Feature(None, None, None) f.loadAllProperties(g) gu.loadAllProperties(g) # add the taxon and the genome tax_num = '9606' # HARDCODE tax_id = 'NCBITaxon:'+tax_num tax_label = 'Human' gu.addClassToGraph(g, tax_id, None) geno.addGenome(tax_id, None) # label gets added elsewhere # not unzipping the file logger.info("Processing Variant records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['variant_summary']['file'])) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match('^#', line): continue # AlleleID integer value as stored in the AlleleID field in ClinVar (//Measure/@ID in the XML) # Type character, the type of variation # Name character, the preferred name for the variation # GeneID integer, GeneID in NCBI's Gene database # GeneSymbol character, comma-separated list of GeneIDs overlapping the variation # ClinicalSignificance character, comma-separated list of values of clinical significance reported for this variation # for the mapping between the terms listed here and the integers in the .VCF files, see # http://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/ # RS# (dbSNP) integer, rs# in dbSNP # nsv (dbVar) character, the NSV identifier for the region in dbVar # RCVaccession character, list of RCV accessions that report this variant # TestedInGTR character, Y/N for Yes/No if there is a test registered as specific to this variation in the NIH Genetic Testing Registry (GTR) # PhenotypeIDs character, list of db names and identifiers for phenotype(s) reported for this variant # Origin character, list of all allelic origins for this variation # Assembly character, name of the assembly on which locations are based # Chromosome character, chromosomal location # Start integer, starting location, in pter->qter orientation # Stop integer, end location, in pter->qter orientation # Cytogenetic character, ISCN band # ReviewStatus character, highest review status for reporting this measure. For the key to the terms, # and their relationship to the star graphics ClinVar displays on its web pages, # see http://www.ncbi.nlm.nih.gov/clinvar/docs/variation_report/#interpretation # HGVS(c.) character, RefSeq cDNA-based HGVS expression # HGVS(p.) character, RefSeq protein-based HGVS expression # NumberSubmitters integer, number of submissions with this variant # LastEvaluated datetime, the latest time any submitter reported clinical significance # Guidelines character, ACMG only right now, for the reporting of incidental variation in a Gene # (NOTE: if ACMG, not a specific to the allele but to the Gene) # OtherIDs character, list of other identifiers or sources of information about this variant # VariantID integer, the value used to build the URL for the current default report, # e.g. http://www.ncbi.nlm.nih.gov/clinvar/variation/1756/ # # a crude check that there's an expected number of cols. if not, error out because something changed. num_cols = len(line.split('\t')) expected_numcols = 28 if num_cols != expected_numcols: logger.error("Unexpected number of columns in raw file (%d actual vs %d expected)", num_cols, expected_numcols) (allele_num, allele_type, allele_name, gene_num, gene_symbol, clinical_significance, dbsnp_num, dbvar_num, rcv_nums, tested_in_gtr, phenotype_ids, origin, assembly, chr, start, stop, cytogenetic_loc, review_status, hgvs_c, hgvs_p, number_of_submitters, last_eval, guidelines, other_ids, variant_num, reference_allele, alternate_allele, categories) = line.split('\t') # #### set filter=None in init if you don't want to have a filter # if self.filter is not None: # if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids)) # or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))): # continue # #### end filter line_counter += 1 pheno_list = [] if phenotype_ids != '-': # trim any leading/trailing semicolons/commas phenotype_ids = re.sub('^[;,]', '', phenotype_ids) phenotype_ids = re.sub('[;,]$', '', phenotype_ids) pheno_list = re.split('[,;]', phenotype_ids) if self.testMode: # get intersection of test disease ids and these phenotype_ids intersect = list(set([str(i) for i in self.disease_ids]) & set(pheno_list)) if int(gene_num) not in self.gene_ids and int(variant_num) not in self.variant_ids \ and len(intersect) < 1: continue # TODO may need to switch on assembly to create correct assembly/build identifiers build_id = ':'.join(('NCBIGenome', assembly)) # make the reference genome build geno.addReferenceGenome(build_id, assembly, tax_id) allele_type_id = self._map_type_of_allele(allele_type) bandinbuild_id = None if str(chr) == '': # check cytogenic location if str(cytogenetic_loc).strip() != '': # use cytogenic location to get the approximate location # strangely, they still put an assembly number even when there's no numeric location if not re.search('-',str(cytogenetic_loc)): band_id = makeChromID(re.split('-',str(cytogenetic_loc)), tax_num, 'CHR') geno.addChromosomeInstance(cytogenetic_loc, build_id, assembly, band_id) bandinbuild_id = makeChromID(re.split('-',str(cytogenetic_loc)), assembly, 'MONARCH') else: # can't deal with ranges yet pass else: # add the human chromosome class to the graph, and add the build-specific version of it chr_id = makeChromID(str(chr), tax_num, 'CHR') geno.addChromosomeClass(str(chr), tax_id, tax_label) geno.addChromosomeInstance(str(chr), build_id, assembly, chr_id) chrinbuild_id = makeChromID(str(chr), assembly, 'MONARCH') seqalt_id = ':'.join(('ClinVarVariant', variant_num)) gene_id = None if str(gene_num) != '-1' and str(gene_num) != 'more than 10': # they use -1 to indicate unknown gene gene_id = ':'.join(('NCBIGene', str(gene_num))) # FIXME there are some "variants" that are actually haplotypes # probably will get taken care of when we switch to processing the xml # for example, variant_num = 38562 # but there's no way to tell if it's a haplotype in the csv data # so the dbsnp or dbvar should probably be primary, and the variant num be the vslc, # with each of the dbsnps being added to it # todo clinical significance needs to be mapped to a list of terms # first, make the variant: f = Feature(seqalt_id, allele_name, allele_type_id) if start != '-' and start.strip() != '': f.addFeatureStartLocation(start, chrinbuild_id) if stop != '-' and stop.strip() != '': f.addFeatureEndLocation(stop, chrinbuild_id) f.addFeatureToGraph(g) if bandinbuild_id is not None: f.addSubsequenceOfFeature(g, bandinbuild_id) # CHECK - this makes the assumption that there is only one affected chromosome per variant # what happens with chromosomal rearrangement variants? shouldn't both chromosomes be here? # add the hgvs as synonyms if hgvs_c != '-' and hgvs_c.strip() != '': gu.addSynonym(g, seqalt_id, hgvs_c) if hgvs_p != '-' and hgvs_p.strip() != '': gu.addSynonym(g, seqalt_id, hgvs_p) # add the dbsnp and dbvar ids as equivalent if dbsnp_num != '-' and int(dbsnp_num) != -1: dbsnp_id = 'dbSNP:rs'+str(dbsnp_num) gu.addIndividualToGraph(g, dbsnp_id, None) gu.addSameIndividual(g, seqalt_id, dbsnp_id) if dbvar_num != '-': dbvar_id = 'dbVar:'+dbvar_num gu.addIndividualToGraph(g, dbvar_id, None) gu.addSameIndividual(g, seqalt_id, dbvar_id) # TODO - not sure if this is right... add as xref? # the rcv is like the combo of the phenotype with the variant if rcv_nums != '-': for rcv_num in re.split(';',rcv_nums): rcv_id = 'ClinVar:'+rcv_num gu.addIndividualToGraph(g, rcv_id, None) gu.addXref(g, seqalt_id, rcv_id) if gene_id is not None: # add the gene gu.addClassToGraph(g, gene_id, gene_symbol) # make a variant locus vl_id = '_'+gene_num+'-'+variant_num if self.nobnodes: vl_id = ':'+vl_id vl_label = allele_name gu.addIndividualToGraph(g, vl_id, vl_label, geno.genoparts['variant_locus']) geno.addSequenceAlterationToVariantLocus(seqalt_id, vl_id) geno.addAlleleOfGene(vl_id, gene_id) else: # some basic reporting gmatch = re.search('\(\w+\)', allele_name) if gmatch is not None and len(gmatch.groups()) > 0: logger.info("Gene found in allele label, but no id provided: %s", gmatch.group(1)) elif re.match('more than 10', gene_symbol): logger.info("More than 10 genes found; need to process XML to fetch (variant=%d)", int(variant_num)) else: logger.info("No gene listed for variant %d", int(variant_num)) # parse the list of "phenotypes" which are diseases. add them as an association # ;GeneReviews:NBK1440,MedGen:C0392514,OMIM:235200,SNOMED CT:35400008;MedGen:C3280096,OMIM:614193;MedGen:CN034317,OMIM:612635;MedGen:CN169374 # the list is both semicolon delimited and comma delimited, but i don't know why! # some are bad, like: Orphanet:ORPHA ORPHA319705,SNOMED CT:49049000 if phenotype_ids != '-': for p in pheno_list: m = re.match("(Orphanet:ORPHA(?:\s*ORPHA)?)", p) if m is not None and len(m.groups()) > 0: p = re.sub(m.group(1), 'Orphanet:', p.strip()) elif re.match('SNOMED CT', p): p = re.sub('SNOMED CT', 'SNOMED', p.strip()) assoc = G2PAssoc(self.name, seqalt_id, p.strip()) assoc.add_association_to_graph(g) if other_ids != '-': id_list = other_ids.split(',') # process the "other ids" # ex: CFTR2:F508del,HGMD:CD890142,OMIM Allelic Variant:602421.0001 # TODO make more xrefs for xrefid in id_list: prefix = xrefid.split(':')[0].strip() if prefix == 'OMIM Allelic Variant': xrefid = 'OMIM:'+xrefid.split(':')[1] gu.addIndividualToGraph(g, xrefid, None) gu.addSameIndividual(g, seqalt_id, xrefid) elif prefix == 'HGMD': gu.addIndividualToGraph(g, xrefid, None) gu.addSameIndividual(g, seqalt_id, xrefid) elif prefix == 'dbVar' and dbvar_num == xrefid.split(':')[1].strip(): pass # skip over this one elif re.search('\s', prefix): pass # logger.debug('xref prefix has a space: %s', xrefid) else: # should be a good clean prefix # note that HGMD variants are in here as Xrefs because we can't resolve URIs for them # logger.info("Adding xref: %s", xrefid) # gu.addXref(g, seqalt_id, xrefid) # logger.info("xref prefix to add: %s", xrefid) pass if not self.testMode and limit is not None and line_counter > limit: break gu.loadProperties(g, G2PAssoc.object_properties, gu.OBJPROP) gu.loadProperties(g, G2PAssoc.annotation_properties, gu.ANNOTPROP) gu.loadProperties(g, G2PAssoc.datatype_properties, gu.DATAPROP) logger.info("Finished parsing variants") return
def process_gaf(self, file, limit, id_map=None): if self.testMode: g = self.testgraph else: g = self.graph gu = GraphUtils(curie_map.get()) geno = Genotype(g) logger.info("Processing Gene Associations from %s", file) line_counter = 0 zfin = wbase = None if 7955 in self.tax_ids: zfin = ZFIN() elif 6239 in self.tax_ids: wbase = WormBase() with gzip.open(file, 'rb') as csvfile: filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 # comments start with exclamation if re.match(r'!', ''.join(row)): continue (db, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, object_type, taxon, date, assigned_by, annotation_extension, gene_product_form_id) = row # test for required fields if (db == '' or gene_num == '' or gene_symbol == '' or go_id == '' or ref == '' or eco_symbol == '' or aspect == '' or object_type == '' or taxon == '' or date == '' or assigned_by == ''): logger.error( "Missing required part of annotation " + "on row %d:\n"+'\t'.join(row), line_counter) continue # deal with qualifier NOT, contributes_to, colocalizes_with if re.search(r'NOT', qualifier): continue db = self.clean_db_prefix(db) uniprotid = None gene_id = None if db == 'UniProtKB': mapped_ids = id_map.get(gene_num) if id_map is not None and mapped_ids is not None: if len(mapped_ids) == 1: gene_id = mapped_ids[0] uniprotid = ':'.join((db, gene_num)) gene_num = re.sub(r'\w+\:', '', gene_id) elif len(mapped_ids) > 1: # logger.warning( # "Skipping gene id mapped for >1 gene %s -> %s", # gene_num, str(mapped_ids)) continue else: continue elif db == 'MGI': gene_num = re.sub(r'MGI:', '', gene_num) gene_id = ':'.join((db, gene_num)) gene_id = re.sub(r'MGI\:MGI\:', 'MGI:', gene_id) else: gene_id = ':'.join((db, gene_num)) if self.testMode \ and not( re.match(r'NCBIGene', gene_id) and int(gene_num) in self.test_ids): continue gu.addClassToGraph(g, gene_id, gene_symbol) if gene_name != '': gu.addDescription(g, gene_id, gene_name) if gene_synonym != '': for s in re.split(r'\|', gene_synonym): gu.addSynonym(g, gene_id, s.strip()) if re.search(r'\|', taxon): # TODO add annotations with >1 taxon logger.info(">1 taxon (%s) on line %d. skipping", taxon, line_counter) else: tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon) geno.addTaxon(tax_id, gene_id) assoc = Assoc(self.name) assoc.set_subject(gene_id) assoc.set_object(go_id) eco_id = self.map_go_evidence_code_to_eco(eco_symbol) if eco_id is not None: assoc.add_evidence(eco_id) refs = re.split(r'\|', ref) for r in refs: r = r.strip() if r != '': prefix = re.split(r':', r)[0] r = re.sub(prefix, self.clean_db_prefix(prefix), r) r = re.sub(r'MGI\:MGI\:', 'MGI:', r) ref = Reference(r) if re.match(r'PMID', r): ref_type = Reference.ref_types['journal_article'] ref.setType(ref_type) ref.addRefToGraph(g) assoc.add_source(r) # TODO add the source of the annotations from assigned by? aspect_rel_map = { 'P': gu.object_properties['involved_in'], # involved in 'F': gu.object_properties['enables'], # enables 'C': gu.object_properties['part_of'] # part of } if aspect not in aspect_rel_map: logger.error("Aspect not recognized: %s", aspect) rel = aspect_rel_map.get(aspect) if aspect == 'F' and re.search(r'contributes_to', qualifier): rel = gu.object_properties['contributes_to'] assoc.set_relationship(rel) if uniprotid is not None: assoc.set_description('Mapped from '+uniprotid) # object_type should be one of: # protein_complex; protein; transcript; ncRNA; rRNA; tRNA; # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology. # If the precise product type is unknown, # gene_product should be used assoc.add_association_to_graph(g) # Derive G2P Associations from IMP annotations # in version 2.1 Pipe will indicate 'OR' # and Comma will indicate 'AND'. # in version 2.0, multiple values are separated by pipes # where the pipe has been used to mean 'AND' if eco_symbol == 'IMP' and with_or_from != '': withitems = re.split(r'\|', with_or_from) phenotypeid = go_id+'PHENOTYPE' # create phenotype associations for i in withitems: if i == '' or \ re.match( r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i): logger.warning( "Don't know what having a uniprot id " + "in the 'with' column means of %s", uniprotid) continue i = re.sub(r'MGI\:MGI\:', 'MGI:', i) i = re.sub(r'WB:', 'WormBase:', i) # for worms and fish, they might give a RNAi or MORPH # in these cases make a reagent-targeted gene if re.search('MRPHLNO|CRISPR|TALEN', i): targeted_gene_id = zfin.make_targeted_gene_id( gene_id, i, self.nobnodes) geno.addReagentTargetedGene(i, gene_id, targeted_gene_id) # TODO PYLINT why is this: # Redefinition of assoc type from # dipper.models.assoc.Association.Assoc to # dipper.models.assoc.G2PAssoc.G2PAssoc assoc = G2PAssoc(self.name, targeted_gene_id, phenotypeid) elif re.search(r'WBRNAi', i): targeted_gene_id = \ wbase.make_reagent_targeted_gene_id( gene_id, i, self.nobnodes) geno.addReagentTargetedGene( i, gene_id, targeted_gene_id) assoc = G2PAssoc( self.name, targeted_gene_id, phenotypeid) else: assoc = G2PAssoc(self.name, i, phenotypeid) for r in refs: r = r.strip() if r != '': prefix = re.split(r':', r)[0] r = re.sub( prefix, self.clean_db_prefix(prefix), r) r = re.sub(r'MGI\:MGI\:', 'MGI:', r) assoc.add_source(r) # experimental phenotypic evidence assoc.add_evidence("ECO:0000059") assoc.add_association_to_graph(g, self.nobnodes) # TODO should the G2PAssoc be # the evidence for the GO assoc? if not self.testMode and \ limit is not None and line_counter > limit: break return
def _process_nlx_157874_1_view(self, raw, limit=None): """ This table contains the Elements of Morphology data that has been screen-scraped into DISCO. Note that foaf:depiction is inverse of foaf:depicts relationship. Since it is bad form to have two definitions, we concatenate the two into one string. Triples: <eom id> a owl:Class rdf:label Literal(eom label) OIO:hasRelatedSynonym Literal(synonym list) IAO:definition Literal(objective_def. subjective def) foaf:depiction Literal(small_image_url), Literal(large_image_url) foaf:page Literal(page_url) rdfs:comment Literal(long commented text) :param raw: :param limit: :return: """ gu = GraphUtils(curie_map.get()) line_counter = 0 with open(raw, 'r') as f1: f1.readline() # read the header row; skip filereader = csv.reader(f1, delimiter='\t', quotechar='\"') for line in filereader: line_counter += 1 (morphology_term_id, morphology_term_num, morphology_term_label, morphology_term_url, terminology_category_label, terminology_category_url, subcategory, objective_definition, subjective_definition, comments, synonyms, replaces, small_figure_url, large_figure_url, e_uid, v_uid, v_uuid, v_last_modified) = line # note: # e_uid v_uuid v_last_modified terminology_category_url # subcategory v_uid morphology_term_num # terminology_category_label hp_label notes # are currently unused. # Add morphology term to graph as a class # with label, type, and description. gu.addClassToGraph(self.graph, morphology_term_id, morphology_term_label) # Assemble the description text if subjective_definition != '' and not ( re.match(r'.+\.$', subjective_definition)): # add a trailing period. subjective_definition = subjective_definition.strip() + '.' if objective_definition != '' and not ( re.match(r'.+\.$', objective_definition)): # add a trailing period. objective_definition = objective_definition.strip() + '.' definition = \ ' '.join( (objective_definition, subjective_definition)).strip() gu.addDefinition(self.graph, morphology_term_id, definition) # <term id> FOAF:depicted_by literal url # <url> type foaf:depiction # do we want both images? # morphology_term_id has depiction small_figure_url if small_figure_url != '': gu.addDepiction(self.graph, morphology_term_id, small_figure_url) # morphology_term_id has depiction large_figure_url if large_figure_url != '': gu.addDepiction(self.graph, morphology_term_id, large_figure_url) # morphology_term_id has comment comments if comments != '': gu.addComment(self.graph, morphology_term_id, comments.strip()) if synonyms != '': for s in synonyms.split(';'): gu.addSynonym( self.graph, morphology_term_id, s.strip(), gu.properties['hasExactSynonym']) # morphology_term_id hasRelatedSynonym replaces (; delimited) if replaces != '' and replaces != synonyms: for s in replaces.split(';'): gu.addSynonym( self.graph, morphology_term_id, s.strip(), gu.properties['hasRelatedSynonym']) # morphology_term_id has page morphology_term_url gu.addPage(self.graph, morphology_term_id, morphology_term_url) if limit is not None and line_counter > limit: break return
def _process_ortholog_classes(self, limit=None): """ This method add the KEGG orthology classes to the graph. If there's an embedded enzyme commission number, that is added as an xref. Triples created: <orthology_class_id> is a class <orthology_class_id> has label <orthology_symbols> <orthology_class_id> has description <orthology_description> :param limit: :return: """ logger.info("Processing ortholog classes") if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 gu = GraphUtils(curie_map.get()) raw = '/'.join((self.rawdir, self.files['ortholog_classes']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (orthology_class_id, orthology_class_name) = row if self.testMode and \ orthology_class_id not in \ self.test_ids['orthology_classes']: continue # The orthology class is essentially a KEGG gene ID # that is species agnostic. # Add the ID and label as a gene family class other_labels = re.split(r'[;,]', orthology_class_name) # the first one is the label we'll use orthology_label = other_labels[0] orthology_class_id = 'KEGG-'+orthology_class_id.strip() orthology_type = OrthologyAssoc.terms['gene_family'] gu.addClassToGraph(g, orthology_class_id, orthology_label, orthology_type) if len(other_labels) > 1: # add the rest as synonyms # todo skip the first for s in other_labels: gu.addSynonym(g, orthology_class_id, s.strip()) # add the last one as the description d = other_labels[len(other_labels)-1] gu.addDescription(g, orthology_class_id, d) # add the enzyme commission number (EC:1.2.99.5)as an xref # sometimes there's two, like [EC:1.3.5.1 1.3.5.4] # can also have a dash, like EC:1.10.3.- ec_matches = re.findall(r'((?:\d+|\.|-){5,7})', d) if ec_matches is not None: for ecm in ec_matches: gu.addXref(g, orthology_class_id, 'EC:'+ecm) if not self.testMode and \ limit is not None and line_counter > limit: break logger.info("Done with ortholog classes") return
def _process_diseasegene(self, limit): """ :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) gu = GraphUtils(curie_map.get()) myfile = "/".join((self.rawdir, self.files["disease-gene"]["file"])) for event, elem in ET.iterparse(myfile): if elem.tag == "Disorder": # get the element name and id # id = elem.get('id') # some internal identifier disorder_num = elem.find("OrphaNumber").text disorder_id = "Orphanet:" + str(disorder_num) if self.testMode and disorder_id not in config.get_config()["test_ids"]["disease"]: continue disorder_label = elem.find("Name").text # make a hash of internal gene id to type for later lookup gene_iid_to_type = {} gene_list = elem.find("GeneList") for gene in gene_list.findall("Gene"): gene_iid = gene.get("id") gene_type = gene.find("GeneType").get("id") gene_iid_to_type[gene_iid] = gene_type gu.addClassToGraph(g, disorder_id, disorder_label) # assuming that these are in the ontology assoc_list = elem.find("DisorderGeneAssociationList") for a in assoc_list.findall("DisorderGeneAssociation"): gene_iid = a.find(".//Gene").get("id") gene_name = a.find(".//Gene/Name").text gene_symbol = a.find(".//Gene/Symbol").text gene_num = a.find("./Gene/OrphaNumber").text gene_id = "Orphanet:" + str(gene_num) gene_type_id = self._map_gene_type_id(gene_iid_to_type[gene_iid]) gu.addClassToGraph(g, gene_id, gene_symbol, gene_type_id, gene_name) syn_list = a.find("./Gene/SynonymList") if int(syn_list.get("count")) > 0: for s in syn_list.findall("./Synonym"): gu.addSynonym(g, gene_id, s.text) dgtype = a.find("DisorderGeneAssociationType").get("id") rel_id = self._map_rel_id(dgtype) dg_label = a.find("./DisorderGeneAssociationType/Name").text if rel_id is None: logger.warn( "Cannot map association type (%s) to RO for association (%s | %s). Skipping.", dg_label, disorder_label, gene_symbol, ) continue alt_locus_id = "_" + gene_num + "-" + disorder_num + "VL" alt_label = " ".join( ("some variant of", gene_symbol.strip(), "that is a", dg_label.lower(), disorder_label) ) if self.nobnodes: alt_locus_id = ":" + alt_locus_id gu.addIndividualToGraph(g, alt_locus_id, alt_label, geno.genoparts["variant_locus"]) geno.addAlleleOfGene(alt_locus_id, gene_id) # consider typing the gain/loss-of-function variants like: # http://sequenceontology.org/browser/current_svn/term/SO:0002054 # http://sequenceontology.org/browser/current_svn/term/SO:0002053 # use "assessed" status to issue an evidence code # FIXME I think that these codes are sub-optimal status_code = a.find("DisorderGeneAssociationStatus").get("id") eco_id = "ECO:0000323" # imported automatically asserted information used in automatic assertion if status_code == "17991": # Assessed # TODO are these internal ids stable between releases? eco_id = "ECO:0000322" # imported manually asserted information used in automatic assertion # Non-traceable author statement ECO_0000034 # imported information in automatic assertion ECO_0000313 assoc = G2PAssoc(self.name, alt_locus_id, disorder_id, rel_id) assoc.add_evidence(eco_id) assoc.add_association_to_graph(g) rlist = a.find("./Gene/ExternalReferenceList") eqid = None for r in rlist.findall("ExternalReference"): if r.find("Source").text == "Ensembl": eqid = "ENSEMBL:" + r.find("Reference").text elif r.find("Source").text == "HGNC": eqid = "HGNC:" + r.find("Reference").text elif r.find("Source").text == "OMIM": eqid = "OMIM:" + r.find("Reference").text else: pass # skip the others for now if eqid is not None: gu.addClassToGraph(g, eqid, None) gu.addEquivalentClass(g, gene_id, eqid) pass elem.clear() # discard the element if self.testMode and limit is not None and line_counter > limit: return gu.loadProperties(g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP) gu.loadProperties(g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP) gu.loadProperties(g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP) gu.loadAllProperties(g) return
def _process_genes(self, limit=None): """ This method processes the KEGG gene IDs. The label for the gene is pulled as the first symbol in the list of gene symbols; the rest are added as synonyms. The long-form of the gene name is added as a definition. This is hardcoded to just processes human genes. Triples created: <gene_id> is a SO:gene <gene_id> rdfs:label <gene_name> :param limit: :return: """ logger.info("Processing genes") if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 gu = GraphUtils(curie_map.get()) geno = Genotype(g) raw = '/'.join((self.rawdir, self.files['hsa_genes']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (gene_id, gene_name) = row gene_id = 'KEGG-'+gene_id.strip() # the gene listing has a bunch of labels that are delimited, like: # DST, BP240, BPA, BPAG1, CATX-15, CATX15, D6S1101, DMH, DT, EBSB2, HSAN6, MACF2; dystonin; K10382 dystonin # it looks like the list is semicolon delimited (symbol, name, gene_class) # where the symbol is a comma-delimited list # here, we split them up. we will take the first abbreviation and make it the symbol # then take the rest as synonyms gene_stuff = re.split(';', gene_name) symbollist = re.split(',', gene_stuff[0]) first_symbol = symbollist[0].strip() if gene_id not in self.label_hash: self.label_hash[gene_id] = first_symbol if self.testMode and gene_id not in self.test_ids['genes']: continue # Add the gene as a class. geno.addGene(gene_id, first_symbol) # add the long name as the description if len(gene_stuff) > 1: description = gene_stuff[1].strip() gu.addDefinition(g, gene_id, description) # add the rest of the symbols as synonyms for i in enumerate(symbollist, start=1): gu.addSynonym(g, gene_id, i[1].strip()) # TODO add the KO here? if (not self.testMode) and (limit is not None and line_counter > limit): break logger.info("Done with genes") return
def process_feature_loc(self, limit): raw = '/'.join((self.rawdir, self.files['feature_loc']['file'])) if self.testMode: g = self.testgraph else: g = self.graph gu = GraphUtils(curie_map.get()) logger.info("Processing Feature location and attributes") line_counter = 0 geno = Genotype(g) strain_to_variant_map = {} build_num = self.version_num build_id = 'WormBase:'+build_num with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'\#', ''.join(row)): continue (chrom, db, feature_type_label, start, end, score, strand, phase, attributes) = row # I interpolated_pmap_position gene 1 559768 . . . ID=gmap:spe-13;gmap=spe-13;status=uncloned;Note=-21.3602 cM (+/- 1.84 cM) # I WormBase gene 3747 3909 . - . ID=Gene:WBGene00023193;Name=WBGene00023193;interpolated_map_position=-21.9064;sequence_name=Y74C9A.6;biotype=snoRNA;Alias=Y74C9A.6 # I absolute_pmap_position gene 4119 10230 . . . ID=gmap:homt-1;gmap=homt-1;status=cloned;Note=-21.8252 cM (+/- 0.00 cM) # dbs = re.split( # r' ', 'assembly_component expressed_sequence_match Coding_transcript Genomic_canonical Non_coding_transcript Orfeome Promoterome Pseudogene RNAi_primary RNAi_secondary Reference Transposon Transposon_CDS cDNA_for_RNAi miRanda ncRNA operon polyA_signal_sequence polyA_site snlRNA') # # if db not in dbs: # continue if feature_type_label not in [ 'gene', 'point_mutation', 'deletion', 'RNAi_reagent', 'duplication', 'enhancer', 'binding_site', 'biological_region', 'complex_substitution', 'substitution', 'insertion', 'inverted_repeat']: # note biological_regions include balancers # other options here: promoter, regulatory_region, reagent continue line_counter += 1 attribute_dict = {} if attributes != '': attribute_dict = dict( item.split("=")for item in re.sub(r'"', '', attributes).split(";")) fid = flabel = desc = None if 'ID' in attribute_dict: fid = attribute_dict.get('ID') if re.search(r'WB(Gene|Var|sf)', fid): fid = re.sub(r'^\w+:WB', 'WormBase:WB', fid) elif re.match(r'(gmap|landmark)', fid): continue else: logger.info('other identifier %s', fid) fid = None elif 'variation' in attribute_dict: fid = 'WormBase:'+attribute_dict.get('variation') flabel = attribute_dict.get('public_name') sub = attribute_dict.get('substitution') ins = attribute_dict.get('insertion') # if it's a variation: # variation=WBVar00604246;public_name=gk320600;strain=VC20384;substitution=C/T desc = '' if sub is not None: desc = 'substitution='+sub if ins is not None: desc = 'insertion='+ins # keep track of the strains with this variation, # for later processing strain_list = attribute_dict.get('strain') if strain_list is not None: for s in re.split(r',', strain_list): if s.strip() not in strain_to_variant_map: strain_to_variant_map[s.strip()] = set() strain_to_variant_map[s.strip()].add(fid) # if feature_type_label == 'RNAi_reagent': # Target=WBRNAi00096030 1 4942 # this will tell us where the RNAi is actually binding # target = attribute_dict.get('Target') # TODO unused # rnai_num = re.split(r' ', target)[0] # TODO unused # it will be the reagent-targeted-gene that has a position, # (i think) # TODO finish the RNAi binding location name = attribute_dict.get('Name') polymorphism = attribute_dict.get('polymorphism') if fid is None: if name is not None and re.match(r'WBsf', name): fid = 'WormBase:'+name name = None else: continue if self.testMode \ and re.sub(r'WormBase:', '', fid) \ not in self.test_ids['gene']+self.test_ids['allele']: continue # these really aren't that interesting if polymorphism is not None: continue if name is not None and not re.search(name, fid): if flabel is None: flabel = name else: gu.addSynonym(g, fid, name) if desc is not None: gu.addDescription(g, fid, desc) alias = attribute_dict.get('Alias') biotype = attribute_dict.get('biotype') note = attribute_dict.get('Note') other_name = attribute_dict.get('other_name') for n in [alias, other_name]: if n is not None: gu.addSynonym(g, fid, other_name) ftype = self.get_feature_type_by_class_and_biotype( feature_type_label, biotype) chr_id = makeChromID(chrom, build_id, 'CHR') geno.addChromosomeInstance(chrom, build_id, build_num) f = Feature(fid, flabel, ftype) f.addFeatureStartLocation(start, chr_id, strand) f.addFeatureEndLocation(start, chr_id, strand) feature_is_class = False if feature_type_label == 'gene': feature_is_class = True f.addFeatureToGraph(g, True, None, feature_is_class) if note is not None: gu.addDescription(g, fid, note) if not self.testMode \ and limit is not None and line_counter > limit: break # RNAi reagents: # I RNAi_primary RNAi_reagent 4184 10232 . + . Target=WBRNAi00001601 1 6049 +;laboratory=YK;history_name=SA:yk326e10 # I RNAi_primary RNAi_reagent 4223 10147 . + . Target=WBRNAi00033465 1 5925 +;laboratory=SV;history_name=MV_SV:mv_G_YK5052 # I RNAi_primary RNAi_reagent 5693 9391 . + . Target=WBRNAi00066135 1 3699 +;laboratory=CH # TODO TF bindiing sites and network: # I TF_binding_site_region TF_binding_site 1861 2048 . + . Name=WBsf292777;tf_id=WBTranscriptionFactor000025;tf_name=DAF-16 # I TF_binding_site_region TF_binding_site 3403 4072 . + . Name=WBsf331847;tf_id=WBTranscriptionFactor000703;tf_name=DPL-1 return
def _process_all(self, limit): """ This takes the list of omim identifiers from the omim.txt.Z file, and iteratively queries the omim api for the json-formatted data. This will create OMIM classes, with the label, definition, and some synonyms. If an entry is "removed", it is added as a deprecated class. If an entry is "moved", it is deprecated and consider annotations are added. Additionally, we extract: *phenotypicSeries ids as superclasses *equivalent ids for Orphanet and UMLS If set to testMode, it will write only those items in the test_ids to the testgraph. :param limit: :return: """ omimids = self._get_omim_ids() # store the set of omim identifiers omimparams = { 'format': 'json', 'include': 'all', } # you will need to add the API key into the conf.json file, like: # keys : { 'omim' : '<your api key here>' } omimparams.update({'apiKey': config.get_config()['keys']['omim']}) # http://api.omim.org/api/entry?mimNumber=100100&include=all if self.testMode: g = self.testgraph else: g = self.graph gu = GraphUtils(curie_map.get()) it = 0 # for counting # note that you can only do request batches of 20 # see info about "Limits" at http://omim.org/help/api groupsize = 20 if not self.testMode and limit is not None: # just in case the limit is larger than the number of records, max it out max = min((limit, omimids.__len__())) else: max = omimids.__len__() # max = 10 #for testing # TODO write the json to local files - make the assumption that downloads within 24 hrs are the same # now, loop through the omim numbers and pull the records as json docs while it < max: end = min((max, it+groupsize)) # iterate through the omim ids list, and fetch from the OMIM api in batches of 20 if self.testMode: intersect = list(set([str(i) for i in self.test_ids]) & set(omimids[it:end])) if len(intersect) > 0: # some of the test ids are in the omimids logger.info("found test ids: %s", intersect) omimparams.update({'mimNumber': ','.join(intersect)}) else: it += groupsize continue else: omimparams.update({'mimNumber': ','.join(omimids[it:end])}) p = urllib.parse.urlencode(omimparams) url = '/'.join((self.OMIM_API, 'entry'))+'?%s' % p logger.info('fetching: %s', '/'.join((self.OMIM_API, 'entry'))+'?%s' % p) # ### if you want to test a specific entry number, uncomment the following code block # if ('101600' in omimids[it:end]): #104000 # print("FOUND IT in",omimids[it:end]) # else: # #testing very specific record # it+=groupsize # continue # ### end code block for testing # print ('fetching:',(',').join(omimids[it:end])) # print('url:',url) d = urllib.request.urlopen(url) resp = d.read().decode() request_time = datetime.now() it += groupsize myjson = json.loads(resp) entries = myjson['omim']['entryList'] geno = Genotype(g) # add genome and taxon tax_num = '9606' tax_id = 'NCBITaxon:9606' tax_label = 'Human' geno.addGenome(tax_id, str(tax_num)) # tax label can get added elsewhere gu.addClassToGraph(g, tax_id, None) # label added elsewhere for e in entries: # get the numbers, labels, and descriptions omimnum = e['entry']['mimNumber'] titles = e['entry']['titles'] label = titles['preferredTitle'] other_labels = [] if 'alternativeTitles' in titles: other_labels += self._get_alt_labels(titles['alternativeTitles']) if 'includedTitles' in titles: other_labels += self._get_alt_labels(titles['includedTitles']) # add synonyms of alternate labels # preferredTitle": "PFEIFFER SYNDROME", # "alternativeTitles": "ACROCEPHALOSYNDACTYLY, TYPE V; ACS5;;\nACS V;;\nNOACK SYNDROME", # "includedTitles": "CRANIOFACIAL-SKELETAL-DERMATOLOGIC DYSPLASIA, INCLUDED" # remove the abbreviation (comes after the ;) from the preferredTitle, and add it as a synonym abbrev = None if len(re.split(';', label)) > 1: abbrev = (re.split(';', label)[1].strip()) newlabel = self._cleanup_label(label) description = self._get_description(e['entry']) omimid = 'OMIM:'+str(omimnum) if e['entry']['status'] == 'removed': gu.addDeprecatedClass(g, omimid) else: omimtype = self._get_omimtype(e['entry']) # this uses our cleaned-up label gu.addClassToGraph(g, omimid, newlabel, omimtype) # add the original OMIM label as a synonym gu.addSynonym(g, omimid, label) # add the alternate labels and includes as synonyms for l in other_labels: gu.addSynonym(g, omimid, l) # for OMIM, we're adding the description as a definition gu.addDefinition(g, omimid, description) if abbrev is not None: gu.addSynonym(g, omimid, abbrev) # if this is a genetic locus (but not sequenced) then add the chrom loc info if omimtype == Genotype.genoparts['biological_region']: if 'geneMapExists' in e['entry'] and e['entry']['geneMapExists']: genemap = e['entry']['geneMap'] if 'cytoLocation' in genemap: cytoloc = genemap['cytoLocation'] # parse the cytoloc. add this omim thing as a subsequence of the cytofeature # 18p11.3-p11.2 # for now, just take the first one # FIXME add the other end of the range, but not sure how to do that # not sure if saying subsequence of feature is the right relationship cytoloc = cytoloc.split('-')[0] f = Feature(omimid, None, None) if 'chromosome' in genemap: chrom = makeChromID(str(genemap['chromosome']), tax_num, 'CHR') geno.addChromosomeClass(str(genemap['chromosome']), tax_id, tax_label) loc = makeChromID(cytoloc, tax_num, 'CHR') gu.addClassToGraph(g, loc, cytoloc) # this is the chr band f.addSubsequenceOfFeature(g, loc) f.addFeatureToGraph(g) pass # check if moved, if so, make it deprecated and replaced/consider class to the other thing(s) # some entries have been moved to multiple other entries and use the joining raw word "and" # 612479 is movedto: "603075 and 603029" OR # others use a comma-delimited list, like: # 610402 is movedto: "609122,300870" if e['entry']['status'] == 'moved': if re.search('and', str(e['entry']['movedTo'])): # split the movedTo entry on 'and' newids = re.split('and', str(e['entry']['movedTo'])) elif len(str(e['entry']['movedTo']).split(',')) > 0: # split on the comma newids = str(e['entry']['movedTo']).split(',') else: # make a list of one newids = [str(e['entry']['movedTo'])] # cleanup whitespace and add OMIM prefix to numeric portion fixedids = [] for i in newids: fixedids.append('OMIM:'+i.strip()) gu.addDeprecatedClass(g, omimid, fixedids) self._get_phenotypicseries_parents(e['entry'], g) self._get_mappedids(e['entry'], g) self._get_pubs(e['entry'], g) self._get_process_allelic_variants(e['entry'], g) ### end iterating over batch of entries # can't have more than 4 req per sec, # so wait the remaining time, if necessary dt = datetime.now() - request_time rem = 0.25 - dt.total_seconds() if rem > 0: logger.info("waiting %d sec", rem) time.sleep(rem/1000) gu.loadAllProperties(g) return