def _get_gene_history(self, limit): """ Loops through the gene_history file and adds the old gene ids as deprecated classes, where the new gene id is the replacement for it. The old gene symbol is added as a synonym to the gene. :param limit: :return: """ gu = GraphUtils(curie_map.get()) if self.testMode: g = self.testgraph else: g = self.graph logger.info("Processing Gene records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['gene_history']['file'])) logger.info("FILE: %s", myfile) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match('^#', line): continue (tax_num, gene_num, discontinued_num, discontinued_symbol, discontinued_date) = line.split('\t') ##### set filter=None in init if you don't want to have a filter #if self.filter is not None: # if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids)) # or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))): # continue ##### end filter if gene_num == '-' or discontinued_num == '-': continue if self.testMode and int(gene_num) not in self.gene_ids: continue if int(tax_num) not in self.tax_ids: continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num)) tax_id = ':'.join(('NCBITaxon', tax_num)) # add the two genes gu.addClassToGraph(g, gene_id, None) gu.addClassToGraph(g, discontinued_gene_id, discontinued_symbol) # add the new gene id to replace the old gene id gu.addDeprecatedClass(g, discontinued_gene_id, [gene_id]) # also add the old symbol as a synonym of the new gene gu.addSynonym(g, gene_id, discontinued_symbol) if (not self.testMode) and (limit is not None and line_counter > limit): break return
def process_gene_ids(self, limit): raw = '/'.join((self.rawdir, self.files['gene_ids']['file'])) if self.testMode: g = self.testgraph else: g = self.graph gu = GraphUtils(curie_map.get()) logger.info("Processing Gene IDs") line_counter = 0 geno = Genotype(g) with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter=',', quotechar='\"') for row in filereader: line_counter += 1 (taxon_num, gene_num, gene_symbol, gene_synonym, live) = row # 6239,WBGene00000001,aap-1,Y110A7A.10,Live if self.testMode and gene_num not in self.test_ids['gene']: continue taxon_id = 'NCBITaxon:'+taxon_num gene_id = 'WormBase:'+gene_num if gene_symbol == '': gene_symbol = gene_synonym if gene_symbol == '': gene_symbol = None gu.addClassToGraph( g, gene_id, gene_symbol, Genotype.genoparts['gene']) if live == 'Dead': gu.addDeprecatedClass(g, gene_id) geno.addTaxon(taxon_id, gene_id) if gene_synonym != '': gu.addSynonym(g, gene_id, gene_synonym) if not self.testMode \ and limit is not None and line_counter > limit: break return
def _process_genes(self, limit=None): gu = GraphUtils(curie_map.get()) if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) raw = '/'.join((self.rawdir, self.files['genes']['file'])) line_counter = 0 logger.info("Processing HGNC genes") with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: (hgnc_id, symbol, name, locus_group, locus_type, status, location, location_sortable, alias_symbol, alias_name, prev_symbol, prev_name, gene_family, gene_family_id, date_approved_reserved, date_symbol_changed, date_name_changed, date_modified, entrez_id, ensembl_gene_id, vega_id, ucsc_id, ena, refseq_accession, ccds_id, uniprot_ids, pubmed_id, mgd_id, rgd_id, lsdb, cosmic, omim_id, mirbase, homeodb, snornabase, bioparadigms_slc, orphanet, pseudogene_org, horde_id, merops, imgt, iuphar, kznf_gene_catalog, mamit_trnadb, cd, lncrnadb, enzyme_id, intermediate_filament_db) = row line_counter += 1 # skip header if line_counter <= 1: continue if self.testMode and entrez_id != '' \ and int(entrez_id) not in self.gene_ids: continue if name == '': name = None gene_type_id = self._get_gene_type(locus_type) gu.addClassToGraph(g, hgnc_id, symbol, gene_type_id, name) if locus_type == 'withdrawn': gu.addDeprecatedClass(g, hgnc_id) if entrez_id != '': gu.addEquivalentClass( g, hgnc_id, 'NCBIGene:' + entrez_id) if ensembl_gene_id != '': gu.addEquivalentClass( g, hgnc_id, 'ENSEMBL:' + ensembl_gene_id) geno.addTaxon('NCBITaxon:9606', hgnc_id) # add pubs as "is about" if pubmed_id != '': for p in re.split(r'\|', pubmed_id.strip()): if str(p) != '': gu.addTriple( g, 'PMID:' + str(p.strip()), gu.object_properties['is_about'], hgnc_id) # add chr location # sometimes two are listed, like: 10p11.2 or 17q25 # -- there are only 2 of these FRA10A and MPFD # sometimes listed like "1 not on reference assembly" # sometimes listed like 10q24.1-q24.3 # sometimes like 11q11 alternate reference locus band = chrom = None chr_pattern = r'(\d+|X|Y|Z|W|MT)[pq$]' chr_match = re.match(chr_pattern, location) if chr_match is not None and len(chr_match.groups()) > 0: chrom = chr_match.group(1) chrom_id = makeChromID(chrom, 'NCBITaxon:9606', 'CHR') band_pattern = r'([pq][A-H\d]?\d?(?:\.\d+)?)' band_match = re.search(band_pattern, location) f = Feature(hgnc_id, None, None) if band_match is not None and len(band_match.groups()) > 0: band = band_match.group(1) band = chrom + band # add the chr band as the parent to this gene # as a feature but assume that the band is created # as a class with properties elsewhere in Monochrom # TEC Monoch? Monarchdom?? band_id = makeChromID(band, 'NCBITaxon:9606', 'CHR') gu.addClassToGraph(g, band_id, None) f.addSubsequenceOfFeature(g, band_id) else: gu.addClassToGraph(g, chrom_id, None) f.addSubsequenceOfFeature(g, chrom_id) if not self.testMode \ and limit is not None and line_counter > limit: break # end loop through file gu.loadProperties(g, Feature.object_properties, gu.OBJPROP) gu.loadProperties(g, Feature.data_properties, gu.DATAPROP) gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP) gu.loadAllProperties(g) return
def _process_all(self, limit): """ This takes the list of omim identifiers from the omim.txt.Z file, and iteratively queries the omim api for the json-formatted data. This will create OMIM classes, with the label, definition, and some synonyms. If an entry is "removed", it is added as a deprecated class. If an entry is "moved", it is deprecated and consider annotations are added. Additionally, we extract: *phenotypicSeries ids as superclasses *equivalent ids for Orphanet and UMLS If set to testMode, it will write only those items in the test_ids to the testgraph. :param limit: :return: """ omimids = self._get_omim_ids() # store the set of omim identifiers omimparams = { 'format': 'json', 'include': 'all', } # you will need to add the API key into the conf.json file, like: # keys : { 'omim' : '<your api key here>' } omimparams.update({'apiKey': config.get_config()['keys']['omim']}) # http://api.omim.org/api/entry?mimNumber=100100&include=all if self.testMode: g = self.testgraph else: g = self.graph gu = GraphUtils(curie_map.get()) it = 0 # for counting # note that you can only do request batches of 20 # see info about "Limits" at http://omim.org/help/api groupsize = 20 if not self.testMode and limit is not None: # just in case the limit is larger than the number of records, max it out max = min((limit, omimids.__len__())) else: max = omimids.__len__() # max = 10 #for testing # TODO write the json to local files - make the assumption that downloads within 24 hrs are the same # now, loop through the omim numbers and pull the records as json docs while it < max: end = min((max, it+groupsize)) # iterate through the omim ids list, and fetch from the OMIM api in batches of 20 if self.testMode: intersect = list(set([str(i) for i in self.test_ids]) & set(omimids[it:end])) if len(intersect) > 0: # some of the test ids are in the omimids logger.info("found test ids: %s", intersect) omimparams.update({'mimNumber': ','.join(intersect)}) else: it += groupsize continue else: omimparams.update({'mimNumber': ','.join(omimids[it:end])}) p = urllib.parse.urlencode(omimparams) url = '/'.join((self.OMIM_API, 'entry'))+'?%s' % p logger.info('fetching: %s', '/'.join((self.OMIM_API, 'entry'))+'?%s' % p) # ### if you want to test a specific entry number, uncomment the following code block # if ('101600' in omimids[it:end]): #104000 # print("FOUND IT in",omimids[it:end]) # else: # #testing very specific record # it+=groupsize # continue # ### end code block for testing # print ('fetching:',(',').join(omimids[it:end])) # print('url:',url) d = urllib.request.urlopen(url) resp = d.read().decode() request_time = datetime.now() it += groupsize myjson = json.loads(resp) entries = myjson['omim']['entryList'] geno = Genotype(g) # add genome and taxon tax_num = '9606' tax_id = 'NCBITaxon:9606' tax_label = 'Human' geno.addGenome(tax_id, str(tax_num)) # tax label can get added elsewhere gu.addClassToGraph(g, tax_id, None) # label added elsewhere for e in entries: # get the numbers, labels, and descriptions omimnum = e['entry']['mimNumber'] titles = e['entry']['titles'] label = titles['preferredTitle'] other_labels = [] if 'alternativeTitles' in titles: other_labels += self._get_alt_labels(titles['alternativeTitles']) if 'includedTitles' in titles: other_labels += self._get_alt_labels(titles['includedTitles']) # add synonyms of alternate labels # preferredTitle": "PFEIFFER SYNDROME", # "alternativeTitles": "ACROCEPHALOSYNDACTYLY, TYPE V; ACS5;;\nACS V;;\nNOACK SYNDROME", # "includedTitles": "CRANIOFACIAL-SKELETAL-DERMATOLOGIC DYSPLASIA, INCLUDED" # remove the abbreviation (comes after the ;) from the preferredTitle, and add it as a synonym abbrev = None if len(re.split(';', label)) > 1: abbrev = (re.split(';', label)[1].strip()) newlabel = self._cleanup_label(label) description = self._get_description(e['entry']) omimid = 'OMIM:'+str(omimnum) if e['entry']['status'] == 'removed': gu.addDeprecatedClass(g, omimid) else: omimtype = self._get_omimtype(e['entry']) # this uses our cleaned-up label gu.addClassToGraph(g, omimid, newlabel, omimtype) # add the original OMIM label as a synonym gu.addSynonym(g, omimid, label) # add the alternate labels and includes as synonyms for l in other_labels: gu.addSynonym(g, omimid, l) # for OMIM, we're adding the description as a definition gu.addDefinition(g, omimid, description) if abbrev is not None: gu.addSynonym(g, omimid, abbrev) # if this is a genetic locus (but not sequenced) then add the chrom loc info if omimtype == Genotype.genoparts['biological_region']: if 'geneMapExists' in e['entry'] and e['entry']['geneMapExists']: genemap = e['entry']['geneMap'] if 'cytoLocation' in genemap: cytoloc = genemap['cytoLocation'] # parse the cytoloc. add this omim thing as a subsequence of the cytofeature # 18p11.3-p11.2 # for now, just take the first one # FIXME add the other end of the range, but not sure how to do that # not sure if saying subsequence of feature is the right relationship cytoloc = cytoloc.split('-')[0] f = Feature(omimid, None, None) if 'chromosome' in genemap: chrom = makeChromID(str(genemap['chromosome']), tax_num, 'CHR') geno.addChromosomeClass(str(genemap['chromosome']), tax_id, tax_label) loc = makeChromID(cytoloc, tax_num, 'CHR') gu.addClassToGraph(g, loc, cytoloc) # this is the chr band f.addSubsequenceOfFeature(g, loc) f.addFeatureToGraph(g) pass # check if moved, if so, make it deprecated and replaced/consider class to the other thing(s) # some entries have been moved to multiple other entries and use the joining raw word "and" # 612479 is movedto: "603075 and 603029" OR # others use a comma-delimited list, like: # 610402 is movedto: "609122,300870" if e['entry']['status'] == 'moved': if re.search('and', str(e['entry']['movedTo'])): # split the movedTo entry on 'and' newids = re.split('and', str(e['entry']['movedTo'])) elif len(str(e['entry']['movedTo']).split(',')) > 0: # split on the comma newids = str(e['entry']['movedTo']).split(',') else: # make a list of one newids = [str(e['entry']['movedTo'])] # cleanup whitespace and add OMIM prefix to numeric portion fixedids = [] for i in newids: fixedids.append('OMIM:'+i.strip()) gu.addDeprecatedClass(g, omimid, fixedids) self._get_phenotypicseries_parents(e['entry'], g) self._get_mappedids(e['entry'], g) self._get_pubs(e['entry'], g) self._get_process_allelic_variants(e['entry'], g) ### end iterating over batch of entries # can't have more than 4 req per sec, # so wait the remaining time, if necessary dt = datetime.now() - request_time rem = 0.25 - dt.total_seconds() if rem > 0: logger.info("waiting %d sec", rem) time.sleep(rem/1000) gu.loadAllProperties(g) return