def select_species(self): import tempfile outfile = tempfile.mktemp() + '.txt.gz' try: self.logger.info('Downloading "dataset_names.txt.gz"...') out_f = open(outfile, 'wb') ftp = FTP(self.__class__.ENSEMBL_FTP_HOST) ftp.login() species_file = '/pub/metazoa/release-%s/mysql/metazoa_mart_%s/dataset_names.txt.gz' % ( self.release, self.release) ftp.retrbinary("RETR " + species_file, out_f.write) out_f.close() self.logger.info('Done.') #load saved file self.logger.info('Parsing "dataset_names.txt.gz"...') species_li = tab2list(outfile, (0, 4, 5), header=0) species_li = [[x[0]] + [x[2]] + [x[1]] for x in species_li] species_li = [ x[:-1] + [is_int(x[-1]) and int(x[-1]) or None] for x in species_li ] self.logger.info('Done.') finally: os.remove(outfile) pass import pprint self.logger.error(pprint.pformat(species_li)) return species_li
def get_all_species(self): import tempfile outfile = tempfile.mktemp() + '.txt.gz' try: self.logger.info('Downloading "species.txt.gz"...') out_f = open(outfile, 'wb') ftp = FTP(self.__class__.ENSEMBL_FTP_HOST) ftp.login() species_file = '/pub/release-%s/mysql/ensembl_production_%s/species.txt.gz' % (self.release, self.release) ftp.retrbinary("RETR " + species_file, out_f.write) out_f.close() self.logger.info('Done.') #load saved file self.logger.info('Parsing "species.txt.gz"...') species_li = tab2list(outfile, (1, 2, 7), header=0) # db_name,common_name,taxid species_li = [x[:-1] + [is_int(x[-1]) and int(x[-1]) or None] for x in species_li] # as of ensembl 87, there are also mouse strains. keep only the "original" one species_li = [s for s in species_li if not s[0].startswith("mus_musculus_")] self.logger.info('Done.') finally: os.remove(outfile) pass return species_li
def _select_species(self): """ Return a list of tuple containing species to download data for. [(species_name1, common_name1, taxid1),(species_name2, common_name2, taxid2), ...] """ import tempfile outfile = tempfile.mktemp() + '.txt.gz' try: self.logger.info('Downloading Species List...') out_f = open(outfile, 'wb') ftp = FTP(self.__class__.ENSEMBL_FTP_HOST) ftp.login() species_file = self.get_species_file() ftp.retrbinary("RETR " + species_file, out_f.write) out_f.close() self.logger.info('Done.') # load saved file self.logger.info('Loading Species List...') species_li = tab2list(outfile, (0, 4, 5), header=0) species_li = [[x[0]] + [x[2]] + [x[1]] for x in species_li] species_li = [ x[:-1] + [is_int(x[-1]) and int(x[-1]) or None] for x in species_li] self.logger.info('Done.') finally: os.remove(outfile) import pprint self.logger.debug('\n %s', pprint.pformat(species_li)) return species_li
def load_data(self, data_folder): """ Loads gene data from NCBI's refseq2gene.gz file. Parses it based on genomic position data and refseq status provided by the list of taxids from get_ref_microbe_taxids() as lookup table :return: """ taxids_file = os.path.join(data_folder, "../ref_microbe_taxids.pyobj") datafile = os.path.join(data_folder, 'gene2refseq.gz') taxids = loadobj(taxids_file) taxid_set = set(taxids) def _includefn(ld): return ld[0] in taxid_set # match taxid from taxid_set cols_included = [0, 1, 7, 9, 10, 11] # 0-based col idx gene2genomic_pos_li = tab2list(datafile, cols_included, header=1, includefn=_includefn) count = 0 last_id = None for gene in gene2genomic_pos_li: count += 1 strand = 1 if gene[5] == '+' else -1 _id = gene[1] mgi_dict = { '_id': _id, 'genomic_pos': { 'entrezgene': _id, 'start': int(gene[3]), 'end': int(gene[4]), 'chr': gene[2], 'strand': strand } } if _id != last_id: # rows with dup _id will be skipped yield mgi_dict last_id = _id
def get_geneid_d(data_folder, species_li=None, load_cache=True, save_cache=True, only_for={}): '''return a dictionary of current/retired geneid to current geneid mapping. This is useful, when other annotations were mapped to geneids may contain retired gene ids. if species_li is None, genes from all species are loaded. Note that all ids are int type. ''' if species_li: taxid_set = set( [TAXONOMY[species]["tax_id"] for species in species_li]) else: taxid_set = None orig_cwd = os.getcwd() os.chdir(data_folder) # check cache file _cache_file = 'geneid_d.pyobj' if load_cache and os.path.exists(_cache_file) and \ file_newer(_cache_file, 'gene_info.gz') and \ file_newer(_cache_file, 'gene_history.gz'): _taxid_set, out_d = loadobj(_cache_file) assert _taxid_set == taxid_set os.chdir(orig_cwd) return out_d DATAFILE = os.path.join(data_folder, 'gene_info.gz') if species_li: species_filter = lambda ld: int(ld[0]) in taxid_set and ( only_for and ld[1] in only_for) elif only_for: species_filter = lambda ld: only_for and ld[1] in only_for else: species_filter = None geneid_li = set(tab2list(DATAFILE, 1, includefn=species_filter)) DATAFILE = os.path.join(data_folder, 'gene_history.gz') if species_li: _includefn = lambda ld: int(ld[0]) in taxid_set and ld[1] in geneid_li else: _includefn = lambda ld: ld[1] in geneid_li # include all species retired2gene = tab2dict(DATAFILE, (1, 2), 1, alwayslist=0, includefn=_includefn) # includefn above makes sure taxid is for species_li and filters out those # mapped_to geneid exists in gene_info list # convert key/value to int out_d = dict_convert(retired2gene, keyfn=int, valuefn=int) # TODO: this fills memory with key==value ... for g in geneid_li: _g = int(g) out_d[_g] = _g if save_cache: if species_li: dump((taxid_set, out_d), _cache_file) else: dump((None, out_d), _cache_file) os.chdir(orig_cwd) return out_d