def get_ref_microbe_taxids(): """ Downloads the latest bacterial genome assembly summary from the NCBI genome ftp site and generate a list of taxids of the bacterial reference genomes. :return: """ import urllib.request import csv urlbase = 'ftp://ftp.ncbi.nlm.nih.gov' urlextension = '/genomes/refseq/bacteria/assembly_summary.txt' assembly = urllib.request.urlopen(urlbase + urlextension) datareader = csv.reader(assembly.read().decode().splitlines(), delimiter="\t") taxid = [] for row in datareader: if len(row) == 1 and row[0].startswith("#"): continue if row[4] in ['reference genome', 'representative genome']: taxid.append(row[5]) ts = get_timestamp() dump(taxid, "ref_microbe_taxids_{}.pyobj".format(ts)) return taxid
def get_ref_microbe_taxids(): """ Downloads the latest bacterial genome assembly summary from the NCBI genome ftp site and generate a list of taxids of the bacterial reference genomes. :return: """ import urllib.request import csv urlbase = 'ftp://ftp.ncbi.nlm.nih.gov' urlextension = '/genomes/refseq/bacteria/assembly_summary.txt' assembly = urllib.request.urlopen(urlbase + urlextension) datareader = csv.reader(assembly.read().decode().splitlines(), delimiter="\t") taxid = [] for row in datareader: if len(row) == 1 and row[0].startswith("#"): continue if row[4] in ['reference genome','representative genome']: taxid.append(row[5]) ts = get_timestamp() dump(taxid, "ref_microbe_taxids_{}.pyobj".format(ts)) return taxid
def diff_worker_old_vs_new(id_list_old, new_db_col_names, batch_num, diff_folder): new = create_backend(new_db_col_names) docs_common = new.mget_from_ids(id_list_old) ids_common = [_doc['_id'] for _doc in docs_common] id_in_old = list(set(id_list_old) - set(ids_common)) file_name = os.path.join(diff_folder, "%s.pyobj" % str(batch_num)) _result = { 'delete': id_in_old, 'add': [], 'update': [], 'source': new.target_name, 'timestamp': get_timestamp() } summary = {"add": 0, "update": 0, "delete": len(id_in_old)} if len(id_in_old) != 0: dump(_result, file_name) # compute md5 so when downloaded, users can check integreity md5 = md5sum(file_name) summary["diff_file"] = { "name": os.path.basename(file_name), "md5sum": md5 } return summary
def _download(__metadata__): from utils.dataload import download as _download output_folder = os.path.join(os.path.split(DATA_FOLDER)[0], get_timestamp()) for species in ['human', 'mouse', 'yeast']: url = __metadata__['__url_{}__'.format(species)] output_file = 'CPDB_pathways_genes_{}.tab'.format(species) _download(url, output_folder, output_file)
def switch_collection(self): '''after a successful loading, rename temp_collection to regular collection name, and renaming existing collection to a temp name for archiving purpose. ''' if self.temp_collection and self.temp_collection.count() > 0: if self.collection.count() > 0: # renaming existing collections new_name = '_'.join([self.__collection__, 'archive', get_timestamp(), get_random_string()]) self.collection.rename(new_name, dropTarget=True) self.temp_collection.rename(self.__collection__) else: print("Error: load data first.")
def switch_collection(self): '''after a successful loading, rename temp_collection to regular collection name, and renaming existing collection to a temp name for archiving purpose. ''' if self.temp_collection_name and self.db[self.temp_collection_name].count() > 0: if self.collection.count() > 0: # renaming existing collections new_name = '_'.join([self.collection_name, 'archive', get_timestamp(), get_random_string()]) self.collection.rename(new_name, dropTarget=True) self.db[self.temp_collection_name].rename(self.collection_name) else: raise ResourceError("No temp collection (or it's empty)")
def switch_collection(self): '''after a successful loading, rename temp_collection to regular collection name, and renaming existing collection to a temp name for archiving purpose. ''' if self.temp_collection and self.temp_collection.count() > 0: if self.collection.count() > 0: # renaming existing collections new_name = '_'.join([ self.__collection__, 'archive', get_timestamp(), get_random_string() ]) self.collection.rename(new_name, dropTarget=True) self.temp_collection.rename(self.__collection__) else: print("Error: load data first.")
def backup(folder=".", archive=None): """ Dump the whole hub_db database in given folder. "archive" can be pass to specify the target filename, otherwise, it's randomly generated Note: this doesn't backup source/merge data, just the internal data used by the hub """ # get database name (ie. hub_db internal database) db_name = get_src_dump().database.name dump = {} for getter in [ get_src_dump, get_src_master, get_src_build, get_src_build_config, get_data_plugin, get_api, get_cmd, get_event, get_hub_config ]: col = getter() dump[col.name] = [] for doc in col.find(): dump[col.name].append(doc) if not archive: archive = "backup_%s_%s.pyobj" % (get_timestamp(), get_random_string()) path = os.path.join(folder, archive) dumpobj(dump, path) return path
def diff_worker_new_vs_old(id_list_new, old_db_col_names, new_db_col_names, batch_num, diff_folder, diff_func, exclude=[], selfcontained=False): new = create_backend(new_db_col_names) old = create_backend(old_db_col_names) docs_common = old.mget_from_ids(id_list_new) ids_common = [_doc['_id'] for _doc in docs_common] id_in_new = list(set(id_list_new) - set(ids_common)) _updates = [] if len(ids_common) > 0: _updates = diff_func(old, new, list(ids_common), exclude_attrs=exclude) file_name = os.path.join(diff_folder, "%s.pyobj" % str(batch_num)) _result = { 'add': id_in_new, 'update': _updates, 'delete': [], 'source': new.target_name, 'timestamp': get_timestamp() } if selfcontained: _result["add"] = new.mget_from_ids(id_in_new) summary = {"add": len(id_in_new), "update": len(_updates), "delete": 0} if len(_updates) != 0 or len(id_in_new) != 0: dump(_result, file_name) # compute md5 so when downloaded, users can check integreity md5 = md5sum(file_name) summary["diff_file"] = { "name": os.path.basename(file_name), "md5sum": md5 } return summary
def generate_target_name(self, build_config_name): assert build_config_name is not None return '{}_{}_{}'.format(build_config_name, get_timestamp(), get_random_string()).lower()
def generate_target_name(self, build_config_name): return 'genedoc_{}_{}_{}'.format(build_config_name, get_timestamp(), get_random_string()).lower()
def _get_target_name(self): return 'genedoc_{}_{}_{}'.format(self._build_config['name'], get_timestamp(), get_random_string()).lower()