def log_building_start(self): if self.merge_logging: #setup logging logfile = 'databuild_{}_{}.log'.format( 'genedoc' + '_' + self._build_config['name'], time.strftime('%Y%m%d')) logfile = os.path.join(self.log_folder, logfile) setup_logfile(logfile) src_build = getattr(self, 'src_build', None) if src_build: #src_build.update({'_id': self._build_config['_id']}, {"$unset": {"build": ""}}) d = { 'status': 'building', 'started_at': datetime.now(), 'logfile': logfile, 'target_backend': self.target.name } if self.target.name == 'mongodb': d['target'] = self.target.target_collection.name elif self.target.name == 'es': d['target'] = self.target.target_esidxer.ES_INDEX_NAME logging.info(pformat(d)) src_build.update({'_id': self._build_config['_id']}, {"$push": { 'build': d }}) _cfg = src_build.find_one({'_id': self._build_config['_id']}) if len(_cfg['build']) > self.max_build_status: #remove the first build status record src_build.update({'_id': self._build_config['_id']}, {"$pop": { 'build': -1 }})
def main(no_confirm=True): src_dump = get_src_dump() download_list = get_file_list_for_download() if len(download_list) == 0: logging.info("No newer file found. Abort now.") sys.exit(0) doc = src_dump.find_one({'_id': 'ucsc'}) if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) logfile = os.path.join(DATA_FOLDER, 'ucsc_dump.log') setup_logfile(logfile) # mark the download starts doc = { '_id': 'ucsc', 'timestamp': timestamp, 'data_folder': DATA_FOLDER, 'lastmodified': latest_lastmodified, 'logfile': logfile, 'status': 'downloading' } src_dump.save(doc) t0 = time.time() download(download_list, no_confirm) # mark the download finished successfully _updates = { 'status': 'success', 'time': timesofar(t0), 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'ucsc'}, {'$set': _updates})
def log_building_start(self): if self.merge_logging: #setup logging logfile = 'databuild_{}_{}.log'.format('genedoc' + '_' + self._build_config['name'], time.strftime('%Y%m%d')) logfile = os.path.join(self.log_folder, logfile) setup_logfile(logfile) src_build = getattr(self, 'src_build', None) if src_build: #src_build.update({'_id': self._build_config['_id']}, {"$unset": {"build": ""}}) d = {'status': 'building', 'started_at': datetime.now(), 'logfile': logfile, 'target_backend': self.target.name} if self.target.name == 'mongodb': d['target'] = self.target.target_collection.name elif self.target.name == 'es': d['target'] = self.target.target_esidxer.ES_INDEX_NAME logging.info(pformat(d)) src_build.update({'_id': self._build_config['_id']}, {"$push": {'build': d}}) _cfg = src_build.find_one({'_id': self._build_config['_id']}) if len(_cfg['build']) > self.max_build_status: #remove the first build status record src_build.update({'_id': self._build_config['_id']}, {"$pop": {'build': -1}})
def main(no_confirm=True): src_dump = get_src_dump() download_list = get_file_list_for_download() if len(download_list) == 0: logging.info("No newer file found. Abort now.") sys.exit(0) doc = src_dump.find_one({'_id': 'ucsc'}) if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) logfile = os.path.join(DATA_FOLDER, 'ucsc_dump.log') setup_logfile(logfile) # mark the download starts doc = {'_id': 'ucsc', 'timestamp': timestamp, 'data_folder': DATA_FOLDER, 'lastmodified': latest_lastmodified, 'logfile': logfile, 'status': 'downloading'} src_dump.save(doc) t0 = time.time() download(download_list, no_confirm) # mark the download finished successfully _updates = { 'status': 'success', 'time': timesofar(t0), 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'ucsc'}, {'$set': _updates})
def main_cron(no_confirm=True): '''set no_confirm to True for running this script automatically without intervention.''' src_dump = get_src_dump() mart_version = chk_latest_mart_version() logging.info("Checking latest mart_version:\t%s" % mart_version) doc = src_dump.find_one({'_id': 'ensembl'}) if doc and 'release' in doc and mart_version <= doc['release']: data_file = os.path.join(doc['data_folder'], 'gene_ensembl__gene__main.txt') if os.path.exists(data_file): logging.info("No newer release found. Abort now.") sys.exit(0) DATA_FOLDER = os.path.join(ENSEMBL_FOLDER, str(mart_version)) if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit(0) logfile = os.path.join(DATA_FOLDER, 'ensembl_mart_%s.log' % mart_version) setup_logfile(logfile) #mark the download starts doc = {'_id': 'ensembl', 'release': mart_version, 'timestamp': time.strftime('%Y%m%d'), 'data_folder': DATA_FOLDER, 'logfile': logfile, 'status': 'downloading'} src_dump.save(doc) t0 = time.time() try: BM = BioMart() BM.species_li = get_all_species(mart_version) BM.get_gene__main(os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt')) BM.get_translation__main(os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt')) BM.get_xref_entrezgene(os.path.join(DATA_FOLDER, 'gene_ensembl__xref_entrezgene__dm.txt')) BM.get_profile(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_profile__dm.txt')) BM.get_interpro(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt')) BM.get_pfam(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_pfam__dm.txt')) finally: sys.stdout.close() #mark the download finished successfully _updates = { 'status': 'success', 'time': timesofar(t0), 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'ensembl'}, {'$set': _updates})
def main_cron(): no_confirm = True # set it to True for running this script automatically without intervention. logging.info("Checking latest refseq release:\t", end='') refseq_release = get_refseq_release() logging.info(refseq_release) src_dump = get_src_dump() doc = src_dump.find_one({'_id': 'refseq'}) if doc and 'release' in doc and refseq_release <= doc['release']: data_file = os.path.join(doc['data_folder'], 'complete.109.rna.gbff.gz') if os.path.exists(data_file): logging.info("No newer release found. Abort now.") sys.exit(0) DATA_FOLDER = os.path.join(REFSEQ_FOLDER, str(refseq_release)) if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit(0) logfile = os.path.join(DATA_FOLDER, 'refseq_dump.log') setup_logfile(logfile) #mark the download starts doc = {'_id': 'refseq', 'release': refseq_release, 'timestamp': time.strftime('%Y%m%d'), 'data_folder': DATA_FOLDER, 'logfile': logfile, 'status': 'downloading'} src_dump.save(doc) t0 = time.time() try: download(DATA_FOLDER, refseq_release, no_confirm=no_confirm) finally: sys.stdout.close() #mark the download finished successfully _updates = { 'status': 'success', 'time': timesofar(t0), 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'refseq'}, {'$set': _updates})
def main(): no_confirm = True # set it to True for running this script automatically without intervention. if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit() logfile = os.path.join(DATA_FOLDER, 'entrez_dump.log') setup_logfile(logfile) #mark the download starts src_dump = get_src_dump() doc = { '_id': 'entrez', 'timestamp': timestamp, 'data_folder': DATA_FOLDER, 'logfile': logfile, 'status': 'downloading' } src_dump.save(doc) t0 = time.time() download(DATA_FOLDER, no_confirm=no_confirm) t_download = timesofar(t0) t1 = time.time() #mark parsing starts src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}}) parse_gbff(DATA_FOLDER) t_parsing = timesofar(t1) t_total = timesofar(t0) #mark the download finished successfully _updates = { 'status': 'success', 'time': { 'download': t_download, 'parsing': t_parsing, 'total': t_total }, 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'entrez'}, {'$set': _updates})
def update_from_temp_collections(config, no_confirm=False, use_parallel=False): t0 = time.time() sc = GeneDocSyncer(config) new_src_li = sc.get_new_source_list() if not new_src_li: logging.info("No new source collections need to update. Abort now.") return logging.info("Found {} new source collections need to update:".format( len(new_src_li))) logging.info("\n".join(['\t' + x for x in new_src_li])) if no_confirm or ask('Continue?') == 'Y': logfile = 'databuild_sync_{}_{}.log'.format(config, time.strftime('%Y%m%d')) logfile = os.path.join(LOG_FOLDER, logfile) setup_logfile(logfile) for src in new_src_li: t0 = time.time() logging.info("Current source collection: %s" % src) ts = _get_timestamp(src, as_str=True) logging.info("Calculating changes... ") changes = sc.get_changes(src, use_parallel=use_parallel) logging.info("Done") get_changes_stats(changes) if no_confirm or ask("Continue to save changes...") == 'Y': if config == 'genedoc_mygene': dumpfile = 'changes_{}.pyobj'.format(ts) else: dumpfile = 'changes_{}_allspecies.pyobj'.format(ts) dump(changes, dumpfile) dumpfile_key = 'genedoc_changes/' + dumpfile logging.info('Saving to S3: "{}"... '.format(dumpfile_key)) send_s3_file(dumpfile, dumpfile_key) logging.info('Done.') #os.remove(dumpfile) if no_confirm or ask("Continue to apply changes...") == 'Y': sc.apply_changes(changes) sc.verify_changes(changes) logging.info('=' * 20) logging.info("Finished. %s" % timesofar(t0))
def main(no_confirm=True): src_dump = get_src_dump() lastmodified = check_lastmodified() doc = src_dump.find_one({'_id': 'exac'}) if doc and 'lastmodified' in doc and lastmodified <= doc['lastmodified']: path, filename = os.path.split(DATAFILES_PATH[0]) data_file = os.path.join(doc['data_folder'], filename) if os.path.exists(data_file): logging.info("No newer file found. Abort now.") sys.exit(0) if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit(0) logfile = os.path.join(DATA_FOLDER, 'exac_dump.log') setup_logfile(logfile) #mark the download starts doc = { '_id': 'exac', 'timestamp': timestamp, 'data_folder': DATA_FOLDER, 'lastmodified': lastmodified, 'logfile': logfile, 'status': 'downloading' } src_dump.save(doc) t0 = time.time() download(no_confirm) #mark the download finished successfully _updates = { 'status': 'success', 'time': timesofar(t0), 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'exac'}, {'$set': _updates})
def main(): no_confirm = True # set it to True for running this script automatically without intervention. if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit() logfile = os.path.join(DATA_FOLDER, 'entrez_dump.log') setup_logfile(logfile) #mark the download starts src_dump = get_src_dump() doc = {'_id': 'entrez', 'timestamp': timestamp, 'data_folder': DATA_FOLDER, 'logfile': logfile, 'status': 'downloading'} src_dump.save(doc) t0 = time.time() download(DATA_FOLDER, no_confirm=no_confirm) t_download = timesofar(t0) t1 = time.time() #mark parsing starts src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}}) parse_gbff(DATA_FOLDER) t_parsing = timesofar(t1) t_total = timesofar(t0) #mark the download finished successfully _updates = { 'status': 'success', 'time': { 'download': t_download, 'parsing': t_parsing, 'total': t_total }, 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'entrez'}, {'$set': _updates})
def update_from_temp_collections(config,no_confirm=False,use_parallel=False): t0 = time.time() sc = GeneDocSyncer(config) new_src_li = sc.get_new_source_list() if not new_src_li: logging.info("No new source collections need to update. Abort now.") return logging.info("Found {} new source collections need to update:".format(len(new_src_li))) logging.info("\n".join(['\t' + x for x in new_src_li])) if no_confirm or ask('Continue?') == 'Y': logfile = 'databuild_sync_{}_{}.log'.format(config, time.strftime('%Y%m%d')) logfile = os.path.join(LOG_FOLDER, logfile) setup_logfile(logfile) for src in new_src_li: t0 = time.time() logging.info("Current source collection: %s" % src) ts = _get_timestamp(src, as_str=True) logging.info("Calculating changes... ") changes = sc.get_changes(src, use_parallel=use_parallel) logging.info("Done") get_changes_stats(changes) if no_confirm or ask("Continue to save changes...") == 'Y': if config == 'genedoc_mygene': dumpfile = 'changes_{}.pyobj'.format(ts) else: dumpfile = 'changes_{}_allspecies.pyobj'.format(ts) dump(changes, dumpfile) dumpfile_key = 'genedoc_changes/' + dumpfile logging.info('Saving to S3: "{}"... '.format(dumpfile_key)) send_s3_file(dumpfile, dumpfile_key) logging.info('Done.') #os.remove(dumpfile) if no_confirm or ask("Continue to apply changes...") == 'Y': sc.apply_changes(changes) sc.verify_changes(changes) logging.info('=' * 20) logging.info("Finished. %s" % timesofar(t0))
def main(no_confirm=True): src_dump = get_src_dump() lastmodified = check_lastmodified() doc = src_dump.find_one({'_id': 'uniprot'}) if doc and 'lastmodified' in doc and lastmodified <= doc['lastmodified']: path, filename = os.path.split(DATAFILE_PATH) data_file = os.path.join(doc['data_folder'], filename) if os.path.exists(data_file): logging.info("No newer file found. Abort now.") sys.exit(0) if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit(0) logfile = os.path.join(DATA_FOLDER, 'uniprot_dump.log') setup_logfile(logfile) #mark the download starts doc = {'_id': 'uniprot', 'timestamp': timestamp, 'data_folder': DATA_FOLDER, 'lastmodified': lastmodified, 'logfile': logfile, 'status': 'downloading'} src_dump.save(doc) t0 = time.time() download(no_confirm) #mark the download finished successfully _updates = { 'status': 'success', 'time': timesofar(t0), 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'uniprot'}, {'$set': _updates})
def main_cron(no_confirm=True): '''set no_confirm to True for running this script automatically without intervention.''' src_dump = get_src_dump() mart_version = chk_latest_mart_version() logging.info("Checking latest mart_version:\t%s" % mart_version) doc = src_dump.find_one({'_id': 'ensembl'}) if doc and 'release' in doc and mart_version <= doc['release']: data_file = os.path.join(doc['data_folder'], 'gene_ensembl__gene__main.txt') if os.path.exists(data_file): logging.info("No newer release found. Abort now.") sys.exit(0) DATA_FOLDER = os.path.join(ENSEMBL_FOLDER, str(mart_version)) if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit(0) logfile = os.path.join(DATA_FOLDER, 'ensembl_mart_%s.log' % mart_version) setup_logfile(logfile) #mark the download starts doc = { '_id': 'ensembl', 'release': mart_version, 'timestamp': time.strftime('%Y%m%d'), 'data_folder': DATA_FOLDER, 'logfile': logfile, 'status': 'downloading' } src_dump.save(doc) t0 = time.time() try: BM = BioMart() BM.species_li = get_all_species(mart_version) BM.get_gene__main( os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt')) BM.get_translation__main( os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt')) BM.get_xref_entrezgene( os.path.join(DATA_FOLDER, 'gene_ensembl__xref_entrezgene__dm.txt')) BM.get_profile( os.path.join(DATA_FOLDER, 'gene_ensembl__prot_profile__dm.txt')) BM.get_interpro( os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt')) BM.get_pfam( os.path.join(DATA_FOLDER, 'gene_ensembl__prot_pfam__dm.txt')) finally: sys.stdout.close() #mark the download finished successfully _updates = { 'status': 'success', 'time': timesofar(t0), 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'ensembl'}, {'$set': _updates})