def __init__(self, resource: resources_description.ResourceDescription, thread_pool: SharedThreadPool, progress_records_base_path=os.getcwd(), on_start=None, on_doc=None, on_done=None, iterate_all=False, redo_failed_chunks=False): super().__init__() self.resource = resource base_path = os.path.join(progress_records_base_path, 'ws-iterator-progress') self.progress_records_base_path = os.path.join(base_path, self.resource.res_name) os.makedirs(self.progress_records_base_path, exist_ok=True) self.fail_file = os.path.join( base_path, self.resource.res_name + '-failed-chunks') self.count_file = os.path.join(base_path, self.resource.res_name + '-count') ws_endpoint = resources_description.WS_URL_TO_USE parsed_url = urlparse(ws_endpoint) self.domain = parsed_url.scheme + '://' + parsed_url.netloc self.base_url_path = parsed_url.path + '/{0}.json?offset={1}&limit=' + str( ResourceIterator.LIMIT) self.stop = False signal_handler.add_termination_handler(self.stop_iterator) self.scheduled_tasks = [] self.total_count = 0 self.iterated_count = 0 self.count_future = None self.progress_bar = None self.thread_pool = thread_pool self.scheduled_tasks_count = 0 self.sync_lock = Lock() self.iterate_all = iterate_all self.redo_failed_chunks = redo_failed_chunks # Callable parameters check self.on_start = None if on_start: if callable(on_start): self.on_start = on_start else: raise Exception('on_start parameter is not callable') self.on_doc = None if on_doc: if callable(on_doc): self.on_doc = on_doc else: raise Exception('on_doc parameter is not callable') self.on_done = None if on_done: if callable(on_done): self.on_done = on_done else: raise Exception('on_done parameter is not callable')
def __init__(self, max_workers=10, label='SharedThreadPool'): super().__init__() self.max_workers = max_workers self.max_queue_size = max_workers * 1000 self.current_tasks_queue = [] self.thread_pool = ThreadPoolExecutor(max_workers=max_workers) self.stop_thread_pool = False self.label = label signal_handler.add_termination_handler(self.stop_pool)
def run(self): signal_handler.add_termination_handler(self.stop_submitter) self.submission_pb = progress_bar_handler.get_new_progressbar( 'ES-bulk-submitter', 1) self.submission_pool.start() cur_low_counts = 0 while not self.stop_submission: max_count = self.get_max_queue_count() if max_count >= self.max_docs_per_request * 5 or ( max_count > 0 and cur_low_counts > 10): cur_low_counts = 0 self.check_and_submit_queues() else: if max_count > 0: cur_low_counts += 1 time.sleep(1) sys.stderr.flush()
def get_new_progressbar(name, max_val=1) -> ProgressBar: global PROGRESS_BAR_IDX, PROGRESS_BAR_REQUESTED if PROGRESS_BAR_IDX == 0: signal_handler.add_termination_handler(on_exit) atexit.register(on_exit, *[None, None]) PROGRESS_BAR_REQUESTED = True if PROGRESS_BAR_IDX == 0: print(term.clear) writer = Writer((0, PROGRESS_BAR_IDX)) p_bar = ProgressBar(widgets=[ name + ': ', Counter(format='%(value)d out of %(max_value)d'), ' ', Percentage(), ' ', Bar(), ' ', ETA() ], fd=writer, max_value=max_val).start(max_value=max_val) PROGRESS_BAR_IDX += 1 return p_bar
CACHING_PB_COUNT = 0 RDKIT_CACHE = {} INDIGO_CACHE = {} SVG_FAILURES = {} STOP_SCAN = False BASE_CACHE_PATH = os.path.join(os.getcwd(), 'svg-files-cache') os.makedirs(BASE_CACHE_PATH, exist_ok=True) def stop_scan(signal, frame): global STOP_SCAN STOP_SCAN = True es_util.stop_scan(signal, frame) signal_handler.add_termination_handler(stop_scan) @synchronized def register_fail(molecule_chembl_id, framework): global SVG_FAILURES if molecule_chembl_id not in SVG_FAILURES: SVG_FAILURES[molecule_chembl_id] = [] SVG_FAILURES[molecule_chembl_id].append(framework) def get_svg_by_chembl_id(molecule_chembl_id, indigo=False): global BASE_WS_URL, CACHING_PB, CACHING_PB_COUNT, RDKIT_CACHE, INDIGO_CACHE, STOP_SCAN, BASE_CACHE_PATH, \ SVG_FAILURES if STOP_SCAN: return None
def main(): t_ini = time.time() parser = argparse.ArgumentParser( description="Denormalize ChEMBL data existing in Elastic Search") parser.add_argument("--host", dest="es_host", help="Elastic Search Hostname or IP address.", default="localhost") parser.add_argument("--user", dest="es_user", help="Elastic Search username.", default=None) parser.add_argument("--password", dest="es_password", help="Elastic Search username password.", default=None) parser.add_argument("--port", dest="es_port", help="Elastic Search port.", default=9200) parser.add_argument( "--unichem", dest="denormalize_unichem", help="If included will denormalize the unichem related data.", action="store_true", ) parser.add_argument( "--activity", dest="denormalize_activity", help= "If included will denormalize the configured activity related data.", action="store_true", ) parser.add_argument( "--compound_hierarchy", dest="denormalize_compound_hierarchy", help="If included will denormalize the Compound Hierarchy data.", action="store_true", ) parser.add_argument( "--mechanism_and_drug_indication", dest="denormalize_mechanism_and_drug_indication", help= "If included will denormalize the Mechanism and Drug Indication data.", action="store_true", ) args = parser.parse_args() es_util.setup_connection(args.es_host, args.es_port, args.es_user, args.es_password) es_util.bulk_submitter.start() signal_handler.add_termination_handler(es_util.stop_scan) dn_type = None if args.denormalize_compound_hierarchy: denormalize_compound_hierarchy() dn_type = 'COMPOUND-HIERARCHY' elif args.denormalize_activity: denormalize_activity() dn_type = 'ACTIVITY' elif args.denormalize_unichem: denormalize_unichem() dn_type = 'UNICHEM' elif args.denormalize_mechanism_and_drug_indication: denormalize_mechanism_and_drug_indication() dn_type = 'MECHANISMS-AND-DRUG-INDICATION' else: denormalize_all_but_activity() dn_type = 'ALL-NO-ACTIVITY' end_msg = 'DENORMALIZATION FOR "{}" FINISHED'.format(dn_type) es_util.bulk_submitter.join() glados.es.ws2es.progress_bar_handler.write_after_progress_bars() total_time = time.time() - t_ini sec = timedelta(seconds=total_time) d = datetime(1, 1, 1) + sec print(end_msg, file=sys.stderr) print( "Finished in: {0} Day(s), {1} Hour(s), {2} Minute(s) and {3} Second(s)" .format(d.day - 1, d.hour, d.minute, d.second), file=sys.stderr)
resource_desc.res_name) doc_ids = list(dn_dict.keys()) p_bar = progress_bar_handler.get_new_progressbar( progressbar_name, len(dn_dict)) entity_dn_count = 0 for doc_id_i in doc_ids: if DenormalizationHandler.STOP: return update_doc, update_size = get_update_script_and_size( doc_id_i, dn_dict[doc_id_i]) # Indexes instead of update if it is requested if do_index: es_util.index_doc_bulk(resource_desc.idx_name, doc_id_i, update_doc) else: es_util.update_doc_bulk(resource_desc.idx_name, doc_id_i, doc=update_doc) entity_dn_count += 1 p_bar.update(entity_dn_count) es_util.bulk_submitter.finish_current_queues() p_bar.finish() signal_handler.add_termination_handler( DenormalizationHandler.stop_denormalization)
import glados.es.ws2es.signal_handler as signal_handler import requests import sys import zlib BASE_EBI_URL = 'https://www.ebi.ac.uk' UNICHEM_FTP_URL = 'http://ftp.ebi.ac.uk/pub/databases/chembl/UniChem/data/wholeSourceMapping/src_id1/src1src{0}.txt.gz' STOP_LOAD = False def stop_unichem(signal, frame): global STOP_LOAD STOP_LOAD = True signal_handler.add_termination_handler(stop_unichem) UNICHEM_MAPPING = { 'properties': { '_metadata': { 'properties': { 'unichem': { 'properties': { 'id': DefaultMappings.ID, 'src_name': DefaultMappings.KEYWORD, 'link': DefaultMappings.NO_INDEX_KEYWORD, 'src_url': DefaultMappings.NO_INDEX_KEYWORD, } } } }
def main(): t_ini = time.time() parser = argparse.ArgumentParser( description="Replicate ChEMBL data existing in Elastic Search from origin to destination." ) parser.add_argument("--delete_indexes", dest="delete_indexes", help="Delete indexes if they exist already in the elastic cluster.", action="store_true",) parser.add_argument("--skip-update-mappings", dest="skip_update_mappings", help="Does not attempt to update the mappings in the destination cluster.", action="store_true",) parser.add_argument("--monitoring", dest="monitoring", help="Replicate the monitoring indexes.", action="store_true",) parser.add_argument("--resource", dest="es_resource", help="Resource to iterate, if not specified will iterate all the resources.", default=None) parser.add_argument("--es-major-version-origin", dest="es_major_version_origin", help="Elastic Search class to use for origin cluster.", default=CURRENT_ES_VERSION) parser.add_argument("--host-origin", dest="es_host_origin", help="Elastic Search Hostname or IP address for origin cluster.", default="localhost") parser.add_argument("--user-origin", dest="es_user_origin", help="Elastic Search username for origin cluster.", default=None) parser.add_argument("--password-origin", dest="es_password_origin", help="Elastic Search username password for origin cluster.", default=None) parser.add_argument("--port-origin", dest="es_port_origin", help="Elastic Search port for origin cluster.", default=9200) parser.add_argument("--host-destination", dest="es_host_destination", help="Elastic Search Hostname or IP address for destination cluster.", default="localhost") parser.add_argument("--user-destination", dest="es_user_destination", help="Elastic Search username for destination cluster.", default=None) parser.add_argument("--password-destination", dest="es_password_destination", help="Elastic Search username password for destination cluster.", default=None) parser.add_argument("--port-destination", dest="es_port_destination", help="Elastic Search port for destination cluster.", default=9200) args = parser.parse_args() try: args.es_major_version_origin = int(args.es_major_version_origin) assert args.es_major_version_origin <= CURRENT_ES_VERSION except: traceback.print_exc() print( 'ERROR: Major version for elastic "{0}" is not valid, it must be an integer lower than {1}.' .format(args.es_major_version_origin, CURRENT_ES_VERSION), file=sys.stderr ) sys.exit(1) print('ORIGIN:') print(args.es_host_origin, args.es_port_origin, args.es_user_origin) print('DESTINATION:') print(args.es_host_destination, args.es_port_destination, args.es_user_destination) selected_resources = None if args.es_resource: selected_resources = args.es_resource.split(',') resources_to_run = resources_description.ALL_MONITORING_RESOURCES if args.monitoring else \ resources_description.ALL_RELEASE_RESOURCES if selected_resources: resources_to_run = [] for resource_i_str in selected_resources: resource_i = resources_description.RESOURCES_BY_RES_NAME.get(resource_i_str, None) if resource_i is None: print('Unknown resource {0}'.format(resource_i_str), file=sys.stderr) sys.exit(1) resources_to_run.append(resource_i) if args.es_host_origin == args.es_host_destination and args.es_port_origin == args.es_port_destination: print('ERROR: Origin and destination clusters are the same.') return if args.delete_indexes: if not query_yes_no("This procedure will delete and create all indexes again in the destination server.\n" "Do you want to proceed?", default="no"): return es_util_origin = ESUtil(es_major_version=args.es_major_version_origin) es_util_origin.setup_connection( args.es_host_origin, args.es_port_origin, args.es_user_origin, args.es_password_origin ) es_util_destination = ESUtil() es_util_destination.setup_connection( args.es_host_destination, args.es_port_destination, args.es_user_destination, args.es_password_destination ) ping_failed = False if not es_util_origin.ping(): print('ERROR: Ping failed to origin cluster.', file=sys.stderr) ping_failed = True if not es_util_destination.ping(): print('ERROR: Ping failed to destination cluster.', file=sys.stderr) ping_failed = True if ping_failed: return es_util_destination.bulk_submitter.start() signal_handler.add_termination_handler(es_util_origin.stop_scan) signal_handler.add_termination_handler(es_util_destination.stop_scan) signal_handler.add_termination_handler(es_util_destination.bulk_submitter.stop_submitter) replicate_clusters( es_util_origin, es_util_destination, resources_to_run=resources_to_run, delete_dest_idx=args.delete_indexes, skip_update_mappings=args.skip_update_mappings ) es_util_destination.bulk_submitter.finish_current_queues() es_util_destination.bulk_submitter.join() pbh.write_after_progress_bars() end_msg = 'REPLICATION FINISHED' total_time = time.time() - t_ini sec = timedelta(seconds=total_time) d = datetime(1, 1, 1) + sec print(end_msg, file=sys.stderr) print( "Finished in: {0} Day(s), {1} Hour(s), {2} Minute(s) and {3} Second(s)" .format(d.day-1, d.hour, d.minute, d.second), file=sys.stderr ) check_origin_vs_destination_counts(es_util_origin, es_util_destination, resources_to_run=resources_to_run)