def main(): """ main entry point return 0 for success (exit code) """ return_value = 0 initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() set_signal_handler(halt_event) node_generators = _start_subprocesses(halt_event) merge_manager = heapq.merge(*node_generators) try: _manage_subprocesses(halt_event, merge_manager) except Exception as instance: log.exception(instance) return_value = 1 return return_value
def main(): """ main entry point """ initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") zeromq_context = zmq.Context() req_socket = zeromq_context.socket(zmq.REQ) req_socket.setsockopt(zmq.LINGER, 1000) log.debug("connecting req socket to {0}".format(_zfec_server_address)) req_socket.connect(_zfec_server_address) return_value = 0 try: success_count, failure_count = _run_tests(req_socket) except Exception as instance: log.exception(instance) return_value = 1 else: log.info("terminates normally {0} successes {1} failures".format( success_count, failure_count)) finally: req_socket.close() zeromq_context.term() return return_value
def main(): """ main entry point return 0 for success (exit code) """ return_value = 0 initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() set_signal_handler(halt_event) node_subprocesses = _start_subprocesses(halt_event) try: halt_event.wait() except Exception as instance: log.exception(instance) return_value = 1 for node_subprocess in node_subprocesses: node_subprocess.terminate() return return_value
def main(): """ main entry point return 0 for success (exit code) """ initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") options = get_options() try: connection = get_node_local_connection() except Exception as instance: log.exception("Exception connecting to database {0}".format(instance)) return -1 zmq_context = zmq.Context() event_push_client = EventPushClient(zmq_context, "garbage_collector") event_push_client.info("program-start", "garbage_collector starts") return_code = 0 collectable_segment_ids = io.StringIO() partition_count = 0 collectable_count = 0 try: versioned_collections = get_versioned_collections() for partition in generate_candidate_partitions(connection): partition_count += 1 versioned_collection = \ partition[0].collection_id in versioned_collections count = _evaluate_partition(collectable_segment_ids, partition, versioned_collection) collectable_count += count archive_collectable_segment_rows(connection, collectable_segment_ids, options.max_node_offline_time) collectable_segment_ids.close() except Exception: log.exception("_garbage_collection") return_code = -2 else: log.info( "found {0:,} candidates, collected {1:,} segments".format( partition_count, collectable_count ) ) log.info("program terminates normally") connection.close() event_push_client.close() zmq_context.term() return return_code
def init_setup(): initialize_logging(LOG_PATH) log = logging.getLogger("init_setup") log.info("setup start") global _ROUTER _ROUTER = Router() gevent.spawn_later(0.0, _ROUTER.init) log.info("setup complete")
def main(): """ main entry point returns 0 for normal termination (usually SIGTERM) """ return_value = 0 initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() set_signal_handler(halt_event) memcached_client = memcache.Client(_memcached_nodes) zeromq_context = zmq.Context() sub_socket = _create_sub_socket(zeromq_context) expected_sequence = { _cache_update_channel : None, } while not halt_event.is_set(): try: topic = sub_socket.recv() assert sub_socket.rcvmore meta = sub_socket.recv() if sub_socket.rcvmore: data = sub_socket.recv() else: data = "" _process_one_event(memcached_client, expected_sequence, topic, meta, data) except KeyboardInterrupt: # convenience for testing log.info("keyboard interrupt: terminating normally") halt_event.set() except zmq.ZMQError as zmq_error: if is_interrupted_system_call(zmq_error) and halt_event.is_set(): log.info("interrupted system call - ok at shutdown") else: log.exception("zeromq error processing request") return_value = 1 halt_event.set() except Exception: log.exception("error processing request") return_value = 1 halt_event.set() sub_socket.close() zeromq_context.term() log.info("program teminates: return value = {0}".format(return_value)) return return_value
def main(): """ main entry point return 0 for success (exit code) """ initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") args = parse_commandline() halt_event = Event() set_signal_handler(halt_event) zeromq_context = zmq.Context() event_push_client = EventPushClient(zeromq_context, "handoff_client") event_push_client.info("program-start", "handoff_client starts") return_code = 0 node_databases = None try: node_dict = get_node_ids(args.node_name) node_databases = get_node_databases() conjoined_rows, segment_rows = \ get_handoff_rows(node_databases, node_dict[args.node_name]) log.info("found {0} conjoined and {1} segment handoffs".format( len(conjoined_rows), len(segment_rows))) if len(conjoined_rows) > 0: process_conjoined_rows(halt_event, args, node_databases, conjoined_rows) if len(segment_rows) > 0: process_segment_rows(halt_event, zeromq_context, args, node_dict, node_databases, segment_rows) except Exception as instance: log.exception("Uhandled exception {0}".format(instance)) event_push_client.exception( unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__ ) return_code = 1 if node_databases is not None: for connection in node_databases.values(): connection.close() event_push_client.close() zeromq_context.term() log.info("program terminates return_code = {0}".format(return_code)) return return_code
def main(): """ main entry point returns 0 for normal termination (usually SIGTERM) """ return_value = 0 initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() set_signal_handler(halt_event) memcached_client = memcache.Client(_memcached_nodes) zeromq_context = zmq.Context() sub_socket = _create_sub_socket(zeromq_context) expected_sequence = { _cache_update_channel: None, } while not halt_event.is_set(): try: topic = sub_socket.recv() assert sub_socket.rcvmore meta = sub_socket.recv() if sub_socket.rcvmore: data = sub_socket.recv() else: data = "" _process_one_event(memcached_client, expected_sequence, topic, meta, data) except KeyboardInterrupt: # convenience for testing log.info("keyboard interrupt: terminating normally") halt_event.set() except zmq.ZMQError as zmq_error: if is_interrupted_system_call(zmq_error) and halt_event.is_set(): log.info("interrupted system call - ok at shutdown") else: log.exception("zeromq error processing request") return_value = 1 halt_event.set() except Exception: log.exception("error processing request") return_value = 1 halt_event.set() sub_socket.close() zeromq_context.term() log.info("program teminates: return value = {0}".format(return_value)) return return_value
def main(): """ main entry point return 0 for success (exit code) """ initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") options = get_options() try: connection = get_node_local_connection() except Exception as instance: log.exception("Exception connecting to database {0}".format(instance)) return -1 zmq_context = zmq.Context() event_push_client = EventPushClient(zmq_context, "garbage_collector") event_push_client.info("program-start", "garbage_collector starts") return_code = 0 collectable_segment_ids = io.StringIO() partition_count = 0 collectable_count = 0 try: versioned_collections = get_versioned_collections() for partition in generate_candidate_partitions(connection): partition_count += 1 versioned_collection = \ partition[0].collection_id in versioned_collections count = _evaluate_partition(collectable_segment_ids, partition, versioned_collection) collectable_count += count archive_collectable_segment_rows(connection, collectable_segment_ids, options.max_node_offline_time) collectable_segment_ids.close() except Exception: log.exception("_garbage_collection") return_code = -2 else: log.info("found {0:,} candidates, collected {1:,} segments".format( partition_count, collectable_count)) log.info("program terminates normally") connection.close() event_push_client.close() zmq_context.term() return return_code
def main(): initialize_logging(_log_path) log = logging.getLogger("main") halt_event = Event() gevent.signal(signal.SIGTERM, _signal_handler_closure(halt_event)) try: web_public_reader = WebPublicReaderServer(halt_event) web_public_reader.start() except Exception, instance: log.exception(str(instance)) return -1
def main(): """ main entry point return 0 for success (exit code) """ initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") options = get_options() try: connection = get_node_local_connection() except Exception as instance: log.exception("Exception connecting to database") return -1 zmq_context = zmq.Context() event_push_client = EventPushClient(zmq_context, "garbage_collector") event_push_client.info("program-start", "garbage_collector starts") return_code = 0 try: total_unused_value_file_size = unlink_totally_unused_value_files( connection, _repository_path) unreachable_value_file_size = unlink_unreachable_value_files( connection, _repository_path) ref_generator = generate_value_file_references(options, connection) savings = rewrite_value_files( options, connection, _repository_path, ref_generator) except Exception: log.exception("_garbage_collection") return_code = -2 else: log.info("program terminates normally") event_push_client.info( "rewrite complete", "garbage_collector finished", unused_value_file_bytes_reclaimed=total_unused_value_file_size, unreachable_value_file_bytes_reclaimed=unreachable_value_file_size, rewrite_value_file_savings=savings ) connection.close() event_push_client.close() zmq_context.term() return return_code
def main(): """ main entry point return 0 for success (exit code) """ initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") options = get_options() try: connection = get_node_local_connection() except Exception as instance: log.exception("Exception connecting to database") return -1 zmq_context = zmq.Context() event_push_client = EventPushClient(zmq_context, "garbage_collector") event_push_client.info("program-start", "garbage_collector starts") return_code = 0 try: total_unused_value_file_size = unlink_totally_unused_value_files( connection, _repository_path) unreachable_value_file_size = unlink_unreachable_value_files( connection, _repository_path) ref_generator = generate_value_file_references(options, connection) savings = rewrite_value_files(options, connection, _repository_path, ref_generator) except Exception: log.exception("_garbage_collection") return_code = -2 else: log.info("program terminates normally") event_push_client.info( "rewrite complete", "garbage_collector finished", unused_value_file_bytes_reclaimed=total_unused_value_file_size, unreachable_value_file_bytes_reclaimed=unreachable_value_file_size, rewrite_value_file_savings=savings) connection.close() event_push_client.close() zmq_context.term() return return_code
def main(): """ main processing module """ global _return_code initialize_logging(_log_path) log = logging.getLogger("main") halt_event = Event() gevent.signal(signal.SIGTERM, _handle_sigterm, halt_event) log.info("program starts") redis_queue = gevent.queue.Queue() greenlets = list() try: config_path = sys.argv[1] log.info("reading config from '{0}'".format(config_path)) with open(config_path) as input_file: config = json.load(input_file) redis_sink = WebMonitorRedisSink(halt_event, redis_queue) redis_sink.link_exception(_redis_exception_closure(halt_event)) redis_sink.start() greenlets.append(redis_sink) for config_entry in config: pinger = Pinger(halt_event, _polling_interval, redis_queue, config_entry) pinger.link_exception(_unhandled_greenlet_exception) pinger.start() greenlets.append(pinger) except Exception as instance: log.exception(instance) _return_code = 1 # wait here while the pingers do their job halt_event.wait() for entry in greenlets: entry.join(timeout=3.0) log.info("program terminates return code {0}".format(_return_code)) return _return_code
def setUp(self): initialize_logging(_log_path) self.tearDown() # clear out any old stats space_accounting_database = SpaceAccountingDatabase() space_accounting_database.clear_collection_stats(_collection_id) space_accounting_database.commit() self._space_accounting_server_process = start_space_accounting_server( _local_node_name, _space_accounting_server_address, _space_accounting_pipeline_address ) poll_result = poll_process(self._space_accounting_server_process) self.assertEqual(poll_result, None)
def main(): """ main entry point return 0 for success (exit code) """ initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() set_signal_handler(halt_event) zmq_context = zmq.Context() event_push_client = EventPushClient(zmq_context, "cluster_repair") event_push_client.info("program-start", "cluster_repair starts") zfec_server_req_socket = zmq_context.socket(zmq.REQ) zfec_server_req_socket.setsockopt(zmq.LINGER, 1000) log.info("connecting req socket to {0}".format(_zfec_server_address)) zfec_server_req_socket.connect(_zfec_server_address) read_subprocess = _start_read_subprocess() write_subprocess = _start_write_subprocess() try: _repair_cluster(halt_event, zfec_server_req_socket, read_subprocess, write_subprocess) except KeyboardInterrupt: halt_event.set() except Exception as instance: log.exception(str(instance)) event_push_client.exception( unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__ ) return -3 finally: read_subprocess.terminate() write_subprocess.terminate() event_push_client.close() zfec_server_req_socket.close() zmq_context.term() log.info("program terminates normally") return 0
def main(): """Main entry point for cluster simulator""" args = parse_cmdline() config = ClusterConfig(args) print repr(args) if not sanity_check(config): return 1 if config.createnew: ensure_paths(config) old_config = config else: old_config = config config = ClusterConfig.load(config) if old_config.logprune: remove_files(config.log_path) # save() sets createnew to false createnew = config.createnew if old_config.createnew and not config.systemdb: config.database_users.update(create_database(config)) print "Saving config to %s" % (config.config_path, ) config.save() elif not config.systemdb: start_database(config) os.environ.update(dict(config.env_for_cluster())) #import pdb #pdb.set_trace() log_path = os.path.join(config.log_path, _log_name) initialize_logging(log_path) log = logging.getLogger("main") log.info("progam starts") log.info("entering main loop") command_interpreter = CommandInterpreter(config, createnew) if old_config.start: command_interpreter.do_start("all") print "Web servers at: %s" % (", ".join(config.web_server_urls), ) command_interpreter.cmdloop("sim.nimbus.io") log.info("leaving main loop") log.info("program ends normally") return 0
def main(): """ main entry point return 0 for success (exit code) """ initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") args = parse_commandline() halt_event = Event() set_signal_handler(halt_event) zeromq_context = zmq.Context() event_push_client = EventPushClient(zeromq_context, "handoff_client") event_push_client.info("program-start", "handoff_client starts") return_code = 0 node_databases = None try: node_dict = get_node_ids(args.node_name) node_databases = get_node_databases() conjoined_rows, segment_rows = \ get_handoff_rows(node_databases, node_dict[args.node_name]) log.info("found {0} conjoined and {1} segment handoffs".format( len(conjoined_rows), len(segment_rows))) if len(conjoined_rows) > 0: process_conjoined_rows(halt_event, args, node_databases, conjoined_rows) if len(segment_rows) > 0: process_segment_rows(halt_event, zeromq_context, args, node_dict, node_databases, segment_rows) except Exception as instance: log.exception("Uhandled exception {0}".format(instance)) event_push_client.exception(unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__) return_code = 1 if node_databases is not None: for connection in node_databases.values(): connection.close() event_push_client.close() zeromq_context.term() log.info("program terminates return_code = {0}".format(return_code)) return return_code
def main(): """Main entry point for cluster simulator""" args = parse_cmdline() config = ClusterConfig(args) print repr(args) if not sanity_check(config): return 1 if config.createnew: ensure_paths(config) old_config = config else: old_config = config config = ClusterConfig.load(config) if old_config.logprune: remove_files(config.log_path) # save() sets createnew to false createnew = config.createnew if old_config.createnew and not config.systemdb: config.database_users.update(create_database(config)) print "Saving config to %s" % (config.config_path, ) config.save() elif not config.systemdb: start_database(config) os.environ.update(dict(config.env_for_cluster())) #import pdb #pdb.set_trace() log_path = os.path.join(config.log_path, _log_name) initialize_logging(log_path) log = logging.getLogger("main") log.info("progam starts") log.info("entering main loop") command_interpreter = CommandInterpreter(config, createnew) if old_config.start: command_interpreter.do_start("all") print "Web servers at: %s" % ( ", ".join(config.web_server_urls), ) command_interpreter.cmdloop("sim.nimbus.io") log.info("leaving main loop") log.info("program ends normally") return 0
def main(): """ main entry point return 0 for success (exit code) """ initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() set_signal_handler(halt_event) zmq_context = zmq.Context() event_push_client = EventPushClient(zmq_context, "cluster_inspector") event_push_client.info("program-start", "cluster_inspector starts") # if there's any wreckage from a previous run, clear it out if os.path.exists(_work_dir): log.info("removing old {0}".format(_work_dir)) shutil.rmtree(_work_dir) os.mkdir(_work_dir) try: pull_segments_from_nodes(halt_event, _work_dir) if halt_event.is_set(): log.info("halt_event set (1): exiting") return -1 audit_segments(halt_event, _work_dir) except KeyboardInterrupt: halt_event.set() except Exception as instance: log.exception(str(instance)) event_push_client.exception( unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__ ) return -3 event_push_client.close() zmq_context.term() log.info("program terminates normally") return 0
def main(): """ main entry point return 0 for success (exit code) """ initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() set_signal_handler(halt_event) zmq_context = zmq.Context() event_push_client = EventPushClient(zmq_context, "cluster_inspector") event_push_client.info("program-start", "cluster_inspector starts") # if there's any wreckage from a previous run, clear it out if os.path.exists(_work_dir): log.info("removing old {0}".format(_work_dir)) shutil.rmtree(_work_dir) os.mkdir(_work_dir) try: pull_segments_from_nodes(halt_event, _work_dir) if halt_event.is_set(): log.info("halt_event set (1): exiting") return -1 audit_segments(halt_event, _work_dir) except KeyboardInterrupt: halt_event.set() except Exception as instance: log.exception(str(instance)) event_push_client.exception(unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__) return -3 event_push_client.close() zmq_context.term() log.info("program terminates normally") return 0
def setUp(self): initialize_logging(_log_path) self.tearDown() # clear out any old stats space_accounting_database = SpaceAccountingDatabase() space_accounting_database.clear_collection_stats(_collection_id) space_accounting_database.commit() self._space_accounting_server_process = \ start_space_accounting_server( _local_node_name, _space_accounting_server_address, _space_accounting_pipeline_address ) poll_result = poll_process(self._space_accounting_server_process) self.assertEqual(poll_result, None)
def main(): """ main entry point return 0 for success (exit code) """ initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() set_signal_handler(halt_event) zmq_context = zmq.Context() event_push_client = EventPushClient(zmq_context, "cluster_repair") event_push_client.info("program-start", "cluster_repair starts") zfec_server_req_socket = zmq_context.socket(zmq.REQ) zfec_server_req_socket.setsockopt(zmq.LINGER, 1000) log.info("connecting req socket to {0}".format(_zfec_server_address)) zfec_server_req_socket.connect(_zfec_server_address) read_subprocess = _start_read_subprocess() write_subprocess = _start_write_subprocess() try: _repair_cluster(halt_event, zfec_server_req_socket, read_subprocess, write_subprocess) except KeyboardInterrupt: halt_event.set() except Exception as instance: log.exception(str(instance)) event_push_client.exception(unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__) return -3 finally: read_subprocess.terminate() write_subprocess.terminate() event_push_client.close() zfec_server_req_socket.close() zmq_context.term() log.info("program terminates normally") return 0
def main(): """ main entry point """ returncode = 0 initialize_logging(_log_path) log = logging.getLogger("main") state = _create_state() set_signal_handler(state["halt-event"]) try: _setup(state) except Exception: instance = sys.exc_info()[1] log.exception("unhandled exception in _setup") log.critical("unhandled exception in _setup {0}".format( instance)) state["halt-event"].set() returncode = 1 log.debug("start halt_event loop") while not state["halt-event"].is_set(): try: state["pollster"].run(state["halt-event"]) except Exception: instance = sys.exc_info()[1] log.exception("unhandled exception in pollster") log.critical("unhandled exception in pollster {0}".format( instance)) state["halt-event"].set() returncode = 1 log.debug("end halt_event loop") try: _tear_down(state) except Exception: instance = sys.exc_info()[1] log.exception("unhandled exception in _tear_down") log.critical("unhandled exception in _tear_down {0}".format( instance)) returncode = 1 return returncode
def main(): """ main entry point returns 0 for normal termination (usually SIGTERM) """ return_value = 0 if len(sys.argv) == 1: server_number = 0 else: server_number = int(sys.argv[1]) log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"], _local_node_name, server_number) initialize_logging(log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() set_signal_handler(halt_event) zeromq_context = zmq.Context() rep_socket = _bind_rep_socket(zeromq_context) try: while not halt_event.is_set(): _process_one_request(rep_socket) except ZfecServerInterrupedSystemCall: if halt_event.is_set(): log.info("program teminates normally with interrupted system call") return 0 log.exception("error processing request") return_value = 1 except Exception: log.exception("error processing request") return_value = 1 else: log.info("program teminates normally") finally: rep_socket.close() zeromq_context.term() return return_value
def main(): """ main entry point """ index_str = sys.argv[1] index = int(index_str) source_node_name = _node_names[index] data_reader_anti_entropy_address = \ _data_reader_anti_entropy_addresses[index] log_path = "{0}/nimbusio_cluster_repair_data_reader_{1}_to_{2}.log".format( os.environ["NIMBUSIO_LOG_DIR"], source_node_name, _local_node_name) initialize_logging(log_path) log = logging.getLogger("main") log.info("program starts: reading from node {0}".format(source_node_name)) zeromq_context = zmq.Context() req_socket = zeromq_context.socket(zmq.REQ) req_socket.setsockopt(zmq.LINGER, 1000) log.debug("connecting req socket to {0}".format( data_reader_anti_entropy_address)) req_socket.connect(data_reader_anti_entropy_address) return_value = 0 try: audit_records_processed = _process_repair_entries(index, source_node_name, req_socket) except Exception as instance: log.exception(instance) return_value = 1 else: log.info("terminates normally {0} audit records processed".format( audit_records_processed)) finally: req_socket.close() zeromq_context.term() return return_value
def main(): """ main entry point """ [ work_dir, index_str, ] = sys.argv[1:] index = int(index_str) node_name = _node_names[index] database_host = _node_database_hosts[index] database_port = _node_database_ports[index] database_password = _node_database_passwords[index] log_path = "{0}/nimbusio_segment_puller_from_{1}_to_{2}.log".format( os.environ["NIMBUSIO_LOG_DIR"], node_name, _local_node_name) initialize_logging(log_path) log = logging.getLogger("main") log.info("program starts: work_dir={0}, index={1}, {2}".format( work_dir, index, node_name)) try: connection = get_node_connection(node_name, database_password, database_host, database_port) except Exception as instance: log.exception("Unable to connect to database {0}".format(instance)) return -1 try: _pull_segment_data(connection, work_dir, node_name) _pull_damaged_segment_data(connection, work_dir, node_name) except Exception as instance: log.exception("_pull_segment_data failed {0}".format(instance)) return -2 finally: connection.close() log.info("program terminates normally") return 0
def main(): """ main entry point """ [work_dir, index_str, ] = sys.argv[1:] index = int(index_str) node_name = _node_names[index] database_host = _node_database_hosts[index] database_port = _node_database_ports[index] database_password = _node_database_passwords[index] log_path = "{0}/nimbusio_segment_puller_from_{1}_to_{2}.log".format( os.environ["NIMBUSIO_LOG_DIR"], node_name, _local_node_name) initialize_logging(log_path) log = logging.getLogger("main") log.info("program starts: work_dir={0}, index={1}, {2}".format( work_dir, index, node_name)) try: connection = get_node_connection(node_name, database_password, database_host, database_port) except Exception as instance: log.exception("Unable to connect to database {0}".format(instance)) return -1 try: _pull_segment_data(connection, work_dir, node_name) _pull_damaged_segment_data(connection, work_dir, node_name) except Exception as instance: log.exception("_pull_segment_data failed {0}".format(instance)) return -2 finally: connection.close() log.info("program terminates normally") return 0
def main(): """ main entry point return 0 for success (exit code) """ initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() set_signal_handler(halt_event) zmq_context = zmq.Context() event_push_client = EventPushClient(zmq_context, "defragger") event_push_client.info("program-start", "defragger starts") connection = None file_space_info = None while not halt_event.is_set(): # if we don't have an open database connection, get one if connection is None: try: connection = get_node_local_connection() except Exception as instance: exctype, value = sys.exc_info()[:2] event_push_client.exception("database exception", str(value), exctype=exctype.__name__) log.exception("Exception connecting to database") halt_event.wait(_database_retry_interval) continue file_space_info = load_file_space_info(connection) file_space_sanity_check(file_space_info, _repository_path) # try one defrag pass bytes_defragged = 0 connection.begin_transaction() try: bytes_defragged = _defrag_pass(connection, file_space_info, event_push_client) except KeyboardInterrupt: halt_event.set() connection.rollback() except Exception as instance: log.exception(str(instance)) event_push_client.exception(unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__) connection.rollback() else: connection.commit() log.info("bytes defragged = {0:,}".format(bytes_defragged)) # if we didn't do anything on this pass... if bytes_defragged == 0: # exit if we're done and asked to do single pass if int(os.environ.get('NIMBUSIO_EXIT_WHEN_DONE', '0')): halt_event.set() # close the database connection if connection is not None: connection.close() connection = None # wait and try again try: halt_event.wait(_defrag_check_interval) except KeyboardInterrupt: halt_event.set() if connection is not None: connection.close() event_push_client.close() zmq_context.term() log.info("program terminates normally") return 0
def main(): """ main entry point return 0 for success (exit code) """ global _max_value_file_time initialize_logging(_log_path) log = logging.getLogger("main") try: _max_value_file_time = parse_timedelta_str(_max_value_file_time_str) except Exception as instance: log.exception("Unable to parse '{0}' {1}".format( _max_value_file_time_str, instance)) return -1 log.info("program starts; max_value_file_time = {0}".format( _max_value_file_time)) zmq_context = zmq.Context() event_push_client = EventPushClient(zmq_context, "node_inspector") event_push_client.info("program-start", "node_inspector starts") try: connection = get_node_local_connection() except Exception as instance: log.exception("Exception connecting to database {0}".format(instance)) event_push_client.exception( unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__ ) return -1 known_value_files = dict() connection.begin_transaction() try: for batch in generate_work(connection): _process_work_batch(connection, known_value_files, batch) except Exception as instance: connection.rollback() log.exception("Exception processing batch {0} {1}".format( batch, instance)) event_push_client.exception( unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__ ) return -1 else: connection.commit() finally: connection.close() event_push_client.close() zmq_context.term() log.info("program terminates normally") return 0
def main(): """ main entry point returns 0 for normal termination (usually SIGTERM) """ return_value = 0 log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"], _local_node_name) initialize_logging(log_path) log = logging.getLogger("main") log.info("program starts") prepare_ipc_path(_pull_socket_uri) halt_event = Event() set_signal_handler(halt_event) zeromq_context = zmq.Context() pull_socket = _bind_pull_socket(zeromq_context) event_push_client = EventPushClient(zeromq_context, "service_availability") event_push_client.info("program-starts", "service availability monitor starts") message_count = 0 try: ping_process_dict = _start_ping_processes(halt_event) while not halt_event.is_set(): if message_count % len(ping_process_dict) == 0: for ping_process in ping_process_dict.values(): poll_subprocess(ping_process.process) message = pull_socket.recv_pyobj() assert not pull_socket.rcvmore _process_one_message(message, ping_process_dict, event_push_client) message_count += 1 except KeyboardInterrupt: # convenience for testing log.info("keyboard interrupt: terminating normally") except zmq.ZMQError as zmq_error: if is_interrupted_system_call(zmq_error) and halt_event.is_set(): log.info("program terminating normally; interrupted system call") else: log.exception("zeromq error processing request") event_push_client.exception(unhandled_exception_topic, "zeromq_error", exctype="ZMQError") return_value = 1 except Exception as instance: log.exception("error processing request") event_push_client.exception(unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__) return_value = 1 else: log.info("program teminating normally") log.debug("terminating subprocesses") _terminate_ping_processes(ping_process_dict) pull_socket.close() event_push_client.close() zeromq_context.term() return return_value
def main(): """ main entry point returns 0 for normal termination (usually SIGTERM) """ return_value = 0 log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"], _local_node_name) initialize_logging(log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() set_signal_handler(halt_event) zeromq_context = zmq.Context() resources = \ _resources_tuple(halt_event=Event(), zeromq_context=zeromq_context, reply_push_sockets=dict(), pull_socket=zeromq_context.socket(zmq.PULL), io_controller_push_socket=\ zeromq_context.socket(zmq.PUSH), router_socket=zeromq_context.socket(zmq.ROUTER), event_push_client=\ EventPushClient(zeromq_context, "rs_db_pool_controller"), active_retrieves=dict(), pending_work_queue=deque(), available_ident_queue=deque()) log.debug("binding to {0}".format(db_controller_pull_socket_uri)) resources.pull_socket.bind(db_controller_pull_socket_uri) log.debug("connecting to {0}".format(io_controller_pull_socket_uri)) resources.io_controller_push_socket.connect(io_controller_pull_socket_uri) resources.router_socket.setsockopt(zmq.LINGER, 1000) log.debug("binding to {0}".format(db_controller_router_socket_uri)) resources.router_socket.bind(db_controller_router_socket_uri) # we poll the sockets for readability, we assume we can always # write to the router socket poller = zmq.Poller() poller.register(resources.pull_socket, zmq.POLLIN | zmq.POLLERR) poller.register(resources.router_socket, zmq.POLLIN| zmq.POLLERR) worker_processes = list() for index in range(_worker_count): worker_processes.append(_launch_database_pool_worker(index+1)) last_report_time = 0.0 try: while not halt_event.is_set(): for worker_process in worker_processes: poll_subprocess(worker_process) for active_socket, event_flags in poller.poll(_poll_timeout): if event_flags & zmq.POLLERR: error_message = \ "error flags from zmq {0}".format(active_socket) log.error(error_message) raise PollError(error_message) if active_socket is resources.pull_socket: _read_pull_socket(resources) elif active_socket is resources.router_socket: _read_router_socket(resources) else: log.error("unknown socket {0}".format(active_socket)) current_time = time.time() elapsed_time = current_time - last_report_time if elapsed_time > _reporting_interval: report_message = \ "{0:,} active_retrives, " \ "{1:,} pending_work_queue entries, " \ "{2:,} available_ident_queue entries" \ "".format(len(resources.active_retrieves), len(resources.pending_work_queue), len(resources.available_ident_queue)) log.info(report_message) resources.event_push_client.info( "queue_sizes", report_message, active_retrieves=len(resources.active_retrieves), pending_work_queue=len(resources.pending_work_queue), available_ident_queue=len(resources.available_ident_queue)) last_report_time = current_time except zmq.ZMQError as zmq_error: if is_interrupted_system_call(zmq_error) and halt_event.is_set(): log.info("program teminates normally with interrupted system call") else: log.exception("zeromq error processing request") resources.event_push_client.exception(unhandled_exception_topic, "zeromq_error", exctype="ZMQError") return_value = 1 except Exception as instance: log.exception("error processing request") resources.event_push_client.exception(unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__) return_value = 1 else: log.info("program teminates normally") finally: for worker_process in worker_processes: terminate_subprocess(worker_process) resources.pull_socket.close() resources.io_controller_push_socket.close() resources.router_socket.close() for push_socket in resources.reply_push_sockets.values(): push_socket.close() resources.event_push_client.close() zeromq_context.term() return return_value
def main( log_path, state, pre_loop_actions, post_loop_actions, exception_action = None, halt_event = Event(), ): """ This function is run as the main entry point of every time queue driven process. It wraps the event loop driven by te time_queue. log_path The full path to the log file for this process state State object (usually a dict) passed to callback functions pre_loop_actions A list of functions to be run before the event loop starts. Function arguments are ``(halt_event, state)`` Functions may return a list of tuples to be added to the time queue ``(callback_function, start_time, )`` nimbus.io processes this function is conventionlly run a single function called ``_startup``, but it can have any name. post_loop_actions A list of functions to be run after the event loop terminates (``halt_event`` set) Function argument is ``(state)`` Function returns is ignored In nimbus.io processes this function is conventioanlly named ``_tear_down``, but it can have any name. exception_action A frunction to be executed when the event loop catches and exception fromma calback function. It takes ``(state)`` as an argument Its return is ignored In nimbus.io processes this function is used to push a zeromq message to an event publisher halt_event (optional) a ``threading.Event`` that will be set when SIGTERM is detected used to terminate the event loop. halt_event is passed to as an argument to callback functions. If the caller wants greater access, they can create their own halt event and pass it here as an argument. returns 0 for normal termination, nonzero for failure """ initialize_logging(log_path) log = logging.getLogger("main") log.info("start") while not halt_event.is_set(): try: _run_until_halt( state, pre_loop_actions, post_loop_actions, halt_event ) except Exception, instance: log.exception(instance) print >> sys.stderr, instance.__class__.__name__, str(instance) if exception_action is not None: exception_action(state) return 12
self.tearDown() def tearDown(self): pass def test_invalid_url(self): """try a URL we cant parse""" result = parse_url("GET", "pork") self.assertEqual(result, None) def test_valid_urls(self): """run through all the URLs we should be able to parse""" for method, url, expected_action in _valid_urls_with_actions: result = parse_url(method, url) self.assertNotEqual(result, None, (method, url, )) action, match_object = result self.assertEqual( action, expected_action, (action, expected_action, method, url, ) ) self.assertTrue( _match_object_dispatch_table[action](match_object), (method, url,) ) if __name__ == "__main__": initialize_logging(_log_path) unittest.main()
def main(): """ main entry point returns 0 for normal termination (usually SIGTERM) """ return_value = 0 volume_name = sys.argv[1] worker_number = int(sys.argv[2]) log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"], volume_name.replace("/", "_"), worker_number, _local_node_name) initialize_logging(log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() set_signal_handler(halt_event) zeromq_context = zmq.Context() event_source_name = "rs_io_worker_{0}_{1}".format(volume_name, worker_number) resources = \ _resources_tuple(halt_event=halt_event, zeromq_context=zeromq_context, reply_push_sockets=dict(), event_push_client=EventPushClient(zeromq_context, event_source_name), dealer_socket=zeromq_context.socket(zmq.DEALER), file_cache=LRUCache(_max_file_cache_size)) resources.dealer_socket.setsockopt(zmq.LINGER, 1000) log.debug("connecting to {0}".format(io_controller_router_socket_uri)) resources.dealer_socket.connect(io_controller_router_socket_uri) last_close_pass_time = time.time() try: while not halt_event.is_set(): # an occasional pass that closes any open files that haven't # been used current_time = time.time() elapsed_time = current_time - last_close_pass_time if elapsed_time > _unused_file_close_interval: _make_close_pass(resources, current_time) last_close_pass_time = current_time _send_work_request(resources, volume_name) _process_request(resources) except InterruptedSystemCall: if halt_event.is_set(): log.info("program teminates normally with interrupted system call") else: log.exception("zeromq error processing request") resources.event_push_client.exception( unhandled_exception_topic, "Interrupted zeromq system call", exctype="InterruptedSystemCall") return_value = 1 except Exception as instance: log.exception("error processing request") resources.event_push_client.exception( unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__) return_value = 1 else: log.info("program teminates normally") finally: resources.dealer_socket.close() for push_socket in resources.reply_push_sockets.values(): push_socket.close() resources.event_push_client.close() resources.zeromq_context.term() return return_value
def main( log_path, state, pre_loop_actions, post_loop_actions, exception_action=None, halt_event=Event(), ): """ This function is run as the main entry point of every time queue driven process. It wraps the event loop driven by te time_queue. log_path The full path to the log file for this process state State object (usually a dict) passed to callback functions pre_loop_actions A list of functions to be run before the event loop starts. Function arguments are ``(halt_event, state)`` Functions may return a list of tuples to be added to the time queue ``(callback_function, start_time, )`` nimbus.io processes this function is conventionlly run a single function called ``_startup``, but it can have any name. post_loop_actions A list of functions to be run after the event loop terminates (``halt_event`` set) Function argument is ``(state)`` Function returns is ignored In nimbus.io processes this function is conventioanlly named ``_tear_down``, but it can have any name. exception_action A frunction to be executed when the event loop catches and exception fromma calback function. It takes ``(state)`` as an argument Its return is ignored In nimbus.io processes this function is used to push a zeromq message to an event publisher halt_event (optional) a ``threading.Event`` that will be set when SIGTERM is detected used to terminate the event loop. halt_event is passed to as an argument to callback functions. If the caller wants greater access, they can create their own halt event and pass it here as an argument. returns 0 for normal termination, nonzero for failure """ initialize_logging(log_path) log = logging.getLogger("main") log.info("start") while not halt_event.is_set(): try: _run_until_halt(state, pre_loop_actions, post_loop_actions, halt_event) except Exception: instance = sys.exc_info()[1] log.exception(instance) if exception_action is not None: exception_action(state) return 12 log.info("normal termination") return 0
def main(): """ main entry point returns 0 for normal termination (usually SIGTERM) """ return_value = 0 log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"], _local_node_name) initialize_logging(log_path) log = logging.getLogger("main") log.info("program starts") for internal_socket_uri in internal_socket_uri_list: prepare_ipc_path(internal_socket_uri) halt_event = Event() set_signal_handler(halt_event) database_pool_controller = _launch_database_pool_controller() io_controller = _launch_io_controller() zeromq_context = zmq.Context() rep_socket = _bind_rep_socket(zeromq_context) db_controller_push_socket = \ _connect_db_controller_push_socket(zeromq_context) event_push_client = EventPushClient(zeromq_context, "retrieve_source") event_push_client.info("program-starts", "retrieve source starts") # we poll the sockets for readability, we assume we can always # write to the push client sockets poller = zmq.Poller() poller.register(rep_socket, zmq.POLLIN | zmq.POLLERR) last_report_time = 0.0 request_count = 0 try: while not halt_event.is_set(): poll_subprocess(database_pool_controller) poll_subprocess(io_controller) # we've only registered one socket, so we could use an 'if' here, # but this 'for' works ok and it has the same form as the other # places where we use poller for active_socket, event_flags in poller.poll(_poll_timeout): if event_flags & zmq.POLLERR: error_message = \ "error flags from zmq {0}".format(active_socket) log.error(error_message) raise PollError(error_message) assert active_socket is rep_socket _process_one_request(rep_socket, db_controller_push_socket) request_count += 1 current_time = time.time() elapsed_time = current_time - last_report_time if elapsed_time > _reporting_interval: report_message = "{0:,} requests".format(request_count) log.info(report_message) event_push_client.info("request_count", report_message, request_count=request_count) last_report_time = current_time request_count = 0 except KeyboardInterrupt: # convenience for testing log.info("keyboard interrupt: terminating normally") except zmq.ZMQError as zmq_error: if is_interrupted_system_call(zmq_error) and halt_event.is_set(): log.info("program teminates normally with interrupted system call") else: log.exception("zeromq error processing request") event_push_client.exception(unhandled_exception_topic, "zeromq_error", exctype="ZMQError") return_value = 1 except Exception as instance: log.exception("error processing request") event_push_client.exception(unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__) return_value = 1 else: log.info("program teminates normally") finally: terminate_subprocess(database_pool_controller) terminate_subprocess(io_controller) rep_socket.close() db_controller_push_socket.close() event_push_client.close() zeromq_context.term() return return_value
def main(): """ main entry point returns 0 for normal termination (usually SIGTERM) """ return_value = 0 log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"], _local_node_name) initialize_logging(log_path) log = logging.getLogger("main") log.info("program starts") for internal_socket_uri in internal_socket_uri_list: prepare_ipc_path(internal_socket_uri) halt_event = Event() set_signal_handler(halt_event) database_pool_controller = _launch_database_pool_controller() io_controller = _launch_io_controller() zeromq_context = zmq.Context() rep_socket = _bind_rep_socket(zeromq_context) db_controller_push_socket = _connect_db_controller_push_socket(zeromq_context) event_push_client = EventPushClient(zeromq_context, "retrieve_source") event_push_client.info("program-starts", "retrieve source starts") # we poll the sockets for readability, we assume we can always # write to the push client sockets poller = zmq.Poller() poller.register(rep_socket, zmq.POLLIN | zmq.POLLERR) last_report_time = 0.0 request_count = 0 try: while not halt_event.is_set(): poll_subprocess(database_pool_controller) poll_subprocess(io_controller) # we've only registered one socket, so we could use an 'if' here, # but this 'for' works ok and it has the same form as the other # places where we use poller for active_socket, event_flags in poller.poll(_poll_timeout): if event_flags & zmq.POLLERR: error_message = "error flags from zmq {0}".format(active_socket) log.error(error_message) raise PollError(error_message) assert active_socket is rep_socket _process_one_request(rep_socket, db_controller_push_socket) request_count += 1 current_time = time.time() elapsed_time = current_time - last_report_time if elapsed_time > _reporting_interval: report_message = "{0:,} requests".format(request_count) log.info(report_message) event_push_client.info("request_count", report_message, request_count=request_count) last_report_time = current_time request_count = 0 except KeyboardInterrupt: # convenience for testing log.info("keyboard interrupt: terminating normally") except zmq.ZMQError as zmq_error: if is_interrupted_system_call(zmq_error) and halt_event.is_set(): log.info("program teminates normally with interrupted system call") else: log.exception("zeromq error processing request") event_push_client.exception(unhandled_exception_topic, "zeromq_error", exctype="ZMQError") return_value = 1 except Exception as instance: log.exception("error processing request") event_push_client.exception(unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__) return_value = 1 else: log.info("program teminates normally") finally: terminate_subprocess(database_pool_controller) terminate_subprocess(io_controller) rep_socket.close() db_controller_push_socket.close() event_push_client.close() zeromq_context.term() return return_value
def main(): """ main entry point returns 0 for normal termination (usually SIGTERM) """ return_value = 0 worker_number = int(sys.argv[1]) log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"], worker_number, _local_node_name) initialize_logging(log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() set_signal_handler(halt_event) zeromq_context = zmq.Context() event_source_name = "rs_dbpool_worker_{0}".format(worker_number) event_push_client = EventPushClient(zeromq_context, event_source_name) dealer_socket = zeromq_context.socket(zmq.DEALER) dealer_socket.setsockopt(zmq.LINGER, 1000) log.debug("connecting to {0}".format(db_controller_router_socket_uri)) dealer_socket.connect(db_controller_router_socket_uri) log.debug("opening local database connection") database_connection = get_node_local_connection() try: _send_initial_work_request(dealer_socket) while not halt_event.is_set(): _process_one_transaction(dealer_socket, database_connection, event_push_client) except InterruptedSystemCall: if halt_event.is_set(): log.info("program teminates normally with interrupted system call") else: log.exception("zeromq error processing request") event_push_client.exception(unhandled_exception_topic, "Interrupted zeromq system call", exctype="InterruptedSystemCall") return_value = 1 except Exception as instance: log.exception("error processing request") event_push_client.exception(unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__) return_value = 1 else: log.info("program teminates normally") finally: database_connection.close() dealer_socket.close() event_push_client.close() zeromq_context.term() return return_value
def main(): """ main entry point return 0 for success (exit code) """ initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() signal.signal(signal.SIGTERM, _create_signal_handler(halt_event)) zmq_context = zmq.Context() event_push_client = EventPushClient(zmq_context, "defragger") event_push_client.info("program-start", "defragger starts") connection = None while not halt_event.is_set(): # if we don't have an open database connection, get one if connection is None: try: connection = get_node_local_connection() except Exception as instance: exctype, value = sys.exc_info()[:2] event_push_client.exception( "database exception", str(value), exctype=exctype.__name__ ) log.exception("Exception connecting to database") halt_event.wait(_database_retry_interval) continue # start a transaction connection.execute("begin") # try one defrag pass bytes_defragged = 0 try: bytes_defragged = _defrag_pass(connection, event_push_client) except KeyboardInterrupt: halt_event.set() connection.rollback() except Exception as instance: log.exception(str(instance)) event_push_client.exception( unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__ ) connection.rollback() else: connection.commit() log.info("bytes defragged = {0:,}".format(bytes_defragged)) # if we didn't do anything on this pass... if bytes_defragged == 0: # close the database connection if connection is not None: connection.close() connection = None # wait and try again try: halt_event.wait(_defrag_check_interval) except KeyboardInterrupt: halt_event.set() if connection is not None: connection.close() event_push_client.close() zmq_context.term() log.info("program terminates normally") return 0
def main(): """ main entry point return 0 for success (exit code) """ initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() set_signal_handler(halt_event) zmq_context = zmq.Context() event_push_client = EventPushClient(zmq_context, "defragger") event_push_client.info("program-start", "defragger starts") connection = None file_space_info = None while not halt_event.is_set(): # if we don't have an open database connection, get one if connection is None: try: connection = get_node_local_connection() except Exception as instance: exctype, value = sys.exc_info()[:2] event_push_client.exception( "database exception", str(value), exctype=exctype.__name__ ) log.exception("Exception connecting to database") halt_event.wait(_database_retry_interval) continue file_space_info = load_file_space_info(connection) file_space_sanity_check(file_space_info, _repository_path) # try one defrag pass bytes_defragged = 0 connection.begin_transaction() try: bytes_defragged = _defrag_pass(connection, file_space_info, event_push_client) except KeyboardInterrupt: halt_event.set() connection.rollback() except Exception as instance: log.exception(str(instance)) event_push_client.exception( unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__ ) connection.rollback() else: connection.commit() log.info("bytes defragged = {0:,}".format(bytes_defragged)) # if we didn't do anything on this pass... if bytes_defragged == 0: # exit if we're done and asked to do single pass if int(os.environ.get('NIMBUSIO_EXIT_WHEN_DONE', '0')): halt_event.set() # close the database connection if connection is not None: connection.close() connection = None # wait and try again try: halt_event.wait(_defrag_check_interval) except KeyboardInterrupt: halt_event.set() if connection is not None: connection.close() event_push_client.close() zmq_context.term() log.info("program terminates normally") return 0
def main(): """ main entry point return 0 for success (exit code) """ initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() set_signal_handler(halt_event) zeromq_context = zmq.Context() event_push_client = EventPushClient(zeromq_context, "redis_stats_collector") event_push_client.info("program-start", "flush_stats_from_redis starts") # don't flush anything newer than 1 minute ago current_time = datetime.utcnow() timestamp_cutoff = current_time - timedelta(minutes=1) return_code = 0 central_db_connection = None collection_ops_accounting_rows = list() # values to be added to the dedupe table new_dedupes = list() # keys to be deleted (a list for each node node_keys_processed = [list() for _ in _node_names] try: central_db_connection = get_central_connection() # On startup, the program connects to the central database and tries # to acquire a pg_advisory_lock appropriate for this program and the # data center it is running in using the pg_try_advisory_lock function. # If it cannot acquire the lock, it notes the status of the lock # and exits. This central locking mechanism lets us avoid single points # of failure by configuring the program to run on multiple nodes. with advisory_lock(central_db_connection, "redis_stats_collector"): node_dict = _retrieve_node_dict(central_db_connection) for node_name, keys_processed in \ zip(_node_names, node_keys_processed): node_id = node_dict[node_name] log.debug("processing node {0} node_id={1}".format(node_name, node_id)) # The program then selects into memory all recently collected # keys from the central database table # collection_ops_accounting_flush_dedupe and stores them in a # dedupe set. This set allows runs of the collection/flush # program to be idempotent across some time period ( # but we won't keep the list of old keys forever.) dedupe_set = _retrieve_dedupe_set(central_db_connection, node_id) # The program then visits the Redis instance on every storage # node in the local data center, collecting the data from all # past stats keys -- aggregating it into the program's memory. # The aggregation should involve buckets for each # storage_node_id and redis key, corresponding to the columns # in the database. _process_one_node(node_name, node_dict[node_name], timestamp_cutoff, dedupe_set, collection_ops_accounting_rows, new_dedupes, keys_processed) # After collecting past keys from every storage node, # inside a central database transaction: # 1. Insert the collected stats into the central database # collection_ops_accounting # 2. Insert collected keys into recently collected keys # collection_ops_accounting_flush_dedupe. # 3. commit transaction log.debug("updating central database") central_db_connection.begin_transaction() try: _insert_accounting_rows(central_db_connection, collection_ops_accounting_rows) _insert_dedupe_rows(central_db_connection, timestamp_cutoff, new_dedupes) except Exception: central_db_connection.rollback() raise else: central_db_connection.commit() # Then revisit the Redis nodes, and delete the keys we flushed # into the database, and any keys we skipped because they were # found in the dedupe set. for node_name, keys_processed in zip(_node_names, node_keys_processed): _remove_processed_keys(node_name, keys_processed) except Exception as instance: log.exception("Uhandled exception {0}".format(instance)) event_push_client.exception( unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__ ) return_code = 1 if central_db_connection is not None: central_db_connection.close() event_push_client.close() zeromq_context.term() log.info("program terminates return_code = {0}".format(return_code)) return return_code
"segment-size" : file_size, "segment-adler32" : file_adler32, "segment-md5-digest": b64encode(file_md5.digest()), "file-size" : file_size, "file-adler32" : file_adler32, "file-hash" : b64encode(file_md5.digest()), "handoff-node-name" : None, } reply = send_request_and_get_reply( _local_node_name, _data_writer_address, _local_node_name, _client_address, message, data=content_item ) self.assertEqual(reply["message-id"], message_id) self.assertEqual(reply["message-type"], "archive-key-final-reply") self.assertEqual(reply["result"], "success") reply = self._destroy( collection_id, key, destroy_timestamp, segment_num ) self.assertEqual(reply["result"], "success", reply["error-message"]) if __name__ == "__main__": initialize_logging(_log_path) unittest.main()
def main(): """ main entry point returns 0 for normal termination (usually SIGTERM) """ return_value = 0 log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"], _local_node_name) initialize_logging(log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() set_signal_handler(halt_event) zeromq_context = zmq.Context() resources = \ _resources_tuple(halt_event=Event(), volume_by_space_id=_volume_name_by_space_id(), pull_socket=zeromq_context.socket(zmq.PULL), router_socket=zeromq_context.socket(zmq.ROUTER), event_push_client=\ EventPushClient(zeromq_context, "rs_io_controller"), pending_work_by_volume=defaultdict(deque), available_ident_by_volume=defaultdict(deque)) log.debug("binding to {0}".format(io_controller_pull_socket_uri)) resources.pull_socket.bind(io_controller_pull_socket_uri) resources.router_socket.setsockopt(zmq.LINGER, 1000) log.debug("binding to {0}".format(io_controller_router_socket_uri)) resources.router_socket.bind(io_controller_router_socket_uri) # we poll the sockets for readability, we assume we can always # write to the router socket poller = zmq.Poller() poller.register(resources.pull_socket, zmq.POLLIN | zmq.POLLERR) poller.register(resources.router_socket, zmq.POLLIN | zmq.POLLERR) worker_processes = list() for volume_name in set(resources.volume_by_space_id.values()): for index in range(_worker_count): worker_processes.append(_launch_io_worker(volume_name, index + 1)) last_report_time = 0.0 try: while not halt_event.is_set(): for worker_process in worker_processes: poll_subprocess(worker_process) for active_socket, event_flags in poller.poll(_poll_timeout): if event_flags & zmq.POLLERR: error_message = \ "error flags from zmq {0}".format(active_socket) log.error(error_message) raise PollError(error_message) if active_socket is resources.pull_socket: _read_pull_socket(resources) elif active_socket is resources.router_socket: _read_router_socket(resources) else: log.error("unknown socket {0}".format(active_socket)) current_time = time.time() elapsed_time = current_time - last_report_time if elapsed_time > _reporting_interval: pending_work = 0 for volume_queue in resources.pending_work_by_volume.values(): pending_work += len(volume_queue) report_message = \ "{0:,} pending_work entries".format(pending_work) log.info(report_message) resources.event_push_client.info("queue_sizes", report_message, pending_work=pending_work) last_report_time = current_time except zmq.ZMQError as zmq_error: if is_interrupted_system_call(zmq_error) and halt_event.is_set(): log.info("program teminates normally with interrupted system call") else: log.exception("zeromq error processing request") resources.event_push_client.exception(unhandled_exception_topic, "zeromq_error", exctype="ZMQError") return_value = 1 except Exception as instance: log.exception("error processing request") resources.event_push_client.exception( unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__) return_value = 1 else: log.info("program teminates normally") finally: for worker_process in worker_processes: terminate_subprocess(worker_process) resources.pull_socket.close() resources.router_socket.close() resources.event_push_client.close() zeromq_context.term() return return_value