def _handle_assignment_message(self, pplan): """Called when new NewInstanceAssignmentMessage arrives""" Log.debug("In handle_assignment_message() of STStmgrClient, Physical Plan: \n%s", str(pplan)) new_helper = PhysicalPlanHelper( pplan, self.instance.instance_id, self.heron_instance_cls.topo_pex_file_abs_path ) if self._pplan_helper is not None and ( self._pplan_helper.my_component_name != new_helper.my_component_name or self._pplan_helper.my_task_id != new_helper.my_task_id ): raise RuntimeError("Our Assignment has changed. We will die to pick it.") if self._pplan_helper is None: Log.info("Received a new Physical Plan") Log.info("Push the new pplan_helper to Heron Instance") self.heron_instance_cls.handle_assignment_msg(new_helper) else: Log.info("Received a new Physical Plan with the same assignment -- State Change") Log.info( "Old state: %s, new state: %s.", self._pplan_helper.get_topology_state(), new_helper.get_topology_state(), ) self.heron_instance_cls.handle_state_change_msg(new_helper) self._pplan_helper = new_helper
def poll(self, timeout=0.0): """Modified version of poll() from asyncore module""" if self.sock_map is None: Log.warning("Socket map is not registered to Gateway Looper") readable_lst = [] writable_lst = [] error_lst = [] if self.sock_map is not None: for fd, obj in self.sock_map.items(): is_r = obj.readable() is_w = obj.writable() if is_r: readable_lst.append(fd) if is_w and not obj.accepting: writable_lst.append(fd) if is_r or is_w: error_lst.append(fd) # Add wakeup fd readable_lst.append(self.pipe_r) Log.debug("Will select() with timeout: " + str(timeout) + ", with map: " + str(self.sock_map)) try: readable_lst, writable_lst, error_lst = \ select.select(readable_lst, writable_lst, error_lst, timeout) except select.error, err: Log.debug("Trivial error: " + err.message) if err.args[0] != errno.EINTR: raise else: return
def get(self): """ get method """ try: cluster = self.get_argument_cluster() role = self.get_argument_role() environ = self.get_argument_environ() topology_name = self.get_argument_topology() container = self.get_argument(constants.PARAM_CONTAINER) path = self.get_argument(constants.PARAM_PATH) offset = self.get_argument_offset() length = self.get_argument_length() topology_info = self.tracker.getTopologyInfo(topology_name, cluster, role, environ) stmgr_id = "stmgr-" + container stmgr = topology_info["physical_plan"]["stmgrs"][stmgr_id] host = stmgr["host"] shell_port = stmgr["shell_port"] file_data_url = "http://%s:%d/filedata/%s?offset=%s&length=%s" % \ (host, shell_port, path, offset, length) http_client = tornado.httpclient.AsyncHTTPClient() response = yield http_client.fetch(file_data_url) self.write_success_response(json.loads(response.body)) self.finish() except Exception as e: Log.debug(traceback.format_exc()) self.write_error_response(e)
def start_heron_tools(masters, cl_args): ''' Start Heron tracker and UI ''' single_master = list(masters)[0] wait_for_master_to_start(single_master) cmd = "%s run %s >> /tmp/heron_tools_start.log 2>&1 &" \ % (get_nomad_path(cl_args), get_heron_tools_job_file(cl_args)) Log.info("Starting Heron Tools on %s" % single_master) if not is_self(single_master): cmd = ssh_remote_execute(cmd, single_master, cl_args) Log.debug(cmd) pid = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) return_code = pid.wait() output = pid.communicate() Log.debug("return code: %s output: %s" % (return_code, output)) if return_code != 0: Log.error("Failed to start Heron Tools on %s with error:\n%s" % (single_master, output[1])) sys.exit(-1) wait_for_job_to_start(single_master, "heron-tools") Log.info("Done starting Heron Tools")
def _send_metrics_messages(self): if self.connected: while not self.out_queue.is_empty(): message = self.out_queue.poll() assert isinstance(message, metrics_pb2.MetricPublisherPublishMessage) Log.debug("Sending metric message: %s" % str(message)) self.send_message(message)
def register_watch(self, callback): """ Returns the UUID with which the watch is registered. This UUID can be used to unregister the watch. Returns None if watch could not be registered. The argument 'callback' must be a function that takes exactly one argument, the topology on which the watch was triggered. Note that the watch will be unregistered in case it raises any Exception the first time. This callback is also called at the time of registration. """ RETRY_COUNT = 5 # Retry in case UID is previously # generated, just in case... for _ in range(RETRY_COUNT): # Generate a random UUID. uid = uuid.uuid4() if uid not in self.watches: Log.info("Registering a watch with uid: " + str(uid)) try: callback(self) except Exception as e: Log.error("Caught exception while triggering callback: " + str(e)) Log.debug(traceback.format_exc()) return None self.watches[uid] = callback return uid return None
def get_jobs(cl_args, nomad_addr): r = requests.get("http://%s:4646/v1/jobs" % nomad_addr) if r.status_code != 200: Log.error("Failed to get list of jobs") Log.debug("Response: %s" % r) sys.exit(-1) return r.json()
def start_slave_nodes(slaves, cl_args): ''' Star slave nodes ''' pids = [] for slave in slaves: Log.info("Starting slave on %s" % slave) cmd = "%s agent -config %s >> /tmp/nomad_client.log 2>&1 &" \ % (get_nomad_path(cl_args), get_nomad_slave_config_file(cl_args)) if not is_self(slave): cmd = ssh_remote_execute(cmd, slave, cl_args) Log.debug(cmd) pid = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) pids.append({"pid": pid, "dest": slave}) errors = [] for entry in pids: pid = entry["pid"] return_code = pid.wait() output = pid.communicate() Log.debug("return code: %s output: %s" % (return_code, output)) if return_code != 0: errors.append("Failed to start slave on %s with error:\n%s" % (entry["dest"], output[1])) if errors: for error in errors: Log.error(error) sys.exit(-1) Log.info("Done starting slaves")
def get(self): try: cluster = self.get_argument_cluster() role = self.get_argument_role() environ = self.get_argument_environ() topology_name = self.get_argument_topology() container = self.get_argument(constants.PARAM_CONTAINER) path = self.get_argument(constants.PARAM_PATH) topology_info = self.tracker.getTopologyInfo(topology_name, cluster, role, environ) stmgr_id = "stmgr-" + container stmgr = topology_info["physical_plan"]["stmgrs"][stmgr_id] host = stmgr["host"] shell_port = stmgr["shell_port"] file_download_url = "http://%s:%d/download/%s" % (host, shell_port, path) Log.debug("download file url: %s", file_download_url) path = self.get_argument("path") filename = path.split("/")[-1] self.set_header("Content-Disposition", "attachment; filename=%s" % filename) def streaming_callback(chunk): self.write(chunk) self.flush() http_client = tornado.httpclient.AsyncHTTPClient() yield http_client.fetch(file_download_url, streaming_callback=streaming_callback) self.finish() except Exception as e: Log.debug(traceback.format_exc()) self.write_error_response(e)
def on_connect(self, status): Log.debug("In on_connect of MetricsManagerClient") if status != StatusCode.OK: Log.error("Error connecting to Metrics Manager with status: %s" % str(status)) retry_interval = float(self.sys_config[constants.INSTANCE_RECONNECT_METRICSMGR_INTERVAL_SEC]) self.looper.register_timer_task_in_sec(self.start_connect, retry_interval) self._send_register_req()
def start_api_server(masters, cl_args): ''' Start the Heron API server ''' # make sure nomad cluster is up single_master = list(masters)[0] wait_for_master_to_start(single_master) cmd = "%s run %s >> /tmp/apiserver_start.log 2>&1 &" \ % (get_nomad_path(cl_args), get_apiserver_job_file(cl_args)) Log.info("Starting Heron API Server on %s" % single_master) if not is_self(single_master): cmd = ssh_remote_execute(cmd, single_master, cl_args) Log.debug(cmd) pid = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) return_code = pid.wait() output = pid.communicate() Log.debug("return code: %s output: %s" % (return_code, output)) if return_code != 0: Log.error("Failed to start apiserver on %s with error:\n%s" % (single_master, output[1])) sys.exit(-1) wait_for_job_to_start(single_master, "apiserver") Log.info("Done starting Heron API Server")
def getComponentException(self, tmaster, component_name, instances=[], callback=None): """ Get all (last 1000) exceptions for 'component_name' of the topology. Returns an Array of exception logs on success. Returns json with message on failure. """ if not tmaster or not tmaster.host or not tmaster.stats_port: return exception_request = tmaster_pb2.ExceptionLogRequest() exception_request.component_name = component_name if len(instances) > 0: exception_request.instances.extend(instances) request_str = exception_request.SerializeToString() port = str(tmaster.stats_port) host = tmaster.host url = "http://{0}:{1}/exceptions".format(host, port) request = tornado.httpclient.HTTPRequest(url, body=request_str, method='POST', request_timeout=5) Log.debug('Making HTTP call to fetch exceptions url: %s', url) try: client = tornado.httpclient.AsyncHTTPClient() result = yield client.fetch(request) Log.debug("HTTP call complete.") except tornado.httpclient.HTTPError as e: raise Exception(str(e)) # Check the response code - error if it is in 400s or 500s responseCode = result.code if responseCode >= 400: message = "Error in getting exceptions from Tmaster, code: " + responseCode Log.error(message) raise tornado.gen.Return({ "message": message }) # Parse the response from tmaster. exception_response = tmaster_pb2.ExceptionLogResponse() exception_response.ParseFromString(result.body) if exception_response.status.status == common_pb2.NOTOK: if exception_response.status.HasField("message"): raise tornado.gen.Return({ "message": exception_response.status.message }) # Send response ret = [] for exception_log in exception_response.exceptions: ret.append({'hostname': exception_log.hostname, 'instance_id': exception_log.instance_id, 'stack_trace': exception_log.stacktrace, 'lasttime': exception_log.lasttime, 'firsttime': exception_log.firsttime, 'count': str(exception_log.count), 'logging': exception_log.logging}) raise tornado.gen.Return(ret)
def get_logical_plan(cluster, env, topology, role): """Synced API call to get logical plans""" instance = tornado.ioloop.IOLoop.instance() try: return instance.run_sync(lambda: API.get_logical_plan(cluster, env, topology, role)) except Exception: Log.debug(traceback.format_exc()) raise
def get_cluster_topologies(cluster): """Synced API call to get topologies under a cluster""" instance = tornado.ioloop.IOLoop.instance() try: return instance.run_sync(lambda: API.get_cluster_topologies(cluster)) except Exception: Log.debug(traceback.format_exc()) raise
def get_cluster_role_env_topologies(cluster, role, env): """Synced API call to get topologies under a cluster submitted by a role under env""" instance = tornado.ioloop.IOLoop.instance() try: return instance.run_sync(lambda: API.get_cluster_role_env_topologies(cluster, role, env)) except Exception: Log.debug(traceback.format_exc()) raise
def get_topology_metrics(*args): """Synced API call to get topology metrics""" instance = tornado.ioloop.IOLoop.instance() try: return instance.run_sync(lambda: API.get_comp_metrics(*args)) except Exception: Log.debug(traceback.format_exc()) raise
def set_topology_context(self, metrics_collector): """Sets a new topology context""" Log.debug("Setting topology context") cluster_config = self.get_topology_config() cluster_config.update(self._get_dict_from_config(self.my_component.config)) task_to_component_map = self._get_task_to_comp_map() self.context = TopologyContext(cluster_config, self.pplan.topology, task_to_component_map, self.my_task_id, metrics_collector)
def register_on_message(self, msg_builder): """Registers protobuf message builders that this client wants to receive :param msg_builder: callable to create a protobuf message that this client wants to receive """ message = msg_builder() Log.debug("In register_on_message(): %s" % message.DESCRIPTOR.full_name) self.registered_message_map[message.DESCRIPTOR.full_name] = msg_builder
def get_clusters(): """Synced API call to get all cluster names""" instance = tornado.ioloop.IOLoop.instance() # pylint: disable=unnecessary-lambda try: return instance.run_sync(lambda: API.get_clusters()) except Exception: Log.debug(traceback.format_exc()) raise
def on_connect(self, status): Log.debug("In on_connect of STStmgrClient") if status != StatusCode.OK: Log.error("Error connecting to Stream Manager with status: %s", str(status)) retry_interval = float(self.sys_config[constants.INSTANCE_RECONNECT_STREAMMGR_INTERVAL_SEC]) self.looper.register_timer_task_in_sec(self.start_connect, retry_interval) return self._register_msg_to_handle() self._send_register_req()
def _send_metrics_messages(self): if self.connected: while not self.out_queue.is_empty(): message = self.out_queue.poll() assert isinstance(message, metrics_pb2.MetricPublisherPublishMessage) Log.debug("Sending metric message: %s" % str(message)) self.send_message(message) self.gateway_metrics.update_sent_metrics_size(message.ByteSize()) self.gateway_metrics.update_sent_metrics(len(message.metrics), len(message.exceptions))
def on_response(self, status, context, response): Log.debug("In on_response with status: %s, with context: %s" % (str(status), str(context))) if status != StatusCode.OK: raise RuntimeError("Response from Metrics Manager not OK") if isinstance(response, metrics_pb2.MetricPublisherRegisterResponse): self._handle_register_response(response) else: Log.error("Unknown kind of response received: %s" % response.DESCRIPTOR.full_name) raise RuntimeError("Unknown kind of response received from Metrics Manager")
def heron_pex(topology_pex, topology_class_name, tmp_dir): Log.debug("Importing %s from %s" % (topology_class_name, topology_pex)) try: pex_loader.load_pex(topology_pex) topology_class = pex_loader.import_and_get_class(topology_pex, topology_class_name) topology_class.write(tmp_dir) except Exception: traceback.print_exc() err_str = "Topology pex failed to be loaded. Bailing out..." raise RuntimeError(err_str)
def run(command, parser, cl_args, unknown_args): ''' :param command: :param parser: :param cl_args: :param unknown_args: :return: ''' Log.debug("Activate Args: %s", cl_args) return cli_helper.run(command, cl_args, "activate topology")
def get_positional_args(self): positional_args_map = collections.defaultdict(dict) for key in self._actions: # pylint: disable=protected-access if isinstance(key, argparse._StoreAction) and len(key.option_strings) == 0: if key.dest == 'cluster/[role]/[env]': continue positional_args_map['--'+key.dest] = key.dest Log.debug("get_positional_args : key: %s, dest : %s", key, key.dest) return positional_args_map
def get_component_metrics(component, cluster, env, topology, role): """Synced API call to get component metrics""" all_queries = metric_queries() try: result = get_topology_metrics(cluster, env, topology, component, [], all_queries, [0, -1], role) return result["metrics"] except Exception: Log.debug(traceback.format_exc()) raise
def _handle_register_response(self, response): """Called when a register response (RegisterInstanceResponse) arrives""" if response.status.status != common_pb2.StatusCode.Value("OK"): raise RuntimeError("Stream Manager returned a not OK response for register") Log.info("We registered ourselves to the Stream Manager") if response.HasField("pplan"): Log.info("Handling assignment message from response") self._handle_assignment_message(response.pplan) else: Log.debug("Received a register response with no pplan")
def start_connect(self): """Tries to connect to the Heron Server ``loop()`` method needs to be called after this. """ Log.debug("In start_connect() of %s" % self._get_classname()) # TODO: specify buffer size, exception handling self.create_socket(socket.AF_INET, socket.SOCK_STREAM) # when ready, handle_connect is called self.connect(self.endpoint)
def _gather_metrics(self, time_bucket_in_sec): if time_bucket_in_sec in self.time_bucket_in_sec_to_metrics_name: message = metrics_pb2.MetricPublisherPublishMessage() for name in self.time_bucket_in_sec_to_metrics_name[time_bucket_in_sec]: Log.debug("Will call gather_one_metric with %s", name) self._gather_one_metric(name, message) assert message.IsInitialized() self.out_metrics.offer(message) # schedule ourselves again self._register_timer_task(time_bucket_in_sec)
def get(self): """ get method """ try: cluster = self.get_argument_cluster() role = self.get_argument_role() environ = self.get_argument_environ() topology_name = self.get_argument_topology() topology_info = self.tracker.getTopologyInfo(topology_name, cluster, role, environ) self.write_success_response(topology_info) except Exception as e: Log.debug(traceback.format_exc()) self.write_error_response(e)
def on_connect(self, status): Log.debug("In on_connect of STStmgrClient") if status != StatusCode.OK: Log.error("Error connecting to Stream Manager with status: %s" % str(status)) retry_interval = float(self.sys_config[ constants.INSTANCE_RECONNECT_STREAMMGR_INTERVAL_SEC]) self.looper.register_timer_task_in_sec(self.start_connect, retry_interval) return self._register_msg_to_handle() self._send_register_req()
def _handle_register_response(self, response): """Called when a register response (RegisterInstanceResponse) arrives""" if response.status.status != common_pb2.StatusCode.Value("OK"): raise RuntimeError( "Stream Manager returned a not OK response for register") Log.info("We registered ourselves to the Stream Manager") if response.HasField("pplan"): Log.info("Handling assignment message from response") self._handle_assignment_message(response.pplan) else: Log.debug("Received a register response with no pplan")
def start_connect(self): """Tries to connect to the Heron Server ``loop()`` method needs to be called after this. """ Log.debug("In start_connect() of %s" % self._get_classname()) # TODO: specify buffer size, exception handling self.create_socket(socket.AF_INET, socket.SOCK_STREAM) # when ready, handle_connect is called self._connecting = True self.connect(self.endpoint)
def on_response(self, status, context, response): Log.debug("In on_response with status: %s, with context: %s" % (str(status), str(context))) if status != StatusCode.OK: raise RuntimeError("Response from Metrics Manager not OK") if isinstance(response, metrics_pb2.MetricPublisherRegisterResponse): self._handle_register_response(response) else: Log.error("Unknown kind of response received: %s" % response.DESCRIPTOR.full_name) raise RuntimeError( "Unknown kind of response received from Metrics Manager")
def _send_metrics_messages(self): if self.connected: while not self.out_queue.is_empty(): message = self.out_queue.poll() assert isinstance(message, metrics_pb2.MetricPublisherPublishMessage) Log.debug("Sending metric message: %s" % str(message)) self.send_message(message) self.gateway_metrics.update_sent_metrics_size( message.ByteSize()) self.gateway_metrics.update_sent_metrics( len(message.metrics), len(message.exceptions))
def get_positional_args(self): positional_args_map = collections.defaultdict(dict) for key in self._actions: # pylint: disable=protected-access if isinstance(key, argparse._StoreAction) and len( key.option_strings) == 0: if key.dest == 'cluster/[role]/[env]': continue positional_args_map['--' + key.dest] = key.dest Log.debug("get_positional_args : key: %s, dest : %s", key, key.dest) return positional_args_map
def on_connect(self, status): Log.debug("In on_connect of MetricsManagerClient") if status != StatusCode.OK: Log.error( f"Error connecting to Metrics Manager with status: {str(status)}" ) retry_interval = float(self.sys_config[ constants.INSTANCE_RECONNECT_METRICSMGR_INTERVAL_SEC]) self.looper.register_timer_task_in_sec(self.start_connect, retry_interval) return self._send_register_req()
def heron_class(class_name, lib_jars, extra_jars=None, args=None, java_defines=None): ''' Execute a heron class given the args and the jars needed for class path :param class_name: :param lib_jars: :param extra_jars: :param args: :param java_defines: :return: ''' # default optional params to empty list if not provided if extra_jars is None: extra_jars = [] if args is None: args = [] if java_defines is None: java_defines = [] # Format all java -D options that need to be passed while running # the class locally. java_opts = ['-D' + opt for opt in java_defines] # Construct the command line for the sub process to run # Because of the way Python execute works, # the java opts must be passed as part of the list all_args = [config.get_java_path(), "-client", "-Xmx1g"] + \ java_opts + \ ["-cp", config.get_classpath(extra_jars + lib_jars)] all_args += [class_name] + list(args) # set heron_config environment variable heron_env = os.environ.copy() heron_env['HERON_OPTIONS'] = opts.get_heron_config() # print the verbose message Log.debug("Invoking class using command: ``%s''", ' '.join(all_args)) Log.debug("Heron options: {%s}", str(heron_env["HERON_OPTIONS"])) # invoke the command with subprocess and print error message, if any proc = subprocess.Popen(all_args, env=heron_env, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1) # stdout message has the information Java program sends back # stderr message has extra information, such as debugging message return ProcessResult(proc)
def __init__(self, socket_map): """Initializes a GatewayLooper instance :param socket_map: socket map used for asyncore.dispatcher """ super().__init__() self.sock_map = socket_map # Pipe used for wake up select self.pipe_r, self.pipe_w = os.pipe() self.started = time.time() Log.debug("Gateway Looper started time: " + str(time.asctime()))
def update_config_files(cl_args): Log.info("Updating config files...") roles = read_and_parse_roles(cl_args) Log.debug("roles: %s" % roles) masters = list(roles[Role.MASTERS]) zookeepers = list(roles[Role.ZOOKEEPERS]) template_slave_hcl(cl_args, masters) template_scheduler_yaml(cl_args, masters) template_uploader_yaml(cl_args, masters) template_apiserver_hcl(cl_args, masters, zookeepers) template_statemgr_yaml(cl_args, zookeepers) template_heron_tools_hcl(cl_args, masters, zookeepers)
def update_slave_config_files(cl_args): ''' update/template config files related to slave servers ''' roles = read_and_parse_roles(cl_args) slaves = list(roles[SET.SLAVES]) if not slaves: return Log.debug("Templating files for slaves...") # update apiserver location single_slave = slaves[0] uploader_config_template = "%s/standalone/templates/uploader.template.yaml" \ % cl_args["config_path"] with open(uploader_config_template, 'r') as tf: file_contents = tf.read() new_file_contents = file_contents.replace( "<http_uploader_uri>", "http://%s:9000/api/v1/file/upload" % single_slave) uploader_config_actual = "%s/standalone/uploader.yaml" % cl_args[ "config_path"] with open(uploader_config_actual, 'w') as tf: tf.write(new_file_contents) tf.truncate() # Api server nomad job def apiserver_config_template = "%s/standalone/templates/apiserver.template.hcl" \ % cl_args["config_path"] with open(apiserver_config_template, 'r') as tf: file_contents = tf.read() new_file_contents = file_contents.replace( "<heron_apiserver_hostname>", '"%s"' % get_hostname(single_slave, cl_args)) if is_self(single_slave): new_file_contents = new_file_contents.replace( "<heron_apiserver_executable>", '"%s/heron-apiserver"' % config.get_heron_bin_dir()) else: new_file_contents = new_file_contents.replace( "<heron_apiserver_executable>", '"%s/.heron/bin/heron-apiserver"' % get_remote_home(single_slave, cl_args)) apiserver_config_actual = "%s/standalone/resources/apiserver.hcl" % cl_args[ "config_path"] with open(apiserver_config_actual, 'w') as tf: tf.write(new_file_contents) tf.truncate()
def import_and_get_class(path_to_pex, python_class_name): """Imports and load a class from a given pex file path and python class name For example, if you want to get a class called `Sample` in /some-path/sample.pex/heron/examples/src/python/sample.py, ``path_to_pex`` needs to be ``/some-path/sample.pex``, and ``python_class_name`` needs to be ``heron.examples.src.python.sample.Sample`` """ abs_path_to_pex = os.path.abspath(path_to_pex) Log.debug(f"Add a pex to the path: {abs_path_to_pex}") Log.debug(f"In import_and_get_class with cls_name: {python_class_name}") split = python_class_name.split('.') from_path = '.'.join(split[:-1]) import_name = python_class_name.split('.')[-1] Log.debug(f"From path: {from_path}, import name: {import_name}") # Resolve duplicate package suffix problem (heron.), if the top level package name is heron if python_class_name.startswith("heron."): try: mod = resolve_heron_suffix_issue(abs_path_to_pex, python_class_name) return getattr(mod, import_name) except: Log.error( f"Could not resolve class {python_class_name} with special handling" ) mod = __import__(from_path, fromlist=[import_name], level=0) Log.debug(f"Imported module: {str(mod)}") return getattr(mod, import_name)
def distribute_package(roles, cl_args): ''' distribute Heron packages to all nodes ''' Log.info("Distributing heron package to nodes (this might take a while)...") masters = roles[Role.MASTERS] slaves = roles[Role.SLAVES] tar_file = tempfile.NamedTemporaryFile(suffix=".tmp").name Log.debug("TAR file %s to %s" % (cl_args["heron_dir"], tar_file)) make_tarfile(tar_file, cl_args["heron_dir"]) dist_nodes = masters.union(slaves) scp_package(tar_file, dist_nodes, cl_args)
def __init__(self, looper, metrics_host, port, instance, out_metrics, in_stream, out_stream, sock_map, socket_options, gateway_metrics, sys_config): HeronClient.__init__(self, looper, metrics_host, port, sock_map, socket_options) self.instance = instance self.out_queue = out_metrics self.in_stream = in_stream self.out_stream = out_stream self.gateway_metrics = gateway_metrics self.sys_config = sys_config self._add_metrics_client_tasks() Log.debug('start updating in and out stream metrics') self._update_in_out_stream_metrics_tasks()
def import_and_get_class(path_to_pex, python_class_name): """Imports and load a class from a given pex file path and python class name For example, if you want to get a class called `Sample` in /some-path/sample.pex/heron/examples/src/python/sample.py, ``path_to_pex`` needs to be ``/some-path/sample.pex``, and ``python_class_name`` needs to be ``heron.examples.src.python.sample.Sample`` """ abs_path_to_pex = os.path.abspath(path_to_pex) Log.debug("Add a pex to the path: %s" % abs_path_to_pex) Log.debug("In import_and_get_class with cls_name: %s" % python_class_name) split = python_class_name.split('.') from_path = '.'.join(split[:-1]) import_name = python_class_name.split('.')[-1] Log.debug("From path: %s, import name: %s" % (from_path, import_name)) # Resolve duplicate package suffix problem (heron.), if the top level package name is heron if python_class_name.startswith("heron."): mod = resolve_heron_suffix_issue(abs_path_to_pex, python_class_name) return getattr(mod, import_name) mod = __import__(from_path, fromlist=[import_name], level=-1) Log.debug("Imported module: %s" % str(mod)) return getattr(mod, import_name)
def on_topologies_watch(state_manager: StateManager, topologies: List[str]) -> None: """watch topologies""" topologies = set(topologies) Log.info("State watch triggered for topologies.") Log.debug("Topologies: %s", topologies) cached_names = {t.name for t in self.get_stmgr_topologies(state_manager.name)} Log.debug("Existing topologies: %s", cached_names) for name in cached_names - topologies: Log.info("Removing topology: %s in rootpath: %s", name, state_manager.rootpath) self.remove_topology(name, state_manager.name) for name in topologies - cached_names: self.add_new_topology(state_manager, name)
def poll(self, timeout=0.0): """Modified version of poll() from asyncore module""" if self.sock_map is None: Log.warning("Socket map is not registered to Gateway Looper") readable_lst = [] writable_lst = [] error_lst = [] if self.sock_map is not None: for fd, obj in self.sock_map.items(): is_r = obj.readable() is_w = obj.writable() if is_r: readable_lst.append(fd) if is_w and not obj.accepting: writable_lst.append(fd) if is_r or is_w: error_lst.append(fd) # Add wakeup fd readable_lst.append(self.pipe_r) Log.debug("Will select() with timeout: " + str(timeout) + ", with map: " + str(self.sock_map)) try: readable_lst, writable_lst, error_lst = \ select.select(readable_lst, writable_lst, error_lst, timeout) except select.error as err: Log.debug("Trivial error: " + str(err)) if err.args[0] != errno.EINTR: raise else: return Log.debug("Selected [r]: " + str(readable_lst) + " [w]: " + str(writable_lst) + " [e]: " + str(error_lst)) if self.pipe_r in readable_lst: Log.debug("Read from pipe") os.read(self.pipe_r, 1024) readable_lst.remove(self.pipe_r) if self.sock_map is not None: for fd in readable_lst: obj = self.sock_map.get(fd) if obj is None: continue asyncore.read(obj) for fd in writable_lst: obj = self.sock_map.get(fd) if obj is None: continue asyncore.write(obj) for fd in error_lst: obj = self.sock_map.get(fd) if obj is None: continue # pylint: disable=W0212 asyncore._exception(obj)
def get(self): """ get method """ try: cluster = self.get_argument_cluster() role = self.get_argument_role() environ = self.get_argument_environ() topology_name = self.get_argument_topology() topology_info = self.tracker.getTopologyInfo(topology_name, cluster, role, environ) metadata = topology_info["metadata"] self.write_success_response(metadata) except Exception as e: Log.error("Exception when handling GET request '/topologies/metadata'") Log.debug(traceback.format_exc()) self.write_error_response(e)
def offer(self, item): """Offer to the buffer It is a non-blocking operation, and when the buffer is full, it raises Queue.Full exception """ try: # non-blocking self._buffer.put(item, block=False) if self._consumer_callback is not None: self._consumer_callback() return True except Queue.Full: Log.debug("%s: Full in offer()" % str(self)) return False
def poll(self): """Poll from the buffer It is a non-blocking operation, and when the buffer is empty, it raises Queue.Empty exception """ try: # non-blocking ret = self._buffer.get(block=False) if self._producer_callback is not None: self._producer_callback() return ret except Queue.Empty: Log.debug("%s: Empty in poll()" % str(self)) raise Queue.Empty
def getInstancePid(topology_info, instance_id): """ This method is used by other modules, and so it is not a part of the class. Fetches Instance pid from heron-shell. """ try: http_client = tornado.httpclient.AsyncHTTPClient() endpoint = utils.make_shell_endpoint(topology_info, instance_id) url = "%s/pid/%s" % (endpoint, instance_id) Log.debug("HTTP call for url: %s", url) response = yield http_client.fetch(url) raise tornado.gen.Return(response.body) except tornado.httpclient.HTTPError as e: raise Exception(str(e))
def _flush_remaining(self): if self.current_data_tuple_set is not None: Log.debug("In flush_remaining() - flush data tuple set") tuple_set = self.make_tuple_set() tuple_set.data.CopyFrom(self.current_data_tuple_set) self._push_tuple_to_stream(tuple_set) self.current_data_tuple_set = None self.current_data_tuple_size_in_bytes = 0 if self.current_control_tuple_set is not None: Log.debug("In flush_remaining() - flush control tuple set") tuple_set = self.make_tuple_set() tuple_set.control.CopyFrom(self.current_control_tuple_set) self._push_tuple_to_stream(tuple_set) self.current_control_tuple_set = None
def get(self): """ get method """ try: cluster = self.get_argument_cluster() role = self.get_argument_role() environ = self.get_argument_environ() topology_name = self.get_argument_topology() instance = self.get_argument_instance() topology_info = self.tracker.get_topology_info( topology_name, cluster, role, environ) result = yield getInstancePid(topology_info, instance) self.write_success_response(result) except Exception as e: Log.debug(traceback.format_exc()) self.write_error_response(e)
def _save_or_remove(config, cluster): cluster_config_file = get_cluster_config_file(cluster) if config: Log.debug("saving config file: %s", cluster_config_file) config_directory = get_config_directory(cluster) if not os.path.isdir(config_directory): os.makedirs(config_directory) with open(cluster_config_file, 'wb') as cf: yaml.dump(config, cf, default_flow_style=False) else: if os.path.isfile(cluster_config_file): try: os.remove(cluster_config_file) except OSError: pass
def on_topologies_watch(state_manager, topologies) -> None: """watch topologies""" Log.info("State watch triggered for topologies.") Log.debug("Topologies: " + str(topologies)) cached_names = [t.name for t in self.get_stmgr_topologies(state_manager.name)] Log.debug(f"Existing topologies: {cached_names}") for name in cached_names: if name not in topologies: Log.info("Removing topology: %s in rootpath: %s", name, state_manager.rootpath) self.remove_topology(name, state_manager.name) for name in topologies: if name not in cached_names: self.add_new_topology(state_manager, name)
def download(uri, cluster): tmp_dir = tempfile.mkdtemp() cmd_downloader = config.get_heron_bin_dir() + "/heron-downloader.sh" cmd_uri = "-u " + uri cmd_destination = "-f " + tmp_dir cmd_heron_root = "-d " + config.get_heron_dir() cmd_heron_config = "-p " + config.get_heron_cluster_conf_dir(cluster, config.get_heron_conf_dir()) cmd_mode = "-m local" cmd = [cmd_downloader, cmd_uri, cmd_destination, cmd_heron_root, cmd_heron_config, cmd_mode] Log.debug("download uri command: %s", cmd) subprocess.call(cmd) suffix = (".jar", ".tar", ".tar.gz", ".pex", ".dylib", ".so") for f in os.listdir(tmp_dir): if f.endswith(suffix): return os.path.join(tmp_dir, f)
def heron_class(class_name, lib_jars, extra_jars=None, args=None, java_defines=None): ''' Execute a heron class given the args and the jars needed for class path :param class_name: :param lib_jars: :param extra_jars: :param args: :param java_defines: :return: ''' # default optional params to empty list if not provided if extra_jars is None: extra_jars = [] if args is None: args = [] if java_defines is None: java_defines = [] # Format all java -D options that need to be passed while running # the class locally. java_opts = ['-D' + opt for opt in java_defines] # Construct the command line for the sub process to run # Because of the way Python execute works, # the java opts must be passed as part of the list all_args = [config.get_java_path(), "-client", "-Xmx1g"] + \ java_opts + \ ["-cp", config.get_classpath(lib_jars + extra_jars)] all_args += [class_name] + list(args) # set heron_config environment variable heron_env = os.environ.copy() heron_env['HERON_OPTIONS'] = opts.get_heron_config() # print the verbose message Log.debug('$> %s' % ' '.join(all_args)) Log.debug('Heron options: %s' % str(heron_env["HERON_OPTIONS"])) # invoke the command with subprocess and print error message, if any status = subprocess.call(all_args, env=heron_env) if status != 0: err_str = "User main failed with status %d. Bailing out..." % status raise RuntimeError(err_str)
def parse_known_args(self, args=None, namespace=None): namespace, args = super(HeronArgumentParser, self).parse_known_args(args, namespace) positional_args_map = self.get_positional_args() if self.prog != 'heron': ## sub parser specific validation Log.debug('sub parser expansion %s %s', self.prog, args) ## if the expanded args contains a optional equivalent of a positional argument ## i.e --topology-name xyz for positional argument topology-name ## need to prevent that for that sub parser. bail out for key in args: if key in positional_args_map: raise ValueError( 'positional argument for command {} : {} specified in heronrc'.format( self.prog, positional_args_map[key])) return namespace, args