def start_cluster(cl_args): ''' Start a Heron standalone cluster ''' roles = read_and_parse_roles(cl_args) masters = roles[Role.MASTERS] slaves = roles[Role.SLAVES] zookeepers = roles[Role.ZOOKEEPERS] Log.info("Roles:") Log.info(" - Master Servers: %s" % list(masters)) Log.info(" - Slave Servers: %s" % list(slaves)) Log.info(" - Zookeeper Servers: %s" % list(zookeepers)) if not masters: Log.error("No master servers specified!") sys.exit(-1) if not slaves: Log.error("No slave servers specified!") sys.exit(-1) if not zookeepers: Log.error("No zookeeper servers specified!") sys.exit(-1) # make sure configs are templated update_config_files(cl_args) dist_nodes = list(masters.union(slaves)) # if just local deployment if not (len(dist_nodes) == 1 and is_self(dist_nodes[0])): distribute_package(roles, cl_args) start_master_nodes(masters, cl_args) start_slave_nodes(slaves, cl_args) start_api_server(masters, cl_args) start_heron_tools(masters, cl_args) Log.info("Heron standalone cluster complete!")
def _emit_terminal_if_needed(self): Log.info("is_done: %s, tuples_to_complete: %s" % (self.is_done, self.tuples_to_complete)) if self.is_done and self.tuples_to_complete == 0: Log.info("Emitting terminals to downstream") super(IntegrationTestSpout, self).emit( [integ_const.INTEGRATION_TEST_TERMINAL], stream=integ_const.INTEGRATION_TEST_CONTROL_STREAM_ID )
def start_slave_nodes(slaves, cl_args): ''' Star slave nodes ''' pids = [] for slave in slaves: Log.info("Starting slave on %s" % slave) cmd = "%s agent -config %s >> /tmp/nomad_client.log 2>&1 &" \ % (get_nomad_path(cl_args), get_nomad_slave_config_file(cl_args)) if not is_self(slave): cmd = ssh_remote_execute(cmd, slave, cl_args) Log.debug(cmd) pid = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) pids.append({"pid": pid, "dest": slave}) errors = [] for entry in pids: pid = entry["pid"] return_code = pid.wait() output = pid.communicate() Log.debug("return code: %s output: %s" % (return_code, output)) if return_code != 0: errors.append("Failed to start slave on %s with error:\n%s" % (entry["dest"], output[1])) if errors: for error in errors: Log.error(error) sys.exit(-1) Log.info("Done starting slaves")
def __init__(self, pplan_helper, in_stream, out_stream, looper): super(SpoutInstance, self).__init__(pplan_helper, in_stream, out_stream, looper) self.topology_state = topology_pb2.TopologyState.Value("PAUSED") if not self.pplan_helper.is_spout: raise RuntimeError("No spout in physicial plan") context = self.pplan_helper.context self.spout_metrics = SpoutMetrics(self.pplan_helper) self.serializer = SerializerHelper.get_serializer(context) # acking related self.acking_enabled = context.get_cluster_config().get(api_constants.TOPOLOGY_ENABLE_ACKING, False) self.enable_message_timeouts = \ context.get_cluster_config().get(api_constants.TOPOLOGY_ENABLE_MESSAGE_TIMEOUTS) Log.info("Enable ACK: %s" % str(self.acking_enabled)) Log.info("Enable Message Timeouts: %s" % str(self.enable_message_timeouts)) # map <tuple_info.key -> tuple_info>, ordered by insertion time self.in_flight_tuples = collections.OrderedDict() self.immediate_acks = collections.deque() self.total_tuples_emitted = 0 # load user's spout class spout_impl_class = super(SpoutInstance, self).load_py_instance(is_spout=True) self.spout_impl = spout_impl_class(delegate=self)
def fail(self, tup): Log.info( "Trying to do a fail. tuples processed: %d, received: %d" % (self.tuples_processed, self.tuple_received) ) if self.tuples_processed < self.tuple_received: super(IntegrationTestBolt, self).fail(tup) self.tuples_processed += 1
def start_api_server(masters, cl_args): ''' Start the Heron API server ''' # make sure nomad cluster is up single_master = list(masters)[0] wait_for_master_to_start(single_master) cmd = "%s run %s >> /tmp/apiserver_start.log 2>&1 &" \ % (get_nomad_path(cl_args), get_apiserver_job_file(cl_args)) Log.info("Starting Heron API Server on %s" % single_master) if not is_self(single_master): cmd = ssh_remote_execute(cmd, single_master, cl_args) Log.debug(cmd) pid = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) return_code = pid.wait() output = pid.communicate() Log.debug("return code: %s output: %s" % (return_code, output)) if return_code != 0: Log.error("Failed to start apiserver on %s with error:\n%s" % (single_master, output[1])) sys.exit(-1) wait_for_job_to_start(single_master, "apiserver") Log.info("Done starting Heron API Server")
def prepare(self, context, component, stream, target_tasks): Log.info("In prepare of SampleCustomGrouping, " "with src component: %s, " "with stream id: %s, " "with target tasks: %s" % (component, stream, str(target_tasks))) self.target_tasks = target_tasks
def _add_spout_task(self): Log.info("Adding spout task...") def spout_task(): # don't do anything when topology is paused if not self._is_topology_running(): return if self._should_produce_tuple(): self._produce_tuple() self.output_helper.send_out_tuples() self.looper.wake_up() # so emitted tuples would be added to buffer now else: self.spout_metrics.update_out_queue_full_count() if self.acking_enabled: self._read_tuples_and_execute() self.spout_metrics.update_pending_tuples_count(len(self.in_flight_tuples)) else: self._do_immediate_acks() if self._is_continue_to_work(): self.looper.wake_up() self.looper.add_wakeup_task(spout_task) # look for the timeout's tuples if self.enable_message_timeouts: self._look_for_timeouts()
def register_watch(self, callback): """ Returns the UUID with which the watch is registered. This UUID can be used to unregister the watch. Returns None if watch could not be registered. The argument 'callback' must be a function that takes exactly one argument, the topology on which the watch was triggered. Note that the watch will be unregistered in case it raises any Exception the first time. This callback is also called at the time of registration. """ RETRY_COUNT = 5 # Retry in case UID is previously # generated, just in case... for _ in range(RETRY_COUNT): # Generate a random UUID. uid = uuid.uuid4() if uid not in self.watches: Log.info("Registering a watch with uid: " + str(uid)) try: callback(self) except Exception as e: Log.error("Caught exception while triggering callback: " + str(e)) Log.debug(traceback.format_exc()) return None self.watches[uid] = callback return uid return None
def start_heron_tools(masters, cl_args): ''' Start Heron tracker and UI ''' single_master = list(masters)[0] wait_for_master_to_start(single_master) cmd = "%s run %s >> /tmp/heron_tools_start.log 2>&1 &" \ % (get_nomad_path(cl_args), get_heron_tools_job_file(cl_args)) Log.info("Starting Heron Tools on %s" % single_master) if not is_self(single_master): cmd = ssh_remote_execute(cmd, single_master, cl_args) Log.debug(cmd) pid = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) return_code = pid.wait() output = pid.communicate() Log.debug("return code: %s output: %s" % (return_code, output)) if return_code != 0: Log.error("Failed to start Heron Tools on %s with error:\n%s" % (single_master, output[1])) sys.exit(-1) wait_for_job_to_start(single_master, "heron-tools") Log.info("Done starting Heron Tools")
def launch_topologies(cl_args, topology_file, tmp_dir): ''' Launch topologies :param cl_args: :param topology_file: :param tmp_dir: :return: list(Responses) ''' # the submitter would have written the .defn file to the tmp_dir defn_files = glob.glob(tmp_dir + '/*.defn') if len(defn_files) == 0: return SimpleResult(Status.HeronError, "No topologies found under %s" % tmp_dir) results = [] for defn_file in defn_files: # load the topology definition from the file topology_defn = topology_pb2.Topology() try: handle = open(defn_file, "rb") topology_defn.ParseFromString(handle.read()) handle.close() except Exception as e: err_context = "Cannot load topology definition '%s': %s" % (defn_file, e) return SimpleResult(Status.HeronError, err_context) # launch the topology Log.info("Launching topology: \'%s\'", topology_defn.name) res = launch_a_topology( cl_args, tmp_dir, topology_file, defn_file, topology_defn.name) results.append(res) return results
def unregister_watch(self, uid): """ Unregister the watch with the given UUID. """ # Do not raise an error if UUID is # not present in the watches. Log.info("Unregister a watch with uid: " + str(uid)) self.watches.pop(uid, None)
def handle_initiate_stateful_checkpoint(self, ckptmsg, component): Log.info("Received initiate state checkpoint message for %s" % ckptmsg.checkpoint_id) if not self.is_stateful: raise RuntimeError("Received state checkpoint message but we are not stateful topology") if isinstance(component, StatefulComponent): component.pre_save(ckptmsg.checkpoint_id) else: Log.info("Trying to checkponit a non stateful component. Send empty state") self.admit_ckpt_state(ckptmsg.checkpoint_id, self._stateful_state)
def _post_result_to_server(self, json_result): conn = httplib.HTTPConnection(self.parsed_url.netloc) conn.request("POST", self.parsed_url.path, json_result) response = conn.getresponse() if response.status == 200: Log.info("HTTP POST successful") else: Log.severe("HTTP POST failed, response code: %d, response: %s" % (response.status, response.read())) return response.status
def emit(self, tup, stream=Stream.DEFAULT_STREAM_ID, anchors=None, direct_task=None, need_task_ids=False): Log.info("emitting tuple: %s", tup) if tup is None: super(IntegrationTestBolt, self).emit(list(self.current_tuple_processing), stream=stream, anchors=anchors, direct_task=direct_task, need_task_ids=need_task_ids) else: super(IntegrationTestBolt, self).emit(tup, stream, anchors, direct_task, need_task_ids)
def setTopologyInfo(self, topology): """ Extracts info from the stored proto states and convert it into representation that is exposed using the API. This method is called on any change for the topology. For example, when a container moves and its host or some port changes. All the information is parsed all over again and cache is updated. """ # Execution state is the most basic info. # If there is no execution state, just return # as the rest of the things don't matter. if not topology.execution_state: Log.info("No execution state found for: " + topology.name) return Log.info("Setting topology info for topology: " + topology.name) has_physical_plan = True if not topology.physical_plan: has_physical_plan = False has_tmaster_location = True if not topology.tmaster: has_tmaster_location = False has_scheduler_location = True if not topology.scheduler_location: has_scheduler_location = False topologyInfo = { "name": topology.name, "id": topology.id, "logical_plan": None, "physical_plan": None, "execution_state": None, "tmaster_location": None, "scheduler_location": None, } executionState = self.extract_execution_state(topology) executionState["has_physical_plan"] = has_physical_plan executionState["has_tmaster_location"] = has_tmaster_location executionState["has_scheduler_location"] = has_scheduler_location executionState["status"] = topology.get_status() topologyInfo["metadata"] = self.extract_metadata(topology) topologyInfo["runtime_state"] = self.extract_runtime_state(topology) topologyInfo["execution_state"] = executionState topologyInfo["logical_plan"] = self.extract_logical_plan(topology) topologyInfo["physical_plan"] = self.extract_physical_plan(topology) topologyInfo["tmaster_location"] = self.extract_tmaster(topology) topologyInfo["scheduler_location"] = self.extract_scheduler_location(topology) self.topologyInfos[(topology.name, topology.state_manager_name)] = topologyInfo
def on_incoming_message(self, message): self.gateway_metrics.received_packet(message.ByteSize()) if isinstance(message, stmgr_pb2.NewInstanceAssignmentMessage): Log.info("Handling assignment message from direct NewInstanceAssignmentMessage") self._handle_assignment_message(message.pplan) elif isinstance(message, stmgr_pb2.TupleMessage): self._handle_new_tuples(message) else: raise RuntimeError("Unknown kind of message received from Stream Manager")
def addNewTopology(self, state_manager, topologyName): """ Adds a topology in the local cache, and sets a watch on any changes on the topology. """ topology = Topology(topologyName, state_manager.name) Log.info("Adding new topology: %s, state_manager: %s", topologyName, state_manager.name) self.topologies.append(topology) # Register a watch on topology and change # the topologyInfo on any new change. topology.register_watch(self.setTopologyInfo) def on_topology_pplan(data): """watch physical plan""" Log.info("Watch triggered for topology pplan: " + topologyName) topology.set_physical_plan(data) if not data: Log.debug("No data to be set") def on_topology_packing_plan(data): """watch packing plan""" Log.info("Watch triggered for topology packing plan: " + topologyName) topology.set_packing_plan(data) if not data: Log.debug("No data to be set") def on_topology_execution_state(data): """watch execution state""" Log.info("Watch triggered for topology execution state: " + topologyName) topology.set_execution_state(data) if not data: Log.debug("No data to be set") def on_topology_tmaster(data): """set tmaster""" Log.info("Watch triggered for topology tmaster: " + topologyName) topology.set_tmaster(data) if not data: Log.debug("No data to be set") def on_topology_scheduler_location(data): """set scheduler location""" Log.info("Watch triggered for topology scheduler location: " + topologyName) topology.set_scheduler_location(data) if not data: Log.debug("No data to be set") # Set watches on the pplan, execution_state, tmaster and scheduler_location. state_manager.get_pplan(topologyName, on_topology_pplan) state_manager.get_packing_plan(topologyName, on_topology_packing_plan) state_manager.get_execution_state(topologyName, on_topology_execution_state) state_manager.get_tmaster(topologyName, on_topology_tmaster) state_manager.get_scheduler_location(topologyName, on_topology_scheduler_location)
def _handle_register_response(self, response): """Called when a register response (RegisterInstanceResponse) arrives""" if response.status.status != common_pb2.StatusCode.Value("OK"): raise RuntimeError("Stream Manager returned a not OK response for register") Log.info("We registered ourselves to the Stream Manager") if response.HasField("pplan"): Log.info("Handling assignment message from response") self._handle_assignment_message(response.pplan) else: Log.debug("Received a register response with no pplan")
def next_tuple(self): if self.is_done: return self.max_executions -= 1 Log.info("max executions: %d" % self.max_executions) self.user_spout.next_tuple() if self.is_done: self._emit_terminal_if_needed() Log.info("This topology is finished.")
def launch_topology_server(cl_args, topology_file, topology_defn_file, topology_name): ''' Launch a topology given topology jar, its definition file and configurations :param cl_args: :param topology_file: :param topology_defn_file: :param topology_name: :return: ''' service_apiurl = cl_args['service_url'] + rest.ROUTE_SIGNATURES['submit'][1] service_method = rest.ROUTE_SIGNATURES['submit'][0] data = dict( name=topology_name, cluster=cl_args['cluster'], role=cl_args['role'], environment=cl_args['environ'], user=cl_args['submit_user'], ) Log.info("" + str(cl_args)) overrides = dict() if 'config_property' in cl_args: overrides = config.parse_override_config(cl_args['config_property']) if overrides: data.update(overrides) if cl_args['dry_run']: data["dry_run"] = True files = dict( definition=open(topology_defn_file, 'rb'), topology=open(topology_file, 'rb'), ) err_ctxt = "Failed to launch topology '%s' %s" % (topology_name, launch_mode_msg(cl_args)) succ_ctxt = "Successfully launched topology '%s' %s" % (topology_name, launch_mode_msg(cl_args)) try: r = service_method(service_apiurl, data=data, files=files) ok = r.status_code is requests.codes.ok created = r.status_code is requests.codes.created s = Status.Ok if created or ok else Status.HeronError if s is Status.HeronError: Log.error(r.json().get('message', "Unknown error from API server %d" % r.status_code)) elif ok: # this case happens when we request a dry_run print(r.json().get("response")) except (requests.exceptions.ConnectionError, requests.exceptions.HTTPError) as err: Log.error(err) return SimpleResult(Status.HeronError, err_ctxt, succ_ctxt) return SimpleResult(s, err_ctxt, succ_ctxt)
def initialize(self, config, context): user_spout_classpath = config.get(integ_const.USER_SPOUT_CLASSPATH, None) if user_spout_classpath is None: raise RuntimeError("User defined integration test spout was not found") user_spout_cls = self._load_user_spout(context.get_topology_pex_path(), user_spout_classpath) self.user_spout = user_spout_cls(delegate=self) self.max_executions = config.get(integ_const.USER_MAX_EXECUTIONS, integ_const.MAX_EXECUTIONS) assert isinstance(self.max_executions, int) and self.max_executions > 0 Log.info("Max executions: %d" % self.max_executions) self.tuples_to_complete = 0 self.user_spout.initialize(config, context)
def initialize(self, config, context): user_spout_classpath = config.get(integ_const.USER_SPOUT_CLASSPATH, None) if user_spout_classpath is None: raise RuntimeError("User defined integration test spout was not found") user_spout_cls = self._load_user_spout(context.get_topology_pex_path(), user_spout_classpath) self.user_spout = user_spout_cls(delegate=self) self.max_executions = config.get(integ_const.USER_MAX_EXECUTIONS, integ_const.MAX_EXECUTIONS) assert isinstance(self.max_executions, int) and self.max_executions > 0 Log.info("Max executions: %d", self.max_executions) self.tuples_to_complete = 0 self.user_spout.initialize(config, context)
def write_finished_data(self): json_result = json.dumps(self.result) Log.info("Actual result: %s", json_result) Log.info("Posting actual result to %s", self.http_post_url) try: response_code = self._post_result_to_server(json_result) if response_code != 200: # try again response_code = self._post_result_to_server(json_result) if response_code != 200: raise RuntimeError(f"Response code: {response_code}") except Exception as e: raise RuntimeError(f"Posting result to server failed with: {e.message}")
def update_config_files(cl_args): Log.info("Updating config files...") roles = read_and_parse_roles(cl_args) Log.debug("roles: %s" % roles) masters = list(roles[Role.MASTERS]) zookeepers = list(roles[Role.ZOOKEEPERS]) template_slave_hcl(cl_args, masters) template_scheduler_yaml(cl_args, masters) template_uploader_yaml(cl_args, masters) template_apiserver_hcl(cl_args, masters, zookeepers) template_statemgr_yaml(cl_args, zookeepers) template_heron_tools_hcl(cl_args, masters, zookeepers)
def write_finished_data(self): json_result = json.dumps(self.result) Log.info("Actual result: %s" % json_result) Log.info("Posting actual result to %s" % self.http_post_url) try: response_code = self._post_result_to_server(json_result) if response_code != 200: # try again response_code = self._post_result_to_server(json_result) if response_code != 200: raise RuntimeError("Response code: %d" % response_code) except Exception as e: raise RuntimeError("Posting result to server failed with: %s" % e.message)
def on_incoming_message(self, message): self.gateway_metrics.received_packet(message.ByteSize()) if isinstance(message, stmgr_pb2.NewInstanceAssignmentMessage): Log.info( "Handling assignment message from direct NewInstanceAssignmentMessage" ) self._handle_assignment_message(message.pplan) elif isinstance(message, stmgr_pb2.TupleMessage): self._handle_new_tuples(message) else: raise RuntimeError( "Unknown kind of message received from Stream Manager")
def _handle_register_response(self, response): """Called when a register response (RegisterInstanceResponse) arrives""" if response.status.status != common_pb2.StatusCode.Value("OK"): raise RuntimeError( "Stream Manager returned a not OK response for register") Log.info("We registered ourselves to the Stream Manager") self.is_registered = True if response.HasField("pplan"): Log.info("Handling assignment message from response") self._handle_assignment_message(response.pplan) else: Log.debug("Received a register response with no pplan")
def main(): parser = HeronArgumentParser( prog='heron', epilog=help_epilog, formatter_class=config.SubcommandHelpFormatter, fromfile_prefix_chars='@', add_help=False, rcfile="./.heronrc") parser.add_subparsers(title="Available commands", metavar='<command> <options>') args, unknown_args = parser.parse_known_args() Log.info("parse results args: %s unknown: %s ", args, unknown_args)
def parse_cluster_role_env(cluster_role_env, config_path): """Parse cluster/[role]/[environ], supply default, if not provided, not required""" parts = cluster_role_env.split('/')[:3] Log.info("Using config file under %s" % config_path) if not os.path.isdir(config_path): Log.error("Config path cluster directory does not exist: %s" % config_path) raise Exception("Invalid config path") # if cluster/role/env is not completely provided, check further if len(parts) < 3: cli_conf_file = os.path.join(config_path, CLIENT_YAML) # if client conf doesn't exist, use default value if not os.path.isfile(cli_conf_file): if len(parts) == 1: parts.append(getpass.getuser()) if len(parts) == 2: parts.append(ENVIRON) else: cli_confs = {} with open(cli_conf_file, 'r') as conf_file: tmp_confs = yaml.load(conf_file) # the return value of yaml.load can be None if conf_file is an empty file if tmp_confs is not None: cli_confs = tmp_confs else: print "Failed to read: %s due to it is empty" % (CLIENT_YAML) # if role is required but not provided, raise exception if len(parts) == 1: if (IS_ROLE_REQUIRED in cli_confs) and (cli_confs[IS_ROLE_REQUIRED] is True): raise Exception("role required but not provided (cluster/role/env = %s). See %s in %s" % (cluster_role_env, IS_ROLE_REQUIRED, CLIENT_YAML)) else: parts.append(getpass.getuser()) # if environ is required but not provided, raise exception if len(parts) == 2: if (IS_ENV_REQUIRED in cli_confs) and (cli_confs[IS_ENV_REQUIRED] is True): raise Exception("environ required but not provided (cluster/role/env = %s). See %s in %s" % (cluster_role_env, IS_ENV_REQUIRED, CLIENT_YAML)) else: parts.append(ENVIRON) # if cluster or role or environ is empty, print if len(parts[0]) == 0 or len(parts[1]) == 0 or len(parts[2]) == 0: print "Failed to parse" sys.exit(1) return (parts[0], parts[1], parts[2])
def addNewTopology(self, state_manager, topologyName): """ Adds a topology in the local cache, and sets a watch on any changes on the topology. """ topology = Topology(topologyName, state_manager.name) Log.info("Adding new topology: %s, state_manager: %s", topologyName, state_manager.name) self.topologies.append(topology) # Register a watch on topology and change # the topologyInfo on any new change. topology.register_watch(self.setTopologyInfo) def on_topology_pplan(data): """watch physical plan""" Log.info("Watch triggered for topology pplan: " + topologyName) topology.set_physical_plan(data) if not data: Log.debug("No data to be set") def on_topology_execution_state(data): """watch execution state""" Log.info("Watch triggered for topology execution state: " + topologyName) topology.set_execution_state(data) if not data: Log.debug("No data to be set") def on_topology_tmaster(data): """set tmaster""" Log.info("Watch triggered for topology tmaster: " + topologyName) topology.set_tmaster(data) if not data: Log.debug("No data to be set") def on_topology_scheduler_location(data): """set scheduler location""" Log.info("Watch triggered for topology scheduler location: " + topologyName) topology.set_scheduler_location(data) if not data: Log.debug("No data to be set") # Set watches on the pplan, execution_state, tmaster and scheduler_location. state_manager.get_pplan(topologyName, on_topology_pplan) state_manager.get_execution_state(topologyName, on_topology_execution_state) state_manager.get_tmaster(topologyName, on_topology_tmaster) state_manager.get_scheduler_location(topologyName, on_topology_scheduler_location)
def on_topologies_watch(state_manager: StateManager, topologies: List[str]) -> None: """watch topologies""" topologies = set(topologies) Log.info("State watch triggered for topologies.") Log.debug("Topologies: %s", topologies) cached_names = {t.name for t in self.get_stmgr_topologies(state_manager.name)} Log.debug("Existing topologies: %s", cached_names) for name in cached_names - topologies: Log.info("Removing topology: %s in rootpath: %s", name, state_manager.rootpath) self.remove_topology(name, state_manager.name) for name in topologies - cached_names: self.add_new_topology(state_manager, name)
def distribute_package(roles, cl_args): ''' distribute Heron packages to all nodes ''' Log.info("Distributing heron package to nodes (this might take a while)...") masters = roles[Role.MASTERS] slaves = roles[Role.SLAVES] tar_file = tempfile.NamedTemporaryFile(suffix=".tmp").name Log.debug("TAR file %s to %s" % (cl_args["heron_dir"], tar_file)) make_tarfile(tar_file, cl_args["heron_dir"]) dist_nodes = masters.union(slaves) scp_package(tar_file, dist_nodes, cl_args)
def launch_topologies(cl_args, topology_file, tmp_dir): ''' Launch topologies :param cl_args: :param topology_file: :param tmp_dir: :return: list(Responses) ''' # the submitter would have written the .defn file to the tmp_dir defn_files = glob.glob(tmp_dir + '/*.defn') if len(defn_files) == 0: return SimpleResult(Status.HeronError, "No topologies found under %s" % tmp_dir) results = [] for defn_file in defn_files: # load the topology definition from the file topology_defn = topology_pb2.Topology() try: handle = open(defn_file, "rb") topology_defn.ParseFromString(handle.read()) handle.close() except Exception as e: err_context = "Cannot load topology definition '%s': %s" % ( defn_file, e) return SimpleResult(Status.HeronError, err_context) # log topology and components configurations Log.debug("Topology config: %s", topology_defn.topology_config) Log.debug("Component config:") for spout in topology_defn.spouts: Log.debug("%s => %s", spout.comp.name, spout.comp.config) for bolt in topology_defn.bolts: Log.debug("%s => %s", bolt.comp.name, bolt.comp.config) # launch the topology Log.info("Launching topology: \'%s\'%s", topology_defn.name, launch_mode_msg(cl_args)) # check if we have to do server or direct based deployment if cl_args['deploy_mode'] == config.SERVER_MODE: res = launch_topology_server(cl_args, topology_file, defn_file, topology_defn.name) else: res = launch_a_topology(cl_args, tmp_dir, topology_file, defn_file, topology_defn.name) results.append(res) return results
def main(): parser = HeronArgumentParser( prog='heron', epilog=help_epilog, formatter_class=config.SubcommandHelpFormatter, fromfile_prefix_chars='@', add_help=False, rcfile="./.heronrc") parser.add_subparsers( title="Available commands", metavar='<command> <options>') args, unknown_args = parser.parse_known_args() Log.info("parse results args: %s unknown: %s ", args, unknown_args)
def _handle_assignment_message(self, pplan): """Called when new NewInstanceAssignmentMessage arrives""" Log.debug("In handle_assignment_message() of STStmgrClient, Physical Plan: \n%s", str(pplan)) new_helper = PhysicalPlanHelper( pplan, self.instance.instance_id, self.heron_instance_cls.topo_pex_file_abs_path ) if self._pplan_helper is not None and ( self._pplan_helper.my_component_name != new_helper.my_component_name or self._pplan_helper.my_task_id != new_helper.my_task_id ): raise RuntimeError("Our Assignment has changed. We will die to pick it.") if self._pplan_helper is None: Log.info("Received a new Physical Plan") Log.info("Push the new pplan_helper to Heron Instance") self.heron_instance_cls.handle_assignment_msg(new_helper) else: Log.info("Received a new Physical Plan with the same assignment -- State Change") Log.info( "Old state: %s, new state: %s.", self._pplan_helper.get_topology_state(), new_helper.get_topology_state(), ) self.heron_instance_cls.handle_state_change_msg(new_helper) self._pplan_helper = new_helper
def _handle_assignment_message(self, pplan): """Called when new NewInstanceAssignmentMessage arrives""" Log.debug( "In handle_assignment_message() of STStmgrClient, Physical Plan: \n%s", str(pplan)) new_helper = PhysicalPlanHelper( pplan, self.instance.instance_id, self.heron_instance_cls.topo_pex_file_abs_path) if self._pplan_helper is not None and \ (self._pplan_helper.my_component_name != new_helper.my_component_name or self._pplan_helper.my_task_id != new_helper.my_task_id): raise RuntimeError( "Our Assignment has changed. We will die to pick it.") if self._pplan_helper is None: Log.info("Received a new Physical Plan") Log.info("Push the new pplan_helper to Heron Instance") self.heron_instance_cls.handle_assignment_msg(new_helper) else: Log.info( "Received a new Physical Plan with the same assignment -- State Change" ) Log.info("Old state: %s, new state: %s.", self._pplan_helper.get_topology_state(), new_helper.get_topology_state()) self.heron_instance_cls.handle_state_change_msg(new_helper) self._pplan_helper = new_helper
def handle_initiate_stateful_checkpoint(self, ckptmsg, component): Log.info("Received initiate state checkpoint message for %s" % ckptmsg.checkpoint_id) if not self.is_stateful: raise RuntimeError( "Received state checkpoint message but we are not stateful topology" ) if isinstance(component, StatefulComponent): component.pre_save(ckptmsg.checkpoint_id) else: Log.info( "Trying to checkponit a non stateful component. Send empty state" ) self.admit_ckpt_state(ckptmsg.checkpoint_id, self._stateful_state)
def emit(self, tup, tup_id=None, stream=Stream.DEFAULT_STREAM_ID, direct_task=None, need_task_ids=None): """Emits from this integration test spout Overriden method which will be called when user's spout calls emit() """ # if is_control True -> control stream should not count self.tuples_to_complete += 1 if tup_id is None: Log.info("Add tup_id for tuple: %s" % str(tup)) _tup_id = integ_const.INTEGRATION_TEST_MOCK_MESSAGE_ID else: _tup_id = tup_id super(IntegrationTestSpout, self).emit(tup, _tup_id, stream, direct_task, need_task_ids)
def on_topologies_watch(state_manager, topologies) -> None: """watch topologies""" Log.info("State watch triggered for topologies.") Log.debug("Topologies: " + str(topologies)) cached_names = [t.name for t in self.get_stmgr_topologies(state_manager.name)] Log.debug(f"Existing topologies: {cached_names}") for name in cached_names: if name not in topologies: Log.info("Removing topology: %s in rootpath: %s", name, state_manager.rootpath) self.remove_topology(name, state_manager.name) for name in topologies: if name not in cached_names: self.add_new_topology(state_manager, name)
def stop_cluster(cl_args): ''' teardown the cluster ''' Log.info("Terminating cluster...") roles = read_and_parse_roles(cl_args) masters = roles[Role.MASTERS] slaves = roles[Role.SLAVES] dist_nodes = masters.union(slaves) # stop all jobs if masters: try: single_master = list(masters)[0] jobs = get_jobs(cl_args, single_master) for job in jobs: job_id = job["ID"] Log.info("Terminating job %s" % job_id) delete_job(cl_args, job_id, single_master) except: Log.debug("Error stopping jobs") Log.debug(sys.exc_info()[0]) for node in dist_nodes: Log.info("Terminating processes on %s" % node) if not is_self(node): cmd = "ps aux | grep heron-nomad | awk '{print \$2}' " \ "| xargs kill" cmd = ssh_remote_execute(cmd, node, cl_args) else: cmd = "ps aux | grep heron-nomad | awk '{print $2}' " \ "| xargs kill" Log.debug(cmd) pid = subprocess.Popen(cmd, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) return_code = pid.wait() output = pid.communicate() Log.debug("return code: %s output: %s" % (return_code, output)) Log.info("Cleaning up directories on %s" % node) cmd = "rm -rf /tmp/slave ; rm -rf /tmp/master" if not is_self(node): cmd = ssh_remote_execute(cmd, node, cl_args) Log.debug(cmd) pid = subprocess.Popen(cmd, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) return_code = pid.wait() output = pid.communicate() Log.debug("return code: %s output: %s" % (return_code, output))
def emit(self, tup, tup_id=None, stream=Stream.DEFAULT_STREAM_ID, direct_task=None, need_task_ids=None): """Emits from this integration test spout Overriden method which will be called when user's spout calls emit() """ # if is_control True -> control stream should not count self.tuples_to_complete += 1 if tup_id is None: Log.info("Add tup_id for tuple: %s", str(tup)) _tup_id = integ_const.INTEGRATION_TEST_MOCK_MESSAGE_ID else: _tup_id = tup_id super().emit(tup, _tup_id, stream, direct_task, need_task_ids)
def on_topologies_watch(state_manager, topologies): """watch topologies""" Log.info("State watch triggered for topologies.") Log.debug("Topologies: " + str(topologies)) existingTopologies = self.getTopologiesForStateLocation(state_manager.name) existingTopNames = map(lambda t: t.name, existingTopologies) Log.debug("Existing topologies: " + str(existingTopNames)) for name in existingTopNames: if name not in topologies: Log.info("Removing topology: %s in rootpath: %s", name, state_manager.rootpath) self.removeTopology(name, state_manager.name) for name in topologies: if name not in existingTopNames: self.addNewTopology(state_manager, name)
def run(command, parser, cl_args, unknown_args): """ run the update command """ topology_name = cl_args['topology-name'] try: new_args = [ "--cluster", cl_args['cluster'], "--role", cl_args['role'], "--environment", cl_args['environ'], "--heron_home", config.get_heron_dir(), "--config_path", cl_args['config_path'], "--override_config_file", cl_args['override_config_file'], "--release_file", config.get_heron_release_file(), "--topology_name", topology_name, "--command", command, "--component_parallelism", ','.join(cl_args['component_parallelism']), ] if Log.getEffectiveLevel() == logging.DEBUG: new_args.append("--verbose") lib_jars = config.get_heron_libs(jars.scheduler_jars() + jars.statemgr_jars() + jars.packing_jars()) # invoke the runtime manager to kill the topology execute.heron_class('com.twitter.heron.scheduler.RuntimeManagerMain', lib_jars, extra_jars=[], args=new_args) except Exception as ex: Log.error('Failed to update topology \'%s\': %s', topology_name, traceback.format_exc(ex)) return False Log.info('Successfully updated topology \'%s\'' % topology_name) return True
def _gather_one_metric(self, name, message): metric_value = self.metrics_map[name].get_value_and_reset() Log.debug("In gather_one_metric with name: %s, and value: %s", name, str(metric_value)) if metric_value is None: return elif isinstance(metric_value, dict): for key, value in list(metric_value.items()): if key is not None and value is not None: self._add_data_to_message(message, name + "/" + str(key), value) self._add_data_to_message(message, "%s/%s" % (name, str(key)), value) else: Log.info("When gathering metric: %s, <%s:%s> is not a valid key-value to output " "as metric. Skipping...", name, str(key), str(value)) continue else: self._add_data_to_message(message, name, metric_value)
def _handle_packet(self, packet): # only called when packet.is_complete is True # otherwise, it's just an message -- call on_incoming_message() typename, reqid, serialized_msg = HeronProtocol.decode_packet(packet) if self.context_map.has_key(reqid): # this incoming packet has the response of a request context = self.context_map.pop(reqid) response_msg = self.response_message_map.pop(reqid) try: response_msg.ParseFromString(serialized_msg) except Exception as e: Log.error("Invalid Packet Error: %s" % e.message) self._handle_close() self.on_error() return if response_msg.IsInitialized(): self.on_response(StatusCode.OK, context, response_msg) else: Log.error("Response not initialized") self._handle_close() self.on_error() elif reqid.is_zero(): # this is a Message -- no need to send back response try: if typename not in self.registered_message_map: raise ValueError("%s is not registered in message map" % typename) msg_builder = self.registered_message_map[typename] message = msg_builder() message.ParseFromString(serialized_msg) if message.IsInitialized(): self.on_incoming_message(message) else: raise RuntimeError("Message not initialized") except Exception as e: Log.error("Error when handling message packet: %s" % e.message) Log.error(traceback.format_exc()) raise RuntimeError("Problem reading message") else: # might be a timeout response Log.info( "In handle_packet(): Received message whose REQID is not registered: %s" % str(reqid))
def on_incoming_message(self, message): self.gateway_metrics.update_received_packet(message.ByteSize()) if isinstance(message, stmgr_pb2.NewInstanceAssignmentMessage): Log.info( "Handling assignment message from direct NewInstanceAssignmentMessage" ) self._handle_assignment_message(message.pplan) elif isinstance(message, tuple_pb2.HeronTupleSet2): self._handle_new_tuples_2(message) elif isinstance(message, ckptmgr_pb2.StartInstanceStatefulProcessing): self._handle_start_stateful_processing(message) elif isinstance(message, ckptmgr_pb2.RestoreInstanceStateRequest): self._handle_restore_instance_state(message) elif isinstance(message, ckptmgr_pb2.InitiateStatefulCheckpoint): self._handle_initiate_stateful_checkpoint(message) else: raise RuntimeError( "Unknown kind of message received from Stream Manager")
def add_new_topology(self, state_manager, topology_name: str) -> None: """ Adds a topology in the local cache, and sets a watch on any changes on the topology. """ topology = Topology(topology_name, state_manager.name, self.config) Log.info("Adding new topology: %s, state_manager: %s", topology_name, state_manager.name) # populate the cache before making it addressable in the topologies to # avoid races due to concurrent execution self.topologies.append(topology) # Set watches on the pplan, execution_state, tmanager and scheduler_location. state_manager.get_pplan(topology_name, topology.set_physical_plan) state_manager.get_packing_plan(topology_name, topology.set_packing_plan) state_manager.get_execution_state(topology_name, topology.set_execution_state) state_manager.get_tmanager(topology_name, topology.set_tmanager) state_manager.get_scheduler_location(topology_name, topology.set_scheduler_location)
def wait_for_master_to_start(single_master): ''' Wait for a nomad master to start ''' i = 0 while True: try: r = requests.get("http://%s:4646/v1/status/leader" % single_master) if r.status_code == 200: break except: Log.debug(sys.exc_info()[0]) Log.info("Waiting for cluster to come up... %s" % i) time.sleep(1) if i > 10: Log.error("Failed to start Nomad Cluster!") sys.exit(-1) i = i + 1
def run(command, parser, cl_args, unknown_args, action): ''' helper function to take action on topologies :param command: :param parser: :param cl_args: :param unknown_args: :param action: description of action taken :return: ''' try: topology_name = cl_args['topology-name'] new_args = [ "--cluster", cl_args['cluster'], "--role", cl_args['role'], "--environment", cl_args['environ'], "--heron_home", config.get_heron_dir(), "--config_path", cl_args['config_path'], "--override_config_file", cl_args['override_config_file'], "--release_file", config.get_heron_release_file(), "--topology_name", topology_name, "--command", command, ] if Log.getEffectiveLevel() == logging.DEBUG: new_args.append("--verbose") lib_jars = config.get_heron_libs(jars.scheduler_jars() + jars.statemgr_jars()) # invoke the runtime manager to kill the topology execute.heron_class( 'com.twitter.heron.scheduler.RuntimeManagerMain', lib_jars, extra_jars=[], args=new_args ) except Exception: Log.error('Failed to %s \'%s\'' % (action, topology_name)) return False Log.info('Successfully executed %s \'%s\'' % (action, topology_name)) return True
def wait_for_job_to_start(single_master, job): ''' Wait for a Nomad job to start ''' i = 0 while True: try: r = requests.get("http://%s:4646/v1/job/%s" % (single_master, job)) if r.status_code == 200 and r.json()["Status"] == "running": break raise RuntimeError() except: Log.debug(sys.exc_info()[0]) Log.info("Waiting for %s to come up... %s" % (job, i)) time.sleep(1) if i > 20: Log.error("Failed to start Nomad Cluster!") sys.exit(-1) i = i + 1
def launch_topologies(cl_args, topology_file, tmp_dir): ''' Launch topologies :param cl_args: :param topology_file: :param tmp_dir: :return: ''' # the submitter would have written the .defn file to the tmp_dir defn_files = glob.glob(tmp_dir + '/*.defn') if len(defn_files) == 0: raise Exception("No topologies found") try: for defn_file in defn_files: # load the topology definition from the file topology_defn = topology_pb2.Topology() try: handle = open(defn_file, "rb") topology_defn.ParseFromString(handle.read()) handle.close() except: raise Exception( "Could not open and parse topology defn file %s" % defn_file) # launch the topology try: Log.info("Launching topology \'%s\'" % topology_defn.name) launch_a_topology(cl_args, tmp_dir, topology_file, defn_file) Log.info("Topology \'%s\' launched successfully" % topology_defn.name) except Exception as ex: Log.exception('Failed to launch topology \'%s\' because %s' % (topology_defn.name, str(ex))) raise except: raise
def add_new_topology(self, state_manager, topology_name: str) -> None: """ Adds a topology in the local cache, and sets a watch on any changes on the topology. """ topology = Topology(topology_name, state_manager.name) Log.info("Adding new topology: %s, state_manager: %s", topology_name, state_manager.name) self.topologies.append(topology) # Register a watch on topology and change # the topology_info on any new change. topology.register_watch(self.set_topology_info) # Set watches on the pplan, execution_state, tmanager and scheduler_location. state_manager.get_pplan(topology_name, topology.set_physical_plan) state_manager.get_packing_plan(topology_name, topology.set_packing_plan) state_manager.get_execution_state(topology_name, topology.set_execution_state) state_manager.get_tmanager(topology_name, topology.set_tmanager) state_manager.get_scheduler_location(topology_name, topology.set_scheduler_location)
def on_incoming_message(self, message): self.gateway_metrics.update_received_packet(message.ByteSize()) try: if isinstance(message, stmgr_pb2.NewInstanceAssignmentMessage): Log.info("Handling assignment message from direct NewInstanceAssignmentMessage") self._handle_assignment_message(message.pplan) elif isinstance(message, tuple_pb2.HeronTupleSet2): self._handle_new_tuples_2(message) elif isinstance(message, ckptmgr_pb2.StartInstanceStatefulProcessing): self._handle_start_stateful_processing(message) elif isinstance(message, ckptmgr_pb2.RestoreInstanceStateRequest): self._handle_restore_instance_state(message) elif isinstance(message, ckptmgr_pb2.InitiateStatefulCheckpoint): self._handle_initiate_stateful_checkpoint(message) else: raise RuntimeError("Unknown kind of message received from Stream Manager") except Exception as e: Log.error("Error happened while handling a message from stmgr: " + str(e)) Log.error(traceback.format_exc()) sys.exit(1)
def scp_package(package_file, destinations, cl_args): ''' scp and extract package ''' pids = [] for dest in destinations: if is_self(dest): continue Log.info("Server: %s" % dest) file_path = "/tmp/heron.tar.gz" dest_file_path = "%s:%s" % (dest, file_path) remote_cmd = "rm -rf ~/.heron && mkdir ~/.heron " \ "&& tar -xzvf %s -C ~/.heron --strip-components 1" % (file_path) cmd = '%s && %s' \ % (scp_cmd(package_file, dest_file_path, cl_args), ssh_remote_execute(remote_cmd, dest, cl_args)) Log.debug(cmd) pid = subprocess.Popen(cmd, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) pids.append({"pid": pid, "dest": dest}) errors = [] for entry in pids: pid = entry["pid"] return_code = pid.wait() output = pid.communicate() Log.debug("return code: %s output: %s" % (return_code, output)) if return_code != 0: errors.append("Failed to scp package to %s with error:\n%s" % (entry["dest"], output[1])) if errors: for error in errors: Log.error(error) sys.exit(-1) Log.info("Done distributing packages")
def __init__(self, pplan_helper, in_stream, out_stream, looper): super(BoltInstance, self).__init__(pplan_helper, in_stream, out_stream, looper) if self.pplan_helper.is_spout: raise RuntimeError("No bolt in physical plan") # bolt_config is auto-typed, not <str -> str> only context = self.pplan_helper.context self.bolt_metrics = BoltMetrics(self.pplan_helper) self.serializer = SerializerHelper.get_serializer(context) # acking related self.acking_enabled = context.get_cluster_config().get( constants.TOPOLOGY_ENABLE_ACKING, False) Log.info("Enable ACK: %s" % str(self.acking_enabled)) # load user's bolt class bolt_impl_class = super(BoltInstance, self).load_py_instance(is_spout=False) self.bolt_impl = bolt_impl_class(delegate=self)