def run_keys_from_partial_values( self, run_key_values, group_uuid=None, experiment_uuid=None, ): # NOTE: This probably does not work well with arrays. # All of the uuids are optional are should restrict the returns # to only items associated with the respective group/experiment. cls_term, cls_bindings = self._create_item_cls_query_term(runs.RunKey) uuid_terms, uuid_bindings = self._create_uuid_query_terms( group_uuid=group_uuid, experiment_uuid=experiment_uuid, ) run_key_binding = {"attributes": {"key_values": run_key_values}} run_key_binding = serialization.serialize(run_key_binding) query = (f"SELECT data FROM {ITEMS_TABLE} WHERE " f"data @> %s AND {cls_term} AND {uuid_terms}") bindings = (run_key_binding, ) + cls_bindings + uuid_bindings with self._cursor() as c: c.execute(query, bindings) rows = c.fetchall() items = [serialization.deserialize(row[0]) for row in rows] return items
def replace_item(self, item_uuid, new_item): assert item_uuid, "Update needs a non-empty item_uuid." ser_item = serialization.serialize(new_item) with self._cursor() as c: c.execute( f"UPDATE {ITEMS_TABLE} SET data = %s WHERE uuid = %s", (ser_item, item_uuid), ) return item_uuid
def _add_start_supervisor_script(execution_items, executor_params, launch_params, instance_name): on_start_cmd = executor_params.create_onstart_cmd() executor_params = executor_params.copy(entire_on_start_cmd=on_start_cmd) # We serialize each execution item so that we can pass the string to the # worker on the supervisor. Thus we do not need to download experiment's # dependencies on the supervisor. execution_items = [ serialization.serialize(item) for item in execution_items ] execution_items = serialization.serialize(execution_items).encode("utf-8") executor_params = serialization.serialize(executor_params).encode("utf-8") execution_items = base64.b64encode(execution_items).decode("utf-8") executor_params = base64.b64encode(executor_params).decode("utf-8") logs_dir = launch_params.supervisor_logs_dir stdout = os.path.join(logs_dir, "logs.stdout") stderr = os.path.join(logs_dir, "logs.stderr") cmd = [ launch_params.python_binary, launch_params.supervisor_main, f"--gce_instance_name='{instance_name}'", f"--zone='{launch_params.datacenter}'", "--execution_items_file=~/.execution_items", "--executor_params_file=~/.execution_params", f"1>{stdout} 2>{stderr} &", ] cmd = " ".join(cmd) script = [ f"EXE_ITEMS_B64='{execution_items}'", "echo $EXE_ITEMS_B64 > ~/.execution_items", # f"EXE_PARAMS_B64='{executor_params}'", "echo $EXE_PARAMS_B64 > ~/.execution_params", # f"mkdir -p {logs_dir}", cmd, ] return "\n".join(script)
def launch(execution_items, longleaf_params: LongleafParams): launch_id = uuidlib.uuid4().hex # We serialize each execution item so that we can pass the string to the # worker on the supervisor. Thus we do not need to download experiment's # dependencies on the supervisor. execution_items = [ serialization.serialize(_change_storage_params(item, longleaf_params)) for item in execution_items ] launcher.launch(execution_items, longleaf_params, launch_id)
def main_loop(): address = ("127.0.0.1", FLAGS.port) with connection.Listener(address) as listener: while True: with listener.accept() as conn: while True: # Try to keep these flushed for exit logging. sys.stdout.flush() sys.stderr.flush() try: logging.info("Waiting for message from supervisor.") msg = conn.recv() logging.info("Message received.") except EOFError: logging.warning("[NOT FATAL] EOFError on conn.recv()") break msg = serialization.deserialize(msg) logging.info(f"Incoming msg: {msg}") if msg.type == messages.MessageType.PROCESS_ITEM: exe_item = msg.content.execution_item if isinstance(exe_item, str): exe_item = serialization.deserialize(exe_item) logging.info( f"Processing execution item: {serialization.serialize(exe_item, indent=2)}" ) entrypoint.worker_run(**exe_item.worker_run_kwargs) response = messages.Message( type=messages.MessageType.PROCESS_ITEM, content=messages.ItemProcessed( status=messages.ResponseStatus.SUCCESS), ) logging.info("Successfully processed execution item") ser_res = serialization.serialize(response) logging.info("Sending response to supervisor.") conn.send(ser_res) logging.info("Clearing keras session.") tf.keras.backend.clear_session() # NOTE: I don't I support this, so commenting out. # elif msg.type == messages.MessageType.KILL: # return else: raise ValueError( f"Message received with unknown type {msg.type}.")
def _process_execution_item(self, item): # TODO: Add some nice logging and handle some failures (see Vast AI for example). msg = messages.ProcessItem.from_execution_item(item) ser_msg = serialization.serialize(msg) self._conn.send(ser_msg) logging.info("sent") response = self._conn.recv() self._assert_good_response_status(response) logging.info("Successfully processed an item.") return response
def main(_): logging.info("Longleaf worker started.") logging.info( f"Waiting to connect to {FLAGS.listener_host}:{FLAGS.listener_port}") conn = connection.Client((FLAGS.listener_host, FLAGS.listener_port)) logging.info(f"Connected to {FLAGS.listener_host}:{FLAGS.listener_port}") conn.send("CONNECTED") while True: msg = conn.recv() try: logging.info("Waiting for message from supervisor.") msg = conn.recv() logging.info("Message received.") except EOFError: logging.warning("[NOT FATAL] EOFError on conn.recv()") break msg = serialization.deserialize(msg) logging.info(f"Incoming msg: {msg}") if msg.type == messages.MessageType.PROCESS_ITEM: exe_item = msg.content.execution_item if isinstance(exe_item, str): exe_item = serialization.deserialize(exe_item) logging.info( f"Processing execution item: {serialization.serialize(exe_item, indent=2)}" ) entrypoint.worker_run(**exe_item.worker_run_kwargs) response = messages.Message( type=messages.MessageType.PROCESS_ITEM, content=messages.ItemProcessed( status=messages.ResponseStatus.SUCCESS), ) logging.info("Successfully processed execution item") ser_res = serialization.serialize(response) logging.info("Sending response to supervisor.") conn.send(ser_res) logging.info("Clearing keras session.") tf.keras.backend.clear_session() else: raise ValueError(f"Message received with unknown type {msg.type}.")
def store_item(self, item): item_uuid = self.new_uuid() ser_item = serialization.serialize(item) with self._cursor() as c: c.execute( f"INSERT INTO {ITEMS_TABLE} VALUES (%s, %s, %s, %s, %s)", ( item_uuid, self.group_uuid, self.experiment_uuid, self.run_uuid, ser_item, ), ) return item_uuid
def accept_item(self, item): assert self.state == _WorkerStates.ACCEPTING assert item is not None self.state = _WorkerStates.PROCESSING msg = messages.Message( type=messages.MessageType.PROCESS_ITEM, content=messages.ProcessItem(execution_item=item), ) ser_msg = serialization.serialize(msg) logging.info(f"Sending execution item to worker {self._uuid}.") start_time = time.time() self._conn.send(ser_msg) try: response = self._conn.recv() except EOFError as e: logging.error( f"Worker {self._uuid} received EOFError. Instance {self._instance._json}." ) logging.exception(e) self._supervisor.handle_failed_item(item) return self.kill() elapsed_seconds = time.time() - start_time elapsed_nice = str(datetime.timedelta(seconds=elapsed_seconds)) logging.info(f"Received response from worker {self._uuid}.") # Elapsed will be formated as "hh:mm:ss.fractions". logging.info(f"The worker processed the item in {elapsed_nice}.") response = serialization.deserialize(response) if response.content.status != messages.ResponseStatus.SUCCESS: # TODO: Handle failures. raise ValueError( f"Unsuccessful item processing with status {response.status}." ) logging.info("Successfully processed an item.") self.state = _WorkerStates.ACCEPTING return self
def _create_singularity_command( longleaf_params, supervisor_params, launch_id, execution_items ): setup_cmd = setup_util.create_setup_command( supervisor_params, longleaf_params.project_params, longleaf_params.storage_params, ) params_b64 = serialization.serialize(longleaf_params) params_b64 = base64.b64encode(params_b64.encode("utf-8")).decode("utf-8") python_cmd = [ PYTHON, supervisor_params.supervisor_main, f"--longleaf_params_base64={_to_base64(longleaf_params)}", f"--execution_items_base64={_to_base64(execution_items)}", f"--launch_id={launch_id}", ] cmd = [ setup_cmd, " ".join(python_cmd), ] return "\n".join(cmd)
def _to_base64(data): ser = serialization.serialize(data) return base64.b64encode(ser.encode("utf-8")).decode("utf-8")