Пример #1
0
    def run_keys_from_partial_values(
        self,
        run_key_values,
        group_uuid=None,
        experiment_uuid=None,
    ):
        # NOTE: This probably does not work well with arrays.

        # All of the uuids are optional are should restrict the returns
        # to only items associated with the respective group/experiment.
        cls_term, cls_bindings = self._create_item_cls_query_term(runs.RunKey)
        uuid_terms, uuid_bindings = self._create_uuid_query_terms(
            group_uuid=group_uuid,
            experiment_uuid=experiment_uuid,
        )
        run_key_binding = {"attributes": {"key_values": run_key_values}}
        run_key_binding = serialization.serialize(run_key_binding)

        query = (f"SELECT data FROM {ITEMS_TABLE} WHERE "
                 f"data @> %s AND {cls_term} AND {uuid_terms}")
        bindings = (run_key_binding, ) + cls_bindings + uuid_bindings

        with self._cursor() as c:
            c.execute(query, bindings)
            rows = c.fetchall()

        items = [serialization.deserialize(row[0]) for row in rows]
        return items
Пример #2
0
 def replace_item(self, item_uuid, new_item):
     assert item_uuid, "Update needs a non-empty item_uuid."
     ser_item = serialization.serialize(new_item)
     with self._cursor() as c:
         c.execute(
             f"UPDATE {ITEMS_TABLE} SET data = %s WHERE uuid = %s",
             (ser_item, item_uuid),
         )
     return item_uuid
Пример #3
0
def _add_start_supervisor_script(execution_items, executor_params,
                                 launch_params, instance_name):
    on_start_cmd = executor_params.create_onstart_cmd()
    executor_params = executor_params.copy(entire_on_start_cmd=on_start_cmd)

    # We serialize each execution item so that we can pass the string to the
    # worker on the supervisor. Thus we do not need to download experiment's
    # dependencies on the supervisor.
    execution_items = [
        serialization.serialize(item) for item in execution_items
    ]

    execution_items = serialization.serialize(execution_items).encode("utf-8")
    executor_params = serialization.serialize(executor_params).encode("utf-8")

    execution_items = base64.b64encode(execution_items).decode("utf-8")
    executor_params = base64.b64encode(executor_params).decode("utf-8")

    logs_dir = launch_params.supervisor_logs_dir
    stdout = os.path.join(logs_dir, "logs.stdout")
    stderr = os.path.join(logs_dir, "logs.stderr")

    cmd = [
        launch_params.python_binary,
        launch_params.supervisor_main,
        f"--gce_instance_name='{instance_name}'",
        f"--zone='{launch_params.datacenter}'",
        "--execution_items_file=~/.execution_items",
        "--executor_params_file=~/.execution_params",
        f"1>{stdout} 2>{stderr} &",
    ]
    cmd = " ".join(cmd)
    script = [
        f"EXE_ITEMS_B64='{execution_items}'",
        "echo $EXE_ITEMS_B64 > ~/.execution_items",
        #
        f"EXE_PARAMS_B64='{executor_params}'",
        "echo $EXE_PARAMS_B64 > ~/.execution_params",
        #
        f"mkdir -p {logs_dir}",
        cmd,
    ]
    return "\n".join(script)
Пример #4
0
def launch(execution_items, longleaf_params: LongleafParams):
    launch_id = uuidlib.uuid4().hex

    # We serialize each execution item so that we can pass the string to the
    # worker on the supervisor. Thus we do not need to download experiment's
    # dependencies on the supervisor.
    execution_items = [
        serialization.serialize(_change_storage_params(item, longleaf_params))
        for item in execution_items
    ]

    launcher.launch(execution_items, longleaf_params, launch_id)
Пример #5
0
def main_loop():
    address = ("127.0.0.1", FLAGS.port)
    with connection.Listener(address) as listener:
        while True:
            with listener.accept() as conn:
                while True:
                    # Try to keep these flushed for exit logging.
                    sys.stdout.flush()
                    sys.stderr.flush()

                    try:
                        logging.info("Waiting for message from supervisor.")
                        msg = conn.recv()
                        logging.info("Message received.")
                    except EOFError:
                        logging.warning("[NOT FATAL] EOFError on conn.recv()")
                        break

                    msg = serialization.deserialize(msg)
                    logging.info(f"Incoming msg: {msg}")

                    if msg.type == messages.MessageType.PROCESS_ITEM:
                        exe_item = msg.content.execution_item
                        if isinstance(exe_item, str):
                            exe_item = serialization.deserialize(exe_item)

                        logging.info(
                            f"Processing execution item: {serialization.serialize(exe_item, indent=2)}"
                        )
                        entrypoint.worker_run(**exe_item.worker_run_kwargs)

                        response = messages.Message(
                            type=messages.MessageType.PROCESS_ITEM,
                            content=messages.ItemProcessed(
                                status=messages.ResponseStatus.SUCCESS),
                        )
                        logging.info("Successfully processed execution item")

                        ser_res = serialization.serialize(response)
                        logging.info("Sending response to supervisor.")
                        conn.send(ser_res)

                        logging.info("Clearing keras session.")
                        tf.keras.backend.clear_session()

                    # NOTE: I don't I support this, so commenting out.
                    # elif msg.type == messages.MessageType.KILL:
                    #     return

                    else:
                        raise ValueError(
                            f"Message received with unknown type {msg.type}.")
Пример #6
0
    def _process_execution_item(self, item):
        # TODO: Add some nice logging and handle some failures (see Vast AI for example).
        msg = messages.ProcessItem.from_execution_item(item)
        ser_msg = serialization.serialize(msg)

        self._conn.send(ser_msg)
        logging.info("sent")

        response = self._conn.recv()
        self._assert_good_response_status(response)

        logging.info("Successfully processed an item.")
        return response
Пример #7
0
def main(_):
    logging.info("Longleaf worker started.")
    logging.info(
        f"Waiting to connect to {FLAGS.listener_host}:{FLAGS.listener_port}")

    conn = connection.Client((FLAGS.listener_host, FLAGS.listener_port))
    logging.info(f"Connected to {FLAGS.listener_host}:{FLAGS.listener_port}")

    conn.send("CONNECTED")

    while True:
        msg = conn.recv()
        try:
            logging.info("Waiting for message from supervisor.")
            msg = conn.recv()
            logging.info("Message received.")
        except EOFError:
            logging.warning("[NOT FATAL] EOFError on conn.recv()")
            break

        msg = serialization.deserialize(msg)
        logging.info(f"Incoming msg: {msg}")

        if msg.type == messages.MessageType.PROCESS_ITEM:
            exe_item = msg.content.execution_item
            if isinstance(exe_item, str):
                exe_item = serialization.deserialize(exe_item)

            logging.info(
                f"Processing execution item: {serialization.serialize(exe_item, indent=2)}"
            )
            entrypoint.worker_run(**exe_item.worker_run_kwargs)

            response = messages.Message(
                type=messages.MessageType.PROCESS_ITEM,
                content=messages.ItemProcessed(
                    status=messages.ResponseStatus.SUCCESS),
            )
            logging.info("Successfully processed execution item")

            ser_res = serialization.serialize(response)
            logging.info("Sending response to supervisor.")
            conn.send(ser_res)

            logging.info("Clearing keras session.")
            tf.keras.backend.clear_session()

        else:
            raise ValueError(f"Message received with unknown type {msg.type}.")
Пример #8
0
 def store_item(self, item):
     item_uuid = self.new_uuid()
     ser_item = serialization.serialize(item)
     with self._cursor() as c:
         c.execute(
             f"INSERT INTO {ITEMS_TABLE} VALUES (%s, %s, %s, %s, %s)",
             (
                 item_uuid,
                 self.group_uuid,
                 self.experiment_uuid,
                 self.run_uuid,
                 ser_item,
             ),
         )
     return item_uuid
Пример #9
0
    def accept_item(self, item):
        assert self.state == _WorkerStates.ACCEPTING
        assert item is not None

        self.state = _WorkerStates.PROCESSING

        msg = messages.Message(
            type=messages.MessageType.PROCESS_ITEM,
            content=messages.ProcessItem(execution_item=item),
        )

        ser_msg = serialization.serialize(msg)
        logging.info(f"Sending execution item to worker {self._uuid}.")
        start_time = time.time()
        self._conn.send(ser_msg)

        try:
            response = self._conn.recv()
        except EOFError as e:
            logging.error(
                f"Worker {self._uuid} received EOFError. Instance {self._instance._json}."
            )
            logging.exception(e)
            self._supervisor.handle_failed_item(item)
            return self.kill()

        elapsed_seconds = time.time() - start_time
        elapsed_nice = str(datetime.timedelta(seconds=elapsed_seconds))

        logging.info(f"Received response from worker {self._uuid}.")
        # Elapsed will be formated as "hh:mm:ss.fractions".
        logging.info(f"The worker processed the item in {elapsed_nice}.")
        response = serialization.deserialize(response)

        if response.content.status != messages.ResponseStatus.SUCCESS:
            # TODO: Handle failures.
            raise ValueError(
                f"Unsuccessful item processing with status {response.status}."
            )

        logging.info("Successfully processed an item.")

        self.state = _WorkerStates.ACCEPTING

        return self
Пример #10
0
def _create_singularity_command(
    longleaf_params, supervisor_params, launch_id, execution_items
):
    setup_cmd = setup_util.create_setup_command(
        supervisor_params,
        longleaf_params.project_params,
        longleaf_params.storage_params,
    )

    params_b64 = serialization.serialize(longleaf_params)
    params_b64 = base64.b64encode(params_b64.encode("utf-8")).decode("utf-8")

    python_cmd = [
        PYTHON,
        supervisor_params.supervisor_main,
        f"--longleaf_params_base64={_to_base64(longleaf_params)}",
        f"--execution_items_base64={_to_base64(execution_items)}",
        f"--launch_id={launch_id}",
    ]
    cmd = [
        setup_cmd,
        " ".join(python_cmd),
    ]
    return "\n".join(cmd)
Пример #11
0
def _to_base64(data):
    ser = serialization.serialize(data)
    return base64.b64encode(ser.encode("utf-8")).decode("utf-8")