def post(req: func.HttpRequest) -> func.HttpResponse: envelope = parse_request(NodeEventEnvelope, req) if isinstance(envelope, Error): return not_ok(envelope, context=ERROR_CONTEXT) logging.info( "node event: machine_id: %s event: %s", envelope.machine_id, envelope.event, ) if isinstance(envelope.event, NodeEvent): event = envelope.event elif isinstance(envelope.event, NodeStateUpdate): event = NodeEvent(state_update=envelope.event) elif isinstance(envelope.event, WorkerEvent): event = NodeEvent(worker_event=envelope.event) else: err = Error(code=ErrorCode.INVALID_REQUEST, errors=["invalid node event"]) return not_ok(err, context=ERROR_CONTEXT) if event.state_update: on_state_update(envelope.machine_id, event.state_update) return ok(BoolResult(result=True)) elif event.worker_event: on_worker_event(envelope.machine_id, event.worker_event) return ok(BoolResult(result=True)) else: err = Error(code=ErrorCode.INVALID_REQUEST, errors=["invalid node event"]) return not_ok(err, context=ERROR_CONTEXT)
def patch(req: func.HttpRequest) -> func.HttpResponse: request = parse_request(ProxyReset, req) if isinstance(request, Error): return not_ok(request, context="ProxyReset") proxy = Proxy.get(request.region) if proxy is not None: proxy.state = VmState.stopping proxy.save() return ok(BoolResult(result=True)) return ok(BoolResult(result=False))
def patch(req: func.HttpRequest) -> func.HttpResponse: request = parse_request(ProxyReset, req) if isinstance(request, Error): return not_ok(request, context="ProxyReset") proxy_list = Proxy.search(query={"region": [request.region]}) for proxy in proxy_list: proxy.set_state(VmState.stopping) if proxy_list: return ok(BoolResult(result=True)) else: return ok(BoolResult(result=False))
def delete(req: func.HttpRequest) -> func.HttpResponse: request = parse_request(ContainerDelete, req) if isinstance(request, Error): return not_ok(request, context="container delete") logging.info("container - deleting %s", request.name) return ok(BoolResult(result=delete_container(request.name, StorageType.corpus)))
def delete(req: func.HttpRequest) -> func.HttpResponse: request = parse_request(JobTemplateDelete, req) if isinstance(request, Error): return not_ok(request, context="JobTemplateDelete") entry = JobTemplateIndex.get(request.name) return ok(BoolResult(result=entry is not None))
def delete(req: func.HttpRequest) -> func.HttpResponse: request = parse_uri(NodeCommandDelete, req) if isinstance(request, Error): return not_ok(request, context="NodeCommandDelete") NodeMessage.delete_messages(request.machine_id, [request.message_id]) return ok(BoolResult(result=True))
def on_worker_event(machine_id: UUID, event: WorkerEvent) -> func.HttpResponse: if event.running: task_id = event.running.task_id elif event.done: task_id = event.done.task_id task = get_task_checked(task_id) node = get_node_checked(machine_id) node_task = NodeTasks(machine_id=machine_id, task_id=task_id, state=NodeTaskState.running) if event.running: if task.state not in TaskState.shutting_down(): task.state = TaskState.running if node.state not in NodeState.ready_for_reset(): node.state = NodeState.busy node_task.save() task.on_start() elif event.done: # Only record exit status if the task isn't already shutting down. # # It's ok for the agent to fail because resources vanish out from underneath # it during deletion. if task.state not in TaskState.shutting_down(): exit_status = event.done.exit_status if not exit_status.success: logging.error("task failed: status = %s", exit_status) task.error = Error( code=ErrorCode.TASK_FAILED, errors=[ "task failed. exit_status = %s" % exit_status, event.done.stdout, event.done.stderr, ], ) task.state = TaskState.stopping if node.state not in NodeState.ready_for_reset(): node.state = NodeState.done node_task.delete() else: err = Error( code=ErrorCode.INVALID_REQUEST, errors=["invalid worker event type"], ) raise RequestException(err) task.save() node.save() task_event = TaskEvent(task_id=task_id, machine_id=machine_id, event_data=event) task_event.save() return ok(BoolResult(result=True))
def delete(req: func.HttpRequest) -> func.HttpResponse: request = parse_request(NodeCommandDelete, req) if isinstance(request, Error): return not_ok(request, context="NodeCommandDelete") message = NodeMessage.get(request.machine_id, request.message_id) if message: message.delete() return ok(BoolResult(result=True))
def delete(req: func.HttpRequest) -> func.HttpResponse: request = parse_request(PoolStop, req) if isinstance(request, Error): return not_ok(request, context="PoolDelete") pool = Pool.get_by_name(request.name) if isinstance(pool, Error): return not_ok(pool, context="pool stop") pool.set_shutdown(now=request.now) return ok(BoolResult(result=True))
def post(req: func.HttpRequest) -> func.HttpResponse: request = parse_request(JobTemplateUpload, req) if isinstance(request, Error): return not_ok(request, context="JobTemplateUpload") entry = JobTemplateIndex(name=request.name, template=request.template) result = entry.save() if isinstance(result, Error): return not_ok(result, context="JobTemplateUpload") return ok(BoolResult(result=True))
def delete(req: func.HttpRequest) -> func.HttpResponse: request = parse_request(WebhookGet, req) if isinstance(request, Error): return not_ok(request, context="webhook delete") logging.info("deleting webhook: %s", request.webhook_id) entry = Webhook.get_by_id(request.webhook_id) if isinstance(entry, Error): return not_ok(entry, context="webhook delete") entry.delete() return ok(BoolResult(result=True))
def on_state_update(machine_id: UUID, state: NodeState) -> func.HttpResponse: node = get_node_checked(machine_id) if state == NodeState.init or node.state not in NodeState.ready_for_reset( ): if node.state != state: node.state = state node.save() else: logging.info("ignoring state updates from the node: %s: %s", machine_id, state) return ok(BoolResult(result=True))
def post(req: func.HttpRequest) -> func.HttpResponse: request = parse_request(TaskConfig, req) if isinstance(request, Error): return not_ok(request, context="task create") user_info = parse_jwt_token(req) if isinstance(user_info, Error): return not_ok(user_info, context="task create") try: check_config(request) except TaskConfigError as err: return not_ok( Error(code=ErrorCode.INVALID_REQUEST, errors=[str(err)]), context="task create", ) if "dryrun" in req.params: return ok(BoolResult(result=True)) job = Job.get(request.job_id) if job is None: return not_ok( Error(code=ErrorCode.INVALID_REQUEST, errors=["unable to find job"]), context=request.job_id, ) if job.state not in [JobState.enabled, JobState.init]: return not_ok( Error( code=ErrorCode.UNABLE_TO_ADD_TASK_TO_JOB, errors=["unable to add a job in state: %s" % job.state.name], ), context=job.job_id, ) if request.prereq_tasks: for task_id in request.prereq_tasks: prereq = Task.get_by_task_id(task_id) if isinstance(prereq, Error): return not_ok(prereq, context="task create prerequisite") task = Task.create(config=request, job_id=request.job_id, user_info=user_info) if isinstance(task, Error): return not_ok(task, context="task create invalid pool") return ok(task)
def patch(req: func.HttpRequest) -> func.HttpResponse: request = parse_request(NodeGet, req) if isinstance(request, Error): return not_ok(request, context="NodeRestart") node = Node.get_by_machine_id(request.machine_id) if not node: return not_ok( Error(code=ErrorCode.UNABLE_TO_FIND, errors=["unable to find node"]), context=request.machine_id, ) node.stop() return ok(BoolResult(result=True))
def patch(req: func.HttpRequest) -> func.HttpResponse: request = parse_request(JobTemplateUpdate, req) if isinstance(request, Error): return not_ok(request, context="JobTemplateUpdate") entry = JobTemplateIndex.get(request.name) if entry is None: return not_ok( Error(code=ErrorCode.UNABLE_TO_UPDATE, errors=["no such job template"]), context="JobTemplateUpdate", ) entry.template = request.template entry.save() return ok(BoolResult(result=True))
def post(req: func.HttpRequest) -> func.HttpResponse: request = parse_request(NodeUpdate, req) if isinstance(request, Error): return not_ok(request, context="NodeUpdate") node = Node.get_by_machine_id(request.machine_id) if not node: return not_ok( Error(code=ErrorCode.UNABLE_TO_FIND, errors=["unable to find node"]), context=request.machine_id, ) if request.debug_keep_node is not None: node.debug_keep_node = request.debug_keep_node node.save() return ok(BoolResult(result=True))
def delete(req: func.HttpRequest) -> func.HttpResponse: request = parse_request(ProxyDelete, req) if isinstance(request, Error): return not_ok(request, context="debug_proxy delete") regions = ProxyForward.remove_forward( scaleset_id=request.scaleset_id, machine_id=request.machine_id, dst_port=request.dst_port, ) for region in regions: proxy = Proxy.get_or_create(region) if proxy: proxy.save_proxy_config() return ok(BoolResult(result=True))
def post(req: func.HttpRequest) -> func.HttpResponse: request = parse_request(NodeAddSshKey, req) if isinstance(request, Error): return not_ok(request, context="NodeAddSshKey") node = Node.get_by_machine_id(request.machine_id) if not node: return not_ok( Error(code=ErrorCode.UNABLE_TO_FIND, errors=["unable to find node"]), context=request.machine_id, ) result = node.add_ssh_public_key(public_key=request.public_key) if isinstance(result, Error): return not_ok(result, context="NodeAddSshKey") return ok(BoolResult(result=True))
def on_state_update( machine_id: UUID, state_update: NodeStateUpdate, ) -> func.HttpResponse: state = state_update.state node = get_node_checked(machine_id) if state == NodeState.init or node.state not in NodeState.ready_for_reset( ): if node.state != state: node.state = state node.save() if state == NodeState.setting_up: # This field will be required in the future. # For now, it is optional for back compat. if state_update.data: for task_id in state_update.data.tasks: task = get_task_checked(task_id) # The task state may be `running` if it has `vm_count` > 1, and # another node is concurrently executing the task. If so, leave # the state as-is, to represent the max progress made. # # Other states we would want to preserve are excluded by the # outermost conditional check. if task.state != TaskState.running: task.state = TaskState.setting_up # We don't yet call `on_start()` for the task. # This will happen once we see a worker event that # reports it as `running`. task.save() # Note: we set the node task state to `setting_up`, even though # the task itself may be `running`. node_task = NodeTasks( machine_id=machine_id, task_id=task_id, state=NodeTaskState.setting_up, ) node_task.save() else: logging.info("ignoring state updates from the node: %s: %s", machine_id, state) return ok(BoolResult(result=True))
def delete(req: func.HttpRequest) -> func.HttpResponse: request = parse_request(ScalesetStop, req) if isinstance(request, Error): return not_ok(request, context="ScalesetDelete") scaleset = Scaleset.get_by_id(request.scaleset_id) if isinstance(scaleset, Error): return not_ok(scaleset, context="scaleset stop") if request.now: scaleset.state = ScalesetState.halt else: scaleset.state = ScalesetState.shutdown scaleset.save() scaleset.auth = None return ok(BoolResult(result=True))
def post(req: func.HttpRequest) -> func.HttpResponse: envelope = parse_request(NodeEventEnvelope, req) if isinstance(envelope, Error): return not_ok(envelope, context="node event") logging.info( "node event: machine_id: %s event: %s", envelope.machine_id, envelope.event.json(exclude_none=True), ) result = process(envelope) if isinstance(result, Error): logging.error("unable to process agent event. envelope:%s error:%s", envelope, result) return not_ok(result, context="node event") return ok(BoolResult(result=True))
def patch(req: func.HttpRequest) -> func.HttpResponse: request = parse_request(NodeGet, req) if isinstance(request, Error): return not_ok(request, context="NodeReimage") answer = check_require_admins(req) if isinstance(answer, Error): return not_ok(answer, context="NodeReimage") node = Node.get_by_machine_id(request.machine_id) if not node: return not_ok( Error(code=ErrorCode.UNABLE_TO_FIND, errors=["unable to find node"]), context=request.machine_id, ) node.stop(done=True) if node.debug_keep_node: node.debug_keep_node = False node.save() return ok(BoolResult(result=True))