def test_new_stopped_status_after_stopping(self): new_run_status( self.run, condition=V1StatusCondition.get_condition(type=V1Statuses.STOPPING, status=True, reason="foo"), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 1 # Same this condition new_run_status( self.run, condition=V1StatusCondition.get_condition(type=V1Statuses.RUNNING, status=True, reason="foo"), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 1 # Different condition's message new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.STOPPED, status=True, reason="foo", message="New message", ), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 2
def test_status_transition(self): new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.SCHEDULED, status=True, reason="foo"), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 1 # New running condition new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.RUNNING, status=True, reason="foo", message="New message", ), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 2 # New warning condition new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.WARNING, status=True, reason="foo", message="New message", ), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 3 # New running condition new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.RUNNING, status=True, reason="foo", message="New message", ), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 4 # New warning condition new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.WARNING, status=True, reason="foo", message="New message", ), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 5
def runs_prepare(run_id: int, run: Optional[BaseRun], eager: bool = False) -> bool: run = get_run(run_id=run_id, run=run) if not run: return False if not LifeCycle.is_compilable(run.status): _logger.info( "Run `%s` cannot transition from `%s` to `%s`.", run_id, run.status, V1Statuses.COMPILED, ) return False try: compiled_at = now() _, compiled_operation = resolver.resolve(run=run, compiled_at=compiled_at, eager=eager) except PolyaxonCompilerError as e: condition = V1StatusCondition.get_condition( type=V1Statuses.FAILED, status="True", reason="SchedulerPrepare", message=f"Failed to compile.\n{e}", ) new_run_status(run=run, condition=condition) return False except Exception as e: condition = V1StatusCondition.get_condition( type=V1Statuses.FAILED, status="True", reason="SchedulerPrepare", message=f"Compiler received an internal error.\n{e}", ) new_run_status(run=run, condition=condition) return False condition = V1StatusCondition.get_condition( type=V1Statuses.COMPILED, status="True", reason="SchedulerPrepare", message="Run is compiled", last_update_time=compiled_at, ) new_run_status(run=run, condition=condition) if run.pending: return False if eager: runs_start(run_id=run.id, run=run) return False return True
def resume_run( run: BaseRun, user_id: int = None, name: str = None, description: str = None, content: str = None, readme: str = None, tags: List[str] = None, ) -> BaseRun: op_spec = V1Operation.read(run.raw_content) compiled_operation, instance = operations.init_run( project_id=run.project_id, user_id=user_id or run.user_id, name=name or run.name, description=description or run.description, readme=readme or run.readme, op_spec=op_spec, tags=tags or run.tags, override=content, ) run.user_id = instance.user_id run.name = instance.name run.description = instance.description run.readme = instance.readme run.content = instance.content run.raw_content = instance.raw_content run.tags = instance.tags run.save() new_run_status( run, condition=V1StatusCondition.get_condition(type=V1Statuses.RESUMING, status=True), ) return run
def test_new_run_status_created(self, auditor_record): new_run_status( self.run, condition=V1StatusCondition.get_condition(type=V1Statuses.CREATED, status=True), ) assert auditor_record.call_count == 0
def test_get(self): resp = self.client.get(self.url) assert resp.status_code == status.HTTP_200_OK data = resp.data assert len(data["status_conditions"]) == 0 assert data == self.serializer_class(self.object).data new_run_status( self.object, condition=V1StatusCondition.get_condition(type=V1Statuses.RUNNING, status=True), ) self.object.refresh_from_db() resp = self.client.get(self.url) assert resp.status_code == status.HTTP_200_OK data = resp.data assert len(data["status_conditions"]) == 1 assert data == self.serializer_class(self.object).data new_run_stop_status(run=self.object, message="foo") self.object.refresh_from_db() resp = self.client.get(self.url) assert resp.status_code == status.HTTP_200_OK data = resp.data assert len(data["status_conditions"]) == 2 assert data == self.serializer_class(self.object).data
def create_run( project_id: int, user_id: int, name: str = None, description: str = None, readme: str = None, tags: List[int] = None, raw_content: str = None, ) -> BaseRun: instance = get_run_model().objects.create( project_id=project_id, user_id=user_id, name=name, description=description, readme=readme, tags=tags, kind=V1RunKind.JOB, is_managed=False, raw_content=raw_content, status_conditions=[ V1StatusCondition.get_condition( type=V1Statuses.CREATED, status="True", reason="PolyaxonRunCreated", message="Run is created", ).to_dict() ], ) return instance
def setUp(self): super().setUp() new_run_status( self.object, condition=V1StatusCondition.get_condition(type=V1Statuses.STOPPED, status=True), )
def runs_start(run_id: int, run: Optional[BaseRun]): run = get_run(run_id=run_id, run=run) if not run: return if not run.is_managed: return if not LifeCycle.is_compiled(run.status): _logger.info( "Run `%s` cannot transition from `%s` to `%s`.", run_id, run.status, V1Statuses.QUEUED, ) return condition = V1StatusCondition.get_condition( type=V1Statuses.QUEUED, status="True", reason="PolyaxonRunQueued", message="Run is queued", ) new_run_status(run=run, condition=condition) try: in_cluster = conf.get(K8S_IN_CLUSTER) if in_cluster and (run.is_service or run.is_job): manager.start( content=run.content, owner_name=run.project.owner.name, project_name=run.project.name, run_name=run.name, run_uuid=run.uuid.hex, run_kind=run.kind, namespace=conf.get(K8S_NAMESPACE), in_cluster=in_cluster, default_auth=False, ) except PolyaxonK8SError as e: condition = V1StatusCondition.get_condition( type=V1Statuses.FAILED, status="True", reason="PolyaxonRunFailed", message="Could not start the job {}".format(e), ) new_run_status(run=run, condition=condition)
def runs_start(run_id: int, run: Optional[BaseRun]): run = get_run(run_id=run_id, run=run) if not run: return if not run.is_managed: return if not LifeCycle.is_compiled(run.status): _logger.info( "Run `%s` cannot transition from `%s` to `%s`.", run_id, run.status, V1Statuses.QUEUED, ) return condition = V1StatusCondition.get_condition( type=V1Statuses.QUEUED, status="True", reason="PolyaxonRunQueued", message="Run is queued", ) new_run_status(run=run, condition=condition) def _log_error(exc: Exception, message: str = None): message = message or "Could not start the operation.\n" message += "error: {}\n{}".format(repr(exc), traceback.format_exc()) cond = V1StatusCondition.get_condition( type=V1Statuses.FAILED, status="True", reason="PolyaxonRunFailed", message=message, ) new_run_status(run=run, condition=cond) try: in_cluster = conf.get(K8S_IN_CLUSTER) if in_cluster and (run.is_service or run.is_job): manager.start( content=run.content, owner_name=run.project.owner.name, project_name=run.project.name, run_name=run.name, run_uuid=run.uuid.hex, run_kind=run.kind, namespace=conf.get(K8S_NAMESPACE), in_cluster=in_cluster, default_auth=False, ) return except (PolyaxonK8SError, ApiException) as e: _log_error( exc=e, message="Kubernetes manager could not start the operation.\n") except PolypodException as e: _log_error(exc=e, message="Failed converting the run manifest.\n") except Exception as e: _log_error(exc=e, message="Failed with unknown exception.\n")
def test_start_run(self, manager_start): experiment = RunFactory(project=self.project, user=self.user) new_run_status( run=experiment, condition=V1StatusCondition.get_condition(type=V1Statuses.COMPILED, status=True), ) runs_start(run_id=experiment.id) assert manager_start.call_count == 1
def test_status_update_results_in_new_updated_at_datetime(self): updated_at = self.run.updated_at # Create new status new_run_status( self.run, condition=V1StatusCondition.get_condition(type=V1Statuses.STARTING, status=True), ) assert updated_at < self.run.updated_at updated_at = self.run.updated_at # Create new status new_run_status( self.run, condition=V1StatusCondition.get_condition(type=V1Statuses.STARTING, status=True), ) assert updated_at < self.run.updated_at
def new_run_stop_status(run, message): # Update run status to show that its stopped message = f"Run is stopped; {message}" if message else "Run is stopped" condition = V1StatusCondition.get_condition( type=V1Statuses.STOPPED, status="True", reason="PolyaxonRunStopped", message=message, ) new_run_status(run=run, condition=condition)
def _log_error(exc: Exception, message: str = None): message = message or "Could not start the operation.\n" message += "error: {}\n{}".format(repr(exc), traceback.format_exc()) cond = V1StatusCondition.get_condition( type=V1Statuses.FAILED, status="True", reason="PolyaxonRunFailed", message=message, ) new_run_status(run=run, condition=cond)
def test_new_status_equality(self): new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.SCHEDULED, status=True, reason="foo"), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 1 # Same condition new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.SCHEDULED, status=True, reason="foo"), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 1 # Different condition's message new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.SCHEDULED, status=True, reason="foo", message="New message", ), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 1 # New condition new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.RUNNING, status=True, reason="foo", message="New message", ), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 2
def log_agent_status(self, status: str, reason: str = None, message: str = None): status_condition = V1StatusCondition.get_condition( type=status, status=True, reason=reason, message=message ) self.client.agents_v1.create_agent_status( owner=self.owner, uuid=self.agent_uuid, body={"condition": status_condition}, async_req=True, )
def test_new_run_status_scheduled(self, auditor_record): new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.SCHEDULED, status=True), ) assert auditor_record.call_count == 1 call_args, call_kwargs = auditor_record.call_args assert call_args == () assert call_kwargs["event_type"] == run_events.RUN_NEW_STATUS
def create_one(self): run = super().create_one() condition = V1StatusCondition.get_condition( type=V1Statuses.RUNNING, status="True", reason="Run is running", message="foo", ) new_run_status(run, condition) new_run_stop_status(run, "stopping") return run
def create_status(view, serializer): serializer.is_valid() validated_data = serializer.validated_data if not validated_data: return condition = None if validated_data.get("condition"): condition = V1StatusCondition.get_condition( **validated_data.get("condition")) if condition: new_run_status(run=view.run, condition=condition)
def stop_run(view, request, *args, **kwargs): if LifeCycle.is_done(view.run.status): return Response(status=status.HTTP_200_OK, data={}) condition = V1StatusCondition.get_condition( type=V1Statuses.STOPPING, status="True", reason="PolyaxonRunStopping", message="User requested to stop the run.", ) new_run_status(run=view.run, condition=condition) view.audit(request, *args, **kwargs) return Response(status=status.HTTP_200_OK, data={})
def stop_runs(view, request, actor, *args, **kwargs): # Immediate stop queryset = ( get_run_model() .objects.filter(project=view.project, uuid__in=request.data.get("uuids", [])) .filter(status__in=LifeCycle.SAFE_STOP_VALUES) ) condition = V1StatusCondition.get_condition( type=V1Statuses.STOPPED, status="True", reason="EventHandler", message="User requested to stop the run.", ) bulk_new_run_status(queryset, condition) queryset = ( get_run_model() .objects.filter(project=view.project, uuid__in=request.data.get("uuids", [])) .exclude(status__in=LifeCycle.DONE_OR_IN_PROGRESS_VALUES) ) runs = [r for r in queryset] condition = V1StatusCondition.get_condition( type=V1Statuses.STOPPING, status="True", reason="EventHandler", message="User requested to stop the run.", ) bulk_new_run_status(runs, condition) for run in runs: auditor.record( event_type=RUN_STOPPED_ACTOR, instance=run, actor_id=actor.id, actor_name=actor.username, owner_id=view.project.owner_id, owner_name=view.owner_name, project_name=view.project_name, ) return Response(status=status.HTTP_200_OK, data={})
def runs_prepare(run_id: int, run: Optional[BaseRun], eager: bool = False): run = get_run(run_id=run_id, run=run) if not run: return if not LifeCycle.is_compilable(run.status): _logger.info( "Run `%s` cannot transition from `%s` to `%s`.", run_id, run.status, V1Statuses.COMPILED, ) return None try: compiled_at = now() _, compiled_operation = resolver.resolve(run=run, compiled_at=compiled_at) except PolyaxonCompilerError as e: condition = V1StatusCondition.get_condition( type=V1Statuses.FAILED, status="True", reason="PolyaxonRunFailed", message=f"Run compilation error: {e}", ) new_run_status(run=run, condition=condition) return None condition = V1StatusCondition.get_condition( type=V1Statuses.COMPILED, status="True", reason="PolyaxonRunCompiler", message="Run is compiled", last_update_time=compiled_at, ) new_run_status(run=run, condition=condition) if eager: runs_start(run_id=run.id, run=run) return
def test_resume_undone_run(self): new_run_status( self.object, condition=V1StatusCondition.get_condition(type=V1Statuses.RUNNING, status=True), ) data = {} assert self.queryset.count() == 1 with patch("polycommon.workers.send") as workers_send: resp = self.client.post(self.url + "resume/", data) assert resp.status_code == status.HTTP_400_BAD_REQUEST assert workers_send.call_count == 0 assert self.queryset.count() == 1
def new_run_stopping_status(run, message) -> bool: if LifeCycle.is_done(run.status, progressing=True): return False message = f"Run is stopping; {message}" if message else "Run is stopping" condition = V1StatusCondition.get_condition( type=V1Statuses.STOPPING, status="True", reason="PolyaxonRunStopping", message=message, ) new_run_status(run=run, condition=condition) return True
def test_new_run_status_succeeded(self, auditor_record): new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.SUCCEEDED, status=True), ) assert auditor_record.call_count == 3 call_args_list = auditor_record.call_args_list assert call_args_list[0][0] == () assert call_args_list[1][0] == () assert call_args_list[2][0] == () assert call_args_list[0][1]["event_type"] == run_events.RUN_NEW_STATUS assert call_args_list[1][1]["event_type"] == run_events.RUN_SUCCEEDED assert call_args_list[2][1]["event_type"] == run_events.RUN_DONE
def test_equality_apply(self): eq_cond = EqualityCondition(op="eq") neq_cond = EqualityCondition(op="eq", negation=True) new_run_status( run=self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.SCHEDULED, status=True ), ) # eq queryset = eq_cond.apply( queryset=Run.objects, name="status", params=V1Statuses.SCHEDULED, query_backend=Q, timezone=settings.TIME_ZONE, ) assert queryset.count() == 1 queryset = eq_cond.apply( queryset=Run.objects, name="status", params=V1Statuses.SUCCEEDED, query_backend=Q, timezone=settings.TIME_ZONE, ) assert queryset.count() == 0 # neq queryset = neq_cond.apply( queryset=Run.objects, name="status", params=V1Statuses.SCHEDULED, query_backend=Q, timezone=settings.TIME_ZONE, ) assert queryset.count() == 0 queryset = neq_cond.apply( queryset=Run.objects, name="status", params=V1Statuses.SUCCEEDED, query_backend=Q, timezone=settings.TIME_ZONE, ) assert queryset.count() == 1
def test_prepare_run_of_already_skipped_run(self, mock_resolve): spec_run = MagicMock(cache=V1Cache(disable=False)) mock_resolve.return_value = (None, spec_run) experiment = RunFactory(project=self.project, user=self.user) new_run_status( run=experiment, condition=V1StatusCondition.get_condition(type=V1Statuses.SKIPPED, status=True), ) new_experiment = RunFactory(project=self.project, user=self.user) runs_prepare(run_id=new_experiment.id) new_experiment.refresh_from_db() assert new_experiment.status == V1Statuses.COMPILED
def setUp(self): super().setUp() self.project = ProjectFactory() self.objects = [ self.factory_class(project=self.project, user=self.user) for _ in range(4) ] for obj in self.objects: new_run_status( run=obj, condition=V1StatusCondition.get_condition( type=V1Statuses.RUNNING, status=True), ) self.url = "/{}/{}/{}/runs/stop/".format(API_V1, self.user.username, self.project.name)
def notify(kind, owner, project, run_uuid, run_name, condition): """Notifier command.""" import ujson from polyaxon.lifecycle import V1StatusCondition from polyaxon.notifiers import NOTIFIERS, NotificationSpec condition = ujson.loads(condition) condition = V1StatusCondition.get_condition(**condition) notification = NotificationSpec( kind=kind, owner=owner, project=project, uuid=run_uuid, name=run_name, condition=condition, ) NOTIFIERS[kind].execute(notification=notification)
def set_entity_status(entity, condition: V1StatusCondition): entity.status = condition.type if condition: status_conditions = None if entity.status_conditions: status_conditions = to_list(entity.status_conditions, check_none=True) last_condition = V1StatusCondition.get_condition(**status_conditions[-1]) if last_condition == condition: status_conditions[-1] = condition.to_dict() else: status_conditions.append(condition.to_dict()) elif condition: status_conditions = [condition.to_dict()] if status_conditions: entity.status_conditions = status_conditions return entity