def test_status_transition(self): new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.SCHEDULED, status=True, reason="foo"), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 1 # New running condition new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.RUNNING, status=True, reason="foo", message="New message", ), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 2 # New warning condition new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.WARNING, status=True, reason="foo", message="New message", ), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 3 # New running condition new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.RUNNING, status=True, reason="foo", message="New message", ), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 4 # New warning condition new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.WARNING, status=True, reason="foo", message="New message", ), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 5
def setUp(self): super().setUp() new_run_status( self.object, condition=V1StatusCondition.get_condition(type=V1Statuses.STOPPED, status=True), )
def test_new_run_status_created(self, auditor_record): new_run_status( self.run, condition=V1StatusCondition.get_condition(type=V1Statuses.CREATED, status=True), ) assert auditor_record.call_count == 0
def test_get(self): resp = self.client.get(self.url) assert resp.status_code == status.HTTP_200_OK data = resp.data assert len(data["status_conditions"]) == 0 assert data == self.serializer_class(self.object).data new_run_status( self.object, condition=V1StatusCondition.get_condition(type=V1Statuses.RUNNING, status=True), ) self.object.refresh_from_db() resp = self.client.get(self.url) assert resp.status_code == status.HTTP_200_OK data = resp.data assert len(data["status_conditions"]) == 1 assert data == self.serializer_class(self.object).data new_run_stop_status(run=self.object, message="foo") self.object.refresh_from_db() resp = self.client.get(self.url) assert resp.status_code == status.HTTP_200_OK data = resp.data assert len(data["status_conditions"]) == 2 assert data == self.serializer_class(self.object).data
def resume_run( run: BaseRun, user_id: int = None, name: str = None, description: str = None, content: str = None, readme: str = None, tags: List[str] = None, ) -> BaseRun: op_spec = V1Operation.read(run.raw_content) compiled_operation, instance = operations.init_run( project_id=run.project_id, user_id=user_id or run.user_id, name=name or run.name, description=description or run.description, readme=readme or run.readme, op_spec=op_spec, tags=tags or run.tags, override=content, ) run.user_id = instance.user_id run.name = instance.name run.description = instance.description run.readme = instance.readme run.content = instance.content run.raw_content = instance.raw_content run.tags = instance.tags run.save() new_run_status( run, condition=V1StatusCondition.get_condition(type=V1Statuses.RESUMING, status=True), ) return run
def runs_start(run_id: int, run: Optional[BaseRun]): run = get_run(run_id=run_id, run=run) if not run: return if not run.is_managed: return if not LifeCycle.is_compiled(run.status): _logger.info( "Run `%s` cannot transition from `%s` to `%s`.", run_id, run.status, V1Statuses.QUEUED, ) return condition = V1StatusCondition.get_condition( type=V1Statuses.QUEUED, status="True", reason="PolyaxonRunQueued", message="Run is queued", ) new_run_status(run=run, condition=condition) def _log_error(exc: Exception, message: str = None): message = message or "Could not start the operation.\n" message += "error: {}\n{}".format(repr(exc), traceback.format_exc()) cond = V1StatusCondition.get_condition( type=V1Statuses.FAILED, status="True", reason="PolyaxonRunFailed", message=message, ) new_run_status(run=run, condition=cond) try: in_cluster = conf.get(K8S_IN_CLUSTER) if in_cluster and (run.is_service or run.is_job): manager.start( content=run.content, owner_name=run.project.owner.name, project_name=run.project.name, run_name=run.name, run_uuid=run.uuid.hex, run_kind=run.kind, namespace=conf.get(K8S_NAMESPACE), in_cluster=in_cluster, default_auth=False, ) return except (PolyaxonK8SError, ApiException) as e: _log_error( exc=e, message="Kubernetes manager could not start the operation.\n") except PolypodException as e: _log_error(exc=e, message="Failed converting the run manifest.\n") except Exception as e: _log_error(exc=e, message="Failed with unknown exception.\n")
def test_start_run(self, manager_start): experiment = RunFactory(project=self.project, user=self.user) new_run_status( run=experiment, condition=V1StatusCondition.get_condition(type=V1Statuses.COMPILED, status=True), ) runs_start(run_id=experiment.id) assert manager_start.call_count == 1
def _log_error(exc: Exception, message: str = None): message = message or "Could not start the operation.\n" message += "error: {}\n{}".format(repr(exc), traceback.format_exc()) cond = V1StatusCondition.get_condition( type=V1Statuses.FAILED, status="True", reason="PolyaxonRunFailed", message=message, ) new_run_status(run=run, condition=cond)
def test_new_run_status_scheduled(self, auditor_record): new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.SCHEDULED, status=True), ) assert auditor_record.call_count == 1 call_args, call_kwargs = auditor_record.call_args assert call_args == () assert call_kwargs["event_type"] == run_events.RUN_NEW_STATUS
def create_status(view, serializer): serializer.is_valid() validated_data = serializer.validated_data if not validated_data: return condition = None if validated_data.get("condition"): condition = V1StatusCondition.get_condition( **validated_data.get("condition")) if condition: new_run_status(run=view.run, condition=condition)
def create_one(self): run = super().create_one() condition = V1StatusCondition.get_condition( type=V1Statuses.RUNNING, status="True", reason="Run is running", message="foo", ) new_run_status(run, condition) new_run_stop_status(run, "stopping") return run
def stop_run(view, request, *args, **kwargs): if LifeCycle.is_done(view.run.status): return Response(status=status.HTTP_200_OK, data={}) condition = V1StatusCondition.get_condition( type=V1Statuses.STOPPING, status="True", reason="PolyaxonRunStopping", message="User requested to stop the run.", ) new_run_status(run=view.run, condition=condition) view.audit(request, *args, **kwargs) return Response(status=status.HTTP_200_OK, data={})
def test_resume_undone_run(self): new_run_status( self.object, condition=V1StatusCondition.get_condition(type=V1Statuses.RUNNING, status=True), ) data = {} assert self.queryset.count() == 1 with patch("polycommon.workers.send") as workers_send: resp = self.client.post(self.url + "resume/", data) assert resp.status_code == status.HTTP_400_BAD_REQUEST assert workers_send.call_count == 0 assert self.queryset.count() == 1
def test_new_run_status_succeeded(self, auditor_record): new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.SUCCEEDED, status=True), ) assert auditor_record.call_count == 3 call_args_list = auditor_record.call_args_list assert call_args_list[0][0] == () assert call_args_list[1][0] == () assert call_args_list[2][0] == () assert call_args_list[0][1]["event_type"] == run_events.RUN_NEW_STATUS assert call_args_list[1][1]["event_type"] == run_events.RUN_SUCCEEDED assert call_args_list[2][1]["event_type"] == run_events.RUN_DONE
def test_equality_apply(self): eq_cond = EqualityCondition(op="eq") neq_cond = EqualityCondition(op="eq", negation=True) new_run_status( run=self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.SCHEDULED, status=True ), ) # eq queryset = eq_cond.apply( queryset=Run.objects, name="status", params=V1Statuses.SCHEDULED, query_backend=Q, timezone=settings.TIME_ZONE, ) assert queryset.count() == 1 queryset = eq_cond.apply( queryset=Run.objects, name="status", params=V1Statuses.SUCCEEDED, query_backend=Q, timezone=settings.TIME_ZONE, ) assert queryset.count() == 0 # neq queryset = neq_cond.apply( queryset=Run.objects, name="status", params=V1Statuses.SCHEDULED, query_backend=Q, timezone=settings.TIME_ZONE, ) assert queryset.count() == 0 queryset = neq_cond.apply( queryset=Run.objects, name="status", params=V1Statuses.SUCCEEDED, query_backend=Q, timezone=settings.TIME_ZONE, ) assert queryset.count() == 1
def test_prepare_run_of_already_skipped_run(self, mock_resolve): spec_run = MagicMock(cache=V1Cache(disable=False)) mock_resolve.return_value = (None, spec_run) experiment = RunFactory(project=self.project, user=self.user) new_run_status( run=experiment, condition=V1StatusCondition.get_condition(type=V1Statuses.SKIPPED, status=True), ) new_experiment = RunFactory(project=self.project, user=self.user) runs_prepare(run_id=new_experiment.id) new_experiment.refresh_from_db() assert new_experiment.status == V1Statuses.COMPILED
def setUp(self): super().setUp() self.project = ProjectFactory() self.objects = [ self.factory_class(project=self.project, user=self.user) for _ in range(4) ] for obj in self.objects: new_run_status( run=obj, condition=V1StatusCondition.get_condition( type=V1Statuses.RUNNING, status=True), ) self.url = "/{}/{}/{}/runs/stop/".format(API_V1, self.user.username, self.project.name)
def runs_start(run_id: int, run: Optional[BaseRun]): run = get_run(run_id=run_id, run=run) if not run: return if not run.is_managed: return if not LifeCycle.is_compiled(run.status): _logger.info( "Run `%s` cannot transition from `%s` to `%s`.", run_id, run.status, V1Statuses.QUEUED, ) return condition = V1StatusCondition.get_condition( type=V1Statuses.QUEUED, status="True", reason="PolyaxonRunQueued", message="Run is queued", ) new_run_status(run=run, condition=condition) try: in_cluster = conf.get(K8S_IN_CLUSTER) if in_cluster and (run.is_service or run.is_job): manager.start( content=run.content, owner_name=run.project.owner.name, project_name=run.project.name, run_name=run.name, run_uuid=run.uuid.hex, run_kind=run.kind, namespace=conf.get(K8S_NAMESPACE), in_cluster=in_cluster, default_auth=False, ) except PolyaxonK8SError as e: condition = V1StatusCondition.get_condition( type=V1Statuses.FAILED, status="True", reason="PolyaxonRunFailed", message="Could not start the job {}".format(e), ) new_run_status(run=run, condition=condition)
def runs_prepare(run_id: int, run: Optional[BaseRun], eager: bool = False) -> bool: run = get_run(run_id=run_id, run=run) if not run: return False if not LifeCycle.is_compilable(run.status): _logger.info( "Run `%s` cannot transition from `%s` to `%s`.", run_id, run.status, V1Statuses.COMPILED, ) return False try: compiled_at = now() _, compiled_operation = resolver.resolve(run=run, compiled_at=compiled_at) except PolyaxonCompilerError as e: condition = V1StatusCondition.get_condition( type=V1Statuses.FAILED, status="True", reason="PolyaxonRunFailed", message=f"Run compilation error: {e}", ) new_run_status(run=run, condition=condition) return False condition = V1StatusCondition.get_condition( type=V1Statuses.COMPILED, status="True", reason="PolyaxonRunCompiler", message="Run is compiled", last_update_time=compiled_at, ) new_run_status(run=run, condition=condition) if not run.is_approved: return False if eager: runs_start(run_id=run.id, run=run) return False return True
def test_status_update_results_in_new_updated_at_datetime(self): updated_at = self.run.updated_at # Create new status new_run_status( self.run, condition=V1StatusCondition.get_condition(type=V1Statuses.STARTING, status=True), ) assert updated_at < self.run.updated_at updated_at = self.run.updated_at # Create new status new_run_status( self.run, condition=V1StatusCondition.get_condition(type=V1Statuses.STARTING, status=True), ) assert updated_at < self.run.updated_at
def test_delete_from_running_status_archives_and_schedules_stop(self): new_run_status( self.object, condition=V1StatusCondition.get_condition( type=V1Statuses.RUNNING, status=True ), ) assert self.model_class.objects.count() == 1 with patch("polycommon.workers.send") as workers_send: resp = self.client.delete(self.url) assert workers_send.call_count == 1 assert {c[0][0] for c in workers_send.call_args_list} == { CoreSchedulerCeleryTasks.RUNS_DELETE, } assert resp.status_code == status.HTTP_204_NO_CONTENT # Deleted assert self.model_class.objects.count() == 0 assert self.model_class.all.count() == 1
def test_new_stopped_status_after_stopping(self): new_run_status( self.run, condition=V1StatusCondition.get_condition(type=V1Statuses.STOPPING, status=True, reason="foo"), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 1 # Same this condition new_run_status( self.run, condition=V1StatusCondition.get_condition(type=V1Statuses.RUNNING, status=True, reason="foo"), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 1 # Different condition's message new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.STOPPED, status=True, reason="foo", message="New message", ), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 2
def test_new_status_equality(self): new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.SCHEDULED, status=True, reason="foo"), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 1 # Same condition new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.SCHEDULED, status=True, reason="foo"), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 1 # Different condition's message new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.SCHEDULED, status=True, reason="foo", message="New message", ), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 1 # New condition new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.RUNNING, status=True, reason="foo", message="New message", ), ) self.run.refresh_from_db() assert len(self.run.status_conditions) == 2
def test_new_status_set_start_date(self): # No status change assert self.run.started_at is None new_run_status( self.run, condition=V1StatusCondition.get_condition(type=V1Statuses.CREATED, status=True), ) assert self.run.started_at is None assert self.run.finished_at is None assert self.run.run_time is None new_run_status( self.run, condition=V1StatusCondition.get_condition( type=V1Statuses.SCHEDULED, status=True), ) self.run.refresh_from_db() assert self.run.started_at is None assert self.run.finished_at is None assert self.run.run_time is None # Set a running status self.run.status = V1Statuses.CREATED new_run_status( self.run, condition=V1StatusCondition.get_condition(type=V1Statuses.STARTING, status=True), ) self.run.refresh_from_db() assert self.run.started_at is not None assert self.run.finished_at is None assert self.run.run_time is None started_at = self.run.started_at self.run.status = V1Statuses.CREATED new_run_status( self.run, condition=V1StatusCondition.get_condition(type=V1Statuses.RUNNING, status=True), ) assert self.run.started_at == started_at assert self.run.finished_at is None assert self.run.run_time is None assert len(self.run.status_conditions) == 3 condition1 = V1StatusCondition.get_condition( type=V1Statuses.STOPPED, status="True", reason="Run is stopped", message="zombie error", ) new_run_status(self.run, condition1) self.run.refresh_from_db() assert self.run.started_at == started_at assert self.run.finished_at is not None finished_at = self.run.finished_at assert self.run.run_time == (self.run.finished_at - self.run.started_at).seconds assert len(self.run.status_conditions) == 4 assert self.run.status_conditions[3]["type"] == V1Statuses.STOPPED assert self.run.status_conditions[3]["message"] == "zombie error" assert self.run.status_conditions[3]["reason"] == "Run is stopped" condition2 = V1StatusCondition.get_condition( type=V1Statuses.FAILED, status="True", reason="Run failed", message="some error", ) new_run_status(self.run, condition2) self.run.refresh_from_db() assert self.run.started_at == started_at assert self.run.finished_at == finished_at assert self.run.run_time == (self.run.finished_at - self.run.started_at).seconds assert len(self.run.status_conditions) == 5 assert self.run.status_conditions[3]["type"] == V1Statuses.STOPPED assert self.run.status_conditions[4]["type"] == V1Statuses.FAILED # Update the stopped status condition3 = V1StatusCondition.get_condition( type=V1Statuses.STOPPED, status="True", reason="Run failed", message="some error", ) new_run_status(self.run, condition3) self.run.refresh_from_db() assert self.run.started_at == started_at assert self.run.finished_at == finished_at assert self.run.run_time == (self.run.finished_at - self.run.started_at).seconds assert len(self.run.status_conditions) == 6 assert self.run.status_conditions[3]["type"] == V1Statuses.STOPPED assert self.run.status_conditions[4]["type"] == V1Statuses.FAILED assert self.run.status_conditions[5]["type"] == V1Statuses.STOPPED assert self.run.status_conditions[5]["message"] == "some error" assert self.run.status_conditions[5]["reason"] == "Run failed"
def test_range_apply(self): new_run_status( self.run, condition=V1StatusCondition.get_condition(type=V1Statuses.FAILED, status=True), ) run2 = RunFactory(project=self.project) new_run_status( run2, condition=V1StatusCondition.get_condition(type=V1Statuses.STOPPED, status=True), ) run3 = RunFactory(project=self.project) new_run_status( run3, condition=V1StatusCondition.get_condition(type=V1Statuses.RUNNING, status=True), ) eq_cond = ValueCondition(op="eq") neq_cond = ValueCondition(op="eq", negation=True) in_cond = ValueCondition(op="in") nin_cond = ValueCondition(op="in", negation=True) # eq queryset = eq_cond.apply( queryset=Run.objects, name="status", params=V1Statuses.STOPPED, query_backend=Q, timezone=settings.TIME_ZONE, ) assert queryset.count() == 1 queryset = eq_cond.apply( queryset=Run.objects, name="status", params="foo", query_backend=Q, timezone=settings.TIME_ZONE, ) assert queryset.count() == 0 # neq queryset = neq_cond.apply( queryset=Run.objects, name="status", params=V1Statuses.STOPPED, query_backend=Q, timezone=settings.TIME_ZONE, ) assert queryset.count() == 2 queryset = neq_cond.apply( queryset=Run.objects, name="status", params="doo", query_backend=Q, timezone=settings.TIME_ZONE, ) assert queryset.count() == 3 # in queryset = in_cond.apply( queryset=Run.objects, name="status", params=[V1Statuses.STOPPED, V1Statuses.RUNNING], query_backend=Q, timezone=settings.TIME_ZONE, ) assert queryset.count() == 2 queryset = in_cond.apply( queryset=Run.objects, name="status", params=[V1Statuses.STOPPED, V1Statuses.RESUMING], query_backend=Q, timezone=settings.TIME_ZONE, ) assert queryset.count() == 1 queryset = in_cond.apply( queryset=Run.objects, name="status", params=[V1Statuses.RESUMING, V1Statuses.SKIPPED], query_backend=Q, timezone=settings.TIME_ZONE, ) assert queryset.count() == 0 queryset = in_cond.apply( queryset=Run.objects, name="status", params=[V1Statuses.FAILED, V1Statuses.STOPPED, V1Statuses.RUNNING], query_backend=Q, timezone=settings.TIME_ZONE, ) assert queryset.count() == 3 # nin queryset = nin_cond.apply( queryset=Run.objects, name="status", params=[V1Statuses.STOPPED, V1Statuses.RUNNING], query_backend=Q, timezone=settings.TIME_ZONE, ) assert queryset.count() == 1 queryset = nin_cond.apply( queryset=Run.objects, name="status", params=[V1Statuses.STOPPED, V1Statuses.RESUMING], query_backend=Q, timezone=settings.TIME_ZONE, ) assert queryset.count() == 2 queryset = nin_cond.apply( queryset=Run.objects, name="status", params=[V1Statuses.RESUMING, V1Statuses.SKIPPED], query_backend=Q, timezone=settings.TIME_ZONE, ) assert queryset.count() == 3 queryset = nin_cond.apply( queryset=Run.objects, name="status", params=[V1Statuses.FAILED, V1Statuses.STOPPED, V1Statuses.RUNNING], query_backend=Q, timezone=settings.TIME_ZONE, ) assert queryset.count() == 0