def __init__( self, owner: str = None, project: str = None, run_uuid: str = None, client: PolyaxonClient = None, ): try: owner, project = get_project_or_local( get_project_full_name(owner=owner, project=project)) except PolyaxonClientException: pass if project is None: if settings.CLIENT_CONFIG.is_managed: owner, project, _run_uuid = get_run_info() run_uuid = run_uuid or _run_uuid else: raise PolyaxonClientException( "Please provide a valid project.") if not owner or not project: raise PolyaxonClientException( "Please provide a valid project with owner.") self.client = client if not (self.client or settings.CLIENT_CONFIG.is_offline): self.client = PolyaxonClient() self._owner = owner self._project = project self._run_uuid = get_run_or_local(run_uuid) self._run_data = polyaxon_sdk.V1Run() self._namespace = None
def make_and_create_run(self, run_data: Tuple[str, str, str, str], default_auth: bool = False): run_owner, run_project, run_uuid = get_run_info( run_instance=run_data[0]) resource = self.make_run_resource( owner_name=run_owner, project_name=run_project, run_name=run_data[2], run_uuid=run_uuid, content=run_data[3], default_auth=default_auth, ) if not resource: return try: self.spawner.create(run_uuid=run_uuid, run_kind=run_data[1], resource=resource) except ApiException as e: if e.status == 409: logger.info( "Run already running, triggering an apply mechanism.") else: logger.info("Run submission error.") except Exception as e: logger.info( "Run could not be cleaned. Agent failed converting run manifest: {}\n{}" .format(repr(e), traceback.format_exc()))
def apply_run(self, run_data: Tuple[str, str, str, str]): run_owner, run_project, run_uuid = get_run_info( run_instance=run_data[0]) resource = self.prepare_run_resource( owner_name=run_owner, project_name=run_project, run_name=run_data[2], run_uuid=run_uuid, content=run_data[3], ) if not resource: return try: self.spawner.apply(run_uuid=run_uuid, run_kind=run_data[1], resource=resource) self.log_run_running(run_owner=run_owner, run_project=run_project, run_uuid=run_uuid) except Exception as e: self.log_run_failed(run_owner=run_owner, run_project=run_project, run_uuid=run_uuid, exc=e) self.clean_run(run_uuid=run_uuid, run_kind=run_data[1])
def create_run(self, run_data: Tuple[str, str, str, str]): run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0]) resource = self.prepare_run_resource( owner_name=run_owner, project_name=run_project, run_name=run_data[2], run_uuid=run_uuid, content=run_data[3], ) try: self.spawner.create( run_uuid=run_uuid, run_kind=run_data[1], resource=resource ) self.log_run_scheduled( run_owner=run_owner, run_project=run_project, run_uuid=run_uuid ) except ApiException as e: if e.status == 409: logger.info( "Run already running running, triggering an apply mechanism." ) self.apply_run(run_data=run_data) except Exception as e: self.log_run_failed( run_owner=run_owner, run_project=run_project, run_uuid=run_uuid, exc=e )
def create_code_repo(repo_path: str, url: str, revision: str, connection: str = None): try: clone_url = get_clone_url(url) except Exception as e: raise PolyaxonContainerException( "Error parsing url: {}.".format(url)) from e clone_git_repo(repo_path=repo_path, url=clone_url) set_remote(repo_path=repo_path, url=url) if revision: checkout_revision(repo_path=repo_path, revision=revision) if not settings.CLIENT_CONFIG.no_api: try: owner, project, run_uuid = get_run_info() except PolyaxonClientException as e: raise PolyaxonContainerException(e) code_ref = get_code_reference(path=repo_path, url=url) artifact_run = V1RunArtifact( name=code_ref.get("commit"), kind=V1ArtifactKind.CODEREF, connection=connection, summary=code_ref, is_input=True, ) RunClient(owner=owner, project=project, run_uuid=run_uuid).log_artifact_lineage(artifact_run)
def log_suggestions(suggestions: List[Dict]): from polyaxon import settings from polyaxon.client import RunClient from polyaxon.env_vars.getters import get_run_info from polyaxon.exceptions import PolyaxonClientException, PolyaxonContainerException if not settings.CLIENT_CONFIG.no_api: try: owner, project, run_uuid = get_run_info() except PolyaxonClientException as e: raise PolyaxonContainerException(e) RunClient(owner=owner, project=project, run_uuid=run_uuid).log_outputs(suggestions=suggestions)
def stop_run(self, run_data: Tuple[str, str]): run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0]) try: self.spawner.stop(run_uuid=run_uuid, run_kind=run_data[1]) except ApiException as e: if e.status == 404: logger.info("Run does not exist anymore, it could have been stopped.") self.log_run_stopped( run_owner=run_owner, run_project=run_project, run_uuid=run_uuid ) except Exception as e: self.log_run_failed( run_owner=run_owner, run_project=run_project, run_uuid=run_uuid, exc=e, message="Agent failed stopping run.\n", )
def create_auth_context(): try: owner, project, run_uuid = get_run_info() except PolyaxonClientException as e: raise PolyaxonContainerException(e) retry = 1 done = False while not done and retry <= 3: try: impersonate(owner=owner, project=project, run_uuid=run_uuid) print("Auth context initialized.") return except PolyaxonClientException: retry += 1 print("Could not establish connection, retrying ...") time.sleep(1 * retry) raise PolyaxonContainerException("Init job did not succeed authenticating job.")
def _submit_run(self, run_data: Tuple[str, str, str, str], sync_api=True): run_owner, run_project, run_uuid = get_run_info( run_instance=run_data[0]) resource = self.prepare_run_resource( owner_name=run_owner, project_name=run_project, run_name=run_data[2], run_uuid=run_uuid, content=run_data[3], ) if not resource: return try: self.spawner.create(run_uuid=run_uuid, run_kind=run_data[1], resource=resource) if sync_api: self.log_run_scheduled(run_owner=run_owner, run_project=run_project, run_uuid=run_uuid) except ApiException as e: if e.status == 409: logger.info( "Run already running, triggering an apply mechanism.") self.apply_run(run_data=run_data) else: logger.info("Run submission error.") self.log_run_failed( run_owner=run_owner, run_project=run_project, run_uuid=run_uuid, exc=e, ) except Exception as e: if sync_api: self.log_run_failed( run_owner=run_owner, run_project=run_project, run_uuid=run_uuid, exc=e, )
def create_dockerfile_lineage(dockerfile_path: str, summary: Dict): if not dockerfile_path: return filename = os.path.basename(dockerfile_path) if not settings.CLIENT_CONFIG.no_api: try: owner, project, run_uuid = get_run_info() except PolyaxonClientException as e: raise PolyaxonContainerException(e) artifact_run = V1RunArtifact( name=filename, kind=V1ArtifactKind.DOCKERFILE, path=RunClient.get_rel_asset_path(dockerfile_path), summary=summary, is_input=True, ) RunClient(owner=owner, project=project, run_uuid=run_uuid).log_artifact_lineage( artifact_run )
def delete_run(self, run_data: Tuple[str, str, str, str]): run_owner, run_project, run_uuid = get_run_info( run_instance=run_data[0]) self.clean_run(run_uuid=run_uuid, run_kind=run_data[1]) if run_data[3]: self.make_and_create_run(run_data)
async def start_sidecar( container_id: str, sleep_interval: int, sync_interval: int, monitor_outputs: bool, monitor_logs: bool, ): sync_interval = get_sync_interval( interval=sync_interval, sleep_interval=sleep_interval ) try: pod_id = os.environ[POLYAXON_KEYS_K8S_POD_ID] except KeyError as e: raise PolyaxonContainerException( "Please make sure that this job has been " "started by Polyaxon with all required context." ) from e try: owner, project, run_uuid = get_run_info() except PolyaxonClientException as e: raise PolyaxonContainerException(e) client = RunClient(owner=owner, project=project, run_uuid=run_uuid) k8s_manager = AsyncK8SManager(namespace=CLIENT_CONFIG.namespace, in_cluster=True) await k8s_manager.setup() pod = await k8s_manager.get_pod(pod_id, reraise=True) retry = 1 is_running = True counter = 0 state = { "last_artifacts_check": None, "last_logs_check": None, } async def monitor(): if monitor_logs: await sync_logs( run_uuid=run_uuid, k8s_manager=k8s_manager, pod=pod, last_time=None, stream=True, is_running=is_running, ) if monitor_outputs: last_check = state["last_artifacts_check"] state["last_artifacts_check"] = sync_artifacts( last_check=last_check, run_uuid=run_uuid, ) sync_summaries( last_check=last_check, run_uuid=run_uuid, client=client, ) while is_running and retry <= 3: await asyncio.sleep(sleep_interval) try: is_running = await k8s_manager.is_pod_running(pod_id, container_id) except ApiException as e: retry += 1 logger.info("Exception %s" % repr(e)) logger.info("Sleeping ...") await asyncio.sleep(retry) continue logger.debug("Syncing ...") if is_running: retry = 1 counter += 1 if counter == sync_interval: counter = 0 try: await monitor() except Exception as e: logger.warning("Polyaxon sidecar error: %s" % repr(e)) await monitor() logger.info("Cleaning non main containers") if k8s_manager: await k8s_manager.close()
def start_sidecar( container_id: str, sleep_interval: int, sync_interval: int, monitor_outputs: bool, monitor_logs: bool, ): sync_interval = get_sync_interval(interval=sync_interval, sleep_interval=sleep_interval) try: owner, project, run_uuid = get_run_info() except PolyaxonClientException as e: raise PolyaxonContainerException(e) client = RunClient(owner=owner, project=project, run_uuid=run_uuid) pod_id = CLIENT_CONFIG.pod_id if not pod_id: raise PolyaxonContainerException( "Please make sure that this job has been " "started by Polyaxon with all required context.") k8s_manager = K8SManager(namespace=CLIENT_CONFIG.namespace, in_cluster=True) retry = 1 is_running = True counter = 0 state = { "last_artifacts_check": None, "last_logs_check": None, } def monitor(): if monitor_outputs: last_check = state["last_artifacts_check"] state["last_artifacts_check"] = sync_artifacts( last_check=last_check, run_uuid=run_uuid, ) sync_summaries( last_check=last_check, run_uuid=run_uuid, client=client, ) if monitor_logs: state["last_logs_check"] = sync_logs( k8s_manager=k8s_manager, client=client, last_check=state["last_logs_check"], run_uuid=run_uuid, pod_id=pod_id, container_id=container_id, owner=owner, project=project, ) while is_running and retry <= 3: time.sleep(sleep_interval) try: is_running = is_pod_running(k8s_manager, pod_id, container_id) except ApiException as e: retry += 1 time.sleep(1 * retry) logger.info("Exception %s" % repr(e)) logger.info("Sleeping ...") logger.debug("Syncing ...") if is_running: retry = 1 counter += 1 if counter == sync_interval: counter = 0 try: monitor() except Exception as e: logger.warning("Polyaxon sidecar error: %e", e) monitor() logger.info("Cleaning non main containers")
def test_run_info_checks_is_managed(self): settings.CLIENT_CONFIG.is_managed = False with self.assertRaises(PolyaxonClientException): get_run_info()