def process( self, pool: "ThreadPoolExecutor") -> polyaxon_sdk.V1AgentStateResponse: try: agent_state = self.get_state() if agent_state.compatible_updates: self.sync_compatible_updates(agent_state.compatible_updates) if agent_state: logger.info("Starting runs submission process.") else: logger.info("No state was found.") return polyaxon_sdk.V1AgentStateResponse() state = agent_state.state for run_data in state.schedules or []: pool.submit(self.create_run_and_sync, run_data) for run_data in state.queued or []: pool.submit(self.create_run_and_sync, run_data) for run_data in state.stopping or []: pool.submit(self.stop_run, run_data) for run_data in state.apply or []: pool.submit(self.apply_run, run_data) for run_data in state.deleting or []: pool.submit(self.delete_run, run_data) for run_data in state.hooks or []: pool.submit(self.make_and_create_run, run_data) for run_data in state.watchdogs or []: pool.submit(self.make_and_create_run, run_data) for run_data in state.tuners or []: pool.submit(self.make_and_create_run, run_data, True) return agent_state except Exception as exc: logger.error(exc) return polyaxon_sdk.V1AgentStateResponse()
def make_run_resource( self, owner_name: str, project_name: str, run_name: str, run_uuid: str, content: str, default_auth=False, ) -> Dict: try: return converter.make_and_convert( owner_name=owner_name, project_name=project_name, run_name=run_name, run_uuid=run_uuid, content=content, default_auth=default_auth, ) except PolypodException as e: logger.info( "Run could not be cleaned. Agent failed converting run manifest: {}\n{}" .format(repr(e), traceback.format_exc())) except Exception as e: logger.info( "Agent failed during compilation with unknown exception: {}\n{}" .format(repr(e), traceback.format_exc()))
def _on_error(ws, error): if isinstance(error, (KeyboardInterrupt, SystemExit)): logger.info( "Quitting... The session will be running in the background.") else: logger.debug("Termination cause: %s", error) logger.debug("Session disconnected.")
def clean_run(self, run_uuid: str, run_kind: str): try: self.spawner.clean(run_uuid=run_uuid, run_kind=run_kind) self.spawner.stop(run_uuid=run_uuid, run_kind=run_kind) except ApiException as e: if e.status == 404: logger.info("Run does not exist.") except Exception as e: logger.info("Run could not be cleaned: {}\n{}".format( repr(e), traceback.format_exc()))
def untar(self, filename, delete_tar=True, extract_path=None): extract_path = extract_path or "." logger.info("Untarring the contents of the file ...") tar = tarfile.open(filename) tar.extractall(extract_path) tar.close() if delete_tar: logger.info("Cleaning up the tar file ...") os.remove(filename) return filename
def log_run_running(self, run_owner: str, run_project: str, run_uuid: str) -> None: message = "Run changes were applied by the agent." self.log_run_status( run_owner=run_owner, run_project=run_project, run_uuid=run_uuid, status=V1Statuses.RUNNING, reason="PolyaxonAgentRunActionRunning", message=message, ) logger.info(message)
def log_run_scheduled(self, run_owner: str, run_project: str, run_uuid: str) -> None: message = "Run was scheduled by the agent." self.log_run_status( run_owner=run_owner, run_project=run_project, run_uuid=run_uuid, status=V1Statuses.SCHEDULED, reason="PolyaxonAgentRunActionScheduled", message=message, ) logger.info(message)
def get_gc_credentials( key_path=None, keyfile_dict=None, scopes=None, context_path: Optional[str] = None ): """ Returns the Credentials object for Google API """ key_path = key_path or get_key_path(context_path=context_path) keyfile_dict = keyfile_dict or get_keyfile_dict(context_path=context_path) scopes = scopes or get_scopes(context_path=context_path) if scopes is not None: scopes = [s.strip() for s in scopes.split(",")] else: scopes = DEFAULT_SCOPES if not key_path and not keyfile_dict: # Look for default GC path if os.path.exists(CONTEXT_MOUNT_GC): key_path = CONTEXT_MOUNT_GC if not key_path and not keyfile_dict: logger.info( "Getting connection using `google.auth.default()` " "since no key file is defined for hook." ) credentials, _ = google.auth.default(scopes=scopes) elif key_path: # Get credentials from a JSON file. if key_path.endswith(".json"): logger.info("Getting connection using a JSON key file.") credentials = Credentials.from_service_account_file( os.path.abspath(key_path), scopes=scopes ) else: raise PolyaxonStoresException("Unrecognised extension for key file.") else: # Get credentials from JSON data. try: if not isinstance(keyfile_dict, Mapping): keyfile_dict = json.loads(keyfile_dict) # Convert escaped newlines to actual newlines if any. keyfile_dict["private_key"] = keyfile_dict["private_key"].replace( "\\n", "\n" ) credentials = Credentials.from_service_account_info( keyfile_dict, scopes=scopes ) except ValueError: # json.decoder.JSONDecodeError does not exist on py2 raise PolyaxonStoresException("Invalid key JSON.") return credentials
def untar_file(filename: str = None, delete_tar: bool = True, extract_path: str = None): extract_path = extract_path or "." extract_path = get_path(extract_path, filename.split(".tar.gz")[0]) check_or_create_path(extract_path, is_dir=True) logger.info("Untarring the contents of the file ...") # Untar the file with tarfile.open(filename) as tar: tar.extractall(extract_path) if delete_tar: logger.info("Cleaning up the tar file ...") os.remove(filename) return extract_path
def check_bucket(self, bucket_name): """ Checks if a buckete exists. Args: bucket_name: `str`. Name of the bucket """ try: self.connection.head_bucket(Bucket=bucket_name) return True except ClientError as e: logger.info(e.response["Error"]["Message"]) return False
def check_blob(self, blob, bucket_name=None): """ Checks for the existence of a file in Google Cloud Storage. Args: blob: `str`. the path to the object to check in the Google cloud storage bucket. bucket_name: `str`. Name of the bucket in which the file is stored """ try: return bool(self.get_blob(blob=blob, bucket_name=bucket_name)) except Exception as e: logger.info("Block does not exist %s", e) return False
def handle_iteration( client: RunClient, suggestions: List[Dict] = None, ): if not suggestions: logger.warning("No new suggestions were created") return try: logger.info("Generated new {} suggestions".format(len(suggestions))) client.log_outputs(suggestions=[sanitize_dict(s) for s in suggestions], async_req=False) except Exception as e: exp = "Polyaxon tuner failed logging iteration definition: {}\n{}".format( repr(e), traceback.format_exc()) client.log_failed(reason="PolyaxonTunerIteration", message=exp) logger.warning(e)
def check_key(self, key, bucket_name=None): """ Checks if a key exists in a bucket Args: key: `str`. S3 key that will point to the file bucket_name: `str`. Name of the bucket in which the file is stored """ if not bucket_name: (bucket_name, key) = self.parse_s3_url(key) try: self.connection.head_object(Bucket=bucket_name, Key=key) return True except ClientError as e: logger.info(e.response["Error"]["Message"]) return False
def stop_run(self, run_data: Tuple[str, str]): run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0]) try: self.spawner.stop(run_uuid=run_uuid, run_kind=run_data[1]) except ApiException as e: if e.status == 404: logger.info("Run does not exist anymore, it could have been stopped.") self.log_run_stopped( run_owner=run_owner, run_project=run_project, run_uuid=run_uuid ) except Exception as e: self.log_run_failed( run_owner=run_owner, run_project=run_project, run_uuid=run_uuid, exc=e, message="Agent failed stopping run.\n", )
def _register(self): logger.info("Agent is starting.") try: agent_state = self.get_state() if agent_state.status == V1Statuses.STOPPED: logger.info( "Agent has been stopped from the platform," "but the deployment is still running." "Please either set the agent to starting or teardown the agent deployment." ) return self.sync() self.log_agent_running() except (ApiException, HTTPError) as e: self.log_agent_failed( message="Could not start the agent {}.".format(repr(e)) ) sys.exit(1) atexit.register(self._wait)
def _submit_run(self, run_data: Tuple[str, str, str, str], sync_api=True): run_owner, run_project, run_uuid = get_run_info( run_instance=run_data[0]) resource = self.prepare_run_resource( owner_name=run_owner, project_name=run_project, run_name=run_data[2], run_uuid=run_uuid, content=run_data[3], ) if not resource: return try: self.spawner.create(run_uuid=run_uuid, run_kind=run_data[1], resource=resource) if sync_api: self.log_run_scheduled(run_owner=run_owner, run_project=run_project, run_uuid=run_uuid) except ApiException as e: if e.status == 409: logger.info( "Run already running, triggering an apply mechanism.") self.apply_run(run_data=run_data) else: logger.info("Run submission error.") self.log_run_failed( run_owner=run_owner, run_project=run_project, run_uuid=run_uuid, exc=e, ) except Exception as e: if sync_api: self.log_run_failed( run_owner=run_owner, run_project=run_project, run_uuid=run_uuid, exc=e, )
def start(self) -> None: try: with exit_context() as exit_event: index = 0 workers = get_pool_workers() with ThreadPoolExecutor(workers) as pool: logger.debug("Thread pool Workers: {}".format(workers)) timeout = self.sleep_interval or get_wait(index) while not exit_event.wait(timeout=timeout): index += 1 agent_state = self.process(pool) self._check_status(agent_state) if agent_state.state.full: index = 2 self.ping() timeout = self.sleep_interval or get_wait(index) logger.info("Sleeping for {} seconds".format(timeout)) finally: self.end()
def convert_to_image_init(x): if isinstance(x, Mapping): if "name" not in x: raise PolyaxonSchemaError( "Polyaxon received an image that does not contain an image" ) logger.info("Polyaxon received a legacy image format. " "The operation will not run correctly") return x.get("name") if not isinstance(x, Mapping): try: x = convert_to_dict(x, key) except: return x if "name" not in x: raise PolyaxonSchemaError( "Polyaxon received an image that does not contain an image") logger.info("Polyaxon received a legacy image format. " "The operation will not run correctly") return x.get("name")
def process(self, pool: "ThreadPoolExecutor") -> polyaxon_sdk.V1AgentStateResponse: try: agent_state = self.get_state() if agent_state: logger.info("Starting runs submission process.") else: logger.info("No state was found.") return polyaxon_sdk.V1AgentStateResponse() state = agent_state.state for run_data in state.queued or []: pool.submit(self.create_run, run_data) for run_data in state.stopping or []: pool.submit(self.stop_run, run_data) for run_data in state.apply or []: pool.submit(self.apply_run, run_data) return agent_state except Exception as exc: logger.error(exc) return polyaxon_sdk.V1AgentStateResponse()
def make_and_create_run(self, run_data: Tuple[str, str, str, str], default_auth: bool = False): run_owner, run_project, run_uuid = get_run_info( run_instance=run_data[0]) resource = self.make_run_resource( owner_name=run_owner, project_name=run_project, run_name=run_data[2], run_uuid=run_uuid, content=run_data[3], default_auth=default_auth, ) if not resource: return try: self.spawner.create(run_uuid=run_uuid, run_kind=run_data[1], resource=resource) except ApiException as e: if e.status == 409: logger.info( "Run already running, triggering an apply mechanism.") else: logger.info("Run submission error.") except Exception as e: logger.info( "Run could not be cleaned. Agent failed converting run manifest: {}\n{}" .format(repr(e), traceback.format_exc()))
def _on_close(ws): logger.info("Session ended")
def login(token, username, password): """Login to Polyaxon.""" polyaxon_client = PolyaxonClient() if username and not token: # Use user or email / password login if not password: password = click.prompt( "Please enter your password", type=str, hide_input=True ) password = password.strip() if not password: logger.info( "You entered an empty string. " "Please make sure you enter your password correctly." ) sys.exit(1) try: body = V1Credentials(username=username, password=password) access_auth = polyaxon_client.auth_v1.login(body=body) except (ApiException, HTTPError) as e: AuthConfigManager.purge() CliConfigManager.purge() handle_cli_error(e, message="Could not login.") sys.exit(1) if not access_auth.token: Printer.print_error("Failed to login") return else: if not token: token_url = "{}/profile/token".format( clean_host(polyaxon_client.config.host) ) click.confirm( "Authentication token page will now open in your browser. Continue?", abort=True, default=True, ) click.launch(token_url) logger.info("Please copy and paste the authentication token.") token = click.prompt( "This is an invisible field. Paste token and press ENTER", type=str, hide_input=True, ) if not token: logger.info( "Empty token received. " "Make sure your shell is handling the token appropriately." ) logger.info( "See docs for help: http://polyaxon.com/docs/polyaxon_cli/commands/auth" ) return access_auth = polyaxon_sdk.models.V1Auth(token=token.strip(" ")) # Set user try: AuthConfigManager.purge() polyaxon_client = PolyaxonClient(token=access_auth.token) user = polyaxon_client.users_v1.get_user() except (ApiException, HTTPError) as e: handle_cli_error(e, message="Could not load user info.") sys.exit(1) access_token = AccessTokenConfig(username=user.username, token=access_auth.token) AuthConfigManager.set_config(access_token) polyaxon_client.config.token = access_auth.token Printer.print_success("Login successful") # Reset current cli server_versions = get_server_versions(polyaxon_client=polyaxon_client) current_version = get_current_version() log_handler = get_log_handler(polyaxon_client=polyaxon_client) CliConfigManager.reset( check_count=0, current_version=current_version, server_versions=server_versions.to_dict(), log_handler=log_handler, )
def end(self): self._graceful_shutdown = True logger.info("Agent is shutting down.")
def _exit_handler(*args, **kwargs) -> None: logger.info("Keyboard Interrupt received, exiting pool.") exit_event.set()
def end(self, sleep: int = None): self._graceful_shutdown = True if sleep: time.sleep(sleep) else: logger.info("Agent is shutting down.")
def tuner(): logger.info("Creating new suggestions...") pass
async def start_sidecar( container_id: str, sleep_interval: int, sync_interval: int, monitor_outputs: bool, monitor_logs: bool, ): sync_interval = get_sync_interval( interval=sync_interval, sleep_interval=sleep_interval ) try: pod_id = os.environ[POLYAXON_KEYS_K8S_POD_ID] except KeyError as e: raise PolyaxonContainerException( "Please make sure that this job has been " "started by Polyaxon with all required context." ) from e try: owner, project, run_uuid = get_run_info() except PolyaxonClientException as e: raise PolyaxonContainerException(e) client = RunClient(owner=owner, project=project, run_uuid=run_uuid) k8s_manager = AsyncK8SManager(namespace=CLIENT_CONFIG.namespace, in_cluster=True) await k8s_manager.setup() pod = await k8s_manager.get_pod(pod_id, reraise=True) retry = 1 is_running = True counter = 0 state = { "last_artifacts_check": None, "last_logs_check": None, } async def monitor(): if monitor_logs: await sync_logs( run_uuid=run_uuid, k8s_manager=k8s_manager, pod=pod, last_time=None, stream=True, is_running=is_running, ) if monitor_outputs: last_check = state["last_artifacts_check"] state["last_artifacts_check"] = sync_artifacts( last_check=last_check, run_uuid=run_uuid, ) sync_summaries( last_check=last_check, run_uuid=run_uuid, client=client, ) while is_running and retry <= 3: await asyncio.sleep(sleep_interval) try: is_running = await k8s_manager.is_pod_running(pod_id, container_id) except ApiException as e: retry += 1 logger.info("Exception %s" % repr(e)) logger.info("Sleeping ...") await asyncio.sleep(retry) continue logger.debug("Syncing ...") if is_running: retry = 1 counter += 1 if counter == sync_interval: counter = 0 try: await monitor() except Exception as e: logger.warning("Polyaxon sidecar error: %s" % repr(e)) await monitor() logger.info("Cleaning non main containers") if k8s_manager: await k8s_manager.close()
def start_sidecar( container_id: str, sleep_interval: int, sync_interval: int, monitor_outputs: bool, monitor_logs: bool, ): sync_interval = get_sync_interval(interval=sync_interval, sleep_interval=sleep_interval) try: owner, project, run_uuid = get_run_info() except PolyaxonClientException as e: raise PolyaxonContainerException(e) client = RunClient(owner=owner, project=project, run_uuid=run_uuid) pod_id = CLIENT_CONFIG.pod_id if not pod_id: raise PolyaxonContainerException( "Please make sure that this job has been " "started by Polyaxon with all required context.") k8s_manager = K8SManager(namespace=CLIENT_CONFIG.namespace, in_cluster=True) retry = 1 is_running = True counter = 0 state = { "last_artifacts_check": None, "last_logs_check": None, } def monitor(): if monitor_outputs: last_check = state["last_artifacts_check"] state["last_artifacts_check"] = sync_artifacts( last_check=last_check, run_uuid=run_uuid, ) sync_summaries( last_check=last_check, run_uuid=run_uuid, client=client, ) if monitor_logs: state["last_logs_check"] = sync_logs( k8s_manager=k8s_manager, client=client, last_check=state["last_logs_check"], run_uuid=run_uuid, pod_id=pod_id, container_id=container_id, owner=owner, project=project, ) while is_running and retry <= 3: time.sleep(sleep_interval) try: is_running = is_pod_running(k8s_manager, pod_id, container_id) except ApiException as e: retry += 1 time.sleep(1 * retry) logger.info("Exception %s" % repr(e)) logger.info("Sleeping ...") logger.debug("Syncing ...") if is_running: retry = 1 counter += 1 if counter == sync_interval: counter = 0 try: monitor() except Exception as e: logger.warning("Polyaxon sidecar error: %e", e) monitor() logger.info("Cleaning non main containers")