Пример #1
0
    def process(
            self,
            pool: "ThreadPoolExecutor") -> polyaxon_sdk.V1AgentStateResponse:
        try:
            agent_state = self.get_state()
            if agent_state.compatible_updates:
                self.sync_compatible_updates(agent_state.compatible_updates)

            if agent_state:
                logger.info("Starting runs submission process.")
            else:
                logger.info("No state was found.")
                return polyaxon_sdk.V1AgentStateResponse()

            state = agent_state.state
            for run_data in state.schedules or []:
                pool.submit(self.create_run_and_sync, run_data)
            for run_data in state.queued or []:
                pool.submit(self.create_run_and_sync, run_data)
            for run_data in state.stopping or []:
                pool.submit(self.stop_run, run_data)
            for run_data in state.apply or []:
                pool.submit(self.apply_run, run_data)
            for run_data in state.deleting or []:
                pool.submit(self.delete_run, run_data)
            for run_data in state.hooks or []:
                pool.submit(self.make_and_create_run, run_data)
            for run_data in state.watchdogs or []:
                pool.submit(self.make_and_create_run, run_data)
            for run_data in state.tuners or []:
                pool.submit(self.make_and_create_run, run_data, True)
            return agent_state
        except Exception as exc:
            logger.error(exc)
            return polyaxon_sdk.V1AgentStateResponse()
Пример #2
0
 def make_run_resource(
     self,
     owner_name: str,
     project_name: str,
     run_name: str,
     run_uuid: str,
     content: str,
     default_auth=False,
 ) -> Dict:
     try:
         return converter.make_and_convert(
             owner_name=owner_name,
             project_name=project_name,
             run_name=run_name,
             run_uuid=run_uuid,
             content=content,
             default_auth=default_auth,
         )
     except PolypodException as e:
         logger.info(
             "Run could not be cleaned. Agent failed converting run manifest: {}\n{}"
             .format(repr(e), traceback.format_exc()))
     except Exception as e:
         logger.info(
             "Agent failed during compilation with unknown exception: {}\n{}"
             .format(repr(e), traceback.format_exc()))
Пример #3
0
 def _on_error(ws, error):
     if isinstance(error, (KeyboardInterrupt, SystemExit)):
         logger.info(
             "Quitting... The session will be running in the background.")
     else:
         logger.debug("Termination cause: %s", error)
         logger.debug("Session disconnected.")
Пример #4
0
 def clean_run(self, run_uuid: str, run_kind: str):
     try:
         self.spawner.clean(run_uuid=run_uuid, run_kind=run_kind)
         self.spawner.stop(run_uuid=run_uuid, run_kind=run_kind)
     except ApiException as e:
         if e.status == 404:
             logger.info("Run does not exist.")
     except Exception as e:
         logger.info("Run could not be cleaned: {}\n{}".format(
             repr(e), traceback.format_exc()))
Пример #5
0
 def untar(self, filename, delete_tar=True, extract_path=None):
     extract_path = extract_path or "."
     logger.info("Untarring the contents of the file ...")
     tar = tarfile.open(filename)
     tar.extractall(extract_path)
     tar.close()
     if delete_tar:
         logger.info("Cleaning up the tar file ...")
         os.remove(filename)
     return filename
Пример #6
0
 def log_run_running(self, run_owner: str, run_project: str, run_uuid: str) -> None:
     message = "Run changes were applied by the agent."
     self.log_run_status(
         run_owner=run_owner,
         run_project=run_project,
         run_uuid=run_uuid,
         status=V1Statuses.RUNNING,
         reason="PolyaxonAgentRunActionRunning",
         message=message,
     )
     logger.info(message)
Пример #7
0
 def log_run_scheduled(self, run_owner: str, run_project: str,
                       run_uuid: str) -> None:
     message = "Run was scheduled by the agent."
     self.log_run_status(
         run_owner=run_owner,
         run_project=run_project,
         run_uuid=run_uuid,
         status=V1Statuses.SCHEDULED,
         reason="PolyaxonAgentRunActionScheduled",
         message=message,
     )
     logger.info(message)
Пример #8
0
def get_gc_credentials(
    key_path=None, keyfile_dict=None, scopes=None, context_path: Optional[str] = None
):
    """
    Returns the Credentials object for Google API
    """
    key_path = key_path or get_key_path(context_path=context_path)
    keyfile_dict = keyfile_dict or get_keyfile_dict(context_path=context_path)
    scopes = scopes or get_scopes(context_path=context_path)

    if scopes is not None:
        scopes = [s.strip() for s in scopes.split(",")]
    else:
        scopes = DEFAULT_SCOPES

    if not key_path and not keyfile_dict:
        # Look for default GC path
        if os.path.exists(CONTEXT_MOUNT_GC):
            key_path = CONTEXT_MOUNT_GC

    if not key_path and not keyfile_dict:
        logger.info(
            "Getting connection using `google.auth.default()` "
            "since no key file is defined for hook."
        )
        credentials, _ = google.auth.default(scopes=scopes)
    elif key_path:
        # Get credentials from a JSON file.
        if key_path.endswith(".json"):
            logger.info("Getting connection using a JSON key file.")
            credentials = Credentials.from_service_account_file(
                os.path.abspath(key_path), scopes=scopes
            )
        else:
            raise PolyaxonStoresException("Unrecognised extension for key file.")
    else:
        # Get credentials from JSON data.
        try:
            if not isinstance(keyfile_dict, Mapping):
                keyfile_dict = json.loads(keyfile_dict)

            # Convert escaped newlines to actual newlines if any.
            keyfile_dict["private_key"] = keyfile_dict["private_key"].replace(
                "\\n", "\n"
            )

            credentials = Credentials.from_service_account_info(
                keyfile_dict, scopes=scopes
            )
        except ValueError:  # json.decoder.JSONDecodeError does not exist on py2
            raise PolyaxonStoresException("Invalid key JSON.")

    return credentials
Пример #9
0
def untar_file(filename: str = None, delete_tar: bool = True, extract_path: str = None):
    extract_path = extract_path or "."
    extract_path = get_path(extract_path, filename.split(".tar.gz")[0])
    check_or_create_path(extract_path, is_dir=True)
    logger.info("Untarring the contents of the file ...")
    # Untar the file
    with tarfile.open(filename) as tar:
        tar.extractall(extract_path)
    if delete_tar:
        logger.info("Cleaning up the tar file ...")
        os.remove(filename)
    return extract_path
Пример #10
0
    def check_bucket(self, bucket_name):
        """
        Checks if a buckete exists.

        Args:
            bucket_name: `str`. Name of the bucket
        """
        try:
            self.connection.head_bucket(Bucket=bucket_name)
            return True
        except ClientError as e:
            logger.info(e.response["Error"]["Message"])
            return False
Пример #11
0
    def check_blob(self, blob, bucket_name=None):
        """
        Checks for the existence of a file in Google Cloud Storage.

        Args:
            blob: `str`. the path to the object to check in the Google cloud storage bucket.
            bucket_name: `str`. Name of the bucket in which the file is stored
        """
        try:
            return bool(self.get_blob(blob=blob, bucket_name=bucket_name))
        except Exception as e:
            logger.info("Block does not exist %s", e)
            return False
Пример #12
0
def handle_iteration(
    client: RunClient,
    suggestions: List[Dict] = None,
):
    if not suggestions:
        logger.warning("No new suggestions were created")
        return
    try:
        logger.info("Generated new {} suggestions".format(len(suggestions)))
        client.log_outputs(suggestions=[sanitize_dict(s) for s in suggestions],
                           async_req=False)
    except Exception as e:
        exp = "Polyaxon tuner failed logging iteration definition: {}\n{}".format(
            repr(e), traceback.format_exc())
        client.log_failed(reason="PolyaxonTunerIteration", message=exp)
        logger.warning(e)
Пример #13
0
    def check_key(self, key, bucket_name=None):
        """
        Checks if a key exists in a bucket

        Args:
            key: `str`. S3 key that will point to the file
            bucket_name: `str`. Name of the bucket in which the file is stored
        """
        if not bucket_name:
            (bucket_name, key) = self.parse_s3_url(key)

        try:
            self.connection.head_object(Bucket=bucket_name, Key=key)
            return True
        except ClientError as e:
            logger.info(e.response["Error"]["Message"])
            return False
Пример #14
0
 def stop_run(self, run_data: Tuple[str, str]):
     run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
     try:
         self.spawner.stop(run_uuid=run_uuid, run_kind=run_data[1])
     except ApiException as e:
         if e.status == 404:
             logger.info("Run does not exist anymore, it could have been stopped.")
             self.log_run_stopped(
                 run_owner=run_owner, run_project=run_project, run_uuid=run_uuid
             )
     except Exception as e:
         self.log_run_failed(
             run_owner=run_owner,
             run_project=run_project,
             run_uuid=run_uuid,
             exc=e,
             message="Agent failed stopping run.\n",
         )
Пример #15
0
 def _register(self):
     logger.info("Agent is starting.")
     try:
         agent_state = self.get_state()
         if agent_state.status == V1Statuses.STOPPED:
             logger.info(
                 "Agent has been stopped from the platform,"
                 "but the deployment is still running."
                 "Please either set the agent to starting or teardown the agent deployment."
             )
             return
         self.sync()
         self.log_agent_running()
     except (ApiException, HTTPError) as e:
         self.log_agent_failed(
             message="Could not start the agent {}.".format(repr(e))
         )
         sys.exit(1)
     atexit.register(self._wait)
Пример #16
0
    def _submit_run(self, run_data: Tuple[str, str, str, str], sync_api=True):
        run_owner, run_project, run_uuid = get_run_info(
            run_instance=run_data[0])
        resource = self.prepare_run_resource(
            owner_name=run_owner,
            project_name=run_project,
            run_name=run_data[2],
            run_uuid=run_uuid,
            content=run_data[3],
        )
        if not resource:
            return

        try:
            self.spawner.create(run_uuid=run_uuid,
                                run_kind=run_data[1],
                                resource=resource)
            if sync_api:
                self.log_run_scheduled(run_owner=run_owner,
                                       run_project=run_project,
                                       run_uuid=run_uuid)
        except ApiException as e:
            if e.status == 409:
                logger.info(
                    "Run already running, triggering an apply mechanism.")
                self.apply_run(run_data=run_data)
            else:
                logger.info("Run submission error.")
                self.log_run_failed(
                    run_owner=run_owner,
                    run_project=run_project,
                    run_uuid=run_uuid,
                    exc=e,
                )
        except Exception as e:
            if sync_api:
                self.log_run_failed(
                    run_owner=run_owner,
                    run_project=run_project,
                    run_uuid=run_uuid,
                    exc=e,
                )
Пример #17
0
    def start(self) -> None:
        try:
            with exit_context() as exit_event:
                index = 0
                workers = get_pool_workers()

                with ThreadPoolExecutor(workers) as pool:
                    logger.debug("Thread pool Workers: {}".format(workers))
                    timeout = self.sleep_interval or get_wait(index)
                    while not exit_event.wait(timeout=timeout):
                        index += 1
                        agent_state = self.process(pool)
                        self._check_status(agent_state)
                        if agent_state.state.full:
                            index = 2
                        self.ping()
                        timeout = self.sleep_interval or get_wait(index)
                        logger.info("Sleeping for {} seconds".format(timeout))
        finally:
            self.end()
Пример #18
0
    def convert_to_image_init(x):
        if isinstance(x, Mapping):
            if "name" not in x:
                raise PolyaxonSchemaError(
                    "Polyaxon received an image that does not contain an image"
                )
            logger.info("Polyaxon received a legacy image format. "
                        "The operation will not run correctly")
            return x.get("name")
        if not isinstance(x, Mapping):
            try:
                x = convert_to_dict(x, key)
            except:
                return x

        if "name" not in x:
            raise PolyaxonSchemaError(
                "Polyaxon received an image that does not contain an image")
        logger.info("Polyaxon received a legacy image format. "
                    "The operation will not run correctly")
        return x.get("name")
Пример #19
0
    def process(self, pool: "ThreadPoolExecutor") -> polyaxon_sdk.V1AgentStateResponse:
        try:
            agent_state = self.get_state()

            if agent_state:
                logger.info("Starting runs submission process.")
            else:
                logger.info("No state was found.")
                return polyaxon_sdk.V1AgentStateResponse()

            state = agent_state.state
            for run_data in state.queued or []:
                pool.submit(self.create_run, run_data)
            for run_data in state.stopping or []:
                pool.submit(self.stop_run, run_data)
            for run_data in state.apply or []:
                pool.submit(self.apply_run, run_data)
            return agent_state
        except Exception as exc:
            logger.error(exc)
            return polyaxon_sdk.V1AgentStateResponse()
Пример #20
0
    def make_and_create_run(self,
                            run_data: Tuple[str, str, str, str],
                            default_auth: bool = False):
        run_owner, run_project, run_uuid = get_run_info(
            run_instance=run_data[0])
        resource = self.make_run_resource(
            owner_name=run_owner,
            project_name=run_project,
            run_name=run_data[2],
            run_uuid=run_uuid,
            content=run_data[3],
            default_auth=default_auth,
        )
        if not resource:
            return

        try:
            self.spawner.create(run_uuid=run_uuid,
                                run_kind=run_data[1],
                                resource=resource)
        except ApiException as e:
            if e.status == 409:
                logger.info(
                    "Run already running, triggering an apply mechanism.")
            else:
                logger.info("Run submission error.")
        except Exception as e:
            logger.info(
                "Run could not be cleaned. Agent failed converting run manifest: {}\n{}"
                .format(repr(e), traceback.format_exc()))
Пример #21
0
 def _on_close(ws):
     logger.info("Session ended")
Пример #22
0
def login(token, username, password):
    """Login to Polyaxon."""
    polyaxon_client = PolyaxonClient()
    if username and not token:
        # Use user or email / password login
        if not password:
            password = click.prompt(
                "Please enter your password", type=str, hide_input=True
            )
            password = password.strip()
            if not password:
                logger.info(
                    "You entered an empty string. "
                    "Please make sure you enter your password correctly."
                )
                sys.exit(1)

        try:
            body = V1Credentials(username=username, password=password)
            access_auth = polyaxon_client.auth_v1.login(body=body)
        except (ApiException, HTTPError) as e:
            AuthConfigManager.purge()
            CliConfigManager.purge()
            handle_cli_error(e, message="Could not login.")
            sys.exit(1)

        if not access_auth.token:
            Printer.print_error("Failed to login")
            return
    else:
        if not token:
            token_url = "{}/profile/token".format(
                clean_host(polyaxon_client.config.host)
            )
            click.confirm(
                "Authentication token page will now open in your browser. Continue?",
                abort=True,
                default=True,
            )

            click.launch(token_url)
            logger.info("Please copy and paste the authentication token.")
            token = click.prompt(
                "This is an invisible field. Paste token and press ENTER",
                type=str,
                hide_input=True,
            )

        if not token:
            logger.info(
                "Empty token received. "
                "Make sure your shell is handling the token appropriately."
            )
            logger.info(
                "See docs for help: http://polyaxon.com/docs/polyaxon_cli/commands/auth"
            )
            return

        access_auth = polyaxon_sdk.models.V1Auth(token=token.strip(" "))

    # Set user
    try:
        AuthConfigManager.purge()
        polyaxon_client = PolyaxonClient(token=access_auth.token)
        user = polyaxon_client.users_v1.get_user()
    except (ApiException, HTTPError) as e:
        handle_cli_error(e, message="Could not load user info.")
        sys.exit(1)
    access_token = AccessTokenConfig(username=user.username, token=access_auth.token)
    AuthConfigManager.set_config(access_token)
    polyaxon_client.config.token = access_auth.token
    Printer.print_success("Login successful")

    # Reset current cli
    server_versions = get_server_versions(polyaxon_client=polyaxon_client)
    current_version = get_current_version()
    log_handler = get_log_handler(polyaxon_client=polyaxon_client)
    CliConfigManager.reset(
        check_count=0,
        current_version=current_version,
        server_versions=server_versions.to_dict(),
        log_handler=log_handler,
    )
Пример #23
0
 def end(self):
     self._graceful_shutdown = True
     logger.info("Agent is shutting down.")
Пример #24
0
 def _exit_handler(*args, **kwargs) -> None:
     logger.info("Keyboard Interrupt received, exiting pool.")
     exit_event.set()
Пример #25
0
 def end(self, sleep: int = None):
     self._graceful_shutdown = True
     if sleep:
         time.sleep(sleep)
     else:
         logger.info("Agent is shutting down.")
Пример #26
0
def tuner():
    logger.info("Creating new suggestions...")
    pass
Пример #27
0
async def start_sidecar(
    container_id: str,
    sleep_interval: int,
    sync_interval: int,
    monitor_outputs: bool,
    monitor_logs: bool,
):
    sync_interval = get_sync_interval(
        interval=sync_interval, sleep_interval=sleep_interval
    )
    try:
        pod_id = os.environ[POLYAXON_KEYS_K8S_POD_ID]
    except KeyError as e:
        raise PolyaxonContainerException(
            "Please make sure that this job has been "
            "started by Polyaxon with all required context."
        ) from e

    try:
        owner, project, run_uuid = get_run_info()
    except PolyaxonClientException as e:
        raise PolyaxonContainerException(e)

    client = RunClient(owner=owner, project=project, run_uuid=run_uuid)
    k8s_manager = AsyncK8SManager(namespace=CLIENT_CONFIG.namespace, in_cluster=True)
    await k8s_manager.setup()
    pod = await k8s_manager.get_pod(pod_id, reraise=True)

    retry = 1
    is_running = True
    counter = 0
    state = {
        "last_artifacts_check": None,
        "last_logs_check": None,
    }

    async def monitor():
        if monitor_logs:
            await sync_logs(
                run_uuid=run_uuid,
                k8s_manager=k8s_manager,
                pod=pod,
                last_time=None,
                stream=True,
                is_running=is_running,
            )
        if monitor_outputs:
            last_check = state["last_artifacts_check"]
            state["last_artifacts_check"] = sync_artifacts(
                last_check=last_check,
                run_uuid=run_uuid,
            )
            sync_summaries(
                last_check=last_check,
                run_uuid=run_uuid,
                client=client,
            )

    while is_running and retry <= 3:
        await asyncio.sleep(sleep_interval)
        try:
            is_running = await k8s_manager.is_pod_running(pod_id, container_id)
        except ApiException as e:
            retry += 1
            logger.info("Exception %s" % repr(e))
            logger.info("Sleeping ...")
            await asyncio.sleep(retry)
            continue

        logger.debug("Syncing ...")
        if is_running:
            retry = 1

        counter += 1
        if counter == sync_interval:
            counter = 0
            try:
                await monitor()
            except Exception as e:
                logger.warning("Polyaxon sidecar error: %s" % repr(e))

    await monitor()
    logger.info("Cleaning non main containers")
    if k8s_manager:
        await k8s_manager.close()
Пример #28
0
def start_sidecar(
    container_id: str,
    sleep_interval: int,
    sync_interval: int,
    monitor_outputs: bool,
    monitor_logs: bool,
):
    sync_interval = get_sync_interval(interval=sync_interval,
                                      sleep_interval=sleep_interval)

    try:
        owner, project, run_uuid = get_run_info()
    except PolyaxonClientException as e:
        raise PolyaxonContainerException(e)

    client = RunClient(owner=owner, project=project, run_uuid=run_uuid)
    pod_id = CLIENT_CONFIG.pod_id
    if not pod_id:
        raise PolyaxonContainerException(
            "Please make sure that this job has been "
            "started by Polyaxon with all required context.")

    k8s_manager = K8SManager(namespace=CLIENT_CONFIG.namespace,
                             in_cluster=True)
    retry = 1
    is_running = True
    counter = 0
    state = {
        "last_artifacts_check": None,
        "last_logs_check": None,
    }

    def monitor():
        if monitor_outputs:
            last_check = state["last_artifacts_check"]
            state["last_artifacts_check"] = sync_artifacts(
                last_check=last_check,
                run_uuid=run_uuid,
            )
            sync_summaries(
                last_check=last_check,
                run_uuid=run_uuid,
                client=client,
            )

        if monitor_logs:
            state["last_logs_check"] = sync_logs(
                k8s_manager=k8s_manager,
                client=client,
                last_check=state["last_logs_check"],
                run_uuid=run_uuid,
                pod_id=pod_id,
                container_id=container_id,
                owner=owner,
                project=project,
            )

    while is_running and retry <= 3:
        time.sleep(sleep_interval)
        try:
            is_running = is_pod_running(k8s_manager, pod_id, container_id)
        except ApiException as e:
            retry += 1
            time.sleep(1 * retry)
            logger.info("Exception %s" % repr(e))
            logger.info("Sleeping ...")

        logger.debug("Syncing ...")
        if is_running:
            retry = 1

        counter += 1
        if counter == sync_interval:
            counter = 0
            try:
                monitor()
            except Exception as e:
                logger.warning("Polyaxon sidecar error: %e", e)

    monitor()
    logger.info("Cleaning non main containers")