示例#1
0
 def k8s_manager(self):
     if not self._k8s_manager:
         self._k8s_manager = AsyncK8SManager(
             namespace=self.namespace,
             in_cluster=self.in_cluster,
         )
     return self._k8s_manager
示例#2
0
async def collect_logs(request):
    run_uuid = request.path_params["run_uuid"]
    resource_name = get_resource_name(run_uuid=run_uuid)
    k8s_manager = AsyncK8SManager(
        namespace=settings.CLIENT_CONFIG.namespace,
        in_cluster=settings.CLIENT_CONFIG.in_cluster,
    )
    await k8s_manager.setup()
    k8s_operation = await get_k8s_operation(k8s_manager=k8s_manager,
                                            resource_name=resource_name)
    if not k8s_operation:
        raise HTTPException(
            detail="Run's logs was not collected, resource was not found.",
            status_code=status.HTTP_400_BAD_REQUEST,
        )
    operation_logs, _ = await query_k8s_operation_logs(instance=run_uuid,
                                                       k8s_manager=k8s_manager,
                                                       last_time=None)
    if k8s_manager:
        await k8s_manager.close()
    if not operation_logs:
        return Response()

    try:
        await upload_logs(run_uuid=run_uuid, logs=operation_logs)
    except Exception as e:
        raise HTTPException(
            detail=
            "Run's logs was not collected, an error was raised while uploading the data %s."
            % e,
            status_code=status.HTTP_400_BAD_REQUEST,
        )
    task = BackgroundTask(clean_tmp_logs, run_uuid=run_uuid)
    return Response(background=task)
示例#3
0
async def collect_logs(request):
    owner = request.path_params["owner"]
    project = request.path_params["project"]
    run_uuid = request.path_params["run_uuid"]
    resource_name = get_resource_name(run_uuid=run_uuid)
    operation = get_run_instance(owner=owner,
                                 project=project,
                                 run_uuid=run_uuid)
    k8s_manager = AsyncK8SManager(
        namespace=settings.CLIENT_CONFIG.namespace,
        in_cluster=settings.CLIENT_CONFIG.in_cluster,
    )
    await k8s_manager.setup()
    k8s_operation = await get_k8s_operation(k8s_manager=k8s_manager,
                                            resource_name=resource_name)
    if not k8s_operation:
        raise HTTPException(
            detail="Run's logs was not collected, resource was not found.",
            status_code=status.HTTP_400_BAD_REQUEST,
        )
    operation_logs, _ = await get_k8s_operation_logs(operation=operation,
                                                     k8s_manager=k8s_manager,
                                                     last_time=None)
    if k8s_manager:
        await k8s_manager.close()
    if not operation_logs:
        return Response()

    logs = operation_logs
    task = BackgroundTask(upload_logs, run_uuid=run_uuid, logs=logs)
    return Response(background=task)
示例#4
0
async def get_logs(request):
    owner = request.path_params["owner"]
    project = request.path_params["project"]
    run_uuid = request.path_params["run_uuid"]
    force = to_bool(request.query_params.get("force"), handle_none=True)
    resource_name = get_resource_name(run_uuid=run_uuid)
    operation = get_run_instance(owner=owner,
                                 project=project,
                                 run_uuid=run_uuid)
    last_time = QueryParams(request.url.query).get("last_time")
    if last_time:
        last_time = dt_parser.parse(last_time).astimezone()
    last_file = QueryParams(request.url.query).get("last_file")

    k8s_manager = None
    k8s_operation = None
    if not last_file:
        k8s_manager = AsyncK8SManager(
            namespace=settings.CLIENT_CONFIG.namespace,
            in_cluster=settings.CLIENT_CONFIG.in_cluster,
        )
        await k8s_manager.setup()
        k8s_operation = await get_k8s_operation(k8s_manager=k8s_manager,
                                                resource_name=resource_name)

    if not last_file and k8s_operation:
        last_file = None
        operation_logs, last_time = await get_k8s_operation_logs(
            operation=operation,
            last_time=last_time,
            k8s_manager=k8s_manager,
            stream=True,
        )
        if k8s_operation["status"].get("completionTime"):
            last_time = None
    elif last_time:  # Streaming should stop
        last_file = None
        last_time = None
        operation_logs = []
    else:
        last_time = None
        operation_logs, last_file = await get_archived_operation_logs(
            run_uuid=run_uuid, last_file=last_file, check_cache=not force)
    if k8s_manager:
        await k8s_manager.close()
    response = V1Logs(last_time=last_time,
                      last_file=last_file,
                      logs=operation_logs)
    return UJSONResponse(response.to_dict())
示例#5
0
async def collect_logs(request: Request) -> Response:
    run_uuid = request.path_params["run_uuid"]
    run_kind = request.path_params["run_kind"]
    resource_name = get_resource_name_for_kind(run_uuid=run_uuid,
                                               run_kind=run_kind)
    k8s_manager = AsyncK8SManager(
        namespace=settings.CLIENT_CONFIG.namespace,
        in_cluster=settings.CLIENT_CONFIG.in_cluster,
    )
    await k8s_manager.setup()
    k8s_operation = await get_k8s_operation(k8s_manager=k8s_manager,
                                            resource_name=resource_name)
    if not k8s_operation:
        errors = "Run's logs was not collected, resource was not found."
        logger.warning(errors)
        return UJSONResponse(
            content={"errors": errors},
            status_code=status.HTTP_400_BAD_REQUEST,
        )
    operation_logs, _ = await query_k8s_operation_logs(instance=run_uuid,
                                                       k8s_manager=k8s_manager,
                                                       last_time=None)
    if k8s_manager:
        await k8s_manager.close()
    if not operation_logs:
        return Response(status_code=status.HTTP_404_NOT_FOUND)

    try:
        await upload_logs(run_uuid=run_uuid, logs=operation_logs)
    except Exception as e:
        errors = (
            "Run's logs was not collected, an error was raised while uploading the data %s."
            % e)
        logger.warning(errors)
        return UJSONResponse(
            content={"errors": errors},
            status_code=status.HTTP_400_BAD_REQUEST,
        )
    if settings.AGENT_CONFIG.is_replica:
        task = BackgroundTask(clean_tmp_logs, run_uuid=run_uuid)
        return Response(background=task)
    return Response(status_code=status.HTTP_200_OK)
示例#6
0
async def get_logs(request: Request) -> UJSONResponse:
    run_uuid = request.path_params["run_uuid"]
    force = to_bool(request.query_params.get("force"), handle_none=True)
    last_time = QueryParams(request.url.query).get("last_time")
    if last_time:
        last_time = parse_datetime(last_time).astimezone()
    last_file = QueryParams(request.url.query).get("last_file")
    files = []

    if last_time:
        resource_name = get_resource_name(run_uuid=run_uuid)

        k8s_manager = AsyncK8SManager(
            namespace=settings.CLIENT_CONFIG.namespace,
            in_cluster=settings.CLIENT_CONFIG.in_cluster,
        )
        await k8s_manager.setup()
        k8s_operation = await get_k8s_operation(
            k8s_manager=k8s_manager, resource_name=resource_name
        )
        if k8s_operation:
            operation_logs, last_time = await get_operation_logs(
                k8s_manager=k8s_manager,
                k8s_operation=k8s_operation,
                instance=run_uuid,
                last_time=last_time,
            )
        else:
            operation_logs, last_time = await get_tmp_operation_logs(
                run_uuid=run_uuid, last_time=last_time
            )
        if k8s_manager:
            await k8s_manager.close()

    else:
        operation_logs, last_file, files = await get_archived_operation_logs(
            run_uuid=run_uuid, last_file=last_file, check_cache=not force
        )
    response = V1Logs(
        last_time=last_time, last_file=last_file, logs=operation_logs, files=files
    )
    return UJSONResponse(response.to_dict())
示例#7
0
async def start_sidecar(
    container_id: str,
    sleep_interval: int,
    sync_interval: int,
    monitor_outputs: bool,
    monitor_logs: bool,
):
    sync_interval = get_sync_interval(
        interval=sync_interval, sleep_interval=sleep_interval
    )
    try:
        pod_id = os.environ[POLYAXON_KEYS_K8S_POD_ID]
    except KeyError as e:
        raise PolyaxonContainerException(
            "Please make sure that this job has been "
            "started by Polyaxon with all required context."
        ) from e

    try:
        owner, project, run_uuid = get_run_info()
    except PolyaxonClientException as e:
        raise PolyaxonContainerException(e)

    client = RunClient(owner=owner, project=project, run_uuid=run_uuid)
    k8s_manager = AsyncK8SManager(namespace=CLIENT_CONFIG.namespace, in_cluster=True)
    await k8s_manager.setup()
    pod = await k8s_manager.get_pod(pod_id, reraise=True)

    retry = 1
    is_running = True
    counter = 0
    state = {
        "last_artifacts_check": None,
        "last_logs_check": None,
    }

    async def monitor():
        if monitor_logs:
            await sync_logs(
                run_uuid=run_uuid,
                k8s_manager=k8s_manager,
                pod=pod,
                last_time=None,
                stream=True,
                is_running=is_running,
            )
        if monitor_outputs:
            last_check = state["last_artifacts_check"]
            state["last_artifacts_check"] = sync_artifacts(
                last_check=last_check,
                run_uuid=run_uuid,
            )
            sync_summaries(
                last_check=last_check,
                run_uuid=run_uuid,
                client=client,
            )

    while is_running and retry <= 3:
        await asyncio.sleep(sleep_interval)
        try:
            is_running = await k8s_manager.is_pod_running(pod_id, container_id)
        except ApiException as e:
            retry += 1
            logger.info("Exception %s" % repr(e))
            logger.info("Sleeping ...")
            await asyncio.sleep(retry)
            continue

        logger.debug("Syncing ...")
        if is_running:
            retry = 1

        counter += 1
        if counter == sync_interval:
            counter = 0
            try:
                await monitor()
            except Exception as e:
                logger.warning("Polyaxon sidecar error: %s" % repr(e))

    await monitor()
    logger.info("Cleaning non main containers")
    if k8s_manager:
        await k8s_manager.close()