Exemplo n.º 1
0
        def wrap(f):
            if iscoroutinefunction is not None and iscoroutinefunction(f):
                r = AsyncRetrying(*dargs, **dkw)
            elif (tornado and hasattr(tornado.gen, "is_coroutine_function")
                  and tornado.gen.is_coroutine_function(f)):
                r = TornadoRetrying(*dargs, **dkw)
            else:
                r = Retrying(*dargs, **dkw)

            return r.wraps(f)
Exemplo n.º 2
0
        def wrap(f):
            if asyncio and asyncio.iscoroutinefunction(f):
                r = AsyncRetrying(*dargs, **dkw)
            elif tornado and hasattr(tornado.gen, 'is_coroutine_function') \
                    and tornado.gen.is_coroutine_function(f):
                r = TornadoRetrying(*dargs, **dkw)
            else:
                r = Retrying(*dargs, **dkw)

            return r.wraps(f)
Exemplo n.º 3
0
        def wrap(f):
            if asyncio and asyncio.iscoroutinefunction(f):
                r = AsyncRetrying(*dargs, **dkw)
            elif tornado and hasattr(tornado.gen, 'is_coroutine_function') \
                    and tornado.gen.is_coroutine_function(f):
                r = TornadoRetrying(*dargs, **dkw)
            else:
                r = Retrying(*dargs, **dkw)

            return r.wraps(f)
Exemplo n.º 4
0
async def test_local_dask_gateway_server(
        loop: AbstractEventLoop, local_dask_gateway_server: DaskGatewayServer):
    async with Gateway(
            local_dask_gateway_server.address,
            local_dask_gateway_server.proxy_address,
            asynchronous=True,
            auth=auth.BasicAuth("pytest_user",
                                local_dask_gateway_server.password),
    ) as gateway:
        print(f"--> {gateway=} created")
        cluster_options = await gateway.cluster_options()
        gateway_versions = await gateway.get_versions()
        clusters_list = await gateway.list_clusters()
        print(f"--> {gateway_versions=}, {cluster_options=}, {clusters_list=}")
        for option in cluster_options.items():
            print(f"--> {option=}")

        async with gateway.new_cluster() as cluster:
            assert cluster
            print(
                f"--> created new cluster {cluster=}, {cluster.scheduler_info=}"
            )
            NUM_WORKERS = 10
            await cluster.scale(NUM_WORKERS)
            print(f"--> scaling cluster {cluster=} to {NUM_WORKERS} workers")
            async for attempt in AsyncRetrying(reraise=True,
                                               wait=wait_fixed(0.24),
                                               stop=stop_after_delay(30)):
                with attempt:
                    print(
                        f"cluster {cluster=} has now {len(cluster.scheduler_info.get('workers', []))}"
                    )
                    assert len(cluster.scheduler_info.get("workers", 0)) == 10

            async with cluster.get_client() as client:
                print(f"--> created new client {client=}, submitting a job")
                res = await client.submit(lambda x: x + 1, 1)  # type: ignore
                assert res == 2

            print(f"--> scaling cluster {cluster=} back to 0")
            await cluster.scale(0)

            async for attempt in AsyncRetrying(reraise=True,
                                               wait=wait_fixed(0.24),
                                               stop=stop_after_delay(30)):
                with attempt:
                    print(
                        f"cluster {cluster=} has now {len(cluster.scheduler_info.get('workers', []))}"
                    )
                    assert len(cluster.scheduler_info.get("workers", 0)) == 0
Exemplo n.º 5
0
        def wrap(f):
            if isinstance(f, retry_base):
                warnings.warn(
                    ("Got retry_base instance ({cls}) as callable argument, " +
                     "this will probably hang indefinitely (did you mean " +
                     "retry={cls}(...)?)").format(cls=f.__class__.__name__))
            if iscoroutinefunction is not None and iscoroutinefunction(f):
                r = AsyncRetrying(*dargs, **dkw)
            elif (tornado and hasattr(tornado.gen, "is_coroutine_function")
                  and tornado.gen.is_coroutine_function(f)):
                r = TornadoRetrying(*dargs, **dkw)
            else:
                r = Retrying(*dargs, **dkw)

            return r.wraps(f)
Exemplo n.º 6
0
async def _is_registry_reachable(registry_settings: RegistrySettings) -> None:
    async for attempt in AsyncRetrying(
            wait=wait_fixed(1),
            stop=stop_after_attempt(1),
            before_sleep=before_sleep_log(logger, logging.INFO),
            reraise=True,
    ):
        with attempt:
            async with httpx.AsyncClient() as client:
                params = {}
                if registry_settings.REGISTRY_AUTH:
                    params["auth"] = (
                        registry_settings.REGISTRY_USER,
                        registry_settings.REGISTRY_PW.get_secret_value(),
                    )

                protocol = "https" if registry_settings.REGISTRY_SSL else "http"
                url = f"{protocol}://{registry_settings.api_url}/"
                logging.info("Registry test url ='%s'", url)
                response = await client.get(url, timeout=1, **params)
                reachable = (response.status_code == status.HTTP_200_OK
                             and response.json() == {})
                if not reachable:
                    logger.error("Response: %s", response)
                    error_message = (
                        f"Could not reach registry {registry_settings.api_url} "
                        f"auth={registry_settings.REGISTRY_AUTH}")
                    raise _RegistryNotReachableException(error_message)
Exemplo n.º 7
0
async def _assert_and_wait_for_pipeline_state(
    client: TestClient,
    project_id: str,
    expected_state: RunningState,
    expected_api_response: ExpectedResponse,
):
    url_project_state = client.app.router["state_project"].url_for(
        project_id=project_id)
    assert url_project_state == URL(f"/{API_VTAG}/projects/{project_id}/state")
    async for attempt in AsyncRetrying(
            reraise=True,
            stop=stop_after_delay(120),
            wait=wait_fixed(5),
            retry=retry_if_exception_type(AssertionError),
    ):
        with attempt:
            print(
                f"--> waiting for pipeline to complete with {expected_state=} attempt {attempt.retry_state.attempt_number}..."
            )
            resp = await client.get(f"{url_project_state}")
            data, error = await assert_status(resp, expected_api_response.ok)
            assert "state" in data
            assert "value" in data["state"]
            received_study_state = RunningState(data["state"]["value"])
            print(f"<-- received pipeline state: {received_study_state=}")
            assert received_study_state == expected_state
            print(
                f"--> pipeline completed with state {received_study_state=}! "
                f"That's great: {json_dumps(attempt.retry_state.retry_object.statistics)}",
            )
Exemplo n.º 8
0
async def test_publish_event(dask_client: distributed.Client):
    dask_pub = distributed.Pub("some_topic")
    dask_sub = distributed.Sub("some_topic")
    async for attempt in AsyncRetrying(
            reraise=True,
            retry=retry_if_exception_type(AssertionError),
            wait=wait_fixed(0.01),
            stop=stop_after_delay(60),
    ):
        with attempt:
            print(
                f"waiting for subscribers... attempt={attempt.retry_state.attempt_number}"
            )
            assert dask_pub.subscribers
            print("we do have subscribers!")

    event_to_publish = TaskLogEvent(job_id="some_fake_job_id", log="the log")
    publish_event(dask_pub=dask_pub, event=event_to_publish)
    # NOTE: this tests runs a sync dask client,
    # and the CI seems to have sometimes difficulties having this run in a reasonable time
    # hence the long time out
    message = dask_sub.get(timeout=1)
    assert message is not None
    received_task_log_event = TaskLogEvent.parse_raw(message)  # type: ignore
    assert received_task_log_event == event_to_publish
Exemplo n.º 9
0
async def test_publish_to_user(
    logged_user: Dict[str, Any],
    other_project_id: UUIDStr,
    other_node_uuid: str,
    #
    socketio_subscriber_handlers: NamedTuple,
    publish_some_messages_in_rabbit: Callable[
        [UserID, UUIDStr, UUIDStr, int],
        Awaitable[Tuple[LogMessages, ProgressMessages, InstrumMessages]], ],
):
    mock_log_handler, mock_node_update_handler = socketio_subscriber_handlers

    # publish messages with correct user id, but no project
    log_messages, _, _ = await publish_some_messages_in_rabbit(
        logged_user["id"],
        other_project_id,
        other_node_uuid,
        NUMBER_OF_MESSAGES,
    )

    async for attempt in AsyncRetrying(**RETRY_POLICY):
        with attempt:
            assert mock_log_handler.call_count == (NUMBER_OF_MESSAGES)

    for mock_call, expected_message in zip(mock_log_handler.call_args_list,
                                           log_messages):
        value = mock_call[0]
        deserialized_value = json.loads(value[0])
        assert deserialized_value == json.loads(
            expected_message.json(include={"node_id", "messages"}))
    mock_node_update_handler.assert_not_called()
Exemplo n.º 10
0
async def test_publish_about_users_projects_node(
    logged_user: Dict[str, Any],
    user_project: Dict[str, Any],
    #
    socketio_subscriber_handlers: NamedTuple,
    publish_some_messages_in_rabbit: Callable[
        [UserID, UUIDStr, UUIDStr, int],
        Awaitable[Tuple[LogMessages, ProgressMessages, InstrumMessages]], ],
):
    mock_log_handler, mock_node_update_handler = socketio_subscriber_handlers

    # publish message with correct user id, project node
    node_uuid = list(user_project["workbench"])[0]
    log_messages, _, _ = await publish_some_messages_in_rabbit(
        logged_user["id"],
        user_project["uuid"],
        node_uuid,
        NUMBER_OF_MESSAGES,
    )

    async for attempt in AsyncRetrying(**RETRY_POLICY):
        with attempt:
            assert mock_log_handler.call_count == (NUMBER_OF_MESSAGES)
            assert mock_node_update_handler.call_count == (NUMBER_OF_MESSAGES)

    for mock_call, expected_message in zip(mock_log_handler.call_args_list,
                                           log_messages):
        value = mock_call[0]
        deserialized_value = json.loads(value[0])
        assert deserialized_value == json.loads(
            expected_message.json(include={"node_id", "messages"}))

    # mock_log_handler.assert_has_calls(log_calls, any_order=True)
    mock_node_update_handler.assert_called()
    assert mock_node_update_handler.call_count == (NUMBER_OF_MESSAGES)
Exemplo n.º 11
0
async def wait_till_service_healthy(service_name: str, endpoint: URL):

    log.info(
        "Connecting to %s",
        f"{service_name=} at {endpoint=}",
    )
    async for attempt in AsyncRetrying(
            # randomizing healthchecks sampling helps parallel execution
            wait=wait_random(1, 2),
            # sets the timeout for a service to become healthy
            stop=stop_after_delay(2 * MINUTE),
            before_sleep=before_sleep_log(log, logging.WARNING),
            reraise=True,
    ):
        with attempt:
            async with aiohttp.ClientSession(
                    timeout=_ONE_SEC_TIMEOUT) as session:
                async with session.get(endpoint) as response:
                    # NOTE: Health-check endpoint require only a status code 200
                    # (see e.g. services/web/server/docker/healthcheck.py)
                    # regardless of the payload content
                    assert (
                        response.status == 200
                    ), f"Connection to {service_name=} at {endpoint=} failed with {response=}"

            log.info(
                "Connection to %s succeeded [%s]",
                f"{service_name=} at {endpoint=}",
                json.dumps(attempt.retry_state.retry_object.statistics),
            )
Exemplo n.º 12
0
async def test_websocket_disconnected_remove_or_maintain_files_based_on_role(
    client,
    logged_user,
    empty_user_project,
    mocked_director_v2_api,
    create_dynamic_service_mock,
    client_session_id_factory: Callable[[], str],
    socketio_client_factory: Callable,
    # asyncpg_storage_system_mock,
    storage_subsystem_mock,  # when guest user logs out garbage is collected
    expect_call: bool,
    exp_save_state: bool,
):
    set_service_deletion_delay(SERVICE_DELETION_DELAY, client.server.app)
    # login - logged_user fixture
    # create empty study - empty_user_project fixture
    # create dynamic service - create_dynamic_service_mock fixture
    service = await create_dynamic_service_mock(logged_user["id"],
                                                empty_user_project["uuid"])
    # create websocket
    client_session_id1 = client_session_id_factory()
    sio: socketio.AsyncClient = await socketio_client_factory(
        client_session_id1)
    # open project in client 1
    await open_project(client, empty_user_project["uuid"], client_session_id1)
    # logout
    logout_url = client.app.router["auth_logout"].url_for()
    r = await client.post(logout_url,
                          json={"client_session_id": client_session_id1})
    assert r.url_obj.path == logout_url.path
    await assert_status(r, web.HTTPOk)

    # ensure sufficient time is wasted here
    await asyncio.sleep(SERVICE_DELETION_DELAY + 1)
    await garbage_collector.collect_garbage(client.app)

    # assert dynamic service is removed
    calls = [
        call(
            app=client.server.app,
            save_state=exp_save_state,
            service_uuid=service["service_uuid"],
        )
    ]
    mocked_director_v2_api["director_v2_core.stop_service"].assert_has_calls(
        calls)

    # this call is done async, so wait a bit here to ensure it is correctly done
    async for attempt in AsyncRetrying(reraise=True,
                                       stop=stop_after_delay(10)):
        with attempt:
            if expect_call:
                # make sure `delete_project` is called
                storage_subsystem_mock[1].assert_called_once()
                # make sure `delete_user` is called
                # asyncpg_storage_system_mock.assert_called_once()
            else:
                # make sure `delete_project` not called
                storage_subsystem_mock[1].assert_not_called()
Exemplo n.º 13
0
async def _wait_for_call(mocked_fct):
    async for attempt in AsyncRetrying(
            stop=stop_after_delay(10),
            wait=wait_random(0, 1),
            retry=retry_if_exception_type(AssertionError),
            reraise=True,
    ):
        with attempt:
            print(f"waiting for call in mocked fct {mocked_fct}, "
                  f"Attempt={attempt.retry_state.attempt_number}")
            mocked_fct.assert_called_once()
Exemplo n.º 14
0
async def assert_and_wait_for_pipeline_status(
    client: httpx.AsyncClient,
    url: AnyHttpUrl,
    user_id: UserID,
    project_uuid: UUID,
    wait_for_states: Optional[List[RunningState]] = None,
) -> ComputationTaskGet:
    if not wait_for_states:
        wait_for_states = [
            RunningState.SUCCESS,
            RunningState.FAILED,
            RunningState.ABORTED,
        ]
    MAX_TIMEOUT_S = 5 * MINUTE

    async def check_pipeline_state() -> ComputationTaskGet:
        response = await client.get(url, params={"user_id": user_id})
        assert (
            response.status_code == status.HTTP_202_ACCEPTED
        ), f"response code is {response.status_code}, error: {response.text}"
        task_out = ComputationTaskGet.parse_obj(response.json())
        assert task_out.id == project_uuid
        assert task_out.url == f"{client.base_url}/v2/computations/{project_uuid}"
        print(
            f"Pipeline '{project_uuid=}' current task out is '{task_out=}'",
        )
        assert wait_for_states
        assert (
            task_out.state in wait_for_states
        ), f"current task state is '{task_out.state}', not in any of {wait_for_states}"
        return task_out

    start = time.monotonic()
    async for attempt in AsyncRetrying(
        stop=stop_after_delay(MAX_TIMEOUT_S),
        wait=wait_fixed(2),
        retry=retry_if_exception_type(AssertionError),
        reraise=True,
    ):
        elapsed_s = time.monotonic() - start
        with attempt:
            print(
                f"Waiting for pipeline '{project_uuid=}' state to be one of: {wait_for_states=}, attempt={attempt.retry_state.attempt_number}, time={elapsed_s}s"
            )
            task_out = await check_pipeline_state()
            print(
                f"Pipeline '{project_uuid=}' state succesfuly became '{task_out.state}'\n{json.dumps(attempt.retry_state.retry_object.statistics, indent=2)}, time={elapsed_s}s"
            )

            return task_out

    # this is only to satisfy pylance
    raise AssertionError("No computation task generated!")
Exemplo n.º 15
0
 async def _check_all_services_are_running():
     async for attempt in AsyncRetrying(
             wait=wait_fixed(5),
             stop=stop_after_delay(8 * MINUTE),
             before_sleep=before_sleep_log(log, logging.INFO),
             reraise=True,
     ):
         with attempt:
             await asyncio.gather(*[
                 asyncio.get_event_loop().run_in_executor(
                     None, assert_service_is_running, service)
                 for service in docker_client.services.list()
             ])
Exemplo n.º 16
0
async def test_interactive_services_removed_after_logout(
    client: TestClient,
    logged_user: Dict[str, Any],
    empty_user_project: Dict[str, Any],
    mocked_director_v2_api: Dict[str, mock.MagicMock],
    create_dynamic_service_mock,
    client_session_id_factory: Callable[[], str],
    socketio_client_factory: Callable,
    storage_subsystem_mock:
    MockedStorageSubsystem,  # when guest user logs out garbage is collected
    director_v2_service_mock: aioresponses,
    expected_save_state: bool,
):
    # login - logged_user fixture
    # create empty study - empty_user_project fixture
    # create dynamic service - create_dynamic_service_mock fixture
    service = await create_dynamic_service_mock(logged_user["id"],
                                                empty_user_project["uuid"])
    # create websocket
    client_session_id1 = client_session_id_factory()
    sio = await socketio_client_factory(client_session_id1)
    # open project in client 1
    await open_project(client, empty_user_project["uuid"], client_session_id1)
    # logout
    logout_url = client.app.router["auth_logout"].url_for()
    r = await client.post(f"{logout_url}",
                          json={"client_session_id": client_session_id1})
    assert r.url_obj.path == logout_url.path
    await assert_status(r, web.HTTPOk)

    # check result perfomed by background task
    await asyncio.sleep(SERVICE_DELETION_DELAY + 1)
    await garbage_collector_core.collect_garbage(client.app)

    # assert dynamic service is removed *this is done in a fire/forget way so give a bit of leeway
    async for attempt in AsyncRetrying(reraise=True,
                                       stop=stop_after_attempt(10),
                                       wait=wait_fixed(1)):
        with attempt:
            logger.warning(
                "Waiting for stop to have been called service_uuid=%s, save_state=%s",
                service["service_uuid"],
                expected_save_state,
            )
            mocked_director_v2_api[
                "director_v2_core.stop_service"].assert_awaited_with(
                    app=client.server.app,
                    service_uuid=service["service_uuid"],
                    save_state=expected_save_state,
                )
Exemplo n.º 17
0
 async def create(
     cls,
     app: FastAPI,
     settings: DaskSchedulerSettings,
     endpoint: AnyUrl,
     authentication: ClusterAuthentication,
 ) -> "DaskClient":
     logger.info(
         "Initiating connection to %s with auth: %s",
         f"dask-scheduler/gateway at {endpoint}",
         authentication,
     )
     async for attempt in AsyncRetrying(
             reraise=True,
             before_sleep=before_sleep_log(logger, logging.WARNING),
             wait=wait_fixed(0.3),
             stop=stop_after_attempt(3),
     ):
         with attempt:
             logger.debug(
                 "Connecting to %s, attempt %s...",
                 endpoint,
                 attempt.retry_state.attempt_number,
             )
             dask_subsystem = await _create_internal_client_based_on_auth(
                 endpoint, authentication)
             check_scheduler_status(dask_subsystem.client)
             instance = cls(
                 app=app,
                 dask_subsystem=dask_subsystem,
                 settings=settings,
                 cancellation_dask_pub=distributed.Pub(
                     TaskCancelEvent.topic_name(),
                     client=dask_subsystem.client),
             )
             logger.info(
                 "Connection to %s succeeded [%s]",
                 f"dask-scheduler/gateway at {endpoint}",
                 json.dumps(attempt.retry_state.retry_object.statistics),
             )
             logger.info(
                 "Scheduler info:\n%s",
                 json.dumps(dask_subsystem.client.scheduler_info(),
                            indent=2),
             )
             return instance
     # this is to satisfy pylance
     raise ValueError("Could not create client")
Exemplo n.º 18
0
        def wrap(f: WrappedFn) -> WrappedFn:
            if isinstance(f, retry_base):
                warnings.warn(
                    f"Got retry_base instance ({f.__class__.__name__}) as callable argument, "
                    f"this will probably hang indefinitely (did you mean retry={f.__class__.__name__}(...)?)"
                )
            if iscoroutinefunction(f):
                r: "BaseRetrying" = AsyncRetrying(*dargs, **dkw)
            elif tornado and hasattr(
                    tornado.gen, "is_coroutine_function"
            ) and tornado.gen.is_coroutine_function(f):
                r = TornadoRetrying(*dargs, **dkw)
            else:
                r = Retrying(*dargs, **dkw)

            return r.wraps(f)
Exemplo n.º 19
0
async def _request_director_v2(
    app: web.Application,
    method: str,
    url: URL,
    expected_status: Type[web.HTTPSuccessful] = web.HTTPOk,
    headers: Optional[Dict[str, str]] = None,
    data: Optional[Any] = None,
    **kwargs,
) -> DataBody:

    try:
        async for attempt in AsyncRetrying(**DEFAULT_RETRY_POLICY):
            with attempt:
                session = get_client_session(app)
                async with session.request(method,
                                           url,
                                           headers=headers,
                                           json=data,
                                           **kwargs) as response:
                    payload = (await response.json()
                               if response.content_type == "application/json"
                               else await response.text())
                    # NOTE:
                    # - `sometimes director-v0` (via redirects) replies
                    #   in plain text and this is considered an error
                    # - `director-v2` and `director-v0` can reply with 204 no content
                    if response.status != expected_status.status_code or isinstance(
                            payload, str):
                        raise DirectorServiceError(response.status,
                                                   reason=f"{payload}")
                    return payload

    # TODO: enrich with https://docs.aiohttp.org/en/stable/client_reference.html#hierarchy-of-exceptions
    except asyncio.TimeoutError as err:
        raise DirectorServiceError(
            web.HTTPServiceUnavailable.status_code,
            reason=f"request to director-v2 timed-out: {err}",
        ) from err

    except aiohttp.ClientError as err:
        raise DirectorServiceError(
            web.HTTPServiceUnavailable.status_code,
            reason=f"request to director-v2 service unexpected error {err}",
        ) from err
    log.error("Unexpected result calling %s, %s", f"{url=}", f"{method=}")
    raise DirectorServiceError(web.HTTPClientError.status_code,
                               reason="Unexpected client error")
Exemplo n.º 20
0
async def setup_director(app: FastAPI) -> None:
    if settings := app.state.settings.CATALOG_DIRECTOR:
        # init client-api
        logger.debug("Setup director at %s...", settings.base_url)
        director_client = DirectorApi(base_url=settings.base_url, app=app)

        # check that the director is accessible
        async for attempt in AsyncRetrying(**director_startup_retry_policy):
            with attempt:
                if not await director_client.is_responsive():
                    raise ValueError("Director-v0 is not responsive")

                logger.info(
                    "Connection to director-v0 succeded [%s]",
                    json_dumps(attempt.retry_state.retry_object.statistics),
                )

        app.state.director_api = director_client
Exemplo n.º 21
0
async def _assert_and_wait_for_comp_task_states_to_be_transmitted_in_projects(
    project_id: str,
    postgres_session: sa.orm.session.Session,
):

    async for attempt in AsyncRetrying(
            reraise=True,
            stop=stop_after_delay(120),
            wait=wait_fixed(5),
            retry=retry_if_exception_type(AssertionError),
    ):
        with attempt:
            print(
                f"--> waiting for pipeline results to move to projects table, attempt {attempt.retry_state.attempt_number}..."
            )
            comp_tasks_in_db: Dict[NodeIdStr,
                                   Any] = _get_computational_tasks_from_db(
                                       project_id, postgres_session)
            workbench_in_db: Dict[NodeIdStr,
                                  Any] = _get_project_workbench_from_db(
                                      project_id, postgres_session)
            for node_id, node_values in comp_tasks_in_db.items():
                assert (
                    node_id in workbench_in_db
                ), f"node {node_id=} is missing from workbench {json_dumps(workbench_in_db, indent=2)}"

                node_in_project_table = workbench_in_db[node_id]

                # if this one is in, the other should also be but let's check it carefully
                assert node_values.run_hash
                assert "runHash" in node_in_project_table
                assert node_values.run_hash == node_in_project_table["runHash"]

                assert node_values.state
                assert "state" in node_in_project_table
                assert "currentStatus" in node_in_project_table["state"]
                # NOTE: beware that the comp_tasks has StateType and Workbench has RunningState (sic)
                assert (DB_TO_RUNNING_STATE[node_values.state].value ==
                        node_in_project_table["state"]["currentStatus"])
            print(
                "--> tasks were properly transferred! "
                f"That's great: {json_dumps(attempt.retry_state.retry_object.statistics)}",
            )
Exemplo n.º 22
0
async def assert_service_is_available(  # pylint: disable=redefined-outer-name
        exposed_port: PositiveInt, is_legacy: bool, service_uuid: str) -> None:
    service_address = (f"http://{get_ip()}:{exposed_port}/x/{service_uuid}"
                       if is_legacy else f"http://{get_ip()}:{exposed_port}")
    print(f"checking service @ {service_address}")

    async for attempt in AsyncRetrying(wait=wait_fixed(1),
                                       stop=stop_after_attempt(60),
                                       reraise=True):
        with attempt:
            async with httpx.AsyncClient() as client:
                response = await client.get(service_address)
                print(
                    f"{SEPARATOR}\nAttempt={attempt.retry_state.attempt_number}"
                )
                print(
                    f"Body:\n{response.text}\nHeaders={response.headers}\n{SEPARATOR}"
                )
                assert response.status_code == httpx.codes.OK, response.text
Exemplo n.º 23
0
async def assemble_cached_indexes(app: web.Application):
    """
    Currently the static resources are contain 3 folders: osparc, s4l, tis
    each of them contain and index.html to be served to as the root of the site
    for each type of frontend.

    Caching these 3 items on start. This
    """
    settings: StaticWebserverModuleSettings = get_plugin_settings(app)
    cached_indexes: Dict[str, str] = {}

    session: ClientSession = get_client_session(app)

    for frontend_name in FRONTEND_APPS_AVAILABLE:
        url = URL(settings.STATIC_WEBSERVER_URL) / frontend_name
        log.info("Fetching index from %s", url)
        try:
            body = ""
            # web-static server might still not be up
            async for attempt in AsyncRetrying(
                **_STATIC_WEBSERVER_RETRY_ON_STARTUP_POLICY):
                with attempt:
                    response = await session.get(url, raise_for_status=True)
                    body = await response.text()

        except ClientError as err:
            log.error("Could not fetch index from static server: %s", err)

            # ANE: Yes this is supposed to fail the boot process
            raise RuntimeError(
                f"Could not fetch index at {str(url)}. Stopping application boot"
            ) from err
        else:
            # fixes relative paths
            body = body.replace(f"../resource/{frontend_name}",
                                f"resource/{frontend_name}")
            body = body.replace("boot.js", f"{frontend_name}/boot.js")

            log.info("Storing index for %s", url)
            cached_indexes[frontend_name] = body

    app[APP_FRONTEND_CACHED_INDEXES_KEY] = cached_indexes
Exemplo n.º 24
0
async def _assert_wait_for_task_status(
    job_id: str,
    dask_client: DaskClient,
    expected_status: RunningState,
    timeout: Optional[int] = None,
):
    async for attempt in AsyncRetrying(
        reraise=True,
        stop=stop_after_delay(timeout or _ALLOW_TIME_FOR_GATEWAY_TO_CREATE_WORKERS),
        wait=wait_fixed(1),
    ):
        with attempt:
            print(
                f"waiting for task to be {expected_status=}, "
                f"Attempt={attempt.retry_state.attempt_number}"
            )
            current_task_status = await dask_client.get_task_status(job_id)
            assert isinstance(current_task_status, RunningState)
            print(f"{current_task_status=} vs {expected_status=}")
            assert current_task_status == expected_status
Exemplo n.º 25
0
    async def _create_client(address: str) -> aioredis.Redis:
        client: Optional[aioredis.Redis] = None

        async for attempt in AsyncRetrying(
                stop=stop_after_delay(1 * _MINUTE),
                wait=wait_fixed(_WAIT_SECS),
                before_sleep=before_sleep_log(log, logging.WARNING),
                reraise=True,
        ):
            with attempt:
                client = await aioredis.create_redis_pool(address,
                                                          encoding="utf-8")
                log.info(
                    "Connection to %s succeeded with %s [%s]",
                    f"redis at {address=}",
                    f"{client=}",
                    json.dumps(attempt.retry_state.retry_object.statistics),
                )
        assert client  # nosec
        return client
Exemplo n.º 26
0
async def test_listen_comp_tasks_task(
    mock_project_subsystem: Dict,
    comp_task_listening_task: None,
    client,
    update_values: Dict[str, Any],
    expected_calls: List[str],
    task_class: NodeClass,
):
    db_engine: aiopg.sa.Engine = client.app[APP_DB_ENGINE_KEY]
    async with db_engine.acquire() as conn:
        # let's put some stuff in there now
        result = await conn.execute(
            comp_tasks.insert()
            .values(outputs=json.dumps({}), node_class=task_class)
            .returning(literal_column("*"))
        )
        row: RowProxy = await result.fetchone()
        task = dict(row)

        # let's update some values
        await conn.execute(
            comp_tasks.update()
            .values(**update_values)
            .where(comp_tasks.c.task_id == task["task_id"])
        )

        # tests whether listener gets hooked calls executed
        for call_name, mocked_call in mock_project_subsystem.items():
            if call_name in expected_calls:
                async for attempt in AsyncRetrying(
                    wait=wait_fixed(1),
                    stop=stop_after_delay(10),
                    retry=retry_if_exception_type(AssertionError),
                    before_sleep=before_sleep_log(logger, logging.INFO),
                    reraise=True,
                ):
                    with attempt:
                        mocked_call.assert_awaited()

            else:
                mocked_call.assert_not_called()
Exemplo n.º 27
0
async def ensure_volume_cleanup(docker_client: aiodocker.Docker,
                                node_uuid: str) -> None:
    async def _get_volume_names() -> Set[str]:
        volumes_list = await docker_client.volumes.list()
        volume_names: Set[str] = {x["Name"] for x in volumes_list["Volumes"]}
        return volume_names

    for volume_name in await _get_volume_names():
        if volume_name.startswith(f"dy-sidecar_{node_uuid}"):
            # docker volume results to be in use and it takes a bit to remove
            # it once done with it
            async for attempt in AsyncRetrying(
                    reraise=False,
                    stop=stop_after_attempt(15),
                    wait=wait_fixed(5),
            ):
                with attempt:
                    # if volume is still found raise an exception
                    # by the time this finishes all volumes should have been removed
                    if volume_name in await _get_volume_names():
                        raise _VolumeNotExpectedError(volume_name)
Exemplo n.º 28
0
 async def create_client(url) -> aioredis.Redis:
     # create redis client
     client: Optional[aioredis.Redis] = None
     async for attempt in AsyncRetrying(
             stop=stop_after_delay(1 * _MINUTE),
             wait=wait_fixed(_WAIT_SECS),
             before_sleep=before_sleep_log(log, logging.WARNING),
             reraise=True,
     ):
         with attempt:
             client = await aioredis.create_redis_pool(url,
                                                       encoding="utf-8")
             if not client:
                 raise ValueError(
                     "Expected aioredis client instance, got {client}")
             log.info(
                 "Connection to %s succeeded [%s]",
                 f"redis at {endpoint=}",
                 json.dumps(attempt.retry_state.retry_object.statistics),
             )
     assert client  # no sec
     return client
Exemplo n.º 29
0
async def test_creating_new_project_from_template_and_disconnecting_does_not_create_project(
    client: TestClient,
    logged_user: Dict[str, Any],
    primary_group: Dict[str, str],
    standard_groups: List[Dict[str, str]],
    template_project: Dict[str, Any],
    expected: ExpectedResponse,
    catalog_subsystem_mock: Callable,
    slow_storage_subsystem_mock: MockedStorageSubsystem,
    project_db_cleaner: None,
):
    catalog_subsystem_mock([template_project])
    # create a project from another and disconnect while doing this by timing out
    # POST /v0/projects
    create_url = client.app.router["create_projects"].url_for()
    assert str(create_url) == f"{API_PREFIX}/projects"
    create_url = create_url.with_query(from_template=template_project["uuid"])
    with pytest.raises(asyncio.TimeoutError):
        await client.post(f"{create_url}", json={}, timeout=5)

    # let's check that there are no new project created, after timing out
    list_url = client.app.router["list_projects"].url_for()
    assert str(list_url) == API_PREFIX + "/projects"
    list_url = list_url.with_query(type="user")
    resp = await client.get(f"{list_url}")
    data, *_ = await assert_status(
        resp,
        expected.ok,
    )
    assert not data

    # NOTE: after coming back here timing-out, the code shall still run
    # in the server which is why we need to retry here
    async for attempt in AsyncRetrying(
        reraise=True, stop=stop_after_delay(20), wait=wait_fixed(1)
    ):
        with attempt:
            slow_storage_subsystem_mock.delete_project.assert_called_once()
Exemplo n.º 30
0
async def _wait_till_rabbit_responsive(url: str) -> None:
    async for attempt in AsyncRetrying(
            **RabbitMQRetryPolicyUponInitialization().kwargs):
        with attempt:
            connection = await aio_pika.connect(url, timeout=1.0)
            await connection.close()
Exemplo n.º 31
0
async def assert_service_is_running(
        service_id: str,
        docker,
        *,
        max_running_delay=1 * MINUTE
) -> Tuple[List[TaskDict], TenacityStatsDict]:
    MAX_WAIT = 5
    assert max_running_delay > 3 * MAX_WAIT

    #
    # The retry-policy constraints in this test
    # the time a service takes since it is deployed by the swarm
    # until it is running (i.e. started and healthy)
    #
    retry_policy = dict(
        # instead of wait_fix in order to help parallel execution in asyncio.gather
        wait=wait_random(1, MAX_WAIT),
        stop=stop_after_delay(max_running_delay),
        before_sleep=before_sleep_log(log, logging.INFO),
        reraise=True,
    )

    async for attempt in AsyncRetrying(**retry_policy):
        with attempt:

            # service
            service: ServiceDict = await docker.services.inspect(service_id)

            assert service_id == service["ID"]

            service_name = service["Spec"]["Name"]
            num_replicas = int(
                get_from_dict(service,
                              "Spec.Mode.Replicated.Replicas",
                              default=1))

            # tasks in a service
            tasks: List[TaskDict] = await docker.tasks.list(
                filters={"service": service_name})

            tasks_current_state = [task["Status"]["State"] for task in tasks]
            num_running = sum(current == "running"
                              for current in tasks_current_state)

            # assert condition
            is_running: bool = num_replicas == num_running

            error_msg = ""
            if not is_running:
                # lazy composes error msg
                logs_lines = await docker.services.logs(
                    service_id,
                    follow=False,
                    timestamps=True,
                    tail=50,  # SEE *_docker_logs artifacts for details
                )
                log_str = " ".join(logs_lines)
                tasks_json = json.dumps(
                    [
                        copy_from_dict(
                            task,
                            include={
                                "ID":...,
                                "CreatedAt":...,
                                "UpdatedAt":...,
                                "Spec": {
                                    "ContainerSpec": {"Image"}
                                },
                                "Status": {"Timestamp", "State"},
                                "DesiredState":...,
                            },
                        ) for task in tasks
                    ],
                    indent=1,
                )
                error_msg = (
                    f"{service_name=} has {tasks_current_state=}, but expected at least {num_replicas=} running. "
                    f"Details:\n"
                    f"tasks={tasks_json}\n"
                    f"logs={log_str}\n")

            assert is_running, error_msg

            log.info(
                "Connection to %s succeded [%s]",
                service_name,
                json.dumps(attempt.retry_state.retry_object.statistics),
            )

            return tasks, attempt.retry_state.retry_object.statistics
    assert False  # never reached