def wrap(f): if iscoroutinefunction is not None and iscoroutinefunction(f): r = AsyncRetrying(*dargs, **dkw) elif (tornado and hasattr(tornado.gen, "is_coroutine_function") and tornado.gen.is_coroutine_function(f)): r = TornadoRetrying(*dargs, **dkw) else: r = Retrying(*dargs, **dkw) return r.wraps(f)
def wrap(f): if asyncio and asyncio.iscoroutinefunction(f): r = AsyncRetrying(*dargs, **dkw) elif tornado and hasattr(tornado.gen, 'is_coroutine_function') \ and tornado.gen.is_coroutine_function(f): r = TornadoRetrying(*dargs, **dkw) else: r = Retrying(*dargs, **dkw) return r.wraps(f)
async def test_local_dask_gateway_server( loop: AbstractEventLoop, local_dask_gateway_server: DaskGatewayServer): async with Gateway( local_dask_gateway_server.address, local_dask_gateway_server.proxy_address, asynchronous=True, auth=auth.BasicAuth("pytest_user", local_dask_gateway_server.password), ) as gateway: print(f"--> {gateway=} created") cluster_options = await gateway.cluster_options() gateway_versions = await gateway.get_versions() clusters_list = await gateway.list_clusters() print(f"--> {gateway_versions=}, {cluster_options=}, {clusters_list=}") for option in cluster_options.items(): print(f"--> {option=}") async with gateway.new_cluster() as cluster: assert cluster print( f"--> created new cluster {cluster=}, {cluster.scheduler_info=}" ) NUM_WORKERS = 10 await cluster.scale(NUM_WORKERS) print(f"--> scaling cluster {cluster=} to {NUM_WORKERS} workers") async for attempt in AsyncRetrying(reraise=True, wait=wait_fixed(0.24), stop=stop_after_delay(30)): with attempt: print( f"cluster {cluster=} has now {len(cluster.scheduler_info.get('workers', []))}" ) assert len(cluster.scheduler_info.get("workers", 0)) == 10 async with cluster.get_client() as client: print(f"--> created new client {client=}, submitting a job") res = await client.submit(lambda x: x + 1, 1) # type: ignore assert res == 2 print(f"--> scaling cluster {cluster=} back to 0") await cluster.scale(0) async for attempt in AsyncRetrying(reraise=True, wait=wait_fixed(0.24), stop=stop_after_delay(30)): with attempt: print( f"cluster {cluster=} has now {len(cluster.scheduler_info.get('workers', []))}" ) assert len(cluster.scheduler_info.get("workers", 0)) == 0
def wrap(f): if isinstance(f, retry_base): warnings.warn( ("Got retry_base instance ({cls}) as callable argument, " + "this will probably hang indefinitely (did you mean " + "retry={cls}(...)?)").format(cls=f.__class__.__name__)) if iscoroutinefunction is not None and iscoroutinefunction(f): r = AsyncRetrying(*dargs, **dkw) elif (tornado and hasattr(tornado.gen, "is_coroutine_function") and tornado.gen.is_coroutine_function(f)): r = TornadoRetrying(*dargs, **dkw) else: r = Retrying(*dargs, **dkw) return r.wraps(f)
async def _is_registry_reachable(registry_settings: RegistrySettings) -> None: async for attempt in AsyncRetrying( wait=wait_fixed(1), stop=stop_after_attempt(1), before_sleep=before_sleep_log(logger, logging.INFO), reraise=True, ): with attempt: async with httpx.AsyncClient() as client: params = {} if registry_settings.REGISTRY_AUTH: params["auth"] = ( registry_settings.REGISTRY_USER, registry_settings.REGISTRY_PW.get_secret_value(), ) protocol = "https" if registry_settings.REGISTRY_SSL else "http" url = f"{protocol}://{registry_settings.api_url}/" logging.info("Registry test url ='%s'", url) response = await client.get(url, timeout=1, **params) reachable = (response.status_code == status.HTTP_200_OK and response.json() == {}) if not reachable: logger.error("Response: %s", response) error_message = ( f"Could not reach registry {registry_settings.api_url} " f"auth={registry_settings.REGISTRY_AUTH}") raise _RegistryNotReachableException(error_message)
async def _assert_and_wait_for_pipeline_state( client: TestClient, project_id: str, expected_state: RunningState, expected_api_response: ExpectedResponse, ): url_project_state = client.app.router["state_project"].url_for( project_id=project_id) assert url_project_state == URL(f"/{API_VTAG}/projects/{project_id}/state") async for attempt in AsyncRetrying( reraise=True, stop=stop_after_delay(120), wait=wait_fixed(5), retry=retry_if_exception_type(AssertionError), ): with attempt: print( f"--> waiting for pipeline to complete with {expected_state=} attempt {attempt.retry_state.attempt_number}..." ) resp = await client.get(f"{url_project_state}") data, error = await assert_status(resp, expected_api_response.ok) assert "state" in data assert "value" in data["state"] received_study_state = RunningState(data["state"]["value"]) print(f"<-- received pipeline state: {received_study_state=}") assert received_study_state == expected_state print( f"--> pipeline completed with state {received_study_state=}! " f"That's great: {json_dumps(attempt.retry_state.retry_object.statistics)}", )
async def test_publish_event(dask_client: distributed.Client): dask_pub = distributed.Pub("some_topic") dask_sub = distributed.Sub("some_topic") async for attempt in AsyncRetrying( reraise=True, retry=retry_if_exception_type(AssertionError), wait=wait_fixed(0.01), stop=stop_after_delay(60), ): with attempt: print( f"waiting for subscribers... attempt={attempt.retry_state.attempt_number}" ) assert dask_pub.subscribers print("we do have subscribers!") event_to_publish = TaskLogEvent(job_id="some_fake_job_id", log="the log") publish_event(dask_pub=dask_pub, event=event_to_publish) # NOTE: this tests runs a sync dask client, # and the CI seems to have sometimes difficulties having this run in a reasonable time # hence the long time out message = dask_sub.get(timeout=1) assert message is not None received_task_log_event = TaskLogEvent.parse_raw(message) # type: ignore assert received_task_log_event == event_to_publish
async def test_publish_to_user( logged_user: Dict[str, Any], other_project_id: UUIDStr, other_node_uuid: str, # socketio_subscriber_handlers: NamedTuple, publish_some_messages_in_rabbit: Callable[ [UserID, UUIDStr, UUIDStr, int], Awaitable[Tuple[LogMessages, ProgressMessages, InstrumMessages]], ], ): mock_log_handler, mock_node_update_handler = socketio_subscriber_handlers # publish messages with correct user id, but no project log_messages, _, _ = await publish_some_messages_in_rabbit( logged_user["id"], other_project_id, other_node_uuid, NUMBER_OF_MESSAGES, ) async for attempt in AsyncRetrying(**RETRY_POLICY): with attempt: assert mock_log_handler.call_count == (NUMBER_OF_MESSAGES) for mock_call, expected_message in zip(mock_log_handler.call_args_list, log_messages): value = mock_call[0] deserialized_value = json.loads(value[0]) assert deserialized_value == json.loads( expected_message.json(include={"node_id", "messages"})) mock_node_update_handler.assert_not_called()
async def test_publish_about_users_projects_node( logged_user: Dict[str, Any], user_project: Dict[str, Any], # socketio_subscriber_handlers: NamedTuple, publish_some_messages_in_rabbit: Callable[ [UserID, UUIDStr, UUIDStr, int], Awaitable[Tuple[LogMessages, ProgressMessages, InstrumMessages]], ], ): mock_log_handler, mock_node_update_handler = socketio_subscriber_handlers # publish message with correct user id, project node node_uuid = list(user_project["workbench"])[0] log_messages, _, _ = await publish_some_messages_in_rabbit( logged_user["id"], user_project["uuid"], node_uuid, NUMBER_OF_MESSAGES, ) async for attempt in AsyncRetrying(**RETRY_POLICY): with attempt: assert mock_log_handler.call_count == (NUMBER_OF_MESSAGES) assert mock_node_update_handler.call_count == (NUMBER_OF_MESSAGES) for mock_call, expected_message in zip(mock_log_handler.call_args_list, log_messages): value = mock_call[0] deserialized_value = json.loads(value[0]) assert deserialized_value == json.loads( expected_message.json(include={"node_id", "messages"})) # mock_log_handler.assert_has_calls(log_calls, any_order=True) mock_node_update_handler.assert_called() assert mock_node_update_handler.call_count == (NUMBER_OF_MESSAGES)
async def wait_till_service_healthy(service_name: str, endpoint: URL): log.info( "Connecting to %s", f"{service_name=} at {endpoint=}", ) async for attempt in AsyncRetrying( # randomizing healthchecks sampling helps parallel execution wait=wait_random(1, 2), # sets the timeout for a service to become healthy stop=stop_after_delay(2 * MINUTE), before_sleep=before_sleep_log(log, logging.WARNING), reraise=True, ): with attempt: async with aiohttp.ClientSession( timeout=_ONE_SEC_TIMEOUT) as session: async with session.get(endpoint) as response: # NOTE: Health-check endpoint require only a status code 200 # (see e.g. services/web/server/docker/healthcheck.py) # regardless of the payload content assert ( response.status == 200 ), f"Connection to {service_name=} at {endpoint=} failed with {response=}" log.info( "Connection to %s succeeded [%s]", f"{service_name=} at {endpoint=}", json.dumps(attempt.retry_state.retry_object.statistics), )
async def test_websocket_disconnected_remove_or_maintain_files_based_on_role( client, logged_user, empty_user_project, mocked_director_v2_api, create_dynamic_service_mock, client_session_id_factory: Callable[[], str], socketio_client_factory: Callable, # asyncpg_storage_system_mock, storage_subsystem_mock, # when guest user logs out garbage is collected expect_call: bool, exp_save_state: bool, ): set_service_deletion_delay(SERVICE_DELETION_DELAY, client.server.app) # login - logged_user fixture # create empty study - empty_user_project fixture # create dynamic service - create_dynamic_service_mock fixture service = await create_dynamic_service_mock(logged_user["id"], empty_user_project["uuid"]) # create websocket client_session_id1 = client_session_id_factory() sio: socketio.AsyncClient = await socketio_client_factory( client_session_id1) # open project in client 1 await open_project(client, empty_user_project["uuid"], client_session_id1) # logout logout_url = client.app.router["auth_logout"].url_for() r = await client.post(logout_url, json={"client_session_id": client_session_id1}) assert r.url_obj.path == logout_url.path await assert_status(r, web.HTTPOk) # ensure sufficient time is wasted here await asyncio.sleep(SERVICE_DELETION_DELAY + 1) await garbage_collector.collect_garbage(client.app) # assert dynamic service is removed calls = [ call( app=client.server.app, save_state=exp_save_state, service_uuid=service["service_uuid"], ) ] mocked_director_v2_api["director_v2_core.stop_service"].assert_has_calls( calls) # this call is done async, so wait a bit here to ensure it is correctly done async for attempt in AsyncRetrying(reraise=True, stop=stop_after_delay(10)): with attempt: if expect_call: # make sure `delete_project` is called storage_subsystem_mock[1].assert_called_once() # make sure `delete_user` is called # asyncpg_storage_system_mock.assert_called_once() else: # make sure `delete_project` not called storage_subsystem_mock[1].assert_not_called()
async def _wait_for_call(mocked_fct): async for attempt in AsyncRetrying( stop=stop_after_delay(10), wait=wait_random(0, 1), retry=retry_if_exception_type(AssertionError), reraise=True, ): with attempt: print(f"waiting for call in mocked fct {mocked_fct}, " f"Attempt={attempt.retry_state.attempt_number}") mocked_fct.assert_called_once()
async def assert_and_wait_for_pipeline_status( client: httpx.AsyncClient, url: AnyHttpUrl, user_id: UserID, project_uuid: UUID, wait_for_states: Optional[List[RunningState]] = None, ) -> ComputationTaskGet: if not wait_for_states: wait_for_states = [ RunningState.SUCCESS, RunningState.FAILED, RunningState.ABORTED, ] MAX_TIMEOUT_S = 5 * MINUTE async def check_pipeline_state() -> ComputationTaskGet: response = await client.get(url, params={"user_id": user_id}) assert ( response.status_code == status.HTTP_202_ACCEPTED ), f"response code is {response.status_code}, error: {response.text}" task_out = ComputationTaskGet.parse_obj(response.json()) assert task_out.id == project_uuid assert task_out.url == f"{client.base_url}/v2/computations/{project_uuid}" print( f"Pipeline '{project_uuid=}' current task out is '{task_out=}'", ) assert wait_for_states assert ( task_out.state in wait_for_states ), f"current task state is '{task_out.state}', not in any of {wait_for_states}" return task_out start = time.monotonic() async for attempt in AsyncRetrying( stop=stop_after_delay(MAX_TIMEOUT_S), wait=wait_fixed(2), retry=retry_if_exception_type(AssertionError), reraise=True, ): elapsed_s = time.monotonic() - start with attempt: print( f"Waiting for pipeline '{project_uuid=}' state to be one of: {wait_for_states=}, attempt={attempt.retry_state.attempt_number}, time={elapsed_s}s" ) task_out = await check_pipeline_state() print( f"Pipeline '{project_uuid=}' state succesfuly became '{task_out.state}'\n{json.dumps(attempt.retry_state.retry_object.statistics, indent=2)}, time={elapsed_s}s" ) return task_out # this is only to satisfy pylance raise AssertionError("No computation task generated!")
async def _check_all_services_are_running(): async for attempt in AsyncRetrying( wait=wait_fixed(5), stop=stop_after_delay(8 * MINUTE), before_sleep=before_sleep_log(log, logging.INFO), reraise=True, ): with attempt: await asyncio.gather(*[ asyncio.get_event_loop().run_in_executor( None, assert_service_is_running, service) for service in docker_client.services.list() ])
async def test_interactive_services_removed_after_logout( client: TestClient, logged_user: Dict[str, Any], empty_user_project: Dict[str, Any], mocked_director_v2_api: Dict[str, mock.MagicMock], create_dynamic_service_mock, client_session_id_factory: Callable[[], str], socketio_client_factory: Callable, storage_subsystem_mock: MockedStorageSubsystem, # when guest user logs out garbage is collected director_v2_service_mock: aioresponses, expected_save_state: bool, ): # login - logged_user fixture # create empty study - empty_user_project fixture # create dynamic service - create_dynamic_service_mock fixture service = await create_dynamic_service_mock(logged_user["id"], empty_user_project["uuid"]) # create websocket client_session_id1 = client_session_id_factory() sio = await socketio_client_factory(client_session_id1) # open project in client 1 await open_project(client, empty_user_project["uuid"], client_session_id1) # logout logout_url = client.app.router["auth_logout"].url_for() r = await client.post(f"{logout_url}", json={"client_session_id": client_session_id1}) assert r.url_obj.path == logout_url.path await assert_status(r, web.HTTPOk) # check result perfomed by background task await asyncio.sleep(SERVICE_DELETION_DELAY + 1) await garbage_collector_core.collect_garbage(client.app) # assert dynamic service is removed *this is done in a fire/forget way so give a bit of leeway async for attempt in AsyncRetrying(reraise=True, stop=stop_after_attempt(10), wait=wait_fixed(1)): with attempt: logger.warning( "Waiting for stop to have been called service_uuid=%s, save_state=%s", service["service_uuid"], expected_save_state, ) mocked_director_v2_api[ "director_v2_core.stop_service"].assert_awaited_with( app=client.server.app, service_uuid=service["service_uuid"], save_state=expected_save_state, )
async def create( cls, app: FastAPI, settings: DaskSchedulerSettings, endpoint: AnyUrl, authentication: ClusterAuthentication, ) -> "DaskClient": logger.info( "Initiating connection to %s with auth: %s", f"dask-scheduler/gateway at {endpoint}", authentication, ) async for attempt in AsyncRetrying( reraise=True, before_sleep=before_sleep_log(logger, logging.WARNING), wait=wait_fixed(0.3), stop=stop_after_attempt(3), ): with attempt: logger.debug( "Connecting to %s, attempt %s...", endpoint, attempt.retry_state.attempt_number, ) dask_subsystem = await _create_internal_client_based_on_auth( endpoint, authentication) check_scheduler_status(dask_subsystem.client) instance = cls( app=app, dask_subsystem=dask_subsystem, settings=settings, cancellation_dask_pub=distributed.Pub( TaskCancelEvent.topic_name(), client=dask_subsystem.client), ) logger.info( "Connection to %s succeeded [%s]", f"dask-scheduler/gateway at {endpoint}", json.dumps(attempt.retry_state.retry_object.statistics), ) logger.info( "Scheduler info:\n%s", json.dumps(dask_subsystem.client.scheduler_info(), indent=2), ) return instance # this is to satisfy pylance raise ValueError("Could not create client")
def wrap(f: WrappedFn) -> WrappedFn: if isinstance(f, retry_base): warnings.warn( f"Got retry_base instance ({f.__class__.__name__}) as callable argument, " f"this will probably hang indefinitely (did you mean retry={f.__class__.__name__}(...)?)" ) if iscoroutinefunction(f): r: "BaseRetrying" = AsyncRetrying(*dargs, **dkw) elif tornado and hasattr( tornado.gen, "is_coroutine_function" ) and tornado.gen.is_coroutine_function(f): r = TornadoRetrying(*dargs, **dkw) else: r = Retrying(*dargs, **dkw) return r.wraps(f)
async def _request_director_v2( app: web.Application, method: str, url: URL, expected_status: Type[web.HTTPSuccessful] = web.HTTPOk, headers: Optional[Dict[str, str]] = None, data: Optional[Any] = None, **kwargs, ) -> DataBody: try: async for attempt in AsyncRetrying(**DEFAULT_RETRY_POLICY): with attempt: session = get_client_session(app) async with session.request(method, url, headers=headers, json=data, **kwargs) as response: payload = (await response.json() if response.content_type == "application/json" else await response.text()) # NOTE: # - `sometimes director-v0` (via redirects) replies # in plain text and this is considered an error # - `director-v2` and `director-v0` can reply with 204 no content if response.status != expected_status.status_code or isinstance( payload, str): raise DirectorServiceError(response.status, reason=f"{payload}") return payload # TODO: enrich with https://docs.aiohttp.org/en/stable/client_reference.html#hierarchy-of-exceptions except asyncio.TimeoutError as err: raise DirectorServiceError( web.HTTPServiceUnavailable.status_code, reason=f"request to director-v2 timed-out: {err}", ) from err except aiohttp.ClientError as err: raise DirectorServiceError( web.HTTPServiceUnavailable.status_code, reason=f"request to director-v2 service unexpected error {err}", ) from err log.error("Unexpected result calling %s, %s", f"{url=}", f"{method=}") raise DirectorServiceError(web.HTTPClientError.status_code, reason="Unexpected client error")
async def setup_director(app: FastAPI) -> None: if settings := app.state.settings.CATALOG_DIRECTOR: # init client-api logger.debug("Setup director at %s...", settings.base_url) director_client = DirectorApi(base_url=settings.base_url, app=app) # check that the director is accessible async for attempt in AsyncRetrying(**director_startup_retry_policy): with attempt: if not await director_client.is_responsive(): raise ValueError("Director-v0 is not responsive") logger.info( "Connection to director-v0 succeded [%s]", json_dumps(attempt.retry_state.retry_object.statistics), ) app.state.director_api = director_client
async def _assert_and_wait_for_comp_task_states_to_be_transmitted_in_projects( project_id: str, postgres_session: sa.orm.session.Session, ): async for attempt in AsyncRetrying( reraise=True, stop=stop_after_delay(120), wait=wait_fixed(5), retry=retry_if_exception_type(AssertionError), ): with attempt: print( f"--> waiting for pipeline results to move to projects table, attempt {attempt.retry_state.attempt_number}..." ) comp_tasks_in_db: Dict[NodeIdStr, Any] = _get_computational_tasks_from_db( project_id, postgres_session) workbench_in_db: Dict[NodeIdStr, Any] = _get_project_workbench_from_db( project_id, postgres_session) for node_id, node_values in comp_tasks_in_db.items(): assert ( node_id in workbench_in_db ), f"node {node_id=} is missing from workbench {json_dumps(workbench_in_db, indent=2)}" node_in_project_table = workbench_in_db[node_id] # if this one is in, the other should also be but let's check it carefully assert node_values.run_hash assert "runHash" in node_in_project_table assert node_values.run_hash == node_in_project_table["runHash"] assert node_values.state assert "state" in node_in_project_table assert "currentStatus" in node_in_project_table["state"] # NOTE: beware that the comp_tasks has StateType and Workbench has RunningState (sic) assert (DB_TO_RUNNING_STATE[node_values.state].value == node_in_project_table["state"]["currentStatus"]) print( "--> tasks were properly transferred! " f"That's great: {json_dumps(attempt.retry_state.retry_object.statistics)}", )
async def assert_service_is_available( # pylint: disable=redefined-outer-name exposed_port: PositiveInt, is_legacy: bool, service_uuid: str) -> None: service_address = (f"http://{get_ip()}:{exposed_port}/x/{service_uuid}" if is_legacy else f"http://{get_ip()}:{exposed_port}") print(f"checking service @ {service_address}") async for attempt in AsyncRetrying(wait=wait_fixed(1), stop=stop_after_attempt(60), reraise=True): with attempt: async with httpx.AsyncClient() as client: response = await client.get(service_address) print( f"{SEPARATOR}\nAttempt={attempt.retry_state.attempt_number}" ) print( f"Body:\n{response.text}\nHeaders={response.headers}\n{SEPARATOR}" ) assert response.status_code == httpx.codes.OK, response.text
async def assemble_cached_indexes(app: web.Application): """ Currently the static resources are contain 3 folders: osparc, s4l, tis each of them contain and index.html to be served to as the root of the site for each type of frontend. Caching these 3 items on start. This """ settings: StaticWebserverModuleSettings = get_plugin_settings(app) cached_indexes: Dict[str, str] = {} session: ClientSession = get_client_session(app) for frontend_name in FRONTEND_APPS_AVAILABLE: url = URL(settings.STATIC_WEBSERVER_URL) / frontend_name log.info("Fetching index from %s", url) try: body = "" # web-static server might still not be up async for attempt in AsyncRetrying( **_STATIC_WEBSERVER_RETRY_ON_STARTUP_POLICY): with attempt: response = await session.get(url, raise_for_status=True) body = await response.text() except ClientError as err: log.error("Could not fetch index from static server: %s", err) # ANE: Yes this is supposed to fail the boot process raise RuntimeError( f"Could not fetch index at {str(url)}. Stopping application boot" ) from err else: # fixes relative paths body = body.replace(f"../resource/{frontend_name}", f"resource/{frontend_name}") body = body.replace("boot.js", f"{frontend_name}/boot.js") log.info("Storing index for %s", url) cached_indexes[frontend_name] = body app[APP_FRONTEND_CACHED_INDEXES_KEY] = cached_indexes
async def _assert_wait_for_task_status( job_id: str, dask_client: DaskClient, expected_status: RunningState, timeout: Optional[int] = None, ): async for attempt in AsyncRetrying( reraise=True, stop=stop_after_delay(timeout or _ALLOW_TIME_FOR_GATEWAY_TO_CREATE_WORKERS), wait=wait_fixed(1), ): with attempt: print( f"waiting for task to be {expected_status=}, " f"Attempt={attempt.retry_state.attempt_number}" ) current_task_status = await dask_client.get_task_status(job_id) assert isinstance(current_task_status, RunningState) print(f"{current_task_status=} vs {expected_status=}") assert current_task_status == expected_status
async def _create_client(address: str) -> aioredis.Redis: client: Optional[aioredis.Redis] = None async for attempt in AsyncRetrying( stop=stop_after_delay(1 * _MINUTE), wait=wait_fixed(_WAIT_SECS), before_sleep=before_sleep_log(log, logging.WARNING), reraise=True, ): with attempt: client = await aioredis.create_redis_pool(address, encoding="utf-8") log.info( "Connection to %s succeeded with %s [%s]", f"redis at {address=}", f"{client=}", json.dumps(attempt.retry_state.retry_object.statistics), ) assert client # nosec return client
async def test_listen_comp_tasks_task( mock_project_subsystem: Dict, comp_task_listening_task: None, client, update_values: Dict[str, Any], expected_calls: List[str], task_class: NodeClass, ): db_engine: aiopg.sa.Engine = client.app[APP_DB_ENGINE_KEY] async with db_engine.acquire() as conn: # let's put some stuff in there now result = await conn.execute( comp_tasks.insert() .values(outputs=json.dumps({}), node_class=task_class) .returning(literal_column("*")) ) row: RowProxy = await result.fetchone() task = dict(row) # let's update some values await conn.execute( comp_tasks.update() .values(**update_values) .where(comp_tasks.c.task_id == task["task_id"]) ) # tests whether listener gets hooked calls executed for call_name, mocked_call in mock_project_subsystem.items(): if call_name in expected_calls: async for attempt in AsyncRetrying( wait=wait_fixed(1), stop=stop_after_delay(10), retry=retry_if_exception_type(AssertionError), before_sleep=before_sleep_log(logger, logging.INFO), reraise=True, ): with attempt: mocked_call.assert_awaited() else: mocked_call.assert_not_called()
async def ensure_volume_cleanup(docker_client: aiodocker.Docker, node_uuid: str) -> None: async def _get_volume_names() -> Set[str]: volumes_list = await docker_client.volumes.list() volume_names: Set[str] = {x["Name"] for x in volumes_list["Volumes"]} return volume_names for volume_name in await _get_volume_names(): if volume_name.startswith(f"dy-sidecar_{node_uuid}"): # docker volume results to be in use and it takes a bit to remove # it once done with it async for attempt in AsyncRetrying( reraise=False, stop=stop_after_attempt(15), wait=wait_fixed(5), ): with attempt: # if volume is still found raise an exception # by the time this finishes all volumes should have been removed if volume_name in await _get_volume_names(): raise _VolumeNotExpectedError(volume_name)
async def create_client(url) -> aioredis.Redis: # create redis client client: Optional[aioredis.Redis] = None async for attempt in AsyncRetrying( stop=stop_after_delay(1 * _MINUTE), wait=wait_fixed(_WAIT_SECS), before_sleep=before_sleep_log(log, logging.WARNING), reraise=True, ): with attempt: client = await aioredis.create_redis_pool(url, encoding="utf-8") if not client: raise ValueError( "Expected aioredis client instance, got {client}") log.info( "Connection to %s succeeded [%s]", f"redis at {endpoint=}", json.dumps(attempt.retry_state.retry_object.statistics), ) assert client # no sec return client
async def test_creating_new_project_from_template_and_disconnecting_does_not_create_project( client: TestClient, logged_user: Dict[str, Any], primary_group: Dict[str, str], standard_groups: List[Dict[str, str]], template_project: Dict[str, Any], expected: ExpectedResponse, catalog_subsystem_mock: Callable, slow_storage_subsystem_mock: MockedStorageSubsystem, project_db_cleaner: None, ): catalog_subsystem_mock([template_project]) # create a project from another and disconnect while doing this by timing out # POST /v0/projects create_url = client.app.router["create_projects"].url_for() assert str(create_url) == f"{API_PREFIX}/projects" create_url = create_url.with_query(from_template=template_project["uuid"]) with pytest.raises(asyncio.TimeoutError): await client.post(f"{create_url}", json={}, timeout=5) # let's check that there are no new project created, after timing out list_url = client.app.router["list_projects"].url_for() assert str(list_url) == API_PREFIX + "/projects" list_url = list_url.with_query(type="user") resp = await client.get(f"{list_url}") data, *_ = await assert_status( resp, expected.ok, ) assert not data # NOTE: after coming back here timing-out, the code shall still run # in the server which is why we need to retry here async for attempt in AsyncRetrying( reraise=True, stop=stop_after_delay(20), wait=wait_fixed(1) ): with attempt: slow_storage_subsystem_mock.delete_project.assert_called_once()
async def _wait_till_rabbit_responsive(url: str) -> None: async for attempt in AsyncRetrying( **RabbitMQRetryPolicyUponInitialization().kwargs): with attempt: connection = await aio_pika.connect(url, timeout=1.0) await connection.close()
async def assert_service_is_running( service_id: str, docker, *, max_running_delay=1 * MINUTE ) -> Tuple[List[TaskDict], TenacityStatsDict]: MAX_WAIT = 5 assert max_running_delay > 3 * MAX_WAIT # # The retry-policy constraints in this test # the time a service takes since it is deployed by the swarm # until it is running (i.e. started and healthy) # retry_policy = dict( # instead of wait_fix in order to help parallel execution in asyncio.gather wait=wait_random(1, MAX_WAIT), stop=stop_after_delay(max_running_delay), before_sleep=before_sleep_log(log, logging.INFO), reraise=True, ) async for attempt in AsyncRetrying(**retry_policy): with attempt: # service service: ServiceDict = await docker.services.inspect(service_id) assert service_id == service["ID"] service_name = service["Spec"]["Name"] num_replicas = int( get_from_dict(service, "Spec.Mode.Replicated.Replicas", default=1)) # tasks in a service tasks: List[TaskDict] = await docker.tasks.list( filters={"service": service_name}) tasks_current_state = [task["Status"]["State"] for task in tasks] num_running = sum(current == "running" for current in tasks_current_state) # assert condition is_running: bool = num_replicas == num_running error_msg = "" if not is_running: # lazy composes error msg logs_lines = await docker.services.logs( service_id, follow=False, timestamps=True, tail=50, # SEE *_docker_logs artifacts for details ) log_str = " ".join(logs_lines) tasks_json = json.dumps( [ copy_from_dict( task, include={ "ID":..., "CreatedAt":..., "UpdatedAt":..., "Spec": { "ContainerSpec": {"Image"} }, "Status": {"Timestamp", "State"}, "DesiredState":..., }, ) for task in tasks ], indent=1, ) error_msg = ( f"{service_name=} has {tasks_current_state=}, but expected at least {num_replicas=} running. " f"Details:\n" f"tasks={tasks_json}\n" f"logs={log_str}\n") assert is_running, error_msg log.info( "Connection to %s succeded [%s]", service_name, json.dumps(attempt.retry_state.retry_object.statistics), ) return tasks, attempt.retry_state.retry_object.statistics assert False # never reached