def test_update_failed(some_data_set: DownloadableContent): FetcherResult(FetcherStatus.FAILED, FetchedType.FILE, ERROR).update(some_data_set) assert not some_data_set.dst assert some_data_set.type == FetchedType.FILE assert some_data_set.message == ERROR
def on_content_locked(content: DownloadableContent, lock: RWLock): def _on_done_and_unlock(content: DownloadableContent): on_done(content) self._download_dispatcher.cleanup(content, event) lock.release() try: content.size_info = self._size_estimator(content.src) except Exception as e: msg = f"Failed to estimate the size of content {content.src}: {str(e)}" logger.exception(f"{msg}") FetcherResult(FetcherStatus.FAILED, None, msg).update(content) on_done(content) lock.release() return # This node will be killed if I die zk_node_path = self._get_node_path(event.client_id, event.action_id, content) self._zk.create(zk_node_path, DownloadManager.INITIAL_DATA, ephemeral=True, makepath=True) self.__handle_node_state(zk_node_path, _on_done_and_unlock, content) content.size_info = self._size_estimator(content.src) self._download_dispatcher.dispatch_fetch(content, event, zk_node_path)
def test_fetcher_updates_zk(mock_http_to_s3, mock_update_zk_node): cfg = FetcherJobConfig(HTTP_SRC, DST, zk_node_path=ZK_NODE_PATH, zookeeper_ensemble_hosts=ZK_ENSEMBLE) retrying_fetch(cfg) mock_update_zk_node.assert_called_with( ZK_NODE_PATH, ZK_ENSEMBLE, FetcherResult(FetcherStatus.DONE, FetchedType.FILE, SUCCESS_MESSAGE) )
def test_fetcher_updates_zk_fail(mock_http_to_s3_client_error, mock_update_zk_node): cfg = FetcherJobConfig(HTTP_SRC, DST, zk_node_path=ZK_NODE_PATH, zookeeper_ensemble_hosts=ZK_ENSEMBLE) retrying_fetch(cfg) mock_update_zk_node.assert_called_with( ZK_NODE_PATH, ZK_ENSEMBLE, FetcherResult(status=FetcherStatus.FAILED, message=FILE_NOT_FOUND) )
def test_fetcher_updates_zk_with_directory(mock_s3_to_s3, mock_update_zk_node): # It's a directory mock_s3_to_s3.return_value = FetchedType.DIRECTORY cfg = FetcherJobConfig(S3_SRC, DST, zk_node_path=ZK_NODE_PATH, zookeeper_ensemble_hosts=ZK_ENSEMBLE) retrying_fetch(cfg) mock_update_zk_node.assert_called_with( ZK_NODE_PATH, ZK_ENSEMBLE, FetcherResult(FetcherStatus.DONE, FetchedType.DIRECTORY, SUCCESS_MESSAGE) )
def test_fetcher_updates_zk_once(mock_http_to_s3_server_error, mock_update_zk_node): cfg = FetcherJobConfig( HTTP_SRC, DST, zk_node_path=ZK_NODE_PATH, zookeeper_ensemble_hosts=ZK_ENSEMBLE, retry=RetryConfig(max_attempts=1), ) retrying_fetch(cfg) mock_update_zk_node.assert_called_with( ZK_NODE_PATH, ZK_ENSEMBLE, FetcherResult(status=FetcherStatus.FAILED, message=SERVER_ERROR) )
def retrying_fetch(cfg: FetcherJobConfig): @retry( retry_on_exception=lambda exc: isinstance(exc, RetryableError), wait_exponential_multiplier=cfg.retry.exp_multiplier, wait_exponential_max=cfg.retry.exp_max, stop_max_attempt_number=cfg.retry.max_attempts, ) def _retry_fetch(cfg) -> FetchedType: return _fetch(cfg) fetched_type = None try: fetched_type = _retry_fetch(cfg) except (RetryableError, UnRetryableError) as ex: logger.exception("Download error. Unretryable or out of attempts") _update_zk_node( cfg, FetcherResult(status=FetcherStatus.FAILED, message=str(ex))) return _update_zk_node( cfg, FetcherResult(status=FetcherStatus.DONE, message=SUCCESS_MESSAGE, type=fetched_type))
def __handle_node_state(self, zk_node_path: str, on_done: DownloadOnDone, content: DownloadableContent): def _on_zk_changed(evt): self.__on_zk_changed(evt, on_done, content) data, _ = self._zk.get(zk_node_path, _on_zk_changed) result: FetcherResult = FetcherResult.from_binary(data) logger.info("Fetch request %s result = %s", content, result) if result.status.final: result.update(content) # We clean up self._zk.delete(zk_node_path) on_done(content)
def _update_nodes_to_cancel(self, client_id: str, action_id: str) -> int: # As always with stop-flags, we can face a bunch of race conditions zk_node_path = self._get_node_path(client_id, action_id) number_of_nodes_updated = 0 try: for child in self._zk.get_children(zk_node_path): abs_path = zk_node_path + "/" + child logger.info(f"Updating node {abs_path}") try: while True: data, zk_stat = self._zk.get(abs_path) result: FetcherResult = FetcherResult.from_binary(data) # The guy is final - it will not take long for us to cancel it. # The job is finished. # So now we are in a race with a zookeeper listener, that will pass the results downstream. if result.status.final: logger.info(f"{abs_path}: not to be canceled - already finished") break result.status = FetcherStatus.CANCELED new_data = result.to_binary() try: self._zk.set(abs_path, new_data, version=zk_stat.version) number_of_nodes_updated = number_of_nodes_updated + 1 except BadVersionError: logger.info(f"{abs_path}: the node was updated meanwhile") continue logger.info(f"{abs_path}: canceled") break except NoNodeError: logger.info(f"{abs_path}: the node was deleted meanwhile") # The task was just finished - status was repopted to customer and the node got deleted. # OK. It's not our deal anymore continue except NoNodeError: # Absorb NoNodeError logger.info(f"{zk_node_path}: node not found") return number_of_nodes_updated
import pytest from pytest import fixture from bai_kafka_utils.events import FetcherStatus, FetchedType, DownloadableContent from bai_zk_utils.states import FetcherResult ERROR = "Error" FETCHER_DONE_RESULT = FetcherResult(FetcherStatus.DONE, FetchedType.FILE, "Success") STATE_DONE_BIN = b'{"status": "DONE", "type": "FILE", "message": "Success"}' STATE_RUNNING_BIN = b'{"status": "RUNNING"}' STATE_STRANGE_BIN = b'{"status": "STRANGE"}' def test_serialize_state(): assert STATE_DONE_BIN == FETCHER_DONE_RESULT.to_binary() def test_deserialize_state(): assert FetcherResult.from_binary(STATE_DONE_BIN) == FETCHER_DONE_RESULT def test_deserialize_state_final(): result = FetcherResult.from_binary(STATE_DONE_BIN) assert result.status.final
def update_zk_node(zk_node_path: str, zookeeper_ensemble: str, state: FetcherResult): zk = KazooClient(hosts=zookeeper_ensemble) zk.start() zk.set(zk_node_path, state.to_binary()) zk.stop()
def test_deserialize_state_final(): result = FetcherResult.from_binary(STATE_DONE_BIN) assert result.status.final
def test_deserialize_state(): assert FetcherResult.from_binary(STATE_DONE_BIN) == FETCHER_DONE_RESULT
def test_deserialize_state_not_final(): result = FetcherResult.from_binary(STATE_RUNNING_BIN) assert not result.status.final
from kazoo.client import KazooClient from unittest.mock import patch, create_autospec from bai_zk_utils import zk_client from bai_zk_utils.states import FetcherResult from bai_kafka_utils.events import FetcherStatus from bai_zk_utils.zk_client import update_zk_node FETCHER_RESULT = FetcherResult(FetcherStatus.DONE, "Success") ZK_NODE_PATH = "/zk/path" ZK_ENSEMBLE = "Z1" @patch.object(zk_client, "KazooClient") def test_update_zk_node(mockKazooClient): mock_zk_client = mockKazooClient.return_value = create_autospec( KazooClient) update_zk_node(ZK_NODE_PATH, ZK_ENSEMBLE, FETCHER_RESULT) mockKazooClient.assert_called_with(hosts=ZK_ENSEMBLE) mock_zk_client.start.assert_called_once() mock_zk_client.set.assert_called_with(ZK_NODE_PATH, FETCHER_RESULT.to_binary()) mock_zk_client.stop.assert_called_once()
def test_deserialize_state_strange(): with pytest.raises(Exception): FetcherResult.from_binary(STATE_STRANGE_BIN)
class DownloadManager: @staticmethod def __get_node_path(client_id: str, action_id: str = None, content: DownloadableContent = None) -> str: # MD5 has impact on the node - so different locks etc. path = f"/downloads/{client_id}" if action_id: path += f"/{action_id}" if content: path += f"/{md5sum(str(content))}" return path INITIAL_DATA = FetcherResult(FetcherStatus.PENDING).to_binary() @staticmethod def _set_failed(content: DownloadableContent, message: str): content.message = message content.status = FetcherStatus.FAILED content.dst = None def __init__( self, zk: KazooClient, download_dispatcher: DownloadDispatcher, lock_manager: RWLockManager, get_node_path: NodePathSource = None, size_estimator: ContentSizeEstimator = None, ): self._zk = zk self._download_dispatcher = download_dispatcher self._get_node_path = get_node_path or DownloadManager.__get_node_path self._lock_manager = lock_manager self._size_estimator = size_estimator or estimate_fetch_size def start(self) -> None: logger.info("Start") self._zk.start() def fetch(self, content: DownloadableContent, event: BenchmarkEvent, on_done: DownloadOnDone) -> None: logger.info("Fetch request %s", content) def on_content_locked(content: DownloadableContent, lock: RWLock): def _on_done_and_unlock(content: DownloadableContent): on_done(content) self._download_dispatcher.cleanup(content, event) lock.release() try: content.size_info = self._size_estimator(content.src) except Exception as e: msg = f"Failed to estimate the size of content {content.src}: {str(e)}" logger.exception(f"{msg}") FetcherResult(FetcherStatus.FAILED, None, msg).update(content) on_done(content) lock.release() return # This node will be killed if I die zk_node_path = self._get_node_path(event.client_id, event.action_id, content) self._zk.create(zk_node_path, DownloadManager.INITIAL_DATA, ephemeral=True, makepath=True) self.__handle_node_state(zk_node_path, _on_done_and_unlock, content) content.size_info = self._size_estimator(content.src) self._download_dispatcher.dispatch_fetch(content, event, zk_node_path) self._lock_manager.acquire_write_lock(content, on_content_locked) def __on_zk_changed(self, event: WatchedEvent, on_done: DownloadOnDone, content: DownloadableContent): if event.type == EventType.DELETED: if not content.status: # Something not final - and deleted??? logger.error("Deleted node %s for the not finalized content %s", event.path, content) # TODO More sophisticated handling of that? return self.__handle_node_state(event.path, on_done, content) def __handle_node_state(self, zk_node_path: str, on_done: DownloadOnDone, content: DownloadableContent): def _on_zk_changed(evt): self.__on_zk_changed(evt, on_done, content) data, _ = self._zk.get(zk_node_path, _on_zk_changed) result: FetcherResult = FetcherResult.from_binary(data) logger.info("Fetch request %s result = %s", content, result) if result.status.final: result.update(content) # We clean up self._zk.delete(zk_node_path) on_done(content) def stop(self) -> None: logger.info("Stop") self._zk.stop() def cancel(self, client_id: str, action_id: str) -> Tuple[List[str], int]: logger.info(f"Canceling action {client_id}/{action_id}") return ( self._download_dispatcher.cancel_all(client_id, action_id), self._update_nodes_to_cancel(client_id, action_id), ) def _update_nodes_to_cancel(self, client_id: str, action_id: str) -> int: # As always with stop-flags, we can face a bunch of race conditions zk_node_path = self._get_node_path(client_id, action_id) number_of_nodes_updated = 0 try: for child in self._zk.get_children(zk_node_path): abs_path = zk_node_path + "/" + child logger.info(f"Updating node {abs_path}") try: while True: data, zk_stat = self._zk.get(abs_path) result: FetcherResult = FetcherResult.from_binary(data) # The guy is final - it will not take long for us to cancel it. # The job is finished. # So now we are in a race with a zookeeper listener, that will pass the results downstream. if result.status.final: logger.info(f"{abs_path}: not to be canceled - already finished") break result.status = FetcherStatus.CANCELED new_data = result.to_binary() try: self._zk.set(abs_path, new_data, version=zk_stat.version) number_of_nodes_updated = number_of_nodes_updated + 1 except BadVersionError: logger.info(f"{abs_path}: the node was updated meanwhile") continue logger.info(f"{abs_path}: canceled") break except NoNodeError: logger.info(f"{abs_path}: the node was deleted meanwhile") # The task was just finished - status was repopted to customer and the node got deleted. # OK. It's not our deal anymore continue except NoNodeError: # Absorb NoNodeError logger.info(f"{zk_node_path}: node not found") return number_of_nodes_updated
def _mock_result_binary(status: FetcherStatus, msg: str = None): return FetcherResult(status, msg).to_binary()