def s3_estimate_size(src: str, s3: Any = None) -> ContentSizeInfo: s3 = s3 or boto3.resource("s3") dst_url = urlparse(src) bucket_name = dst_url.netloc key = dst_url.path[1:] bucket = s3.Bucket(bucket_name) # Are we able to access the key on it's own? obj = bucket.Object(key) try: if obj.content_length > 0: return ContentSizeInfo(obj.content_length, 1, obj.content_length) except ClientError: logger.info( f"Failed to get content_length for {obj}. May be not an object at all" ) cnt = 0 total_size = 0 max_size = 0 try: for sub_obj in bucket.objects.filter(Prefix=obj.key): if not sub_obj.size: continue cnt = cnt + 1 total_size += sub_obj.size max_size = max(max_size, sub_obj.size) except ClientError as e: raise S3Error(str(e)) from e return ContentSizeInfo(int(total_size), cnt, int(max_size))
def fetcher_event(descriptor_as_adict) -> FetcherBenchmarkEvent: return FetcherBenchmarkEvent( action_id=ACTION_ID, message_id="MESSAGE_ID", client_id="CLIENT_ID", client_version="CLIENT_VERSION", client_username="******", authenticated=False, tstamp=42, visited=[], type="PRODUCER_TOPIC", payload=FetcherPayload( toml=BenchmarkDoc(contents=descriptor_as_adict.to_dict(), doc="", sha1="SHA"), scripts=SCRIPTS, datasets=[ DownloadableContent( src="http://someserver.com/somedata.zip", dst=DATASET_S3_URI, path="/mount/path", id=DATASET_ID, size_info=ContentSizeInfo(total_size=42, file_count=1, max_size=42), type=FetchedType.FILE, ) ], ), )
def test_http_estimator(mock_curl): size_info = http_estimate_size(SOME_DATASET_SRC) mock_curl.setopt.assert_has_calls([ call(pycurl.URL, SOME_DATASET_SRC), call(pycurl.NOBODY, 1), call(pycurl.HEADER, 1) ], any_order=True) mock_curl.getinfo.assert_called_with(pycurl.CONTENT_LENGTH_DOWNLOAD) assert size_info == ContentSizeInfo(DATA_SIZE, 1, DATA_SIZE)
def http_estimate_size(src) -> ContentSizeInfo: curl = pycurl.Curl() curl.setopt(pycurl.URL, src) curl.setopt(pycurl.FOLLOWLOCATION, 1) curl.setopt(pycurl.MAXREDIRS, 5) curl.setopt(pycurl.CONNECTTIMEOUT, 30) curl.setopt(pycurl.TIMEOUT, 60) # 60s should be enough to send HEAD and get back curl.setopt(pycurl.HEADER, 1) curl.setopt(pycurl.NOBODY, 1) http_perform(curl) content_length = curl.getinfo(pycurl.CONTENT_LENGTH_DOWNLOAD) return ContentSizeInfo(int(content_length), 1, int(content_length))
def test_http_estimator(): assert http_estimate_size(BIG_FILE) == ContentSizeInfo( HUGE_SIZE, 1, HUGE_SIZE)
def test_s3_estimator_folder(mock_s3_with_folder): size_info = s3_estimate_size(SOME_S3_FOLDER, mock_s3_with_folder) assert size_info == ContentSizeInfo(SOME_SIZE, 1, SOME_SIZE)
def test_s3_estimator_file(mock_s3_with_file): size_info = s3_estimate_size(SOME_S3_FILE, mock_s3_with_file) assert size_info == ContentSizeInfo(SOME_SIZE, 1, SOME_SIZE)
def test_s3_with_a_folder(s3_with_a_folder): assert ContentSizeInfo(FILE_SIZE * FILE_COUNT, FILE_COUNT, FILE_SIZE) == s3_estimate_size( S3_FOLDER, s3_with_a_folder)
def test_s3_with_a_file(s3_with_a_file): assert ContentSizeInfo(FILE_SIZE, 1, FILE_SIZE) == s3_estimate_size( S3_SINGLE_FILE, s3_with_a_file)
# This test can be executed from IDE # API boundary test - should just not fail starting the job - the job itself can fail from bai_k8s_utils.kubernetes_tests_client import KubernetesTestUtilsClient S3_DST = "s3://dst" SOMEDATA_BIG = "http://*****:*****@pytest.mark.parametrize("size_info", [BIG_SIZE, SMALL_SIZE], ids=["big", "small"]) def test_kubernetes_client( k8s_dispatcher: KubernetesDispatcher, benchmark_event_dummy_payload: BenchmarkEvent, k8s_test_client: KubernetesTestUtilsClient, fetcher_job_config: FetcherJobConfig, size_info: ContentSizeInfo, ): data_set = DownloadableContent(src=SOMEDATA_BIG, path="/mount/path", dst=S3_DST, md5=None, size_info=size_info) k8s_dispatcher.dispatch_fetch(data_set, benchmark_event_dummy_payload, "/data/sets/fake")
DownloadDispatcher, DownloadOnDone, ContentSizeEstimator, ) FILE_SIZE = 42 ZK_VERSION = 1 CLIENT_ID = "CLIENT_ID" ACTION_ID = "ACTION_ID" SOME_PATH = "/some/path" SOME_SIZE_INFO = ContentSizeInfo(FILE_SIZE, 1, FILE_SIZE) def mock_size_estimator(src: str) -> ContentSizeInfo: return SOME_SIZE_INFO @fixture def failing_size_estimator() -> ContentSizeEstimator: mock = create_autospec(ContentSizeEstimator) mock.side_effect = UnRetryableError() return mock def data_set_to_path(client_id: str, action_id: str = None,
NODE_SELECTOR = {"label1": "val1", "label2": "val2"} NAMESPACE = "internal" PULL_POLICY = "OnFailure" RESTART_POLICY = "OnFailure" TTL = 42 SMALL_DATA_SET_SIZE = 1 * MB MIN_VOLUME_SIZE_MB = 64 SMALL_DATA_SET_SIZE_INFO = ContentSizeInfo(SMALL_DATA_SET_SIZE, 1, SMALL_DATA_SET_SIZE) BIG_DATA_SET_SIZE_INFO = ContentSizeInfo(MIN_VOLUME_SIZE_MB * MB, 1, MIN_VOLUME_SIZE_MB * MB) FETCHER_JOB_CONFIG = FetcherJobConfig( namespace=NAMESPACE, image=FETCHER_JOB_IMAGE, node_selector=NODE_SELECTOR, pull_policy=PULL_POLICY, ttl=TTL, restart_policy=RESTART_POLICY, volume=FetcherVolumeConfig(MIN_VOLUME_SIZE_MB), ) KUBECONFIG = "path/cfg"