def test_regression_s3_utils_short_name_c4_706(): # Environment long names work (at least in legacy CGAP) s3Utils(env="fourfront-mastertest") with known_bug_expected(jira_ticket="C4-706", fixed=True, error_class=ClientError): # Sort names not allowed. s3Utils(env="mastertest")
def test_s3_utils_environment_variable_use(): with pytest.raises(SynonymousEnvironmentVariablesMismatched): with override_environ(GLOBAL_BUCKET_ENV='should-be-unused', GLOBAL_ENV_BUCKET='inconsistently-unused'): # If we do the simple-minded version of this, the environment variable doesn't matter s3Utils(sys_bucket='foo') with pytest.raises(SynonymousEnvironmentVariablesMismatched): # If we don't initialize the sys_bucket, we have to go through the smart protocols # and expect environment variables to be in order. s3Utils()
def mocked_s3_integration(integrated_names=None, zip_suffix="", ffenv=None): """ This does common setup of some mocks needed by zip testing. """ zip_path_key = "zip_path" + zip_suffix zip_filename_key = "zip_filename" + zip_suffix b3 = MockBoto3() if not ffenv: ffenv = integrated_names['ffenv'] if integrated_names else None with mock.patch.object(s3_utils_module, "boto3", b3): s3_connection = s3Utils(env=ffenv) if integrated_names is not None: # Not needed when mocked. # s3_connection.s3_delete_dir(prefix) # In our mock, this won't exist already on S3 like in the integrated version of this test, # so we have to pre-load to our mock S3 manually. -kmp 13-Jan-2021 s3_connection.s3.upload_file(Filename=integrated_names[zip_path_key], Bucket=s3_connection.outfile_bucket, Key=integrated_names[zip_filename_key]) s3_connection.s3.put_object(Bucket=s3_connection.outfile_bucket, Key=integrated_names['filename'], Body=str.encode('thisisatest')) yield s3_connection
def test_s3utils_delete_key(): sample_key_name = "--- reserved_key_name_for_unit_testing ---" util = s3Utils(env='fourfront-mastertest') with mock.patch.object(util.s3, "delete_object") as mock_delete_object: def make_mocked_delete_object(expected_bucket, expected_key): def mocked_delete_object(Bucket, Key): # noQA - AWS chooses the arg names assert Bucket == expected_bucket assert Key == expected_key return mocked_delete_object mock_delete_object.side_effect = make_mocked_delete_object(expected_bucket=util.outfile_bucket, expected_key=sample_key_name) util.delete_key(sample_key_name) # This won't err if everything went well assert mock_delete_object.call_count == 1 explicit_bucket = '--- reserved_bucket_name_for_unit_testing ---' mock_delete_object.side_effect = make_mocked_delete_object(expected_bucket=explicit_bucket, expected_key=sample_key_name) util.delete_key(sample_key_name, bucket=explicit_bucket) assert mock_delete_object.call_count == 2
def test_s3utils_s3_put(): util = s3Utils(env='fourfront-mastertest') some_content_type = "text/plain" with mock.patch("mimetypes.guess_type") as mock_guess_type: mock_guess_type.return_value = [some_content_type] with mock.patch.object(util.s3, "put_object") as mock_put_object: def mocked_put_object(**kwargs): return kwargs mock_put_object.side_effect = mocked_put_object item = {'a': 1, 'b': 2} some_key = 'some-key' assert util.s3_put(item, upload_key=some_key) == { "Body": item, "Bucket": util.outfile_bucket, "Key": some_key, "ContentType": some_content_type, } some_acl = 'some-acl' assert util.s3_put(item, upload_key=some_key, acl=some_acl) == { "Body": item, "Bucket": util.outfile_bucket, "Key": some_key, "ContentType": some_content_type, "ACL": some_acl, }
def test_s3utils_s3_put_secret(): util = s3Utils(env='fourfront-mastertest') standard_algorithm = "AES256" environmental_key = 'environmental-key' with override_environ(S3_ENCRYPT_KEY=environmental_key): with mock.patch.object(util.s3, "put_object") as mock_put_object: def mocked_put_object(**kwargs): return kwargs mock_put_object.side_effect = mocked_put_object item = {'a': 1, 'b': 2} some_key = 'some-key' some_secret = 'some-secret' assert util.s3_put_secret(item, keyname=some_key) == { "Body": item, "Bucket": util.sys_bucket, "Key": some_key, "SSECustomerKey": environmental_key, "SSECustomerAlgorithm": standard_algorithm, } some_bucket = 'some-bucket' assert util.s3_put_secret(item, keyname=some_key, bucket=some_bucket) == { "Body": item, "Bucket": some_bucket, "Key": some_key, "SSECustomerKey": environmental_key, "SSECustomerAlgorithm": standard_algorithm, } assert util.s3_put_secret(item, keyname=some_key, secret=some_secret) == { "Body": item, "Bucket": util.sys_bucket, "Key": some_key, "SSECustomerKey": some_secret, "SSECustomerAlgorithm": standard_algorithm, }
def __init__(self, fs_environ, fs_environ_info, test=False, use_es=True, host=None): # FOURSIGHT information self.fs_env = fs_environ es = ESConnection(index=fs_environ_info.get('bucket'), host=host) if use_es else None self.connections = { 's3': S3Connection(fs_environ_info.get('bucket')), 'es': es } # FOURFRONT information self.ff_server = fs_environ_info['fourfront'] self.ff_env = fs_environ_info['ff_env'] self.ff_es = fs_environ_info['es'] if not test: self.ff_s3 = s3Utils(env=self.ff_env) try: # TODO: make this configurable from env variables? self.ff_keys = self.ff_s3.get_access_keys( 'access_key_foursight') except Exception as e: raise Exception( 'Could not initiate connection to Fourfront; it is probably a bad ff_env. ' 'You gave: %s. Error message: %s' % (self.ff_env, str(e))) # ensure ff_keys has server, and make sure it does not end with '/' if 'server' not in self.ff_keys: server = self.ff_server[:-1] if self.ff_server.endswith( '/') else self.ff_server self.ff_keys['server'] = server else: self.ff_s3 = None self.ff_keys = None
def integrated_s3_info(integrated_names): """ Ensure the test files are present in the s3 sys bucket of the integrated environment (probably 'fourfront-mastertest') and return some info on them """ test_filename = integrated_names['filename'] s3_obj = s3Utils(env=INTEGRATED_ENV) # for now, always upload these files s3_obj.s3.put_object(Bucket=s3_obj.outfile_bucket, Key=test_filename, Body=str.encode('thisisatest')) s3_obj.s3.upload_file(Filename=integrated_names['zip_path'], Bucket=s3_obj.outfile_bucket, Key=integrated_names['zip_filename']) s3_obj.s3.upload_file(Filename=integrated_names['zip_path2'], Bucket=s3_obj.outfile_bucket, Key=integrated_names['zip_filename2']) return { 's3Obj': s3_obj, 'filename': test_filename, 'zip_filename': integrated_names['zip_filename'], 'zip_filename2': integrated_names['zip_filename2'], }
def update_config(config, app_name, input_files, parameters): if config['instance_type'] != '' and config['ebs_size'] != 0 and config[ 'EBS_optimized'] != '': pass else: input_size_in_bytes = dict() for argname, f in input_files.iteritems(): bucket = f['bucket_name'] s3 = s3_utils.s3Utils(bucket, bucket, bucket) if isinstance(f['object_key'], list): size = [] for key in f['object_key']: try: size.append(s3.get_file_size(key, bucket)) except: raise Exception("Can't get input file size") else: try: size = s3.get_file_size(f['object_key'], bucket) except: raise Exception("Can't get input file size") input_size_in_bytes.update({str(argname): size}) print({"input_size_in_bytes": input_size_in_bytes}) try: res = B.benchmark( app_name, { 'input_size_in_bytes': input_size_in_bytes, 'parameters': parameters }) except: try: res raise Exception("Benchmarking not working. : {}".format( str(res))) except: raise Exception("Benchmarking not working. : None") if res is not None: logger.info(str(res)) instance_type = res['aws']['recommended_instance_type'] ebs_size = 10 if res['total_size_in_GB'] < 10 else int( res['total_size_in_GB']) + 1 ebs_opt = res['aws']['EBS_optimized'] if config['instance_type'] == '': config['instance_type'] = instance_type if config['ebs_size'] == 0: config['ebs_size'] = ebs_size if config['EBS_optimized'] == '': config['EBS_optimized'] = ebs_opt elif config['instance_type'] == '': raise Exception("instance type cannot be determined nor given") elif config['ebs_size'] == 0: raise Exception("ebs_size cannot be determined nor given") elif config['EBS_optimized'] == '': raise Exception("EBS_optimized cannot be determined nor given")
def test_s3utils_get_keys_for_staging(): # TODO: I'm not sure what this is testing, so it's hard to rewrite # But I fear this use of env 'staging' implies the GA test environment has overbroad privilege. # We should make this work without access to 'staging'. # -kmp 13-Jan-2021 util = s3Utils(env='staging') keys = util.get_ff_key() assert keys['server'] == 'http://staging.4dnucleome.org'
def test_s3utils_get_google_key(): util = s3Utils(env='staging') keys = util.get_google_key() assert isinstance(keys, dict) assert keys['type'] == 'service_account' assert keys["project_id"] == "fourdn-fourfront" for dict_key in ['private_key_id', 'private_key', 'client_email', 'client_id', 'auth_uri', 'client_x509_cert_url']: assert keys[dict_key]
def test_s3utils_get_higlass_key_integrated(): # TODO: I'm not sure what this is testing, so it's hard to rewrite # But I fear this use of env 'staging' implies the GA test environment has overbroad privilege. # We should make this work without access to 'staging'. # -kmp 13-Jan-2021 util = s3Utils(env='staging') keys = util.get_higlass_key() assert isinstance(keys, dict) assert 3 == len(keys.keys())
def test_s3utils_get_jupyterhub_key(): # TODO: I'm not sure what this is testing, so it's hard to rewrite # But I fear this use of env 'data' implies the GA test environment has overbroad privilege. # We should make this work without access to 'data'. # -kmp 13-Jan-2021 util = s3Utils(env='data') key = util.get_jupyterhub_key() assert 'secret' in key assert key['server'] == 'https://jupyter.4dnucleome.org'
def test_s3utils_get_keys_for_data(): util = s3Utils(env='data') keys = util.get_access_keys() assert keys['server'] == 'https://data.4dnucleome.org' # make sure we have keys for foursight and tibanna as well keys_tb = util.get_access_keys('access_key_tibanna') assert keys_tb['key'] != keys['key'] assert keys_tb['server'] == keys['server'] keys_fs = util.get_access_keys('access_key_foursight') assert keys_fs['key'] != keys_tb['key'] != keys['key'] assert keys_fs['server'] == keys['server']
def __init__(self, env, ff_keys=None, sbg_keys=None, settings=None): self.env = env self.s3 = s3Utils(env=env) if not ff_keys: ff_keys = self.s3.get_access_keys() self.ff_keys = ff_keys if not settings: settings = {} self.settings = settings
def test_it(): # As long as sys_bucket= is given in the s3Utils() call, it will just fill the slots # with given values and won't try to do anything smart. s = s3Utils(outfile_bucket, sys_bucket, raw_file_bucket) assert s.outfile_bucket == outfile_bucket assert s.sys_bucket == sys_bucket assert s.raw_file_bucket == raw_file_bucket assert s.blob_bucket is None assert s.metadata_bucket is None assert s.tibanna_cwls_bucket is None assert s.tibanna_output_bucket is None s = s3Utils(sys_bucket=sys_bucket) assert s.outfile_bucket is None assert s.sys_bucket == sys_bucket assert s.raw_file_bucket is None assert s.blob_bucket is None assert s.metadata_bucket is None assert s.tibanna_cwls_bucket is None assert s.tibanna_output_bucket is None
def test_s3utils_get_access_keys_with_old_style_default(): util = s3Utils(env='fourfront-mastertest') with mock.patch.object(util, "get_key") as mock_get_key: actual_key = {'key': 'some-key', 'server': 'some-server'} def mocked_get_key(keyfile_name): ignored(keyfile_name) key_wrapper = {'default': actual_key} return key_wrapper mock_get_key.side_effect = mocked_get_key key = util.get_access_keys() assert key == actual_key
def test_s3utils_get_key_non_json_data(): util = s3Utils(env='fourfront-mastertest') non_json_string = '1 { 2 3 >' with mock.patch.object(util.s3, "get_object") as mock_get_object: mock_get_object.return_value = {'Body': io.BytesIO(bytes(non_json_string, encoding='utf-8'))} assert util.get_key() == non_json_string with mock.patch.object(util.s3, "get_object") as mock_get_object: mock_get_object.return_value = {'Body': io.StringIO(non_json_string)} assert util.get_key() == non_json_string
def __init__(self, bucket, key, accession=None, filesize=None, md5=None, file_format=None): self.bucket = bucket self.key = key self.s3 = s3Utils(self.bucket, self.bucket, self.bucket) self.accession = accession self.filesize = filesize self.md5 = md5 self.file_format = file_format
def test_prd(ff_production_envname): util = s3Utils(env=ff_production_envname) actual_props = { 'sys_bucket': util.sys_bucket, 'outfile_bucket': util.outfile_bucket, 'raw_file_bucket': util.raw_file_bucket, 'url': util.url, } assert actual_props == { 'sys_bucket': 'elasticbeanstalk-fourfront-webprod-system', 'outfile_bucket': 'elasticbeanstalk-fourfront-webprod-wfoutput', 'raw_file_bucket': 'elasticbeanstalk-fourfront-webprod-files', 'url': FF_PUBLIC_URL_PRD, }
def test_prd(cgap_production_envname): util = s3Utils(env=cgap_production_envname) actual_props = { 'sys_bucket': util.sys_bucket, 'outfile_bucket': util.outfile_bucket, 'raw_file_bucket': util.raw_file_bucket, 'url': util.url, } assert actual_props == { 'sys_bucket': 'elasticbeanstalk-fourfront-cgap-system', 'outfile_bucket': 'elasticbeanstalk-fourfront-cgap-wfoutput', 'raw_file_bucket': 'elasticbeanstalk-fourfront-cgap-files', 'url': _CGAP_MGB_PUBLIC_URL_PRD, }
def __init__(self, bucket, key, runner, accession=None, output_type=None, filesize=None, md5=None): self.bucket = bucket self.key = key self.s3 = s3_utils.s3Utils(self.bucket, self.bucket, self.bucket) self.runner = runner self.accession = accession self.output_type = output_type self.filesize = filesize self.md5 = md5
def test_s3_utils_buckets_modern(): env_name = 'fourfront-cgapfoo' es_server_short = "some-es-server.com:443" es_server_https = "https://some-es-server.com:443" with mock.patch("boto3.client"): with mock.patch.object(s3_utils_module.EnvManager, "fetch_health_page_json") as mock_fetch: mock_fetch.return_value = { "elasticsearch": es_server_short, "system_bucket": "the-system-bucket", "processed_file_bucket": "the-output-file-bucket", "file_upload_bucket": "the-raw-file-bucket", "blob-bucket": "the-blob-bucket", "metadata_bundles_bucket": "the-metadata-bundles-bucket", "tibanna_cwls_bucket": "the-tibanna-cwls-bucket", "tibanna_output_bucket": "the-tibanna-output-bucket", "s3_encrypt_key_id": "my-encrypt-key", } s = s3Utils(env=env_name) assert s.outfile_bucket != 'the-output-file-bucket' assert s.sys_bucket != 'the-system-bucket' assert s.raw_file_bucket != 'the-raw-file-bucket' assert s.blob_bucket != 'the-blog-bucket' assert s.metadata_bucket != 'the-metadata-bundles-bucket' assert s.tibanna_cwls_bucket != 'the-tibanna-cwls-bucket' assert s.tibanna_output_bucket != 'the-tibanna-output-bucket' assert s.outfile_bucket == 'elasticbeanstalk-fourfront-cgapfoo-wfoutput' assert s.sys_bucket == 'elasticbeanstalk-fourfront-cgapfoo-system' assert s.raw_file_bucket == 'elasticbeanstalk-fourfront-cgapfoo-files' assert s.blob_bucket == 'elasticbeanstalk-fourfront-cgapfoo-blobs' assert s.metadata_bucket == 'elasticbeanstalk-fourfront-cgapfoo-metadata-bundles' assert s.tibanna_cwls_bucket == 'tibanna-cwls' assert s.tibanna_output_bucket == 'tibanna-output' assert s.s3_encrypt_key_id == 'my-encrypt-key' e = s.env_manager assert e.s3 == s.s3 # This mock is not elaborate enough for testing how e.portal_url is set up. # assert e.portal_url = ... assert e.es_url == es_server_https assert e.env_name == env_name
def __init__( self, ff_access_keys = None, google_api_key = None, s3UtilsInstance = None, extra_config = DEFAULT_GOOGLE_API_CONFIG ): """Authenticate with Google APIs and initialize sub-class instances.""" if s3UtilsInstance is None: self._s3Utils = s3_utils.s3Utils(env='data') # Google API Keys are stored on production bucket only ATM. else: self._s3Utils = s3UtilsInstance if google_api_key is None: self._api_key = self._s3Utils.get_google_key() if not self._api_key: raise Exception("Failed to get Google API key from S3.") else: self._api_key = google_api_key if not GoogleAPISyncer.validate_api_key_format(self._api_key): raise Exception("Google API Key is in invalid format.") self.extra_config = extra_config self.credentials = Credentials.from_service_account_info( self._api_key, scopes=self.extra_config.get('scopes', DEFAULT_GOOGLE_API_CONFIG['scopes']) ) # These are required only for POSTing/GETing data for TrackingInfo items. if ff_access_keys is None: ff_access_keys = self._s3Utils.get_access_keys() self.server = ff_access_keys['server'] self.access_key = { "key" : ff_access_keys['key'], "secret": ff_access_keys['secret'] } # Init sub-class objects self.analytics = GoogleAPISyncer.AnalyticsAPI(self) self.sheets = GoogleAPISyncer.SheetsAPI(self) self.docs = GoogleAPISyncer.DocsAPI(self)
def handler(event, context): ''' somewhere in the event data should be a jobid ''' RESPONSE_JSON_CONTENT_INCLUSION_LIMIT = 30000 # strictly it is 32,768 but just to be safe. # s3 bucket that stores the output bucket_name = event['config']['log_bucket'] s3 = s3_utils.s3Utils(bucket_name, bucket_name, bucket_name) # info about the jobby job jobid = event['jobid'] job_started = "%s.job_started" % jobid job_success = "%s.success" % jobid job_error = "%s.error" % jobid job_log = "%s.log" % jobid postrunjson = "%s.postrun.json" % jobid job_log_location = "https://s3.amazonaws.com/%s/%s" % (bucket_name, job_log) postrunjson_location = "https://s3.amazonaws.com/%s/%s" % (bucket_name, postrunjson) # check to see ensure this job has started else fail if not s3.does_key_exist(job_started): raise EC2StartingException("Failed to find jobid %s, ec2 is probably still booting" % jobid) # check to see if job has error, report if so if s3.does_key_exist(job_error): raise AWSEMJobErrorException("Job encountered an error check log at %s" % job_log_location) # check to see if job has completed if not throw retry error if s3.does_key_exist(job_success): if not s3.does_key_exist(postrunjson): raise Exception("Postrun json not found at %s" % postrunjson_location) postrunjsoncontent = json.loads(s3.read_s3(postrunjson)) if len(str(postrunjsoncontent)) + len(str(event)) < RESPONSE_JSON_CONTENT_INCLUSION_LIMIT: event['postrunjson'] = postrunjsoncontent else: event['postrunjson'] = {'log': 'postrun json not included due to data size limit', 'Job': {'Output': postrunjsoncontent['Job']['Output']}} print("completed successfully") return event else: raise StillRunningException("job %s still running" % jobid)
def BUCKET_NAME(env, filetype): global _BUCKET_NAME_PROCESSED_FILES global _BUCKET_NAME_RAW_FILES global _BUCKET_NAME_SYSG global _BUCKET_NAME_LOG global _BUCKET_NAME_CWL # use cache if filetype == 'FileProcessed' and env in _BUCKET_NAME_PROCESSED_FILES: return _BUCKET_NAME_PROCESSED_FILES[env] if filetype in ['FileFastq', 'FileReference', 'FileMicroscopy' ] and env in _BUCKET_NAME_RAW_FILES: return _BUCKET_NAME_RAW_FILES[env] if filetype == 'system' and env in _BUCKET_NAME_SYS: # log bucket return _BUCKET_NAME_SYS[env] if filetype == 'log' and env in _BUCKET_NAME_LOG: # log bucket return _BUCKET_NAME_LOG[env] if filetype == 'cwl' and env in _BUCKET_NAME_CWL: return _BUCKET_NAME_CWL[env] # no cache if filetype == 'log' and AWS_ACCOUNT_NUMBER == '643366669028': # 4dn-dcic account _BUCKET_NAME_LOG[env] = 'tibanna-output' else: s3 = s3Utils(env=env) _BUCKET_NAME_PROCESSED_FILES[env] = s3.outfile_bucket _BUCKET_NAME_RAW_FILES[env] = s3.raw_file_bucket _BUCKET_NAME_SYS[env] = s3.sys_bucket _BUCKET_NAME_LOG[env] = s3.tibanna_output_bucket _BUCKET_NAME_CWL[env] = s3.tibanna_cwls_bucket if filetype == 'FileProcessed': return _BUCKET_NAME_PROCESSED_FILES[env] elif filetype in ['FileFastq', 'FileReference', 'FileMicroscopy']: return _BUCKET_NAME_RAW_FILES[env] elif filetype == 'system': return _BUCKET_NAME_SYS[env] elif filetype == 'cwl': return _BUCKET_NAME_CWL[env] else: # log return _BUCKET_NAME_LOG[env]
def integrated_ff(): """ Object that contains keys and ff_env for integrated environment """ integrated = {} s3 = s3Utils(env=INTEGRATED_ENV) integrated['ff_key'] = s3.get_access_keys() integrated['higlass_key'] = s3.get_higlass_key() integrated['ff_env'] = INTEGRATED_ENV integrated['es_url'] = INTEGRATED_ES # do this to make sure env is up (will error if not) res = authorized_request( integrated['ff_key'] ['server'], # noQA - PyCharm fears the ['server'] part won't be there. auth=integrated['ff_key']) if res.status_code != 200: raise Exception( 'Environment %s is not ready for integrated status. Requesting ' 'the homepage gave status of: %s' % (INTEGRATED_ENV, res.status_code)) return integrated
def __init__(self, bucket, key, runner, argument_type=None, filesize=None, md5=None, format_if_extra=None, is_extra=False): self.bucket = bucket self.key = key self.s3 = s3Utils(self.bucket, self.bucket, self.bucket) self.runner = runner self.argument_type = argument_type self.filesize = filesize self.md5 = md5 self.format_if_extra = format_if_extra if self.format_if_extra or is_extra: self.is_extra = True else: self.is_extra = False
def s3(check_task_input): bucket_name = check_task_input['config']['log_bucket'] return s3_utils.s3Utils(bucket_name, bucket_name, bucket_name)
def s3_utils(used_env): return s3Utils(env=used_env)