def test_it_assumes_role_for_session_where_given(mock_sts, mock_boto): mock_sts.assume_role.return_value = { "Credentials": { "AccessKeyId": "a", "SecretAccessKey": "b", "SessionToken": "c", } } get_session(assume_role_arn="arn:aws:iam:accountid::role/rolename") mock_sts.assume_role.assert_called_with(RoleArn="arn:aws:iam:accountid::role/rolename", RoleSessionName=ANY) mock_boto.session.Session.assert_called_with( aws_access_key_id="a", aws_secret_access_key="b", aws_session_token="c", )
def _aggregate_stats(events): stats = Counter({}) for event in events: event_name = event["EventName"] event_data = event.get("EventData", {}) if event_name in ["QuerySucceeded", "QueryFailed"]: stats += Counter({ "TotalQueryCount": 1, "TotalQuerySucceededCount": 1 if event_name == "QuerySucceeded" else 0, "TotalQueryFailedCount": 1 if event_name == "QueryFailed" else 0, "TotalQueryScannedInBytes": event_data.get("Statistics", {}).get("DataScannedInBytes", 0), "TotalQueryTimeInMillis": event_data.get("Statistics", {}).get("EngineExecutionTimeInMillis", 0), }) if event_name in [ "ObjectUpdated", "ObjectUpdateFailed", "ObjectRollbackFailed" ]: stats += Counter({ "TotalObjectUpdatedCount": 1 if event_name == "ObjectUpdated" else 0, "TotalObjectUpdateFailedCount": 1 if event_name == "ObjectUpdateFailed" else 0, "TotalObjectRollbackFailedCount": 1 if event_name == "ObjectRollbackFailed" else 0, }) if event_name == "ObjectUpdateFailed": # TODO:: this can only work when deleteOldVersion flag is True for all involved data mappers try: data_access_role_arn = event_data["Message"].get( "RoleArn", getenv("DataAccessRoleArn")) session = get_session(data_access_role_arn) client = session.client("s3") s3_object = event_data["Message"]["Object"] input_bucket, input_key = parse_s3_url(s3_object) revert_last(client, input_bucket, input_key) except Exception: logger.exception("Unable to revert last") try: purge_queue(deletion_queue_url) except Exception: logger.exception("Unable to purge queue") return stats
def execute(queue_url, message_body, receipt_handle): logger.info("Message received") queue = get_queue(queue_url) msg = queue.Message(receipt_handle) try: # Parse and validate incoming message validate_message(message_body) body = json.loads(message_body) session = get_session(body.get("RoleArn")) client = session.client("s3") cols, object_path, job_id = itemgetter('Columns', 'Object', 'JobId')(body) input_bucket, input_key = parse_s3_url(object_path) validate_bucket_versioning(client, input_bucket) creds = session.get_credentials().get_frozen_credentials() s3 = s3fs.S3FileSystem(key=creds.access_key, secret=creds.secret_key, token=creds.token, default_cache_type='none', requester_pays=True, default_fill_cache=False, version_aware=True) # Download the object in-memory and convert to PyArrow NativeFile logger.info("Downloading and opening %s object in-memory", object_path) with s3.open(object_path, "rb") as f: source_version = f.version_id logger.info("Using object version %s as source", source_version) infile = load_parquet(f) # Write new file in-memory logger.info("Generating new parquet file without matches") out_sink, stats = delete_matches_from_file(infile, cols) if stats["DeletedRows"] == 0: raise ValueError( "The object {} was processed successfully but no rows required deletion" .format(object_path)) with pa.BufferReader(out_sink.getvalue()) as output_buf: new_version = save(s3, client, output_buf, input_bucket, input_key, source_version) logger.info("New object version: %s", new_version) verify_object_versions_integrity(client, input_bucket, input_key, source_version, new_version) if body.get("DeleteOldVersions"): logger.info( "Deleting object {} versions older than version {}".format( input_key, new_version)) delete_old_versions(client, input_bucket, input_key, new_version) msg.delete() emit_deletion_event(body, stats) except (KeyError, ArrowException) as e: err_message = "Parquet processing error: {}".format(str(e)) handle_error(msg, message_body, err_message) except IOError as e: err_message = "Unable to retrieve object: {}".format(str(e)) handle_error(msg, message_body, err_message) except MemoryError as e: err_message = "Insufficient memory to work on object: {}".format( str(e)) handle_error(msg, message_body, err_message) except ClientError as e: err_message = "ClientError: {}".format(str(e)) if e.operation_name == "PutObjectAcl": err_message += ". Redacted object uploaded successfully but unable to restore WRITE ACL" if e.operation_name == "ListObjectVersions": err_message += ". Could not verify redacted object version integrity" handle_error(msg, message_body, err_message) except ValueError as e: err_message = "Unprocessable message: {}".format(str(e)) handle_error(msg, message_body, err_message) except DeleteOldVersionsError as e: err_message = "Unable to delete previous versions: {}".format(str(e)) handle_error(msg, message_body, err_message) except IntegrityCheckFailedError as e: err_description, client, bucket, key, version_id = e.args err_message = "Object version integrity check failed: {}".format( err_description) handle_error(msg, message_body, err_message) rollback_object_version( client, bucket, key, version_id, on_error=lambda err: handle_error(None, "{}", err, "ObjectRollbackFailed", False)) except Exception as e: err_message = "Unknown error during message processing: {}".format( str(e)) handle_error(msg, message_body, err_message)
def test_it_returns_default_session(mock_sts): resp = get_session() mock_sts.assume_role.assert_not_called() assert isinstance(resp, Session)
import boto3 import boto_utils from aws_requests_auth.aws_auth import AWSRequestsAuth from elasticsearch import Elasticsearch, RequestsHttpConnection import configparser config = configparser.ConfigParser() config.read('../../config.ini') aws_config = config['AWS'] session = boto_utils.get_session() credentials = session.get_credentials().get_frozen_credentials() awsauth = AWSRequestsAuth(aws_access_key=credentials.access_key, aws_secret_access_key=credentials.secret_key, aws_token=credentials.token, aws_host=aws_config['es_host'], aws_region=session.region_name, aws_service='es') # use the requests connection_class and pass in our custom auth class es = Elasticsearch(hosts=[{ 'host': aws_config['es_host'], 'port': 443 }], http_auth=awsauth, use_ssl=True, verify_certs=True, connection_class=RequestsHttpConnection)