def _start_retries(self, retry_data, obj, recovering=False): """ Saves the current payload, with modified retry information, to DynamoDB so that a query can pick up the items, and re-execute the payload at a future point. :param retry_data: a dict like {'system_context': {...}, 'user_context': {...}} :param obj: a dict """ retry_system_context = retry_data[PAYLOAD.SYSTEM_CONTEXT] serialized = json.dumps(retry_data, **json_dumps_additional_kwargs()) for primary in [True, False]: try: # save the retry entity # https://www.awsarchitectureblog.com/2015/03/backoff.html # "full jitter" cap, base, attempt = 60., 1., retry_system_context[ SYSTEM_CONTEXT.RETRIES] sleep = random.uniform(0, min(cap, base * 2**attempt)) return start_retries(self, time.time() + sleep, serialized, primary=primary, recovering=recovering) except ClientError: # log an error to at least expose the error self._queue_error( ERRORS.ERROR, 'Unable to save last payload for retry (primary=%s).' % primary, exc_info=True)
def _store_checkpoint(self, obj): """ Saves the last response from Context._send_next_event_for_dispatch so that a terminated machine can be started back up using the saved information. :param obj: a dict. """ # save the last successful dispatch to aws. on kinesis, the sent data looks like # {u'ShardId': u'shardId-000000000000', u'SequenceNumber': u'49559000...18786'} and thus # has sufficient information to go and seek the record directory from kinesis, and to # restart the fsm using the saved state. if obj.get(OBJ.SENT): for primary in [True, False]: try: return store_checkpoint( self, json.dumps(obj[OBJ.SENT], **json_dumps_additional_kwargs()), primary=primary) except ClientError: # if unable to save the last sent message, then recovery/checkpointing # will be missing the more recent executed state. recovering may be # complicated, especially since the last transition has been marked as # successfully dispatched self._queue_error( ERRORS.ERROR, 'Unable to save last sent data (primary=%s).' % primary, exc_info=True)
def get_sns_record(self): return { 'eventSource': 'aws:sns', 'Sns': { 'Message': json.dumps({"mess": "age"}, **json_dumps_additional_kwargs()) } }
def get_kinesis_record(self): return { 'eventSource': 'aws:kinesis', 'kinesis': { 'data': base64.b64encode( json.dumps({ 'machine_name': 'barfoo' }, **json_dumps_additional_kwargs()).encode('utf-8')) } }
def lambda_step_handler(lambda_event, lambda_context): """ AWS Lambda handler for executing state machines. :param lambda_event: a dict event from AWS Lambda :param lambda_context: a dict context from AWS Lambda :return: a dict event to pass along to AWS Step Functions orchestration """ obj = {OBJ.SOURCE: AWS.STEP_FUNCTION, OBJ.LAMBDA_CONTEXT: lambda_context} payload = json.dumps(lambda_event, **json_dumps_additional_kwargs() ) # Step Function just passes straight though return _process_payload_step(payload, obj)
def search_for_machine(filename='fsm.yaml'): """ Searches the .yaml hierarchy for the correct machine. :param filename: a path to a fsm.yaml file :return: """ for machine_dict in get_current_configuration( filename=filename)[CONFIG.MACHINES]: if CONFIG.IMPORT in machine_dict: search_for_machine(filename=machine_dict[CONFIG.IMPORT]) continue if machine_dict[CONFIG.NAME] == args.machine_name: data = output_machine_dict(machine_dict) print(json.dumps(data, indent=2, **json_dumps_additional_kwargs())) return
def lambda_api_handler(lambda_event, lambda_context): """ AWS Lambda handler for executing state machines. :param lambda_event: a dict event from AWS Lambda :param lambda_context: a dict context from AWS Lambda """ try: obj = {OBJ.SOURCE: AWS.GATEWAY, OBJ.LAMBDA_CONTEXT: lambda_context} payload = json.dumps(lambda_event, **json_dumps_additional_kwargs() ) # API Gateway just passes straight though _process_payload(payload, obj) # in batch mode, we don't want a single error to cause the the entire batch # to retry. for that reason, we have opted to gobble all the errors here # and handle retries withing the fsm dispatch code. except Exception: lambda_event = AWS_LAMBDA.REDACTED logger.exception('Critical error handling lambda: %s', lambda_event)
def _dispatch_to_current_state(self, event, obj): """ Dispatches the event to the current state, then send the next event onto Kinesis/DynamoDB for subsequent processing. :param event: a str event. :param obj: a dict. """ # dispatch the event using the user context only next_event = self.current_state.dispatch(self, event, obj) # dispatch local transitions without enqueing more messages while next_event \ and self.current_state \ and self.current_state.get_transition(next_event) \ and self.current_state.get_transition(next_event).local: next_event = self.current_state.dispatch(self, next_event, obj) # if there are more events if next_event: # make a full copy ctx = Context.from_payload_dict(self.to_payload_dict()) ctx.steps += 1 ctx.retries = 0 ctx.current_event = next_event serialized = json.dumps(ctx.to_payload_dict(), **json_dumps_additional_kwargs()) # dispatch the next event to aws kinesis/dynamodb sent = self._send_next_event_for_dispatch(serialized, obj) # things are falling off the rails if not sent: self._queue_error( ERRORS.DISPATCH, 'System error during dispatch. Failover to retry stream.') sent = self._send_next_event_for_dispatch(serialized, obj, recovering=True) obj[OBJ.SENT] = sent
def start_state_machines(machine_name, user_contexts, correlation_ids=None, current_state=STATE.PSEUDO_INIT, current_event=STATE.PSEUDO_INIT, additional_delay_seconds=0): """ Insert a bulk AWS SQS/Kinesis/SNS/DynamoDB/... message that will kick off several state machines. :param machine_name: a str name for the machine to start. :param user_contexts: a list of dict of initial data for the state machines. :param correlation_ids: a list of guids for the fsms, or list of Nones if the system should define then automatically. :param current_state: the state to start the machines in. :param current_event: the event to start the machines with. :param additional_delay_seconds: number of seconds to insert between state transitions (for streams that support delay) """ all_data = [] correlation_ids = correlation_ids or [ uuid.uuid4().hex for i in range(len(user_contexts)) ] for i, user_context in enumerate(user_contexts): correlation_id = correlation_ids[i] started_at = int(time.time()) system_context = { SYSTEM_CONTEXT.STARTED_AT: started_at, SYSTEM_CONTEXT.MACHINE_NAME: machine_name, SYSTEM_CONTEXT.CURRENT_STATE: current_state, SYSTEM_CONTEXT.CURRENT_EVENT: current_event, SYSTEM_CONTEXT.STEPS: 0, SYSTEM_CONTEXT.RETRIES: 0, SYSTEM_CONTEXT.CORRELATION_ID: correlation_id, SYSTEM_CONTEXT.ADDITIONAL_DELAY_SECONDS: additional_delay_seconds } payload = { PAYLOAD.VERSION: PAYLOAD.DEFAULT_VERSION, PAYLOAD.SYSTEM_CONTEXT: system_context, PAYLOAD.USER_CONTEXT: user_context } all_data.append(json.dumps(payload, **json_dumps_additional_kwargs())) send_next_events_for_dispatch(None, all_data, correlation_ids)
def test_process_payload_step(self, mock_FSM): payload = json.dumps( { 'system_context': { 'machine_name': 'barfoo', 'current_state': 'foobar', 'stream': 's', 'table': 't', 'topic': 'z', 'metrics': 'm' }, 'user_context': {} }, **json_dumps_additional_kwargs()) obj = {} mock_FSM.return_value.create_FSM_instance.return_value\ .system_context.return_value.get.return_value = 'pseudo-init' _process_payload_step(payload, obj) mock_FSM.return_value.create_FSM_instance.assert_called_with( 'barfoo', initial_system_context={ 'topic': 'z', 'machine_name': 'barfoo', 'stream': 's', 'current_state': 'foobar', 'metrics': 'm', 'table': 't' }, initial_user_context={}, initial_state_name='foobar') mock_FSM.return_value.create_FSM_instance.return_value.current_state.dispatch.assert_called_with( mock_FSM.return_value.create_FSM_instance.return_value, 'pseudo-init', { 'payload': '{"system_context": {"current_state": "foobar", "machine_name": ' '"barfoo", "metrics": "m", "stream": "s", "table": "t", ' '"topic": "z"}, "user_context": {}}' }) self.assertEqual({'payload': payload}, obj)
def start_state_machine(machine_name, initial_context, correlation_id=None, current_state=STATE.PSEUDO_INIT, current_event=STATE.PSEUDO_INIT, additional_delay_seconds=0): """ Insert an AWS SQS/Kinesis/SNS/DynamoDB/... message that will kick off a state machine. :param machine_name: a str name for the machine to start. :param initial_context: a dict of initial data for the state machine. :param correlation_id: the guid for the fsm, or None if the system should define it automatically. :param current_state: the state to start the machine in. :param current_event: the event to start the machine with. :param additional_delay_seconds: number of seconds to insert between state transitions (for streams that support delay) """ correlation_id = correlation_id or uuid.uuid4().hex system_context = { SYSTEM_CONTEXT.STARTED_AT: int(time.time()), SYSTEM_CONTEXT.MACHINE_NAME: machine_name, SYSTEM_CONTEXT.CURRENT_STATE: current_state, SYSTEM_CONTEXT.CURRENT_EVENT: current_event, SYSTEM_CONTEXT.STEPS: 0, SYSTEM_CONTEXT.RETRIES: 0, SYSTEM_CONTEXT.CORRELATION_ID: correlation_id, SYSTEM_CONTEXT.ADDITIONAL_DELAY_SECONDS: additional_delay_seconds } payload = { PAYLOAD.VERSION: PAYLOAD.DEFAULT_VERSION, PAYLOAD.SYSTEM_CONTEXT: system_context, PAYLOAD.USER_CONTEXT: initial_context } send_next_event_for_dispatch( None, json.dumps(payload, **json_dumps_additional_kwargs()), correlation_id)
client.start(container=container) stdout = client.logs(container, stdout=True, stream=True) for line in stdout: sys.stdout.write(line) stderr = client.logs(container, stderr=True, stream=True) for line in stderr: sys.stderr.write(line) return_code = client.wait(container) except Exception: logging.exception('') raise finally: if not environment: sys.stderr.write(FATAL_ENVIRONMENT_ERROR) sys.exit(1) # FSM_CONTEXT is the environment variable used by aws_lambda_fsm.utils.ECSTaskEntryAction event = DONE_EVENT if return_code == 0 else FAIL_EVENT payload_encoded = environment[ENVIRONMENT.FSM_CONTEXT] payload = json.loads(base64.b64decode(payload_encoded), **json_loads_additional_kwargs()) payload[PAYLOAD.SYSTEM_CONTEXT][SYSTEM_CONTEXT.CURRENT_EVENT] = event serialized = json.dumps(payload, **json_dumps_additional_kwargs()) send_next_event_for_dispatch( None, serialized, payload[PAYLOAD.SYSTEM_CONTEXT][SYSTEM_CONTEXT.CORRELATION_ID] )
def json_dumps_additional_kwargs_using_settings(self, mock_settings): mock_settings.JSON_DUMPS_ADDITIONAL_KWARGS = {'default': lambda x: "foobar"} self.assertEquals({'sort_keys', 'default'}, json_dumps_additional_kwargs().keys()) self.assertEquals("foobar", json_dumps_additional_kwargs()['default']('~~~'))
def get_sqs_record(self): return { 'eventSource': 'aws:sqs', 'body': json.dumps({"mess": "age"}, **json_dumps_additional_kwargs()) }
def increment_error_counters(self, data, dimensions): self.errors.send( json.dumps((data, dimensions), **json_dumps_additional_kwargs())) return {'test': 'stub'}
# create the lambda event lambda_event = {AWS_LAMBDA.Records: []} # populate the lambda event for sqs_message in sqs_messages: body = sqs_message[AWS_SQS.MESSAGE.Body] tmp = { AWS_LAMBDA.EventSource: AWS_LAMBDA.EVENT_SOURCE.SQS, AWS_LAMBDA.SQS_RECORD.BODY: body } lambda_event[AWS_LAMBDA.Records].append(tmp) # and call the handler with the records if args.lambda_command: serialized = json.dumps(lambda_event, **json_dumps_additional_kwargs()) quoted = shellquote(serialized) subprocess.call( ['/bin/bash', '-c', args.lambda_command + " " + quoted]) else: lambda_handler(lambda_event, lambda_context) # after processing, the SQS messages need to be deleted response = sqs_conn.delete_message_batch( QueueUrl=sqs_queue_url, Entries=[{ AWS_SQS.MESSAGE.Id: str(i), AWS_SQS.MESSAGE.ReceiptHandle: sqs_message[AWS_SQS.MESSAGE.ReceiptHandle] } for i, sqs_message in enumerate(sqs_messages)])
def test_json_dumps_additional_kwargs_defaults(self): self.assertEquals({'sort_keys', 'default'}, set(json_dumps_additional_kwargs().keys())) self.assertEquals("<not_serializable>", json_dumps_additional_kwargs()['default']('~~~'))
def execute(self, context, obj): """ Action that launches and ECS task. The API for using this class is as follows: { 'context_var': 'context_value', # normal context variable 'task_details': { # dictionary of all the states that run images 'state_name_1': { # first state name (as in fsm.yaml) # cluster to run image for state_name_1 'cluster_arn': 'arn:aws:ecs:region:1234567890:cluster/foobar', 'container_image': 'host/corp/image:12345' # image for state_name_1 }, 'state_name_2': { # second state name (as in fsm.yaml) 'cluster_arn': 'arn:aws:ecs:eu-west-1:1234567890:cluster/foobar', 'container_image': 'host/corp/image:12345', 'runner_task_definition': 'my_runner', # alternative docker image runner task name 'runner_container_name': 'my_runner' # alternative docker image runner container name } }, 'clone_aws_credentials': True # flag to copy aws creds from local environment # to the container overrides - makes for easier # local testing. alternatively, just add permanent # credentials to your runner task. } :param context: a aws_lambda_fsm.fsm.Context instance :param obj: a dict :return: a string event, or None """ # construct a version of the context that can be base64 encoded # and stuffed into a environment variable for the container program. # all the container program needs to do is extract this data, add # an event, and send the message onto sqs/kinesis/... since this is an # ENTRY action, we inspect the current transition for the state we # will be in AFTER this code executes. ctx = Context.from_payload_dict(context.to_payload_dict()) ctx.current_state = context.current_transition.target ctx.steps += 1 fsm_context = base64.b64encode( json.dumps(ctx.to_payload_dict(), **json_dumps_additional_kwargs())) # now finally launch the ECS task using all the data from above # as well as tasks etc. specified when the state machine was run. state_to_task_details_map = context[TASK_DETAILS_KEY] task_details = state_to_task_details_map[ context.current_transition.target.name] # this is the image the user wants to run cluster_arn = task_details[CLUSTER_ARN_KEY] container_image = task_details[CONTAINER_IMAGE_KEY] # this is the task that will run that image task_definition = task_details.get(RUNNER_TASK_DEFINITION_KEY, DEFAULT_RUNNER_TASK_NAME) container_name = task_details.get(RUNNER_CONTAINER_NAME_KEY, DEFAULT_RUNNER_CONTAINER_NAME) # setup the environment for the ECS task. this first set of variables # are used by the docker container runner image. environment = { ENVIRONMENT.FSM_CONTEXT: fsm_context, ENVIRONMENT.FSM_DOCKER_IMAGE: container_image } # this second set of variables are used by actual docker image that # does actual stuff (pdf processing etc.) for name, value in task_details.get(ENVIRONMENT_KEY, {}).items(): environment[name] = value # store the environment and record the guid. guid, _ = store_environment(context, environment) # stuff the guid and a couple stream settings into the task # overrides. the guid allows the FSM_CONTEXT to be loaded from # storage, and the FSM_PRIMARY_STREAM_SOURCE allow the call # to send_next_event_for_dispatch call to succeed. env = [{ AWS_ECS.CONTAINER_OVERRIDES.ENVIRONMENT.NAME: ENVIRONMENT.FSM_ENVIRONMENT_GUID_KEY, AWS_ECS.CONTAINER_OVERRIDES.ENVIRONMENT.VALUE: guid }, { AWS_ECS.CONTAINER_OVERRIDES.ENVIRONMENT.NAME: ENVIRONMENT.FSM_PRIMARY_STREAM_SOURCE, AWS_ECS.CONTAINER_OVERRIDES.ENVIRONMENT.VALUE: get_primary_stream_source() or '' }, { AWS_ECS.CONTAINER_OVERRIDES.ENVIRONMENT.NAME: ENVIRONMENT.FSM_SECONDARY_STREAM_SOURCE, AWS_ECS.CONTAINER_OVERRIDES.ENVIRONMENT.VALUE: get_secondary_stream_source() or '' }] # this is for local testing if context.get(CLONE_AWS_CREDENTIALS_KEY): _testing(env) # get an ECS connection and start a task. conn = get_connection(cluster_arn) # run the task conn.run_task(cluster=cluster_arn, taskDefinition=task_definition, overrides={ AWS_ECS.CONTAINER_OVERRIDES.KEY: [{ AWS_ECS.CONTAINER_OVERRIDES.CONTAINER_NAME: container_name, AWS_ECS.CONTAINER_OVERRIDES.ENVIRONMENT.KEY: env }] }) # entry actions do not return events return None
def test_custom_encoder(self, mock_settings): mock_settings.JSON_DUMPS_ADDITIONAL_KWARGS = {'cls': Encoder} self.assertEquals('B', json.dumps("A", **json_dumps_additional_kwargs()))