def handle_state_request_constraint_encountered( self, spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ): """ Constraint encountered after spot request initiated but before request fullfilled, i.e. time limit expired Submit another spot request :param spot_request_msg: :param spot_request_item: :param spot_request_uuid: :param spot_master_uuid: """ logger.info( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'handle_state_request_constraint_encountered' ) ts_now = int( time.time() ) spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, { TableSpotRequest.spot_request_state_code:SpotRequestStateCode.instance_complete, TableSpotRequest.is_open:0, TableSpotRequest.ts_end:ts_now }, region_name=self.region_name, profile_name=self.profile_name ) # Create a new spot request based on the spot request that just failed master_msg_resubmit_failed_request = SpotMasterMsg( spot_master_uuid=spot_request_msg.spot_master_uuid, spot_master_msg_type=SpotMasterMsg.TYPE_RESUBMIT_FAILED_REQUEST, spot_request_uuid=spot_request_msg.spot_request_uuid ) message_attributes = create_microsvc_message_attributes( awsspotbatch.common.const.MICROSVC_MASTER_CLASSNAME_SpotMasterMessageResubmitFailedRequest ) spot_master_sqs_message_durable = SqsMessageDurable( self.spot_master_queue_name, self.region_name, profile_name=self.profile_name ) spot_master_sqs_message_durable.send_message( master_msg_resubmit_failed_request.to_json(), message_attributes=message_attributes )
def handle_state_instance_force_termination_pending( self, spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ): """ AWS has started the termination process for this instance, i.e. the price has increased This is the beginning of the two minute warning pending forced termination Terminate the instance and start another spot request :param spot_request_msg: :param spot_request_item: :param spot_request_uuid: :param spot_master_uuid: """ logger.info( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'handle_state_instance_force_termination_pending' ) ts_now = int( time.time() ) spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, { TableSpotRequest.spot_request_state_code:SpotRequestStateCode.instance_complete, TableSpotRequest.is_open:0, TableSpotRequest.ts_end:ts_now }, region_name=self.region_name, profile_name=self.profile_name ) # Create a new spot request based on the spot request that just failed master_msg_resubmit_failed_request = SpotMasterMsg( spot_master_uuid=spot_request_msg.spot_master_uuid, spot_master_msg_type=SpotMasterMsg.TYPE_RESUBMIT_FAILED_REQUEST, spot_request_uuid=spot_request_msg.spot_request_uuid ) message_attributes = create_microsvc_message_attributes( awsspotbatch.common.const.MICROSVC_MASTER_CLASSNAME_SpotMasterMessageResubmitFailedRequest ) spot_master_sqs_message_durable = SqsMessageDurable( self.spot_master_queue_name, self.region_name, profile_name=self.profile_name ) spot_master_sqs_message_durable.send_message( master_msg_resubmit_failed_request.to_json(), message_attributes=message_attributes )
def process(self, message): """ Spot Request has completed, write completion info to SpotRequestItem in DynamoDB, let master know this request has completed so the master can determine if the job has completed :param message: SQS Message instance """ try: spot_request_msg = SpotRequestMsg(raw_json=message.get_body()) spot_request_item = get_spot_request_item( self.spot_request_table_name, spot_request_msg.spot_request_uuid, region_name=self.region_name, profile_name=self.profile_name, ) ts_cmd_complete = spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_BATCH_PROCESS_COMPLETE_TIMESTAMP ] cmd_returncode = spot_request_msg.name_value_pairs[SpotRequestMsg.PAIR_NAME_BATCH_PROCESS_RETURNCODE] cmd_std_out = spot_request_msg.name_value_pairs[SpotRequestMsg.PAIR_NAME_BATCH_PROCESS_STD_OUT] cmd_std_err = spot_request_msg.name_value_pairs[SpotRequestMsg.PAIR_NAME_BATCH_PROCESS_STD_ERR] key_value_pairs = { TableSpotRequest.is_open: 0, TableSpotRequest.spot_request_state_code: SpotRequestStateCode.instance_complete, TableSpotRequest.ts_cmd_complete: ts_cmd_complete, TableSpotRequest.cmd_returncode: cmd_returncode, } if cmd_std_out != None and len(cmd_std_out) > 0: key_value_pairs[TableSpotRequest.cmd_std_out] = cmd_std_out if cmd_std_err != None and len(cmd_std_err) > 0: key_value_pairs[TableSpotRequest.cmd_std_err] = cmd_std_err spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, key_value_pairs, region_name=self.region_name, profile_name=self.profile_name, ) # let the Master increment the completion count to determine if the job is complete master_msg_incr_instance_success = SpotMasterMsg( spot_master_uuid=spot_request_msg.spot_master_uuid, spot_master_msg_type=SpotMasterMsg.TYPE_INCR_INSTANCE_SUCCESS_CNT, ) message_attributes = create_microsvc_message_attributes( awsspotbatch.common.const.MICROSVC_MASTER_CLASSNAME_SpotMasterMessageIncrSuccessCnt ) spot_master_sqs_message_durable = SqsMessageDurable( self.spot_master_queue_name, self.region_name, profile_name=self.profile_name ) spot_master_sqs_message_durable.send_message( master_msg_incr_instance_success.to_json(), message_attributes=message_attributes ) self.spot_request_sqs_message_durable.delete_message(message) except StandardError as e: logger.error(fmt_request_item_msg_hdr(spot_request_item) + "Exiting SpotRequestDispatcher due to exception") logger.error(fmt_request_item_msg_hdr(spot_request_item) + str(e)) logger.error(fmt_request_item_msg_hdr(spot_request_item) + traceback.format_exc())
def submit_spot_batch_job( argv ): """ Submit a users' spot batch job Submit an SQS message containing the 2 parm files - Batch Job and User Parm :param argv: """ import logging.config if len(sys.argv) == 1: print 'ERROR: Missing log configuration file, first argument must be path/name.ext of the log configuration file' sys.exit(8) logging.config.fileConfig( sys.argv[1], disable_existing_loggers=False) logger = logging.getLogger(__name__) if len(sys.argv) == 2: logger.error( 'ERROR: Missing Batch Job Parm file, second argument must be path/name.ext of the log Batch Job Parm file' ) sys.exit(8) try: logger.info("Starting") path_batch_job_parm_file = sys.argv[2] if len(sys.argv) == 4: path_user_job_parm_file = sys.argv[3] else: path_user_job_parm_file = None with open( path_batch_job_parm_file ) as parm_file: raw_batch_job_parm_item = parm_file.read() if path_user_job_parm_file != None: with open( path_user_job_parm_file ) as parm_file: raw_user_job_parm_item = parm_file.read() else: raw_user_job_parm_item = None batch_job_parm_item = BatchJobParmItem( stringParmFile=raw_batch_job_parm_item ) spot_master_sqs_message_durable = SqsMessageDurable( awsspotbatch.common.const.SPOT_MASTER_QUEUE_NAME, batch_job_parm_item.primary_region_name, profile_name=batch_job_parm_item.profile_name ) spot_master_uuid = str(uuid.uuid1()) logger.info('Submitting test batch message, spot_master_uuid=' + spot_master_uuid ) spot_master_msg = SpotMasterMsg( spot_master_uuid=spot_master_uuid, spot_master_msg_type=SpotMasterMsg.TYPE_SUBMIT_BATCH, raw_batch_job_parm_item=raw_batch_job_parm_item, raw_user_job_parm_item=raw_user_job_parm_item) message_attributes = create_microsvc_message_attributes( awsspotbatch.common.const.MICROSVC_MASTER_CLASSNAME_SpotMasterMessageSubmitBatch ) spot_master_sqs_message_durable.send_message( spot_master_msg.to_json(), message_attributes=message_attributes ) logger.info( 'Completed Successfully' ) except StandardError as e: logger.error( e ) logger.error( traceback.format_exc() ) sys.exit(8)
def receive_test_data( self ): """ """ try: spot_sqs_message_durable = SqsMessageDurable( self.spot_master_queue_name, region_name=self.region_name, profile_name=self.profile_name) while True: message = spot_sqs_message_durable.receive_message(); if message == None: break; spot_master_msg = SpotMasterMsg( raw_json=message.get_body() ) logger.info('spot_master_msg: type=' + spot_master_msg.spot_master_msg_type ) spot_sqs_message_durable.delete_message(message) except StandardError as e: logger.error( e ) logger.error( traceback.format_exc() ) sys.exit(8)
def process( self, message ) : """ Try to submit another Spot Request based on the one that just failed :param message: SQS Message instance """ try: spot_master_msg = SpotMasterMsg( raw_json=message.get_body() ) spot_master_uuid = spot_master_msg.spot_master_uuid logger.info( fmt_master_uuid_msg_hdr( spot_master_uuid ) + 'process_resubmit_failed_request') dynamodb_conn = boto.dynamodb2.connect_to_region( self.region_name, profile_name=self.profile_name ) spot_master_table = Table( self.spot_master_table_name, connection=dynamodb_conn ) spot_master_item = spot_master_table.get_item( spot_master_uuid=spot_master_uuid ) spot_request_table = Table( self.spot_request_table_name, connection=dynamodb_conn ) failed_spot_request_item = spot_request_table.get_item( spot_request_uuid=spot_master_msg.spot_request_uuid ) # Request spot instance spot_instance_request = self.resubmit_failed_request_spot_instance( spot_master_item, failed_spot_request_item, dynamodb_conn ) # Queue up a SpotRequestMsg if spot_instance_request != None: spot_request_uuid = str(uuid.uuid1()) spot_request_msg = SpotRequestMsg( spot_request_uuid=spot_request_uuid, spot_master_uuid=spot_master_item[ TableSpotMaster.spot_master_uuid ], spot_request_msg_type=SpotRequestMsg.TYPE_SPOT_REQUEST_INITIATED, spot_request_id=spot_instance_request.id ) spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_SPOT_PRICE ] = str( spot_instance_request.price ) spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_INSTANCE_USERNAME ] = spot_master_item[ TableSpotMaster.instance_username ] spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_ATTEMPT_NUMBER ] = int( failed_spot_request_item[ TableSpotRequest.attempt_number ] + 1 ) spot_request_sqs_message_durable = SqsMessageDurable( self.spot_request_queue_name, self.region_name, profile_name=self.profile_name ) spot_request_sqs_message_durable.send_message( spot_request_msg.to_json(), message_attributes=create_microsvc_message_attributes( awsspotbatch.common.const.MICROSVC_REQUEST_CLASSNAME_SpotRequestMessageSpotRequestInitiated ) ) self.spot_master_sqs_message_durable.delete_message(message) # No instances available - resubmit this message with a delay timer so it will get reprocessed in future else: logger.warning( fmt_master_uuid_msg_hdr( spot_master_uuid ) + 'No spot instances available, will try again in ' + str(awsspotbatch.common.const.NO_SPOT_INSTANCES_AVAILABLE_RECHECK_MINUTES) + ' minutes') delay_seconds = awsspotbatch.common.const.NO_SPOT_INSTANCES_AVAILABLE_RECHECK_MINUTES * 60 self.spot_master_sqs_message_durable.send_message( message.get_body(), message_attributes=create_microsvc_message_attributes( awsspotbatch.common.const.MICROSVC_MASTER_CLASSNAME_SpotMasterMessageResubmitFailedRequest ), delay_seconds=delay_seconds ) self.spot_master_sqs_message_durable.delete_message(message) except StandardError as e: logger.error( fmt_master_item_msg_hdr( spot_master_item ) + str(e) ) logger.error( fmt_master_item_msg_hdr( spot_master_item ) + traceback.format_exc() )
def send_test_data( self ): """ """ try: spot_master_uuid = str( uuid.uuid1() ) spot_master_msg_submit_batch = SpotMasterMsg( spot_master_uuid, SpotMasterMsg.TYPE_SUBMIT_BATCH ) spot_master_msg_check_status = SpotMasterMsg( spot_master_uuid, SpotMasterMsg.TYPE_CHECK_STATUS ) spot_sqs_message_durable = SqsMessageDurable( self.spot_master_queue_name, region_name=self.region_name, profile_name=self.profile_name) spot_sqs_message_durable.send_message( spot_master_msg_submit_batch.to_json() ) spot_sqs_message_durable.send_message( spot_master_msg_check_status.to_json() ) except StandardError as e: logger.error( e ) logger.error( traceback.format_exc() ) sys.exit(8)
def handle_state_master_role_policy_in_progress(self, spot_master_item, dynamodb_conn ): """ Verify the Policy is added to the Role :param spot_master_item: :param dynamodb_conn: """ logger.info( fmt_master_item_msg_hdr( spot_master_item ) + 'handle_state_master_role_policy_in_progress') iam_conn = awsext.iam.connect_to_region( self.region_name, profile_name=self.profile_name ) is_role_policy_added = iam_conn.is_role_policy_added( role_name=spot_master_item[ TableSpotMaster.role_name ], policy_name=spot_master_item[ TableSpotMaster.policy_name ]) if not is_role_policy_added: return # For some bizarre timing reason, is_role_policy_added can return True but the spot request fails on IAM role not attached to instance profile # - give it a few seconds to clear time.sleep(5) spot_master_state_code = SpotMasterStateCode.waiting_for_instances_complete # Request spot instances spot_instance_requests = submit_request_spot_instances( spot_master_item, self.region_name, self.profile_name ) # Queue up a SpotRequestMsg for each spot request - this will manage all states for SpotRequest if spot_instance_requests != None: spot_request_sqs_message_durable = SqsMessageDurable( self.spot_request_queue_name, self.region_name, profile_name=self.profile_name ) for spot_instance_request in spot_instance_requests: spot_request_uuid = str(uuid.uuid1()) spot_request_msg = SpotRequestMsg( spot_request_uuid=spot_request_uuid, spot_master_uuid=spot_master_item[ TableSpotMaster.spot_master_uuid ], spot_request_msg_type=SpotRequestMsg.TYPE_SPOT_REQUEST_INITIATED, spot_request_id=spot_instance_request.id ) spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_SPOT_PRICE ] = str( spot_master_item[TableSpotMaster.cheapest_price]) spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_INSTANCE_USERNAME ] = spot_master_item[ TableSpotMaster.instance_username ] spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_ATTEMPT_NUMBER ] = 1 message_attributes = create_microsvc_message_attributes( awsspotbatch.common.const.MICROSVC_REQUEST_CLASSNAME_SpotRequestMessageSpotRequestInitiated ) spot_request_sqs_message_durable.send_message( spot_request_msg.to_json(), message_attributes=message_attributes ) else: spot_master_state_code = SpotMasterStateCode.no_instances_available spot_master_row_partial_save( self.spot_master_table_name, spot_master_item, { TableSpotMaster.spot_master_state_code:spot_master_state_code }, region_name=self.region_name, profile_name=self.profile_name )
def run(self): """ Read Master messages, launch Master microservice based on service_class_name message attribute """ spot_master_sqs_message_durable = SqsMessageDurable( self.spot_master_queue_name, self.region_name, profile_name=self.profile_name ) while True: try: logger.info('SpotMasterDispatcher loop') if self.is_shutdown: logger.info('Shutting down SpotMasterDispatcher' ) break; message = spot_master_sqs_message_durable.receive_message( message_attributes=['service_class_name']) if message == None: continue message_attribute = message.message_attributes['service_class_name'] service_class_name = message_attribute['string_value'] logger.info('Launching ' + service_class_name ) SpotMasterMicrosvcLauncher( service_class_name, message, self ).start() except StandardError as e: logger.error('Exiting SpotMasterDispatcher due to exception' ) logger.error( e ) logger.error( traceback.format_exc() )
def main(): """ """ import logging.config logging.config.fileConfig( '../../../../config/consoleandfile.conf', disable_existing_loggers=False) logger = logging.getLogger(__name__) try: logger.info( 'Starting' ) master_parm_item = MasterParmItem( sys.argv[1] ) spot_master_sqs_message_durable = SqsMessageDurable( awsspotbatch.common.const.SPOT_MASTER_QUEUE_NAME, master_parm_item.region_name, profile_name=master_parm_item.profile_name ) spot_request_sqs_message_durable = SqsMessageDurable( awsspotbatch.common.const.SPOT_REQUEST_QUEUE_NAME, master_parm_item.region_name, profile_name=master_parm_item.profile_name ) # TEST TEST TEST - only during development spot_master_sqs_message_durable.purge_queue() spot_request_sqs_message_durable.purge_queue() spot_master_dispatcher = SpotMasterDispatcher( region_name=master_parm_item.region_name, profile_name=master_parm_item.profile_name ) spot_request_dispatcher = SpotRequestDispatcher( region_name=master_parm_item.region_name, profile_name=master_parm_item.profile_name ) spot_master_dispatcher.start() spot_request_dispatcher.start() spot_master_msg_batch_submit = create_spot_master_msg_batch_submit( sys.argv[2], sys.argv[3] ) message_attributes = create_microsvc_message_attributes( awsspotbatch.common.const.MICROSVC_MASTER_CLASSNAME_SpotMasterMessageSubmitBatch ) spot_master_sqs_message_durable.send_message( spot_master_msg_batch_submit.to_json(), message_attributes=message_attributes ) spot_master_dispatcher.join() logger.info( 'Completed Successfully' ) except StandardError as e: logger.error( e ) logger.error( traceback.format_exc() ) sys.exit(8)
def launch_spot_batch_service(): """ Launch dispatcher on t2.micro EC2 instance, in the future this will be in AWS Lambda 1. Multiple instances can (and should) be launched concurrently, i.e. in different AZ's 2. Install as a service that starts at system boot, this is detailed in the README """ if len(sys.argv) == 1: print 'ERROR: Missing log configuration file, first argument must be path/name.ext of the log configuration file' sys.exit(8) logging.config.fileConfig( sys.argv[1], disable_existing_loggers=False) logger = logging.getLogger(__name__) try: logger.info( '**********************************' ) logger.info( 'Starting SpotBatchMgr Version: ' + awsspotbatch.Version ) logger.info( '**********************************' ) if len(sys.argv) == 2: logger.error('Missing master parm item file, second argument must be path/name.ext of master parm item json file') sys.exit(8) master_parm_item = MasterParmItem( sys.argv[2] ) is_purge_queues = False if len(sys.argv) > 3 and sys.argv[3] == 'purge': is_purge_queues = True spot_master_sqs_message_durable = SqsMessageDurable( awsspotbatch.common.const.SPOT_MASTER_QUEUE_NAME, master_parm_item.region_name, profile_name=master_parm_item.profile_name ) spot_request_sqs_message_durable = SqsMessageDurable( awsspotbatch.common.const.SPOT_REQUEST_QUEUE_NAME, master_parm_item.region_name, profile_name=master_parm_item.profile_name ) if is_purge_queues: spot_master_sqs_message_durable.purge_queue() spot_request_sqs_message_durable.purge_queue() spot_master_dispatcher = SpotMasterDispatcher( region_name=master_parm_item.region_name, profile_name=master_parm_item.profile_name ) spot_request_dispatcher = SpotRequestDispatcher( region_name=master_parm_item.region_name, profile_name=master_parm_item.profile_name ) spot_master_dispatcher.start() logger.info("Started: spot_master_dispatcher") spot_request_dispatcher.start() logger.info("Started: spot_request_dispatcher") spot_master_dispatcher.join() except StandardError as e: logger.error( e ) logger.error( traceback.format_exc() ) sys.exit(8)
def main(): """ """ if( len(sys.argv) < 2 ): print 'Invalid format, execution cancelled' print 'Correct format: python awsspotbatch.spotclientlaunch <parmFile.json>' sys.exit(8) logging.basicConfig( format='%(asctime)s [%(levelname)s] [%(module)s] [%(funcName)s] [%(message)s]', level=logging.INFO ) logger = logging.getLogger(__name__) try: spot_client_parm_item = SpotClientParmItem( pathInParmFile=sys.argv[1] ) logger.info( 'Starting, region_name=' + spot_client_parm_item.region_name ) spot_instance_status_thread = SpotInstanceStatusThread( spot_client_parm_item.spot_request_queue_name, spot_client_parm_item.region_name, spot_request_uuid=spot_client_parm_item.spot_request_uuid, spot_master_uuid=spot_client_parm_item.spot_master_uuid, spot_request_id=spot_client_parm_item.spot_request_id ) spot_instance_status_thread.start() child_process = subprocess.Popen( spot_client_parm_item.script_name_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) std_out, std_err = child_process.communicate( ) returncode = child_process.returncode std_out, std_err = awsspotbatch.common.util.trimStdOutErrSqsPayload( std_out, std_err ) sqs_message_send_durable = SqsMessageDurable( spot_client_parm_item.spot_request_queue_name, spot_client_parm_item.region_name) message_attributes = create_microsvc_message_attributes( awsspotbatch.common.const.MICROSVC_REQUEST_CLASSNAME_SpotRequestMessageInstanceBatchProcessComplete ) sqs_message_send_durable.send_message( SpotRequestMsg( spot_request_uuid=spot_client_parm_item.spot_request_uuid, spot_master_uuid=spot_client_parm_item.spot_master_uuid, spot_request_msg_type=SpotRequestMsg.TYPE_INSTANCE_BATCH_PROCESS_COMPLETE, spot_request_id=spot_client_parm_item.spot_request_id, name_value_pairs={ SpotRequestMsg.PAIR_NAME_BATCH_PROCESS_COMPLETE_TIMESTAMP:int(time.time()), SpotRequestMsg.PAIR_NAME_BATCH_PROCESS_RETURNCODE:str(returncode), SpotRequestMsg.PAIR_NAME_BATCH_PROCESS_STD_OUT:std_out, SpotRequestMsg.PAIR_NAME_BATCH_PROCESS_STD_ERR:std_err } ).to_json(), message_attributes=message_attributes ) spot_instance_status_thread.shutdown() spot_instance_status_thread.join( 60 ) logger.info( 'Completed Successfully, child_process returncode=' + str(returncode) ) except StandardError as e: spot_instance_status_thread.is_shutdown = True; message = '' for arg in e.args: message = arg + '|' logger.error( message ) logger.error( traceback.format_exc() ) sqs_message_send_durable = SqsMessageDurable( spot_client_parm_item.spot_request_queue_name, spot_client_parm_item.region_name) message_attributes = create_microsvc_message_attributes( awsspotbatch.common.const.MICROSVC_REQUEST_CLASSNAME_SpotRequestMessageInstanceBatchProcessStartException ) sqs_message_send_durable.send_message( SpotRequestMsg( spot_request_uuid=spot_client_parm_item.spot_request_uuid, spot_master_uuid=spot_client_parm_item.spot_master_uuid, spot_request_msg_type=SpotRequestMsg.TYPE_INSTANCE_BATCH_PROCESS_START_EXCEPTION, spot_request_id=spot_client_parm_item.spot_request_id, name_value_pairs={ SpotRequestMsg.PAIR_NAME_BATCH_PROCESS_COMPLETE_TIMESTAMP:int(time.time()), SpotRequestMsg.PAIR_NAME_INSTANCE_BATCH_PROCESS_START_EXCEPTION_MESSAGE:message, SpotRequestMsg.PAIR_NAME_INSTANCE_BATCH_PROCESS_START_EXCEPTION_TRACEBACK:traceback.format_exc(), } ).to_json(), message_attributes=message_attributes ) spot_instance_status_thread.join( 60 ) sys.exit(8)