def process( self, message ) : """ Process the message :param message: SQS Message instance """ try: spot_master_msg = SpotMasterMsg( raw_json=message.get_body() ) spot_master_uuid = spot_master_msg.spot_master_uuid logger.info( fmt_master_uuid_msg_hdr( spot_master_uuid ) + 'process_check_status' ) # Get master row from DynamoDB and process based on state dynamodb_conn = boto.dynamodb2.connect_to_region( self.region_name, profile_name=self.profile_name ) spot_master_table = Table( self.spot_master_table_name, connection=dynamodb_conn ) spot_master_item = spot_master_table.get_item( spot_master_uuid=spot_master_uuid ) logger.info( fmt_master_uuid_msg_hdr( spot_master_uuid ) + 'master state=' + spot_master_item[TableSpotMaster.spot_master_state_code]) next_status_msg_delay_secs = 60 is_send_master_msg_check_status = True master_state_code = spot_master_item[TableSpotMaster.spot_master_state_code] spot_master_item[ TableSpotMaster.ts_last_state_check ] = int( time.time() ) spot_master_row_partial_save( self.spot_master_table_name, spot_master_item, {TableSpotMaster.ts_last_state_check:int( time.time() )}, region_name=self.region_name, profile_name=self.profile_name ) # Process based on the current Master State if SpotMasterStateCode.master_resources_in_progress == master_state_code: self.handle_state_master_resources_in_progress( spot_master_item ) next_status_msg_delay_secs = 5 elif SpotMasterStateCode.master_role_policy_in_progress == master_state_code: self.handle_state_master_role_policy_in_progress( spot_master_item, dynamodb_conn ) next_status_msg_delay_secs = 5 elif SpotMasterStateCode.waiting_for_instances_complete == master_state_code: self.handle_state_waiting_for_instances_complete( spot_master_item ) elif SpotMasterStateCode.waiting_for_instances_terminated == master_state_code: self.handle_state_waiting_for_instances_terminated( spot_master_item ) elif SpotMasterStateCode.waiting_for_master_resources_terminated == master_state_code: self.handle_state_waiting_for_master_resources_terminated( spot_master_item ) next_status_msg_delay_secs = 5 elif SpotMasterStateCode.cleanup_in_progress == master_state_code: self.handle_state_cleanup_in_progress( spot_master_item ) elif SpotMasterStateCode.cleanup_complete == master_state_code: self.handle_state_cleanup_complete( spot_master_item ) is_send_master_msg_check_status = False self.spot_master_sqs_message_durable.delete_message(message) if is_send_master_msg_check_status: spot_master_msg_check_status = SpotMasterMsg( spot_master_uuid=spot_master_uuid, spot_master_msg_type=SpotMasterMsg.TYPE_CHECK_STATUS ) message_attributes = create_microsvc_message_attributes( awsspotbatch.common.const.MICROSVC_MASTER_CLASSNAME_SpotMasterMessageCheckStatus ) self.spot_master_sqs_message_durable.send_message( spot_master_msg_check_status.to_json(), delay_seconds=next_status_msg_delay_secs, message_attributes=message_attributes ) except StandardError as e: logger.error( fmt_master_uuid_msg_hdr( spot_master_uuid ) + str(e) ) logger.error( fmt_master_uuid_msg_hdr( spot_master_uuid ) + traceback.format_exc() )
def process( self, message ) : """ Try to submit another Spot Request based on the one that just failed :param message: SQS Message instance """ try: spot_master_msg = SpotMasterMsg( raw_json=message.get_body() ) spot_master_uuid = spot_master_msg.spot_master_uuid logger.info( fmt_master_uuid_msg_hdr( spot_master_uuid ) + 'process_resubmit_failed_request') dynamodb_conn = boto.dynamodb2.connect_to_region( self.region_name, profile_name=self.profile_name ) spot_master_table = Table( self.spot_master_table_name, connection=dynamodb_conn ) spot_master_item = spot_master_table.get_item( spot_master_uuid=spot_master_uuid ) spot_request_table = Table( self.spot_request_table_name, connection=dynamodb_conn ) failed_spot_request_item = spot_request_table.get_item( spot_request_uuid=spot_master_msg.spot_request_uuid ) # Request spot instance spot_instance_request = self.resubmit_failed_request_spot_instance( spot_master_item, failed_spot_request_item, dynamodb_conn ) # Queue up a SpotRequestMsg if spot_instance_request != None: spot_request_uuid = str(uuid.uuid1()) spot_request_msg = SpotRequestMsg( spot_request_uuid=spot_request_uuid, spot_master_uuid=spot_master_item[ TableSpotMaster.spot_master_uuid ], spot_request_msg_type=SpotRequestMsg.TYPE_SPOT_REQUEST_INITIATED, spot_request_id=spot_instance_request.id ) spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_SPOT_PRICE ] = str( spot_instance_request.price ) spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_INSTANCE_USERNAME ] = spot_master_item[ TableSpotMaster.instance_username ] spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_ATTEMPT_NUMBER ] = int( failed_spot_request_item[ TableSpotRequest.attempt_number ] + 1 ) spot_request_sqs_message_durable = SqsMessageDurable( self.spot_request_queue_name, self.region_name, profile_name=self.profile_name ) spot_request_sqs_message_durable.send_message( spot_request_msg.to_json(), message_attributes=create_microsvc_message_attributes( awsspotbatch.common.const.MICROSVC_REQUEST_CLASSNAME_SpotRequestMessageSpotRequestInitiated ) ) self.spot_master_sqs_message_durable.delete_message(message) # No instances available - resubmit this message with a delay timer so it will get reprocessed in future else: logger.warning( fmt_master_uuid_msg_hdr( spot_master_uuid ) + 'No spot instances available, will try again in ' + str(awsspotbatch.common.const.NO_SPOT_INSTANCES_AVAILABLE_RECHECK_MINUTES) + ' minutes') delay_seconds = awsspotbatch.common.const.NO_SPOT_INSTANCES_AVAILABLE_RECHECK_MINUTES * 60 self.spot_master_sqs_message_durable.send_message( message.get_body(), message_attributes=create_microsvc_message_attributes( awsspotbatch.common.const.MICROSVC_MASTER_CLASSNAME_SpotMasterMessageResubmitFailedRequest ), delay_seconds=delay_seconds ) self.spot_master_sqs_message_durable.delete_message(message) except StandardError as e: logger.error( fmt_master_item_msg_hdr( spot_master_item ) + str(e) ) logger.error( fmt_master_item_msg_hdr( spot_master_item ) + traceback.format_exc() )
def process( self, message ) : """ User has requested Spot Batch instances - this is the initial entry point for a user request :param message: SQS Message instance """ try: spot_master_msg = SpotMasterMsg( raw_json=message.get_body() ) spot_master_uuid = spot_master_msg.spot_master_uuid logger.info( fmt_master_uuid_msg_hdr( spot_master_uuid ) + 'process_submit_batch') dynamodb_conn = boto.dynamodb2.connect_to_region( self.region_name, profile_name=self.profile_name ) vpc_conn = awsext.vpc.connect_to_region( self.region_name, profile_name=self.profile_name ) ec2_conn = awsext.ec2.connect_to_region( self.region_name, profile_name=self.profile_name ) iam_conn = awsext.iam.connect_to_region( self.region_name, profile_name=self.profile_name ) batch_job_parm_item = BatchJobParmItem( stringParmFile=spot_master_msg.raw_batch_job_parm_item ) cheapest_subnet_id, cheapest_price, spot_cheapest_item = find_cheapest_subnet_price( batch_job_parm_item,profile_name=self.profile_name ) if cheapest_subnet_id != None: logger.info( fmt_master_uuid_msg_hdr( spot_master_uuid ) + 'Starting spot batch job' ) put_batch_job_parm_item( spot_master_uuid, self.spot_batch_job_parm_table_name, spot_master_msg, region_name=self.region_name, profile_name=self.profile_name ) spot_master_state_code = SpotMasterStateCode.master_resources_in_progress subnet = vpc_conn.get_all_subnets( subnet_ids=[cheapest_subnet_id] )[0] cheapest_vpc_id = subnet.vpc_id cheapest_subnet_id = cheapest_subnet_id cheapest_region_name = spot_cheapest_item.region.name cheapest_zone_name = spot_cheapest_item.zone.name unique_key_pair = ec2_conn.create_unique_key_pair( 'spotkp_' ) # Store the key for later use in SSH rsa_key_encoded = awsspotbatch.common.util.encode( awsspotbatch.common.util.kp_enc_key, unique_key_pair.material ) put_rsa_key_item( spot_master_uuid, self.spot_rsa_key_table_name, rsa_key_encoded, region_name=self.region_name, profile_name=self.profile_name ) unique_security_group = vpc_conn.create_unique_security_group( cheapest_vpc_id, 'spotsg_' ) policy = batch_job_parm_item.policy_statements security_group_inbound_rule_items_serialized = batch_job_parm_item.serialized_inbound_rule_items role_instance_profile_item = iam_conn.create_unique_role_instance_profile( policy=policy, role_name_prefix=awsspotbatch.common.const.ROLE_NAME_PREFIX, policy_name_prefix=awsspotbatch.common.const.POLICY_NAME_PREFIX ) # instance_profile_name, role_name, policy_name self.create_master_row( dynamodb_conn, batch_job_parm_item, spot_master_msg=spot_master_msg, spot_master_uuid=spot_master_uuid, cheapest_vpc_id=cheapest_vpc_id, cheapest_subnet_id=cheapest_subnet_id, cheapest_region_name=cheapest_region_name, cheapest_zone_name=cheapest_zone_name, cheapest_price=cheapest_price, unique_key_pair=unique_key_pair, unique_security_group=unique_security_group, role_instance_profile_item=role_instance_profile_item, security_group_inbound_rule_items_serialized=security_group_inbound_rule_items_serialized, spot_master_state_code=spot_master_state_code ) # submit CheckStatus msg to check on completion of master resources self.send_check_status( spot_master_uuid ) self.spot_master_sqs_message_durable.delete_message(message) else: spot_master_state_code = SpotMasterStateCode.no_instances_available unique_key_pair = None unique_security_group = None policy = None security_group_inbound_rule_items_serialized = None role_instance_profile_item = None cheapest_vpc_id = None cheapest_subnet_id = None cheapest_region_name = None cheapest_zone_name = None cheapest_price = None logger.warning( fmt_master_uuid_msg_hdr( spot_master_uuid ) + 'No spot instances currently available, will retry in 5 minutes') # At this point, the SpotMasterMessageSubmitBatch message won't be deleted, it will reprocess at the end of the in flight movie # change the visibility timeout to 5 minutes message.change_visibility( (5*60) ) except StandardError as e: logger.error( str(e) ) logger.error( traceback.format_exc() )