def main(): killer = GracefulKiller() evalai = EvalAI_Interface( AUTH_TOKEN=AUTH_TOKEN, EVALAI_API_SERVER=EVALAI_API_SERVER, QUEUE_NAME=QUEUE_NAME, ) logger.info("Deploying Worker for {}".format( evalai.get_challenge_by_queue_name()["title"])) challenge = evalai.get_challenge_by_queue_name() cluster_details = evalai.get_aws_eks_cluster_details(challenge.get("id")) cluster_name = cluster_details.get("name") cluster_endpoint = cluster_details.get("cluster_endpoint") api_instance = get_api_client(cluster_name, cluster_endpoint, challenge, evalai) install_gpu_drivers(api_instance) while True: message = evalai.get_message_from_sqs_queue() message_body = message.get("body") if message_body: submission_pk = message_body.get("submission_pk") challenge_pk = message_body.get("challenge_pk") phase_pk = message_body.get("phase_pk") submission = evalai.get_submission_by_pk(submission_pk) if submission: api_instance = get_api_object(cluster_name, cluster_endpoint, challenge, evalai) core_v1_api_instance = get_core_v1_api_object( cluster_name, cluster_endpoint, challenge, evalai) if (submission.get("status") == "finished" or submission.get("status") == "failed" or submission.get("status") == "cancelled"): # Fetch the last job name from the list as it is the latest running job job_name = submission.get("job_name")[-1] delete_job(api_instance, job_name) message_receipt_handle = message.get("receipt_handle") evalai.delete_message_from_sqs_queue( message_receipt_handle) elif submission.get("status") == "running": job_name = submission.get("job_name")[-1] update_failed_jobs_and_send_logs( api_instance, core_v1_api_instance, evalai, job_name, submission_pk, challenge_pk, phase_pk, ) else: logger.info( "Processing message body: {0}".format(message_body)) challenge_phase = evalai.get_challenge_phase_by_pk( challenge_pk, phase_pk) process_submission_callback(api_instance, message_body, challenge_phase, evalai) if killer.kill_now: break
def main(): killer = GracefulKiller() evalai = EvalAI_Interface( AUTH_TOKEN=AUTH_TOKEN, EVALAI_API_SERVER=EVALAI_API_SERVER, QUEUE_NAME=QUEUE_NAME, ) logger.info("Deploying Worker for {}".format( evalai.get_challenge_by_queue_name()["title"])) while True: logger.info( "Fetching new messages from the queue {}".format(QUEUE_NAME)) message = evalai.get_message_from_sqs_queue() message_body = message.get("body") if message_body: submission_pk = message_body.get("submission_pk") submission = evalai.get_submission_by_pk(submission_pk) if submission: if (submission.get("status") == "finished" or submission.get("status") == "failed"): # Fetch the last job name from the list as it is the latest running job job_name = submission.get("job_name")[-1] delete_job(batch_v1, job_name) message_receipt_handle = message.get("receipt_handle") evalai.delete_message_from_sqs_queue( message_receipt_handle) elif submission.get("status") == "running": continue else: message_receipt_handle = message.get("receipt_handle") logger.info( "Processing message body: {0}".format(message_body)) process_submission_callback(message_body, evalai) if killer.kill_now: break
def main(): killer = GracefulKiller() evalai = EvalAI_Interface( AUTH_TOKEN=AUTH_TOKEN, EVALAI_API_SERVER=EVALAI_API_SERVER, QUEUE_NAME=QUEUE_NAME, ) logger.info( "Deploying Worker for {}".format( evalai.get_challenge_by_queue_name()["title"] ) ) challenge = evalai.get_challenge_by_queue_name() cluster_details = evalai.get_aws_eks_cluster_details(challenge.get("id")) cluster_name = cluster_details.get("name") cluster_endpoint = cluster_details.get("cluster_endpoint") api_instance_client = get_api_client( cluster_name, cluster_endpoint, challenge, evalai ) install_gpu_drivers(api_instance_client) api_instance = get_api_object( cluster_name, cluster_endpoint, challenge, evalai ) core_v1_api_instance = get_core_v1_api_object( cluster_name, cluster_endpoint, challenge, evalai ) if challenge.get("is_static_dataset_code_upload"): # Create and Mount Script Volume script_config_map = create_script_config_map(script_config_map_name) create_configmap(core_v1_api_instance, script_config_map) submission_meta = {} submission_meta["submission_time_limit"] = challenge.get( "submission_time_limit" ) while True: time.sleep(2) message = evalai.get_message_from_sqs_queue() message_body = message.get("body") if message_body: if challenge.get( "is_static_dataset_code_upload" ) and not message_body.get( "is_static_dataset_code_upload_submission" ): time.sleep(35) continue api_instance = get_api_object( cluster_name, cluster_endpoint, challenge, evalai ) core_v1_api_instance = get_core_v1_api_object( cluster_name, cluster_endpoint, challenge, evalai ) message_body["submission_meta"] = submission_meta submission_pk = message_body.get("submission_pk") challenge_pk = message_body.get("challenge_pk") phase_pk = message_body.get("phase_pk") submission = evalai.get_submission_by_pk(submission_pk) if submission: if ( submission.get("status") == "finished" or submission.get("status") == "failed" or submission.get("status") == "cancelled" ): try: # Fetch the last job name from the list as it is the latest running job job_name = submission.get("job_name")[-1] delete_job(api_instance, job_name) message_receipt_handle = message.get("receipt_handle") evalai.delete_message_from_sqs_queue( message_receipt_handle ) except Exception as e: logger.exception( "Failed to delete submission job: {}".format(e) ) # Delete message from sqs queue to avoid re-triggering job delete evalai.delete_message_from_sqs_queue( message_receipt_handle ) elif submission.get("status") == "running": job_name = submission.get("job_name")[-1] update_failed_jobs_and_send_logs( api_instance, core_v1_api_instance, evalai, job_name, submission_pk, challenge_pk, phase_pk, message, ) else: logger.info( "Processing message body: {0}".format(message_body) ) challenge_phase = evalai.get_challenge_phase_by_pk( challenge_pk, phase_pk ) process_submission_callback( api_instance, message_body, challenge_phase, evalai ) if killer.kill_now: break