def delete_service_by_challenge_pk(challenge): """ Deletes the workers service of a challenge. Before deleting, it scales down the number of workers in the service to 0, then proceeds to delete the service. Parameters: challenge (<class 'challenges.models.Challenge'>): The challenge object for whom the task definition is being registered. Returns: dict: The response returned by the delete_service method from boto3 """ client = get_boto3_client("ecs", aws_keys) queue_name = challenge.queue service_name = "{}_service".format(queue_name) kwargs = delete_service_args.format(CLUSTER=COMMON_SETTINGS_DICT["CLUSTER"], service_name=service_name, force=True) kwargs = eval(kwargs) try: if (challenge.workers != 0): response = update_service_by_challenge_pk(client, challenge, 0, False) if (response["ResponseMetadata"]["HTTPStatusCode"] != HTTPStatus.OK): return response response = client.delete_service(**kwargs) if (response["ResponseMetadata"]["HTTPStatusCode"] == HTTPStatus.OK): challenge.workers = None challenge.save() client.deregister_task_definition(taskDefinition=challenge.task_def_arn) challenge.task_def_arn = "" challenge.save() return response except ClientError as e: logger.exception(e) return e.response
def scale_workers(queryset, num_of_tasks): """ The function called by the admin action method to scale all the selected workers. Calls the service_manager method. Before calling, checks if the target scaling number is different than current. Parameters: queryset (<class 'django.db.models.query.QuerySet'>): The queryset of selected challenges in the django admin page. Returns: dict: keys-> 'count': the number of workers successfully started. 'failures': a dict of all the failures with their error messages and the challenge pk """ client = get_boto3_client("ecs", aws_keys) count = 0 failures = [] for challenge in queryset: if (challenge.workers is None): response = "Please start worker(s) before scaling." failures.append({"message": response, "challenge_pk": challenge.pk}) continue if (num_of_tasks == challenge.workers): response = "Please scale to a different number. Challenge has {} worker(s).".format(num_of_tasks) failures.append({"message": response, "challenge_pk": challenge.pk}) continue response = service_manager(client, challenge=challenge, num_of_tasks=num_of_tasks) if (response["ResponseMetadata"]["HTTPStatusCode"] != HTTPStatus.OK): failures.append({"message": response['Error'], "challenge_pk": challenge.pk}) continue count += 1 return {"count": count, "failures": failures}
def generate_presigned_url(file_key_on_s3, challenge_pk): """ Function to get the presigned url to upload a file to s3 Arguments: file_key_on_s3 {string} -- The S3 key for the file to be uploaded challenge_pk {int} -- challenge pk for which credentails are to be fetched Returns: response_data {dict} -- Dict containing the presigned_url or the error if request failed """ if settings.DEBUG or settings.TEST: return try: aws_keys = get_aws_credentials_for_challenge(challenge_pk) s3 = get_boto3_client("s3", aws_keys) response = s3.generate_presigned_url( "put_object", Params={ "Bucket": aws_keys["AWS_STORAGE_BUCKET_NAME"], "Key": file_key_on_s3, }, ExpiresIn=settings.PRESIGNED_URL_EXPIRY_TIME, HttpMethod="PUT", ) response_data = {"presigned_url": response} return response_data except Exception as e: logger.exception(e) response_data = {"error": "Could not fetch presigned url."} return response_data
def create_eks_nodegroup(challenge, cluster_name): """ Creates a nodegroup when a EKS cluster is created by the EvalAI admin Arguments: instance {<class 'django.db.models.query.QuerySet'>} -- instance of the model calling the post hook cluster_name {str} -- name of eks cluster """ for obj in serializers.deserialize("json", challenge): challenge_obj = obj.object nodegroup_name = "{0}-nodegroup".format( challenge_obj.title.replace(" ", "-")) client = get_boto3_client("eks", aws_keys) # TODO: Move the hardcoded cluster configuration such as the # instance_type, subnets, AMI to challenge configuration later. try: response = client.create_nodegroup( clusterName=cluster_name, nodegroupName=nodegroup_name, scalingConfig={ "minSize": 1, "maxSize": 10, "desiredSize": 1 }, diskSize=100, subnets=[VPC_DICT["SUBNET_1"], VPC_DICT["SUBNET_2"]], instanceTypes=["g4dn.xlarge"], amiType="AL2_x86_64_GPU", nodeRole=settings.EKS_NODEGROUP_ROLE_ARN, ) except ClientError as e: logger.exception(e) return response waiter = client.get_waiter("nodegroup_active") waiter.wait(clusterName=cluster_name, nodegroupName=nodegroup_name)
def complete_s3_multipart_file_upload(parts, upload_id, file_key_on_s3, challenge_pk): """ Function to complete the multipart upload of s3 files using presigned urls Arguments: parts {List} -- List of S3 ETag and PartNumber for each uploaded chunk upload_id {string} -- Unique upload id for multipart file upload file_key_on_s3 {string} -- The S3 key for the file to be uploaded challenge_pk {int} -- challenge pk for which credentails are to be fetched Returns: response_data {dict} -- Dict containing the presigned_urls or the error if request failed """ if settings.DEBUG: return response_data = {} try: aws_keys = get_aws_credentials_for_challenge(challenge_pk) s3 = get_boto3_client("s3", aws_keys) response_data = s3.complete_multipart_upload( Bucket=aws_keys["AWS_STORAGE_BUCKET_NAME"], Key=file_key_on_s3, MultipartUpload={"Parts": parts}, UploadId=upload_id, ) except Exception as e: logger.exception(e) response_data = {"error": "Could not fetch presigned urls."} return response_data
def get_logs_from_cloudwatch(log_group_name, log_stream_prefix, start_time, end_time, pattern): """ To fetch logs of a container from cloudwatch within a specific time frame. """ client = get_boto3_client("logs", aws_keys) logs = [] if settings.DEBUG: logs = [ "The worker logs in the development environment are available on the terminal. Please use docker-compose worker -f to view the logs." ] else: try: response = client.filter_log_events( logGroupName=log_group_name, logStreamNamePrefix=log_stream_prefix, startTime=start_time, endTime=end_time, filterPattern=pattern, ) for event in response["events"]: logs.append(event["message"]) except Exception as e: if e.response["Error"]["Code"] == "ResourceNotFoundException": return logs logger.exception(e) return [ f"There is an error in displaying logs. Please find the full error traceback here {e}" ] return logs
def start_workers(queryset): """ The function called by the admin action method to start all the selected workers. Calls the service_manager method. Before calling, checks if all the workers are incactive. Parameters: queryset (<class 'django.db.models.query.QuerySet'>): The queryset of selected challenges in the django admin page. Returns: dict: keys-> 'count': the number of workers successfully started. 'failures': a dict of all the failures with their error messages and the challenge pk """ client = get_boto3_client("ecs", aws_keys) count = 0 failures = [] for challenge in queryset: if (challenge.workers == 0) or (challenge.workers is None): response = service_manager(client, challenge=challenge, num_of_tasks=1) if response["ResponseMetadata"]["HTTPStatusCode"] != HTTPStatus.OK: failures.append({ "message": response["Error"], "challenge_pk": challenge.pk, }) continue count += 1 else: response = "Please select challenge with inactive workers only." failures.append({ "message": response, "challenge_pk": challenge.pk }) return {"count": count, "failures": failures}
def restart_workers(queryset): """ The function called by the admin action method to restart all the selected workers. Calls the delete_service_by_challenge_pk method. Before calling, verifies that the challenge worker(s) is(are) active. Parameters: queryset (<class 'django.db.models.query.QuerySet'>): The queryset of selected challenges in the django admin page. Returns: dict: keys-> 'count': the number of workers successfully stopped. 'failures': a dict of all the failures with their error messages and the challenge pk """ client = get_boto3_client("ecs", aws_keys) count = 0 failures = [] for challenge in queryset: if (challenge.workers is not None) and (challenge.workers > 0): response = service_manager(client, challenge, num_of_tasks=challenge.workers, force_new_deployment=True) if (response["ResponseMetadata"]["HTTPStatusCode"] != HTTPStatus.OK): failures.append({"message": response['Error'], "challenge_pk": challenge.pk}) continue count += 1 else: response = "Please select challenges with active workers only." failures.append({"message": response, "challenge_pk": challenge.pk}) return {"count": count, "failures": failures}
def create_eks_nodegroup(challenge, cluster_name): """ Creates a nodegroup when a EKS cluster is created by the EvalAI admin Arguments: instance {<class 'django.db.models.query.QuerySet'>} -- instance of the model calling the post hook cluster_name {str} -- name of eks cluster """ from .utils import get_aws_credentials_for_challenge for obj in serializers.deserialize("json", challenge): challenge_obj = obj.object environment_suffix = "{}-{}".format(challenge_obj.pk, settings.ENVIRONMENT) nodegroup_name = "{}-{}-nodegroup".format( challenge_obj.title.replace(" ", "-"), environment_suffix ) challenge_aws_keys = get_aws_credentials_for_challenge(challenge_obj.pk) client = get_boto3_client("eks", challenge_aws_keys) cluster_meta = get_code_upload_setup_meta_for_challenge(challenge_obj.pk) # TODO: Move the hardcoded cluster configuration such as the # instance_type, subnets, AMI to challenge configuration later. try: response = client.create_nodegroup( clusterName=cluster_name, nodegroupName=nodegroup_name, scalingConfig={ "minSize": challenge_obj.min_worker_instance, "maxSize": challenge_obj.max_worker_instance, "desiredSize": challenge_obj.desired_worker_instance, }, diskSize=challenge_obj.worker_disk_size, subnets=[cluster_meta["SUBNET_1"], cluster_meta["SUBNET_2"]], instanceTypes=[challenge_obj.worker_instance_type], amiType=challenge_obj.worker_ami_type, nodeRole=cluster_meta["EKS_NODEGROUP_ROLE_ARN"], ) logger.info("Nodegroup create: {}".format(response)) except ClientError as e: logger.exception(e) return waiter = client.get_waiter("nodegroup_active") waiter.wait(clusterName=cluster_name, nodegroupName=nodegroup_name) construct_and_send_eks_cluster_creation_mail(challenge_obj) # starting the code-upload-worker client = get_boto3_client("ecs", aws_keys) client_token = client_token_generator(challenge_obj.pk) create_service_by_challenge_pk(client, challenge_obj, client_token)
def restart_workers(queryset): """ The function called by the admin action method to restart all the selected workers. Calls the service_manager method. Before calling, verifies that the challenge worker(s) is(are) active. Parameters: queryset (<class 'django.db.models.query.QuerySet'>): The queryset of selected challenges in the django admin page. Returns: dict: keys-> 'count': the number of workers successfully stopped. 'failures': a dict of all the failures with their error messages and the challenge pk """ if settings.DEBUG: failures = [] for challenge in queryset: failures.append( { "message": "Workers cannot be restarted on AWS ECS service in development environment", "challenge_pk": challenge.pk, } ) return {"count": 0, "failures": failures} client = get_boto3_client("ecs", aws_keys) count = 0 failures = [] for challenge in queryset: if ( challenge.is_docker_based and not challenge.is_static_dataset_code_upload ): response = "Sorry. This feature is not available for code upload/docker based challenges." failures.append( {"message": response, "challenge_pk": challenge.pk} ) elif (challenge.workers is not None) and (challenge.workers > 0): response = service_manager( client, challenge=challenge, num_of_tasks=challenge.workers, force_new_deployment=True, ) if response["ResponseMetadata"]["HTTPStatusCode"] != HTTPStatus.OK: failures.append( { "message": response["Error"], "challenge_pk": challenge.pk, } ) continue count += 1 else: response = "Please select challenges with active workers only." failures.append( {"message": response, "challenge_pk": challenge.pk} ) return {"count": count, "failures": failures}
def get_signed_url_for_submission_related_file(request): """Returns S3 signed URL for a particular file residing on S3 bucket Arguments: request {object} -- Request object Returns: Response object -- Response object with appropriate response code (200/400/403/404) """ # Assumption: file will be stored in this format: 'team_{id}/submission_{id}/.../file.log' bucket = request.query_params.get("bucket", None) key = request.query_params.get("key", None) if not bucket or not key: response_data = {"error": "key and bucket names can't be empty"} return Response(response_data, status=status.HTTP_400_BAD_REQUEST) try: splits = key.split("/") participant_team_id, submission_id = ( splits[0].replace("team_", ""), splits[1].replace("submission_", ""), ) except Exception: response_data = { "error": "Invalid file path format. Please try again with correct file path format." } return Response(response_data, status=status.HTTP_400_BAD_REQUEST) participant_team = get_participant_team_model(participant_team_id) submission = get_submission_model(submission_id) challenge_pk = submission.challenge_phase.challenge.pk if submission.participant_team != participant_team: response_data = { "error": "You are not authorized to access this file." } return Response(response_data, status=status.HTTP_403_FORBIDDEN) if is_user_part_of_participant_team( request.user, participant_team) or is_user_a_host_of_challenge( request.user, challenge_pk): aws_keys = get_aws_credentials_for_challenge(challenge_pk) s3 = get_boto3_client("s3", aws_keys) url = s3.generate_presigned_url(ClientMethod="get_object", Params={ "Bucket": bucket, "Key": key }) response_data = {"signed_url": url} return Response(response_data, status=status.HTTP_200_OK) else: response_data = { "error": "You are not authorized to access this file." } return Response(response_data, status=status.HTTP_403_FORBIDDEN)
def delete_log_group(log_group_name): if settings.DEBUG: pass else: try: client = get_boto3_client("logs", aws_keys) client.delete_log_group(logGroupName=log_group_name) except Exception as e: logger.exception(e)
def generate_presigned_url_for_multipart_upload(file_key_on_s3, challenge_pk, num_parts): """ Function to get the presigned urls to upload a file to s3 in chunks Arguments: file_key_on_s3 {string} -- The S3 key for the file to be uploaded challenge_pk {int} -- challenge pk for which credentails are to be fetched Returns: response_data {dict} -- Dict containing the presigned_urls or the error if request failed """ if settings.DEBUG: return response_data = {} try: aws_keys = get_aws_credentials_for_challenge(challenge_pk) s3 = get_boto3_client("s3", aws_keys) response = s3.create_multipart_upload( Bucket=aws_keys["AWS_STORAGE_BUCKET_NAME"], Key=file_key_on_s3, ACL="public-read", ) upload_id = response["UploadId"] presigned_urls = [] for part_number in range(1, num_parts + 1): presigned_url = s3.generate_presigned_url( ClientMethod="upload_part", Params={ "Bucket": aws_keys["AWS_STORAGE_BUCKET_NAME"], "Key": file_key_on_s3, "UploadId": upload_id, "PartNumber": part_number, }, ExpiresIn=settings.PRESIGNED_URL_EXPIRY_TIME, ) presigned_urls.append({ "partNumber": part_number, "url": presigned_url }) response_data = { "presigned_urls": presigned_urls, "upload_id": upload_id, } except Exception as e: logger.exception(e) response_data = {"error": "Could not fetch presigned urls."} return response_data
def get_or_create_ecr_repository(name, aws_keys): """Get or create AWS ECR Repository Arguments: name {string} -- name of ECR repository Keyword Arguments: aws_keys {dict} -- AWS keys where the ECR repositories will be created Returns: tuple -- Contains repository dict and boolean field to represent whether ECR repository was created Eg: ( { 'repositoryArn': 'arn:aws:ecr:us-east-1:1234567890:repository/some-repository-name', 'registryId': '1234567890', 'repositoryName': 'some-repository-name', 'repositoryUri': '1234567890.dkr.ecr.us-east-1.amazonaws.com/some-repository-name', 'createdAt': datetime.datetime(2019, 2, 6, 9, 12, 5, tzinfo=tzlocal()) }, False ) """ repository, created = None, False client = get_boto3_client("ecr", aws_keys) try: response = client.describe_repositories( registryId=aws_keys.get("AWS_ACCOUNT_ID"), repositoryNames=[name] ) repository = response["repositories"][0] except ClientError as e: if ( e.response["Error"]["Code"] == "RepositoryNotFoundException" or e.response["Error"]["Code"] == "400" ): response = client.create_repository(repositoryName=name) repository = response["repository"] created = True else: logger.exception(e) return (repository, created)
def create_federated_user(name, repository, aws_keys): """Create AWS federated user Arguments: name {string} -- Name of participant team for which federated user is to be created repository {string} -- Name of the AWS ECR repository to which user should be granted permission Returns: dict -- Dict containing user related credentials such as access_key_id, access_secret etc. Eg: { 'Credentials': { 'AccessKeyId': 'ABCDEFGHIJKLMNOPQRTUVWXYZ', 'SecretAccessKey': 'NMgBB75gfVBCDEFGHIJK8g00qVyyzQW+4XjJGQALMNOPQRSTUV', 'SessionToken': 'FQoGZX.....', 'Expiration': datetime.datetime(2019, 2, 7, 5, 43, 58, tzinfo=tzutc()) }, 'FederatedUser': { 'FederatedUserId': '1234567890:test-user', 'Arn': 'arn:aws:sts::1234567890:federated-user/test-user' }, 'PackedPolicySize': 28, 'ResponseMetadata': { 'RequestId': 'fb47f78b-2a92-11e9-84b9-33527429b818', 'HTTPStatusCode': 200, 'HTTPHeaders': { 'x-amzn-requestid': 'fb47f78b-2a92-11e9-84b9-33527429b818', 'content-type': 'text/xml', 'content-length': '1245', 'date': 'Thu, 07 Feb 2019 04:43:57 GMT' }, 'RetryAttempts': 0 } } """ AWS_ACCOUNT_ID = aws_keys.get("AWS_ACCOUNT_ID") AWS_REGION = aws_keys.get("AWS_REGION") policy = { "Version": "2012-10-17", "Statement": [ { "Effect": "Allow", "Action": "ecr:*", "Resource": "arn:aws:ecr:{}:{}:repository/{}".format( AWS_REGION, AWS_ACCOUNT_ID, repository), }, { "Effect": "Allow", "Action": ["ecr:GetAuthorizationToken"], "Resource": "*", }, ], } client = get_boto3_client("sts", aws_keys) response = client.get_federation_token( Name=convert_to_aws_federated_user_format(name), Policy=json.dumps(policy), DurationSeconds=43200, ) return response
def create_eks_cluster(challenge): """ Called when Challenge is approved by the EvalAI admin calls the create_eks_nodegroup function Arguments: sender {type} -- model field called the post hook instance {<class 'django.db.models.query.QuerySet'>} -- instance of the model calling the post hook """ from .models import ChallengeEvaluationCluster for obj in serializers.deserialize("json", challenge): challenge_obj = obj.object cluster_name = "{0}-cluster".format(challenge_obj.title.replace(" ", "-")) if challenge_obj.approved_by_admin and challenge_obj.is_docker_based: client = get_boto3_client("eks", aws_keys) try: response = client.create_cluster( name=cluster_name, version="1.15", roleArn=settings.EKS_CLUSTER_ROLE_ARN, resourcesVpcConfig={ "subnetIds": [VPC_DICT["SUBNET_1"], VPC_DICT["SUBNET_2"]], "securityGroupIds": [VPC_DICT["SUBNET_SECURITY_GROUP"]], }, ) waiter = client.get_waiter("cluster_active") waiter.wait(name=cluster_name) # creating kubeconfig cluster = client.describe_cluster(name=cluster_name) cluster_cert = cluster["cluster"]["certificateAuthority"]["data"] cluster_ep = cluster["cluster"]["endpoint"] cluster_config = { "apiVersion": "v1", "kind": "Config", "clusters": [{ "cluster": { "server": str(cluster_ep), "certificate-authority-data": str(cluster_cert), }, "name": "kubernetes", }], "contexts": [{ "context": { "cluster": "kubernetes", "user": "******" }, "name": "aws", }], "current-context": "aws", "preferences": {}, "users": [{ "name": "aws", "user": { "exec": { "apiVersion": "client.authentication.k8s.io/v1alpha1", "command": "heptio-authenticator-aws", "args": ["token", "-i", cluster_name], } }, }], } # Write in YAML. config_text = yaml.dump(cluster_config, default_flow_style=False) config_file = NamedTemporaryFile(delete=True) config_file.write(config_text.encode()) ChallengeEvaluationCluster.objects.create( challenge=challenge_obj, name=cluster_name, cluster_endpoint=cluster_ep, cluster_ssl=cluster_cert, ) # Creating nodegroup create_eks_nodegroup.delay(challenge, cluster_name) return response except ClientError as e: logger.exception(e) return
def create_eks_cluster_subnets(challenge): """ Creates EKS and NodeGroup ARN roles Arguments: instance {<class 'django.db.models.query.QuerySet'>} -- instance of the model calling the post hook """ from .models import ChallengeEvaluationCluster from .serializers import ChallengeEvaluationClusterSerializer from .utils import get_aws_credentials_for_challenge for obj in serializers.deserialize("json", challenge): challenge_obj = obj.object challenge_aws_keys = get_aws_credentials_for_challenge(challenge_obj.pk) environment_suffix = "{}-{}".format(challenge_obj.pk, settings.ENVIRONMENT) client = get_boto3_client("ec2", challenge_aws_keys) vpc_ids = [] try: response = client.create_vpc(CidrBlock=challenge_obj.vpc_cidr) vpc_ids.append(response["Vpc"]["VpcId"]) except ClientError as e: logger.exception(e) return waiter = client.get_waiter("vpc_available") waiter.wait(VpcIds=vpc_ids) # Create internet gateway and attach to vpc try: # Enable DNS resolution for VPC response = client.modify_vpc_attribute( EnableDnsHostnames={"Value": True}, VpcId=vpc_ids[0]) response = client.create_internet_gateway() internet_gateway_id = response["InternetGateway"]["InternetGatewayId"] client.attach_internet_gateway(InternetGatewayId=internet_gateway_id, VpcId=vpc_ids[0]) # Create and attach route table response = client.create_route_table(VpcId=vpc_ids[0]) route_table_id = response["RouteTable"]["RouteTableId"] client.create_route( DestinationCidrBlock="0.0.0.0/0", GatewayId=internet_gateway_id, RouteTableId=route_table_id, ) # Create subnets subnet_ids = [] response = client.create_subnet( CidrBlock=challenge_obj.subnet_1_cidr, AvailabilityZone="us-east-1a", VpcId=vpc_ids[0], ) subnet_1_id = response["Subnet"]["SubnetId"] subnet_ids.append(subnet_1_id) response = client.create_subnet( CidrBlock=challenge_obj.subnet_2_cidr, AvailabilityZone="us-east-1b", VpcId=vpc_ids[0], ) subnet_2_id = response["Subnet"]["SubnetId"] subnet_ids.append(subnet_2_id) waiter = client.get_waiter("subnet_available") waiter.wait(SubnetIds=subnet_ids) # Creating managed node group needs subnets to auto assign ip v4 for subnet_id in subnet_ids: response = client.modify_subnet_attribute( MapPublicIpOnLaunch={ "Value": True, }, SubnetId=subnet_id, ) # Associate route table with subnets response = client.associate_route_table( RouteTableId=route_table_id, SubnetId=subnet_1_id, ) response = client.associate_route_table( RouteTableId=route_table_id, SubnetId=subnet_2_id, ) # Create security group response = client.create_security_group( GroupName="EvalAI code upload challenge", Description="EvalAI code upload challenge worker group", VpcId=vpc_ids[0], ) security_group_id = response["GroupId"] response = client.create_security_group( GroupName="evalai-code-upload-challenge-efs-{}".format( environment_suffix), Description="EKS nodegroup EFS", VpcId=vpc_ids[0], ) efs_security_group_id = response["GroupId"] response = client.authorize_security_group_ingress( GroupId=efs_security_group_id, IpPermissions=[{ "FromPort": 2049, "IpProtocol": "tcp", "IpRanges": [ { "CidrIp": challenge_obj.vpc_cidr, }, ], "ToPort": 2049, }], ) # Create EFS efs_client = get_boto3_client("efs", challenge_aws_keys) efs_creation_token = str(uuid.uuid4())[:64] response = efs_client.create_file_system( CreationToken=efs_creation_token, ) efs_id = response["FileSystemId"] challenge_evaluation_cluster = ChallengeEvaluationCluster.objects.get( challenge=challenge_obj) serializer = ChallengeEvaluationClusterSerializer( challenge_evaluation_cluster, data={ "vpc_id": vpc_ids[0], "internet_gateway_id": internet_gateway_id, "route_table_id": route_table_id, "security_group_id": security_group_id, "subnet_1_id": subnet_1_id, "subnet_2_id": subnet_2_id, "efs_security_group_id": efs_security_group_id, "efs_id": efs_id, "efs_creation_token": efs_creation_token, }, partial=True, ) if serializer.is_valid(): serializer.save() # Create eks cluster create_eks_cluster.delay(challenge) except ClientError as e: logger.exception(e) return
def create_eks_cluster(challenge): """ Called when Challenge is approved by the EvalAI admin calls the create_eks_nodegroup function Arguments: sender {type} -- model field called the post hook instance {<class 'django.db.models.query.QuerySet'>} -- instance of the model calling the post hook """ from .models import ChallengeEvaluationCluster from .serializers import ChallengeEvaluationClusterSerializer from .utils import get_aws_credentials_for_challenge for obj in serializers.deserialize("json", challenge): challenge_obj = obj.object environment_suffix = "{}-{}".format(challenge_obj.pk, settings.ENVIRONMENT) cluster_name = "{}-{}-cluster".format( challenge_obj.title.replace(" ", "-"), environment_suffix) if challenge_obj.approved_by_admin and challenge_obj.is_docker_based: challenge_aws_keys = get_aws_credentials_for_challenge( challenge_obj.pk) client = get_boto3_client("eks", challenge_aws_keys) cluster_meta = get_code_upload_setup_meta_for_challenge( challenge_obj.pk) try: response = client.create_cluster( name=cluster_name, version="1.16", roleArn=cluster_meta["EKS_CLUSTER_ROLE_ARN"], resourcesVpcConfig={ "subnetIds": [ cluster_meta["SUBNET_1"], cluster_meta["SUBNET_2"], ], "securityGroupIds": [cluster_meta["SUBNET_SECURITY_GROUP"]], }, ) waiter = client.get_waiter("cluster_active") waiter.wait(name=cluster_name) # creating kubeconfig cluster = client.describe_cluster(name=cluster_name) cluster_cert = cluster["cluster"]["certificateAuthority"]["data"] cluster_ep = cluster["cluster"]["endpoint"] cluster_config = { "apiVersion": "v1", "kind": "Config", "clusters": [{ "cluster": { "server": str(cluster_ep), "certificate-authority-data": str(cluster_cert), }, "name": "kubernetes", }], "contexts": [{ "context": { "cluster": "kubernetes", "user": "******" }, "name": "aws", }], "current-context": "aws", "preferences": {}, "users": [{ "name": "aws", "user": { "exec": { "apiVersion": "client.authentication.k8s.io/v1alpha1", "command": "heptio-authenticator-aws", "args": ["token", "-i", cluster_name], } }, }], } # Write in YAML. config_text = yaml.dump(cluster_config, default_flow_style=False) config_file = NamedTemporaryFile(delete=True) config_file.write(config_text.encode()) challenge_evaluation_cluster = ( ChallengeEvaluationCluster.objects.get( challenge=challenge_obj)) efs_client = get_boto3_client("efs", challenge_aws_keys) # Create mount targets for subnets mount_target_ids = [] response = efs_client.create_mount_target( FileSystemId=challenge_evaluation_cluster.efs_id, SubnetId=challenge_evaluation_cluster.subnet_1_id, SecurityGroups=[ challenge_evaluation_cluster.efs_security_group_id ], ) mount_target_ids.append(response["MountTargetId"]) response = efs_client.create_mount_target( FileSystemId=challenge_evaluation_cluster.efs_id, SubnetId=challenge_evaluation_cluster.subnet_2_id, SecurityGroups=[ challenge_evaluation_cluster.efs_security_group_id ], ) mount_target_ids.append(response["MountTargetId"]) serializer = ChallengeEvaluationClusterSerializer( challenge_evaluation_cluster, data={ "name": cluster_name, "cluster_endpoint": cluster_ep, "cluster_ssl": cluster_cert, "efs_mount_target_ids": mount_target_ids, }, partial=True, ) if serializer.is_valid(): serializer.save() # Creating nodegroup create_eks_nodegroup.delay(challenge, cluster_name) return response except ClientError as e: logger.exception(e) return
def setup_eks_cluster(challenge): """ Creates EKS and NodeGroup ARN roles Arguments: instance {<class 'django.db.models.query.QuerySet'>} -- instance of the model calling the post hook """ from .models import ChallengeEvaluationCluster from .serializers import ChallengeEvaluationClusterSerializer from .utils import get_aws_credentials_for_challenge for obj in serializers.deserialize("json", challenge): challenge_obj = obj.object challenge_aws_keys = get_aws_credentials_for_challenge(challenge_obj.pk) client = get_boto3_client("iam", challenge_aws_keys) environment_suffix = "{}-{}".format(challenge_obj.pk, settings.ENVIRONMENT) eks_role_name = "evalai-code-upload-eks-role-{}".format(environment_suffix) eks_arn_role = None try: response = client.create_role( RoleName=eks_role_name, Description="Amazon EKS cluster role with managed policy", AssumeRolePolicyDocument=json.dumps( settings.EKS_CLUSTER_TRUST_RELATION), ) eks_arn_role = response["Role"]["Arn"] except ClientError as e: logger.exception(e) return waiter = client.get_waiter("role_exists") waiter.wait(RoleName=eks_role_name) try: # Attach AWS managed EKS cluster policy to the role response = client.attach_role_policy( RoleName=eks_role_name, PolicyArn=settings.EKS_CLUSTER_POLICY, ) except ClientError as e: logger.exception(e) return node_group_role_name = "evalai-code-upload-nodegroup-role-{}".format( environment_suffix) node_group_arn_role = None try: response = client.create_role( RoleName=node_group_role_name, Description="Amazon EKS node group role with managed policy", AssumeRolePolicyDocument=json.dumps( settings.EKS_NODE_GROUP_TRUST_RELATION), ) node_group_arn_role = response["Role"]["Arn"] except ClientError as e: logger.exception(e) return waiter = client.get_waiter("role_exists") waiter.wait(RoleName=node_group_role_name) task_execution_policies = settings.EKS_NODE_GROUP_POLICIES for policy_arn in task_execution_policies: try: # Attach AWS managed EKS worker node policy to the role response = client.attach_role_policy( RoleName=node_group_role_name, PolicyArn=policy_arn, ) except ClientError as e: logger.exception(e) return # Create custom ECR all access policy and attach to node_group_role ecr_all_access_policy_name = "AWS-ECR-Full-Access-{}".format( environment_suffix) ecr_all_access_policy_arn = None try: response = client.create_policy( PolicyName=ecr_all_access_policy_name, PolicyDocument=json.dumps(settings.ECR_ALL_ACCESS_POLICY_DOCUMENT), ) ecr_all_access_policy_arn = response["Policy"]["Arn"] waiter = client.get_waiter("policy_exists") waiter.wait(PolicyArn=ecr_all_access_policy_arn) # Attach custom ECR policy response = client.attach_role_policy( RoleName=node_group_role_name, PolicyArn=ecr_all_access_policy_arn) except ClientError as e: logger.exception(e) return try: challenge_evaluation_cluster = ChallengeEvaluationCluster.objects.get( challenge=challenge_obj) serializer = ChallengeEvaluationClusterSerializer( challenge_evaluation_cluster, data={ "eks_arn_role": eks_arn_role, "node_group_arn_role": node_group_arn_role, "ecr_all_access_policy_arn": ecr_all_access_policy_arn, }, partial=True, ) if serializer.is_valid(): serializer.save() # Create eks cluster vpc and subnets create_eks_cluster_subnets.delay(challenge) except Exception as e: logger.exception(e) return