def add_asg_fleet(self, scope: BaseApp, cluster: Cluster, fleet) -> List[AutoScalingGroup]: created_fleets: List[AutoScalingGroup] = [] node_labels = fleet.get('nodeLabels', {}) node_labels["fleetName"] = fleet.get('name') node_labels_as_str = ','.join(map('='.join, node_labels.items())) # Source of tweaks: https://kubedex.com/90-days-of-aws-eks-in-production/ kubelet_extra_args = ' '.join([ # Add node labels f'--node-labels {node_labels_as_str}' if len(node_labels_as_str) else '', # Capture resource reservation for kubernetes system daemons like the kubelet, container runtime, # node problem detector, etc. '--kube-reserved cpu=250m,memory=1Gi,ephemeral-storage=1Gi', # Capture resources for vital system functions, such as sshd, udev. '--system-reserved cpu=250m,memory=0.2Gi,ephemeral-storage=1Gi', # Start evicting pods from this node once these thresholds are crossed. '--eviction-hard memory.available<0.2Gi,nodefs.available<10%', ]) cluster_sg = SecurityGroup.from_security_group_id( self, 'eks-cluster-sg', security_group_id=cluster.cluster_security_group_id) asg_tags = { "k8s.io/cluster-autoscaler/enabled": "true", f"k8s.io/cluster-autoscaler/{cluster.cluster_name}": "owned", } # For correctly autoscaling the cluster we need our autoscaling groups to not span across AZs # to avoid the AZ Rebalance, hence we create an ASG per subnet for counter, subnet in enumerate(cluster.vpc.private_subnets): asg: AutoScalingGroup = cluster.add_capacity( id=scope.prefixed_str(f'{fleet.get("name")}-{counter}'), instance_type=InstanceType(fleet.get('instanceType')), min_capacity=fleet.get('autoscaling', {}).get('minInstances'), max_capacity=fleet.get('autoscaling', {}).get('maxInstances'), bootstrap_options=BootstrapOptions( kubelet_extra_args=kubelet_extra_args, ), spot_price=str(fleet.get('spotPrice')) if fleet.get('spotPrice') else None, vpc_subnets=SubnetSelection(subnets=[subnet]), ) created_fleets.append(asg) self._add_userdata_production_tweaks(asg) for key, value in asg_tags.items(): Tag.add(asg, key, value) return created_fleets
def get_file_system(scope: Construct) -> FileSystem: config = get_volume_config() stack_name = config.stack_name security_group = SecurityGroup.from_security_group_id( scope, 'nfs_security_group', security_group_id=Fn.import_value(stack_name + 'SecurityGroupId')) return FileSystem.from_file_system_attributes( scope, 'filesystem', file_system_id=Fn.import_value(stack_name + 'FileSystemId'), security_group=security_group)
def __init__(self, scope: core.Construct, id: str, deploy_env: str, vpc: aws_ec2.Vpc, db_redis_stack: RdsElasticacheEfsStack, config: dict, **kwargs) -> None: super().__init__(scope, id, **kwargs) self.config = config self.deploy_env = deploy_env self.db_port = DB_PORT # cannot map volumes to Fargate task defs yet - so this is done via Boto3 since CDK does not # support it yet: https://github.com/aws/containers-roadmap/issues/825 #self.efs_file_system_id = db_redis_stack.efs_file_system_id cluster_name = get_cluster_name(deploy_env) self.cluster = ecs.Cluster(self, cluster_name, cluster_name=cluster_name, vpc=vpc) pwd_secret = ecs.Secret.from_ssm_parameter( StringParameter.from_secure_string_parameter_attributes( self, f"dbpwd-{deploy_env}", version=1, parameter_name="postgres_pwd")) self.secrets = {"POSTGRES_PASSWORD": pwd_secret} environment = { "EXECUTOR": "Celery", "POSTGRES_HOST": db_redis_stack.db_host, "POSTGRES_PORT": str(self.db_port), "POSTGRES_DB": "airflow", "POSTGRES_USER": self.config["dbadmin"], "REDIS_HOST": db_redis_stack.redis_host, "VISIBILITY_TIMEOUT": str(self.config["celery_broker_visibility_timeout"]) } image_asset = DockerImageAsset(self, "AirflowImage", directory="build", repository_name=config["ecr_repo_name"]) self.image = ecs.ContainerImage.from_docker_image_asset(image_asset) # web server - this initializes the db so must happen first self.web_service = self.airflow_web_service(environment) # https://github.com/aws/aws-cdk/issues/1654 self.web_service_sg().connections.allow_to_default_port( db_redis_stack.postgres_db, 'allow PG') redis_port_info = Port(protocol=Protocol.TCP, string_representation="allow to redis", from_port=REDIS_PORT, to_port=REDIS_PORT) worker_port_info = Port(protocol=Protocol.TCP, string_representation="allow to worker", from_port=AIRFLOW_WORKER_PORT, to_port=AIRFLOW_WORKER_PORT) redis_sg = SecurityGroup.from_security_group_id( self, id=f"Redis-SG-{deploy_env}", security_group_id=db_redis_stack.redis.vpc_security_group_ids[0]) bastion_sg = db_redis_stack.bastion.connections.security_groups[0] self.web_service_sg().connections.allow_to(redis_sg, redis_port_info, 'allow Redis') self.web_service_sg().connections.allow_to_default_port( db_redis_stack.efs_file_system) # scheduler self.scheduler_service = self.create_scheduler_ecs_service(environment) # worker self.worker_service = self.worker_service(environment) self.scheduler_sg().connections.allow_to_default_port( db_redis_stack.postgres_db, 'allow PG') self.scheduler_sg().connections.allow_to(redis_sg, redis_port_info, 'allow Redis') self.scheduler_sg().connections.allow_to_default_port( db_redis_stack.efs_file_system) self.worker_sg().connections.allow_to_default_port( db_redis_stack.postgres_db, 'allow PG') self.worker_sg().connections.allow_to(redis_sg, redis_port_info, 'allow Redis') self.worker_sg().connections.allow_to_default_port( db_redis_stack.efs_file_system) # When you start an airflow worker, airflow starts a tiny web server # subprocess to serve the workers local log files to the airflow main # web server, who then builds pages and sends them to users. This defines # the port on which the logs are served. It needs to be unused, and open # visible from the main web server to connect into the workers. self.web_service_sg().connections.allow_to(self.worker_sg(), worker_port_info, 'web service to worker')