예제 #1
0
def addSearchGuard(template, role, subnet, keyname, secgroup, profilename):
    profile = InstanceProfile("sgprofile" + profilename,
                              Path="/",
                              Roles=[Ref(role)])
    template.add_resource(profile)
    instance = Instance(
        "sg" + profilename,
        InstanceType="m4.xlarge",
        ImageId=FindInMap("RegionToAmi", Ref("AWS::Region"), "stable"),
        DisableApiTermination=False,
        IamInstanceProfile=Ref(profile),
        KeyName=Ref(keyname),
        Monitoring=False,
        InstanceInitiatedShutdownBehavior="stop",
        UserData=userdata.from_file("src/bootstrap.sh"),
        NetworkInterfaces=[
            NetworkInterfaceProperty(DeviceIndex=0,
                                     Description="Primary network interface",
                                     SubnetId=Ref(subnet),
                                     DeleteOnTermination=True,
                                     AssociatePublicIpAddress=True,
                                     GroupSet=[Ref(secgroup)])
        ],
        Tags=[
            Tag("Name", "Search Guard " + profilename),
            Tag("sgnodetag", profilename)
        ],
        EbsOptimized=False,
        BlockDeviceMappings=[
            BlockDeviceMapping(DeviceName="/dev/sda1",
                               Ebs=EBSBlockDevice(VolumeSize=25))
        ])
    template.add_resource(instance)
    return instance
 def add_launch_config(self):
     '''
     Add autoscaling launch configurastion
     '''
     self.cfn_template.add_resource(
         LaunchConfiguration(
             title=constants.INST_LC,
             AssociatePublicIpAddress=False,
             BlockDeviceMappings=[
                 BlockDeviceMapping(
                     DeviceName='/dev/sda1',
                     Ebs=EBSBlockDevice(
                         DeleteOnTermination=True,
                         VolumeSize=int('100'),
                         VolumeType='gp2'
                     )
                 )
             ],
             IamInstanceProfile=Ref(constants.INST_PROFILE),
             ImageId=Ref('AmiId'),
             InstanceType=Ref('InstanceType'),
             SecurityGroups=[
                 Ref(constants.SSH_SG),
                 ImportValue(Sub('${Environment}-AppSecurityGroup')),
             ],
             UserData=Base64(
                 Sub(constants.USERDATA)
             )
         )
     )
     return self.cfn_template
예제 #3
0
    def add_vpnInstance(self):
        t = self.template

        self.vpnInstance = t.add_resource(
            Instance(
                "OpenVPNInstance",
                ImageId=Ref(self.amiParam),
                SecurityGroupIds=[Ref(self.openVPNSecurityGroup)],
                SubnetId=Ref(self.vpnSubnetParam),
                KeyName=Ref(self.keyPairParam),
                InstanceType=Ref(self.instanceTypeParam),
                BlockDeviceMappings=[
                    BlockDeviceMapping(
                        DeviceName="/dev/sda1",
                        Ebs=EBSBlockDevice(
                            VolumeSize=Ref(self.volumeSizeParam)))
                ],
                UserData=Base64(
                    Join("", [
                        "admin_user="******"\n", "admin_pw=", self.sceptreUserData['vpnAdminPw'],
                        "\n", "reroute_gw=1\n", "reroute_dns=1\n"
                    ])),
                Tags=self.defaultTags +
                [Tag('Name', Join("", [self.namePrefix, 'OpenVPNInstance']))]))
예제 #4
0
    def test_using_ephemeral_instance_store(self):
        '''
        EC2 instance is launched from an EBS-backed root volume (which is the default).
        Ephemeral instance stores aren't standalone resources like EBS volumes; it is part of your EC2 instance.
        Ephemeral storage is predefined by instance type.
        '''
        test_stack_name = 'TestInstanceStore'
        init_cf_env(test_stack_name)
        ###

        t = Template()
        sg = ts_add_security_group(t)
        instance = ts_add_instance_with_public_ip(t, Ref(sg))
        instance.InstanceType = 'm5ad.large'  # ephemeral storage is predefined by instance type
        instance.BlockDeviceMappings = [  # use block device mapping to specify device
            BlockDeviceMapping(
                DeviceName='/dev/xvda',  # EBS root volume (OS lives here)
                Ebs=EBSBlockDevice(VolumeSize=20, VolumeType='gp2')),
        ]
        t.add_output([
            Output(
                "PublicIP",
                Value=GetAtt(instance, "PublicIp"),
            ),
        ])
        dump_template(t, True)
        create_stack(test_stack_name, t)
        outputs = get_stack_outputs(test_stack_name)
        public_ip = get_output_value(outputs, 'PublicIP')
        stdout = run(f'ssh {SSH_OPTIONS} ec2-user@{public_ip} sudo fdisk -l')
        stdout = run(f'ssh {SSH_OPTIONS} ec2-user@{public_ip} lsblk')
        run(f'ssh {SSH_OPTIONS} ec2-user@{public_ip} sudo mkfs -t ext4 /dev/nvme1n1'
            )
        run(f'ssh {SSH_OPTIONS} ec2-user@{public_ip} sudo mkdir /mnt/volume/')
        run(f'ssh {SSH_OPTIONS} ec2-user@{public_ip} sudo mount /dev/nvme1n1 /mnt/volume/'
            )
        # write performance comparision
        run(f'ssh {SSH_OPTIONS} ec2-user@{public_ip} sudo dd if=/dev/zero of=/mnt/volume/tempfile bs=1M count=1024'
            )
        run(
            f'ssh {SSH_OPTIONS} ec2-user@{public_ip} "echo 3 | sudo tee /proc/sys/vm/drop_caches"',
            True)
        run(f'ssh {SSH_OPTIONS} ec2-user@{public_ip} sudo dd if=/dev/zero of=/tempfile bs=1M count=1024'
            )  # write to ebs
        # read performance comparision
        run(f'ssh {SSH_OPTIONS} ec2-user@{public_ip} sudo dd if=/mnt/volume/tempfile of=/dev/null bs=1M count=1024'
            )
        run(
            f'ssh {SSH_OPTIONS} ec2-user@{public_ip} "echo 3 | sudo tee /proc/sys/vm/drop_caches"',
            True)
        run(f'ssh {SSH_OPTIONS} ec2-user@{public_ip} sudo dd if=/tempfile of=/dev/null bs=1M count=1024'
            )
예제 #5
0
def my_block_device_mappings_ebs(count,devicenamebase,volumesize,volumetype):
    block_device_mappings_ebs = []
    block_device_mappings_ebs.append(my_block_device_mappings_root("/dev/sd","100","gp2"))
    for i in xrange(count):
        block_device_mappings_ebs.append(
            BlockDeviceMapping(
                DeviceName = devicenamebase + chr(i+98),
                Ebs = EBSBlockDevice(
                    VolumeSize = volumesize,
                    VolumeType = volumetype,
                    DeleteOnTermination = True,
        )))
    return block_device_mappings_ebs
예제 #6
0
    def block_devices(self):
        """ Get block devices for the EC2 instances """
        block_devices = []
        for block in self.instance_base.get('ebs_mounts'):
            block_devices.append(BlockDeviceMapping(
                DeviceName=block.get('device_name'),
                Ebs=EBSBlockDevice(
                    VolumeSize=block.get('size', 8),
                    DeleteOnTermination=True,
                    VolumeType='gp2'
                )
            ))

        return block_devices
예제 #7
0
def main(**params):
    try:
        # Metadata
        t = Template()
        t.set_version("2010-09-09")
        t.set_description(
            "(SOCA) - Base template to deploy compute nodes. Version 2.6.0")
        allow_anonymous_data_collection = params["MetricCollectionAnonymous"]
        debug = False
        mip_usage = False
        instances_list = params[
            "InstanceType"]  # list of instance type. Use + to specify more than one type
        asg_lt = asg_LaunchTemplate()
        ltd = LaunchTemplateData("NodeLaunchTemplateData")
        mip = MixedInstancesPolicy()
        stack_name = Ref("AWS::StackName")

        # Begin LaunchTemplateData
        UserData = '''#!/bin/bash -ex

# Configure the proxy
value="''' + params['ProxyCACert'] + '''"
echo $value >  /etc/pki/ca-trust/source/anchors/proxyCA.pem
update-ca-trust

cat <<EOF > /etc/profile.d/proxy.sh
proxy_url="http://''' + params['ProxyPrivateDnsName'] + ''':3128/"

export HTTP_PROXY=\$proxy_url
export HTTPS_PROXY=\$proxy_url
export http_proxy=\$proxy_url
export https_proxy=\$proxy_url

# No proxy:
# Comma separated list of destinations that shouldn't go to the proxy.
# - EC2 metadata service
# - Private IP address ranges (VPC local)
export NO_PROXY="''' + params['NoProxy'] + '''"
export no_proxy=\$NO_PROXY

export REQUESTS_CA_BUNDLE=/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem
EOF

source /etc/profile.d/proxy.sh

cat <<EOF > /etc/yum.repos.d/10_proxy.conf
[main]
proxy=http://''' + params['ProxyPrivateDnsName'] + ''':3128/
EOF

if grep -q 'Amazon Linux release 2' /etc/system-release; then
    BASE_OS=amazonlinux2
elif grep -q 'CentOS Linux release 7' /etc/system-release; then
    BASE_OS=centos7
else
    BASE_OS=rhel7
fi

# Install pip and awscli
export PATH=$PATH:/usr/local/bin
if [[ "$BASE_OS" == "centos7" ]] || [[ "$BASE_OS" == "rhel7" ]];
then
     yum install -y python3-pip
     PIP=$(which pip3)
     $PIP install awscli
else
     yum install -y python3-pip
     PIP=$(which pip3)
     $PIP install awscli
fi

# Configure using ansible
# If not amazon linux then the proxy needs to be set up before ansible can be installed.
# The playbooks are downloaded from S3 using the S3 VPC endpoint so don't require the proxy.
if ! yum list installed ansible &> /dev/null; then
    if [ $BASE_OS == "amazonlinux2" ]; then
        amazon-linux-extras install -y ansible2
    else
        yum -y install ansible
    fi
fi
aws s3 cp --recursive s3://''' + params['S3Bucket'] + '''/''' + params[
            'S3InstallFolder'] + '''/playbooks/ /root/playbooks/
cd /root/playbooks
ansible-playbook computeNode.yml -e Region=''' + params[
                'Region'] + ''' -e Domain=''' + params[
                    'SocaDomain'] + ''' -e S3InstallBucket=''' + params[
                        'S3Bucket'] + ''' -e S3InstallFolder=''' + params[
                            'S3InstallFolder'] + ''' -e ClusterId=''' + params[
                                'ClusterId'] + ''' -e NoProxy=''' + params[
                                    'NoProxy'] + ''' -e NodeType=''' + params[
                                        'NodeType'] + ''' >> /root/ansible.log 2>&1

if [[ "$BASE_OS" == "centos7" ]] || [[ "$BASE_OS" == "rhel7" ]];
then
     yum install -y nfs-utils # enforce install of nfs-utils
fi

if [[ "$BASE_OS" == "amazonlinux2" ]];
    then
        /usr/sbin/update-motd --disable
fi

GET_INSTANCE_TYPE=$(curl http://169.254.169.254/latest/meta-data/instance-type)
echo export "SOCA_CONFIGURATION="''' + str(params['ClusterId']
                                           ) + '''"" >> /etc/environment
echo export "SOCA_BASE_OS="$BASE_OS"" >> /etc/environment
echo export "SOCA_JOB_QUEUE="''' + str(params['JobQueue']
                                       ) + '''"" >> /etc/environment
echo export "SOCA_JOB_OWNER="''' + str(params['JobOwner']
                                       ) + '''"" >> /etc/environment
echo export "SOCA_JOB_NAME="''' + str(params['JobName']
                                      ) + '''"" >> /etc/environment
echo export "SOCA_JOB_PROJECT="''' + str(params['JobProject']
                                         ) + '''"" >> /etc/environment
echo export "SOCA_VERSION="''' + str(params['Version']
                                     ) + '''"" >> /etc/environment
echo export "SOCA_JOB_EFA="''' + str(params['Efa']).lower(
                                     ) + '''"" >> /etc/environment
echo export "SOCA_JOB_ID="''' + str(params['JobId']
                                    ) + '''"" >> /etc/environment
echo export "SOCA_SCRATCH_SIZE=''' + str(
                                        params['ScratchSize']
                                    ) + '''" >> /etc/environment
echo export "SOCA_INSTALL_BUCKET="''' + str(
                                        params['S3Bucket']
                                    ) + '''"" >> /etc/environment
echo export "SOCA_INSTALL_BUCKET_FOLDER="''' + str(
                                        params['S3InstallFolder']
                                    ) + '''"" >> /etc/environment
echo export "SOCA_FSX_LUSTRE_BUCKET="''' + str(
                                        params['FSxLustreConfiguration']
                                        ['fsx_lustre']
                                    ).lower() + '''"" >> /etc/environment
echo export "SOCA_FSX_LUSTRE_DNS="''' + str(
                                        params['FSxLustreConfiguration']
                                        ['existing_fsx']
                                    ).lower() + '''"" >> /etc/environment
echo export "SOCA_INSTANCE_TYPE=$GET_INSTANCE_TYPE" >> /etc/environment
echo export "SOCA_INSTANCE_HYPERTHREADING="''' + str(
                                        params['ThreadsPerCore']
                                    ).lower() + '''"" >> /etc/environment
echo export "SOCA_SYSTEM_METRICS="''' + str(params['SystemMetrics']).lower(
                                    ) + '''"" >> /etc/environment
echo export "SOCA_ESDOMAIN_ENDPOINT="''' + str(
                                        params['ESDomainEndpoint']
                                    ).lower() + '''"" >> /etc/environment


echo export "SOCA_HOST_SYSTEM_LOG="/apps/soca/''' + str(
                                        params['ClusterId']
                                    ) + '''/cluster_node_bootstrap/logs/''' + str(
                                        params['JobId']
                                    ) + '''/$(hostname -s)"" >> /etc/environment
echo export "AWS_STACK_ID=${AWS::StackName}" >> /etc/environment
echo export "AWS_DEFAULT_REGION=''' + params[
                                        'Region'] + '''" >> /etc/environment


source /etc/environment
AWS=$(which aws)

# Give yum permission to the user on this specific machine
echo "''' + params['JobOwner'] + ''' ALL=(ALL) /bin/yum" >> /etc/sudoers

mkdir -p /apps
mkdir -p /data

# Mount EFS
echo "''' + params['EFSDataDns'] + ''':/ /data nfs4 nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport 0 0" >> /etc/fstab
echo "''' + params['EFSAppsDns'] + ''':/ /apps nfs4 nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport 0 0" >> /etc/fstab
EFS_MOUNT=0
mount -a 
while [[ $? -ne 0 ]] && [[ $EFS_MOUNT -lt 5 ]]
  do
    SLEEP_TIME=$(( RANDOM % 60 ))
    echo "Failed to mount EFS, retrying in $SLEEP_TIME seconds and Loop $EFS_MOUNT/5..."
    sleep $SLEEP_TIME
    ((EFS_MOUNT++))
    mount -a
  done

# Configure Chrony
yum remove -y ntp
yum install -y chrony
mv /etc/chrony.conf  /etc/chrony.conf.original
echo -e """
# use the local instance NTP service, if available
server 169.254.169.123 prefer iburst minpoll 4 maxpoll 4

# Use public servers from the pool.ntp.org project.
# Please consider joining the pool (http://www.pool.ntp.org/join.html).
# !!! [BEGIN] SOCA REQUIREMENT
# You will need to open UDP egress traffic on your security group if you want to enable public pool
#pool 2.amazon.pool.ntp.org iburst
# !!! [END] SOCA REQUIREMENT
# Record the rate at which the system clock gains/losses time.
driftfile /var/lib/chrony/drift

# Allow the system clock to be stepped in the first three updates
# if its offset is larger than 1 second.
makestep 1.0 3

# Specify file containing keys for NTP authentication.
keyfile /etc/chrony.keys

# Specify directory for log files.
logdir /var/log/chrony

# save data between restarts for fast re-load
dumponexit
dumpdir /var/run/chrony
""" > /etc/chrony.conf
systemctl enable chronyd

# Prepare  Log folder
mkdir -p $SOCA_HOST_SYSTEM_LOG
chmod +x /apps/soca/$SOCA_CONFIGURATION/cluster_node_bootstrap/ComputeNodePostReboot.sh
echo "@reboot /apps/soca/$SOCA_CONFIGURATION/cluster_node_bootstrap/ComputeNodePostReboot.sh >> $SOCA_HOST_SYSTEM_LOG/ComputeNodePostReboot.log 2>&1" | crontab -
$AWS s3 cp s3://$SOCA_INSTALL_BUCKET/$SOCA_INSTALL_BUCKET_FOLDER/scripts/config.cfg /root/
chmod +x /apps/soca/$SOCA_CONFIGURATION/cluster_node_bootstrap/ComputeNode.sh
/apps/soca/$SOCA_CONFIGURATION/cluster_node_bootstrap/ComputeNode.sh ''' + params[
                                            'SchedulerHostname'] + ''' >> $SOCA_HOST_SYSTEM_LOG/ComputeNode.sh.log 2>&1'''

        SpotFleet = True if ((params["SpotPrice"] is not False) and
                             (int(params["DesiredCapacity"]) > 1
                              or len(instances_list) > 1)) else False
        ltd.EbsOptimized = True
        for instance in instances_list:
            if "t2." in instance:
                ltd.EbsOptimized = False

            # metal + t2 does not support CpuOptions
            unsupported = ["t2.", "metal"]
            if all(itype not in instance
                   for itype in unsupported) and (SpotFleet is False
                                                  or len(instances_list) == 1):
                # Spotfleet with multiple instance types doesn't support CpuOptions
                # So we can't add CpuOptions if SpotPrice is specified and when multiple instances are specified
                ltd.CpuOptions = CpuOptions(
                    CoreCount=int(params["CoreCount"]),
                    ThreadsPerCore=1
                    if params["ThreadsPerCore"] is False else 2)

        ltd.IamInstanceProfile = IamInstanceProfile(
            Arn=params["ComputeNodeInstanceProfileArn"])
        ltd.KeyName = params["SSHKeyPair"]
        ltd.ImageId = params["ImageId"]
        if params["SpotPrice"] is not False and params[
                "SpotAllocationCount"] is False:
            ltd.InstanceMarketOptions = InstanceMarketOptions(
                MarketType="spot",
                SpotOptions=SpotOptions(
                    MaxPrice=Ref("AWS::NoValue") if params["SpotPrice"]
                    == "auto" else str(params["SpotPrice"])
                    # auto -> cap at OD price
                ))
        ltd.InstanceType = instances_list[0]
        ltd.NetworkInterfaces = [
            NetworkInterfaces(InterfaceType="efa" if params["Efa"] is not False
                              else Ref("AWS::NoValue"),
                              DeleteOnTermination=True,
                              DeviceIndex=0,
                              Groups=[params["SecurityGroupId"]])
        ]
        ltd.UserData = Base64(Sub(UserData))
        ltd.BlockDeviceMappings = [
            LaunchTemplateBlockDeviceMapping(
                DeviceName="/dev/xvda"
                if params["BaseOS"] == "amazonlinux2" else "/dev/sda1",
                Ebs=EBSBlockDevice(VolumeSize=params["RootSize"],
                                   VolumeType="gp2",
                                   DeleteOnTermination="false"
                                   if params["KeepEbs"] is True else "true",
                                   Encrypted=True))
        ]
        if int(params["ScratchSize"]) > 0:
            ltd.BlockDeviceMappings.append(
                BlockDeviceMapping(
                    DeviceName="/dev/xvdbx",
                    Ebs=EBSBlockDevice(
                        VolumeSize=params["ScratchSize"],
                        VolumeType="io1"
                        if int(params["VolumeTypeIops"]) > 0 else "gp2",
                        Iops=params["VolumeTypeIops"]
                        if int(params["VolumeTypeIops"]) > 0 else
                        Ref("AWS::NoValue"),
                        DeleteOnTermination="false"
                        if params["KeepEbs"] is True else "true",
                        Encrypted=True)))
        ltd.TagSpecifications = [
            ec2.TagSpecifications(
                ResourceType="instance",
                Tags=base_Tags(
                    Name=str(params["ClusterId"]) + "-compute-job-" +
                    str(params["JobId"]),
                    _soca_JobId=str(params["JobId"]),
                    _soca_JobName=str(params["JobName"]),
                    _soca_JobQueue=str(params["JobQueue"]),
                    _soca_StackId=stack_name,
                    _soca_JobOwner=str(params["JobOwner"]),
                    _soca_JobProject=str(params["JobProject"]),
                    _soca_TerminateWhenIdle=str(params["TerminateWhenIdle"]),
                    _soca_KeepForever=str(params["KeepForever"]).lower(),
                    _soca_ClusterId=str(params["ClusterId"]),
                    _soca_NodeType="soca-compute-node"))
        ]
        # End LaunchTemplateData

        # Begin Launch Template Resource
        lt = LaunchTemplate("NodeLaunchTemplate")
        lt.LaunchTemplateName = params["ClusterId"] + "-" + str(
            params["JobId"])
        lt.LaunchTemplateData = ltd
        t.add_resource(lt)
        # End Launch Template Resource

        if SpotFleet is True:
            # SpotPrice is defined and DesiredCapacity > 1 or need to try more than 1 instance_type
            # Create SpotFleet

            # Begin SpotFleetRequestConfigData Resource
            sfrcd = ec2.SpotFleetRequestConfigData()
            sfrcd.AllocationStrategy = params["SpotAllocationStrategy"]
            sfrcd.ExcessCapacityTerminationPolicy = "noTermination"
            sfrcd.IamFleetRole = params["SpotFleetIAMRoleArn"]
            sfrcd.InstanceInterruptionBehavior = "terminate"
            if params["SpotPrice"] != "auto":
                sfrcd.SpotPrice = str(params["SpotPrice"])
            sfrcd.TargetCapacity = params["DesiredCapacity"]
            sfrcd.Type = "maintain"
            sfltc = ec2.LaunchTemplateConfigs()
            sflts = ec2.LaunchTemplateSpecification(LaunchTemplateId=Ref(lt),
                                                    Version=GetAtt(
                                                        lt,
                                                        "LatestVersionNumber"))
            sfltc.LaunchTemplateSpecification = sflts
            sfltc.Overrides = []
            for subnet in params["SubnetId"]:
                for instance in instances_list:
                    sfltc.Overrides.append(
                        ec2.LaunchTemplateOverrides(InstanceType=instance,
                                                    SubnetId=subnet))
            sfrcd.LaunchTemplateConfigs = [sfltc]
            TagSpecifications = ec2.SpotFleetTagSpecification(
                ResourceType="spot-fleet-request",
                Tags=base_Tags(
                    Name=str(params["ClusterId"]) + "-compute-job-" +
                    str(params["JobId"]),
                    _soca_JobId=str(params["JobId"]),
                    _soca_JobName=str(params["JobName"]),
                    _soca_JobQueue=str(params["JobQueue"]),
                    _soca_StackId=stack_name,
                    _soca_JobOwner=str(params["JobOwner"]),
                    _soca_JobProject=str(params["JobProject"]),
                    _soca_TerminateWhenIdle=str(params["TerminateWhenIdle"]),
                    _soca_KeepForever=str(params["KeepForever"]).lower(),
                    _soca_ClusterId=str(params["ClusterId"]),
                    _soca_NodeType="soca-compute-node"))
            # End SpotFleetRequestConfigData Resource

            # Begin SpotFleet Resource
            spotfleet = ec2.SpotFleet("SpotFleet")
            spotfleet.SpotFleetRequestConfigData = sfrcd
            t.add_resource(spotfleet)
            # End SpotFleet Resource
        else:

            asg_lt.LaunchTemplateSpecification = LaunchTemplateSpecification(
                LaunchTemplateId=Ref(lt),
                Version=GetAtt(lt, "LatestVersionNumber"))

            asg_lt.Overrides = []
            for instance in instances_list:
                asg_lt.Overrides.append(
                    LaunchTemplateOverrides(InstanceType=instance))

            # Begin InstancesDistribution
            if params["SpotPrice"] is not False and \
                    params["SpotAllocationCount"] is not False and \
                    (int(params["DesiredCapacity"]) - int(params["SpotAllocationCount"])) > 0:
                mip_usage = True
                idistribution = InstancesDistribution()
                idistribution.OnDemandAllocationStrategy = "prioritized"  # only supported value
                idistribution.OnDemandBaseCapacity = params[
                    "DesiredCapacity"] - params["SpotAllocationCount"]
                idistribution.OnDemandPercentageAboveBaseCapacity = "0"  # force the other instances to be SPOT
                idistribution.SpotMaxPrice = Ref(
                    "AWS::NoValue") if params["SpotPrice"] == "auto" else str(
                        params["SpotPrice"])
                idistribution.SpotAllocationStrategy = params[
                    'SpotAllocationStrategy']
                mip.InstancesDistribution = idistribution

            # End MixedPolicyInstance

            # Begin AutoScalingGroup Resource
            asg = AutoScalingGroup("AutoScalingComputeGroup")
            asg.DependsOn = "NodeLaunchTemplate"
            if mip_usage is True or instances_list.__len__() > 1:
                mip.LaunchTemplate = asg_lt
                asg.MixedInstancesPolicy = mip

            else:
                asg.LaunchTemplate = LaunchTemplateSpecification(
                    LaunchTemplateId=Ref(lt),
                    Version=GetAtt(lt, "LatestVersionNumber"))

            asg.MinSize = int(params["DesiredCapacity"])
            asg.MaxSize = int(params["DesiredCapacity"])
            asg.VPCZoneIdentifier = params["SubnetId"]

            if params["PlacementGroup"] is True:
                pg = PlacementGroup("ComputeNodePlacementGroup")
                pg.Strategy = "cluster"
                t.add_resource(pg)
                asg.PlacementGroup = Ref(pg)

            asg.Tags = Tags(
                Name=str(params["ClusterId"]) + "-compute-job-" +
                str(params["JobId"]),
                _soca_JobId=str(params["JobId"]),
                _soca_JobName=str(params["JobName"]),
                _soca_JobQueue=str(params["JobQueue"]),
                _soca_StackId=stack_name,
                _soca_JobOwner=str(params["JobOwner"]),
                _soca_JobProject=str(params["JobProject"]),
                _soca_TerminateWhenIdle=str(params["TerminateWhenIdle"]),
                _soca_KeepForever=str(params["KeepForever"]).lower(),
                _soca_ClusterId=str(params["ClusterId"]),
                _soca_NodeType="soca-compute-node")
            t.add_resource(asg)
            # End AutoScalingGroup Resource

        # Begin FSx for Lustre
        if params["FSxLustreConfiguration"]["fsx_lustre"] is not False:
            if params["FSxLustreConfiguration"]["existing_fsx"] is False:
                fsx_lustre = FileSystem("FSxForLustre")
                fsx_lustre.FileSystemType = "LUSTRE"
                fsx_lustre.StorageCapacity = params["FSxLustreConfiguration"][
                    "capacity"]
                fsx_lustre.SecurityGroupIds = [params["SecurityGroupId"]]
                fsx_lustre.SubnetIds = params["SubnetId"]
                fsx_lustre_configuration = LustreConfiguration()
                fsx_lustre_configuration.DeploymentType = params[
                    "FSxLustreConfiguration"]["deployment_type"].upper()
                if params["FSxLustreConfiguration"]["deployment_type"].upper(
                ) == "PERSISTENT_1":
                    fsx_lustre_configuration.PerUnitStorageThroughput = params[
                        "FSxLustreConfiguration"]["per_unit_throughput"]

                if params["FSxLustreConfiguration"]["s3_backend"] is not False:
                    fsx_lustre_configuration.ImportPath = params[
                        "FSxLustreConfiguration"]["import_path"] if params[
                            "FSxLustreConfiguration"][
                                "import_path"] is not False else params[
                                    "FSxLustreConfiguration"]["s3_backend"]
                    fsx_lustre_configuration.ExportPath = params[
                        "FSxLustreConfiguration"]["import_path"] if params[
                            "FSxLustreConfiguration"][
                                "import_path"] is not False else params[
                                    "FSxLustreConfiguration"][
                                        "s3_backend"] + "/" + params[
                                            "ClusterId"] + "-fsxoutput/job-" + params[
                                                "JobId"] + "/"

                fsx_lustre.LustreConfiguration = fsx_lustre_configuration
                fsx_lustre.Tags = base_Tags(
                    # False disable PropagateAtLaunch
                    Name=str(params["ClusterId"] + "-compute-job-" +
                             params["JobId"]),
                    _soca_JobId=str(params["JobId"]),
                    _soca_JobName=str(params["JobName"]),
                    _soca_JobQueue=str(params["JobQueue"]),
                    _soca_TerminateWhenIdle=str(params["TerminateWhenIdle"]),
                    _soca_StackId=stack_name,
                    _soca_JobOwner=str(params["JobOwner"]),
                    _soca_JobProject=str(params["JobProject"]),
                    _soca_KeepForever=str(params["KeepForever"]).lower(),
                    _soca_FSx="true",
                    _soca_ClusterId=str(params["ClusterId"]),
                )
                t.add_resource(fsx_lustre)
        # End FSx For Lustre

        # Begin Custom Resource
        # Change Mapping to No if you want to disable this
        if allow_anonymous_data_collection is True:
            metrics = CustomResourceSendAnonymousMetrics("SendAnonymousData")
            metrics.ServiceToken = params["SolutionMetricLambda"]
            metrics.DesiredCapacity = str(params["DesiredCapacity"])
            metrics.InstanceType = str(params["InstanceType"])
            metrics.Efa = str(params["Efa"])
            metrics.ScratchSize = str(params["ScratchSize"])
            metrics.RootSize = str(params["RootSize"])
            metrics.SpotPrice = str(params["SpotPrice"])
            metrics.BaseOS = str(params["BaseOS"])
            metrics.StackUUID = str(params["StackUUID"])
            metrics.KeepForever = str(params["KeepForever"])
            metrics.FsxLustre = str(params["FSxLustreConfiguration"])
            metrics.TerminateWhenIdle = str(params["TerminateWhenIdle"])
            metrics.Dcv = "false"
            t.add_resource(metrics)
        # End Custom Resource

        if debug is True:
            print(t.to_json())

        # Tags must use "soca:<Key>" syntax
        template_output = t.to_yaml().replace("_soca_", "soca:")
        return {'success': True, 'output': template_output}

    except Exception as e:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        return {
            'success':
            False,
            'output':
            'cloudformation_builder.py: ' +
            (str(e) + ': error :' + str(exc_type) + ' ' + str(fname) + ' ' +
             str(exc_tb.tb_lineno))
        }
예제 #8
0
def create_template(num_masters, num_agents, num_publicAgents):

    #outfilename = "test.json"
    outfilename = "cf_" + str(num_masters) + "." + str(num_agents) + "." + str(
        num_publicAgents) + ".json"

    # Create the Template
    t = Template()

    t.add_version('2010-09-09')

    t.add_description('Creates a set of Servers for DC/OS using CentOS 7.3 AMI.  Creates a boot server to host the DC/OS installer and a NAT Instance for outbound connections from private agents.  Creates ' + str(num_masters) + ' Master(s), '  \
                      + str(num_agents) + ' Private Agent(s), and ' + str(num_publicAgents) + ' Public Agent(s).  After creating the Stack; Log into the boot server and run the DCOS Bash Script installer for AWS')

    # Amazon Linux AMI 2016.09.1.20170119 x86_64 VPC NAT HVM EBS
    # amzn-ami-vpc-nat-hvm-2016.09.1.20170119-x86_64-ebs -
    # ami-dd3dd7cb us-east-1 (N. Virginia)
    # ami-564b6e33  us-east-2 (Ohio)
    # ami-7d54061d us-west-1 (N. Cal)
    # ami-3b6fd05b us-west-2 (Oregon)
    t.add_mapping(
        'NATAmi', {
            'us-east-1': {
                'default': 'ami-dd3dd7cb'
            },
            'us-east-2': {
                'default': 'ami-564b6e33'
            },
            'us-west-1': {
                'default': 'ami-7d54061d'
            },
            'us-west-2': {
                'default': 'ami-3b6fd05b'
            },
        })

    # The c73 AMI pre created and deployed on each region
    t.add_mapping(
        'c73Ami', {
            'us-east-1': {
                'default': 'ami-46c1b650'
            },
            'us-east-2': {
                'default': 'ami-18f8df7d'
            },
            'us-west-1': {
                'default': 'ami-f5d7f195'
            },
            'us-west-2': {
                'default': 'ami-f4533694'
            },
        })

    # CloudFormation Parameters

    # Sometimes when I deployed stack on us-east-1; it would fail on av zone us-east-1c with error messages instance type not support on this AZ.  I added this parameter to fix all of the components in on AZ for now
    avzone_param = t.add_parameter(
        Parameter(
            "AVZoneName",
            ConstraintDescription='Must be the name of an an Availability Zone',
            Description='Name of an Availability Zone',
            Type='AWS::EC2::AvailabilityZone::Name',
        ))

    # Every agent will get a data drive of this size
    dataDriveSizeGB_param = t.add_parameter(
        Parameter(
            "dataDriveSizeGB",
            Default="100",
            MinValue=20,
            MaxValue=1000,
            Description=
            'Size of data drive to add to private agents from 20 to 1000GB',
            Type='Number'))

    # The key will be added to the centos user so you can login to centos using the key
    keyname_param = t.add_parameter(
        Parameter(
            "KeyName",
            ConstraintDescription=
            'Must be the name of an existing EC2 KeyPair.',
            Description=
            'Name of an existing EC2 KeyPair to enable SSH access to the instance',
            Type='AWS::EC2::KeyPair::KeyName',
        ))

    # While you can allow everyone it's more secure to just allow a single machine or subnet of machines; web port will also be opened to this CIDR
    sshlocation_param = t.add_parameter(
        Parameter(
            "sshlocation",
            Type="String",
            Description=
            "Subnet allowed to ssh to these servers. 0.0.0.0/0 to allow all."))

    # Instance type for Master
    instanceTypeMaster_param = t.add_parameter(
        Parameter(
            'InstanceTypeMaster',
            Type='String',
            Description='EC2 instance type for ' + str(num_masters) +
            ' Masters(s)',
            Default='m4.xlarge',
            AllowedValues=[
                't2.xlarge',
                't2.2xlarge',
                'm4.xlarge',
                'm4.2xlarge',
                'm4.4xlarge',
                'm4.10xlarge',
                'c4.xlarge',
                'c4.2xlarge',
                'c4.4xlarge',
                'c4.8xlarge',
            ],
            ConstraintDescription='Must be a valid EC2 instance type.',
        ))

    # Instance type for Agents
    instanceTypeAgent_param = t.add_parameter(
        Parameter(
            'InstanceTypeAgent',
            Type='String',
            Description='EC2 instance type for ' + str(num_agents) +
            ' Private Agent(s)',
            Default='m4.2xlarge',
            AllowedValues=[
                't2.xlarge',
                't2.2xlarge',
                'm4.xlarge',
                'm4.2xlarge',
                'm4.4xlarge',
                'm4.10xlarge',
                'c4.xlarge',
                'c4.2xlarge',
                'c4.4xlarge',
                'c4.8xlarge',
            ],
            ConstraintDescription='Must be a valid EC2 instance type.',
        ))

    # Instance type for Public Agents
    instanceTypePublicAgent_param = t.add_parameter(
        Parameter(
            'InstanceTypePublicAgent',
            Type='String',
            Description='EC2 instance type for ' + str(num_publicAgents) +
            ' Public Agent(s)',
            Default='m4.xlarge',
            AllowedValues=[
                't2.xlarge',
                't2.2xlarge',
                'm4.xlarge',
                'm4.2xlarge',
                'm4.4xlarge',
                'm4.10xlarge',
                'c4.xlarge',
                'c4.2xlarge',
                'c4.4xlarge',
                'c4.8xlarge',
            ],
            ConstraintDescription='Must be a valid EC2 instance type.',
        ))

    # Adding Resources

    ref_stack_id = Ref('AWS::StackId')
    ref_region = Ref('AWS::Region')
    ref_stack_name = Ref('AWS::StackName')

    # Create VPC
    nm = 'vpc'
    vpc = t.add_resource(
        VPC(nm,
            CidrBlock='10.10.0.0/16',
            EnableDnsSupport=True,
            EnableDnsHostnames=True,
            Tags=Tags(Application=ref_stack_id,
                      Name=Join("", [Ref('AWS::StackName'), "-", nm]))))

    # Create Subnet for Masters
    nm = 'mastersSubnet'
    subnetMasters = t.add_resource(
        Subnet(nm,
               AvailabilityZone=Ref(avzone_param),
               CidrBlock='10.10.0.0/24',
               VpcId=Ref(vpc),
               Tags=Tags(Application=ref_stack_id,
                         Name=Join("", [Ref('AWS::StackName'), "-", nm]))))

    # Create Subnet for Agents
    nm = 'agentsSubnet'
    subnetAgents = t.add_resource(
        Subnet(nm,
               AvailabilityZone=Ref(avzone_param),
               CidrBlock='10.10.16.0/24',
               VpcId=Ref(vpc),
               Tags=Tags(Application=ref_stack_id,
                         Name=Join("", [Ref('AWS::StackName'), "-", nm]))))

    # Create Subnet for Public Agents
    nm = 'publicAgentsSubnet'
    subnetPublicAgents = t.add_resource(
        Subnet(nm,
               AvailabilityZone=Ref(avzone_param),
               CidrBlock='10.10.32.0/24',
               VpcId=Ref(vpc),
               Tags=Tags(Application=ref_stack_id,
                         Name=Join("", [Ref('AWS::StackName'), "-", nm]))))

    # Create Gateway; route to the outside world (Internet)
    nm = 'ig'
    internetGateway = t.add_resource(
        InternetGateway(nm,
                        Tags=Tags(Application=ref_stack_id,
                                  Name=Join(
                                      "", [Ref('AWS::StackName'), "-", nm]))))

    # Attach Gateway to VPC
    nm = 'gatewayAttachment'
    gatewayAttachment = t.add_resource(
        VPCGatewayAttachment(nm,
                             VpcId=Ref(vpc),
                             InternetGatewayId=Ref(internetGateway)))

    # Create Route Table
    nm = 'routeTable'
    routeTable = t.add_resource(
        RouteTable(nm,
                   VpcId=Ref(vpc),
                   Tags=Tags(Application=ref_stack_id,
                             Name=Join("", [Ref('AWS::StackName'), "-", nm]))))

    # Add Routes

    # Allow all outbound traffic
    nm = 'route'
    route = t.add_resource(
        Route(
            nm,
            DependsOn=gatewayAttachment.title,
            GatewayId=Ref(internetGateway),
            DestinationCidrBlock='0.0.0.0/0',
            RouteTableId=Ref(routeTable),
        ))

    # Associate RouteTable to Master and Public Subnets
    nm = 'subnetRTAMasters'
    subnetRouteTableAssociation = t.add_resource(
        SubnetRouteTableAssociation(
            nm,
            SubnetId=Ref(subnetMasters),
            RouteTableId=Ref(routeTable),
        ))

    nm = 'subnetRTAPublicAgents'
    subnetRouteTableAssociation = t.add_resource(
        SubnetRouteTableAssociation(
            nm,
            SubnetId=Ref(subnetPublicAgents),
            RouteTableId=Ref(routeTable),
        ))

    # Create Security Group (General access to ssh and internal connectionsn between masters, agents, and public agents)
    nm = 'securityGroup'
    securityGroup = t.add_resource(
        SecurityGroup(nm,
                      GroupDescription='Security Group',
                      SecurityGroupIngress=[
                          SecurityGroupRule(IpProtocol='tcp',
                                            FromPort='22',
                                            ToPort='22',
                                            CidrIp=Ref(sshlocation_param)),
                          SecurityGroupRule(IpProtocol='-1',
                                            CidrIp='10.10.0.0/16')
                      ],
                      VpcId=Ref(vpc),
                      Tags=Tags(Application=ref_stack_id,
                                Name=Join("",
                                          [Ref('AWS::StackName'), "-", nm]))))

    # Create Security Group Public Agents
    nm = 'securityGroupPublicAgents'
    publicAgentsSG = t.add_resource(
        SecurityGroup(nm,
                      GroupDescription='Security Group Public Agents',
                      SecurityGroupIngress=[
                          SecurityGroupRule(IpProtocol='tcp',
                                            FromPort='80',
                                            ToPort='80',
                                            CidrIp='0.0.0.0/0'),
                          SecurityGroupRule(IpProtocol='tcp',
                                            FromPort='443',
                                            ToPort='443',
                                            CidrIp='0.0.0.0/0'),
                          SecurityGroupRule(IpProtocol='tcp',
                                            FromPort='10000',
                                            ToPort='10010',
                                            CidrIp='0.0.0.0/0'),
                          SecurityGroupRule(IpProtocol='tcp',
                                            FromPort='9090',
                                            ToPort='9090',
                                            CidrIp='0.0.0.0/0')
                      ],
                      VpcId=Ref(vpc),
                      Tags=Tags(Application=ref_stack_id,
                                Name=Join("",
                                          [Ref('AWS::StackName'), "-", nm]))))

    # Create Security Group Masters Allow Access from sshlocation param as test
    nm = 'securityGroupMasters'
    mastersSG = t.add_resource(
        SecurityGroup(nm,
                      GroupDescription='Security Group Masters',
                      SecurityGroupIngress=[
                          SecurityGroupRule(IpProtocol='tcp',
                                            FromPort='80',
                                            ToPort='80',
                                            CidrIp=Ref(sshlocation_param)),
                          SecurityGroupRule(IpProtocol='tcp',
                                            FromPort='443',
                                            ToPort='443',
                                            CidrIp=Ref(sshlocation_param))
                      ],
                      VpcId=Ref(vpc),
                      Tags=Tags(Application=ref_stack_id,
                                Name=Join("",
                                          [Ref('AWS::StackName'), "-", nm]))))

    if useNatInstance:
        # **** Also change in natRoute ****
        # Create NAT instance; This allows private agents to get out to the Internet
        nm = 'nat'
        nat = t.add_resource(
            Instance(
                nm,
                SourceDestCheck="false",
                ImageId=FindInMap("NATAmi", Ref("AWS::Region"), "default"),
                InstanceType="m4.large",
                AvailabilityZone=Ref(avzone_param),
                KeyName=Ref(keyname_param),
                DependsOn=internetGateway.title,
                NetworkInterfaces=[
                    NetworkInterfaceProperty(GroupSet=[Ref(securityGroup)],
                                             AssociatePublicIpAddress='true',
                                             DeviceIndex='0',
                                             DeleteOnTermination='true',
                                             SubnetId=Ref(subnetMasters),
                                             PrivateIpAddress='10.10.0.9')
                ],
                BlockDeviceMappings=[
                    BlockDeviceMapping(DeviceName="/dev/xvda",
                                       Ebs=EBSBlockDevice(
                                           DeleteOnTermination='true', ))
                ],
                Tags=Tags(Application=ref_stack_id,
                          Name=Join("", [Ref('AWS::StackName'), "-", nm]))))
    else:
        # Create Elastic IP for NatGateay
        nm = 'natIP'
        nat_eip = t.add_resource(EIP(
            nm,
            Domain="vpc",
        ))

        # Create NAT Gateway
        nm = 'natGateway'
        nat = t.add_resource(
            NatGateway(
                nm,
                AllocationId=GetAtt(nat_eip, 'AllocationId'),
                SubnetId=Ref(subnetMasters),
            ))

    # Create Route Table for NAT
    nm = 'natRouteTable'
    routeTableNAT = t.add_resource(
        RouteTable(nm,
                   VpcId=Ref(vpc),
                   Tags=Tags(Application=ref_stack_id,
                             Name=Join("", [Ref('AWS::StackName'), "-", nm]))))

    # Associate Agent Subnet to NAT
    nm = 'subnetRTAAgents'
    subnetRouteTableAssociation = t.add_resource(
        SubnetRouteTableAssociation(
            nm,
            SubnetId=Ref(subnetAgents),
            RouteTableId=Ref(routeTableNAT),
        ))

    # Add Routes (Agents can reach out anywhere)
    nm = 'natRoute'
    if useNatInstance:
        route = t.add_resource(
            Route(
                nm,
                RouteTableId=Ref(routeTableNAT),
                DestinationCidrBlock='0.0.0.0/0',
                InstanceId=Ref(nat),
            ))
    else:
        route = t.add_resource(
            Route(
                nm,
                RouteTableId=Ref(routeTableNAT),
                DestinationCidrBlock='0.0.0.0/0',
                NatGatewayId=Ref(nat),
            ))

    # ****************************************
    # NOTE: I am using static PrivateIPAddresses; this may not be a good choice; however, it simplified the install script.  The range of IP's for the master and agents are limited to 24 subnet and I start at 11
    #      With this configuration the max number of agents is around 240.
    # ****************************************

    # Create boot instance
    # Installs on AWS so far have taken longer than on Azure.  Takes about 10 minutes for the boot server to configure.
    # Tried several InstanceType from t2.micro to m4.large; all take about 10 minutes for boot to load.  The docker start of mesosphere/dcos-genconf seems to be taking longer than it did on azure.
    nm = 'boot'
    boot = t.add_resource(
        Instance(nm,
                 ImageId=FindInMap("c73Ami", Ref("AWS::Region"), "default"),
                 InstanceType="m4.xlarge",
                 AvailabilityZone=Ref(avzone_param),
                 KeyName=Ref(keyname_param),
                 NetworkInterfaces=[
                     NetworkInterfaceProperty(GroupSet=[Ref(securityGroup)],
                                              AssociatePublicIpAddress='true',
                                              DeviceIndex='0',
                                              DeleteOnTermination='true',
                                              SubnetId=Ref(subnetMasters),
                                              PrivateIpAddress='10.10.0.10')
                 ],
                 BlockDeviceMappings=[
                     BlockDeviceMapping(DeviceName="/dev/sda1",
                                        Ebs=EBSBlockDevice(
                                            VolumeSize="100",
                                            DeleteOnTermination='true',
                                        ))
                 ],
                 Tags=Tags(Application=ref_stack_id,
                           Name=Join("", [Ref('AWS::StackName'), "-", nm]))))

    # Create master instance(s)
    masters = []
    i = 1
    while i <= num_masters:
        nm = 'm' + str(i)
        private_ip = "10.10.0." + str(i + 10)
        instance = t.add_resource(
            Instance(nm,
                     ImageId=FindInMap("c73Ami", Ref("AWS::Region"),
                                       "default"),
                     InstanceType=Ref(instanceTypeMaster_param),
                     AvailabilityZone=Ref(avzone_param),
                     KeyName=Ref(keyname_param),
                     NetworkInterfaces=[
                         NetworkInterfaceProperty(
                             GroupSet=[Ref(securityGroup),
                                       Ref(mastersSG)],
                             AssociatePublicIpAddress='true',
                             DeviceIndex='0',
                             DeleteOnTermination='true',
                             SubnetId=Ref(subnetMasters),
                             PrivateIpAddress=private_ip)
                     ],
                     BlockDeviceMappings=[
                         BlockDeviceMapping(DeviceName="/dev/sda1",
                                            Ebs=EBSBlockDevice(
                                                VolumeSize="100",
                                                DeleteOnTermination='true',
                                            ))
                     ],
                     Tags=Tags(Application=ref_stack_id,
                               Name=Join("",
                                         [Ref('AWS::StackName'), "-", nm]))))
        masters.append(instance)
        i += 1

    # Create agent instance(s)
    i = 1
    while i <= num_agents:

        nm = 'a' + str(i)
        private_ip = "10.10.16." + str(i + 10)

        instance = t.add_resource(
            Instance(
                nm,
                ImageId=FindInMap("c73Ami", Ref("AWS::Region"), "default"),
                InstanceType=Ref(instanceTypeAgent_param),
                AvailabilityZone=Ref(avzone_param),
                KeyName=Ref(keyname_param),
                NetworkInterfaces=[
                    NetworkInterfaceProperty(GroupSet=[Ref(securityGroup)],
                                             AssociatePublicIpAddress='false',
                                             DeviceIndex='0',
                                             DeleteOnTermination='true',
                                             SubnetId=Ref(subnetAgents),
                                             PrivateIpAddress=private_ip)
                ],
                BlockDeviceMappings=[
                    BlockDeviceMapping(DeviceName="/dev/sda1",
                                       Ebs=EBSBlockDevice(
                                           VolumeSize="100",
                                           DeleteOnTermination='true',
                                       ))
                ],
                Tags=Tags(Application=ref_stack_id,
                          Name=Join("", [Ref('AWS::StackName'), "-", nm]))))

        volume = t.add_resource(
            Volume(nm + "data",
                   AvailabilityZone=Ref(avzone_param),
                   Size=Ref(dataDriveSizeGB_param),
                   Tags=Tags(
                       Application=ref_stack_id,
                       Name=Join("",
                                 [Ref('AWS::StackName'), "-", nm + "data"]))))

        volattach = t.add_resource(
            VolumeAttachment(nm + "dataattach",
                             InstanceId=Ref(instance),
                             VolumeId=Ref(volume),
                             Device="/dev/sdc"))

        i += 1

    # Create public agent instance(s)
    publicAgents = []
    i = 1
    nm = "p1"
    while i <= num_publicAgents:
        nm = 'p' + str(i)
        private_ip = "10.10.32." + str(i + 10)
        instance = t.add_resource(
            Instance(
                nm,
                ImageId=FindInMap("c73Ami", Ref("AWS::Region"), "default"),
                InstanceType=Ref(instanceTypePublicAgent_param),
                AvailabilityZone=Ref(avzone_param),
                KeyName=Ref(keyname_param),
                NetworkInterfaces=[
                    NetworkInterfaceProperty(
                        GroupSet=[Ref(securityGroup),
                                  Ref(publicAgentsSG)],
                        AssociatePublicIpAddress='true',
                        DeviceIndex='0',
                        DeleteOnTermination='true',
                        SubnetId=Ref(subnetPublicAgents),
                        PrivateIpAddress=private_ip)
                ],
                BlockDeviceMappings=[
                    BlockDeviceMapping(DeviceName="/dev/sda1",
                                       Ebs=EBSBlockDevice(
                                           VolumeSize="100",
                                           DeleteOnTermination='true',
                                       ))
                ],
                Tags=Tags(Application=ref_stack_id,
                          Name=Join("", [Ref('AWS::StackName'), "-", nm]))))
        publicAgents.append(instance)
        i += 1

    # Load Balancer Masters
    nm = "masters"
    elasticLBMasters = t.add_resource(
        elb.LoadBalancer(
            nm,
            Instances=[Ref(r) for r in masters],
            Subnets=[Ref(subnetMasters)],
            SecurityGroups=[Ref(mastersSG)],
            CrossZone=False,
            Listeners=[
                elb.Listener(
                    LoadBalancerPort="80",
                    InstancePort="80",
                    Protocol="TCP",
                ),
                elb.Listener(
                    LoadBalancerPort="443",
                    InstancePort="443",
                    Protocol="TCP",
                ),
            ],
            # Health Checking on port 80 which should be there after DCOS has been installed.
            HealthCheck=elb.HealthCheck(
                Target="TCP:80",
                HealthyThreshold="2",
                UnhealthyThreshold="2",
                Interval="30",
                Timeout="5",
            ),
            Tags=Tags(Application=ref_stack_id,
                      Name=Join("", [Ref('AWS::StackName'), "-", nm]))))

    # Load Balancer Public Agents
    nm = "publicagents"
    elasticLBPublicAgents = t.add_resource(
        elb.LoadBalancer(
            nm,
            #AvailabilityZones=GetAZs(""),
            Instances=[Ref(r) for r in publicAgents],
            Subnets=[Ref(subnetPublicAgents)],
            SecurityGroups=[Ref(publicAgentsSG)],
            CrossZone=False,
            Listeners=[
                elb.Listener(
                    LoadBalancerPort="10000",
                    InstancePort="10000",
                    Protocol="TCP",
                ),
                elb.Listener(
                    LoadBalancerPort="10001",
                    InstancePort="10001",
                    Protocol="TCP",
                ),
                elb.Listener(
                    LoadBalancerPort="10002",
                    InstancePort="10002",
                    Protocol="TCP",
                ),
                elb.Listener(
                    LoadBalancerPort="10003",
                    InstancePort="10003",
                    Protocol="TCP",
                ),
                elb.Listener(
                    LoadBalancerPort="10004",
                    InstancePort="10004",
                    Protocol="TCP",
                ),
                elb.Listener(
                    LoadBalancerPort="10005",
                    InstancePort="10005",
                    Protocol="TCP",
                ),
                elb.Listener(
                    LoadBalancerPort="10006",
                    InstancePort="10006",
                    Protocol="TCP",
                ),
                elb.Listener(
                    LoadBalancerPort="10007",
                    InstancePort="10007",
                    Protocol="TCP",
                ),
                elb.Listener(
                    LoadBalancerPort="10008",
                    InstancePort="10008",
                    Protocol="TCP",
                ),
                elb.Listener(
                    LoadBalancerPort="10009",
                    InstancePort="10009",
                    Protocol="TCP",
                ),
                elb.Listener(
                    LoadBalancerPort="10010",
                    InstancePort="10010",
                    Protocol="TCP",
                ),
                elb.Listener(
                    LoadBalancerPort="9090",
                    InstancePort="9090",
                    Protocol="TCP",
                ),
                elb.Listener(
                    LoadBalancerPort="80",
                    InstancePort="80",
                    Protocol="TCP",
                ),
                elb.Listener(
                    LoadBalancerPort="443",
                    InstancePort="443",
                    Protocol="TCP",
                )
            ],
            # I've added health check for port 9090; becomes healthy after Marathon-LB is installed.
            HealthCheck=elb.HealthCheck(
                Target="TCP:9090",
                HealthyThreshold="2",
                UnhealthyThreshold="2",
                Interval="30",
                Timeout="5",
            ),
            Tags=Tags(Application=ref_stack_id,
                      Name=Join("", [Ref('AWS::StackName'), "-", nm]))))

    # Outputs
    t.add_output(
        Output("BootServer",
               Description="Name/IP of Boot Server",
               Value=Join(
                   "/",
                   [GetAtt(boot, "PublicDnsName"),
                    GetAtt(boot, "PublicIp")])))

    t.add_output(
        Output("MastersURL",
               Description="URL of the Masters",
               Value=Join(
                   "",
                   ["http://", GetAtt(elasticLBMasters, "DNSName")])))

    t.add_output(
        Output(
            "PublicAgentsURL",
            Description="URL of the Public Agents haproxy stats.",
            Value=Join("", [
                "http://",
                GetAtt(elasticLBPublicAgents, "DNSName"), ":9090/haproxy?stats"
            ])))

    # Write json to file
    jsonStr = t.to_json()

    fout = open(outfilename, "w")
    fout.write(jsonStr)
    fout.close()

    # Print the json to screen
    print(jsonStr)
예제 #9
0
def main(**params):
    try:
        # Metadata
        t = Template()
        t.set_version("2010-09-09")
        t.set_description("(SOCA) - Base template to deploy compute nodes. Version 2.7.2")
        allow_anonymous_data_collection = params["MetricCollectionAnonymous"]
        debug = False
        mip_usage = False
        instances_list = params["InstanceType"] # list of instance type. Use + to specify more than one type
        asg_lt = asg_LaunchTemplate()
        ltd = LaunchTemplateData("NodeLaunchTemplateData")
        mip = MixedInstancesPolicy()
        stack_name = Ref("AWS::StackName")

        # Begin LaunchTemplateData
        UserData = '''#!/bin/bash -x
export PATH=$PATH:/usr/local/bin
if [[ "''' + params['BaseOS'] + '''" == "centos7" ]] || [[ "''' + params['BaseOS'] + '''" == "rhel7" ]];
then
     yum install -y python3-pip
     PIP=$(which pip3)
     $PIP install awscli
     yum install -y nfs-utils # enforce install of nfs-utils
else
     yum install -y python3-pip
     PIP=$(which pip3)
     $PIP install awscli
fi
if [[ "''' + params['BaseOS'] + '''" == "amazonlinux2" ]];
    then
        /usr/sbin/update-motd --disable
fi

GET_INSTANCE_TYPE=$(curl http://169.254.169.254/latest/meta-data/instance-type)
echo export "SOCA_CONFIGURATION="''' + str(params['ClusterId']) + '''"" >> /etc/environment
echo export "SOCA_BASE_OS="''' + str(params['BaseOS']) + '''"" >> /etc/environment
echo export "SOCA_JOB_QUEUE="''' + str(params['JobQueue']) + '''"" >> /etc/environment
echo export "SOCA_JOB_OWNER="''' + str(params['JobOwner']) + '''"" >> /etc/environment
echo export "SOCA_JOB_NAME="''' + str(params['JobName']) + '''"" >> /etc/environment
echo export "SOCA_JOB_PROJECT="''' + str(params['JobProject']) + '''"" >> /etc/environment
echo export "SOCA_VERSION="''' + str(params['Version']) + '''"" >> /etc/environment
echo export "SOCA_JOB_EFA="''' + str(params['Efa']).lower() + '''"" >> /etc/environment
echo export "SOCA_JOB_ID="''' + str(params['JobId']) + '''"" >> /etc/environment
echo export "SOCA_SCRATCH_SIZE=''' + str(params['ScratchSize']) + '''" >> /etc/environment
echo export "SOCA_INSTALL_BUCKET="''' + str(params['S3Bucket']) + '''"" >> /etc/environment
echo export "SOCA_INSTALL_BUCKET_FOLDER="''' + str(params['S3InstallFolder']) + '''"" >> /etc/environment
echo export "SOCA_FSX_LUSTRE_BUCKET="''' + str(params['FSxLustreConfiguration']['fsx_lustre']).lower() + '''"" >> /etc/environment
echo export "SOCA_FSX_LUSTRE_DNS="''' + str(params['FSxLustreConfiguration']['existing_fsx']).lower() + '''"" >> /etc/environment
echo export "SOCA_INSTANCE_TYPE=$GET_INSTANCE_TYPE" >> /etc/environment
echo export "SOCA_INSTANCE_HYPERTHREADING="''' + str(params['ThreadsPerCore']).lower() + '''"" >> /etc/environment
echo export "SOCA_SYSTEM_METRICS="''' + str(params['SystemMetrics']).lower() + '''"" >> /etc/environment
echo export "SOCA_ESDOMAIN_ENDPOINT="''' + str(params['ESDomainEndpoint']).lower() + '''"" >> /etc/environment
echo export "SOCA_AUTH_PROVIDER="''' + str(params['AuthProvider']).lower() + '''"" >> /etc/environment
echo export "SOCA_HOST_SYSTEM_LOG="/apps/soca/''' + str(params['ClusterId']) + '''/cluster_node_bootstrap/logs/''' + str(params['JobId']) + '''/$(hostname -s)"" >> /etc/environment
echo export "AWS_STACK_ID=${AWS::StackName}" >> /etc/environment
echo export "AWS_DEFAULT_REGION=${AWS::Region}" >> /etc/environment


source /etc/environment
AWS=$(command -v aws)

# Give yum permission to the user on this specific machine
echo "''' + params['JobOwner'] + ''' ALL=(ALL) /bin/yum" >> /etc/sudoers

# Mount File system
mkdir -p /apps
mkdir -p /data

FS_DATA_PROVIDER='''+params['FileSystemDataProvider']+'''
FS_DATA='''+params['FileSystemData']+'''
FS_APPS_PROVIDER='''+params['FileSystemAppsProvider']+'''
FS_APPS='''+params['FileSystemApps']+'''

if [[ "$FS_DATA_PROVIDER" == "fsx_lustre" ]] || [[ "$FS_APPS_PROVIDER" == "fsx_lustre" ]]; then
    if [[ -z "$(rpm -qa lustre-client)" ]]; then
        # Install FSx for Lustre Client
        if [[ "$SOCA_BASE_OS" == "amazonlinux2" ]]; then
            amazon-linux-extras install -y lustre2.10
        else
            kernel=$(uname -r)
            machine=$(uname -m)
            echo "Found kernel version: $kernel running on: $machine"
            yum -y install wget
            if [[ $kernel == *"3.10.0-957"*$machine ]]; then
                yum -y install https://downloads.whamcloud.com/public/lustre/lustre-2.10.8/el7/client/RPMS/x86_64/kmod-lustre-client-2.10.8-1.el7.x86_64.rpm
                yum -y install https://downloads.whamcloud.com/public/lustre/lustre-2.10.8/el7/client/RPMS/x86_64/lustre-client-2.10.8-1.el7.x86_64.rpm
            elif [[ $kernel == *"3.10.0-1062"*$machine ]]; then
                wget https://fsx-lustre-client-repo-public-keys.s3.amazonaws.com/fsx-rpm-public-key.asc -O /tmp/fsx-rpm-public-key.asc
                rpm --import /tmp/fsx-rpm-public-key.asc
                wget https://fsx-lustre-client-repo.s3.amazonaws.com/el/7/fsx-lustre-client.repo -O /etc/yum.repos.d/aws-fsx.repo
                sed -i 's#7#7.7#' /etc/yum.repos.d/aws-fsx.repo
                yum clean all
                yum install -y kmod-lustre-client lustre-client
            elif [[ $kernel == *"3.10.0-1127"*$machine ]]; then
                wget https://fsx-lustre-client-repo-public-keys.s3.amazonaws.com/fsx-rpm-public-key.asc -O /tmp/fsx-rpm-public-key.asc
                rpm --import /tmp/fsx-rpm-public-key.asc
                wget https://fsx-lustre-client-repo.s3.amazonaws.com/el/7/fsx-lustre-client.repo -O /etc/yum.repos.d/aws-fsx.repo
                sed -i 's#7#7.8#' /etc/yum.repos.d/aws-fsx.repo
                yum clean all
                yum install -y kmod-lustre-client lustre-client
            elif [[ $kernel == *"3.10.0-1160"*$machine ]]; then
                wget https://fsx-lustre-client-repo-public-keys.s3.amazonaws.com/fsx-rpm-public-key.asc -O /tmp/fsx-rpm-public-key.asc
                rpm --import /tmp/fsx-rpm-public-key.asc
                wget https://fsx-lustre-client-repo.s3.amazonaws.com/el/7/fsx-lustre-client.repo -O /etc/yum.repos.d/aws-fsx.repo
                yum clean all
                yum install -y kmod-lustre-client lustre-client
            elif [[ $kernel == *"4.18.0-193"*$machine ]]; then
                # FSX for Lustre on aarch64 is supported only on 4.18.0-193
                wget https://fsx-lustre-client-repo-public-keys.s3.amazonaws.com/fsx-rpm-public-key.asc -O /tmp/fsx-rpm-public-key.asc
                rpm --import /tmp/fsx-rpm-public-key.asc
                wget https://fsx-lustre-client-repo.s3.amazonaws.com/centos/7/fsx-lustre-client.repo -O /etc/yum.repos.d/aws-fsx.repo
                yum clean all
                yum install -y kmod-lustre-client lustre-client
            else
                echo "ERROR: Can't install FSx for Lustre client as kernel version: $kernel isn't matching expected versions: (x86_64: 3.10.0-957, -1062, -1127, -1160, aarch64: 4.18.0-193)!"
            fi
        fi
    fi
fi

if [[ "$FS_DATA_PROVIDER" == "efs" ]]; then
    echo "$FS_DATA:/ /data nfs4 nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport 0 0" >> /etc/fstab
elif [[ "$FS_DATA_PROVIDER" == "fsx_lustre" ]]; then
    FSX_ID=$(echo $FS_DATA | cut -d. -f1)
    FSX_DATA_MOUNT_NAME=$($AWS fsx describe-file-systems --file-system-ids $FSX_ID  --query FileSystems[].LustreConfiguration.MountName --output text)
    echo "$FS_DATA@tcp:/$FSX_DATA_MOUNT_NAME /data lustre defaults,noatime,flock,_netdev 0 0" >> /etc/fstab
fi

if [[ "$FS_APPS_PROVIDER" == "efs" ]]; then
    echo "$FS_APPS:/ /apps nfs4 nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport 0 0" >> /etc/fstab
elif [[ "$FS_APPS_PROVIDER" == "fsx_lustre" ]]; then
    FSX_ID=$(echo $FS_APPS | cut -d. -f1)
    FSX_APPS_MOUNT_NAME=$($AWS fsx describe-file-systems --file-system-ids $FSX_ID  --query FileSystems[].LustreConfiguration.MountName --output text)
    echo "$FS_APPS@tcp:/$FSX_APPS_MOUNT_NAME /apps lustre defaults,noatime,flock,_netdev 0 0" >> /etc/fstab
fi

FS_MOUNT=0
mount -a 
while [[ $? -ne 0 ]] && [[ $FS_MOUNT -lt 5 ]]
do
    SLEEP_TIME=$(( RANDOM % 60 ))
    echo "Failed to mount FS, retrying in $SLEEP_TIME seconds and Loop $FS_MOUNT/5..."
    sleep $SLEEP_TIME
    ((FS_MOUNT++))
    mount -a
done

# Configure Chrony
yum remove -y ntp
yum install -y chrony
mv /etc/chrony.conf  /etc/chrony.conf.original
echo -e """
# use the local instance NTP service, if available
server 169.254.169.123 prefer iburst minpoll 4 maxpoll 4

# Use public servers from the pool.ntp.org project.
# Please consider joining the pool (http://www.pool.ntp.org/join.html).
# !!! [BEGIN] SOCA REQUIREMENT
# You will need to open UDP egress traffic on your security group if you want to enable public pool
#pool 2.amazon.pool.ntp.org iburst
# !!! [END] SOCA REQUIREMENT
# Record the rate at which the system clock gains/losses time.
driftfile /var/lib/chrony/drift

# Allow the system clock to be stepped in the first three updates
# if its offset is larger than 1 second.
makestep 1.0 3

# Specify file containing keys for NTP authentication.
keyfile /etc/chrony.keys

# Specify directory for log files.
logdir /var/log/chrony

# save data between restarts for fast re-load
dumponexit
dumpdir /var/run/chrony
""" > /etc/chrony.conf
systemctl enable chronyd

# Prepare  Log folder
mkdir -p $SOCA_HOST_SYSTEM_LOG
echo "@reboot /bin/bash /apps/soca/$SOCA_CONFIGURATION/cluster_node_bootstrap/ComputeNodePostReboot.sh >> $SOCA_HOST_SYSTEM_LOG/ComputeNodePostReboot.log 2>&1" | crontab -
cp /apps/soca/$SOCA_CONFIGURATION/cluster_node_bootstrap/config.cfg /root/
/bin/bash /apps/soca/$SOCA_CONFIGURATION/cluster_node_bootstrap/ComputeNode.sh ''' + params['SchedulerHostname'] + ''' >> $SOCA_HOST_SYSTEM_LOG/ComputeNode.sh.log 2>&1'''

        # Specify the security groups to assign to the compute nodes. Max 5 per instance
        security_groups = [params["SecurityGroupId"]]
        if params["AdditionalSecurityGroupIds"]:
            for sg_id in params["AdditionalSecurityGroupIds"]:
                security_groups.append(sg_id)

        # Specify the IAM instance profile to use
        instance_profile = params["ComputeNodeInstanceProfileArn"] if params["CustomIamInstanceProfile"] is False else params["CustomIamInstanceProfile"]

        SpotFleet = True if ((params["SpotPrice"] is not False) and (params["SpotAllocationCount"] is False) and (int(params["DesiredCapacity"]) > 1 or len(instances_list)>1)) else False
        ltd.EbsOptimized = True
        for instance in instances_list:
            if "t2." in instance:
                ltd.EbsOptimized = False

            # metal + t2 does not support CpuOptions
            unsupported = ["t2.", "metal"]
            if all(itype not in instance for itype in unsupported) and (SpotFleet is False or len(instances_list) == 1):
                # Spotfleet with multiple instance types doesn't support CpuOptions
                # So we can't add CpuOptions if SpotPrice is specified and when multiple instances are specified
                ltd.CpuOptions = CpuOptions(
                    CoreCount=int(params["CoreCount"]),
                    ThreadsPerCore=1 if params["ThreadsPerCore"] is False else 2)

        ltd.IamInstanceProfile = IamInstanceProfile(Arn=instance_profile)
        ltd.KeyName = params["SSHKeyPair"]
        ltd.ImageId = params["ImageId"]
        if params["SpotPrice"] is not False and params["SpotAllocationCount"] is False:
            ltd.InstanceMarketOptions = InstanceMarketOptions(
                MarketType="spot",
                SpotOptions=SpotOptions(
                    MaxPrice=Ref("AWS::NoValue") if params["SpotPrice"] == "auto" else str(params["SpotPrice"])
                    # auto -> cap at OD price
                )
            )
        ltd.InstanceType = instances_list[0]
        ltd.NetworkInterfaces = [NetworkInterfaces(
            InterfaceType="efa" if params["Efa"] is not False else Ref("AWS::NoValue"),
            DeleteOnTermination=True,
            DeviceIndex=0,
            Groups=security_groups
        )]
        ltd.UserData = Base64(Sub(UserData))
        ltd.BlockDeviceMappings = [
            LaunchTemplateBlockDeviceMapping(
                DeviceName="/dev/xvda" if params["BaseOS"] == "amazonlinux2" else "/dev/sda1",
                Ebs=EBSBlockDevice(
                    VolumeSize=params["RootSize"],
                    VolumeType="gp3",
                    DeleteOnTermination="false" if params["KeepEbs"] is True else "true",
                    Encrypted=True))
        ]
        if int(params["ScratchSize"]) > 0:
            ltd.BlockDeviceMappings.append(
                BlockDeviceMapping(
                    DeviceName="/dev/xvdbx",
                    Ebs=EBSBlockDevice(
                        VolumeSize=params["ScratchSize"],
                        VolumeType="io1" if int(params["VolumeTypeIops"]) > 0 else "gp3",
                        Iops=params["VolumeTypeIops"] if int(params["VolumeTypeIops"]) > 0 else Ref("AWS::NoValue"),
                        DeleteOnTermination="false" if params["KeepEbs"] is True else "true",
                        Encrypted=True))
            )
        ltd.TagSpecifications = [ec2.TagSpecifications(
            ResourceType="instance",
            Tags = base_Tags(
                Name=str(params["ClusterId"]) + "-compute-job-" + str(params["JobId"]),
                _soca_JobId=str(params["JobId"]),
                _soca_JobName=str(params["JobName"]),
                _soca_JobQueue=str(params["JobQueue"]),
                _soca_StackId=stack_name,
                _soca_JobOwner=str(params["JobOwner"]),
                _soca_JobProject=str(params["JobProject"]),
                _soca_TerminateWhenIdle=str(params["TerminateWhenIdle"]),
                _soca_KeepForever=str(params["KeepForever"]).lower(),
                _soca_ClusterId=str(params["ClusterId"]),
                _soca_NodeType="soca-compute-node"))]
        # End LaunchTemplateData

        # Begin Launch Template Resource
        lt = LaunchTemplate("NodeLaunchTemplate")
        lt.LaunchTemplateName = params["ClusterId"] + "-" + str(params["JobId"])
        lt.LaunchTemplateData = ltd
        t.add_resource(lt)
        # End Launch Template Resource

        if SpotFleet is True:
            # SpotPrice is defined and DesiredCapacity > 1 or need to try more than 1 instance_type
            # Create SpotFleet

            # Begin SpotFleetRequestConfigData Resource
            sfrcd = ec2.SpotFleetRequestConfigData()
            sfrcd.AllocationStrategy = params["SpotAllocationStrategy"]
            sfrcd.ExcessCapacityTerminationPolicy = "noTermination"
            sfrcd.IamFleetRole = params["SpotFleetIAMRoleArn"]
            sfrcd.InstanceInterruptionBehavior = "terminate"
            if params["SpotPrice"] != "auto":
                sfrcd.SpotPrice = str(params["SpotPrice"])
            sfrcd.SpotMaintenanceStrategies = ec2.SpotMaintenanceStrategies(
                    CapacityRebalance=ec2.SpotCapacityRebalance(ReplacementStrategy="launch"))
            sfrcd.TargetCapacity = params["DesiredCapacity"]
            sfrcd.Type = "maintain"
            sfltc = ec2.LaunchTemplateConfigs()
            sflts = ec2.LaunchTemplateSpecification(
                    LaunchTemplateId=Ref(lt),
                    Version=GetAtt(lt, "LatestVersionNumber"))
            sfltc.LaunchTemplateSpecification = sflts
            sfltc.Overrides = []
            for subnet in params["SubnetId"]:
                for index, instance in enumerate(instances_list):
                    if params["WeightedCapacity"] is not False:
                        sfltc.Overrides.append(ec2.LaunchTemplateOverrides(
                                InstanceType = instance,
                                SubnetId = subnet,
                                WeightedCapacity = params["WeightedCapacity"][index]))
                    else:
                        sfltc.Overrides.append(ec2.LaunchTemplateOverrides(
                                InstanceType = instance,
                                SubnetId = subnet))
            sfrcd.LaunchTemplateConfigs = [sfltc]
            TagSpecifications = ec2.SpotFleetTagSpecification(
                ResourceType="spot-fleet-request",
                Tags=base_Tags(
                Name=str(params["ClusterId"]) + "-compute-job-" + str(params["JobId"]),
                _soca_JobId=str(params["JobId"]),
                _soca_JobName=str(params["JobName"]),
                _soca_JobQueue=str(params["JobQueue"]),
                _soca_StackId=stack_name,
                _soca_JobOwner=str(params["JobOwner"]),
                _soca_JobProject=str(params["JobProject"]),
                _soca_TerminateWhenIdle=str(params["TerminateWhenIdle"]),
                _soca_KeepForever=str(params["KeepForever"]).lower(),
                _soca_ClusterId=str(params["ClusterId"]),
                _soca_NodeType="soca-compute-node"))
            # End SpotFleetRequestConfigData Resource

            # Begin SpotFleet Resource
            spotfleet = ec2.SpotFleet("SpotFleet")
            spotfleet.SpotFleetRequestConfigData = sfrcd
            t.add_resource(spotfleet)
            # End SpotFleet Resource
        else:

            asg_lt.LaunchTemplateSpecification = LaunchTemplateSpecification(
                LaunchTemplateId=Ref(lt),
                Version=GetAtt(lt, "LatestVersionNumber")
            )

            asg_lt.Overrides = []
            for index, instance in enumerate(instances_list):
                if params["WeightedCapacity"] is not False:
                    mip_usage = True
                    asg_lt.Overrides.append(LaunchTemplateOverrides(
                        InstanceType=instance,
                        WeightedCapacity=str(params["WeightedCapacity"][index])))
                else:
                    asg_lt.Overrides.append(LaunchTemplateOverrides(
                        InstanceType=instance))

            # Begin InstancesDistribution
            if params["SpotPrice"] is not False and \
                    params["SpotAllocationCount"] is not False and \
                    (int(params["DesiredCapacity"]) - int(params["SpotAllocationCount"])) > 0:
                mip_usage = True
                idistribution = InstancesDistribution()
                idistribution.OnDemandAllocationStrategy = "prioritized"  # only supported value
                idistribution.OnDemandBaseCapacity = params["DesiredCapacity"] - params["SpotAllocationCount"]
                idistribution.OnDemandPercentageAboveBaseCapacity = "0"  # force the other instances to be SPOT
                idistribution.SpotMaxPrice = Ref("AWS::NoValue") if params["SpotPrice"] == "auto" else str(
                    params["SpotPrice"])
                idistribution.SpotAllocationStrategy = params['SpotAllocationStrategy']
                mip.InstancesDistribution = idistribution

            # End MixedPolicyInstance

            # Begin AutoScalingGroup Resource
            asg = AutoScalingGroup("AutoScalingComputeGroup")
            asg.DependsOn = "NodeLaunchTemplate"
            if mip_usage is True or instances_list.__len__() > 1:
                mip.LaunchTemplate = asg_lt
                asg.MixedInstancesPolicy = mip

            else:
                asg.LaunchTemplate = LaunchTemplateSpecification(
                    LaunchTemplateId=Ref(lt),
                    Version=GetAtt(lt, "LatestVersionNumber"))

            asg.MinSize = int(params["DesiredCapacity"])
            asg.MaxSize = int(params["DesiredCapacity"])
            asg.VPCZoneIdentifier = params["SubnetId"]
            asg.CapacityRebalance = False

            if params["PlacementGroup"] is True:
                pg = PlacementGroup("ComputeNodePlacementGroup")
                pg.Strategy = "cluster"
                t.add_resource(pg)
                asg.PlacementGroup = Ref(pg)

            asg.Tags = Tags(
                Name=str(params["ClusterId"]) + "-compute-job-" + str(params["JobId"]),
                _soca_JobId=str(params["JobId"]),
                _soca_JobName=str(params["JobName"]),
                _soca_JobQueue=str(params["JobQueue"]),
                _soca_StackId=stack_name,
                _soca_JobOwner=str(params["JobOwner"]),
                _soca_JobProject=str(params["JobProject"]),
                _soca_TerminateWhenIdle=str(params["TerminateWhenIdle"]),
                _soca_KeepForever=str(params["KeepForever"]).lower(),
                _soca_ClusterId=str(params["ClusterId"]),
                _soca_NodeType="soca-compute-node")
            t.add_resource(asg)
            # End AutoScalingGroup Resource

        # Begin FSx for Lustre
        if params["FSxLustreConfiguration"]["fsx_lustre"] is not False:
            if params["FSxLustreConfiguration"]["existing_fsx"] is False:
                fsx_lustre = FileSystem("FSxForLustre")
                fsx_lustre.FileSystemType = "LUSTRE"
                fsx_lustre.StorageCapacity = params["FSxLustreConfiguration"]["capacity"]
                fsx_lustre.SecurityGroupIds = security_groups
                fsx_lustre.SubnetIds = params["SubnetId"]
                fsx_lustre_configuration = LustreConfiguration()
                fsx_lustre_configuration.DeploymentType = params["FSxLustreConfiguration"]["deployment_type"].upper()
                if params["FSxLustreConfiguration"]["deployment_type"].upper() == "PERSISTENT_1":
                    fsx_lustre_configuration.PerUnitStorageThroughput = params["FSxLustreConfiguration"]["per_unit_throughput"]

                if params["FSxLustreConfiguration"]["s3_backend"] is not False:
                    fsx_lustre_configuration.ImportPath = params["FSxLustreConfiguration"]["import_path"] if params["FSxLustreConfiguration"]["import_path"] is not False else params["FSxLustreConfiguration"]["s3_backend"]
                    fsx_lustre_configuration.ExportPath = params["FSxLustreConfiguration"]["import_path"] if params["FSxLustreConfiguration"]["import_path"] is not False else params["FSxLustreConfiguration"]["s3_backend"] + "/" + params["ClusterId"] + "-fsxoutput/job-" +  params["JobId"] + "/"

                fsx_lustre.LustreConfiguration = fsx_lustre_configuration
                fsx_lustre.Tags = base_Tags(
                    # False disable PropagateAtLaunch
                    Name=str(params["ClusterId"] + "-compute-job-" + params["JobId"]),
                    _soca_JobId=str(params["JobId"]),
                    _soca_JobName=str(params["JobName"]),
                    _soca_JobQueue=str(params["JobQueue"]),
                    _soca_TerminateWhenIdle=str(params["TerminateWhenIdle"]),
                    _soca_StackId=stack_name,
                    _soca_JobOwner=str(params["JobOwner"]),
                    _soca_JobProject=str(params["JobProject"]),
                    _soca_KeepForever=str(params["KeepForever"]).lower(),
                    _soca_FSx="true",
                    _soca_ClusterId=str(params["ClusterId"]),
                )
                t.add_resource(fsx_lustre)
        # End FSx For Lustre

        # Begin Custom Resource
        # Change Mapping to No if you want to disable this
        if allow_anonymous_data_collection is True:
            metrics = CustomResourceSendAnonymousMetrics("SendAnonymousData")
            metrics.ServiceToken = params["SolutionMetricsLambda"]
            metrics.DesiredCapacity = str(params["DesiredCapacity"])
            metrics.InstanceType = str(params["InstanceType"])
            metrics.Efa = str(params["Efa"])
            metrics.ScratchSize = str(params["ScratchSize"])
            metrics.RootSize = str(params["RootSize"])
            metrics.SpotPrice = str(params["SpotPrice"])
            metrics.BaseOS = str(params["BaseOS"])
            metrics.StackUUID = str(params["StackUUID"])
            metrics.KeepForever = str(params["KeepForever"])
            metrics.FsxLustre = str(params["FSxLustreConfiguration"])
            metrics.TerminateWhenIdle = str(params["TerminateWhenIdle"])
            metrics.Dcv = "false"
            t.add_resource(metrics)
        # End Custom Resource

        if debug is True:
            print(t.to_json())

        # Tags must use "soca:<Key>" syntax
        template_output = t.to_yaml().replace("_soca_", "soca:")
        return {'success': True,
                'output': template_output}

    except Exception as e:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        return {'success': False,
                'output': 'cloudformation_builder.py: ' + (
                            str(e) + ': error :' + str(exc_type) + ' ' + str(fname) + ' ' + str(exc_tb.tb_lineno))}
예제 #10
0
for role in roles:
    # Dont name this resource since named resources need full stack destroy.
    launchConfig = t.add_resource(
        LaunchConfiguration(
            "launchconfig" + role.upper(),
            ImageId=rolemap[role]["instance"]["ami"],
            SecurityGroups=[Ref("defaultSG"),
                            Ref(serverSecurityGroup)],
            InstanceType=rolemap[role]["instance"]["type"],
            IamInstanceProfile=Ref("iamCodeDeploy"),
            AssociatePublicIpAddress=True,
            KeyName=Ref("keyName"),
            BlockDeviceMappings=[
                BlockDeviceMapping(DeviceName="/dev/xvda",
                                   Ebs=EBSBlockDevice(DeleteOnTermination=True,
                                                      VolumeType="gp2",
                                                      VolumeSize=10))
            ],
            UserData=Base64(
                Join('', [
                    '#!/bin/bash\n',
                    'sudo apt-get install wget\n',
                    'wget https://aws-codedeploy-',
                    awsRegion,
                    '.s3.amazonaws.com/latest/install\n',
                    'chmod +x ./install\n',
                    'sudo ./install auto\n',
                ]))))

    loadbalancer = []
    targetgroup = []
예제 #11
0
                              ToPort='8898',
                              CidrIp='0.0.0.0/0')
        ],
    ))

ec2Instance = t.add_resource(
    Instance(
        "Ec2Instance",
        InstanceType=Ref(instanceType_param),
        SecurityGroupIds=[Ref(ec2SecurityGroup)],
        ImageId=Ref(imageId_param),
        KeyName=Ref(keyname_param),
        BlockDeviceMappings=[
            BlockDeviceMapping(DeviceName="/dev/sda1",
                               Ebs=EBSBlockDevice(
                                   VolumeSize="128",
                                   VolumeType="gp2",
                               )),
        ],
        Tags=Tags(Application=ref_stack_id, Network="Public", Rev="0.0.3"),
    ))

ec2EIP = t.add_resource(
    EIP(
        "ec2EIP",
        InstanceId=Ref(ec2Instance),
        Domain='vpc',
    ))

hostRecordSet = t.add_resource(
    RecordSetType(
        "hostRecordSet",
예제 #12
0
        "write_files:\n",
        "  - path: /etc/environment\n",
        "    content: |\n",
        "        REGION_NAME=", Ref("AWS::Region"), "\n",
        "        CLUSTER_NAME=", Ref(ClusterName), "\n",
        "        PRIVATEIP=$private_ipv4\n",
        "        PUBLICIP=$public_ipv4\n"
        ]
    )),
    KeyName=Ref(SecurityKeyName),
    SecurityGroups=[Ref(CoreOSSecurityGroup)],
    InstanceType=Ref(AutoScalingGroupInstanceType),
    BlockDeviceMappings=[
      BlockDeviceMapping(
        DeviceName="/dev/xvda",
        Ebs=EBSBlockDevice(VolumeSize="150")
      )
    ],
    ImageId=FindInMap("CoreOSImageRegionMap", Ref("AWS::Region"), "AMI")
))

CoreOSServerAutoScale = t.add_resource(AutoScalingGroup(
    "CoreOSServerAutoScale",
    DesiredCapacity= Ref(AutoScalingSize),
    LoadBalancerNames=[Ref(ElasticLoadBalancer)],
    MinSize=Ref(AutoScalingSize),
    MaxSize=Ref(AutoScalingSize),
    VPCZoneIdentifier=[Ref(privateBSubnet),Ref(privateASubnet)],
    LaunchConfigurationName=Ref(CoreOSLaunchConfig)
))
예제 #13
0
    def add_resources(self):
        self.runner_ssm_role = self.template.add_resource(
            Role(
                "RunnerSsmRole",
                Path="/",
                ManagedPolicyArns=[
                    "arn:aws:iam::aws:policy/service-role/AmazonEC2RoleforSSM"
                ],
                AssumeRolePolicyDocument=aws.Policy(Statement=[
                    Statement(
                        Action=[sts.AssumeRole],
                        Effect=Allow,
                        Principal=Principal("Service", ["ec2.amazonaws.com"]),
                    )
                ]),
            ))

        self.runner_ssm_instanceprofile = self.template.add_resource(
            InstanceProfile("RunnerSsmInstanceProfile",
                            Path="/",
                            Roles=[Ref(self.runner_ssm_role)]))

        self.runner_launch_config = self.template.add_resource(
            LaunchConfiguration(
                "RunnerLaunchConfiguration",
                UserData=Base64(
                    Join(
                        "",
                        [
                            "#!/bin/bash\n",
                            "#####install ssm######\n",
                            "yum install -y amazon-ssm-agent\n",
                            "systemctl enable amazon-ssm-agent\n",
                            "systemctl start amazon-ssm-agent\n",
                            "####install docker####\n",
                            "yum install -y docker\n",
                            "systemctl enable docker\n",
                            "systemctl start docker\n",
                            "####install runner####\n",
                            "yum install -y wget\n",
                            "wget -O /usr/local/bin/gitlab-runner ",
                            "https://gitlab-runner-downloads.s3.amazonaws.com/v",
                            Ref(self.runner_version),
                            "/binaries/gitlab-runner-linux-amd64\n",
                            "ln -s /usr/local/bin/gitlab-runner ",
                            "/usr/bin/gitlab-runner\n",
                            "chmod +x /usr/local/bin/gitlab-runner\n",
                            "useradd --comment 'GitLab Runner' ",
                            "--create-home gitlab-runner --shell /bin/bash\n",
                            "/usr/local/bin/gitlab-runner install ",
                            "--user=gitlab-runner "
                            "--working-directory=/home/gitlab-runner\n",
                            "systemctl enable gitlab-runner\n",
                            "systemctl start gitlab-runner\n",
                            "####register runner####\n",
                            "gitlab-runner register ",
                            "--config=/etc/gitlab-runner/config.toml ",
                            "--request-concurrency=",
                            Ref(self.runner_job_concurrency),
                            " ",
                            "--tag-list=",
                            Ref(self.runner_tag_list),
                            " ",
                            "--non-interactive ",
                            "--registration-token=",
                            Ref(self.runner_register_token),
                            " ",
                            "--run-untagged=true ",
                            "--locked=false ",
                            "--url=",
                            Ref(self.runner_gitlab_url),
                            " ",
                            "--executor=docker ",
                            "--docker-image=alpine:latest ",
                            "--docker-privileged=true\n",
                            "####create unregister script####\n",
                            "TOKEN=$(gitlab-runner list 2>&1 | grep Executor | ",
                            "awk '{ print $4 }' | awk -F= '{ print $2 }')\n",
                            "URL=$(gitlab-runner list 2>&1 | grep Executor | ",
                            "awk '{ print $5 }' | awk -F= '{ print $2 }')\n",
                            "echo gitlab-runner unregister ",
                            "--url $URL --token $TOKEN > /unregister.sh\n",
                            "chmod +x /unregister.sh",
                        ],
                    )),
                ImageId=Ref(self.runner_ami_id),
                KeyName=Ref(self.runner_key_pair),
                BlockDeviceMappings=[
                    BlockDeviceMapping(
                        DeviceName="/dev/xvda",
                        Ebs=EBSBlockDevice(
                            VolumeSize=Ref(self.runner_volume_size)),
                    )
                ],
                SecurityGroups=[Ref(self.runner_security_group)],
                InstanceType=Ref(self.runner_server_instance_type),
                IamInstanceProfile=GetAtt(self.runner_ssm_instanceprofile,
                                          "Arn"),
            ))

        self.runner_autoscaling_group = self.template.add_resource(
            AutoScalingGroup(
                "RunnerAutoscalingGroup",
                DesiredCapacity=Ref(self.runner_desired_count),
                LaunchConfigurationName=Ref(self.runner_launch_config),
                MinSize=Ref(self.runner_min_count),
                MaxSize=Ref(self.runner_max_count),
                VPCZoneIdentifier=Split(",", Ref(self.runner_subnets)),
                Tags=[Tag("Name", "gitlab-runner-created-by-asg", True)],
            ))
예제 #14
0
    ]
)

template.add_resource(ecs_instance_profile)


launch_template = LaunchTemplate(
    region.replace("-", "") + "ecslivelaunchtemplate",
    LaunchTemplateName = "ecs-live-launch-template",
    LaunchTemplateData = LaunchTemplateData(
        ImageId = image_id,
        BlockDeviceMappings = [
            LaunchTemplateBlockDeviceMapping(
                DeviceName = "/dev/xvda",
                Ebs = EBSBlockDevice(
                    "ecsliveblockdevice",
                    VolumeSize = 30
                )
            )
        ],
        CreditSpecification = LaunchTemplateCreditSpecification(
            CpuCredits = "Unlimited"
        ),
        InstanceType = "t3.micro",
        IamInstanceProfile = IamInstanceProfile(
            region.replace("-", "") + "ecsliveiaminstanceprofile",
            Arn = GetAtt(ecs_instance_profile, "Arn")
        ),
        KeyName = "live-eu-west-1",
        SecurityGroupIds = [
            GetAtt(security_group, "GroupId")
        ],
예제 #15
0
                files=InitFiles({
                    '/etc/cfn/cfn-hup.conf': cfn_hup,
                    '/etc/cfn/hooks.d/cfn-auto-reloader.conf': reloader,
                    '/etc/docker/daemon.json': docker,
                })),
            'services': {
                'sysvinit': {
                    'cfn': cfn_service,
                    'docker': docker_service,
                }
            },
        })),
    BlockDeviceMappings=[
        BlockDeviceMapping(DeviceName='/dev/xvdh',
                           Ebs=EBSBlockDevice(
                               VolumeSize='50',
                               VolumeType='standard',
                           ))
    ])
t.add_resource(netkan_instance)

t.add_resource(
    RecordSetType(
        "NetKANDns",
        HostedZoneId=ZONE_ID,
        Comment="NetKAN Bot DNS",
        Name=BOT_FQDN,
        Type="A",
        TTL="900",
        ResourceRecords=[GetAtt('NetKANCompute', "PublicIp")],
    ))
예제 #16
0
def my_block_device_mappings_root(devicenamebase,volumesize,volumetype):
    block_device_mappings_root = ( BlockDeviceMapping(
        DeviceName=devicenamebase + "a1", Ebs=EBSBlockDevice(VolumeSize=volumesize, VolumeType=volumetype)
    ))
    return block_device_mappings_root
예제 #17
0
def add_launch_template(template, hosts_sg):
    """Function to create a launch template.

    :param template: ECS Cluster template
    :type template: troposphere.Template
    :param hosts_sg: security group for the EC2 hosts
    :type hosts_sg: troposphere.ec2.SecurityGroup

    :return: launch_template
    :rtype: troposphere.ec2.LaunchTemplate
    """
    # from troposphere.cloudformation import (
    #     WaitCondition, WaitConditionHandle
    # )
    #  Deactivated conditions given you could run with no EC2 at all.
    # Tricky condition to do as the WaitCondition and Handler cannot be created on a CFN Update, but only at the
    # very creation of the stack.
    # wait_handle = WaitConditionHandle(
    #     'BootstrapHandle',
    #     template=template
    # )
    # WaitCondition(
    #     'BootStrapCondition',
    #     template=template,
    #     DependsOn=[hosts_role],
    #     Handle=Ref(wait_handle),
    #     Timeout='900'
    # )

    launch_template = LaunchTemplate(
        "LaunchTemplate",
        template=template,
        Metadata=cloudformation.Metadata(
            cloudformation.Init(
                cloudformation.InitConfigSets(
                    default=["awspackages", "dockerconfig", "ecsconfig", "awsservices"]
                ),
                awspackages=cloudformation.InitConfig(
                    packages={"yum": {"awslogs": [], "amazon-ssm-agent": []}},
                    commands={
                        "001-check-packages": {"command": "rpm -qa | grep amazon"},
                        "002-check-packages": {"command": "rpm -qa | grep aws"},
                    },
                ),
                awsservices=cloudformation.InitConfig(
                    services={
                        "sysvinit": {
                            "amazon-ssm-agent": {"enabled": True, "ensureRunning": True}
                        }
                    }
                ),
                dockerconfig=cloudformation.InitConfig(
                    commands={
                        "001-stop-docker": {"command": "systemctl stop docker"},
                        "098-reload-systemd": {"command": "systemctl daemon-reload"},
                    },
                    files={
                        "/etc/sysconfig/docker": {
                            "owner": "root",
                            "group": "root",
                            "mode": "644",
                            "content": Join(
                                "\n",
                                [
                                    "DAEMON_MAXFILES=1048576",
                                    Join(
                                        " ",
                                        ["OPTIONS=--default-ulimit nofile=1024:4096"],
                                    ),
                                    "DAEMON_PIDFILE_TIMEOUT=10",
                                    "#EOF",
                                    "",
                                ],
                            ),
                        }
                    },
                    services={
                        "sysvinit": {
                            "docker": {
                                "enabled": True,
                                "ensureRunning": True,
                                "files": ["/etc/sysconfig/docker"],
                                "commands": ["098-reload-systemd"],
                            }
                        }
                    },
                ),
                ecsconfig=cloudformation.InitConfig(
                    files={
                        "/etc/ecs/ecs.config": {
                            "owner": "root",
                            "group": "root",
                            "mode": "644",
                            "content": Join(
                                "\n",
                                [
                                    Sub(f"ECS_CLUSTER=${{{CLUSTER_NAME_T}}}"),
                                    "ECS_ENABLE_TASK_IAM_ROLE=true",
                                    "ECS_ENABLE_SPOT_INSTANCE_DRAINING=true",
                                    "ECS_ENABLE_TASK_IAM_ROLE_NETWORK_HOST=true",
                                    "ECS_ENABLE_CONTAINER_METADATA=true",
                                    "ECS_ENABLE_UNTRACKED_IMAGE_CLEANUP=true",
                                    "ECS_UPDATES_ENABLED=true",
                                    "ECS_ENGINE_TASK_CLEANUP_WAIT_DURATION=15m",
                                    "ECS_IMAGE_CLEANUP_INTERVAL=10m",
                                    "ECS_NUM_IMAGES_DELETE_PER_CYCLE=100",
                                    "ECS_ENABLE_TASK_ENI=true",
                                    "ECS_AWSVPC_BLOCK_IMDS=true",
                                    "ECS_TASK_METADATA_RPS_LIMIT=300,400",
                                    "ECS_ENABLE_AWSLOGS_EXECUTIONROLE_OVERRIDE=true",
                                    'ECS_AVAILABLE_LOGGING_DRIVERS=["awslogs", "json-file"]',
                                    "#EOF",
                                ],
                            ),
                        }
                    },
                    commands={
                        "0001-restartecs": {
                            "command": "systemctl --no-block restart ecs"
                        }
                    },
                ),
            )
        ),
        LaunchTemplateData=LaunchTemplateData(
            BlockDeviceMappings=[
                LaunchTemplateBlockDeviceMapping(
                    DeviceName="/dev/xvda",
                    Ebs=EBSBlockDevice(DeleteOnTermination=True, Encrypted=True),
                )
            ],
            ImageId=Ref(compute_params.ECS_AMI_ID),
            InstanceInitiatedShutdownBehavior="terminate",
            IamInstanceProfile=IamInstanceProfile(
                Arn=Sub(f"${{{HOST_PROFILE_T}.Arn}}")
            ),
            TagSpecifications=[
                TagSpecifications(
                    ResourceType="instance",
                    Tags=Tags(
                        Name=Sub(f"EcsNodes-${{{CLUSTER_NAME_T}}}"),
                        StackName=Ref("AWS::StackName"),
                        StackId=Ref("AWS::StackId"),
                    ),
                )
            ],
            InstanceType="m5a.large",
            Monitoring=Monitoring(Enabled=True),
            SecurityGroupIds=[GetAtt(hosts_sg, "GroupId")],
            UserData=Base64(
                Join(
                    "\n",
                    [
                        "#!/usr/bin/env bash",
                        "export PATH=$PATH:/opt/aws/bin",
                        "cfn-init -v || yum install aws-cfn-bootstrap -y",
                        Sub(
                            f"cfn-init --region ${{AWS::Region}} -r LaunchTemplate -s ${{AWS::StackName}}"
                        ),
                        # 'if [ $? -ne 0 ]; then',
                        # Sub(f'cfn-signal -e 1 -r "Failed to bootstrap" \'${{{wait_handle.title}}}\''),
                        # 'halt',
                        # 'else',
                        # Sub(f'cfn-signal -e 0 -r "Successfully bootstrapped" \'${{{wait_handle.title}}}\''),
                        # 'fi',
                        "# EOF",
                    ],
                )
            ),
        ),
        LaunchTemplateName=Ref(CLUSTER_NAME_T),
    )
    return launch_template
예제 #18
0
def emit_configuration():
    vpc = cfn.vpcs[0]
    region = Ref("AWS::Region")

    chefserver_instance_class = template.add_parameter(
        Parameter(
            'ChefServerInstanceType',
            Type='String',
            Default='t2.medium',
            Description='Chef Server instance type',
            AllowedValues=cfn.usable_instances(),
            ConstraintDescription='Instance size must be a valid instance type'
        ))

    # Create IAM role for the chefserver instance
    # load the policies
    default_policy = json.loads(
        cfn.load_template("default_policy.json.j2", {
            "env": CLOUDENV,
            "cloud": CLOUDNAME,
            "region": "us-east-1"
        }))

    chefserver_role_name = '.'.join(['chefserver', CLOUDNAME, CLOUDENV])
    chefserver_iam_role = template.add_resource(
        Role("ChefServerIamRole",
             AssumeRolePolicyDocument=ASSUME_ROLE_POLICY,
             Path="/",
             Policies=[
                 Policy(PolicyName="ChefServerPolicy",
                        PolicyDocument=json.loads(
                            cfn.load_template(
                                "chefserver_policy.json.j2", {
                                    "env": CLOUDENV,
                                    "cloud": CLOUDNAME,
                                    "region": "us-east-1"
                                }))),
                 Policy(PolicyName="ChefserverDefaultPolicy",
                        PolicyDocument=default_policy)
             ],
             DependsOn=vpc.title))

    chefserver_instance_profile = template.add_resource(
        InstanceProfile("chefserverInstanceProfile",
                        Path="/",
                        Roles=[Ref(chefserver_iam_role)],
                        DependsOn=chefserver_iam_role.title))

    chefserver_user_data = cfn.load_template("chefserver-init.bash.j2", {
        "env": CLOUDENV,
        "cloud": CLOUDNAME,
        "deploy": "chefserver"
    })

    chefserver_ingress_rules = [
        SecurityGroupRule(IpProtocol=p[0],
                          CidrIp='{0}.0.0/16'.format(CIDR_PREFIX),
                          FromPort=p[1],
                          ToPort=p[1]) for p in [('tcp', 80), ('tcp', 443)]
    ]

    chefserver_sg = template.add_resource(
        SecurityGroup("ChefServer",
                      GroupDescription="Security Group for the Chef server",
                      VpcId=Ref(vpc),
                      SecurityGroupIngress=chefserver_ingress_rules,
                      DependsOn=vpc.title))

    chefserver_name = cfn.sanitize_id("ChefServer", CLOUDNAME, CLOUDENV)
    chefserver_instance = template.add_resource(
        Instance(chefserver_name,
                 DependsOn=vpc.title,
                 InstanceType=Ref(chefserver_instance_class),
                 KeyName=Ref(cfn.keyname),
                 SourceDestCheck=False,
                 ImageId=FindInMap('RegionMap', region, int(cfn.Amis.EBS)),
                 NetworkInterfaces=[
                     NetworkInterfaceProperty(
                         Description='Network interface for {0}'.format(
                             chefserver_name),
                         GroupSet=[Ref(chefserver_sg)],
                         SubnetId=Ref(
                             cfn.get_vpc_subnets(vpc,
                                                 cfn.SubnetTypes.PLATFORM)[0]),
                         AssociatePublicIpAddress=True,
                         DeviceIndex=0,
                         DeleteOnTermination=True)
                 ],
                 BlockDeviceMappings=[
                     BlockDeviceMapping(DeviceName="/dev/sda1",
                                        Ebs=EBSBlockDevice(
                                            VolumeSize=50,
                                            DeleteOnTermination=False))
                 ]))
def main(**params):
    try:
        # Metadata
        t = Template()
        t.set_version("2010-09-09")
        t.set_description("(SOCA) - Base template to deploy compute nodes.")
        allow_anonymous_data_collection = params["MetricCollectionAnonymous"]
        debug = False
        mip_usage = False
        instances_list = params["InstanceType"].split("+")
        asg_lt = asg_LaunchTemplate()
        ltd = LaunchTemplateData("NodeLaunchTemplateData")
        mip = MixedInstancesPolicy()
        stack_name = Ref("AWS::StackName")

        # Begin LaunchTemplateData
        UserData = '''#!/bin/bash -xe
export PATH=$PATH:/usr/local/bin
if [[ "''' + params['BaseOS'] + '''" == "centos7" ]] || [[ "''' + params['BaseOS'] + '''" == "rhel7" ]];
    then
        EASY_INSTALL=$(which easy_install-2.7)
        $EASY_INSTALL pip
        PIP=$(which pip2.7)
        $PIP install awscli
        yum install -y nfs-utils # enforce install of nfs-utils
else
     # Upgrade awscli on ALI (do not use yum)
     EASY_INSTALL=$(which easy_install-2.7)
     $EASY_INSTALL pip
     PIP=$(which pip)
     $PIP install awscli --upgrade 
fi
if [[ "''' + params['BaseOS'] + '''" == "amazonlinux2" ]];
    then
        /usr/sbin/update-motd --disable
fi

GET_INSTANCE_TYPE=$(curl http://169.254.169.254/latest/meta-data/instance-type)
echo export "SOCA_CONFIGURATION="''' + str(params['ClusterId']) + '''"" >> /etc/environment
echo export "SOCA_BASE_OS="''' + str(params['BaseOS']) + '''"" >> /etc/environment
echo export "SOCA_JOB_QUEUE="''' + str(params['JobQueue']) + '''"" >> /etc/environment
echo export "SOCA_JOB_OWNER="''' + str(params['JobOwner']) + '''"" >> /etc/environment
echo export "SOCA_JOB_NAME="''' + str(params['JobName']) + '''"" >> /etc/environment
echo export "SOCA_JOB_PROJECT="''' + str(params['JobProject']) + '''"" >> /etc/environment
echo export "SOCA_VERSION="''' + str(params['Version']) + '''"" >> /etc/environment
echo export "SOCA_JOB_EFA="''' + str(params['Efa']).lower() + '''"" >> /etc/environment
echo export "SOCA_JOB_ID="''' + str(params['JobId']) + '''"" >> /etc/environment
echo export "SOCA_SCRATCH_SIZE=''' + str(params['ScratchSize']) + '''" >> /etc/environment
echo export "SOCA_INSTALL_BUCKET="''' + str(params['S3Bucket']) + '''"" >> /etc/environment
echo export "SOCA_INSTALL_BUCKET_FOLDER="''' + str(params['S3InstallFolder']) + '''"" >> /etc/environment
echo export "SOCA_FSX_LUSTRE_BUCKET="''' + str(params['FSxLustreConfiguration']['fsx_lustre']).lower() + '''"" >> /etc/environment
echo export "SOCA_FSX_LUSTRE_DNS="''' + str(params['FSxLustreConfiguration']['existing_fsx']).lower() + '''"" >> /etc/environment
echo export "SOCA_INSTANCE_TYPE=$GET_INSTANCE_TYPE" >> /etc/environment
echo export "SOCA_INSTANCE_HYPERTHREADING="''' + str(params['ThreadsPerCore']).lower() + '''"" >> /etc/environment
echo export "SOCA_HOST_SYSTEM_LOG="/apps/soca/''' + str(params['ClusterId']) + '''/cluster_node_bootstrap/logs/''' + str(params['JobId']) + '''/$(hostname -s)"" >> /etc/environment
echo export "AWS_STACK_ID=${AWS::StackName}" >> /etc/environment
echo export "AWS_DEFAULT_REGION=${AWS::Region}" >> /etc/environment


source /etc/environment
AWS=$(which aws)

# Give yum permission to the user on this specific machine
echo "''' + params['JobOwner'] + ''' ALL=(ALL) /bin/yum" >> /etc/sudoers

mkdir -p /apps
mkdir -p /data

# Mount EFS
echo "''' + params['EFSDataDns'] + ''':/ /data nfs4 nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 0 0" >> /etc/fstab
echo "''' + params['EFSAppsDns'] + ''':/ /apps nfs4 nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 0 0" >> /etc/fstab
mount -a 

# Configure NTP
yum remove -y ntp
yum install -y chrony
mv /etc/chrony.conf  /etc/chrony.conf.original
echo -e """
# use the local instance NTP service, if available
server 169.254.169.123 prefer iburst minpoll 4 maxpoll 4

# Use public servers from the pool.ntp.org project.
# Please consider joining the pool (http://www.pool.ntp.org/join.html).
# !!! [BEGIN] SOCA REQUIREMENT
# You will need to open UDP egress traffic on your security group if you want to enable public pool
#pool 2.amazon.pool.ntp.org iburst
# !!! [END] SOCA REQUIREMENT
# Record the rate at which the system clock gains/losses time.
driftfile /var/lib/chrony/drift

# Allow the system clock to be stepped in the first three updates
# if its offset is larger than 1 second.
makestep 1.0 3

# Specify file containing keys for NTP authentication.
keyfile /etc/chrony.keys

# Specify directory for log files.
logdir /var/log/chrony

# save data between restarts for fast re-load
dumponexit
dumpdir /var/run/chrony
""" > /etc/chrony.conf
systemctl enable chronyd

# Prepare  Log folder
mkdir -p $SOCA_HOST_SYSTEM_LOG
echo "@reboot /bin/bash /apps/soca/$SOCA_CONFIGURATION/cluster_node_bootstrap/ComputeNodePostReboot.sh >> $SOCA_HOST_SYSTEM_LOG/ComputeNodePostInstall.log 2>&1" | crontab -
$AWS s3 cp s3://$SOCA_INSTALL_BUCKET/$SOCA_INSTALL_BUCKET_FOLDER/scripts/config.cfg /root/
/bin/bash /apps/soca/$SOCA_CONFIGURATION/cluster_node_bootstrap/ComputeNode.sh ''' + params['SchedulerHostname'] + ''' >> $SOCA_HOST_SYSTEM_LOG/ComputeNode.sh.log 2>&1'''

        ltd.EbsOptimized = True
        for instance in instances_list:
            if "t2." in instance:
                ltd.EbsOptimized = False
            else:
                # t2 does not support CpuOptions
                ltd.CpuOptions = CpuOptions(
                    CoreCount=int(params["CoreCount"]),
                    ThreadsPerCore=1 if params["ThreadsPerCore"] is False else 2)

        ltd.IamInstanceProfile = IamInstanceProfile(Arn=params["ComputeNodeInstanceProfileArn"])
        ltd.KeyName = params["SSHKeyPair"]
        ltd.ImageId = params["ImageId"]
        if params["SpotPrice"] is not False and params["SpotAllocationCount"] is False:
            ltd.InstanceMarketOptions = InstanceMarketOptions(
                MarketType="spot",
                SpotOptions=SpotOptions(
                    MaxPrice=Ref("AWS::NoValue") if params["SpotPrice"] == "auto" else str(params["SpotPrice"])
                    # auto -> cap at OD price
                )
            )
        ltd.InstanceType = instances_list[0]
        ltd.NetworkInterfaces = [NetworkInterfaces(
            InterfaceType="efa" if params["Efa"] is not False else Ref("AWS::NoValue"),
            DeleteOnTermination=True,
            DeviceIndex=0,
            Groups=[params["SecurityGroupId"]]
        )]
        ltd.UserData = Base64(Sub(UserData))
        ltd.BlockDeviceMappings = [
            BlockDeviceMapping(
                DeviceName="/dev/xvda" if params["BaseOS"] == "amazonlinux2" else "/dev/sda1",
                Ebs=EBSBlockDevice(
                    VolumeSize=params["RootSize"],
                    VolumeType="gp2",
                    DeleteOnTermination="false" if params["KeepEbs"] is True else "true",
                    Encrypted=True))
        ]
        if int(params["ScratchSize"]) > 0:
            ltd.BlockDeviceMappings.append(
                BlockDeviceMapping(
                    DeviceName="/dev/xvdbx",
                    Ebs=EBSBlockDevice(
                        VolumeSize=params["ScratchSize"],
                        VolumeType="io1" if int(params["VolumeTypeIops"]) > 0 else "gp2",
                        Iops=params["VolumeTypeIops"] if int(params["VolumeTypeIops"]) > 0 else Ref("AWS::NoValue"),
                        DeleteOnTermination="false" if params["KeepEbs"] is True else "true",
                        Encrypted=True))
            )
        # End LaunchTemplateData

        # Begin Launch Template Resource
        lt = LaunchTemplate("NodeLaunchTemplate")
        lt.LaunchTemplateName = params["ClusterId"] + "-" + str(params["JobId"])
        lt.LaunchTemplateData = ltd
        t.add_resource(lt)
        # End Launch Template Resource

        asg_lt.LaunchTemplateSpecification = LaunchTemplateSpecification(
            LaunchTemplateId=Ref(lt),
            Version=GetAtt(lt, "LatestVersionNumber")
        )

        asg_lt.Overrides = []
        for instance in instances_list:
            asg_lt.Overrides.append(LaunchTemplateOverrides(
                InstanceType=instance))

        # Begin InstancesDistribution
        if params["SpotPrice"] is not False and \
                params["SpotAllocationCount"] is not False and \
                (params["DesiredCapacity"] - params["SpotAllocationCount"]) > 0:
            mip_usage = True
            idistribution = InstancesDistribution()
            idistribution.OnDemandAllocationStrategy = "prioritized"  # only supported value
            idistribution.OnDemandBaseCapacity = params["DesiredCapacity"] - params["SpotAllocationCount"]
            idistribution.OnDemandPercentageAboveBaseCapacity = "0"  # force the other instances to be SPOT
            idistribution.SpotMaxPrice = Ref("AWS::NoValue") if params["SpotPrice"] == "auto" else str(
                params["SpotPrice"])
            idistribution.SpotAllocationStrategy = params['SpotAllocationStrategy']
            mip.InstancesDistribution = idistribution

        # End MixedPolicyInstance

        # Begin FSx for Lustre
        if params["FSxLustreConfiguration"]["fsx_lustre"] is not False:
            if params["FSxLustreConfiguration"]["existing_fsx"] is False:
                fsx_lustre = FileSystem("FSxForLustre")
                fsx_lustre.FileSystemType = "LUSTRE"
                fsx_lustre.StorageCapacity = params["FSxLustreConfiguration"]["capacity"]
                fsx_lustre.SecurityGroupIds = [params["SecurityGroupId"]]
                fsx_lustre.SubnetIds = params["SubnetId"]

                if params["FSxLustreConfiguration"]["s3_backend"] is not False:
                    fsx_lustre_configuration = LustreConfiguration()
                    fsx_lustre_configuration.ImportPath = params["FSxLustreConfiguration"]["import_path"] if params["FSxLustreConfiguration"]["import_path"] is not False else params["FSxLustreConfiguration"]["s3_backend"]
                    fsx_lustre_configuration.ExportPath = params["FSxLustreConfiguration"]["import_path"] if params["FSxLustreConfiguration"]["import_path"] is not False else params["FSxLustreConfiguration"]["s3_backend"] + "/" + params["ClusterId"] + "-fsxoutput/job-" +  params["JobId"] + "/"
                    fsx_lustre.LustreConfiguration = fsx_lustre_configuration

                fsx_lustre.Tags = base_Tags(
                    # False disable PropagateAtLaunch
                    Name=str(params["ClusterId"] + "-compute-job-" + params["JobId"]),
                    _soca_JobId=str(params["JobId"]),
                    _soca_JobName=str(params["JobName"]),
                    _soca_JobQueue=str(params["JobQueue"]),
                    _soca_StackId=stack_name,
                    _soca_JobOwner=str(params["JobOwner"]),
                    _soca_JobProject=str(params["JobProject"]),
                    _soca_KeepForever=str(params["KeepForever"]).lower(),
                    _soca_FSx="true",
                    _soca_ClusterId=str(params["ClusterId"]),
                )
                t.add_resource(fsx_lustre)
        # End FSx For Lustre

        # Begin AutoScalingGroup Resource
        asg = AutoScalingGroup("AutoScalingComputeGroup")
        asg.DependsOn = "NodeLaunchTemplate"
        if mip_usage is True or instances_list.__len__() > 1:
            mip.LaunchTemplate = asg_lt
            asg.MixedInstancesPolicy = mip

        else:
            asg.LaunchTemplate = LaunchTemplateSpecification(
                LaunchTemplateId=Ref(lt),
                Version=GetAtt(lt, "LatestVersionNumber"))

        asg.MinSize = int(params["DesiredCapacity"])
        asg.MaxSize = int(params["DesiredCapacity"])
        asg.VPCZoneIdentifier = params["SubnetId"]

        if params["PlacementGroup"] is True:
            pg = PlacementGroup("ComputeNodePlacementGroup")
            pg.Strategy = "cluster"
            t.add_resource(pg)
            asg.PlacementGroup = Ref(pg)

        asg.Tags = Tags(
            Name=str(params["ClusterId"]) + "-compute-job-" + str(params["JobId"]),
            _soca_JobId=str(params["JobId"]),
            _soca_JobName=str(params["JobName"]),
            _soca_JobQueue=str(params["JobQueue"]),
            _soca_StackId=stack_name,
            _soca_JobOwner=str(params["JobOwner"]),
            _soca_JobProject=str(params["JobProject"]),
            _soca_KeepForever=str(params["KeepForever"]).lower(),
            _soca_ClusterId=str(params["ClusterId"]),
            _soca_NodeType="soca-compute-node")
        t.add_resource(asg)
        # End AutoScalingGroup Resource

        # Begin Custom Resource
        # Change Mapping to No if you want to disable this
        if allow_anonymous_data_collection is True:
            metrics = CustomResourceSendAnonymousMetrics("SendAnonymousData")
            metrics.ServiceToken = params["SolutionMetricLambda"]
            metrics.DesiredCapacity = str(params["DesiredCapacity"])
            metrics.InstanceType = str(params["InstanceType"])
            metrics.Efa = str(params["Efa"])
            metrics.ScratchSize = str(params["ScratchSize"])
            metrics.RootSize = str(params["RootSize"])
            metrics.SpotPrice = str(params["SpotPrice"])
            metrics.BaseOS = str(params["BaseOS"])
            metrics.StackUUID = str(params["StackUUID"])
            metrics.KeepForever = str(params["KeepForever"])
            metrics.FsxLustre = str(params["FSxLustreConfiguration"])
            t.add_resource(metrics)
            # End Custom Resource

        if debug is True:
            print(t.to_json())

        # Tags must use "soca:<Key>" syntax
        template_output = t.to_yaml().replace("_soca_", "soca:")
        return {'success': True,
                'output': template_output}

    except Exception as e:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        return {'success': False,
                'output': 'cloudformation_builder.py: ' + (
                            str(e) + ': error :' + str(exc_type) + ' ' + str(fname) + ' ' + str(exc_tb.tb_lineno))}
예제 #20
0
# load balancer
elb_listener_80 = Listener(config['name'] + 'Ssl')
elb_listener_80.InstancePort = 80
elb_listener_80.LoadBalancerPort = 80
elb_listener_80.Protocol = 'HTTP'
elb_listener_80.InstanceProtocol = 'HTTP'

load_balancer = LoadBalancer(config['name'] + "Elb")
load_balancer.CrossZone = True
load_balancer.Listeners = [elb_listener_80]
load_balancer.Subnets = [Ref(subnet.title) for subnet in app_subnets]
load_balancer.SecurityGroups = [Ref(elb_sg)]
t.add_resource(load_balancer)

# launch configuration for consul server
consul_block_device = EBSBlockDevice(config['name'] + 'Ebs')
consul_block_device.DeleteOnTermination = config['consul_launch_config']['block_device']['delete_on_termination']

consul_block_device_mapping = BlockDeviceMapping(config['name'] + 'ConsulBlockDeviceMapping')
consul_block_device_mapping.DeviceName = '/dev/sda1'
consul_block_device_mapping.Ebs = consul_block_device

consul_launch_config = LaunchConfiguration(config['name'] + 'ConsulLaunchConfig')
consul_launch_config.AssociatePublicIpAddress = True
consul_launch_config.EbsOptimized = config['consul_launch_config']['ebs_optimized']
consul_launch_config.ImageId = config['consul_launch_config']['image_id']
consul_launch_config.KeyName = config['consul_launch_config']['key_name']
consul_launch_config.InstanceType = config['consul_launch_config']['instance_type']
consul_launch_config.BlockDeviceMappings = [consul_block_device_mapping]
consul_launch_config.SecurityGroups = [Ref(config['name'] + 'homeSsh'), Ref(consul_sg)]
t.add_resource(consul_launch_config)