Пример #1
0
    def test_allow_string_cluster(self):
        spot = "2"
        withSpotPrice = "WithSpotPrice"
        cluster = emr.Cluster(
            'Cluster',
            # AdditionalInfo="Additional Info",
            Applications=[
                emr.Application(Name="Hadoop"),
                emr.Application(Name="Hive"),
                emr.Application(Name="Mahout"),
                emr.Application(Name="Pig"),
                emr.Application(Name="Spark")
            ],
            BootstrapActions=[
                emr.BootstrapActionConfig(
                    Name='Dummy bootstrap action',
                    ScriptBootstrapAction=emr.ScriptBootstrapActionConfig(
                        Path='file:/usr/share/aws/emr/scripts/install-hue',
                        Args=["dummy", "parameter"]))
            ],
            Configurations=[
                emr.Configuration(Classification="core-site",
                                  ConfigurationProperties={
                                      'hadoop.security.groups.cache.secs':
                                      '250'
                                  })
            ],
            Instances=emr.JobFlowInstancesConfig(
                Ec2KeyName="KeyName",
                Ec2SubnetId="SubnetId",
                MasterInstanceGroup=emr.InstanceGroupConfigProperty(
                    InstanceCount="1",
                    InstanceType=M4_LARGE,
                    AutoScalingPolicy=emr.AutoScalingPolicy(
                        Constraints=emr.ScalingConstraints(MinCapacity="1",
                                                           MaxCapacity="3"),
                        Rules=self.generate_rules("MasterAutoScalingPolicy")),
                ),
                CoreInstanceGroup=emr.InstanceGroupConfigProperty(
                    Name="Core Instance",
                    BidPrice=If(withSpotPrice, Ref(spot), Ref("AWS::NoValue")),
                    Market=If(withSpotPrice, "SPOT", "ON_DEMAND"),
                    InstanceCount="1",
                    InstanceType=M4_LARGE,
                    AutoScalingPolicy=emr.AutoScalingPolicy(
                        Constraints=emr.ScalingConstraints(MinCapacity="1",
                                                           MaxCapacity="3"),
                        Rules=self.generate_rules("CoreAutoScalingPolicy"),
                    )),
            ),
            JobFlowRole="EMRJobFlowRole",
            LogUri="s3://cluster-logs",
            Name="EMR Cluster",
            ReleaseLabel="emr-5.5.0",
            ServiceRole="EMRServiceRole",
            AutoScalingRole="EMR_AutoScaling_DefaultRole",
            VisibleToAllUsers="true",
            Tags=Tags(Name="EMR Sample Cluster"))

        cluster.to_dict()
Пример #2
0
                JobFlowRole=Ref(emr_instance_profile),
                ServiceRole=Ref(emr_service_role),
                Instances=emr.JobFlowInstancesConfig(
                    MasterInstanceGroup=emr.InstanceGroupConfigProperty(
                        Name="Master Instance",
                        InstanceCount="1",
                        InstanceType="m3.xlarge",
                        Market="ON_DEMAND"),
                    CoreInstanceGroup=emr.InstanceGroupConfigProperty(
                        Name="Core Instance",
                        BidPrice="20",
                        InstanceCount="1",
                        InstanceType="m3.xlarge",
                        Market="SPOT")),
                Applications=[
                    emr.Application(Name="Hadoop"),
                    emr.Application(Name="Hive"),
                    emr.Application(Name="Mahout"),
                    emr.Application(Name="Pig"),
                    emr.Application(Name="Spark")
                ],
                VisibleToAllUsers="true",
                Tags=Tags(Name="EMR Sample Cluster")))

step = template.add_resource(
    emr.Step('TestStep',
             Name="TestStep",
             ActionOnFailure='CONTINUE',
             HadoopJarStep=emr.HadoopJarStepConfig(
                 Args=["5", "10"],
                 Jar="s3://emr-cfn-test/hadoop-mapreduce-examples-2.6.0.jar",
Пример #3
0
    def test_allow_string_cluster(self):
        cluster_security_configuration = emr.SecurityConfiguration(
            'emrsecurityconfiguration',
            Name="EMRSecurityConfiguration",
            SecurityConfiguration=security_configuration)

        spot = "2"
        withSpotPrice = "WithSpotPrice"
        cluster = emr.Cluster(
            'Cluster',
            # AdditionalInfo="Additional Info",
            Applications=[
                emr.Application(Name="Hadoop"),
                emr.Application(Name="Hive"),
                emr.Application(Name="Mahout"),
                emr.Application(Name="Pig"),
                emr.Application(Name="Spark")
            ],
            BootstrapActions=[
                emr.BootstrapActionConfig(
                    Name='Dummy bootstrap action',
                    ScriptBootstrapAction=emr.ScriptBootstrapActionConfig(
                        Path='file:/usr/share/aws/emr/scripts/install-hue',
                        Args=["dummy", "parameter"]))
            ],
            Configurations=[
                emr.Configuration(Classification="core-site",
                                  ConfigurationProperties={
                                      'hadoop.security.groups.cache.secs':
                                      '250'
                                  })
            ],
            Instances=emr.JobFlowInstancesConfig(
                Ec2KeyName="KeyName",
                Ec2SubnetId="SubnetId",
                MasterInstanceGroup=emr.InstanceGroupConfigProperty(
                    InstanceCount="1",
                    InstanceType=M4_LARGE,
                    AutoScalingPolicy=emr.AutoScalingPolicy(
                        Constraints=emr.ScalingConstraints(MinCapacity="1",
                                                           MaxCapacity="3"),
                        Rules=self.generate_rules("MasterAutoScalingPolicy")),
                ),
                CoreInstanceGroup=emr.InstanceGroupConfigProperty(
                    Name="Core Instance",
                    BidPrice=If(withSpotPrice, Ref(spot), Ref("AWS::NoValue")),
                    Market=If(withSpotPrice, "SPOT", "ON_DEMAND"),
                    InstanceCount="1",
                    InstanceType=M4_LARGE,
                    AutoScalingPolicy=emr.AutoScalingPolicy(
                        Constraints=emr.ScalingConstraints(MinCapacity="1",
                                                           MaxCapacity="3"),
                        Rules=self.generate_rules("CoreAutoScalingPolicy"),
                    )),
            ),
            JobFlowRole="EMRJobFlowRole",
            LogUri="s3://cluster-logs",
            Name="EMR Cluster",
            ReleaseLabel="emr-5.5.0",
            SecurityConfiguration=Ref(cluster_security_configuration),
            ServiceRole="EMRServiceRole",
            AutoScalingRole="EMR_AutoScaling_DefaultRole",
            VisibleToAllUsers="true",
            Tags=Tags(Name="EMR Sample Cluster"))

        cluster.to_dict()

        autoscale_policy = emr.AutoScalingPolicy(
            Constraints=emr.ScalingConstraints(MinCapacity=0, MaxCapacity=5),
            Rules=[
                emr.ScalingRule(
                    Name='ScaleUpContainerPending',
                    Description='Scale up on over-provisioned '
                    'containers',
                    Action=emr.ScalingAction(
                        SimpleScalingPolicyConfiguration=emr.
                        SimpleScalingPolicyConfiguration(
                            AdjustmentType=emr.CHANGE_IN_CAPACITY,
                            CoolDown=300,
                            ScalingAdjustment=1)),
                    Trigger=emr.ScalingTrigger(
                        CloudWatchAlarmDefinition=emr.
                        CloudWatchAlarmDefinition(
                            ComparisonOperator='GREATER_THAN',
                            MetricName='ContainerPendingRatio',
                            Period=300,
                            Threshold=0.75,
                            Dimensions=[
                                emr.MetricDimension(Key='JobFlowId',
                                                    Value='${emr.clusterId}')
                            ]))),
                emr.ScalingRule(
                    Name='ScaleUpMemory',
                    Description='Scale up on low memory',
                    Action=emr.ScalingAction(
                        SimpleScalingPolicyConfiguration=emr.
                        SimpleScalingPolicyConfiguration(
                            AdjustmentType='CHANGE_IN_CAPACITY',
                            CoolDown=300,
                            ScalingAdjustment=1)),
                    Trigger=emr.ScalingTrigger(
                        CloudWatchAlarmDefinition=emr.
                        CloudWatchAlarmDefinition(
                            ComparisonOperator='LESS_THAN',
                            MetricName='YARNMemoryAvailablePercentage',
                            Period=300,
                            Threshold=15,
                            Dimensions=[
                                emr.MetricDimension(Key='JobFlowId',
                                                    Value='${emr.clusterId}')
                            ]))),
                emr.ScalingRule(
                    Name='ScaleDownMemory',
                    Description='Scale down on high memory',
                    Action=emr.ScalingAction(
                        SimpleScalingPolicyConfiguration=emr.
                        SimpleScalingPolicyConfiguration(
                            AdjustmentType=emr.CHANGE_IN_CAPACITY,
                            CoolDown=300,
                            ScalingAdjustment=-1)),
                    Trigger=emr.ScalingTrigger(
                        CloudWatchAlarmDefinition=emr.
                        CloudWatchAlarmDefinition(
                            ComparisonOperator='GREATER_THAN',
                            MetricName='YARNMemoryAvailablePercentage',
                            Period=300,
                            Threshold=75,
                            Dimensions=[
                                emr.MetricDimension(Key='JobFlowId',
                                                    Value='${emr.clusterId}')
                            ])))
            ])

        emr.InstanceGroupConfig('TaskInstanceGroup',
                                AutoScalingPolicy=autoscale_policy,
                                InstanceCount=0,
                                InstanceType=M4_LARGE,
                                InstanceRole='TASK',
                                Market='ON_DEMAND',
                                Name='Task Instance',
                                JobFlowId=Ref(cluster))
Пример #4
0
                            "PYTHONPATH":
                            os.path.join('/home/hadoop/miniconda',
                                         'bin/python') +
                            ":/usr/lib/spark/python/:$PYTHONPATH",
                            "PYSPARK_DRIVER_PYTHON":
                            os.path.join('/home/hadoop/miniconda',
                                         'bin/python'),
                            "SPARK_HOME":
                            "/usr/lib/spark",
                            "PYTHONHASHSEED":
                            "123"
                        })
                ]),
        ],
        Applications=[
            emr.Application(Name=app) for app in cfg['applications']
        ],
        VisibleToAllUsers='true',
        Tags=Tags(Name='jupyter-cluster')))

# Outputs
template.add_output([
    Output("EMRCluster", Description="EMRCluster", Value=Ref(cluster)),
    Output("EMRClusterMasterDNS",
           Description="EMRCluster",
           Value=GetAtt(cluster, 'MasterPublicDNS')),
])

template_json = template.to_json(indent=4)
print(template_json)
Пример #5
0
def build(ssh_keypair_name):
    template = Template()
    template.set_version("2010-09-09")

    keyname_param = template.add_parameter(
        Parameter(
            "KeyName",
            ConstraintDescription=
            "must be the name of an existing EC2 KeyPair.",
            Description=
            "Name of an existing EC2 KeyPair to enable SSH access to \
    the instance",
            Type="AWS::EC2::KeyPair::KeyName",
            Default=ssh_keypair_name,
        ))

    sshlocation_param = template.add_parameter(
        Parameter(
            "SSHLocation",
            Description=
            " The IP address range that can be used to SSH to the EC2 \
    instances",
            Type="String",
            MinLength="9",
            MaxLength="18",
            Default="0.0.0.0/0",
            AllowedPattern=
            r"(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})/(\d{1,2})",
            ConstraintDescription=(
                "must be a valid IP CIDR range of the form x.x.x.x/x."),
        ))

    vpc = template.add_resource(VPC("VPC", CidrBlock="10.0.0.0/16"))

    subnet = template.add_resource(
        Subnet(
            "Subnet",
            CidrBlock="10.0.0.0/24",
            VpcId=Ref(vpc),
        ))

    internet_gateway = template.add_resource(
        InternetGateway("InternetGateway"))
    attach_gateway = template.add_resource(
        VPCGatewayAttachment("AttachGateway",
                             VpcId=Ref(vpc),
                             InternetGatewayId=Ref(internet_gateway)))
    route_table = template.add_resource(
        RouteTable("RouteTable", VpcId=Ref(vpc)))

    template.add_resource(
        Route(
            "Route",
            DependsOn=attach_gateway,
            GatewayId=Ref(internet_gateway),
            DestinationCidrBlock="0.0.0.0/0",
            RouteTableId=Ref(route_table),
        ))

    template.add_resource(
        SubnetRouteTableAssociation(
            "SubnetRouteTableAssociation",
            SubnetId=Ref(subnet),
            RouteTableId=Ref(route_table),
        ))

    network_acl = template.add_resource(
        NetworkAcl("NetworkAcl", VpcId=Ref(vpc)))

    template.add_resource(
        NetworkAclEntry(
            "InboundSSHNetworkAclEntry",
            NetworkAclId=Ref(network_acl),
            RuleNumber="101",
            Protocol="6",
            PortRange=PortRange(To="22", From="22"),
            Egress="false",
            RuleAction="allow",
            CidrBlock="0.0.0.0/0",
        ))

    template.add_resource(
        NetworkAclEntry(
            "InboundResponsePortsNetworkAclEntry",
            NetworkAclId=Ref(network_acl),
            RuleNumber="102",
            Protocol="6",
            PortRange=PortRange(To="65535", From="1024"),
            Egress="false",
            RuleAction="allow",
            CidrBlock="0.0.0.0/0",
        ))

    template.add_resource(
        NetworkAclEntry(
            "OutBoundResponsePortsNetworkAclEntry",
            NetworkAclId=Ref(network_acl),
            RuleNumber="103",
            Protocol="6",
            PortRange=PortRange(To="65535", From="1024"),
            Egress="true",
            RuleAction="allow",
            CidrBlock="0.0.0.0/0",
        ))

    template.add_resource(
        NetworkAclEntry(
            "OutBoundHTTPPortsNetworkAclEntry",
            NetworkAclId=Ref(network_acl),
            RuleNumber="104",
            Protocol="6",
            PortRange=PortRange(To="80", From="80"),
            Egress="true",
            RuleAction="allow",
            CidrBlock="0.0.0.0/0",
        ))

    template.add_resource(
        NetworkAclEntry(
            "OutBoundHTTPSPortsNetworkAclEntry",
            NetworkAclId=Ref(network_acl),
            RuleNumber="105",
            Protocol="6",
            PortRange=PortRange(To="443", From="443"),
            Egress="true",
            RuleAction="allow",
            CidrBlock="0.0.0.0/0",
        ))

    template.add_resource(
        NetworkAclEntry(
            "OutBoundSSHPortsNetworkAclEntry",
            NetworkAclId=Ref(network_acl),
            RuleNumber="106",
            Protocol="6",
            PortRange=PortRange(To="22", From="22"),
            Egress="true",
            RuleAction="allow",
            CidrBlock="0.0.0.0/0",
        ))

    template.add_resource(
        SubnetNetworkAclAssociation("SubnetNetworkAclAssociation",
                                    SubnetId=Ref(subnet),
                                    NetworkAclId=Ref(network_acl)))

    emr_security_group = template.add_resource(
        SecurityGroup(
            "EMRSecurityGroup",
            GroupDescription="Enable SSH access via port 22",
            SecurityGroupIngress=[
                SecurityGroupRule(IpProtocol="tcp",
                                  FromPort="22",
                                  ToPort="22",
                                  CidrIp=Ref(sshlocation_param)),
            ],
            VpcId=Ref(vpc),
        ))

    emr_service_role = template.add_resource(
        iam.Role(
            "EMRServiceRole",
            AssumeRolePolicyDocument={
                "Statement": [{
                    "Effect": "Allow",
                    "Principal": {
                        "Service": ["elasticmapreduce.amazonaws.com"]
                    },
                    "Action": ["sts:AssumeRole"],
                }]
            },
            ManagedPolicyArns=[
                "arn:aws:iam::aws:policy/service-role/AmazonElasticMapReduceRole"
            ],
        ))

    emr_job_flow_role = template.add_resource(
        iam.Role(
            "EMRJobFlowRole",
            AssumeRolePolicyDocument={
                "Statement": [{
                    "Effect": "Allow",
                    "Principal": {
                        "Service": ["ec2.amazonaws.com"]
                    },
                    "Action": ["sts:AssumeRole"]
                }]
            },
            ManagedPolicyArns=[
                "arn:aws:iam::aws:policy/service-role/AmazonElasticMapReduceforEC2Role"
            ],
        ))

    emr_instance_profile = template.add_resource(
        iam.InstanceProfile("EMRInstanceProfile",
                            Roles=[Ref(emr_job_flow_role)]))

    cluster = template.add_resource(
        emr.Cluster(
            "EMRCluster",
            ReleaseLabel="emr-6.0.0",
            JobFlowRole=Ref(emr_instance_profile),
            ServiceRole=Ref(emr_service_role),
            Instances=emr.JobFlowInstancesConfig(
                Ec2KeyName=Ref(keyname_param),
                Ec2SubnetId=Ref(subnet),
                EmrManagedMasterSecurityGroup=Ref(emr_security_group),
                EmrManagedSlaveSecurityGroup=Ref(emr_security_group),
                MasterInstanceGroup=emr.InstanceGroupConfigProperty(
                    Name="Master Instance",
                    InstanceCount="1",
                    InstanceType=M4_LARGE,
                    Market="ON_DEMAND",
                ),
                CoreInstanceGroup=emr.InstanceGroupConfigProperty(
                    Name="Core Instance",
                    EbsConfiguration=emr.EbsConfiguration(
                        EbsBlockDeviceConfigs=[
                            emr.EbsBlockDeviceConfigs(
                                VolumeSpecification=emr.VolumeSpecification(
                                    SizeInGB="10", VolumeType="gp2"),
                                VolumesPerInstance="1",
                            )
                        ],
                        EbsOptimized="true",
                    ),
                    InstanceCount="1",
                    InstanceType=M4_LARGE,
                ),
            ),
            Applications=[emr.Application(Name="Spark")],
            VisibleToAllUsers="true",
        ))

    template.add_output(
        [Output("MasterPublicDNS", Value=GetAtt(cluster, "MasterPublicDNS"))])
    return template