예제 #1
0
    def make_job(self,
                 role_name,
                 num_tasks=1,
                 skip_existing_job_validation=False,
                 **kwargs):
        """skip_existing_job_validation: if True, doesn't check that existing job on server has same number of tasks as requested."""

        #    u.maybe_create_resources()

        assert num_tasks >= 0

        # TODO: document launch parameters
        job_name = u.format_job_name(role_name, self.name)
        instance_type = kwargs['instance_type']
        instances = u.lookup_aws_instances(job_name,
                                           instance_type=instance_type)
        kwargs = u.merge_kwargs(kwargs, self.kwargs)
        ami = kwargs.get('ami', '')
        ami_name = kwargs.get('ami_name', '')
        availability_zone = kwargs.get('availability_zone', '')
        if not availability_zone:
            availability_zone = os.environ['ZONE']
        placement_group = kwargs.get('placement_group', '')

        # automatically generated placement_group_name
        use_placement_group = kwargs.get('use_placement_group', False)
        assert use_placement_group == False or placement_group == ''
        if use_placement_group:
            placement_group = self.placement_group_name

        install_script = kwargs.get('install_script', '')
        skip_efs_mount = kwargs.get('skip_efs_mount', False)
        linux_type = kwargs.get('linux_type', 'ubuntu')
        # TODO: use heuristics to tell linux type from AMI name
        user_data = kwargs.get('user_data', '')

        if user_data:
            assert user_data.startswith('#!/bin/bash')

        ebs = kwargs.get('ebs', '')
        use_spot = kwargs.get('use_spot', False)
        monitoring = kwargs.get('monitoring', True)

        # always install tmux on Amazon linux types
        # TODO: has no effect for some reason
        # https://console.aws.amazon.com/support/v1?region=us-west-2#/case/?displayId=5256445351&language=en
        if linux_type == 'amazon':
            user_data += 'sudo yum install tmux -y'

        if user_data:
            user_data += '\necho userdata_ok >> /tmp/is_initialized\n'

        #    print("Using user_data", user_data)

        # TODO: also make sure instance type is the same
        if instances:
            if not skip_existing_job_validation:
                assert len(instances) == num_tasks, (
                    "Found job with same name %s(%s), but number of tasks %d doesn't match requested %d, kill job manually."
                    %
                    (job_name, instances[0].state, len(instances), num_tasks))

            print("Found existing job " + job_name)
            starting_instances = False
            for i in instances:
                if i.state['Name'] == 'stopped':
                    i.start()
                    starting_instances = True

            # TODO: replace with proper wait loop
            if starting_instances:
                while True:
                    print("Waiting forever for instances to start")
                    time.sleep(10)

            print(instances)
        else:
            print("Launching new job %s into VPC %s" %
                  (job_name, u.get_resource_name()))

            assert not (
                ami and ami_name
            ), "Must have only one of ami and ami_name, got " + ami + ", " + ami_name
            assert ami or ami_name, "Must specify at least one of ami and ami_name"
            if ami_name:
                ami = u.lookup_ami_id(ami_name).id
            security_group = u.get_security_group_dict()[u.get_resource_name()]

            keypair = u.get_keypair_dict()[u.get_keypair_name()]
            vpc = u.get_vpc_dict()[u.get_resource_name()]
            subnet_dict = u.get_subnet_dict(vpc)
            region = u.get_region()
            assert availability_zone in subnet_dict, "Availability zone %s is not in subnet dict for current AWS default region %s, available subnets are %s. (hint, set AWS_DEFAULT_REGION=%s)" % (
                availability_zone, region, ', '.join(
                    subnet_dict.keys()), availability_zone[:-1])
            subnet = subnet_dict[availability_zone]
            ec2 = u.create_ec2_resource()
            u.maybe_create_placement_group(placement_group)

            self.log("Requesting %d %s" % (num_tasks, instance_type))

            args = {
                'ImageId': ami,
                'InstanceType': instance_type,
                'MinCount': num_tasks,
                'MaxCount': num_tasks,
                'KeyName': keypair.name
            }

            # storage setup
            if ebs: args['BlockDeviceMappings'] = ebs
            # network setup
            # TODO: get rid of zone? Zone seems to be required for constructor
            # that allows to enable AssociatePublicIpAddress field
            args['NetworkInterfaces'] = [{
                'SubnetId': subnet.id,
                'DeviceIndex': 0,
                'AssociatePublicIpAddress': True,
                'Groups': [security_group.id]
            }]

            placement_arg = {'AvailabilityZone': availability_zone}
            if placement_group: placement_arg['GroupName'] = placement_group
            args['Placement'] = placement_arg

            if monitoring: args['Monitoring'] = {'Enabled': True}
            args['UserData'] = user_data

            if use_spot: instances = u.create_spot_instances(args)
            else:
                try:
                    instances = ec2.create_instances(**args)
                except Exception as e:
                    print(f"Instance creation failed with ({e})")
                    print("Account number: ", u.get_account_number())
                    print("Region: ", u.get_region())
                    sys.exit()

            assert instances
            assert len(instances) == num_tasks

            # TODO: make instances match their launch indices. This way
            # tasks can figure out which # they are
            for (task_num, instance) in enumerate(instances):
                while True:
                    try:
                        # sometimes get "An error occurred (InvalidInstanceID.NotFound)"
                        # task_name = u.format_task_name(instance.ami_launch_index, role_name,
                        #                                self.name)
                        task_name = u.format_task_name(task_num, job_name)
                        instance.create_tags(Tags=u.make_name(task_name))

                        break
                    except Exception as e:
                        self.log(
                            "create_tags failed with %s, retrying in %d seconds"
                            % (str(e), TIMEOUT_SEC))
                        time.sleep(TIMEOUT_SEC)

        job = Job(self,
                  job_name,
                  instances=instances,
                  install_script=install_script,
                  linux_type=linux_type,
                  user_data=user_data,
                  skip_efs_mount=skip_efs_mount)
        self.jobs.append(job)
        return job
예제 #2
0
    def make_job(self, role_name, num_tasks=1, **kwargs):
        assert num_tasks >= 0

        # TODO: document launch parameters
        job_name = u.format_job_name(role_name, self.name)
        instances = u.lookup_aws_instances(job_name)
        kwargs = u.merge_kwargs(kwargs, self.kwargs)
        ami = kwargs['ami']
        instance_type = kwargs['instance_type']
        availability_zone = kwargs['availability_zone']
        placement_group = kwargs.get('placement_group', '')
        install_script = kwargs.get('install_script', '')
        skip_efs_mount = kwargs.get('skip_efs_mount', False)
        linux_type = kwargs.get('linux_type', 'ubuntu')
        user_data = kwargs.get('user_data', '')

        if user_data:
            user_data += '\necho userdata_ok >> /tmp/is_initialized\n'

        #    print("Using user_data", user_data)

        # TODO: also make sure instance type is the same
        if instances:
            assert len(instances) == num_tasks, (
                "Found job with same name, but number of tasks %d doesn't match requested %d, kill job manually."
                % (len(instances), num_tasks))
            print("Found existing job " + job_name)
        else:
            print("Launching new job %s into VPC %s" %
                  (job_name, u.get_resource_name()))

            security_group = u.get_security_group_dict()[u.get_resource_name()]
            keypair = u.get_keypair_dict()[u.get_keypair_name()]
            vpc = u.get_vpc_dict()[u.get_resource_name()]
            subnet_dict = u.get_subnet_dict(vpc)
            region = u.get_region()
            assert availability_zone in subnet_dict, "Availability zone %s is not in subnet dict for current AWS default region %s, available subnets are %s. (hint, set AWS_DEFAULT_REGION)" % (
                availability_zone, region, ', '.join(subnet_dict.keys()))
            subnet = subnet_dict[availability_zone]
            ec2 = u.create_ec2_resource()
            u.maybe_create_placement_group(placement_group)

            self.log("Requesting %d %s" % (num_tasks, instance_type))

            args = {
                'ImageId': ami,
                'InstanceType': instance_type,
                'MinCount': num_tasks,
                'MaxCount': num_tasks,
                'KeyName': keypair.name
            }

            # network setup
            args['NetworkInterfaces'] = [{
                'SubnetId': subnet.id,
                'DeviceIndex': 0,
                'AssociatePublicIpAddress': True,
                'Groups': [security_group.id]
            }]

            placement_arg = {'AvailabilityZone': availability_zone}

            if placement_group: placement_arg['GroupName'] = placement_group
            args['Placement'] = placement_arg
            args['UserData'] = user_data

            instances = ec2.create_instances(**args)
            assert len(instances) == num_tasks

            # assign proper names to tasks
            for instance in instances:
                while True:
                    try:
                        # sometimes get "An error occurred (InvalidInstanceID.NotFound)"
                        task_name = u.format_task_name(
                            instance.ami_launch_index, role_name, self.name)
                        # TODO: use instance.create_tags instead like in create_resources.py
                        ec2.create_tags(Resources=[instance.id],
                                        Tags=u.make_name(task_name))
                        break
                    except Exception as e:
                        self.log(
                            "create_tags failed with %s, retrying in %d seconds"
                            % (str(e), TIMEOUT_SEC))
                        time.sleep(TIMEOUT_SEC)

        job = Job(self,
                  job_name,
                  instances=instances,
                  install_script=install_script,
                  linux_type=linux_type,
                  user_data=user_data,
                  skip_efs_mount=skip_efs_mount)
        self.jobs.append(job)
        return job