Exemplo n.º 1
0
    def _createSecurityGroup(cls, ctx, name):
        def groupNotFound(e):
            retry = (e.status == 400 and 'does not exist in default VPC' in e.body)
            return retry

        # security group create/get. ssh + all ports open within the group
        try:
            web = ctx.ec2.create_security_group(name, 'Toil appliance security group')
        except EC2ResponseError as e:
            if e.status == 400 and 'already exists' in e.body:
                pass  # group exists- nothing to do
            else:
                raise
        else:
            for attempt in retry(predicate=groupNotFound, timeout=300):
                with attempt:
                    # open port 22 for ssh-ing
                    web.authorize(ip_protocol='tcp', from_port=22, to_port=22, cidr_ip='0.0.0.0/0')
            for attempt in retry(predicate=groupNotFound, timeout=300):
                with attempt:
                    # the following authorizes all port access within the web security group
                    web.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=web)
            for attempt in retry(predicate=groupNotFound, timeout=300):
                with attempt:
                    # open port 5050-5051 for mesos web interface
                    web.authorize(ip_protocol='tcp', from_port=5050, to_port=5051, cidr_ip='0.0.0.0/0')
Exemplo n.º 2
0
 def _createSecurityGroup(cls, ctx, name, vpcSubnet=None):
     def groupNotFound(e):
         retry = (e.status == 400 and 'does not exist in default VPC' in e.body)
         return retry
     vpcId = None
     if vpcSubnet:
         conn = boto.connect_vpc(region=ctx.ec2.region)
         subnets = conn.get_all_subnets(subnet_ids=[vpcSubnet])
         if len(subnets) > 0:
             vpcId = subnets[0].vpc_id
     # security group create/get. ssh + all ports open within the group
     try:
         web = ctx.ec2.create_security_group(name, 'Toil appliance security group', vpc_id=vpcId)
     except EC2ResponseError as e:
         if e.status == 400 and 'already exists' in e.body:
             pass  # group exists- nothing to do
         else:
             raise
     else:
         for attempt in retry(predicate=groupNotFound, timeout=300):
             with attempt:
                 # open port 22 for ssh-ing
                 web.authorize(ip_protocol='tcp', from_port=22, to_port=22, cidr_ip='0.0.0.0/0')
         for attempt in retry(predicate=groupNotFound, timeout=300):
             with attempt:
                 # the following authorizes all port access within the web security group
                 web.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=web)
     out = []
     for sg in ctx.ec2.get_all_security_groups():
         if sg.name == name and vpcId is None or sg.vpc_id == vpcId:
             out.append(sg)
     return out
Exemplo n.º 3
0
 def _createSecurityGroup(cls, ctx, name, vpcSubnet=None):
     def groupNotFound(e):
         retry = (e.status == 400 and 'does not exist in default VPC' in e.body)
         return retry
     vpcId = None
     if vpcSubnet:
         conn = boto.connect_vpc(region=ctx.ec2.region)
         subnets = conn.get_all_subnets(subnet_ids=[vpcSubnet])
         if len(subnets) > 0:
             vpcId = subnets[0].vpc_id
     # security group create/get. ssh + all ports open within the group
     try:
         web = ctx.ec2.create_security_group(name, 'Toil appliance security group', vpc_id=vpcId)
     except EC2ResponseError as e:
         if e.status == 400 and 'already exists' in e.body:
             pass  # group exists- nothing to do
         else:
             raise
     else:
         for attempt in retry(predicate=groupNotFound, timeout=300):
             with attempt:
                 # open port 22 for ssh-ing
                 web.authorize(ip_protocol='tcp', from_port=22, to_port=22, cidr_ip='0.0.0.0/0')
         for attempt in retry(predicate=groupNotFound, timeout=300):
             with attempt:
                 # the following authorizes all port access within the web security group
                 web.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=web)
     out = []
     for sg in ctx.ec2.get_all_security_groups():
         if sg.name == name and vpcId is None or sg.vpc_id == vpcId:
             out.append(sg)
     return out
Exemplo n.º 4
0
 def _getNodesInCluster(cls,
                        ctx,
                        clusterName,
                        preemptable=False,
                        both=False):
     for attempt in retry(predicate=AWSProvisioner._throttlePredicate):
         with attempt:
             pendingInstances = ctx.ec2.get_only_instances(
                 filters={
                     'instance.group-name': clusterName,
                     'instance-state-name': 'pending'
                 })
     for attempt in retry(predicate=AWSProvisioner._throttlePredicate):
         with attempt:
             runningInstances = ctx.ec2.get_only_instances(
                 filters={
                     'instance.group-name': clusterName,
                     'instance-state-name': 'running'
                 })
     instances = set(pendingInstances)
     if not preemptable and not both:
         return [
             x for x in instances.union(set(runningInstances))
             if x.spot_instance_request_id is None
         ]
     elif preemptable and not both:
         return [
             x for x in instances.union(set(runningInstances))
             if x.spot_instance_request_id is not None
         ]
     elif both:
         return [x for x in instances.union(set(runningInstances))]
Exemplo n.º 5
0
    def _getProfileARN(cls, ctx):
        def addRoleErrors(e):
            return e.status == 404

        def throttleError(e):
            return isinstance(
                e, BotoServerError
            ) and e.status == 400 and e.error_code == 'Throttling'

        def truncExpBackoff():
            # as recommended here https://forums.aws.amazon.com/thread.jspa?messageID=406788#406788
            yield 0
            t = 1
            while t < 1024:
                yield t
                t *= 2
            while True:
                yield t

        for attempt in retry(delays=truncExpBackoff(),
                             predicate=throttleError):
            with attempt:
                roleName = 'toil'
                policy = dict(iam_full=iamFullPolicy,
                              ec2_full=ec2FullPolicy,
                              s3_full=s3FullPolicy,
                              sbd_full=sdbFullPolicy)
                iamRoleName = ctx.setup_iam_ec2_role(role_name=roleName,
                                                     policies=policy)

                try:
                    profile = ctx.iam.get_instance_profile(iamRoleName)
                except BotoServerError as e:
                    if e.status == 404:
                        profile = ctx.iam.create_instance_profile(iamRoleName)
                        profile = profile.create_instance_profile_response.create_instance_profile_result
                    else:
                        raise
                else:
                    profile = profile.get_instance_profile_response.get_instance_profile_result
                profile = profile.instance_profile
                profile_arn = profile.arn

                if len(profile.roles) > 1:
                    raise RuntimeError(
                        'Did not expect profile to contain more than one role')
                elif len(profile.roles) == 1:
                    # this should be profile.roles[0].role_name
                    if profile.roles.member.role_name == iamRoleName:
                        return profile_arn
                    else:
                        ctx.iam.remove_role_from_instance_profile(
                            iamRoleName, profile.roles.member.role_name)
                for attempt in retry(predicate=addRoleErrors):
                    with attempt:
                        ctx.iam.add_role_to_instance_profile(
                            iamRoleName, iamRoleName)
        return profile_arn
Exemplo n.º 6
0
    def addNodes(self, nodeType, numNodes, preemptable):
        instanceType = ec2_instance_types[nodeType]
        bdm = self._getBlockDeviceMapping(instanceType, rootVolSize=self.nodeStorage)
        arn = self._getProfileARN(self.ctx)
        keyPath = '' if not self.config or not self.config.sseKey else self.config.sseKey
        entryPoint = 'mesos-slave' if not self.config or not self.config.sseKey else "waitForKey.sh"
        workerData = dict(role='worker',
                          image=applianceSelf(),
                          entrypoint=entryPoint,
                          sshKey=self.masterPublicKey,
                          args=workerArgs.format(ip=self.leaderIP, preemptable=preemptable, keyPath=keyPath))
        userData = awsUserData.format(**workerData)
        sgs = [sg for sg in self.ctx.ec2.get_all_security_groups() if sg.name == self.clusterName]
        kwargs = {'key_name': self.keyName,
                  'security_group_ids': [sg.id for sg in sgs],
                  'instance_type': instanceType.name,
                  'user_data': userData,
                  'block_device_map': bdm,
                  'instance_profile_arn': arn,
                  'placement': getCurrentAWSZone()}
        kwargs["subnet_id"] = self.subnetID if self.subnetID else self._getClusterInstance(self.instanceMetaData).subnet_id

        instancesLaunched = []

        for attempt in retry(predicate=AWSProvisioner._throttlePredicate):
            with attempt:
                # after we start launching instances we want to insure the full setup is done
                # the biggest obstacle is AWS request throttling, so we retry on these errors at
                # every request in this method
                if not preemptable:
                    logger.info('Launching %s non-preemptable nodes', numNodes)
                    instancesLaunched = create_ondemand_instances(self.ctx.ec2, image_id=self._discoverAMI(self.ctx),
                                                                  spec=kwargs, num_instances=numNodes)
                else:
                    logger.info('Launching %s preemptable nodes', numNodes)
                    kwargs['placement'] = getSpotZone(self.spotBids[nodeType], instanceType.name, self.ctx)
                    # force generator to evaluate
                    instancesLaunched = list(create_spot_instances(ec2=self.ctx.ec2,
                                                                   price=self.spotBids[nodeType],
                                                                   image_id=self._discoverAMI(self.ctx),
                                                                   tags={'clusterName': self.clusterName},
                                                                   spec=kwargs,
                                                                   num_instances=numNodes,
                                                                   tentative=True)
                                             )
                    # flatten the list
                    instancesLaunched = [item for sublist in instancesLaunched for item in sublist]

        for attempt in retry(predicate=AWSProvisioner._throttlePredicate):
            with attempt:
                wait_instances_running(self.ctx.ec2, instancesLaunched)

        # request throttling retry happens internally to these two methods to insure proper granularity
        AWSProvisioner._addTags(instancesLaunched, self.tags)
        self._propagateKey(instancesLaunched)

        logger.info('Launched %s new instance(s)', numNodes)
        return len(instancesLaunched)
Exemplo n.º 7
0
    def addNodes(self, numNodes, preemptable):
        instanceType = self._getInstanceType(preemptable)
        bdm = self._getBlockDeviceMapping(instanceType, rootVolSize=self.nodeStorage)
        arn = self._getProfileARN(self.ctx)
        keyPath = '' if not self.config or not self.config.sseKey else self.config.sseKey
        entryPoint = 'mesos-slave' if not self.config or not self.config.sseKey else "waitForKey.sh"
        workerData = dict(role='worker',
                          image=applianceSelf(),
                          entrypoint=entryPoint,
                          sshKey=self.masterPublicKey,
                          args=workerArgs.format(ip=self.leaderIP, preemptable=preemptable, keyPath=keyPath))
        userData = awsUserData.format(**workerData)
        sgs = [sg for sg in self.ctx.ec2.get_all_security_groups() if sg.name == self.clusterName]
        kwargs = {'key_name': self.keyName,
                  'security_group_ids': [sg.id for sg in sgs],
                  'instance_type': instanceType.name,
                  'user_data': userData,
                  'block_device_map': bdm,
                  'instance_profile_arn': arn,
                  'placement': getCurrentAWSZone()}
        kwargs["subnet_id"] = self.subnetID if self.subnetID else self._getClusterInstance(self.instanceMetaData).subnet_id

        instancesLaunched = []

        for attempt in retry(predicate=AWSProvisioner._throttlePredicate):
            with attempt:
                # after we start launching instances we want to insure the full setup is done
                # the biggest obstacle is AWS request throttling, so we retry on these errors at
                # every request in this method
                if not preemptable:
                    logger.info('Launching %s non-preemptable nodes', numNodes)
                    instancesLaunched = create_ondemand_instances(self.ctx.ec2, image_id=self._discoverAMI(self.ctx),
                                                                  spec=kwargs, num_instances=numNodes)
                else:
                    logger.info('Launching %s preemptable nodes', numNodes)
                    kwargs['placement'] = getSpotZone(self.spotBid, instanceType.name, self.ctx)
                    # force generator to evaluate
                    instancesLaunched = list(create_spot_instances(ec2=self.ctx.ec2,
                                                                   price=self.spotBid,
                                                                   image_id=self._discoverAMI(self.ctx),
                                                                   tags={'clusterName': self.clusterName},
                                                                   spec=kwargs,
                                                                   num_instances=numNodes,
                                                                   tentative=True)
                                             )
                    # flatten the list
                    instancesLaunched = [item for sublist in instancesLaunched for item in sublist]

        for attempt in retry(predicate=AWSProvisioner._throttlePredicate):
            with attempt:
                wait_instances_running(self.ctx.ec2, instancesLaunched)

        # request throttling retry happens internally to these two methods to insure proper granularity
        AWSProvisioner._addTags(instancesLaunched, self.tags)
        self._propagateKey(instancesLaunched)

        logger.info('Launched %s new instance(s)', numNodes)
        return len(instancesLaunched)
Exemplo n.º 8
0
    def _getProfileARN(cls, ctx):
        def addRoleErrors(e):
            return e.status == 404

        def throttleError(e):
            return isinstance(e, BotoServerError) and e.status == 400 and e.error_code == 'Throttling'

        def truncExpBackoff():
            # as recommended here https://forums.aws.amazon.com/thread.jspa?messageID=406788#406788
            yield 0
            t = 1
            while t < 1024:
                yield t
                t *= 2
            while True:
                yield t

        for attempt in retry(delays=truncExpBackoff(), predicate=throttleError):
            with attempt:
                roleName = 'toil'
                policy = dict(iam_full=iamFullPolicy, ec2_full=ec2FullPolicy,
                              s3_full=s3FullPolicy, sbd_full=sdbFullPolicy)
                iamRoleName = ctx.setup_iam_ec2_role(role_name=roleName, policies=policy)

                try:
                    profile = ctx.iam.get_instance_profile(iamRoleName)
                except BotoServerError as e:
                    if e.status == 404:
                        profile = ctx.iam.create_instance_profile(iamRoleName)
                        profile = profile.create_instance_profile_response.create_instance_profile_result
                    else:
                        raise
                else:
                    profile = profile.get_instance_profile_response.get_instance_profile_result
                profile = profile.instance_profile
                profile_arn = profile.arn

                if len(profile.roles) > 1:
                        raise RuntimeError('Did not expect profile to contain more than one role')
                elif len(profile.roles) == 1:
                    # this should be profile.roles[0].role_name
                    if profile.roles.member.role_name == iamRoleName:
                        return profile_arn
                    else:
                        ctx.iam.remove_role_from_instance_profile(iamRoleName,
                                                                  profile.roles.member.role_name)
                for attempt in retry(predicate=addRoleErrors):
                    with attempt:
                        ctx.iam.add_role_to_instance_profile(iamRoleName, iamRoleName)
        return profile_arn
Exemplo n.º 9
0
Arquivo: docker.py Projeto: hufh/toil
def _dockerKill(containerName, action):
    """
    Deprecated.  Kills the specified container.
    :param str containerName: The name of the container created by docker_call
    :param int action: What action should be taken on the container?
    """
    running = containerIsRunning(containerName)
    if running is None:
        # This means that the container doesn't exist.  We will see this if the
        # container was run with --rm and has already exited before this call.
        logger.info(
            'The container with name "%s" appears to have already been '
            'removed.  Nothing to '
            'do.', containerName)
    else:
        if action in (None, FORGO):
            logger.info(
                'The container with name %s continues to exist as we '
                'were asked to forgo a '
                'post-job action on it.', containerName)
        else:
            logger.info(
                'The container with name %s exists. Running '
                'user-specified defer functions.', containerName)
            if running and action >= STOP:
                logger.info('Stopping container "%s".', containerName)
                for attempt in retry(predicate=dockerPredicate):
                    with attempt:
                        subprocess.check_call(
                            ['docker', 'stop', containerName])
            else:
                logger.info('The container "%s" was not found to be running.',
                            containerName)
            if action >= RM:
                # If the container was run with --rm, then stop will most likely
                # remove the container.  We first check if it is running then
                # remove it.
                running = containerIsRunning(containerName)
                if running is not None:
                    logger.info('Removing container "%s".', containerName)
                    for attempt in retry(predicate=dockerPredicate):
                        with attempt:
                            subprocess.check_call(
                                ['docker', 'rm', '-f', containerName])
                else:
                    logger.info(
                        'Container "%s" was not found on the system.'
                        'Nothing to remove.', containerName)
Exemplo n.º 10
0
    def _getProfileARN(cls, ctx):
        def addRoleErrors(e):
            return e.status == 404
        roleName = 'toil'
        policy = dict(iam_full=iamFullPolicy, ec2_full=ec2FullPolicy,
                      s3_full=s3FullPolicy, sbd_full=sdbFullPolicy)
        iamRoleName = ctx.setup_iam_ec2_role(role_name=roleName, policies=policy)

        try:
            profile = ctx.iam.get_instance_profile(iamRoleName)
        except BotoServerError as e:
            if e.status == 404:
                profile = ctx.iam.create_instance_profile(iamRoleName)
                profile = profile.create_instance_profile_response.create_instance_profile_result
            else:
                raise
        else:
            profile = profile.get_instance_profile_response.get_instance_profile_result
        profile = profile.instance_profile
        profile_arn = profile.arn

        if len(profile.roles) > 1:
                raise RuntimeError('Did not expect profile to contain more than one role')
        elif len(profile.roles) == 1:
            # this should be profile.roles[0].role_name
            if profile.roles.member.role_name == iamRoleName:
                return profile_arn
            else:
                ctx.iam.remove_role_from_instance_profile(iamRoleName,
                                                          profile.roles.member.role_name)
        for attempt in retry(predicate=addRoleErrors):
            with attempt:
                ctx.iam.add_role_to_instance_profile(iamRoleName, iamRoleName)
        return profile_arn
Exemplo n.º 11
0
 def _getNodesInCluster(cls, ctx, clusterName, preemptable=False, both=False):
     for attempt in retry(predicate=AWSProvisioner._throttlePredicate):
         with attempt:
             pendingInstances = ctx.ec2.get_only_instances(filters={'instance.group-name': clusterName,
                                                                    'instance-state-name': 'pending'})
     for attempt in retry(predicate=AWSProvisioner._throttlePredicate):
         with attempt:
             runningInstances = ctx.ec2.get_only_instances(filters={'instance.group-name': clusterName,
                                                                    'instance-state-name': 'running'})
     instances = set(pendingInstances)
     if not preemptable and not both:
         return [x for x in instances.union(set(runningInstances)) if x.spot_instance_request_id is None]
     elif preemptable and not both:
         return [x for x in instances.union(set(runningInstances)) if x.spot_instance_request_id is not None]
     elif both:
         return [x for x in instances.union(set(runningInstances))]
Exemplo n.º 12
0
 def _getClusterInstance(self, md):
     zone = getCurrentAWSZone()
     region = Context.availability_zone_re.match(zone).group(1)
     conn = boto.ec2.connect_to_region(region)
     for attempt in retry(predicate=AWSProvisioner._throttlePredicate):
         with attempt:
             return conn.get_all_instances(instance_ids=[md["instance-id"]])[0].instances[0]
Exemplo n.º 13
0
    def _getProfileARN(cls, ctx):
        def addRoleErrors(e):
            return e.status == 404
        roleName = 'toil'
        policy = dict(iam_full=iamFullPolicy, ec2_full=ec2FullPolicy,
                      s3_full=s3FullPolicy, sbd_full=sdbFullPolicy)
        iamRoleName = ctx.setup_iam_ec2_role(role_name=roleName, policies=policy)

        try:
            profile = ctx.iam.get_instance_profile(iamRoleName)
        except BotoServerError as e:
            if e.status == 404:
                profile = ctx.iam.create_instance_profile(iamRoleName)
                profile = profile.create_instance_profile_response.create_instance_profile_result
            else:
                raise
        else:
            profile = profile.get_instance_profile_response.get_instance_profile_result
        profile = profile.instance_profile
        profile_arn = profile.arn

        if len(profile.roles) > 1:
                raise RuntimeError('Did not expect profile to contain more than one role')
        elif len(profile.roles) == 1:
            # this should be profile.roles[0].role_name
            if profile.roles.member.role_name == iamRoleName:
                return profile_arn
            else:
                ctx.iam.remove_role_from_instance_profile(iamRoleName,
                                                          profile.roles.member.role_name)
        for attempt in retry(predicate=addRoleErrors):
            with attempt:
                ctx.iam.add_role_to_instance_profile(iamRoleName, iamRoleName)
        return profile_arn
Exemplo n.º 14
0
def _containerIsRunning(container_name):
    """
    Checks whether the container is running or not.
    :param container_name: Name of the container being checked.
    :returns: True if running, False if not running, None if the container doesn't exist.
    :rtype: bool
    """
    try:
        for attempt in retry(predicate=dockerPredicate):
            with attempt:
                output = subprocess.check_output([
                    'docker', 'inspect', '--format', '{{.State.Running}}',
                    container_name
                ]).strip()
    except subprocess.CalledProcessError:
        # This will be raised if the container didn't exist.
        _logger.debug(
            "'docker inspect' failed. Assuming container %s doesn't exist.",
            container_name,
            exc_info=True)
        return None
    if output == 'true':
        return True
    elif output == 'false':
        return False
    else:
        raise RuntimeError("Got unexpected value for State.Running (%s)" %
                           output)
Exemplo n.º 15
0
def _fixPermissions(tool, workDir):
    """
    Fix permission of a mounted Docker directory by reusing the tool to change ownership.
    Docker natively runs as a root inside the container, and files written to the
    mounted directory are implicitly owned by root.

    :param list baseDockerCall: Docker run parameters
    :param str tool: Name of tool
    :param str workDir: Path of work directory to recursively chown
    """
    if os.geteuid() == 0:
        # we're running as root so this chown is redundant
        return

    baseDockerCall = [
        'docker', 'run', '--log-driver=none', '-v',
        os.path.abspath(workDir) + ':/data', '--rm', '--entrypoint=chown'
    ]
    stat = os.stat(workDir)
    command = baseDockerCall + [tool] + [
        '-R', '{}:{}'.format(stat.st_uid, stat.st_gid), '/data'
    ]
    for attempt in retry(predicate=dockerPredicate):
        with attempt:
            subprocess.check_call(command)
Exemplo n.º 16
0
    def destroyCluster(cls, clusterName, zone=None):
        def expectedShutdownErrors(e):
            return e.status == 400 and 'dependent object' in e.body

        ctx = cls._buildContext(clusterName=clusterName, zone=zone)
        instances = cls.__getNodesInCluster(ctx, clusterName, both=True)
        spotIDs = cls._getSpotRequestIDs(ctx, clusterName)
        if spotIDs:
            ctx.ec2.cancel_spot_instance_requests(request_ids=spotIDs)
        instancesToTerminate = awsFilterImpairedNodes(instances, ctx.ec2)
        if instancesToTerminate:
            cls._deleteIAMProfiles(instances=instancesToTerminate, ctx=ctx)
            cls._terminateInstances(instances=instancesToTerminate, ctx=ctx)
        if len(instances) == len(instancesToTerminate):
            logger.info('Deleting security group...')
            for attempt in retry(timeout=300, predicate=expectedShutdownErrors):
                with attempt:
                    try:
                        ctx.ec2.delete_security_group(name=clusterName)
                    except BotoServerError as e:
                        if e.error_code == 'InvalidGroup.NotFound':
                            pass
                        else:
                            raise
            logger.info('... Succesfully deleted security group')
        else:
            assert len(instances) > len(instancesToTerminate)
            # the security group can't be deleted until all nodes are terminated
            logger.warning('The TOIL_AWS_NODE_DEBUG environment variable is set and some nodes '
                           'have failed health checks. As a result, the security group & IAM '
                           'roles will not be deleted.')
Exemplo n.º 17
0
    def destroyCluster(cls, clusterName, zone=None):
        def expectedShutdownErrors(e):
            return e.status == 400 and 'dependent object' in e.body

        ctx = cls._buildContext(clusterName=clusterName, zone=zone)
        instances = cls.__getNodesInCluster(ctx, clusterName, both=True)
        spotIDs = cls._getSpotRequestIDs(ctx, clusterName)
        if spotIDs:
            ctx.ec2.cancel_spot_instance_requests(request_ids=spotIDs)
        instancesToTerminate = awsFilterImpairedNodes(instances, ctx.ec2)
        if instancesToTerminate:
            cls._deleteIAMProfiles(instances=instancesToTerminate, ctx=ctx)
            cls._terminateInstances(instances=instancesToTerminate, ctx=ctx)
        if len(instances) == len(instancesToTerminate):
            logger.info('Deleting security group...')
            for attempt in retry(timeout=300, predicate=expectedShutdownErrors):
                with attempt:
                    try:
                        ctx.ec2.delete_security_group(name=clusterName)
                    except BotoServerError as e:
                        if e.error_code == 'InvalidGroup.NotFound':
                            pass
                        else:
                            raise
            logger.info('... Succesfully deleted security group')
        else:
            assert len(instances) > len(instancesToTerminate)
            # the security group can't be deleted until all nodes are terminated
            logger.warning('The TOIL_AWS_NODE_DEBUG environment variable is set and some nodes '
                           'have failed health checks. As a result, the security group & IAM '
                           'roles will not be deleted.')
Exemplo n.º 18
0
 def _addTags(cls, instances, tags):
     for instance in instances:
         for key, value in iteritems(tags):
             for attempt in retry(
                     predicate=AWSProvisioner._throttlePredicate):
                 with attempt:
                     instance.add_tag(key, value)
Exemplo n.º 19
0
 def _getClusterInstance(self, md):
     zone = getCurrentAWSZone()
     region = Context.availability_zone_re.match(zone).group(1)
     conn = boto.ec2.connect_to_region(region)
     for attempt in retry(predicate=AWSProvisioner._throttlePredicate):
         with attempt:
             return conn.get_all_instances(instance_ids=[md["instance-id"]])[0].instances[0]
Exemplo n.º 20
0
def retry_ec2(retry_after=a_short_time,
              retry_for=10 * a_short_time,
              retry_while=not_found):
    t = retry_after
    return retry(delays=(t, t, t * 2, t * 4),
                 timeout=retry_for,
                 predicate=retry_while)
Exemplo n.º 21
0
 def _propagateKey(self, instances):
     if not self.config or not self.config.sseKey:
         return
     for node in instances:
         for attempt in retry(predicate=AWSProvisioner._throttlePredicate):
             with attempt:
                 # since we're going to be rsyncing into the appliance we need the appliance to be running first
                 ipAddress = self._waitForNode(node, 'toil_worker')
                 self._rsyncNode(ipAddress, [self.config.sseKey, ':' + self.config.sseKey], applianceName='toil_worker')
Exemplo n.º 22
0
 def _propagateKey(self, instances):
     if not self.config or not self.config.sseKey:
         return
     for node in instances:
         for attempt in retry(predicate=AWSProvisioner._throttlePredicate):
             with attempt:
                 # since we're going to be rsyncing into the appliance we need the appliance to be running first
                 ipAddress = self._waitForNode(node, 'toil_worker')
                 self._rsyncNode(ipAddress, [self.config.sseKey, ':' + self.config.sseKey], applianceName='toil_worker')
Exemplo n.º 23
0
    def setNodeCount(self, numNodes, preemptable=False, force=False):
        """
        Attempt to grow or shrink the number of prepemptable or non-preemptable worker nodes in
        the cluster to the given value, or as close a value as possible, and, after performing
        the necessary additions or removals of worker nodes, return the resulting number of
        preemptable or non-preemptable nodes currently in the cluster.

        :param int numNodes: Desired size of the cluster

        :param bool preemptable: whether the added nodes will be preemptable, i.e. whether they
               may be removed spontaneously by the underlying platform at any time.

        :param bool force: If False, the provisioner is allowed to deviate from the given number
               of nodes. For example, when downsizing a cluster, a provisioner might leave nodes
               running if they have active jobs running on them.

        :rtype: int :return: the number of nodes in the cluster after making the necessary
                adjustments. This value should be, but is not guaranteed to be, close or equal to
                the `numNodes` argument. It represents the closest possible approximation of the
                actual cluster size at the time this method returns.
        """
        for attempt in retry(predicate=self.retryPredicate):
            with attempt:
                workerInstances = self._getWorkersInCluster(preemptable)
                numCurrentNodes = len(workerInstances)
                delta = numNodes - numCurrentNodes
                if delta > 0:
                    log.info(
                        'Adding %i %s nodes to get to desired cluster size of %i.',
                        delta,
                        'preemptable' if preemptable else 'non-preemptable',
                        numNodes)
                    numNodes = numCurrentNodes + self._addNodes(
                        workerInstances,
                        numNodes=delta,
                        preemptable=preemptable)
                elif delta < 0:
                    log.info(
                        'Removing %i %s nodes to get to desired cluster size of %i.',
                        -delta,
                        'preemptable' if preemptable else 'non-preemptable',
                        numNodes)
                    numNodes = numCurrentNodes - self._removeNodes(
                        workerInstances,
                        numNodes=-delta,
                        preemptable=preemptable,
                        force=force)
                else:
                    log.info(
                        'Cluster already at desired size of %i. Nothing to do.',
                        numNodes)
        return numNodes
Exemplo n.º 24
0
def _dockerKill(containerName, action):
    """
    Kills the specified container.
    :param str containerName: The name of the container created by docker_call
    :param int action: What action should be taken on the container?  See `defer=` in
           :func:`docker_call`
    """
    running = _containerIsRunning(containerName)
    if running is None:
        # This means that the container doesn't exist.  We will see this if the container was run
        # with --rm and has already exited before this call.
        _logger.info('The container with name "%s" appears to have already been removed.  Nothing to '
                  'do.', containerName)
    else:
        if action in (None, FORGO):
            _logger.info('The container with name %s continues to exist as we were asked to forgo a '
                      'post-job action on it.', containerName)
        else:
            _logger.info('The container with name %s exists. Running user-specified defer functions.',
                         containerName)
            if running and action >= STOP:
                _logger.info('Stopping container "%s".', containerName)
                for attempt in retry(predicate=dockerPredicate):
                    with attempt:
                        subprocess.check_call(['docker', 'stop', containerName])
            else:
                _logger.info('The container "%s" was not found to be running.', containerName)
            if action >= RM:
                # If the container was run with --rm, then stop will most likely remove the
                # container.  We first check if it is running then remove it.
                running = _containerIsRunning(containerName)
                if running is not None:
                    _logger.info('Removing container "%s".', containerName)
                    for attempt in retry(predicate=dockerPredicate):
                        with attempt:
                            subprocess.check_call(['docker', 'rm', '-f', containerName])
                else:
                    _logger.info('The container "%s" was not found on the system.  Nothing to remove.',
                                 containerName)
Exemplo n.º 25
0
    def _createSecurityGroup(cls, ctx, name):
        def groupNotFound(e):
            retry = (e.status == 400 and 'does not exist in default VPC' in e.body)
            return retry

        # security group create/get. ssh + all ports open within the group
        try:
            web = ctx.ec2.create_security_group(name, 'Toil appliance security group')
        except EC2ResponseError as e:
            if e.status == 400 and 'already exists' in e.body:
                pass  # group exists- nothing to do
            else:
                raise
        else:
            for attempt in retry(predicate=groupNotFound, timeout=300):
                with attempt:
                    # open port 22 for ssh-ing
                    web.authorize(ip_protocol='tcp', from_port=22, to_port=22, cidr_ip='0.0.0.0/0')
            for attempt in retry(predicate=groupNotFound, timeout=300):
                with attempt:
                    # the following authorizes all port access within the web security group
                    web.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=web)
Exemplo n.º 26
0
    def _download(self, dstFile):
        """
        Download this resource from its URL to the given file object.

        :type dstFile: io.BytesIO|io.FileIO
        """
        for attempt in retry(predicate=lambda e: isinstance(e, HTTPError) and e.code == 400):
            with attempt:
                with closing(urlopen(self.url)) as content:
                    buf = content.read()
        contentHash = hashlib.md5(buf)
        assert contentHash.hexdigest() == self.contentHash
        dstFile.write(buf)
Exemplo n.º 27
0
    def _discoverAMI(cls, ctx):
        def descriptionMatches(ami):
            return ami.description is not None and 'stable 1235.4.0' in ami.description
        coreOSAMI = os.environ.get('TOIL_AWS_AMI')
        if coreOSAMI is not None:
            return coreOSAMI
        # that ownerID corresponds to coreOS

        for attempt in retry(predicate= lambda e : isinstance(e, SSLError)):
            # SSLError is thrown when get_all_images times out
            with attempt:
                amis = ctx.ec2.get_all_images(owners=['679593333241'])

        coreOSAMI = [ami for ami in amis if descriptionMatches(ami)]
        logger.debug('Found the following matching AMIs: %s', coreOSAMI)
        assert len(coreOSAMI) == 1
        return coreOSAMI.pop().id
Exemplo n.º 28
0
    def _discoverAMI(cls, ctx):
        def descriptionMatches(ami):
            return ami.description is not None and 'stable 1235.4.0' in ami.description
        coreOSAMI = os.environ.get('TOIL_AWS_AMI')
        if coreOSAMI is not None:
            return coreOSAMI
        # that ownerID corresponds to coreOS

        for attempt in retry(predicate= lambda e : isinstance(e, SSLError)):
            # SSLError is thrown when get_all_images times out
            with attempt:
                amis = ctx.ec2.get_all_images(owners=['679593333241'])

        coreOSAMI = [ami for ami in amis if descriptionMatches(ami)]
        logger.debug('Found the following matching AMIs: %s', coreOSAMI)
        assert len(coreOSAMI) == 1
        return coreOSAMI.pop().id
Exemplo n.º 29
0
    def destroyCluster(self):
        """
        Terminate instances and delete the profile and security group.
        """
        assert self._ctx

        def expectedShutdownErrors(e):
            return e.status == 400 and 'dependent object' in e.body

        instances = self._getNodesInCluster(nodeType=None, both=True)
        spotIDs = self._getSpotRequestIDs()
        if spotIDs:
            self._ctx.ec2.cancel_spot_instance_requests(request_ids=spotIDs)
        instancesToTerminate = awsFilterImpairedNodes(instances, self._ctx.ec2)
        vpcId = None
        if instancesToTerminate:
            vpcId = instancesToTerminate[0].vpc_id
            self._deleteIAMProfiles(instances=instancesToTerminate)
            self._terminateInstances(instances=instancesToTerminate)
        if len(instances) == len(instancesToTerminate):
            logger.info('Deleting security group...')
            removed = False
            for attempt in retry(timeout=300,
                                 predicate=expectedShutdownErrors):
                with attempt:
                    for sg in self._ctx.ec2.get_all_security_groups():
                        if sg.name == self.clusterName and vpcId and sg.vpc_id == vpcId:
                            try:
                                self._ctx.ec2.delete_security_group(
                                    group_id=sg.id)
                                removed = True
                            except BotoServerError as e:
                                if e.error_code == 'InvalidGroup.NotFound':
                                    pass
                                else:
                                    raise
            if removed:
                logger.info('... Succesfully deleted security group')
        else:
            assert len(instances) > len(instancesToTerminate)
            # the security group can't be deleted until all nodes are terminated
            logger.warning(
                'The TOIL_AWS_NODE_DEBUG environment variable is set and some nodes '
                'have failed health checks. As a result, the security group & IAM '
                'roles will not be deleted.')
Exemplo n.º 30
0
def _fixPermissions(tool, workDir):
    """
    Fix permission of a mounted Docker directory by reusing the tool to change ownership.
    Docker natively runs as a root inside the container, and files written to the
    mounted directory are implicitly owned by root.

    :param list baseDockerCall: Docker run parameters
    :param str tool: Name of tool
    :param str workDir: Path of work directory to recursively chown
    """
    if os.geteuid() == 0:
        # we're running as root so this chown is redundant
        return

    baseDockerCall = ['docker', 'run', '--log-driver=none',
                      '-v', os.path.abspath(workDir) + ':/data', '--rm', '--entrypoint=chown']
    stat = os.stat(workDir)
    command = baseDockerCall + [tool] + ['-R', '{}:{}'.format(stat.st_uid, stat.st_gid), '/data']
    for attempt in retry(predicate=dockerPredicate):
        with attempt:
            subprocess.check_call(command)
Exemplo n.º 31
0
    def setNodeCount(self, numNodes, preemptable=False, force=False):
        """
        Attempt to grow or shrink the number of prepemptable or non-preemptable worker nodes in
        the cluster to the given value, or as close a value as possible, and, after performing
        the necessary additions or removals of worker nodes, return the resulting number of
        preemptable or non-preemptable nodes currently in the cluster.

        :param int numNodes: Desired size of the cluster

        :param bool preemptable: whether the added nodes will be preemptable, i.e. whether they
               may be removed spontaneously by the underlying platform at any time.

        :param bool force: If False, the provisioner is allowed to deviate from the given number
               of nodes. For example, when downsizing a cluster, a provisioner might leave nodes
               running if they have active jobs running on them.

        :rtype: int :return: the number of worker nodes in the cluster after making the necessary
                adjustments. This value should be, but is not guaranteed to be, close or equal to
                the `numNodes` argument. It represents the closest possible approximation of the
                actual cluster size at the time this method returns.
        """
        for attempt in retry(predicate=self.scaler.provisioner.retryPredicate):
            with attempt:
                workerInstances = self.getNodes(preemptable=preemptable)
                numCurrentNodes = len(workerInstances)
                delta = numNodes - numCurrentNodes
                if delta > 0:
                    logger.info('Adding %i %s nodes to get to desired cluster size of %i.', delta, 'preemptable' if preemptable else 'non-preemptable', numNodes)
                    numNodes = numCurrentNodes + self._addNodes(numNodes=delta,
                                                                preemptable=preemptable)
                elif delta < 0:
                    logger.info('Removing %i %s nodes to get to desired cluster size of %i.', -delta, 'preemptable' if preemptable else 'non-preemptable', numNodes)
                    numNodes = numCurrentNodes - self._removeNodes(workerInstances,
                                                                   numNodes=-delta,
                                                                   preemptable=preemptable,
                                                                   force=force)
                else:
                    logger.info('Cluster already at desired size of %i. Nothing to do.', numNodes)
        return numNodes
Exemplo n.º 32
0
    def __setup_entity_policies( self, entity_name, policies,
                                 list_policies, delete_policy, get_policy, put_policy ):
        # Delete superfluous policies
        policy_names = set( list_policies( entity_name ).policy_names )
        for policy_name in policy_names.difference( set( policies.keys( ) ) ):
            delete_policy( entity_name, policy_name )

        # Create expected policies
        for policy_name, policy in policies.iteritems( ):
            current_policy = None
            try:
                current_policy = json.loads( urllib.unquote(
                    get_policy( entity_name, policy_name ).policy_document ) )
            except BotoServerError as e:
                if e.status == 404 and e.error_code == 'NoSuchEntity':
                    pass
                else:
                    raise
            if current_policy != policy:
                for attempt in retry(predicate=throttlePredicate):
                    with attempt:
                        put_policy( entity_name, policy_name, json.dumps( policy ) )
Exemplo n.º 33
0
def _containerIsRunning(container_name):
    """
    Checks whether the container is running or not.
    :param container_name: Name of the container being checked.
    :returns: True if running, False if not running, None if the container doesn't exist.
    :rtype: bool
    """
    try:
        for attempt in retry(predicate=dockerPredicate):
            with attempt:
                output = subprocess.check_output(['docker', 'inspect', '--format', '{{.State.Running}}',
                                                  container_name]).strip()
    except subprocess.CalledProcessError:
        # This will be raised if the container didn't exist.
        _logger.debug("'docker inspect' failed. Assuming container %s doesn't exist.", container_name,
                   exc_info=True)
        return None
    if output == 'true':
        return True
    elif output == 'false':
        return False
    else:
        raise RuntimeError("Got unexpected value for State.Running (%s)" % output)
Exemplo n.º 34
0
 def _terminateIDs(cls, instanceIDs, ctx):
     logger.info('Terminating instance(s): %s', instanceIDs)
     for attempt in retry(predicate=AWSProvisioner._throttlePredicate):
         with attempt:
             ctx.ec2.terminate_instances(instance_ids=instanceIDs)
     logger.info('Instance(s) terminated.')
Exemplo n.º 35
0
    def addNodes(self, nodeType, numNodes, preemptable, spotBid=None):
        assert self._leaderPrivateIP
        if preemptable and not spotBid:
            if self._spotBidsMap and nodeType in self._spotBidsMap:
                spotBid = self._spotBidsMap[nodeType]
            else:
                raise RuntimeError(
                    "No spot bid given for a preemptable node request.")
        instanceType = ec2_instance_types[nodeType]
        bdm = self._getBlockDeviceMapping(instanceType,
                                          rootVolSize=self._nodeStorage)
        arn = self._getProfileARN()

        keyPath = self._sseKey if self._sseKey else None
        userData = self._getCloudConfigUserData('worker',
                                                self._masterPublicKey, keyPath,
                                                preemptable)
        sgs = [
            sg for sg in self._ctx.ec2.get_all_security_groups()
            if sg.name == self.clusterName
        ]
        kwargs = {
            'key_name': self._keyName,
            'security_group_ids': [sg.id for sg in sgs],
            'instance_type': instanceType.name,
            'user_data': userData,
            'block_device_map': bdm,
            'instance_profile_arn': arn,
            'placement': self._zone,
            'subnet_id': self._subnetID
        }

        instancesLaunched = []

        for attempt in retry(predicate=awsRetryPredicate):
            with attempt:
                # after we start launching instances we want to insure the full setup is done
                # the biggest obstacle is AWS request throttling, so we retry on these errors at
                # every request in this method
                if not preemptable:
                    logger.info('Launching %s non-preemptable nodes', numNodes)
                    instancesLaunched = create_ondemand_instances(
                        self._ctx.ec2,
                        image_id=self._discoverAMI(),
                        spec=kwargs,
                        num_instances=numNodes)
                else:
                    logger.info('Launching %s preemptable nodes', numNodes)
                    kwargs['placement'] = getSpotZone(spotBid,
                                                      instanceType.name,
                                                      self._ctx)
                    # force generator to evaluate
                    instancesLaunched = list(
                        create_spot_instances(
                            ec2=self._ctx.ec2,
                            price=spotBid,
                            image_id=self._discoverAMI(),
                            tags={'clusterName': self.clusterName},
                            spec=kwargs,
                            num_instances=numNodes,
                            tentative=True))
                    # flatten the list
                    instancesLaunched = [
                        item for sublist in instancesLaunched
                        for item in sublist
                    ]

        for attempt in retry(predicate=awsRetryPredicate):
            with attempt:
                wait_instances_running(self._ctx.ec2, instancesLaunched)

        AWSProvisioner._addTags(instancesLaunched, self._tags)
        if self._sseKey:
            for i in instancesLaunched:
                self._waitForIP(i)
                node = Node(publicIP=i.ip_address,
                            privateIP=i.private_ip_address,
                            name=i.id,
                            launchTime=i.launch_time,
                            nodeType=i.instance_type,
                            preemptable=preemptable,
                            tags=i.tags)
                node.waitForNode('toil_worker')
                node.coreRsync([self._sseKey, ':' + self._sseKey],
                               applianceName='toil_worker')
        logger.info('Launched %s new instance(s)', numNodes)
        return len(instancesLaunched)
Exemplo n.º 36
0
def _docker(job,
            tool,
            parameters=None,
            workDir=None,
            dockerParameters=None,
            outfile=None,
            checkOutput=False,
            defer=None):
    """
    :param toil.Job.job job: The Job instance for the calling function.
    :param str tool: Name of the Docker image to be used (e.g. quay.io/ucsc_cgl/samtools).
    :param list[str] parameters: Command line arguments to be passed to the tool.
           If list of lists: list[list[str]], then treat as successive commands chained with pipe.
    :param str workDir: Directory to mount into the container via `-v`. Destination convention is /data
    :param list[str] dockerParameters: Parameters to pass to Docker. Default parameters are `--rm`,
            `--log-driver none`, and the mountpoint `-v work_dir:/data` where /data is the destination convention.
             These defaults are removed if docker_parmaters is passed, so be sure to pass them if they are desired.
    :param file outfile: Pipe output of Docker call to file handle
    :param bool checkOutput: When True, this function returns docker's output.
    :param int defer: What action should be taken on the container upon job completion?
           FORGO (0) will leave the container untouched.
           STOP (1) will attempt to stop the container with `docker stop` (useful for debugging).
           RM (2) will stop the container and then forcefully remove it from the system
           using `docker rm -f`. This is the default behavior if defer is set to None.
    """
    if parameters is None:
        parameters = []
    if workDir is None:
        workDir = os.getcwd()

    # Setup the outgoing subprocess call for docker
    baseDockerCall = ['docker', 'run']
    if dockerParameters:
        baseDockerCall += dockerParameters
    else:
        baseDockerCall += [
            '--rm', '--log-driver', 'none', '-v',
            os.path.abspath(workDir) + ':/data'
        ]

    # Ensure the user has passed a valid value for defer
    require(defer in (None, FORGO, STOP, RM),
            'Please provide a valid value for defer.')

    # Get container name which is needed for _dockerKill
    try:
        if any('--name' in x for x in baseDockerCall):
            if any('--name=' in x for x in baseDockerCall):
                containerName = [
                    x.split('=')[1] for x in baseDockerCall if '--name' in x
                ][0]
            else:
                containerName = baseDockerCall[baseDockerCall.index('--name') +
                                               1]
        else:
            containerName = _getContainerName(job)
    except ValueError:
        containerName = _getContainerName(job)
        baseDockerCall.extend(['--name', containerName])
    except IndexError:
        raise RuntimeError(
            "Couldn't parse Docker's `--name=` option, check parameters: " +
            str(dockerParameters))

    # Defer the container on-exit action
    if '--rm' in baseDockerCall and defer is None:
        defer = RM
    if '--rm' in baseDockerCall and defer is not RM:
        _logger.warn(
            '--rm being passed to docker call but defer not set to dockerCall.RM, defer set to: '
            + str(defer))
    job.defer(_dockerKill, containerName, action=defer)
    # Defer the permission fixing function which will run after this job concludes.
    # We call this explicitly later on in this function, but we defer it as well to handle unexpected job failure.
    job.defer(_fixPermissions, tool, workDir)

    # Make subprocess call

    # If parameters is list of lists, treat each list as separate command and chain with pipes
    if len(parameters) > 0 and type(parameters[0]) is list:
        # When piping, all arguments now get merged into a single string to bash -c.
        # We try to support spaces in paths by wrapping them all in quotes first.
        chain_params = [
            ' '.join(p) for p in [map(pipes.quote, q) for q in parameters]
        ]
        call = baseDockerCall + [
            '--entrypoint', '/bin/bash', tool, '-c', ' | '.join(chain_params)
        ]
    else:
        call = baseDockerCall + [tool] + parameters
    _logger.info("Calling docker with " + repr(call))

    params = {}
    if outfile:
        params['stdout'] = outfile
    if checkOutput:
        callMethod = subprocess.check_output
    else:
        callMethod = subprocess.check_call

    for attempt in retry(predicate=dockerPredicate):
        with attempt:
            out = callMethod(call, **params)

    _fixPermissions(tool=tool, workDir=workDir)
    return out
Exemplo n.º 37
0
def retry_azure(
        delays=(0, 1, 1, 4, 16, 64), timeout=300,
        predicate=defaultRetryPredicate):
    return retry(delays=delays, timeout=timeout, predicate=predicate)
Exemplo n.º 38
0
def retry_azure(delays=(0, 1, 1, 4, 16, 64), timeout=300, predicate=defaultRetryPredicate):
    return retry(delays=delays, timeout=timeout, predicate=predicate)
Exemplo n.º 39
0
def retry_s3(delays=default_delays, timeout=default_timeout, predicate=retryable_s3_errors):
    return retry(delays=delays, timeout=timeout, predicate=predicate)
Exemplo n.º 40
0
def retry_ec2( retry_after=a_short_time, retry_for=10 * a_short_time, retry_while=not_found ):
    t = retry_after
    return retry( delays=(t,t,t*2,t*4), timeout=retry_for, predicate=retry_while )
Exemplo n.º 41
0
 def _deleteRootEBS(cls, ebsIDs, ctx):
     for volumeID in ebsIDs:
         for attempt in retry(predicate=AWSProvisioner.throttlePredicate):
             with attempt:
                 ctx.ec2.delete_volume(volumeID)
Exemplo n.º 42
0
 def _terminateIDs(cls, instanceIDs, ctx):
     logger.info('Terminating instance(s): %s', instanceIDs)
     for attempt in retry(predicate=AWSProvisioner._throttlePredicate):
         with attempt:
             ctx.ec2.terminate_instances(instance_ids=instanceIDs)
     logger.info('Instance(s) terminated.')
Exemplo n.º 43
0
def _docker(job,
            tool,
            parameters=None,
            workDir=None,
            dockerParameters=None,
            outfile=None,
            checkOutput=False,
            defer=None):
    """
    :param toil.Job.job job: The Job instance for the calling function.
    :param str tool: Name of the Docker image to be used (e.g. quay.io/ucsc_cgl/samtools).
    :param list[str] parameters: Command line arguments to be passed to the tool.
           If list of lists: list[list[str]], then treat as successive commands chained with pipe.
    :param str workDir: Directory to mount into the container via `-v`. Destination convention is /data
    :param list[str] dockerParameters: Parameters to pass to Docker. Default parameters are `--rm`,
            `--log-driver none`, and the mountpoint `-v work_dir:/data` where /data is the destination convention.
             These defaults are removed if docker_parmaters is passed, so be sure to pass them if they are desired.
    :param file outfile: Pipe output of Docker call to file handle
    :param bool checkOutput: When True, this function returns docker's output.
    :param int defer: What action should be taken on the container upon job completion?
           FORGO (0) will leave the container untouched.
           STOP (1) will attempt to stop the container with `docker stop` (useful for debugging).
           RM (2) will stop the container and then forcefully remove it from the system
           using `docker rm -f`. This is the default behavior if defer is set to None.
    """
    if parameters is None:
        parameters = []
    if workDir is None:
        workDir = os.getcwd()

    # Setup the outgoing subprocess call for docker
    baseDockerCall = ['docker', 'run']
    if dockerParameters:
        baseDockerCall += dockerParameters
    else:
        baseDockerCall += ['--rm', '--log-driver', 'none', '-v',
                           os.path.abspath(workDir) + ':/data']

    # Ensure the user has passed a valid value for defer
    require(defer in (None, FORGO, STOP, RM),
            'Please provide a valid value for defer.')

    # Get container name which is needed for _dockerKill
    try:
        if any('--name' in x for x in baseDockerCall):
            if any('--name=' in x for x in baseDockerCall):
                containerName = [x.split('=')[1] for x in baseDockerCall if '--name' in x][0]
            else:
                containerName = baseDockerCall[baseDockerCall.index('--name') + 1]
        else:
            containerName = _getContainerName(job)
            baseDockerCall.extend(['--name', containerName])
    except ValueError:
        containerName = _getContainerName(job)
        baseDockerCall.extend(['--name', containerName])
    except IndexError:
        raise RuntimeError("Couldn't parse Docker's `--name=` option, check parameters: " + str(dockerParameters))

    # Defer the container on-exit action
    if '--rm' in baseDockerCall and defer is None:
        defer = RM
    if '--rm' in baseDockerCall and defer is not RM:
        _logger.warn('--rm being passed to docker call but defer not set to dockerCall.RM, defer set to: ' + str(defer))
    job.defer(_dockerKill, containerName, action=defer)
    # Defer the permission fixing function which will run after this job concludes.
    # We call this explicitly later on in this function, but we defer it as well to handle unexpected job failure.
    job.defer(_fixPermissions, tool, workDir)

    # Make subprocess call

    # If parameters is list of lists, treat each list as separate command and chain with pipes
    if len(parameters) > 0 and type(parameters[0]) is list:
        # When piping, all arguments now get merged into a single string to bash -c.
        # We try to support spaces in paths by wrapping them all in quotes first.
        chain_params = [' '.join(p) for p in [list(map(pipes.quote, q)) for q in parameters]]
        # Use bash's set -eo pipefail to detect and abort on a failure in any command in the chain
        call = baseDockerCall + ['--entrypoint', '/bin/bash',  tool, '-c',
                                 'set -eo pipefail && {}'.format(' | '.join(chain_params))]
    else:
        call = baseDockerCall + [tool] + parameters
    _logger.info("Calling docker with " + repr(call))

    params = {}
    if outfile:
        params['stdout'] = outfile
    if checkOutput:
        callMethod = subprocess.check_output
    else:
        callMethod = subprocess.check_call

    for attempt in retry(predicate=dockerPredicate):
        with attempt:
            out = callMethod(call, **params)

    _fixPermissions(tool=tool, workDir=workDir)
    return out
Exemplo n.º 44
0
    def setNodeCount(self, nodeType, numNodes, preemptable=False, force=False):
        """
        Attempt to grow or shrink the number of preemptable or non-preemptable worker nodes in
        the cluster to the given value, or as close a value as possible, and, after performing
        the necessary additions or removals of worker nodes, return the resulting number of
        preemptable or non-preemptable nodes currently in the cluster.

        :param str nodeType: The node type to add or remove.

        :param int numNodes: Desired size of the cluster

        :param bool preemptable: whether the added nodes will be preemptable, i.e. whether they
               may be removed spontaneously by the underlying platform at any time.

        :param bool force: If False, the provisioner is allowed to deviate from the given number
               of nodes. For example, when downsizing a cluster, a provisioner might leave nodes
               running if they have active jobs running on them.

        :rtype: int :return: the number of worker nodes in the cluster after making the necessary
                adjustments. This value should be, but is not guaranteed to be, close or equal to
                the `numNodes` argument. It represents the closest possible approximation of the
                actual cluster size at the time this method returns.
        """
        for attempt in retry(predicate=self.provisioner.retryPredicate):
            with attempt:
                workerInstances = self.getNodes(preemptable=preemptable)
                logger.info("Cluster contains %i instances" %
                            len(workerInstances))
                # Reduce to nodes of the correct type
                workerInstances = {
                    node: workerInstances[node]
                    for node in workerInstances if node.nodeType == nodeType
                }
                ignoredNodes = [
                    node for node in workerInstances
                    if node.privateIP in self.ignoredNodes
                ]
                numIgnoredNodes = len(ignoredNodes)
                numCurrentNodes = len(workerInstances)
                logger.info(
                    "Cluster contains %i instances of type %s (%i ignored and draining jobs until "
                    "they can be safely terminated)" %
                    (numCurrentNodes, nodeType, numIgnoredNodes))
                if not force:
                    delta = numNodes - (numCurrentNodes - numIgnoredNodes)
                else:
                    delta = numNodes - numCurrentNodes
                if delta > 0:
                    if numIgnoredNodes > 0:
                        # We can un-ignore a few nodes to compensate for the additional nodes we want.
                        numNodesToUnignore = min(delta, numIgnoredNodes)
                        logger.info(
                            'Unignoring %i nodes because we want to scale back up again.'
                            % numNodesToUnignore)
                        delta -= numNodesToUnignore
                        for node in ignoredNodes[:numNodesToUnignore]:
                            self.ignoredNodes.remove(node.privateIP)
                            self.leader.batchSystem.unignoreNode(
                                node.privateIP)
                    logger.info(
                        'Adding %i %s nodes to get to desired cluster size of %i.',
                        delta,
                        'preemptable' if preemptable else 'non-preemptable',
                        numNodes)
                    numNodes = numCurrentNodes + self._addNodes(
                        nodeType, numNodes=delta, preemptable=preemptable)
                elif delta < 0:
                    logger.info(
                        'Removing %i %s nodes to get to desired cluster size of %i.',
                        -delta,
                        'preemptable' if preemptable else 'non-preemptable',
                        numNodes)
                    numNodes = numCurrentNodes - self._removeNodes(
                        workerInstances,
                        nodeType=nodeType,
                        numNodes=-delta,
                        preemptable=preemptable,
                        force=force)
                else:
                    if not force:
                        logger.info(
                            'Cluster (minus ignored nodes) already at desired size of %i. Nothing to do.',
                            numNodes)
                    else:
                        logger.info(
                            'Cluster already at desired size of %i. Nothing to do.',
                            numNodes)
        return numNodes
Exemplo n.º 45
0
def retry_s3(delays=default_delays,
             timeout=default_timeout,
             predicate=retryable_s3_errors):
    return retry(delays=delays, timeout=timeout, predicate=predicate)
Exemplo n.º 46
0
 def wrapper(*args, **kwargs):
     for attempt in retry(delays=truncExpBackoff(),
                          timeout=300,
                          predicate=googleRetryPredicate):
         with attempt:
             return f(*args, **kwargs)
Exemplo n.º 47
0
 def _addTags(cls, instances, tags):
     for instance in instances:
         for key, value in iteritems(tags):
             for attempt in retry(predicate=AWSProvisioner._throttlePredicate):
                 with attempt:
                     instance.add_tag(key, value)