Exemplo n.º 1
0
    def stress(self, *args):
        numInstances = 2

        if len(args) > 0:
            numInstances = int(args[0])

        if numInstances <= 0:
            utils.log("[%s] invalid number of instances to run stress tests on")

        test_instances = self.test_instances

        if len(test_instances) != numInstances:
            if len(test_instances) > 0:
                utils.log("[%s] removing %d stale test instances before create can occur" % (self, len(test_instances)))
                ids = set()

                # remove stale test instances
                for instance in test_instances:
                    ids.add(instance.instance_id)
                    instance.terminate()

                self.instances = filter(lambda instance: instance.instance_id not in ids, self.instances)

            utils.log("[%s] creating %d test instances" % (self, numInstances))

            # create new test instances
            test_instances = []
            for i in xrange(numInstances):
                config = {"name": "test%d" % i, "roles": ["test"], "instance_type": "m1.small"}

                instance = AWSInstance(self, config)
                test_instances.append(instance)
                self._pool.spawn(instance.create)

            self._pool.join()

            self.instances.extend(test_instances)
            utils.log("[%s] done creating %d test instances; initiating tests..." % (self, numInstances))

        env.user = "******"
        env.key_filename = ["keys/test-keypair"]

        # TODO: test just this portion

        for instance in test_instances:
            test_cmd = "/stamped/stamped/platform/tests/stampede/StressTests.py"
            log = "/stamped/logs/test.log"
            cmd = "sudo nohup bash -c '. /stamped/bin/activate && python %s >& %s < /dev/null' &" % (test_cmd, log)

            num_retries = 5
            while num_retries > 0:
                ret = utils.runbg(instance.public_dns_name, env.user, cmd)
                if 0 == ret:
                    break

                num_retries -= 1
Exemplo n.º 2
0
    def clear_cache(self, *args):
        force = len(args) >= 1 and args[0] == "force"
        cmd = "sudo /bin/bash -c 'restart memcached'"
        pp = []

        # restart memcached across all memcached servers
        for instance in self.mem_server_instances:
            pp.append((instance, utils.runbg(instance.public_dns_name, env.user, cmd)))

        for instance, p in pp:
            ret = p.wait()
Exemplo n.º 3
0
    def update(self, *args, **kwargs):
        force = len(args) >= 1 and args[0] == "force"
        utils.log("[%s] updating %d instances" % (self, len(self.instances)))

        branch = kwargs.get("branch", None)

        cmd = "sudo /bin/bash -c '. /stamped/bin/activate && python /stamped/bootstrap/bin/update.py%s%s'" % (
            " --force" if force else "",
            " --branch %s" % branch if branch is not None else "",
        )
        # cmd = "sudo /bin/bash -c '. /stamped/bin/activate && python /stamped/bootstrap/bin/update.py%s%s && cd /stamped/stamped/platform/servers/web2 && bin/restart.sh'" % \
        #      (" --force" if force else "", " --branch %s" % branch if branch is not None else "")
        pp = []
        separator = "-" * 80

        if force:
            # update all instances in parallel
            for instance in self.instances:
                pp.append((instance, utils.runbg(instance.public_dns_name, env.user, cmd)))

            for instance, p in pp:
                ret = p.wait()
        else:
            # update all instances synchronously, removing them one-at-a-time from
            # their respective ELBs and readding them once we're sure that the
            # update was applied successfully and the resulting instance is healthy
            for instance in self.instances:
                utils.log()
                utils.log(separator)
                utils.log("[%s] UPDATING %s" % (self, instance))

                # TODO: this logic doesn't account for the case where an instance
                # may belong to multiple ELBs. NOTE that this scenario will never
                # arise in our current stack architecture, but I'm leaving this
                # note in here just in case that assumption changes in the future.
                elb = self._get_elb(instance)

                # only deregister instance if it belongs to a non-trivial ELB
                deregister = elb is not None  # and len(elb.instances) > 1)

                if deregister:
                    utils.log("[%s] temporarily deregistering %s from %s" % (self, instance, elb))
                    instances = elb.deregister_instances([instance.instance_id])

                    # TODO: this sleep shouldn't be necessary since the instance
                    # is definitely removed from the ELB at this point, but without
                    # pausing, the ELB seems to ignore performing a new health check
                    # before successfully re-registering the instance. pausing here
                    # effectively ensures that the state of the instance will be
                    # set to OutOfService s.t. the health check must be passed
                    # before the instance is considered InService after instance
                    # re-registration.
                    #
                    # NOTE: an additional advantage of pausing here is that the
                    # instance update script may restart certain daemons, and a
                    # small pause after removing the instance from its ELB should
                    # give the instance's daemons a chance to finish handling any
                    # in-progress requests (e.g., gunicorn / nginx).
                    time.sleep(10)

                # apply update synchronously
                with settings(host_string=instance.public_dns_name):
                    try:
                        result = run(cmd, pty=False, shell=True)
                        status = result.return_code
                    except Exception:
                        # if run fails, ask the user whether or not to continue instead of aborting
                        status = 1

                if 0 != status:
                    utils.log("[%s] warning: failure updating %s" % (self, instance))

                    confirmation = utils.get_input()
                    if deregister and (confirmation == "n" or confirmation == "a"):
                        utils.log("[%s] warning: not re-registering %s with %s" % (self, instance, elb))

                    if confirmation == "n":
                        continue
                    elif confirmation == "a":
                        return

                if deregister:
                    utils.log("[%s] %s re-registering with %s" % (self, instance, elb))
                    elb.register_instances([instance.instance_id])

                    utils.log("[%s] %s is waiting to come back online..." % (self, instance))

                    # TODO: infer max timeout from health check settings
                    timeout = 600
                    delay = 2

                    # wait for the instance to come back online with the ELB
                    while True:
                        try:
                            health = elb.get_instance_health([instance.instance_id])[0]

                            if health.state == "InService":
                                utils.log("[%s] %s is back online with elb %s..." % (self, instance, elb))
                                break
                        except Exception, e:
                            health = utils.AttributeDict(dict(state="error retrieving health", description=str(e)))

                        utils.log("[%s] %s is '%s' (%s)" % (self, instance, health.state, health.description))

                        # instance is not in service yet; sleep for a bit before retrying
                        timeout -= delay
                        if timeout <= 0:
                            utils.log(
                                "[%s] %s timed out with elb %s (state=%s, desc=%s)..."
                                % (self, instance, elb, health.state, health.description)
                            )

                            confirmation = utils.get_input()
                            if confirmation == "n" or confirmation == "a":
                                return
                            else:
                                break

                        time.sleep(delay)

                utils.log("[%s] successfully updated %s" % (self, instance))