def start(self, collection, docker, ping, database_name): options = self.options """Launches a cAdvisor container on the instance.""" volumes = { '/': {'bind': '/rootfs', 'ro': True}, '/var/run': {'bind': '/var/run', 'ro': False}, '/sys': {'bind': '/sys', 'ro': True}, '/var/lib/docker': {'bind': '/var/lib/docker', 'ro': True} } logger.debug("cAdvisor: Writing stats to %s" % database_name) command_args = " ".join([ "-storage_driver=influxdb", "-log_dir=/", "-storage_driver_db=%s" % quote(database_name), "-storage_driver_host=%s:%d" % (quote(options.host), options.port), "-storage_driver_user=%s" % quote(options.user), "-storage_driver_password=%s" % quote(options.password), "-storage_driver_secure=%d" % options.secure, # TODO: Calculate based on the run time. "-storage_driver_buffer_duration=5s" ]) yield docker.run_containers(collection, self.info.name, None, command_args, volumes, ports={8080: 8080}) yield self.wait(collection, ping)
def _run_complete(self, session, mgr, future): logger.debug('Run Plan completed') try: response = future.result() logger.debug("Run response of: %s", response) except: logger.error("Run did an exception", exc_info=True)
def run(instance, tries=0): dns = getattr(instance.state, "dns_server", []) docker = instance.state.docker added_env = "\n".join([ "HOST_IP=%s" % instance.instance.ip_address, "STATSD_HOST=%s" % instance.instance.private_ip_address, "STATSD_PORT=8125"]) if env: _env = env + "\n" + added_env else: _env = added_env _env = self.substitute_names(_env, _env) container_env = _env.split("\n") container_args = self.substitute_names(command_args, _env) try: return docker.run_container( container_name, container_env, container_args, volumes, ports, dns=dns) except Exception as exc: logger.debug("Exception with run_container: %s", exc) if tries > 3: logger.debug("Giving up on running container.") return False docker.stop_container(container_name) return run(instance, tries=tries+1)
async def wait(self, collection, interval=60, timeout=600): """Waits till docker is available on every instance in the collection.""" end = time.time() + timeout not_responded = self.not_responding_instances(collection) def get_container(inst): try: inst.state.docker.get_containers() inst.state.docker.responded = True except DOCKER_RETRY_EXC: logger.debug("Docker not ready yet on %s", str(inst.instance.id)) except Exception as exc: logger.debug("Got exception on %s: %r", str(inst.instance.id), exc) # Attempt to fetch until they've all responded while not_responded and time.time() < end: await gen.multi( [collection.execute(get_container, x) for x in not_responded]) # Update the not_responded not_responded = self.not_responding_instances(collection) if not_responded: await collection.wait(interval) # Prune the non-responding logger.debug("Pruning %d non-responding instances.", len(not_responded)) await collection.remove_instances(not_responded)
def wait(self, collection, interval=5, timeout=600): """Waits till docker is available on every instance in the collection.""" end = time.time() + timeout not_responded = self.not_responding_instances(collection) def get_container(inst): try: inst.state.docker.get_containers() inst.state.docker.responded = True except Exception: pass # Attempt to fetch until they've all responded while not_responded and time.time() < end: yield [collection.execute(get_container, x) for x in not_responded] # Update the not_responded not_responded = self.not_responding_instances(collection) if not_responded: yield collection.wait(interval) # Prune the non-responding logger.debug("Pruning %d non-responding instances.", len(not_responded)) collection.remove_instances(not_responded)
def get_amis(region): logger.debug("Working in %s" % region) try: conn = connect_to_region( region, aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, port=port, is_secure=is_secure) filters = {} if owner_id is not None and use_filters: filters["owner-id"] = owner_id images = conn.get_all_images(filters=filters) # The last two highest sorted are the pvm and hvm instance id's # what is this 899.4 ??? XXX # images = sorted([x for x in images if "899.4" in x.name], # key=lambda x: x.name)[-2:] images = sorted(images, key=lambda x: x.name)[-2:] AWS_AMI_IDS[region] = {x.virtualization_type: x for x in images} logger.debug("%s populated" % region) except Exception as exc: logger.exception('Could not get all images in %s' % region) errors.append(exc)
def _recover(self): """Recover allocated instances from EC2.""" recovered_instances = defaultdict(list) # Recover every region at once instancelist = yield [self._recover_region(x) for x in AWS_REGIONS] logger.debug("Found %s instances to recover.", sum(map(len, instancelist))) for instances in instancelist: for instance in instances: tags = instance.tags # If this has been 'pending' too long, we put it in the main # instance pool for later reaping if not available_instance(instance): self._instances[instance.region.name].append(instance) continue if tags.get("RunId") and tags.get("Uuid"): # Put allocated instances into a recovery pool separate # from unallocated inst_key = (tags["RunId"], tags["Uuid"]) recovered_instances[inst_key].append(instance) else: self._instances[instance.region.name].append(instance) self._recovered = recovered_instances
def setup_database(session, db_file): """Helper function to setup the initial database based off a json file""" logger.debug("Verifying database setup.") with open(db_file) as fp: data = json.load(fp, object_pairs_hook=OrderedDict) # Verify the project exists project = session.query(Project).filter_by(name=data["name"]).first() if not project: project = Project(name=data["name"]) session.add(project) session.commit() logger.debug("Project ID: %s", project.uuid) # Key plans by name to look them up quickly if they exist existing = {plan.name: plan for plan in project.plans} # Verify every strategy exists for plan in data["plans"]: ex_plan = existing.get(plan["name"]) if ex_plan: logger.debug("Found plan: %s, UUID: %s", ex_plan.name, ex_plan.uuid) continue new_plan = Plan.from_json(plan) project.plans.append(new_plan) session.commit() logger.debug("Added plan: %s, UUID: %s", new_plan.name, new_plan.uuid) logger.debug("Finished database setup.")
def makedirs(sftp, dirname, mode=511): """Creates a directory with the given dirname and mode on a remote server, including any intermediate-level directories.""" if not dirname: raise OSError('Missing directory name') dirnames = deque([dirname]) while True: dirname, basename = os.path.split(dirname) if not basename: dirname, basename = os.path.split(dirname) if not dirname or not basename: break dirnames.appendleft(dirname) for dirname in dirnames: try: attrs = sftp.stat(dirname) except OSError: logger.debug("Creating directory %s..." % dirname) sftp.mkdir(dirname, mode) continue if not stat.S_ISDIR(attrs.st_mode): raise OSError("%s exists and is not a directory" % dirname)
def _start_set(self, setlink): if setlink.collection.started: return setlink.collection.started = True # Start cadvisor database_name = "%s-cadvisor" % self.run.id logger.debug("Starting up cadvisor on the hosts") yield self.helpers.cadvisor.start( setlink.collection, self.helpers.docker, self.helpers.ping, database_name) # Start heka yield self.helpers.heka.start(setlink.collection, self.helpers.docker, self.helpers.ping) # Startup local DNS if needed if self._use_dns: yield self.helpers.dns.start(setlink.collection, self._dns_map) # Startup the testers yield self.helpers.docker.run_containers( setlink.collection, container_name=setlink.meta.container_name, env=setlink.meta.environment_data, command_args=setlink.meta.additional_command_args, local_dns=self._use_dns )
def get_amis(region): logger.debug("Working in %s" % region) try: conn = connect_to_region( region, aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, port=port, is_secure=is_secure) filters = {} if owner_id is not None and use_filters: filters["owner-id"] = owner_id images = conn.get_all_images(filters=filters) # The last two highest sorted are the pvm and hvm instance id's # what is this 899.4 ??? XXX # images = sorted([x for x in images if "899.4" in x.name], # key=lambda x: x.name)[-2:] images = sorted(images, key=lambda x: x.name)[-2:] AWS_AMI_IDS[region] = {x.virtualization_type: x for x in images} logger.debug("%s populated" % region) except Exception as exc: logger.exception('Could not get all images in %s' % region) errors.append(exc)
def setup_database(session, db_file): """Helper function to setup the initial database based off a json file""" logger.debug("Verifying database setup.") with open(db_file) as fp: data = json.load(fp, object_pairs_hook=OrderedDict) # Verify the project exists project = session.query(Project).filter_by(name=data["name"]).first() if not project: project = Project(name=data["name"]) session.add(project) session.commit() logger.debug("Project ID: %s", project.uuid) # Key plans by name to look them up quickly if they exist existing = {plan.name: plan for plan in project.plans} # Verify every strategy exists for plan in data["plans"]: ex_plan = existing.get(plan["name"]) if ex_plan: logger.debug("Found plan: %s, UUID: %s", ex_plan.name, ex_plan.uuid) continue new_plan = Plan.from_json(plan) project.plans.append(new_plan) session.commit() logger.debug("Added plan: %s, UUID: %s", new_plan.name, new_plan.uuid) logger.debug("Finished database setup.")
def start(self, collection, docker, ping, database_name): """Launches Heka containers on all instances.""" if not self.options: logger.debug("Heka not configured") return config_file = HEKA_CONFIG_TEMPLATE.substitute( remote_addr=join_host_port(self.options.host, self.options.port), remote_secure=self.options.secure and "true" or "false", influx_addr=join_host_port(self.influx.host, self.influx.port), influx_db=database_name) volumes = {'/home/core/heka': {'bind': '/heka', 'ro': False}} ports = {(8125, "udp"): 8125, 4352: 4352} # Upload heka config to all the instances def upload_files(inst): with StringIO(config_file) as fl: self.sshclient.upload_file(inst.instance, fl, "/home/core/heka/config.toml") yield collection.map(upload_files) logger.debug("Launching Heka...") yield docker.run_containers(collection, self.info.name, None, "hekad -config=/heka/config.toml", volumes=volumes, ports=ports) def ping_heka(inst): health_url = "http://%s:4352/" % inst.instance.ip_address yield ping.ping(health_url) yield collection.map(ping_heka)
def wait_for_running(self, interval=5, timeout=600): """Wait for all the instances to be running. Instances unable to load will be removed.""" def update_state(inst): try: inst.instance.update() except Exception: # Updating state can fail, it happens pass end_time = time.time() + 600 pending = self.pending_instances() while time.time() < end_time and pending: # Update the state of all the pending instances yield [self.execute(update_state, inst) for inst in pending] pending = self.pending_instances() # Wait if there's pending to check again if pending: yield self.wait(interval) # Remove everything that isn't running by now dead = self.dead_instances() + self.pending_instances() # Don't wait for the future that kills them logger.debug("Removing %d dead instances that wouldn't run.", len(dead)) self.remove_instances(dead) return True
async def is_done(self, docker) -> bool: """Determine if finished or pending termination""" # If we haven't been started, we can't be done if not self.step_record.started_at: return False # If we're already stopped, then we're obviously done if self.ec2_collection.finished: return True run = self.step_record.run container_name = run.interpolate( self.step.container_name, self.step.environment_data) # If the collection has no instances running the container, its done instances_running = await docker.is_running( self.ec2_collection, container_name, prune=self.step.prune_running ) if not instances_running: inst_info = [] for inst, info in self._instance_debug_info().items(): inst_info.append(inst) inst_info.append(pformat(info)) logger.debug("No instances running, collection done.") logger.debug("Instance information:\n%s", '\n'.join(inst_info)) return True # Remove instances that stopped responding await self.ec2_collection.remove_dead_instances() # Otherwise return whether we should be stopped return self.step_record.should_stop()
async def _is_done(self, setlink): """Given a StepRecordLink, determine if the collection has finished or should be terminated.""" # If we haven't been started, we can't be done if not setlink.step_record.started_at: return False # If we're already stopped, then we're obviously done if setlink.ec2_collection.finished: return True # If the collection has no instances running the container, its done docker = self.helpers.docker container_name = setlink.step.container_name instances_running = await docker.is_running( setlink.ec2_collection, container_name, prune=setlink.step.prune_running) if not instances_running: inst_info = [] for inst, info in self._instance_debug_info(setlink).items(): inst_info.append(inst) inst_info.append(pformat(info)) logger.debug("No instances running, collection done.") logger.debug("Instance information:\n%s", '\n'.join(inst_info)) return True # Remove instances that stopped responding await setlink.ec2_collection.remove_dead_instances() # Otherwise return whether we should be stopped return setlink.step_record.should_stop()
def load(instance, tries=0): docker = instance.state.docker has_container = docker.has_image(container_name) if has_container: return if container_url: client = self.sshclient.connect(instance.instance) try: output = docker.import_container(client, container_url) finally: client.close() else: output = docker.pull_container(container_name) if not docker.has_image(container_name): if tries > 3: logger.debug("Can't load container, retries exceeded.") return False logger.debug("Unable to load container: %s. Retrying.", output) return load(instance, tries+1) return output
def _print_status(self): while True: if not len(self._runs): logger.debug("Status: No runs in progress.") for uuid, mgr in self._runs.items(): run = mgr.run logger.debug("Run state for %s: %s - %s", run.uuid, status_to_text(mgr.state), mgr.state_description) yield gen.Task(self.loop.add_timeout, time.time() + 10)
def get_container(inst): try: inst.state.docker.get_containers() inst.state.docker.responded = True except DOCKER_RETRY_EXC: logger.debug("Docker not ready yet on %s", str(inst.instance.id)) except Exception as exc: logger.debug("Got exception on %s: %r", str(inst.instance.id), exc)
def initialize(self): """Fully initialize the AWS pool and dependencies, recover existing instances, etc. :returns: A future that will require the loop running to retrieve. """ logger.debug("Pulling CoreOS AMI info...") populate_ami_ids(self.access_key, self.secret_key, port=self.port, owner_id=self.owner_id) return self._recover()
def initialize(self): """Fully initialize the AWS pool and dependencies, recover existing instances, etc. :returns: A future that will require the loop running to retrieve. """ logger.debug("Pulling CoreOS AMI info...") populate_ami_ids(self.access_key, self.secret_key, port=self.port, owner_id=self.owner_id, use_filters=self.use_filters) return self._recover()
def ___retry(*args, **kw): attempt = 1 while attempt < attempts: try: return func(*args, **kw) except Exception: logger.debug('Failed (%d/%d)' % (attempt, attempts), exc_info=True) attempt += 1 # failed raise
def has_container(instance): try: all_containers = instance.state.docker.get_containers() except: if prune: msg = ("Lost contact with a container on %s, " "marking dead.") logger.debug(msg % instance.id) instance.state.nonresponsive = True return not prune return any(container_name in cont["Image"] for cont in all_containers.values())
async def _get_steps(self): """Request all the step instances needed from the pool This is a separate method as both the recover run and new run will need to run this identically. """ logger.debug('Getting steps & collections') steps = self.run.plan.steps collections = await gen.multi([ self._pool.request_instances(self.run.uuid, s.uuid, count=s.instance_count, inst_type=s.instance_type, region=s.instance_region, plan=self.run.plan.name, owner=self.run.owner, run_max_time=s.run_delay + s.run_max_time) for s in steps ]) try: # First, setup some dicst, all keyed by step.uuid steps_by_uuid = {x.uuid: x for x in steps} step_records_by_uuid = { x.step.uuid: x for x in self.run.step_records } # Link the step/step_record/ec2_collection under a single # StepRecordLink tuple for coll in collections: step = steps_by_uuid[coll.uuid] step_record = step_records_by_uuid[coll.uuid] setlink = StepRecordLink(step_record, step, coll) self._set_links.append(setlink) except Exception: # Ensure we return collections if something bad happened logger.error("Got an exception in runner, returning instances", exc_info=True) try: await gen.multi( [self._pool.release_instances(x) for x in collections]) except: logger.error("Wat? Got an error returning instances.", exc_info=True) # Clear out the setlinks to make sure they aren't cleaned up # again self._set_links = []
async def _initialize(self): # Initialize all the collections, this needs to always be done # just in case we're recovering await self._get_steps() # Skip if we're running if self.state == RUNNING: return # Wait for the collections to come up self.state_description = "Waiting for running instances." await gen.multi( [x.ec2_collection.wait_for_running() for x in self._set_links]) # Setup docker on the collections docker = self.helpers.docker await gen.multi([ docker.setup_collection(x.ec2_collection) for x in self._set_links ]) # Wait for docker on all the collections to come up self.state_description = "Waiting for docker" await gen.multi([ docker.wait(x.ec2_collection, timeout=360) for x in self._set_links ]) # Pull the base containers we need (for heka) self.state_description = "Pulling base container images" for container in self.base_containers: logger.debug("Pulling base container " + container.name) await gen.multi([ docker.load_containers(x.ec2_collection, container.name, container.url) for x in self._set_links ]) logger.debug("Pulling containers for this step.") # Pull the appropriate containers for every collection self.state_description = "Pulling step images" await gen.multi([ docker.load_containers(x.ec2_collection, x.step.container_name, x.step.container_url) for x in self._set_links ]) self.state_description = "" self.run.state = RUNNING self.run.started_at = datetime.utcnow() self._db_session.commit() log_threadid("Now running.")
def _region_conn(self, region=None): if region in self._conns: return self._conns[region] # Setup a connection logger.debug("Requesting connection for region: %s", region) conn = yield self._executor.submit( connect_to_region, region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key, port=self.port, is_secure=self.is_secure) self._conns[region] = conn logger.debug("Returning connection for region: %s", region) return conn
def request_instances(self, run_id, uuid, count=1, inst_type="t1.micro", region="us-west-2"): """Allocate a collection of instances. :param run_id: Run ID for these instances :param uuid: UUID to use for this collection :param count: How many instances to allocate :param type: EC2 Instance type the instances should be :param region: EC2 region to allocate the instances in :returns: Collection of allocated instances :rtype: :ref:`EC2Collection` """ if region not in AWS_REGIONS: raise LoadsException("Unknown region: %s" % region) # First attempt to recover instances for this run/uuid instances = self._locate_recovered_instances(run_id, uuid) remaining_count = count - len(instances) # Add any more remaining that should be used instances.extend( self._locate_existing_instances(remaining_count, inst_type, region) ) conn = yield self._region_conn(region) # Determine if we should allocate more instances num = count - len(instances) if num > 0: new_instances = yield self._allocate_instances( conn, num, inst_type, region) logger.debug("Allocated instances: %s", new_instances) instances.extend(new_instances) # Tag all the instances if self.use_filters: yield self._executor.submit( conn.create_tags, [x.id for x in instances], { "Name": "loads-%s" % self.broker_id, "Project": "loads", "RunId": run_id, "Uuid": uuid } ) return EC2Collection(run_id, uuid, conn, instances, self._loop)
def main(sysargs=None): """Parses arguments and starts up the loads-broker. This daemon runs in the foreground. """ args, parser = _parse(sysargs) set_logger(debug=args.debug) loop = tornado.ioloop.IOLoop.instance() if args.aws_endpoints is not None: os.environ['BOTO_ENDPOINTS'] = args.aws_endpoints # an empty string means we don't filter by owner id # we translate this to None aws_owner_id = args.aws_owner_id and args.aws_owner_id or None aws_access_key = os.environ.get('AWS_ACCESS_KEY_ID') aws_secret_key = os.environ.get('AWS_SECRET_ACCESS_KEY') heka_options = HekaOptions(args.heka_host, args.heka_port, args.heka_secure) if args.no_influx: influx_options = None else: influx_options = InfluxOptions(args.influx_host, args.influx_port, args.influx_user, args.influx_password, args.influx_secure) application.broker = Broker(args.name, loop, args.database, args.ssh_key, heka_options, influx_options, aws_port=args.aws_port, aws_owner_id=aws_owner_id, aws_use_filters=not args.aws_skip_filters, aws_access_key=aws_access_key, aws_secret_key=aws_secret_key, initial_db=args.initial_db) logger.debug('Listening on port %d...' % args.port) application.listen(args.port) try: loop.start() except KeyboardInterrupt: logger.debug('Bye')
def main(sysargs=None): args, parser = _parse(sysargs) set_logger(debug=args.debug) c = Client(args.host, args.port, args.scheme) if not hasattr(args, 'func'): args.func = _COMMANDS['info'] args.func = args.func(c.session, c.root) try: res = args.func(args) print(json.dumps(res)) except requests.exceptions.ConnectionError as e: logger.debug('Cannot connect => ' + str(e))
def ___retry(*args, **kw): attempt = 0 while True: attempt += 1 try: result = func(*args, **kw) except Exception as exc: if (on_exception is None or not on_exception(exc) or attempt == attempts): logger.debug('Failed (%d/%d)' % (attempt, attempts), exc_info=True) raise else: if (on_result is None or not on_result(result) or attempt == attempts): return result
def main(sysargs=None): args, parser = _parse(sysargs) set_logger(debug=args.debug) c = Client(args.host, args.port, args.scheme) if not hasattr(args, 'func'): args.func = _COMMANDS['info'] args.func = args.func(c.session, c.root) try: res = args.func(args) print(json.dumps(res)) except requests.exceptions.ConnectionError as e: logger.debug('Cannot connect => ' + str(e))
def ___retry(*args, **kw): attempt = 0 while True: attempt += 1 try: result = func(*args, **kw) except Exception as exc: if (on_exception is None or not on_exception(exc) or attempt == attempts): logger.debug('Failed (%d/%d)' % (attempt, attempts), exc_info=True) raise else: if (on_result is None or not on_result(result) or attempt == attempts): return result
def __init__(self, broker_id, access_key=None, secret_key=None, key_pair="loads", security="loads", max_idle=600, user_data=None, io_loop=None, port=None, owner_id="595879546273", use_filters=True): self.owner_id = owner_id self.use_filters = use_filters self.broker_id = broker_id self.access_key = access_key self.secret_key = secret_key self.max_idle = max_idle self.key_pair = key_pair self.security = security self.user_data = user_data self._instances = defaultdict(list) self._tag_filters = {"tag:Name": "loads-%s" % self.broker_id, "tag:Project": "loads"} self._conns = {} self._recovered = {} self._executor = concurrent.futures.ThreadPoolExecutor(15) self._loop = io_loop or tornado.ioloop.IOLoop.instance() self.port = port # see https://github.com/boto/boto/issues/2617 if port is not None: self.is_secure = port == 443 else: self.is_secure = True # Asynchronously initialize ourself when the pool runs self._loop.add_future( self.initialize(), lambda x: logger.debug("Finished initializing. %s", x.result()) ) self.ready = Future()
def _recover_region(self, region): """Recover all the instances in a region""" conn = yield self._region_conn(region) logger.debug("Requesting instances for %s", region) if self.use_filters: filters = self._tag_filters else: filters = {} instances = yield self._executor.submit( conn.get_only_instances, filters=filters) logger.debug("Finished requesting instances for %s", region) return instances
def safe_run_container(self, name: str, *args, **kwargs) -> Any: """Call run_container until it succeeds Max of 5 tries w/ attempts to stop potential zombie containers. """ for i in range(5): try: return self.run_container(name, *args, **kwargs) except Exception as exc: logger.debug("Exception with run_container (%s)", name, exc_info=True) if i == 4: logger.debug("Giving up on running container.") raise self.stop_container(name)
def _is_done(self, setlink): """Given a ContainerSetLink, determine if the collection has finished or should be terminated.""" # If we haven't been started, we can't be done if not setlink.running.started_at: return False # If the collection has no instances running the container, its done docker = self.helpers.docker container_name = setlink.meta.container_name instances_running = yield docker.is_running(setlink.collection, container_name) if not instances_running: logger.debug("No instances running, collection done.") return True # Otherwise return whether we should be stopped return setlink.running.should_stop()
def _initialize(self): # Initialize all the collections, this needs to always be done # just in case we're recovering yield self._get_container_sets() # Skip if we're running if self.state == RUNNING: return # Wait for the collections to come up self.state_Description = "Waiting for running instances." yield [x.collection.wait_for_running() for x in self._set_links] # Setup docker on the collections docker = self.helpers.docker yield [docker.setup_collection(x.collection) for x in self._set_links] # Wait for docker on all the collections to come up self.state_description = "Waiting for docker" yield [docker.wait(x.collection, timeout=120) for x in self._set_links] logger.debug("Pulling base containers: heka/cadvisor") # Pull the base containers we need (for heka / cadvisor) self.state_description = "Pulling base container images" for container in self.base_containers: yield [docker.load_containers(x.collection, container.name, container.url) for x in self._set_links] logger.debug("Pulling containers for this set.") # Pull the appropriate containers for every collection self.state_description = "Pulling container set images" yield [docker.load_containers(x.collection, x.meta.container_name, x.meta.container_url) for x in self._set_links] self.state_description = "" self.run.state = RUNNING self.run.started_at = datetime.utcnow() self._db_session.commit() log_threadid("Now running.")
def run(instance, tries=0): dns = getattr(instance.state, "dns_server", []) docker = instance.state.docker rinstance = instance.instance extra = [("HOST_IP", rinstance.ip_address), ("PRIVATE_IP", rinstance.private_ip_address), ("STATSD_HOST", rinstance.private_ip_address), ("STATSD_PORT", "8125")] extra_env = env.copy() extra_env.update(extra) _env = { self.substitute_names(k, extra_env): self.substitute_names(v, extra_env) for k, v in extra_env.items() } if command is None: _command = None else: _command = self.substitute_names(command, _env) _volumes = {} for host, volume in volumes.items(): binding = volume.copy() binding["bind"] = self.substitute_names( binding.get("bind", host), _env) _volumes[self.substitute_names(host, _env)] = binding try: return docker.run_container(name, _command, env=_env, volumes=_volumes, ports=ports, dns=dns, pid_mode=pid_mode) except Exception as exc: logger.debug("Exception with run_container: %s", exc) if tries > 3: logger.debug("Giving up on running container.") return False docker.stop_container(name) return run(instance, tries=tries + 1)
async def _start_step_containers(self, docker): """Startup the testers""" # XXX: run env should more likely override step env run = self.step_record.run env = run.environment_data or {} env.update(self.step.environment_data) env['CONTAINER_ID'] = self.step.uuid logger.debug("Starting step: %s", self.ec2_collection.uuid) container_name = run.interpolate( self.step.container_name, self.step.environment_data) await docker.run_containers( self.ec2_collection, container_name, self.step.additional_command_args, env=env, ports=self.step.port_mapping or {}, volumes=self.step.volume_mapping or {}, delay=self.step.node_delay, )
async def _start_base_containers(self, helpers, dns_map, influxdb_options): # Reload sysctl because coreos doesn't reload this right await helpers.ssh.reload_sysctl(self.ec2_collection) # Start Watcher await helpers.watcher.start(self.ec2_collection, helpers.docker) if self.is_monitored: await helpers.telegraf.start( self.ec2_collection, helpers.docker, influxdb_options, step=self.step.name, type_=self.step.docker_series ) # Startup local DNS if needed if self.ec2_collection.local_dns: logger.debug("Starting up DNS") await helpers.dns.start(self.ec2_collection, dns_map)
async def remove_instances(self, ec2_instances): """Remove an instance entirely.""" if not ec2_instances: return instances = [i.instance for i in ec2_instances] for inst in ec2_instances: self.instances.remove(inst) instance_ids = [x.id for x in instances] try: # Remove the tags await self.execute(self.conn.create_tags, instance_ids, {"RunId": "", "Uuid": ""}) except Exception: logger.debug("Error detagging instances, continuing.", exc_info=True) try: logger.debug("Terminating instances %s" % str(instance_ids)) # Nuke them await self.execute(self.conn.terminate_instances, instance_ids) except Exception: logger.debug("Error terminating instances.", exc_info=True)
def _start_set(self, setlink): if setlink.collection.started: return setlink.collection.started = True # Start cadvisor database_name = "%s-cadvisor" % self.run.uuid logger.debug("Starting up cadvisor on the hosts") yield self.helpers.cadvisor.start( setlink.collection, self.helpers.docker, self.helpers.ping, database_name) # Start heka yield self.helpers.heka.start(setlink.collection, self.helpers.docker, self.helpers.ping, self.run.uuid) # Startup local DNS if needed if setlink.collection.local_dns: logger.debug("Starting up DNS") yield self.helpers.dns.start(setlink.collection, self._dns_map) # Startup the testers env = "\n".join([dict2str(self.run_env), setlink.meta.environment_data, "CONTAINER_ID=%s" % setlink.meta.uuid]) logger.debug("Starting container set: %s", setlink.collection.uuid) yield self.helpers.docker.run_containers( setlink.collection, container_name=setlink.meta.container_name, env=env, command_args=setlink.meta.additional_command_args, ports=setlink.meta.port_mapping or {} )
def new_run(cls, run_helpers, db_session, pool, io_loop, plan_uuid, run_uuid=None, additional_env=None, owner=None): """Create a new run manager for the given strategy name This creates a new run for this strategy and initializes it. :param db_session: SQLAlchemy database session :param pool: AWS EC2Pool instance to allocate from :param io_loop: A tornado io loop :param plan_uuid: The strategy UUID to use for this run :param run_uuid: Use the provided run_uuid instead of generating one :param additional_env: Additional env args to use in container set interpolation :returns: New RunManager in the process of being initialized, along with a future tracking the run. """ # Create the run for this manager logger.debug('Starting a new run manager') run = Run.new_run(db_session, plan_uuid, owner) if run_uuid: run.uuid = run_uuid db_session.add(run) db_session.commit() log_threadid("Committed new session.") run_manager = cls(run_helpers, db_session, pool, io_loop, run) if additional_env: run_manager.run_env.update(additional_env) future = gen.convert_yielded(run_manager.start()) return run_manager, future
async def remove_instances(self, ec2_instances): """Remove an instance entirely.""" if not ec2_instances: return instances = [i.instance for i in ec2_instances] for inst in ec2_instances: self.instances.remove(inst) instance_ids = [x.id for x in instances] try: # Remove the tags await self.execute(self.conn.create_tags, instance_ids, {"RunId": "", "Uuid": ""}) except Exception: logger.debug("Error detagging instances, continuing.", exc_info=True) try: logger.debug("Terminating instances %s" % str(instance_ids)) # Nuke them await self.execute(self.conn.terminate_instances, instance_ids) except Exception: logger.debug("Error terminating instances.", exc_info=True)
def _cleanup(self, exc=False): if exc: # Ensure we try and shut them down logger.debug("Exception occurred, ensure containers terminated.", exc_info=True) try: yield [self._stop_set(s) for s in self._set_links] except Exception: logger.error("Le sigh, error shutting down instances.", exc_info=True) # Ensure we always release the collections we used logger.debug("Returning collections") try: yield [self._pool.release_instances(x.collection) for x in self._set_links] except Exception: logger.error("Embarassing, error returning instances.", exc_info=True) self._set_links = []
def _run(self): # Skip if we're not running if self.state != RUNNING: return while True: if self.abort: logger.debug("Aborted, exiting run loop.") break stop = yield self._check_containers() if stop: break # Now we sleep for a bit yield gen.Task(self._loop.add_timeout, time.time() + self.sleep_time) # We're done running, time to terminate self.run.state = TERMINATING self.run.completed_at = datetime.utcnow() self._db_session.commit()
async def _run(self): # Skip if we're not running if self.state != RUNNING: return # Main run loop while True: if self.abort: logger.debug("Aborted, exiting run loop.") break stop = await self._check_steps() if stop: break # Now we sleep for a bit await gen.Task(self._loop.add_timeout, time.time() + self.sleep_time) # We're done running, time to terminate self.run.state = TERMINATING self.run.completed_at = datetime.utcnow() self._db_session.commit()
async def start(self, collection, docker): """Launches Heka containers on all instances.""" if not self.options: logger.debug("Watcher not configured") return bind = {'bind': '/var/run/docker.sock', 'ro': False} volumes = {'/var/run/docker.sock': bind} ports = {} env = { 'AWS_ACCESS_KEY_ID': self.options['AWS_ACCESS_KEY_ID'] or "", 'AWS_SECRET_ACCESS_KEY': self.options['AWS_SECRET_ACCESS_KEY'] or "" } logger.debug("Launching Watcher...") await docker.run_containers(collection, self.info.name, "python ./watch.py", env=env, volumes=volumes, ports=ports, pid_mode="host")
async def _start_step(self, setlink): setlink.ec2_collection.started = True # Reload sysctl because coreos doesn't reload this right await self.helpers.ssh.reload_sysctl(setlink.ec2_collection) # Start Watcher await self.helpers.watcher.start(setlink.ec2_collection, self.helpers.docker) # Start heka await self.helpers.heka.start(setlink.ec2_collection, self.helpers.docker, self.helpers.ping, "db" + self.run.uuid.replace('-', ''), series=setlink.step.docker_series) # Startup local DNS if needed if setlink.ec2_collection.local_dns: logger.debug("Starting up DNS") await self.helpers.dns.start(setlink.ec2_collection, self._dns_map) # Startup the testers env = self.run_env.copy() env.update(setlink.step.environment_data) env['CONTAINER_ID'] = setlink.step.uuid logger.debug("Starting step: %s", setlink.ec2_collection.uuid) await self.helpers.docker.run_containers( setlink.ec2_collection, setlink.step.container_name, setlink.step.additional_command_args, env=env, ports=setlink.step.port_mapping or {}, volumes=setlink.step.volume_mapping or {}, delay=setlink.step.node_delay, )
async def _cleanup(self, exc=False): if exc: # Ensure we try and shut them down logger.debug("Exception occurred, ensure containers terminated.", exc_info=True) try: await gen.multi([self._stop_step(s) for s in self._set_links]) except Exception: logger.error("Le sigh, error shutting down instances.", exc_info=True) # Ensure we always release the collections we used logger.debug("Returning collections") try: await gen.multi([ self._pool.release_instances(x.ec2_collection) for x in self._set_links ]) except Exception: logger.error("Embarassing, error returning instances.", exc_info=True) self._set_links = []
def load(instance): def debug(msg): logger.debug("[%s] %s" % (instance.instance.id, msg)) docker = instance.state.docker has_container = docker.has_image(container_name) if has_container and "latest" not in container_name: return if container_url: debug("Importing %s" % container_url) with self.sshclient.connect(instance.instance) as client: output = docker.import_container(client, container_url) if output: logger.debug(output) else: debug("Pulling %r" % container_name) output = docker.pull_container(container_name) if not image_loaded(docker, container_name): debug("Docker does not have %s" % container_name) return False return output
async def _recover(self): """Recover allocated instances from EC2.""" recovered_instances = defaultdict(list) # Recover every region at once instancelist = await gen.multi( [self._recover_region(x) for x in AWS_REGIONS]) logger.debug("Found %s instances to look at for recovery.", sum(map(len, instancelist))) allocated = 0 not_used = 0 for instances in instancelist: for instance in instances: # skipping terminated instances if instance.state == 'terminated': continue tags = instance.tags region = instance.region.name logger.debug('- %s (%s)' % (instance.id, region)) # If this has been 'pending' too long, we put it in the main # instance pool for later reaping if not available_instance(instance): self._instances[region].append(instance) continue if tags.get("RunId") and tags.get("Uuid"): # Put allocated instances into a recovery pool separate # from unallocated inst_key = (tags["RunId"], tags["Uuid"]) recovered_instances[inst_key].append(instance) allocated += 1 else: self._instances[region].append(instance) not_used += 1 logger.debug("%d instances were allocated to a run" % allocated) logger.debug("%d instances were not used" % not_used) self._recovered = recovered_instances
def __init__(self, name, io_loop, sqluri, ssh_key, heka_options, influx_options, aws_port=None, aws_owner_id="595879546273", aws_use_filters=True, aws_access_key=None, aws_secret_key=None, initial_db=None): self.name = name logger.debug("loads-broker (%s)", self.name) self.loop = io_loop self._base_env = BASE_ENV.copy() self.watcher_options = { 'AWS_ACCESS_KEY_ID': aws_access_key, 'AWS_SECRET_ACCESS_KEY': aws_secret_key } user_data = _DEFAULTS["user_data"] if user_data is not None and os.path.exists(user_data): with open(user_data) as f: user_data = f.read() self.influx_options = influx_options if influx_options is None: self.influx = None else: influx_args = { "host": influx_options.host, "port": influx_options.port, "username": influx_options.user, "password": influx_options.password, "database": "loads" } if influx_options.secure: influx_args["ssl"] = True influx_args["verify_ssl"] = True if InfluxDBClient is None: raise ImportError('You need to install the influx lib') self.influx = InfluxDBClient(**influx_args) self.pool = aws.EC2Pool(self.name, user_data=user_data, io_loop=self.loop, port=aws_port, owner_id=aws_owner_id, use_filters=aws_use_filters, access_key=aws_access_key, secret_key=aws_secret_key) # Utilities used by RunManager ssh = SSH(ssh_keyfile=ssh_key) self.run_helpers = run_helpers = RunHelpers() run_helpers.ping = Ping(self.loop) run_helpers.docker = Docker(ssh) run_helpers.dns = DNSMasq(DNSMASQ_INFO, run_helpers.docker) run_helpers.heka = Heka(HEKA_INFO, ssh=ssh, options=heka_options, influx=influx_options) run_helpers.watcher = Watcher(WATCHER_INFO, options=self.watcher_options) run_helpers.ssh = ssh self.db = Database(sqluri, echo=True) # Run managers keyed by uuid self._runs = {} # Ensure the db is setup if initial_db: setup_database(self.db.session(), initial_db)
def debug(self, msg): logger.debug('[uuid:%s] %s' % (self.uuid, msg))
def _initialized(self, future): # Run the result to ensure we raise an exception if any occurred logger.debug("Finished initializing: %s.", future.result()) self.ready.set_result(True)
def log_threadid(msg): """Log a message, including the thread ID""" thread_id = threading.currentThread().ident logger.debug("Msg: %s, ThreadID: %s", msg, thread_id)
async def request_instances(self, run_id: str, uuid: str, count=1, inst_type="t1.micro", region="us-west-2", allocate_missing=True, plan: Optional[str] = None, owner: Optional[str] = None, run_max_time: Optional[int] = None): """Allocate a collection of instances. :param run_id: Run ID for these instances :param uuid: UUID to use for this collection :param count: How many instances to allocate :param type: EC2 Instance type the instances should be :param region: EC2 region to allocate the instances in :param allocate_missing: If there's insufficient existing instances for this uuid, whether existing or new instances should be allocated to the collection. :param plan: Name of the instances' plan :param owner: Owner name of the instances :param run_max_time: Maximum expected run-time of instances in seconds :returns: Collection of allocated instances :rtype: :class:`EC2Collection` """ if region not in AWS_REGIONS: raise LoadsException("Unknown region: %s" % region) # First attempt to recover instances for this run/uuid instances = self._locate_recovered_instances(run_id, uuid) remaining_count = count - len(instances) conn = await self._region_conn(region) # If existing/new are not being allocated, the recovered are # already tagged, so we're done. if not allocate_missing: return EC2Collection(run_id, uuid, conn, instances, self._loop) # Add any more remaining that should be used instances.extend( self._locate_existing_instances(remaining_count, inst_type, region) ) # Determine if we should allocate more instances num = count - len(instances) if num > 0: new_instances = await self._allocate_instances( conn, num, inst_type, region) logger.debug("Allocated instances%s: %s", " (Owner: %s)" % owner if owner else "", new_instances) instances.extend(new_instances) # Tag all the instances if self.use_filters: tags = { "Name": "loads-{}{}".format(self.broker_id, "-" + plan if plan else ""), "Project": "loads", "RunId": run_id, "Uuid": uuid, } if owner: tags["Owner"] = owner if run_max_time is not None: self._tag_for_reaping(tags, run_max_time) # Sometimes, we can get instance data back before the AWS # API fully recognizes it, so we wait as needed. async def tag_instance(instance): retries = 0 while True: try: await self._run_in_executor( conn.create_tags, [instance.id], tags) break except: if retries > 5: raise retries += 1 await gen.Task(self._loop.add_timeout, time.time() + 1) await gen.multi([tag_instance(x) for x in instances]) return EC2Collection(run_id, uuid, conn, instances, self._loop)
def _set_state(self, state): self._state_description = state if state: logger.debug(state)
async def start(self, collection, docker, ping, database_name, series=None): """Launches Heka containers on all instances.""" if not self.options: logger.debug("Heka not configured") return volumes = { '/home/core/heka': { 'bind': '/heka', 'ro': False }, # '/proc': {'bind': '/proc', 'ro': False} } ports = {(8125, "udp"): 8125, 4352: 4352} series_name = "" if series: series_name = "%s." % series # Upload heka config to all the instances def upload_files(inst): hostname = "%s%s" % (series_name, inst.instance.ip_address.replace('.', '_')) if self.influx: config_file = HEKA_CONFIG_TEMPLATE.substitute( remote_addr=join_host_port(self.options.host, self.options.port), remote_secure=self.options.secure and "true" or "false", influx_addr=join_host_port(self.influx.host, self.influx.port), influx_db=database_name, hostname=hostname) else: config_file = HEKA_NOINFLUX_TEMPLATE.substitute( remote_addr=join_host_port(self.options.host, self.options.port), remote_secure=self.options.secure and "true" or "false", hostname=hostname) with StringIO(config_file) as fl: self.sshclient.upload_file(inst.instance, fl, "/home/core/heka/config.toml") await collection.map(upload_files) logger.debug("Launching Heka...") await docker.run_containers(collection, self.info.name, "hekad -config=/heka/config.toml", volumes=volumes, ports=ports, pid_mode="host") await gen.multi([ ping.ping("http://%s:4352/" % inst.instance.ip_address) for inst in collection.instances ])