def error_rerun_handler(self, exc, errors, email, cloud_id): from mist.io.methods import notify_user if len(errors) < 6: return self.result_fresh # Retry when the result is no longer fresh user = user_from_email(email) if len(errors) == 6: # If does not respond for a minute notify_user(user, 'Cloud %s does not respond' % user.clouds[cloud_id].title, email_notify=False, cloud_id=cloud_id) # Keep retrying for 30 minutes times = [60, 60, 120, 300, 600, 600] index = len(errors) - 6 if index < len(times): return times[index] else: # If cloud still unresponsive disable it & notify user with user.lock_n_load(): user.clouds[cloud_id].enabled = False user.save() notify_user(user, "Cloud %s disabled after not responding for 30mins" % user.clouds[cloud_id].title, email_notify=True, cloud_id=cloud_id) log_event(user.email, 'incident', action='disable_cloud', cloud_id=cloud_id, error="Cloud unresponsive")
def error_rerun_handler(self, exc, errors, email, backend_id): from mist.io.methods import notify_user if len(errors) < 6: return self.result_fresh # Retry when the result is no longer fresh user = user_from_email(email) if len(errors) == 6: # If does not respond for a minute notify_user(user, 'Backend %s does not respond' % user.backends[backend_id].title, email_notify=False, backend_id=backend_id) # Keep retrying for 30 minutes times = [60, 60, 120, 300, 600, 600] index = len(errors) - 6 if index < len(times): return times[index] else: # If backend still unresponsive disable it & notify user with user.lock_n_load(): user.backends[backend_id].enabled = False user.save() notify_user(user, "Backend %s disabled after not responding for 30mins" % user.backends[backend_id].title, email_notify=True, backend_id=backend_id) log_event(user.email, 'incident', action='disable_backend', backend_id=backend_id, error="Backend unresponsive")
def create_machine_async(email, backend_id, key_id, machine_name, location_id, image_id, size_id, script, image_extra, disk, image_name, size_name, location_name, ips, monitoring, networks, docker_env, docker_command, script_id=None, script_params=None, quantity=1, persist=False, job_id=None, docker_port_bindings={}, docker_exposed_ports={}, hostname='', plugins=None): from multiprocessing.dummy import Pool as ThreadPool from mist.io.methods import create_machine from mist.io.exceptions import MachineCreationError log.warn('MULTICREATE ASYNC %d' % quantity) if multi_user: from mist.core.helpers import log_event else: log_event = lambda *args, **kwargs: None job_id = job_id or uuid.uuid4().hex log_event(email, 'job', 'async_machine_creation_started', job_id=job_id, backend_id=backend_id, script=script, script_id=script_id, script_params=script_params, monitoring=monitoring, persist=persist, quantity=quantity) THREAD_COUNT = 5 pool = ThreadPool(THREAD_COUNT) names = [] for i in range(1, quantity+1): names.append('%s-%d' % (machine_name,i)) user = user_from_email(email) specs = [] for name in names: specs.append(( (user, backend_id, key_id, name, location_id, image_id, size_id, script, image_extra, disk, image_name, size_name, location_name, ips, monitoring, networks, docker_env, docker_command, 22, script_id, script_params, job_id), {'hostname': hostname, 'plugins': plugins} )) def create_machine_wrapper(args_kwargs): args, kwargs = args_kwargs error = False try: node = create_machine(*args, **kwargs) except MachineCreationError as exc: error = str(exc) except Exception as exc: error = repr(exc) finally: name = args[3] log_event(email, 'job', 'machine_creation_finished', job_id=job_id, backend_id=backend_id, machine_name=name, error=error) pool.map(create_machine_wrapper, specs) pool.close() pool.join()
def create_machine_wrapper(args_kwargs): args, kwargs = args_kwargs error = False try: node = create_machine(*args, **kwargs) except MachineCreationError as exc: error = str(exc) except Exception as exc: error = repr(exc) finally: name = args[3] log_event(email, 'job', 'machine_creation_finished', job_id=job_id, backend_id=backend_id, machine_name=name, error=error)
def post_deploy_steps(self, email, backend_id, machine_id, monitoring, command, key_id=None, username=None, password=None, port=22, script_id='', script_params='', job_id=None, hostname='', plugins=None, post_script_id='', post_script_params=''): from mist.io.methods import connect_provider, probe_ssh_only from mist.io.methods import notify_user, notify_admin from mist.io.methods import create_dns_a_record if multi_user: from mist.core.methods import enable_monitoring from mist.core.tasks import run_script from mist.core.helpers import log_event else: from mist.io.methods import enable_monitoring log_event = lambda *args, **kwargs: None job_id = job_id or uuid.uuid4().hex user = user_from_email(email) tmp_log = lambda msg, *args: log.error('Post deploy: %s' % msg, *args) tmp_log('Entering post deploy steps for %s %s %s', user.email, backend_id, machine_id) try: # find the node we're looking for and get its hostname node = None try: conn = connect_provider(user.backends[backend_id]) nodes = conn.list_nodes() # TODO: use cache for n in nodes: if n.id == machine_id: node = n break tmp_log('run list_machines') except: raise self.retry(exc=Exception(), countdown=10, max_retries=10) if node and len(node.public_ips): # filter out IPv6 addresses ips = filter(lambda ip: ':' not in ip, node.public_ips) host = ips[0] else: tmp_log('ip not found, retrying') raise self.retry(exc=Exception(), countdown=60, max_retries=20) try: from mist.io.shell import Shell shell = Shell(host) # connect with ssh even if no command, to create association # to be able to enable monitoring tmp_log('attempting to connect to shell') key_id, ssh_user = shell.autoconfigure( user, backend_id, node.id, key_id, username, password, port ) tmp_log('connected to shell') result = probe_ssh_only(user, backend_id, machine_id, host=None, key_id=key_id, ssh_user=ssh_user, shell=shell) log_dict = { 'email': email, 'event_type': 'job', 'backend_id': backend_id, 'machine_id': machine_id, 'job_id': job_id, 'host': host, 'key_id': key_id, 'ssh_user': ssh_user, } log_event(action='probe', result=result, **log_dict) backend = user.backends[backend_id] msg = "Backend:\n Name: %s\n Id: %s\n" % (backend.title, backend_id) msg += "Machine:\n Name: %s\n Id: %s\n" % (node.name, node.id) if hostname: try: record = create_dns_a_record(user, hostname, host) hostname = '.'.join((record.name, record.zone.domain)) log_event(action='create_dns_a_record', hostname=hostname, **log_dict) except Exception as exc: log_event(action='create_dns_a_record', error=str(exc), **log_dict) error = False if script_id and multi_user: tmp_log('will run script_id %s', script_id) ret = run_script.run( user.email, script_id, backend_id, machine_id, params=script_params, host=host, job_id=job_id ) error = ret['error'] tmp_log('executed script_id %s', script_id) elif command: tmp_log('will run command %s', command) log_event(action='deployment_script_started', command=command, **log_dict) start_time = time() retval, output = shell.command(command) tmp_log('executed command %s', command) execution_time = time() - start_time output = output.decode('utf-8','ignore') title = "Deployment script %s" % ('failed' if retval else 'succeeded') error = retval > 0 notify_user(user, title, backend_id=backend_id, machine_id=machine_id, machine_name=node.name, command=command, output=output, duration=execution_time, retval=retval, error=retval > 0) log_event(action='deployment_script_finished', error=retval > 0, return_value=retval, command=command, stdout=output, **log_dict) shell.disconnect() if monitoring: try: enable_monitoring(user, backend_id, node.id, name=node.name, dns_name=node.extra.get('dns_name',''), public_ips=ips, no_ssh=False, dry=False, job_id=job_id, plugins=plugins, deploy_async=False, ) except Exception as e: print repr(e) error = True notify_user(user, "Enable monitoring failed for machine %s" % machine_id, repr(e)) notify_admin('Enable monitoring on creation failed for user %s machine %s: %r' % (email, machine_id, e)) log_event(action='enable_monitoring_failed', error=repr(e), **log_dict) if post_script_id and multi_user: tmp_log('will run post_script_id %s', post_script_id) ret = run_script.run( user.email, post_script_id, backend_id, machine_id, params=post_script_params, host=host, job_id=job_id, action_prefix='post_', ) error = ret['error'] tmp_log('executed post_script_id %s', script_id) log_event(action='post_deploy_finished', error=error, **log_dict) except (ServiceUnavailableError, SSHException) as exc: tmp_log(repr(exc)) raise self.retry(exc=exc, countdown=60, max_retries=15) except Exception as exc: tmp_log(repr(exc)) if str(exc).startswith('Retry'): raise notify_user(user, "Deployment script failed for machine %s" % machine_id) notify_admin("Deployment script failed for machine %s in backend %s by user %s" % (machine_id, backend_id, email), repr(exc)) log_event( email=email, event_type='job', action='post_deploy_finished', backend_id=backend_id, machine_id=machine_id, enable_monitoring=bool(monitoring), command=command, error="Couldn't connect to run post deploy steps.", job_id=job_id )
def post_deploy_steps(self, email, backend_id, machine_id, monitoring, command, key_id=None, username=None, password=None, port=22): from mist.io.methods import ssh_command, connect_provider, enable_monitoring from mist.io.methods import notify_user, notify_admin if multi_user: from mist.core.methods import enable_monitoring from mist.core.helpers import log_event else: from mist.io.methods import enable_monitoring log_event = lambda *args, **kwargs: None user = user_from_email(email) try: # find the node we're looking for and get its hostname conn = connect_provider(user.backends[backend_id]) nodes = conn.list_nodes() node = None for n in nodes: if n.id == machine_id: node = n break if node and len(node.public_ips): # filter out IPv6 addresses ips = filter(lambda ip: ':' not in ip, node.public_ips) host = ips[0] else: raise self.retry(exc=Exception(), countdown=120, max_retries=5) try: from mist.io.shell import Shell shell = Shell(host) # connect with ssh even if no command, to create association # to be able to enable monitoring key_id, ssh_user = shell.autoconfigure(user, backend_id, node.id, key_id, username, password, port) backend = user.backends[backend_id] msg = "Backend:\n Name: %s\n Id: %s\n" % (backend.title, backend_id) msg += "Machine:\n Name: %s\n Id: %s\n" % (node.name, node.id) if command: log_dict = { 'email': email, 'event_type': 'job', 'backend_id': backend_id, 'machine_id': machine_id, 'job_id': uuid.uuid4().hex, 'command': command, 'host': host, 'key_id': key_id, 'ssh_user': ssh_user, } log_event(action='deployment_script_started', **log_dict) start_time = time() retval, output = shell.command(command) execution_time = time() - start_time output = output.decode('utf-8', 'ignore') title = "Deployment script %s" % ('failed' if retval else 'succeeded') notify_user(user, title, backend_id=backend_id, machine_id=machine_id, machine_name=node.name, command=command, output=output, duration=execution_time, retval=retval, error=retval > 0) log_event(action='deployment_script_finished', error=retval > 0, return_value=retval, stdout=output, **log_dict) shell.disconnect() if monitoring: try: enable_monitoring( user, backend_id, node.id, name=node.name, dns_name=node.extra.get('dns_name', ''), public_ips=ips, no_ssh=False, dry=False, ) except Exception as e: print repr(e) notify_user( user, "Enable monitoring failed for machine %s (%s)" % (node.name, node.id), repr(e)) notify_admin( 'Enable monitoring on creation failed for user %s machine %s: %r' % (email, node.name, e)) except (ServiceUnavailableError, SSHException) as exc: raise self.retry(exc=exc, countdown=60, max_retries=5) except Exception as exc: if str(exc).startswith('Retry'): raise notify_user( user, "Deployment script failed for machine %s after 5 retries" % node.id) notify_admin( "Deployment script failed for machine %s in backend %s by user %s after 5 retries" % (node.id, backend_id, email), repr(exc)) log_event( email=email, event_type='job', action='deployment_script_failed', backend_id=backend_id, machine_id=machine_id, enable_monitoring=bool(monitoring), command=command, error="Couldn't connect to run post deploy steps (5 attempts).", )
def create_machine_async(email, cloud_id, key_id, machine_name, location_id, image_id, size_id, script, image_extra, disk, image_name, size_name, location_name, ips, monitoring, networks, docker_env, docker_command, script_id='', script_params='', post_script_id='', post_script_params='', quantity=1, persist=False, job_id=None, docker_port_bindings={}, docker_exposed_ports={}, azure_port_bindings='', hostname='', plugins=None, disk_size=None, disk_path=None, cloud_init='', associate_floating_ip=False, associate_floating_ip_subnet=None, project_id=None): from multiprocessing.dummy import Pool as ThreadPool from mist.io.methods import create_machine from mist.io.exceptions import MachineCreationError log.warn('MULTICREATE ASYNC %d' % quantity) if multi_user: from mist.core.helpers import log_event else: log_event = lambda *args, **kwargs: None job_id = job_id or uuid.uuid4().hex log_event(email, 'job', 'async_machine_creation_started', job_id=job_id, cloud_id=cloud_id, script=script, script_id=script_id, script_params=script_params, monitoring=monitoring, persist=persist, quantity=quantity) THREAD_COUNT = 5 pool = ThreadPool(THREAD_COUNT) names = [] for i in range(1, quantity + 1): names.append('%s-%d' % (machine_name, i)) user = user_from_email(email) specs = [] for name in names: specs.append( ((user, cloud_id, key_id, name, location_id, image_id, size_id, script, image_extra, disk, image_name, size_name, location_name, ips, monitoring, networks, docker_env, docker_command, 22, script_id, script_params, job_id), { 'hostname': hostname, 'plugins': plugins, 'post_script_id': post_script_id, 'post_script_params': post_script_params, 'azure_port_bindings': azure_port_bindings, 'associate_floating_ip': associate_floating_ip, 'cloud_init': cloud_init, 'disk_size': disk_size, 'disk_path': disk_path, 'project_id': project_id })) def create_machine_wrapper(args_kwargs): args, kwargs = args_kwargs error = False try: node = create_machine(*args, **kwargs) except MachineCreationError as exc: error = str(exc) except Exception as exc: error = repr(exc) finally: name = args[3] log_event(email, 'job', 'machine_creation_finished', job_id=job_id, cloud_id=cloud_id, machine_name=name, error=error) pool.map(create_machine_wrapper, specs) pool.close() pool.join()
def post_deploy_steps(self, email, cloud_id, machine_id, monitoring, command, key_id=None, username=None, password=None, port=22, script_id='', script_params='', job_id=None, hostname='', plugins=None, post_script_id='', post_script_params=''): from mist.io.methods import connect_provider, probe_ssh_only from mist.io.methods import notify_user, notify_admin from mist.io.methods import create_dns_a_record if multi_user: from mist.core.methods import enable_monitoring from mist.core.tasks import run_script from mist.core.helpers import log_event else: from mist.io.methods import enable_monitoring log_event = lambda *args, **kwargs: None job_id = job_id or uuid.uuid4().hex user = user_from_email(email) tmp_log = lambda msg, *args: log.error('Post deploy: %s' % msg, *args) tmp_log('Entering post deploy steps for %s %s %s', user.email, cloud_id, machine_id) try: # find the node we're looking for and get its hostname node = None try: conn = connect_provider(user.clouds[cloud_id]) nodes = conn.list_nodes() # TODO: use cache for n in nodes: if n.id == machine_id: node = n break tmp_log('run list_machines') except: raise self.retry(exc=Exception(), countdown=10, max_retries=10) if node and len(node.public_ips): # filter out IPv6 addresses ips = filter(lambda ip: ':' not in ip, node.public_ips) host = ips[0] else: tmp_log('ip not found, retrying') raise self.retry(exc=Exception(), countdown=60, max_retries=20) try: from mist.io.shell import Shell shell = Shell(host) # connect with ssh even if no command, to create association # to be able to enable monitoring tmp_log('attempting to connect to shell') key_id, ssh_user = shell.autoconfigure(user, cloud_id, node.id, key_id, username, password, port) tmp_log('connected to shell') result = probe_ssh_only(user, cloud_id, machine_id, host=None, key_id=key_id, ssh_user=ssh_user, shell=shell) log_dict = { 'email': email, 'event_type': 'job', 'cloud_id': cloud_id, 'machine_id': machine_id, 'job_id': job_id, 'host': host, 'key_id': key_id, 'ssh_user': ssh_user, } log_event(action='probe', result=result, **log_dict) cloud = user.clouds[cloud_id] msg = "Cloud:\n Name: %s\n Id: %s\n" % (cloud.title, cloud_id) msg += "Machine:\n Name: %s\n Id: %s\n" % (node.name, node.id) if hostname: try: record = create_dns_a_record(user, hostname, host) hostname = '.'.join((record.name, record.zone.domain)) log_event(action='create_dns_a_record', hostname=hostname, **log_dict) except Exception as exc: log_event(action='create_dns_a_record', error=str(exc), **log_dict) error = False if script_id and multi_user: tmp_log('will run script_id %s', script_id) ret = run_script.run(user.email, script_id, cloud_id, machine_id, params=script_params, host=host, job_id=job_id) error = ret['error'] tmp_log('executed script_id %s', script_id) elif command: tmp_log('will run command %s', command) log_event(action='deployment_script_started', command=command, **log_dict) start_time = time() retval, output = shell.command(command) tmp_log('executed command %s', command) execution_time = time() - start_time output = output.decode('utf-8', 'ignore') title = "Deployment script %s" % ('failed' if retval else 'succeeded') error = retval > 0 notify_user(user, title, cloud_id=cloud_id, machine_id=machine_id, machine_name=node.name, command=command, output=output, duration=execution_time, retval=retval, error=retval > 0) log_event(action='deployment_script_finished', error=retval > 0, return_value=retval, command=command, stdout=output, **log_dict) shell.disconnect() if monitoring: try: enable_monitoring( user, cloud_id, node.id, name=node.name, dns_name=node.extra.get('dns_name', ''), public_ips=ips, no_ssh=False, dry=False, job_id=job_id, plugins=plugins, deploy_async=False, ) except Exception as e: print repr(e) error = True notify_user( user, "Enable monitoring failed for machine %s" % machine_id, repr(e)) notify_admin( 'Enable monitoring on creation failed for user %s machine %s: %r' % (email, machine_id, e)) log_event(action='enable_monitoring_failed', error=repr(e), **log_dict) if post_script_id and multi_user: tmp_log('will run post_script_id %s', post_script_id) ret = run_script.run( user.email, post_script_id, cloud_id, machine_id, params=post_script_params, host=host, job_id=job_id, action_prefix='post_', ) error = ret['error'] tmp_log('executed post_script_id %s', script_id) log_event(action='post_deploy_finished', error=error, **log_dict) except (ServiceUnavailableError, SSHException) as exc: tmp_log(repr(exc)) raise self.retry(exc=exc, countdown=60, max_retries=15) except Exception as exc: tmp_log(repr(exc)) if str(exc).startswith('Retry'): raise notify_user(user, "Deployment script failed for machine %s" % machine_id) notify_admin( "Deployment script failed for machine %s in cloud %s by user %s" % (machine_id, cloud_id, email), repr(exc)) log_event(email=email, event_type='job', action='post_deploy_finished', cloud_id=cloud_id, machine_id=machine_id, enable_monitoring=bool(monitoring), command=command, error="Couldn't connect to run post deploy steps.", job_id=job_id)
def create_machine_async(email, cloud_id, key_id, machine_name, location_id, image_id, size_id, script, image_extra, disk, image_name, size_name, location_name, ips, monitoring, networks, docker_env, docker_command, script_id='', script_params='', post_script_id='', post_script_params='', quantity=1, persist=False, job_id=None, docker_port_bindings={}, docker_exposed_ports={}, azure_port_bindings='', hostname='', plugins=None, disk_size=None, disk_path=None, cloud_init='', associate_floating_ip=False, associate_floating_ip_subnet=None, project_id=None, bare_metal=False, hourly=True, cronjob={}): from multiprocessing.dummy import Pool as ThreadPool from mist.io.methods import create_machine from mist.io.exceptions import MachineCreationError log.warn('MULTICREATE ASYNC %d' % quantity) if multi_user: from mist.core.helpers import log_event else: log_event = lambda *args, **kwargs: None job_id = job_id or uuid.uuid4().hex if quantity == 1: names = [machine_name] else: names = [] for i in range(1, quantity+1): names.append('%s-%d' % (machine_name, i)) log_event(email, 'job', 'async_machine_creation_started', job_id=job_id, cloud_id=cloud_id, script=script, script_id=script_id, script_params=script_params, monitoring=monitoring, persist=persist, quantity=quantity, key_id=key_id, machine_names=names) THREAD_COUNT = 5 pool = ThreadPool(THREAD_COUNT) user = user_from_email(email) specs = [] for name in names: specs.append(( (user, cloud_id, key_id, name, location_id, image_id, size_id, script, image_extra, disk, image_name, size_name, location_name, ips, monitoring, networks, docker_env, docker_command, 22, script_id, script_params, job_id), {'hostname': hostname, 'plugins': plugins, 'post_script_id': post_script_id, 'post_script_params': post_script_params, 'azure_port_bindings': azure_port_bindings, 'associate_floating_ip': associate_floating_ip, 'cloud_init': cloud_init, 'disk_size': disk_size, 'disk_path': disk_path, 'project_id': project_id, 'cronjob': cronjob} )) def create_machine_wrapper(args_kwargs): args, kwargs = args_kwargs error = False node = {} try: node = create_machine(*args, **kwargs) except MachineCreationError as exc: error = str(exc) except Exception as exc: error = repr(exc) finally: name = args[3] log_event(email, 'job', 'machine_creation_finished', job_id=job_id, cloud_id=cloud_id, machine_name=name, error=error, machine_id=node.get('id', '')) pool.map(create_machine_wrapper, specs) pool.close() pool.join()
def post_deploy_steps(self, email, backend_id, machine_id, monitoring, command, key_id=None, username=None, password=None, port=22): from mist.io.methods import ssh_command, connect_provider, enable_monitoring from mist.io.methods import notify_user, notify_admin if multi_user: from mist.core.methods import enable_monitoring from mist.core.helpers import log_event else: from mist.io.methods import enable_monitoring log_event = lambda *args, **kwargs: None user = user_from_email(email) try: # find the node we're looking for and get its hostname conn = connect_provider(user.backends[backend_id]) nodes = conn.list_nodes() node = None for n in nodes: if n.id == machine_id: node = n break if node and len(node.public_ips): # filter out IPv6 addresses ips = filter(lambda ip: ':' not in ip, node.public_ips) host = ips[0] else: raise self.retry(exc=Exception(), countdown=120, max_retries=5) try: from mist.io.shell import Shell shell = Shell(host) # connect with ssh even if no command, to create association # to be able to enable monitoring key_id, ssh_user = shell.autoconfigure( user, backend_id, node.id, key_id, username, password, port ) backend = user.backends[backend_id] msg = "Backend:\n Name: %s\n Id: %s\n" % (backend.title, backend_id) msg += "Machine:\n Name: %s\n Id: %s\n" % (node.name, node.id) if command: log_dict = { 'email': email, 'event_type': 'job', 'backend_id': backend_id, 'machine_id': machine_id, 'job_id': uuid.uuid4().hex, 'command': command, 'host': host, 'key_id': key_id, 'ssh_user': ssh_user, } log_event(action='deployment_script_started', **log_dict) start_time = time() retval, output = shell.command(command) execution_time = time() - start_time output = output.decode('utf-8','ignore') title = "Deployment script %s" % ('failed' if retval else 'succeeded') notify_user(user, title, backend_id=backend_id, machine_id=machine_id, machine_name=node.name, command=command, output=output, duration=execution_time, retval=retval, error=retval > 0) log_event(action='deployment_script_finished', error=retval > 0, return_value=retval, stdout=output, **log_dict) shell.disconnect() if monitoring: try: enable_monitoring(user, backend_id, node.id, name=node.name, dns_name=node.extra.get('dns_name',''), public_ips=ips, no_ssh=False, dry=False, ) except Exception as e: print repr(e) notify_user(user, "Enable monitoring failed for machine %s (%s)" % (node.name, node.id), repr(e)) notify_admin('Enable monitoring on creation failed for user %s machine %s: %r' % (email, node.name, e)) except (ServiceUnavailableError, SSHException) as exc: raise self.retry(exc=exc, countdown=60, max_retries=5) except Exception as exc: if str(exc).startswith('Retry'): raise notify_user(user, "Deployment script failed for machine %s after 5 retries" % node.id) notify_admin("Deployment script failed for machine %s in backend %s by user %s after 5 retries" % (node.id, backend_id, email), repr(exc)) log_event( email=email, event_type='job', action='deployment_script_failed', backend_id=backend_id, machine_id=machine_id, enable_monitoring=bool(monitoring), command=command, error="Couldn't connect to run post deploy steps (5 attempts).", )