def enable_monitoring(owner, cloud_id, machine_id, no_ssh=False, dry=False, job_id='', deploy_async=True, plugins=None): """Enable monitoring for a machine. If `no_ssh` is False, then the monitoring agent will be deployed over SSH. Otherwise, the installation command will be returned to the User in order to be ran manually. """ log.info("%s: Enabling monitoring for machine '%s' in cloud '%s'.", owner.id, machine_id, cloud_id) try: cloud = Cloud.objects.get(owner=owner, id=cloud_id, deleted=None) except Cloud.DoesNotExist: raise NotFoundError('Cloud does not exist') try: machine = Machine.objects.get(cloud=cloud, machine_id=machine_id) except Machine.DoesNotExist: raise NotFoundError("Machine %s doesn't exist" % machine_id) if machine.monitoring.hasmonitoring: log.warning( "%s: Monitoring is already enabled for " "machine '%s' in cloud '%s'.", owner.id, machine_id, cloud_id) old_monitoring_method = machine.monitoring.method # Decide on monitoring method machine.monitoring.method = (machine.cloud.default_monitoring_method or machine.cloud.owner.default_monitoring_method or config.DEFAULT_MONITORING_METHOD) assert machine.monitoring.method in config.MONITORING_METHODS if old_monitoring_method != machine.monitoring.method: machine.monitoring.method_since = datetime.datetime.now() # Extra vars if machine.monitoring.method in ('telegraf-influxdb', 'telegraf-graphite'): extra_vars = {'uuid': machine.id, 'monitor': config.INFLUX['host']} else: raise Exception("Invalid monitoring method") # Ret dict ret_dict = {'extra_vars': extra_vars} for os_type, cmd in machine.monitoring.get_commands().items(): ret_dict['%s_command' % os_type] = cmd # for backwards compatibility ret_dict['command'] = ret_dict['unix_command'] # Dry run, so return! if dry: return ret_dict # Reset Machines's InstallationStatus field. machine.monitoring.installation_status = InstallationStatus() machine.monitoring.installation_status.started_at = time.time() machine.monitoring.installation_status.state = 'preparing' machine.monitoring.installation_status.manual = no_ssh machine.monitoring.hasmonitoring = True machine.save() trigger_session_update(owner, ['monitoring']) # Attempt to contact monitor server and enable monitoring for the machine try: if machine.monitoring.method in ('telegraf-influxdb', 'telegraf-graphite'): traefik.reset_config() except Exception as exc: machine.monitoring.installation_status.state = 'failed' machine.monitoring.installation_status.error_msg = repr(exc) machine.monitoring.installation_status.finished_at = time.time() machine.monitoring.hasmonitoring = False machine.save() trigger_session_update(owner, ['monitoring']) raise # Update installation status if no_ssh: machine.monitoring.installation_status.state = 'installing' else: machine.monitoring.installation_status.state = 'pending' machine.save() trigger_session_update(owner, ['monitoring']) if not no_ssh: if job_id: job = None else: job_id = uuid.uuid4().hex job = 'enable_monitoring' ret_dict['job'] = job if machine.monitoring.method in ('telegraf-influxdb', 'telegraf-graphite'): # Install Telegraf func = mist.api.monitoring.tasks.install_telegraf if deploy_async: func = func.delay func(machine.id, job, job_id, plugins) else: raise Exception("Invalid monitoring method") if job_id: ret_dict['job_id'] = job_id return ret_dict
def disable_monitoring(owner, cloud_id, machine_id, no_ssh=False, job_id=''): """Disable monitoring for a machine. If `no_ssh` is False, we will attempt to SSH to the Machine and uninstall the monitoring agent. """ log.info("%s: Disabling monitoring for machine '%s' in cloud '%s'.", owner.id, machine_id, cloud_id) try: cloud = Cloud.objects.get(owner=owner, id=cloud_id, deleted=None) except Cloud.DoesNotExist: raise NotFoundError('Cloud does not exist') try: machine = Machine.objects.get(cloud=cloud, machine_id=machine_id) except Machine.DoesNotExist: raise NotFoundError("Machine %s doesn't exist" % machine_id) if not machine.monitoring.hasmonitoring: raise BadRequestError('Machine does not have monitoring enabled') # Uninstall monitoring agent. ret_dict = {} if not no_ssh: if job_id: job = None else: job = 'disable_monitoring' job_id = uuid.uuid4().hex ret_dict['job'] = job if machine.monitoring.method in ('telegraf-influxdb', 'telegraf-graphite'): # Schedule undeployment of Telegraf. mist.api.monitoring.tasks.uninstall_telegraf.delay( machine.id, job, job_id) if job_id: ret_dict['job_id'] = job_id # Update monitoring information in db: set monitoring to off, remove rules. # If the machine we are trying to disable monitoring for is the only one # included in a rule, then delete the rule. Otherwise, attempt to remove # the machine from the list of resources the rule is referring to. for rule in Rule.objects(owner_id=machine.owner.id): if rule.ctl.includes_only(machine): rule.delete() else: rule.ctl.maybe_remove(machine) machine.monitoring.hasmonitoring = False machine.monitoring.activated_at = 0 machine.save() # tell monitor server to no longer monitor this uuid try: if machine.monitoring.method in ('telegraf-influxdb', 'telegraf-graphite'): traefik.reset_config() except Exception as exc: log.error( "Exception %s while asking monitor server in " "disable_monitoring", exc) trigger_session_update(owner, ['monitoring']) return ret_dict
def reset_traefik_config(): try: _get_config() except Exception as exc: log.error(exc) reset_config()