예제 #1
0
def enable_monitoring(owner,
                      cloud_id,
                      machine_id,
                      no_ssh=False,
                      dry=False,
                      job_id='',
                      deploy_async=True,
                      plugins=None):
    """Enable monitoring for a machine.

    If `no_ssh` is False, then the monitoring agent will be deployed over SSH.
    Otherwise, the installation command will be returned to the User in order
    to be ran manually.

    """
    log.info("%s: Enabling monitoring for machine '%s' in cloud '%s'.",
             owner.id, machine_id, cloud_id)

    try:
        cloud = Cloud.objects.get(owner=owner, id=cloud_id, deleted=None)
    except Cloud.DoesNotExist:
        raise NotFoundError('Cloud does not exist')
    try:
        machine = Machine.objects.get(cloud=cloud, machine_id=machine_id)
    except Machine.DoesNotExist:
        raise NotFoundError("Machine %s doesn't exist" % machine_id)
    if machine.monitoring.hasmonitoring:
        log.warning(
            "%s: Monitoring is already enabled for "
            "machine '%s' in cloud '%s'.", owner.id, machine_id, cloud_id)

    old_monitoring_method = machine.monitoring.method
    # Decide on monitoring method
    machine.monitoring.method = (machine.cloud.default_monitoring_method or
                                 machine.cloud.owner.default_monitoring_method
                                 or config.DEFAULT_MONITORING_METHOD)
    assert machine.monitoring.method in config.MONITORING_METHODS

    if old_monitoring_method != machine.monitoring.method:
        machine.monitoring.method_since = datetime.datetime.now()
    # Extra vars
    if machine.monitoring.method in ('telegraf-influxdb', 'telegraf-graphite'):
        extra_vars = {'uuid': machine.id, 'monitor': config.INFLUX['host']}
    else:
        raise Exception("Invalid monitoring method")

    # Ret dict
    ret_dict = {'extra_vars': extra_vars}
    for os_type, cmd in machine.monitoring.get_commands().items():
        ret_dict['%s_command' % os_type] = cmd
    # for backwards compatibility
    ret_dict['command'] = ret_dict['unix_command']

    # Dry run, so return!
    if dry:
        return ret_dict

    # Reset Machines's InstallationStatus field.
    machine.monitoring.installation_status = InstallationStatus()
    machine.monitoring.installation_status.started_at = time.time()
    machine.monitoring.installation_status.state = 'preparing'
    machine.monitoring.installation_status.manual = no_ssh
    machine.monitoring.hasmonitoring = True

    machine.save()
    trigger_session_update(owner, ['monitoring'])

    # Attempt to contact monitor server and enable monitoring for the machine
    try:
        if machine.monitoring.method in ('telegraf-influxdb',
                                         'telegraf-graphite'):
            traefik.reset_config()
    except Exception as exc:
        machine.monitoring.installation_status.state = 'failed'
        machine.monitoring.installation_status.error_msg = repr(exc)
        machine.monitoring.installation_status.finished_at = time.time()
        machine.monitoring.hasmonitoring = False
        machine.save()
        trigger_session_update(owner, ['monitoring'])
        raise

    # Update installation status
    if no_ssh:
        machine.monitoring.installation_status.state = 'installing'
    else:
        machine.monitoring.installation_status.state = 'pending'
    machine.save()
    trigger_session_update(owner, ['monitoring'])

    if not no_ssh:
        if job_id:
            job = None
        else:
            job_id = uuid.uuid4().hex
            job = 'enable_monitoring'
        ret_dict['job'] = job
        if machine.monitoring.method in ('telegraf-influxdb',
                                         'telegraf-graphite'):
            # Install Telegraf
            func = mist.api.monitoring.tasks.install_telegraf
            if deploy_async:
                func = func.delay
            func(machine.id, job, job_id, plugins)
        else:
            raise Exception("Invalid monitoring method")

    if job_id:
        ret_dict['job_id'] = job_id

    return ret_dict
예제 #2
0
def disable_monitoring(owner, cloud_id, machine_id, no_ssh=False, job_id=''):
    """Disable monitoring for a machine.

    If `no_ssh` is False, we will attempt to SSH to the Machine and uninstall
    the monitoring agent.

    """
    log.info("%s: Disabling monitoring for machine '%s' in cloud '%s'.",
             owner.id, machine_id, cloud_id)

    try:
        cloud = Cloud.objects.get(owner=owner, id=cloud_id, deleted=None)
    except Cloud.DoesNotExist:
        raise NotFoundError('Cloud does not exist')
    try:
        machine = Machine.objects.get(cloud=cloud, machine_id=machine_id)
    except Machine.DoesNotExist:
        raise NotFoundError("Machine %s doesn't exist" % machine_id)
    if not machine.monitoring.hasmonitoring:
        raise BadRequestError('Machine does not have monitoring enabled')

    # Uninstall monitoring agent.
    ret_dict = {}
    if not no_ssh:
        if job_id:
            job = None
        else:
            job = 'disable_monitoring'
            job_id = uuid.uuid4().hex
        ret_dict['job'] = job

        if machine.monitoring.method in ('telegraf-influxdb',
                                         'telegraf-graphite'):
            # Schedule undeployment of Telegraf.
            mist.api.monitoring.tasks.uninstall_telegraf.delay(
                machine.id, job, job_id)
    if job_id:
        ret_dict['job_id'] = job_id

    # Update monitoring information in db: set monitoring to off, remove rules.
    # If the machine we are trying to disable monitoring for is the only one
    # included in a rule, then delete the rule. Otherwise, attempt to remove
    # the machine from the list of resources the rule is referring to.
    for rule in Rule.objects(owner_id=machine.owner.id):
        if rule.ctl.includes_only(machine):
            rule.delete()
        else:
            rule.ctl.maybe_remove(machine)

    machine.monitoring.hasmonitoring = False
    machine.monitoring.activated_at = 0
    machine.save()

    # tell monitor server to no longer monitor this uuid
    try:
        if machine.monitoring.method in ('telegraf-influxdb',
                                         'telegraf-graphite'):
            traefik.reset_config()
    except Exception as exc:
        log.error(
            "Exception %s while asking monitor server in "
            "disable_monitoring", exc)

    trigger_session_update(owner, ['monitoring'])
    return ret_dict
예제 #3
0
def reset_traefik_config():
    try:
        _get_config()
    except Exception as exc:
        log.error(exc)
        reset_config()