Exemplo n.º 1
0
    def register(self, uuid=None, hostname=None, ip_address=None):
        if not uuid:
            uuid = settings.SYSTEM_UUID
        if not hostname:
            hostname = settings.CLUSTER_HOST_ID
        with advisory_lock('instance_registration_%s' % hostname):
            if settings.AWX_AUTO_DEPROVISION_INSTANCES:
                # detect any instances with the same IP address.
                # if one exists, set it to None
                inst_conflicting_ip = self.filter(ip_address=ip_address).exclude(hostname=hostname)
                if inst_conflicting_ip.exists():
                    for other_inst in inst_conflicting_ip:
                        other_hostname = other_inst.hostname
                        other_inst.ip_address = None
                        other_inst.save(update_fields=['ip_address'])
                        logger.warning("IP address {0} conflict detected, ip address unset for host {1}.".format(ip_address, other_hostname))

            instance = self.filter(hostname=hostname)
            if instance.exists():
                instance = instance.get()
                if instance.ip_address != ip_address:
                    instance.ip_address = ip_address
                    instance.save(update_fields=['ip_address'])
                    return (True, instance)
                else:
                    return (False, instance)
            instance = self.create(uuid=uuid,
                                   hostname=hostname,
                                   ip_address=ip_address,
                                   capacity=0)
        return (True, instance)
Exemplo n.º 2
0
def migrate_json_fields(table, expensive, columns):
    logger.warning(f"Migrating json fields: {table} {columns}")

    with advisory_lock(f'json_migration_{table}', wait=False) as acquired:
        if not acquired:
            return

        from django.db.migrations.executor import MigrationExecutor

        # If Django is currently running migrations, wait until it is done.
        while True:
            executor = MigrationExecutor(connection)
            if not executor.migration_plan(executor.loader.graph.leaf_nodes()):
                break
            time.sleep(60)

        if expensive:
            migrate_json_fields_expensive(table, columns)
        else:
            with connection.cursor() as cursor:
                column_expr = " ".join(f"ALTER {colname} TYPE jsonb"
                                       for colname in columns)
                cursor.execute(f"ALTER TABLE {table} {column_expr};")

    logger.warning(f"Migration of {table} to jsonb is finished")
Exemplo n.º 3
0
 def handle(self, *args, **options):
     # TODO: remove in 3.3
     if options.get('name'):
         warnings.warn(
             "`--name` is depreciated in favor of `--hostname`, and will be removed in release 3.3."
         )
         if options.get('hostname'):
             raise CommandError("Cannot accept both --name and --hostname.")
         options['hostname'] = options['name']
     hostname = options.get('hostname')
     if not hostname:
         raise CommandError("--hostname is a required argument")
     with advisory_lock('instance_registration_%s' % hostname):
         instance = Instance.objects.filter(hostname=hostname)
         if instance.exists():
             instance.delete()
             print("Instance Removed")
             result = subprocess.Popen(
                 "rabbitmqctl forget_cluster_node rabbitmq@{}".format(
                     hostname),
                 shell=True).wait()
             if result != 0:
                 print(
                     "Node deprovisioning may have failed when attempting to "
                     "remove the RabbitMQ instance {} from the cluster".
                     format(hostname))
             else:
                 print('Successfully deprovisioned {}'.format(hostname))
             print('(changed: True)')
         else:
             print('No instance found matching name {}'.format(hostname))
Exemplo n.º 4
0
 def handle(self, *args, **options):
     # TODO: remove in 3.3
     hostname = options.get('hostname')
     if not hostname:
         raise CommandError("--hostname is a required argument")
     with advisory_lock('instance_registration_%s' % hostname):
         instance = Instance.objects.filter(hostname=hostname)
         if instance.exists():
             isolated = instance.first().is_isolated()
             instance.delete()
             print("Instance Removed")
             if isolated:
                 print('Successfully deprovisioned {}'.format(hostname))
             else:
                 result = subprocess.Popen(
                     "rabbitmqctl forget_cluster_node rabbitmq@{}".format(
                         hostname),
                     shell=True).wait()
                 if result != 0:
                     print(
                         "Node deprovisioning may have failed when attempting to "
                         "remove the RabbitMQ instance {} from the cluster".
                         format(hostname))
                 else:
                     print('Successfully deprovisioned {}'.format(hostname))
             print('(changed: True)')
         else:
             print('No instance found matching name {}'.format(hostname))
Exemplo n.º 5
0
    def register(self):
        with advisory_lock('cluster_policy_lock'):
            with transaction.atomic():
                changed2 = False
                changed3 = False
                (ig, created, changed1) = self.get_create_update_instance_group()
                if created:
                    print("Creating instance group {}".format(ig.name))
                elif not created:
                    print("Instance Group already registered {}".format(ig.name))

                if self.controller:
                    (ig_ctrl, changed2) = self.update_instance_group_controller(ig)
                    if changed2:
                        print("Set controller group {} on {}.".format(self.controller, self.queuename))

                try:
                    (instances, changed3) = self.add_instances_to_group(ig)
                    for i in instances:
                        print("Added instance {} to {}".format(i.hostname, ig.name))
                except InstanceNotFound as e:
                    self.instance_not_found_err = e

        if any([changed1, changed2, changed3]):
            print('(changed: True)')
Exemplo n.º 6
0
def inspect_execution_nodes(instance_list):
    with advisory_lock('inspect_execution_nodes_lock', wait=False):
        node_lookup = {inst.hostname: inst for inst in instance_list}

        ctl = get_receptor_ctl()
        mesh_status = ctl.simple_command('status')

        nowtime = now()
        workers = mesh_status['Advertisements']
        for ad in workers:
            hostname = ad['NodeID']

            if hostname in node_lookup:
                instance = node_lookup[hostname]
            else:
                logger.warning(
                    f"Unrecognized node advertising on mesh: {hostname}")
                continue

            # Control-plane nodes are dealt with via local_health_check instead.
            if instance.node_type in ('control', 'hybrid'):
                continue

            was_lost = instance.is_lost(ref_time=nowtime)
            last_seen = parse_date(ad['Time'])

            if instance.last_seen and instance.last_seen >= last_seen:
                continue
            instance.last_seen = last_seen
            instance.save(update_fields=['last_seen'])

            # Only execution nodes should be dealt with by execution_node_health_check
            if instance.node_type == 'hop':
                if was_lost and (not instance.is_lost(ref_time=nowtime)):
                    logger.warning(
                        f'Hop node {hostname}, has rejoined the receptor mesh')
                    instance.save_health_data(errors='')
                continue

            if was_lost:
                # if the instance *was* lost, but has appeared again,
                # attempt to re-establish the initial capacity and version
                # check
                logger.warning(
                    f'Execution node attempting to rejoin as instance {hostname}.'
                )
                execution_node_health_check.apply_async([hostname])
            elif instance.capacity == 0 and instance.enabled:
                # nodes with proven connection but need remediation run health checks are reduced frequency
                if not instance.last_health_check or (
                        nowtime - instance.last_health_check).total_seconds(
                        ) >= settings.EXECUTION_NODE_REMEDIATION_CHECKS:
                    # Periodically re-run the health check of errored nodes, in case someone fixed it
                    # TODO: perhaps decrease the frequency of these checks
                    logger.debug(
                        f'Restarting health check for execution node {hostname} with known errors.'
                    )
                    execution_node_health_check.apply_async([hostname])
Exemplo n.º 7
0
Arquivo: system.py Projeto: mahak/awx
def awx_periodic_scheduler():
    with advisory_lock('awx_periodic_scheduler_lock', wait=False) as acquired:
        if acquired is False:
            logger.debug("Not running periodic scheduler, another task holds lock")
            return
        logger.debug("Starting periodic scheduler")

        run_now = now()
        state = TowerScheduleState.get_solo()
        last_run = state.schedule_last_run
        logger.debug("Last scheduler run was: %s", last_run)
        state.schedule_last_run = run_now
        state.save()

        old_schedules = Schedule.objects.enabled().before(last_run)
        for schedule in old_schedules:
            schedule.update_computed_fields()
        schedules = Schedule.objects.enabled().between(last_run, run_now)

        invalid_license = False
        try:
            access_registry[Job](None).check_license(quiet=True)
        except PermissionDenied as e:
            invalid_license = e

        for schedule in schedules:
            template = schedule.unified_job_template
            schedule.update_computed_fields()  # To update next_run timestamp.
            if template.cache_timeout_blocked:
                logger.warning("Cache timeout is in the future, bypassing schedule for template %s" % str(template.id))
                continue
            try:
                job_kwargs = schedule.get_job_kwargs()
                new_unified_job = schedule.unified_job_template.create_unified_job(**job_kwargs)
                logger.debug('Spawned {} from schedule {}-{}.'.format(new_unified_job.log_format, schedule.name, schedule.pk))

                if invalid_license:
                    new_unified_job.status = 'failed'
                    new_unified_job.job_explanation = str(invalid_license)
                    new_unified_job.save(update_fields=['status', 'job_explanation'])
                    new_unified_job.websocket_emit_status("failed")
                    raise invalid_license
                can_start = new_unified_job.signal_start()
            except Exception:
                logger.exception('Error spawning scheduled job.')
                continue
            if not can_start:
                new_unified_job.status = 'failed'
                new_unified_job.job_explanation = gettext_noop(
                    "Scheduled job could not start because it \
                    was not in the right state or required manual credentials"
                )
                new_unified_job.save(update_fields=['status', 'job_explanation'])
                new_unified_job.websocket_emit_status("failed")
            emit_channel_notification('schedules-changed', dict(id=schedule.id, group_name="schedules"))
        state.save()
Exemplo n.º 8
0
 def schedule(self):
     # Lock
     with advisory_lock('task_manager_lock', wait=False) as acquired:
         with transaction.atomic():
             if acquired is False:
                 logger.debug("Not running scheduler, another task holds lock")
                 return
             logger.debug("Starting Scheduler")
             with task_manager_bulk_reschedule():
                 self._schedule()
Exemplo n.º 9
0
    def register(self, uuid=None, hostname=None, ip_address=None, node_type='hybrid', defaults=None):
        if not hostname:
            hostname = settings.CLUSTER_HOST_ID

        with advisory_lock('instance_registration_%s' % hostname):
            if settings.AWX_AUTO_DEPROVISION_INSTANCES:
                # detect any instances with the same IP address.
                # if one exists, set it to None
                inst_conflicting_ip = self.filter(ip_address=ip_address).exclude(hostname=hostname)
                if inst_conflicting_ip.exists():
                    for other_inst in inst_conflicting_ip:
                        other_hostname = other_inst.hostname
                        other_inst.ip_address = None
                        other_inst.save(update_fields=['ip_address'])
                        logger.warning("IP address {0} conflict detected, ip address unset for host {1}.".format(ip_address, other_hostname))

            # Return existing instance that matches hostname or UUID (default to UUID)
            if uuid is not None and uuid != UUID_DEFAULT and self.filter(uuid=uuid).exists():
                instance = self.filter(uuid=uuid)
            else:
                # if instance was not retrieved by uuid and hostname was, use the hostname
                instance = self.filter(hostname=hostname)

            # Return existing instance
            if instance.exists():
                instance = instance.first()  # in the unusual occasion that there is more than one, only get one
                update_fields = []
                # if instance was retrieved by uuid and hostname has changed, update hostname
                if instance.hostname != hostname:
                    logger.warning("passed in hostname {0} is different from the original hostname {1}, updating to {0}".format(hostname, instance.hostname))
                    instance.hostname = hostname
                    update_fields.append('hostname')
                # if any other fields are to be updated
                if instance.ip_address != ip_address:
                    instance.ip_address = ip_address
                if instance.node_type != node_type:
                    instance.node_type = node_type
                    update_fields.append('node_type')
                if update_fields:
                    instance.save(update_fields=update_fields)
                    return (True, instance)
                else:
                    return (False, instance)

            # Create new instance, and fill in default values
            create_defaults = dict(capacity=0)
            if defaults is not None:
                create_defaults.update(defaults)
            uuid_option = {}
            if uuid is not None:
                uuid_option = dict(uuid=uuid)
            if node_type == 'execution' and 'version' not in create_defaults:
                create_defaults['version'] = RECEPTOR_PENDING
            instance = self.create(hostname=hostname, ip_address=ip_address, node_type=node_type, **create_defaults, **uuid_option)
        return (True, instance)
Exemplo n.º 10
0
    def register(self,
                 uuid=None,
                 hostname=None,
                 ip_address=None,
                 node_type='hybrid',
                 defaults=None):
        if not hostname:
            hostname = settings.CLUSTER_HOST_ID
        with advisory_lock('instance_registration_%s' % hostname):
            if settings.AWX_AUTO_DEPROVISION_INSTANCES:
                # detect any instances with the same IP address.
                # if one exists, set it to None
                inst_conflicting_ip = self.filter(
                    ip_address=ip_address).exclude(hostname=hostname)
                if inst_conflicting_ip.exists():
                    for other_inst in inst_conflicting_ip:
                        other_hostname = other_inst.hostname
                        other_inst.ip_address = None
                        other_inst.save(update_fields=['ip_address'])
                        logger.warning(
                            "IP address {0} conflict detected, ip address unset for host {1}."
                            .format(ip_address, other_hostname))

            # Return existing instance that matches hostname
            instance = self.filter(hostname=hostname)
            if instance.exists():
                instance = instance.get()
                update_fields = []
                if instance.ip_address != ip_address:
                    instance.ip_address = ip_address
                    update_fields.append('ip_address')
                if instance.node_type != node_type:
                    instance.node_type = node_type
                    update_fields.append('node_type')
                if update_fields:
                    instance.save(update_fields=update_fields)
                    return (True, instance)
                else:
                    return (False, instance)

            # Create new instance, and fill in default values
            create_defaults = dict(capacity=0)
            if defaults is not None:
                create_defaults.update(defaults)
            uuid_option = {}
            if uuid is not None:
                uuid_option = dict(uuid=uuid)
            if node_type == 'execution' and 'version' not in create_defaults:
                create_defaults['version'] = RECEPTOR_PENDING
            instance = self.create(hostname=hostname,
                                   ip_address=ip_address,
                                   node_type=node_type,
                                   **create_defaults,
                                   **uuid_option)
        return (True, instance)
Exemplo n.º 11
0
 def handle(self, **options):
     queuename = options.get('queuename')
     if not queuename:
         raise CommandError("Specify `--queuename` to use this command.")
     changed = False
     with advisory_lock('instance_group_registration_%s' % queuename):
         ig = InstanceGroup.objects.filter(name=queuename)
         control_ig = None
         if options.get('controller'):
             control_ig = InstanceGroup.objects.filter(
                 name=options.get('controller')).first()
         if ig.exists():
             print("Instance Group already registered {}".format(
                 ig[0].name))
             ig = ig[0]
             if control_ig and ig.controller_id != control_ig.pk:
                 ig.controller = control_ig
                 ig.save()
                 print("Set controller group {} on {}.".format(
                     control_ig.name, ig.name))
                 changed = True
         else:
             print("Creating instance group {}".format(queuename))
             ig = InstanceGroup(
                 name=queuename,
                 policy_instance_percentage=options.get('instance_percent'),
                 policy_instance_minimum=options.get('instance_minimum'))
             if control_ig:
                 ig.controller = control_ig
             ig.save()
             changed = True
         hostname_list = []
         if options.get('hostnames'):
             hostname_list = options.get('hostnames').split(",")
         instance_list = [x.strip() for x in hostname_list if x]
         for inst_name in instance_list:
             instance = Instance.objects.filter(hostname=inst_name)
             if instance.exists() and instance[0] not in ig.instances.all():
                 ig.instances.add(instance[0])
                 print("Added instance {} to {}".format(
                     instance[0].hostname, ig.name))
                 changed = True
             elif not instance.exists():
                 print("Instance does not exist: {}".format(inst_name))
                 if changed:
                     print('(changed: True)')
                 sys.exit(1)
             else:
                 print("Instance already registered {}".format(
                     instance[0].hostname))
         ig.policy_instance_list = instance_list
         ig.save()
         if changed:
             print('(changed: True)')
Exemplo n.º 12
0
 def register(self, uuid=None, hostname=None):
     if not uuid:
         uuid = settings.SYSTEM_UUID
     if not hostname:
         hostname = settings.CLUSTER_HOST_ID
     with advisory_lock('instance_registration_%s' % hostname):
         instance = self.filter(hostname=hostname)
         if instance.exists():
             return (False, instance[0])
         instance = self.create(uuid=uuid, hostname=hostname, capacity=0)
     return (True, instance)
Exemplo n.º 13
0
    def handle(self, **options):
        instance_not_found_err = None
        queuename = options.get('queuename')
        if not queuename:
            raise CommandError("Specify `--queuename` to use this command.")
        ctrl = options.get('controller')
        inst_per = options.get('instance_percent')
        inst_min = options.get('instance_minimum')
        hostname_list = []
        if options.get('hostnames'):
            hostname_list = options.get('hostnames').split(",")

        with advisory_lock(
                six.text_type('instance_group_registration_{}').format(
                    queuename)):
            changed2 = False
            changed3 = False
            (ig, created, changed1) = self.get_create_update_instance_group(
                queuename, inst_per, inst_min)
            if created:
                print(
                    six.text_type("Creating instance group {}").format(
                        ig.name))
            elif not created:
                print(
                    six.text_type(
                        "Instance Group already registered {}").format(
                            ig.name))

            if ctrl:
                (ig_ctrl,
                 changed2) = self.update_instance_group_controller(ig, ctrl)
                if changed2:
                    print(
                        six.text_type("Set controller group {} on {}.").format(
                            ctrl, queuename))

            try:
                (instances,
                 changed3) = self.add_instances_to_group(ig, hostname_list)
                for i in instances:
                    print(
                        six.text_type("Added instance {} to {}").format(
                            i.hostname, ig.name))
            except InstanceNotFound as e:
                instance_not_found_err = e

        if any([changed1, changed2, changed3]):
            print('(changed: True)')

        if instance_not_found_err:
            print(instance_not_found_err.message)
            sys.exit(1)
Exemplo n.º 14
0
 def _register_hostname(self, hostname):
     if not hostname:
         return
     with advisory_lock('instance_registration_%s' % hostname):
         instance = Instance.objects.filter(hostname=hostname)
         if instance.exists():
             print("Instance already registered {}".format(
                 instance[0].hostname))
             return
         instance = Instance(uuid=self.uuid, hostname=hostname)
         instance.save()
     print('Successfully registered instance {}'.format(hostname))
     self.changed = True
Exemplo n.º 15
0
 def handle(self, **options):
     queuename = options.get('queuename')
     if not queuename:
         raise CommandError('Must specify `--queuename` in order to use command.')
     with advisory_lock('instance_group_registration_%s' % queuename):
         ig = InstanceGroup.objects.filter(name=queuename)
         if not ig.exists():
             print("Instance group doesn't exist")
             sys.exit(1)
         ig = ig.first()
         ig.delete()
         print("Instance Group Removed")
         print('(changed: True)')
Exemplo n.º 16
0
    def schedule(self):
        # Lock
        with advisory_lock('task_manager_lock', wait=False) as acquired:
            with transaction.atomic():
                if acquired is False:
                    logger.debug("Not running scheduler, another task holds lock")
                    return
                logger.debug("Starting Scheduler")

                finished_wfjs = self._schedule()

                # Operations whose queries rely on modifications made during the atomic scheduling session
                for wfj in WorkflowJob.objects.filter(id__in=finished_wfjs):
                    wfj.send_notification_templates('succeeded' if wfj.status == 'successful' else 'failed')
Exemplo n.º 17
0
 def handle(self, *args, **options):
     # TODO: remove in 3.3
     hostname = options.get('hostname')
     if not hostname:
         raise CommandError("--hostname is a required argument")
     with advisory_lock('instance_registration_%s' % hostname):
         instance = Instance.objects.filter(hostname=hostname)
         if instance.exists():
             instance.delete()
             print("Instance Removed")
             print('Successfully deprovisioned {}'.format(hostname))
             print('(changed: True)')
         else:
             print('No instance found matching name {}'.format(hostname))
Exemplo n.º 18
0
 def schedule(self):
     # Lock
     with advisory_lock('task_manager_lock', wait=False) as acquired:
         with transaction.atomic():
             if acquired is False:
                 logger.debug("Not running scheduler, another task holds lock")
                 return
             logger.debug("Starting Scheduler")
             with task_manager_bulk_reschedule():
                 # if sigterm due to timeout, still record metrics
                 signal.signal(signal.SIGTERM, self.record_aggregate_metrics_and_exit)
                 self._schedule()
                 self.record_aggregate_metrics()
             logger.debug("Finishing Scheduler")
Exemplo n.º 19
0
    def register(self):
        with advisory_lock('cluster_policy_lock'):
            with transaction.atomic():
                changed2 = False
                (ig, created, changed1) = self.get_create_update_instance_group()
                if created:
                    print("Creating instance group {}".format(ig.name))
                elif not created:
                    print("Instance Group already registered {}".format(ig.name))

                try:
                    (instances, changed2) = self.add_instances_to_group(ig)
                    for i in instances:
                        print("Added instance {} to {}".format(i.hostname, ig.name))
                except InstanceNotFound as e:
                    self.instance_not_found_err = e

        if changed1 or changed2:
            print('(changed: True)')
Exemplo n.º 20
0
 def register(self, uuid=None, hostname=None, ip_address=None):
     if not uuid:
         uuid = settings.SYSTEM_UUID
     if not hostname:
         hostname = settings.CLUSTER_HOST_ID
     with advisory_lock('instance_registration_%s' % hostname):
         instance = self.filter(hostname=hostname)
         if instance.exists():
             instance = instance.get()
             if instance.ip_address != ip_address:
                 instance.ip_address = ip_address
                 instance.save(update_fields=['ip_address'])
                 return (True, instance)
             else:
                 return (False, instance)
         instance = self.create(uuid=uuid,
                                hostname=hostname,
                                ip_address=ip_address,
                                capacity=0)
     return (True, instance)
Exemplo n.º 21
0
    def schedule(self):
        # Lock
        with task_manager_bulk_reschedule():
            with advisory_lock(f"{self.prefix}_lock", wait=False) as acquired:
                with transaction.atomic():
                    if acquired is False:
                        logger.debug(
                            f"Not running {self.prefix} scheduler, another task holds lock"
                        )
                        return
                    logger.debug(f"Starting {self.prefix} Scheduler")
                    # if sigterm due to timeout, still record metrics
                    signal.signal(signal.SIGTERM,
                                  self.record_aggregate_metrics_and_exit)
                    self._schedule()
                    commit_start = time.time()

                if self.prefix == "task_manager":
                    self.subsystem_metrics.set(f"{self.prefix}_commit_seconds",
                                               time.time() - commit_start)
                self.record_aggregate_metrics()
                logger.debug(f"Finishing {self.prefix} Scheduler")
Exemplo n.º 22
0
Arquivo: system.py Projeto: mahak/awx
def apply_cluster_membership_policies():
    from awx.main.signals import disable_activity_stream

    started_waiting = time.time()
    with advisory_lock('cluster_policy_lock', wait=True):
        lock_time = time.time() - started_waiting
        if lock_time > 1.0:
            to_log = logger.info
        else:
            to_log = logger.debug
        to_log('Waited {} seconds to obtain lock name: cluster_policy_lock'.format(lock_time))
        started_compute = time.time()
        # Hop nodes should never get assigned to an InstanceGroup.
        all_instances = list(Instance.objects.exclude(node_type='hop').order_by('id'))
        all_groups = list(InstanceGroup.objects.prefetch_related('instances'))

        total_instances = len(all_instances)
        actual_groups = []
        actual_instances = []
        Group = namedtuple('Group', ['obj', 'instances', 'prior_instances'])
        Node = namedtuple('Instance', ['obj', 'groups'])

        # Process policy instance list first, these will represent manually managed memberships
        instance_hostnames_map = {inst.hostname: inst for inst in all_instances}
        for ig in all_groups:
            group_actual = Group(obj=ig, instances=[], prior_instances=[instance.pk for instance in ig.instances.all()])  # obtained in prefetch
            for hostname in ig.policy_instance_list:
                if hostname not in instance_hostnames_map:
                    logger.info("Unknown instance {} in {} policy list".format(hostname, ig.name))
                    continue
                inst = instance_hostnames_map[hostname]
                group_actual.instances.append(inst.id)
                # NOTE: arguable behavior: policy-list-group is not added to
                # instance's group count for consideration in minimum-policy rules
            if group_actual.instances:
                logger.debug("Policy List, adding Instances {} to Group {}".format(group_actual.instances, ig.name))

            actual_groups.append(group_actual)

        # Process Instance minimum policies next, since it represents a concrete lower bound to the
        # number of instances to make available to instance groups
        actual_instances = [Node(obj=i, groups=[]) for i in all_instances if i.managed_by_policy]
        logger.debug("Total instances: {}, available for policy: {}".format(total_instances, len(actual_instances)))
        for g in sorted(actual_groups, key=lambda x: len(x.instances)):
            exclude_type = 'execution' if g.obj.name == settings.DEFAULT_CONTROL_PLANE_QUEUE_NAME else 'control'
            policy_min_added = []
            for i in sorted(actual_instances, key=lambda x: len(x.groups)):
                if i.obj.node_type == exclude_type:
                    continue  # never place execution instances in controlplane group or control instances in other groups
                if len(g.instances) >= g.obj.policy_instance_minimum:
                    break
                if i.obj.id in g.instances:
                    # If the instance is already _in_ the group, it was
                    # applied earlier via the policy list
                    continue
                g.instances.append(i.obj.id)
                i.groups.append(g.obj.id)
                policy_min_added.append(i.obj.id)
            if policy_min_added:
                logger.debug("Policy minimum, adding Instances {} to Group {}".format(policy_min_added, g.obj.name))

        # Finally, process instance policy percentages
        for g in sorted(actual_groups, key=lambda x: len(x.instances)):
            exclude_type = 'execution' if g.obj.name == settings.DEFAULT_CONTROL_PLANE_QUEUE_NAME else 'control'
            candidate_pool_ct = sum(1 for i in actual_instances if i.obj.node_type != exclude_type)
            if not candidate_pool_ct:
                continue
            policy_per_added = []
            for i in sorted(actual_instances, key=lambda x: len(x.groups)):
                if i.obj.node_type == exclude_type:
                    continue
                if i.obj.id in g.instances:
                    # If the instance is already _in_ the group, it was
                    # applied earlier via a minimum policy or policy list
                    continue
                if 100 * float(len(g.instances)) / candidate_pool_ct >= g.obj.policy_instance_percentage:
                    break
                g.instances.append(i.obj.id)
                i.groups.append(g.obj.id)
                policy_per_added.append(i.obj.id)
            if policy_per_added:
                logger.debug("Policy percentage, adding Instances {} to Group {}".format(policy_per_added, g.obj.name))

        # Determine if any changes need to be made
        needs_change = False
        for g in actual_groups:
            if set(g.instances) != set(g.prior_instances):
                needs_change = True
                break
        if not needs_change:
            logger.debug('Cluster policy no-op finished in {} seconds'.format(time.time() - started_compute))
            return

        # On a differential basis, apply instances to groups
        with transaction.atomic():
            with disable_activity_stream():
                for g in actual_groups:
                    if g.obj.is_container_group:
                        logger.debug('Skipping containerized group {} for policy calculation'.format(g.obj.name))
                        continue
                    instances_to_add = set(g.instances) - set(g.prior_instances)
                    instances_to_remove = set(g.prior_instances) - set(g.instances)
                    if instances_to_add:
                        logger.debug('Adding instances {} to group {}'.format(list(instances_to_add), g.obj.name))
                        g.obj.instances.add(*instances_to_add)
                    if instances_to_remove:
                        logger.debug('Removing instances {} from group {}'.format(list(instances_to_remove), g.obj.name))
                        g.obj.instances.remove(*instances_to_remove)
        logger.debug('Cluster policy computation finished in {} seconds'.format(time.time() - started_compute))
Exemplo n.º 23
0
    def perform_update(self, options, data, inventory_update):
        """Shared method for both awx-manage CLI updates and inventory updates
        from the tasks system.

        This saves the inventory data to the database, calling load_into_database
        but also wraps that method in a host of options processing
        """
        # outside of normal options, these are needed as part of programatic interface
        self.inventory = inventory_update.inventory
        self.inventory_source = inventory_update.inventory_source
        self.inventory_update = inventory_update

        # the update options, could be parser object or dict
        self.overwrite = bool(options.get('overwrite', False))
        self.overwrite_vars = bool(options.get('overwrite_vars', False))
        self.enabled_var = options.get('enabled_var', None)
        self.enabled_value = options.get('enabled_value', None)
        self.group_filter = options.get('group_filter', None) or r'^.+$'
        self.host_filter = options.get('host_filter', None) or r'^.+$'
        self.exclude_empty_groups = bool(options.get('exclude_empty_groups', False))
        self.instance_id_var = options.get('instance_id_var', None)

        try:
            self.group_filter_re = re.compile(self.group_filter)
        except re.error:
            raise CommandError('invalid regular expression for --group-filter')
        try:
            self.host_filter_re = re.compile(self.host_filter)
        except re.error:
            raise CommandError('invalid regular expression for --host-filter')

        begin = time.time()

        # Since perform_update can be invoked either through the awx-manage CLI
        # or from the task system, we need to create a new lock at this level
        # (even though inventory_import.Command.handle -- which calls
        # perform_update -- has its own lock, inventory_ID_import)
        with advisory_lock('inventory_{}_perform_update'.format(self.inventory.id)):

            try:
                self.check_license()
            except PermissionDenied as e:
                self.mark_license_failure(save=True)
                raise e

            try:
                # Check the per-org host limits
                self.check_org_host_limit()
            except PermissionDenied as e:
                self.mark_org_limits_failure(save=True)
                raise e

            if settings.SQL_DEBUG:
                queries_before = len(connection.queries)

            # Update inventory update for this command line invocation.
            with ignore_inventory_computed_fields():
                # TODO: move this to before perform_update
                iu = self.inventory_update
                if iu.status != 'running':
                    with transaction.atomic():
                        self.inventory_update.status = 'running'
                        self.inventory_update.save()

            logger.info('Processing JSON output...')
            inventory = MemInventory(group_filter_re=self.group_filter_re, host_filter_re=self.host_filter_re)
            inventory = dict_to_mem_data(data, inventory=inventory)

            logger.info('Loaded %d groups, %d hosts', len(inventory.all_group.all_groups), len(inventory.all_group.all_hosts))

            if self.exclude_empty_groups:
                inventory.delete_empty_groups()

            self.all_group = inventory.all_group

            if settings.DEBUG:
                # depending on inventory source, this output can be
                # *exceedingly* verbose - crawling a deeply nested
                # inventory/group data structure and printing metadata about
                # each host and its memberships
                #
                # it's easy for this scale of data to overwhelm pexpect,
                # (and it's likely only useful for purposes of debugging the
                # actual inventory import code), so only print it if we have to:
                # https://github.com/ansible/ansible-tower/issues/7414#issuecomment-321615104
                self.all_group.debug_tree()

            with batch_role_ancestor_rebuilding():
                # If using with transaction.atomic() with try ... catch,
                # with transaction.atomic() must be inside the try section of the code as per Django docs
                try:
                    # Ensure that this is managed as an atomic SQL transaction,
                    # and thus properly rolled back if there is an issue.
                    with transaction.atomic():
                        # Merge/overwrite inventory into database.
                        if settings.SQL_DEBUG:
                            logger.warning('loading into database...')
                        with ignore_inventory_computed_fields():
                            if getattr(settings, 'ACTIVITY_STREAM_ENABLED_FOR_INVENTORY_SYNC', True):
                                self.load_into_database()
                            else:
                                with disable_activity_stream():
                                    self.load_into_database()
                            if settings.SQL_DEBUG:
                                queries_before2 = len(connection.queries)
                            self.inventory.update_computed_fields()
                        if settings.SQL_DEBUG:
                            logger.warning('update computed fields took %d queries', len(connection.queries) - queries_before2)

                        # Check if the license is valid.
                        # If the license is not valid, a CommandError will be thrown,
                        # and inventory update will be marked as invalid.
                        # with transaction.atomic() will roll back the changes.
                        license_fail = True
                        self.check_license()

                        # Check the per-org host limits
                        license_fail = False
                        self.check_org_host_limit()
                except PermissionDenied as e:
                    if license_fail:
                        self.mark_license_failure(save=True)
                    else:
                        self.mark_org_limits_failure(save=True)
                    raise e

                if settings.SQL_DEBUG:
                    logger.warning('Inventory import completed for %s in %0.1fs', self.inventory_source.name, time.time() - begin)
                else:
                    logger.info('Inventory import completed for %s in %0.1fs', self.inventory_source.name, time.time() - begin)

            # If we're in debug mode, then log the queries and time
            # used to do the operation.
            if settings.SQL_DEBUG:
                queries_this_import = connection.queries[queries_before:]
                sqltime = sum(float(x['time']) for x in queries_this_import)
                logger.warning('Inventory import required %d queries ' 'taking %0.3fs', len(queries_this_import), sqltime)
Exemplo n.º 24
0
Arquivo: core.py Projeto: timkids/awx
def gather(dest=None, module=None, subset=None, since=None, until=None, collection_type='scheduled'):
    """
    Gather all defined metrics and write them as JSON files in a .tgz

    :param dest:   the (optional) absolute path to write a compressed tarball
    :param module: the module to search for registered analytic collector
                   functions; defaults to awx.main.analytics.collectors
    """
    log_level = logging.ERROR if collection_type != 'scheduled' else logging.DEBUG

    if not _valid_license():
        logger.log(log_level, "Invalid License provided, or No License Provided")
        return None

    if collection_type != 'dry-run':
        if not settings.INSIGHTS_TRACKING_STATE:
            logger.log(log_level, "Automation Analytics not enabled. Use --dry-run to gather locally without sending.")
            return None

        if not (settings.AUTOMATION_ANALYTICS_URL and settings.REDHAT_USERNAME and settings.REDHAT_PASSWORD):
            logger.log(log_level, "Not gathering analytics, configuration is invalid. Use --dry-run to gather locally without sending.")
            return None

    with advisory_lock('gather_analytics_lock', wait=False) as acquired:
        if not acquired:
            logger.log(log_level, "Not gathering analytics, another task holds lock")
            return None

        from awx.conf.models import Setting
        from awx.main.analytics import collectors
        from awx.main.signals import disable_activity_stream

        logger.debug("Last analytics run was: {}".format(settings.AUTOMATION_ANALYTICS_LAST_GATHER))

        try:
            since, until, last_gather = calculate_collection_interval(since, until)
        except ValueError:
            return None

        last_entries = Setting.objects.filter(key='AUTOMATION_ANALYTICS_LAST_ENTRIES').first()
        last_entries = json.loads((last_entries.value if last_entries is not None else '') or '{}', object_hook=datetime_hook)

        collector_module = module if module else collectors
        collector_list = [
            func
            for name, func in inspect.getmembers(collector_module)
            if inspect.isfunction(func) and hasattr(func, '__awx_analytics_key__') and (not subset or name in subset)
        ]
        if not any(c.__awx_analytics_key__ == 'config' for c in collector_list):
            # In order to ship to analytics, we must include the output of the built-in 'config' collector.
            collector_list.append(collectors.config)

        json_collectors = [func for func in collector_list if func.__awx_analytics_type__ == 'json']
        csv_collectors = [func for func in collector_list if func.__awx_analytics_type__ == 'csv']

        dest = pathlib.Path(dest or tempfile.mkdtemp(prefix='awx_analytics'))
        gather_dir = dest.joinpath('stage')
        gather_dir.mkdir(mode=0o700)
        tarfiles = []
        succeeded = True

        # These json collectors are pretty compact, so collect all of them before shipping to analytics.
        data = {}
        for func in json_collectors:
            key = func.__awx_analytics_key__
            filename = f'{key}.json'
            try:
                last_entry = max(last_entries.get(key) or last_gather, until - timedelta(weeks=4))
                results = (func(since or last_entry, collection_type=collection_type, until=until), func.__awx_analytics_version__)
                json.dumps(results, cls=DjangoJSONEncoder)  # throwaway check to see if the data is json-serializable
                data[filename] = results
            except Exception:
                logger.exception("Could not generate metric {}".format(filename))
        if data:
            if data.get('config.json') is None:
                logger.error("'config' collector data is missing.")
                return None

            tgzfile = package(dest.parent, data, until)
            if tgzfile is not None:
                tarfiles.append(tgzfile)
                if collection_type != 'dry-run':
                    if ship(tgzfile):
                        with disable_activity_stream():
                            for filename in data:
                                key = filename.replace('.json', '')
                                last_entries[key] = max(last_entries[key], until) if last_entries.get(key) else until
                            settings.AUTOMATION_ANALYTICS_LAST_ENTRIES = json.dumps(last_entries, cls=DjangoJSONEncoder)
                    else:
                        succeeded = False

        for func in csv_collectors:
            key = func.__awx_analytics_key__
            filename = f'{key}.csv'
            try:
                # These slicer functions may return a generator. The `since` parameter is
                # allowed to be None, and will fall back to LAST_ENTRIES[key] or to
                # LAST_GATHER (truncated appropriately to match the 4-week limit).
                if func.__awx_expensive__:
                    slices = func.__awx_expensive__(key, since, until, last_gather)
                else:
                    slices = collectors.trivial_slicing(key, since, until, last_gather)

                for start, end in slices:
                    files = func(start, full_path=gather_dir, until=end)

                    if not files:
                        if collection_type != 'dry-run':
                            with disable_activity_stream():
                                entry = last_entries.get(key)
                                last_entries[key] = max(entry, end) if entry and type(entry) == type(end) else end
                                settings.AUTOMATION_ANALYTICS_LAST_ENTRIES = json.dumps(last_entries, cls=DjangoJSONEncoder)
                        continue

                    slice_succeeded = True
                    for fpath in files:
                        payload = {filename: (fpath, func.__awx_analytics_version__)}

                        payload['config.json'] = data.get('config.json')
                        if payload['config.json'] is None:
                            logger.error("'config' collector data is missing, and is required to ship.")
                            return None

                        tgzfile = package(dest.parent, payload, until)
                        if tgzfile is not None:
                            tarfiles.append(tgzfile)
                            if collection_type != 'dry-run':
                                if not ship(tgzfile):
                                    slice_succeeded, succeeded = False, False
                                    break

                    if slice_succeeded and collection_type != 'dry-run':
                        with disable_activity_stream():
                            entry = last_entries.get(key)
                            last_entries[key] = max(entry, end) if entry and type(entry) == type(end) else end
                            settings.AUTOMATION_ANALYTICS_LAST_ENTRIES = json.dumps(last_entries, cls=DjangoJSONEncoder)
            except Exception:
                succeeded = False
                logger.exception("Could not generate metric {}".format(filename))

        if collection_type != 'dry-run':
            if succeeded:
                for fpath in tarfiles:
                    if os.path.exists(fpath):
                        os.remove(fpath)
            with disable_activity_stream():
                if not settings.AUTOMATION_ANALYTICS_LAST_GATHER or until > settings.AUTOMATION_ANALYTICS_LAST_GATHER:
                    # `AUTOMATION_ANALYTICS_LAST_GATHER` is set whether collection succeeds or fails;
                    # if collection fails because of a persistent, underlying issue and we do not set last_gather,
                    # we risk the collectors hitting an increasingly greater workload while the underlying issue
                    # remains unresolved. Put simply, if collection fails, we just move on.

                    # All that said, `AUTOMATION_ANALYTICS_LAST_GATHER` plays a much smaller role in determining
                    # what is actually collected than it used to; collectors now mostly rely on their respective entry
                    # under `last_entries` to determine what should be collected.
                    settings.AUTOMATION_ANALYTICS_LAST_GATHER = until

        shutil.rmtree(dest, ignore_errors=True)  # clean up individual artifact files
        if not tarfiles:
            # No data was collected
            logger.warning("No data from {} to {}".format(since or last_gather, until))
            return None

        return tarfiles
Exemplo n.º 25
0
    def handle(self, *args, **options):
        # Load inventory and related objects from database.
        inventory_name = options.get('inventory_name', None)
        inventory_id = options.get('inventory_id', None)
        if inventory_name and inventory_id:
            raise CommandError('--inventory-name and --inventory-id are mutually exclusive')
        elif not inventory_name and not inventory_id:
            raise CommandError('--inventory-name or --inventory-id is required')

        with advisory_lock('inventory_{}_import'.format(inventory_id)):
            # Obtain rest of the options needed to run update
            raw_source = options.get('source', None)
            if not raw_source:
                raise CommandError('--source is required')
            verbosity = int(options.get('verbosity', 1))
            self.set_logging_level(verbosity)

            # Load inventory object based on name or ID.
            if inventory_id:
                q = dict(id=inventory_id)
            else:
                q = dict(name=inventory_name)
            try:
                inventory = Inventory.objects.get(**q)
            except Inventory.DoesNotExist:
                raise CommandError('Inventory with %s = %s cannot be found' % list(q.items())[0])
            except Inventory.MultipleObjectsReturned:
                raise CommandError('Inventory with %s = %s returned multiple results' % list(q.items())[0])
            logger.info('Updating inventory %d: %s' % (inventory.pk, inventory.name))

            # Create ad-hoc inventory source and inventory update objects
            with ignore_inventory_computed_fields():
                source = Command.get_source_absolute_path(raw_source)

                inventory_source, created = InventorySource.objects.get_or_create(
                    inventory=inventory,
                    source='file',
                    source_path=os.path.abspath(source),
                    overwrite=bool(options.get('overwrite', False)),
                    overwrite_vars=bool(options.get('overwrite_vars', False)),
                )
                inventory_update = inventory_source.create_inventory_update(
                    _eager_fields=dict(job_args=json.dumps(sys.argv), job_env=dict(os.environ.items()), job_cwd=os.getcwd())
                )

            data = AnsibleInventoryLoader(source=source, verbosity=verbosity).load()

            logger.debug('Finished loading from source: %s', source)

            status, tb, exc = 'error', '', None
            try:
                self.perform_update(options, data, inventory_update)
                status = 'successful'
            except Exception as e:
                exc = e
                if isinstance(e, KeyboardInterrupt):
                    status = 'canceled'
                else:
                    tb = traceback.format_exc()

            with ignore_inventory_computed_fields():
                inventory_update = InventoryUpdate.objects.get(pk=inventory_update.pk)
                inventory_update.result_traceback = tb
                inventory_update.status = status
                inventory_update.save(update_fields=['status', 'result_traceback'])
                inventory_source.status = status
                inventory_source.save(update_fields=['status'])

        if exc:
            logger.error(str(exc))

        if exc:
            if isinstance(exc, CommandError):
                sys.exit(1)
            raise exc