Пример #1
0
def _delete_cloud_accounts(cloud_accounts):
    """
    Delete the given list of CloudAccount objects.

    Args:
        cloud_accounts (list[CloudAccount]): cloud accounts to delete

    """
    for cloud_account in cloud_accounts:
        # Lock on the user level, so that a single user can only have one task
        # running at a time.
        #
        # The select_for_update() lock has been moved from the CloudAccount to the
        # UserTaskLock. We should release the UserTaskLock with each
        # cloud_account.delete action.
        #
        # Using the UserTaskLock *should* fix the issue of Django not getting a
        # row-level lock in the DB for each CloudAccount we want to delete until
        # after all of the pre_delete logic completes
        with lock_task_for_user_ids([cloud_account.user.id]):
            # Call delete on the CloudAccount queryset instead of the specific
            # cloud_account. Why? A queryset delete does not raise DoesNotExist
            # exceptions if the cloud_account has already been deleted.
            # If we call delete on a nonexistent cloud_account, we run into trouble
            # with Django rollback and our task lock.
            # See https://gitlab.com/cloudigrade/cloudigrade/-/merge_requests/811
            try:
                cloud_account.refresh_from_db()
                CloudAccount.objects.filter(id=cloud_account.id).delete()
            except CloudAccount.DoesNotExist:
                logger.info(_("Cloud Account %s has already been deleted"),
                            cloud_account)
Пример #2
0
 def test_lock_task_for_user_ids_updates_usertasklock(self):
     """Assert UserTaskLock is updated by context manager."""
     user = util_helper.generate_test_user()
     UserTaskLock.objects.create(user=user)
     with lock_task_for_user_ids([user.id]) as locks:
         for lock in locks:
             self.assertEqual(lock.locked, True)
     lock = UserTaskLock.objects.get(user=user)
     self.assertEqual(lock.locked, False)
Пример #3
0
    def test_lock_task_for_user_ids_create_usertasklock(self):
        """Assert UserTaskLock is created by context manager."""
        user1 = util_helper.generate_test_user()
        user2 = util_helper.generate_test_user()

        with lock_task_for_user_ids([user1.id, user2.id]):
            locks = UserTaskLock.objects.all()
            for lock in locks:
                self.assertEqual(lock.locked, True)
        locks = UserTaskLock.objects.all()
        for lock in locks:
            self.assertEqual(lock.locked, False)
Пример #4
0
    def save(self, *args, **kwargs):
        """Save this image and delete any related ConcurrentUsage objects."""
        concurrent_usages = ConcurrentUsage.objects.filter(
            potentially_related_runs__in=Run.objects.filter(machineimage=self)
        )
        if concurrent_usages.exists():
            # Lock all users that depend on this machineimage
            user_ids = set(
                Instance.objects.filter(machine_image=self).values_list(
                    "cloud_account__user__id", flat=True
                )
            )

            with lock_task_for_user_ids(user_ids):
                concurrent_usages.delete()
        return super().save(*args, **kwargs)
Пример #5
0
    def test_delete_clount_lock_exception(self, mock_notify_sources):
        """Test that an exception when deleting a clount inside a lock rolls back."""
        aws_account_id = util_helper.generate_dummy_aws_account_id()
        arn = util_helper.generate_dummy_arn(account_id=aws_account_id)
        account = api_helper.generate_cloud_account(
            arn=arn,
            aws_account_id=aws_account_id,
            name="test",
            generate_verify_task=False,
        )
        UserTaskLock.objects.create(user=account.user)
        with self.assertRaises(transaction.TransactionManagementError):
            with lock_task_for_user_ids([account.user.id]):
                CloudAccount.objects.filter(id=account.id).delete()
                raise transaction.TransactionManagementError

        self.assertEqual(CloudAccount.objects.all().count(), 1)
        self.assertFalse(UserTaskLock.objects.get(user=account.user).locked)
Пример #6
0
    def test_delete_clount_lock(self, mock_notify_sources,
                                mock_delete_cloudtrail):
        """Test that deleting an clount inside a lock is successful."""
        aws_account_id = util_helper.generate_dummy_aws_account_id()
        arn = util_helper.generate_dummy_arn(account_id=aws_account_id)
        account = api_helper.generate_cloud_account(
            arn=arn,
            aws_account_id=aws_account_id,
            name="test",
            generate_verify_task=False,
        )

        with lock_task_for_user_ids([account.user.id]):
            CloudAccount.objects.filter(id=account.id).delete()

        self.assertEqual(CloudAccount.objects.all().count(), 0)
        self.assertFalse(
            UserTaskLock.objects.filter(user=account.user).exists())
Пример #7
0
    def save(self, *args, **kwargs):
        """Save this image and delete any related ConcurrentUsage objects."""
        concurrent_usages = ConcurrentUsage.objects.filter(
            potentially_related_runs__in=Run.objects.filter(machineimage=self))
        if concurrent_usages.exists():
            # Lock all users that depend on this machineimage
            user_ids = set(
                Instance.objects.filter(machine_image=self).values_list(
                    "cloud_account__user__id", flat=True))

            with lock_task_for_user_ids(user_ids):
                logger.info(
                    "Removing %(num_usages)d related ConcurrentUsage objects "
                    "related to Run %(run)s.",
                    {
                        "num_usages": concurrent_usages.count(),
                        "run": str(self)
                    },
                )
                concurrent_usages.delete()
        return super().save(*args, **kwargs)
Пример #8
0
def _save_cloudtrail_activity(instance_events, ami_tag_events,
                              described_instances, described_images):
    """
    Save new images and instances events found via CloudTrail to the DB.

    The order of operations here generally looks like:

        1. Save new images.
        2. Save tag changes for images.
        3. Save new instances.
        4. Save events for instances.

    Note:
        Nothing should be reaching out to AWS APIs in this function! We should
        have all the necessary information already, and this function saves all
        of it atomically in a single transaction.

    Args:
        instance_events (list[CloudTrailInstanceEvent]): found instance events
        ami_tag_events (list[CloudTrailImageTagEvent]): found ami tag events
        described_instances (dict): described new-to-us AWS instances keyed by
            EC2 instance ID
        described_images (dict): described new-to-us AMIs keyed by AMI ID

    Returns:
        dict: Only the new images that were created in the process.

    """
    # Log some basic information about what we're saving.
    log_prefix = "analyzer"

    # Lock all user accounts related to the instance events being processed.
    # A user can only run one task at a time.
    all_user_ids = set([
        AwsCloudAccount.objects.get(aws_account_id=instance_event.
                                    aws_account_id).cloud_account.get().user.id
        for instance_event in instance_events
    ] + [
        AwsCloudAccount.objects.get(aws_account_id=ami_tag_event.aws_account_id
                                    ).cloud_account.get().user.id
        for ami_tag_event in ami_tag_events
    ])
    with lock_task_for_user_ids(all_user_ids):

        # Save instances and their events.
        all_ec2_instance_ids = set([
            instance_event.ec2_instance_id
            for instance_event in instance_events
            if instance_event.ec2_instance_id is not None
        ])
        logger.info(
            _("%(prefix)s: EC2 Instance IDs found: %(all_ec2_instance_ids)s"),
            {
                "prefix": log_prefix,
                "all_ec2_instance_ids": all_ec2_instance_ids
            },
        )

        all_ami_ids = set([
            instance_event.ec2_ami_id for instance_event in instance_events
            if instance_event.ec2_ami_id is not None
        ] + [
            ami_tag_event.ec2_ami_id for ami_tag_event in ami_tag_events
            if ami_tag_event.ec2_ami_id is not None
        ] + [ec2_ami_id for ec2_ami_id in described_images.keys()])
        logger.info(
            _("%(prefix)s: EC2 AMI IDs found: %(all_ami_ids)s"),
            {
                "prefix": log_prefix,
                "all_ami_ids": all_ami_ids
            },
        )

        # Which images have the Windows platform?
        windows_ami_ids = {
            ami_id
            for ami_id, described_ami in described_images.items()
            if is_windows(described_ami)
        }
        logger.info(
            _("%(prefix)s: Windows AMI IDs found: %(windows_ami_ids)s"),
            {
                "prefix": log_prefix,
                "windows_ami_ids": windows_ami_ids
            },
        )

        # Which images need tag state changes?
        ocp_tagged_ami_ids, ocp_untagged_ami_ids = _extract_ami_ids_by_tag_change(
            ami_tag_events, OPENSHIFT_TAG)
        logger.info(
            _("%(prefix)s: AMIs found tagged for OCP: %(ocp_tagged_ami_ids)s"),
            {
                "prefix": log_prefix,
                "ocp_tagged_ami_ids": ocp_tagged_ami_ids
            },
        )
        logger.info(
            _("%(prefix)s: AMIs found untagged for OCP: %(ocp_untagged_ami_ids)s"
              ),
            {
                "prefix": log_prefix,
                "ocp_untagged_ami_ids": ocp_untagged_ami_ids
            },
        )

        rhel_tagged_ami_ids, rhel_untagged_ami_ids = _extract_ami_ids_by_tag_change(
            ami_tag_events, RHEL_TAG)
        logger.info(
            _("%(prefix)s: AMIs found tagged for RHEL: %(rhel_tagged_ami_ids)s"
              ),
            {
                "prefix": log_prefix,
                "rhel_tagged_ami_ids": rhel_tagged_ami_ids
            },
        )
        logger.info(
            _("%(prefix)s: AMIs found untagged for RHEL: %(rhel_untagged_ami_ids)s"
              ),
            {
                "prefix": log_prefix,
                "rhel_untagged_ami_ids": rhel_untagged_ami_ids
            },
        )

        # Create only the new images.
        new_images = {}
        for ami_id, described_image in described_images.items():
            owner_id = Decimal(described_image["OwnerId"])
            name = described_image["Name"]
            architecture = described_image.get("Architecture")
            windows = ami_id in windows_ami_ids
            rhel_detected_by_tag = ami_id in rhel_tagged_ami_ids
            openshift_detected = ami_id in ocp_tagged_ami_ids
            region = described_image["found_in_region"]

            logger.info(
                _("%(prefix)s: Saving new AMI ID %(ami_id)s in region %(region)s"
                  ),
                {
                    "prefix": log_prefix,
                    "ami_id": ami_id,
                    "region": region
                },
            )
            awsimage, new = save_new_aws_machine_image(
                ami_id,
                name,
                owner_id,
                rhel_detected_by_tag,
                openshift_detected,
                windows,
                region,
                architecture,
            )

            image = awsimage.machine_image.get()
            if new and image.status is not image.INSPECTED:
                new_images[ami_id] = awsimage

        # Create "unavailable" images for AMIs we saw referenced but that we either
        # don't have in our models or could not describe from AWS.
        seen_ami_ids = set([
            described_instance["ImageId"]
            for described_instance in described_instances.values()
            if described_instance.get("ImageId") is not None
        ] + [
            ami_tag_event.ec2_ami_id for ami_tag_event in ami_tag_events
            if ami_tag_event.ec2_ami_id is not None
        ] + [
            instance_event.ec2_ami_id for instance_event in instance_events
            if instance_event.ec2_ami_id is not None
        ])
        described_ami_ids = set(described_images.keys())
        known_ami_ids = set(
            image.ec2_ami_id for image in AwsMachineImage.objects.filter(
                ec2_ami_id__in=list(seen_ami_ids - described_ami_ids)))
        unavailable_ami_ids = seen_ami_ids - described_ami_ids - known_ami_ids
        for ami_id in unavailable_ami_ids:
            logger.info(
                _("Missing image data for %s; creating UNAVAILABLE stub image."
                  ), ami_id)
            with transaction.atomic():
                awsmachineimage = AwsMachineImage.objects.create(
                    ec2_ami_id=ami_id)
                MachineImage.objects.create(status=MachineImage.UNAVAILABLE,
                                            content_object=awsmachineimage)
                awsmachineimage.machine_image.get()

        # Update images with openshift tag changes.
        if ocp_tagged_ami_ids:
            MachineImage.objects.filter(
                aws_machine_image__ec2_ami_id__in=ocp_tagged_ami_ids).update(
                    openshift_detected=True)
        if ocp_untagged_ami_ids:
            MachineImage.objects.filter(
                aws_machine_image__ec2_ami_id__in=ocp_untagged_ami_ids).update(
                    openshift_detected=False)

        # Update images with RHEL tag changes.
        if rhel_tagged_ami_ids:
            MachineImage.objects.filter(
                aws_machine_image__ec2_ami_id__in=rhel_tagged_ami_ids).update(
                    rhel_detected_by_tag=True)
        if rhel_untagged_ami_ids:
            MachineImage.objects.filter(
                aws_machine_image__ec2_ami_id__in=rhel_untagged_ami_ids
            ).update(rhel_detected_by_tag=False)

        # Save instances and their events.
        for ((ec2_instance_id, region, aws_account_id),
             events) in itertools.groupby(
                 instance_events,
                 key=lambda e: (e.ec2_instance_id, e.region, e.aws_account_id),
             ):
            events = list(events)

            if ec2_instance_id in described_instances:
                instance_data = described_instances[ec2_instance_id]
            else:
                instance_data = {
                    "InstanceId": ec2_instance_id,
                    "ImageId": events[0].ec2_ami_id,
                    "SubnetId": events[0].subnet_id,
                }
            logger.info(
                _("%(prefix)s: Saving new EC2 instance ID %(ec2_instance_id)s "
                  "for AWS account ID %(aws_account_id)s in region %(region)s"
                  ),
                {
                    "prefix": log_prefix,
                    "ec2_instance_id": ec2_instance_id,
                    "aws_account_id": aws_account_id,
                    "region": region,
                },
            )

            awsaccount = AwsCloudAccount.objects.get(
                aws_account_id=aws_account_id)
            account = awsaccount.cloud_account.get()
            instance = save_instance(account, instance_data, region)

            # Build a list of event data
            events_info = _build_events_info_for_saving(
                account, instance, events)
            save_instance_events(instance, instance_data, events_info)

    return new_images
Пример #9
0
def initial_aws_describe_instances(account_id):
    """
    Fetch and save instances data found upon AWS cloud account creation.

    Args:
        account_id (int): the AwsAccount id
    """
    try:
        aws_account = AwsCloudAccount.objects.get(pk=account_id)
    except AwsCloudAccount.DoesNotExist:
        logger.warning(
            _("AwsCloudAccount id %s could not be found for initial describe"),
            account_id,
        )
        # This can happen if a customer creates and then quickly deletes their
        # cloud account before this async task has started to run. Early exit!
        return

    account = aws_account.cloud_account.get()
    if not account.is_enabled:
        logger.warning(
            _("AwsCloudAccount id %s is not enabled; skipping initial describe"
              ),
            account_id,
        )
        # This can happen if a customer creates and then quickly disabled their
        # cloud account before this async task has started to run. Early exit!
        return
    arn = aws_account.account_arn

    session = aws.get_session(arn)
    instances_data = aws.describe_instances_everywhere(session)

    try:
        user_id = account.user.id
    except User.DoesNotExist:
        logger.info(
            _("User for account id %s has already been deleted; "
              "skipping initial describe."),
            account_id,
        )
        # This can happen if a customer creates and then quickly deletes their
        # cloud account before this async task has started to run. If the user has
        # no other cloud accounts the user will also be deleted. Early exit!
        return

    # Lock the task at a user level. A user can only run one task at a time.
    with lock_task_for_user_ids([user_id]):
        try:
            # Explicitly "get" the related AwsCloudAccount before proceeding.
            # We do this at the start of this transaction in case the account has been
            # deleted during the potentially slow describe_instances_everywhere above.
            # If this fails, we'll jump to the except block to log an important warning.
            AwsCloudAccount.objects.get(pk=account_id)

            create_missing_power_off_aws_instance_events(
                account, instances_data)
            new_ami_ids = create_new_machine_images(session, instances_data)
            logger.info(
                _("Created new machine images include: %(new_ami_ids)s"),
                {"new_ami_ids": new_ami_ids},
            )
            create_initial_aws_instance_events(account, instances_data)
        except AwsCloudAccount.DoesNotExist:
            logger.warning(
                _("AwsCloudAccount id %s could not be found to save newly "
                  "discovered images and instances"),
                account_id,
            )
            # This can happen if a customer deleted their cloud account between
            # the start of this function and here. The AWS calls for
            # describe_instances_everywhere may be slow and are not within this
            # transaction. That's why we have to check again after it.
            return

    messages = generate_aws_ami_messages(instances_data, new_ami_ids)
    for message in messages:
        start_image_inspection(str(arn), message["image_id"],
                               message["region"])
Пример #10
0
def calculate_max_concurrent_usage_task(self, date, user_id):  # noqa: C901
    """
    Schedule a task to calculate maximum concurrent usage of RHEL instances.

    Args:
        self (celery.Task): The bound task. With this we can retry if necessary.
        date (str): the day during which we are measuring usage.
            Celery serializes the date as a string in the format "%Y-%B-%dT%H:%M:%S.
        user_id (int): required filter on user

    Returns:
        ConcurrentUsage for the given date and user ID.

    """
    task_id = self.request.id
    date = date_parser.parse(date).date()

    # Temporary logger.info to help diagnose retry issues.
    logger.info(
        "retries is %(retries)s for id %(id)s user_id %(user_id)s and date %(date)s.",
        {
            "retries": self.request.retries,
            "id": task_id,
            "user_id": user_id,
            "date": date,
        },
    )

    # If the user does not exist, all the related ConcurrentUsage
    # objects should also have been removed, so we can exit early.
    if not User.objects.filter(id=user_id).exists():
        return

    try:
        # Lock the task at a user level. A user can only run one task at a time.
        # Since this both starts a transaction and blocks any others from starting, we
        # can be reasonably confident that there are no other tasks processing for the
        # same user and date at the same time.
        with lock_task_for_user_ids([user_id]):
            try:
                calculation_task = ConcurrentUsageCalculationTask.objects.get(
                    task_id=task_id)
            except ConcurrentUsageCalculationTask.DoesNotExist:
                # It's possible but unlikely this task was deleted since its task was
                # delayed. Since the same user still exists, try scheduling a new task.
                logger.warning(
                    "ConcurrentUsageCalculationTask not found for task ID %(task_id)s! "
                    "Scheduling a new task for user_id %(user_id)s and date %(date)s.",
                    {
                        "task_id": task_id,
                        "user_id": user_id,
                        "date": date
                    },
                )
                schedule_concurrent_calculation_task(date, user_id)
                return

            if calculation_task.status != ConcurrentUsageCalculationTask.SCHEDULED:
                # It's possible but unlikely that something else has changed the status
                # of this task. If it's not currently SCHEDULED, log and return early.
                logger.info(
                    "ConcurrentUsageCalculationTask for task ID %(task_id)s for "
                    "user_id %(user_id)s and date %(date)s has status "
                    "%(status)s which is not SCHEDULED.",
                    {
                        "user_id": user_id,
                        "date": date,
                        "task_id": task_id,
                        "status": calculation_task.status,
                    },
                )
                return

            calculate_max_concurrent_usage(date, user_id)

            calculation_task.status = ConcurrentUsageCalculationTask.COMPLETE
            calculation_task.save()
            logger.info(
                "Completed calculate_max_concurrent_usage_task for user_id %(user_id)s "
                "and date %(date)s (task_id %(task_id)s).",
                {
                    "user_id": user_id,
                    "date": date,
                    "task_id": task_id
                },
            )
            return
    except Exception as unknown_exception:
        # It's unclear exactly what other exceptions might arise, but just to be safe,
        # let's log the trace, set the task's status to ERROR, and re-raise it.
        logger.warning(unknown_exception, exc_info=True)
        # Use this objects.filter().update() pattern so that we don't risk raising an
        # IntegrityError in case the object has somehow been deleted.
        ConcurrentUsageCalculationTask.objects.filter(task_id=task_id).update(
            status=ConcurrentUsageCalculationTask.ERROR)
        raise unknown_exception
Пример #11
0
def calculate_max_concurrent_usage_task(self, date, user_id):
    """
    Schedule a task to calculate maximum concurrent usage of RHEL instances.

    Args:
        self (celery.Task): The bound task. With this we can retry if necessary.
        date (str): the day during which we are measuring usage.
            Celery serializes the date as a string in the format "%Y-%B-%dT%H:%M:%S.
        user_id (int): required filter on user

    Returns:
        ConcurrentUsage for the given date and user ID.

    """
    # Temporary logger.info to help diagnose retry issues.
    logger.info(
        "retries is %(retries)s for id %(id)s user_id %(user_id)s and date %(date)s.",
        {
            "retries": self.request.retries,
            "id": self.request.id,
            "user_id": user_id,
            "date": date,
        },
    )

    # If the user does not exist, all the related ConcurrentUsage
    # objects should also have been removed, so we can exit early.
    if not User.objects.filter(id=user_id).exists():
        return

    date = date_parser.parse(date).date()

    # If there is already an calculate_max_concurrent_usage running for given
    # user and date, then retry this task later.
    running_tasks = ConcurrentUsageCalculationTask.objects.filter(
        date=date,
        user__id=user_id,
        status=ConcurrentUsageCalculationTask.RUNNING)
    if running_tasks:
        logger.info(
            "calculate_max_concurrent_usage_task for user_id %(user_id)s "
            "and date %(date)s is already running. The current task will "
            "be retried later.",
            {
                "user_id": user_id,
                "date": date
            },
        )
        for task in running_tasks:
            logger.info("already running task %(task)s", {"task": task})
        self.retry()

    logger.info(
        "Running calculate_max_concurrent_usage_task for user_id %(user_id)s "
        "and date %(date)s.",
        {
            "user_id": user_id,
            "date": date
        },
    )

    # Set task to running
    task_id = self.request.id
    try:
        calculation_task = ConcurrentUsageCalculationTask.objects.get(
            task_id=task_id)
    except ConcurrentUsageCalculationTask.DoesNotExist:
        # This probably shouldn't happen, but this error that suggest it does:
        # https://sentry.io/organizations/cloudigrade/issues/2299804963/
        # Until we can figure out the root cause of tasks going missing, let's log an
        # error here with details and schedule a new calculation task.
        logger.error(
            'ConcurrentUsageCalculationTask not found for task ID "%(task_id)s"! '
            "Scheduling a new task for user_id %(user_id)s and date %(date)s.",
            {
                "task_id": task_id,
                "user_id": user_id,
                "date": date
            },
        )
        schedule_concurrent_calculation_task(date, user_id)
        return

    calculation_task.status = ConcurrentUsageCalculationTask.RUNNING
    calculation_task.save()

    try:
        # Lock the task at a user level. A user can only run one task at a time.
        # If another user task is already running, then don't start the
        # concurrent usage calculation task
        with lock_task_for_user_ids([user_id]):
            calculate_max_concurrent_usage(date, user_id)
    except Exception:
        calculation_task.status = ConcurrentUsageCalculationTask.ERROR
        calculation_task.save()
        raise

    calculation_task.status = ConcurrentUsageCalculationTask.COMPLETE
    calculation_task.save()
    logger.info(
        "Completed calculate_max_concurrent_usage_task for user_id %(user_id)s "
        "and date %(date)s.",
        {
            "user_id": user_id,
            "date": date
        },
    )