示例#1
0
class Task(TaskMixin, models.Model):
    """ Business tasks from project
    """
    id = models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID', db_index=True)
    data = JSONField('data', null=False, help_text='User imported or uploaded data for a task. Data is formatted according to '
                                                   'the project label config. You can find examples of data for your project '
                                                   'on the Import page in the Label Studio Data Manager UI.')
    meta = JSONField('meta', null=True, default=dict,
                     help_text='Meta is user imported (uploaded) data and can be useful as input for an ML '
                               'Backend for embeddings, advanced vectors, and other info. It is passed to '
                               'ML during training/predicting steps.')
    project = models.ForeignKey('projects.Project', related_name='tasks', on_delete=models.CASCADE, null=True,
                                help_text='Project ID for this task')
    created_at = models.DateTimeField(_('created at'), auto_now_add=True, help_text='Time a task was created')
    updated_at = models.DateTimeField(_('updated at'), auto_now=True, help_text='Last time a task was updated')
    is_labeled = models.BooleanField(_('is_labeled'), default=False,
                                     help_text='True if the number of annotations for this task is greater than or equal '
                                               'to the number of maximum_completions for the project', db_index=True)
    overlap = models.IntegerField(_('overlap'), default=1, db_index=True,
                                  help_text='Number of distinct annotators that processed the current task')
    file_upload = models.ForeignKey(
        'data_import.FileUpload', on_delete=models.SET_NULL, null=True, blank=True, related_name='tasks',
        help_text='Uploaded file used as data source for this task'
    )
    updates = ['is_labeled']

    objects = TaskManager()  # task manager by default
    prepared = PreparedTaskManager()  # task manager with filters, ordering, etc for data_manager app

    class Meta:
        db_table = 'task'
        ordering = ['-updated_at']
        indexes = [
            models.Index(fields=['project', 'is_labeled']),
            models.Index(fields=['id', 'overlap']),
            models.Index(fields=['overlap']),
            models.Index(fields=['is_labeled'])
        ]

    @property
    def file_upload_name(self):
        return os.path.basename(self.file_upload.file.name)

    @classmethod
    def get_locked_by(cls, user, project=None, tasks=None):
        """ Retrieve the task locked by specified user. Returns None if the specified user didn't lock anything.
        """
        lock = None
        if project is not None:
            lock = TaskLock.objects.filter(user=user, expire_at__gt=now(), task__project=project).first()
        elif tasks is not None:
            locked_tasks = tasks.filter(locks__user=user, locks__expire_at__gt=now())[:1]
            if locked_tasks:
                return locked_tasks[0]
        else:
            raise Exception('Neither project or tasks passed to get_locked_by')

        if lock:
            return lock.task

    def has_lock(self, user=None):
        """Check whether current task has been locked by some user"""
        num_locks = self.num_locks
        num_annotations = self.annotations.filter(ground_truth=False)\
            .exclude(Q(was_cancelled=True) & ~Q(completed_by=user)).count()

        num = num_locks + num_annotations
        if num > self.overlap:
            logger.error(
                f"Num takes={num} > overlap={self.overlap} for task={self.id} - it's a bug",
                extra=dict(
                    lock_ttl=self.get_lock_ttl(),
                    num_locks=num_locks,
                    num_annotations=num_annotations,
                )
            )
        return num >= self.overlap

    @property
    def num_locks(self):
        return self.locks.filter(expire_at__gt=now()).count()

    def get_lock_ttl(self):
        if settings.TASK_LOCK_TTL is not None:
            return settings.TASK_LOCK_TTL
        return settings.TASK_LOCK_MIN_TTL

    def has_permission(self, user):
        return self.project.has_permission(user)

    def clear_expired_locks(self):
        self.locks.filter(expire_at__lt=now()).delete()

    def set_lock(self, user):
        """Lock current task by specified user. Lock lifetime is set by `expire_in_secs`"""
        num_locks = self.num_locks
        if num_locks < self.overlap:
            lock_ttl = self.get_lock_ttl()
            expire_at = now() + datetime.timedelta(seconds=lock_ttl)
            TaskLock.objects.create(task=self, user=user, expire_at=expire_at)
            logger.debug(f'User={user} acquires a lock for the task={self} ttl: {lock_ttl}')
        else:
            logger.error(
                f"Current number of locks for task {self.id} is {num_locks}, but overlap={self.overlap}: "
                f"that's a bug because this task should not be taken in a label stream (task should be locked)")
        self.clear_expired_locks()

    def release_lock(self, user=None):
        """Release lock for the task.
        If user specified, it checks whether lock is released by the user who previously has locked that task"""

        if user is not None:
            self.locks.filter(user=user).delete()
        else:
            self.locks.all().delete()
        self.clear_expired_locks()

    def get_storage_link(self):
        # TODO: how to get neatly any storage class here?
        return find_first_one_to_one_related_field_by_prefix(self, '.*io_storages_')

    def resolve_uri(self, task_data, proxy=True):
        if proxy and self.project.task_data_login and self.project.task_data_password:
            protected_data = {}
            for key, value in task_data.items():
                if isinstance(value, str) and string_is_url(value):
                    path = reverse('projects-file-proxy', kwargs={'pk': self.project.pk}) + '?url=' + value
                    value = urljoin(settings.HOSTNAME, path)
                protected_data[key] = value
            return protected_data
        else:
            # Try resolve URLs via storage associated with that task
            storage = self._get_task_storage(task_data)
            if storage:
                return storage.resolve_task_data_uri(task_data)
            return task_data

    def _get_storage_by_task_data(self, task_data):
        from io_storages.models import get_import_storage_by_url

        for url in task_data.values():
            storage_class = get_import_storage_by_url(url)
            if storage_class:
                # Only first matched storage is returned - no way to specify {"url1": "s3://", "url2": "gs://"}
                return storage_class.objects.filter(project=self.project).first()

    def _get_task_storage(self, task_data):
        # maybe task has storage link
        storage_link = self.get_storage_link()
        if storage_link:
            return storage_link.storage

        # or try global storage settings (only s3 for now)
        elif get_env('USE_DEFAULT_S3_STORAGE', default=False, is_bool=True):
            # TODO: this is used to access global environment storage settings.
            # We may use more than one and non-default S3 storage (like GCS, Azure)
            from io_storages.s3.models import S3ImportStorage
            return S3ImportStorage()

        storage = self._get_storage_by_task_data(task_data)
        if storage:
            return storage

    def update_is_labeled(self):
        """Set is_labeled field according to annotations*.count > overlap
        """
        n = self.annotations.filter(Q_finished_annotations & Q(ground_truth=False)).count()
        # self.is_labeled = n >= self.project.maximum_annotations
        self.is_labeled = n >= self.overlap

    def reset_updates(self):
        """ Reset updates to default from model for one task.
            We need it in duplicate project or total deletion of annotations
        """
        for field in Task._meta.fields:
            if field.name in Task.updates:
                setattr(self, field.name, field.default)

    @staticmethod
    def bulk_reset_updates(project):
        """ Bulk reset updates to default, it's a fast way to reset all tasks in project
        """
        for field in Task._meta.fields:
            if field.name in Task.updates:
                project.tasks.update(**{field.name: field.default})

    @staticmethod
    def bulk_update_is_labeled(project):
        """ Fast way to update only is_labeled.
            Prefer to use Django 2.2 bulk_update(), see bulk_update_field('is_labeled')

            get all project.tasks as subquery
            Subquery(
                w coalesce get the first non-null value (count(annotations), or 0)
                make condition
                add temp field pre_is_labeled as condtion values
            )
            update all tasks with Subquery
        """
        tasks = project.tasks.filter(pk=OuterRef('pk'))
        count = Coalesce(Count(
            'annotations', filter=Q(annotations__was_cancelled=False) & Q(annotations__ground_truth=False)), Value(0))
        condition = Case(
            When(overlap__lte=count, then=Value(True)),
            default=Value(False),
            output_field=models.BooleanField(null=False)
        )
        results = tasks.annotate(pre_is_labeled=condition).values('pre_is_labeled')
        project.tasks.update(is_labeled=Subquery(results))

    def delete_url(self):
        return reverse('tasks:task-delete', kwargs={'pk': self.pk})

    def completion_for_ground_truth(self):
        """ 1 Get ground_truth completion if task has it, else
            2 Get first completion created by owner of project,
            3 Or the first of somebody if no owner's items.
            It's used for ground_truth selection right on data manager page
        """
        if not self.annotations.exists():
            return None

        # ground_truth already exist
        ground_truth_annotations = self.annotations.filter(ground_truth=True)
        if ground_truth_annotations.exists():
            return ground_truth_annotations.first()

        # owner annotation
        owner_annotations = self.annotations.filter(completed_by=self.project.created_by)
        if owner_annotations.count() > 0:
            return owner_annotations.first()

        # annotator annotation
        return self.annotations.first()

    def increase_project_summary_counters(self):
        if hasattr(self.project, 'summary'):
            summary = self.project.summary
            summary.update_data_columns([self])

    def decrease_project_summary_counters(self):
        if hasattr(self.project, 'summary'):
            summary = self.project.summary
            summary.remove_data_columns([self])

    def ensure_unique_groundtruth(self, annotation_id):
        self.annotations.exclude(id=annotation_id).update(ground_truth=False)
示例#2
0
class Task(TaskMixin, models.Model):
    """ Business tasks from project
    """
    id = models.AutoField(auto_created=True,
                          primary_key=True,
                          serialize=False,
                          verbose_name='ID',
                          db_index=True)
    data = JSONField(
        'data',
        null=False,
        help_text=
        'User imported or uploaded data for a task. Data is formatted according to '
        'the project label config. You can find examples of data for your project '
        'on the Import page in the Label Studio Data Manager UI.')
    meta = JSONField(
        'meta',
        null=True,
        default=dict,
        help_text=
        'Meta is user imported (uploaded) data and can be useful as input for an ML '
        'Backend for embeddings, advanced vectors, and other info. It is passed to '
        'ML during training/predicting steps.')
    project = models.ForeignKey('projects.Project',
                                related_name='tasks',
                                on_delete=models.CASCADE,
                                null=True,
                                help_text='Project ID for this task')
    created_at = models.DateTimeField(_('created at'),
                                      auto_now_add=True,
                                      help_text='Time a task was created')
    updated_at = models.DateTimeField(_('updated at'),
                                      auto_now=True,
                                      help_text='Last time a task was updated')
    updated_by = models.ForeignKey(
        settings.AUTH_USER_MODEL,
        related_name='updated_tasks',
        on_delete=models.SET_NULL,
        null=True,
        verbose_name=_('updated by'),
        help_text='Last annotator or reviewer who updated this task')
    is_labeled = models.BooleanField(
        _('is_labeled'),
        default=False,
        help_text=
        'True if the number of annotations for this task is greater than or equal '
        'to the number of maximum_completions for the project',
        db_index=True)
    overlap = models.IntegerField(
        _('overlap'),
        default=1,
        db_index=True,
        help_text=
        'Number of distinct annotators that processed the current task')
    file_upload = models.ForeignKey(
        'data_import.FileUpload',
        on_delete=models.SET_NULL,
        null=True,
        blank=True,
        related_name='tasks',
        help_text='Uploaded file used as data source for this task')
    inner_id = models.BigIntegerField(
        _('inner id'),
        default=0,
        db_index=True,
        null=True,
        help_text='Internal task ID in the project, starts with 1')
    updates = ['is_labeled']

    objects = TaskManager()  # task manager by default
    prepared = PreparedTaskManager(
    )  # task manager with filters, ordering, etc for data_manager app

    class Meta:
        db_table = 'task'
        ordering = ['-updated_at']
        indexes = [
            models.Index(fields=['project', 'is_labeled']),
            models.Index(fields=['id', 'project']),
            models.Index(fields=['id', 'overlap']),
            models.Index(fields=['overlap']),
            models.Index(fields=['is_labeled'])
        ]

    @property
    def file_upload_name(self):
        return os.path.basename(self.file_upload.file.name)

    @classmethod
    def get_locked_by(cls, user, project=None, tasks=None):
        """ Retrieve the task locked by specified user. Returns None if the specified user didn't lock anything.
        """
        lock = None
        if project is not None:
            lock = TaskLock.objects.filter(user=user,
                                           expire_at__gt=now(),
                                           task__project=project).first()
        elif tasks is not None:
            locked_tasks = tasks.filter(locks__user=user,
                                        locks__expire_at__gt=now())[:1]
            if locked_tasks:
                return locked_tasks[0]
        else:
            raise Exception('Neither project or tasks passed to get_locked_by')

        if lock:
            return lock.task

    def has_lock(self, user=None):
        """Check whether current task has been locked by some user"""
        num_locks = self.num_locks
        if self.project.skip_queue == self.project.SkipQueue.REQUEUE_FOR_ME:
            num_annotations = self.annotations.filter(
                ground_truth=False).exclude(
                    Q(was_cancelled=True) | ~Q(completed_by=user)).count()
        else:
            num_annotations = self.annotations.filter(
                ground_truth=False).exclude(
                    Q(was_cancelled=True) & ~Q(completed_by=user)).count()

        num = num_locks + num_annotations
        if num > self.overlap:
            logger.error(
                f"Num takes={num} > overlap={self.overlap} for task={self.id} - it's a bug",
                extra=dict(
                    lock_ttl=self.get_lock_ttl(),
                    num_locks=num_locks,
                    num_annotations=num_annotations,
                ))
        result = bool(num >= self.overlap)
        logger.debug(
            f'Task {self} locked: {result}; num_locks: {num_locks} num_annotations: {num_annotations}'
        )
        return result

    @property
    def num_locks(self):
        return self.locks.filter(expire_at__gt=now()).count()

    def get_lock_ttl(self):
        if settings.TASK_LOCK_TTL is not None:
            return settings.TASK_LOCK_TTL
        return settings.TASK_LOCK_MIN_TTL

    def has_permission(self, user):
        return self.project.has_permission(user)

    def clear_expired_locks(self):
        self.locks.filter(expire_at__lt=now()).delete()

    def set_lock(self, user):
        """Lock current task by specified user. Lock lifetime is set by `expire_in_secs`"""
        num_locks = self.num_locks
        if num_locks < self.overlap:
            lock_ttl = self.get_lock_ttl()
            expire_at = now() + datetime.timedelta(seconds=lock_ttl)
            TaskLock.objects.create(task=self, user=user, expire_at=expire_at)
            logger.debug(
                f'User={user} acquires a lock for the task={self} ttl: {lock_ttl}'
            )
        else:
            logger.error(
                f"Current number of locks for task {self.id} is {num_locks}, but overlap={self.overlap}: "
                f"that's a bug because this task should not be taken in a label stream (task should be locked)"
            )
        self.clear_expired_locks()

    def release_lock(self, user=None):
        """Release lock for the task.
        If user specified, it checks whether lock is released by the user who previously has locked that task"""

        if user is not None:
            self.locks.filter(user=user).delete()
        else:
            self.locks.all().delete()
        self.clear_expired_locks()

    def get_storage_link(self):
        # TODO: how to get neatly any storage class here?
        return find_first_one_to_one_related_field_by_prefix(
            self, '.*io_storages_')

    @staticmethod
    def is_upload_file(filename):
        if not isinstance(filename, str):
            return False
        return filename.startswith(settings.UPLOAD_DIR + '/')

    def resolve_uri(self, task_data, project):
        if project.task_data_login and project.task_data_password:
            protected_data = {}
            for key, value in task_data.items():
                if isinstance(value, str) and string_is_url(value):
                    path = reverse('projects-file-proxy',
                                   kwargs={'pk': project.pk
                                           }) + '?url=' + quote(value)
                    value = urljoin(settings.HOSTNAME, path)
                protected_data[key] = value
            return protected_data
        else:
            storage_objects = project.get_all_storage_objects(type_='import')

            # try resolve URLs via storage associated with that task
            for field in task_data:
                # file saved in django file storage
                if settings.CLOUD_FILE_STORAGE_ENABLED and self.is_upload_file(
                        task_data[field]):
                    # permission check: resolve uploaded files to the project only
                    file_upload = FileUpload.objects.filter(
                        project=project, file=task_data[field])
                    if file_upload.exists():
                        task_data[field] = default_storage.url(
                            name=task_data[field])
                    # it's very rare case, e.g. user tried to reimport exported file from another project
                    # or user wrote his django storage path manually
                    else:
                        task_data[field] = task_data[
                            field] + '?not_uploaded_project_file'
                    continue

                # project storage
                storage = self.storage or self._get_storage_by_url(
                    task_data[field], storage_objects)
                if storage:
                    try:
                        resolved_uri = storage.resolve_uri(task_data[field])
                    except Exception as exc:
                        logger.error(exc, exc_info=True)
                        resolved_uri = None
                    if resolved_uri:
                        task_data[field] = resolved_uri
            return task_data

    def _get_storage_by_url(self, url, storage_objects):
        """Find the first compatible storage and returns pre-signed URL"""
        from io_storages.models import get_storage_classes

        for storage_object in storage_objects:
            # check url is string because task can have int, float, dict, list
            # and 'can_resolve_url' will fail
            if isinstance(url, str) and storage_object.can_resolve_url(url):
                return storage_object

    @property
    def storage(self):
        # maybe task has storage link
        storage_link = self.get_storage_link()
        if storage_link:
            return storage_link.storage

        # or try global storage settings (only s3 for now)
        elif get_env('USE_DEFAULT_S3_STORAGE', default=False, is_bool=True):
            # TODO: this is used to access global environment storage settings.
            # We may use more than one and non-default S3 storage (like GCS, Azure)
            from io_storages.s3.models import S3ImportStorage
            return S3ImportStorage()

    @property
    def completed_annotations(self):
        """Annotations that we take into account when set completed status to the task"""
        if self.project.skip_queue == self.project.SkipQueue.IGNORE_SKIPPED:
            return self.annotations
        else:
            return self.annotations.filter(Q_finished_annotations)

    def update_is_labeled(self):
        self.is_labeled = self._get_is_labeled_value()

    def increase_project_summary_counters(self):
        if hasattr(self.project, 'summary'):
            summary = self.project.summary
            summary.update_data_columns([self])

    def decrease_project_summary_counters(self):
        if hasattr(self.project, 'summary'):
            summary = self.project.summary
            summary.remove_data_columns([self])

    def ensure_unique_groundtruth(self, annotation_id):
        self.annotations.exclude(id=annotation_id).update(ground_truth=False)

    def save(self, *args, **kwargs):
        if flag_set('ff_back_2070_inner_id_12052022_short',
                    self.project.organization.created_by):
            if self.inner_id == 0:
                task = Task.objects.filter(
                    project=self.project).order_by("-inner_id").first()
                max_inner_id = 1
                if task:
                    max_inner_id = task.inner_id
                self.inner_id = max_inner_id + 1
        super().save(*args, **kwargs)