class Task(TaskMixin, models.Model): """ Business tasks from project """ id = models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID', db_index=True) data = JSONField('data', null=False, help_text='User imported or uploaded data for a task. Data is formatted according to ' 'the project label config. You can find examples of data for your project ' 'on the Import page in the Label Studio Data Manager UI.') meta = JSONField('meta', null=True, default=dict, help_text='Meta is user imported (uploaded) data and can be useful as input for an ML ' 'Backend for embeddings, advanced vectors, and other info. It is passed to ' 'ML during training/predicting steps.') project = models.ForeignKey('projects.Project', related_name='tasks', on_delete=models.CASCADE, null=True, help_text='Project ID for this task') created_at = models.DateTimeField(_('created at'), auto_now_add=True, help_text='Time a task was created') updated_at = models.DateTimeField(_('updated at'), auto_now=True, help_text='Last time a task was updated') is_labeled = models.BooleanField(_('is_labeled'), default=False, help_text='True if the number of annotations for this task is greater than or equal ' 'to the number of maximum_completions for the project', db_index=True) overlap = models.IntegerField(_('overlap'), default=1, db_index=True, help_text='Number of distinct annotators that processed the current task') file_upload = models.ForeignKey( 'data_import.FileUpload', on_delete=models.SET_NULL, null=True, blank=True, related_name='tasks', help_text='Uploaded file used as data source for this task' ) updates = ['is_labeled'] objects = TaskManager() # task manager by default prepared = PreparedTaskManager() # task manager with filters, ordering, etc for data_manager app class Meta: db_table = 'task' ordering = ['-updated_at'] indexes = [ models.Index(fields=['project', 'is_labeled']), models.Index(fields=['id', 'overlap']), models.Index(fields=['overlap']), models.Index(fields=['is_labeled']) ] @property def file_upload_name(self): return os.path.basename(self.file_upload.file.name) @classmethod def get_locked_by(cls, user, project=None, tasks=None): """ Retrieve the task locked by specified user. Returns None if the specified user didn't lock anything. """ lock = None if project is not None: lock = TaskLock.objects.filter(user=user, expire_at__gt=now(), task__project=project).first() elif tasks is not None: locked_tasks = tasks.filter(locks__user=user, locks__expire_at__gt=now())[:1] if locked_tasks: return locked_tasks[0] else: raise Exception('Neither project or tasks passed to get_locked_by') if lock: return lock.task def has_lock(self, user=None): """Check whether current task has been locked by some user""" num_locks = self.num_locks num_annotations = self.annotations.filter(ground_truth=False)\ .exclude(Q(was_cancelled=True) & ~Q(completed_by=user)).count() num = num_locks + num_annotations if num > self.overlap: logger.error( f"Num takes={num} > overlap={self.overlap} for task={self.id} - it's a bug", extra=dict( lock_ttl=self.get_lock_ttl(), num_locks=num_locks, num_annotations=num_annotations, ) ) return num >= self.overlap @property def num_locks(self): return self.locks.filter(expire_at__gt=now()).count() def get_lock_ttl(self): if settings.TASK_LOCK_TTL is not None: return settings.TASK_LOCK_TTL return settings.TASK_LOCK_MIN_TTL def has_permission(self, user): return self.project.has_permission(user) def clear_expired_locks(self): self.locks.filter(expire_at__lt=now()).delete() def set_lock(self, user): """Lock current task by specified user. Lock lifetime is set by `expire_in_secs`""" num_locks = self.num_locks if num_locks < self.overlap: lock_ttl = self.get_lock_ttl() expire_at = now() + datetime.timedelta(seconds=lock_ttl) TaskLock.objects.create(task=self, user=user, expire_at=expire_at) logger.debug(f'User={user} acquires a lock for the task={self} ttl: {lock_ttl}') else: logger.error( f"Current number of locks for task {self.id} is {num_locks}, but overlap={self.overlap}: " f"that's a bug because this task should not be taken in a label stream (task should be locked)") self.clear_expired_locks() def release_lock(self, user=None): """Release lock for the task. If user specified, it checks whether lock is released by the user who previously has locked that task""" if user is not None: self.locks.filter(user=user).delete() else: self.locks.all().delete() self.clear_expired_locks() def get_storage_link(self): # TODO: how to get neatly any storage class here? return find_first_one_to_one_related_field_by_prefix(self, '.*io_storages_') def resolve_uri(self, task_data, proxy=True): if proxy and self.project.task_data_login and self.project.task_data_password: protected_data = {} for key, value in task_data.items(): if isinstance(value, str) and string_is_url(value): path = reverse('projects-file-proxy', kwargs={'pk': self.project.pk}) + '?url=' + value value = urljoin(settings.HOSTNAME, path) protected_data[key] = value return protected_data else: # Try resolve URLs via storage associated with that task storage = self._get_task_storage(task_data) if storage: return storage.resolve_task_data_uri(task_data) return task_data def _get_storage_by_task_data(self, task_data): from io_storages.models import get_import_storage_by_url for url in task_data.values(): storage_class = get_import_storage_by_url(url) if storage_class: # Only first matched storage is returned - no way to specify {"url1": "s3://", "url2": "gs://"} return storage_class.objects.filter(project=self.project).first() def _get_task_storage(self, task_data): # maybe task has storage link storage_link = self.get_storage_link() if storage_link: return storage_link.storage # or try global storage settings (only s3 for now) elif get_env('USE_DEFAULT_S3_STORAGE', default=False, is_bool=True): # TODO: this is used to access global environment storage settings. # We may use more than one and non-default S3 storage (like GCS, Azure) from io_storages.s3.models import S3ImportStorage return S3ImportStorage() storage = self._get_storage_by_task_data(task_data) if storage: return storage def update_is_labeled(self): """Set is_labeled field according to annotations*.count > overlap """ n = self.annotations.filter(Q_finished_annotations & Q(ground_truth=False)).count() # self.is_labeled = n >= self.project.maximum_annotations self.is_labeled = n >= self.overlap def reset_updates(self): """ Reset updates to default from model for one task. We need it in duplicate project or total deletion of annotations """ for field in Task._meta.fields: if field.name in Task.updates: setattr(self, field.name, field.default) @staticmethod def bulk_reset_updates(project): """ Bulk reset updates to default, it's a fast way to reset all tasks in project """ for field in Task._meta.fields: if field.name in Task.updates: project.tasks.update(**{field.name: field.default}) @staticmethod def bulk_update_is_labeled(project): """ Fast way to update only is_labeled. Prefer to use Django 2.2 bulk_update(), see bulk_update_field('is_labeled') get all project.tasks as subquery Subquery( w coalesce get the first non-null value (count(annotations), or 0) make condition add temp field pre_is_labeled as condtion values ) update all tasks with Subquery """ tasks = project.tasks.filter(pk=OuterRef('pk')) count = Coalesce(Count( 'annotations', filter=Q(annotations__was_cancelled=False) & Q(annotations__ground_truth=False)), Value(0)) condition = Case( When(overlap__lte=count, then=Value(True)), default=Value(False), output_field=models.BooleanField(null=False) ) results = tasks.annotate(pre_is_labeled=condition).values('pre_is_labeled') project.tasks.update(is_labeled=Subquery(results)) def delete_url(self): return reverse('tasks:task-delete', kwargs={'pk': self.pk}) def completion_for_ground_truth(self): """ 1 Get ground_truth completion if task has it, else 2 Get first completion created by owner of project, 3 Or the first of somebody if no owner's items. It's used for ground_truth selection right on data manager page """ if not self.annotations.exists(): return None # ground_truth already exist ground_truth_annotations = self.annotations.filter(ground_truth=True) if ground_truth_annotations.exists(): return ground_truth_annotations.first() # owner annotation owner_annotations = self.annotations.filter(completed_by=self.project.created_by) if owner_annotations.count() > 0: return owner_annotations.first() # annotator annotation return self.annotations.first() def increase_project_summary_counters(self): if hasattr(self.project, 'summary'): summary = self.project.summary summary.update_data_columns([self]) def decrease_project_summary_counters(self): if hasattr(self.project, 'summary'): summary = self.project.summary summary.remove_data_columns([self]) def ensure_unique_groundtruth(self, annotation_id): self.annotations.exclude(id=annotation_id).update(ground_truth=False)
class Task(TaskMixin, models.Model): """ Business tasks from project """ id = models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID', db_index=True) data = JSONField( 'data', null=False, help_text= 'User imported or uploaded data for a task. Data is formatted according to ' 'the project label config. You can find examples of data for your project ' 'on the Import page in the Label Studio Data Manager UI.') meta = JSONField( 'meta', null=True, default=dict, help_text= 'Meta is user imported (uploaded) data and can be useful as input for an ML ' 'Backend for embeddings, advanced vectors, and other info. It is passed to ' 'ML during training/predicting steps.') project = models.ForeignKey('projects.Project', related_name='tasks', on_delete=models.CASCADE, null=True, help_text='Project ID for this task') created_at = models.DateTimeField(_('created at'), auto_now_add=True, help_text='Time a task was created') updated_at = models.DateTimeField(_('updated at'), auto_now=True, help_text='Last time a task was updated') updated_by = models.ForeignKey( settings.AUTH_USER_MODEL, related_name='updated_tasks', on_delete=models.SET_NULL, null=True, verbose_name=_('updated by'), help_text='Last annotator or reviewer who updated this task') is_labeled = models.BooleanField( _('is_labeled'), default=False, help_text= 'True if the number of annotations for this task is greater than or equal ' 'to the number of maximum_completions for the project', db_index=True) overlap = models.IntegerField( _('overlap'), default=1, db_index=True, help_text= 'Number of distinct annotators that processed the current task') file_upload = models.ForeignKey( 'data_import.FileUpload', on_delete=models.SET_NULL, null=True, blank=True, related_name='tasks', help_text='Uploaded file used as data source for this task') inner_id = models.BigIntegerField( _('inner id'), default=0, db_index=True, null=True, help_text='Internal task ID in the project, starts with 1') updates = ['is_labeled'] objects = TaskManager() # task manager by default prepared = PreparedTaskManager( ) # task manager with filters, ordering, etc for data_manager app class Meta: db_table = 'task' ordering = ['-updated_at'] indexes = [ models.Index(fields=['project', 'is_labeled']), models.Index(fields=['id', 'project']), models.Index(fields=['id', 'overlap']), models.Index(fields=['overlap']), models.Index(fields=['is_labeled']) ] @property def file_upload_name(self): return os.path.basename(self.file_upload.file.name) @classmethod def get_locked_by(cls, user, project=None, tasks=None): """ Retrieve the task locked by specified user. Returns None if the specified user didn't lock anything. """ lock = None if project is not None: lock = TaskLock.objects.filter(user=user, expire_at__gt=now(), task__project=project).first() elif tasks is not None: locked_tasks = tasks.filter(locks__user=user, locks__expire_at__gt=now())[:1] if locked_tasks: return locked_tasks[0] else: raise Exception('Neither project or tasks passed to get_locked_by') if lock: return lock.task def has_lock(self, user=None): """Check whether current task has been locked by some user""" num_locks = self.num_locks if self.project.skip_queue == self.project.SkipQueue.REQUEUE_FOR_ME: num_annotations = self.annotations.filter( ground_truth=False).exclude( Q(was_cancelled=True) | ~Q(completed_by=user)).count() else: num_annotations = self.annotations.filter( ground_truth=False).exclude( Q(was_cancelled=True) & ~Q(completed_by=user)).count() num = num_locks + num_annotations if num > self.overlap: logger.error( f"Num takes={num} > overlap={self.overlap} for task={self.id} - it's a bug", extra=dict( lock_ttl=self.get_lock_ttl(), num_locks=num_locks, num_annotations=num_annotations, )) result = bool(num >= self.overlap) logger.debug( f'Task {self} locked: {result}; num_locks: {num_locks} num_annotations: {num_annotations}' ) return result @property def num_locks(self): return self.locks.filter(expire_at__gt=now()).count() def get_lock_ttl(self): if settings.TASK_LOCK_TTL is not None: return settings.TASK_LOCK_TTL return settings.TASK_LOCK_MIN_TTL def has_permission(self, user): return self.project.has_permission(user) def clear_expired_locks(self): self.locks.filter(expire_at__lt=now()).delete() def set_lock(self, user): """Lock current task by specified user. Lock lifetime is set by `expire_in_secs`""" num_locks = self.num_locks if num_locks < self.overlap: lock_ttl = self.get_lock_ttl() expire_at = now() + datetime.timedelta(seconds=lock_ttl) TaskLock.objects.create(task=self, user=user, expire_at=expire_at) logger.debug( f'User={user} acquires a lock for the task={self} ttl: {lock_ttl}' ) else: logger.error( f"Current number of locks for task {self.id} is {num_locks}, but overlap={self.overlap}: " f"that's a bug because this task should not be taken in a label stream (task should be locked)" ) self.clear_expired_locks() def release_lock(self, user=None): """Release lock for the task. If user specified, it checks whether lock is released by the user who previously has locked that task""" if user is not None: self.locks.filter(user=user).delete() else: self.locks.all().delete() self.clear_expired_locks() def get_storage_link(self): # TODO: how to get neatly any storage class here? return find_first_one_to_one_related_field_by_prefix( self, '.*io_storages_') @staticmethod def is_upload_file(filename): if not isinstance(filename, str): return False return filename.startswith(settings.UPLOAD_DIR + '/') def resolve_uri(self, task_data, project): if project.task_data_login and project.task_data_password: protected_data = {} for key, value in task_data.items(): if isinstance(value, str) and string_is_url(value): path = reverse('projects-file-proxy', kwargs={'pk': project.pk }) + '?url=' + quote(value) value = urljoin(settings.HOSTNAME, path) protected_data[key] = value return protected_data else: storage_objects = project.get_all_storage_objects(type_='import') # try resolve URLs via storage associated with that task for field in task_data: # file saved in django file storage if settings.CLOUD_FILE_STORAGE_ENABLED and self.is_upload_file( task_data[field]): # permission check: resolve uploaded files to the project only file_upload = FileUpload.objects.filter( project=project, file=task_data[field]) if file_upload.exists(): task_data[field] = default_storage.url( name=task_data[field]) # it's very rare case, e.g. user tried to reimport exported file from another project # or user wrote his django storage path manually else: task_data[field] = task_data[ field] + '?not_uploaded_project_file' continue # project storage storage = self.storage or self._get_storage_by_url( task_data[field], storage_objects) if storage: try: resolved_uri = storage.resolve_uri(task_data[field]) except Exception as exc: logger.error(exc, exc_info=True) resolved_uri = None if resolved_uri: task_data[field] = resolved_uri return task_data def _get_storage_by_url(self, url, storage_objects): """Find the first compatible storage and returns pre-signed URL""" from io_storages.models import get_storage_classes for storage_object in storage_objects: # check url is string because task can have int, float, dict, list # and 'can_resolve_url' will fail if isinstance(url, str) and storage_object.can_resolve_url(url): return storage_object @property def storage(self): # maybe task has storage link storage_link = self.get_storage_link() if storage_link: return storage_link.storage # or try global storage settings (only s3 for now) elif get_env('USE_DEFAULT_S3_STORAGE', default=False, is_bool=True): # TODO: this is used to access global environment storage settings. # We may use more than one and non-default S3 storage (like GCS, Azure) from io_storages.s3.models import S3ImportStorage return S3ImportStorage() @property def completed_annotations(self): """Annotations that we take into account when set completed status to the task""" if self.project.skip_queue == self.project.SkipQueue.IGNORE_SKIPPED: return self.annotations else: return self.annotations.filter(Q_finished_annotations) def update_is_labeled(self): self.is_labeled = self._get_is_labeled_value() def increase_project_summary_counters(self): if hasattr(self.project, 'summary'): summary = self.project.summary summary.update_data_columns([self]) def decrease_project_summary_counters(self): if hasattr(self.project, 'summary'): summary = self.project.summary summary.remove_data_columns([self]) def ensure_unique_groundtruth(self, annotation_id): self.annotations.exclude(id=annotation_id).update(ground_truth=False) def save(self, *args, **kwargs): if flag_set('ff_back_2070_inner_id_12052022_short', self.project.organization.created_by): if self.inner_id == 0: task = Task.objects.filter( project=self.project).order_by("-inner_id").first() max_inner_id = 1 if task: max_inner_id = task.inner_id self.inner_id = max_inner_id + 1 super().save(*args, **kwargs)