def _make_response(self, next_task, request, use_task_lock=True): """Once next task has chosen, this function triggers inference and prepare the API response""" user = request.user project = next_task.project if use_task_lock: # set lock for the task with TTL 3x time more then current average lead time (or 1 hour by default) next_task.set_lock(request.user) # call machine learning api and format response if project.show_collab_predictions: for ml_backend in project.ml_backends.all(): ml_backend.predict_one_task(next_task) # serialize task context = { 'request': request, 'project': project, 'resolve_uri': True, 'proxy': bool_from_request(request.GET, 'proxy', True) } serializer = TaskWithAnnotationsAndPredictionsAndDraftsSerializer( next_task, context=context) response = serializer.data annotations = [] for c in response.get('annotations', []): if c.get('completed_by') == user.id and not (c.get('ground_truth') or c.get('honeypot')): annotations.append(c) response['annotations'] = annotations return Response(response)
class ProjectNextTaskAPI(generics.RetrieveAPIView): """get: Get next task to label Get the next task for labeling. If you enable Machine Learning in your project, the response might include a "predictions" field. It contains a machine learning prediction result for this task. """ permission_classes = (IsAuthenticated, ProjectNextTaskAPIPermissions) serializer_class = TaskWithAnnotationsAndPredictionsAndDraftsSerializer # using it for swagger API docs def _get_random_unlocked(self, task_query, upper_limit=None): # get random task from task query, ignoring locked tasks n = task_query.count() if n > 0: upper_limit = upper_limit or n random_indices = np.random.permutation(upper_limit) task_query_only = task_query.only('overlap', 'id') for i in random_indices: try: task = task_query_only[int(i)] except IndexError as exc: logger.error( f'Task query out of range for {int(i)}, count={task_query_only.count()}. ' f'Reason: {exc}', exc_info=True) except Exception as exc: logger.error(exc, exc_info=True) else: try: task = Task.objects.select_for_update( skip_locked=True).get(pk=task.id) if not task.has_lock(): return task except Task.DoesNotExist: logger.debug('Task with id {} locked'.format(task.id)) def _get_first_unlocked(self, tasks_query): # Skip tasks that are locked due to being taken by collaborators for task in tasks_query.all(): try: task = Task.objects.select_for_update(skip_locked=True).get( pk=task.id) if not task.has_lock(): return task except Task.DoesNotExist: logger.debug('Task with id {} locked'.format(task.id)) def _try_ground_truth(self, tasks, project): """Returns task from ground truth set""" ground_truth = Annotation.objects.filter(task=OuterRef('pk'), ground_truth=True) not_solved_tasks_with_ground_truths = tasks.annotate( has_ground_truths=Exists(ground_truth)).filter( has_ground_truths=True) if not_solved_tasks_with_ground_truths.exists(): if project.sampling == project.SEQUENCE: return self._get_first_unlocked( not_solved_tasks_with_ground_truths) return self._get_random_unlocked( not_solved_tasks_with_ground_truths) def _try_tasks_with_overlap(self, tasks): """Filter out tasks without overlap (doesn't return next task)""" tasks_with_overlap = tasks.filter(overlap__gt=1) if tasks_with_overlap.exists(): return None, tasks_with_overlap else: return None, tasks.filter(overlap=1) def _try_breadth_first(self, tasks): """Try to find tasks with maximum amount of annotations, since we are trying to label tasks as fast as possible """ # ======= # This commented part is trying to solve breadth-first in a bit different way: # it selects first task where any amount of annotations have been already created # we've left it here to be able to select it through the project settings later # ======= # annotations = Annotation.objects.filter(task=OuterRef('pk'), ground_truth=False) # not_solved_tasks_labeling_started = tasks.annotate(labeling_started=Exists(annotations)) # not_solved_tasks_labeling_started_true = not_solved_tasks_labeling_started.filter(labeling_started=True) # if not_solved_tasks_labeling_started_true.exists(): # # try to complete tasks that are already in progress # next_task = self._get_random(not_solved_tasks_labeling_started_true) # return next_task tasks = tasks.annotate(annotations_count=Count('annotations')) max_annotations_count = tasks.aggregate( Max('annotations_count'))['annotations_count__max'] if max_annotations_count == 0: # there is no any labeled tasks found return # find any task with maximal amount of created annotations not_solved_tasks_labeling_started = tasks.annotate( reach_max_annotations_count=Case(When( annotations_count=max_annotations_count, then=Value(True)), default=Value(False), output_field=BooleanField())) not_solved_tasks_labeling_with_max_annotations = not_solved_tasks_labeling_started.filter( reach_max_annotations_count=True) if not_solved_tasks_labeling_with_max_annotations.exists(): # try to complete tasks that are already in progress return self._get_random_unlocked( not_solved_tasks_labeling_with_max_annotations) def _try_uncertainty_sampling(self, tasks, project, user_solved_tasks_array): task_with_current_predictions = tasks.filter( predictions__model_version=project.model_version) if task_with_current_predictions.exists(): logger.debug('Use uncertainty sampling') # collect all clusters already solved by user, count number of solved task in them user_solved_clusters = project.prepared_tasks.filter( pk__in=user_solved_tasks_array).annotate( cluster=Max('predictions__cluster')).values_list('cluster', flat=True) user_solved_clusters = Counter(user_solved_clusters) # order each task by the count of how many tasks solved in it's cluster cluster_num_solved_map = [ When(predictions__cluster=k, then=v) for k, v in user_solved_clusters.items() ] num_tasks_with_current_predictions = task_with_current_predictions.count( ) # WARNING! this call doesn't work after consequent annotate if cluster_num_solved_map: task_with_current_predictions = task_with_current_predictions.annotate( cluster_num_solved=Case(*cluster_num_solved_map, default=0, output_field=DecimalField())) # next task is chosen from least solved cluster and with lowest prediction score possible_next_tasks = task_with_current_predictions.order_by( 'cluster_num_solved', 'predictions__score') else: possible_next_tasks = task_with_current_predictions.order_by( 'predictions__score') num_annotators = project.annotators().count() if num_annotators > 1 and num_tasks_with_current_predictions > 0: # try to randomize tasks to avoid concurrent labeling between several annotators next_task = self._get_random_unlocked( possible_next_tasks, upper_limit=min(num_annotators + 1, num_tasks_with_current_predictions)) else: next_task = self._get_first_unlocked(possible_next_tasks) else: # uncertainty sampling fallback: choose by random sampling logger.debug( f'Uncertainty sampling fallbacks to random sampling ' f'(current project.model_version={str(project.model_version)})' ) next_task = self._get_random_unlocked(tasks) return next_task def _make_response(self, next_task, request, use_task_lock=True): """Once next task has chosen, this function triggers inference and prepare the API response""" user = request.user project = next_task.project if use_task_lock: # set lock for the task with TTL 3x time more then current average lead time (or 1 hour by default) next_task.set_lock(request.user) # call machine learning api and format response if project.show_collab_predictions: for ml_backend in project.ml_backends.all(): ml_backend.predict_one_task(next_task) # serialize task context = { 'request': request, 'project': project, 'resolve_uri': True, 'proxy': bool_from_request(request.GET, 'proxy', True) } serializer = TaskWithAnnotationsAndPredictionsAndDraftsSerializer( next_task, context=context) response = serializer.data annotations = [] for c in response.get('annotations', []): if c.get('completed_by') == user.id and not (c.get('ground_truth') or c.get('honeypot')): annotations.append(c) response['annotations'] = annotations return Response(response) @swagger_auto_schema( tags=['Projects'], responses={ 200: TaskWithAnnotationsAndPredictionsAndDraftsSerializer() }) def get(self, request, *args, **kwargs): project = get_object_with_check_and_log(request, Project, pk=self.kwargs['pk']) # TODO: LSE option # if not project.is_published: # raise PermissionDenied('Project is not published.') self.check_object_permissions(request, project) user = request.user # support actions api call from actions/next_task.py if hasattr(self, 'prepared_tasks'): project.prepared_tasks = self.prepared_tasks external_prepared_tasks_used = True # get prepared tasks from request params (filters, selected items) else: project.prepared_tasks = get_prepared_queryset( self.request, project) external_prepared_tasks_used = False # detect solved and not solved tasks user_solved_tasks_array = user.annotations.filter( ground_truth=False).filter(Q(task__isnull=False)).values_list( 'task__pk', flat=True) with conditional_atomic(): not_solved_tasks = project.prepared_tasks.\ exclude(pk__in=user_solved_tasks_array).filter(is_labeled=False) not_solved_tasks_count = not_solved_tasks.count() # return nothing if there are no tasks remain if not_solved_tasks_count == 0: raise NotFound( f'There are no tasks remaining to be annotated by the user={user}' ) logger.debug( f'{not_solved_tasks_count} tasks that still need to be annotated for user={user}' ) # ordered by data manager if external_prepared_tasks_used: next_task = not_solved_tasks.first() if not next_task: raise NotFound('No more tasks found') return self._make_response(next_task, request) # If current user has already lock one task - return it (without setting the lock again) next_task = Task.get_locked_by(user, project) if next_task: return self._make_response(next_task, request, use_task_lock=False) if project.show_ground_truth_first: logger.debug( f'User={request.user} tries ground truth from {not_solved_tasks_count} tasks' ) next_task = self._try_ground_truth(not_solved_tasks, project) if next_task: return self._make_response(next_task, request) if project.show_overlap_first: # don't output anything - just filter tasks with overlap logger.debug( f'User={request.user} tries overlap first from {not_solved_tasks_count} tasks' ) _, not_solved_tasks = self._try_tasks_with_overlap( not_solved_tasks) # if there any tasks in progress (with maximum number of annotations), randomly sampling from them logger.debug( f'User={request.user} tries depth first from {not_solved_tasks_count} tasks' ) next_task = self._try_breadth_first(not_solved_tasks) if next_task: return self._make_response(next_task, request) if project.sampling == project.UNCERTAINTY: logger.debug( f'User={request.user} tries uncertainty sampling from {not_solved_tasks_count} tasks' ) next_task = self._try_uncertainty_sampling( not_solved_tasks, project, user_solved_tasks_array) elif project.sampling == project.UNIFORM: logger.debug( f'User={request.user} tries random sampling from {not_solved_tasks_count} tasks' ) next_task = self._get_random_unlocked(not_solved_tasks) elif project.sampling == project.SEQUENCE: logger.debug( f'User={request.user} tries sequence sampling from {not_solved_tasks_count} tasks' ) next_task = self._get_first_unlocked( not_solved_tasks.all().order_by('id')) if next_task: return self._make_response(next_task, request) else: raise NotFound( f'There exist some unsolved tasks for the user={user}, but they seem to be locked by another users' )
return super(ProjectAPI, self).put(request, *args, **kwargs) @method_decorator( name='get', decorator=swagger_auto_schema( tags=['Projects'], operation_summary='Get next task to label', operation_description=""" Get the next task for labeling. If you enable Machine Learning in your project, the response might include a "predictions" field. It contains a machine learning prediction result for this task. """, responses={ 200: TaskWithAnnotationsAndPredictionsAndDraftsSerializer() }) ) # leaving this method decorator info in case we put it back in swagger API docs class ProjectNextTaskAPI(generics.RetrieveAPIView): permission_required = all_permissions.tasks_view serializer_class = TaskWithAnnotationsAndPredictionsAndDraftsSerializer # using it for swagger API docs swagger_schema = None # this endpoint doesn't need to be in swagger API docs def _get_random_unlocked(self, task_query, upper_limit=None): # get random task from task query, ignoring locked tasks n = task_query.count() if n > 0: upper_limit = upper_limit or n random_indices = np.random.permutation(upper_limit) task_query_only = task_query.only('overlap', 'id')