Exemplo n.º 1
0
def user_full_name_change_listener(sender, **kwargs):
    user = kwargs.get('instance')
    old_user = kwargs.get('old_instance')
    if old_user is not None and old_user.get_full_name() != user.get_full_name():
        from apps.task.tasks import call_task_func
        from apps.rawdb.tasks import reindex_assignee_for_all_documents_in_system
        call_task_func(reindex_assignee_for_all_documents_in_system, (user.pk,), None)
Exemplo n.º 2
0
def project_name_change_listener(sender, **kwargs):
    project = kwargs.get('instance')
    old_project = kwargs.get('old_instance')
    if old_project is not None and project.name != old_project.name:
        from apps.task.tasks import call_task_func
        from apps.rawdb.tasks import reindex_all_project_documents
        call_task_func(reindex_all_project_documents, (project.pk, ), None)
def run_detect_field_values_for_document(document_id: int):
    from apps.task.tasks import call_task_func
    from apps.document.tasks import DetectFieldValues
    call_task_func(DetectFieldValues.detect_field_values_for_document,
                   (document_id, False, False),
                   None,
                   visible=False)
Exemplo n.º 4
0
def document_fields_change_listener_impl(_sender,
                                         signal,
                                         log: ProcessLogger,
                                         document_event: str,
                                         document: Document,
                                         field_handlers: Dict[str,
                                                              FieldHandler],
                                         fields_before: Optional[Dict],
                                         fields_after: Optional[Dict],
                                         changed_by_user: User = None):
    from apps.task.tasks import call_task_func
    from apps.notifications.tasks import process_notifications_on_document_change
    if not changed_by_user:
        # we ignore changes made by system at the moment
        return

    if not fields_before and not fields_after:
        log.error(
            'Document fields changed event appeared with both "before" and "after" fields empty.'
        )
        return

    from apps.notifications.app_vars import APP_VAR_DISABLE_EVENT_NOTIFICATIONS
    if APP_VAR_DISABLE_EVENT_NOTIFICATIONS.val:
        return
    call_task_func(process_notifications_on_document_change,
                   (document_event, document.pk, fields_before, fields_after,
                    changed_by_user.pk), changed_by_user.pk)
def auto_reindex_not_tracked(task: ExtendedTask,
                             document_type_code: str = None,
                             force: bool = False):
    if APP_VAR_DISABLE_RAW_DB_CACHING.val:
        return
    document_types = [DocumentType.objects.get(code=document_type_code)] \
        if document_type_code is not None else DocumentType.objects.all()
    log = CeleryTaskLogger(task)

    task_model = task.task

    for document_type in document_types:
        reindex_needed = adapt_table_structure(log, document_type, force=force)
        if reindex_needed:
            force_fmt = ', forced' if force else ''
            task.log_info(
                f'Re-index from auto_reindex_not_tracked, {task.name}, '
                f'for {document_type}{force_fmt}')
            call_task_func(manual_reindex, (document_type.code, False),
                           task_model.user_id)
        else:
            if there_are_non_indexed_docs_not_planned_to_index(document_type, log) \
                    and not any_other_reindex_task(task.request.id, document_type.code).exists():
                task.log_info(
                    f'auto_reindex_not_tracked({document_type.code}): '
                    f'there_are_non_indexed_docs_not_planned_to_index')
                call_task_func(manual_reindex, (document_type.code, False),
                               task_model.user_id)
Exemplo n.º 6
0
def review_status_save_listener(sender, **kwargs):
    review_status = kwargs.get('instance')
    old_review_status = kwargs.get('old_instance')
    if old_review_status is not None and review_status.name != old_review_status.name:
        from apps.task.tasks import call_task_func
        from apps.rawdb.tasks import reindex_status_name_for_all_documents_in_system
        call_task_func(reindex_status_name_for_all_documents_in_system, (review_status.pk,), None)
Exemplo n.º 7
0
def reindex_on_doc_type_change(document_type: DocumentType):
    from apps.rawdb.app_vars import APP_VAR_DISABLE_RAW_DB_CACHING
    if APP_VAR_DISABLE_RAW_DB_CACHING.val:
        return

    from apps.rawdb.tasks import auto_reindex_not_tracked
    from apps.task.tasks import call_task_func
    call_task_func(auto_reindex_not_tracked, (document_type.code,), None, queue=settings.CELERY_QUEUE_SERIAL)
Exemplo n.º 8
0
def plan_process_document_changed(doc_id: int,
                                  system_fields_changed: FieldSpec = True,
                                  generic_fields_changed: FieldSpec = True,
                                  user_fields_changed: bool = True,
                                  changed_by_user_id: int = None):
    call_task_func(process_document_changed,
                   (doc_id, system_fields_changed, generic_fields_changed, user_fields_changed, changed_by_user_id),
                   changed_by_user_id)
Exemplo n.º 9
0
def run_detect_field_values_for_document(dcptrs: DocDetectFieldValuesParams,
                                         user: User = None):
    from apps.task.tasks import call_task_func
    from apps.document.tasks import DetectFieldValues
    call_task_func(DetectFieldValues.detect_field_values_for_document,
                   (dcptrs,),
                   user_id=user.pk if user else None,
                   visible=False)
Exemplo n.º 10
0
 def start_task(self, data):
     document_type = data.get('document_type', {})
     document_type_code = document_type.code if document_type else None
     force = data.get('recreate_tables') or False
     proj = data.get('project') or None
     proj_id = proj.pk if proj else None  # type:Optional[int]
     call_task_func(manual_reindex, (document_type_code, force, proj_id),
                    data['user_id'])
Exemplo n.º 11
0
def cache_doc_fields_task_impl(_sender, signal, documents, status_name: str,
                               changed_by_user: User):
    from apps.rawdb.repository.raw_db_repository import RawDbRepository
    from apps.rawdb.tasks import cache_fields_for_docs_queryset
    from apps.task.tasks import call_task_func
    repo = RawDbRepository()
    doc_ids = list(documents.values_list('pk', flat=True))
    repo.update_documents_status(doc_ids, status_name)

    call_task_func(cache_fields_for_docs_queryset,
                   (documents, changed_by_user, False, True, True, None),
                   changed_by_user.pk)
def run_detect_field_values_for_document(document_id: int):
    from urls import custom_apps
    from apps.task.tasks import call_task_func
    for app_name in custom_apps:
        module_str = 'apps.%s.tasks' % app_name
        task_module = sys.modules.get(module_str)
        detector_task = getattr(task_module, 'DetectFieldValues', None)
        if detector_task and hasattr(detector_task,
                                     'detect_field_values_for_document'):
            task_func = getattr(detector_task,
                                'detect_field_values_for_document')
            call_task_func(task_func, (document_id, False),
                           None,
                           visible=False)
Exemplo n.º 13
0
def reindex_on_field_change(document_field: DocumentField):
    from apps.rawdb.app_vars import APP_VAR_DISABLE_RAW_DB_CACHING
    if APP_VAR_DISABLE_RAW_DB_CACHING.val:
        return

    from apps.rawdb.tasks import auto_reindex_not_tracked
    from apps.task.tasks import call_task_func
    from apps.document.models import DocumentField

    try:
        if document_field.document_type:
            call_task_func(auto_reindex_not_tracked, (document_field.document_type.code,),
                           None, queue=settings.CELERY_QUEUE_SERIAL)
    except DocumentField.DoesNotExist:
        pass
Exemplo n.º 14
0
def plan_reindex_tasks_in_chunks(all_doc_ids: Iterable,
                                 changed_by_user_id: int = None,
                                 cache_system_fields: FieldSpec = True,
                                 cache_generic_fields: FieldSpec = True,
                                 cache_user_fields: bool = True):
    """
    Plans document reindexing. Splits the provided set of doc ids to chunks and runs N main tasks which will be
    displayed in the admin task list. Splitting is done to avoid overloading rabbitmq with possible too large
    argument list. Started tasks may split their processing to any number of sub-tasks to parallelize the work.
    """
    for doc_ids_chunk in chunks(all_doc_ids, DOC_NUM_PER_MAIN_TASK):
        call_task_func(index_documents,
                       (doc_ids_chunk, changed_by_user_id, cache_system_fields,
                        cache_generic_fields, cache_user_fields),
                       changed_by_user_id)
Exemplo n.º 15
0
def plan_process_documents_status_changed(doc_ids: Iterable, new_status_id: int, changed_by_user_id: int):
    """
    Plans processing of the documents status change. Starts multiple tasks, N doc ids per task, to avoid
    possible overloading the rabbitmq if too large set of doc ids is provided.
    Each started task will be shown in the admin task list and may start any number of sub-tasks to parallelize
    the processing.
    :param doc_ids:
    :param new_status_id:
    :param changed_by_user_id:
    :return:
    """
    for doc_ids_chunk in chunks(doc_ids, DOC_NUMBER_PER_MAIN_TASK):
        call_task_func(process_documents_status_changed,
                       (doc_ids_chunk, new_status_id, changed_by_user_id),
                       changed_by_user_id)
Exemplo n.º 16
0
def update_documents_assignees_impl(_sender, signal, documents,
                                    assignee_id: int, changed_by_user: User):
    from apps.rawdb.repository.raw_db_repository import RawDbRepository
    from apps.rawdb.tasks import cache_fields_for_docs_queryset
    from apps.task.tasks import call_task_func
    repo = RawDbRepository()
    doc_ids = list(documents.values_list('pk', flat=True))

    old_field_values = {
        d.pk: {
            'assignee_id': d.assignee_id,
            'assignee_name': d.assignee.username if d.assignee else ''
        }
        for d in documents
    }
    repo.update_documents_assignees(doc_ids, assignee_id)
    task_ptrs = (documents, changed_by_user, False, True, True, None,
                 old_field_values)
    call_task_func(cache_fields_for_docs_queryset, task_ptrs,
                   changed_by_user.pk)
Exemplo n.º 17
0
def _trigger_retraining_model(document, field, user_id):
    if settings.ANNOTATOR_RETRAIN_MODEL_ON_ANNOTATIONS_CHANGE:
        call_task_func(TrainDocumentFieldDetectorModel.train_model_for_field,
                       (document.document_type_id, field.uid, None, True), user_id=user_id)
Exemplo n.º 18
0
    def process(self, **kwargs):
        session_id = kwargs['session_id']
        session = UploadSession.objects.get(pk=session_id)

        # 1. Purge Tasks
        self.track_timelog('')
        session_tasks = Task.objects.main_tasks().filter(metadata__session_id=session_id)
        self.log_info(f'Purge {session_tasks.count()} session tasks.')
        for a_task in session_tasks:
            try:
                purge_task(a_task.id)
            except:
                # case when task is already deleted as subtask
                pass
        self.track_timelog('1 - purge tasks')

        # 2. Remove Documents+
        document_ids = \
            list(Document.objects.filter(upload_session_id=session_id).values_list('pk', flat=True))
        self.log_info(f'Remove {len(document_ids)} documents')
        from apps.document.repository.document_bulk_delete import get_document_bulk_delete

        # TODO: WHY it fails with
        # psycopg2.errors.ForeignKeyViolation: update or delete
        # on table "document_textunit" violates foreign key constraint
        attempts = 3
        delay = 60
        attempts_made = 0
        delete_manager = get_document_bulk_delete()
        error_logged = False

        for attempt in range(1, attempts + 1):
            attempts_made += 1
            try:
                delete_manager.delete_documents(document_ids)
                break
            except Exception as e:
                if not error_logged:
                    self.log_error('Error while deleting documents', exc_info=e)
                    error_logged = True
                self.log_info(f'Attempt #{attempt} of {attempts} to delete documents failed, retry')

                time.sleep(delay)
        self.track_timelog(f'2 - bulk delete for {len(document_ids)} documents')
        if attempts_made > 1:
            self.log_error(f'{attempts_made} of {attempts} tried to delete documents')

        # 3. Remove files
        file_storage_exists = file_storage.document_exists(session_id)
        self.log_info(f'File Storage exists: {file_storage_exists}')

        files_removed, failed_removing = (0, 0)
        if file_storage_exists:
            files = file_storage.list_documents(session_id)
            self.log_info(f'Remove {len(files)} files from File Storage.')
            for file_path in files:
                file_storage.delete_document(file_path)
            try:
                file_storage.delete_document(session_id)
                files_removed += 1
            except:
                # TODO: removing folders through LocalStorage is not implemented
                failed_removing += 1
                pass
        self.track_timelog(f'3 - remove files ({files_removed} removed, {failed_removing} failed)')

        # 4. Remove Upload Session
        if not session:
            raise Exception(f"Couldn't find session by id ({session_id})")

        self.log_info(f'Remove session uid="{session_id}".')
        project = session.project
        session.delete()
        self.track_timelog('4 - delete session')

        # 5. Reindex Project
        self.log_info(f'Reindex project id="{project.id}" documents.')
        from apps.rawdb.tasks import reindex_all_project_documents
        call_task_func(reindex_all_project_documents, (project.pk,), None)
        self.track_timelog('5 - reindex project')
Exemplo n.º 19
0
    def process(self,
                document_type: DocumentType = None,
                project_ids=list,
                document_name: str = None,
                do_not_run_for_modified_documents=True,
                do_not_write=False,
                **kwargs):
        self.log_info("Going to detect document field values based on "
                      "the pre-coded regexps and field values entered by users...")

        if isinstance(document_type, dict):
            document_type = DocumentType.objects.get(pk=document_type['pk'])

        # reindex document grid fields cache after detecting fields
        from apps.rawdb.tasks import auto_reindex_not_tracked
        doc_type_code = document_type.code \
            if document_type and hasattr(document_type, 'code') else None
        call_task_func(auto_reindex_not_tracked, (doc_type_code,), None,
                       queue=settings.CELERY_QUEUE_SERIAL,
                       run_after_sub_tasks_finished=True,
                       main_task_id=self.request.id)

        document_id = kwargs.get('document_id')
        if document_id:
            self.set_push_steps(1)

            self.run_sub_tasks('Detect Field Values For Single Document',
                               DetectFieldValues.detect_field_values_for_document,
                               [(document_id, False, True)])
            self.push()
            return

        task_count = 0
        document_types = [document_type] if document_type else DocumentType.objects.all()
        document_type_pks = []
        for document_type in document_types:
            if document_type.pk and document_type.fields.exists():
                document_type_pks.append(document_type.pk)
            else:
                self.log_info('Can not find any fields assigned to document type: {0}'.format(document_type))

        detect_field_values_for_document_args = []
        source_data = []

        qs = Document.objects.filter(status__is_active=True)
        if document_name:
            qs = qs.filter(name=document_name)
        elif document_id:
            qs = qs.filter(pk=document_id)
        elif project_ids:
            qs = qs.filter(project_id__in=project_ids)
        elif document_type_pks:
            qs = qs.filter(document_type_id__in=document_type_pks)

        # filter out modified documents
        if do_not_run_for_modified_documents:
            modified_document_ids = DocumentFieldValue.objects \
                .filter(Q(created_by__isnull=False) | Q(removed_by_user=True)) \
                .distinct('document_id') \
                .values_list('document_id')
            qs = qs.exclude(pk__in=Subquery(modified_document_ids))

        for doc_id, source, name in qs.values_list('id', 'source', 'name'):
            detect_field_values_for_document_args.append((doc_id, do_not_write, True))
            if source:
                source_data.append('{0}/{1}'.format(source, name))
            else:
                source_data.append(name)
            task_count += 1

        self.run_sub_tasks('Detect Field Values For Each Document',
                           DetectFieldValues.detect_field_values_for_document,
                           detect_field_values_for_document_args, source_data)
        if task_count > 0:
            self.log_info('Found {0} documents'.format(task_count))
        else:
            self.log_info('No documents found')
Exemplo n.º 20
0
 def get_json_data(self, request, *args, **kwargs):
     call_task_func(clean_tasks, (), request.user.pk, queue=settings.CELERY_QUEUE_SERIAL)
     return 'Cleaning task started.'
Exemplo n.º 21
0
 def start_task(self, data):
     document_type = data.get('document_type', {})
     document_type_code = document_type.code if document_type else None
     force = data.get('recreate_tables') or False
     call_task_func(manual_reindex, (document_type_code, force), data['user_id'])