Exemplo n.º 1
0
def manual_reindex(task: ExtendedTask,
                   document_type_code: str = None,
                   force: bool = False,
                   project_id: Optional[int] = None):
    from apps.rawdb.app_vars import APP_VAR_DISABLE_RAW_DB_CACHING
    if APP_VAR_DISABLE_RAW_DB_CACHING.val:
        task.log_info(
            'Document caching to raw tables is disabled in Commons / App Vars')
        return
    run_parameters = {'document type': document_type_code}
    if project_id:
        run_parameters['project'] = project_id
    if force:
        run_parameters['force'] = True
    ptrs_str = ', '.join([f'{p}={run_parameters[p]}' for p in run_parameters])

    msg = f'manual_reindex called for {ptrs_str}. ' \
          f'Task: {task.task_name}, main id: {task.main_task_id}'
    log = CeleryTaskLogger(task)
    log.info(msg)
    adapt_tables_and_reindex(task, document_type_code, force, True, project_id)
    def train_model_for_dirty_field(task: ExtendedTask,
                                    dirty_field_id: Any) -> None:
        dirty_field = DocumentTypeField.objects \
            .filter(pk=dirty_field_id) \
            .prefetch_related('document_type', 'document_field')[0]

        if dirty_field.can_retrain():
            dirty_field.dirty = False
            dirty_field.save()
            document_type = dirty_field.document_type
            field = dirty_field.document_field
            train_docs_count = field_detection_utils.get_approved_documents_number(
                document_type, field, None)
            if train_docs_count >= settings.ML_TRAIN_DATA_SET_GROUP_LEN:
                new_model = field_detection.train_document_field_detector_model(
                    CeleryTaskLogger(task), document_type, field, None)
                if new_model:
                    ClassifierModel.objects.filter(
                        document_type=document_type,
                        document_field=field).delete()
                    new_model.save()
def auto_reindex_not_tracked(task: ExtendedTask,
                             document_type_code: str = None,
                             force: bool = False):
    if APP_VAR_DISABLE_RAW_DB_CACHING.val:
        return
    document_types = [DocumentType.objects.get(code=document_type_code)] \
        if document_type_code is not None else DocumentType.objects.all()
    log = CeleryTaskLogger(task)

    task_model = task.task

    for document_type in document_types:
        reindex_needed = adapt_table_structure(log, document_type, force=force)
        if reindex_needed:
            call_task_func(manual_reindex, (document_type.code, False),
                           task_model.user_id)
        else:
            if there_are_non_indexed_docs_not_planned_to_index(document_type) \
                    and not any_other_reindex_task(task.request.id, document_type.code).exists():
                call_task_func(manual_reindex, (document_type.code, False),
                               task_model.user_id)
Exemplo n.º 4
0
def process_document_changed(task: ExtendedTask,
                             doc_id: int,
                             system_fields_changed: FieldSpec = True,
                             generic_fields_changed: FieldSpec = True,
                             user_fields_changed: bool = True,
                             changed_by_user_id: int = None):
    from apps.document.repository.document_field_repository import DocumentFieldRepository

    dfr = DocumentFieldRepository()

    doc = Document.objects.get(pk=doc_id)  # type: Document
    changed_by_user = User.objects.get(pk=changed_by_user_id) if changed_by_user_id is not None else None
    if DocumentSystemField.status.specified_in(system_fields_changed):
        dfr.delete_hidden_field_values_if_needed(doc, event_sender=task)
    fire_document_changed(sender=task,
                          log=CeleryTaskLogger(task),
                          document=doc,
                          changed_by_user=changed_by_user,
                          document_initial_load=False,
                          system_fields_changed=system_fields_changed,
                          generic_fields_changed=generic_fields_changed,
                          user_fields_changed=user_fields_changed)
Exemplo n.º 5
0
    def detect_field_values_for_document(task: ExtendedTask,
                                         detect_ptrs: DocDetectFieldValuesParams):
        doc = Document.all_objects.get(pk=detect_ptrs.document_id)
        log = CeleryTaskLogger(task)

        # If the document is in one of completed statuses then
        # the detected values wont be stored even if do_not_write = False.
        # But caching should go as usual.
        dfvs = field_detection \
            .detect_and_cache_field_values_for_document(
                log,
                doc,
                changed_by_user=task.task.user,
                save=not detect_ptrs.do_not_write,
                clear_old_values=detect_ptrs.clear_old_values,
                updated_field_codes=detect_ptrs.updated_field_codes,
                skip_modified_values=detect_ptrs.skip_modified_values,
                task=field_detection.detect_and_cache_field_values_for_document)

        task.log_info(f'Detected {len(dfvs)} field values for document ' +
                      f'#{detect_ptrs.document_id} ({doc.name})',
                      extra={Document.LOG_FIELD_DOC_ID: str(doc.pk),
                             Document.LOG_FIELD_DOC_NAME: doc.name})
Exemplo n.º 6
0
def cache_fields_for_docs_queryset(
        task: ExtendedTask,
        doc_qr,
        changed_by_user: User = None,
        document_initial_load: bool = False,
        generic_fields_changed: bool = True,
        user_fields_changed: bool = True,
        pre_detected_field_values: Optional[Dict[str, Any]] = None,
        old_field_values: Dict[int, Dict[str, Any]] = None):
    from apps.rawdb.field_value_tables import cache_document_fields
    old_field_values = old_field_values or {}
    for doc in doc_qr.select_related('document_type', 'project',
                                     'status'):  # type: Document
        log = CeleryTaskLogger(task)
        cache_document_fields(log=log,
                              document=doc,
                              cache_generic_fields=generic_fields_changed,
                              cache_user_fields=user_fields_changed,
                              pre_detected_field_codes_to_suggested_values=
                              pre_detected_field_values,
                              changed_by_user=changed_by_user,
                              document_initial_load=document_initial_load,
                              old_field_values=old_field_values.get(doc.pk))
Exemplo n.º 7
0
    def load_doc(task: ExtendedTask,
                 document: Document,
                 document_fields: Dict,
                 run_detect_field_values: bool,
                 filed_owners: dict = None):
        filed_owners = filed_owners if filed_owners else {}
        fields_to_values = LoadDocumentWithFields.load_field_values(
            task, document, document_fields, filed_owners)
        log = CeleryTaskLogger(task)

        with transaction.atomic():
            new_document = document.pk is None
            document.save(force_insert=new_document)
            if not new_document:
                DocumentFieldValue.objects \
                    .filter(document=document,
                            removed_by_user=False,
                            created_by__isnull=True,
                            modified_by__isnull=True) \
                    .delete()

            for field, values in fields_to_values.items():
                field_detection.save_detected_values(document, field, values)

            if run_detect_field_values:
                field_detection.detect_and_cache_field_values_for_document(
                    log, document, True)
            else:
                dfvs = field_detection.detect_and_cache_field_values_for_document(
                    log, document, False)
                field_value_cache.cache_field_values(document,
                                                     dfvs,
                                                     save=True,
                                                     log=log)

        task.log_info('Loaded {0} field values for document #{1} ({2})'.format(
            len(fields_to_values), document.pk, document.name))
    def send_notifications_packet(ntfs: List[DocumentNotification],
                                  event: str,
                                  task: BaseTask):
        documents_data = list(Document.all_objects.filter(
            pk__in={d.document_id for d in ntfs}))  # type: List[Document]
        doc_type_by_id = {dt.document_type.pk:dt.document_type for dt in documents_data}
        doc_types = [doc_type_by_id[pk] for pk in doc_type_by_id]

        doc_by_id = {}  # type: Dict[int, Document]
        for doc in documents_data:
            doc_by_id[doc.pk] = doc

        users = User.objects.filter(pk__in={d.changed_by_user_id for d in ntfs})
        user_by_id = {u.pk: u for u in users}

        handlers_by_doctype = {d: build_field_handlers(d, include_annotation_fields=False)
                               for d in doc_types}  # type:Dict[str, RawdbFieldHandler]

        log = CeleryTaskLogger(task)

        # { (doc_type, event,) : [notification0, notification1, ...], ... }
        messages_by_subscr_key = {}  # type: Dict[Tuple[str, str], List[DocumentNotification]]
        # { (doc_type, event,) : [DocumentNotificationSubscription0, ... ], ... }
        subscr_by_key = {}  # type: Dict[Tuple[str, str], List[DocumentNotificationSubscription]]

        for ntf in ntfs:
            if ntf.document_id not in doc_by_id:
                continue
            document = doc_by_id[ntf.document_id]
            key = (document.document_type, ntf.event,)
            if key in messages_by_subscr_key:
                messages_by_subscr_key[key].append(ntf)
            else:
                subscriptions = DocumentNotificationSubscription.objects \
                    .filter(enabled=True,
                            document_type=document.document_type,
                            event=event,
                            recipients__isnull=False) \
                    .select_related('specified_user', 'specified_role') \
                    .prefetch_related(Prefetch('user_fields',
                                               queryset=DocumentField.objects.all().order_by('order')))
                subscr_by_key[key] = subscriptions
                messages_by_subscr_key[key] = [ntf]

        notifications_to_send = []  # type: List[RenderedNotification]

        for key in messages_by_subscr_key:
            messages = messages_by_subscr_key[key]
            subscriptions = subscr_by_key[key]
            for sub in subscriptions:
                for msg_pack in chunks(messages, sub.max_stack):
                    # render pack of notifications or just one notification
                    if len(msg_pack) < 2:
                        # render single notification
                        if msg_pack[0].document_id not in doc_by_id or \
                                not doc_by_id[msg_pack[0].document_id]:
                            raise Exception(f'Error in send_notifications_packet(1): doc '
                                            f'with id={msg_pack[0].document_id} was not obtained')
                        document = doc_by_id[msg_pack[0].document_id]
                        handlers = handlers_by_doctype[document.document_type]
                        user = user_by_id[msg_pack[0].changed_by_user_id]

                        try:
                            notification = NotificationRenderer.render_notification(
                                msg_pack[0].package_id,
                                sub,
                                DocumentNotificationSource(
                                    document=document,
                                    field_handlers=handlers,
                                    field_values=msg_pack[0].field_values,
                                    changes=msg_pack[0].changes,
                                    changed_by_user=user))
                            if notification:
                                notifications_to_send.append(notification)
                        except Exception as e:
                            log.error(f'Error in send_notifications_packet(1), '
                                      f'sending render_notification()', exc_info=e)
                    else:
                        not_sources = []  # List[DocumentNotificationSource
                        # render pack of notifications in a single message
                        for msg in msg_pack:
                            if msg.document_id not in doc_by_id or \
                                    not doc_by_id[msg.document_id]:
                                raise Exception(f'Error in send_notifications_packet({len(msg_pack)}: doc '
                                                f'with id={msg.document_id} was not obtained')

                            document = doc_by_id[msg.document_id]
                            handlers = handlers_by_doctype[document.document_type]
                            user = user_by_id[msg.changed_by_user_id]
                            not_src = DocumentNotificationSource(
                                document=document,
                                field_handlers=handlers,
                                field_values=msg.field_values,
                                changes=msg.changes,
                                changed_by_user=user)
                            not_sources.append(not_src)
                        try:
                            notifications = NotificationRenderer.render_notification_pack(
                                [m.package_id for m in msg_pack],
                                sub, not_sources)
                            notifications_to_send += notifications
                        except Exception as e:
                            log.error(f'Error in send_notifications_packet(), '
                                      f'sending render_notification_pack()', exc_info=e)

        log.info(f'notification.send({len(notifications_to_send)})')
        for notification in notifications_to_send:
            notification.send(log=log)
def process_notifications_on_document_change(task: ExtendedTask,
                                             document_event: str,
                                             document_id: int,
                                             fields_before: Optional[Dict],
                                             fields_after: Optional[Dict],
                                             changed_by_user_id: int):
    document_type_id = Document.all_objects.filter(pk=document_id).values_list(
        'document_type', flat=True).first()  # type: str
    document_type = DocumentType.objects.get(pk=document_type_id)  # type: DocumentType
    changed_by_user = User.objects.get(pk=changed_by_user_id)  # type: User
    field_handlers = build_field_handlers(document_type,
                                          include_annotation_fields=False)  # List[RawdbFieldHandler]
    field_handlers_by_field_code = {h.field_code: h for h in field_handlers}  # Dict[str, RawdbFieldHandler]

    log_msgs = []
    package_id = uuid.uuid4().hex

    if document_event == DocumentEvent.CREATED.value or fields_before is None:
        if fields_after.get(FIELD_CODE_ASSIGNEE_ID) is not None:
            send_notification(package_id=package_id,
                              event=DocumentAssignedEvent.code,
                              document_id=document_id,
                              field_values=fields_after,
                              changed_by_user=changed_by_user)

        send_notification(package_id=package_id,
                          event=DocumentLoadedEvent.code,
                          document_id=document_id,
                          field_values=fields_after,
                          changed_by_user=changed_by_user)
    elif document_event == DocumentEvent.DELETED.value:
        send_notification(package_id=package_id,
                          event=DocumentDeletedEvent.code,
                          document_id=document_id,
                          field_values=fields_before,
                          changed_by_user=changed_by_user)
    else:
        changes = dict()
        for field_code, old_value in fields_before.items():
            if field_code not in field_handlers_by_field_code:
                continue
            new_value = fields_after.get(field_code)
            if not values_look_equal(old_value, new_value):
                changes[field_code] = (old_value, new_value)
                log_msgs.append(format_values_difference(field_code, old_value, new_value))

        if not changes:
            return

        if len(log_msgs) > 0:
            msgs_str = 'Following fields are different:\n    ' + '\n    '.join(log_msgs)
            log = CeleryTaskLogger(task)
            log.info(msgs_str)

        if FIELD_CODE_ASSIGNEE_ID in changes:
            send_notification(package_id=package_id,
                              event=DocumentAssignedEvent.code,
                              document_id=document_id,
                              field_values=fields_after,
                              changes=changes,
                              changed_by_user=changed_by_user)

        send_notification(package_id=package_id,
                          event=DocumentChangedEvent.code,
                          document_id=document_id,
                          field_values=fields_after,
                          changes=changes,
                          changed_by_user=changed_by_user)
Exemplo n.º 10
0
def refresh_materialized_view(_celery_task, view_name: str):
    mat_views_repo = MaterializedViews()
    mat_views_repo.refresh_materialized_view(CeleryTaskLogger(_celery_task), view_name)
Exemplo n.º 11
0
    def process(self, **kwargs):
        dst_field = kwargs['field']
        dst_field = DocumentField.objects.filter(pk=dst_field['pk']) \
            .prefetch_related('depends_on_fields') \
            .select_related(DST_FIELD_SIMILARITY_CONFIG_ATTR) \
            .first()  # type: DocumentField

        if not dst_field:
            raise RuntimeError('Document field not found: {0}'.format(kwargs['field']))

        proj = kwargs['project']
        proj_id = proj['pk'] if proj else None  # type:Optional[int]
        doc_query = Document.objects.filter(document_type=dst_field.document_type,
                                            project_id=proj_id) if proj_id \
            else Document.objects.filter(document_type=dst_field.document_type)

        config = getattr(dst_field, DST_FIELD_SIMILARITY_CONFIG_ATTR)  # type: DocumentSimilarityConfig

        config.self_validate()

        similarity_threshold = config.similarity_threshold
        feature_vector_fields = list(dst_field.depends_on_fields.all())
        feature_vector_field_codes = [f.code for f in feature_vector_fields]

        self.log_info('{field}: Min similarity: {threshold}'
                      .format(field=dst_field.code, threshold=similarity_threshold))

        import apps.document.repository.document_field_repository as dfr
        field_repo = dfr.DocumentFieldRepository()
        qr_doc_ids = doc_query.values_list('pk', flat=True)
        doc_ids_to_code_to_value = field_repo \
            .get_field_code_to_python_value_multiple_docs(document_type_id=dst_field.document_type_id,
                                                          doc_ids=qr_doc_ids,
                                                          field_codes_only=feature_vector_field_codes)

        field_values_list = list()
        for doc_id, values in doc_ids_to_code_to_value:
            values[FIELD_CODE_DOC_ID] = doc_id
            field_values_list.append(values)

        total_docs = len(field_values_list)

        self.set_push_steps(int(5 + total_docs / 100))

        self.push()
        self.log_info(
            '{field}: Building feature vectors for {n} documents'.format(field=dst_field.code, n=total_docs))

        vectorizer = document_feature_vector_pipeline(feature_vector_fields, use_field_codes=True)
        feature_vectors = vectorizer.fit_transform(field_values_list)

        self.push()
        self.log_info('{field}: Finding similar documents (similarity >= {threshold})'
                      .format(field=dst_field.code, threshold=similarity_threshold))

        doc_ids_to_values = defaultdict(set)
        for x, doc_a_field_values in enumerate(field_values_list):
            doc_a_pk = doc_a_field_values[FIELD_CODE_DOC_ID]
            similarities = cosine_similarity(feature_vectors[x], feature_vectors)
            for y, doc_b_field_values in enumerate(field_values_list):
                doc_b_pk = doc_b_field_values[FIELD_CODE_DOC_ID]
                if doc_a_pk == doc_b_pk:
                    continue
                similarity = similarities[0, y]
                if similarity < similarity_threshold:
                    continue
                doc_ids_to_values[doc_a_pk].add(doc_b_pk)
                doc_ids_to_values[doc_b_pk].add(doc_a_pk)
            if x % 100 == 0:
                self.log_info('{field}: Checked for similarity {x} documents of {n}'
                              .format(field=dst_field.code, x=x + 1, n=total_docs))
                self.push()

        self.push()
        self.log_info('{field}: Found {n} similar documents. Storing links into the document fields.'
                      .format(field=dst_field.code, n=len(doc_ids_to_values)))

        doc_ids_to_values = {doc_id: list(v) if v else None for doc_id, v in doc_ids_to_values}
        field_repo.store_values_one_field_many_docs_no_ants(field=dst_field, doc_ids_to_values=doc_ids_to_values)

        log = CeleryTaskLogger(self)
        for doc_id in doc_ids_to_values.keys():
            try:
                doc = Document.objects.get(pk=doc_id)
                signals.fire_document_changed(log=log, document=doc, changed_by_user=None, system_fields_changed=False,
                                              generic_fields_changed=False, user_fields_changed=[dst_field.code])
            except Exception as ex:
                self.log_error(f'Unable to fire doc id change event for doc #{doc_id}', exc_info=ex)
Exemplo n.º 12
0
    def sync_imanage_document(task: ExtendedTask, imanage_config_id: int, imanage_doc_id: str):
        task.log_info('Synchronizing iManage document #{0} or config #{1}'.format(imanage_doc_id, imanage_config_id))
        imanage_doc = IManageDocument.objects \
            .filter(imanage_config_id=imanage_config_id, imanage_doc_id=imanage_doc_id) \
            .select_related('imanage_config').get()
        file_storage = get_file_storage()
        try:
            imanage_config = imanage_doc.imanage_config
            log = CeleryTaskLogger(task)
            project = imanage_config.resolve_dst_project(imanage_doc.imanage_doc_data, log)
            project_id = project.pk

            assignee = imanage_config.resolve_assignee(imanage_doc.imanage_doc_data, log)
            assignee_id = assignee.pk if assignee else None
            task.log_info('Assignee resolved to: {0}'.format(assignee.get_full_name() if assignee else '<no assignee>'))

            task.log_info('Downloading iManage document contents into a temp file...')
            auth_token = imanage_config.login()
            filename, response = imanage_config.load_document(auth_token, imanage_doc_id)

            upload_session_id = str(uuid.uuid4())
            filename = get_valid_filename(filename)
            rel_filepath = os.path.join(upload_session_id, filename)

            _, ext = os.path.splitext(filename) if filename else None
            with buffer_contents_into_temp_file(response, ext) as temp_fn:

                # upload file to file storage
                with open(temp_fn, 'rb') as f:
                    file_storage.mk_doc_dir(upload_session_id)
                    file_storage.write_document(rel_filepath, f)

                kwargs = {
                    'document_type_id': imanage_config.document_type_id,
                    'project_id': project_id,
                    'assignee_id': assignee_id,
                    'user_id': get_main_admin_user().pk,
                    'propagate_exception': True,
                    'run_standard_locators': True,
                    'metadata': {},
                    'do_not_check_exists': True
                }

                pre_defined_fields = None
                if imanage_doc.imanage_doc_data and imanage_config.imanage_to_contraxsuite_field_binding:
                    pre_defined_fields = dict()
                    for imanage_field_code, contraxsuite_field_code \
                            in dict(imanage_config.imanage_to_contraxsuite_field_binding).items():
                        imanage_field_value = imanage_doc.imanage_doc_data.get(imanage_field_code)
                        if imanage_field_value:
                            pre_defined_fields[contraxsuite_field_code] = imanage_field_value
                            task.log_info('Assigning iManage field {0} to Contraxsuite field {1}: {2}'
                                          .format(imanage_field_code, contraxsuite_field_code, imanage_field_value))
                        else:
                            task.log_info('iManage field {0} has no value assigned.'
                                          .format(imanage_field_code))
                else:
                    task.log_info('No binding of iManage fields to Contraxsuite fields.')

                document_id = LoadDocuments \
                    .create_document_local(task, temp_fn, rel_filepath, kwargs,
                                           return_doc_id=True,
                                           pre_defined_doc_fields_code_to_python_val=pre_defined_fields)

                if document_id:
                    task.log_info('Created Contraxsuite document #{0}'.format(document_id))
                    imanage_doc.document_id = document_id
                    imanage_doc.last_sync_date = timezone.now()
                    imanage_doc.save(update_fields=['document_id', 'last_sync_date'])
                else:
                    task.log_error('Unable to create Contraxsuite document for '
                                   'iManage document #{0}'.format(imanage_doc_id))
                    raise RuntimeError('No document loaded.')
        except Exception as ex:
            msg = render_error('Unable to synchronize iManage document #{0}'.format(imanage_doc_id), ex)
            task.log_error(msg)
            imanage_doc.import_problem = True
            imanage_doc.save(update_fields=['import_problem'])
def adapt_tables_and_reindex(task: ExtendedTask,
                             document_type_code: str = None,
                             force_recreate_tables: bool = False,
                             force_reindex: bool = False):
    """
    "RawDB: Reindex" task
    Checks if raw table with field values of doc type needs to be altered according to the changed
    field structure and triggers document reindexing if needed.

    This task should be always executed in the "serial" queue (used for Celery Beat) to avoid parallel modifications
    on the same table.
    See settings.py/CELERY_BEAT_SCHEDULE
    :param task:
    :param document_type_code: Document type code or None to check all doc types.
    :param force_recreate_tables: Force re-creating tables and re-indexing from scratch.
    :param force_reindex: Force re-indexing of all docs even if the table was not altered.
    :return:
    """
    document_types = [DocumentType.objects.get(code=document_type_code)] \
        if document_type_code is not None else DocumentType.objects.all()
    log = CeleryTaskLogger(task)

    for document_type in document_types:
        reindex_needed = adapt_table_structure(log,
                                               document_type,
                                               force=force_recreate_tables)

        if force_recreate_tables:
            # If "force" is requested - we cancel all currently planned re-index tasks
            # and plan (re-plan) reindexing for all documents of this task.
            for prev_task in any_other_reindex_task(task.request.id,
                                                    document_type.code):
                purge_task(prev_task)
            args = [(ids, ) for ids in get_all_doc_ids(document_type.uid, 20)]
            task.log_info(
                f'Initiating re-index for all documents of {document_type.code} '
                + f' - forced tables recreating.')
            task.run_sub_tasks('Reindex set of documents',
                               cache_document_fields_for_doc_ids_tracked, args)
        elif reindex_needed or force_reindex:
            comment = 'forced' if force_reindex else 'reindex needed'
            task.log_info(
                f'Raw DB table for document type {document_type.code} ' +
                f'has been altered ({comment}), task "{task.task_name}".\n' +
                f'Initiating re-index for all documents of this document type.'
            )
            # If we altered the field structure then we need to re-index all docs of this type.
            # If no need to force - we plan re-index tasks only
            # for those documents for which they are not planned yet
            args = [
                (ids, )
                for ids in get_all_doc_ids_not_planned_to_index_by_doc_type(
                    document_type.uid, 20)
            ]
            task.run_sub_tasks('Reindex set of documents',
                               cache_document_fields_for_doc_ids_tracked, args)
        else:
            # If we did not alter the table but there are non-indexed docs fo this type
            # then we trigger the re-index task making it index non-indexed docs only.
            # In this case we don't stop other re-index tasks. But we can be stopped further in case of
            # full reindex.

            # It makes sense to only plan re-indexing for those docs which are:
            # - not indexed
            # - have no re-index planned for them
            task.log_info(
                f'Initiating re-index for all documents of {document_type.code} '
                + f' - index not planned.')
            args = [(ids, )
                    for ids in non_indexed_doc_ids_not_planned_to_index(
                        document_type, 20)]
            task.run_sub_tasks('Reindex set of documents',
                               cache_document_fields_for_doc_ids_tracked, args)
Exemplo n.º 14
0
def import_document_type(json_bytes: bytes, save: bool,
                         auto_fix_validation_errors: bool,
                         remove_missed_in_dump_objects: bool,
                         source_version: int,
                         task: ExtendedTask) -> DocumentType:
    tasks = Task.objects \
        .get_active_user_tasks() \
        .exclude(pk=task.task.pk) \
        .exclude(name__in=[task_names.TASK_NAME_REFRESH_MATERIALIZED_VIEW,
                           task_names.TASK_NAME_CLEAN_ALL_TASKS,
                           task_names.TASK_NAME_CHECK_EMAIL_POOL]) \
        .distinct('name') \
        .order_by('name') \
        .values_list('name', flat=True)

    tasks = list(tasks)
    if tasks:
        msg = f'The following user tasks are running: {", ".join(tasks)}. ' + \
              'This import can cause their crashing because of document ' + \
              'type / field structure changes.'

        raise RuntimeError(msg)

    # check data contains version
    json_str = json_bytes.decode('utf-8')
    json_dict = json.loads(json_str)

    sm = SchemeMigration()
    if isinstance(json_dict, dict):
        # {"version":"75","data":[{"model": ... ]}
        version = json_dict.get('version')
        records = sm.migrate_model_records(json_dict['data'], int(version),
                                           CURRENT_VERSION)
        json_str = json.dumps(records)
    elif source_version != CURRENT_VERSION:
        json_str = sm.migrate_json(json_str, source_version, CURRENT_VERSION)

    for doc_type_subclass in DESERIALIZED_OBJECT_CLASSES:
        doc_type_subclass.init_static()

    objects = serializers.deserialize("json", json_str)
    document_type = None
    pk_to_field = {}
    field_detectors = []
    other_objects = []
    logger = CeleryTaskLogger(task)
    for deserialized_object in objects:
        obj = deserialized_object.object
        if isinstance(obj, DocumentType):
            if document_type is not None:
                raise RuntimeError('More than one document types was detected')
            document_type = DeserializedDocumentType(
                deserialized_object,
                auto_fix_validation_errors=auto_fix_validation_errors,
                remove_missed_in_dump_objects=remove_missed_in_dump_objects,
                logger=logger)
        elif isinstance(obj, DocumentField):
            field = DeserializedDocumentField(
                deserialized_object,
                auto_fix_validation_errors=auto_fix_validation_errors,
                remove_missed_in_dump_objects=remove_missed_in_dump_objects,
                logger=logger)
            pk_to_field[field.pk] = field
        elif isinstance(obj, DocumentFieldDetector):
            field_detector = DeserializedDocumentFieldDetector(
                deserialized_object,
                auto_fix_validation_errors=auto_fix_validation_errors,
                logger=logger)
            field_detectors.append(field_detector)
        elif isinstance(obj, DocumentFieldCategory):
            category = DeserializedDocumentFieldCategory(
                deserialized_object,
                auto_fix_validation_errors=auto_fix_validation_errors,
                logger=logger)
            other_objects.append(category)
        elif isinstance(obj, DocumentFieldFamily):
            family = DeserializedDocumentFieldFamily(
                deserialized_object,
                auto_fix_validation_errors=auto_fix_validation_errors,
                logger=logger)
            other_objects.append(family)
        else:
            raise RuntimeError('Unknown model')

    if document_type is None:
        raise RuntimeError('Unable to find document type')

    conflicting_document_type = DocumentType.objects \
        .filter(code=document_type.object.code) \
        .exclude(pk=document_type.pk) \
        .first()
    if conflicting_document_type is not None:
        err_msg = f'Unable to import document type #{document_type.pk} "{document_type.object.code}". ' +\
                  f'Database already contains a document type #{conflicting_document_type.pk} ' + \
                  f'with code "{conflicting_document_type.code}"'
        raise RuntimeError(err_msg)

    for field_detector in field_detectors:
        field = pk_to_field.get(field_detector.field_pk)
        if field is not None:
            field.add_dependent_object(field_detector)
        else:
            raise RuntimeError(f'Unknown field #{field_detector.field_pk}')

    for field in pk_to_field.values():
        if field.document_type_pk == document_type.pk:
            document_type.add_dependent_object(field)
        else:
            raise RuntimeError(f'Unknown document type #{document_type.pk}')

    for obj in other_objects:
        document_type.add_dependent_object(obj)

    logger.info(f'Validation of {document_type.object.code} ...')
    validation_errors = document_type.validate()
    logger.info(f'Validation of {document_type.object.code} is finished')
    if validation_errors:
        task.log_error(
            f'{len(validation_errors)} VALIDATION ERRORS HAS OCCURRED DURING VALIDATION OF {document_type.object.code}.'
        )
        for index, validation_error in enumerate(validation_errors):
            # for different timestamps
            sleep(0.001)
            task.log_error(f'VALIDATION ERROR {index + 1}. {validation_error}')
        raise ValidationError(
            f'Validation errors has occurred during import of {document_type.object.code}'
        )

    if save:
        logger.info(f'Import of {document_type.object.code} ...')
        with transaction.atomic():
            document_type.save()
        logger.info(f'Import of {document_type.object.code} is finished')

    return document_type.object
Exemplo n.º 15
0
    def test_field_detector_model(task: ExtendedTask, field_id,
                                  document_id) -> dict:
        document = Document.objects.get(pk=document_id)  # type: Document
        field = DocumentField.objects.get(pk=field_id)  # type: DocumentField

        expected_dfvs = field_detection.detect_and_cache_field_values(
            CeleryTaskLogger(task), document, field,
            save=False)  # type: List[DetectedFieldValue]
        actual_dfvs = list(
            DocumentFieldValue.objects.filter(
                document=document, field=field,
                removed_by_user=False).all())  # type: List[DocumentFieldValue]

        if field.is_value_aware():
            # dates, numbers, e.t.c.
            expected_field_values = field_detection.merge_detected_field_values_to_python_value(
                expected_dfvs)
            expected_field_value = expected_field_values.get(field.code)

            actual_field_values = merge_document_field_values_to_python_value(
                actual_dfvs)
            actual_field_value = actual_field_values.get(field.code)

            matches = bool(expected_field_value == actual_field_value)
        else:
            # related-info e.t.c.
            expected_set = {
                'text_unit_' + str(dfv.text_unit.id)
                for dfv in expected_dfvs if dfv.text_unit
            }
            expected_field_value = '; '.join(sorted(expected_set))

            actual_set = {
                'text_unit_' + str(dfv.text_unit.id)
                for dfv in actual_dfvs if dfv.text_unit
            }
            actual_field_value = '; '.join(sorted(actual_set))
            matches = bool(expected_set == actual_set)

        if not matches:
            found_in_text = [
                dfv.text_unit.text for dfv in expected_dfvs
                if dfv.text_unit and dfv.text_unit.text
            ] if expected_dfvs else []
            found_in_text_msg = ''
            if found_in_text:
                found_in_text_msg = '\nDetected in text:\n-----\n{0}\n-----'.format(
                    '\n---\n'.join(found_in_text))
            task.log_info(
                '{3} Test doc: {0} (Doc id: {6}, Project: {5}). '
                'Detected: {1}. Real: {2}.{4}'.format(
                    document.name, expected_field_value, actual_field_value,
                    '[  OK  ]' if matches else '[ ERR  ]', found_in_text_msg,
                    document.project.name if document.project else '',
                    document.id))

        text_units_number = TextUnit.objects.filter(
            document=document, unit_type=field.text_unit_type).count()

        return {
            'text_units_number':
            text_units_number,
            'value_matches_expected':
            matches,
            'actual_field_value':
            actual_field_value if field.is_choice_field() else None
        }
Exemplo n.º 16
0
    def process(self, **kwargs):
        if self.PARAM_CONFIG in kwargs:
            config_id = kwargs[self.PARAM_CONFIG]['pk']
        else:
            config_id = kwargs[self.PARAM_CONFIG_ID]

        if self.PARAM_USER in kwargs:
            user_ids = {kwargs[self.PARAM_USER]['pk']}
        else:
            user_ids = kwargs.get(self.PARAM_USER_IDS)

        run_date = kwargs.get(self.PARAM_RUN_DATE)
        run_date_specified = run_date is not None

        if isinstance(run_date, str):
            run_date = parse(run_date)

        run_date = run_date or datetime.datetime.now(tz=tzlocal.get_localzone())

        run_even_if_not_enabled = bool(kwargs.get(self.PARAM_RUN_EVEN_IF_NOT_ENABLED))

        config = DocumentDigestConfig.objects \
            .filter(pk=config_id).select_related('for_role', 'for_user').first()  # type: DocumentDigestConfig
        if not config:
            self.log_error('{1} not found: #{0}'.format(config_id, DocumentDigestConfig.__name__))
            return

        if not config.enabled and not run_even_if_not_enabled:
            self.log_info('{1} #{0} is disabled.'.format(config_id, DocumentDigestConfig.__name__))
            return

        tz_msg = ' at timezone {0}'.format(run_date.tzname()) if run_date_specified else ''
        self.log_info('Rendering and sending {what} #{pk} ({doc_filter}) for date "{run_date}" to {n} users{tz_msg}'
                      .format(what=DocumentDigestConfig.__name__,
                              pk=config.pk, doc_filter=config.documents_filter, n=len(user_ids), run_date=run_date,
                              tz_msg=tz_msg))

        if user_ids:
            users_qr = User.objects.filter(pk__in=user_ids)
        elif config.for_role_id is not None:
            users_qr = User.objects.filter(role_id=config.for_role_id)
        elif config.for_user_id is not None:
            users_qr = User.objects.get(pk=config.for_user_id)
        else:
            self.log_error('{what} #{config_id} specifies neither for_user nor for_role.'
                           .format(what=DocumentDigestConfig.__name__, config_id=config.pk))
            return

        log = CeleryTaskLogger(self)
        for user in users_qr:  # type: User
            if config.for_user_id != user.id and (config.for_role_id is None or config.for_role_id != user.role_id):
                self.log_error('{what} #{what_id} is not applicable for user {user_name} (#{user_id})'
                               .format(what=DocumentDigestConfig.__name__,
                                       what_id=config.pk,
                                       user_name=user.get_full_name(),
                                       user_id=user.pk))
                continue

            try:
                digest = render_digest(config=config, dst_user=user, run_date=run_date)
                if digest:
                    digest.send(log)
            except Exception as e:
                self.log_error(f'Unable to send {RenderedDigest}.\n'
                               f'Config: #{config.pk}\n'
                               f'Dst user: {user.get_full_name()} #{user.pk}\n'
                               f'Run date: {run_date}', exc_info=e)
Exemplo n.º 17
0
 def cache_document_fields_for_doc_ids(_task: ExtendedTask, doc_ids: Set):
     for doc in Document.objects.filter(pk__in=doc_ids):
         log = CeleryTaskLogger(_task)
         field_value_cache.cache_generic_values(doc)
         suggested_values = field_detection.detect_and_cache_field_values_for_document(log, doc, False)
         field_value_cache.cache_field_values(doc, suggested_values, save=True)
Exemplo n.º 18
0
 def cache_document_fields_for_doc_ids(_task: ExtendedTask, doc_ids: List):
     for doc in Document.all_objects.filter(pk__in=doc_ids):
         log = CeleryTaskLogger(_task)
         detect_and_cache_field_values_for_document(
             log, doc, False, clear_old_values=False)
Exemplo n.º 19
0
    def test_field_detector_model(task: ExtendedTask, field_id, document_id) -> dict:
        document = Document.objects.get(pk=document_id)  # type: Document
        field = DocumentField.objects.get(pk=field_id)  # type: DocumentField
        typed_field = TypedField.by(field)

        expected_field_value_dto = field_detection.detect_field_value(
            CeleryTaskLogger(task), document, field)  # type: FieldValueDTO

        import apps.document.repository.document_field_repository as dfr
        field_repo = dfr.DocumentFieldRepository()

        if typed_field.requires_value:
            # dates, numbers, e.t.c.
            actual_field_value_dict = field_repo \
                .get_field_code_to_python_value(document_type_id=document.document_type_id,
                                                doc_id=document_id,
                                                field_codes_only={field.code})

            actual_field_value = actual_field_value_dict.get(field.code) if actual_field_value_dict else None
            expected_field_value = expected_field_value_dto.field_value if expected_field_value_dto else None

            matches = bool(expected_field_value == actual_field_value)
        else:
            expected_set = set()
            # related-info e.t.c. - comparing by annotations - exact comparing
            if expected_field_value_dto.annotations:
                for ant_dto in expected_field_value_dto.annotations:
                    text_unit_id = field_repo.find_text_unit_id_by_location(
                        document,
                        field,
                        ant_dto.location_in_doc_start,
                        ant_dto.location_in_doc_end)
                    if not text_unit_id:
                        continue
                    expected_set.add('text_unit_' + str(text_unit_id))
            expected_field_value_dto = '; '.join(sorted(expected_set))

            actual_dfvs = FieldAnnotation.objects.filter(
                document_type_id=document.document_type_id,
                doc_id=document_id,
                field_id=field.pk)
            actual_set = {'text_unit_' + str(dfv.text_unit.id) for dfv in actual_dfvs if dfv.text_unit}
            actual_field_value = '; '.join(sorted(actual_set))
            matches = bool(expected_set == actual_set)

        if not matches:
            found_in_text = [dfv.text_unit.text
                             for dfv in expected_field_value_dto
                             if dfv.text_unit and dfv.text_unit.text] if expected_field_value_dto else []
            found_in_text_msg = ''
            if found_in_text:
                found_in_text_msg = '\nDetected in text:\n-----\n{0}\n-----'.format('\n---\n'.join(found_in_text))
            task.log_info('{3} Test doc: {0} (Doc id: {6}, Project: {5}). '
                          'Detected: {1}. Real: {2}.{4}'
                          .format(document.name,
                                  expected_field_value_dto,
                                  actual_field_value,
                                  '[  OK  ]' if matches else '[ ERR  ]',
                                  found_in_text_msg,
                                  document.project.name if document.project else '',
                                  document.pk))

        text_units_number = TextUnit.objects.filter(document=document, unit_type=field.text_unit_type).count()

        return {
            'text_units_number': text_units_number,
            'value_matches_expected': matches,
            'actual_field_value': actual_field_value if typed_field.is_choice_field else None
        }
Exemplo n.º 20
0
    def process(self, **kwargs):

        n_clusters = kwargs.get('n_clusters', 3)
        method = kwargs.get('method', 'kmeans')
        cluster_by = kwargs.get('cluster_by', 'term')

        self.project_clustering_id = kwargs.get('project_clustering_id')
        project_clustering = ProjectClustering.objects.get(
            pk=self.project_clustering_id)
        project_clustering.status = PENDING
        project_clustering.task = self.task
        project_clustering.save()

        project = project_clustering.project

        self.log_info('Start clustering documents for project id={}'.format(
            project.id))
        self.log_info('Clustering method: "{}", n_clusters={}'.format(
            method, n_clusters))
        self.log_info('Cluster by: {}'.format(str(cluster_by)))

        self.set_push_steps(4)

        self.push()

        # clear previous clusters, their tasks and cluster sessions
        project.drop_clusters(
            exclude_task_ids={self.request.id},
            exclude_project_clustering_id=self.project_clustering_id)
        self.push()

        cluster_model = ClusterDocuments(project_id=project.id,
                                         cluster_algorithm=method,
                                         n_clusters=n_clusters,
                                         cluster_by=cluster_by,
                                         use_default_name=True)
        result = cluster_model.run()

        project_clustering.metadata = result.metadata
        project_clustering.save()
        project_clustering.document_clusters.add(
            *result.metadata['cluster_obj_ids'])

        self.push()
        self.log_info('Clustering completed. Updating document cache.')

        log = CeleryTaskLogger(self)
        for doc in Document.objects.filter(project__pk=project.id):
            signals.fire_document_changed(
                sender=self,
                log=log,
                document=doc,
                changed_by_user=None,
                system_fields_changed=False,
                user_fields_changed=False,
                generic_fields_changed=[DocumentGenericField.cluster_id.value])

        project_clustering.status = SUCCESS
        project_clustering.save()

        self.push()
        self.log_info('Finished.')
        return result.metadata
Exemplo n.º 21
0
def process_notifications_on_document_change(task: ExtendedTask,
                                             document_event: str, document_id,
                                             fields_before: Optional[Dict],
                                             fields_after: Optional[Dict],
                                             changed_by_user_id):
    log = CeleryTaskLogger(task)

    document = Document.objects.filter(pk=document_id).select_related(
        'document_type').first()  # type: Document
    document_type = document.document_type
    changed_by_user = User.objects.get(pk=changed_by_user_id)
    field_handlers = build_field_handlers(document_type,
                                          include_suggested_fields=False)
    field_handlers_by_field_code = {h.field_code: h
                                    for h in field_handlers
                                    }  # Dict[str, FieldHandler]
    already_sent_user_ids = set()

    log_msgs = []

    if document_event == DocumentEvent.CREATED.value:
        if fields_after.get(FIELD_CODE_ASSIGNEE_ID) is not None:
            send_notification(event=DocumentAssignedEvent.code,
                              log=log,
                              already_sent_user_ids=already_sent_user_ids,
                              document=document,
                              field_handlers=field_handlers,
                              field_values=fields_after,
                              changed_by_user=changed_by_user)

        send_notification(event=DocumentLoadedEvent.code,
                          log=log,
                          already_sent_user_ids=already_sent_user_ids,
                          document=document,
                          field_handlers=field_handlers,
                          field_values=fields_after,
                          changed_by_user=changed_by_user)
    elif document_event == DocumentEvent.DELETED.value:
        send_notification(event=DocumentDeletedEvent.code,
                          log=log,
                          already_sent_user_ids=already_sent_user_ids,
                          document=document,
                          field_handlers=field_handlers,
                          field_values=fields_before,
                          changed_by_user=changed_by_user)
    else:
        changes = dict()
        for field_code, old_value in fields_before.items():
            if field_code not in field_handlers_by_field_code \
                    or field_handlers_by_field_code[field_code].is_suggested:
                continue
            new_value = fields_after.get(field_code)
            if not values_look_equal(old_value, new_value):
                changes[field_code] = (old_value, new_value)
                log_msgs.append(
                    format_values_difference(field_code, old_value, new_value))

        if not changes:
            return

        if len(log_msgs) > 0:
            msgs_str = 'Following fields are different:\n    ' + '\n    '.join(
                log_msgs)
            log = CeleryTaskLogger(task)
            log.info(msgs_str)

        if FIELD_CODE_ASSIGNEE_ID in changes:
            send_notification(event=DocumentAssignedEvent.code,
                              log=log,
                              already_sent_user_ids=already_sent_user_ids,
                              document=document,
                              field_handlers=field_handlers,
                              field_values=fields_after,
                              changes=changes,
                              changed_by_user=changed_by_user)

        send_notification(event=DocumentChangedEvent.code,
                          log=log,
                          already_sent_user_ids=already_sent_user_ids,
                          document=document,
                          field_handlers=field_handlers,
                          field_values=fields_after,
                          changes=changes,
                          changed_by_user=changed_by_user)
Exemplo n.º 22
0
    def process(self, **kwargs):
        self.log_info(
            'Going to train document field based on the datasets stored in DB...')

        document_type_field_id = kwargs.get('document_type_field_id')
        skip_training = kwargs.get('skip_training')
        use_only_confirmed_field_values_for_training = kwargs.get('use_only_confirmed_field_values_for_training')
        train_data_project_ids = kwargs.get('train_data_project_ids')

        skip_testing = kwargs.get('skip_testing')
        use_only_confirmed_field_values_for_testing = kwargs.get('use_only_confirmed_field_values_for_testing')
        test_data_projects_ids = kwargs.get('test_data_projects_ids')

        document_type_field = DocumentTypeField.objects.get(pk=document_type_field_id)
        document_type = document_type_field.document_type
        field = document_type_field.document_field  # type: DocumentField

        if not field.is_detectable():
            self.log_info('Field {0} is not detectable. Nothing to train and/or test.'.format(field.code))

        new_model = None

        if not skip_training:
            if train_data_project_ids:
                self.log_info('Training model on the specified projects...')
            else:
                self.log_info('No training projects specified. '
                              'Training model on all user-confirmed field values in the system...')

            new_model = field_detection \
                .train_document_field_detector_model(CeleryTaskLogger(self),
                                                     document_type,
                                                     field,
                                                     train_data_project_ids,
                                                     use_only_confirmed_field_values_for_training)
            if new_model:
                ClassifierModel.objects.filter(document_type=document_type, document_field=field).delete()
                new_model.save()

                if new_model.classifier_accuracy_report_in_sample:
                    self.log_info('Sklearn test report for in-sample docs:\n{0}'
                                  .format(new_model.classifier_accuracy_report_in_sample))

                if new_model.classifier_accuracy_report_out_of_sample:
                    self.log_info('Sklearn test report for out-of-sample docs:\n{0}'
                                  .format(new_model.classifier_accuracy_report_out_of_sample))
            else:
                self.log_info('No model trained. '
                              'Probably the detection strategy of field {0} does not allow training'.format(field.code))

        if skip_testing:
            return

        if not test_data_projects_ids:
            self.log_info('No test projects specified. Skiping the testing step.')
            return
        else:
            if not use_only_confirmed_field_values_for_testing:
                test_document_ids = Document.objects \
                    .filter(project_id__in=test_data_projects_ids, document_type_id=document_type.pk) \
                    .values_list('pk', flat=True)
            else:
                test_document_ids = set(field_detection_utils
                                        .get_qs_active_modified_document_ids(document_type,
                                                                             field,
                                                                             test_data_projects_ids))
                test_document_ids.update(set(field_detection_utils
                                             .get_qs_finished_document_ids(document_type,
                                                                           test_data_projects_ids)))

            self.log_info('Testing field detection document-by-document...')
            test_tasks_args = []
            for test_document_id in test_document_ids:
                test_tasks_args.append((field.uid, test_document_id))

            if test_tasks_args:
                self.run_sub_tasks('Test Field Detector Model', TrainAndTest.test_field_detector_model,
                                   test_tasks_args)

                self.run_after_sub_tasks_finished('Join Field Detector Model Tests',
                                                  TrainAndTest.join_field_detector_model_tests,
                                                  [(field.uid, document_type.uid, new_model.pk if new_model else None)])
Exemplo n.º 23
0
    def process(self, **kwargs):

        n_clusters = kwargs.get('n_clusters')
        method = kwargs.get('method')
        project_id = kwargs.get('project_id')

        project_clustering_id = kwargs.get('project_clustering_id')
        project_clustering = ProjectClustering.objects.get(
            pk=project_clustering_id) if project_id else None
        project_clustering.task = self.task
        project_clustering.save()

        project = project_clustering.project

        self.log_info(
            'Start clustering documents for project id={}'.format(project_id))
        self.log_info('Clustering method: "{}", n_clusters={}'.format(
            method, n_clusters))

        self.set_push_steps(4)

        # get documents data
        documents = Document.objects.filter(project_id=project_id)
        id_name_map = {k: v for k, v in documents.values_list('id', 'name')}
        docs_count = len(id_name_map)

        # cluster by full text
        if kwargs.get('cluster_by') == 'full_text':
            docs = np.array(documents.values_list('pk', 'full_text'))
            pks, data = docs[:, 0], docs[:, 1]

            # try increase min_df if exception occurs while fit_trasform
            for max_df in range(50, 101, 5):
                max_df = float(max_df / 100)
                try:
                    vectorizer = TfidfVectorizer(max_df=max_df,
                                                 max_features=100,
                                                 min_df=2,
                                                 stop_words='english',
                                                 use_idf=True)
                    X = vectorizer.fit_transform(data)
                except ValueError as e:
                    if 'Try a lower min_df or a higher max_df' in str(e):
                        continue
                    else:
                        raise e
                break

            terms = vectorizer.get_feature_names()

        # Cluster by terms
        else:
            id_field = 'id'
            prop_field = 'textunit__termusage__term__term'
            # filter non-null, null
            qs = documents.filter(textunit__termusage__isnull=False)
            if not qs.exists():
                raise RuntimeError(
                    'No terms in documents detected, try to re-run terms parser.'
                )
            # get values
            ann_cond = dict(prop_count=Count(prop_field))
            qs = qs.values(id_field,
                           prop_field).annotate(**ann_cond).distinct()
            # get data
            df = pd.DataFrame(list(qs)).dropna()
            null_qs = documents.exclude(textunit__termusage__isnull=False)
            if null_qs.exists():
                null_df = pd.DataFrame(list(
                    null_qs.values('id'))).set_index('id')
                df = df.join(null_df, how='outer', on='id')
            df = df.pivot(index=id_field,
                          columns=prop_field,
                          values='prop_count').fillna(0)

            X = df.as_matrix()
            # convert CountVec into TFvec
            tf_transformer = TfidfTransformer(use_idf=False).fit(X)
            X = tf_transformer.transform(X)

            pks = df.index.tolist()
            terms = df.columns.tolist()

        if method == 'Birch':
            m = Birch(n_clusters=n_clusters,
                      threshold=0.5,
                      branching_factor=50)
        elif method == 'MiniBatchKMeans':
            m = MiniBatchKMeans(n_clusters=n_clusters,
                                init='k-means++',
                                n_init=1,
                                init_size=100,
                                batch_size=100,
                                verbose=False)
        else:
            method = 'KMeans'
            m = KMeans(n_clusters=n_clusters,
                       init='k-means++',
                       max_iter=100,
                       n_init=1,
                       verbose=False)

        m.fit(X)
        self.push()

        X = X.toarray()
        pca = PCA(n_components=2).fit(X)
        data2d = pca.transform(X)

        if method == 'DBSCAN':
            clusters = m.labels_
            cluster_labels = set(clusters)
            # reshape cluster labels
            if -1 in cluster_labels:
                cluster_labels = [i + 1 for i in cluster_labels]
            cluster_terms = cluster_labels
            centers2d = None
        else:
            if method == 'Birch':
                cluster_centers = m.subcluster_centers_
            else:
                cluster_centers = m.cluster_centers_

            order_centroids = cluster_centers.argsort()[:, ::-1]
            clusters = m.labels_.tolist()
            cluster_labels = set(clusters)
            _n_clusters = len(cluster_labels)
            cluster_terms = [[terms[ind] for ind in order_centroids[i, :10]]
                             for i in range(_n_clusters)]
            centers2d = pca.transform(cluster_centers)

        points_data = [{
            'document_id': pks[i],
            'document_name': id_name_map[pks[i]],
            'coord': data2d[i].tolist(),
            'cluster_id': str(clusters[i])
        } for i in range(docs_count)]

        self.push()

        clusters_data = {}
        created_date = now()
        for cluster_id in cluster_labels:
            cluster_label = cluster_terms[cluster_id]
            if isinstance(cluster_label, list):
                cluster_label = '-'.join(cluster_label[:5])
            cluster = DocumentCluster.objects.create(
                cluster_id=cluster_id,
                name='Default({})'.format(project.pk if project else None),
                self_name=cluster_label,
                description=
                'Cluster Project (id={}) with Multiple Contract Types'.format(
                    project_id),
                cluster_by='all',
                using=method,
                created_date=created_date)
            cluster_documents = [
                i['document_id'] for i in points_data
                if i['cluster_id'] == str(cluster_id)
            ]
            cluster.documents.set(cluster_documents)
            clusters_data[str(cluster_id)] = dict(
                cluster_obj_id=cluster.pk,
                cluster_terms=cluster_terms[cluster_id],
                centroid_coord=centers2d[cluster_id].tolist()
                if centers2d is not None else None)
            project_clustering.document_clusters.add(cluster)

        result = {
            'method': method,
            'n_clusters': n_clusters,
            'points_data': points_data,
            'clusters_data': clusters_data
        }
        project_clustering.metadata = result
        project_clustering.save()

        self.push()
        self.log_info('Clustering completed. Updating document cache.')

        log = CeleryTaskLogger(self)
        for doc in Document.objects.filter(project__pk=project_id):
            field_value_cache.cache_generic_values(doc, log=log)

        self.push()
        self.log_info('Finished.')
        return result
Exemplo n.º 24
0
def import_document_type(json_bytes: bytes, save: bool,
                         auto_fix_validation_errors: bool,
                         remove_missed_in_dump_objects: bool,
                         task: ExtendedTask) -> DocumentType:
    tasks = Task.objects \
        .get_active_user_tasks() \
        .exclude(pk=task.task.pk) \
        .distinct('name') \
        .order_by('name') \
        .values_list('name', flat=True)
    tasks = list(tasks)
    if tasks:
        msg = 'The following user tasks are running: {0}. This import can cause their crashing because of document' \
              ' type / field structure changes.'.format(', '.join(tasks))
        raise RuntimeError(msg)

    objects = serializers.deserialize("json", json_bytes.decode("utf-8"))
    document_type = None
    pk_to_field = {}
    field_detectors = []
    other_objects = []
    logger = CeleryTaskLogger(task)
    for deserialized_object in objects:
        obj = deserialized_object.object
        if isinstance(obj, DocumentType):
            if document_type is not None:
                raise RuntimeError('More than one document types was detected')
            document_type = DeserializedDocumentType(
                deserialized_object,
                auto_fix_validation_errors=auto_fix_validation_errors,
                remove_missed_in_dump_objects=remove_missed_in_dump_objects,
                logger=logger)
        elif isinstance(obj, DocumentField):
            field = DeserializedDocumentField(
                deserialized_object,
                auto_fix_validation_errors=auto_fix_validation_errors,
                remove_missed_in_dump_objects=remove_missed_in_dump_objects,
                logger=logger)
            pk_to_field[field.pk] = field
        elif isinstance(obj, DocumentFieldDetector):
            field_detector = DeserializedDocumentFieldDetector(
                deserialized_object,
                auto_fix_validation_errors=auto_fix_validation_errors,
                logger=logger)
            field_detectors.append(field_detector)
        elif isinstance(obj, DocumentFieldCategory):
            category = DeserializedDocumentFieldCategory(
                deserialized_object,
                auto_fix_validation_errors=auto_fix_validation_errors,
                logger=logger)
            other_objects.append(category)
        else:
            raise RuntimeError('Unknown model')

    if document_type is None:
        raise RuntimeError('Unable to find document type')

    conflicting_document_type = DocumentType.objects \
        .filter(code=document_type.object.code) \
        .exclude(pk=document_type.pk) \
        .first()
    if conflicting_document_type is not None:
        err_msg = 'Unable to import document type #{0} "{1}". Database already contains a document type #{2}' \
                  ' with code "{3}"'.format(document_type.pk,
                                            document_type.object.code,
                                            conflicting_document_type.pk,
                                            conflicting_document_type.code)
        raise RuntimeError(err_msg)

    for field_detector in field_detectors:
        field = pk_to_field.get(field_detector.field_pk)
        if field is not None:
            field.add_dependent_object(field_detector)
        else:
            raise RuntimeError('Unknown field #{0}'.format(
                field_detector.field_pk))

    for field in pk_to_field.values():
        if field.document_type_pk == document_type.pk:
            document_type.add_dependent_object(field)
        else:
            raise RuntimeError('Unknown document type #{0}'.format(
                document_type.pk))

    for obj in other_objects:
        document_type.add_dependent_object(obj)

    logger.info('Validation of {0} ...'.format(document_type.object.code))
    validation_errors = document_type.validate()
    logger.info('Validation of {0} is finished'.format(
        document_type.object.code))
    if validation_errors:
        task.log_error(
            '{0} VALIDATION ERRORS HAS OCCURRED DURING VALIDATION OF {1}.'.
            format(len(validation_errors), document_type.object.code))
        for index, validation_error in enumerate(validation_errors):
            # for different timestamps
            sleep(0.001)
            task.log_error('VALIDATION ERROR {0}. {1}'.format(
                index + 1, str(validation_error)))
        raise ValidationError(
            'Validation errors has occurred during import of {0}'.format(
                document_type.object.code))

    if save:
        logger.info('Import of {0} ...'.format(document_type.object.code))
        with transaction.atomic():
            document_type.save()
        logger.info('Import of {0} is finished'.format(
            document_type.object.code))

    return document_type.object
Exemplo n.º 25
0
    def check_email_pool(_task) -> None:
        log = CeleryTaskLogger(_task)
        errors = []  # type: List[Tuple[str, Any]]
        for event in EmailNotificationPool.DOC_NOTIFICATION_EVENTS:
            cache_key = f'{CACHE_DOC_NOTIFICATION_PREFIX}{event}'
            try:
                cached_msgs_count = ObjectStorage.objects.filter(
                    pk__startswith=cache_key).count()
                if not cached_msgs_count:
                    continue
                if cached_msgs_count < EmailNotificationPool.batch_size:
                    lastest_msg_time = ObjectStorage.objects.filter(
                        pk__startswith=cache_key).aggregate(
                            Min('last_updated'))
                    lastest_msg_time = [
                        lastest_msg_time[k] for k in lastest_msg_time
                    ][0]
                    delta = now() - lastest_msg_time
                    if delta.seconds < EmailNotificationPool.batch_seconds:
                        continue

                ntfs = []  # type:List[DocumentNotification]
                for raw_msg in ObjectStorage.objects.filter(
                        pk__startswith=cache_key):  # type: ObjectStorage
                    try:
                        msg = pickle.loads(
                            raw_msg.data)  # type: DocumentNotification
                        ntfs.append(msg)
                    except Exception as e:
                        er_msg = 'send_notifications_packet() - error unpickling raw_msg.data'
                        log.error(er_msg)
                        errors.append((
                            er_msg,
                            e,
                        ))
                        pass

                if not ntfs:
                    continue
            except Exception as e:
                log.error(
                    f'Error in check_email_pool(), extracting pool messages: {e}'
                )
                continue
            try:
                log.info(f'send_notifications_packet({len(ntfs)})')
                EmailNotificationPool.send_notifications_packet(
                    ntfs, event, _task)
            except Exception as e:
                log.error(f'Error in check_email_pool(), sending package: {e}')
                errors.append((
                    'Error in check_email_pool(), sending package',
                    e,
                ))

            try:
                ObjectStorage.objects.filter(pk__startswith=cache_key).delete()
            except Exception as e:
                log.error(
                    f'Error in check_email_pool(), deleting pool objects: {e}')
                errors.append((
                    'Error in check_email_pool(), deleting pool objects',
                    e,
                ))
                continue
        if errors:
            er_msg = '\n'.join([m for m, _ in errors])
            raise RuntimeError(er_msg) from errors[0][1]