Пример #1
0
def send_email(log: ProcessLogger, dst_user, subject: str, txt: str, html: str, image_dir: str, cc: Set[str] = None):
    if not dst_user.email:
        log.error('Destination user {0} has no email assigned'.format(dst_user.get_full_name()))
        return

    try:
        email = EmailMultiAlternatives(subject=subject,
                                       body=txt,
                                       cc=list(cc) if cc else None,
                                       from_email=settings.DEFAULT_FROM_EMAIL,
                                       to=['"{0}" <{1}>'.format(dst_user.get_full_name(), dst_user.email)])
        if html:
            images = [m.group(3) for m in RE_SRC_ATTACHMENT.finditer(html)]
            email_html = RE_SRC_ATTACHMENT.sub(r'\1cid:\3\4', html)
            email.attach_alternative(email_html, 'text/html')

            for image_fn in images:
                data = get_notification_template_resource(os.path.join(image_dir, image_fn))
                mime_type = get_predefined_mime_type(image_fn)
                try:
                    img = MIMEImage(data, _subtype=mime_type) if mime_type else MIMEImage(data)
                except TypeError as e:
                    raise RuntimeError(f"Couldn't guess MIME type for tile {image_fn}") from e
                img.add_header('Content-Id', '<' + image_fn + '>')
                img.add_header("Content-Disposition", "inline", filename=image_fn)
                email.attach(img)

        email.send(fail_silently=False)
    except Exception as caused_by:
        log.error(f'Unable to send email to user "{dst_user.get_full_name()}" (#{dst_user.pk})',
                  exc_info=caused_by)
Пример #2
0
 def save_summary(self, log: ProcessLogger, user_id):
     # save DocumentTermUsage
     if self.located_usage_entities and TermUsage in self.located_usage_entities:
         term_usages = self.located_usage_entities[TermUsage]
         # update DocumentTermUsage records
         doc_term_usgs = {}  # type: Dict[Tuple[int, int], DocumentTermUsage]
         for tu in term_usages:  # type: TermUsage
             key = (tu.text_unit.document_id, tu.term.pk,)
             doc_usg = doc_term_usgs.get(key)
             if doc_usg:
                 doc_usg.count += 1
             else:
                 doc_usg = DocumentTermUsage()
                 doc_usg.document_id = tu.text_unit.document_id
                 doc_usg.term_id = tu.term.pk
                 doc_usg.count = 1
                 doc_term_usgs[key] = doc_usg
         if doc_term_usgs:
             doc_term_usgs_lst = [v for _, v in doc_term_usgs.items()]
             try:
                 with transaction.atomic():
                     DocumentTermUsage.objects.bulk_create(doc_term_usgs_lst, ignore_conflicts=True)
             except Exception as e:
                 log.error(f'Unable to store {len(doc_term_usgs)} DocumentTermUsage records.\n',
                           exc_info=e)
def _build_insert_clause(log: ProcessLogger, table_name: str,
                         handlers: List[field_handlers.FieldHandler],
                         document: Document,
                         fields_to_python_values: Dict[str, Any]) -> SQLClause:
    insert_clauses = list()

    for handler in handlers:  # type: field_handlers.FieldHandler
        python_value = fields_to_python_values.get(handler.field_code)
        try:
            insert_clause = handler.get_pg_sql_insert_clause(
                document.language, python_value)  # type: SQLInsertClause
            insert_clauses.append(insert_clause)
        except Exception as ex:
            msg = render_error('Unable to cache field values.\n'
                               'Document: {0} (#{1}).\n'
                               'Field: {2}'.format(document.name, document.id,
                                                   handler.field_code),
                               caused_by=ex)
            log.error(msg)

    columns_clause, values_clause = SQLInsertClause.join(insert_clauses)

    insert_clause = format_clause(
        'insert into "{table_name}" ({columns}) '
        'values ({values}) on conflict ({column_document_id}) '
        'do update set ({columns}) = ({values})',
        table_name=table_name,
        columns=columns_clause,
        values=values_clause,
        column_document_id=FIELD_CODE_DOC_ID)

    return insert_clause
Пример #4
0
def document_fields_change_listener_impl(_sender,
                                         signal,
                                         log: ProcessLogger,
                                         document_event: str,
                                         document: Document,
                                         field_handlers: Dict[str,
                                                              FieldHandler],
                                         fields_before: Optional[Dict],
                                         fields_after: Optional[Dict],
                                         changed_by_user: User = None):
    from apps.task.tasks import call_task_func
    from apps.notifications.tasks import process_notifications_on_document_change
    if not changed_by_user:
        # we ignore changes made by system at the moment
        return

    if not fields_before and not fields_after:
        log.error(
            'Document fields changed event appeared with both "before" and "after" fields empty.'
        )
        return

    from apps.notifications.app_vars import APP_VAR_DISABLE_EVENT_NOTIFICATIONS
    if APP_VAR_DISABLE_EVENT_NOTIFICATIONS.val:
        return
    call_task_func(process_notifications_on_document_change,
                   (document_event, document.pk, fields_before, fields_after,
                    changed_by_user.pk), changed_by_user.pk)
Пример #5
0
    def save(self, log: ProcessLogger, user_id):
        try:
            with transaction.atomic():
                if self.processed_text_unit_ids:
                    TextUnitTag.objects.filter(text_unit_id__in=self.processed_text_unit_ids).delete()
                    for entity_class in self.processed_usage_entity_classes:
                        entity_class.objects.filter(text_unit_id__in=self.processed_text_unit_ids).delete()

                count = 0
                for entity_class, entities in self.located_usage_entities.items():  # type: Type[Usage], List[Usage]
                    if entities:
                        entity_class.objects.bulk_create(entities, ignore_conflicts=True)
                        count += len(entities)

                tag_models = list()
                from apps.document.app_vars import LOCATE_TEXTUNITTAGS
                if LOCATE_TEXTUNITTAGS.val:
                    for text_unit_id, tags in self.tags.items():
                        for tag in tags:
                            tag_models.append(TextUnitTag(user_id=user_id,
                                                          text_unit_id=text_unit_id,
                                                          tag=tag))
                    TextUnitTag.objects.bulk_create(tag_models, ignore_conflicts=True)
                log.info(
                    'Stored {0} usage entities and {1} tags for {2} text units'.format(
                        count, len(tag_models), len(self.processed_text_unit_ids)))
        except Exception as e:
            entities_str = '\n'.join([str(e) for e in self.processed_usage_entity_classes])
            log.error(f'Unable to store location results.\n'
                      f'Text unit ids: {self.processed_text_unit_ids}\n'
                      f'Usage models caused the problem:\n{entities_str}', exc_info=e)
        self.save_summary(log, user_id)
Пример #6
0
    def save(self, log: ProcessLogger, user_id):
        try:
            with transaction.atomic():
                if self.processed_text_unit_ids:
                    if not self.document_initial_load:
                        TextUnitTag.objects.filter(text_unit_id__in=self.processed_text_unit_ids).delete()
                        for entity_class in self.processed_usage_entity_classes:
                            entity_class.objects.filter(text_unit_id__in=self.processed_text_unit_ids).delete()

                tag_models = list()
                from apps.document.app_vars import LOCATE_TEXTUNITTAGS
                tags_saved = 0
                if LOCATE_TEXTUNITTAGS.val:
                    for text_unit_id, tags in self.tags.items():
                        for tag in tags:
                            tag_models.append(TextUnitTag(user_id=user_id,
                                                          text_unit_id=text_unit_id,
                                                          tag=tag))
                    tags_saved = SafeBulkCreate.bulk_create(TextUnitTag.objects.bulk_create, tag_models)

            # save "_usage" objects
            count = 0
            for entity_class, entities in self.located_usage_entities.items():  # type: Type[Usage], List[Usage]
                if not entities:
                    continue
                count += SafeBulkCreate.bulk_create(entity_class.objects, entities)

            log.info(
                'Stored {0} usage entities and {1} tags for {2} text units'.format(
                    count, tags_saved, len(self.processed_text_unit_ids)))
        except Exception as e:
            entities_str = '\n'.join([str(e) for e in self.processed_usage_entity_classes])
            log.error(f'Unable to store location results.\n'
                      f'Text unit ids: {self.processed_text_unit_ids}\n'
                      f'Usage models caused the problem:\n{entities_str}', exc_info=e)
Пример #7
0
 def try_parsing(self, log: ProcessLogger, locate_results: LocationResults,
                 text: str, text_unit_id: int, text_unit_lang: str,
                 document_id: int, document_project_id: int, **kwargs):
     if not text:
         return
     start = datetime.datetime.now()
     try:
         parse_results = self.parse(log, text, text_unit_id, text_unit_lang,
                                    locate_results.document_initial_load,
                                    **kwargs)  # type: ParseResults
         if parse_results:
             parse_results.update_doc_project_ids(document_id,
                                                  document_project_id)
             locate_results.collect(self, text_unit_id, parse_results)
         elapsed = (datetime.datetime.now() - start).total_seconds()
         LocatingPerformanceMeter().add_record(str(type(self).__name__),
                                               elapsed, text_unit_id, text)
     except Exception as e:
         log.error(
             f'Exception caught while trying to run locator on a text unit.\n'
             f'Locator: {self.__class__.__name__}\n'
             f'Text unit id: {text_unit_id}\n'
             f'Text: {text[:1024]}\n'
             f'Text unit language: {text_unit_lang}\n',
             exc_info=e)
    def refresh_materialized_view(self, log: ProcessLogger, view_name: str):
        """
        Refresh the specified materialized view and delete all refresh requests older or equal to the last request date
        taken at this method start.

        Additionally this method acquires a PG advisory lock to prevent
        parallel refreshing of the same view.
        The lock is used by the planning routine which tries to acquire the lock
        to prevent re-planning the same refresh if it is already running.
        :param view_name:
        :param log
        :return:
        """
        try:
            with connection.cursor() as cursor:
                cursor.execute(f'update {TABLE_M_VIEW} '
                               'set status=%s where view_name=%s;',
                               [MaterializedView.VIEW_STATUS_UPDATING, view_name])
        except Exception as e:
            log.error(f'Error saving updated status for view "{view_name}": {e}')

        with transaction.atomic():
            with connection.cursor() as cursor:
                if not self.advisory_lock_by_relation_name(cursor, view_name):
                    log.info(f'Canceled refreshing materialized view: {view_name}. '
                             f'Unable to acquire the advisory lock.')
                    cursor.execute(f'update {TABLE_M_VIEW} '
                                   'set status=%s where view_name=%s;',
                                   [MaterializedView.VIEW_STATUS_UPDATED, view_name])
                    return
                log.info(f'Refreshing materialized view: {view_name}.')
                cursor.execute('select max(request_date) '
                               f'from {TABLE_M_VIEW_REQUEST} '
                               'where view_name = %s;', [view_name])
                row = cursor.fetchone()
                request_date = row[0] if row else None

                concurency_clause = ''
                from apps.materialized_views.app_vars import CONCURRENCY_UPDATE
                if CONCURRENCY_UPDATE.val:
                    concurency_clause = ' CONCURRENTLY'
                cursor.execute(f'refresh materialized view{concurency_clause} {view_name};')

                if request_date is not None:
                    cursor.execute(f'delete from {TABLE_M_VIEW_REQUEST} '
                                   'where view_name = %s and request_date <= %s',
                                   [view_name, request_date])
                else:
                    cursor.execute(f'delete from {TABLE_M_VIEW_REQUEST} '
                                   'where view_name = %s',
                                   [view_name])
                dt_now = timezone.now()
                cursor.execute(f'insert into {TABLE_M_VIEW} '
                               '(view_name, refresh_date, status) '
                               'values (%s, %s, %s) '
                               'on conflict (view_name) do update set refresh_date = %s, '
                               'status = %s;',
                               [view_name, dt_now, MaterializedView.VIEW_STATUS_UPDATED,
                                dt_now, MaterializedView.VIEW_STATUS_UPDATED])
Пример #9
0
 def try_parsing(self, log: ProcessLogger, locate_results: LocationResults, text: str,
                 text_unit_id: int, text_unit_lang: str, **kwargs):
     try:
         parse_results = self.parse(log, text, text_unit_id, text_unit_lang, **kwargs)  # type: ParseResults
         if parse_results:
             locate_results.collect(self, text_unit_id, parse_results)
     except Exception as e:
         log.error(f'Exception caught while trying to run locator on a text unit.\n'
                   f'Locator: {self.__class__.__name__}\n'
                   f'Text unit id: {text_unit_id}\n'
                   f'Text: {text[:1024]}\n'
                   f'Text unit language: {text_unit_lang}\n', exc_info=e)
Пример #10
0
    def save(self, log: ProcessLogger, user_id):
        try:
            with transaction.atomic():
                if self.processed_text_unit_ids:
                    TextUnitTag.objects.filter(
                        text_unit_id__in=self.processed_text_unit_ids).delete(
                        )
                    for entity_class in self.processed_usage_entity_classes:
                        entity_class.objects.filter(
                            text_unit_id__in=self.processed_text_unit_ids
                        ).delete()

                count = 0
                for entity_class, entities in self.located_usage_entities.items(
                ):  # type: Type[Usage], List[Usage]
                    if entities:
                        entity_class.objects.bulk_create(entities,
                                                         ignore_conflicts=True)
                        count += len(entities)

                tag_models = list()
                for text_unit_id, tags in self.tags.items():
                    for tag in tags:
                        tag_models.append(
                            TextUnitTag(user_id=user_id,
                                        text_unit_id=text_unit_id,
                                        tag=tag))
                TextUnitTag.objects.bulk_create(tag_models,
                                                ignore_conflicts=True)
                log.info(
                    'Stored {0} usage entities and {1} tags for {2} text units'
                    .format(count, len(tag_models),
                            len(self.processed_text_unit_ids)))
        except:
            msg = render_error(
                'Unable to store location results.\n'
                'Text unit ids: {text_unit_ids}\n'
                'Usage models caused the problem:\n{entities}'.format(
                    text_unit_ids=self.processed_text_unit_ids,
                    entities='\n'.join([
                        str(e) for e in self.processed_usage_entity_classes
                    ])))
            log.error(msg)
Пример #11
0
 def try_parsing(self, log: ProcessLogger, locate_results: LocationResults,
                 text: str, text_unit_id: int, text_unit_lang: str,
                 **kwargs):
     try:
         parse_results = self.parse(text, text_unit_id, text_unit_lang,
                                    **kwargs)  # type: ParseResults
         locate_results.collect(self, text_unit_id, parse_results)
     except:
         msg = render_error(
             'Exception caught while trying to run locator on a text unit.\n'
             'Locator: {locator}\n'
             'Text unit id: {text_unit_id}\n'
             'Text: {text}\n'
             'Text unit language: {text_unit_lang}\n'.format(
                 locator=self.__class__.__name__,
                 text_unit_id=text_unit_id,
                 text=text[:1024],
                 text_unit_lang=text_unit_lang))
         log.error(msg)
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
        log.debug('detect_field_value: csv_regexps_field_detection, ' +
                  f'field {field.code}({field.pk}), document #{doc.pk}')
        detectors = cls.detecting_cache.get_detectors(
            field.pk,
            lambda msg, er: log.error(msg, field_code=field.code, exc_info=er))
        if not detectors:
            return None

        is_multichoice = field.type == MultiChoiceField.type_code
        doc_text = cls.get_document_text(doc)

        annotations = []

        for detector in detectors:
            found_item = detector.find_value(doc_text)
            if not found_item:
                continue

            # TODO: implement reading values from full text (TextParts.FULL.value)
            # as it is done now, or from text units - paragraphs or sentences
            # based on field.text_unit_type - for other detector.text_part options
            """            
            if detector.text_part == TextParts.BEFORE_REGEXP.value:
                return matching_string[:begin], 0, begin
            elif detector.text_part == TextParts.AFTER_REGEXP.value:
                return matching_string[end:], end, len(text)
            elif detector.text_part == TextParts.INSIDE_REGEXP.value:
                return matching_string[begin:end], begin, end
            else:
                return text, 0, len(text)
            """

            # starting position has to be shifted backward by 1 symbol for FE
            ant = AnnotationDTO(annotation_value=found_item[0],
                                location_in_doc_start=max(
                                    found_item[1] - 1, 0),
                                location_in_doc_end=found_item[2],
                                extraction_hint_name='')
            if not is_multichoice:
                return FieldValueDTO(field_value=found_item[0],
                                     annotations=[ant])
            else:
                annotations.append(ant)

        if annotations:
            f_val = [a.annotation_value for a in annotations]
            return FieldValueDTO(field_value=f_val, annotations=annotations)
        return None
Пример #13
0
def detect_and_cache_field_values_for_document(log: ProcessLogger,
                                               document: Document,
                                               save: bool = True,
                                               clear_old_values: bool = True,
                                               changed_by_user: User = None,
                                               system_fields_changed: bool = False,
                                               generic_fields_changed: bool = False,
                                               document_initial_load: bool = False,
                                               ignore_field_codes: Set[str] = None,
                                               updated_field_codes: List[str] = None,
                                               skip_modified_values: bool = True):
    """
    Detects field values for a document and stores their DocumentFieldValue objects as well as Document.field_value.
    These two should always be consistent.
    :param log:
    :param document:
    :param save:
    :param clear_old_values:
    :param changed_by_user
    :param system_fields_changed
    :param generic_fields_changed
    :param ignore_field_codes
    :param document_initial_load
    :param updated_field_codes - if set, we search for changed and dependent fields only
    :param skip_modified_values - don't overwrite field values overwritten by user
    :return:
    """
    import apps.document.repository.document_field_repository as dfr
    field_repo = dfr.DocumentFieldRepository()

    if save and document.status and not document.status.is_active:
        raise RuntimeError(f'Detecting field values for completed documents is not permitted.\n'
                           f'Document: {document.name} (#{document.pk})')

    document_type = document.document_type  # type: DocumentType

    all_fields = document_type.fields \
        .all() \
        .prefetch_related(Prefetch('depends_on_fields', queryset=DocumentField.objects.only('uid').all()))

    all_fields = list(all_fields)

    fields_and_deps = [(f.code, set(f.get_depends_on_codes()) or set()) for f in all_fields]
    dependent_fields = get_dependent_fields(fields_and_deps, set(updated_field_codes)) \
        if updated_field_codes else None

    sorted_codes = order_field_detection(fields_and_deps)
    all_fields_code_to_field = {f.code: f for f in all_fields}  # type: Dict[str, DocumentField]

    log.info(f'Detecting field values for document {document.name} (#{document.pk}), save={save}.\n'
             f'Updated fields: {updated_field_codes or "All"}.\n'
             f'Dependent fields to be detected: {dependent_fields or "All"}.\n'
             f'Ignored fields: {ignore_field_codes}.')

    if updated_field_codes:
        sorted_codes = [c for c in sorted_codes
                        if c in dependent_fields and (not ignore_field_codes or c not in ignore_field_codes)]
    elif ignore_field_codes:
        sorted_codes = [c for c in sorted_codes if c not in ignore_field_codes]

    current_field_values = {f.code: None for f in all_fields}
    # we may get values for fields required for sorted_codes, regarding
    # further dependencies
    # or we may just get all fields' values (field_codes_only=None)
    actual_field_values = field_repo.get_field_code_to_python_value(document_type_id=document_type.pk,
                                                                    doc_id=document.pk,
                                                                    field_codes_only=None)
    current_field_values.update(actual_field_values)

    res = list()

    detecting_field_status = []  # type:List[str]
    detection_errors = []  # type:List[Tuple[str, str, Exception, Any]]

    # do not touch field values modified by user
    skip_codes = set()
    if skip_modified_values:
        skip_codes = set(list(FieldValue.objects.filter(
            modified_by__isnull=False, document_id=document.pk).values_list('field__code', flat=True)))
        if updated_field_codes:  # these fields have to be deleted despite being set by user
            # updated_field_ids = DocumentField.objects.filter(code__in=updated_field_codes).values_list('pk', flat=True)
            skip_codes -= set(updated_field_codes)

    if clear_old_values:
        field_repo.delete_document_field_values(document.pk,
                                                list(skip_codes),
                                                updated_field_codes)

    for field_code in sorted_codes:
        if field_code in skip_codes:
            continue
        field = all_fields_code_to_field[field_code]  # type: DocumentField
        typed_field = TypedField.by(field)  # type: TypedField
        field_detection_strategy = FIELD_DETECTION_STRATEGY_REGISTRY[
            field.value_detection_strategy]  # type: FieldDetectionStrategy

        try:
            new_field_value_dto = field_detection_strategy.detect_field_value(log=log,
                                                                              doc=document,
                                                                              field=field,
                                                                              field_code_to_value=current_field_values)

            if not new_field_value_dto:
                detecting_field_status.append(f"No new value's gotten for '{field.code}'")
                continue
            if is_unit_limit_exceeded(new_field_value_dto, field, document):
                continue

            detecting_field_status.append(
                f"{format_value_short_str(new_field_value_dto.field_value)} for '{field.code}'")

            # now merge the detection results with the current DB state
            if save:
                # user = None here to store detected values as owned by system allowing further overwriting
                field_value, annotations = field_repo.update_field_value_with_dto(document=document,
                                                                                  field=field,
                                                                                  field_value_dto=new_field_value_dto,
                                                                                  user=None)

                # and update the field value of this field which may be used for detection of fields depending on it
                current_field_values[field.code] = typed_field.field_value_json_to_python(field_value.value)

            # If save is not requested then do not update current_field_values.
            # Most likely in this case we detect only few requested fields and trying to comply the dependency
            # tree makes no big sense.
        except Exception as e:
            # Additionally logging here because the further compound exception will not contain the full stack trace.
            log.error(f'Exception caught while detecting value of field {field.code} ({typed_field.type_code})',
                      exc_info=e)
            detection_errors.append((field.code, typed_field.type_code, e, sys.exc_info()))

    if save:
        if updated_field_codes:
            user_fields_changed_set = set(updated_field_codes)
            if dependent_fields:
                user_fields_changed_set.update(dependent_fields)
            user_fields_changed = list(user_fields_changed_set)  # type: FieldSpec
        else:
            user_fields_changed = True

        fire_document_changed(sender=detect_and_cache_field_values_for_document,
                              log=log,
                              document=document,
                              changed_by_user=changed_by_user,
                              document_initial_load=document_initial_load,
                              system_fields_changed=system_fields_changed,
                              generic_fields_changed=generic_fields_changed,
                              user_fields_changed=user_fields_changed)
        if dependent_fields:
            msg = f'Recalculating dependent fields for {document.name}: '  # dependent_fields
            msg += ', '.join(dependent_fields)
            msg += '.\n\nSource fields data: \n'
            msg += '; '.join([f'"{k}": "{format_value_short_str(current_field_values[k])}"'
                              for k in current_field_values])
            msg += '.\n\nCalculation results:\n'
            msg += '\n'.join(detecting_field_status)
            log.info(msg)

    if detection_errors:
        fields_str = ', '.join([f'{e[0]} ({e[1]})' for e in detection_errors])
        msg = f'There were errors while detecting fields:\n{fields_str}\n' + \
              f'for document {document.name} (#{document.pk}, type {document_type.code})\n'
        for f_code, f_type, ex, ex_stack in detection_errors:
            msg += f'\n{f_code}, {f_type}: {ex}'
        raise FieldDetectionError(msg)

    return res
def detect_and_cache_field_values_for_document(log: ProcessLogger,
                                               document: Document,
                                               save: bool = True,
                                               clear_old_values: bool = True,
                                               changed_by_user: User = None,
                                               system_fields_changed: bool = False,
                                               generic_fields_changed: bool = False,
                                               document_initial_load: bool = False,
                                               ignore_field_codes: Set[str] = None,
                                               updated_field_codes: List[str] = None):
    """
    Detects field values for a document and stores their DocumentFieldValue objects as well as Document.field_value.
    These two should always be consistent.
    :param log:
    :param document:
    :param save:
    :param clear_old_values:
    :param changed_by_user
    :param system_fields_changed
    :param generic_fields_changed
    :param document_initial_load
    :param updated_field_codes - if set, we search for changed and dependent fields only
    :return:
    """

    save_cache = save
    save_detected = save
    if save and document.status and not document.status.is_active:
        log.info('Forbidden storing detected field values for document with "completed"'
                 ' status, document #{} ({})'.format(document.id, document.name))
        save_detected = False

    document_type = document.document_type  # type: DocumentType

    all_fields = document_type.fields \
        .all() \
        .prefetch_related(Prefetch('depends_on_fields', queryset=DocumentField.objects.only('uid').all()))

    all_fields = list(all_fields)

    fields_and_deps = [(f.code, f.get_depends_on_codes() or set()) for f in all_fields]
    required_fields = get_dependent_fields(fields_and_deps, set(updated_field_codes)) \
        if updated_field_codes else None

    sorted_codes = order_field_detection(fields_and_deps)
    all_fields_code_to_field = {f.code: f for f in all_fields}  # type: Dict[str, DocumentField]

    res = list()
    for field_code in sorted_codes:
        if ignore_field_codes and field_code in ignore_field_codes:
            continue
        if required_fields and field_code not in required_fields:
            continue

        field = all_fields_code_to_field[field_code]  # type: DocumentField
        field_detection_strategy = FIELD_DETECTION_STRATEGY_REGISTRY[
            field.value_detection_strategy]  # type: FieldDetectionStrategy

        try:
            field_vals = field_value_cache.cache_field_values(document, None, save=False)
            detected_values = field_detection_strategy.detect_field_values(log,
                                                                           document,
                                                                           field,
                                                                           field_vals)  # type: List[DetectedFieldValue]
        except Exception as e:
            msg = '''Unable to detect field value. 
            Document type: {0} 
            Document: {1} 
            Field: {2}'''.format(document_type.code, document.pk, field.code)
            log.error(render_error(msg, e))
            raise e

        if save_detected and clear_old_values:
            # Delete previously detected values
            # to avoid accumulating garbage on each iteration.
            DocumentFieldValue.objects \
                .filter(document=document,
                        field=field,
                        removed_by_user=False,
                        created_by__isnull=True,
                        modified_by__isnull=True) \
                .exclude(field__value_detection_strategy=DocumentField.VD_DISABLED) \
                .delete()

        if detected_values:
            res.extend(detected_values)
            if save_detected:
                save_detected_values(document, field, detected_values)

    if save_cache:
        field_value_cache.cache_field_values(document, suggested_field_values=res,
                                             save=True, log=log,
                                             changed_by_user=changed_by_user,
                                             system_fields_changed=system_fields_changed,
                                             generic_fields_changed=generic_fields_changed,
                                             document_initial_load=document_initial_load)

    return res
def cache_document_fields(
        log: ProcessLogger,
        document: Document,
        cache_generic_fields: bool = True,
        cache_user_fields: bool = True,
        pre_detected_field_codes_to_suggested_values: Optional[Dict[
            str, Any]] = None,
        document_initial_load: bool = False,
        changed_by_user: User = None):
    document_type = document.document_type
    table_name = doc_fields_table_name(document_type.code)

    cache_suggested_fields = pre_detected_field_codes_to_suggested_values is not None

    handlers = build_field_handlers(document_type,
                                    table_name,
                                    include_generic_fields=True,
                                    include_user_fields=True,
                                    include_suggested_fields=True)

    system_field_handlers = list()  # type: List[field_handlers.FieldHandler]
    generic_field_handlers = list()  # type: List[field_handlers.FieldHandler]
    user_field_handers = list()  # type: List[field_handlers.FieldHandler]
    user_suggested_field_handlers = list()

    for h in handlers:
        if h.field_code in FIELD_CODES_SYSTEM:
            system_field_handlers.append(h)
        elif h.field_code in _FIELD_CODES_GENERIC:
            generic_field_handlers.append(h)
        elif h.is_suggested:
            user_suggested_field_handlers.append(h)
        else:
            user_field_handers.append(h)

    insert_field_handlers = list()  # type: List[field_handlers.FieldHandler]
    field_to_python_values = dict()
    _fill_system_fields_to_python_values(document, field_to_python_values)
    insert_field_handlers += system_field_handlers

    if cache_generic_fields:
        _fill_generic_fields_to_python_values(document, field_to_python_values)
        insert_field_handlers += generic_field_handlers

    if cache_user_fields:
        if user_field_handers:
            insert_field_handlers += user_field_handers
            real_document_field_values = DocumentFieldValue.objects \
                .filter(document=document, field__code__in={h.field_code for h in user_field_handers}) \
                .exclude(removed_by_user=True) \
                .select_related('field')  # type: List[DocumentFieldValue]

            for dfv in real_document_field_values:
                field_type = field_types.FIELD_TYPES_REGISTRY[dfv.field.type]
                field_to_python_values[
                    dfv.field.code] = field_type.merge_multi_python_values(
                        field_to_python_values.get(dfv.field.code),
                        dfv.python_value)

            if cache_suggested_fields and pre_detected_field_codes_to_suggested_values is not None:
                insert_field_handlers += user_suggested_field_handlers
                for field_code, python_value in pre_detected_field_codes_to_suggested_values.items(
                ):
                    field_to_python_values[field_code +
                                           '_suggested'] = python_value

    insert_clause = _build_insert_clause(log, table_name,
                                         insert_field_handlers, document,
                                         field_to_python_values)

    with connection.cursor() as cursor:
        document_fields_before = _get_document_fields(cursor=cursor,
                                                      document_id=document.pk,
                                                      table_name=table_name,
                                                      handlers=handlers)
        try:
            cursor.execute(insert_clause.sql, insert_clause.params)
        except:
            import sys
            etype, evalue, _ = sys.exc_info()
            log.error(
                'Error {etype}: {evalue}\n' +
                'in cache_document_fields(doc_id={document_id})\nSQL: {sql}\nParams: {ptrs}.\n\n'
                .format(etype=etype,
                        evalue=evalue,
                        document_id=document.pk,
                        sql=insert_clause.sql,
                        ptrs=insert_clause.params))
            raise

    inserted_document_fields = {
        h.field_code: h.python_value_to_indexed_field_value(
            field_to_python_values.get(h.field_code))
        for h in insert_field_handlers
    }

    document_fields_after = dict(
        document_fields_before) if document_fields_before else dict()
    document_fields_after.update(inserted_document_fields)
    fire_document_fields_changed(
        cache_document_fields,
        log=log,
        document_event=DocumentEvent.CREATED.value
        if document_initial_load else DocumentEvent.CHANGED.value,
        document=document,
        field_handlers={h.field_code: h
                        for h in handlers},
        fields_before=document_fields_before,
        fields_after=document_fields_after,
        changed_by_user=changed_by_user)
Пример #16
0
    def train_model(cls, log: ProcessLogger, field: DocumentField, train_data_sets: List[List[dict]],
                    split_and_log_out_of_sample_test_report: bool = False) -> ClassifierModel:
        typed_field = TypedField.by(field)
        df = pd.DataFrame.from_records(train_data_sets.pop(0))
        # add transferred external data
        for train_data in train_data_sets:
            df = df.append(pd.DataFrame.from_records(train_data))

        df['target_name'] = df.apply(lambda row: encode_category(
            field.code,
            row.value if typed_field.is_choice_field else None,
            row.extraction_hint), axis=1)

        df['target_index'] = df['target_name'].factorize(sort=True)[0] + 1

        df = df.append(
            [{'text_unit__textunittext__text': i} for i in
             cls.get_no_field_text_units(field.document_type, field.text_unit_type)])

        df['target_index'] = df['target_index'].fillna(0).astype('int')
        df['target_name'] = df['target_name'].fillna(SkLearnClassifierModel.EMPTY_CAT_NAME).astype(
            'str')
        df['user_input'] = df['modified_by'].fillna(0).astype('bool')

        res_df = pd.DataFrame()

        for group_index, group_df in df.groupby('target_index'):
            if group_df.shape[0] > settings.ML_TRAIN_DATA_SET_GROUP_LEN:
                group_df = shuffle(
                    group_df.sort_values('user_input', ascending=False)[:settings.ML_TRAIN_DATA_SET_GROUP_LEN])
            res_df = res_df.append(group_df)
        res_df = shuffle(res_df)

        target_names = sorted(res_df['target_name'].unique())

        if field.classifier_init_script:
            try:
                clf = cls.init_classifier(field)
            except Exception as e:
                log.error(f'Unable to initialize classifier for field {field.code}. '
                          f'Classifier init script: {field.classifier_init_script}', exc_info=e)
        else:
            clf = SGDClassifier(loss='hinge', penalty='l2',
                                alpha=1e-3, max_iter=5, tol=None, n_jobs=-1,
                                class_weight='balanced')

        log.info(f'Classifier initialized: {clf}')

        text_clf = Pipeline([('vect', CountVectorizer(strip_accents='unicode', analyzer='word',
                                                      stop_words='english',
                                                      tokenizer=word_position_tokenizer)),
                             ('tfidf', TfidfTransformer()),
                             ('clf', clf),
                             ])
        x = res_df['text_unit__textunittext__text']
        y = res_df['target_index']

        if split_and_log_out_of_sample_test_report:
            x_train, x_test_os, y_train, y_test_os = train_test_split(x, y, test_size=0.2, random_state=42)
        else:
            x_train, x_test_os, y_train, y_test_os = x, None, y, None

        sklearn_model = text_clf.fit(x_train, y_train)

        model = SkLearnClassifierModel(sklearn_model=sklearn_model, target_names=target_names)

        classifier_model = ClassifierModel()
        classifier_model.set_trained_model_obj(model)
        classifier_model.document_field = field

        classifier_model.classifier_accuracy_report_in_sample = \
            classification_report(y,
                                  text_clf.predict(x),
                                  target_names=target_names)

        if y_test_os is not None and x_test_os is not None:
            classifier_model.classifier_accuracy_report_out_of_sample = \
                classification_report(y_test_os,
                                      text_clf.predict(x_test_os),
                                      target_names=target_names)

        return classifier_model
Пример #17
0
def check_task_health(log: ProcessLogger, restart_task_func: Callable[[str],
                                                                      None]):
    """
    Find and process unhealthy tasks - the tasks which are hanging in PENDING while there is at least one
    free worker of each kind (default, high, doc_load).
    This is intended to wait silently until all other tasks processed and next re-send the hanged PENDING tasks.


    Goal state: if there are PENDING tasks which are not known by any worker
                - there should not be free workers of all types.
    """
    start_time = time()

    inspect_start_time = time()
    celery_stats = get_celery_stats()
    inspect_time_spent = time() - inspect_start_time

    if not celery_stats.free_workers_available_of_any_kind:
        log.info(
            f'Task health check: there are no workers at all or at least some kind of worker is still busy.\n'
            f'Not checking for the hanged tasks.'
            f'Celery inspect time: {inspect_time_spent:.3f}s\n')
        return

    query_time_start = time()

    # There is at least one free worker of each kind.
    # This means there should be no PENDING tasks not known to workers.
    # Increasing bad health check counter for the PENDING tasks not known to workers.
    Task.objects \
        .filter(own_status='PENDING', bad_health_check_num__lt=TASK_BAD_HEALTH_CHECK_RETRIES) \
        .exclude(queue=settings.CELERY_QUEUE_SERIAL) \
        .exclude(name__in=settings.EXCLUDE_FROM_TRACKING) \
        .exclude(pk__in=celery_stats.tasks_on_workers) \
        .update(bad_health_check_num=F('bad_health_check_num') + 1)

    # Set bad counter to zero for all tasks on workers
    Task.objects \
        .filter(pk__in=celery_stats.tasks_on_workers) \
        .exclude(bad_health_check_num=0) \
        .update(bad_health_check_num=0)

    # Restarts those having the counter >= threshold

    to_restart = list(
        Task.objects.filter(
            own_status='PENDING',
            bad_health_check_num=TASK_BAD_HEALTH_CHECK_RETRIES).values_list(
                'pk', 'name'))

    query_time_spent = time() - query_time_start

    restarted_tasks = list()
    could_not_restart_tasks = list()
    for task_id, task_name in to_restart:
        try:
            restart_task_func(task_id)
            restarted_tasks.append((task_id, task_name))
        except Exception as ex:
            log.error(f'Unable to restart task {task_name} ({task_id})',
                      exc_info=ex)
            could_not_restart_tasks.append((task_id, task_name))

    restarted_msg = '\n'.join(
        task_id + " " + task_name
        for task_id, task_name in restarted_tasks) if restarted_tasks else 'no'
    problem_restarting_msg = '\n'.join(
        task_id + " " + task_name for task_id, task_name in
        could_not_restart_tasks) if restarted_tasks else 'no'
    log.info(
        f'Checked task health. Found {len(to_restart)} unhealthy tasks.\n'
        f'Total time: {time() - start_time:.3f}s\n'
        f'Celery inspect time: {inspect_time_spent:.3f}s\n'
        f'DB query time: {query_time_spent:.3f}s\n'
        f'Restarted tasks:\n{restarted_msg}\n'
        f'Could not restart tasks:\n{problem_restarting_msg}',
        extra={'log_unhealthy_tasks': bool(to_restart)})