Пример #1
0
    def detect_field_value(cls,
                           log: ProcessLogger,
                           doc: Document,
                           field: DocumentField,
                           field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:

        depends_on_full_text = doc.full_text
        detected_with_stop_words, detected_value = detect_with_stop_words_by_field_and_full_text(field,
                                                                                                 depends_on_full_text)
        if detected_with_stop_words:
            return FieldValueDTO(field_value=detected_value)

        try:
            classifier_model = ClassifierModel.objects.get(document_field=field)
            sklearn_model = classifier_model.get_trained_model_obj()
            typed_field = TypedField.by(field)  # type: TypedField

            ants = list()  # type: List[AnnotationDTO]

            qs_text_units = TextUnit.objects \
                .filter(document=doc) \
                .filter(unit_type=field.text_unit_type) \
                .order_by('location_start', 'pk')

            units_counted = 0
            for text_unit in qs_text_units.iterator():
                if field.detect_limit_count:
                    units_counted = FieldDetectionStrategy.update_units_counted(
                        field, units_counted, text_unit)
                    if units_counted > field.detect_limit_count:
                        break

                ant = cls.predict_and_extract_value(sklearn_model=sklearn_model,
                                                    typed_field=typed_field,
                                                    document=doc,
                                                    field=field,
                                                    text_unit=text_unit)
                if ant is None:
                    continue
                if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                    if ant.location_in_doc_start > field.detect_limit_count:
                        break

                ants.append(ant)
                if not isinstance(typed_field, MultiValueField):
                    return FieldValueDTO(field_value=ant.annotation_value, annotations=ants)

                if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                    units_counted += len(text_unit.text)

            if not ants:
                return None

            return FieldValueDTO(field_value=typed_field.build_json_field_value_from_json_ant_values([a.annotation_value
                                                                                                      for a in ants]),
                                 annotations=ants)

        except ClassifierModel.DoesNotExist as e:
            log.info(f'Classifier model does not exist for field: {field.code}')
            raise e
    def maybe_detect_with_stop_words(
            cls, field: DocumentField, doc: Document,
            cached_fields: Dict[str, Any]) -> Optional[FieldValueDTO]:
        if field.stop_words:
            depends_on_field_codes = list(
                field.depends_on_fields.all().values_list(
                    'code', flat=True))  # type: List[str]
            depends_on_full_text = []

            if not any(cached_fields):
                return None

            for field_code in depends_on_field_codes:
                v = cached_fields.get(field_code)
                if v:
                    depends_on_full_text.append(str(v))

            detected_with_stop_words, detected_field_value = \
                detect_with_stop_words_by_field_and_full_text(field=field,
                                                              doc=doc,
                                                              full_text='\n'.join(depends_on_full_text))
            if detected_with_stop_words:
                return FieldValueDTO(field_value=TypedField.by(
                    field).field_value_python_to_json(detected_field_value))
        return None
Пример #3
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> FieldValueDTO:
        formula = field.formula

        if not formula:
            raise ValueError(
                f'No formula specified for field {field.code} (#{field.uid})')

        depends_on_field_codes = field.get_depends_on_codes() or set()

        field_code_to_value = {
            c: v
            for c, v in field_code_to_value.items()
            if c in depends_on_field_codes
        }

        if field.stop_words:
            depends_on_full_text = '\n'.join(
                [str(v) for v in field_code_to_value.values()])
            log.debug(
                'detect_field_value: formula_based_field_detection, checking stop words, '
                + f'field {field.code}({field.pk}), document #{doc.pk}')
            detected_with_stop_words, detected_values \
                = detect_with_stop_words_by_field_and_full_text(field,
                                                                doc,
                                                                depends_on_full_text)
            if detected_with_stop_words:
                return detected_values or list()
        else:
            log.debug('detect_field_value: formula_based_field_detection, ' +
                      f'field {field.code}({field.pk}), document #{doc.pk}')

        v = cls.calc_formula(field_code=field.code,
                             formula=formula,
                             depends_on_field_to_value=field_code_to_value,
                             convert_decimals_to_floats=field.
                             convert_decimals_to_floats_in_formula_args)
        typed_field = TypedField.by(field)

        # We don't accept formulas returning values of wrong type to avoid further confusion and
        # creating wrong formulas in future.
        # For example for multi-choice fields the formula should return a list and not a string
        # to ensure the admin understands that this value will replace the whole set/list of strings and not
        # just add one more string to the value.
        if typed_field.is_choice_field and typed_field.multi_value:
            if v and isinstance(v, str):
                # "outdated" formula is incorrect and returns string instead of
                # set / list, but we don't warn user: when he updates this formula
                # (or other detection method) he'll be forced to write code, returning
                # list or set.
                v = [v]

        if not typed_field.is_python_field_value_ok(v):
            raise ValueError(
                f'Formula of field {field.code} returned value not suitable for this field:\n{v}'
            )
        v = typed_field.field_value_python_to_json(v)
        return FieldValueDTO(field_value=v)
Пример #4
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:

        depends_on_fields = field.get_depends_on_codes()
        field_code_to_value = {
            c: v
            for c, v in field_code_to_value.items() if c in depends_on_fields
        }

        if field.stop_words:
            depends_on_full_text = '\n'.join(
                [str(v) for v in field_code_to_value.values()])
            detected_with_stop_words, detected_value \
                = detect_with_stop_words_by_field_and_full_text(field, doc, depends_on_full_text)
            if detected_with_stop_words:
                return FieldValueDTO(field_value=detected_value)

        field_detectors = DocumentFieldDetector.objects.filter(field=field)
        detectors = [DetectorFieldMatcher(d) for d in field_detectors]
        typed_field = TypedField.by(field)  # type: TypedField

        values = list()  # type: List

        for depends_on_value in field_code_to_value.values():
            if not depends_on_value:
                continue
            depends_on_value = str(depends_on_value)
            for detector_field_matcher in detectors:  # type: DetectorFieldMatcher
                matching_piece = detector_field_matcher.matching_string(
                    depends_on_value, text_is_sentence=False)
                if matching_piece is not None:
                    matching_string = matching_piece[0]
                    value = detector_field_matcher.get_validated_detected_value(
                        field)
                    if typed_field.requires_value:
                        hint_name = detector_field_matcher.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                        value, hint_name = typed_field \
                            .get_or_extract_value(doc,
                                                  value,
                                                  hint_name,
                                                  matching_string)
                        if value is None:
                            continue

                    value = typed_field.annotation_value_python_to_json(value)
                    if not isinstance(typed_field, MultiValueField):
                        return FieldValueDTO(field_value=value)
                    else:
                        values.append(value)

        if isinstance(typed_field, MultiValueField):
            return FieldValueDTO(
                field_value=typed_field.
                build_json_field_value_from_json_ant_values(values))
        else:
            return None
Пример #5
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:

        log.debug(
            'detect_field_value: regexps_and_text_based_ml_field_value, ' +
            f'field {field.code}({field.pk}), document #{doc.pk}')

        ants: List[AnnotationDTO] = []
        text_unit_repo = cls.text_unit_repo
        depends_on_full_text: str = doc.full_text
        typed_field: TypedField = TypedField.by(field)

        detected_with_stop_words, detected_value = \
            detect_with_stop_words_by_field_and_full_text(field, depends_on_full_text)
        if detected_with_stop_words:
            return FieldValueDTO(field_value=detected_value)

        qs_text_units = text_unit_repo.get_doc_text_units(
            doc, field.text_unit_type)
        qs_text_units = FieldDetectionStrategy.reduce_textunits_by_detection_limit(
            qs_text_units, field)

        try:
            classifier_model = ClassifierModel.objects.get(
                document_field=field)
            sklearn_model = classifier_model.get_trained_model_obj()

            for text_unit in qs_text_units.iterator():  # type: TextUnit
                ant = cls.predict_and_extract_value(
                    sklearn_model=sklearn_model,
                    typed_field=typed_field,
                    document=doc,
                    field=field,
                    text=text_unit.text,
                    location_start=text_unit.location_start,
                    location_end=text_unit.location_end)
                if ant is None:
                    continue
                ants.append(ant)
                if not isinstance(typed_field, MultiValueField):
                    return FieldValueDTO(field_value=ant.annotation_value,
                                         annotations=ants)
            if not ants:
                return None

            return FieldValueDTO(field_value=typed_field.
                                 build_json_field_value_from_json_ant_values(
                                     [a.annotation_value for a in ants]),
                                 annotations=ants)

        except ClassifierModel.DoesNotExist as e:
            log.info(
                f'Classifier model does not exist for field: {field.code}')
            raise e
Пример #6
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
        try:
            log.debug('detect_field_value: regexps_field_detection, ' +
                      f'field {field.code}({field.pk}), document #{doc.pk}')
        except AttributeError:
            pass

        ants: List[AnnotationDTO] = []
        depends_on_full_text: str = doc.full_text
        typed_field: TypedField = TypedField.by(field)
        text_unit_repo: TextUnitRepository = cls.text_unit_repo
        field_detector_repo: FieldDetectorRepository = cls.field_detector_repo

        detected_with_stop_words, detected_value = \
            detect_with_stop_words_by_field_and_full_text(field, doc, depends_on_full_text)
        if detected_with_stop_words:
            return FieldValueDTO(field_value=detected_value)

        qs_text_units = text_unit_repo.get_doc_text_units(
            doc, field.text_unit_type)
        qs_text_units = FieldDetectionStrategy.reduce_textunits_by_detection_limit(
            qs_text_units, field)

        field_detectors = field_detector_repo.get_field_detectors(field)
        detectors = [DetectorFieldMatcher(d) for d in field_detectors]

        for text_unit in qs_text_units:
            unit_ants = cls.extract_from_textunit(text_unit, field, detectors)
            if not unit_ants:
                continue
            if not isinstance(typed_field, MultiValueField):
                return FieldValueDTO(field_value=unit_ants[0].annotation_value,
                                     annotations=unit_ants)
            else:
                ants += unit_ants

        if not ants:
            return None

        if isinstance(typed_field, MultiValueField):
            field_value = typed_field.build_json_field_value_from_json_ant_values(
                [a.annotation_value for a in ants])
        else:
            field_value = typed_field.annotation_value_python_to_json(
                ants[0].annotation_value)
        return FieldValueDTO(field_value=field_value, annotations=ants)
Пример #7
0
    def detect_field_value(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]:
        text_unit_repo = cls.text_unit_repo
        field_detector_repo = cls.field_detector_repo
        log.debug('detect_field_value: regexps_field_detection, ' +
                  f'field {field.code}({field.pk}), document #{doc.pk}')
        depends_on_full_text = doc.full_text
        detected_with_stop_words, detected_value = detect_with_stop_words_by_field_and_full_text(
            field, depends_on_full_text)
        if detected_with_stop_words:
            return FieldValueDTO(field_value=detected_value)

        qs_text_units = text_unit_repo.get_doc_text_units(
            doc, field.text_unit_type)

        field_detectors = field_detector_repo.get_field_detectors(field)
        detectors = [DetectorFieldMatcher(d) for d in field_detectors]

        typed_field = TypedField.by(field)  # type: TypedField
        ants = list()  # type: List[AnnotationDTO]
        units_counted = 0

        for text_unit in qs_text_units:  # type: TextUnit
            if field.detect_limit_count:
                units_counted = FieldDetectionStrategy.update_units_counted(
                    field, units_counted, text_unit)
                if units_counted > field.detect_limit_count:
                    break

            for field_detector in detectors:
                try:
                    matching_piece = field_detector.matching_string(
                        text_unit.textunittext.text,
                        text_is_sentence=text_unit.is_sentence())
                    if matching_piece is not None:
                        if field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                            if field.detect_limit_count < units_counted + matching_piece[
                                    1]:
                                continue
                        matching_string = matching_piece[0]
                        value = field_detector.get_validated_detected_value(
                            field)
                        hint_name = None
                        if typed_field.requires_value:
                            hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                            try:
                                value, hint_name = typed_field \
                                    .get_or_extract_value(doc,
                                                          value,
                                                          hint_name,
                                                          matching_string)
                            except Exception as e:
                                raise ValueExtractionFunctionThrownException(
                                    f'Value extraction function has thrown an exception.\n'
                                    f'Document: {doc.name} (#{doc.pk})\n'
                                    f'Value: {value}\n'
                                    f'Extraction hint: {hint_name}\n'
                                    f'Matching string:\n'
                                    f'{matching_string}') from e
                            if value is None:
                                continue

                        annotation_value = typed_field.annotation_value_python_to_json(
                            value)
                        ant = AnnotationDTO(
                            annotation_value=annotation_value,
                            location_in_doc_start=text_unit.location_start,
                            location_in_doc_end=text_unit.location_end,
                            extraction_hint_name=hint_name)

                        if not isinstance(typed_field, MultiValueField):
                            return FieldValueDTO(
                                field_value=ant.annotation_value,
                                annotations=[ant])
                        else:
                            ants.append(ant)
                except Exception as e:
                    raise CaughtErrorWhileApplyingFieldDetector(
                        f'Exception caught while trying to apply field detector.\n'
                        f'Document: {doc.name} (#{doc.pk})\n'
                        f'Field detector: #{field_detector.detector.pk}\n'
                        f'{field_detector.detector.include_regexps}\n'
                        f'Text unit: #{text_unit.pk}\n'
                        f'{text_unit.text[:300]}') from e

            if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR:
                units_counted += len(text_unit.text)

        if not ants:
            return None

        return FieldValueDTO(field_value=typed_field.
                             build_json_field_value_from_json_ant_values(
                                 [a.annotation_value for a in ants]),
                             annotations=ants)