Exemplo n.º 1
0
    def detect_field_values(cls,
                            log: ProcessLogger,
                            doc: Document,
                            field: DocumentField,
                            cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]:
        # This method assumes that field detection already goes in the required order and dependencies of this
        # field are already calculated / detected.

        formula = field.formula

        if not formula:
            raise ValueError('No formula specified for field {0} (#{1})'.format(field.code, field.uid))

        depends_on_fields = list(field.depends_on_fields.all())

        qs_document_field_values = doc.documentfieldvalue_set \
            .filter(removed_by_user=False) \
            .filter(field__in=depends_on_fields)

        field_code_to_value = merge_document_field_values_to_python_value(list(qs_document_field_values))

        field_code_to_value = {f.code: field_code_to_value.get(f.code) for f in depends_on_fields}

        if field.stop_words:
            depends_on_full_text = '\n'.join([str(v) for v in field_code_to_value.values()])
            detected_with_stop_words, detected_values \
                = detect_with_stop_words_by_field_and_full_text(field,
                                                                depends_on_full_text)
            if detected_with_stop_words:
                return detected_values or list()

        v = cls.calc_formula(field.code, field.type, formula, field_code_to_value)
        return [DetectedFieldValue(field, v)]
Exemplo n.º 2
0
    def detect_field_values(cls, log: ProcessLogger, doc: Document,
                            field: DocumentField) -> List[DetectedFieldValue]:
        # This method assumes that field detection already goes in the required order and dependencies of this
        # field are already calculated / detected.

        formula = field.formula

        if not formula:
            raise ValueError(
                'No formula specified for field {0} (#{1})'.format(
                    field.code, field.uid))

        depends_on_fields = list(field.depends_on_fields.all())

        qs_document_field_values = doc.documentfieldvalue_set \
            .filter(removed_by_user=False) \
            .filter(field__in=depends_on_fields)

        field_code_to_value = merge_document_field_values_to_python_value(
            list(qs_document_field_values))

        field_code_to_value = {
            f.code: field_code_to_value.get(f.code)
            for f in depends_on_fields
        }

        v = cls.calc_formula(field.code, field.type, formula,
                             field_code_to_value)
        return [DetectedFieldValue(field, v)]
    def test_field_detector_model(task: ExtendedTask, field_id,
                                  document_id) -> dict:
        document = Document.objects.get(pk=document_id)  # type: Document
        field = DocumentField.objects.get(pk=field_id)  # type: DocumentField

        expected_dfvs = field_detection.detect_and_cache_field_values(
            CeleryTaskLogger(task), document, field,
            save=False)  # type: List[DetectedFieldValue]
        actual_dfvs = list(
            DocumentFieldValue.objects.filter(
                document=document, field=field,
                removed_by_user=False).all())  # type: List[DocumentFieldValue]

        if field.is_value_aware():
            # dates, numbers, e.t.c.
            expected_field_values = field_detection.merge_detected_field_values_to_python_value(
                expected_dfvs)
            expected_field_value = expected_field_values.get(field.code)

            actual_field_values = merge_document_field_values_to_python_value(
                actual_dfvs)
            actual_field_value = actual_field_values.get(field.code)

            matches = bool(expected_field_value == actual_field_value)
        else:
            # related-info e.t.c.
            expected_set = {
                'text_unit_' + str(dfv.text_unit.id)
                for dfv in expected_dfvs if dfv.text_unit
            }
            expected_field_value = '; '.join(sorted(expected_set))

            actual_set = {
                'text_unit_' + str(dfv.text_unit.id)
                for dfv in actual_dfvs if dfv.text_unit
            }
            actual_field_value = '; '.join(sorted(actual_set))
            matches = bool(expected_set == actual_set)

        if not matches:
            task.log_info(
                '{3} Test doc: {0} (Project: {5}). '
                'Detected: {1}. Real: {2}.\nDetected in text:-----\n{4}\n-----'
                .format(
                    document.name, expected_field_value, actual_field_value,
                    '[  OK  ]' if matches else '[ ERR  ]', '\n---\n'.join(
                        [dfv.text_unit.text
                         for dfv in expected_dfvs]) if expected_dfvs else '',
                    document.project.name if document.project else ''))

        text_units_number = TextUnit.objects.filter(
            document=document, unit_type=field.text_unit_type).count()

        return {
            'text_units_number': text_units_number,
            'value_matches_expected': matches
        }
    def detect_field_values(cls, log: ProcessLogger, doc: Document,
                            field: DocumentField) -> List[DetectedFieldValue]:

        depends_on_fields = list(field.depends_on_fields.all())

        qs_document_field_values = doc.documentfieldvalue_set \
            .filter(removed_by_user=False) \
            .filter(field__in=depends_on_fields)

        field_code_to_value = merge_document_field_values_to_python_value(
            list(qs_document_field_values))

        field_code_to_value = {
            f.code: field_code_to_value.get(f.code)
            for f in depends_on_fields
        }

        document_type = doc.document_type

        field_detectors = DocumentFieldDetector.objects.filter(
            document_type=document_type, field=field)
        field_type_adapter = FIELD_TYPES_REGISTRY.get(
            field.type)  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]

        for depends_on_value in field_code_to_value.values():
            if not depends_on_value:
                continue
            depends_on_value = str(depends_on_value)
            for field_detector in field_detectors:
                if field_detector.matches(depends_on_value):
                    value = field_detector.detected_value
                    hint_name = None
                    if field_type_adapter.value_aware:
                        hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                        value, hint_name = field_type_adapter \
                            .get_or_extract_value(doc,
                                                  field, value,
                                                  hint_name,
                                                  depends_on_value)
                        if value is None:
                            continue

                    detected_values.append(
                        DetectedFieldValue(field, value, None, hint_name))

                    if not (field_type_adapter.multi_value
                            or field.is_choice_field()):
                        break

            if detected_values and not (field_type_adapter.multi_value
                                        or field.is_choice_field()):
                break

        return detected_values
Exemplo n.º 5
0
    def detect_field_values(cls, log: ProcessLogger, doc: Document,
                            field: DocumentField) -> List[DetectedFieldValue]:

        depends_on_fields = list(field.depends_on_fields.all())

        qs_document_field_values = doc.documentfieldvalue_set \
            .filter(removed_by_user=False) \
            .filter(field__in=depends_on_fields)

        field_code_to_value = merge_document_field_values_to_python_value(
            list(qs_document_field_values))

        field_code_to_value = {
            f.code: field_code_to_value.get(f.code)
            for f in depends_on_fields
        }

        if field.stop_words:
            depends_on_full_text = '\n'.join(
                [str(v) for v in field_code_to_value.values()])
            detected_with_stop_words, detected_values \
                = detect_with_stop_words_by_field_and_full_text(field, depends_on_full_text)
            if detected_with_stop_words:
                return detected_values or list()

        field_detectors = DocumentFieldDetector.objects.filter(field=field)
        field_type_adapter = FIELD_TYPES_REGISTRY.get(
            field.type)  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]

        for depends_on_value in field_code_to_value.values():
            if not depends_on_value:
                continue
            depends_on_value = str(depends_on_value)
            for field_detector in field_detectors:  # type: DocumentFieldDetector
                matching_string = field_detector.matching_string(
                    depends_on_value, text_is_sentence=False)
                if matching_string is not None:
                    value = field_detector.get_validated_detected_value(field)
                    hint_name = None
                    if field_type_adapter.requires_value:
                        hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                        value, hint_name = field_type_adapter \
                            .get_or_extract_value(doc,
                                                  field, value,
                                                  hint_name,
                                                  matching_string)
                        if value is None:
                            continue

                    detected_values.append(
                        DetectedFieldValue(field, value, None, hint_name))

                    if not (field_type_adapter.multi_value
                            or field.is_choice_field()):
                        break

            if detected_values and not (field_type_adapter.multi_value
                                        or field.is_choice_field()):
                break

        return detected_values