def detect_field_values(cls, log: ProcessLogger, doc: Document,
                            field: DocumentField) -> List[DetectedFieldValue]:
        python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get(
            field.python_coded_field)  # type: PythonCodedField
        if not python_coded_field:
            raise RuntimeError('Unknown python-coded field: {0}'.format(
                field.python_coded_field))
        field_type_adapter = FIELD_TYPES_REGISTRY[
            field.type]  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]
        if python_coded_field.by_sentence:
            qs_text_units = TextUnit.objects \
                .filter(document=doc) \
                .filter(unit_type=field.text_unit_type) \
                .order_by('location_start', 'pk')

            for text_unit in qs_text_units.iterator():
                for value, location_start, location_end in python_coded_field.get_values(
                        text_unit.text) or []:
                    detected_values.append(
                        DetectedFieldValue(field, value, text_unit, None,
                                           location_start, location_end))
                    if not (field_type_adapter.multi_value
                            or field.is_choice_field()):
                        return detected_values
        else:
            for value, location_start, location_end in python_coded_field.get_values(
                    doc.full_text) or []:
                text_unit = TextUnit.objects.filter(
                    document=doc,
                    unit_type='sentence',
                    location_start__lte=location_start,
                    location_end__gte=location_start).first()  # type: TextUnit
                if not text_unit:
                    raise RuntimeError(
                        'Python coded field {0} detected a value in document {1} at '
                        'location [{2};{3}] but the start of location does not belong to any '
                        'text unit object in DB.\n'
                        'It can not be. Something is broken.'.format(
                            field.python_coded_field, doc, location_start,
                            location_end))
                location_length = location_end - location_start
                location_start = location_start - text_unit.location_start
                location_end = location_start + location_length
                detected_values.append(
                    DetectedFieldValue(field, value, text_unit, None,
                                       location_start, location_end))
                if not (field_type_adapter.multi_value
                        or field.is_choice_field()):
                    return detected_values

        return detected_values
Exemplo n.º 2
0
    def detect_field_values(cls,
                            log: ProcessLogger,
                            doc: Document,
                            field: DocumentField,
                            cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]:
        # This method assumes that field detection already goes in the required order and dependencies of this
        # field are already calculated / detected.

        formula = field.formula

        if not formula:
            raise ValueError('No formula specified for field {0} (#{1})'.format(field.code, field.uid))

        depends_on_fields = list(field.depends_on_fields.all())

        qs_document_field_values = doc.documentfieldvalue_set \
            .filter(removed_by_user=False) \
            .filter(field__in=depends_on_fields)

        field_code_to_value = merge_document_field_values_to_python_value(list(qs_document_field_values))

        field_code_to_value = {f.code: field_code_to_value.get(f.code) for f in depends_on_fields}

        if field.stop_words:
            depends_on_full_text = '\n'.join([str(v) for v in field_code_to_value.values()])
            detected_with_stop_words, detected_values \
                = detect_with_stop_words_by_field_and_full_text(field,
                                                                depends_on_full_text)
            if detected_with_stop_words:
                return detected_values or list()

        v = cls.calc_formula(field.code, field.type, formula, field_code_to_value)
        return [DetectedFieldValue(field, v)]
Exemplo n.º 3
0
    def predict_and_extract_value(cls, sklearn_model: SkLearnClassifierModel,
                                  field_type_adapter: FieldType,
                                  document: Document,
                                  field: DocumentField,
                                  text_unit: TextUnit) -> Optional[DetectedFieldValue]:
        field_uid, value, hint_name = cls.predict_value(sklearn_model, text_unit)
        if field_uid == field.uid:
            if field_type_adapter.value_aware:
                hint_name = hint_name or ValueExtractionHint.TAKE_FIRST.name
                value, hint_name = field_type_adapter \
                    .get_or_extract_value(document, field, value, hint_name, text_unit.text)
                return DetectedFieldValue(field, value, text_unit, hint_name)
            else:
                return DetectedFieldValue(field, None, text_unit)

        return None
Exemplo n.º 4
0
    def detect_field_values(cls, log: ProcessLogger, doc: Document,
                            field: DocumentField) -> List[DetectedFieldValue]:
        # This method assumes that field detection already goes in the required order and dependencies of this
        # field are already calculated / detected.

        formula = field.formula

        if not formula:
            raise ValueError(
                'No formula specified for field {0} (#{1})'.format(
                    field.code, field.uid))

        depends_on_fields = list(field.depends_on_fields.all())

        qs_document_field_values = doc.documentfieldvalue_set \
            .filter(removed_by_user=False) \
            .filter(field__in=depends_on_fields)

        field_code_to_value = merge_document_field_values_to_python_value(
            list(qs_document_field_values))

        field_code_to_value = {
            f.code: field_code_to_value.get(f.code)
            for f in depends_on_fields
        }

        v = cls.calc_formula(field.code, field.type, formula,
                             field_code_to_value)
        return [DetectedFieldValue(field, v)]
Exemplo n.º 5
0
    def detect_field_values(cls, log: ProcessLogger, doc: Document,
                            field: DocumentField) -> List[DetectedFieldValue]:
        # This method assumes that field detection already goes in the required order and dependencies of this
        # field are already calculated / detected.

        document_type = doc.document_type  # type: DocumentType
        try:
            classifier_model = ClassifierModel.objects \
                .get(document_type=document_type, document_field=field)
            obj = classifier_model.get_trained_model_obj(
            )  # type: Dict[str, Any]

            model = obj['model']
            categories = obj['categories']

            predicted = model.predict([doc.field_values])

            target_index = predicted[0]

            target_name = categories[target_index] if 0 <= target_index < len(
                categories) else None

            dfv = DetectedFieldValue(field, target_name)
            return [dfv]

        except ClassifierModel.DoesNotExist as e:
            raise e
    def detect_field_values(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]:
        detected_values = cls.maybe_detect_with_stop_words(
            field, cached_fields)
        if detected_values is not None:
            return detected_values

        try:
            classifier_model = ClassifierModel.objects.get(
                document_field=field)
            obj = classifier_model.get_trained_model_obj(
            )  # type: Dict[str, Any]

            model = obj['model']
            categories = obj['categories']

            doc_field_vals = cached_fields
            predicted = model.predict([doc_field_vals])

            target_index = predicted[0]

            target_name = categories[target_index] if 0 <= target_index < len(
                categories) else None

            dfv = DetectedFieldValue(field, target_name)
            return [dfv]

        except ClassifierModel.DoesNotExist as e:
            raise e
Exemplo n.º 7
0
 def _maybe_add_val(f, v, owner):
     if v is None:
         return
     v = DetectedFieldValue(f, v, user=owner)
     prev = fields_to_values.get(f)
     if not prev:
         fields_to_values[f] = [v]
     else:
         prev.append(v)
    def detect_field_values(cls, log: ProcessLogger, doc: Document,
                            field: DocumentField) -> List[DetectedFieldValue]:

        depends_on_fields = list(field.depends_on_fields.all())

        qs_document_field_values = doc.documentfieldvalue_set \
            .filter(removed_by_user=False) \
            .filter(field__in=depends_on_fields)

        field_code_to_value = merge_document_field_values_to_python_value(
            list(qs_document_field_values))

        field_code_to_value = {
            f.code: field_code_to_value.get(f.code)
            for f in depends_on_fields
        }

        document_type = doc.document_type

        field_detectors = DocumentFieldDetector.objects.filter(
            document_type=document_type, field=field)
        field_type_adapter = FIELD_TYPES_REGISTRY.get(
            field.type)  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]

        for depends_on_value in field_code_to_value.values():
            if not depends_on_value:
                continue
            depends_on_value = str(depends_on_value)
            for field_detector in field_detectors:
                if field_detector.matches(depends_on_value):
                    value = field_detector.detected_value
                    hint_name = None
                    if field_type_adapter.value_aware:
                        hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                        value, hint_name = field_type_adapter \
                            .get_or_extract_value(doc,
                                                  field, value,
                                                  hint_name,
                                                  depends_on_value)
                        if value is None:
                            continue

                    detected_values.append(
                        DetectedFieldValue(field, value, None, hint_name))

                    if not (field_type_adapter.multi_value
                            or field.is_choice_field()):
                        break

            if detected_values and not (field_type_adapter.multi_value
                                        or field.is_choice_field()):
                break

        return detected_values
Exemplo n.º 9
0
    def detect_field_values(cls, log: ProcessLogger, doc: Document,
                            field: DocumentField) -> List[DetectedFieldValue]:

        depends_on_full_text = doc.full_text
        detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text(
            field, depends_on_full_text)
        if detected_with_stop_words:
            return detected_values or list()

        qs_text_units = TextUnit.objects \
            .filter(document=doc) \
            .filter(unit_type=field.text_unit_type) \
            .order_by('location_start', 'pk')
        document_type = doc.document_type

        field_detectors = DocumentFieldDetector.objects.filter(
            document_type=document_type, field=field)

        field_type_adapter = FIELD_TYPES_REGISTRY.get(
            field.type)  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]

        for text_unit in qs_text_units.iterator():

            for field_detector in field_detectors:
                if field_detector.matches(text_unit.text):
                    value = field_detector.detected_value
                    hint_name = None
                    if field_type_adapter.value_aware:
                        hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                        value, hint_name = field_type_adapter \
                            .get_or_extract_value(doc,
                                                  field, value,
                                                  hint_name,
                                                  text_unit.text)
                        if value is None:
                            continue

                    detected_values.append(
                        DetectedFieldValue(field, value, text_unit, hint_name))

                    if not (field_type_adapter.multi_value
                            or field.is_choice_field()):
                        break

            if detected_values and not (field_type_adapter.multi_value
                                        or field.is_choice_field()):
                break

        return detected_values
Exemplo n.º 10
0
    def detect_field_values(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]:

        depends_on_full_text = doc.full_text
        detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text(
            field, depends_on_full_text)
        if detected_with_stop_words:
            return detected_values or list()

        qs_text_units = RegexpsOnlyFieldDetectionStrategy.\
            text_unit_repo.get_doc_text_units(doc, field.text_unit_type)

        field_detectors = RegexpsOnlyFieldDetectionStrategy.\
            field_detector_repo.get_field_detectors(field)
        detectors = [DetectorFieldMatcher(d) for d in field_detectors]

        field_type_adapter = field.get_field_type()  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]

        for text_unit in qs_text_units:  # type: TextUnit

            for field_detector in detectors:
                matching_string = field_detector.matching_string(
                    text_unit.text, text_is_sentence=text_unit.is_sentence())
                if matching_string is not None:
                    value = field_detector.get_validated_detected_value(field)
                    hint_name = None
                    if field_type_adapter.requires_value:
                        hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                        value, hint_name = field_type_adapter \
                            .get_or_extract_value(doc,
                                                  field, value,
                                                  hint_name,
                                                  matching_string)
                        if value is None:
                            continue

                    detected_values.append(
                        DetectedFieldValue(field, value, text_unit, hint_name))

                    if not (field_type_adapter.multi_value
                            or field.is_choice_field()):
                        break

            if detected_values and not (field_type_adapter.multi_value
                                        or field.is_choice_field()):
                break

        return detected_values
Exemplo n.º 11
0
    def detect_field_values(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]:

        depends_on_full_text = doc.full_text
        detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text(
            field, depends_on_full_text)
        if detected_with_stop_words:
            return detected_values or list()

        qs_text_units = TextUnit.objects \
            .filter(document=doc) \
            .filter(unit_type=field.text_unit_type) \
            .order_by('location_start', 'pk')

        field_detectors = DocumentFieldDetector.objects.filter(field=field)

        field_type_adapter = field.get_field_type()  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]

        for text_unit in qs_text_units.iterator():  # type: TextUnit

            for field_detector in field_detectors:
                matching_string = field_detector.matching_string(
                    text_unit.text, text_is_sentence=text_unit.is_sentence())
                if matching_string is not None:
                    value = field_detector.get_validated_detected_value(field)
                    hint_name = None
                    if field_type_adapter.requires_value:
                        hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                        value, hint_name = field_type_adapter \
                            .get_or_extract_value(doc,
                                                  field, value,
                                                  hint_name,
                                                  matching_string)
                        if value is None:
                            continue

                    detected_values.append(
                        DetectedFieldValue(field, value, text_unit, hint_name))

                    if not (field_type_adapter.multi_value
                            or field.is_choice_field()):
                        break

            if detected_values and not (field_type_adapter.multi_value
                                        or field.is_choice_field()):
                break

        return detected_values
def detect_with_stop_words_by_field_and_full_text(field: DocumentField, full_text: str) -> Tuple[bool, Optional[List]]:
    if field.requires_text_annotations:
        return False, None
    stop_words = compile_stop_words(field.stop_words)
    if not stop_words:
        return False, None
    field_type_adapter = field.get_field_type()  # type: FieldType
    detected, possible_value = detect_value_with_stop_words(stop_words, full_text)
    if not detected:
        return False, None
    if possible_value is None:
        return True, None
    else:
        possible_value = field_type_adapter.extract_from_possible_value_text(field, possible_value)
        return True, [DetectedFieldValue(field, possible_value)]
    def detect_field_values(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]:

        # If changing this code make sure you update similar code in notebooks/demo/Train and Debug Decision Tree...

        detected_values = cls.maybe_detect_with_stop_words(
            field, cached_fields)
        if detected_values is not None:
            return detected_values

        try:
            classifier_model = ClassifierModel.objects.get(
                document_field=field)
            obj = classifier_model.get_trained_model_obj(
            )  # type: Dict[str, Any]

            model = obj['model']
            categories = obj['categories']

            category_probabilities = model.predict_proba([cached_fields])[0]

            target_index = max(range(len(category_probabilities)),
                               key=category_probabilities.__getitem__)
            target_probability = category_probabilities[target_index]

            predicted_value = categories[
                target_index] if 0 <= target_index < len(categories) else None

            if predicted_value is None:
                target_name = field.unsure_choice_value
            else:
                threshold = (field.unsure_thresholds_by_value or {}) \
                                .get(predicted_value) or DocumentField.DEFAULT_UNSURE_THRESHOLD

                target_name = predicted_value if target_probability >= threshold else field.unsure_choice_value

            dfv = DetectedFieldValue(field, target_name)
            return [dfv]

        except ClassifierModel.DoesNotExist as e:
            raise e
    def detect_field_values(cls, log: ProcessLogger, doc: Document,
                            field: DocumentField) -> List[DetectedFieldValue]:
        if field.stop_words:
            depends_on_fields = list(field.depends_on_fields.all())
            depends_on_full_text = []

            for df in depends_on_fields:  # type: DocumentField
                field_type_adapter = FIELD_TYPES_REGISTRY[
                    df.type]  # type: FieldType
                v = field_type_adapter.merged_db_value_to_python(
                    doc.field_values.get(df.uid))
                if v:
                    depends_on_full_text.append(str(v))

            detected_with_stop_words, detected_values = \
                detect_with_stop_words_by_field_and_full_text(field, '\n'.join(depends_on_full_text))
            if detected_with_stop_words:
                return detected_values or list()

        document_type = doc.document_type  # type: DocumentType
        try:
            classifier_model = ClassifierModel.objects \
                .get(document_type=document_type, document_field=field)
            obj = classifier_model.get_trained_model_obj(
            )  # type: Dict[str, Any]

            model = obj['model']
            categories = obj['categories']

            predicted = model.predict([doc.field_values])

            target_index = predicted[0]

            target_name = categories[target_index] if 0 <= target_index < len(
                categories) else None

            dfv = DetectedFieldValue(field, target_name)
            return [dfv]

        except ClassifierModel.DoesNotExist as e:
            raise e
    def detect_field_values(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]:
        python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get(
            field.python_coded_field)  # type: PythonCodedField
        if not python_coded_field:
            raise RuntimeError('Unknown python-coded field: {0}'.format(
                field.python_coded_field))
        field_type_adapter = field.get_field_type()  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]
        if python_coded_field.detect_per_text_unit:
            qs_text_units = TextUnit.objects \
                .filter(document=doc) \
                .filter(unit_type=field.text_unit_type) \
                .order_by('location_start', 'pk')

            for text_unit in qs_text_units.iterator():
                for value, location_start, location_end \
                        in python_coded_field.get_values(log, field, doc, text_unit.text) or []:
                    detected_values.append(
                        DetectedFieldValue(field, value, text_unit, None,
                                           location_start, location_end))
                    if not (field_type_adapter.multi_value
                            or field.is_choice_field()):
                        return detected_values
        else:
            for value, location_start, location_end \
                    in python_coded_field.get_values(log, field, doc, doc.full_text) or []:
                if field.requires_text_annotations and (
                        location_start is None or location_end is None):
                    raise RuntimeError(
                        'Python coded field {0} detected a value in document {1} at '
                        'undefined location but the field requires text annotation (and location).\n'
                        'This should not happen. Something is broken.'.format(
                            field.python_coded_field, doc))
                if location_start is not None and location_end is not None:
                    text_unit = TextUnit.objects.filter(
                        document=doc,
                        unit_type=field.text_unit_type,
                        location_start__lte=location_start,
                        location_end__gte=location_start).first(
                        )  # type: TextUnit
                    if not text_unit:
                        raise RuntimeError(
                            'Python coded field {0} detected a value in document {1} at '
                            'location [{2};{3}] but the start of location does not belong to any '
                            'text unit object in DB.\n'
                            'This should not happen. Something is broken.'.
                            format(field.python_coded_field, doc,
                                   location_start, location_end))
                    location_length = location_end - location_start
                    location_start = location_start - text_unit.location_start
                    location_end = location_start + location_length
                else:
                    text_unit = None
                    location_start = None
                    location_end = None
                detected_values.append(
                    DetectedFieldValue(field, value, text_unit, None,
                                       location_start, location_end))
                if not (field_type_adapter.multi_value
                        or field.is_choice_field()):
                    return detected_values

        return detected_values
Exemplo n.º 16
0
    def detect_field_values(cls, log: ProcessLogger, doc: Document,
                            field: DocumentField) -> List[DetectedFieldValue]:

        depends_on_fields = list(field.depends_on_fields.all())

        qs_document_field_values = doc.documentfieldvalue_set \
            .filter(removed_by_user=False) \
            .filter(field__in=depends_on_fields)

        field_code_to_value = merge_document_field_values_to_python_value(
            list(qs_document_field_values))

        field_code_to_value = {
            f.code: field_code_to_value.get(f.code)
            for f in depends_on_fields
        }

        if field.stop_words:
            depends_on_full_text = '\n'.join(
                [str(v) for v in field_code_to_value.values()])
            detected_with_stop_words, detected_values \
                = detect_with_stop_words_by_field_and_full_text(field, depends_on_full_text)
            if detected_with_stop_words:
                return detected_values or list()

        field_detectors = DocumentFieldDetector.objects.filter(field=field)
        field_type_adapter = FIELD_TYPES_REGISTRY.get(
            field.type)  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]

        for depends_on_value in field_code_to_value.values():
            if not depends_on_value:
                continue
            depends_on_value = str(depends_on_value)
            for field_detector in field_detectors:  # type: DocumentFieldDetector
                matching_string = field_detector.matching_string(
                    depends_on_value, text_is_sentence=False)
                if matching_string is not None:
                    value = field_detector.get_validated_detected_value(field)
                    hint_name = None
                    if field_type_adapter.requires_value:
                        hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                        value, hint_name = field_type_adapter \
                            .get_or_extract_value(doc,
                                                  field, value,
                                                  hint_name,
                                                  matching_string)
                        if value is None:
                            continue

                    detected_values.append(
                        DetectedFieldValue(field, value, None, hint_name))

                    if not (field_type_adapter.multi_value
                            or field.is_choice_field()):
                        break

            if detected_values and not (field_type_adapter.multi_value
                                        or field.is_choice_field()):
                break

        return detected_values