def detect_field_value(cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: depends_on_full_text = doc.full_text detected_with_stop_words, detected_value = detect_with_stop_words_by_field_and_full_text(field, depends_on_full_text) if detected_with_stop_words: return FieldValueDTO(field_value=detected_value) try: classifier_model = ClassifierModel.objects.get(document_field=field) sklearn_model = classifier_model.get_trained_model_obj() typed_field = TypedField.by(field) # type: TypedField ants = list() # type: List[AnnotationDTO] qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .order_by('location_start', 'pk') units_counted = 0 for text_unit in qs_text_units.iterator(): if field.detect_limit_count: units_counted = FieldDetectionStrategy.update_units_counted( field, units_counted, text_unit) if units_counted > field.detect_limit_count: break ant = cls.predict_and_extract_value(sklearn_model=sklearn_model, typed_field=typed_field, document=doc, field=field, text_unit=text_unit) if ant is None: continue if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: if ant.location_in_doc_start > field.detect_limit_count: break ants.append(ant) if not isinstance(typed_field, MultiValueField): return FieldValueDTO(field_value=ant.annotation_value, annotations=ants) if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: units_counted += len(text_unit.text) if not ants: return None return FieldValueDTO(field_value=typed_field.build_json_field_value_from_json_ant_values([a.annotation_value for a in ants]), annotations=ants) except ClassifierModel.DoesNotExist as e: log.info(f'Classifier model does not exist for field: {field.code}') raise e
def maybe_detect_with_stop_words( cls, field: DocumentField, doc: Document, cached_fields: Dict[str, Any]) -> Optional[FieldValueDTO]: if field.stop_words: depends_on_field_codes = list( field.depends_on_fields.all().values_list( 'code', flat=True)) # type: List[str] depends_on_full_text = [] if not any(cached_fields): return None for field_code in depends_on_field_codes: v = cached_fields.get(field_code) if v: depends_on_full_text.append(str(v)) detected_with_stop_words, detected_field_value = \ detect_with_stop_words_by_field_and_full_text(field=field, doc=doc, full_text='\n'.join(depends_on_full_text)) if detected_with_stop_words: return FieldValueDTO(field_value=TypedField.by( field).field_value_python_to_json(detected_field_value)) return None
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> FieldValueDTO: formula = field.formula if not formula: raise ValueError( f'No formula specified for field {field.code} (#{field.uid})') depends_on_field_codes = field.get_depends_on_codes() or set() field_code_to_value = { c: v for c, v in field_code_to_value.items() if c in depends_on_field_codes } if field.stop_words: depends_on_full_text = '\n'.join( [str(v) for v in field_code_to_value.values()]) log.debug( 'detect_field_value: formula_based_field_detection, checking stop words, ' + f'field {field.code}({field.pk}), document #{doc.pk}') detected_with_stop_words, detected_values \ = detect_with_stop_words_by_field_and_full_text(field, doc, depends_on_full_text) if detected_with_stop_words: return detected_values or list() else: log.debug('detect_field_value: formula_based_field_detection, ' + f'field {field.code}({field.pk}), document #{doc.pk}') v = cls.calc_formula(field_code=field.code, formula=formula, depends_on_field_to_value=field_code_to_value, convert_decimals_to_floats=field. convert_decimals_to_floats_in_formula_args) typed_field = TypedField.by(field) # We don't accept formulas returning values of wrong type to avoid further confusion and # creating wrong formulas in future. # For example for multi-choice fields the formula should return a list and not a string # to ensure the admin understands that this value will replace the whole set/list of strings and not # just add one more string to the value. if typed_field.is_choice_field and typed_field.multi_value: if v and isinstance(v, str): # "outdated" formula is incorrect and returns string instead of # set / list, but we don't warn user: when he updates this formula # (or other detection method) he'll be forced to write code, returning # list or set. v = [v] if not typed_field.is_python_field_value_ok(v): raise ValueError( f'Formula of field {field.code} returned value not suitable for this field:\n{v}' ) v = typed_field.field_value_python_to_json(v) return FieldValueDTO(field_value=v)
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: depends_on_fields = field.get_depends_on_codes() field_code_to_value = { c: v for c, v in field_code_to_value.items() if c in depends_on_fields } if field.stop_words: depends_on_full_text = '\n'.join( [str(v) for v in field_code_to_value.values()]) detected_with_stop_words, detected_value \ = detect_with_stop_words_by_field_and_full_text(field, doc, depends_on_full_text) if detected_with_stop_words: return FieldValueDTO(field_value=detected_value) field_detectors = DocumentFieldDetector.objects.filter(field=field) detectors = [DetectorFieldMatcher(d) for d in field_detectors] typed_field = TypedField.by(field) # type: TypedField values = list() # type: List for depends_on_value in field_code_to_value.values(): if not depends_on_value: continue depends_on_value = str(depends_on_value) for detector_field_matcher in detectors: # type: DetectorFieldMatcher matching_piece = detector_field_matcher.matching_string( depends_on_value, text_is_sentence=False) if matching_piece is not None: matching_string = matching_piece[0] value = detector_field_matcher.get_validated_detected_value( field) if typed_field.requires_value: hint_name = detector_field_matcher.extraction_hint or ValueExtractionHint.TAKE_FIRST.name value, hint_name = typed_field \ .get_or_extract_value(doc, value, hint_name, matching_string) if value is None: continue value = typed_field.annotation_value_python_to_json(value) if not isinstance(typed_field, MultiValueField): return FieldValueDTO(field_value=value) else: values.append(value) if isinstance(typed_field, MultiValueField): return FieldValueDTO( field_value=typed_field. build_json_field_value_from_json_ant_values(values)) else: return None
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: log.debug( 'detect_field_value: regexps_and_text_based_ml_field_value, ' + f'field {field.code}({field.pk}), document #{doc.pk}') ants: List[AnnotationDTO] = [] text_unit_repo = cls.text_unit_repo depends_on_full_text: str = doc.full_text typed_field: TypedField = TypedField.by(field) detected_with_stop_words, detected_value = \ detect_with_stop_words_by_field_and_full_text(field, depends_on_full_text) if detected_with_stop_words: return FieldValueDTO(field_value=detected_value) qs_text_units = text_unit_repo.get_doc_text_units( doc, field.text_unit_type) qs_text_units = FieldDetectionStrategy.reduce_textunits_by_detection_limit( qs_text_units, field) try: classifier_model = ClassifierModel.objects.get( document_field=field) sklearn_model = classifier_model.get_trained_model_obj() for text_unit in qs_text_units.iterator(): # type: TextUnit ant = cls.predict_and_extract_value( sklearn_model=sklearn_model, typed_field=typed_field, document=doc, field=field, text=text_unit.text, location_start=text_unit.location_start, location_end=text_unit.location_end) if ant is None: continue ants.append(ant) if not isinstance(typed_field, MultiValueField): return FieldValueDTO(field_value=ant.annotation_value, annotations=ants) if not ants: return None return FieldValueDTO(field_value=typed_field. build_json_field_value_from_json_ant_values( [a.annotation_value for a in ants]), annotations=ants) except ClassifierModel.DoesNotExist as e: log.info( f'Classifier model does not exist for field: {field.code}') raise e
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: try: log.debug('detect_field_value: regexps_field_detection, ' + f'field {field.code}({field.pk}), document #{doc.pk}') except AttributeError: pass ants: List[AnnotationDTO] = [] depends_on_full_text: str = doc.full_text typed_field: TypedField = TypedField.by(field) text_unit_repo: TextUnitRepository = cls.text_unit_repo field_detector_repo: FieldDetectorRepository = cls.field_detector_repo detected_with_stop_words, detected_value = \ detect_with_stop_words_by_field_and_full_text(field, doc, depends_on_full_text) if detected_with_stop_words: return FieldValueDTO(field_value=detected_value) qs_text_units = text_unit_repo.get_doc_text_units( doc, field.text_unit_type) qs_text_units = FieldDetectionStrategy.reduce_textunits_by_detection_limit( qs_text_units, field) field_detectors = field_detector_repo.get_field_detectors(field) detectors = [DetectorFieldMatcher(d) for d in field_detectors] for text_unit in qs_text_units: unit_ants = cls.extract_from_textunit(text_unit, field, detectors) if not unit_ants: continue if not isinstance(typed_field, MultiValueField): return FieldValueDTO(field_value=unit_ants[0].annotation_value, annotations=unit_ants) else: ants += unit_ants if not ants: return None if isinstance(typed_field, MultiValueField): field_value = typed_field.build_json_field_value_from_json_ant_values( [a.annotation_value for a in ants]) else: field_value = typed_field.annotation_value_python_to_json( ants[0].annotation_value) return FieldValueDTO(field_value=field_value, annotations=ants)
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: text_unit_repo = cls.text_unit_repo field_detector_repo = cls.field_detector_repo log.debug('detect_field_value: regexps_field_detection, ' + f'field {field.code}({field.pk}), document #{doc.pk}') depends_on_full_text = doc.full_text detected_with_stop_words, detected_value = detect_with_stop_words_by_field_and_full_text( field, depends_on_full_text) if detected_with_stop_words: return FieldValueDTO(field_value=detected_value) qs_text_units = text_unit_repo.get_doc_text_units( doc, field.text_unit_type) field_detectors = field_detector_repo.get_field_detectors(field) detectors = [DetectorFieldMatcher(d) for d in field_detectors] typed_field = TypedField.by(field) # type: TypedField ants = list() # type: List[AnnotationDTO] units_counted = 0 for text_unit in qs_text_units: # type: TextUnit if field.detect_limit_count: units_counted = FieldDetectionStrategy.update_units_counted( field, units_counted, text_unit) if units_counted > field.detect_limit_count: break for field_detector in detectors: try: matching_piece = field_detector.matching_string( text_unit.textunittext.text, text_is_sentence=text_unit.is_sentence()) if matching_piece is not None: if field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: if field.detect_limit_count < units_counted + matching_piece[ 1]: continue matching_string = matching_piece[0] value = field_detector.get_validated_detected_value( field) hint_name = None if typed_field.requires_value: hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name try: value, hint_name = typed_field \ .get_or_extract_value(doc, value, hint_name, matching_string) except Exception as e: raise ValueExtractionFunctionThrownException( f'Value extraction function has thrown an exception.\n' f'Document: {doc.name} (#{doc.pk})\n' f'Value: {value}\n' f'Extraction hint: {hint_name}\n' f'Matching string:\n' f'{matching_string}') from e if value is None: continue annotation_value = typed_field.annotation_value_python_to_json( value) ant = AnnotationDTO( annotation_value=annotation_value, location_in_doc_start=text_unit.location_start, location_in_doc_end=text_unit.location_end, extraction_hint_name=hint_name) if not isinstance(typed_field, MultiValueField): return FieldValueDTO( field_value=ant.annotation_value, annotations=[ant]) else: ants.append(ant) except Exception as e: raise CaughtErrorWhileApplyingFieldDetector( f'Exception caught while trying to apply field detector.\n' f'Document: {doc.name} (#{doc.pk})\n' f'Field detector: #{field_detector.detector.pk}\n' f'{field_detector.detector.include_regexps}\n' f'Text unit: #{text_unit.pk}\n' f'{text_unit.text[:300]}') from e if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: units_counted += len(text_unit.text) if not ants: return None return FieldValueDTO(field_value=typed_field. build_json_field_value_from_json_ant_values( [a.annotation_value for a in ants]), annotations=ants)