def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get( field.python_coded_field) # type: PythonCodedField if not python_coded_field: raise RuntimeError('Unknown python-coded field: {0}'.format( field.python_coded_field)) typed_field = TypedField.by(field) # type: TypedField if python_coded_field.type != typed_field.type_code: raise RuntimeError( f'Python-coded field {python_coded_field.__class__.__name__} is ' f'for fields of type {python_coded_field.type} and field {field.code} ' f'is of type {typed_field.type_code}') log.debug('detect_field_value: python_coded_field_detection, ' + f'field {field.code}({field.pk}), document #{doc.pk}') field_value_dto = python_coded_field.get_value( log=log, field=field, doc=doc, cur_field_code_to_value=field_code_to_value) if not typed_field.is_json_field_value_ok(field_value_dto.field_value): raise ValueError( f'Python coded field class {field.python_coded_field} returned value not suitable for ' f'field {field.code} ({typed_field.type_code})') return field_value_dto
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> FieldValueDTO: formula = field.formula if not formula: raise ValueError( f'No formula specified for field {field.code} (#{field.uid})') depends_on_field_codes = field.get_depends_on_codes() or set() field_code_to_value = { c: v for c, v in field_code_to_value.items() if c in depends_on_field_codes } if field.stop_words: depends_on_full_text = '\n'.join( [str(v) for v in field_code_to_value.values()]) log.debug( 'detect_field_value: formula_based_field_detection, checking stop words, ' + f'field {field.code}({field.pk}), document #{doc.pk}') detected_with_stop_words, detected_values \ = detect_with_stop_words_by_field_and_full_text(field, doc, depends_on_full_text) if detected_with_stop_words: return detected_values or list() else: log.debug('detect_field_value: formula_based_field_detection, ' + f'field {field.code}({field.pk}), document #{doc.pk}') v = cls.calc_formula(field_code=field.code, formula=formula, depends_on_field_to_value=field_code_to_value, convert_decimals_to_floats=field. convert_decimals_to_floats_in_formula_args) typed_field = TypedField.by(field) # We don't accept formulas returning values of wrong type to avoid further confusion and # creating wrong formulas in future. # For example for multi-choice fields the formula should return a list and not a string # to ensure the admin understands that this value will replace the whole set/list of strings and not # just add one more string to the value. if typed_field.is_choice_field and typed_field.multi_value: if v and isinstance(v, str): # "outdated" formula is incorrect and returns string instead of # set / list, but we don't warn user: when he updates this formula # (or other detection method) he'll be forced to write code, returning # list or set. v = [v] if not typed_field.is_python_field_value_ok(v): raise ValueError( f'Formula of field {field.code} returned value not suitable for this field:\n{v}' ) v = typed_field.field_value_python_to_json(v) return FieldValueDTO(field_value=v)
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: log.debug( 'detect_field_value: regexps_and_text_based_ml_field_value, ' + f'field {field.code}({field.pk}), document #{doc.pk}') ants: List[AnnotationDTO] = [] text_unit_repo = cls.text_unit_repo depends_on_full_text: str = doc.full_text typed_field: TypedField = TypedField.by(field) detected_with_stop_words, detected_value = \ detect_with_stop_words_by_field_and_full_text(field, depends_on_full_text) if detected_with_stop_words: return FieldValueDTO(field_value=detected_value) qs_text_units = text_unit_repo.get_doc_text_units( doc, field.text_unit_type) qs_text_units = FieldDetectionStrategy.reduce_textunits_by_detection_limit( qs_text_units, field) try: classifier_model = ClassifierModel.objects.get( document_field=field) sklearn_model = classifier_model.get_trained_model_obj() for text_unit in qs_text_units.iterator(): # type: TextUnit ant = cls.predict_and_extract_value( sklearn_model=sklearn_model, typed_field=typed_field, document=doc, field=field, text=text_unit.text, location_start=text_unit.location_start, location_end=text_unit.location_end) if ant is None: continue ants.append(ant) if not isinstance(typed_field, MultiValueField): return FieldValueDTO(field_value=ant.annotation_value, annotations=ants) if not ants: return None return FieldValueDTO(field_value=typed_field. build_json_field_value_from_json_ant_values( [a.annotation_value for a in ants]), annotations=ants) except ClassifierModel.DoesNotExist as e: log.info( f'Classifier model does not exist for field: {field.code}') raise e
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: log.debug('detect_field_value: csv_regexps_field_detection, ' + f'field {field.code}({field.pk}), document #{doc.pk}') detectors = cls.detecting_cache.get_detectors( field.pk, lambda msg, er: log.error(msg, field_code=field.code, exc_info=er)) if not detectors: return None is_multichoice = field.type == MultiChoiceField.type_code doc_text = cls.get_document_text(doc) annotations = [] for detector in detectors: found_item = detector.find_value(doc_text) if not found_item: continue # TODO: implement reading values from full text (TextParts.FULL.value) # as it is done now, or from text units - paragraphs or sentences # based on field.text_unit_type - for other detector.text_part options """ if detector.text_part == TextParts.BEFORE_REGEXP.value: return matching_string[:begin], 0, begin elif detector.text_part == TextParts.AFTER_REGEXP.value: return matching_string[end:], end, len(text) elif detector.text_part == TextParts.INSIDE_REGEXP.value: return matching_string[begin:end], begin, end else: return text, 0, len(text) """ # starting position has to be shifted backward by 1 symbol for FE ant = AnnotationDTO(annotation_value=found_item[0], location_in_doc_start=max( found_item[1] - 1, 0), location_in_doc_end=found_item[2], extraction_hint_name='') if not is_multichoice: return FieldValueDTO(field_value=found_item[0], annotations=[ant]) else: annotations.append(ant) if annotations: f_val = [a.annotation_value for a in annotations] return FieldValueDTO(field_value=f_val, annotations=annotations) return None
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: try: log.debug('detect_field_value: regexps_field_detection, ' + f'field {field.code}({field.pk}), document #{doc.pk}') except AttributeError: pass ants: List[AnnotationDTO] = [] depends_on_full_text: str = doc.full_text typed_field: TypedField = TypedField.by(field) text_unit_repo: TextUnitRepository = cls.text_unit_repo field_detector_repo: FieldDetectorRepository = cls.field_detector_repo detected_with_stop_words, detected_value = \ detect_with_stop_words_by_field_and_full_text(field, doc, depends_on_full_text) if detected_with_stop_words: return FieldValueDTO(field_value=detected_value) qs_text_units = text_unit_repo.get_doc_text_units( doc, field.text_unit_type) qs_text_units = FieldDetectionStrategy.reduce_textunits_by_detection_limit( qs_text_units, field) field_detectors = field_detector_repo.get_field_detectors(field) detectors = [DetectorFieldMatcher(d) for d in field_detectors] for text_unit in qs_text_units: unit_ants = cls.extract_from_textunit(text_unit, field, detectors) if not unit_ants: continue if not isinstance(typed_field, MultiValueField): return FieldValueDTO(field_value=unit_ants[0].annotation_value, annotations=unit_ants) else: ants += unit_ants if not ants: return None if isinstance(typed_field, MultiValueField): field_value = typed_field.build_json_field_value_from_json_ant_values( [a.annotation_value for a in ants]) else: field_value = typed_field.annotation_value_python_to_json( ants[0].annotation_value) return FieldValueDTO(field_value=field_value, annotations=ants)
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: text_unit_repo = cls.text_unit_repo field_detector_repo = cls.field_detector_repo log.debug('detect_field_value: regexps_field_detection, ' + f'field {field.code}({field.pk}), document #{doc.pk}') depends_on_full_text = doc.full_text detected_with_stop_words, detected_value = detect_with_stop_words_by_field_and_full_text( field, depends_on_full_text) if detected_with_stop_words: return FieldValueDTO(field_value=detected_value) qs_text_units = text_unit_repo.get_doc_text_units( doc, field.text_unit_type) field_detectors = field_detector_repo.get_field_detectors(field) detectors = [DetectorFieldMatcher(d) for d in field_detectors] typed_field = TypedField.by(field) # type: TypedField ants = list() # type: List[AnnotationDTO] units_counted = 0 for text_unit in qs_text_units: # type: TextUnit if field.detect_limit_count: units_counted = FieldDetectionStrategy.update_units_counted( field, units_counted, text_unit) if units_counted > field.detect_limit_count: break for field_detector in detectors: try: matching_piece = field_detector.matching_string( text_unit.textunittext.text, text_is_sentence=text_unit.is_sentence()) if matching_piece is not None: if field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: if field.detect_limit_count < units_counted + matching_piece[ 1]: continue matching_string = matching_piece[0] value = field_detector.get_validated_detected_value( field) hint_name = None if typed_field.requires_value: hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name try: value, hint_name = typed_field \ .get_or_extract_value(doc, value, hint_name, matching_string) except Exception as e: raise ValueExtractionFunctionThrownException( f'Value extraction function has thrown an exception.\n' f'Document: {doc.name} (#{doc.pk})\n' f'Value: {value}\n' f'Extraction hint: {hint_name}\n' f'Matching string:\n' f'{matching_string}') from e if value is None: continue annotation_value = typed_field.annotation_value_python_to_json( value) ant = AnnotationDTO( annotation_value=annotation_value, location_in_doc_start=text_unit.location_start, location_in_doc_end=text_unit.location_end, extraction_hint_name=hint_name) if not isinstance(typed_field, MultiValueField): return FieldValueDTO( field_value=ant.annotation_value, annotations=[ant]) else: ants.append(ant) except Exception as e: raise CaughtErrorWhileApplyingFieldDetector( f'Exception caught while trying to apply field detector.\n' f'Document: {doc.name} (#{doc.pk})\n' f'Field detector: #{field_detector.detector.pk}\n' f'{field_detector.detector.include_regexps}\n' f'Text unit: #{text_unit.pk}\n' f'{text_unit.text[:300]}') from e if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: units_counted += len(text_unit.text) if not ants: return None return FieldValueDTO(field_value=typed_field. build_json_field_value_from_json_ant_values( [a.annotation_value for a in ants]), annotations=ants)
def detect_field_value(cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: depends_on_full_text = doc.full_text detected_with_stop_words, detected_value = detect_with_stop_words_by_field_and_full_text(field, depends_on_full_text) if detected_with_stop_words: return FieldValueDTO(field_value=detected_value) try: log.debug('detect_field_value: regexps_and_text_based_ml_field_value, ' + f'field {field.code}({field.pk}), document #{doc.pk}') classifier_model = ClassifierModel.objects.get(document_field=field) sklearn_model = classifier_model.get_trained_model_obj() typed_field = TypedField.by(field) # type: TypedField ants = list() # type: List[AnnotationDTO] qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .order_by('location_start', 'pk') units_counted = 0 for text_unit in qs_text_units.iterator(): if field.detect_limit_count: units_counted = FieldDetectionStrategy.update_units_counted( field, units_counted, text_unit) if units_counted > field.detect_limit_count: break ant = cls.predict_and_extract_value(sklearn_model=sklearn_model, typed_field=typed_field, document=doc, field=field, text_unit=text_unit) if ant is None: continue if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: if ant.location_in_doc_start > field.detect_limit_count: break ants.append(ant) if not isinstance(typed_field, MultiValueField): return FieldValueDTO(field_value=ant.annotation_value, annotations=ants) if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: units_counted += len(text_unit.text) if not ants: return None return FieldValueDTO(field_value=typed_field.build_json_field_value_from_json_ant_values([a.annotation_value for a in ants]), annotations=ants) except ClassifierModel.DoesNotExist as e: log.info(f'Classifier model does not exist for field: {field.code}') raise e
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: ants: List[AnnotationDTO] = [] typed_field: TypedField = TypedField.by(field) text_unit_repo = cls.text_unit_repo if field.mlflow_detect_on_document_level: log.debug( 'detect_field_value: mlflow_field_detection on doc level, ' + f'field {field.code}({field.pk}), document #{doc.pk}') text = doc.text model_input = dict(field_code_to_value) model_input['text'] = text model_input_df = pd.DataFrame([model_input]) model_output = MLFlowModelManager().predict( field.mlflow_model_uri, model_input_df) value = model_output[0] if not value: return None hint_name = ValueExtractionHint.TAKE_FIRST.name value, hint_name = typed_field \ .get_or_extract_value(doc, value, hint_name, text) if not value: return None return FieldValueDTO(field_value=value) qs_text_units = text_unit_repo.get_doc_text_units( doc, field.text_unit_type) qs_text_units = FieldDetectionStrategy.reduce_textunits_by_detection_limit( qs_text_units, field) log.debug( 'detect_field_value: mlflow_field_detection on text unit level, ' + f'field {field.code}({field.pk}), document #{doc.pk}') for text_unit in qs_text_units.iterator(): model_input = dict(field_code_to_value) model_input['text'] = text_unit.text model_input_df = pd.DataFrame([model_input]) model_output = MLFlowModelManager().predict( field.mlflow_model_uri, model_input_df) value = model_output[0] if value is None: continue ant = None if typed_field.requires_value: # For the field types expecting a value the mlflow model must return either a value or None. hint_name = ValueExtractionHint.TAKE_FIRST.name value, hint_name = typed_field.get_or_extract_value( doc, value, hint_name, text_unit.text) if not typed_field.is_python_annotation_value_ok(value): raise ValueError( f'ML model of field {field.code} ({typed_field.type_code}) returned ' f'annotation value not suitable for this field:\n' f'{value}') annotation_value = typed_field.annotation_value_python_to_json( value) ant = AnnotationDTO( annotation_value=annotation_value, location_in_doc_start=text_unit.location_start, location_in_doc_end=text_unit.location_end, extraction_hint_name=hint_name) elif value: # For the related-info fields the mlflow model must return 0 or 1 # where 1 means the text unit matches the field. ant = AnnotationDTO( annotation_value=None, location_in_doc_start=text_unit.location_start, location_in_doc_end=text_unit.location_end, extraction_hint_name=None) if ant is None: continue ants.append(ant) if not isinstance(typed_field, MultiValueField): return FieldValueDTO(field_value=ant.annotation_value, annotations=ants) if not ants: return None return FieldValueDTO(field_value=typed_field. build_json_field_value_from_json_ant_values( [a.annotation_value for a in ants]), annotations=ants)
def detect_field_value(cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: typed_field = TypedField.by(field) # type: TypedField if field.mlflow_detect_on_document_level: log.debug('detect_field_value: mlflow_field_detection on doc level, ' + f'field {field.code}({field.pk}), document #{doc.pk}') text = doc.text model_input = dict(field_code_to_value) model_input['text'] = text model_input_df = pd.DataFrame([model_input]) model_output = MLFlowModelManager().predict(field.mlflow_model_uri, model_input_df) value = model_output[0] if not value: return None hint_name = ValueExtractionHint.TAKE_FIRST.name value, hint_name = typed_field \ .get_or_extract_value(doc, value, hint_name, text) if not value: return None return FieldValueDTO(field_value=value) ants = list() # type: List[AnnotationDTO] qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .select_related('textunittext') \ .order_by('location_start', 'pk') \ .defer('textunittext__text_tsvector') units_counted = 0 log.debug('detect_field_value: mlflow_field_detection on text unit level, ' + f'field {field.code}({field.pk}), document #{doc.pk}') for text_unit in qs_text_units.iterator(): if field.detect_limit_count: units_counted = FieldDetectionStrategy.update_units_counted( field, units_counted, text_unit) if units_counted > field.detect_limit_count: break model_input = dict(field_code_to_value) model_input['text'] = text_unit.text model_input_df = pd.DataFrame([model_input]) model_output = MLFlowModelManager().predict(field.mlflow_model_uri, model_input_df) value = model_output[0] if value is None: continue ant = None if typed_field.requires_value: # For the field types expecting a value the mlflow model must return either a value or None. hint_name = ValueExtractionHint.TAKE_FIRST.name value, hint_name = typed_field \ .get_or_extract_value(doc, value, hint_name, text_unit.text) if not typed_field.is_python_annotation_value_ok(value): raise ValueError(f'ML model of field {field.code} ({typed_field.type_code}) returned ' f'annotation value not suitable for this field:\n' f'{value}') annotation_value = typed_field.annotation_value_python_to_json(value) ant = AnnotationDTO(annotation_value=annotation_value, location_in_doc_start=text_unit.location_start, location_in_doc_end=text_unit.location_end, extraction_hint_name=hint_name) elif value: # For the related-info fields the mlflow model must return 0 or 1 # where 1 means the text unit matches the field. ant = AnnotationDTO(annotation_value=None, location_in_doc_start=text_unit.location_start, location_in_doc_end=text_unit.location_end, extraction_hint_name=None) if ant is None: continue if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: if ant.location_in_doc_start > field.detect_limit_count: break ants.append(ant) if not isinstance(typed_field, MultiValueField): return FieldValueDTO(field_value=ant.annotation_value, annotations=ants) if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: units_counted += len(text_unit.text) if not ants: return None return FieldValueDTO(field_value=typed_field.build_json_field_value_from_json_ant_values([a.annotation_value for a in ants]), annotations=ants)