def predict_and_extract_value(cls, sklearn_model: SkLearnClassifierModel, typed_field: TypedField, document: Document, field: DocumentField, text_unit: TextUnit) -> Optional[AnnotationDTO]: field_code, value, hint_name = cls.predict_value(sklearn_model, text_unit) if field_code == field.code: if typed_field.requires_value: hint_name = hint_name or ValueExtractionHint.TAKE_FIRST.name value, hint_name = typed_field \ .get_or_extract_value(document, value, hint_name, text_unit.text) if not typed_field.is_python_annotation_value_ok(value): raise ValueError(f'ML model of field {field.code} ({typed_field.type_code}) returned ' f'annotation value not suitable for this field:\n' f'{value}') annotation_value = typed_field.annotation_value_python_to_json(value) return AnnotationDTO(annotation_value=annotation_value, location_in_doc_start=text_unit.location_start, location_in_doc_end=text_unit.location_end, extraction_hint_name=hint_name) else: return AnnotationDTO(annotation_value=None, location_in_doc_start=text_unit.location_start, location_in_doc_end=text_unit.location_end, extraction_hint_name=None) return None
def pack_parsed_value(cls, typed_field: TypedField, value: Any, loc_start: int, loc_end: int): value = typed_field.field_value_python_to_json(value) ant = AnnotationDTO(annotation_value=value, location_in_doc_start=loc_start, location_in_doc_end=loc_end) return FieldValueDTO(field_value=value, annotations=[ant])
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: log.debug('detect_field_value: csv_regexps_field_detection, ' + f'field {field.code}({field.pk}), document #{doc.pk}') detectors = cls.detecting_cache.get_detectors( field.pk, lambda msg, er: log.error(msg, field_code=field.code, exc_info=er)) if not detectors: return None is_multichoice = field.type == MultiChoiceField.type_code doc_text = cls.get_document_text(doc) annotations = [] for detector in detectors: found_item = detector.find_value(doc_text) if not found_item: continue # TODO: implement reading values from full text (TextParts.FULL.value) # as it is done now, or from text units - paragraphs or sentences # based on field.text_unit_type - for other detector.text_part options """ if detector.text_part == TextParts.BEFORE_REGEXP.value: return matching_string[:begin], 0, begin elif detector.text_part == TextParts.AFTER_REGEXP.value: return matching_string[end:], end, len(text) elif detector.text_part == TextParts.INSIDE_REGEXP.value: return matching_string[begin:end], begin, end else: return text, 0, len(text) """ # starting position has to be shifted backward by 1 symbol for FE ant = AnnotationDTO(annotation_value=found_item[0], location_in_doc_start=max( found_item[1] - 1, 0), location_in_doc_end=found_item[2], extraction_hint_name='') if not is_multichoice: return FieldValueDTO(field_value=found_item[0], annotations=[ant]) else: annotations.append(ant) if annotations: f_val = [a.annotation_value for a in annotations] return FieldValueDTO(field_value=f_val, annotations=annotations) return None
def get_value(self, log: ProcessLogger, field: DocumentField, doc: Document, cur_field_code_to_value: Dict[str, Any]) \ -> Optional[FieldValueDTO]: typed_field = TypedField.by(field) # type: TypedField if typed_field.multi_value: raise Exception(f'Python coded field {self.__class__.__name__} supports only single-value field types and ' f'{typed_field.type_code} is multi-value') qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .order_by('location_start', 'pk') for text_unit in qs_text_units.iterator(): # type: TextUnit found, value = self.find_value_in_text_unit(log, field, doc, text_unit) if found: value = typed_field.field_value_python_to_json(value) ant = AnnotationDTO(annotation_value=value, location_in_doc_start=text_unit.location_start, location_in_doc_end=text_unit.location_end) return FieldValueDTO(field_value=value, annotations={ant})
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: text_unit_repo = cls.text_unit_repo field_detector_repo = cls.field_detector_repo log.debug('detect_field_value: regexps_field_detection, ' + f'field {field.code}({field.pk}), document #{doc.pk}') depends_on_full_text = doc.full_text detected_with_stop_words, detected_value = detect_with_stop_words_by_field_and_full_text( field, depends_on_full_text) if detected_with_stop_words: return FieldValueDTO(field_value=detected_value) qs_text_units = text_unit_repo.get_doc_text_units( doc, field.text_unit_type) field_detectors = field_detector_repo.get_field_detectors(field) detectors = [DetectorFieldMatcher(d) for d in field_detectors] typed_field = TypedField.by(field) # type: TypedField ants = list() # type: List[AnnotationDTO] units_counted = 0 for text_unit in qs_text_units: # type: TextUnit if field.detect_limit_count: units_counted = FieldDetectionStrategy.update_units_counted( field, units_counted, text_unit) if units_counted > field.detect_limit_count: break for field_detector in detectors: try: matching_piece = field_detector.matching_string( text_unit.textunittext.text, text_is_sentence=text_unit.is_sentence()) if matching_piece is not None: if field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: if field.detect_limit_count < units_counted + matching_piece[ 1]: continue matching_string = matching_piece[0] value = field_detector.get_validated_detected_value( field) hint_name = None if typed_field.requires_value: hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name try: value, hint_name = typed_field \ .get_or_extract_value(doc, value, hint_name, matching_string) except Exception as e: raise ValueExtractionFunctionThrownException( f'Value extraction function has thrown an exception.\n' f'Document: {doc.name} (#{doc.pk})\n' f'Value: {value}\n' f'Extraction hint: {hint_name}\n' f'Matching string:\n' f'{matching_string}') from e if value is None: continue annotation_value = typed_field.annotation_value_python_to_json( value) ant = AnnotationDTO( annotation_value=annotation_value, location_in_doc_start=text_unit.location_start, location_in_doc_end=text_unit.location_end, extraction_hint_name=hint_name) if not isinstance(typed_field, MultiValueField): return FieldValueDTO( field_value=ant.annotation_value, annotations=[ant]) else: ants.append(ant) except Exception as e: raise CaughtErrorWhileApplyingFieldDetector( f'Exception caught while trying to apply field detector.\n' f'Document: {doc.name} (#{doc.pk})\n' f'Field detector: #{field_detector.detector.pk}\n' f'{field_detector.detector.include_regexps}\n' f'Text unit: #{text_unit.pk}\n' f'{text_unit.text[:300]}') from e if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: units_counted += len(text_unit.text) if not ants: return None return FieldValueDTO(field_value=typed_field. build_json_field_value_from_json_ant_values( [a.annotation_value for a in ants]), annotations=ants)
def extract_from_textunit( cls, text_unit: TextUnit, field: DocumentField, detectors: List[DetectorFieldMatcher]) -> List[AnnotationDTO]: """ Searches TextUnit text for FieldDetector regex matches. Entity extraction, based on the Field type, is run and returned. Args: text_unit (TextUnit): field (DocumentField): detectors (List[DetectorFieldMatcher]): Returns: List[AnnotationDTO]: """ ants = [] src_text = text_unit.textunittext.text typed_field: TypedField = TypedField.by(field) for field_detector in detectors: try: matching_and_location = field_detector.matching_string( src_text, text_unit.is_sentence()) if matching_and_location is not None: hint_name = None matching_string = matching_and_location[0] value = field_detector.get_validated_detected_value(field) if typed_field.requires_value: hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name try: value, hint_name = typed_field.get_or_extract_value( document=text_unit.document, possible_value=value, possible_hint=hint_name, text=matching_string) except Exception as e: raise ValueExtractionFunctionThrownException( f'Document: {text_unit.document.name} (#{text_unit.document.pk})\n' f'TextUnit: {text_unit.unit_type} (#{text_unit.pk})\n' f'Value: {value}\n' f'Extraction hint: {hint_name}\n' f'Matching string:\n' f'{matching_string}', e) from e if value is None: continue annotation_value = typed_field.annotation_value_python_to_json( value) ant = AnnotationDTO( annotation_value=annotation_value, location_in_doc_start=text_unit.location_start, location_in_doc_end=text_unit.location_end, extraction_hint_name=hint_name) if not isinstance(typed_field, MultiValueField): return [ant] else: ants.append(ant) except Exception as e: if isinstance(e, ValueExtractionFunctionThrownException): raise e else: raise CaughtErrorWhileApplyingFieldDetector( f'Document: {text_unit.document.name} (#{text_unit.document.pk})\n' f'TextUnit: {text_unit.unit_type} (#{text_unit.pk})\n' f'Field detector: #{field_detector.detector.pk}\n' f'{field_detector.detector.include_regexps}\n' f'Text: {src_text[:300]}', e) from e return ants
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: typed_field = TypedField.by(field) # type: TypedField if field.mlflow_detect_on_document_level: text = doc.text model_input = dict(field_code_to_value) model_input['text'] = text model_input_df = pd.DataFrame([model_input]) model_output = MLFlowModelManager().predict( field.mlflow_model_uri, model_input_df) value = model_output[0] if not value: return None hint_name = ValueExtractionHint.TAKE_FIRST.name value, hint_name = typed_field \ .get_or_extract_value(doc, value, hint_name, text) if not value: return None return FieldValueDTO(field_value=value) ants = list() # type: List[AnnotationDTO] qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .select_related('textunittext') \ .order_by('location_start', 'pk') units_counted = 0 for text_unit in qs_text_units.iterator(): if field.detect_limit_count: units_counted = FieldDetectionStrategy.update_units_counted( field, units_counted, text_unit) if units_counted > field.detect_limit_count: break model_input = dict(field_code_to_value) model_input['text'] = text_unit.text model_input_df = pd.DataFrame([model_input]) model_output = MLFlowModelManager().predict( field.mlflow_model_uri, model_input_df) value = model_output[0] if value is None: continue ant = None if typed_field.requires_value: # For the field types expecting a value the mlflow model must return either a value or None. hint_name = ValueExtractionHint.TAKE_FIRST.name value, hint_name = typed_field \ .get_or_extract_value(doc, value, hint_name, text_unit.text) if not typed_field.is_python_annotation_value_ok(value): raise ValueError( f'ML model of field {field.code} ({typed_field.type_code}) returned ' f'annotation value not suitable for this field:\n' f'{value}') annotation_value = typed_field.annotation_value_python_to_json( value) ant = AnnotationDTO( annotation_value=annotation_value, location_in_doc_start=text_unit.location_start, location_in_doc_end=text_unit.location_end, extraction_hint_name=hint_name) elif value: # For the related-info fields the mlflow model must return 0 or 1 # where 1 means the text unit matches the field. ant = AnnotationDTO( annotation_value=None, location_in_doc_start=text_unit.location_start, location_in_doc_end=text_unit.location_end, extraction_hint_name=None) if ant is None: continue if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: if ant.location_in_doc_start > field.detect_limit_count: break ants.append(ant) if not isinstance(typed_field, MultiValueField): return FieldValueDTO(field_value=ant.annotation_value, annotations=ants) if field.detect_limit_count and field.detect_limit_unit == DocumentField.DETECT_LIMIT_CHAR: units_counted += len(text_unit.text) if not ants: return None return FieldValueDTO(field_value=typed_field. build_json_field_value_from_json_ant_values( [a.annotation_value for a in ants]), annotations=ants)
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: ants: List[AnnotationDTO] = [] typed_field: TypedField = TypedField.by(field) text_unit_repo = cls.text_unit_repo if field.mlflow_detect_on_document_level: log.debug( 'detect_field_value: mlflow_field_detection on doc level, ' + f'field {field.code}({field.pk}), document #{doc.pk}') text = doc.text model_input = dict(field_code_to_value) model_input['text'] = text model_input_df = pd.DataFrame([model_input]) model_output = MLFlowModelManager().predict( field.mlflow_model_uri, model_input_df) value = model_output[0] if not value: return None hint_name = ValueExtractionHint.TAKE_FIRST.name value, hint_name = typed_field \ .get_or_extract_value(doc, value, hint_name, text) if not value: return None return FieldValueDTO(field_value=value) qs_text_units = text_unit_repo.get_doc_text_units( doc, field.text_unit_type) qs_text_units = FieldDetectionStrategy.reduce_textunits_by_detection_limit( qs_text_units, field) log.debug( 'detect_field_value: mlflow_field_detection on text unit level, ' + f'field {field.code}({field.pk}), document #{doc.pk}') for text_unit in qs_text_units.iterator(): model_input = dict(field_code_to_value) model_input['text'] = text_unit.text model_input_df = pd.DataFrame([model_input]) model_output = MLFlowModelManager().predict( field.mlflow_model_uri, model_input_df) value = model_output[0] if value is None: continue ant = None if typed_field.requires_value: # For the field types expecting a value the mlflow model must return either a value or None. hint_name = ValueExtractionHint.TAKE_FIRST.name value, hint_name = typed_field.get_or_extract_value( doc, value, hint_name, text_unit.text) if not typed_field.is_python_annotation_value_ok(value): raise ValueError( f'ML model of field {field.code} ({typed_field.type_code}) returned ' f'annotation value not suitable for this field:\n' f'{value}') annotation_value = typed_field.annotation_value_python_to_json( value) ant = AnnotationDTO( annotation_value=annotation_value, location_in_doc_start=text_unit.location_start, location_in_doc_end=text_unit.location_end, extraction_hint_name=hint_name) elif value: # For the related-info fields the mlflow model must return 0 or 1 # where 1 means the text unit matches the field. ant = AnnotationDTO( annotation_value=None, location_in_doc_start=text_unit.location_start, location_in_doc_end=text_unit.location_end, extraction_hint_name=None) if ant is None: continue ants.append(ant) if not isinstance(typed_field, MultiValueField): return FieldValueDTO(field_value=ant.annotation_value, annotations=ants) if not ants: return None return FieldValueDTO(field_value=typed_field. build_json_field_value_from_json_ant_values( [a.annotation_value for a in ants]), annotations=ants)