def test_order_field_detection(self) -> None: init_field_type_registry() doc = self.setup_document() field = DocumentField() field.requires_text_annotations = False field.stop_words = None field.text_unit_type = 'sentences' text_unit_repo = MockTextUnitRepository() text_unit_repo.units = [TextUnit(), TextUnit()] text_unit_repo.units[0].text = "But those cushion's velvet lining" text_unit_repo.units[1].text = "She shall press! Ah! Nevermore..." for tu in text_unit_repo.units: tu.document = doc tu.unit_type = field.text_unit_type detect_repo = MockFieldDetectorRepository() detector = self.make_doc_field_detector() detect_repo.detectors = [detector] old_tu_repo = RegexpsOnlyFieldDetectionStrategy.text_unit_repo RegexpsOnlyFieldDetectionStrategy.text_unit_repo = text_unit_repo old_repo_detect = RegexpsOnlyFieldDetectionStrategy.field_detector_repo RegexpsOnlyFieldDetectionStrategy.field_detector_repo = detect_repo try: detected = RegexpsOnlyFieldDetectionStrategy.\ detect_field_value(None, doc, field) finally: RegexpsOnlyFieldDetectionStrategy.text_unit_repo = old_tu_repo RegexpsOnlyFieldDetectionStrategy.field_detector_repo = old_repo_detect self.assertEqual(1, len(detected))
def apply_simple_config(log: ProcessLogger, document_field: DocumentField, csv: bytes, drop_previous_field_detectors: bool, update_field_choice_values: bool): df = pd.read_csv(io.BytesIO(csv), dtype=str) if df.shape[0] < 1 or df.shape[1] < 1: raise ValueError('Config csv contains no data') row_num = df.shape[0] if update_field_choice_values: choices = df[ df.columns[0]].dropna().drop_duplicates().sort_values().tolist() document_field.choices = '\n'.join(choices) document_field.save() log.info( 'Creating {2} naive field detectors for document field {0} and document type {1}...' .format(document_field, document_field.document_type, df.shape[0])) log.set_progress_steps_number(int(row_num / 10) + 1) if drop_previous_field_detectors: DocumentFieldDetector.objects.filter( field=document_field, category=FD_CATEGORY_IMPORTED_SIMPLE_CONFIG).delete() for index, row in df.iterrows(): detector = DocumentFieldDetector() detector.category = FD_CATEGORY_IMPORTED_SIMPLE_CONFIG detector.field = document_field detector.regexps_pre_process_lower = True detector.detected_value = row[0] detector.include_regexps = '\n'.join(row.dropna()).lower() detector.save() if index % 10 == 0: log.step_progress() log.info('Done.')
def detect_field_values_for_python_coded_field( document: Document, field: DocumentField, sentence_text_units: List[TextUnit], do_not_write: bool) -> int: python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get( field.python_coded_field) # type: PythonCodedField if not python_coded_field: raise RuntimeError('Unknown python-coded field: {0}'.format( field.python_coded_field)) field_type_adapter = FIELD_TYPES_REGISTRY[ field.type] # type: FieldType detected_values = list() # type: List[DetectedFieldValue] if python_coded_field.by_sentence: for text_unit in sentence_text_units: for value, location_start, location_end in python_coded_field.get_values( text_unit.text) or []: detected_values.append( DetectedFieldValue(text_unit, value, None, location_start, location_end)) if not (field_type_adapter.multi_value or field.is_choice_field()): return DetectFieldValues.save_detected_values( document, field, field_type_adapter, detected_values, do_not_write) else: for value, location_start, location_end in python_coded_field.get_values( document.full_text) or []: text_unit = TextUnit.objects.filter( document=document, unit_type='sentence', location_start__lte=location_start, location_end__gte=location_start).first() # type: TextUnit if not text_unit: raise RuntimeError( 'Python coded field {0} detected a value in document {1} at ' 'location [{2};{3}] but the start of location does not belong to any ' 'text unit object in DB.\n' 'It can not be. Something is broken.'.format( field.python_coded_field, document, location_start, location_end)) location_length = location_end - location_start location_start = location_start - text_unit.location_start location_end = location_start + location_length detected_values.append( DetectedFieldValue(text_unit, value, None, location_start, location_end)) if not (field_type_adapter.multi_value or field.is_choice_field()): return DetectFieldValues.save_detected_values( document, field, field_type_adapter, detected_values, do_not_write) return DetectFieldValues.save_detected_values(document, field, field_type_adapter, detected_values, do_not_write)
def detect_field_values(cls, log: ProcessLogger, doc: Document, field: DocumentField) -> List[DetectedFieldValue]: depends_on_fields = list(field.depends_on_fields.all()) qs_document_field_values = doc.documentfieldvalue_set \ .filter(removed_by_user=False) \ .filter(field__in=depends_on_fields) field_code_to_value = merge_document_field_values_to_python_value( list(qs_document_field_values)) field_code_to_value = { f.code: field_code_to_value.get(f.code) for f in depends_on_fields } document_type = doc.document_type field_detectors = DocumentFieldDetector.objects.filter( document_type=document_type, field=field) field_type_adapter = FIELD_TYPES_REGISTRY.get( field.type) # type: FieldType detected_values = list() # type: List[DetectedFieldValue] for depends_on_value in field_code_to_value.values(): if not depends_on_value: continue depends_on_value = str(depends_on_value) for field_detector in field_detectors: if field_detector.matches(depends_on_value): value = field_detector.detected_value hint_name = None if field_type_adapter.value_aware: hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name value, hint_name = field_type_adapter \ .get_or_extract_value(doc, field, value, hint_name, depends_on_value) if value is None: continue detected_values.append( DetectedFieldValue(field, value, None, hint_name)) if not (field_type_adapter.multi_value or field.is_choice_field()): break if detected_values and not (field_type_adapter.multi_value or field.is_choice_field()): break return detected_values
def detect_field_values(cls, log: ProcessLogger, doc: Document, field: DocumentField) -> List[DetectedFieldValue]: python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get( field.python_coded_field) # type: PythonCodedField if not python_coded_field: raise RuntimeError('Unknown python-coded field: {0}'.format( field.python_coded_field)) field_type_adapter = FIELD_TYPES_REGISTRY[ field.type] # type: FieldType detected_values = list() # type: List[DetectedFieldValue] if python_coded_field.by_sentence: qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .order_by('location_start', 'pk') for text_unit in qs_text_units.iterator(): for value, location_start, location_end in python_coded_field.get_values( text_unit.text) or []: detected_values.append( DetectedFieldValue(field, value, text_unit, None, location_start, location_end)) if not (field_type_adapter.multi_value or field.is_choice_field()): return detected_values else: for value, location_start, location_end in python_coded_field.get_values( doc.full_text) or []: text_unit = TextUnit.objects.filter( document=doc, unit_type='sentence', location_start__lte=location_start, location_end__gte=location_start).first() # type: TextUnit if not text_unit: raise RuntimeError( 'Python coded field {0} detected a value in document {1} at ' 'location [{2};{3}] but the start of location does not belong to any ' 'text unit object in DB.\n' 'It can not be. Something is broken.'.format( field.python_coded_field, doc, location_start, location_end)) location_length = location_end - location_start location_start = location_start - text_unit.location_start location_end = location_start + location_length detected_values.append( DetectedFieldValue(field, value, text_unit, None, location_start, location_end)) if not (field_type_adapter.multi_value or field.is_choice_field()): return detected_values return detected_values
def detect_field_values( cls, log: ProcessLogger, doc: Document, field: DocumentField, cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]: depends_on_full_text = doc.full_text detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text( field, depends_on_full_text) if detected_with_stop_words: return detected_values or list() qs_text_units = RegexpsOnlyFieldDetectionStrategy.\ text_unit_repo.get_doc_text_units(doc, field.text_unit_type) field_detectors = RegexpsOnlyFieldDetectionStrategy.\ field_detector_repo.get_field_detectors(field) detectors = [DetectorFieldMatcher(d) for d in field_detectors] field_type_adapter = field.get_field_type() # type: FieldType detected_values = list() # type: List[DetectedFieldValue] for text_unit in qs_text_units: # type: TextUnit for field_detector in detectors: matching_string = field_detector.matching_string( text_unit.text, text_is_sentence=text_unit.is_sentence()) if matching_string is not None: value = field_detector.get_validated_detected_value(field) hint_name = None if field_type_adapter.requires_value: hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name value, hint_name = field_type_adapter \ .get_or_extract_value(doc, field, value, hint_name, matching_string) if value is None: continue detected_values.append( DetectedFieldValue(field, value, text_unit, hint_name)) if not (field_type_adapter.multi_value or field.is_choice_field()): break if detected_values and not (field_type_adapter.multi_value or field.is_choice_field()): break return detected_values
def detect_field_values(cls, log: ProcessLogger, doc: Document, field: DocumentField) -> List[DetectedFieldValue]: depends_on_full_text = doc.full_text detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text( field, depends_on_full_text) if detected_with_stop_words: return detected_values or list() qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .order_by('location_start', 'pk') document_type = doc.document_type field_detectors = DocumentFieldDetector.objects.filter( document_type=document_type, field=field) field_type_adapter = FIELD_TYPES_REGISTRY.get( field.type) # type: FieldType detected_values = list() # type: List[DetectedFieldValue] for text_unit in qs_text_units.iterator(): for field_detector in field_detectors: if field_detector.matches(text_unit.text): value = field_detector.detected_value hint_name = None if field_type_adapter.value_aware: hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name value, hint_name = field_type_adapter \ .get_or_extract_value(doc, field, value, hint_name, text_unit.text) if value is None: continue detected_values.append( DetectedFieldValue(field, value, text_unit, hint_name)) if not (field_type_adapter.multi_value or field.is_choice_field()): break if detected_values and not (field_type_adapter.multi_value or field.is_choice_field()): break return detected_values
def detect_field_values( cls, log: ProcessLogger, doc: Document, field: DocumentField, cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]: depends_on_full_text = doc.full_text detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text( field, depends_on_full_text) if detected_with_stop_words: return detected_values or list() qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .order_by('location_start', 'pk') field_detectors = DocumentFieldDetector.objects.filter(field=field) field_type_adapter = field.get_field_type() # type: FieldType detected_values = list() # type: List[DetectedFieldValue] for text_unit in qs_text_units.iterator(): # type: TextUnit for field_detector in field_detectors: matching_string = field_detector.matching_string( text_unit.text, text_is_sentence=text_unit.is_sentence()) if matching_string is not None: value = field_detector.get_validated_detected_value(field) hint_name = None if field_type_adapter.requires_value: hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name value, hint_name = field_type_adapter \ .get_or_extract_value(doc, field, value, hint_name, matching_string) if value is None: continue detected_values.append( DetectedFieldValue(field, value, text_unit, hint_name)) if not (field_type_adapter.multi_value or field.is_choice_field()): break if detected_values and not (field_type_adapter.multi_value or field.is_choice_field()): break return detected_values
def detect_field_values_with_model(classifier_model, document: Document, field: DocumentField, sentence_text_units: List[TextUnit], do_not_write: bool) -> int: sklearn_model = classifier_model.get_trained_model_obj() field_type_adapter = FIELD_TYPES_REGISTRY[field.type] detected_values = list() # type: List[DetectedFieldValue] for text_unit in sentence_text_units: value, hint_name = DetectFieldValues.predict_and_extract_value( sklearn_model=sklearn_model, field_type_adapter=field_type_adapter, document=document, field=field, text_unit=text_unit) if value is None: continue detected_values.append( DetectedFieldValue(text_unit, value, hint_name)) if not (field_type_adapter.multi_value or field.is_choice_field()): break return DetectFieldValues.save_detected_values(document, field, field_type_adapter, detected_values, do_not_write)
def detect_field_values(cls, log: ProcessLogger, doc: Document, field: DocumentField) -> List[DetectedFieldValue]: document_type = doc.document_type # type: DocumentType try: classifier_model = ClassifierModel.objects \ .get(document_type=document_type, document_field=field) sklearn_model = classifier_model.get_trained_model_obj() field_type_adapter = FIELD_TYPES_REGISTRY[field.type] detected_values = list() # type: List[DetectedFieldValue] qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .order_by('location_start', 'pk') for text_unit in qs_text_units.iterator(): detected_value = cls.predict_and_extract_value( sklearn_model=sklearn_model, field_type_adapter=field_type_adapter, document=doc, field=field, text_unit=text_unit) if detected_value is None: continue detected_values.append(detected_value) if not (field_type_adapter.multi_value or field.is_choice_field()): break return detected_values except ClassifierModel.DoesNotExist as e: log.info('Classifier model does not exist for field: {0}'.format( field.code)) raise e
def __init__(self, text: str, field_type: str): self.document = Document() self.field = DocumentField() self.field.type = field_type self.text_unit = TextUnit() self.text_unit.document = self.document self.text_unit.textunittext = TextUnitText() self.text_unit.textunittext.text = text self.text_unit.location_start = 1001 self.text_unit.location_end = self.text_unit.location_start + len(text) self.detector = DocumentFieldDetector() self.detector.regexps_pre_process_lower = True self.detector.include_regexps = 'at\\s{1,5}least\\s{1,5}(two|2).{1,15}unaffiliated.{1,15}lenders\n' + \ '(two|2).{1,30}lenders.{1,200}(not.{1,50}affiliate|affiliate.{1,100}(one|1|single))' self.detector.definition_words = 'required lenders\nrequired revolving lenders\n' + \ 'required revolving credit lenders\nrequired term lenders\n' + \ 'requisite lenders\nrequisite revolving lenders\n' + \ 'required class lenders\nrequired ddtl lenders' self.detector.detected_value = 'AFFILIATED' self.detector.text_part = TextParts.FULL.value self.detector.extraction_hint = ValueExtractionHint.TAKE_FIRST self.matcher = DetectorFieldMatcher(self.detector)
def detect_field_value(log: ProcessLogger, doc: Document, field: DocumentField, save: bool = False) -> Optional[FieldValueDTO]: field_repo = DocumentFieldRepository() strategy = FIELD_DETECTION_STRATEGY_REGISTRY[ field.value_detection_strategy] \ if field.value_detection_strategy else STRATEGY_DISABLED doc_field_values = None depends_on_codes = set(field.get_depends_on_codes()) if depends_on_codes: doc_field_values = field_repo.get_field_code_to_python_value( document_type_id=doc.document_type_id, doc_id=doc.pk, field_codes_only=depends_on_codes) dto = strategy.detect_field_value(log, doc, field, doc_field_values) if save and dto is not None: field_repo.update_field_value_with_dto(document=doc, field=field, field_value_dto=dto, user=None) return dto
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> FieldValueDTO: formula = field.formula if not formula: raise ValueError( f'No formula specified for field {field.code} (#{field.uid})') depends_on_field_codes = field.get_depends_on_codes() or set() field_code_to_value = { c: v for c, v in field_code_to_value.items() if c in depends_on_field_codes } if field.stop_words: depends_on_full_text = '\n'.join( [str(v) for v in field_code_to_value.values()]) log.debug( 'detect_field_value: formula_based_field_detection, checking stop words, ' + f'field {field.code}({field.pk}), document #{doc.pk}') detected_with_stop_words, detected_values \ = detect_with_stop_words_by_field_and_full_text(field, doc, depends_on_full_text) if detected_with_stop_words: return detected_values or list() else: log.debug('detect_field_value: formula_based_field_detection, ' + f'field {field.code}({field.pk}), document #{doc.pk}') v = cls.calc_formula(field_code=field.code, formula=formula, depends_on_field_to_value=field_code_to_value, convert_decimals_to_floats=field. convert_decimals_to_floats_in_formula_args) typed_field = TypedField.by(field) # We don't accept formulas returning values of wrong type to avoid further confusion and # creating wrong formulas in future. # For example for multi-choice fields the formula should return a list and not a string # to ensure the admin understands that this value will replace the whole set/list of strings and not # just add one more string to the value. if typed_field.is_choice_field and typed_field.multi_value: if v and isinstance(v, str): # "outdated" formula is incorrect and returns string instead of # set / list, but we don't warn user: when he updates this formula # (or other detection method) he'll be forced to write code, returning # list or set. v = [v] if not typed_field.is_python_field_value_ok(v): raise ValueError( f'Formula of field {field.code} returned value not suitable for this field:\n{v}' ) v = typed_field.field_value_python_to_json(v) return FieldValueDTO(field_value=v)
def detect_values_in_document(self, text_units: List[TextUnitMock], detector: DocumentFieldDetector): init_field_type_registry() doc = self.setup_document(text_units) field = DocumentField() field.requires_text_annotations = False field.stop_words = None field.text_unit_type = 'sentences' field.type = 'multi_choice' field.allow_values_not_specified_in_choices = True text_unit_repo = MockTextUnitRepository() text_unit_repo.units = text_units for tu in text_unit_repo.units: tu.document = doc tu.unit_type = field.text_unit_type detect_repo = MockFieldDetectorRepository() detect_repo.detectors = [detector] old_tu_repo = RegexpsOnlyFieldDetectionStrategy.text_unit_repo RegexpsOnlyFieldDetectionStrategy.text_unit_repo = text_unit_repo old_repo_detect = RegexpsOnlyFieldDetectionStrategy.field_detector_repo RegexpsOnlyFieldDetectionStrategy.field_detector_repo = detect_repo try: detected = RegexpsOnlyFieldDetectionStrategy. \ detect_field_value(None, doc, field, {}) finally: RegexpsOnlyFieldDetectionStrategy.text_unit_repo = old_tu_repo RegexpsOnlyFieldDetectionStrategy.field_detector_repo = old_repo_detect return detected
def _get_invalid_choices(self, saved_field: DocumentField) -> set: # old_choices = set() # if not saved_field.allow_values_not_specified_in_choices and \ # not self.object.allow_values_not_specified_in_choices: old_choices = set(saved_field.get_choice_values()) for choice_value in self.object.get_choice_values(): if choice_value in old_choices: old_choices.remove(choice_value) return old_choices
def detect_field_value( cls, log: ProcessLogger, doc: Document, field: DocumentField, field_code_to_value: Dict[str, Any]) -> Optional[FieldValueDTO]: depends_on_fields = field.get_depends_on_codes() field_code_to_value = { c: v for c, v in field_code_to_value.items() if c in depends_on_fields } if field.stop_words: depends_on_full_text = '\n'.join( [str(v) for v in field_code_to_value.values()]) detected_with_stop_words, detected_value \ = detect_with_stop_words_by_field_and_full_text(field, doc, depends_on_full_text) if detected_with_stop_words: return FieldValueDTO(field_value=detected_value) field_detectors = DocumentFieldDetector.objects.filter(field=field) detectors = [DetectorFieldMatcher(d) for d in field_detectors] typed_field = TypedField.by(field) # type: TypedField values = list() # type: List for depends_on_value in field_code_to_value.values(): if not depends_on_value: continue depends_on_value = str(depends_on_value) for detector_field_matcher in detectors: # type: DetectorFieldMatcher matching_piece = detector_field_matcher.matching_string( depends_on_value, text_is_sentence=False) if matching_piece is not None: matching_string = matching_piece[0] value = detector_field_matcher.get_validated_detected_value( field) if typed_field.requires_value: hint_name = detector_field_matcher.extraction_hint or ValueExtractionHint.TAKE_FIRST.name value, hint_name = typed_field \ .get_or_extract_value(doc, value, hint_name, matching_string) if value is None: continue value = typed_field.annotation_value_python_to_json(value) if not isinstance(typed_field, MultiValueField): return FieldValueDTO(field_value=value) else: values.append(value) if isinstance(typed_field, MultiValueField): return FieldValueDTO( field_value=typed_field. build_json_field_value_from_json_ant_values(values)) else: return None
def save_detected_values(document: Document, field: DocumentField, field_type_adapter: FieldType, detected_values: List[DetectedFieldValue], do_not_write: bool): if len(detected_values) == 0: return 0 try: if field.is_choice_field() and not field_type_adapter.multi_value: values_order = field.get_choice_values() for choice_value in values_order: for dv in detected_values: if choice_value == dv.value: if not do_not_write: field_type_adapter.save_value( document, field, dv.get_annotation_start(), dv.get_annotation_end(), dv.get_annotation_text(), dv.text_unit, dv.value, user=None, allow_overwriting_user_data=False, extraction_hint=dv.hint_name) return 1 else: for dv in detected_values: if not do_not_write: field_type_adapter.save_value( document, field, dv.get_annotation_start(), dv.get_annotation_end(), dv.get_annotation_text(), dv.text_unit, dv.value, user=None, allow_overwriting_user_data=False, extraction_hint=dv.hint_name) return len(detected_values) finally: document.cache_field_values()
def clean(self): field_code = self.cleaned_data.get('code') formula = self.cleaned_data.get('formula') type_code = self.cleaned_data.get('type') depends_on_fields = self.cleaned_data.get('depends_on_fields') or [] fields_to_values = {field: FIELD_TYPES_REGISTRY[field.type].example_json_value(field) for field in depends_on_fields} python_coded_field_code = self.cleaned_data.get('python_coded_field') if python_coded_field_code: python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get(python_coded_field_code) if not python_coded_field: self.add_error('python_coded_field', 'Unknown Python-coded field: {0}'.format(python_coded_field_code)) else: if type_code != python_coded_field.type: self.add_error('type', 'Python-coded field {0} is of type {1} but {2} is specified' ' as the field type'.format(python_coded_field.title, python_coded_field.type, type_code)) if not formula or not formula.strip() or not type_code: return try: DocumentField.calc_formula(field_code, type_code, formula, fields_to_values) except DocumentFieldFormulaError as ex: base_error_class = type(ex.base_error).__name__ base_error_msg = str(ex.base_error) lines = list() lines.append("Error caught while trying to execute formula on example values:") for field_name in ex.field_values: lines.append('{0}={1}'.format(field_name, ex.field_values[field_name])) lines.append("{0}. {1} in formula of field '{2}' at line {3}".format(base_error_class, base_error_msg, ex.field_code, ex.line_number)) self.add_error('formula', lines) except Exception: trace = traceback.format_exc() raise forms.ValidationError( 'Tried to eval formula on example values:\n{0}\nGot error:\n{1}'.format( str(fields_to_values), trace)) return self.cleaned_data
def detect_field_values( cls, log: ProcessLogger, doc: Document, field: DocumentField, cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]: depends_on_full_text = doc.full_text detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text( field, depends_on_full_text) if detected_with_stop_words: return detected_values or list() try: classifier_model = ClassifierModel.objects.get( document_field=field) sklearn_model = classifier_model.get_trained_model_obj() field_type_adapter = field.get_field_type() detected_values = list() # type: List[DetectedFieldValue] qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .order_by('location_start', 'pk') for text_unit in qs_text_units.iterator(): detected_value = cls.predict_and_extract_value( sklearn_model=sklearn_model, field_type_adapter=field_type_adapter, document=doc, field=field, text_unit=text_unit) if detected_value is None: continue detected_values.append(detected_value) if not (field_type_adapter.multi_value or field.is_choice_field()): break return detected_values except ClassifierModel.DoesNotExist as e: log.info('Classifier model does not exist for field: {0}'.format( field.code)) raise e
def clean(self): formula = self.cleaned_data.get('formula') type_code = self.cleaned_data.get('type') if not formula or not formula.strip() or not type_code: return depends_on_fields = self.cleaned_data.get('depends_on_fields') or [] fields_to_values = { field: FIELD_TYPES_REGISTRY[field.type].example_json_value(field) for field in depends_on_fields } try: DocumentField.calc_formula(type_code, formula, fields_to_values) except Exception as ex: trace = traceback.format_exc() raise forms.ValidationError( 'Tried to eval formula on example values:\n{0}\nGot error:\n{1}' .format(str(fields_to_values), trace)) return self.cleaned_data
def save_detected_values(document: Document, field: DocumentField, detected_values: List[DetectedFieldValue]): if len(detected_values) == 0: return 0 field_type_adapter = FIELD_TYPES_REGISTRY[field.type] # type: FieldType if field.is_choice_field() and not field_type_adapter.multi_value: values_order = field.get_choice_values() for choice_value in values_order: for dv in detected_values: if choice_value == dv.value: field_type_adapter.save_value( document, field, dv.get_annotation_start(), dv.get_annotation_end(), dv.get_annotation_text(), dv.text_unit, dv.value, user=dv.user, allow_overwriting_user_data=dv.user is not None, extraction_hint=dv.hint_name) return 1 else: for dv in detected_values: field_type_adapter.save_value(document, field, dv.get_annotation_start(), dv.get_annotation_end(), dv.get_annotation_text(), dv.text_unit, dv.value, user=dv.user, allow_overwriting_user_data=dv.user is not None, extraction_hint=dv.hint_name) return len(detected_values)
def clean(self): field_code = self.cleaned_data.get('code') formula = self.cleaned_data.get('formula') type_code = self.cleaned_data.get('type') if not formula or not formula.strip() or not type_code: return depends_on_fields = self.cleaned_data.get('depends_on_fields') or [] fields_to_values = { field: FIELD_TYPES_REGISTRY[field.type].example_json_value(field) for field in depends_on_fields } try: DocumentField.calc_formula(field_code, type_code, formula, fields_to_values) except DocumentFieldFormulaError as ex: base_error_class = type(ex.base_error).__name__ base_error_msg = str(ex.base_error) lines = list() lines.append( "Error caught while trying to execute formula on example values:" ) for field_name in ex.field_values: lines.append('{0}={1}'.format(field_name, ex.field_values[field_name])) lines.append( "{0}. {1} in formula of field '{2}' at line {3}".format( base_error_class, base_error_msg, ex.field_code, ex.line_number)) self.add_error('formula', lines) except Exception: trace = traceback.format_exc() raise forms.ValidationError( 'Tried to eval formula on example values:\n{0}\nGot error:\n{1}' .format(str(fields_to_values), trace)) return self.cleaned_data
def detect_with_stop_words_by_field_and_full_text(field: DocumentField, full_text: str) -> Tuple[bool, Optional[List]]: if field.requires_text_annotations: return False, None stop_words = compile_stop_words(field.stop_words) if not stop_words: return False, None field_type_adapter = field.get_field_type() # type: FieldType detected, possible_value = detect_value_with_stop_words(stop_words, full_text) if not detected: return False, None if possible_value is None: return True, None else: possible_value = field_type_adapter.extract_from_possible_value_text(field, possible_value) return True, [DetectedFieldValue(field, possible_value)]
def make_doc_field(**kwargs) -> DocumentField: doc_field_attributes = { 'requires_text_annotations': kwargs.get('requires_text_annotations', False), 'stop_words': kwargs.get('stop_words'), 'text_unit_type': kwargs.get('text_unit_type', 'sentence'), 'type': kwargs.get('type', 'multi_choice'), 'choices': kwargs.get('choices', 'brown fox\nbrown box\nfrown fox'), 'allow_values_not_specified_in_choices': kwargs.get('choices', True), 'detect_limit_unit': kwargs.get('choices', 'UNIT'), 'detect_limit_count': kwargs.get('choices', 0) } for k, v in doc_field_attributes.items(): if k not in kwargs: kwargs[k] = v return DocumentField(**kwargs)
def detect_field_values_with_regexps(document: Document, field: DocumentField, sentence_text_units: List[TextUnit], do_not_write: bool) -> int: document_type = document.document_type field_detectors = DocumentFieldDetector.objects.filter( document_type=document_type, field=field) field_type_adapter = FIELD_TYPES_REGISTRY.get( field.type) # type: FieldType detected_values = list() # type: List[DetectedFieldValue] for text_unit in sentence_text_units: for field_detector in field_detectors: if field_detector.matches(text_unit.text): value = field_detector.detected_value hint_name = None if field_type_adapter.value_aware: hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name value, hint_name = field_type_adapter \ .get_or_extract_value(document, field, value, hint_name, text_unit.text) if value is None: continue detected_values.append( DetectedFieldValue(text_unit, value, hint_name)) if not (field_type_adapter.multi_value or field.is_choice_field()): break return DetectFieldValues.save_detected_values(document, field, field_type_adapter, detected_values, do_not_write)
def manual_test_columns(self): user_fields = [DocumentField(), DocumentField()] user_fields[0].code = 'k_one' user_fields[0].type = 'int' user_fields[1].code = 'k_ten' user_fields[1].type = 'multi_choice' repo = DocumentFieldRepositoryMock() repo.fields = user_fields doc_type = DocumentType.objects.get(code='k_fields_depend') f_count = calculate_doctype_cache_columns(doc_type, [], repo) self.assertEqual(35, f_count) # old type was 'bigint' f_new = DocumentField() f_new.code = 'k_ten' f_new.type = 'linked_documents' f_count = calculate_doctype_cache_columns(doc_type, [f_new], repo) self.assertEqual(36, f_count)
def detect_field_values( cls, log: ProcessLogger, doc: Document, field: DocumentField, cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]: python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get( field.python_coded_field) # type: PythonCodedField if not python_coded_field: raise RuntimeError('Unknown python-coded field: {0}'.format( field.python_coded_field)) field_type_adapter = field.get_field_type() # type: FieldType detected_values = list() # type: List[DetectedFieldValue] if python_coded_field.detect_per_text_unit: qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .order_by('location_start', 'pk') for text_unit in qs_text_units.iterator(): for value, location_start, location_end \ in python_coded_field.get_values(log, field, doc, text_unit.text) or []: detected_values.append( DetectedFieldValue(field, value, text_unit, None, location_start, location_end)) if not (field_type_adapter.multi_value or field.is_choice_field()): return detected_values else: for value, location_start, location_end \ in python_coded_field.get_values(log, field, doc, doc.full_text) or []: if field.requires_text_annotations and ( location_start is None or location_end is None): raise RuntimeError( 'Python coded field {0} detected a value in document {1} at ' 'undefined location but the field requires text annotation (and location).\n' 'This should not happen. Something is broken.'.format( field.python_coded_field, doc)) if location_start is not None and location_end is not None: text_unit = TextUnit.objects.filter( document=doc, unit_type=field.text_unit_type, location_start__lte=location_start, location_end__gte=location_start).first( ) # type: TextUnit if not text_unit: raise RuntimeError( 'Python coded field {0} detected a value in document {1} at ' 'location [{2};{3}] but the start of location does not belong to any ' 'text unit object in DB.\n' 'This should not happen. Something is broken.'. format(field.python_coded_field, doc, location_start, location_end)) location_length = location_end - location_start location_start = location_start - text_unit.location_start location_end = location_start + location_length else: text_unit = None location_start = None location_end = None detected_values.append( DetectedFieldValue(field, value, text_unit, None, location_start, location_end)) if not (field_type_adapter.multi_value or field.is_choice_field()): return detected_values return detected_values
class LoggerMock(ProcessLogger): def info(self, message: str): print(message) def error(self, message: str, field_code: str = None, exc_info: Exception = None): if field_code: message = f'{field_code}: {message or "error"}' if exc_info: message += f'\nException: {exc_info}' print(message) doc_field = DocumentField() logger = LoggerMock() def setup_mock(): doc_field.uid = 'ABCDEF' doc_field.code = 'client' csv_text = """ ,value,pattern 0,"Big Bank & Company (004578) (Knight, Bobby (Charlotte); Bryant, Koby (Charlotte); Williams, Gary (Charlotte); Johnson, Magic (Charlotte); Lobo, Rebecca (Charlotte))","\bbig\s{1,5}bank\s{1,5}.{1,5}\s{1,5}company\s{1,5}(004578)\b" 1,"Family Name Limited (173437) (Tanner, Rebecca (Houston); Saget, Bob (Houston))","family\s{1,5}name\s{1,5}\(173437\)" 2,"Financial Services & Co. (015607) (Spelling, Tori (Chicago); Priestley, Jason (Dallas); Perry, Luke (New York); Doherty, Shannon (Chicago); Garth, Jenny (Chicago))","\bfinancial\s{1,5}services\s{1,5}.{1,5}(015607)\b" 3,"Food Wholsale, Inc. (056230) (Jenner, Bruce (Chicago))","\bfood\s{1,5}wholsale,(056230)\b" 4,"All Eyes Communications (018951) (Moore, Michael (New York); Tarantino, Quentin (San Francisco); Lee, Spike (New York); Levinson, Barry (Charlotte))","\ball\s{1,5}eyes\s{1,5}communications\s{1,5}(018951)\b" 5,"Joe Smith Archives, LLC d/b/a Foxtrot (085292) (Flay, Bobby (New York))","\bfoxtrot\s{1,5}(085292)\b
def process(self, **kwargs): with transaction.atomic(): csv_log = list() # type: List[Tuple[str, str, str]] for document_type in DocumentType.objects.all(): # type: DocumentType changed_field_codes = dict() # type: Dict[str, str] field_code_use_counts = dict() # type: Dict[str, int] for code in DocumentField.objects \ .filter(document_type=document_type) \ .order_by('order', 'code') \ .values_list('code', flat=True): field_code_use_counts[code] = 1 m = self.RE_FIELD_CODE_NUM.fullmatch(code) if m: base = m.group(1) num = int(m.group(2)) old_num = field_code_use_counts.get(base) or 0 field_code_use_counts[base] = max(old_num, num) for field in DocumentField.objects \ .filter(document_type=document_type) \ .order_by('order', 'code'): # type: DocumentField field_code_escaped = escape_column_name(field.code)[:DOCUMENT_FIELD_CODE_MAX_LEN] if field.code == field_code_escaped: field_code_use_counts[field.code] = (field_code_use_counts.get(field.code) or 0) + 1 long_code = DocumentField.get_long_code(field, document_type) if field.long_code != long_code: self.log_info('Updating field long code {0} to {1}' .format(field.long_code, long_code)) field.long_code = long_code field.save(update_fields={'long_code'}) else: field_code_use_count = field_code_use_counts.get(field_code_escaped) if field_code_use_count is not None: field_code_use_counts[field_code_escaped] = field_code_use_count + 1 counter_str = str(field_code_use_count) # make next repeated column name to be column1, column2, ... # make it fitting into N chars by cutting the field code on the required # number of chars to fit the num field_code_escaped = field_code_escaped[:DOCUMENT_FIELD_CODE_MAX_LEN - len(counter_str) - 1] \ + '_' + counter_str else: field_code_use_counts[field_code_escaped] \ = (field_code_use_counts.get(field_code_escaped) or 0) + 1 self.log_info('Updating field {0}.{1} to {2}' .format(document_type.code, field.code, field_code_escaped)) changed_field_codes[field.code] = field_code_escaped csv_log.append((document_type.code, field.code, field_code_escaped)) field.code = field_code_escaped field.long_code = DocumentField.get_long_code(field, document_type) field.save(update_fields={'code', 'long_code'}) hide_until_js = jiphy.to.javascript(field.hide_until_python) if field.hide_until_python else '' if hide_until_js != field.hide_until_js: field.hide_until_js = hide_until_js self.log_info('Updating hide_until_js for field {0}.{1}' .format(document_type.code, field.code)) field.save(update_fields={'hide_until_js'}) if len(changed_field_codes) > 0 and document_type.field_code_aliases: updated_aliases = {k: changed_field_codes.get(v) or v for k, v in document_type.field_code_aliases.items()} self.log_info('Updating field code aliases of document type {0}"\n{1}' .format(document_type.code, updated_aliases)) document_type.field_code_aliases = updated_aliases document_type.save(update_fields={'field_code_aliases'}) output = io.StringIO() writer = csv.writer(output) writer.writerow(('Document Type', 'Old Field Code', 'New Field Code')) for r in csv_log: writer.writerow(r) self.log_info('\n\n\n------------------\n' 'Changed fields csv:\n' + output.getvalue() + '\n------------------')
def detect_field_values(cls, log: ProcessLogger, doc: Document, field: DocumentField) -> List[DetectedFieldValue]: depends_on_fields = list(field.depends_on_fields.all()) qs_document_field_values = doc.documentfieldvalue_set \ .filter(removed_by_user=False) \ .filter(field__in=depends_on_fields) field_code_to_value = merge_document_field_values_to_python_value( list(qs_document_field_values)) field_code_to_value = { f.code: field_code_to_value.get(f.code) for f in depends_on_fields } if field.stop_words: depends_on_full_text = '\n'.join( [str(v) for v in field_code_to_value.values()]) detected_with_stop_words, detected_values \ = detect_with_stop_words_by_field_and_full_text(field, depends_on_full_text) if detected_with_stop_words: return detected_values or list() field_detectors = DocumentFieldDetector.objects.filter(field=field) field_type_adapter = FIELD_TYPES_REGISTRY.get( field.type) # type: FieldType detected_values = list() # type: List[DetectedFieldValue] for depends_on_value in field_code_to_value.values(): if not depends_on_value: continue depends_on_value = str(depends_on_value) for field_detector in field_detectors: # type: DocumentFieldDetector matching_string = field_detector.matching_string( depends_on_value, text_is_sentence=False) if matching_string is not None: value = field_detector.get_validated_detected_value(field) hint_name = None if field_type_adapter.requires_value: hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name value, hint_name = field_type_adapter \ .get_or_extract_value(doc, field, value, hint_name, matching_string) if value is None: continue detected_values.append( DetectedFieldValue(field, value, None, hint_name)) if not (field_type_adapter.multi_value or field.is_choice_field()): break if detected_values and not (field_type_adapter.multi_value or field.is_choice_field()): break return detected_values