def detect_field_values(cls, log: ProcessLogger, doc: Document, field: DocumentField, cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]: # This method assumes that field detection already goes in the required order and dependencies of this # field are already calculated / detected. formula = field.formula if not formula: raise ValueError('No formula specified for field {0} (#{1})'.format(field.code, field.uid)) depends_on_fields = list(field.depends_on_fields.all()) qs_document_field_values = doc.documentfieldvalue_set \ .filter(removed_by_user=False) \ .filter(field__in=depends_on_fields) field_code_to_value = merge_document_field_values_to_python_value(list(qs_document_field_values)) field_code_to_value = {f.code: field_code_to_value.get(f.code) for f in depends_on_fields} if field.stop_words: depends_on_full_text = '\n'.join([str(v) for v in field_code_to_value.values()]) detected_with_stop_words, detected_values \ = detect_with_stop_words_by_field_and_full_text(field, depends_on_full_text) if detected_with_stop_words: return detected_values or list() v = cls.calc_formula(field.code, field.type, formula, field_code_to_value) return [DetectedFieldValue(field, v)]
def detect_field_values(cls, log: ProcessLogger, doc: Document, field: DocumentField) -> List[DetectedFieldValue]: # This method assumes that field detection already goes in the required order and dependencies of this # field are already calculated / detected. formula = field.formula if not formula: raise ValueError( 'No formula specified for field {0} (#{1})'.format( field.code, field.uid)) depends_on_fields = list(field.depends_on_fields.all()) qs_document_field_values = doc.documentfieldvalue_set \ .filter(removed_by_user=False) \ .filter(field__in=depends_on_fields) field_code_to_value = merge_document_field_values_to_python_value( list(qs_document_field_values)) field_code_to_value = { f.code: field_code_to_value.get(f.code) for f in depends_on_fields } v = cls.calc_formula(field.code, field.type, formula, field_code_to_value) return [DetectedFieldValue(field, v)]
def test_field_detector_model(task: ExtendedTask, field_id, document_id) -> dict: document = Document.objects.get(pk=document_id) # type: Document field = DocumentField.objects.get(pk=field_id) # type: DocumentField expected_dfvs = field_detection.detect_and_cache_field_values( CeleryTaskLogger(task), document, field, save=False) # type: List[DetectedFieldValue] actual_dfvs = list( DocumentFieldValue.objects.filter( document=document, field=field, removed_by_user=False).all()) # type: List[DocumentFieldValue] if field.is_value_aware(): # dates, numbers, e.t.c. expected_field_values = field_detection.merge_detected_field_values_to_python_value( expected_dfvs) expected_field_value = expected_field_values.get(field.code) actual_field_values = merge_document_field_values_to_python_value( actual_dfvs) actual_field_value = actual_field_values.get(field.code) matches = bool(expected_field_value == actual_field_value) else: # related-info e.t.c. expected_set = { 'text_unit_' + str(dfv.text_unit.id) for dfv in expected_dfvs if dfv.text_unit } expected_field_value = '; '.join(sorted(expected_set)) actual_set = { 'text_unit_' + str(dfv.text_unit.id) for dfv in actual_dfvs if dfv.text_unit } actual_field_value = '; '.join(sorted(actual_set)) matches = bool(expected_set == actual_set) if not matches: task.log_info( '{3} Test doc: {0} (Project: {5}). ' 'Detected: {1}. Real: {2}.\nDetected in text:-----\n{4}\n-----' .format( document.name, expected_field_value, actual_field_value, '[ OK ]' if matches else '[ ERR ]', '\n---\n'.join( [dfv.text_unit.text for dfv in expected_dfvs]) if expected_dfvs else '', document.project.name if document.project else '')) text_units_number = TextUnit.objects.filter( document=document, unit_type=field.text_unit_type).count() return { 'text_units_number': text_units_number, 'value_matches_expected': matches }
def detect_field_values(cls, log: ProcessLogger, doc: Document, field: DocumentField) -> List[DetectedFieldValue]: depends_on_fields = list(field.depends_on_fields.all()) qs_document_field_values = doc.documentfieldvalue_set \ .filter(removed_by_user=False) \ .filter(field__in=depends_on_fields) field_code_to_value = merge_document_field_values_to_python_value( list(qs_document_field_values)) field_code_to_value = { f.code: field_code_to_value.get(f.code) for f in depends_on_fields } document_type = doc.document_type field_detectors = DocumentFieldDetector.objects.filter( document_type=document_type, field=field) field_type_adapter = FIELD_TYPES_REGISTRY.get( field.type) # type: FieldType detected_values = list() # type: List[DetectedFieldValue] for depends_on_value in field_code_to_value.values(): if not depends_on_value: continue depends_on_value = str(depends_on_value) for field_detector in field_detectors: if field_detector.matches(depends_on_value): value = field_detector.detected_value hint_name = None if field_type_adapter.value_aware: hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name value, hint_name = field_type_adapter \ .get_or_extract_value(doc, field, value, hint_name, depends_on_value) if value is None: continue detected_values.append( DetectedFieldValue(field, value, None, hint_name)) if not (field_type_adapter.multi_value or field.is_choice_field()): break if detected_values and not (field_type_adapter.multi_value or field.is_choice_field()): break return detected_values
def detect_field_values(cls, log: ProcessLogger, doc: Document, field: DocumentField) -> List[DetectedFieldValue]: depends_on_fields = list(field.depends_on_fields.all()) qs_document_field_values = doc.documentfieldvalue_set \ .filter(removed_by_user=False) \ .filter(field__in=depends_on_fields) field_code_to_value = merge_document_field_values_to_python_value( list(qs_document_field_values)) field_code_to_value = { f.code: field_code_to_value.get(f.code) for f in depends_on_fields } if field.stop_words: depends_on_full_text = '\n'.join( [str(v) for v in field_code_to_value.values()]) detected_with_stop_words, detected_values \ = detect_with_stop_words_by_field_and_full_text(field, depends_on_full_text) if detected_with_stop_words: return detected_values or list() field_detectors = DocumentFieldDetector.objects.filter(field=field) field_type_adapter = FIELD_TYPES_REGISTRY.get( field.type) # type: FieldType detected_values = list() # type: List[DetectedFieldValue] for depends_on_value in field_code_to_value.values(): if not depends_on_value: continue depends_on_value = str(depends_on_value) for field_detector in field_detectors: # type: DocumentFieldDetector matching_string = field_detector.matching_string( depends_on_value, text_is_sentence=False) if matching_string is not None: value = field_detector.get_validated_detected_value(field) hint_name = None if field_type_adapter.requires_value: hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name value, hint_name = field_type_adapter \ .get_or_extract_value(doc, field, value, hint_name, matching_string) if value is None: continue detected_values.append( DetectedFieldValue(field, value, None, hint_name)) if not (field_type_adapter.multi_value or field.is_choice_field()): break if detected_values and not (field_type_adapter.multi_value or field.is_choice_field()): break return detected_values