def cache_field_values(doc: Document, suggested_field_values: Optional[ List[DetectedFieldValue]], save: bool = True) -> Dict[str, Any]: """ Loads DocumentFieldValue objects from DB, merges them to get python field values of their fields for the document, converts them to the sortable DB-aware form and saves them to Document.field_values. :param doc: :param save: :param suggested_field_values: :return: """ document_type = doc.document_type # type: DocumentType # TODO: get/save field value for specific field all_fields = list(document_type.fields.all()) fields_to_field_values = {f: None for f in all_fields} for fv in doc.documentfieldvalue_set.all(): if fv.removed_by_user: continue field = fv.field field_type = FIELD_TYPES_REGISTRY[fv.field.type] # type: FieldType fields_to_field_values[field] = field_type \ .merge_multi_python_values(fields_to_field_values.get(field), fv.python_value) field_uids_to_field_values_db = {} for f in all_fields: # type: DocumentField field_type = FIELD_TYPES_REGISTRY[f.type] # type: FieldType v = fields_to_field_values[f] field_uids_to_field_values_db[ f.uid] = field_type.merged_python_value_to_db(v) if suggested_field_values: field_codes_to_suggested_values = \ merge_detected_field_values_to_python_value(suggested_field_values) # type: Dict[str, Any] else: field_codes_to_suggested_values = None for f in all_fields: # type: DocumentField field_type = f.get_field_type() # type: FieldType if f.is_detectable(): suggested_field_uid = Document.get_suggested_field_uid(f.uid) if field_codes_to_suggested_values: suggested_value_db = field_type.merged_python_value_to_db( field_codes_to_suggested_values.get(f.code)) else: suggested_value_db = doc.field_values.get( suggested_field_uid) if doc.field_values else None field_uids_to_field_values_db[ suggested_field_uid] = suggested_value_db if save: doc.field_values = field_uids_to_field_values_db doc.save() return field_uids_to_field_values_db
def _fill_system_fields_to_python_values(document: Document, field_to_python_values: Dict[str, List]): field_to_python_values[FIELD_CODE_DOC_ID] = document.id field_to_python_values[FIELD_CODE_DOC_NAME] = document.name field_to_python_values[FIELD_CODE_DOC_TITLE] = document.title field_to_python_values[FIELD_CODE_IS_REVIEWED] = document.is_reviewed() field_to_python_values[FIELD_CODE_IS_COMPLETED] = document.is_completed() field_to_python_values[FIELD_CODE_DOC_FULL_TEXT] = \ document.full_text[:settings.RAW_DB_FULL_TEXT_SEARCH_CUT_ABOVE_TEXT_LENGTH] if document.full_text else None field_to_python_values[FIELD_CODE_DOC_FULL_TEXT_LENGTH] = len( document.full_text) if document.full_text else 0 project = document.project field_to_python_values[ FIELD_CODE_PROJECT_ID] = project.pk if project is not None else None field_to_python_values[ FIELD_CODE_PROJECT_NAME] = project.name if project is not None else None field_to_python_values[FIELD_CODE_ASSIGNEE_ID] = document.assignee_id field_to_python_values[ FIELD_CODE_ASSIGNEE_NAME] = document.assignee.get_full_name( ) if document.assignee else None field_to_python_values[FIELD_CODE_CREATE_DATE] = document.history.last( ).history_date field_to_python_values[FIELD_CODE_ASSIGN_DATE] = document.assign_date field_to_python_values[ FIELD_CODE_STATUS_NAME] = document.status.name if document.status else None
def test_detect_field_value(self): doc = Document() doc.pk = 'A' CsvRegexpsFieldDetectionStrategyMock.text_by_doc_id = { doc.pk: """ Collateral: Enigma Corp Client ref: "Diane" D.O.O. """ } found_entity = CsvRegexpsFieldDetectionStrategyMock.detect_field_value( logger, doc, doc_field, {}) self.assertIsNone(found_entity) CsvRegexpsFieldDetectionStrategyMock.text_by_doc_id = { doc.pk: """ Collateral: Family Name (173437) Client ref: "Diane" D.O.O. """ } found_entity = CsvRegexpsFieldDetectionStrategyMock.detect_field_value( logger, doc, doc_field, {}) self.assertIsNotNone(found_entity)
def load_doc(task: ExtendedTask, document: Document, document_fields: Dict, run_detect_field_values: bool, filed_owners: dict = None): filed_owners = filed_owners if filed_owners else {} fields_to_values = LoadDocumentWithFields.load_field_values(task, document, document_fields, filed_owners) log = CeleryTaskLogger(task) with transaction.atomic(): new_document = document.pk is None document.save(force_insert=new_document) if not new_document: DocumentFieldValue.objects \ .filter(document=document, removed_by_user=False, created_by__isnull=True, modified_by__isnull=True) \ .delete() for field, values in fields_to_values.items(): field_detection.save_detected_values(document, field, values) if run_detect_field_values: field_detection.detect_and_cache_field_values_for_document(log, document, True) else: dfvs = field_detection.detect_and_cache_field_values_for_document(log, document, False) field_value_cache.cache_field_values(document, dfvs, save=True) task.log_info('Loaded {0} field values for document #{1} ({2})' .format(len(fields_to_values), document.pk, document.name))
def process(self, **kwargs): ant_uids = kwargs.get('ids') status_id = kwargs.get('status_id') # for preventing "connection already closed" TaskUtils.prepare_task_execution() ann_status = FieldAnnotationStatus.objects.get(pk=status_id) user = User.objects.get(pk=kwargs.get('user_id')) true_annotations = FieldAnnotation.objects.filter(uid__in=ant_uids) false_annotations = FieldAnnotationFalseMatch.objects.filter(uid__in=ant_uids) if ann_status.is_rejected: from apps.document.repository.document_field_repository import DocumentFieldRepository field_repo = DocumentFieldRepository() for ant in true_annotations: field_repo.delete_field_annotation_and_update_field_value(ant, user) else: import apps.document.repository.document_field_repository as dfr field_repo = dfr.DocumentFieldRepository() field_repo.update_field_annotations_by_ant_ids( ant_uids, [(f'{FIELD_CODE_STATUS_ID}', status_id)]) if false_annotations: for false_ant in false_annotations: field_repo.restore_field_annotation_and_update_field_value( false_ant, status_id, user) ant_docs = set(FieldAnnotation.objects.filter( uid__in=ant_uids).values_list('document_id', flat=True)) false_ant_docs = set(FieldAnnotationFalseMatch.objects.filter( uid__in=ant_uids).values_list('document_id', flat=True)) ant_docs.update(false_ant_docs) Document.reset_status_from_annotations(ann_status=ann_status, document_ids=list(ant_docs))
def cache_generic_values(doc: Document, save: bool = True, log: ProcessLogger = None): doc.generic_data = get_generic_values(doc) if save: doc.save(update_fields=['generic_data']) events.on_document_change(events.DocumentChangedEvent(log=log, document=doc, system_fields_changed=False, generic_fields_changed=True, user_fields_changed=False, pre_detected_field_values=None))
def make_documents(cls, doc_ids: List[int]): texts = TEST_TEXTS docs: List[Document] = [] text_index = 0 for id in doc_ids: doc = Document() doc.pk = id doc.project_id = 1 doc.documenttext = DocumentText() doc.documenttext.full_text = texts[text_index] text_index += 1 docs.append(doc) return docs
def pdf_by_document(self, document: Document): limit = document.account.current_limit fname, mimet, data = self.generate( document.load_data().dict[:limit], document.layout, document.background, document.get_variables(), ) ftype = fname.split(".")[-1] resp = HttpResponse(data, content_type='application/octet-stream') resp['Content-Disposition'] = 'attachment; filename="{}.{}"'.format( document.name, ftype) return resp
def cache_generic_values(doc: Document, save: bool = True, log: ProcessLogger = None, fire_doc_changed_event: bool = True): doc.generic_data = get_generic_values(doc) if save: doc.save(update_fields=['generic_data']) if fire_doc_changed_event: signals.fire_document_changed(sender=cache_generic_values, log=log, document=doc, system_fields_changed=False, generic_fields_changed=True, user_fields_changed=False, pre_detected_field_values=None)
def __init__(self, text: str, field_type: str): self.document = Document() self.field = DocumentField() self.field.type = field_type self.text_unit = TextUnit() self.text_unit.document = self.document self.text_unit.textunittext = TextUnitText() self.text_unit.textunittext.text = text self.text_unit.location_start = 1001 self.text_unit.location_end = self.text_unit.location_start + len(text) self.detector = DocumentFieldDetector() self.detector.regexps_pre_process_lower = True self.detector.include_regexps = 'at\\s{1,5}least\\s{1,5}(two|2).{1,15}unaffiliated.{1,15}lenders\n' + \ '(two|2).{1,30}lenders.{1,200}(not.{1,50}affiliate|affiliate.{1,100}(one|1|single))' self.detector.definition_words = 'required lenders\nrequired revolving lenders\n' + \ 'required revolving credit lenders\nrequired term lenders\n' + \ 'requisite lenders\nrequisite revolving lenders\n' + \ 'required class lenders\nrequired ddtl lenders' self.detector.detected_value = 'AFFILIATED' self.detector.text_part = TextParts.FULL.value self.detector.extraction_hint = ValueExtractionHint.TAKE_FIRST self.matcher = DetectorFieldMatcher(self.detector)
def process(self, **kwargs): self.log_info('Going to load document with fields...') document_name = kwargs.get('document_name') project = Project.objects.get(pk=kwargs.get('project_id')) # type: Project run_detect_field_values = bool(kwargs.get('run_detect_field_values')) document_fields = kwargs.get('document_fields') or {} # type: Dict if document_fields: document = Document( name=document_name, project=project, document_type=project.type, metadata={'parsed_by': None} ) LoadDocumentWithFields.load_doc(self, document, document_fields, run_detect_field_values) path = kwargs['source_data'] if path: self.log_info('Parse {0} at {1}'.format(path, file_access_handler)) file_list = file_access_handler.list(path) self.log_info("Detected {0} files. Added {0} subtasks.".format(len(file_list))) if len(file_list) == 0: raise RuntimeError('Wrong file or directory name or directory is empty: {}' .format(path)) load_docs_args = [(file_path, project.id, run_detect_field_values) for file_path in file_list] self.run_sub_tasks('Load Each Document', LoadDocumentWithFields.create_document, load_docs_args, file_list)
def detect_and_cache_field_values_for_document(log: ProcessLogger, document: Document, save: bool = True): """ Detects field values for a document and stores their DocumentFieldValue objects as well as Document.field_value. These two should always be consistent. :param log: :param document: :param save: :return: """ save_cache = save save_detected = save if save and document.status and not document.status.is_active: log.info( 'Forbidden storing detected field values for document with "completed"' ' status, document #{} ({})'.format(document.id, document.name)) save_detected = False document_type = document.document_type # type: DocumentType all_fields = document_type.fields \ .all() \ .prefetch_related(Prefetch('depends_on_fields', queryset=DocumentField.objects.only('uid').all())) all_fields = list(all_fields) fields_and_deps = [(f.code, f.get_depends_on_codes() or set()) for f in all_fields] sorted_codes = order_field_detection(fields_and_deps) all_fields_code_to_field = {f.code: f for f in all_fields } # type: Dict[str, DocumentField] field_values_pre_cached = False res = list() for field_code in sorted_codes: field = all_fields_code_to_field[field_code] # type: DocumentField field_detection_strategy = FIELD_DETECTION_STRATEGY_REGISTRY[ field.value_detection_strategy] # type: FieldDetectionStrategy if not field_values_pre_cached \ and field_detection_strategy.uses_cached_document_field_values(field): # Pre-cache Document.field_values structure for the usage in field detection strategies document.field_values = field_value_cache.cache_field_values( document, None, save=False) field_values_pre_cached = True detected_values = field_detection_strategy.detect_field_values( log, document, field) # type: List[DetectedFieldValue] if detected_values: res.extend(detected_values) if save_detected: save_detected_values(document, field, detected_values) if save_cache: field_value_cache.cache_field_values(document, res, save=True, log=log) return res
def cache_generic_values(doc: Document, save: bool = True): document_qs = Document.objects.filter(pk=doc.pk) \ .annotate(cluster_id=Max('documentcluster'), parties=StringAgg('textunit__partyusage__party__name', delimiter=', ', distinct=True), max_currency_amount=Max('textunit__currencyusage__amount'), max_currency_name=Max('textunit__currencyusage__currency'), min_date=Min('textunit__dateusage__date'), max_date=Max('textunit__dateusage__date')) values = document_qs.values('cluster_id', 'parties', 'max_currency_amount', 'max_currency_name', 'min_date', 'max_date').first() doc.generic_data = values if save: doc.save(update_fields=['generic_data'])
def save_detected_values(document: Document, field: DocumentField, field_type_adapter: FieldType, detected_values: List[DetectedFieldValue], do_not_write: bool): if len(detected_values) == 0: return 0 try: if field.is_choice_field() and not field_type_adapter.multi_value: values_order = field.get_choice_values() for choice_value in values_order: for dv in detected_values: if choice_value == dv.value: if not do_not_write: field_type_adapter.save_value( document, field, dv.get_annotation_start(), dv.get_annotation_end(), dv.get_annotation_text(), dv.text_unit, dv.value, user=None, allow_overwriting_user_data=False, extraction_hint=dv.hint_name) return 1 else: for dv in detected_values: if not do_not_write: field_type_adapter.save_value( document, field, dv.get_annotation_start(), dv.get_annotation_end(), dv.get_annotation_text(), dv.text_unit, dv.value, user=None, allow_overwriting_user_data=False, extraction_hint=dv.hint_name) return len(detected_values) finally: document.cache_field_values()
def load_doc(task: ExtendedTask, document: Document, field_values_alias_to_value: Dict[str, Any], run_detect_field_values: bool, field_owners: Dict[str, User] = None): field_owners = field_owners if field_owners else {} fields_to_values = LoadDocumentWithFields.load_field_values(task, document, field_values_alias_to_value) log = CeleryTaskLogger(task) import apps.document.repository.document_field_repository as dfr field_repo = dfr.DocumentFieldRepository() with transaction.atomic(): new_document = document.pk is None document.save(force_insert=new_document) DocumentMetadata.objects.create(document=document, metadata={'parsed_by': None}) for field, value_dto in fields_to_values.items(): field_repo.update_field_value_with_dto(document=document, field=field, field_value_dto=value_dto, user=field_owners.get(field.code)) if run_detect_field_values: field_detection.detect_and_cache_field_values_for_document(log=log, document=document, save=True, clear_old_values=False) else: signals.fire_document_changed(sender=task, log=log, document=document, changed_by_user=None, document_initial_load=True, system_fields_changed=True, generic_fields_changed=True, user_fields_changed=True) task.log_info('Loaded {0} field values for document #{1} ({2}): {3}' .format(len(fields_to_values), document.pk, document.name, ';\n'.join(f'{f}: {dto.field_value}' for f, dto in fields_to_values.items())))
def set_value_from_selection(self, doc: Document, value: str): doc.address = value g = geocoder.google(doc.address) if g.ok: doc.address_latitude = g.lat doc.address_longitude = g.lng doc.address_country = g.country_long doc.address_state_province = g.province_long elif g.status and 'ZERO' in g.status: # Google does not know such address - probably we detected it wrong. doc.address_state_province = None doc.address_country = None doc.address_longitude = None doc.address_latitude = None else: print('Unable to detect address via Google geocoder: {0}'.format(g.status)) return doc.address
def filter_queryset(self, queryset) -> Any: queryset = queryset.filter(document__processed=True, document__delete_pending=False) # perm check - use only allowed docs user_id = self.extra_kwargs.get('user_id') if user_id: allowed_document_ids = Document.get_allowed_document_ids(user_id) queryset = queryset.filter(document_id__in=allowed_document_ids) if not self.project_id: return queryset project_ids = [self.project_id] if isinstance(self.project_id, (int, str)) else self.project_id or [] return queryset.filter(document__project_id__in=project_ids)
def filter_queryset(self, queryset) -> Any: # TODO: this is copied from TextUnitFeatures, consider to inherit from 2 parents? queryset = queryset.filter(document__processed=True, document__delete_pending=False) # perm check - use only allowed docs user_id = self.extra_kwargs.get('user_id') if user_id: allowed_document_ids = Document.get_allowed_document_ids(user_id) queryset = queryset.filter(document_id__in=allowed_document_ids) if not self.project_id: return queryset project_ids = [self.project_id] if isinstance(self.project_id, (int, str)) else self.project_id or [] return queryset.filter(document__project_id__in=project_ids)
def create_document(task: ExtendedTask, uri: str, project_id, run_detect_field_values): with file_access_handler.get_local_fn(uri) as (fn, file_name): task.task.title = 'Load Document: {0}'.format(uri) task.log_extra = {'log_document_name': uri} with open(fn, encoding='utf-8') as data_file: data = json.loads(data_file.read()) project = Project.objects.get(pk=project_id) document_type = project.type document = Document( name=file_name, project=project, document_type=document_type, metadata={'parsed_by': None} ) LoadDocumentWithFields.load_doc(task, document, data, run_detect_field_values)
def detect_and_cache_field_values(log: ProcessLogger, doc: Document, field: DocumentField, save: bool = True) -> Optional[List[DetectedFieldValue]]: strategy = FIELD_DETECTION_STRATEGY_REGISTRY[ field.value_detection_strategy] \ if field.value_detection_strategy else STRATEGY_DISABLED if strategy.uses_cached_document_field_values(field): # Pre-cache Document.field_values structure for the usage in field detection strategies doc.field_values = field_value_cache.cache_field_values(doc, None, save=False) detected_values = strategy.detect_field_values(log, doc, field) if save: save_detected_values(doc, field, detected_values) field_value_cache.cache_field_values(doc, detected_values, save=True, log=log) return detected_values
def create_document(task: ExtendedTask, uri: str, project_id, run_detect_field_values): file_storage = get_file_storage() with file_storage.get_document_as_local_fn(uri) as (fn, file_name): task.task.title = 'Load Document: {0}'.format(uri) task.log_extra = {'log_document_name': uri} with open(fn, encoding='utf-8') as data_file: data = json.loads(data_file.read()) project = Project.objects.get(pk=project_id) document_type = project.type document = Document( name=file_name, project=project, document_type=document_type, ) LoadDocumentWithFields.load_doc(task=task, document=document, field_values_alias_to_value=data, run_detect_field_values=run_detect_field_values)
def _fill_system_fields_to_python_values(document: Document, field_to_python_values: Dict[str, List]): field_to_python_values[_FIELD_CODE_DOC_ID] = [document.id] field_to_python_values[_FIELD_CODE_DOC_NAME] = [document.name] field_to_python_values[_FIELD_CODE_DOC_TITLE] = [document.title] field_to_python_values[_FIELD_CODE_IS_REVIEWED] = [document.is_reviewed()] field_to_python_values[_FIELD_CODE_DOC_FULL_TEXT] = \ [document.full_text[:settings.RAW_DB_FULL_TEXT_SEARCH_CUT_ABOVE_TEXT_LENGTH] if document.full_text else None] field_to_python_values[_FIELD_CODE_DOC_FULL_TEXT_LENGTH] = [ len(document.full_text) if document.full_text else 0 ] field_to_python_values[_FIELD_CODE_PROJECT_ID] = [document.project_id] field_to_python_values[_FIELD_CODE_ASSIGNEE_NAME] = [ document.assignee.get_full_name() if document.assignee else None ] field_to_python_values[_FIELD_CODE_STATUS_NAME] = [ document.status.name if document.status else None ]
def import_document(self, values: Dict[str, Any]): doc = Document() doc.name = values['name'] doc.description = values['description'] doc.source = values['source'] doc.source_type = values['source_type'] doc.paragraphs = values['paragraphs'] doc.sentences = values['sentences'] doc.title = values['title'] doc.document_type_id = self.document_types[str( values['document_type_id'])] doc.project_id = self.project_ids[values['project_id']] doc.status_id = str(values['status_id']) doc.language = values['language'] doc.file_size = values['file_size'] if not pd.isnull(values['assign_date']): doc.assign_date = values['assign_date'] doc.delete_pending = values['delete_pending'] == 't' doc.processed = values['processed'] == 't' doc.folder = values['folder'] doc.document_class = values['document_class'] doc.fields_dirty = values['fields_dirty'] if not pd.isnull(values['assignee_id']): doc.assignee = self.target_user doc.source_path = values['source_path'] doc.save() self.document_ids[values['id']] = doc.pk self.document_src_paths[doc.pk] = doc.source_path self.initially_loaded_docs.append(doc.pk)
def cache_field_values(doc: Document, suggested_field_values: Optional[List[DetectedFieldValue]], save: bool = True, log: ProcessLogger = None, changed_by_user: User = None, system_fields_changed: bool = False, generic_fields_changed: bool = False, document_initial_load: bool = False) -> Dict[str, Any]: """ Loads DocumentFieldValue objects from DB, merges them to get python field values of their fields for the document, converts them to the sortable DB-aware form and saves them to Document.f i eld_values. :param doc: :param save: :param suggested_field_values: :param log :param changed_by_user :param system_fields_changed :param generic_fields_changed :param document_initial_load :return: """ document_type = doc.document_type # type: DocumentType # TODO: get/save field value for specific field all_fields = list(document_type.fields.all()) related_info_field_uids = {f.uid for f in all_fields if f.is_related_info_field()} fields_to_field_values = {f: None for f in all_fields} for fv in doc.documentfieldvalue_set.all(): if fv.removed_by_user: continue field = fv.field field_type = fv.field.get_field_type() # type: FieldType fields_to_field_values[field] = field_type \ .merge_multi_python_values(fields_to_field_values.get(field), fv.python_value) field_uids_to_field_values_db = {} for f in all_fields: # type: DocumentField field_type = f.get_field_type() # type: FieldType v = fields_to_field_values[f] field_uids_to_field_values_db[f.uid] = field_type.merged_python_value_to_db(v) if suggested_field_values: field_codes_to_suggested_values = \ merge_detected_field_values_to_python_value(suggested_field_values) # type: Dict[str, Any] else: field_codes_to_suggested_values = None for f in all_fields: # type: DocumentField field_type = f.get_field_type() # type: FieldType if f.is_detectable(): suggested_field_uid = Document.get_suggested_field_uid(f.uid) if field_codes_to_suggested_values: suggested_value_db = field_type.merged_python_value_to_db(field_codes_to_suggested_values.get(f.code)) else: suggested_value_db = field_uids_to_field_values_db.get(suggested_field_uid) # suggested_value_db can be list, None or int, Iterable validation should be here if isinstance(suggested_value_db, Iterable) and f.is_related_info_field(): suggested_value_db = len(suggested_value_db) field_uids_to_field_values_db[suggested_field_uid] = suggested_value_db if save: signals.fire_document_changed(sender=cache_field_values, changed_by_user=changed_by_user, log=log, document=doc, system_fields_changed=system_fields_changed, generic_fields_changed=generic_fields_changed, user_fields_changed=True, pre_detected_field_values=field_codes_to_suggested_values, document_initial_load=document_initial_load) return field_uids_to_field_values_db
def set_value_from_selection(self, doc: Document, value: str): if self.field_type == FieldType.FIELD_TYPE_CONCRETE_STRING: doc.__setattr__(self.field, value) elif self.field_type == FieldType.FIELD_TYPE_CONCRETE_FLOAT: try: doc.__setattr__(self.field, float(value)) except ValueError: nums = list(extractors.find_numbers(value)) if value else None doc.__setattr__(self.field, nums[0] if nums else None) elif self.field_type == FieldType.FIELD_TYPE_CONCRETE_INTEGER: try: doc.__setattr__(self.field, int(value)) except ValueError: nums = list(extractors.find_numbers(value)) if value else None doc.__setattr__(self.field, nums[0] if nums else None) elif self.field_type == FieldType.FIELD_TYPE_CONCRETE_DATE: d = dateparser.parse(value) if value else None if d: doc.__setattr__(self.field, d) else: dates = list(get_dates(value)) if value else None doc.__setattr__(self.field, dates[0] if dates else None) return doc.__getattribute__(self.field)
def setup_document(self) -> Document: doc = Document() return doc
def cache_field_values(doc: Document, suggested_field_values: Optional[ List[DetectedFieldValue]], save: bool = True, log: ProcessLogger = None) -> Dict[str, Any]: """ Loads DocumentFieldValue objects from DB, merges them to get python field values of their fields for the document, converts them to the sortable DB-aware form and saves them to Document.field_values. :param doc: :param save: :param suggested_field_values: :param log :return: """ document_type = doc.document_type # type: DocumentType # TODO: get/save field value for specific field all_fields = list(document_type.fields.all()) related_info_field_uids = { f.uid for f in all_fields if f.is_related_info_field() } fields_to_field_values = {f: None for f in all_fields} for fv in doc.documentfieldvalue_set.all(): if fv.removed_by_user: continue field = fv.field field_type = FIELD_TYPES_REGISTRY[fv.field.type] # type: FieldType fields_to_field_values[field] = field_type \ .merge_multi_python_values(fields_to_field_values.get(field), fv.python_value) field_uids_to_field_values_db = {} for f in all_fields: # type: DocumentField field_type = FIELD_TYPES_REGISTRY[f.type] # type: FieldType v = fields_to_field_values[f] field_uids_to_field_values_db[ f.uid] = field_type.merged_python_value_to_db(v) if suggested_field_values: field_codes_to_suggested_values = \ merge_detected_field_values_to_python_value(suggested_field_values) # type: Dict[str, Any] else: field_codes_to_suggested_values = None for f in all_fields: # type: DocumentField field_type = f.get_field_type() # type: FieldType if f.is_detectable(): suggested_field_uid = Document.get_suggested_field_uid(f.uid) if field_codes_to_suggested_values: suggested_value_db = field_type.merged_python_value_to_db( field_codes_to_suggested_values.get(f.code)) else: suggested_value_db = doc.field_values.get( suggested_field_uid) if doc.field_values else None field_uids_to_field_values_db[ suggested_field_uid] = suggested_value_db if save: doc.field_values = { uid: len(value) if uid in related_info_field_uids and value is not None else value for uid, value in field_uids_to_field_values_db.items() } doc.save() events.on_document_change( events.DocumentChangedEvent( log=log, document=doc, system_fields_changed=False, generic_fields_changed=False, user_fields_changed=True, pre_detected_field_values=field_codes_to_suggested_values)) return field_uids_to_field_values_db
def set_value_from_selection(self, doc: Document, value: str): doc.address = value return doc.address
def get_value(self, doc: Document): return doc.__getattribute__(self.field)
def detect_and_cache_field_values_for_document( log: ProcessLogger, document: Document, save: bool = True, clear_old_values: bool = True, changed_by_user: User = None, system_fields_changed: bool = False, generic_fields_changed: bool = False, document_initial_load: bool = False, ignore_field_codes: Set[str] = None): """ Detects field values for a document and stores their DocumentFieldValue objects as well as Document.field_value. These two should always be consistent. :param log: :param document: :param save: :param clear_old_values: :param changed_by_user :param system_fields_changed :param generic_fields_changed :param document_initial_load :return: """ save_cache = save save_detected = save if save and document.status and not document.status.is_active: log.info( 'Forbidden storing detected field values for document with "completed"' ' status, document #{} ({})'.format(document.id, document.name)) save_detected = False document_type = document.document_type # type: DocumentType all_fields = document_type.fields \ .all() \ .prefetch_related(Prefetch('depends_on_fields', queryset=DocumentField.objects.only('uid').all())) all_fields = list(all_fields) fields_and_deps = [(f.code, f.get_depends_on_codes() or set()) for f in all_fields] sorted_codes = order_field_detection(fields_and_deps) all_fields_code_to_field = {f.code: f for f in all_fields } # type: Dict[str, DocumentField] field_values_pre_cached = False res = list() for field_code in sorted_codes: if ignore_field_codes and field_code in ignore_field_codes: continue field = all_fields_code_to_field[field_code] # type: DocumentField field_detection_strategy = FIELD_DETECTION_STRATEGY_REGISTRY[ field.value_detection_strategy] # type: FieldDetectionStrategy if not field_values_pre_cached \ and field_detection_strategy.uses_cached_document_field_values(field): # Pre-cache Document.field_values structure for the usage in field detection strategies document.field_values = field_value_cache.cache_field_values( document, None, save=False) field_values_pre_cached = True try: detected_values = field_detection_strategy.detect_field_values( log, document, field) # type: List[DetectedFieldValue] except Exception as e: msg = '''Unable to detect field value. Document type: {0} Document: {1} Field: {2}'''.format(document_type.code, document.pk, field.code) log.error(render_error(msg, e)) raise e if save_detected and clear_old_values: # Delete previously detected values # to avoid accumulating garbage on each iteration. DocumentFieldValue.objects \ .filter(document=document, field=field, removed_by_user=False, created_by__isnull=True, modified_by__isnull=True) \ .delete() if detected_values: res.extend(detected_values) if save_detected: save_detected_values(document, field, detected_values) if save_cache: field_value_cache.cache_field_values( document, suggested_field_values=res, save=True, log=log, changed_by_user=changed_by_user, system_fields_changed=system_fields_changed, generic_fields_changed=generic_fields_changed, document_initial_load=document_initial_load) return res