def process(self, **kwargs): document_field_arg = kwargs.get('document_field') document_field_id = document_field_arg['pk'] if document_field_arg else None delete_broken = kwargs.get('delete_broken') # check FieldValue-s import apps.document.repository.document_field_repository as dfr field_repo = dfr.DocumentFieldRepository() qs_field_values = field_repo.get_field_value_ids_by_doc_field(document_field_id) total_num = qs_field_values.count() for ids_chunk in chunks(qs_field_values.values_list('pk', flat=True), 100): id_list = list(ids_chunk) self.run_sub_tasks('Check FieldValues', self.check_field_values, [(id_list, delete_broken)]) self.log_info(f'Sub-tasks started for {len(id_list)} FieldValues of total {total_num}') # check FieldAnnotation-s qs_ants = field_repo.get_fieldant_ids_by_doc_field(document_field_id) total_num = qs_ants.count() for ids_chunk in chunks(qs_ants.values_list('pk', flat=True), 100): id_list = list(ids_chunk) self.run_sub_tasks('Check FieldAnnotations', self.check_annotations, [(id_list, delete_broken)]) self.log_info(f'Sub-tasks started for {len(id_list)} FieldAnnotations of total {total_num}')
def process(self, **kwargs): ant_uids = kwargs.get('ids') status_id = kwargs.get('status_id') # for preventing "connection already closed" TaskUtils.prepare_task_execution() ann_status = FieldAnnotationStatus.objects.get(pk=status_id) user = User.objects.get(pk=kwargs.get('user_id')) true_annotations = FieldAnnotation.objects.filter(uid__in=ant_uids) false_annotations = FieldAnnotationFalseMatch.objects.filter(uid__in=ant_uids) if ann_status.is_rejected: from apps.document.repository.document_field_repository import DocumentFieldRepository field_repo = DocumentFieldRepository() for ant in true_annotations: field_repo.delete_field_annotation_and_update_field_value(ant, user) else: import apps.document.repository.document_field_repository as dfr field_repo = dfr.DocumentFieldRepository() field_repo.update_field_annotations_by_ant_ids( ant_uids, [(f'{FIELD_CODE_STATUS_ID}', status_id)]) if false_annotations: for false_ant in false_annotations: field_repo.restore_field_annotation_and_update_field_value( false_ant, status_id, user) ant_docs = set(FieldAnnotation.objects.filter( uid__in=ant_uids).values_list('document_id', flat=True)) false_ant_docs = set(FieldAnnotationFalseMatch.objects.filter( uid__in=ant_uids).values_list('document_id', flat=True)) ant_docs.update(false_ant_docs) Document.reset_status_from_annotations(ann_status=ann_status, document_ids=list(ant_docs))
def _validate_critical_properties_changed(self, context: dict) -> None: import apps.document.repository.document_field_repository as dfr field_repo = dfr.DocumentFieldRepository() saved_field = self._get_saved_field(context) if not saved_field: return err_msg = '' new_field_type = self.object.type old_document_type_pk = self.to_str_if_uuid(self._get_document_type_pk(saved_field)) new_document_type_pk = self.to_str_if_uuid(self.document_type_pk) old_field_type = saved_field.type if old_document_type_pk != new_document_type_pk: err_msg += f'Document type has changed, old document type id is #{old_document_type_pk}' + \ f', new document type id is #{self.document_type_pk}. ' if old_field_type != new_field_type: err_msg += f'Field type has changed, old field type is ' + \ f'"{self._get_field_type_title(old_field_type)}", ' + \ f'new field type is "{self._get_field_type_title(new_field_type)}". ' if err_msg: err_msg = f'Unable to update field #{self.pk} "{self.object.code}". {err_msg}' values_count = field_repo.get_count_by_field(self.object.pk) user_values_count = 0 detected_values_count = 0 if values_count > 0: user_values_count = field_repo.get_doc_field_values_filtered_count(self.object.pk) detected_values_count = self._get_detected_values_count(values_count, user_values_count) err_msg += 'Existing document field values become invalid and will be removed. ' + \ f'User entered values {user_values_count}, ' + \ f'automatically detected values {detected_values_count}. ' + \ 'You need to set force auto-fixes option to continue ' + \ '(this option will remove all values for this field) or make manual updates.' raise ValidationError(err_msg)
def check_field_values(task: ExtendedTask, field_value_ids: List[int], delete_broken: bool = False): import apps.document.repository.document_field_repository as dfr field_repo = dfr.DocumentFieldRepository() qa_field_values = field_repo.get_field_values_by_ids(field_value_ids) for fv in qa_field_values: # FieldAnnotation field = fv.field # type: DocumentField if not TypedField.by(field).is_json_field_value_ok(fv.value): FindBrokenDocumentFieldValues.process_broken(task, fv, delete_broken)
def delete_document_history_by_ids(self, ids: List[int]) -> None: import apps.document.repository.document_field_repository as dfr field_repo = dfr.DocumentFieldRepository() field_repo.delete_documents_history_values(ids) TextUnitNote.history.filter(text_unit__document_id__in=ids) DocumentNote.history.filter(document_id__in=ids).delete() Document.history.filter(id__in=ids).delete() # delete empty Party and Clusters Party.objects.filter(partyusage__isnull=True).delete() DocumentCluster.objects.filter(documents__isnull=True).delete() TextUnitCluster.objects.filter(text_units__isnull=True).delete()
def load_doc(task: ExtendedTask, document: Document, field_values_alias_to_value: Dict[str, Any], run_detect_field_values: bool, field_owners: Dict[str, User] = None): field_owners = field_owners if field_owners else {} fields_to_values = LoadDocumentWithFields.load_field_values(task, document, field_values_alias_to_value) log = CeleryTaskLogger(task) import apps.document.repository.document_field_repository as dfr field_repo = dfr.DocumentFieldRepository() with transaction.atomic(): new_document = document.pk is None document.save(force_insert=new_document) DocumentMetadata.objects.create(document=document, metadata={'parsed_by': None}) for field, value_dto in fields_to_values.items(): field_repo.update_field_value_with_dto(document=document, field=field, field_value_dto=value_dto, user=field_owners.get(field.code)) if run_detect_field_values: field_detection.detect_and_cache_field_values_for_document(log=log, document=document, save=True, clear_old_values=False) else: signals.fire_document_changed(sender=task, log=log, document=document, changed_by_user=None, document_initial_load=True, system_fields_changed=True, generic_fields_changed=True, user_fields_changed=True) task.log_info('Loaded {0} field values for document #{1} ({2}): {3}' .format(len(fields_to_values), document.pk, document.name, ';\n'.join(f'{f}: {dto.field_value}' for f, dto in fields_to_values.items())))
def detect_and_cache_field_values_for_document(log: ProcessLogger, document: Document, save: bool = True, clear_old_values: bool = True, changed_by_user: User = None, system_fields_changed: bool = False, generic_fields_changed: bool = False, document_initial_load: bool = False, ignore_field_codes: Set[str] = None, updated_field_codes: List[str] = None, skip_modified_values: bool = True): """ Detects field values for a document and stores their DocumentFieldValue objects as well as Document.field_value. These two should always be consistent. :param log: :param document: :param save: :param clear_old_values: :param changed_by_user :param system_fields_changed :param generic_fields_changed :param ignore_field_codes :param document_initial_load :param updated_field_codes - if set, we search for changed and dependent fields only :param skip_modified_values - don't overwrite field values overwritten by user :return: """ import apps.document.repository.document_field_repository as dfr field_repo = dfr.DocumentFieldRepository() if save and document.status and not document.status.is_active: raise RuntimeError(f'Detecting field values for completed documents is not permitted.\n' f'Document: {document.name} (#{document.pk})') document_type = document.document_type # type: DocumentType all_fields = document_type.fields \ .all() \ .prefetch_related(Prefetch('depends_on_fields', queryset=DocumentField.objects.only('uid').all())) all_fields = list(all_fields) fields_and_deps = [(f.code, set(f.get_depends_on_codes()) or set()) for f in all_fields] dependent_fields = get_dependent_fields(fields_and_deps, set(updated_field_codes)) \ if updated_field_codes else None sorted_codes = order_field_detection(fields_and_deps) all_fields_code_to_field = {f.code: f for f in all_fields} # type: Dict[str, DocumentField] log.info(f'Detecting field values for document {document.name} (#{document.pk}), save={save}.\n' f'Updated fields: {updated_field_codes or "All"}.\n' f'Dependent fields to be detected: {dependent_fields or "All"}.\n' f'Ignored fields: {ignore_field_codes}.') if updated_field_codes: sorted_codes = [c for c in sorted_codes if c in dependent_fields and (not ignore_field_codes or c not in ignore_field_codes)] elif ignore_field_codes: sorted_codes = [c for c in sorted_codes if c not in ignore_field_codes] current_field_values = {f.code: None for f in all_fields} # we may get values for fields required for sorted_codes, regarding # further dependencies # or we may just get all fields' values (field_codes_only=None) actual_field_values = field_repo.get_field_code_to_python_value(document_type_id=document_type.pk, doc_id=document.pk, field_codes_only=None) current_field_values.update(actual_field_values) res = list() detecting_field_status = [] # type:List[str] detection_errors = [] # type:List[Tuple[str, str, Exception, Any]] # do not touch field values modified by user skip_codes = set() if skip_modified_values: skip_codes = set(list(FieldValue.objects.filter( modified_by__isnull=False, document_id=document.pk).values_list('field__code', flat=True))) if updated_field_codes: # these fields have to be deleted despite being set by user # updated_field_ids = DocumentField.objects.filter(code__in=updated_field_codes).values_list('pk', flat=True) skip_codes -= set(updated_field_codes) if clear_old_values: field_repo.delete_document_field_values(document.pk, list(skip_codes), updated_field_codes) for field_code in sorted_codes: if field_code in skip_codes: continue field = all_fields_code_to_field[field_code] # type: DocumentField typed_field = TypedField.by(field) # type: TypedField field_detection_strategy = FIELD_DETECTION_STRATEGY_REGISTRY[ field.value_detection_strategy] # type: FieldDetectionStrategy try: new_field_value_dto = field_detection_strategy.detect_field_value(log=log, doc=document, field=field, field_code_to_value=current_field_values) if not new_field_value_dto: detecting_field_status.append(f"No new value's gotten for '{field.code}'") continue if is_unit_limit_exceeded(new_field_value_dto, field, document): continue detecting_field_status.append( f"{format_value_short_str(new_field_value_dto.field_value)} for '{field.code}'") # now merge the detection results with the current DB state if save: # user = None here to store detected values as owned by system allowing further overwriting field_value, annotations = field_repo.update_field_value_with_dto(document=document, field=field, field_value_dto=new_field_value_dto, user=None) # and update the field value of this field which may be used for detection of fields depending on it current_field_values[field.code] = typed_field.field_value_json_to_python(field_value.value) # If save is not requested then do not update current_field_values. # Most likely in this case we detect only few requested fields and trying to comply the dependency # tree makes no big sense. except Exception as e: # Additionally logging here because the further compound exception will not contain the full stack trace. log.error(f'Exception caught while detecting value of field {field.code} ({typed_field.type_code})', exc_info=e) detection_errors.append((field.code, typed_field.type_code, e, sys.exc_info())) if save: if updated_field_codes: user_fields_changed_set = set(updated_field_codes) if dependent_fields: user_fields_changed_set.update(dependent_fields) user_fields_changed = list(user_fields_changed_set) # type: FieldSpec else: user_fields_changed = True fire_document_changed(sender=detect_and_cache_field_values_for_document, log=log, document=document, changed_by_user=changed_by_user, document_initial_load=document_initial_load, system_fields_changed=system_fields_changed, generic_fields_changed=generic_fields_changed, user_fields_changed=user_fields_changed) if dependent_fields: msg = f'Recalculating dependent fields for {document.name}: ' # dependent_fields msg += ', '.join(dependent_fields) msg += '.\n\nSource fields data: \n' msg += '; '.join([f'"{k}": "{format_value_short_str(current_field_values[k])}"' for k in current_field_values]) msg += '.\n\nCalculation results:\n' msg += '\n'.join(detecting_field_status) log.info(msg) if detection_errors: fields_str = ', '.join([f'{e[0]} ({e[1]})' for e in detection_errors]) msg = f'There were errors while detecting fields:\n{fields_str}\n' + \ f'for document {document.name} (#{document.pk}, type {document_type.code})\n' for f_code, f_type, ex, ex_stack in detection_errors: msg += f'\n{f_code}, {f_type}: {ex}' raise FieldDetectionError(msg) return res
def get_field_values_dump() -> str: import apps.document.repository.document_field_repository as dfr field_repo = dfr.DocumentFieldRepository() data = field_repo.get_annotated_values_for_dump() transfer_objects = [ExternalFieldValue(**i) for i in data] return core_serializers.serialize('json', transfer_objects)
def process(self, **kwargs): dst_field = kwargs['field'] dst_field = DocumentField.objects.filter(pk=dst_field['pk']) \ .prefetch_related('depends_on_fields') \ .select_related(DST_FIELD_SIMILARITY_CONFIG_ATTR) \ .first() # type: DocumentField if not dst_field: raise RuntimeError('Document field not found: {0}'.format(kwargs['field'])) proj = kwargs['project'] proj_id = proj['pk'] if proj else None # type:Optional[int] doc_query = Document.objects.filter(document_type=dst_field.document_type, project_id=proj_id) if proj_id \ else Document.objects.filter(document_type=dst_field.document_type) config = getattr(dst_field, DST_FIELD_SIMILARITY_CONFIG_ATTR) # type: DocumentSimilarityConfig config.self_validate() similarity_threshold = config.similarity_threshold feature_vector_fields = list(dst_field.depends_on_fields.all()) feature_vector_field_codes = [f.code for f in feature_vector_fields] self.log_info('{field}: Min similarity: {threshold}' .format(field=dst_field.code, threshold=similarity_threshold)) import apps.document.repository.document_field_repository as dfr field_repo = dfr.DocumentFieldRepository() qr_doc_ids = doc_query.values_list('pk', flat=True) doc_ids_to_code_to_value = field_repo \ .get_field_code_to_python_value_multiple_docs(document_type_id=dst_field.document_type_id, doc_ids=qr_doc_ids, field_codes_only=feature_vector_field_codes) field_values_list = list() for doc_id, values in doc_ids_to_code_to_value: values[FIELD_CODE_DOC_ID] = doc_id field_values_list.append(values) total_docs = len(field_values_list) self.set_push_steps(int(5 + total_docs / 100)) self.push() self.log_info( '{field}: Building feature vectors for {n} documents'.format(field=dst_field.code, n=total_docs)) vectorizer = document_feature_vector_pipeline(feature_vector_fields, use_field_codes=True) feature_vectors = vectorizer.fit_transform(field_values_list) self.push() self.log_info('{field}: Finding similar documents (similarity >= {threshold})' .format(field=dst_field.code, threshold=similarity_threshold)) doc_ids_to_values = defaultdict(set) for x, doc_a_field_values in enumerate(field_values_list): doc_a_pk = doc_a_field_values[FIELD_CODE_DOC_ID] similarities = cosine_similarity(feature_vectors[x], feature_vectors) for y, doc_b_field_values in enumerate(field_values_list): doc_b_pk = doc_b_field_values[FIELD_CODE_DOC_ID] if doc_a_pk == doc_b_pk: continue similarity = similarities[0, y] if similarity < similarity_threshold: continue doc_ids_to_values[doc_a_pk].add(doc_b_pk) doc_ids_to_values[doc_b_pk].add(doc_a_pk) if x % 100 == 0: self.log_info('{field}: Checked for similarity {x} documents of {n}' .format(field=dst_field.code, x=x + 1, n=total_docs)) self.push() self.push() self.log_info('{field}: Found {n} similar documents. Storing links into the document fields.' .format(field=dst_field.code, n=len(doc_ids_to_values))) doc_ids_to_values = {doc_id: list(v) if v else None for doc_id, v in doc_ids_to_values} field_repo.store_values_one_field_many_docs_no_ants(field=dst_field, doc_ids_to_values=doc_ids_to_values) log = CeleryTaskLogger(self) for doc_id in doc_ids_to_values.keys(): try: doc = Document.objects.get(pk=doc_id) signals.fire_document_changed(log=log, document=doc, changed_by_user=None, system_fields_changed=False, generic_fields_changed=False, user_fields_changed=[dst_field.code]) except Exception as ex: self.log_error(f'Unable to fire doc id change event for doc #{doc_id}', exc_info=ex)
def process(self, document_type: DocumentType = None, project_ids=list, document_name: str = None, do_not_run_for_modified_documents=True, do_not_write=False, **kwargs): self.log_info( "Going to detect document field values based on " "the pre-coded regexps and field values entered by users...") if isinstance(document_type, dict): document_type = DocumentType.objects.get(pk=document_type['pk']) # reindex document grid fields cache after detecting fields from apps.rawdb.tasks import auto_reindex_not_tracked doc_type_code = document_type.code \ if document_type and hasattr(document_type, 'code') else None call_task_func(auto_reindex_not_tracked, (doc_type_code, ), None, queue=settings.CELERY_QUEUE_SERIAL, run_after_sub_tasks_finished=True, main_task_id=self.request.id) document_id = kwargs.get('document_id') if document_id: self.set_push_steps(1) dcptrs = DocDetectFieldValuesParams(document_id, False, True) self.run_sub_tasks( 'Detect Field Values For Single Document', DetectFieldValues.detect_field_values_for_document, [(dcptrs, )]) self.push() return task_count = 0 document_types = [document_type ] if document_type else DocumentType.objects.all() document_type_pks = [] for document_type in document_types: if document_type.pk and document_type.fields.exists(): document_type_pks.append(document_type.pk) else: self.log_info( 'Can not find any fields assigned to document type: {0}'. format(document_type)) detect_field_values_for_document_args = [] source_data = [] qs = Document.objects.filter(status__is_active=True) if document_name: qs = qs.filter(name=document_name) elif document_id: qs = qs.filter(pk=document_id) elif project_ids: qs = qs.filter(project_id__in=project_ids) elif document_type_pks: qs = qs.filter(document_type_id__in=document_type_pks) # filter out modified documents import apps.document.repository.document_field_repository as dfr field_repo = dfr.DocumentFieldRepository() if do_not_run_for_modified_documents: modified_document_ids = field_repo.get_removed_fieldvals_doc_ids() qs = qs.exclude(pk__in=Subquery(modified_document_ids)) for doc_id, source, name in qs.values_list('id', 'source', 'name'): dcptrs = DocDetectFieldValuesParams(doc_id, do_not_write, True) detect_field_values_for_document_args.append((dcptrs, )) if source: source_data.append('{0}/{1}'.format(source, name)) else: source_data.append(name) task_count += 1 self.run_sub_tasks('Detect Field Values For Each Document', DetectFieldValues.detect_field_values_for_document, detect_field_values_for_document_args, source_data) if task_count > 0: self.log_info('Found {0} documents'.format(task_count)) else: self.log_info('No documents found')
def test_field_detector_model(task: ExtendedTask, field_id, document_id) -> dict: document = Document.objects.get(pk=document_id) # type: Document field = DocumentField.objects.get(pk=field_id) # type: DocumentField typed_field = TypedField.by(field) expected_field_value_dto = field_detection.detect_field_value( CeleryTaskLogger(task), document, field) # type: FieldValueDTO import apps.document.repository.document_field_repository as dfr field_repo = dfr.DocumentFieldRepository() if typed_field.requires_value: # dates, numbers, e.t.c. actual_field_value_dict = field_repo \ .get_field_code_to_python_value(document_type_id=document.document_type_id, doc_id=document_id, field_codes_only={field.code}) actual_field_value = actual_field_value_dict.get(field.code) if actual_field_value_dict else None expected_field_value = expected_field_value_dto.field_value if expected_field_value_dto else None matches = bool(expected_field_value == actual_field_value) else: expected_set = set() # related-info e.t.c. - comparing by annotations - exact comparing if expected_field_value_dto.annotations: for ant_dto in expected_field_value_dto.annotations: text_unit_id = field_repo.find_text_unit_id_by_location( document, field, ant_dto.location_in_doc_start, ant_dto.location_in_doc_end) if not text_unit_id: continue expected_set.add('text_unit_' + str(text_unit_id)) expected_field_value_dto = '; '.join(sorted(expected_set)) actual_dfvs = FieldAnnotation.objects.filter( document_type_id=document.document_type_id, doc_id=document_id, field_id=field.pk) actual_set = {'text_unit_' + str(dfv.text_unit.id) for dfv in actual_dfvs if dfv.text_unit} actual_field_value = '; '.join(sorted(actual_set)) matches = bool(expected_set == actual_set) if not matches: found_in_text = [dfv.text_unit.text for dfv in expected_field_value_dto if dfv.text_unit and dfv.text_unit.text] if expected_field_value_dto else [] found_in_text_msg = '' if found_in_text: found_in_text_msg = '\nDetected in text:\n-----\n{0}\n-----'.format('\n---\n'.join(found_in_text)) task.log_info('{3} Test doc: {0} (Doc id: {6}, Project: {5}). ' 'Detected: {1}. Real: {2}.{4}' .format(document.name, expected_field_value_dto, actual_field_value, '[ OK ]' if matches else '[ ERR ]', found_in_text_msg, document.project.name if document.project else '', document.pk)) text_units_number = TextUnit.objects.filter(document=document, unit_type=field.text_unit_type).count() return { 'text_units_number': text_units_number, 'value_matches_expected': matches, 'actual_field_value': actual_field_value if typed_field.is_choice_field else None }