示例#1
0
    def _validate_choice_values_removed(self, context: dict) -> None:
        saved_field = self._get_saved_field(context)
        if not saved_field or not saved_field.is_choice_field(
        ) or not self.object.is_choice_field():
            return
        err_msg = ''
        invalid_choices = self._get_invalid_choices(saved_field)
        if self._is_allow_values_not_specified_in_choices_was_unset(
                saved_field):
            err_msg += '"Allow values not specified in choices" flag is unset in the the config being imported. '
        if invalid_choices:
            invalid_choices = [
                '"{0}"'.format(invalid_choice)
                for invalid_choice in invalid_choices
            ]
            err_msg += 'The following choice values are missing in the config being imported: {0}. ' \
                .format(', '.join(invalid_choices))

        if err_msg:
            invalid_values_count = self.object.get_invalid_choice_values(
            ).count()
            user_values_count = 0
            detected_values_count = 0
            if invalid_values_count > 0:
                user_values_qs = self.object.get_invalid_choice_values()
                user_values_count = DocumentFieldValue.filter_user_values(
                    user_values_qs).count()
                detected_values_count = self._get_detected_values_count(
                    invalid_values_count, user_values_count)
            err_msg += 'Number of invalid values: user entered values {0}, automatically detected values {1}.' \
                       ' You need to set force auto-fixes option to continue (this option will remove all invalid' \
                       ' values) or make manual updates.'.format(user_values_count, detected_values_count)
            err_msg = 'Unable to update field #{0} "{1}". {2}'.format(
                self.pk, self.object.code, err_msg)
            raise ValidationError(err_msg)
    def _validate_critical_properties_changed(self, context: dict) -> None:
        saved_field = self._get_saved_field(context)
        if not saved_field:
            return
        err_msg = ''
        new_field_type = self.object.type
        old_document_type_pk = self.to_str_if_uuid(self._get_document_type_pk(saved_field))
        new_document_type_pk = self.to_str_if_uuid(self.document_type_pk)
        old_field_type = saved_field.type
        if old_document_type_pk != new_document_type_pk:
            err_msg += 'Document type has changed, old document type id is #{0}, new document type id is #{1}. ' \
                .format(old_document_type_pk, self.document_type_pk)
        if old_field_type != new_field_type:
            err_msg += 'Field type has changed, old field type is "{0}", new field type is "{1}". ' \
                .format(self._get_field_type_title(old_field_type), self._get_field_type_title(new_field_type))
        if err_msg:
            err_msg = 'Unable to update field #{0} "{1}". {2}'.format(self.pk, self.object.code, err_msg)
            values_count = DocumentFieldValue.objects.filter(field=self.object).count()
            user_values_count = 0
            detected_values_count = 0
            if values_count > 0:
                user_values_qs = DocumentFieldValue.objects.filter(field=self.object)
                user_values_count = DocumentFieldValue.filter_user_values(user_values_qs).count()
                detected_values_count = self._get_detected_values_count(values_count, user_values_count)
            err_msg += 'Existing document field values become invalid and will be removed. User entered values {0},' \
                       ' automatically detected values {1}. You need to set force auto-fixes option to continue' \
                       ' (this option will remove all values for this field) or make manual updates.' \
                .format(user_values_count, detected_values_count)

            raise ValidationError(err_msg)
 def _maybe_save_reverse_similarity_value(self, log: ProcessLogger,
                                          field: DocumentField,
                                          document: Document,
                                          other_doc_id) -> bool:
     if not DocumentFieldValue.objects.filter(document_id=other_doc_id,
                                              field=field,
                                              value=document.pk).exists():
         other_document = Document.all_objects.get(pk=other_doc_id)
         DocumentFieldValue(document=other_document,
                            field=field,
                            value=document.pk).save()
         cache_document_fields(log=log,
                               document=other_document,
                               cache_generic_fields=False,
                               cache_user_fields=True)
示例#4
0
    def process(self, **kwargs):
        dst_field = kwargs['field']
        dst_field = DocumentField.objects.filter(pk=dst_field['pk']) \
            .prefetch_related('depends_on_fields') \
            .select_related(DST_FIELD_SIMILARITY_CONFIG_ATTR) \
            .first()  # type: DocumentField

        if not dst_field:
            raise RuntimeError('Document field not found: {0}'.format(kwargs['field']))

        config = getattr(dst_field, DST_FIELD_SIMILARITY_CONFIG_ATTR)  # type: DocumentSimilarityConfig

        config.self_validate()

        similarity_threshold = config.similarity_threshold
        feature_vector_fields = dst_field.depends_on_fields.all()
        feature_vector_field_codes = {f.code for f in feature_vector_fields}.union({FIELD_CODE_DOC_ID})

        self.log_info('{field}: Min similarity: {threshold}'
                      .format(field=dst_field.code, threshold=similarity_threshold))

        rawdb = RawDbRepository()
        field_values_list = list(rawdb.get_field_values(document_type=dst_field.document_type,
                                                        field_codes=feature_vector_field_codes))

        total_docs = len(field_values_list)

        self.set_push_steps(int(5 + total_docs / 100))

        self.push()
        self.log_info(
            '{field}: Building feature vectors for {n} documents'.format(field=dst_field.code, n=total_docs))

        vectorizer = document_feature_vector_pipeline(feature_vector_fields, use_field_codes=True)
        feature_vectors = vectorizer.fit_transform(field_values_list)

        self.push()
        self.log_info('{field}: Finding similar documents (similarity >= {threshold})'
                      .format(field=dst_field.code, threshold=similarity_threshold))

        dfvs = list()
        for x, doc_a_field_values in enumerate(field_values_list):
            doc_a_pk = doc_a_field_values[FIELD_CODE_DOC_ID]
            similarities = cosine_similarity(feature_vectors[x], feature_vectors)
            for y, doc_b_field_values in enumerate(field_values_list):
                doc_b_pk = doc_b_field_values[FIELD_CODE_DOC_ID]
                if doc_a_pk == doc_b_pk:
                    continue
                similarity = similarities[0, y]
                if similarity < similarity_threshold:
                    continue
                dfvs.append(DocumentFieldValue(document_id=doc_a_pk, value=doc_b_pk, field_id=dst_field.pk))
                dfvs.append(DocumentFieldValue(document_id=doc_b_pk, value=doc_a_pk, field_id=dst_field.pk))
            if x % 100 == 0:
                self.log_info('{field}: Checked for similarity {x} documents of {n}'
                              .format(field=dst_field.code, x=x + 1, n=total_docs))
                self.push()

        self.push()
        self.log_info('{field}: Found {n} similar documents. Storing links into the document fields.'
                      .format(field=dst_field.code, n=len(dfvs)))

        del_doc_batch_size = 100
        for i in range(0, len(field_values_list), del_doc_batch_size):
            DocumentFieldValue.objects \
                .filter(field_id=dst_field.pk) \
                .filter(document_id__in={field_values[FIELD_CODE_DOC_ID] for field_values
                                         in field_values_list[i: i + del_doc_batch_size]}) \
                .delete()
        DocumentFieldValue.objects.bulk_create(dfvs)
        self.push()