def get_user_data(
         cls, field: DocumentField,
         project_ids: Optional[List[str]]) -> Optional[List[dict]]:
     qs_modified_document_ids = field_detection_utils.get_qs_active_modified_document_ids(
         field, project_ids)
     qs_finished_document_ids = field_detection_utils.get_qs_finished_document_ids(
         field.document_type, project_ids)
     all_ids = list(qs_modified_document_ids) + \
               list(qs_finished_document_ids)
     all_ids = list(set(all_ids))
     return cls.FIELD_REPOSITORY.get_documents_fields_by_doc_ids(
         all_ids, settings.ML_TRAIN_DATA_SET_GROUP_LEN)
    def get_user_data(
            cls, field: DocumentField,
            project_ids: Optional[List[str]]) -> Optional[List[dict]]:
        qs_modified_document_ids = field_detection_utils.get_qs_active_modified_document_ids(
            field, project_ids)
        qs_finished_document_ids = field_detection_utils.get_qs_finished_document_ids(
            field.document_type, project_ids)

        return list(
            Document.objects.filter(
                pk__in=Q(Subquery(qs_modified_document_ids))
                | Q(Subquery(qs_finished_document_ids))).values_list(
                    'field_values',
                    flat=True)[:settings.ML_TRAIN_DATA_SET_GROUP_LEN])
def get_user_data(field: DocumentField,
                  project_ids: Optional[List[str]]) -> List[dict]:
    qs_modified_document_ids = field_detection_utils.get_qs_active_modified_document_ids(field,
                                                                                         project_ids)

    qs_finished_document_ids = field_detection_utils.get_qs_finished_document_ids(field.document_type, project_ids)

    document_values = DocumentFieldValue.objects.filter(Q(field=field),
                                                        Q(text_unit__isnull=False),
                                                        Q(document__in=Subquery(qs_modified_document_ids))
                                                        | Q(document__in=Subquery(qs_finished_document_ids)),
                                                        Q(removed_by_user=False)) \
                          .values('created_by', 'text_unit__text', 'value', 'extraction_hint') \
                          .order_by('created_by')[:settings.ML_TRAIN_DATA_SET_GROUP_LEN]

    return list(document_values)
    def process(self, **kwargs):
        self.log_info(
            'Going to train document field based on the datasets stored in DB...'
        )

        document_type_field_id = kwargs.get('document_type_field_id')
        skip_training = kwargs.get('skip_training')
        use_only_confirmed_field_values_for_training = kwargs.get(
            'use_only_confirmed_field_values_for_training')
        train_data_project_ids = kwargs.get('train_data_project_ids')

        skip_testing = kwargs.get('skip_testing')
        use_only_confirmed_field_values_for_testing = kwargs.get(
            'use_only_confirmed_field_values_for_testing')
        test_data_projects_ids = kwargs.get('test_data_projects_ids')

        document_type_field = DocumentTypeField.objects.get(
            pk=document_type_field_id)
        document_type = document_type_field.document_type
        field = document_type_field.document_field  # type: DocumentField

        if not field.is_detectable():
            self.log_info(
                'Field {0} is not detectable. Nothing to train and/or test.'.
                format(field.code))

        new_model = None

        if not skip_training:
            if train_data_project_ids:
                self.log_info('Training model on the specified projects...')
            else:
                self.log_info(
                    'No training projects specified. '
                    'Training model on all user-confirmed field values in the system...'
                )

            new_model = field_detection \
                .train_document_field_detector_model(CeleryTaskLogger(self),
                                                     document_type,
                                                     field,
                                                     train_data_project_ids,
                                                     use_only_confirmed_field_values_for_training)
            if new_model:
                ClassifierModel.objects.filter(document_type=document_type,
                                               document_field=field).delete()
                new_model.save()

                if new_model.classifier_accuracy_report_in_sample:
                    self.log_info(
                        'Sklearn test report for in-sample docs:\n{0}'.format(
                            new_model.classifier_accuracy_report_in_sample))

                if new_model.classifier_accuracy_report_out_of_sample:
                    self.log_info(
                        'Sklearn test report for out-of-sample docs:\n{0}'.
                        format(new_model.
                               classifier_accuracy_report_out_of_sample))
            else:
                self.log_info(
                    'No model trained. '
                    'Probably the detection strategy of field {0} does not allow training'
                    .format(field.code))

        if skip_testing:
            return

        if not test_data_projects_ids:
            self.log_info(
                'No test projects specified. Skiping the testing step.')
            return
        else:
            if not use_only_confirmed_field_values_for_testing:
                test_document_ids = Document.objects \
                    .filter(project_id__in=test_data_projects_ids, document_type_id=document_type.pk) \
                    .values_list('pk', flat=True)
            else:
                test_document_ids = set(
                    field_detection_utils.get_qs_active_modified_document_ids(
                        document_type, field, test_data_projects_ids))
                test_document_ids.update(
                    set(
                        field_detection_utils.get_qs_finished_document_ids(
                            document_type, test_data_projects_ids)))

            self.log_info('Testing field detection document-by-document...')
            test_tasks_args = []
            for test_document_id in test_document_ids:
                test_tasks_args.append((field.uid, test_document_id))

            if test_tasks_args:
                self.run_sub_tasks('Test Field Detector Model',
                                   TrainAndTest.test_field_detector_model,
                                   test_tasks_args)

                self.run_after_sub_tasks_finished(
                    'Join Field Detector Model Tests',
                    TrainAndTest.join_field_detector_model_tests,
                    [(field.uid, document_type.uid,
                      new_model.pk if new_model else None)])