def get_user_data( cls, field: DocumentField, project_ids: Optional[List[str]]) -> Optional[List[dict]]: qs_modified_document_ids = field_detection_utils.get_qs_active_modified_document_ids( field, project_ids) qs_finished_document_ids = field_detection_utils.get_qs_finished_document_ids( field.document_type, project_ids) all_ids = list(qs_modified_document_ids) + \ list(qs_finished_document_ids) all_ids = list(set(all_ids)) return cls.FIELD_REPOSITORY.get_documents_fields_by_doc_ids( all_ids, settings.ML_TRAIN_DATA_SET_GROUP_LEN)
def get_user_data( cls, field: DocumentField, project_ids: Optional[List[str]]) -> Optional[List[dict]]: qs_modified_document_ids = field_detection_utils.get_qs_active_modified_document_ids( field, project_ids) qs_finished_document_ids = field_detection_utils.get_qs_finished_document_ids( field.document_type, project_ids) return list( Document.objects.filter( pk__in=Q(Subquery(qs_modified_document_ids)) | Q(Subquery(qs_finished_document_ids))).values_list( 'field_values', flat=True)[:settings.ML_TRAIN_DATA_SET_GROUP_LEN])
def get_user_data(field: DocumentField, project_ids: Optional[List[str]]) -> List[dict]: qs_modified_document_ids = field_detection_utils.get_qs_active_modified_document_ids(field, project_ids) qs_finished_document_ids = field_detection_utils.get_qs_finished_document_ids(field.document_type, project_ids) document_values = DocumentFieldValue.objects.filter(Q(field=field), Q(text_unit__isnull=False), Q(document__in=Subquery(qs_modified_document_ids)) | Q(document__in=Subquery(qs_finished_document_ids)), Q(removed_by_user=False)) \ .values('created_by', 'text_unit__text', 'value', 'extraction_hint') \ .order_by('created_by')[:settings.ML_TRAIN_DATA_SET_GROUP_LEN] return list(document_values)
def process(self, **kwargs): self.log_info( 'Going to train document field based on the datasets stored in DB...' ) document_type_field_id = kwargs.get('document_type_field_id') skip_training = kwargs.get('skip_training') use_only_confirmed_field_values_for_training = kwargs.get( 'use_only_confirmed_field_values_for_training') train_data_project_ids = kwargs.get('train_data_project_ids') skip_testing = kwargs.get('skip_testing') use_only_confirmed_field_values_for_testing = kwargs.get( 'use_only_confirmed_field_values_for_testing') test_data_projects_ids = kwargs.get('test_data_projects_ids') document_type_field = DocumentTypeField.objects.get( pk=document_type_field_id) document_type = document_type_field.document_type field = document_type_field.document_field # type: DocumentField if not field.is_detectable(): self.log_info( 'Field {0} is not detectable. Nothing to train and/or test.'. format(field.code)) new_model = None if not skip_training: if train_data_project_ids: self.log_info('Training model on the specified projects...') else: self.log_info( 'No training projects specified. ' 'Training model on all user-confirmed field values in the system...' ) new_model = field_detection \ .train_document_field_detector_model(CeleryTaskLogger(self), document_type, field, train_data_project_ids, use_only_confirmed_field_values_for_training) if new_model: ClassifierModel.objects.filter(document_type=document_type, document_field=field).delete() new_model.save() if new_model.classifier_accuracy_report_in_sample: self.log_info( 'Sklearn test report for in-sample docs:\n{0}'.format( new_model.classifier_accuracy_report_in_sample)) if new_model.classifier_accuracy_report_out_of_sample: self.log_info( 'Sklearn test report for out-of-sample docs:\n{0}'. format(new_model. classifier_accuracy_report_out_of_sample)) else: self.log_info( 'No model trained. ' 'Probably the detection strategy of field {0} does not allow training' .format(field.code)) if skip_testing: return if not test_data_projects_ids: self.log_info( 'No test projects specified. Skiping the testing step.') return else: if not use_only_confirmed_field_values_for_testing: test_document_ids = Document.objects \ .filter(project_id__in=test_data_projects_ids, document_type_id=document_type.pk) \ .values_list('pk', flat=True) else: test_document_ids = set( field_detection_utils.get_qs_active_modified_document_ids( document_type, field, test_data_projects_ids)) test_document_ids.update( set( field_detection_utils.get_qs_finished_document_ids( document_type, test_data_projects_ids))) self.log_info('Testing field detection document-by-document...') test_tasks_args = [] for test_document_id in test_document_ids: test_tasks_args.append((field.uid, test_document_id)) if test_tasks_args: self.run_sub_tasks('Test Field Detector Model', TrainAndTest.test_field_detector_model, test_tasks_args) self.run_after_sub_tasks_finished( 'Join Field Detector Model Tests', TrainAndTest.join_field_detector_model_tests, [(field.uid, document_type.uid, new_model.pk if new_model else None)])