def apply_simple_config(log: ProcessLogger, document_field: DocumentField, csv: bytes, drop_previous_field_detectors: bool, update_field_choice_values: bool): df = pd.read_csv(io.BytesIO(csv), dtype=str) if df.shape[0] < 1 or df.shape[1] < 1: raise ValueError('Config csv contains no data') row_num = df.shape[0] if update_field_choice_values: choices = df[ df.columns[0]].dropna().drop_duplicates().sort_values().tolist() document_field.choices = '\n'.join(choices) document_field.save() log.info( 'Creating {2} naive field detectors for document field {0} and document type {1}...' .format(document_field, document_field.document_type, df.shape[0])) log.set_progress_steps_number(int(row_num / 10) + 1) if drop_previous_field_detectors: DocumentFieldDetector.objects.filter( field=document_field, category=FD_CATEGORY_IMPORTED_SIMPLE_CONFIG).delete() for index, row in df.iterrows(): detector = DocumentFieldDetector() detector.category = FD_CATEGORY_IMPORTED_SIMPLE_CONFIG detector.field = document_field detector.regexps_pre_process_lower = True detector.detected_value = row[0] detector.include_regexps = '\n'.join(row.dropna()).lower() detector.save() if index % 10 == 0: log.step_progress() log.info('Done.')
def train_document_field_detector_model( cls, log: ProcessLogger, field: DocumentField, train_data_project_ids: Optional[List], use_only_confirmed_field_values: bool = False ) -> Optional[ClassifierModel]: field_type_adapter = FIELD_TYPES_REGISTRY[ field.type] # type: FieldType log.set_progress_steps_number(7) log.info('Training model for field #{0} ({1})...'.format( field.pk, field.code)) # Classifier: values of dependencies -> value of this field # Field types supported: only choice fields if not isinstance(field_type_adapter, ChoiceField): raise ValueError( 'Field-based ML supports only choice fields but field {0} (#{1}) is of type {2}' .format(field.code, field.uid, field_type_adapter.code)) # Lets find good values of depends-on fields suitable for using as train data. if train_data_project_ids and not use_only_confirmed_field_values: train_data = list(Document.objects \ .filter(project_id__in=train_data_project_ids) \ .values_list('field_values', flat=True)[:settings.ML_TRAIN_DATA_SET_GROUP_LEN]) else: train_data = list(cls.get_user_data(field, train_data_project_ids)) if not train_data: raise RuntimeError( 'Not enough train data for field {0} (#{1}). ' 'Need at least {2} approved or changed documents of type {3}.'. format(field.code, field.uid, settings.ML_TRAIN_DATA_SET_GROUP_LEN, field.document_type.code)) depends_on_fields_types = cls.get_depends_on_uid_code_type(field) depends_on_fields_types = cls.remove_empty_fields( depends_on_fields_types, train_data) pipeline, feature_names_funcs = cls.build_pipeline( field, depends_on_fields_types) # type: Pipeline, List[Callable] categories = sorted([c.strip() for c in field.choices.split('\n')]) category_names_to_indexes = {c: i for i, c in enumerate(categories)} log.step_progress() log.info( 'Collecting feature rows from train and test documents in dict form...' ) # When tried to use sklearn shuffling something went wrong, leaving manual methods for a while. random.shuffle(train_data) # TODO: use sklearn methods for splitting train/test data and shuffling test_size = 0.2 train_feature_data = list() train_target_data = list() for doc_field_values in train_data: field_value = doc_field_values.get(field.uid) del doc_field_values[field.uid] field_value_idx = category_names_to_indexes.get( field_value) if field_value else None if field_value_idx is None: field_value_idx = len(categories) train_feature_data.append(doc_field_values) train_target_data.append(field_value_idx) is_index = math.floor(test_size * len(train_data)) test_oos_feature_data = train_feature_data[:is_index] test_oos_target_data = train_target_data[:is_index] train_feature_data = train_feature_data[is_index:] train_target_data = train_target_data[is_index:] test_is_feature_data = train_feature_data[:is_index] test_is_target_data = train_target_data[:is_index] log.step_progress() log.info('Training the model...') model = pipeline.fit(train_feature_data, train_target_data) log.step_progress() log.info('Testing the model...') cm = ClassifierModel() cm.document_field = field predicted_oos = pipeline.predict(test_oos_feature_data) cm.classifier_accuracy_report_out_of_sample = classification_report( test_oos_target_data, predicted_oos, target_names=categories) predicted_is = pipeline.predict(test_is_feature_data) cm.classifier_accuracy_report_in_sample = classification_report( test_is_target_data, predicted_is, target_names=categories) log.step_progress() log.info('Saving ClassifierModel instance...') feature_names = [] for f in feature_names_funcs: feature_names.extend(f()) cm.set_trained_model_obj({ 'model': model, 'categories': categories, 'feature_names': feature_names }) log.step_progress() log.info('Finished.') return cm