Exemplo n.º 1
0
def apply_simple_config(log: ProcessLogger, document_field: DocumentField,
                        csv: bytes, drop_previous_field_detectors: bool,
                        update_field_choice_values: bool):
    df = pd.read_csv(io.BytesIO(csv), dtype=str)
    if df.shape[0] < 1 or df.shape[1] < 1:
        raise ValueError('Config csv contains no data')
    row_num = df.shape[0]

    if update_field_choice_values:
        choices = df[
            df.columns[0]].dropna().drop_duplicates().sort_values().tolist()
        document_field.choices = '\n'.join(choices)
        document_field.save()

    log.info(
        'Creating {2} naive field detectors for document field {0} and document type {1}...'
        .format(document_field, document_field.document_type, df.shape[0]))
    log.set_progress_steps_number(int(row_num / 10) + 1)
    if drop_previous_field_detectors:
        DocumentFieldDetector.objects.filter(
            field=document_field,
            category=FD_CATEGORY_IMPORTED_SIMPLE_CONFIG).delete()
    for index, row in df.iterrows():
        detector = DocumentFieldDetector()
        detector.category = FD_CATEGORY_IMPORTED_SIMPLE_CONFIG
        detector.field = document_field
        detector.regexps_pre_process_lower = True
        detector.detected_value = row[0]
        detector.include_regexps = '\n'.join(row.dropna()).lower()
        detector.save()
        if index % 10 == 0:
            log.step_progress()
    log.info('Done.')
    def train_document_field_detector_model(
        cls,
        log: ProcessLogger,
        field: DocumentField,
        train_data_project_ids: Optional[List],
        use_only_confirmed_field_values: bool = False
    ) -> Optional[ClassifierModel]:

        field_type_adapter = FIELD_TYPES_REGISTRY[
            field.type]  # type: FieldType

        log.set_progress_steps_number(7)
        log.info('Training model for field #{0} ({1})...'.format(
            field.pk, field.code))

        # Classifier: values of dependencies -> value of this field
        # Field types supported: only choice fields
        if not isinstance(field_type_adapter, ChoiceField):
            raise ValueError(
                'Field-based ML supports only choice fields but field {0} (#{1}) is of type {2}'
                .format(field.code, field.uid, field_type_adapter.code))
        # Lets find good values of depends-on fields suitable for using as train data.

        if train_data_project_ids and not use_only_confirmed_field_values:
            train_data = list(Document.objects \
                              .filter(project_id__in=train_data_project_ids) \
                              .values_list('field_values', flat=True)[:settings.ML_TRAIN_DATA_SET_GROUP_LEN])
        else:
            train_data = list(cls.get_user_data(field, train_data_project_ids))

        if not train_data:
            raise RuntimeError(
                'Not enough train data for field {0} (#{1}). '
                'Need at least {2} approved or changed documents of type {3}.'.
                format(field.code, field.uid,
                       settings.ML_TRAIN_DATA_SET_GROUP_LEN,
                       field.document_type.code))

        depends_on_fields_types = cls.get_depends_on_uid_code_type(field)
        depends_on_fields_types = cls.remove_empty_fields(
            depends_on_fields_types, train_data)

        pipeline, feature_names_funcs = cls.build_pipeline(
            field, depends_on_fields_types)  # type: Pipeline, List[Callable]

        categories = sorted([c.strip() for c in field.choices.split('\n')])
        category_names_to_indexes = {c: i for i, c in enumerate(categories)}

        log.step_progress()
        log.info(
            'Collecting feature rows from train and test documents in dict form...'
        )

        #  When tried to use sklearn shuffling something went wrong, leaving manual methods for a while.
        random.shuffle(train_data)

        #  TODO: use sklearn methods for splitting train/test data and shuffling

        test_size = 0.2

        train_feature_data = list()
        train_target_data = list()

        for doc_field_values in train_data:
            field_value = doc_field_values.get(field.uid)
            del doc_field_values[field.uid]

            field_value_idx = category_names_to_indexes.get(
                field_value) if field_value else None
            if field_value_idx is None:
                field_value_idx = len(categories)

            train_feature_data.append(doc_field_values)
            train_target_data.append(field_value_idx)

        is_index = math.floor(test_size * len(train_data))

        test_oos_feature_data = train_feature_data[:is_index]
        test_oos_target_data = train_target_data[:is_index]

        train_feature_data = train_feature_data[is_index:]
        train_target_data = train_target_data[is_index:]

        test_is_feature_data = train_feature_data[:is_index]
        test_is_target_data = train_target_data[:is_index]

        log.step_progress()
        log.info('Training the model...')
        model = pipeline.fit(train_feature_data, train_target_data)

        log.step_progress()

        log.info('Testing the model...')
        cm = ClassifierModel()
        cm.document_field = field

        predicted_oos = pipeline.predict(test_oos_feature_data)
        cm.classifier_accuracy_report_out_of_sample = classification_report(
            test_oos_target_data, predicted_oos, target_names=categories)

        predicted_is = pipeline.predict(test_is_feature_data)
        cm.classifier_accuracy_report_in_sample = classification_report(
            test_is_target_data, predicted_is, target_names=categories)

        log.step_progress()
        log.info('Saving ClassifierModel instance...')

        feature_names = []
        for f in feature_names_funcs:
            feature_names.extend(f())

        cm.set_trained_model_obj({
            'model': model,
            'categories': categories,
            'feature_names': feature_names
        })
        log.step_progress()
        log.info('Finished.')
        return cm