Пример #1
0
 def calc_formula(self,
                  field_code,
                  type_code,
                  formula,
                  fields_to_values,
                  form_field,
                  formula_name='formula'):
     try:
         FormulaBasedFieldDetectionStrategy.calc_formula(
             field_code, type_code, formula, fields_to_values)
     except DocumentFieldFormulaError as ex:
         base_error_class = type(ex.base_error).__name__
         base_error_msg = str(ex.base_error)
         lines = list()
         lines.append(
             'Error caught while trying to execute {0} on example values:'.
             format(formula_name))
         for field_name in ex.field_values:
             lines.append('{0}={1}'.format(field_name,
                                           ex.field_values[field_name]))
         lines.append("{0}. {1} in {2} of field '{3}' at line {4}".format(
             base_error_class, base_error_msg, formula_name, ex.field_code,
             ex.line_number))
         self.add_error(form_field, lines)
     except Exception:
         trace = traceback.format_exc()
         raise forms.ValidationError(
             'Tried to eval {0} on example values:\n{1}\nGot error:\n{2}'.
             format(formula_name, str(fields_to_values), trace))
 def detect_field_values(cls, log: ProcessLogger, doc: Document,
                         field: DocumentField) -> List[DetectedFieldValue]:
     try:
         return super().detect_field_values(log, doc, field)
     except ClassifierModel.DoesNotExist:
         return FormulaBasedFieldDetectionStrategy.detect_field_values(
             log, doc, field)
Пример #3
0
    def clean(self):
        field_code = self.cleaned_data.get('code')
        formula = self.cleaned_data.get('formula')
        type_code = self.cleaned_data.get('type')
        depends_on_fields = self.cleaned_data.get('depends_on_fields') or []
        depends_on_fields = list(depends_on_fields)
        classifier_init_script = self.cleaned_data['classifier_init_script']
        stop_words = self.cleaned_data['stop_words']

        try:
            stop_words = compile_stop_words(stop_words)
            _v = detect_value_with_stop_words(stop_words, 'dummy text')
        except Exception as err:
            self.add_error('stop_words', str(err))

        try:
            FieldBasedMLOnlyFieldDetectionStrategy.init_classifier_impl(
                field_code, classifier_init_script)
        except ScriptError as err:
            self.add_error('classifier_init_script', str(err).split('\n'))

        fields_and_deps = {
            self.cleaned_data.get('code') or 'xxx':
            {f.code
             for f in depends_on_fields}
        }
        fields_and_deps = self._extract_field_and_deps(depends_on_fields,
                                                       fields_and_deps)
        fields_and_deps = [(code, deps)
                           for code, deps in fields_and_deps.items()]
        try:
            order_field_detection(fields_and_deps)
        except ValueError as ve:
            self.add_error(None, str(ve))

        fields_to_values = {
            field.code:
            FIELD_TYPES_REGISTRY[field.type].example_python_value(field)
            for field in depends_on_fields
        }

        python_coded_field_code = self.cleaned_data.get('python_coded_field')
        if python_coded_field_code:
            python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get(
                python_coded_field_code)
            if not python_coded_field:
                self.add_error(
                    'python_coded_field',
                    'Unknown Python-coded field: {0}'.format(
                        python_coded_field_code))
            else:
                if type_code != python_coded_field.type:
                    self.add_error(
                        'type',
                        'Python-coded field {0} is of type {1} but {2} is specified'
                        ' as the field type'.format(python_coded_field.title,
                                                    python_coded_field.type,
                                                    type_code))

        if not formula or not formula.strip() or not type_code:
            return self.cleaned_data

        try:
            FormulaBasedFieldDetectionStrategy.calc_formula(
                field_code, type_code, formula, fields_to_values)
        except DocumentFieldFormulaError as ex:
            base_error_class = type(ex.base_error).__name__
            base_error_msg = str(ex.base_error)
            lines = list()
            lines.append(
                "Error caught while trying to execute formula on example values:"
            )
            for field_name in ex.field_values:
                lines.append('{0}={1}'.format(field_name,
                                              ex.field_values[field_name]))
            lines.append(
                "{0}. {1} in formula of field '{2}' at line {3}".format(
                    base_error_class, base_error_msg, ex.field_code,
                    ex.line_number))
            self.add_error('formula', lines)
        except Exception:
            trace = traceback.format_exc()
            raise forms.ValidationError(
                'Tried to eval formula on example values:\n{0}\nGot error:\n{1}'
                .format(str(fields_to_values), trace))

        return self.cleaned_data
from apps.document.fields_detection.regexps_and_text_based_ml_field_detection import \
    RegexpsAndTextBasedMLFieldDetectionStrategy, TextBasedMLFieldDetectionStrategy
from apps.document.fields_detection.regexps_field_detection import RegexpsOnlyFieldDetectionStrategy, \
    FieldBasedRegexpsDetectionStrategy
from apps.document.fields_processing import field_value_cache
from apps.document.fields_processing.field_processing_utils import merge_detected_field_values_to_python_value, \
    order_field_detection
from apps.document.models import ClassifierModel
from apps.document.models import Document, DocumentType, DocumentField

STRATEGY_DISABLED = DisabledFieldDetectionStrategy()

_FIELD_DETECTION_STRATEGIES = [
    FieldBasedMLOnlyFieldDetectionStrategy(),
    FormulaAndFieldBasedMLFieldDetectionStrategy(),
    FormulaBasedFieldDetectionStrategy(),
    RegexpsOnlyFieldDetectionStrategy(),
    RegexpsAndTextBasedMLFieldDetectionStrategy(),
    TextBasedMLFieldDetectionStrategy(),
    PythonCodedFieldDetectionStrategy(),
    FieldBasedRegexpsDetectionStrategy(), STRATEGY_DISABLED
]

FIELD_DETECTION_STRATEGY_REGISTRY = {
    st.code: st
    for st in _FIELD_DETECTION_STRATEGIES
}


def train_document_field_detector_model(
    log: ProcessLogger,