def detect_field_values(cls, log: ProcessLogger, doc: Document,
                            field: DocumentField) -> List[DetectedFieldValue]:

        depends_on_fields = list(field.depends_on_fields.all())

        qs_document_field_values = doc.documentfieldvalue_set \
            .filter(removed_by_user=False) \
            .filter(field__in=depends_on_fields)

        field_code_to_value = merge_document_field_values_to_python_value(
            list(qs_document_field_values))

        field_code_to_value = {
            f.code: field_code_to_value.get(f.code)
            for f in depends_on_fields
        }

        document_type = doc.document_type

        field_detectors = DocumentFieldDetector.objects.filter(
            document_type=document_type, field=field)
        field_type_adapter = FIELD_TYPES_REGISTRY.get(
            field.type)  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]

        for depends_on_value in field_code_to_value.values():
            if not depends_on_value:
                continue
            depends_on_value = str(depends_on_value)
            for field_detector in field_detectors:
                if field_detector.matches(depends_on_value):
                    value = field_detector.detected_value
                    hint_name = None
                    if field_type_adapter.value_aware:
                        hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                        value, hint_name = field_type_adapter \
                            .get_or_extract_value(doc,
                                                  field, value,
                                                  hint_name,
                                                  depends_on_value)
                        if value is None:
                            continue

                    detected_values.append(
                        DetectedFieldValue(field, value, None, hint_name))

                    if not (field_type_adapter.multi_value
                            or field.is_choice_field()):
                        break

            if detected_values and not (field_type_adapter.multi_value
                                        or field.is_choice_field()):
                break

        return detected_values
Пример #2
0
    def detect_field_values_for_python_coded_field(
            document: Document, field: DocumentField,
            sentence_text_units: List[TextUnit], do_not_write: bool) -> int:
        python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get(
            field.python_coded_field)  # type: PythonCodedField
        if not python_coded_field:
            raise RuntimeError('Unknown python-coded field: {0}'.format(
                field.python_coded_field))
        field_type_adapter = FIELD_TYPES_REGISTRY[
            field.type]  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]
        if python_coded_field.by_sentence:
            for text_unit in sentence_text_units:
                for value, location_start, location_end in python_coded_field.get_values(
                        text_unit.text) or []:
                    detected_values.append(
                        DetectedFieldValue(text_unit, value, None,
                                           location_start, location_end))
                    if not (field_type_adapter.multi_value
                            or field.is_choice_field()):
                        return DetectFieldValues.save_detected_values(
                            document, field, field_type_adapter,
                            detected_values, do_not_write)
        else:
            for value, location_start, location_end in python_coded_field.get_values(
                    document.full_text) or []:
                text_unit = TextUnit.objects.filter(
                    document=document,
                    unit_type='sentence',
                    location_start__lte=location_start,
                    location_end__gte=location_start).first()  # type: TextUnit
                if not text_unit:
                    raise RuntimeError(
                        'Python coded field {0} detected a value in document {1} at '
                        'location [{2};{3}] but the start of location does not belong to any '
                        'text unit object in DB.\n'
                        'It can not be. Something is broken.'.format(
                            field.python_coded_field, document, location_start,
                            location_end))
                location_length = location_end - location_start
                location_start = location_start - text_unit.location_start
                location_end = location_start + location_length
                detected_values.append(
                    DetectedFieldValue(text_unit, value, None, location_start,
                                       location_end))
                if not (field_type_adapter.multi_value
                        or field.is_choice_field()):
                    return DetectFieldValues.save_detected_values(
                        document, field, field_type_adapter, detected_values,
                        do_not_write)

        return DetectFieldValues.save_detected_values(document, field,
                                                      field_type_adapter,
                                                      detected_values,
                                                      do_not_write)
    def detect_field_values(cls, log: ProcessLogger, doc: Document,
                            field: DocumentField) -> List[DetectedFieldValue]:
        python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get(
            field.python_coded_field)  # type: PythonCodedField
        if not python_coded_field:
            raise RuntimeError('Unknown python-coded field: {0}'.format(
                field.python_coded_field))
        field_type_adapter = FIELD_TYPES_REGISTRY[
            field.type]  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]
        if python_coded_field.by_sentence:
            qs_text_units = TextUnit.objects \
                .filter(document=doc) \
                .filter(unit_type=field.text_unit_type) \
                .order_by('location_start', 'pk')

            for text_unit in qs_text_units.iterator():
                for value, location_start, location_end in python_coded_field.get_values(
                        text_unit.text) or []:
                    detected_values.append(
                        DetectedFieldValue(field, value, text_unit, None,
                                           location_start, location_end))
                    if not (field_type_adapter.multi_value
                            or field.is_choice_field()):
                        return detected_values
        else:
            for value, location_start, location_end in python_coded_field.get_values(
                    doc.full_text) or []:
                text_unit = TextUnit.objects.filter(
                    document=doc,
                    unit_type='sentence',
                    location_start__lte=location_start,
                    location_end__gte=location_start).first()  # type: TextUnit
                if not text_unit:
                    raise RuntimeError(
                        'Python coded field {0} detected a value in document {1} at '
                        'location [{2};{3}] but the start of location does not belong to any '
                        'text unit object in DB.\n'
                        'It can not be. Something is broken.'.format(
                            field.python_coded_field, doc, location_start,
                            location_end))
                location_length = location_end - location_start
                location_start = location_start - text_unit.location_start
                location_end = location_start + location_length
                detected_values.append(
                    DetectedFieldValue(field, value, text_unit, None,
                                       location_start, location_end))
                if not (field_type_adapter.multi_value
                        or field.is_choice_field()):
                    return detected_values

        return detected_values
Пример #4
0
    def detect_field_values(cls, log: ProcessLogger, doc: Document,
                            field: DocumentField) -> List[DetectedFieldValue]:

        depends_on_full_text = doc.full_text
        detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text(
            field, depends_on_full_text)
        if detected_with_stop_words:
            return detected_values or list()

        qs_text_units = TextUnit.objects \
            .filter(document=doc) \
            .filter(unit_type=field.text_unit_type) \
            .order_by('location_start', 'pk')
        document_type = doc.document_type

        field_detectors = DocumentFieldDetector.objects.filter(
            document_type=document_type, field=field)

        field_type_adapter = FIELD_TYPES_REGISTRY.get(
            field.type)  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]

        for text_unit in qs_text_units.iterator():

            for field_detector in field_detectors:
                if field_detector.matches(text_unit.text):
                    value = field_detector.detected_value
                    hint_name = None
                    if field_type_adapter.value_aware:
                        hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                        value, hint_name = field_type_adapter \
                            .get_or_extract_value(doc,
                                                  field, value,
                                                  hint_name,
                                                  text_unit.text)
                        if value is None:
                            continue

                    detected_values.append(
                        DetectedFieldValue(field, value, text_unit, hint_name))

                    if not (field_type_adapter.multi_value
                            or field.is_choice_field()):
                        break

            if detected_values and not (field_type_adapter.multi_value
                                        or field.is_choice_field()):
                break

        return detected_values
Пример #5
0
    def detect_field_values(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]:

        depends_on_full_text = doc.full_text
        detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text(
            field, depends_on_full_text)
        if detected_with_stop_words:
            return detected_values or list()

        qs_text_units = RegexpsOnlyFieldDetectionStrategy.\
            text_unit_repo.get_doc_text_units(doc, field.text_unit_type)

        field_detectors = RegexpsOnlyFieldDetectionStrategy.\
            field_detector_repo.get_field_detectors(field)
        detectors = [DetectorFieldMatcher(d) for d in field_detectors]

        field_type_adapter = field.get_field_type()  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]

        for text_unit in qs_text_units:  # type: TextUnit

            for field_detector in detectors:
                matching_string = field_detector.matching_string(
                    text_unit.text, text_is_sentence=text_unit.is_sentence())
                if matching_string is not None:
                    value = field_detector.get_validated_detected_value(field)
                    hint_name = None
                    if field_type_adapter.requires_value:
                        hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                        value, hint_name = field_type_adapter \
                            .get_or_extract_value(doc,
                                                  field, value,
                                                  hint_name,
                                                  matching_string)
                        if value is None:
                            continue

                    detected_values.append(
                        DetectedFieldValue(field, value, text_unit, hint_name))

                    if not (field_type_adapter.multi_value
                            or field.is_choice_field()):
                        break

            if detected_values and not (field_type_adapter.multi_value
                                        or field.is_choice_field()):
                break

        return detected_values
Пример #6
0
    def detect_field_values(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]:

        depends_on_full_text = doc.full_text
        detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text(
            field, depends_on_full_text)
        if detected_with_stop_words:
            return detected_values or list()

        qs_text_units = TextUnit.objects \
            .filter(document=doc) \
            .filter(unit_type=field.text_unit_type) \
            .order_by('location_start', 'pk')

        field_detectors = DocumentFieldDetector.objects.filter(field=field)

        field_type_adapter = field.get_field_type()  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]

        for text_unit in qs_text_units.iterator():  # type: TextUnit

            for field_detector in field_detectors:
                matching_string = field_detector.matching_string(
                    text_unit.text, text_is_sentence=text_unit.is_sentence())
                if matching_string is not None:
                    value = field_detector.get_validated_detected_value(field)
                    hint_name = None
                    if field_type_adapter.requires_value:
                        hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                        value, hint_name = field_type_adapter \
                            .get_or_extract_value(doc,
                                                  field, value,
                                                  hint_name,
                                                  matching_string)
                        if value is None:
                            continue

                    detected_values.append(
                        DetectedFieldValue(field, value, text_unit, hint_name))

                    if not (field_type_adapter.multi_value
                            or field.is_choice_field()):
                        break

            if detected_values and not (field_type_adapter.multi_value
                                        or field.is_choice_field()):
                break

        return detected_values
Пример #7
0
    def detect_field_values_with_model(classifier_model, document: Document,
                                       field: DocumentField,
                                       sentence_text_units: List[TextUnit],
                                       do_not_write: bool) -> int:
        sklearn_model = classifier_model.get_trained_model_obj()
        field_type_adapter = FIELD_TYPES_REGISTRY[field.type]

        detected_values = list()  # type: List[DetectedFieldValue]
        for text_unit in sentence_text_units:
            value, hint_name = DetectFieldValues.predict_and_extract_value(
                sklearn_model=sklearn_model,
                field_type_adapter=field_type_adapter,
                document=document,
                field=field,
                text_unit=text_unit)
            if value is None:
                continue
            detected_values.append(
                DetectedFieldValue(text_unit, value, hint_name))
            if not (field_type_adapter.multi_value or field.is_choice_field()):
                break

        return DetectFieldValues.save_detected_values(document, field,
                                                      field_type_adapter,
                                                      detected_values,
                                                      do_not_write)
    def detect_field_values(cls, log: ProcessLogger, doc: Document,
                            field: DocumentField) -> List[DetectedFieldValue]:
        document_type = doc.document_type  # type: DocumentType
        try:
            classifier_model = ClassifierModel.objects \
                .get(document_type=document_type, document_field=field)
            sklearn_model = classifier_model.get_trained_model_obj()
            field_type_adapter = FIELD_TYPES_REGISTRY[field.type]

            detected_values = list()  # type: List[DetectedFieldValue]

            qs_text_units = TextUnit.objects \
                .filter(document=doc) \
                .filter(unit_type=field.text_unit_type) \
                .order_by('location_start', 'pk')

            for text_unit in qs_text_units.iterator():
                detected_value = cls.predict_and_extract_value(
                    sklearn_model=sklearn_model,
                    field_type_adapter=field_type_adapter,
                    document=doc,
                    field=field,
                    text_unit=text_unit)
                if detected_value is None:
                    continue
                detected_values.append(detected_value)
                if not (field_type_adapter.multi_value
                        or field.is_choice_field()):
                    break
            return detected_values

        except ClassifierModel.DoesNotExist as e:
            log.info('Classifier model does not exist for field: {0}'.format(
                field.code))
            raise e
Пример #9
0
    def save_detected_values(document: Document, field: DocumentField,
                             field_type_adapter: FieldType,
                             detected_values: List[DetectedFieldValue],
                             do_not_write: bool):
        if len(detected_values) == 0:
            return 0

        try:
            if field.is_choice_field() and not field_type_adapter.multi_value:
                values_order = field.get_choice_values()
                for choice_value in values_order:
                    for dv in detected_values:
                        if choice_value == dv.value:
                            if not do_not_write:
                                field_type_adapter.save_value(
                                    document,
                                    field,
                                    dv.get_annotation_start(),
                                    dv.get_annotation_end(),
                                    dv.get_annotation_text(),
                                    dv.text_unit,
                                    dv.value,
                                    user=None,
                                    allow_overwriting_user_data=False,
                                    extraction_hint=dv.hint_name)
                            return 1
            else:
                for dv in detected_values:
                    if not do_not_write:
                        field_type_adapter.save_value(
                            document,
                            field,
                            dv.get_annotation_start(),
                            dv.get_annotation_end(),
                            dv.get_annotation_text(),
                            dv.text_unit,
                            dv.value,
                            user=None,
                            allow_overwriting_user_data=False,
                            extraction_hint=dv.hint_name)
                return len(detected_values)
        finally:
            document.cache_field_values()
Пример #10
0
    def detect_field_values(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]:

        depends_on_full_text = doc.full_text
        detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text(
            field, depends_on_full_text)
        if detected_with_stop_words:
            return detected_values or list()

        try:
            classifier_model = ClassifierModel.objects.get(
                document_field=field)
            sklearn_model = classifier_model.get_trained_model_obj()
            field_type_adapter = field.get_field_type()

            detected_values = list()  # type: List[DetectedFieldValue]

            qs_text_units = TextUnit.objects \
                .filter(document=doc) \
                .filter(unit_type=field.text_unit_type) \
                .order_by('location_start', 'pk')

            for text_unit in qs_text_units.iterator():
                detected_value = cls.predict_and_extract_value(
                    sklearn_model=sklearn_model,
                    field_type_adapter=field_type_adapter,
                    document=doc,
                    field=field,
                    text_unit=text_unit)
                if detected_value is None:
                    continue
                detected_values.append(detected_value)
                if not (field_type_adapter.multi_value
                        or field.is_choice_field()):
                    break
            return detected_values

        except ClassifierModel.DoesNotExist as e:
            log.info('Classifier model does not exist for field: {0}'.format(
                field.code))
            raise e
Пример #11
0
    def detect_field_values_with_regexps(document: Document,
                                         field: DocumentField,
                                         sentence_text_units: List[TextUnit],
                                         do_not_write: bool) -> int:
        document_type = document.document_type
        field_detectors = DocumentFieldDetector.objects.filter(
            document_type=document_type, field=field)
        field_type_adapter = FIELD_TYPES_REGISTRY.get(
            field.type)  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]

        for text_unit in sentence_text_units:

            for field_detector in field_detectors:
                if field_detector.matches(text_unit.text):
                    value = field_detector.detected_value
                    hint_name = None
                    if field_type_adapter.value_aware:
                        hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                        value, hint_name = field_type_adapter \
                            .get_or_extract_value(document,
                                                  field, value,
                                                  hint_name,
                                                  text_unit.text)
                        if value is None:
                            continue

                    detected_values.append(
                        DetectedFieldValue(text_unit, value, hint_name))

                    if not (field_type_adapter.multi_value
                            or field.is_choice_field()):
                        break

        return DetectFieldValues.save_detected_values(document, field,
                                                      field_type_adapter,
                                                      detected_values,
                                                      do_not_write)
def save_detected_values(document: Document, field: DocumentField,
                         detected_values: List[DetectedFieldValue]):
    if len(detected_values) == 0:
        return 0

    field_type_adapter = FIELD_TYPES_REGISTRY[field.type]  # type: FieldType

    if field.is_choice_field() and not field_type_adapter.multi_value:
        values_order = field.get_choice_values()
        for choice_value in values_order:
            for dv in detected_values:
                if choice_value == dv.value:
                    field_type_adapter.save_value(
                        document,
                        field,
                        dv.get_annotation_start(),
                        dv.get_annotation_end(),
                        dv.get_annotation_text(),
                        dv.text_unit,
                        dv.value,
                        user=dv.user,
                        allow_overwriting_user_data=dv.user is not None,
                        extraction_hint=dv.hint_name)
                    return 1
    else:
        for dv in detected_values:
            field_type_adapter.save_value(document,
                                          field,
                                          dv.get_annotation_start(),
                                          dv.get_annotation_end(),
                                          dv.get_annotation_text(),
                                          dv.text_unit,
                                          dv.value,
                                          user=dv.user,
                                          allow_overwriting_user_data=dv.user
                                          is not None,
                                          extraction_hint=dv.hint_name)
        return len(detected_values)
    def detect_field_values(
            cls, log: ProcessLogger, doc: Document, field: DocumentField,
            cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]:
        python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get(
            field.python_coded_field)  # type: PythonCodedField
        if not python_coded_field:
            raise RuntimeError('Unknown python-coded field: {0}'.format(
                field.python_coded_field))
        field_type_adapter = field.get_field_type()  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]
        if python_coded_field.detect_per_text_unit:
            qs_text_units = TextUnit.objects \
                .filter(document=doc) \
                .filter(unit_type=field.text_unit_type) \
                .order_by('location_start', 'pk')

            for text_unit in qs_text_units.iterator():
                for value, location_start, location_end \
                        in python_coded_field.get_values(log, field, doc, text_unit.text) or []:
                    detected_values.append(
                        DetectedFieldValue(field, value, text_unit, None,
                                           location_start, location_end))
                    if not (field_type_adapter.multi_value
                            or field.is_choice_field()):
                        return detected_values
        else:
            for value, location_start, location_end \
                    in python_coded_field.get_values(log, field, doc, doc.full_text) or []:
                if field.requires_text_annotations and (
                        location_start is None or location_end is None):
                    raise RuntimeError(
                        'Python coded field {0} detected a value in document {1} at '
                        'undefined location but the field requires text annotation (and location).\n'
                        'This should not happen. Something is broken.'.format(
                            field.python_coded_field, doc))
                if location_start is not None and location_end is not None:
                    text_unit = TextUnit.objects.filter(
                        document=doc,
                        unit_type=field.text_unit_type,
                        location_start__lte=location_start,
                        location_end__gte=location_start).first(
                        )  # type: TextUnit
                    if not text_unit:
                        raise RuntimeError(
                            'Python coded field {0} detected a value in document {1} at '
                            'location [{2};{3}] but the start of location does not belong to any '
                            'text unit object in DB.\n'
                            'This should not happen. Something is broken.'.
                            format(field.python_coded_field, doc,
                                   location_start, location_end))
                    location_length = location_end - location_start
                    location_start = location_start - text_unit.location_start
                    location_end = location_start + location_length
                else:
                    text_unit = None
                    location_start = None
                    location_end = None
                detected_values.append(
                    DetectedFieldValue(field, value, text_unit, None,
                                       location_start, location_end))
                if not (field_type_adapter.multi_value
                        or field.is_choice_field()):
                    return detected_values

        return detected_values
Пример #14
0
    def detect_field_values(cls, log: ProcessLogger, doc: Document,
                            field: DocumentField) -> List[DetectedFieldValue]:

        depends_on_fields = list(field.depends_on_fields.all())

        qs_document_field_values = doc.documentfieldvalue_set \
            .filter(removed_by_user=False) \
            .filter(field__in=depends_on_fields)

        field_code_to_value = merge_document_field_values_to_python_value(
            list(qs_document_field_values))

        field_code_to_value = {
            f.code: field_code_to_value.get(f.code)
            for f in depends_on_fields
        }

        if field.stop_words:
            depends_on_full_text = '\n'.join(
                [str(v) for v in field_code_to_value.values()])
            detected_with_stop_words, detected_values \
                = detect_with_stop_words_by_field_and_full_text(field, depends_on_full_text)
            if detected_with_stop_words:
                return detected_values or list()

        field_detectors = DocumentFieldDetector.objects.filter(field=field)
        field_type_adapter = FIELD_TYPES_REGISTRY.get(
            field.type)  # type: FieldType

        detected_values = list()  # type: List[DetectedFieldValue]

        for depends_on_value in field_code_to_value.values():
            if not depends_on_value:
                continue
            depends_on_value = str(depends_on_value)
            for field_detector in field_detectors:  # type: DocumentFieldDetector
                matching_string = field_detector.matching_string(
                    depends_on_value, text_is_sentence=False)
                if matching_string is not None:
                    value = field_detector.get_validated_detected_value(field)
                    hint_name = None
                    if field_type_adapter.requires_value:
                        hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name
                        value, hint_name = field_type_adapter \
                            .get_or_extract_value(doc,
                                                  field, value,
                                                  hint_name,
                                                  matching_string)
                        if value is None:
                            continue

                    detected_values.append(
                        DetectedFieldValue(field, value, None, hint_name))

                    if not (field_type_adapter.multi_value
                            or field.is_choice_field()):
                        break

            if detected_values and not (field_type_adapter.multi_value
                                        or field.is_choice_field()):
                break

        return detected_values
Пример #15
0
def train_model(document_type: DocumentType, field: DocumentField, train_data_sets: List[dict]) -> ClassifierModel:
    df = pd.DataFrame.from_records(train_data_sets.pop(0))
    # add transferred external data
    for train_data in train_data_sets:
        df = df.append(pd.DataFrame.from_records(train_data))

    df['target_name'] = df.apply(lambda row: encode_category(
        field.pk,
        row.value if field.is_choice_field() else None,
        row.extraction_hint), axis=1)

    df['target_index'] = df['target_name'].factorize(sort=True)[0] + 1

    df = df.append([{'text_unit__text': i} for i in get_no_field_text_units(document_type, field)])

    df['target_index'] = df['target_index'].fillna(0).astype('int')
    df['target_name'] = df['target_name'].fillna(SkLearnClassifierModel.EMPTY_CAT_NAME).astype(
        'str')
    df['user_input'] = df['created_by'].fillna(0).astype('bool')

    res_df = pd.DataFrame()

    for group_index, group_df in df.groupby('target_index'):
        if group_df.shape[0] > settings.ML_TRAIN_DATA_SET_GROUP_LEN:
            group_df = shuffle(
                group_df.sort_values('user_input', ascending=False)[:settings.ML_TRAIN_DATA_SET_GROUP_LEN])
        res_df = res_df.append(group_df)
    res_df = shuffle(res_df)

    target_names = sorted(res_df['target_name'].unique())

    text_clf = Pipeline([('vect', CountVectorizer(strip_accents='unicode', analyzer='word',
                                                  stop_words='english',
                                                  tokenizer=word_position_tokenizer)),
                         ('tfidf', TfidfTransformer()),
                         ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                               alpha=1e-3, max_iter=5, tol=None, n_jobs=-1,
                                               class_weight='balanced')),
                         ])
    x = res_df['text_unit__text']
    y = res_df['target_index']

    x_train, x_test_os, y_train, y_test_os = train_test_split(x, y, test_size=0.2, random_state=42)
    _x_train, x_test_is, _y_train, y_test_is = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

    sklearn_model = text_clf.fit(x_train, y_train)

    model = SkLearnClassifierModel(sklearn_model=sklearn_model, target_names=target_names)

    classifier_model = ClassifierModel()

    classifier_model.set_trained_model_obj(model)
    classifier_model.document_field = field
    classifier_model.document_type = document_type

    predicted_os = text_clf.predict(x_test_os)
    predicted_is = text_clf.predict(x_test_is)

    classifier_model.classifier_accuracy_report_out_of_sample = classification_report(y_test_os,
                                                                                      predicted_os,
                                                                                      target_names=target_names)
    classifier_model.classifier_accuracy_report_in_sample = classification_report(y_test_is,
                                                                                  predicted_is,
                                                                                  target_names=target_names)

    return classifier_model