def detect_field_values(cls, log: ProcessLogger, doc: Document, field: DocumentField) -> List[DetectedFieldValue]: depends_on_fields = list(field.depends_on_fields.all()) qs_document_field_values = doc.documentfieldvalue_set \ .filter(removed_by_user=False) \ .filter(field__in=depends_on_fields) field_code_to_value = merge_document_field_values_to_python_value( list(qs_document_field_values)) field_code_to_value = { f.code: field_code_to_value.get(f.code) for f in depends_on_fields } document_type = doc.document_type field_detectors = DocumentFieldDetector.objects.filter( document_type=document_type, field=field) field_type_adapter = FIELD_TYPES_REGISTRY.get( field.type) # type: FieldType detected_values = list() # type: List[DetectedFieldValue] for depends_on_value in field_code_to_value.values(): if not depends_on_value: continue depends_on_value = str(depends_on_value) for field_detector in field_detectors: if field_detector.matches(depends_on_value): value = field_detector.detected_value hint_name = None if field_type_adapter.value_aware: hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name value, hint_name = field_type_adapter \ .get_or_extract_value(doc, field, value, hint_name, depends_on_value) if value is None: continue detected_values.append( DetectedFieldValue(field, value, None, hint_name)) if not (field_type_adapter.multi_value or field.is_choice_field()): break if detected_values and not (field_type_adapter.multi_value or field.is_choice_field()): break return detected_values
def detect_field_values_for_python_coded_field( document: Document, field: DocumentField, sentence_text_units: List[TextUnit], do_not_write: bool) -> int: python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get( field.python_coded_field) # type: PythonCodedField if not python_coded_field: raise RuntimeError('Unknown python-coded field: {0}'.format( field.python_coded_field)) field_type_adapter = FIELD_TYPES_REGISTRY[ field.type] # type: FieldType detected_values = list() # type: List[DetectedFieldValue] if python_coded_field.by_sentence: for text_unit in sentence_text_units: for value, location_start, location_end in python_coded_field.get_values( text_unit.text) or []: detected_values.append( DetectedFieldValue(text_unit, value, None, location_start, location_end)) if not (field_type_adapter.multi_value or field.is_choice_field()): return DetectFieldValues.save_detected_values( document, field, field_type_adapter, detected_values, do_not_write) else: for value, location_start, location_end in python_coded_field.get_values( document.full_text) or []: text_unit = TextUnit.objects.filter( document=document, unit_type='sentence', location_start__lte=location_start, location_end__gte=location_start).first() # type: TextUnit if not text_unit: raise RuntimeError( 'Python coded field {0} detected a value in document {1} at ' 'location [{2};{3}] but the start of location does not belong to any ' 'text unit object in DB.\n' 'It can not be. Something is broken.'.format( field.python_coded_field, document, location_start, location_end)) location_length = location_end - location_start location_start = location_start - text_unit.location_start location_end = location_start + location_length detected_values.append( DetectedFieldValue(text_unit, value, None, location_start, location_end)) if not (field_type_adapter.multi_value or field.is_choice_field()): return DetectFieldValues.save_detected_values( document, field, field_type_adapter, detected_values, do_not_write) return DetectFieldValues.save_detected_values(document, field, field_type_adapter, detected_values, do_not_write)
def detect_field_values(cls, log: ProcessLogger, doc: Document, field: DocumentField) -> List[DetectedFieldValue]: python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get( field.python_coded_field) # type: PythonCodedField if not python_coded_field: raise RuntimeError('Unknown python-coded field: {0}'.format( field.python_coded_field)) field_type_adapter = FIELD_TYPES_REGISTRY[ field.type] # type: FieldType detected_values = list() # type: List[DetectedFieldValue] if python_coded_field.by_sentence: qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .order_by('location_start', 'pk') for text_unit in qs_text_units.iterator(): for value, location_start, location_end in python_coded_field.get_values( text_unit.text) or []: detected_values.append( DetectedFieldValue(field, value, text_unit, None, location_start, location_end)) if not (field_type_adapter.multi_value or field.is_choice_field()): return detected_values else: for value, location_start, location_end in python_coded_field.get_values( doc.full_text) or []: text_unit = TextUnit.objects.filter( document=doc, unit_type='sentence', location_start__lte=location_start, location_end__gte=location_start).first() # type: TextUnit if not text_unit: raise RuntimeError( 'Python coded field {0} detected a value in document {1} at ' 'location [{2};{3}] but the start of location does not belong to any ' 'text unit object in DB.\n' 'It can not be. Something is broken.'.format( field.python_coded_field, doc, location_start, location_end)) location_length = location_end - location_start location_start = location_start - text_unit.location_start location_end = location_start + location_length detected_values.append( DetectedFieldValue(field, value, text_unit, None, location_start, location_end)) if not (field_type_adapter.multi_value or field.is_choice_field()): return detected_values return detected_values
def detect_field_values(cls, log: ProcessLogger, doc: Document, field: DocumentField) -> List[DetectedFieldValue]: depends_on_full_text = doc.full_text detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text( field, depends_on_full_text) if detected_with_stop_words: return detected_values or list() qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .order_by('location_start', 'pk') document_type = doc.document_type field_detectors = DocumentFieldDetector.objects.filter( document_type=document_type, field=field) field_type_adapter = FIELD_TYPES_REGISTRY.get( field.type) # type: FieldType detected_values = list() # type: List[DetectedFieldValue] for text_unit in qs_text_units.iterator(): for field_detector in field_detectors: if field_detector.matches(text_unit.text): value = field_detector.detected_value hint_name = None if field_type_adapter.value_aware: hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name value, hint_name = field_type_adapter \ .get_or_extract_value(doc, field, value, hint_name, text_unit.text) if value is None: continue detected_values.append( DetectedFieldValue(field, value, text_unit, hint_name)) if not (field_type_adapter.multi_value or field.is_choice_field()): break if detected_values and not (field_type_adapter.multi_value or field.is_choice_field()): break return detected_values
def detect_field_values( cls, log: ProcessLogger, doc: Document, field: DocumentField, cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]: depends_on_full_text = doc.full_text detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text( field, depends_on_full_text) if detected_with_stop_words: return detected_values or list() qs_text_units = RegexpsOnlyFieldDetectionStrategy.\ text_unit_repo.get_doc_text_units(doc, field.text_unit_type) field_detectors = RegexpsOnlyFieldDetectionStrategy.\ field_detector_repo.get_field_detectors(field) detectors = [DetectorFieldMatcher(d) for d in field_detectors] field_type_adapter = field.get_field_type() # type: FieldType detected_values = list() # type: List[DetectedFieldValue] for text_unit in qs_text_units: # type: TextUnit for field_detector in detectors: matching_string = field_detector.matching_string( text_unit.text, text_is_sentence=text_unit.is_sentence()) if matching_string is not None: value = field_detector.get_validated_detected_value(field) hint_name = None if field_type_adapter.requires_value: hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name value, hint_name = field_type_adapter \ .get_or_extract_value(doc, field, value, hint_name, matching_string) if value is None: continue detected_values.append( DetectedFieldValue(field, value, text_unit, hint_name)) if not (field_type_adapter.multi_value or field.is_choice_field()): break if detected_values and not (field_type_adapter.multi_value or field.is_choice_field()): break return detected_values
def detect_field_values( cls, log: ProcessLogger, doc: Document, field: DocumentField, cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]: depends_on_full_text = doc.full_text detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text( field, depends_on_full_text) if detected_with_stop_words: return detected_values or list() qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .order_by('location_start', 'pk') field_detectors = DocumentFieldDetector.objects.filter(field=field) field_type_adapter = field.get_field_type() # type: FieldType detected_values = list() # type: List[DetectedFieldValue] for text_unit in qs_text_units.iterator(): # type: TextUnit for field_detector in field_detectors: matching_string = field_detector.matching_string( text_unit.text, text_is_sentence=text_unit.is_sentence()) if matching_string is not None: value = field_detector.get_validated_detected_value(field) hint_name = None if field_type_adapter.requires_value: hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name value, hint_name = field_type_adapter \ .get_or_extract_value(doc, field, value, hint_name, matching_string) if value is None: continue detected_values.append( DetectedFieldValue(field, value, text_unit, hint_name)) if not (field_type_adapter.multi_value or field.is_choice_field()): break if detected_values and not (field_type_adapter.multi_value or field.is_choice_field()): break return detected_values
def detect_field_values_with_model(classifier_model, document: Document, field: DocumentField, sentence_text_units: List[TextUnit], do_not_write: bool) -> int: sklearn_model = classifier_model.get_trained_model_obj() field_type_adapter = FIELD_TYPES_REGISTRY[field.type] detected_values = list() # type: List[DetectedFieldValue] for text_unit in sentence_text_units: value, hint_name = DetectFieldValues.predict_and_extract_value( sklearn_model=sklearn_model, field_type_adapter=field_type_adapter, document=document, field=field, text_unit=text_unit) if value is None: continue detected_values.append( DetectedFieldValue(text_unit, value, hint_name)) if not (field_type_adapter.multi_value or field.is_choice_field()): break return DetectFieldValues.save_detected_values(document, field, field_type_adapter, detected_values, do_not_write)
def detect_field_values(cls, log: ProcessLogger, doc: Document, field: DocumentField) -> List[DetectedFieldValue]: document_type = doc.document_type # type: DocumentType try: classifier_model = ClassifierModel.objects \ .get(document_type=document_type, document_field=field) sklearn_model = classifier_model.get_trained_model_obj() field_type_adapter = FIELD_TYPES_REGISTRY[field.type] detected_values = list() # type: List[DetectedFieldValue] qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .order_by('location_start', 'pk') for text_unit in qs_text_units.iterator(): detected_value = cls.predict_and_extract_value( sklearn_model=sklearn_model, field_type_adapter=field_type_adapter, document=doc, field=field, text_unit=text_unit) if detected_value is None: continue detected_values.append(detected_value) if not (field_type_adapter.multi_value or field.is_choice_field()): break return detected_values except ClassifierModel.DoesNotExist as e: log.info('Classifier model does not exist for field: {0}'.format( field.code)) raise e
def save_detected_values(document: Document, field: DocumentField, field_type_adapter: FieldType, detected_values: List[DetectedFieldValue], do_not_write: bool): if len(detected_values) == 0: return 0 try: if field.is_choice_field() and not field_type_adapter.multi_value: values_order = field.get_choice_values() for choice_value in values_order: for dv in detected_values: if choice_value == dv.value: if not do_not_write: field_type_adapter.save_value( document, field, dv.get_annotation_start(), dv.get_annotation_end(), dv.get_annotation_text(), dv.text_unit, dv.value, user=None, allow_overwriting_user_data=False, extraction_hint=dv.hint_name) return 1 else: for dv in detected_values: if not do_not_write: field_type_adapter.save_value( document, field, dv.get_annotation_start(), dv.get_annotation_end(), dv.get_annotation_text(), dv.text_unit, dv.value, user=None, allow_overwriting_user_data=False, extraction_hint=dv.hint_name) return len(detected_values) finally: document.cache_field_values()
def detect_field_values( cls, log: ProcessLogger, doc: Document, field: DocumentField, cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]: depends_on_full_text = doc.full_text detected_with_stop_words, detected_values = detect_with_stop_words_by_field_and_full_text( field, depends_on_full_text) if detected_with_stop_words: return detected_values or list() try: classifier_model = ClassifierModel.objects.get( document_field=field) sklearn_model = classifier_model.get_trained_model_obj() field_type_adapter = field.get_field_type() detected_values = list() # type: List[DetectedFieldValue] qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .order_by('location_start', 'pk') for text_unit in qs_text_units.iterator(): detected_value = cls.predict_and_extract_value( sklearn_model=sklearn_model, field_type_adapter=field_type_adapter, document=doc, field=field, text_unit=text_unit) if detected_value is None: continue detected_values.append(detected_value) if not (field_type_adapter.multi_value or field.is_choice_field()): break return detected_values except ClassifierModel.DoesNotExist as e: log.info('Classifier model does not exist for field: {0}'.format( field.code)) raise e
def detect_field_values_with_regexps(document: Document, field: DocumentField, sentence_text_units: List[TextUnit], do_not_write: bool) -> int: document_type = document.document_type field_detectors = DocumentFieldDetector.objects.filter( document_type=document_type, field=field) field_type_adapter = FIELD_TYPES_REGISTRY.get( field.type) # type: FieldType detected_values = list() # type: List[DetectedFieldValue] for text_unit in sentence_text_units: for field_detector in field_detectors: if field_detector.matches(text_unit.text): value = field_detector.detected_value hint_name = None if field_type_adapter.value_aware: hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name value, hint_name = field_type_adapter \ .get_or_extract_value(document, field, value, hint_name, text_unit.text) if value is None: continue detected_values.append( DetectedFieldValue(text_unit, value, hint_name)) if not (field_type_adapter.multi_value or field.is_choice_field()): break return DetectFieldValues.save_detected_values(document, field, field_type_adapter, detected_values, do_not_write)
def save_detected_values(document: Document, field: DocumentField, detected_values: List[DetectedFieldValue]): if len(detected_values) == 0: return 0 field_type_adapter = FIELD_TYPES_REGISTRY[field.type] # type: FieldType if field.is_choice_field() and not field_type_adapter.multi_value: values_order = field.get_choice_values() for choice_value in values_order: for dv in detected_values: if choice_value == dv.value: field_type_adapter.save_value( document, field, dv.get_annotation_start(), dv.get_annotation_end(), dv.get_annotation_text(), dv.text_unit, dv.value, user=dv.user, allow_overwriting_user_data=dv.user is not None, extraction_hint=dv.hint_name) return 1 else: for dv in detected_values: field_type_adapter.save_value(document, field, dv.get_annotation_start(), dv.get_annotation_end(), dv.get_annotation_text(), dv.text_unit, dv.value, user=dv.user, allow_overwriting_user_data=dv.user is not None, extraction_hint=dv.hint_name) return len(detected_values)
def detect_field_values( cls, log: ProcessLogger, doc: Document, field: DocumentField, cached_fields: Dict[str, Any]) -> List[DetectedFieldValue]: python_coded_field = PYTHON_CODED_FIELDS_REGISTRY.get( field.python_coded_field) # type: PythonCodedField if not python_coded_field: raise RuntimeError('Unknown python-coded field: {0}'.format( field.python_coded_field)) field_type_adapter = field.get_field_type() # type: FieldType detected_values = list() # type: List[DetectedFieldValue] if python_coded_field.detect_per_text_unit: qs_text_units = TextUnit.objects \ .filter(document=doc) \ .filter(unit_type=field.text_unit_type) \ .order_by('location_start', 'pk') for text_unit in qs_text_units.iterator(): for value, location_start, location_end \ in python_coded_field.get_values(log, field, doc, text_unit.text) or []: detected_values.append( DetectedFieldValue(field, value, text_unit, None, location_start, location_end)) if not (field_type_adapter.multi_value or field.is_choice_field()): return detected_values else: for value, location_start, location_end \ in python_coded_field.get_values(log, field, doc, doc.full_text) or []: if field.requires_text_annotations and ( location_start is None or location_end is None): raise RuntimeError( 'Python coded field {0} detected a value in document {1} at ' 'undefined location but the field requires text annotation (and location).\n' 'This should not happen. Something is broken.'.format( field.python_coded_field, doc)) if location_start is not None and location_end is not None: text_unit = TextUnit.objects.filter( document=doc, unit_type=field.text_unit_type, location_start__lte=location_start, location_end__gte=location_start).first( ) # type: TextUnit if not text_unit: raise RuntimeError( 'Python coded field {0} detected a value in document {1} at ' 'location [{2};{3}] but the start of location does not belong to any ' 'text unit object in DB.\n' 'This should not happen. Something is broken.'. format(field.python_coded_field, doc, location_start, location_end)) location_length = location_end - location_start location_start = location_start - text_unit.location_start location_end = location_start + location_length else: text_unit = None location_start = None location_end = None detected_values.append( DetectedFieldValue(field, value, text_unit, None, location_start, location_end)) if not (field_type_adapter.multi_value or field.is_choice_field()): return detected_values return detected_values
def detect_field_values(cls, log: ProcessLogger, doc: Document, field: DocumentField) -> List[DetectedFieldValue]: depends_on_fields = list(field.depends_on_fields.all()) qs_document_field_values = doc.documentfieldvalue_set \ .filter(removed_by_user=False) \ .filter(field__in=depends_on_fields) field_code_to_value = merge_document_field_values_to_python_value( list(qs_document_field_values)) field_code_to_value = { f.code: field_code_to_value.get(f.code) for f in depends_on_fields } if field.stop_words: depends_on_full_text = '\n'.join( [str(v) for v in field_code_to_value.values()]) detected_with_stop_words, detected_values \ = detect_with_stop_words_by_field_and_full_text(field, depends_on_full_text) if detected_with_stop_words: return detected_values or list() field_detectors = DocumentFieldDetector.objects.filter(field=field) field_type_adapter = FIELD_TYPES_REGISTRY.get( field.type) # type: FieldType detected_values = list() # type: List[DetectedFieldValue] for depends_on_value in field_code_to_value.values(): if not depends_on_value: continue depends_on_value = str(depends_on_value) for field_detector in field_detectors: # type: DocumentFieldDetector matching_string = field_detector.matching_string( depends_on_value, text_is_sentence=False) if matching_string is not None: value = field_detector.get_validated_detected_value(field) hint_name = None if field_type_adapter.requires_value: hint_name = field_detector.extraction_hint or ValueExtractionHint.TAKE_FIRST.name value, hint_name = field_type_adapter \ .get_or_extract_value(doc, field, value, hint_name, matching_string) if value is None: continue detected_values.append( DetectedFieldValue(field, value, None, hint_name)) if not (field_type_adapter.multi_value or field.is_choice_field()): break if detected_values and not (field_type_adapter.multi_value or field.is_choice_field()): break return detected_values
def train_model(document_type: DocumentType, field: DocumentField, train_data_sets: List[dict]) -> ClassifierModel: df = pd.DataFrame.from_records(train_data_sets.pop(0)) # add transferred external data for train_data in train_data_sets: df = df.append(pd.DataFrame.from_records(train_data)) df['target_name'] = df.apply(lambda row: encode_category( field.pk, row.value if field.is_choice_field() else None, row.extraction_hint), axis=1) df['target_index'] = df['target_name'].factorize(sort=True)[0] + 1 df = df.append([{'text_unit__text': i} for i in get_no_field_text_units(document_type, field)]) df['target_index'] = df['target_index'].fillna(0).astype('int') df['target_name'] = df['target_name'].fillna(SkLearnClassifierModel.EMPTY_CAT_NAME).astype( 'str') df['user_input'] = df['created_by'].fillna(0).astype('bool') res_df = pd.DataFrame() for group_index, group_df in df.groupby('target_index'): if group_df.shape[0] > settings.ML_TRAIN_DATA_SET_GROUP_LEN: group_df = shuffle( group_df.sort_values('user_input', ascending=False)[:settings.ML_TRAIN_DATA_SET_GROUP_LEN]) res_df = res_df.append(group_df) res_df = shuffle(res_df) target_names = sorted(res_df['target_name'].unique()) text_clf = Pipeline([('vect', CountVectorizer(strip_accents='unicode', analyzer='word', stop_words='english', tokenizer=word_position_tokenizer)), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=5, tol=None, n_jobs=-1, class_weight='balanced')), ]) x = res_df['text_unit__text'] y = res_df['target_index'] x_train, x_test_os, y_train, y_test_os = train_test_split(x, y, test_size=0.2, random_state=42) _x_train, x_test_is, _y_train, y_test_is = train_test_split(x_train, y_train, test_size=0.2, random_state=42) sklearn_model = text_clf.fit(x_train, y_train) model = SkLearnClassifierModel(sklearn_model=sklearn_model, target_names=target_names) classifier_model = ClassifierModel() classifier_model.set_trained_model_obj(model) classifier_model.document_field = field classifier_model.document_type = document_type predicted_os = text_clf.predict(x_test_os) predicted_is = text_clf.predict(x_test_is) classifier_model.classifier_accuracy_report_out_of_sample = classification_report(y_test_os, predicted_os, target_names=target_names) classifier_model.classifier_accuracy_report_in_sample = classification_report(y_test_is, predicted_is, target_names=target_names) return classifier_model