def apply_simple_config(log: ProcessLogger, document_field: DocumentField, csv: bytes, drop_previous_field_detectors: bool, update_field_choice_values: bool): df = pd.read_csv(io.BytesIO(csv), dtype=str) if df.shape[0] < 1 or df.shape[1] < 1: raise ValueError('Config csv contains no data') row_num = df.shape[0] if update_field_choice_values: choices = df[ df.columns[0]].dropna().drop_duplicates().sort_values().tolist() document_field.choices = '\n'.join(choices) document_field.save() log.info( 'Creating {2} naive field detectors for document field {0} and document type {1}...' .format(document_field, document_field.document_type, df.shape[0])) log.set_progress_steps_number(int(row_num / 10) + 1) if drop_previous_field_detectors: DocumentFieldDetector.objects.filter( field=document_field, category=FD_CATEGORY_IMPORTED_SIMPLE_CONFIG).delete() for index, row in df.iterrows(): detector = DocumentFieldDetector() detector.category = FD_CATEGORY_IMPORTED_SIMPLE_CONFIG detector.field = document_field detector.regexps_pre_process_lower = True detector.detected_value = row[0] detector.include_regexps = '\n'.join(row.dropna()).lower() detector.save() if index % 10 == 0: log.step_progress() log.info('Done.')
def save_detector_settings( self, detectors_by_value: Dict[str, List[str]]) -> None: # save [all pattern: value] records into DocumentFieldMultilineRegexDetector if self.save_in_csv_format: self.save_detector_settings_csv(detectors_by_value) return # save patterns as one or more DocumentFieldDetector records # but before (optionally) delete old settings if self.drop_previous_field_detectors: DocumentFieldDetector.objects.filter( field=self.document_field, category=self.FD_CATEGORY_IMPORTED_SIMPLE_CONFIG).delete() for field_val in detectors_by_value: include_reg_values = detectors_by_value[field_val] detector = DocumentFieldDetector() detector.category = self.FD_CATEGORY_IMPORTED_SIMPLE_CONFIG detector.field = self.document_field detector.regexps_pre_process_lower = True detector.detected_value = field_val detector.include_regexps = '\n'.join(include_reg_values) detector.save()