def on_batch_end(self, batch, logs): dt = time.time() - self.iter_start_time self.iter_start_time = time.time() self.dt_stats.push(dt) self.loss_stats.push(logs['loss']) self.checkpoint_params.iter += 1 if self.display > 0 and self.checkpoint_params.iter % self.display == 0: # apply postprocessing to display the true output cer, target, decoded = self._generate(1) self.ler_stats.push(cer) pred_sentence = self.text_post_proc.apply("".join( self.codec.decode(decoded[0]))) gt_sentence = self.text_post_proc.apply("".join( self.codec.decode(target[0]))) if self.display_epochs: print("#{:08f}: loss={:.8f} ler={:.8f} dt={:.8f}s".format( self.checkpoint_params.iter / self.steps_per_epoch, self.loss_stats.mean(), self.ler_stats.mean(), self.dt_stats.mean())) else: print("#{:08d}: loss={:.8f} ler={:.8f} dt={:.8f}s".format( self.checkpoint_params.iter, self.loss_stats.mean(), self.ler_stats.mean(), self.dt_stats.mean())) # Insert utf-8 ltr/rtl direction marks for bidi support lr = "\u202A\u202B" print(" PRED: '{}{}{}'".format( lr[bidi.get_base_level(pred_sentence)], pred_sentence, "\u202C")) print(" TRUE: '{}{}{}'".format( lr[bidi.get_base_level(gt_sentence)], gt_sentence, "\u202C"))
def print_evaluate(self, sample: Sample, data, print_fn): targets, outputs = sample.targets, sample.outputs pred_sentence = outputs.sentence gt_sentence = targets['sentence'] lr = "\u202A\u202B" cer = Levenshtein.distance(pred_sentence, gt_sentence) / len(gt_sentence) print_fn("\n CER: {}".format(cer) + "\n PRED: '{}{}{}'".format(lr[bidi.get_base_level(pred_sentence)], pred_sentence, "\u202C") + "\n TRUE: '{}{}{}'".format(lr[bidi.get_base_level(gt_sentence)], gt_sentence, "\u202C"))
def display(self, train_cer, train_loss, train_dt, iter, steps_per_epoch, display_epochs, example_pred, example_gt): if display_epochs: print("#{:08f}: loss={:.8f} ler={:.8f} dt={:.8f}s".format( iter / steps_per_epoch, train_loss, train_cer, train_dt)) else: print("#{:08d}: loss={:.8f} ler={:.8f} dt={:.8f}s".format( iter, train_loss, train_cer, train_dt)) lr = "\u202A\u202B" print(" PRED: '{}{}{}'".format(lr[bidi.get_base_level(example_pred)], example_pred, "\u202C")) print(" TRUE: '{}{}{}'".format(lr[bidi.get_base_level(example_gt)], example_gt, "\u202C"))
def print_evaluate(self, sample: Sample, data, print_fn=print): targets, outputs = sample.targets, sample.outputs gt_sentence = targets['sentence'] lr = "\u202A\u202B" s = "" pred_sentence = outputs.sentence cer = Levenshtein.distance(pred_sentence, gt_sentence) / len(gt_sentence) s += ( "\n PRED (CER={:.2f}): '{}{}{}'".format(cer, lr[bidi.get_base_level(pred_sentence)], pred_sentence, "\u202C") + "\n TRUE: '{}{}{}'".format(lr[bidi.get_base_level(gt_sentence)], gt_sentence, "\u202C")) print_fn(s)
def print_evaluate(self, inputs: Dict[str, AnyNumpy], outputs: Prediction, targets: Dict[str, AnyNumpy], data: 'CalamariData', print_fn): pred_sentence = outputs.sentence gt_sentence = targets['sentence'] lr = "\u202A\u202B" cer = Levenshtein.distance(pred_sentence, gt_sentence) / len(gt_sentence) print_fn( "\n CER: {}".format(cer) + "\n PRED: '{}{}{}'".format(lr[bidi.get_base_level(pred_sentence)], pred_sentence, "\u202C") + "\n TRUE: '{}{}{}'".format(lr[bidi.get_base_level(gt_sentence)], gt_sentence, "\u202C"))
def bidi_record(record): """ Reorders a record using the Unicode BiDi algorithm. Models trained for RTL or mixed scripts still emit classes in LTR order requiring reordering for proper display. Args: record (kraken.rpred.ocr_record) Returns: kraken.rpred.ocr_record """ storage = bd.get_empty_storage() base_level = bd.get_base_level(record.prediction) storage['base_level'] = base_level storage['base_dir'] = ('L', 'R')[base_level] bd.get_embedding_levels(record.prediction, storage) bd.explicit_embed_and_overrides(storage) bd.resolve_weak_types(storage) bd.resolve_neutral_types(storage, False) bd.resolve_implicit_levels(storage, False) for i, j in enumerate(record): storage['chars'][i]['record'] = j bd.reorder_resolved_levels(storage, False) bd.apply_mirroring(storage, False) prediction = u'' cuts = [] confidences = [] for ch in storage['chars']: prediction = prediction + ch['record'][0] cuts.append(ch['record'][1]) confidences.append(ch['record'][2]) return ocr_record(prediction, cuts, confidences)
def getBiDiInfo(text, *, upper_is_rtl=False, base_dir=None, debug=False): """ Set `upper_is_rtl` to True to treat upper case chars as strong 'R' for debugging (default: False). Set `base_dir` to 'L' or 'R' to override the calculated base_level. Set `debug` to True to display (using sys.stderr) the steps taken with the algorithm. Returns an info dict object and the display layout. """ storage = get_empty_storage() if base_dir is None: base_level = get_base_level(text, upper_is_rtl) else: base_level = PARAGRAPH_LEVELS[base_dir] storage['base_level'] = base_level storage['base_dir'] = ('L', 'R')[base_level] get_embedding_levels(text, storage, upper_is_rtl, debug) assert len(text) == len(storage["chars"]) for index, (ch, chInfo) in enumerate(zip(text, storage["chars"])): assert ch == chInfo["ch"] chInfo["index"] = index explicit_embed_and_overrides(storage, debug) resolve_weak_types(storage, debug) resolve_neutral_types(storage, debug) resolve_implicit_levels(storage, debug) reorder_resolved_levels(storage, debug) return storage
def print_evaluate(self, inputs: Dict[str, AnyNumpy], outputs: Prediction, targets: Dict[str, AnyNumpy], data, print_fn=print): gt_sentence = targets['sentence'] lr = "\u202A\u202B" s = "" pred_sentence = outputs.sentence cer = Levenshtein.distance(pred_sentence, gt_sentence) / len(gt_sentence) s += ("\n PRED (CER={:.2f}): '{}{}{}'".format( cer, lr[bidi.get_base_level(pred_sentence)], pred_sentence, "\u202C") + "\n TRUE: '{}{}{}'".format( lr[bidi.get_base_level(gt_sentence)], gt_sentence, "\u202C")) print_fn(s)
def get_display_mod(unicode_or_str, encoding='utf-8', upper_is_rtl=False, base_dir=None, debug=False): """Accepts unicode or string. In case it's a string, `encoding` is needed as it works on unicode ones (default:"utf-8"). Set `upper_is_rtl` to True to treat upper case chars as strong 'R' for debugging (default: False). Set `base_dir` to 'L' or 'R' to override the calculated base_level. Set `debug` to True to display (using sys.stderr) the steps taken with the algorithm. Returns the display layout, either as unicode or `encoding` encoded string. """ storage = bidi.get_empty_storage() # utf-8 ? we need unicode if isinstance(unicode_or_str, six.text_type): text = unicode_or_str decoded = False else: text = unicode_or_str.decode(encoding) decoded = True if base_dir is None: base_level = bidi.get_base_level(text, upper_is_rtl) else: base_level = bidi.PARAGRAPH_LEVELS[base_dir] storage['base_level'] = base_level storage['base_dir'] = ('L', 'R')[base_level] bidi.get_embedding_levels(text, storage, upper_is_rtl, debug) bidi.explicit_embed_and_overrides(storage, debug) bidi.resolve_weak_types(storage, debug) bidi.resolve_neutral_types(storage, debug) bidi.resolve_implicit_levels(storage, debug) bidi.reorder_resolved_levels(storage, debug) #Commented out from original code: # bidi.apply_mirroring(storage, debug) # print_storage_chars(storage) # chars = storage['chars'] # display = u''.join([_ch['ch'] for _ch in chars]) display = print_storage_chars(storage) if decoded: return display.encode(encoding) else: return display
def bidi_record(record: ocr_record, base_dir=None) -> ocr_record: """ Reorders a record using the Unicode BiDi algorithm. Models trained for RTL or mixed scripts still emit classes in LTR order requiring reordering for proper display. Args: record (kraken.rpred.ocr_record) Returns: kraken.rpred.ocr_record """ storage = bd.get_empty_storage() if base_dir not in ('L', 'R'): base_level = bd.get_base_level(record.prediction) else: base_level = {'L': 0, 'R': 1}[base_dir] storage['base_level'] = base_level storage['base_dir'] = ('L', 'R')[base_level] bd.get_embedding_levels(record.prediction, storage) bd.explicit_embed_and_overrides(storage) bd.resolve_weak_types(storage) bd.resolve_neutral_types(storage, False) bd.resolve_implicit_levels(storage, False) for i, j in enumerate(record): storage['chars'][i]['record'] = j bd.reorder_resolved_levels(storage, False) bd.apply_mirroring(storage, False) prediction = '' cuts = [] confidences = [] for ch in storage['chars']: # code point may have been mirrored prediction = prediction + ch['ch'] cuts.append(ch['record'][1]) confidences.append(ch['record'][2]) # carry over whole line information if record.type == 'baselines': line = {'boundary': record.line, 'baseline': record.baseline} else: line = record.line rec = ocr_record(prediction, cuts, confidences, line) rec.tags = record.tags rec.base_dir = base_dir return rec
def train(self, progress_bar=False): """ Launch the training Parameters ---------- progress_bar : bool Show or hide any progress bar """ checkpoint_params = self.checkpoint_params train_start_time = time.time() + self.checkpoint_params.total_time self.dataset.load_samples(processes=1, progress_bar=progress_bar) datas, txts = self.dataset.train_samples( skip_empty=checkpoint_params.skip_invalid_gt) if len(datas) == 0: raise Exception( "Empty dataset is not allowed. Check if the data is at the correct location" ) if self.validation_dataset: self.validation_dataset.load_samples(processes=1, progress_bar=progress_bar) validation_datas, validation_txts = self.validation_dataset.train_samples( skip_empty=checkpoint_params.skip_invalid_gt) if len(validation_datas) == 0: raise Exception( "Validation dataset is empty. Provide valid validation data for early stopping." ) else: validation_datas, validation_txts = [], [] # preprocessing steps texts = self.txt_preproc.apply(txts, processes=checkpoint_params.processes, progress_bar=progress_bar) datas = self.data_preproc.apply(datas, processes=checkpoint_params.processes, progress_bar=progress_bar) validation_txts = self.txt_preproc.apply( validation_txts, processes=checkpoint_params.processes, progress_bar=progress_bar) validation_datas = self.data_preproc.apply( validation_datas, processes=checkpoint_params.processes, progress_bar=progress_bar) # compute the codec codec = self.codec if self.codec else Codec.from_texts( texts, whitelist=self.codec_whitelist) # data augmentation on preprocessed data if self.data_augmenter: datas, texts = self.data_augmenter.augment_datas( datas, texts, n_augmentations=self.n_augmentations, processes=checkpoint_params.processes, progress_bar=progress_bar) # TODO: validation data augmentation # validation_datas, validation_txts = self.data_augmenter.augment_datas(validation_datas, validation_txts, n_augmentations=0, # processes=checkpoint_params.processes, progress_bar=progress_bar) # create backend network_params = checkpoint_params.model.network network_params.features = checkpoint_params.model.line_height network_params.classes = len(codec) if self.weights: # if we load the weights, take care of codec changes as-well with open(self.weights + '.json', 'r') as f: restore_checkpoint_params = json_format.Parse( f.read(), CheckpointParams()) restore_model_params = restore_checkpoint_params.model # checks if checkpoint_params.model.line_height != network_params.features: raise Exception( "The model to restore has a line height of {} but a line height of {} is requested" .format(network_params.features, checkpoint_params.model.line_height)) # create codec of the same type restore_codec = codec.__class__(restore_model_params.codec.charset) # the codec changes as tuple (deletions/insertions), and the new codec is the changed old one codec_changes = restore_codec.align(codec) codec = restore_codec print("Codec changes: {} deletions, {} appends".format( len(codec_changes[0]), len(codec_changes[1]))) # The actual weight/bias matrix will be changed after loading the old weights else: codec_changes = None # store the new codec checkpoint_params.model.codec.charset[:] = codec.charset print("CODEC: {}".format(codec.charset)) # compute the labels with (new/current) codec labels = [codec.encode(txt) for txt in texts] backend = create_backend_from_proto( network_params, weights=self.weights, ) train_net = backend.create_net(restore=None, weights=self.weights, graph_type="train", batch_size=checkpoint_params.batch_size) test_net = backend.create_net(restore=None, weights=self.weights, graph_type="test", batch_size=checkpoint_params.batch_size) train_net.set_data(datas, labels) test_net.set_data(validation_datas, validation_txts) if codec_changes: # only required on one net, since the other shares the same variables train_net.realign_model_labels(*codec_changes) train_net.prepare() test_net.prepare() loss_stats = RunningStatistics(checkpoint_params.stats_size, checkpoint_params.loss_stats) ler_stats = RunningStatistics(checkpoint_params.stats_size, checkpoint_params.ler_stats) dt_stats = RunningStatistics(checkpoint_params.stats_size, checkpoint_params.dt_stats) early_stopping_enabled = self.validation_dataset is not None \ and checkpoint_params.early_stopping_frequency > 0 \ and checkpoint_params.early_stopping_nbest > 1 early_stopping_best_accuracy = checkpoint_params.early_stopping_best_accuracy early_stopping_best_cur_nbest = checkpoint_params.early_stopping_best_cur_nbest early_stopping_best_at_iter = checkpoint_params.early_stopping_best_at_iter early_stopping_predictor = Predictor(codec=codec, text_postproc=self.txt_postproc, network=test_net) # Start the actual training # ==================================================================================== iter = checkpoint_params.iter # helper function to write a checkpoint def make_checkpoint(base_dir, prefix, version=None): if version: checkpoint_path = os.path.abspath( os.path.join(base_dir, "{}{}.ckpt".format(prefix, version))) else: checkpoint_path = os.path.abspath( os.path.join(base_dir, "{}{:08d}.ckpt".format(prefix, iter + 1))) print("Storing checkpoint to '{}'".format(checkpoint_path)) train_net.save_checkpoint(checkpoint_path) checkpoint_params.iter = iter checkpoint_params.loss_stats[:] = loss_stats.values checkpoint_params.ler_stats[:] = ler_stats.values checkpoint_params.dt_stats[:] = dt_stats.values checkpoint_params.total_time = time.time() - train_start_time checkpoint_params.early_stopping_best_accuracy = early_stopping_best_accuracy checkpoint_params.early_stopping_best_cur_nbest = early_stopping_best_cur_nbest checkpoint_params.early_stopping_best_at_iter = early_stopping_best_at_iter with open(checkpoint_path + ".json", 'w') as f: f.write(json_format.MessageToJson(checkpoint_params)) return checkpoint_path try: last_checkpoint = None n_infinite_losses = 0 n_max_infinite_losses = 5 # Training loop, can be interrupted by early stopping for iter in range(iter, checkpoint_params.max_iters): checkpoint_params.iter = iter iter_start_time = time.time() result = train_net.train_step() if not np.isfinite(result['loss']): n_infinite_losses += 1 if n_max_infinite_losses == n_infinite_losses: print( "Error: Loss is not finite! Trying to restart from last checkpoint." ) if not last_checkpoint: raise Exception( "No checkpoint written yet. Training must be stopped." ) else: # reload also non trainable weights, such as solver-specific variables train_net.load_weights( last_checkpoint, restore_only_trainable=False) continue else: continue n_infinite_losses = 0 loss_stats.push(result['loss']) ler_stats.push(result['ler']) dt_stats.push(time.time() - iter_start_time) if iter % checkpoint_params.display == 0: # apply postprocessing to display the true output pred_sentence = self.txt_postproc.apply("".join( codec.decode(result["decoded"][0]))) gt_sentence = self.txt_postproc.apply("".join( codec.decode(result["gt"][0]))) print("#{:08d}: loss={:.8f} ler={:.8f} dt={:.8f}s".format( iter, loss_stats.mean(), ler_stats.mean(), dt_stats.mean())) # Insert utf-8 ltr/rtl direction marks for bidi support lr = "\u202A\u202B" print(" PRED: '{}{}{}'".format( lr[bidi.get_base_level(pred_sentence)], pred_sentence, "\u202C")) print(" TRUE: '{}{}{}'".format( lr[bidi.get_base_level(gt_sentence)], gt_sentence, "\u202C")) if (iter + 1) % checkpoint_params.checkpoint_frequency == 0: last_checkpoint = make_checkpoint( checkpoint_params.output_dir, checkpoint_params.output_model_prefix) if early_stopping_enabled and ( iter + 1) % checkpoint_params.early_stopping_frequency == 0: print("Checking early stopping model") out = early_stopping_predictor.predict_raw( validation_datas, progress_bar=progress_bar, apply_preproc=False) pred_texts = [d.sentence for d in out] pred_texts = self.txt_preproc.apply( pred_texts, processes=checkpoint_params.processes, progress_bar=progress_bar) result = Evaluator.evaluate(gt_data=validation_txts, pred_data=pred_texts, progress_bar=progress_bar) accuracy = 1 - result["avg_ler"] if accuracy > early_stopping_best_accuracy: early_stopping_best_accuracy = accuracy early_stopping_best_cur_nbest = 1 early_stopping_best_at_iter = iter + 1 # overwrite as best model last_checkpoint = make_checkpoint( checkpoint_params. early_stopping_best_model_output_dir, prefix="", version=checkpoint_params. early_stopping_best_model_prefix, ) print( "Found better model with accuracy of {:%}".format( early_stopping_best_accuracy)) else: early_stopping_best_cur_nbest += 1 print( "No better model found. Currently accuracy of {:%} at iter {} (remaining nbest = {})" .format( early_stopping_best_accuracy, early_stopping_best_at_iter, checkpoint_params.early_stopping_nbest - early_stopping_best_cur_nbest)) if accuracy > 0 and early_stopping_best_cur_nbest >= checkpoint_params.early_stopping_nbest: print("Early stopping now.") break except KeyboardInterrupt as e: print("Storing interrupted checkpoint") make_checkpoint(checkpoint_params.output_dir, checkpoint_params.output_model_prefix, "interrupted") raise e print("Total time {}s for {} iterations.".format( time.time() - train_start_time, iter))
def run(args): # check if loading a json file if len(args.files) == 1 and args.files[0].endswith("json"): import json with open(args.files[0], 'r') as f: json_args = json.load(f) for key, value in json_args.items(): setattr(args, key, value) # checks if args.extended_prediction_data_format not in ["pred", "json"]: raise Exception("Only 'pred' and 'json' are allowed extended prediction data formats") # add json as extension, resolve wildcard, expand user, ... and remove .json again args.checkpoint = [(cp if cp.endswith(".json") else cp + ".json") for cp in args.checkpoint] args.checkpoint = glob_all(args.checkpoint) args.checkpoint = [cp[:-5] for cp in args.checkpoint] # create voter voter_params = VoterParams() voter_params.type = VoterParams.Type.Value(args.voter.upper()) voter = voter_from_proto(voter_params) # load files input_image_files = glob_all(args.files) if args.text_files: args.text_files = glob_all(args.text_files) # skip invalid files and remove them, there wont be predictions of invalid files dataset = create_dataset( args.dataset, DataSetMode.PREDICT, input_image_files, args.text_files, skip_invalid=True, remove_invalid=True, args={'text_index': args.pagexml_text_index}, ) print("Found {} files in the dataset".format(len(dataset))) if len(dataset) == 0: raise Exception("Empty dataset provided. Check your files argument (got {})!".format(args.files)) # predict for all models predictor = MultiPredictor(checkpoints=args.checkpoint, batch_size=args.batch_size, processes=args.processes) do_prediction = predictor.predict_dataset(dataset, progress_bar=not args.no_progress_bars) avg_sentence_confidence = 0 n_predictions = 0 # output the voted results to the appropriate files for result, sample in do_prediction: n_predictions += 1 for i, p in enumerate(result): p.prediction.id = "fold_{}".format(i) # vote the results (if only one model is given, this will just return the sentences) prediction = voter.vote_prediction_result(result) prediction.id = "voted" sentence = prediction.sentence avg_sentence_confidence += prediction.avg_char_probability if args.verbose: lr = "\u202A\u202B" print("{}: '{}{}{}'".format(sample['id'], lr[get_base_level(sentence)], sentence, "\u202C" )) output_dir = args.output_dir dataset.store_text(sentence, sample, output_dir=output_dir, extension=".pred.txt") if args.extended_prediction_data: ps = Predictions() ps.line_path = sample['image_path'] if 'image_path' in sample else sample['id'] ps.predictions.extend([prediction] + [r.prediction for r in result]) output_dir = output_dir if output_dir else os.path.dirname(ps.line_path) if not os.path.exists(output_dir): os.mkdir(output_dir) if args.extended_prediction_data_format == "pred": with open(os.path.join(output_dir, sample['id'] + ".pred"), 'wb') as f: f.write(ps.SerializeToString()) elif args.extended_prediction_data_format == "json": with open(os.path.join(output_dir, sample['id'] + ".json"), 'w') as f: # remove logits for prediction in ps.predictions: prediction.logits.rows = 0 prediction.logits.cols = 0 prediction.logits.data[:] = [] f.write(MessageToJson(ps, including_default_value_fields=True)) else: raise Exception("Unknown prediction format.") print("Average sentence confidence: {:.2%}".format(avg_sentence_confidence / n_predictions)) dataset.store() print("All files written")
def predict_books( self, books, checkpoint, cachefile=None, pageupload=True, text_index=1, pred_all=False, ): keras.backend.clear_session() if type(books) == str: books = [books] if type(checkpoint) == str: checkpoint = [checkpoint] checkpoint = [ (cp if cp.endswith(".json") else cp + ".json") for cp in checkpoint ] checkpoint = glob_all(checkpoint) checkpoint = [cp[:-5] for cp in checkpoint] if cachefile is None: cachefile = self.cachefile verbose = False lids = list( lids_from_books( books, cachefile, complete_only=False, skip_commented=False, new_only=not pred_all, ) ) data = Nsh5(cachefile=cachefile, lines=lids) predparams = PredictorParams() predparams.device.gpus = [n for n, _ in enumerate(list_physical_devices("GPU"))] predictor = MultiPredictor.from_paths( checkpoints=checkpoint, voter_params=VoterParams(), predictor_params=predparams, ) newprcs = [] for prc in predictor.data.params.pre_proc.processors: prc = deepcopy(prc) if isinstance(prc, FinalPreparationProcessorParams): prc.normalize, prc.invert, prc.transpose = False, False, True newprcs.append(prc) elif isinstance(prc, PrepareSampleProcessorParams): newprcs.append(prc) predictor.data.params.pre_proc.processors = newprcs do_prediction = predictor.predict(data) pipeline = predictor.data.get_or_create_pipeline( predictor.params.pipeline, data ) reader = pipeline.reader() if len(reader) == 0: raise Exception( "Empty dataset provided. Check your lines (got {})!".format(lids) ) avg_sentence_confidence = 0 n_predictions = 0 reader.prepare_store() samples = [] sentences = [] # output the voted results to the appropriate files for s in do_prediction: _, (_, prediction), meta = s.inputs, s.outputs, s.meta sample = reader.sample_by_id(meta["id"]) n_predictions += 1 sentence = prediction.sentence avg_sentence_confidence += prediction.avg_char_probability if verbose: lr = "\u202A\u202B" logger.info( "{}: '{}{}{}'".format( meta["id"], lr[get_base_level(sentence)], sentence, "\u202C" ) ) samples.append(sample) sentences.append(sentence) reader.store_text(sentence, sample, output_dir=None, extension=None) logger.info( "Average sentence confidence: {:.2%}".format( avg_sentence_confidence / n_predictions ) ) if pageupload: ocrdata = {} for lname, text in reader.predictions.items(): _, b, p, ln = lname.split("/") if b not in ocrdata: ocrdata[b] = {} if p not in ocrdata[b]: ocrdata[b][p] = {} ocrdata[b][p][ln] = text data = {"ocrdata": ocrdata, "index": text_index} self.get_session().post( self.baseurl + "/_ocrdata", data=gzip.compress(json.dumps(data).encode("utf-8")), headers={ "Content-Type": "application/json;charset=UTF-8", "Content-Encoding": "gzip", }, ) logger.info("Results uploaded") else: reader.store() logger.info("All prediction files written")
def run(args): # check if loading a json file if len(args.files) == 1 and args.files[0].endswith("json"): import json with open(args.files[0], 'r') as f: json_args = json.load(f) for key, value in json_args.items(): setattr(args, key, value) # checks if args.extended_prediction_data_format not in ["pred", "json"]: raise Exception( "Only 'pred' and 'json' are allowed extended prediction data formats" ) # add json as extension, resolve wildcard, expand user, ... and remove .json again args.checkpoint = [(cp if cp.endswith(".json") else cp + ".json") for cp in args.checkpoint] args.checkpoint = glob_all(args.checkpoint) args.checkpoint = [cp[:-5] for cp in args.checkpoint] # create voter voter_params = VoterParams() voter_params.type = VoterParams.Type.Value(args.voter.upper()) voter = voter_from_proto(voter_params) # load files input_image_files = glob_all(args.files) if args.text_files: args.text_files = glob_all(args.text_files) # skip invalid files and remove them, there wont be predictions of invalid files dataset = create_dataset( args.dataset, DataSetMode.PREDICT, input_image_files, args.text_files, skip_invalid=True, remove_invalid=True, args={'text_index': args.pagexml_text_index}, ) print("Found {} files in the dataset".format(len(dataset))) if len(dataset) == 0: raise Exception( "Empty dataset provided. Check your files argument (got {})!". format(args.files)) # predict for all models predictor = MultiPredictor(checkpoints=args.checkpoint, batch_size=args.batch_size, processes=args.processes) do_prediction = predictor.predict_dataset( dataset, progress_bar=not args.no_progress_bars) avg_sentence_confidence = 0 n_predictions = 0 # output the voted results to the appropriate files for result, sample in do_prediction: n_predictions += 1 for i, p in enumerate(result): p.prediction.id = "fold_{}".format(i) # vote the results (if only one model is given, this will just return the sentences) prediction = voter.vote_prediction_result(result) prediction.id = "voted" sentence = prediction.sentence avg_sentence_confidence += prediction.avg_char_probability if args.verbose: lr = "\u202A\u202B" print("{}: '{}{}{}'".format(sample['id'], lr[get_base_level(sentence)], sentence, "\u202C")) output_dir = args.output_dir dataset.store_text(sentence, sample, output_dir=output_dir, extension=".pred.txt") if args.extended_prediction_data: ps = Predictions() ps.line_path = sample[ 'image_path'] if 'image_path' in sample else sample['id'] ps.predictions.extend([prediction] + [r.prediction for r in result]) output_dir = output_dir if output_dir else os.path.dirname( ps.line_path) if not os.path.exists(output_dir): os.mkdir(output_dir) if args.extended_prediction_data_format == "pred": with open(os.path.join(output_dir, sample['id'] + ".pred"), 'wb') as f: f.write(ps.SerializeToString()) elif args.extended_prediction_data_format == "json": with open(os.path.join(output_dir, sample['id'] + ".json"), 'w') as f: # remove logits for prediction in ps.predictions: prediction.logits.rows = 0 prediction.logits.cols = 0 prediction.logits.data[:] = [] f.write( MessageToJson(ps, including_default_value_fields=True)) else: raise Exception("Unknown prediction format.") print("Average sentence confidence: {:.2%}".format( avg_sentence_confidence / n_predictions)) dataset.store() print("All files written")
def _run_train(self, train_net, test_net, codec, train_start_time, progress_bar): checkpoint_params = self.checkpoint_params validation_dataset = test_net.input_dataset iters_per_epoch = max( 1, int(train_net.input_dataset.epoch_size() / checkpoint_params.batch_size)) loss_stats = RunningStatistics(checkpoint_params.stats_size, checkpoint_params.loss_stats) ler_stats = RunningStatistics(checkpoint_params.stats_size, checkpoint_params.ler_stats) dt_stats = RunningStatistics(checkpoint_params.stats_size, checkpoint_params.dt_stats) display = checkpoint_params.display display_epochs = display <= 1 if display <= 0: display = 0 # to not display anything elif display_epochs: display = max(1, int(display * iters_per_epoch)) # relative to epochs else: display = max(1, int(display)) # iterations checkpoint_frequency = checkpoint_params.checkpoint_frequency early_stopping_frequency = checkpoint_params.early_stopping_frequency if early_stopping_frequency < 0: # set early stopping frequency to half epoch early_stopping_frequency = int(0.5 * iters_per_epoch) elif 0 < early_stopping_frequency <= 1: early_stopping_frequency = int( early_stopping_frequency * iters_per_epoch) # relative to epochs else: early_stopping_frequency = int(early_stopping_frequency) early_stopping_frequency = max(1, early_stopping_frequency) if checkpoint_frequency < 0: checkpoint_frequency = early_stopping_frequency elif 0 < checkpoint_frequency <= 1: checkpoint_frequency = int(checkpoint_frequency * iters_per_epoch) # relative to epochs else: checkpoint_frequency = int(checkpoint_frequency) early_stopping_enabled = self.validation_dataset is not None \ and checkpoint_params.early_stopping_frequency > 0 \ and checkpoint_params.early_stopping_nbest > 1 early_stopping_best_accuracy = checkpoint_params.early_stopping_best_accuracy early_stopping_best_cur_nbest = checkpoint_params.early_stopping_best_cur_nbest early_stopping_best_at_iter = checkpoint_params.early_stopping_best_at_iter early_stopping_predictor = Predictor(codec=codec, text_postproc=self.txt_postproc, network=test_net) # Start the actual training # ==================================================================================== iter = checkpoint_params.iter # helper function to write a checkpoint def make_checkpoint(base_dir, prefix, version=None): if version: checkpoint_path = os.path.abspath( os.path.join(base_dir, "{}{}.ckpt".format(prefix, version))) else: checkpoint_path = os.path.abspath( os.path.join(base_dir, "{}{:08d}.ckpt".format(prefix, iter + 1))) print("Storing checkpoint to '{}'".format(checkpoint_path)) train_net.save_checkpoint(checkpoint_path) checkpoint_params.version = Checkpoint.VERSION checkpoint_params.iter = iter checkpoint_params.loss_stats[:] = loss_stats.values checkpoint_params.ler_stats[:] = ler_stats.values checkpoint_params.dt_stats[:] = dt_stats.values checkpoint_params.total_time = time.time() - train_start_time checkpoint_params.early_stopping_best_accuracy = early_stopping_best_accuracy checkpoint_params.early_stopping_best_cur_nbest = early_stopping_best_cur_nbest checkpoint_params.early_stopping_best_at_iter = early_stopping_best_at_iter with open(checkpoint_path + ".json", 'w') as f: f.write(json_format.MessageToJson(checkpoint_params)) return checkpoint_path try: last_checkpoint = None n_infinite_losses = 0 n_max_infinite_losses = 5 # Training loop, can be interrupted by early stopping for iter in range(iter, checkpoint_params.max_iters): checkpoint_params.iter = iter iter_start_time = time.time() result = train_net.train_step() if not np.isfinite(result['loss']): n_infinite_losses += 1 if n_max_infinite_losses == n_infinite_losses: print( "Error: Loss is not finite! Trying to restart from last checkpoint." ) if not last_checkpoint: raise Exception( "No checkpoint written yet. Training must be stopped." ) else: # reload also non trainable weights, such as solver-specific variables train_net.load_weights( last_checkpoint, restore_only_trainable=False) continue else: continue n_infinite_losses = 0 loss_stats.push(result['loss']) ler_stats.push(result['ler']) dt_stats.push(time.time() - iter_start_time) if display > 0 and iter % display == 0: # apply postprocessing to display the true output pred_sentence = self.txt_postproc.apply("".join( codec.decode(result["decoded"][0]))) gt_sentence = self.txt_postproc.apply("".join( codec.decode(result["gt"][0]))) if display_epochs: print("#{:08f}: loss={:.8f} ler={:.8f} dt={:.8f}s". format(iter / iters_per_epoch, loss_stats.mean(), ler_stats.mean(), dt_stats.mean())) else: print("#{:08d}: loss={:.8f} ler={:.8f} dt={:.8f}s". format(iter, loss_stats.mean(), ler_stats.mean(), dt_stats.mean())) # Insert utf-8 ltr/rtl direction marks for bidi support lr = "\u202A\u202B" print(" PRED: '{}{}{}'".format( lr[bidi.get_base_level(pred_sentence)], pred_sentence, "\u202C")) print(" TRUE: '{}{}{}'".format( lr[bidi.get_base_level(gt_sentence)], gt_sentence, "\u202C")) if checkpoint_frequency > 0 and ( iter + 1) % checkpoint_frequency == 0: last_checkpoint = make_checkpoint( checkpoint_params.output_dir, checkpoint_params.output_model_prefix) if early_stopping_enabled and ( iter + 1) % early_stopping_frequency == 0: print("Checking early stopping model") out_gen = early_stopping_predictor.predict_input_dataset( validation_dataset, progress_bar=progress_bar) result = Evaluator.evaluate_single_list( map( Evaluator.evaluate_single_args, map( lambda d: tuple( self.txt_preproc.apply([ ''.join(d.ground_truth), d.sentence ])), out_gen))) accuracy = 1 - result["avg_ler"] if accuracy > early_stopping_best_accuracy: early_stopping_best_accuracy = accuracy early_stopping_best_cur_nbest = 1 early_stopping_best_at_iter = iter + 1 # overwrite as best model last_checkpoint = make_checkpoint( checkpoint_params. early_stopping_best_model_output_dir, prefix="", version=checkpoint_params. early_stopping_best_model_prefix, ) print( "Found better model with accuracy of {:%}".format( early_stopping_best_accuracy)) else: early_stopping_best_cur_nbest += 1 print( "No better model found. Currently accuracy of {:%} at iter {} (remaining nbest = {})" .format( early_stopping_best_accuracy, early_stopping_best_at_iter, checkpoint_params.early_stopping_nbest - early_stopping_best_cur_nbest)) if accuracy > 0 and early_stopping_best_cur_nbest >= checkpoint_params.early_stopping_nbest: print("Early stopping now.") break if accuracy >= 1: print( "Reached perfect score on validation set. Early stopping now." ) break except KeyboardInterrupt as e: print("Storing interrupted checkpoint") make_checkpoint(checkpoint_params.output_dir, checkpoint_params.output_model_prefix, "interrupted") raise e print("Total time {}s for {} iterations.".format( time.time() - train_start_time, iter))
def _run_train(self, train_net, test_net, codec, train_start_time, progress_bar): checkpoint_params = self.checkpoint_params validation_dataset = test_net.input_dataset iters_per_epoch = max(1, int(len(train_net.input_dataset) / checkpoint_params.batch_size)) loss_stats = RunningStatistics(checkpoint_params.stats_size, checkpoint_params.loss_stats) ler_stats = RunningStatistics(checkpoint_params.stats_size, checkpoint_params.ler_stats) dt_stats = RunningStatistics(checkpoint_params.stats_size, checkpoint_params.dt_stats) display = checkpoint_params.display display_epochs = display <= 1 if display <= 0: display = 0 # to not display anything elif display_epochs: display = max(1, int(display * iters_per_epoch)) # relative to epochs else: display = max(1, int(display)) # iterations checkpoint_frequency = checkpoint_params.checkpoint_frequency early_stopping_frequency = checkpoint_params.early_stopping_frequency if early_stopping_frequency < 0: # set early stopping frequency to half epoch early_stopping_frequency = int(0.5 * iters_per_epoch) elif 0 < early_stopping_frequency <= 1: early_stopping_frequency = int(early_stopping_frequency * iters_per_epoch) # relative to epochs else: early_stopping_frequency = int(early_stopping_frequency) if checkpoint_frequency < 0: checkpoint_frequency = early_stopping_frequency elif 0 < checkpoint_frequency <= 1: checkpoint_frequency = int(checkpoint_frequency * iters_per_epoch) # relative to epochs else: checkpoint_frequency = int(checkpoint_frequency) early_stopping_enabled = self.validation_dataset is not None \ and checkpoint_params.early_stopping_frequency > 0 \ and checkpoint_params.early_stopping_nbest > 1 early_stopping_best_accuracy = checkpoint_params.early_stopping_best_accuracy early_stopping_best_cur_nbest = checkpoint_params.early_stopping_best_cur_nbest early_stopping_best_at_iter = checkpoint_params.early_stopping_best_at_iter early_stopping_predictor = Predictor(codec=codec, text_postproc=self.txt_postproc, network=test_net) # Start the actual training # ==================================================================================== iter = checkpoint_params.iter # helper function to write a checkpoint def make_checkpoint(base_dir, prefix, version=None): if version: checkpoint_path = os.path.abspath(os.path.join(base_dir, "{}{}.ckpt".format(prefix, version))) else: checkpoint_path = os.path.abspath(os.path.join(base_dir, "{}{:08d}.ckpt".format(prefix, iter + 1))) print("Storing checkpoint to '{}'".format(checkpoint_path)) train_net.save_checkpoint(checkpoint_path) checkpoint_params.version = Checkpoint.VERSION checkpoint_params.iter = iter checkpoint_params.loss_stats[:] = loss_stats.values checkpoint_params.ler_stats[:] = ler_stats.values checkpoint_params.dt_stats[:] = dt_stats.values checkpoint_params.total_time = time.time() - train_start_time checkpoint_params.early_stopping_best_accuracy = early_stopping_best_accuracy checkpoint_params.early_stopping_best_cur_nbest = early_stopping_best_cur_nbest checkpoint_params.early_stopping_best_at_iter = early_stopping_best_at_iter with open(checkpoint_path + ".json", 'w') as f: f.write(json_format.MessageToJson(checkpoint_params)) return checkpoint_path try: last_checkpoint = None n_infinite_losses = 0 n_max_infinite_losses = 5 # Training loop, can be interrupted by early stopping for iter in range(iter, checkpoint_params.max_iters): checkpoint_params.iter = iter iter_start_time = time.time() result = train_net.train_step() if not np.isfinite(result['loss']): n_infinite_losses += 1 if n_max_infinite_losses == n_infinite_losses: print("Error: Loss is not finite! Trying to restart from last checkpoint.") if not last_checkpoint: raise Exception("No checkpoint written yet. Training must be stopped.") else: # reload also non trainable weights, such as solver-specific variables train_net.load_weights(last_checkpoint, restore_only_trainable=False) continue else: continue n_infinite_losses = 0 loss_stats.push(result['loss']) ler_stats.push(result['ler']) dt_stats.push(time.time() - iter_start_time) if display > 0 and iter % display == 0: # apply postprocessing to display the true output pred_sentence = self.txt_postproc.apply("".join(codec.decode(result["decoded"][0]))) gt_sentence = self.txt_postproc.apply("".join(codec.decode(result["gt"][0]))) if display_epochs: print("#{:08f}: loss={:.8f} ler={:.8f} dt={:.8f}s".format( iter / iters_per_epoch, loss_stats.mean(), ler_stats.mean(), dt_stats.mean())) else: print("#{:08d}: loss={:.8f} ler={:.8f} dt={:.8f}s".format( iter, loss_stats.mean(), ler_stats.mean(), dt_stats.mean())) # Insert utf-8 ltr/rtl direction marks for bidi support lr = "\u202A\u202B" print(" PRED: '{}{}{}'".format(lr[bidi.get_base_level(pred_sentence)], pred_sentence, "\u202C")) print(" TRUE: '{}{}{}'".format(lr[bidi.get_base_level(gt_sentence)], gt_sentence, "\u202C")) if checkpoint_frequency > 0 and (iter + 1) % checkpoint_frequency == 0: last_checkpoint = make_checkpoint(checkpoint_params.output_dir, checkpoint_params.output_model_prefix) if early_stopping_enabled and (iter + 1) % early_stopping_frequency == 0: print("Checking early stopping model") out_gen = early_stopping_predictor.predict_input_dataset(validation_dataset, progress_bar=progress_bar) result = Evaluator.evaluate_single_list(map( Evaluator.evaluate_single_args, map(lambda d: tuple(self.txt_preproc.apply([''.join(d.ground_truth), d.sentence])), out_gen))) accuracy = 1 - result["avg_ler"] if accuracy > early_stopping_best_accuracy: early_stopping_best_accuracy = accuracy early_stopping_best_cur_nbest = 1 early_stopping_best_at_iter = iter + 1 # overwrite as best model last_checkpoint = make_checkpoint( checkpoint_params.early_stopping_best_model_output_dir, prefix="", version=checkpoint_params.early_stopping_best_model_prefix, ) print("Found better model with accuracy of {:%}".format(early_stopping_best_accuracy)) else: early_stopping_best_cur_nbest += 1 print("No better model found. Currently accuracy of {:%} at iter {} (remaining nbest = {})". format(early_stopping_best_accuracy, early_stopping_best_at_iter, checkpoint_params.early_stopping_nbest - early_stopping_best_cur_nbest)) if accuracy > 0 and early_stopping_best_cur_nbest >= checkpoint_params.early_stopping_nbest: print("Early stopping now.") break if accuracy >= 1: print("Reached perfect score on validation set. Early stopping now.") break except KeyboardInterrupt as e: print("Storing interrupted checkpoint") make_checkpoint(checkpoint_params.output_dir, checkpoint_params.output_model_prefix, "interrupted") raise e print("Total time {}s for {} iterations.".format(time.time() - train_start_time, iter))
def run(args: PredictArgs): # check if loading a json file # TODO: support running from JSON # if len(args.files) == 1 and args.files[0].endswith("json"): # import json # with open(args.files[0], 'r') as f: # json_args = json.load(f) # for key, value in json_args.items(): # setattr(args, key, value) # checks if args.extended_prediction_data_format not in ["pred", "json"]: raise Exception( "Only 'pred' and 'json' are allowed extended prediction data formats" ) # add json as extension, resolve wildcard, expand user, ... and remove .json again args.checkpoint = [(cp if cp.endswith(".json") else cp + ".json") for cp in args.checkpoint] args.checkpoint = glob_all(args.checkpoint) args.checkpoint = [cp[:-5] for cp in args.checkpoint] # create ctc decoder prepare_ctc_decoder_params(args.ctc_decoder) # predict for all models from calamari_ocr.ocr.predict.predictor import MultiPredictor predictor = MultiPredictor.from_paths( checkpoints=args.checkpoint, voter_params=args.voter, predictor_params=args.predictor, ) do_prediction = predictor.predict(args.data) pipeline: CalamariPipeline = predictor.data.get_or_create_pipeline( predictor.params.pipeline, args.data) reader = pipeline.reader() if len(reader) == 0: raise Exception( "Empty dataset provided. Check your command line arguments or if the provided files are empty." ) avg_sentence_confidence = 0 n_predictions = 0 reader.prepare_store() # output the voted results to the appropriate files for s in do_prediction: _, (result, prediction), meta = s.inputs, s.outputs, s.meta sample = reader.sample_by_id(meta["id"]) n_predictions += 1 sentence = prediction.sentence avg_sentence_confidence += prediction.avg_char_probability if args.verbose: lr = "\u202A\u202B" logger.info("{}: '{}{}{}'".format(meta["id"], lr[get_base_level(sentence)], sentence, "\u202C")) output_dir = args.output_dir if args.output_dir else os.path.dirname( prediction.line_path) reader.store_text_prediction(prediction, meta["id"], output_dir=output_dir) if args.extended_prediction_data: ps = Predictions() ps.line_path = sample[ "image_path"] if "image_path" in sample else sample["id"] ps.predictions.extend([prediction] + [r.prediction for r in result]) output_dir = output_dir if output_dir else os.path.dirname( ps.line_path) if not os.path.exists(output_dir): os.mkdir(output_dir) if args.extended_prediction_data_format == "pred": data = zlib.compress( ps.to_json(indent=2, ensure_ascii=False).encode("utf-8")) elif args.extended_prediction_data_format == "json": # remove logits for p in ps.predictions: p.logits = None data = ps.to_json(indent=2) else: raise Exception("Unknown prediction format.") reader.store_extended_prediction( data, sample, output_dir=output_dir, extension=args.extended_prediction_data_format, ) logger.info("Average sentence confidence: {:.2%}".format( avg_sentence_confidence / n_predictions)) reader.store() logger.info("All prediction files written")
def run(args): # checks if args.extended_prediction_data_format not in ["pred", "json"]: raise Exception( "Only 'pred' and 'json' are allowed extended prediction data formats" ) # add json as extension, resolve wildcard, expand user, ... and remove .json again args.checkpoint = [(cp if cp.endswith(".json") else cp + ".json") for cp in args.checkpoint] args.checkpoint = glob_all(args.checkpoint) args.checkpoint = [cp[:-5] for cp in args.checkpoint] # create voter voter_params = VoterParams() voter_params.type = VoterParams.Type.Value(args.voter.upper()) voter = voter_from_proto(voter_params) # load files files = glob.glob(args.files) dataset = AbbyyDataSet(files, skip_invalid=True, remove_invalid=False, binary=args.binary) dataset.load_samples(processes=args.processes, progress_bar=not args.no_progress_bars) print("Found {} files in the dataset".format(len(dataset))) if len(dataset) == 0: raise Exception( "Empty dataset provided. Check your files argument (got {})!". format(args.files)) # predict for all models predictor = MultiPredictor(checkpoints=args.checkpoint, batch_size=args.batch_size, processes=args.processes) do_prediction = predictor.predict_dataset( dataset, progress_bar=not args.no_progress_bars) # output the voted results to the appropriate files input_image_files = [] # creat input_image_files list for next loop for page in dataset.book.pages: for fo in page.getFormats(): input_image_files.append(page.imgFile) for (result, sample), filepath in zip(do_prediction, input_image_files): for i, p in enumerate(result): p.prediction.id = "fold_{}".format(i) # vote the results (if only one model is given, this will just return the sentences) prediction = voter.vote_prediction_result(result) prediction.id = "voted" sentence = prediction.sentence if args.verbose: lr = "\u202A\u202B" print("{}: '{}{}{}'".format(sample['id'], lr[get_base_level(sentence)], sentence, "\u202C")) output_dir = args.output_dir if args.output_dir else os.path.dirname( filepath) sample["format"].text = sentence if args.extended_prediction_data: ps = Predictions() ps.line_path = filepath ps.predictions.extend([prediction] + [r.prediction for r in result]) if args.extended_prediction_data_format == "pred": with open(os.path.join(output_dir, sample['id'] + ".pred"), 'wb') as f: f.write(ps.SerializeToString()) elif args.extended_prediction_data_format == "json": with open(os.path.join(output_dir, sample['id'] + ".json"), 'w') as f: # remove logits for prediction in ps.predictions: prediction.logits.rows = 0 prediction.logits.cols = 0 prediction.logits.data[:] = [] f.write( MessageToJson(ps, including_default_value_fields=True)) else: raise Exception("Unknown prediction format.") w = XMLWriter(output_dir, os.path.dirname(filepath), dataset.book) w.write() print("All files written")
def run(args): # check if loading a json file if len(args.files) == 1 and args.files[0].endswith("json"): import json with open(args.files[0], 'r') as f: json_args = json.load(f) for key, value in json_args.items(): setattr(args, key, value) # checks if args.extended_prediction_data_format not in ["pred", "json"]: raise Exception( "Only 'pred' and 'json' are allowed extended prediction data formats" ) # add json as extension, resolve wildcard, expand user, ... and remove .json again args.checkpoint = [(cp if cp.endswith(".json") else cp + ".json") for cp in args.checkpoint] args.checkpoint = glob_all(args.checkpoint) args.checkpoint = [cp[:-5] for cp in args.checkpoint] args.extension = args.extension if args.extension else DataSetType.pred_extension( args.dataset) # create ctc decoder ctc_decoder_params = create_ctc_decoder_params(args) # create voter voter_params = VoterParams() voter_params.type = VoterType(args.voter) # load files input_image_files = glob_all(args.files) if args.text_files: args.text_files = glob_all(args.text_files) # skip invalid files and remove them, there wont be predictions of invalid files predict_params = PipelineParams( type=args.dataset, skip_invalid=True, remove_invalid=True, files=input_image_files, text_files=args.text_files, data_reader_args=FileDataReaderArgs( pad=args.dataset_pad, text_index=args.pagexml_text_index, ), batch_size=args.batch_size, num_processes=args.processes, ) # predict for all models # TODO: Use CTC Decoder params from calamari_ocr.ocr.predict.predictor import MultiPredictor predictor = MultiPredictor.from_paths( checkpoints=args.checkpoint, voter_params=voter_params, predictor_params=PredictorParams( silent=True, progress_bar=not args.no_progress_bars)) do_prediction = predictor.predict(predict_params) pipeline: CalamariPipeline = predictor.data.get_predict_data( predict_params) reader = pipeline.reader() if len(reader) == 0: raise Exception( "Empty dataset provided. Check your files argument (got {})!". format(args.files)) avg_sentence_confidence = 0 n_predictions = 0 reader.prepare_store() # output the voted results to the appropriate files for s in do_prediction: inputs, (result, prediction), meta = s.inputs, s.outputs, s.meta sample = reader.sample_by_id(meta['id']) n_predictions += 1 sentence = prediction.sentence avg_sentence_confidence += prediction.avg_char_probability if args.verbose: lr = "\u202A\u202B" logger.info("{}: '{}{}{}'".format(meta['id'], lr[get_base_level(sentence)], sentence, "\u202C")) output_dir = args.output_dir reader.store_text(sentence, sample, output_dir=output_dir, extension=args.extension) if args.extended_prediction_data: ps = Predictions() ps.line_path = sample[ 'image_path'] if 'image_path' in sample else sample['id'] ps.predictions.extend([prediction] + [r.prediction for r in result]) output_dir = output_dir if output_dir else os.path.dirname( ps.line_path) if not os.path.exists(output_dir): os.mkdir(output_dir) if args.extended_prediction_data_format == "pred": data = zlib.compress( ps.to_json(indent=2, ensure_ascii=False).encode('utf-8')) elif args.extended_prediction_data_format == "json": # remove logits for p in ps.predictions: p.logits = None data = ps.to_json(indent=2) else: raise Exception("Unknown prediction format.") reader.store_extended_prediction( data, sample, output_dir=output_dir, extension=args.extended_prediction_data_format) logger.info("Average sentence confidence: {:.2%}".format( avg_sentence_confidence / n_predictions)) reader.store(args.extension) logger.info("All prediction files written")