def ocr_box_file(self, box_fname): # Set up the names of output files replace = lambda s: box_fname.replace('.box', s) asis_fname = replace('.ml.txt') nogram_out_fname = replace('.nogram.txt') ngram_out_fname = replace('.gram.txt') log_fname = replace('.{}.log'.format(self.loglevelname)) logging.basicConfig(filename=log_fname, level=self.loglevel, filemode="w") # Read Bantries & get Most likely output bf = BantryFile(box_fname) with open(asis_fname, 'w', encoding='utf-8') as f: f.write(post_process(bf.text)) # Process using ngrams ngrammed_lines, notgrammed_lines = [], [] for linenum in range(bf.num_lines): print("Line ", linenum) line_bantries = bf.get_line_bantires(linenum) gramgraph = GramGraph(line_bantries) gramgraph.process_tree() notgrammed_lines.append(gramgraph.get_best_apriori_str()) ngrammed_lines.append(gramgraph.get_best_str()) nogram_out = post_process("\n".join(notgrammed_lines)) with open(nogram_out_fname, 'w', encoding='utf-8') as out_file: out_file.write(nogram_out) ngram_out = post_process("\n".join(ngrammed_lines)) with open(ngram_out_fname, 'w', encoding='utf-8') as out_file: out_file.write(ngram_out) print("Input : ", box_fname) print("As is output : ", asis_fname) print("Without ngram : ", nogram_out_fname) print("With ngram : ", ngram_out_fname) print("Log : ", log_fname)
log_fname = replace('.{}.log'.format(logging._levelToName[loglevel]).lower()) asis_fname = replace('.ml.txt') nogram_out_fname = replace('.nogram.txt') ngram_out_fname = replace('.gram.txt') logging.basicConfig(filename=log_fname, level=loglevel, filemode="w") ############################## Set-up scaler, classifier, ngram etc. Bantry.scaler = ScalerFactory(scaler_fname) Bantry.classifier = Classifier(nnet_fname, labels_fname, logbase=1) ng = Ngram(ngram_fname) Bantry.ngram = ng GramGraph.set_ngram(ng) ############################## Read Bantries & get Most likely output bf = BantryFile(banti_fname) with open(asis_fname, 'w', encoding='utf-8') as f: f.write(post_process(bf.text)) ############################## Process using ngrams ngrammed_lines, notgrammed_lines = [], [] for linenum in range(bf.num_lines): print("Line ", linenum) line_bantries = bf.get_line_bantires(linenum) gramgraph = GramGraph(line_bantries) gramgraph.process_tree() notgrammed_lines.append(gramgraph.get_best_apriori_str()) ngrammed_lines.append(gramgraph.get_best_str()) nogram_out = post_process("\n".join(notgrammed_lines))
else: decent = np.arange(self.nclasses) return [(ch, logprobs[i]) for i in decent for ch in self.unichars[i]] if __name__ == "__main__": import sys from scaler import ScalerFactory from bantry import Bantry, BantryFile banti_file_name = sys.argv[1] if len(sys.argv) > 1 else "sample_images/praasa.box" nnet_file = sys.argv[2] if len(sys.argv) > 2 else "library/nn.pkl" scaler_prms_file = sys.argv[3] if len(sys.argv) > 3 else "scalings/relative48.scl" labellings_file = sys.argv[4] if len(sys.argv) > 4 else "labellings/alphacodes.lbl" Bantry.scaler = ScalerFactory(scaler_prms_file) Bantry.classifier = Classifier(nnet_file, labellings_file, logbase=10, only_top=3) bf = BantryFile(banti_file_name) for linenum in range(bf.num_lines): print('*' * 80) line_bantries = bf.get_line_bantires(linenum) for bantree in line_bantries: print(bantree.scaled) for char, logprob in bantree.likelies: print(np.exp(logprob), char) print(bf.text)