def ocr_box_file(self, box_fname): # Set up the names of output files replace = lambda s: box_fname.replace('.box', s) asis_fname = replace('.ml.txt') nogram_out_fname = replace('.nogram.txt') ngram_out_fname = replace('.gram.txt') log_fname = replace('.{}.log'.format(self.loglevelname)) logging.basicConfig(filename=log_fname, level=self.loglevel, filemode="w") # Read Bantries & get Most likely output bf = BantryFile(box_fname) with open(asis_fname, 'w', encoding='utf-8') as f: f.write(post_process(bf.text)) # Process using ngrams ngrammed_lines, notgrammed_lines = [], [] for linenum in range(bf.num_lines): print("Line ", linenum) line_bantries = bf.get_line_bantires(linenum) gramgraph = GramGraph(line_bantries) gramgraph.process_tree() notgrammed_lines.append(gramgraph.get_best_apriori_str()) ngrammed_lines.append(gramgraph.get_best_str()) nogram_out = post_process("\n".join(notgrammed_lines)) with open(nogram_out_fname, 'w', encoding='utf-8') as out_file: out_file.write(nogram_out) ngram_out = post_process("\n".join(ngrammed_lines)) with open(ngram_out_fname, 'w', encoding='utf-8') as out_file: out_file.write(ngram_out) print("Input : ", box_fname) print("As is output : ", asis_fname) print("Without ngram : ", nogram_out_fname) print("With ngram : ", ngram_out_fname) print("Log : ", log_fname)
if __name__ == "__main__": import sys from scaler import ScalerFactory from bantry import Bantry, BantryFile from classifier import Classifier from ngram import Ngram nnet_file = sys.argv[1] if len(sys.argv) > 1 else "library/nn.pkl" banti_file_name = sys.argv[2] if len(sys.argv) > 2 else "sample_images/praasa.box" scaler_prms_file = sys.argv[3] if len(sys.argv) > 3 else "scalings/relative48.scl" labellings_file = sys.argv[4] if len(sys.argv) > 4 else "labellings/alphacodes.lbl" ngram_file = "library/mega.123.pkl" Bantry.scaler = ScalerFactory(scaler_prms_file) Bantry.classifier = Classifier(nnet_file, labellings_file, logbase=1) bf = BantryFile(banti_file_name) ngram = Ngram(ngram_file) GramGraph.set_ngram(ngram) for linenum in range(bf.num_lines): print('*' * 80) bantires = bf.get_line_bantires(linenum) gramgraph = GramGraph(bantires) gramgraph.process_tree() gramgraph.find_top_ngram_paths() for node, children in enumerate(gramgraph.lchildren): print(gramgraph.top_pathnodes_at(node, 1)) print(gramgraph.get_best_str('|')) print(gramgraph.get_best_apriori_str('|'))
log_fname = replace('.{}.log'.format(logging._levelToName[loglevel]).lower()) asis_fname = replace('.ml.txt') nogram_out_fname = replace('.nogram.txt') ngram_out_fname = replace('.gram.txt') logging.basicConfig(filename=log_fname, level=loglevel, filemode="w") ############################## Set-up scaler, classifier, ngram etc. Bantry.scaler = ScalerFactory(scaler_fname) Bantry.classifier = Classifier(nnet_fname, labels_fname, logbase=1) ng = Ngram(ngram_fname) Bantry.ngram = ng GramGraph.set_ngram(ng) ############################## Read Bantries & get Most likely output bf = BantryFile(banti_fname) with open(asis_fname, 'w', encoding='utf-8') as f: f.write(post_process(bf.text)) ############################## Process using ngrams ngrammed_lines, notgrammed_lines = [], [] for linenum in range(bf.num_lines): print("Line ", linenum) line_bantries = bf.get_line_bantires(linenum) gramgraph = GramGraph(line_bantries) gramgraph.process_tree() notgrammed_lines.append(gramgraph.get_best_apriori_str()) ngrammed_lines.append(gramgraph.get_best_str()) nogram_out = post_process("\n".join(notgrammed_lines))
with open(scaler_prms_file, 'r') as sfp: scaler_prms = ast.literal_eval(sfp.read()) with open(nnet_prms_file_name, 'rb') as nnet_prms_file: nnet_prms = pickle.load(nnet_prms_file) with open(labelings_file_name, encoding='utf-8') as labels_fp: labellings = ast.literal_eval(labels_fp.read()) # print(labellings) chars = LabelToUnicodeConverter(labellings).onecode ############################################# Init Network Bantry.scaler = ScalerFactory(scaler_prms) bf = BantryFile(banti_file_name) nnet_prms['training_params']['BATCH_SZ'] = 1 ntwk = NeuralNet(**nnet_prms) tester = ntwk.get_data_test_model(go_nuts=True) ############################################# Image saver dir_name = os.path.basename(nnet_prms_file_name)[:-7] + '/' if not os.path.exists(dir_name): os.makedirs(dir_name) namer = (dir_name + '{:03d}_{}_{:02d}.png').format print("Look for me in :", dir_name) def saver(outs, ch, debug=True): saver.index += 1 for i, out in enumerate(outs):
from bantry import Bantry, BantryFile from classifier import Classifier from ngram import Ngram nnet_file = sys.argv[1] if len(sys.argv) > 1 else "library/nn.pkl" banti_file_name = sys.argv[2] if len( sys.argv) > 2 else "sample_images/praasa.box" scaler_prms_file = sys.argv[3] if len( sys.argv) > 3 else "scalings/relative48.scl" labellings_file = sys.argv[4] if len( sys.argv) > 4 else "labellings/alphacodes.lbl" ngram_file = "library/mega.123.pkl" Bantry.scaler = ScalerFactory(scaler_prms_file) Bantry.classifier = Classifier(nnet_file, labellings_file, logbase=1) bf = BantryFile(banti_file_name) ngram = Ngram(ngram_file) GramGraph.set_ngram(ngram) for linenum in range(bf.num_lines): print('*' * 80) bantires = bf.get_line_bantires(linenum) gramgraph = GramGraph(bantires) gramgraph.process_tree() gramgraph.find_top_ngram_paths() for node, children in enumerate(gramgraph.lchildren): print(gramgraph.top_pathnodes_at(node, 1)) print(gramgraph.get_best_str('|')) print(gramgraph.get_best_apriori_str('|'))