def ocr_box_file(self, box_fname): # Set up the names of output files replace = lambda s: box_fname.replace('.box', s) asis_fname = replace('.ml.txt') nogram_out_fname = replace('.nogram.txt') ngram_out_fname = replace('.gram.txt') log_fname = replace('.{}.log'.format(self.loglevelname)) logging.basicConfig(filename=log_fname, level=self.loglevel, filemode="w") # Read Bantries & get Most likely output bf = BantryFile(box_fname) with open(asis_fname, 'w', encoding='utf-8') as f: f.write(post_process(bf.text)) # Process using ngrams ngrammed_lines, notgrammed_lines = [], [] for linenum in range(bf.num_lines): print("Line ", linenum) line_bantries = bf.get_line_bantires(linenum) gramgraph = GramGraph(line_bantries) gramgraph.process_tree() notgrammed_lines.append(gramgraph.get_best_apriori_str()) ngrammed_lines.append(gramgraph.get_best_str()) nogram_out = post_process("\n".join(notgrammed_lines)) with open(nogram_out_fname, 'w', encoding='utf-8') as out_file: out_file.write(nogram_out) ngram_out = post_process("\n".join(ngrammed_lines)) with open(ngram_out_fname, 'w', encoding='utf-8') as out_file: out_file.write(ngram_out) print("Input : ", box_fname) print("As is output : ", asis_fname) print("Without ngram : ", nogram_out_fname) print("With ngram : ", ngram_out_fname) print("Log : ", log_fname)
if __name__ == "__main__": import sys from scaler import ScalerFactory from bantry import Bantry, BantryFile from classifier import Classifier from ngram import Ngram nnet_file = sys.argv[1] if len(sys.argv) > 1 else "library/nn.pkl" banti_file_name = sys.argv[2] if len(sys.argv) > 2 else "sample_images/praasa.box" scaler_prms_file = sys.argv[3] if len(sys.argv) > 3 else "scalings/relative48.scl" labellings_file = sys.argv[4] if len(sys.argv) > 4 else "labellings/alphacodes.lbl" ngram_file = "library/mega.123.pkl" Bantry.scaler = ScalerFactory(scaler_prms_file) Bantry.classifier = Classifier(nnet_file, labellings_file, logbase=1) bf = BantryFile(banti_file_name) ngram = Ngram(ngram_file) GramGraph.set_ngram(ngram) for linenum in range(bf.num_lines): print('*' * 80) bantires = bf.get_line_bantires(linenum) gramgraph = GramGraph(bantires) gramgraph.process_tree() gramgraph.find_top_ngram_paths() for node, children in enumerate(gramgraph.lchildren): print(gramgraph.top_pathnodes_at(node, 1)) print(gramgraph.get_best_str('|')) print(gramgraph.get_best_apriori_str('|'))
Bantry.classifier = Classifier(nnet_fname, labels_fname, logbase=1) ng = Ngram(ngram_fname) Bantry.ngram = ng GramGraph.set_ngram(ng) ############################## Read Bantries & get Most likely output bf = BantryFile(banti_fname) with open(asis_fname, 'w', encoding='utf-8') as f: f.write(post_process(bf.text)) ############################## Process using ngrams ngrammed_lines, notgrammed_lines = [], [] for linenum in range(bf.num_lines): print("Line ", linenum) line_bantries = bf.get_line_bantires(linenum) gramgraph = GramGraph(line_bantries) gramgraph.process_tree() notgrammed_lines.append(gramgraph.get_best_apriori_str()) ngrammed_lines.append(gramgraph.get_best_str()) nogram_out = post_process("\n".join(notgrammed_lines)) with open(nogram_out_fname, 'w', encoding='utf-8') as out_file: out_file.write(nogram_out) ngram_out = post_process("\n".join(ngrammed_lines)) with open(ngram_out_fname, 'w', encoding='utf-8') as out_file: out_file.write(ngram_out) print("Input : ", banti_fname) print("As is output : ", asis_fname)