示例#1
0
    def ocr_box_file(self, box_fname):
        # Set up the names of output files
        replace = lambda s: box_fname.replace('.box', s)

        asis_fname = replace('.ml.txt')
        nogram_out_fname = replace('.nogram.txt')
        ngram_out_fname = replace('.gram.txt')

        log_fname = replace('.{}.log'.format(self.loglevelname))
        logging.basicConfig(filename=log_fname,
                            level=self.loglevel,
                            filemode="w")

        # Read Bantries & get Most likely output
        bf = BantryFile(box_fname)
        with open(asis_fname, 'w', encoding='utf-8') as f:
            f.write(post_process(bf.text))

        # Process using ngrams
        ngrammed_lines, notgrammed_lines = [], []
        for linenum in range(bf.num_lines):
            print("Line ", linenum)
            line_bantries = bf.get_line_bantires(linenum)
            gramgraph = GramGraph(line_bantries)
            gramgraph.process_tree()
            notgrammed_lines.append(gramgraph.get_best_apriori_str())
            ngrammed_lines.append(gramgraph.get_best_str())

        nogram_out = post_process("\n".join(notgrammed_lines))
        with open(nogram_out_fname, 'w', encoding='utf-8') as out_file:
            out_file.write(nogram_out)

        ngram_out = post_process("\n".join(ngrammed_lines))
        with open(ngram_out_fname, 'w', encoding='utf-8') as out_file:
            out_file.write(ngram_out)

        print("Input : ", box_fname)
        print("As is output : ", asis_fname)
        print("Without ngram : ", nogram_out_fname)
        print("With ngram : ", ngram_out_fname)
        print("Log : ", log_fname)
示例#2
0
    def __init__(self,
                 nnet_fname,
                 scaler_fname,
                 labels_fname,
                 ngram_fname,
                 logbase=1,
                 loglevel=logging.INFO,):
        self.nnet_fname = nnet_fname
        self.scaler_fname = scaler_fname
        self.labels_fname = labels_fname
        self.ngram_fname = ngram_fname
        self.logbase = logbase
        self.loglevel = loglevel
        self.loglevelname = logging._levelToName[loglevel].lower()

        Bantry.scaler = ScalerFactory(scaler_fname)
        Bantry.classifier = Classifier(nnet_fname, labels_fname,
                                       logbase=logbase)
        self.ng = Ngram(ngram_fname)
        Bantry.ngram = self.ng
        GramGraph.set_ngram(self.ng)
示例#3
0
    def __init__(
        self,
        nnet_fname,
        scaler_fname,
        labels_fname,
        ngram_fname,
        logbase=1,
        loglevel=logging.INFO,
    ):
        self.nnet_fname = nnet_fname
        self.scaler_fname = scaler_fname
        self.labels_fname = labels_fname
        self.ngram_fname = ngram_fname
        self.logbase = logbase
        self.loglevel = loglevel
        self.loglevelname = logging._levelToName[loglevel].lower()

        Bantry.scaler = ScalerFactory(scaler_fname)
        Bantry.classifier = Classifier(nnet_fname,
                                       labels_fname,
                                       logbase=logbase)
        self.ng = Ngram(ngram_fname)
        Bantry.ngram = self.ng
        GramGraph.set_ngram(self.ng)
示例#4
0
    def ocr_box_file(self, box_fname):
        # Set up the names of output files
        replace = lambda s: box_fname.replace('.box', s)

        asis_fname = replace('.ml.txt')
        nogram_out_fname = replace('.nogram.txt')
        ngram_out_fname = replace('.gram.txt')

        log_fname = replace('.{}.log'.format(self.loglevelname))
        logging.basicConfig(filename=log_fname,
                            level=self.loglevel,
                            filemode="w")

        # Read Bantries & get Most likely output
        bf = BantryFile(box_fname)
        with open(asis_fname, 'w', encoding='utf-8') as f:
            f.write(post_process(bf.text))

        # Process using ngrams
        ngrammed_lines, notgrammed_lines = [], []
        for linenum in range(bf.num_lines):
            print("Line ", linenum)
            line_bantries = bf.get_line_bantires(linenum)
            gramgraph = GramGraph(line_bantries)
            gramgraph.process_tree()
            notgrammed_lines.append(gramgraph.get_best_apriori_str())
            ngrammed_lines.append(gramgraph.get_best_str())

        nogram_out = post_process("\n".join(notgrammed_lines))
        with open(nogram_out_fname, 'w', encoding='utf-8') as out_file:
            out_file.write(nogram_out)

        ngram_out = post_process("\n".join(ngrammed_lines))
        with open(ngram_out_fname, 'w', encoding='utf-8') as out_file:
            out_file.write(ngram_out)

        print("Input : ", box_fname)
        print("As is output : ", asis_fname)
        print("Without ngram : ", nogram_out_fname)
        print("With ngram : ", ngram_out_fname)
        print("Log : ", log_fname)
示例#5
0
}.get(sys.argv[-1][:2].lower(), logging.INFO)

replace = lambda s: banti_fname.replace('.box', s)
log_fname = replace('.{}.log'.format(logging._levelToName[loglevel]).lower())
asis_fname = replace('.ml.txt')
nogram_out_fname = replace('.nogram.txt')
ngram_out_fname = replace('.gram.txt')

logging.basicConfig(filename=log_fname, level=loglevel, filemode="w")

############################## Set-up scaler, classifier, ngram etc.
Bantry.scaler = ScalerFactory(scaler_fname)
Bantry.classifier = Classifier(nnet_fname, labels_fname, logbase=1)
ng = Ngram(ngram_fname)
Bantry.ngram = ng
GramGraph.set_ngram(ng)

############################## Read Bantries & get Most likely output
bf = BantryFile(banti_fname)
with open(asis_fname, 'w', encoding='utf-8') as f:
    f.write(post_process(bf.text))

############################## Process using ngrams
ngrammed_lines, notgrammed_lines = [], []

for linenum in range(bf.num_lines):
    print("Line ", linenum)
    line_bantries = bf.get_line_bantires(linenum)
    gramgraph = GramGraph(line_bantries)
    gramgraph.process_tree()
    notgrammed_lines.append(gramgraph.get_best_apriori_str())