def __init__(self):
        self.parser = get_params()
        self.args = self.parser.parse_args()

        self.root_data_dir = self.args.root_data_dir
        self.data_name = self.args.dataset
        self.data_dir = self.root_data_dir + '/' + self.data_name

        print("gpu_index", self.args.gpu_index)
        tf.device('/gpu:{}'.format(self.args.gpu_index))

        os.environ['CUDA_VISIBLE_DEVICES'] = "1,4,7,8"

        self.model_type = 'narre'
        self.model_dir = self.data_dir + '/' + self.model_type
        self.train_input = self.model_dir + '/' + self.model_type + '.train'
        self.valid_input = self.model_dir + '/' + self.model_type + '.valid'
        self.test_input = self.model_dir + '/' + self.model_type + '.test'
        self.model_para = self.model_dir + '/' + self.model_type + '.para'

        dtime = datetime.datetime.now()
        un_time = int(time.mktime(dtime.timetuple()))
        self.record_file = self.model_dir + '/' + self.model_type + '_' + str(
            un_time) + '.record'

        self.batch_size = self.args.batch_size
        self.num_epoches = self.args.num_epoches
        self.dropout_keep_prob = self.args.dropout_keep_prob
示例#2
0
def sym_wordcorrect(conf, uncorrected_dir, corrected_dir):
    """Correct OCR files from inputdir specified in config.ini - using word level SymSpell"""
    print("Initialize SymSpell")
    sym_spell = SymSpell()
    param_tuple, param_str = util.get_params(conf)
    dictionary_path = conf[param_tuple[1]]
    sym_spell.load_dictionary(dictionary_path, 0, 1)

    # Sort novels, just because; then correct each novel
    sorted_novels = sorted_listdir(uncorrected_dir)
    for novel in sorted_novels:
        novel_str = get_novel_string(novel, uncorrected_dir)
        # Correct individual words using SymSpell
        corrected_novel_str = word_correct_text(novel_str, sym_spell)
        # Create output folder if not exists and write to file
        outfolder = os.path.join(corrected_dir, novel)
        try:
            os.makedirs(outfolder)
        except FileExistsError:
            pass
        outpath = os.path.join(outfolder,
                               os.path.basename(novel) + '.corrected.txt')
        print(outpath)
        with open(outpath, 'w') as f:
            f.write(corrected_novel_str + "\n")
def main():
    config = configparser.ConfigParser()
    config.read(os.path.join(ROOT_PATH, 'config', 'config.ini'))
    conf = config['eval']
    *_, param_str = util.get_params(conf)
    # Generate various paths and create them if necessary.
    # TODO Does this still work ..?
    pth = EvalPaths(conf, param_str)
    analyze_gold_vrt(pth.annotated_gold_vrt_path, conf, pth.analyses_dir, param_str, n_datasets=5)
示例#4
0
    def __init__(self):

        self.parser = get_params()
        self.args = self.parser.parse_args()
        self.model_type = 'narre'

        self.root_data_dir = self.args.root_data_dir
        self.data_name = self.args.dataset
        self.data_dir = self.root_data_dir + '/' + self.data_name

        self.train_file = os.path.join(self.data_dir,
                                       self.data_name + '_train.csv')
        self.valid_file = os.path.join(self.data_dir,
                                       self.data_name + '_valid.csv')
        self.test_file = os.path.join(self.data_dir,
                                      self.data_name + '_test.csv')
        self.user_reviews_file = os.path.join(self.data_dir, 'user_review')
        self.item_reviews_file = os.path.join(self.data_dir, 'item_review')
        self.user_rids_file = os.path.join(self.data_dir, 'user_rid')
        self.item_rids_file = os.path.join(self.data_dir, 'item_rid')
        self.vocab_file_bert = os.path.join(self.data_dir, 'vocab.txt')
        self.vocab_file = os.path.join(self.data_dir, 'vocab.pk')

        self.u_max_num = self.args.u_max_num
        self.u_max_len = self.args.u_max_len
        self.i_max_num = self.args.i_max_num
        self.i_max_len = self.args.i_max_len

        self.model_dir = self.data_dir + '/' + self.model_type
        if not os.path.exists(self.model_dir):
            os.makedirs(self.model_dir)
        self.train_input = os.path.join(self.model_dir,
                                        self.model_type + '.train')
        self.valid_input = os.path.join(self.model_dir,
                                        self.model_type + '.valid')
        self.test_input = os.path.join(self.model_dir,
                                       self.model_type + '.test')
        self.model_para = os.path.join(self.model_dir,
                                       self.model_type + '.para')
示例#5
0
def main():
    """Run the OCR pipeline."""
    starttime = datetime.now()
    config = util.get_config()

    # Generate various paths and create them if necessary.
    conf = util.Confs(config).corrconf
    *_, param_str = util.get_params(conf)
    pth = util.CorrPaths(conf)

    # Which OCR traineddata should be used?
    # Note! frk.traineddata must be downloaded from tessdata_fast in order to work:
    # https://github.com/tesseract-ocr/tessdata_fast/blob/master/frk.traineddata
    # Same for dan.traineddata: https://github.com/tesseract-ocr/tessdata_fast/blob/master/dan.traineddata
    # fraktur.traineddata can be downloaded from tessdata_best:
    # https://github.com/tesseract-ocr/tessdata_best/blob/master/script/Fraktur.traineddata
    traineddata_labels = ['Fraktur', 'dan', 'frk']
    tess_outdirs = [
        os.path.join(pth.fulloutputdir, f'tess_out_{label}')
        for label in traineddata_labels
    ]
    uncorrected_dir = os.path.join(pth.fulloutputdir, conf['base_ocr'])
    corrected_dir = os.path.join(pth.fulloutputdir, param_str)

    # Steps of the pipeline. Set options in the config file for which processing steps to perform.
    if conf.getboolean('run_make_dictionary'):
        make_dic(conf['metadir'])
    if conf.getboolean('run_pdf2img'):
        pdfs2imgs(pth.frakturpaths, pth.img_dir, int(conf['split_size']))
    if conf.getboolean('run_ocr'):
        do_ocr(pth.img_dir, pth.fulloutputdir, traineddata_labels)
    if conf.getboolean('correct_easy'):
        correct_easy_fraktur_errors(uncorrected_dir, corrected_dir)
        uncorrected_dir = corrected_dir
    if conf.getboolean('correct_hard'):
        correct_hard_fraktur_errors(uncorrected_dir, pth.fulloutputdir,
                                    corrected_dir)
        uncorrected_dir = corrected_dir
    if conf.getboolean('sym_wordcorrect'):
        sym_wordcorrect(conf, uncorrected_dir, corrected_dir)
    # TODO Will it make any sense to employ SymSpell at the bigram level? Probably not?
    # if conf.getboolean('make_basic_gold_vrt'):
    #     gold_vrt_gen = generate_novels_vrt(corrpaths.gold_novels_dir, corrpaths.corp_label)
    #     write_novels_vrt(gold_vrt_gen, corrpaths.basic_gold_vrt_path)
    # if conf.getboolean('annotate_gold_vrt'):
    #     text_annotation_generator = generate_gold_annotations(corrpaths.basic_gold_vrt_path, corrpaths.ocr_kb_dir,
    #                                                           conf['texton_out_dir'], corrpaths.corp_label, tess_outdirs,
    #                                                           [corrected_dir], conf)  # TODO single dir instead of list of dirs?
    #     write_annotated_gold_vrt(text_annotation_generator, corrpaths.local_annotated_gold_vrt_path)
    #     shutil.copy(corrpaths.local_annotated_gold_vrt_path, corrpaths.annotated_gold_vrt_path)
    # if conf.getboolean('analyze_errors'):
    #     # TODO Not very transparent error when n_datasets is wrong.
    #     analyze_gold_vrt(corrpaths.annotated_gold_vrt_path, conf, corrpaths.analyses_dir, param_str, n_datasets=5)
    # if conf.getboolean('write_korp_configs'):
    #     util.write_frakturgold_mode(conf['frakturgold_mode_template'],
    #                                 conf['gold_vrt_p_attrs'],
    #                                 conf['frakturgold_mode_outpath'])
    #     shutil.copy(conf['frakturgold_mode_outpath'], os.path.join(corrpaths.vrt_dir, 'memo_frakturgold_mode.js'))
    #     util.write_frakturgold_encodescript(conf['frakturgold_encode_template'],
    #                                         corrpaths.annotated_outdir,
    #                                         conf['gold_vrt_p_attrs'],
    #                                         conf['frakturgold_encode_outpath'])
    #     shutil.copy(conf['frakturgold_encode_outpath'], os.path.join(corrpaths.vrt_dir, 'encode_MEMO_fraktur_gold.sh'))
    # if conf.getboolean('write_word'):
    #     pass

    endtime = datetime.now()
    elapsed = endtime - starttime
    print(f"Start: {starttime.strftime('%H:%M:%S')}")
    print(f"End:   {endtime.strftime('%H:%M:%S')}")
    print(f"Elapsed: {elapsed}")