def __init__(self): self.parser = get_params() self.args = self.parser.parse_args() self.root_data_dir = self.args.root_data_dir self.data_name = self.args.dataset self.data_dir = self.root_data_dir + '/' + self.data_name print("gpu_index", self.args.gpu_index) tf.device('/gpu:{}'.format(self.args.gpu_index)) os.environ['CUDA_VISIBLE_DEVICES'] = "1,4,7,8" self.model_type = 'narre' self.model_dir = self.data_dir + '/' + self.model_type self.train_input = self.model_dir + '/' + self.model_type + '.train' self.valid_input = self.model_dir + '/' + self.model_type + '.valid' self.test_input = self.model_dir + '/' + self.model_type + '.test' self.model_para = self.model_dir + '/' + self.model_type + '.para' dtime = datetime.datetime.now() un_time = int(time.mktime(dtime.timetuple())) self.record_file = self.model_dir + '/' + self.model_type + '_' + str( un_time) + '.record' self.batch_size = self.args.batch_size self.num_epoches = self.args.num_epoches self.dropout_keep_prob = self.args.dropout_keep_prob
def sym_wordcorrect(conf, uncorrected_dir, corrected_dir): """Correct OCR files from inputdir specified in config.ini - using word level SymSpell""" print("Initialize SymSpell") sym_spell = SymSpell() param_tuple, param_str = util.get_params(conf) dictionary_path = conf[param_tuple[1]] sym_spell.load_dictionary(dictionary_path, 0, 1) # Sort novels, just because; then correct each novel sorted_novels = sorted_listdir(uncorrected_dir) for novel in sorted_novels: novel_str = get_novel_string(novel, uncorrected_dir) # Correct individual words using SymSpell corrected_novel_str = word_correct_text(novel_str, sym_spell) # Create output folder if not exists and write to file outfolder = os.path.join(corrected_dir, novel) try: os.makedirs(outfolder) except FileExistsError: pass outpath = os.path.join(outfolder, os.path.basename(novel) + '.corrected.txt') print(outpath) with open(outpath, 'w') as f: f.write(corrected_novel_str + "\n")
def main(): config = configparser.ConfigParser() config.read(os.path.join(ROOT_PATH, 'config', 'config.ini')) conf = config['eval'] *_, param_str = util.get_params(conf) # Generate various paths and create them if necessary. # TODO Does this still work ..? pth = EvalPaths(conf, param_str) analyze_gold_vrt(pth.annotated_gold_vrt_path, conf, pth.analyses_dir, param_str, n_datasets=5)
def __init__(self): self.parser = get_params() self.args = self.parser.parse_args() self.model_type = 'narre' self.root_data_dir = self.args.root_data_dir self.data_name = self.args.dataset self.data_dir = self.root_data_dir + '/' + self.data_name self.train_file = os.path.join(self.data_dir, self.data_name + '_train.csv') self.valid_file = os.path.join(self.data_dir, self.data_name + '_valid.csv') self.test_file = os.path.join(self.data_dir, self.data_name + '_test.csv') self.user_reviews_file = os.path.join(self.data_dir, 'user_review') self.item_reviews_file = os.path.join(self.data_dir, 'item_review') self.user_rids_file = os.path.join(self.data_dir, 'user_rid') self.item_rids_file = os.path.join(self.data_dir, 'item_rid') self.vocab_file_bert = os.path.join(self.data_dir, 'vocab.txt') self.vocab_file = os.path.join(self.data_dir, 'vocab.pk') self.u_max_num = self.args.u_max_num self.u_max_len = self.args.u_max_len self.i_max_num = self.args.i_max_num self.i_max_len = self.args.i_max_len self.model_dir = self.data_dir + '/' + self.model_type if not os.path.exists(self.model_dir): os.makedirs(self.model_dir) self.train_input = os.path.join(self.model_dir, self.model_type + '.train') self.valid_input = os.path.join(self.model_dir, self.model_type + '.valid') self.test_input = os.path.join(self.model_dir, self.model_type + '.test') self.model_para = os.path.join(self.model_dir, self.model_type + '.para')
def main(): """Run the OCR pipeline.""" starttime = datetime.now() config = util.get_config() # Generate various paths and create them if necessary. conf = util.Confs(config).corrconf *_, param_str = util.get_params(conf) pth = util.CorrPaths(conf) # Which OCR traineddata should be used? # Note! frk.traineddata must be downloaded from tessdata_fast in order to work: # https://github.com/tesseract-ocr/tessdata_fast/blob/master/frk.traineddata # Same for dan.traineddata: https://github.com/tesseract-ocr/tessdata_fast/blob/master/dan.traineddata # fraktur.traineddata can be downloaded from tessdata_best: # https://github.com/tesseract-ocr/tessdata_best/blob/master/script/Fraktur.traineddata traineddata_labels = ['Fraktur', 'dan', 'frk'] tess_outdirs = [ os.path.join(pth.fulloutputdir, f'tess_out_{label}') for label in traineddata_labels ] uncorrected_dir = os.path.join(pth.fulloutputdir, conf['base_ocr']) corrected_dir = os.path.join(pth.fulloutputdir, param_str) # Steps of the pipeline. Set options in the config file for which processing steps to perform. if conf.getboolean('run_make_dictionary'): make_dic(conf['metadir']) if conf.getboolean('run_pdf2img'): pdfs2imgs(pth.frakturpaths, pth.img_dir, int(conf['split_size'])) if conf.getboolean('run_ocr'): do_ocr(pth.img_dir, pth.fulloutputdir, traineddata_labels) if conf.getboolean('correct_easy'): correct_easy_fraktur_errors(uncorrected_dir, corrected_dir) uncorrected_dir = corrected_dir if conf.getboolean('correct_hard'): correct_hard_fraktur_errors(uncorrected_dir, pth.fulloutputdir, corrected_dir) uncorrected_dir = corrected_dir if conf.getboolean('sym_wordcorrect'): sym_wordcorrect(conf, uncorrected_dir, corrected_dir) # TODO Will it make any sense to employ SymSpell at the bigram level? Probably not? # if conf.getboolean('make_basic_gold_vrt'): # gold_vrt_gen = generate_novels_vrt(corrpaths.gold_novels_dir, corrpaths.corp_label) # write_novels_vrt(gold_vrt_gen, corrpaths.basic_gold_vrt_path) # if conf.getboolean('annotate_gold_vrt'): # text_annotation_generator = generate_gold_annotations(corrpaths.basic_gold_vrt_path, corrpaths.ocr_kb_dir, # conf['texton_out_dir'], corrpaths.corp_label, tess_outdirs, # [corrected_dir], conf) # TODO single dir instead of list of dirs? # write_annotated_gold_vrt(text_annotation_generator, corrpaths.local_annotated_gold_vrt_path) # shutil.copy(corrpaths.local_annotated_gold_vrt_path, corrpaths.annotated_gold_vrt_path) # if conf.getboolean('analyze_errors'): # # TODO Not very transparent error when n_datasets is wrong. # analyze_gold_vrt(corrpaths.annotated_gold_vrt_path, conf, corrpaths.analyses_dir, param_str, n_datasets=5) # if conf.getboolean('write_korp_configs'): # util.write_frakturgold_mode(conf['frakturgold_mode_template'], # conf['gold_vrt_p_attrs'], # conf['frakturgold_mode_outpath']) # shutil.copy(conf['frakturgold_mode_outpath'], os.path.join(corrpaths.vrt_dir, 'memo_frakturgold_mode.js')) # util.write_frakturgold_encodescript(conf['frakturgold_encode_template'], # corrpaths.annotated_outdir, # conf['gold_vrt_p_attrs'], # conf['frakturgold_encode_outpath']) # shutil.copy(conf['frakturgold_encode_outpath'], os.path.join(corrpaths.vrt_dir, 'encode_MEMO_fraktur_gold.sh')) # if conf.getboolean('write_word'): # pass endtime = datetime.now() elapsed = endtime - starttime print(f"Start: {starttime.strftime('%H:%M:%S')}") print(f"End: {endtime.strftime('%H:%M:%S')}") print(f"Elapsed: {elapsed}")