class TesseractTrainerTest(unittest.TestCase): GENERATED_DURING_TRAINING = ['unicharset', 'pffmtable', 'Microfeat', 'inttemp', 'normproto', 'mfunicharset'] @classmethod def setUpClass(self): """ Setup a trainer with defaults arguments """ self.trainer = TesseractTrainer(dictionary_name='test', text='text', font_name='helveticanarrow', font_path='./font/Helvetica-Narrow.otf', font_size=df.FONT_SIZE, exp_number=df.EXP_NUMBER, font_properties=df.FONT_PROPERTIES, tessdata_path=df.TESSDATA_PATH, word_list=df.WORD_LIST, verbose=False) self.prefix = '%s.%s.exp%d' % (self.trainer.dictionary_name, self.trainer.font_name, self.trainer.exp_number) @classmethod def tearDownClass(self): """ Delete all temporary files created during tests """ tempfiles = glob.glob(self.trainer.dictionary_name + '.' + self.trainer.font_name + '*') for tempfile in tempfiles: os.remove(tempfile) os.remove(self.trainer.dictionary_name + '.traineddata') def assertFileExists(self, f): try: open(f) except IOError: raise AssertionError('The file %s does not exist.' % (f)) def assertFileDoesNotExist(self, f): try: open(f) except IOError: pass else: raise AssertionError('The file %s does exist.' % (f)) def test1_generate_boxfile(self): """ Test if the tif and box files are correctly created after executing the self.trainer._generate_boxfile() method. """ self.trainer._generate_boxfile() self.assertFileExists(self.prefix + '.tif') self.assertFileExists(self.prefix + '.box') def test2_train_on_boxfile(self): """ Test if the tr file is correctly created after executing the self.trainer._train_on_boxfile() method. """ self.trainer._train_on_boxfile() self.assertFileExists(self.prefix + '.tr') def test3_compute_character_set(self): """ Test if the unicharset file is correctly created after executing the self.trainer._compute_character_set() method. """ self.trainer._compute_character_set() self.assertFileExists('unicharset') def test4_clustering(self): """ Test if the mfunicharset, inttemp, Microfeat and pffmtable files are correctly created after executing the self.trainer._clustering() method. """ self.trainer._clustering() self.assertFileExists('mfunicharset') self.assertFileExists('inttemp') self.assertFileExists('Microfeat') self.assertFileExists('pffmtable') def test5_normalize(self): """ Test if the normproto file is correctly created after executing the self.trainer._normalize() method. """ self.trainer._normalize() self.assertFileExists('normproto') def test6_rename_files(self): """ Check if all generated files were renamed to 'self.prefix'.old_name after executing the self.trainer._rename_files() method. """ self.trainer._rename_files() for filename in self.GENERATED_DURING_TRAINING: if filename not in "mfunicharset": # mfunicharset does not need to be renamed self.assertFileExists(self.trainer.dictionary_name + '.' + filename) self.assertFileDoesNotExist(filename) def test7_combine_data(self): """ Test if the traineddata file is correctly created after executing the self.combine_data() method. """ self.trainer._combine_data() self.assertFileExists(self.trainer.dictionary_name + '.traineddata') def test8_clean(self): """ Test if the all generated files were removed after executing the self.trainer.clean() method. """ self.trainer.clean() for filename in self.GENERATED_DURING_TRAINING: if filename not in "mfunicharset": # mfunicharset does not need to be renamed self.assertFileDoesNotExist(self.trainer.dictionary_name + '.' + filename) else: self.assertFileDoesNotExist(filename)
parser.add_argument('--experience_number', '-e', type=int, action='store', default=df.EXP_NUMBER, help="The number of the training experience.") parser.add_argument('--font-properties', '-f', type=str, action='store', default=df.FONT_PROPERTIES, help="The path of a file containing font properties for a list of training fonts.") parser.add_argument('--font-size', '-s', type=int, action='store', default=df.FONT_SIZE, help="The font size of the training font, in px.") parser.add_argument('--tessdata-path', '-p', type=str, action='store', default=df.TESSDATA_PATH, help="The path of the tessdata/ directory on your filesystem.") parser.add_argument('--word_list', '-w', type=str, action='store', default=df.WORD_LIST, help="The path of a file containing a list of frequent words.") parser.add_argument('--verbose', '-v', action='store_true', help="Use this argument if you want to display the training output.") args = parser.parse_args() perform_security_checks(args) # Check validity of args # Training process trainer = TesseractTrainer(dictionary_name=args.tesseract_lang, text=args.training_text, font_name=args.font_name, font_path=args.font_path, font_size=args.font_size, exp_number=args.experience_number, font_properties=args.font_properties, tessdata_path=args.tessdata_path, word_list=args.word_list, verbose=args.verbose) trainer.training() # generate a multipage tif from args.training_text, train on it and generate a traineddata file trainer.clean() # remove all files generated in the training process (except the traineddata file) trainer.add_trained_data() # copy the traineddata file to the tessdata/ directory