class TesseractTrainerTest(unittest.TestCase):

    GENERATED_DURING_TRAINING = ['unicharset', 'pffmtable', 'Microfeat', 'inttemp', 'normproto', 'mfunicharset']

    @classmethod
    def setUpClass(self):
        """ Setup a trainer with defaults arguments """
        self.trainer = TesseractTrainer(dictionary_name='test',
                                        text='text',
                                        font_name='helveticanarrow',
                                        font_path='./font/Helvetica-Narrow.otf',
                                        font_size=df.FONT_SIZE,
                                        exp_number=df.EXP_NUMBER,
                                        font_properties=df.FONT_PROPERTIES,
                                        tessdata_path=df.TESSDATA_PATH,
                                        word_list=df.WORD_LIST,
                                        verbose=False)
        self.prefix = '%s.%s.exp%d' % (self.trainer.dictionary_name, self.trainer.font_name, self.trainer.exp_number)

    @classmethod
    def tearDownClass(self):
        """ Delete all temporary files created during tests """
        tempfiles = glob.glob(self.trainer.dictionary_name + '.' + self.trainer.font_name + '*')
        for tempfile in tempfiles:
            os.remove(tempfile)
        os.remove(self.trainer.dictionary_name + '.traineddata')

    def assertFileExists(self, f):
        try:
            open(f)
        except IOError:
            raise AssertionError('The file %s does not exist.' % (f))

    def assertFileDoesNotExist(self, f):
        try:
            open(f)
        except IOError:
            pass
        else:
            raise AssertionError('The file %s does exist.' % (f))

    def test1_generate_boxfile(self):
        """ Test if the tif and box files are correctly created after executing the self.trainer._generate_boxfile() method. """
        self.trainer._generate_boxfile()
        self.assertFileExists(self.prefix + '.tif')
        self.assertFileExists(self.prefix + '.box')

    def test2_train_on_boxfile(self):
        """ Test if the tr file is correctly created after executing the self.trainer._train_on_boxfile() method. """
        self.trainer._train_on_boxfile()
        self.assertFileExists(self.prefix + '.tr')

    def test3_compute_character_set(self):
        """ Test if the unicharset file is correctly created after executing the self.trainer._compute_character_set() method. """
        self.trainer._compute_character_set()
        self.assertFileExists('unicharset')

    def test4_clustering(self):
        """ Test if the mfunicharset, inttemp, Microfeat and pffmtable files are correctly created
            after executing the self.trainer._clustering() method.
        """
        self.trainer._clustering()
        self.assertFileExists('mfunicharset')
        self.assertFileExists('inttemp')
        self.assertFileExists('Microfeat')
        self.assertFileExists('pffmtable')

    def test5_normalize(self):
        """ Test if the normproto file is correctly created after executing the self.trainer._normalize() method. """
        self.trainer._normalize()
        self.assertFileExists('normproto')

    def test6_rename_files(self):
        """ Check if all generated files were renamed to 'self.prefix'.old_name
            after executing the self.trainer._rename_files() method.
        """
        self.trainer._rename_files()
        for filename in self.GENERATED_DURING_TRAINING:
            if filename not in "mfunicharset":  # mfunicharset does not need to be renamed
                self.assertFileExists(self.trainer.dictionary_name + '.' + filename)
                self.assertFileDoesNotExist(filename)

    def test7_combine_data(self):
        """ Test if the traineddata file is correctly created after executing the self.combine_data() method. """
        self.trainer._combine_data()
        self.assertFileExists(self.trainer.dictionary_name + '.traineddata')

    def test8_clean(self):
        """ Test if the all generated files were removed after executing the self.trainer.clean() method. """
        self.trainer.clean()
        for filename in self.GENERATED_DURING_TRAINING:
            if filename not in "mfunicharset":  # mfunicharset does not need to be renamed
                self.assertFileDoesNotExist(self.trainer.dictionary_name + '.' + filename)
            else:
                self.assertFileDoesNotExist(filename)
예제 #2
0
    parser.add_argument('--experience_number', '-e', type=int, action='store', default=df.EXP_NUMBER,
        help="The number of the training experience.")
    parser.add_argument('--font-properties', '-f', type=str, action='store', default=df.FONT_PROPERTIES,
        help="The path of a file containing font properties for a list of training fonts.")
    parser.add_argument('--font-size', '-s', type=int, action='store', default=df.FONT_SIZE,
        help="The font size of the training font, in px.")
    parser.add_argument('--tessdata-path', '-p', type=str, action='store', default=df.TESSDATA_PATH,
        help="The path of the tessdata/ directory on your filesystem.")
    parser.add_argument('--word_list', '-w', type=str, action='store', default=df.WORD_LIST,
        help="The path of a file containing a list of frequent words.")
    parser.add_argument('--verbose', '-v', action='store_true',
        help="Use this argument if you want to display the training output.")
    args = parser.parse_args()

    perform_security_checks(args)  # Check validity of args

    # Training process
    trainer = TesseractTrainer(dictionary_name=args.tesseract_lang,
                                text=args.training_text,
                                font_name=args.font_name,
                                font_path=args.font_path,
                                font_size=args.font_size,
                                exp_number=args.experience_number,
                                font_properties=args.font_properties,
                                tessdata_path=args.tessdata_path,
                                word_list=args.word_list,
                                verbose=args.verbose)
    trainer.training()  # generate a multipage tif from args.training_text, train on it and generate a traineddata file
    trainer.clean()  # remove all files generated in the training process (except the traineddata file)
    trainer.add_trained_data()  # copy the traineddata file to the tessdata/ directory