Пример #1
0
    def translate(self):
        """TODO: Docstring for translate.
        :returns: TODO

        """
        def wait():
            trans.wait()
            self.reset_ui()
        trainer = self.trainers[self.data['model']]
        model_path = self.get_model_path()
        state_path = self.get_state_path()
        in_path = self.data['input_file']
        out_path = self.data['output_file']
        glog.log('Model: %s' % model_path)
        glog.log('State: %s' % state_path)
        glog.log('File to translate: %s' % in_path)
        glog.log('File to save translation: %s' % out_path)
        trans = start_trans(trainer, state_path, model_path, in_path, out_path)
        trans_thread = Thread(target=wait)
        trans_thread.daemon = True
        self.btn_back['state'] = tk.DISABLED
        self.btn_trans['text'] = 'Translating...'
        self.btn_trans['state'] = tk.DISABLED
        trans_thread.start()
Пример #2
0
    def preprocess(self):
        """TODO: Docstring for preprocess.
        :returns: TODO

        """
        glog.log('Preparing text data')
        glog.log('Copying source text files to project directory')
        copyfile(self.conf['text1_path'], self.prel('lang1.txt'))
        copyfile(self.conf['text2_path'], self.prel('lang2.txt'))

        glog.log('Before tokenization preprocessing')
        if self.conf['norm_punctuation']:
            glog.log('Normalize punctuation')
            with ChangeInPlace(self.prel('lang1.txt')) as (pin, pout):
                preprocess.norm_punct(pin, pout)
            with ChangeInPlace(self.prel('lang2.txt')) as (pin, pout):
                preprocess.norm_punct(pin, pout)

        if self.conf['clean_corpus']:
            glog.log('Clean corpus')
            with ChangeInPlace(self.prel('lang1.txt')) as (p1in, p1out),\
                    ChangeInPlace(self.prel('lang2.txt')) as (p2in, p2out):
                preprocess.filter_by_words_len(p1in, p2in, p1out, p2out, self.conf['clean_corpus_n'])

        glog.log('Tokenization')
        preprocess.tokenize(
            self.conf['lang1'],
            self.prel('lang1.txt'),
            self.prel('lang1.tok.txt'))
        preprocess.tokenize(
            self.conf['lang2'],
            self.prel('lang2.txt'),
            self.prel('lang2.tok.txt'))
        glog.log('After tokenization preprocessing')

        if self.conf['lower_casing']:
            glog.log('Lower casing')
            with ChangeInPlace(self.prel('lang1.tok.txt')) as (pin, pout):
                preprocess.lowercase(pin, pout)
            with ChangeInPlace(self.prel('lang2.tok.txt')) as (pin, pout):
                preprocess.lowercase(pin, pout)

        glog.log('Preprocess')
        preprocess.preprocess(
            self.prel('vocab.lang1.pkl'),
            self.prel('binarized_text.lang1.pkl'),
            self.prel('lang1.tok.txt'))
        preprocess.preprocess(
            self.prel('vocab.lang2.pkl'),
            self.prel('binarized_text.lang2.pkl'),
            self.prel('lang2.tok.txt'))

        glog.log('Invert dict')
        preprocess.invert_dict(
            self.prel('vocab.lang1.pkl'),
            self.prel('ivocab.lang1.pkl'))
        preprocess.invert_dict(
            self.prel('vocab.lang2.pkl'),
            self.prel('ivocab.lang2.pkl'))

        glog.log('Convert pkl-hdf5')
        preprocess.convert_dict(
            self.prel('binarized_text.lang1.pkl'),
            self.prel('binarized_text.lang1.h5'))
        preprocess.convert_dict(
            self.prel('binarized_text.lang2.pkl'),
            self.prel('binarized_text.lang2.h5'))

        glog.log('Shuffle hdf5')
        preprocess.shuffle_hdf5(
            self.prel('binarized_text.lang1.h5'),
            self.prel('binarized_text.lang2.h5'),
            self.prel('binarized_text.lang1.shuffle.h5'),
            self.prel('binarized_text.lang2.shuffle.h5'))
        self.set_state(1)
        signal('trainer.jobfinished').send()