def translate(self): """TODO: Docstring for translate. :returns: TODO """ def wait(): trans.wait() self.reset_ui() trainer = self.trainers[self.data['model']] model_path = self.get_model_path() state_path = self.get_state_path() in_path = self.data['input_file'] out_path = self.data['output_file'] glog.log('Model: %s' % model_path) glog.log('State: %s' % state_path) glog.log('File to translate: %s' % in_path) glog.log('File to save translation: %s' % out_path) trans = start_trans(trainer, state_path, model_path, in_path, out_path) trans_thread = Thread(target=wait) trans_thread.daemon = True self.btn_back['state'] = tk.DISABLED self.btn_trans['text'] = 'Translating...' self.btn_trans['state'] = tk.DISABLED trans_thread.start()
def preprocess(self): """TODO: Docstring for preprocess. :returns: TODO """ glog.log('Preparing text data') glog.log('Copying source text files to project directory') copyfile(self.conf['text1_path'], self.prel('lang1.txt')) copyfile(self.conf['text2_path'], self.prel('lang2.txt')) glog.log('Before tokenization preprocessing') if self.conf['norm_punctuation']: glog.log('Normalize punctuation') with ChangeInPlace(self.prel('lang1.txt')) as (pin, pout): preprocess.norm_punct(pin, pout) with ChangeInPlace(self.prel('lang2.txt')) as (pin, pout): preprocess.norm_punct(pin, pout) if self.conf['clean_corpus']: glog.log('Clean corpus') with ChangeInPlace(self.prel('lang1.txt')) as (p1in, p1out),\ ChangeInPlace(self.prel('lang2.txt')) as (p2in, p2out): preprocess.filter_by_words_len(p1in, p2in, p1out, p2out, self.conf['clean_corpus_n']) glog.log('Tokenization') preprocess.tokenize( self.conf['lang1'], self.prel('lang1.txt'), self.prel('lang1.tok.txt')) preprocess.tokenize( self.conf['lang2'], self.prel('lang2.txt'), self.prel('lang2.tok.txt')) glog.log('After tokenization preprocessing') if self.conf['lower_casing']: glog.log('Lower casing') with ChangeInPlace(self.prel('lang1.tok.txt')) as (pin, pout): preprocess.lowercase(pin, pout) with ChangeInPlace(self.prel('lang2.tok.txt')) as (pin, pout): preprocess.lowercase(pin, pout) glog.log('Preprocess') preprocess.preprocess( self.prel('vocab.lang1.pkl'), self.prel('binarized_text.lang1.pkl'), self.prel('lang1.tok.txt')) preprocess.preprocess( self.prel('vocab.lang2.pkl'), self.prel('binarized_text.lang2.pkl'), self.prel('lang2.tok.txt')) glog.log('Invert dict') preprocess.invert_dict( self.prel('vocab.lang1.pkl'), self.prel('ivocab.lang1.pkl')) preprocess.invert_dict( self.prel('vocab.lang2.pkl'), self.prel('ivocab.lang2.pkl')) glog.log('Convert pkl-hdf5') preprocess.convert_dict( self.prel('binarized_text.lang1.pkl'), self.prel('binarized_text.lang1.h5')) preprocess.convert_dict( self.prel('binarized_text.lang2.pkl'), self.prel('binarized_text.lang2.h5')) glog.log('Shuffle hdf5') preprocess.shuffle_hdf5( self.prel('binarized_text.lang1.h5'), self.prel('binarized_text.lang2.h5'), self.prel('binarized_text.lang1.shuffle.h5'), self.prel('binarized_text.lang2.shuffle.h5')) self.set_state(1) signal('trainer.jobfinished').send()