def main2(input_file, output_file): with open(input_file) as f_in: data = json.load(f_in) tmp = list() tr = Translator() for elem in data['data']: trans = tr.translate(elem) if not list(trans.keys()): continue key = list(trans.keys())[0] val = trans[key][0] tmp.append({'word': elem, 'translations': {key: val}}) data['data'] = tmp with open(output_file, 'w+') as f_out: f_out.write(json.dumps(data))
def post(self): try: data = json.loads(self.request.body) except json.JSONDecodeError: logging.warning(f"get incorrect body: {self.request.body}") self.write(json.dumps({'error': 'incorrect-format'})) return if 'word' not in data: logging.warning(f"get incorrect body: {self.request.body}") self.write(json.dumps({'error': 'incorrect-format'})) return word = data['word'].lower().strip() if not check_english_word(word): logging.warning(f"get incorrect word: {word}") self.write(json.dumps({'error': 'incorrect-format'})) return try: translation = Translator().translate(word) except RuntimeError: logging.error(f"translation error") self.write(json.dumps({'error': 'translation-error'})) return logging.debug(f"get translation {translation} for word {data['word']}") self.write( json.dumps({ 'result': 'ok', 'data': { 'translations': translation } }))
def main4(input_file, output_file): with open(input_file) as f_in: data = json.load(f_in) tmp = list() tr = Translator() for elem in data['data']: if ' ' in elem['word']: continue key = list(elem['translations'].keys())[0] val = elem['translations'][key][0] tmp.append({ 'word': elem['word'].lower(), 'translations': { key: [val] } }) data['data'] = tmp with open(output_file, 'w+') as f_out: f_out.write(json.dumps(data))
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "0" parentdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) grandfatherdir = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.insert(0, parentdir) sys.path.insert(0, grandfatherdir) from utils.query_util import tokenize from utils.translator import Translator import dmn.char.dmn_data_utils as dmn_data_utils from dmn.char.dmn_plus import Config, DMN_PLUS translator = Translator() EPOCH = 5 def prepare_data(args, config): # train, valid, word_embedding, word2vec, updated_embedding, max_q_len, max_input_len, max_sen_len, \ # num_supporting_facts, vocab_size, candidate_size, candid2idx, \ # idx2candid, w2idx, idx2w = dmn_data_utils.load_data( # config, split_sentences=True) train_data, val_data, test_data, metadata = dmn_data_utils.load_data( config, split_sentences=True) # metadata = dict() data = dict() data['train'] = train_data data['valid'] = val_data
def train(self): """ Main training method for the Trainer class """ print("Starting training for {} epoch(s)".format(self.max_num_epochs - self.epoch)) if not self.params.boost_warmup: hard_training_instances = [] for epoch in range(self.max_num_epochs): self.epoch = epoch print("Epoch {}/{}".format(epoch + 1, self.max_num_epochs)) # train the model the train set epoch_start_time = time.time() # Make a copy of train_iter, add new examples to it (if boost==True), # and pass it into train_epoch() data_iterator = self.train_iter # If boost==True and epochs are past warmup, perform boosting if self.params.boost and epoch + 1 > self.params.boost_warmup: print("Boosting....") # make `Example` objects for all hard training instances example_objs = self.create_example_objs( hard_training_instances) # Add the new hard training instances to the original training data # thereby `boosting` the dataset with hard training examples existing_data = self.train_iter.data() existing_data.extend(example_objs) # Create new Dataset and iterator on the boosted data data_iterator = self.create_boosted_dataset(existing_data) train_loss_avg, hard_training_instances = self.train_epoch( data_iterator) # write epoch statistics to Tensorboard self.summary_writer.add_scalar("train/avg_loss_per_epoch", train_loss_avg, self.epoch) self.summary_writer.add_scalar("train/avg_perplexity_epoch", math.exp(train_loss_avg), self.epoch) epoch_end_time = time.time() epoch_mins, epoch_secs = self.epoch_time(epoch_start_time, epoch_end_time) print( f'Epoch: {epoch+1:02} | Avg Train Loss: {train_loss_avg} | Perpelxity: {math.exp(train_loss_avg)} | Time: {epoch_mins}m {epoch_secs}s' ) # validate the model on the dev set val_start_time = time.time() val_loss_avg = self.validate() val_end_time = time.time() val_mins, val_secs = self.epoch_time(val_start_time, val_end_time) # write validation statistics to Tensorboard self.summary_writer.add_scalar("val/loss", val_loss_avg, self.epoch) self.summary_writer.add_scalar("val/perplexity", math.exp(val_loss_avg), self.epoch) # TODO: write translations to Tensorboard # every `decode_every_num_epochs` epochs, write out translations using Greedy Decoding # to Tensorboard if (self.epoch + 1) % self.decode_every_num_epochs == 0: print("Performing Greedy Decoding...") num_translations = 5 dev_iter = copy.copy(self.dev_iter) decoder = Translator( model=self.model, dev_iter=list(dev_iter)[:num_translations], params=self.params, device=self.params.device) translations = decoder.greedy_decode(max_len=100) translations = [ " ".join(translation) for translation in translations ] for translation in translations: self.summary_writer.add_text("transformer/translation", translation, self.epoch) print( f'Avg Val Loss: {val_loss_avg} | Val Perplexity: {math.exp(val_loss_avg)} | Time: {val_mins}m {val_secs}s' ) print('\n') # use a scheduler in order to decay learning rate hasn't improved if self.scheduler is not None: self.scheduler.step(val_loss_avg) is_best = val_loss_avg < self.best_val_loss optim_dict = self.optimizer._optimizer.state_dict() if isinstance( self.optimizer, ScheduledOptimizer) else self.optimizer.state_dict() # save checkpoint self.save_checkpoint( { "epoch": epoch + 1, "state_dict": self.model.state_dict(), "optim_dict": optim_dict }, is_best=is_best, checkpoint=self.params.model_dir + "/checkpoints/") if is_best: print("- Found new lowest loss!") self.best_val_loss = val_loss_avg
replace_unk=False, phrase_table='', verbose=True, dump_beam='', n_best=1, batch_type='sents', gpu=0) fields, model, model_opt = load_test_model(opt, args) scorer = GNMTGlobalScorer.from_opt(opt) out_file = codecs.open(opt.output, 'w+', 'utf-8') translator = Translator.from_opt(model, fields, opt, model_opt, args, global_scorer=scorer, out_file=out_file, report_align=opt.report_align, report_score=False, logger=None) res = [] n = 1 with open(args.input_file, 'r') as f: lines = f.readlines() lines = [line.strip() for line in lines] translated = translator.translate(lines, batch_size=args.batch_size) for i in range(len(translated[1])): res.append(translated[1][i][0]) if args.output_file:
def main(params, greedy, beam_size, test): """ The main function for decoding a trained MT model Arguments: params: parameters related to the `model` that is being decoded greedy: whether or not to do greedy decoding beam_size: size of beam if doing beam search """ print("Loading dataset...") _, dev_iter, test_iterator, DE, EN = load_dataset(params.data_path, params.train_batch_size, params.dev_batch_size) de_size, en_size = len(DE.vocab), len(EN.vocab) print("[DE Vocab Size: ]: {}, [EN Vocab Size]: {}".format( de_size, en_size)) params.src_vocab_size = de_size params.tgt_vocab_size = en_size params.sos_index = EN.vocab.stoi["<s>"] params.pad_token = EN.vocab.stoi["<pad>"] params.eos_index = EN.vocab.stoi["</s>"] params.itos = EN.vocab.itos device = torch.device('cuda' if params.cuda else 'cpu') params.device = device # make the Seq2Seq model model = make_seq2seq_model(params) # load the saved model for evaluation if params.average > 1: print("Averaging the last {} checkpoints".format(params.average)) checkpoint = {} checkpoint["state_dict"] = average_checkpoints(params.model_dir, params.average) model = Trainer.load_checkpoint(model, checkpoint) else: model_path = os.path.join(params.model_dir + "checkpoints/", params.model_file) print("Restoring parameters from {}".format(model_path)) model = Trainer.load_checkpoint(model, model_path) # evaluate on the test set if test: print("Doing Beam Search on the Test Set") test_decoder = Translator(model, test_iterator, params, device) test_beam_search_outputs = test_decoder.beam_decode( beam_width=beam_size) test_decoder.output_decoded_translations( test_beam_search_outputs, "beam_search_outputs_size_test={}.en".format(beam_size)) return # instantiate a Translator object to translate SRC langauge to TRG language using Greedy/Beam Decoding decoder = Translator(model, dev_iter, params, device) if greedy: print("Doing Greedy Decoding...") greedy_outputs = decoder.greedy_decode(max_len=100) decoder.output_decoded_translations(greedy_outputs, "greedy_outputs.en") print("Evaluating BLEU Score on Greedy Tranlsation...") subprocess.call([ './utils/eval.sh', params.model_dir + "outputs/greedy_outputs.en" ]) if beam_size: print("Doing Beam Search...") beam_search_outputs = decoder.beam_decode(beam_width=beam_size) decoder.output_decoded_translations( beam_search_outputs, "beam_search_outputs_size={}.en".format(beam_size)) print("Evaluating BLEU Score on Beam Search Translation") subprocess.call([ './utils/eval.sh', params.model_dir + "outputs/beam_search_outputs_size={}.en".format(beam_size) ])