def exec(self): if self.decode_mode != 'greedy': logger.notice(f"Start decoding with beam search (with beam size: {self.config['solver']['beam_decode']['beam_size']})") raise NotImplementedError(f"{self.decode_mode} haven't supported yet") self._decode = self.beam_decode else: logger.notice("Start greedy decoding") if self.batch_size > 1: dev = 'gpu' if self.use_gpu else 'cpu' logger.log(f"Number of utterance batches to decode: {len(self.eval_set)}, decoding with {self.batch_size} batch_size using {dev}") self._decode = self.batch_greedy_decode self.njobs = 1 else: logger.log(f"Number of utterances to decode: {len(self.eval_set)}, decoding with {self.njobs} threads using cpu") self._decode = self.greedy_decode if self.njobs > 1: try: _ = Parallel(n_jobs=self.njobs)(delayed(self._decode)(i, x, ilen, y, olen) for i, (x, ilen, y, olen) in enumerate(self.eval_set)) #NOTE: cannot log comet here, since it cannot serialize except KeyboardInterrupt: logger.warning("Decoding stopped") else: logger.notice("Decoding done") # self.comet_exp.log_other('status','decoded') else: tbar = get_bar(total=len(self.eval_set), leave=True) for cur_b, (xs, ilens, ys, olens) in enumerate(self.eval_set): self.batch_greedy_decode(xs, ilens, ys, olens) tbar.update(1)
def train(self): try: while self.global_step < self.max_step: tbar = get_bar(total=self.eval_ival, \ desc=f"Step {self.global_step}", leave=True) for _ in range(self.eval_ival): #TODO: we can add sampling method to compare Meta and Multi fair idx, (x, ilens, ys, olens) = self.data_container.get_item()[0] batch_size = len(ys) info = self._train(idx, x, ilens, ys, olens, accent_idx=idx) self.train_info.add(info, batch_size) grad_norm = nn.utils.clip_grad_norm_( self.asr_model.parameters(), GRAD_CLIP) if math.isnan(grad_norm): logger.warning( f"grad norm NaN @ step {self.global_step}") else: self.asr_opt.step() if isinstance(self.asr_opt, TransformerOptimizer): self.log_msg(self.asr_opt.lr) else: self.log_msg() self.check_evaluate() self.global_step += 1 self.dashboard.step() del x, ilens, ys, olens tbar.update(1) if self.global_step % self.save_ival == 0: self.save_per_steps() self.dashboard.check() tbar.close() except KeyboardInterrupt: logger.warning("Pretraining stopped") self.save_per_steps() self.dashboard.set_status('pretrained(SIGINT)') else: logger.notice("Pretraining completed") self.dashboard.set_status('pretrained')
def evaluate(self): self.asr_model.eval() self.write_tr_logs() dev_info_ls = [ RunningAvgDict(decay_rate=1.) for _ in range(self.num_pretrain) ] for idx, dev_loader in enumerate(self.data_container.dev_loaders): tbar = get_bar( total=len(dev_loader), desc=f"Eval on {self.accents[idx]} @ step {self.global_step}") with torch.no_grad(): for cur_b, (x, ilens, ys, olens) in enumerate(dev_loader): if ilens.max() > self.dev_max_ilen: tbar.update(1) continue batch_size = len(ys) info = self._eval(idx, x, ilens, ys, olens) dev_info_ls[idx].add(info, batch_size) if cur_b % self.log_ival == 0: logger.log_info(dev_info_ls[idx], prefix='test') del x, ilens, ys, olens tbar.update(1) logger.flush() tbar.close() self.dashboard.log_info(f"dev_{self.accents[idx]}", dev_info_ls[idx]) self.write_dev_logs(f"dev_{self.accents[idx]}", dev_info_ls[idx]) dev_avg_info = RunningAvgDict(decay_rate=1.0) for dev_info in dev_info_ls: dev_avg_info.add({k: float(v) for k, v in dev_info.items()}) self.dashboard.log_info("dev", dev_avg_info) self.write_dev_logs("dev_avg", dev_avg_info) cur_cer = float(dev_avg_info['cer']) cur_wer = float(dev_avg_info['wer']) if cur_wer < self.best_wer: self.best_wer = cur_wer self.save_best_model() if cur_cer < self.best_cer: self.best_cer = cur_cer self.save_best_model('cer', only_stat=True) self.asr_model.train()
def train(self): self.evaluate() try: if self.save_verbose: self.save_init() while self.ep < self.max_epoch: tbar = get_bar(total=len(self.train_set), \ desc=f"Epoch {self.ep}", leave=True) for cur_b, (x, ilens, ys, olens) in enumerate(self.train_set): batch_size = len(ys) info = self._train(cur_b, x, ilens, ys, olens) self.train_info.add(info, batch_size) grad_norm = nn.utils.clip_grad_norm_( self.asr_model.parameters(), GRAD_CLIP) if math.isnan(grad_norm): logger.warning( f"grad norm NaN @ step {self.global_step}") else: self.asr_opt.step() if isinstance(self.asr_opt, TransformerOptimizer): self.log_msg(self.asr_opt.lr) else: self.log_msg() self.check_evaluate() self.global_step += 1 self.dashboard.step() del x, ilens, ys, olens tbar.update(1) self.ep += 1 self.save_per_epoch() self.dashboard.check() tbar.close() if self.eval_every_epoch: self.evaluate() except KeyboardInterrupt: logger.warning("Training stopped") self.evaluate() self.dashboard.set_status('trained(SIGINT)') else: logger.notice("Training completed") self.dashboard.set_status('trained')
def train(self): try: task_ids = list(range(self.num_pretrain)) while self.global_step < self.max_step: tbar = get_bar(total=self.eval_ival, \ desc=f"Step {self.global_step}", leave=True) for _ in range(self.eval_ival): shuffle(task_ids) #FIXME: Here split to inner-train and inner-test (should observe whether the performance drops) for accent_id in task_ids[:self.meta_batch_size]: # inner-loop learn tr_batches = self.data_container.get_item(accent_id, self.meta_k) self.run_task(tr_batches) # inner-loop test val_batch = self.data_container.get_item(accent_id)[0] batch_size = len(val_batch[1][2]) info = self._train(val_batch[0],*val_batch[1], accent_idx = val_batch[0]) grad_norm = nn.utils.clip_grad_norm_( self.asr_model.parameters(), GRAD_CLIP) if math.isnan(grad_norm): logger.warning(f"grad norm NaN @ step {self.global_step} on {self.accents[accent_id]}, ignore...") self._partial_meta_update() del val_batch self.train_info.add(info, batch_size) self._final_meta_update() self.log_msg(self.meta_opt.lr) self.check_evaluate() self.global_step += 1 self.dashboard.step() tbar.update(1) if self.global_step % self.save_ival == 0: self.save_per_steps() self.dashboard.check() tbar.close() except KeyboardInterrupt: logger.warning("Pretraining stopped") self.save_per_steps() self.dashboard.set_status('pretrained(SIGINT)') else: logger.notice("Pretraining completed") self.dashboard.set_status('pretrained')
def evaluate(self): self.asr_model.eval() dev_info = RunningAvgDict(decay_rate=1.) tbar = get_bar(total=len(self.dev_set), desc=f"Eval @step{self.global_step}", leave=True) with torch.no_grad(): for cur_b, (x, ilens, ys, olens) in enumerate(self.dev_set): if ilens.max() > self.dev_max_ilen: tbar.update(1) continue batch_size = len(ys) info = self._eval(cur_b, x, ilens, ys, olens) dev_info.add(info, batch_size) if cur_b % self.log_ival == 0: logger.log_info(dev_info, prefix='test') del x, ilens, ys, olens tbar.update(1) logger.flush() tbar.close() self.dashboard.log_info('dev', dev_info) self.write_logs(dev_info) cur_cer = float(dev_info['cer']) cur_wer = float(dev_info['wer']) if cur_wer < self.best_wer: self.best_wer = cur_wer self.save_best_model() if cur_cer < self.best_cer: self.best_cer = cur_cer self.save_best_model('cer', only_stat=True) if self.lr_scheduler is not None: self.lr_scheduler.step(float(dev_info['loss'])) self.asr_model.train()