def update(self, labels, preds): check_label_shapes(labels, preds) if self.is_logging: log = LogUtil.getInstance().getlogger() labelUtil = LabelUtil.getInstance() self.batch_loss = 0. for label, pred in zip(labels, preds): label = label.asnumpy() pred = pred.asnumpy() seq_length = len(pred) / int( int(self.batch_size) / int(self.num_gpu)) for i in range(int(int(self.batch_size) / int(self.num_gpu))): l = remove_blank(label[i]) p = [] for k in range(int(seq_length)): p.append( np.argmax(pred[ k * int(int(self.batch_size) / int(self.num_gpu)) + i])) p = pred_best(p) l_distance = levenshtein_distance(l, p) self.total_n_label += len(l) self.total_l_dist += l_distance this_cer = float(l_distance) / float(len(l)) if self.is_logging: log.info("label: %s " % (labelUtil.convert_num_to_word(l))) log.info( "pred : %s , cer: %f (distance: %d/ label length: %d)" % (labelUtil.convert_num_to_word(p), this_cer, l_distance, len(l))) self.num_inst += 1 self.sum_metric += this_cer if self.is_epoch_end: loss = ctc_loss(l, pred, i, int(seq_length), int(self.batch_size), int(self.num_gpu)) self.batch_loss += loss if self.is_logging: log.info("loss: %f " % loss) self.total_ctc_loss += self.batch_loss
def update(self, labels, preds): check_label_shapes(labels, preds) if self.is_logging: log = LogUtil.getInstance().getlogger() labelUtil = LabelUtil.getInstance() self.batch_loss = 0. for label, pred in zip(labels, preds): label = label.asnumpy() pred = pred.asnumpy() seq_length = len(pred) / int(int(self.batch_size) / int(self.num_gpu)) for i in range(int(int(self.batch_size) / int(self.num_gpu))): l = remove_blank(label[i]) p = [] for k in range(int(seq_length)): p.append(np.argmax(pred[k * int(int(self.batch_size) / int(self.num_gpu)) + i])) p = pred_best(p) l_distance = levenshtein_distance(l, p) self.total_n_label += len(l) self.total_l_dist += l_distance this_cer = float(l_distance) / float(len(l)) if self.is_logging: log.info("label: %s " % (labelUtil.convert_num_to_word(l))) log.info("pred : %s , cer: %f (distance: %d/ label length: %d)" % ( labelUtil.convert_num_to_word(p), this_cer, l_distance, len(l))) self.num_inst += 1 self.sum_metric += this_cer if self.is_epoch_end: loss = ctc_loss(l, pred, i, int(seq_length), int(self.batch_size), int(self.num_gpu)) self.batch_loss += loss if self.is_logging: log.info("loss: %f " % loss) self.total_ctc_loss += self.batch_loss
def do_training(args, module, data_train, data_val, begin_epoch=0): from distutils.dir_util import mkpath from log_util import LogUtil log = LogUtil.getInstance().getlogger() mkpath(os.path.dirname(get_checkpoint_path(args))) #seq_len = args.config.get('arch', 'max_t_count') batch_size = args.config.getint('common', 'batch_size') save_checkpoint_every_n_epoch = args.config.getint('common', 'save_checkpoint_every_n_epoch') save_checkpoint_every_n_batch = args.config.getint('common', 'save_checkpoint_every_n_batch') enable_logging_train_metric = args.config.getboolean('train', 'enable_logging_train_metric') enable_logging_validation_metric = args.config.getboolean('train', 'enable_logging_validation_metric') contexts = parse_contexts(args) num_gpu = len(contexts) eval_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, is_logging=enable_logging_validation_metric,is_epoch_end=True) # mxboard setting loss_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, is_logging=enable_logging_train_metric,is_epoch_end=False) optimizer = args.config.get('optimizer', 'optimizer') learning_rate = args.config.getfloat('train', 'learning_rate') learning_rate_annealing = args.config.getfloat('train', 'learning_rate_annealing') mode = args.config.get('common', 'mode') num_epoch = args.config.getint('train', 'num_epoch') clip_gradient = args.config.getfloat('optimizer', 'clip_gradient') weight_decay = args.config.getfloat('optimizer', 'weight_decay') save_optimizer_states = args.config.getboolean('train', 'save_optimizer_states') show_every = args.config.getint('train', 'show_every') optimizer_params_dictionary = json.loads(args.config.get('optimizer', 'optimizer_params_dictionary')) kvstore_option = args.config.get('common', 'kvstore_option') n_epoch=begin_epoch is_bucketing = args.config.getboolean('arch', 'is_bucketing') if clip_gradient == 0: clip_gradient = None if is_bucketing and mode == 'load': model_file = args.config.get('common', 'model_file') model_name = os.path.splitext(model_file)[0] model_num_epoch = int(model_name[-4:]) model_path = 'checkpoints/' + str(model_name[:-5]) symbol, data_names, label_names = module(1600) model = STTBucketingModule( sym_gen=module, default_bucket_key=data_train.default_bucket_key, context=contexts) data_train.reset() model.bind(data_shapes=data_train.provide_data, label_shapes=data_train.provide_label, for_training=True) _, arg_params, aux_params = mx.model.load_checkpoint(model_path, model_num_epoch) model.set_params(arg_params, aux_params) module = model else: module.bind(data_shapes=data_train.provide_data, label_shapes=data_train.provide_label, for_training=True) if begin_epoch == 0 and mode == 'train': module.init_params(initializer=get_initializer(args)) lr_scheduler = SimpleLRScheduler(learning_rate=learning_rate) def reset_optimizer(force_init=False): optimizer_params = {'lr_scheduler': lr_scheduler, 'clip_gradient': clip_gradient, 'wd': weight_decay} optimizer_params.update(optimizer_params_dictionary) module.init_optimizer(kvstore=kvstore_option, optimizer=optimizer, optimizer_params=optimizer_params, force_init=force_init) if mode == "train": reset_optimizer(force_init=True) else: reset_optimizer(force_init=False) data_train.reset() data_train.is_first_epoch = True #mxboard setting mxlog_dir = args.config.get('common', 'mxboard_log_dir') summary_writer = SummaryWriter(mxlog_dir) while True: if n_epoch >= num_epoch: break loss_metric.reset() log.info('---------train---------') for nbatch, data_batch in enumerate(data_train): module.forward_backward(data_batch) module.update() # mxboard setting if (nbatch + 1) % show_every == 0: module.update_metric(loss_metric, data_batch.label) #summary_writer.add_scalar('loss batch', loss_metric.get_batch_loss(), nbatch) if (nbatch+1) % save_checkpoint_every_n_batch == 0: log.info('Epoch[%d] Batch[%d] SAVE CHECKPOINT', n_epoch, nbatch) module.save_checkpoint(prefix=get_checkpoint_path(args)+"n_epoch"+str(n_epoch)+"n_batch", epoch=(int((nbatch+1)/save_checkpoint_every_n_batch)-1), save_optimizer_states=save_optimizer_states) # commented for Libri_sample data set to see only train cer log.info('---------validation---------') data_val.reset() eval_metric.reset() for nbatch, data_batch in enumerate(data_val): # when is_train = False it leads to high cer when batch_norm module.forward(data_batch, is_train=True) module.update_metric(eval_metric, data_batch.label) # mxboard setting val_cer, val_n_label, val_l_dist, _ = eval_metric.get_name_value() log.info("Epoch[%d] val cer=%f (%d / %d)", n_epoch, val_cer, int(val_n_label - val_l_dist), val_n_label) curr_acc = val_cer summary_writer.add_scalar('CER validation', val_cer, n_epoch) assert curr_acc is not None, 'cannot find Acc_exclude_padding in eval metric' data_train.reset() data_train.is_first_epoch = False # mxboard setting train_cer, train_n_label, train_l_dist, train_ctc_loss = loss_metric.get_name_value() summary_writer.add_scalar('loss epoch', train_ctc_loss, n_epoch) summary_writer.add_scalar('CER train', train_cer, n_epoch) # save checkpoints if n_epoch % save_checkpoint_every_n_epoch == 0: log.info('Epoch[%d] SAVE CHECKPOINT', n_epoch) module.save_checkpoint(prefix=get_checkpoint_path(args), epoch=n_epoch, save_optimizer_states=save_optimizer_states) n_epoch += 1 lr_scheduler.learning_rate=learning_rate/learning_rate_annealing log.info('FINISH')
from stt_io_iter import STTIter from label_util import LabelUtil from log_util import LogUtil import numpy as np from stt_datagenerator import DataGenerator from stt_metric import STTMetric from stt_bi_graphemes_util import generate_bi_graphemes_dictionary from stt_bucketing_module import STTBucketingModule from stt_io_bucketingiter import BucketSTTIter sys.path.insert(0, "../../python") # os.environ['MXNET_ENGINE_TYPE'] = "NaiveEngine" os.environ['MXNET_ENGINE_TYPE'] = "ThreadedEnginePerDevice" os.environ['MXNET_ENABLE_GPU_P2P'] = "0" logUtil = LogUtil.getInstance() class WHCS: width = 0 height = 0 channel = 0 stride = 0 class ConfigLogger(object): def __init__(self, log): self.__log = log def __call__(self, config): self.__log.info("Config:")
def __init__(self): self._log = LogUtil.getInstance().getlogger() self._log.debug("LabelUtil init")
def do_training(args, module, data_train, data_val, begin_epoch=0): from distutils.dir_util import mkpath from log_util import LogUtil log = LogUtil.getInstance().getlogger() mkpath(os.path.dirname(get_checkpoint_path(args))) #seq_len = args.config.get('arch', 'max_t_count') batch_size = args.config.getint('common', 'batch_size') save_checkpoint_every_n_epoch = args.config.getint( 'common', 'save_checkpoint_every_n_epoch') save_checkpoint_every_n_batch = args.config.getint( 'common', 'save_checkpoint_every_n_batch') enable_logging_train_metric = args.config.getboolean( 'train', 'enable_logging_train_metric') enable_logging_validation_metric = args.config.getboolean( 'train', 'enable_logging_validation_metric') contexts = parse_contexts(args) num_gpu = len(contexts) eval_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, is_logging=enable_logging_validation_metric, is_epoch_end=True) # mxboard setting loss_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, is_logging=enable_logging_train_metric, is_epoch_end=False) optimizer = args.config.get('optimizer', 'optimizer') learning_rate = args.config.getfloat('train', 'learning_rate') learning_rate_annealing = args.config.getfloat('train', 'learning_rate_annealing') mode = args.config.get('common', 'mode') num_epoch = args.config.getint('train', 'num_epoch') clip_gradient = args.config.getfloat('optimizer', 'clip_gradient') weight_decay = args.config.getfloat('optimizer', 'weight_decay') save_optimizer_states = args.config.getboolean('train', 'save_optimizer_states') show_every = args.config.getint('train', 'show_every') optimizer_params_dictionary = json.loads( args.config.get('optimizer', 'optimizer_params_dictionary')) kvstore_option = args.config.get('common', 'kvstore_option') n_epoch = begin_epoch is_bucketing = args.config.getboolean('arch', 'is_bucketing') if clip_gradient == 0: clip_gradient = None if is_bucketing and mode == 'load': model_file = args.config.get('common', 'model_file') model_name = os.path.splitext(model_file)[0] model_num_epoch = int(model_name[-4:]) model_path = 'checkpoints/' + str(model_name[:-5]) symbol, data_names, label_names = module(1600) model = STTBucketingModule( sym_gen=module, default_bucket_key=data_train.default_bucket_key, context=contexts) data_train.reset() model.bind(data_shapes=data_train.provide_data, label_shapes=data_train.provide_label, for_training=True) _, arg_params, aux_params = mx.model.load_checkpoint( model_path, model_num_epoch) model.set_params(arg_params, aux_params) module = model else: module.bind(data_shapes=data_train.provide_data, label_shapes=data_train.provide_label, for_training=True) if begin_epoch == 0 and mode == 'train': module.init_params(initializer=get_initializer(args)) lr_scheduler = SimpleLRScheduler(learning_rate=learning_rate) def reset_optimizer(force_init=False): optimizer_params = { 'lr_scheduler': lr_scheduler, 'clip_gradient': clip_gradient, 'wd': weight_decay } optimizer_params.update(optimizer_params_dictionary) module.init_optimizer(kvstore=kvstore_option, optimizer=optimizer, optimizer_params=optimizer_params, force_init=force_init) if mode == "train": reset_optimizer(force_init=True) else: reset_optimizer(force_init=False) data_train.reset() data_train.is_first_epoch = True #mxboard setting mxlog_dir = args.config.get('common', 'mxboard_log_dir') summary_writer = SummaryWriter(mxlog_dir) while True: if n_epoch >= num_epoch: break loss_metric.reset() log.info('---------train---------') for nbatch, data_batch in enumerate(data_train): module.forward_backward(data_batch) module.update() # mxboard setting if (nbatch + 1) % show_every == 0: module.update_metric(loss_metric, data_batch.label) #summary_writer.add_scalar('loss batch', loss_metric.get_batch_loss(), nbatch) if (nbatch + 1) % save_checkpoint_every_n_batch == 0: log.info('Epoch[%d] Batch[%d] SAVE CHECKPOINT', n_epoch, nbatch) module.save_checkpoint( prefix=get_checkpoint_path(args) + "n_epoch" + str(n_epoch) + "n_batch", epoch=(int( (nbatch + 1) / save_checkpoint_every_n_batch) - 1), save_optimizer_states=save_optimizer_states) # commented for Libri_sample data set to see only train cer log.info('---------validation---------') data_val.reset() eval_metric.reset() for nbatch, data_batch in enumerate(data_val): # when is_train = False it leads to high cer when batch_norm module.forward(data_batch, is_train=True) module.update_metric(eval_metric, data_batch.label) # mxboard setting val_cer, val_n_label, val_l_dist, _ = eval_metric.get_name_value() log.info("Epoch[%d] val cer=%f (%d / %d)", n_epoch, val_cer, int(val_n_label - val_l_dist), val_n_label) curr_acc = val_cer summary_writer.add_scalar('CER validation', val_cer, n_epoch) assert curr_acc is not None, 'cannot find Acc_exclude_padding in eval metric' data_train.reset() data_train.is_first_epoch = False # mxboard setting train_cer, train_n_label, train_l_dist, train_ctc_loss = loss_metric.get_name_value( ) summary_writer.add_scalar('loss epoch', train_ctc_loss, n_epoch) summary_writer.add_scalar('CER train', train_cer, n_epoch) # save checkpoints if n_epoch % save_checkpoint_every_n_epoch == 0: log.info('Epoch[%d] SAVE CHECKPOINT', n_epoch) module.save_checkpoint(prefix=get_checkpoint_path(args), epoch=n_epoch, save_optimizer_states=save_optimizer_states) n_epoch += 1 lr_scheduler.learning_rate = learning_rate / learning_rate_annealing log.info('FINISH')