def run(self): logger.debug('loader %d start' % self.thread_id) while True: items = list() for _ in range(self.batch_size): if self.index >= self.dataset_count: break input, label = self.dataset.get_item(self.index) if input is not None: items.append((input, label)) self.index += 1 if len(items) == 0: batch = self.create_empty_batch() self.queue.put(batch) break random.shuffle(items) batch = self.collate_fn(items) self.queue.put(batch) logger.debug('loader %d stop' % self.thread_id)
def supervised_train(model, config, epoch, total_time_step, queue, criterion, optimizer, device, train_begin, worker_num, print_every=10, teacher_forcing_ratio=0.90): r""" Args: train_begin: train begin time total_time_step: total time step in epoch epoch (int): present epoch config (Config): configuration model (torch.nn.Module): Model to be trained optimizer (torch.optim): optimizer for training teacher_forcing_ratio (float): The probability that teacher forcing will be used (default: 0.90) print_every (int): Parameters to determine how many steps to output queue (Queue.queue): queue for threading criterion (torch.nn): one of PyTorch’s loss function. Refer to http://pytorch.org/docs/master/nn.html#loss-functions for a list of them. device (torch.cuda): device used ('cuda' or 'cpu') worker_num (int): the number of cpu cores used Returns: loss, cer - **loss** (float): loss of present epoch - **cer** (float): character error rate """ epoch_loss_total = 0. print_loss_total = 0. total_num = 0 total_dist = 0 total_length = 0 time_step = 0 decay_speed = 1.0 RAMPUP_POWER = 3 RANMPUP_PERIOD = 3000 EXP_DECAY_PERIOD = total_time_step * 3 model.train() begin = epoch_begin = time.time() while True: # LR Wamp-Up if config.use_multistep_lr and epoch == 0 and time_step < RANMPUP_PERIOD: set_lr(optimizer, lr=config.high_plateau_lr * ((time_step + 1) / RANMPUP_PERIOD) ** RAMPUP_POWER) # LR Exponential-Decay if config.use_multistep_lr and (epoch == 1 or epoch == 2 or epoch == 3): decay_rate = config.low_plateau_lr / config.high_plateau_lr decay_speed *= decay_rate ** (1 / EXP_DECAY_PERIOD) set_lr(optimizer, config.high_plateau_lr * decay_speed) feats, scripts, feat_lens, target_lens = queue.get() if feats.shape[0] == 0: # empty feats means closing one loader worker_num -= 1 logger.debug('left train_loader: %d' % worker_num) if worker_num == 0: break else: continue inputs = feats.to(device) scripts = scripts.to(device) targets = scripts[:, 1:] model.module.flatten_parameters() y_hat, logit = model(inputs, scripts, teacher_forcing_ratio=teacher_forcing_ratio) loss = criterion(logit.contiguous().view(-1, logit.size(-1)), targets.contiguous().view(-1)) epoch_loss_total += loss.item() print_loss_total += loss.item() total_num += sum(feat_lens) dist, length = get_distance(targets, y_hat, id2char, EOS_TOKEN) total_dist += dist total_length += length optimizer.zero_grad() loss.backward() optimizer.step() time_step += 1 torch.cuda.empty_cache() if time_step % print_every == 0: current = time.time() elapsed = current - begin epoch_elapsed = (current - epoch_begin) / 60.0 train_elapsed = (current - train_begin) / 3600.0 logger.info('timestep: {:4d}/{:4d}, loss: {:.4f}, cer: {:.2f}, elapsed: {:.2f}s {:.2f}m {:.2f}h'.format( time_step, total_time_step, print_loss_total / print_every, total_dist / total_length, elapsed, epoch_elapsed, train_elapsed) ) print_loss_total = 0 begin = time.time() if time_step % 1000 == 0: save_step_result(train_step_result, epoch_loss_total / total_num, total_dist / total_length) if time_step % 10000 == 0: torch.save(model, "./data/weight_file/epoch_%s_step_%s.pt" % (str(epoch), str(time_step))) logger.info('train() completed') return epoch_loss_total / total_num, total_dist / total_length
def supervised_train(model, hparams, epoch, total_time_step, queue, criterion, optimizer, device, train_begin, worker_num, print_time_step=10, teacher_forcing_ratio=0.90): """ Args: model (torch.nn.Module): Model to be trained optimizer (torch.optim): optimizer for training teacher_forcing_ratio (float): The probability that teacher forcing will be used (default: 0.90) print_time_step (int): Parameters to determine how many steps to output queue (Queue.queue): queue for threading criterion (torch.nn): one of PyTorch’s loss function. Refer to http://pytorch.org/docs/master/nn.html#loss-functions for a list of them. device (torch.cuda): device used ('cuda' or 'cpu') worker_num (int): the number of cpu cores used Returns: loss, cer - **loss** (float): loss of present epoch - **cer** (float): character error rate """ total_loss = 0. total_num = 0 total_dist = 0 total_length = 0 total_sent_num = 0 time_step = 0 model.train() begin = epoch_begin = time.time() while True: if hparams.use_multistep_lr and epoch == 0 and time_step < 1000: ramp_up(optimizer, time_step, hparams) if hparams.use_multistep_lr and epoch == 1: exp_decay(optimizer, total_time_step, hparams) feats, targets, feat_lengths, label_lengths = queue.get() if feats.shape[0] == 0: # empty feats means closing one loader worker_num -= 1 logger.debug('left train_loader: %d' % (worker_num)) if worker_num == 0: break else: continue optimizer.zero_grad() feats = feats.to(device) targets = targets.to(device) target = targets[:, 1:] model.module.flatten_parameters() y_hat, logit = model(feats, targets, teacher_forcing_ratio=teacher_forcing_ratio) loss = criterion(logit.contiguous().view(-1, logit.size(-1)), target.contiguous().view(-1)) total_loss += loss.item() total_num += sum(feat_lengths) dist, length = get_distance(target, y_hat, id2char, EOS_TOKEN) total_dist += dist total_length += length total_sent_num += target.size(0) loss.backward() optimizer.step() if time_step % print_time_step == 0: current = time.time() elapsed = current - begin epoch_elapsed = (current - epoch_begin) / 60.0 train_elapsed = (current - train_begin) / 3600.0 logger.info( 'timestep: {:4d}/{:4d}, loss: {:.4f}, cer: {:.2f}, elapsed: {:.2f}s {:.2f}m {:.2f}h' .format(time_step, total_time_step, total_loss / total_num, total_dist / total_length, elapsed, epoch_elapsed, train_elapsed)) begin = time.time() if time_step % 1000 == 0: save_step_result(train_step_result, total_loss / total_num, total_dist / total_length) if time_step % 10000 == 0: torch.save(model, "model.pt") torch.save( model, "./data/weight_file/epoch_%s_step_%s.pt" % (str(epoch), str(time_step))) time_step += 1 supervised_train.cumulative_batch_count += 1 torch.cuda.empty_cache( ) # GPU memory free. if you have enough GPU memory, delete this line loss = total_loss / total_num cer = total_dist / total_length logger.info('train() completed') return loss, cer
def supervised_train(model, queue, perplexity, optimizer, device, print_every, epoch, teacher_forcing_ratio, worker_num, total_time_step, train_begin): print_loss_total = 0 # Reset every print_every epoch_loss_total = 0 # Reset every epoch total_num = 0 time_step = 0 model.train() begin = epoch_begin = time.time() while True: loss = perplexity inputs, targets, input_lens, target_lens = queue.get() if inputs.shape[0] == 0: # empty feats means closing one loader worker_num -= 1 logger.debug('left train_loader: %d' % worker_num) if worker_num == 0: break else: continue inputs = inputs.to(device) targets = targets.to(device) model.module.flatten_parameters() outputs = model(inputs, teacher_forcing_ratio=teacher_forcing_ratio) # Get loss loss.reset() for step, step_output in enumerate(outputs): batch_size = targets.size(0) loss.eval_batch(step_output.contiguous().view(batch_size, -1), targets[:, step]) # Backpropagation model.zero_grad() loss.backward() optimizer.step() loss = loss.get_loss() epoch_loss_total += loss print_loss_total += loss total_num += sum(input_lens) time_step += 1 torch.cuda.empty_cache() if time_step % print_every == 0: current = time.time() elapsed = current - begin epoch_elapsed = (current - epoch_begin) / 60.0 train_elapsed = (current - train_begin) / 3600.0 logger.info( 'timestep: {:4d}/{:4d}, perplexity: {:.4f}, elapsed: {:.2f}s {:.2f}m {:.2f}h' .format(time_step, total_time_step, print_loss_total / print_every, elapsed, epoch_elapsed, train_elapsed)) print_loss_total = 0 begin = time.time() if time_step % 50000 == 0: torch.save(model, "./data/epoch%s_%s.pt" % (str(epoch), str(time_step))) logger.info('train() completed') return epoch_loss_total / total_num