def train(opt, train_loader, model, epoch): # average meters to record the training statistics batch_time = AverageMeter() data_time = AverageMeter() train_logger = LogCollector() # switch to train mode model.train_start() progbar = Progbar(train_loader.dataset.length) end = time.time() for i, train_data in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) # make sure train logger is used model.logger = train_logger # Update the model b_size, loss = model.train_emb(*train_data) # print loss progbar.add(b_size, values=[("loss", loss)]) # measure elapsed time batch_time.update(time.time() - end) end = time.time() # Record logs in tensorboard tb_logger.log_value('epoch', epoch, step=model.Eiters) tb_logger.log_value('step', i, step=model.Eiters) tb_logger.log_value('batch_time', batch_time.val, step=model.Eiters) tb_logger.log_value('data_time', data_time.val, step=model.Eiters) model.logger.tb_log(tb_logger, step=model.Eiters)
def fit(self, X, y, batch_size=128, nb_epoch=100, verbose=1, validation_split=0., validation_data=None, shuffle=True, show_accuracy=False): y = standardize_y(y) do_validation = False if validation_data: try: X_val, y_val = validation_data except: raise Exception("Invalid format for validation data; provide a tuple (X_val, y_val).") do_validation = True y_val = standardize_y(y_val) if verbose: print "Train on %d samples, validate on %d samples" % (len(y), len(y_val)) else: if 0 < validation_split < 1: # If a validation split size is given (e.g. validation_split=0.2) # then split X into smaller X and X_val, # and split y into smaller y and y_val. do_validation = True split_at = int(len(X) * (1 - validation_split)) (X, X_val) = (X[0:split_at], X[split_at:]) (y, y_val) = (y[0:split_at], y[split_at:]) if verbose: print "Train on %d samples, validate on %d samples" % (len(y), len(y_val)) index_array = np.arange(len(X)) for epoch in range(nb_epoch): if verbose: print 'Epoch', epoch if shuffle: np.random.shuffle(index_array) batches = make_batches(len(X), batch_size) progbar = Progbar(target=len(X)) for batch_index, (batch_start, batch_end) in enumerate(batches): batch_ids = index_array[batch_start:batch_end] X_batch = X[batch_ids] y_batch = y[batch_ids] if show_accuracy: loss, acc = self._train_with_acc(X_batch, y_batch) else: loss = self._train(X_batch, y_batch) # logging if verbose: is_last_batch = (batch_index == len(batches) - 1) if not is_last_batch or not do_validation: if show_accuracy: progbar.update(batch_end, [('loss', loss), ('acc.', acc)]) else: progbar.update(batch_end, [('loss', loss)]) else: if show_accuracy: val_loss, val_acc = self.test(X_val, y_val, accuracy=True) progbar.update(batch_end, [('loss', loss), ('acc.', acc), ('val. loss', val_loss), ('val. acc.', val_acc)]) else: val_loss = self.test(X_val, y_val, accuracy=False) progbar.update(batch_end, [('loss', loss), ('val. loss', val_loss)])
def process(options, collection): rootpath = options.rootpath feature = options.feature pooling = options.pooling overwrite = options.overwrite pooling_func = get_pooling_func(pooling) feat_dir = os.path.join(rootpath, collection, 'FeatureData', feature) res_dir = os.path.join(rootpath, collection, 'FeatureData', '%s_%s' % (pooling, feature)) if os.path.exists(res_dir): if overwrite: logger.info("%s exists. overwrite", res_dir) else: logger.info("%s exists. quit", res_dir) return 0 feat_file = BigFile(feat_dir) video2frames = {} for frame_id in feat_file.names: video_id, frame_index = frame_id.rsplit('_', 1) frame_index = int(frame_index) video2frames.setdefault(video_id, []).append(frame_id) if not os.path.exists(res_dir): os.makedirs(res_dir) res_binary_file = os.path.join(res_dir, 'feature.bin') fw = open(res_binary_file, 'wb') videoset = [] pbar = Progbar(len(video2frames)) for video_id, frame_id_list in video2frames.iteritems(): renamed, vectors = feat_file.read(frame_id_list) name2vec = dict(zip(renamed, vectors)) frame_id_list.sort(key=lambda v: int(v.rsplit('_', 1)[-1])) feat_matrix = np.zeros((len(renamed), len(vectors[0]))) for i, frame_id in enumerate(frame_id_list): feat_matrix[i, :] = name2vec[frame_id] video_vec = pooling_func(feat_matrix) video_vec.astype(np.float32).tofile(fw) videoset.append(video_id) pbar.add(1) fw.close() fw = open(os.path.join(res_dir, 'id.txt'), 'w') fw.write(' '.join(videoset)) fw.close() fw = open(os.path.join(res_dir, 'shape.txt'), 'w') fw.write('%d %d' % (len(videoset), len(video_vec))) fw.close() logger.info("%s pooling -> %dx%d video feature file", pooling, len(videoset), len(video_vec))
def fit_epoch(self, train_data, batch_size=None, incl_progbar=True): '''Fit on training data for an epoch''' if incl_progbar: progbar = Progbar(target=len(train_data)*batch_size if batch_size else len(train_data)) for (word_id_batch, tag_id_batch, deprel_id_batch), class_batch in \ train_data: loss = self.fit_batch( word_id_batch, tag_id_batch, deprel_id_batch, class_batch) if incl_progbar: progbar.add(word_id_batch.shape[0], [("Cross-entropy", loss)])
def process(options, collection, featnames): rootpath = options.rootpath target_featname = featnames featnames = featnames.split('+') target_feat_dir = os.path.join(rootpath, collection, 'FeatureData', target_featname) if os.path.exists(target_feat_dir): if options.overwrite: logger.info('%s exists! overwrite.', target_feat_dir) else: logger.info('%s exists! quit.', target_feat_dir) sys.exit(0) else: os.makedirs(target_feat_dir) target_binary_file = os.path.join(target_feat_dir, 'feature.bin') target_id_file = os.path.join(target_feat_dir, 'id.txt') feat_dim = 0 img_ids = [] featfiles = [] for i, feat in enumerate(featnames): feat_dir = os.path.join(rootpath, collection, 'FeatureData', feat) featfile = BigFile(feat_dir) feat_dim += featfile.ndims if i == 0: img_ids = featfile.names else: assert len(img_ids) == len(featfile.names) and set(img_ids) == set( featfile.names), '%s not match target feature' % feat featfiles.append(featfile) with open(target_binary_file, 'w') as fw: progbar = Progbar(len(img_ids)) for im in img_ids: target_feat_vec = [] for feat in featfiles: vec = feat.read_one(im) target_feat_vec.extend(vec) vec = np.array(target_feat_vec, dtype=np.float32) vec.tofile(fw) progbar.add(1) with open(os.path.join(target_feat_dir, 'id.txt'), 'w') as fw: fw.write(' '.join(img_ids)) with open(os.path.join(target_feat_dir, 'shape.txt'), 'w') as fw: fw.write('%d %d' % (len(img_ids), feat_dim)) logger.info('%s: (%d, %d)', target_featname, len(img_ids), feat_dim)
def fit(self, X, y, batch_size=128, nb_epoch=100, verbose=1): y = standardize_y(y) for epoch in range(nb_epoch): if verbose: print 'Epoch', epoch nb_batch = len(X)/batch_size+1 progbar = Progbar(target=len(X)) for batch_index in range(0, nb_batch): batch = range(batch_index*batch_size, min(len(X), (batch_index+1)*batch_size)) if not batch: break loss = self._train(X[batch], y[batch]) if verbose: progbar.update(batch[-1]+1, [('loss', loss)])
def fit(self, X, y, batch_size=128, nb_epoch=100, verbose=1): y = standardize_y(y) for epoch in range(nb_epoch): if verbose: print('Epoch', epoch) nb_batch = len(X) // batch_size + 1 progbar = Progbar(target=len(X)) for batch_index in range(0, nb_batch): batch = range(batch_index * batch_size, min(len(X), (batch_index + 1) * batch_size)) if not batch: break loss = self._train(X[batch], y[batch]) if verbose: progbar.update(batch[-1] + 1, [('loss', loss)])
def predict_proba(self, X, batch_size=128, verbose=1): batches = make_batches(len(X), batch_size) if verbose: progbar = Progbar(target=len(X)) for batch_index, (batch_start, batch_end) in enumerate(batches): X_batch = X[batch_start:batch_end] batch_preds = self._predict(X_batch) if batch_index == 0: shape = (len(X),) + batch_preds.shape[1:] preds = np.zeros(shape) preds[batch_start:batch_end] = batch_preds if verbose: progbar.update(batch_end) return preds
def fit(self, X, y, batch_size=128, nb_epoch=100, verbose=1, validation_split=0., validation_data=None, shuffle=True, show_accuracy=False): y = standardize_y(y) do_validation = False if validation_data: try: X_val, y_val = validation_data except: raise Exception("Invalid format for validation data; provide a tuple (X_val, y_val).") do_validation = True y_val = standardize_y(y_val) if verbose: print "Train on %d samples, validate on %d samples" % (len(y), len(y_val)) else: if 0 < validation_split < 1: # If a validation split size is given (e.g. validation_split=0.2) # then split X into smaller X and X_val, # and split y into smaller y and y_val. do_validation = True split_at = int(len(X) * (1 - validation_split)) (X, X_val) = (X[0:split_at], X[split_at:]) (y, y_val) = (y[0:split_at], y[split_at:]) if verbose: print "Train on %d samples, validate on %d samples" % (len(y), len(y_val)) index_array = np.arange(len(X)) for epoch in range(nb_epoch): if verbose: print 'Epoch', epoch if shuffle: np.random.shuffle(index_array) batches = make_batches(len(X), batch_size) progbar = Progbar(target=len(X))
def process(options, collection, feat_name): overwrite = options.overwrite rootpath = options.rootpath feature_dir = os.path.join(rootpath, collection, 'feature') resdir = os.path.join(rootpath, collection, 'FeatureData', feat_name) train_csv = os.path.join(rootpath, collection, 'split', 'train.csv') val_csv = os.path.join(rootpath, collection, 'split', 'val.csv') test_csv = os.path.join(rootpath, collection, 'split', 'test.csv') train_val_test_set = [] train_val_test_set.extend(map(str.strip, open(train_csv).readlines())) train_val_test_set.extend(map(str.strip, open(val_csv).readlines())) train_val_test_set.extend(map(str.strip, open(test_csv).readlines())) target_feat_file = os.path.join(resdir, 'id.feature.txt') if checkToSkip(os.path.join(resdir, 'feature.bin'), overwrite): sys.exit(0) makedirsforfile(target_feat_file) frame_count = [] print 'Processing %s - %s' % (collection, feat_name) with open(target_feat_file, 'w') as fw_feat: progbar = Progbar(len(train_val_test_set)) for d in train_val_test_set: feat_file = os.path.join(feature_dir, d, '%s-%s.npy' % (d, feat_name)) feats = np.load(feat_file) if len(feats.shape) == 1: # video level feature dim = feats.shape[0] fw_feat.write('%s %s\n' % (d, ' '.join(['%.6f' % x for x in feats]))) elif len(feats.shape) == 2: # frame level feature frames, dim = feats.shape frame_count.append(frames) for i in range(frames): frame_id = d + '_' + str(i) fw_feat.write( '%s %s\n' % (frame_id, ' '.join(['%.6f' % x for x in feats[i]]))) progbar.add(1) text2bin(dim, [target_feat_file], resdir, 1) os.system('rm %s' % target_feat_file)
def fit(self, X, y, batch_size=128, nb_epoch=100, verbose=1, validation_split=0., shuffle=True): # If a validation split size is given (e.g. validation_split=0.2) # then split X into smaller X and X_val, # and split y into smaller y and y_val. y = standardize_y(y) do_validation = False if validation_split > 0 and validation_split < 1: do_validation = True split_at = int(len(X) * (1 - validation_split)) (X, X_val) = (X[0:split_at], X[split_at:]) (y, y_val) = (y[0:split_at], y[split_at:]) if verbose: print "Train on %d samples, validate on %d samples" % (len(y), len(y_val)) index_array = np.arange(len(X)) for epoch in range(nb_epoch): if verbose: print 'Epoch', epoch if shuffle: np.random.shuffle(index_array) nb_batch = len(X)/batch_size+1 progbar = Progbar(target=len(X)) for batch_index in range(0, nb_batch): batch_start = batch_index*batch_size batch_end = min(len(X), (batch_index+1)*batch_size) batch_ids = index_array[batch_start:batch_end] X_batch = X[batch_ids] y_batch = y[batch_ids] loss = self._train(X_batch, y_batch) if verbose: is_last_batch = (batch_index == nb_batch - 1) if not is_last_batch or not do_validation: progbar.update(batch_end, [('loss', loss)]) else: progbar.update(batch_end, [('loss', loss), ('val. loss', self.test(X_val, y_val))])
def evaluate(self, X, y, batch_size=128, show_accuracy=False, verbose=1): y = standardize_y(y) if show_accuracy: tot_acc = 0. tot_score = 0. batches = make_batches(len(X), batch_size) progbar = Progbar(target=len(X)) for batch_index, (batch_start, batch_end) in enumerate(batches): X_batch = X[batch_start:batch_end] y_batch = y[batch_start:batch_end] if show_accuracy: loss, acc = self._test_with_acc(X_batch, y_batch) tot_acc += acc else: loss = self._test(X_batch, y_batch) tot_score += loss if verbose: if show_accuracy: progbar.update(batch_end, [('loss', loss), ('acc.', acc)]) else: progbar.update(batch_end, [('loss', loss)]) if show_accuracy: return tot_score/len(batches), tot_acc/len(batches) else: return tot_score/len(batches)
def fit(self, X, y, batch_size=128, nb_epoch=100, verbose=1, validation_split=0., shuffle=True): # If a validation split size is given (e.g. validation_split=0.2) # then split X into smaller X and X_val, # and split y into smaller y and y_val. y = standardize_y(y) do_validation = False if validation_split > 0 and validation_split < 1: do_validation = True split_at = int(len(X) * (1 - validation_split)) (X, X_val) = (X[0:split_at], X[split_at:]) (y, y_val) = (y[0:split_at], y[split_at:]) if verbose: print "Train on %d samples, validate on %d samples" % ( len(y), len(y_val)) index_array = np.arange(len(X)) for epoch in range(nb_epoch): if verbose: print 'Epoch', epoch if shuffle: np.random.shuffle(index_array) nb_batch = int(np.ceil(len(X) / float(batch_size))) progbar = Progbar(target=len(X)) for batch_index in range(0, nb_batch): batch_start = batch_index * batch_size batch_end = min(len(X), (batch_index + 1) * batch_size) if shuffle: batch_ids = index_array[batch_start:batch_end] else: batch_ids = slice(batch_start, batch_end) X_batch = X[batch_ids] y_batch = y[batch_ids] loss = self._train(X_batch, y_batch) if verbose: is_last_batch = (batch_index == nb_batch - 1) if not is_last_batch or not do_validation: progbar.update(batch_end, [('loss', loss)]) else: progbar.update( batch_end, [('loss', loss), ('val. loss', self.test(X_val, y_val))])
def train(model, criterion, criterion_st, data_loader, optimizer, epoch): model = model.train() epoch_time = 0 avg_linear_loss = 0 avg_mel_loss = 0 avg_stop_loss = 0 avg_attn_loss = 0 print(" | > Epoch {}/{}".format(epoch, c.epochs)) progbar = Progbar(len(data_loader.dataset) / c.batch_size) progbar_display = {} for num_iter, data in enumerate(data_loader): start_time = time.time() # setup input data text_input = data[0] text_lengths = data[1] linear_spec = data[2] mel_spec = data[3] mel_lengths = data[4] stop_target = data[5] current_step = num_iter + args.restore_step + \ epoch * len(data_loader) + 1 # setup lr current_lr = lr_decay(c.lr, current_step, c.warmup_steps) for params_group in optimizer.param_groups: params_group['lr'] = current_lr optimizer.zero_grad() stop_target = stop_target.view(text_input.shape[0], stop_target.size(1) // c.r, -1) stop_target = (stop_target.sum(2) > 0.0).float() # dispatch data to GPU if use_cuda: text_input = text_input.cuda() mel_spec = mel_spec.cuda() mel_lengths = mel_lengths.cuda() linear_spec = linear_spec.cuda() stop_target = stop_target.cuda() # create attention mask if c.mk > 0.0: N = text_input.shape[1] T = mel_spec.shape[1] // c.r M = create_attn_mask(N, T, 0.03) mk = mk_decay(c.mk, c.epochs, epoch) # forward pass mel_output, linear_output, alignments, stop_tokens =\ model.forward(text_input, mel_spec) # loss computation mel_loss = criterion(mel_output, mel_spec, mel_lengths) linear_loss = criterion(linear_output, linear_spec, mel_lengths) stop_loss = criterion_st(stop_tokens, stop_target) if c.priority_freq: linear_loss = 0.5 * linear_loss\ + 0.5 * criterion(linear_output[:, :, :n_priority_freq], linear_spec[:, :, :n_priority_freq], mel_lengths) loss = mel_loss + linear_loss + stop_loss if c.mk > 0.0: attention_loss = criterion(alignments, M, mel_lengths) loss += mk * attention_loss avg_attn_loss += attention_loss.item() progbar_display['attn_loss'] = attention_loss.item() # backpass and check the grad norm loss.backward() grad_norm, skip_flag = check_update(model, 0.5, 100) if skip_flag: optimizer.zero_grad() print(" | > Iteration skipped!!") continue optimizer.step() step_time = time.time() - start_time epoch_time += step_time progbar_display['total_loss'] = loss.item() progbar_display['linear_loss'] = linear_loss.item() progbar_display['mel_loss'] = mel_loss.item() progbar_display['stop_loss'] = stop_loss.item() progbar_display['grad_norm'] = grad_norm.item() # update progbar.update(num_iter+1, values=list(progbar_display.items())) avg_linear_loss += linear_loss.item() avg_mel_loss += mel_loss.item() avg_stop_loss += stop_loss.item() # Plot Training Iter Stats tb.add_scalar('TrainIterLoss/TotalLoss', loss.item(), current_step) tb.add_scalar('TrainIterLoss/LinearLoss', linear_loss.item(), current_step) tb.add_scalar('TrainIterLoss/MelLoss', mel_loss.item(), current_step) tb.add_scalar('Params/LearningRate', optimizer.param_groups[0]['lr'], current_step) tb.add_scalar('Params/GradNorm', grad_norm, current_step) tb.add_scalar('Time/StepTime', step_time, current_step) if current_step % c.save_step == 0: if c.checkpoint: # save model save_checkpoint(model, optimizer, linear_loss.item(), OUT_PATH, current_step, epoch) # Diagnostic visualizations const_spec = linear_output[0].data.cpu().numpy() gt_spec = linear_spec[0].data.cpu().numpy() const_spec = plot_spectrogram(const_spec, data_loader.dataset.ap) gt_spec = plot_spectrogram(gt_spec, data_loader.dataset.ap) tb.add_image('Visual/Reconstruction', const_spec, current_step) tb.add_image('Visual/GroundTruth', gt_spec, current_step) align_img = alignments[0].data.cpu().numpy() align_img = plot_alignment(align_img) tb.add_image('Visual/Alignment', align_img, current_step) # Sample audio audio_signal = linear_output[0].data.cpu().numpy() data_loader.dataset.ap.griffin_lim_iters = 60 audio_signal = data_loader.dataset.ap.inv_spectrogram( audio_signal.T) try: tb.add_audio('SampleAudio', audio_signal, current_step, sample_rate=c.sample_rate) except: # print("\n > Error at audio signal on TB!!") # print(audio_signal.max()) # print(audio_signal.min()) pass avg_linear_loss /= (num_iter + 1) avg_mel_loss /= (num_iter + 1) avg_stop_loss /= (num_iter + 1) avg_total_loss = avg_mel_loss + avg_linear_loss + avg_stop_loss # Plot Training Epoch Stats tb.add_scalar('TrainEpochLoss/TotalLoss', avg_total_loss, current_step) tb.add_scalar('TrainEpochLoss/LinearLoss', avg_linear_loss, current_step) tb.add_scalar('TrainEpochLoss/StopLoss', avg_stop_loss, current_step) tb.add_scalar('TrainEpochLoss/MelLoss', avg_mel_loss, current_step) if c.mk > 0: avg_attn_loss /= (num_iter + 1) tb.add_scalar('TrainEpochLoss/AttnLoss', avg_attn_loss, current_step) tb.add_scalar('Time/EpochTime', epoch_time, epoch) epoch_time = 0 return avg_linear_loss, current_step
def main(args): # setup output paths and read configs c = load_config(args.config_path) _ = os.path.dirname(os.path.realpath(__file__)) OUT_PATH = os.path.join(_, c.output_path) OUT_PATH = create_experiment_folder(OUT_PATH) CHECKPOINT_PATH = os.path.join(OUT_PATH, 'checkpoints') shutil.copyfile(args.config_path, os.path.join(OUT_PATH, 'config.json')) # Ctrl+C handler to remove empty experiment folder def signal_handler(signal, frame): print(" !! Pressed Ctrl+C !!") remove_experiment_folder(OUT_PATH) sys.exit(0) signal.signal(signal.SIGINT, signal_handler) dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata.csv'), os.path.join(c.data_path, 'wavs'), c.r, c.sample_rate, c.text_cleaner) model = Tacotron(c.embedding_size, c.hidden_size, c.num_mels, c.num_freq, c.r) if use_cuda: model = nn.DataParallel(model.cuda()) optimizer = optim.Adam(model.parameters(), lr=c.lr) try: checkpoint = torch.load( os.path.join(CHECKPOINT_PATH, 'checkpoint_%d.pth.tar' % args.restore_step)) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) print("\n > Model restored from step %d\n" % args.restore_step) except: print("\n > Starting a new training\n") model = model.train() if not os.path.exists(CHECKPOINT_PATH): os.mkdir(CHECKPOINT_PATH) if use_cuda: criterion = nn.L1Loss().cuda() else: criterion = nn.L1Loss() n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq) for epoch in range(c.epochs): dataloader = DataLoader(dataset, batch_size=c.batch_size, shuffle=True, collate_fn=dataset.collate_fn, drop_last=True, num_workers=32) progbar = Progbar(len(dataset) / c.batch_size) for i, data in enumerate(dataloader): text_input = data[0] magnitude_input = data[1] mel_input = data[2] current_step = i + args.restore_step + epoch * len(dataloader) + 1 optimizer.zero_grad() try: mel_input = np.concatenate( (np.zeros([c.batch_size, 1, c.num_mels], dtype=np.float32), mel_input[:, 1:, :]), axis=1) except: raise TypeError("not same dimension") if use_cuda: text_input_var = Variable(torch.from_numpy(text_input).type( torch.cuda.LongTensor), requires_grad=False).cuda() mel_input_var = Variable(torch.from_numpy(mel_input).type( torch.cuda.FloatTensor), requires_grad=False).cuda() mel_spec_var = Variable(torch.from_numpy(mel_input).type( torch.cuda.FloatTensor), requires_grad=False).cuda() linear_spec_var = Variable( torch.from_numpy(magnitude_input).type( torch.cuda.FloatTensor), requires_grad=False).cuda() else: text_input_var = Variable(torch.from_numpy(text_input).type( torch.LongTensor), requires_grad=False) mel_input_var = Variable(torch.from_numpy(mel_input).type( torch.FloatTensor), requires_grad=False) mel_spec_var = Variable(torch.from_numpy(mel_input).type( torch.FloatTensor), requires_grad=False) linear_spec_var = Variable( torch.from_numpy(magnitude_input).type(torch.FloatTensor), requires_grad=False) mel_output, linear_output, alignments =\ model.forward(text_input_var, mel_input_var) mel_loss = criterion(mel_output, mel_spec_var) linear_loss = torch.abs(linear_output - linear_spec_var) linear_loss = 0.5 * \ torch.mean(linear_loss) + 0.5 * \ torch.mean(linear_loss[:, :n_priority_freq, :]) loss = mel_loss + linear_loss loss = loss.cuda() start_time = time.time() loss.backward() nn.utils.clip_grad_norm(model.parameters(), 1.) optimizer.step() time_per_step = time.time() - start_time progbar.update(i, values=[('total_loss', loss.data[0]), ('linear_loss', linear_loss.data[0]), ('mel_loss', mel_loss.data[0])]) if current_step % c.save_step == 0: checkpoint_path = 'checkpoint_{}.pth.tar'.format(current_step) checkpoint_path = os.path.join(OUT_PATH, checkpoint_path) save_checkpoint( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'step': current_step, 'total_loss': loss.data[0], 'linear_loss': linear_loss.data[0], 'mel_loss': mel_loss.data[0], 'date': datetime.date.today().strftime("%B %d, %Y") }, checkpoint_path) print(" > Checkpoint is saved : {}".format(checkpoint_path)) if current_step in c.decay_step: optimizer = adjust_learning_rate(optimizer, current_step)
def train(model, criterion, criterion_st, data_loader, optimizer, optimizer_st, epoch): model = model.train() epoch_time = 0 avg_linear_loss = 0 avg_mel_loss = 0 avg_stop_loss = 0 print(" | > Epoch {}/{}".format(epoch, c.epochs)) progbar = Progbar(len(data_loader.dataset) / c.batch_size) n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq) for num_iter, data in enumerate(data_loader): start_time = time.time() # setup input data text_input = data[0] print(text_input) text_lengths = data[1] linear_input = data[2] mel_input = data[3] mel_lengths = data[4] stop_targets = data[5] # set stop targets view, we predict a single stop token per r frames prediction stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float() current_step = num_iter + args.restore_step + \ epoch * len(data_loader) + 1 # setup lr current_lr = lr_decay(c.lr, current_step, c.warmup_steps) current_lr_st = lr_decay(c.lr, current_step, c.warmup_steps) for params_group in optimizer.param_groups: params_group['lr'] = current_lr for params_group in optimizer_st.param_groups: params_group['lr'] = current_lr_st optimizer.zero_grad() optimizer_st.zero_grad() # dispatch data to GPU if use_cuda: text_input = text_input.cuda() mel_input = mel_input.cuda() mel_lengths = mel_lengths.cuda() linear_input = linear_input.cuda() stop_targets = stop_targets.cuda() # forward pass mel_output, linear_output, alignments, stop_tokens = \ model.forward(text_input, mel_input) # loss computation stop_loss = criterion_st(stop_tokens, stop_targets) mel_loss = criterion(mel_output, mel_input, mel_lengths) linear_loss = 0.5 * criterion(linear_output, linear_input, mel_lengths) \ + 0.5 * criterion(linear_output[:, :, :n_priority_freq], linear_input[:, :, :n_priority_freq], mel_lengths) loss = mel_loss + linear_loss # backpass and check the grad norm for spec losses loss.backward(retain_graph=True) grad_norm, skip_flag = check_update(model, 0.5, 100) if skip_flag: optimizer.zero_grad() print(" | > Iteration skipped!!") continue optimizer.step() # backpass and check the grad norm for stop loss stop_loss.backward() grad_norm_st, skip_flag = check_update(model.module.decoder.stopnet, 0.5, 100) if skip_flag: optimizer_st.zero_grad() print(" | > Iteration skipped fro stopnet!!") continue optimizer_st.step() step_time = time.time() - start_time epoch_time += step_time # update progbar.update(num_iter + 1, values=[('total_loss', loss.item()), ('linear_loss', linear_loss.item()), ('mel_loss', mel_loss.item()), ('stop_loss', stop_loss.item()), ('grad_norm', grad_norm.item()), ('grad_norm_st', grad_norm_st.item())]) avg_linear_loss += linear_loss.item() avg_mel_loss += mel_loss.item() avg_stop_loss += stop_loss.item() # Plot Training Iter Stats tb.add_scalar('TrainIterLoss/TotalLoss', loss.item(), current_step) tb.add_scalar('TrainIterLoss/LinearLoss', linear_loss.item(), current_step) tb.add_scalar('TrainIterLoss/MelLoss', mel_loss.item(), current_step) tb.add_scalar('Params/LearningRate', optimizer.param_groups[0]['lr'], current_step) tb.add_scalar('Params/GradNorm', grad_norm, current_step) tb.add_scalar('Params/GradNormSt', grad_norm_st, current_step) tb.add_scalar('Time/StepTime', step_time, current_step) if current_step % c.save_step == 0: if c.checkpoint: # save model save_checkpoint(model, optimizer, linear_loss.item(), OUT_PATH, current_step, epoch) # Diagnostic visualizations const_spec = linear_output[0].data.cpu().numpy() gt_spec = linear_input[0].data.cpu().numpy() const_spec = plot_spectrogram(const_spec, data_loader.dataset.ap) gt_spec = plot_spectrogram(gt_spec, data_loader.dataset.ap) tb.add_image('Visual/Reconstruction', const_spec, current_step) tb.add_image('Visual/GroundTruth', gt_spec, current_step) align_img = alignments[0].data.cpu().numpy() align_img = plot_alignment(align_img) tb.add_image('Visual/Alignment', align_img, current_step) # Sample audio audio_signal = linear_output[0].data.cpu().numpy() data_loader.dataset.ap.griffin_lim_iters = 60 audio_signal = data_loader.dataset.ap.inv_spectrogram( audio_signal.T) try: tb.add_audio('SampleAudio', audio_signal, current_step, sample_rate=c.sample_rate) except: # print("\n > Error at audio signal on TB!!") # print(audio_signal.max()) # print(audio_signal.min()) pass avg_linear_loss /= (num_iter + 1) avg_mel_loss /= (num_iter + 1) avg_stop_loss /= (num_iter + 1) avg_total_loss = avg_mel_loss + avg_linear_loss + avg_stop_loss # Plot Training Epoch Stats tb.add_scalar('TrainEpochLoss/TotalLoss', avg_total_loss, current_step) tb.add_scalar('TrainEpochLoss/LinearLoss', avg_linear_loss, current_step) tb.add_scalar('TrainEpochLoss/MelLoss', avg_mel_loss, current_step) tb.add_scalar('TrainEpochLoss/StopLoss', avg_stop_loss, current_step) tb.add_scalar('Time/EpochTime', epoch_time, epoch) epoch_time = 0 return avg_linear_loss, current_step
def evaluate(model, criterion, criterion_st, data_loader, current_step): model = model.eval() epoch_time = 0 avg_linear_loss = 0 avg_mel_loss = 0 avg_stop_loss = 0 print(" | > Validation") progbar = Progbar(len(data_loader.dataset) / c.batch_size) n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq) with torch.no_grad(): for num_iter, data in enumerate(data_loader): start_time = time.time() # setup input data text_input = data[0] text_lengths = data[1] linear_input = data[2] mel_input = data[3] mel_lengths = data[4] stop_targets = data[5] # set stop targets view, we predict a single stop token per r frames prediction stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float() # dispatch data to GPU if use_cuda: text_input = text_input.cuda() mel_input = mel_input.cuda() mel_lengths = mel_lengths.cuda() linear_input = linear_input.cuda() stop_targets = stop_targets.cuda() # forward pass mel_output, linear_output, alignments, stop_tokens = \ model.forward(text_input, mel_input) # loss computation stop_loss = criterion_st(stop_tokens, stop_targets) mel_loss = criterion(mel_output, mel_input, mel_lengths) linear_loss = 0.5 * criterion(linear_output, linear_input, mel_lengths) \ + 0.5 * criterion(linear_output[:, :, :n_priority_freq], linear_input[:, :, :n_priority_freq], mel_lengths) loss = mel_loss + linear_loss + stop_loss step_time = time.time() - start_time epoch_time += step_time # update progbar.update(num_iter + 1, values=[('total_loss', loss.item()), ('linear_loss', linear_loss.item()), ('mel_loss', mel_loss.item()), ('stop_loss', stop_loss.item())]) avg_linear_loss += linear_loss.item() avg_mel_loss += mel_loss.item() avg_stop_loss += stop_loss.item() # Diagnostic visualizations idx = np.random.randint(mel_input.shape[0]) const_spec = linear_output[idx].data.cpu().numpy() gt_spec = linear_input[idx].data.cpu().numpy() align_img = alignments[idx].data.cpu().numpy() const_spec = plot_spectrogram(const_spec, data_loader.dataset.ap) gt_spec = plot_spectrogram(gt_spec, data_loader.dataset.ap) align_img = plot_alignment(align_img) tb.add_image('ValVisual/Reconstruction', const_spec, current_step) tb.add_image('ValVisual/GroundTruth', gt_spec, current_step) tb.add_image('ValVisual/ValidationAlignment', align_img, current_step) # Sample audio audio_signal = linear_output[idx].data.cpu().numpy() data_loader.dataset.ap.griffin_lim_iters = 60 audio_signal = data_loader.dataset.ap.inv_spectrogram(audio_signal.T) try: tb.add_audio('ValSampleAudio', audio_signal, current_step, sample_rate=c.sample_rate) except: # print(" | > Error at audio signal on TB!!") # print(audio_signal.max()) # print(audio_signal.min()) pass # compute average losses avg_linear_loss /= (num_iter + 1) avg_mel_loss /= (num_iter + 1) avg_stop_loss /= (num_iter + 1) avg_total_loss = avg_mel_loss + avg_linear_loss + stop_loss # Plot Learning Stats tb.add_scalar('ValEpochLoss/TotalLoss', avg_total_loss, current_step) tb.add_scalar('ValEpochLoss/LinearLoss', avg_linear_loss, current_step) tb.add_scalar('ValEpochLoss/MelLoss', avg_mel_loss, current_step) tb.add_scalar('ValEpochLoss/Stop_loss', avg_stop_loss, current_step) return avg_linear_loss
def dl_progress(count, block_size, total_size): global progbar if progbar is None: progbar = Progbar(total_size) else: progbar.update(count*block_size)
def process(opt): rootpath = opt.rootpath collection = opt.collection feature = opt.feature stride = opt.stride overwrite = opt.overwrite pooling_style = opt.pooling_style feat_path = os.path.join(rootpath, collection, "FeatureData", feature) output_dir = os.path.join(rootpath, collection, "FeatureData", '%s-' % pooling_style + feature + "-stride%s" % stride) feat_combined_file = os.path.join(output_dir, "id_feat.txt") if checkToSkip(os.path.join(output_dir, "feature.bin"), overwrite): sys.exit(0) makedirsforfile(feat_combined_file) print "Generate augmented frame-level features and operate mean pooling..." feat_data = BigFile(feat_path) video2fmnos = {} for frame_id in feat_data.names: data = frame_id.strip().split("_") video_id = '_'.join(data[:-1]) fm_no = data[-1] video2fmnos.setdefault(video_id, []).append(int(fm_no)) video2frames = {} for video_id, fmnos in video2fmnos.iteritems(): for fm_no in sorted(fmnos): video2frames.setdefault(video_id, []).append(video_id + "_" + str(fm_no)) stride = map(int, stride.strip().split('-')) f_auger = Frame_Level_Augmenter(stride) video2subvideo = {} fout = open(feat_combined_file, 'w') progbar = Progbar(len(video2frames)) for video in video2frames: frame_ids = video2frames[video] # output the while video level feature video2subvideo.setdefault(video, []).append(video) reanme, feats = feat_data.read(frame_ids) if pooling_style == 'avg': feat_vec = np.array(feats).mean(axis=0) elif pooling_style == 'max': feat_vec = np.array(feats).max(axis=0) fout.write(video + " " + " ".join(map(str,feat_vec)) + '\n') # output the sub video level feature counter = 0 aug_index = f_auger.get_aug_index(len(frame_ids)) # get augmented frame list for sub_index in aug_index: sub_frames = [frame_ids[idx] for idx in sub_index] reanme, sub_feats = feat_data.read(sub_frames) if pooling_style == 'avg': feat_vec = np.array(sub_feats).mean(axis=0) elif pooling_style == 'max': feat_vec = np.array(sub_feats).max(axis=0) video2subvideo.setdefault(video, []).append(video + "_sub%d" % counter) fout.write(video + "_sub%d" % counter + " " + " ".join(map(str,feat_vec)) + '\n') counter += 1 progbar.add(1) fout.close() f = open(os.path.join(output_dir, "video2subvideo.txt"),'w') f.write(str(video2subvideo)) f.close() text2bin(len(feat_vec), [feat_combined_file], output_dir, 1) os.system('rm %s' % feat_combined_file)
def process(options, collection): rootpath = options.rootpath oversample = options.oversample model_prefix = os.path.join(rootpath, options.model_prefix) sub_mean = model_prefix.find('resnext-101_rbps13k') >= 0 logger.info('subtract mean? %d', sub_mean) layer = 'flatten0_output' batch_size = 1 # change the batch size will get slightly different feature vectors. So stick to batch size of 1. feat_name = get_feat_name(model_prefix, layer, oversample) feat_dir = os.path.join(rootpath, collection, 'FeatureData', feat_name) id_file = os.path.join(feat_dir, 'id.txt') feat_file = os.path.join(feat_dir, 'id.feature.txt') for x in [id_file, feat_file]: if os.path.exists(x): if not options.overwrite: logger.info('%s exists. skip', x) return 0 else: logger.info('%s exists. overwrite', x) id_path_file = os.path.join(rootpath, collection, 'id.imagepath.txt') data = map(str.strip, open(id_path_file).readlines()) img_ids = [x.split()[0] for x in data] filenames = [x.split()[1] for x in data] fe_mod = get_feat_extractor(model_prefix=model_prefix, gpuid=options.gpu, oversample=oversample) if fe_mod is None: return 0 if not os.path.exists(feat_dir): os.makedirs(feat_dir) feat_file = os.path.join(feat_dir, 'id.feature.txt') fails_id_path = [] fw = open(feat_file, 'w') im2path = zip(img_ids, filenames) success = 0 fail = 0 start_time = time.time() logger.info('%d images, %d done, %d to do', len(img_ids), 0, len(img_ids)) progbar = Progbar(len(im2path)) for i, (imgid, impath) in enumerate(im2path): try: imid, features = extract_mxnet_feat(fe_mod, imgid, impath, sub_mean, oversample) fw.write('%s %s\n' % (imid, ' '.join(['%g' % x for x in features]))) success += 1 except Exception as e: fail += 1 logger.error('failed to process %s', impath) logger.info('%d success, %d fail', success, fail) fails_id_path.append((imgid, impath)) finally: progbar.add(1) logger.info('%d success, %d fail', success, fail) elapsed_time = time.time() - start_time logger.info('total running time %s', time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) fw.close() if len(fails_id_path) > 0: fail_fw = open(os.path.join(rootpath, collection, 'feature.fails.txt'), 'w') for (imgid, impath) in fails_id_path: fail_fw.write('%s %s\n' % (imgid, impath)) fail_fw.close()
def train(model, criterion, data_loader, optimizer, epoch): model = model.train() epoch_time = 0 avg_linear_loss = 0 avg_mel_loss = 0 print(" | > Epoch {}/{}".format(epoch, c.epochs)) progbar = Progbar(len(data_loader.dataset) / c.batch_size) n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq) for num_iter, data in enumerate(data_loader): start_time = time.time() # setup input data text_input = data[0] text_lengths = data[1] linear_input = data[2] mel_input = data[3] mel_lengths = data[4] current_step = num_iter + args.restore_step + \ epoch * len(data_loader) + 1 # setup lr current_lr = lr_decay(c.lr, current_step, c.warmup_steps) for params_group in optimizer.param_groups: params_group['lr'] = current_lr optimizer.zero_grad() # convert inputs to variables text_input_var = Variable(text_input) mel_spec_var = Variable(mel_input) mel_lengths_var = Variable(mel_lengths) linear_spec_var = Variable(linear_input, volatile=True) # dispatch data to GPU if use_cuda: text_input_var = text_input_var.cuda() mel_spec_var = mel_spec_var.cuda() mel_lengths_var = mel_lengths_var.cuda() linear_spec_var = linear_spec_var.cuda() # forward pass mel_output, linear_output, alignments =\ model.forward(text_input_var, mel_spec_var) # loss computation mel_loss = criterion(mel_output, mel_spec_var, mel_lengths_var) linear_loss = 0.5 * criterion(linear_output, linear_spec_var, mel_lengths_var) \ + 0.5 * criterion(linear_output[:, :, :n_priority_freq], linear_spec_var[:, :, :n_priority_freq], mel_lengths_var) loss = mel_loss + linear_loss # backpass and check the grad norm loss.backward() grad_norm, skip_flag = check_update(model, 0.5, 100) if skip_flag: optimizer.zero_grad() print(" | > Iteration skipped!!") continue optimizer.step() step_time = time.time() - start_time epoch_time += step_time # update progbar.update(num_iter + 1, values=[('total_loss', loss.data[0]), ('linear_loss', linear_loss.data[0]), ('mel_loss', mel_loss.data[0]), ('grad_norm', grad_norm)]) avg_linear_loss += linear_loss.data[0] avg_mel_loss += mel_loss.data[0] # Plot Training Iter Stats tb.add_scalar('TrainIterLoss/TotalLoss', loss.data[0], current_step) tb.add_scalar('TrainIterLoss/LinearLoss', linear_loss.data[0], current_step) tb.add_scalar('TrainIterLoss/MelLoss', mel_loss.data[0], current_step) tb.add_scalar('Params/LearningRate', optimizer.param_groups[0]['lr'], current_step) tb.add_scalar('Params/GradNorm', grad_norm, current_step) tb.add_scalar('Time/StepTime', step_time, current_step) if current_step % c.save_step == 0: if c.checkpoint: # save model save_checkpoint(model, optimizer, linear_loss.data[0], OUT_PATH, current_step, epoch) # Diagnostic visualizations const_spec = linear_output[0].data.cpu().numpy() gt_spec = linear_spec_var[0].data.cpu().numpy() const_spec = plot_spectrogram(const_spec, data_loader.dataset.ap) gt_spec = plot_spectrogram(gt_spec, data_loader.dataset.ap) tb.add_image('Visual/Reconstruction', const_spec, current_step) tb.add_image('Visual/GroundTruth', gt_spec, current_step) align_img = alignments[0].data.cpu().numpy() align_img = plot_alignment(align_img) tb.add_image('Visual/Alignment', align_img, current_step) # Sample audio audio_signal = linear_output[0].data.cpu().numpy() data_loader.dataset.ap.griffin_lim_iters = 60 audio_signal = data_loader.dataset.ap.inv_spectrogram( audio_signal.T) try: tb.add_audio('SampleAudio', audio_signal, current_step, sample_rate=c.sample_rate) except: # print("\n > Error at audio signal on TB!!") # print(audio_signal.max()) # print(audio_signal.min()) pass avg_linear_loss /= (num_iter + 1) avg_mel_loss /= (num_iter + 1) avg_total_loss = avg_mel_loss + avg_linear_loss # Plot Training Epoch Stats tb.add_scalar('TrainEpochLoss/TotalLoss', avg_total_loss, current_step) tb.add_scalar('TrainEpochLoss/LinearLoss', avg_linear_loss, current_step) tb.add_scalar('TrainEpochLoss/MelLoss', avg_mel_loss, current_step) tb.add_scalar('Time/EpochTime', epoch_time, epoch) epoch_time = 0 return avg_linear_loss, current_step
def main(args): # setup output paths and read configs c = load_config(args.config_path) _ = os.path.dirname(os.path.realpath(__file__)) OUT_PATH = os.path.join(_, c.output_path) OUT_PATH = create_experiment_folder(OUT_PATH) CHECKPOINT_PATH = os.path.join(OUT_PATH, 'checkpoints') shutil.copyfile(args.config_path, os.path.join(OUT_PATH, 'config.json')) # save config to tmp place to be loaded by subsequent modules. file_name = str(os.getpid()) tmp_path = os.path.join("/tmp/", file_name+'_tts') pickle.dump(c, open(tmp_path, "wb")) # setup tensorboard LOG_DIR = OUT_PATH tb = SummaryWriter(LOG_DIR) # Ctrl+C handler to remove empty experiment folder def signal_handler(signal, frame): print(" !! Pressed Ctrl+C !!") remove_experiment_folder(OUT_PATH) sys.exit(1) signal.signal(signal.SIGINT, signal_handler) # Setup the dataset dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata.csv'), os.path.join(c.data_path, 'wavs'), c.r, c.sample_rate, c.text_cleaner, c.num_mels, c.min_level_db, c.frame_shift_ms, c.frame_length_ms, c.preemphasis, c.ref_level_db, c.num_freq, c.power ) dataloader = DataLoader(dataset, batch_size=c.batch_size, shuffle=True, collate_fn=dataset.collate_fn, drop_last=True, num_workers=c.num_loader_workers) # setup the model model = Tacotron(c.embedding_size, c.hidden_size, c.num_mels, c.num_freq, c.r) # plot model on tensorboard dummy_input = dataset.get_dummy_data() ## TODO: onnx does not support RNN fully yet # model_proto_path = os.path.join(OUT_PATH, "model.proto") # onnx.export(model, dummy_input, model_proto_path, verbose=True) # tb.add_graph_onnx(model_proto_path) if use_cuda: model = nn.DataParallel(model.cuda()) optimizer = optim.Adam(model.parameters(), lr=c.lr) if args.restore_step: checkpoint = torch.load(os.path.join( args.restore_path, 'checkpoint_%d.pth.tar' % args.restore_step)) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) print("\n > Model restored from step %d\n" % args.restore_step) start_epoch = checkpoint['step'] // len(dataloader) best_loss = checkpoint['linear_loss'] else: start_epoch = 0 print("\n > Starting a new training") num_params = count_parameters(model) print(" | > Model has {} parameters".format(num_params)) model = model.train() if not os.path.exists(CHECKPOINT_PATH): os.mkdir(CHECKPOINT_PATH) if use_cuda: criterion = nn.L1Loss().cuda() else: criterion = nn.L1Loss() n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq) #lr_scheduler = ReduceLROnPlateau(optimizer, factor=c.lr_decay, # patience=c.lr_patience, verbose=True) epoch_time = 0 best_loss = float('inf') for epoch in range(0, c.epochs): print("\n | > Epoch {}/{}".format(epoch, c.epochs)) progbar = Progbar(len(dataset) / c.batch_size) for num_iter, data in enumerate(dataloader): start_time = time.time() text_input = data[0] text_lengths = data[1] linear_input = data[2] mel_input = data[3] current_step = num_iter + args.restore_step + epoch * len(dataloader) + 1 # setup lr current_lr = lr_decay(c.lr, current_step) for params_group in optimizer.param_groups: params_group['lr'] = current_lr optimizer.zero_grad() # Add a single frame of zeros to Mel Specs for better end detection #try: # mel_input = np.concatenate((np.zeros( # [c.batch_size, 1, c.num_mels], dtype=np.float32), # mel_input[:, 1:, :]), axis=1) #except: # raise TypeError("not same dimension") # convert inputs to variables text_input_var = Variable(text_input) mel_spec_var = Variable(mel_input) linear_spec_var = Variable(linear_input, volatile=True) # sort sequence by length. # TODO: might be unnecessary sorted_lengths, indices = torch.sort( text_lengths.view(-1), dim=0, descending=True) sorted_lengths = sorted_lengths.long().numpy() text_input_var = text_input_var[indices] mel_spec_var = mel_spec_var[indices] linear_spec_var = linear_spec_var[indices] if use_cuda: text_input_var = text_input_var.cuda() mel_spec_var = mel_spec_var.cuda() linear_spec_var = linear_spec_var.cuda() mel_output, linear_output, alignments =\ model.forward(text_input_var, mel_spec_var, input_lengths= torch.autograd.Variable(torch.cuda.LongTensor(sorted_lengths))) mel_loss = criterion(mel_output, mel_spec_var) #linear_loss = torch.abs(linear_output - linear_spec_var) #linear_loss = 0.5 * \ #torch.mean(linear_loss) + 0.5 * \ #torch.mean(linear_loss[:, :n_priority_freq, :]) linear_loss = 0.5 * criterion(linear_output, linear_spec_var) \ + 0.5 * criterion(linear_output[:, :, :n_priority_freq], linear_spec_var[: ,: ,:n_priority_freq]) loss = mel_loss + linear_loss # loss = loss.cuda() loss.backward() grad_norm = nn.utils.clip_grad_norm(model.parameters(), 1.) ## TODO: maybe no need optimizer.step() step_time = time.time() - start_time epoch_time += step_time progbar.update(num_iter+1, values=[('total_loss', loss.data[0]), ('linear_loss', linear_loss.data[0]), ('mel_loss', mel_loss.data[0]), ('grad_norm', grad_norm)]) # Plot Learning Stats tb.add_scalar('Loss/TotalLoss', loss.data[0], current_step) tb.add_scalar('Loss/LinearLoss', linear_loss.data[0], current_step) tb.add_scalar('Loss/MelLoss', mel_loss.data[0], current_step) tb.add_scalar('Params/LearningRate', optimizer.param_groups[0]['lr'], current_step) tb.add_scalar('Params/GradNorm', grad_norm, current_step) tb.add_scalar('Time/StepTime', step_time, current_step) align_img = alignments[0].data.cpu().numpy() align_img = plot_alignment(align_img) tb.add_image('Attn/Alignment', align_img, current_step) if current_step % c.save_step == 0: if c.checkpoint: # save model save_checkpoint(model, optimizer, linear_loss.data[0], OUT_PATH, current_step, epoch) # Diagnostic visualizations const_spec = linear_output[0].data.cpu().numpy() gt_spec = linear_spec_var[0].data.cpu().numpy() const_spec = plot_spectrogram(const_spec, dataset.ap) gt_spec = plot_spectrogram(gt_spec, dataset.ap) tb.add_image('Spec/Reconstruction', const_spec, current_step) tb.add_image('Spec/GroundTruth', gt_spec, current_step) align_img = alignments[0].data.cpu().numpy() align_img = plot_alignment(align_img) tb.add_image('Attn/Alignment', align_img, current_step) # Sample audio audio_signal = linear_output[0].data.cpu().numpy() dataset.ap.griffin_lim_iters = 60 audio_signal = dataset.ap.inv_spectrogram(audio_signal.T) try: tb.add_audio('SampleAudio', audio_signal, current_step, sample_rate=c.sample_rate) except: print("\n > Error at audio signal on TB!!") print(audio_signal.max()) print(audio_signal.min()) # average loss after the epoch avg_epoch_loss = np.mean( progbar.sum_values['linear_loss'][0] / max(1, progbar.sum_values['linear_loss'][1])) best_loss = save_best_model(model, optimizer, avg_epoch_loss, best_loss, OUT_PATH, current_step, epoch) #lr_scheduler.step(loss.data[0]) tb.add_scalar('Time/EpochTime', epoch_time, epoch) epoch_time = 0
def evaluate(model, criterion, data_loader, current_step): model = model.eval() epoch_time = 0 avg_linear_loss = 0 avg_mel_loss = 0 print(" | > Validation") progbar = Progbar(len(data_loader.dataset) / c.batch_size) n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq) for num_iter, data in enumerate(data_loader): start_time = time.time() # setup input data text_input = data[0] text_lengths = data[1] linear_input = data[2] mel_input = data[3] mel_lengths = data[4] # convert inputs to variables text_input_var = Variable(text_input) mel_spec_var = Variable(mel_input) mel_lengths_var = Variable(mel_lengths) linear_spec_var = Variable(linear_input, volatile=True) # dispatch data to GPU if use_cuda: text_input_var = text_input_var.cuda() mel_spec_var = mel_spec_var.cuda() mel_lengths_var = mel_lengths_var.cuda() linear_spec_var = linear_spec_var.cuda() # forward pass mel_output, linear_output, alignments =\ model.forward(text_input_var, mel_spec_var) # loss computation mel_loss = criterion(mel_output, mel_spec_var, mel_lengths_var) linear_loss = 0.5 * criterion(linear_output, linear_spec_var, mel_lengths_var) \ + 0.5 * criterion(linear_output[:, :, :n_priority_freq], linear_spec_var[:, :, :n_priority_freq], mel_lengths_var) loss = mel_loss + linear_loss step_time = time.time() - start_time epoch_time += step_time # update progbar.update(num_iter + 1, values=[('total_loss', loss.data[0]), ('linear_loss', linear_loss.data[0]), ('mel_loss', mel_loss.data[0])]) avg_linear_loss += linear_loss.data[0] avg_mel_loss += mel_loss.data[0] # Diagnostic visualizations idx = np.random.randint(mel_input.shape[0]) const_spec = linear_output[idx].data.cpu().numpy() gt_spec = linear_spec_var[idx].data.cpu().numpy() align_img = alignments[idx].data.cpu().numpy() const_spec = plot_spectrogram(const_spec, data_loader.dataset.ap) gt_spec = plot_spectrogram(gt_spec, data_loader.dataset.ap) align_img = plot_alignment(align_img) tb.add_image('ValVisual/Reconstruction', const_spec, current_step) tb.add_image('ValVisual/GroundTruth', gt_spec, current_step) tb.add_image('ValVisual/ValidationAlignment', align_img, current_step) # Sample audio audio_signal = linear_output[idx].data.cpu().numpy() data_loader.dataset.ap.griffin_lim_iters = 60 audio_signal = data_loader.dataset.ap.inv_spectrogram(audio_signal.T) try: tb.add_audio('ValSampleAudio', audio_signal, current_step, sample_rate=c.sample_rate) except: # print(" | > Error at audio signal on TB!!") # print(audio_signal.max()) # print(audio_signal.min()) pass # compute average losses avg_linear_loss /= (num_iter + 1) avg_mel_loss /= (num_iter + 1) avg_total_loss = avg_mel_loss + avg_linear_loss # Plot Learning Stats tb.add_scalar('ValEpochLoss/TotalLoss', avg_total_loss, current_step) tb.add_scalar('ValEpochLoss/LinearLoss', avg_linear_loss, current_step) tb.add_scalar('ValEpochLoss/MelLoss', avg_mel_loss, current_step) return avg_linear_loss