def test_FFTNetModelStep(self): print(" ---- Test FFTNetModel step forward ----") net = FFTNetModel(hid_channels=256, out_channels=256, n_layers=11, cond_channels=80) time_start = time.time() for i in range(1024): x = torch.rand(1, 1, 1) cx = torch.rand(1, 80, 1) out = net.forward_step(x, cx) time_avg = (time.time() - time_start) / 1024 print("> Avg time per step inference on CPU: {}".format(time_avg)) assert abs(net.layers[0].buffer.queue1.sum().item()) > 0 assert abs(net.layers[0].buffer.queue2.sum().item()) == 0 # on GPU net = FFTNetModel(hid_channels=256, out_channels=256, n_layers=11, cond_channels=80) net.cuda() time_start = time.time() for i in range(1024): x = torch.rand(1, 1, 1) cx = torch.rand(1, 80, 1) out = net.forward_step(x.cuda(), cx.cuda()) time_avg = (time.time() - time_start) / 1024 print("> Avg time per step inference on GPU: {}".format(time_avg)) assert abs(net.layers[0].buffer.queue1.sum().item()) > 0 assert abs(net.layers[0].buffer.queue2.sum().item()) == 0 # check the second queue net = FFTNetModel(hid_channels=256, out_channels=256, n_layers=11, cond_channels=80) time_start = time.time() for i in range(1025): x = torch.rand(1, 1, 1) cx = torch.rand(1, 80, 1) out = net.forward_step(x, cx) assert abs(net.layers[0].buffer.queue1.sum().item()) > 0 assert abs(net.layers[0].buffer.queue2.sum().item()) > 0 assert abs(net.layers[0].buffer.queue2[:, :, :-1].sum().item()) == 0
# setup TensorBoard tb = SummaryWriter(OUT_PATH) # create the FFTNet model model = FFTNetModel(hid_channels=256, out_channels=256, n_layers=c.num_quant, cond_channels=80) criterion = MaskedCrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=c.lr) num_params = count_parameters(model) print(" > Models has {} parameters".format(num_params)) if use_cuda: model.cuda() criterion.cuda() # these two classes extend torch.utils.data.Dataset class to create the batches # the batches are tuples of three elements: wav, mels, audio file name train_dataset = LJSpeechDataset( os.path.join(c.data_path, "mels", "meta_fftnet_train.csv"), os.path.join(c.data_path, "mels"), c.sample_rate, c.num_mels, c.num_freq, c.min_level_db, c.frame_shift_ms, c.frame_length_ms, c.preemphasis, c.ref_level_db, c.num_quant, c.min_wav_len, c.max_wav_len, False) val_dataset = LJSpeechDataset( os.path.join(c.data_path, "mels", "meta_fftnet_val.csv"), os.path.join(c.data_path, "mels"), c.sample_rate, c.num_mels, c.num_freq,
print(" ---- Test FFTNetModel step forward ----") net = FFTNetModel(hid_channels=256, out_channels=256, n_layers=11, cond_channels=80) net.eval() print(" > Number of model params: ", count_parameters(net)) x = torch.rand(1, 1, 1) cx = torch.rand(1, 80, 1) time_start = time.time() with torch.no_grad(): for i in tqdm(range(20000)): out = net.forward_step(x, cx) time_avg = (time.time() - time_start) / 20000 print("> Avg time per step inference on CPU: {}".format(time_avg)) # on GPU net = FFTNetModel(hid_channels=256, out_channels=256, n_layers=11, cond_channels=80) net.cuda() net.eval() x = torch.rand(1, 1, 1).cuda() cx = torch.rand(1, 80, 1).cuda() time_start = time.time() for i in tqdm(range(20000)): out = net.forward_step(x, cx) time_avg = (time.time() - time_start) / 20000 print("> Avg time per step inference on GPU: {}".format(time_avg))