def main(): batch = 8 epoch = 20 loss = MeanAbsoluteError() learning_rate = PiecewiseConstantDecay(boundaries=[100000], values=[1e-4, 5e-5]) res_blocks = [ 15, 15, 15, 15, 15, 9, 9, 9, 9, 9, 5, 5, 5, 5, 5, 3, 3, 3, 3, 3 ] checkpoint_dir = './ckpt/edsr' # 데이터 가져오기 ds_train = VCTK(subset='train').dataset() ds_valid = VCTK(subset='valid').dataset() # 모델 빌딩 edsr_model = edsr2(scale=4, res_blocks=res_blocks, res_block_scaling=0.7) # 훈련 edsr_trainer = EDSRTrainer(model=edsr_model, loss=loss, learning_rate=learning_rate, checkpoint_dir=checkpoint_dir) edsr_trainer.train(train_dataset=ds_train, valid_dataset=ds_valid, batch=batch, epoch=epoch) edsr_model.save_weights( f'./weights/EDSR_16000_{len(res_blocks)}res_{batch}batch_{epoch}epochs_tanh_entropy_glorot_uniform.h5' )
import tensorflow as tf from data import VCTK data = VCTK(batch_size = 20) sess = tf.InteractiveSession() x = data.mfcc u = x.eval() print u
# # hyper parameters # batch_size = 1 # batch size num_blocks = 3 # dilated blocks num_dim = 128 # latent dimension # # inputs # # VCTK corpus input tensor ( with QueueRunner ) data = VCTK(batch_size=batch_size, data_path='/Volumes/Warehouse/VCTK-Corpus/', mode='train') # vocabulary size voca_size = data.voca_size # mfcc feature of audio x = tf.placeholder(dtype=tf.sg_floatx, shape=(batch_size, None, 40)) # sequence length except zero-padding seq_len = tf.not_equal(x.sg_sum(dims=2), 0.).sg_int().sg_sum(dims=1) # # encode graph ( atrous convolution ) #
tf.sg_verbosity(10) # # hyper parameters # batch_size = 16 # batch size num_blocks = 3 # dilated blocks num_dim = 128 # latent dimension # # inputs # # VCTK corpus input tensor ( with QueueRunner ) data = VCTK(batch_size=batch_size) # vocabulary size voca_size = data.voca_size # mfcc feature of audio x = data.mfcc # sequence length except zero-padding seq_len = tf.not_equal(x.sg_sum(dims=2), 0.).sg_int().sg_sum(dims=1) # target sentence label y = data.label # # encode graph ( atrous convolution )
def main(data_dir, p_name): index = 1 # sample index person = os.path.join(data_dir, p_name) # # Get Audio files filenames = [fname for fname in os.listdir(person)] # # Play sample files # # play_sound(get_wav_name(person, filenames[index])) # # Read file # rate, data = wavfile.read(get_wav_name(person, filenames[index])) # print(data.min()) # text = os.path.join(person,filenames[index]) # file = tf.io.read_file(text) # data, rate = tf.audio.decode_wav(file) # # print(type(data)) """ 다운샘플링한거 주파수영역에서 보기 """ # F = np.fft.fft(data.numpy().ravel()) # mag = np.abs(F) # freq = rate.numpy()/len(mag) # w = np.arange(0, len(mag)) * freq # plt.plot(w,mag) # plt.show() # print(mag.min(), mag.max()) # #TODO: scale 조정(decode_wav로 했다면 필수) # data = data.numpy() # print(((2**15)*data).astype('int16').min(), rate) # downsampled_data = signal.decimate(data.numpy().ravel(), 12) # downsampled_data = signal.decimate(downsampled_data, 2) # downsampled_data = downsampled_data * (2**15) # print(len(downsampled_data) / 2000) # print(downsampled_data.reshape((-1,)).shape) # wavfile.write('test1.wav', 2000, downsampled_data.astype('int16')) # a = [[1,2,3,4], [2,3,4], [3,5,6], [4,5,6], [5,1]] # a = np.array(a) # # print(type(a[0])) # file = h5py.File('./data/test.h5', 'w') # dt = h5py.vlen_dtype(np.dtype('int16')) # dataset = file.create_dataset('test', shape=(5,), dtype=dt) # for i, d in enumerate(a): # dataset[i] = d # file.close() # TODO:hdf5파일 원소들의 길이가 가변인 파일 저장시키기 # file = h5py.File('./data/test.h5', 'r') # data = file['test'][...] # d_gen = data_generator(data) # for file in d_gen: # print(file.dtype) ds_hr, ds_lr = VCTK().dataset() i = 1 for hr, lr in zip(ds_hr.repeat(1), ds_lr.repeat(1)): if i % 1000 == 0: print(i) i += 1 # print(hr.shape, lr.shape) print(i)
tf.sg_verbosity(10) # # hyper parameters # batch_size = 1 # batch size num_blocks = 3 # dilated blocks num_dim = 128 # latent dimension # # inputs # # VCTK corpus input tensor ( with QueueRunner ) data = VCTK(vocabulary_loading=True) # vocabulary size voca_size = data.voca_size # mfcc feature of audio x = tf.placeholder(dtype=tf.sg_floatx, shape=(batch_size, None, 20)) # sequence length except zero-padding seq_len = tf.not_equal(x.sg_sum(dims=2), 0.).sg_int().sg_sum(dims=1) # # encode graph ( atrous convolution ) #
tf.sg_verbosity(10) # # hyper parameters # batch_size = 1 # batch size num_blocks = 3 # dilated blocks num_dim = 128 # latent dimension # # inputs # # VCTK corpus input tensor ( with QueueRunner ) data = VCTK() # vocabulary size voca_size = data.voca_size # mfcc feature of audio x = tf.placeholder(dtype=tf.sg_floatx, shape=(batch_size, None, 20)) # sequence length except zero-padding seq_len = tf.not_equal(x.sg_sum(dims=2), 0.).sg_int().sg_sum(dims=1) # # encode graph ( atrous convolution ) #