Пример #1
0
def main():
    batch = 8
    epoch = 20
    loss = MeanAbsoluteError()
    learning_rate = PiecewiseConstantDecay(boundaries=[100000],
                                           values=[1e-4, 5e-5])
    res_blocks = [
        15, 15, 15, 15, 15, 9, 9, 9, 9, 9, 5, 5, 5, 5, 5, 3, 3, 3, 3, 3
    ]
    checkpoint_dir = './ckpt/edsr'

    # 데이터 가져오기
    ds_train = VCTK(subset='train').dataset()
    ds_valid = VCTK(subset='valid').dataset()

    # 모델 빌딩
    edsr_model = edsr2(scale=4, res_blocks=res_blocks, res_block_scaling=0.7)

    # 훈련
    edsr_trainer = EDSRTrainer(model=edsr_model,
                               loss=loss,
                               learning_rate=learning_rate,
                               checkpoint_dir=checkpoint_dir)
    edsr_trainer.train(train_dataset=ds_train,
                       valid_dataset=ds_valid,
                       batch=batch,
                       epoch=epoch)

    edsr_model.save_weights(
        f'./weights/EDSR_16000_{len(res_blocks)}res_{batch}batch_{epoch}epochs_tanh_entropy_glorot_uniform.h5'
    )
Пример #2
0
import tensorflow as tf
from data import VCTK

data = VCTK(batch_size = 20)

sess = tf.InteractiveSession()
x = data.mfcc


u = x.eval()
print u
Пример #3
0
#
# hyper parameters
#

batch_size = 1  # batch size
num_blocks = 3  # dilated blocks
num_dim = 128  # latent dimension

#
# inputs
#

# VCTK corpus input tensor ( with QueueRunner )
data = VCTK(batch_size=batch_size,
            data_path='/Volumes/Warehouse/VCTK-Corpus/',
            mode='train')

# vocabulary size
voca_size = data.voca_size

# mfcc feature of audio
x = tf.placeholder(dtype=tf.sg_floatx, shape=(batch_size, None, 40))

# sequence length except zero-padding
seq_len = tf.not_equal(x.sg_sum(dims=2), 0.).sg_int().sg_sum(dims=1)

#
# encode graph ( atrous convolution )
#
Пример #4
0
tf.sg_verbosity(10)

#
# hyper parameters
#

batch_size = 16  # batch size
num_blocks = 3  # dilated blocks
num_dim = 128  # latent dimension

#
# inputs
#

# VCTK corpus input tensor ( with QueueRunner )
data = VCTK(batch_size=batch_size)

# vocabulary size
voca_size = data.voca_size

# mfcc feature of audio
x = data.mfcc

# sequence length except zero-padding
seq_len = tf.not_equal(x.sg_sum(dims=2), 0.).sg_int().sg_sum(dims=1)

# target sentence label
y = data.label

#
# encode graph ( atrous convolution )
Пример #5
0
def main(data_dir, p_name):
    index = 1  # sample index
    person = os.path.join(data_dir, p_name)
    # # Get Audio files
    filenames = [fname for fname in os.listdir(person)]
    # # Play sample files
    # # play_sound(get_wav_name(person, filenames[index]))
    # # Read file
    # rate, data = wavfile.read(get_wav_name(person, filenames[index]))
    # print(data.min())
    # text = os.path.join(person,filenames[index])
    # file = tf.io.read_file(text)
    # data, rate = tf.audio.decode_wav(file)
    #
    # print(type(data))
    """ 다운샘플링한거 주파수영역에서 보기 """
    # F = np.fft.fft(data.numpy().ravel())
    # mag = np.abs(F)
    # freq = rate.numpy()/len(mag)
    # w = np.arange(0, len(mag)) * freq
    # plt.plot(w,mag)
    # plt.show()
    # print(mag.min(), mag.max())

    # #TODO: scale 조정(decode_wav로 했다면 필수)
    # data = data.numpy()
    # print(((2**15)*data).astype('int16').min(), rate)

    # downsampled_data = signal.decimate(data.numpy().ravel(), 12)
    # downsampled_data = signal.decimate(downsampled_data, 2)
    # downsampled_data = downsampled_data * (2**15)
    # print(len(downsampled_data) / 2000)

    # print(downsampled_data.reshape((-1,)).shape)

    # wavfile.write('test1.wav', 2000, downsampled_data.astype('int16'))

    # a = [[1,2,3,4], [2,3,4], [3,5,6], [4,5,6], [5,1]]
    # a = np.array(a)
    # # print(type(a[0]))
    # file = h5py.File('./data/test.h5', 'w')
    # dt = h5py.vlen_dtype(np.dtype('int16'))
    # dataset = file.create_dataset('test', shape=(5,), dtype=dt)
    # for i, d in enumerate(a):
    #     dataset[i] = d
    # file.close()

    # TODO:hdf5파일 원소들의 길이가 가변인 파일 저장시키기
    # file = h5py.File('./data/test.h5', 'r')
    # data = file['test'][...]
    # d_gen = data_generator(data)
    # for file in d_gen:
    #     print(file.dtype)

    ds_hr, ds_lr = VCTK().dataset()

    i = 1
    for hr, lr in zip(ds_hr.repeat(1), ds_lr.repeat(1)):
        if i % 1000 == 0:
            print(i)
        i += 1
        # print(hr.shape, lr.shape)
    print(i)
Пример #6
0
tf.sg_verbosity(10)

#
# hyper parameters
#

batch_size = 1     # batch size
num_blocks = 3     # dilated blocks
num_dim = 128      # latent dimension

#
# inputs
#

# VCTK corpus input tensor ( with QueueRunner )
data = VCTK(vocabulary_loading=True)

# vocabulary size
voca_size = data.voca_size

# mfcc feature of audio
x = tf.placeholder(dtype=tf.sg_floatx, shape=(batch_size, None, 20))

# sequence length except zero-padding
seq_len = tf.not_equal(x.sg_sum(dims=2), 0.).sg_int().sg_sum(dims=1)


#
# encode graph ( atrous convolution )
#
Пример #7
0
tf.sg_verbosity(10)

#
# hyper parameters
#

batch_size = 1  # batch size
num_blocks = 3  # dilated blocks
num_dim = 128  # latent dimension

#
# inputs
#

# VCTK corpus input tensor ( with QueueRunner )
data = VCTK()

# vocabulary size
voca_size = data.voca_size

# mfcc feature of audio
x = tf.placeholder(dtype=tf.sg_floatx, shape=(batch_size, None, 20))

# sequence length except zero-padding
seq_len = tf.not_equal(x.sg_sum(dims=2), 0.).sg_int().sg_sum(dims=1)

#
# encode graph ( atrous convolution )
#