예제 #1
0
def discrete_collate(batch) :
    """collate function used for discrete wav output, such as 9-bit, mulaw-discrete, etc.
    """
    
    pad = 2
    mel_win = hp.seq_len // hp.hop_size + 2 * pad
    max_offsets = [x[0].shape[-1] - (mel_win + 2 * pad) for x in batch]
    mel_offsets = [np.random.randint(0, offset) for offset in max_offsets]
    sig_offsets = [(offset + pad) * hp.hop_size for offset in mel_offsets]
    
    mels = [x[0][:, mel_offsets[i]:mel_offsets[i] + mel_win] \
            for i, x in enumerate(batch)]
    
    coarse = [x[1][sig_offsets[i]:sig_offsets[i] + hp.seq_len + 1] \
              for i, x in enumerate(batch)]
    
    mels = np.stack(mels).astype(np.float32)
    coarse = np.stack(coarse).astype(np.int64)
    
    mels = torch.FloatTensor(mels)
    coarse = torch.LongTensor(coarse)
    if hp.input_type == 'bits':
        x_input = 2 * coarse[:, :hp.seq_len].float() / (2**hp.bits - 1.) - 1.
    elif hp.input_type == 'mulaw':
        x_input = inv_mulaw_quantize(coarse[:, :hp.seq_len], hp.mulaw_quantize_channels)
    
    y_coarse = coarse[:, 1:]
    
    return x_input, mels, y_coarse
예제 #2
0
        def body(time, current_inputs, final_outputs, current_input_buffers,
                 current_c_buffers):
            # we need shift condition by one
            current_c = c[:, time:time + 1, :] if c is not None else None

            current_outputs = current_inputs
            new_input_buffers = []
            new_c_buffers = []

            for layer, current_input_buffer, current_c_buffer in zip(
                    self.fft_layers, current_input_buffers, current_c_buffers):
                current_outputs, out_input_buffer, out_c_buffer = layer.incremental_forward(
                    inputs=current_outputs,
                    c=current_c,
                    input_buffers=current_input_buffer,
                    c_buffers=current_c_buffer,
                )
                new_input_buffers.append(out_input_buffer)
                new_c_buffers.append(out_c_buffer)

            current_outputs = self.out_layer(current_outputs)

            posterior = tf.nn.softmax(tf.reshape(current_outputs, [1, -1]),
                                      axis=-1)

            # dist = tf.distributions.Categorical(probs=posterior)
            # sample = tf.cast(dist.sample(), tf.int32)

            sample = tf.py_func(np.random.choice, [
                np.arange(self.hp.quantize_channels), 1, True,
                tf.reshape(posterior, [-1])
            ], tf.int64)
            sample = tf.reshape(sample, [-1])

            # sample = tf.argmax(posterior, axis=-1)

            decode_sample = utils.inv_mulaw_quantize(sample,
                                                     self.hp.quantize_channels)
            final_outputs = final_outputs.write(time, decode_sample)

            if utils.is_mulaw_quantize(self.hp.input_type):
                next_sample = tf.one_hot(tf.cast(sample, tf.int32),
                                         self.hp.quantize_channels)
            else:
                next_sample = decode_sample

            next_time = time + 1
            next_inputs = current_inputs[:, 1:, :]
            if test_inputs is not None:
                next_sample = tf.reshape(test_inputs[:, next_time],
                                         [1, 1, self.in_channels])
            else:
                next_sample = tf.reshape(next_sample, [1, 1, self.in_channels])

            next_inputs = tf.concat(
                [next_inputs, tf.cast(next_sample, tf.float32)], axis=1)

            return next_time, next_inputs, final_outputs, new_input_buffers, new_c_buffers
예제 #3
0
def synthesize(mel_sp, save_path, weight_path):
    wavenet = WaveNet(hparams.num_mels, hparams.upsample_scales)
    wavenet.load_weights(weight_path)
    mel_sp = tf.expand_dims(mel_sp, axis=0)

    outputs = wavenet.synthesis(mel_sp)
    outputs = np.squeeze(outputs)
    outputs = inv_mulaw_quantize(outputs)

    save_wav(outputs, save_path, hparams.sampling_rate)
예제 #4
0
def discrete_collate(batch):
    """collate function used for discrete wav output, such as 9-bit, mulaw-discrete, etc.
    """

    pad = 2
    mel_win = hp.seq_len // hp.hop_size + 2 * pad
    max_offsets = [x[0].shape[-1] - (mel_win + 2 * pad) for x in batch]
    mel_offsets = [np.random.randint(0, offset) for offset in max_offsets]
    sig_offsets = [(offset + pad) * hp.hop_size for offset in mel_offsets]

    mels = [x[0][:, mel_offsets[i]:mel_offsets[i] + mel_win] \
            for i, x in enumerate(batch)]

    coarse = [x[1][sig_offsets[i]:sig_offsets[i] + hp.wav_seq_factor * hp.seq_len + 1] \
              for i, x in enumerate(batch)]

    mels = np.stack(mels).astype(np.float32)
    try:
        coarse = np.stack(coarse).astype(np.int64)
    except:
        sz = np.max([len(c) for c in coarse])
        c_errs = [n for n, c in enumerate(coarse) if len(c) != sz]
        #print("error in stacking, possible empty file???")
        # this is weird, bad, and random? some wav file truncated
        # copy a neighbor instead...
        for c_err in c_errs:
            # will wrap around due to negative indexing
            while True:
                idx = c_err - 1
                if idx not in c_errs:
                    break
            coarse[c_err] = coarse[idx].copy()
            mels[c_err] = mels[idx].copy()
        coarse = np.stack(coarse).astype(np.int64)

    mels = torch.FloatTensor(mels)
    coarse = torch.LongTensor(coarse)
    if hp.input_type == 'bits':
        x_input = 2 * coarse[:, :hp.wav_seq_factor *
                             hp.seq_len].float() / (2**hp.bits - 1.) - 1.
    elif hp.input_type == 'mulaw':
        x_input = inv_mulaw_quantize(
            coarse[:, :hp.wav_seq_factor * hp.seq_len],
            hp.mulaw_quantize_channels)

    y_coarse = coarse[:, 1:]

    return x_input, mels, y_coarse
예제 #5
0
파일: model.py 프로젝트: kastnerkyle/stexp
    def batch_generate(self, mels) :
        """mel should be of shape [batch_size x 80 x mel_length]
        """
        self.eval()
        output = []
        rnn1 = self.get_gru_cell(self.rnn1)
        rnn2 = self.get_gru_cell(self.rnn2)
        b_size = mels.shape[0]
        assert len(mels.shape) == 3, "mels should have shape [batch_size x 80 x mel_length]"
        
        with torch.no_grad() :
            x = torch.zeros(b_size, 1).cuda()
            h1 = torch.zeros(b_size, self.rnn_dims).cuda()
            h2 = torch.zeros(b_size, self.rnn_dims).cuda()
            
            mels = torch.FloatTensor(mels).cuda()
            mels, aux = self.upsample(mels)
            
            aux_idx = [self.aux_dims * i for i in range(5)]
            a1 = aux[:, :, aux_idx[0]:aux_idx[1]]
            a2 = aux[:, :, aux_idx[1]:aux_idx[2]]
            a3 = aux[:, :, aux_idx[2]:aux_idx[3]]
            a4 = aux[:, :, aux_idx[3]:aux_idx[4]]
            
            seq_len = mels.size(1)
            
            for i in tqdm(range(seq_len)) :

                m_t = mels[:, i, :]
                a1_t = a1[:, i, :]
                a2_t = a2[:, i, :]
                a3_t = a3[:, i, :]
                a4_t = a4[:, i, :]
                
                x = torch.cat([x, m_t, a1_t], dim=1)
                x = self.I(x)
                h1 = rnn1(x, h1)
                
                x = x + h1
                inp = torch.cat([x, a2_t], dim=1)
                h2 = rnn2(inp, h2)
                
                x = x + h2
                x = torch.cat([x, a3_t], dim=1)
                x = F.relu(self.fc1(x))
                
                x = torch.cat([x, a4_t], dim=1)
                x = F.relu(self.fc2(x))
                x = self.fc3(x)
                if hp.input_type == 'raw':
                    sample = sample_from_beta_dist(x.unsqueeze(0))
                elif hp.input_type == 'mixture':
                    sample = sample_from_discretized_mix_logistic(x.unsqueeze(-1),hp.log_scale_min)
                elif hp.input_type == 'bits':
                    posterior = F.softmax(x, dim=1).view(b_size, -1)
                    distrib = torch.distributions.Categorical(posterior)
                    sample = 2 * distrib.sample().float() / (self.n_classes - 1.) - 1.
                elif hp.input_type == 'mulaw':
                    posterior = F.softmax(x, dim=1).view(b_size, -1)
                    distrib = torch.distributions.Categorical(posterior)
                    print(type(distrib.sample()))
                    sample = inv_mulaw_quantize(distrib.sample(), hp.mulaw_quantize_channels, True)
                output.append(sample.view(-1))
                x = sample.view(b_size,1)
        output = torch.stack(output).cpu().numpy()
        self.train()
        # output is a batch of wav segments of shape [batch_size x seq_len]
        # will need to merge into one wav of size [batch_size * seq_len]
        assert output.shape[1] == b_size
        output = (output.swapaxes(1,0)).reshape(-1)
        return output
예제 #6
0
파일: model.py 프로젝트: kastnerkyle/stexp
    def generate(self, mels, DEVICE="cuda") :
        self.eval()
        output = []
        rnn1 = self.get_gru_cell(self.rnn1)
        rnn2 = self.get_gru_cell(self.rnn2)
        
        with torch.no_grad() :
            x = torch.zeros(1, 1).to(DEVICE)
            h1 = torch.zeros(1, self.rnn_dims).to(DEVICE)
            h2 = torch.zeros(1, self.rnn_dims).to(DEVICE)
            
            mels = torch.FloatTensor(mels).to(DEVICE).unsqueeze(0)
            mels, aux = self.upsample(mels)
            
            aux_idx = [self.aux_dims * i for i in range(5)]
            a1 = aux[:, :, aux_idx[0]:aux_idx[1]]
            a2 = aux[:, :, aux_idx[1]:aux_idx[2]]
            a3 = aux[:, :, aux_idx[2]:aux_idx[3]]
            a4 = aux[:, :, aux_idx[3]:aux_idx[4]]
            
            seq_len = mels.size(1)
            
            for i in tqdm(range(seq_len)) :

                m_t = mels[:, i, :]
                a1_t = a1[:, i, :]
                a2_t = a2[:, i, :]
                a3_t = a3[:, i, :]
                a4_t = a4[:, i, :]
                
                x = torch.cat([x, m_t, a1_t], dim=1)
                x = self.I(x)
                h1 = rnn1(x, h1)
                
                x = x + h1
                inp = torch.cat([x, a2_t], dim=1)
                h2 = rnn2(inp, h2)
                
                x = x + h2
                x = torch.cat([x, a3_t], dim=1)
                x = F.relu(self.fc1(x))
                
                x = torch.cat([x, a4_t], dim=1)
                x = F.relu(self.fc2(x))
                x = self.fc3(x)
                if hp.input_type == 'raw':
                    if hp.distribution == 'beta':
                        sample = sample_from_beta_dist(x.unsqueeze(0))
                    elif hp.distribution == 'gaussian':
                        sample = sample_from_gaussian(x.unsqueeze(0))
                elif hp.input_type == 'mixture':
                    sample = sample_from_discretized_mix_logistic(x.unsqueeze(-1),hp.log_scale_min)
                elif hp.input_type == 'bits':
                    posterior = F.softmax(x, dim=1).view(-1)
                    distrib = torch.distributions.Categorical(posterior)
                    sample = 2 * distrib.sample().float() / (self.n_classes - 1.) - 1.
                elif hp.input_type == 'mulaw':
                    posterior = F.softmax(x, dim=1).view(-1)
                    distrib = torch.distributions.Categorical(posterior)
                    sample = inv_mulaw_quantize(distrib.sample(), hp.mulaw_quantize_channels, True)
                output.append(sample.view(-1))
                x = torch.FloatTensor([[sample]]).to(DEVICE)
        output = torch.stack(output).cpu().numpy()
        self.train()
        return output
예제 #7
0
    def generate(self, mels, target=11000, overlap=550, batched=True):

        self.eval()
        output = []

        rnn1 = self.get_gru_cell(self.rnn1)

        with torch.no_grad():
            mels = torch.FloatTensor(mels).cuda().unsqueeze(0)
            mels = self.pad_tensor(mels.transpose(1, 2), pad=hp.pad, side='both')

            mels, aux = self.upsample(mels.transpose(1, 2))

            if batched:
                mels = self.fold_with_overlap(mels, target, overlap)
                aux = self.fold_with_overlap(aux, target, overlap)

            b_size, seq_len, _ = mels.size()

            h1 = torch.zeros(b_size, self.rnn_dims).cuda()

            x = torch.zeros(b_size, 1).cuda()

            d = self.aux_dims
            aux_split = [aux[:, :, d * i:d * (i + 1)] for i in range(2)]

            for i in range(seq_len):

                m_t = mels[:, i, :]

                a1_t, a2_t = \
                    (a[:, i, :] for a in aux_split)

                x = torch.cat([x, m_t, a1_t[:,:-1]], dim=1)
                x = self.I(x)
                h1 = rnn1(x, h1)

                x = x + h1
                x = torch.cat([x, a2_t], dim=1)
                x = F.relu(self.fc1(x))
                #x = F.relu(self.fc2(x))
                x = self.fc3(x)

                if hp.input_type == 'raw':
                    sample = sample_from_beta_dist(x.unsqueeze(0)).view(-1)
                elif hp.input_type == 'mixture':
                    sample = sample_from_discretized_mix_logistic(x.unsqueeze(-1),hp.log_scale_min)
                elif hp.input_type == 'bits':
                    posterior = F.softmax(x, dim=1)
                    distrib = torch.distributions.Categorical(posterior)
                    sample = 2 * distrib.sample().float() / (self.n_classes - 1.) - 1.
                elif hp.input_type == 'mulaw':
                    posterior = F.softmax(x, dim=1)
                    distrib = torch.distributions.Categorical(posterior)
                    sample = inv_mulaw_quantize(distrib.sample(), hp.mulaw_quantize_channels, True)

                output.append(sample)
                x = sample.unsqueeze(-1)

        output = torch.stack(output).transpose(0, 1)
        output = output.cpu().numpy()

        if batched:
            output = self.xfade_and_unfold(output, target, overlap)
        else:
            output = output[0]

        self.train()
        return output
예제 #8
0
    def incremental_forward(self,
                            c=None,
                            g=None,
                            test_inputs=None,
                            targets=None):
        if g is not None:
            raise NotImplementedError("global condition is not added now!")

        # use the zero as inputs
        inputs = tf.zeros([1, 1], dtype=tf.float32)
        if utils.is_mulaw_quantize(self.hp.input_type):
            inputs = utils.mulaw_quantize(inputs, self.hp.quantize_channels)
            inputs = tf.one_hot(tf.cast(inputs, tf.int32),
                                self.hp.quantize_channels)
        else:
            inputs = tf.expand_dims(inputs, axis=-1)

        # check whether need to upsample condition
        if c is not None and self.upsample_conv is not None:
            c = tf.expand_dims(c, axis=-1)  # [B T cin_channels 1]
            for transposed_conv in self.upsample_conv:
                c = transposed_conv(c)
            c = tf.squeeze(c, axis=-1)  # [B new_T cin_channels]

        # apply zero padding to condition
        if c is not None:
            c_shape = tf.shape(c)
            padding_c = tf.zeros(
                [c_shape[0], self.receptive_filed, c_shape[-1]])
            c = tf.concat([padding_c, c], axis=1)
            # create c_buffers
            c_buffers = [
                tf.zeros([1, 2**i // 2 + 1, self.hp.cin_channels])
                for i in range(self.hp.n_layers, 0, -1)
            ]

        synthesis_length = tf.shape(c)[1]

        initial_time = tf.constant(0, dtype=tf.int32)

        initial_outputs_ta = tf.TensorArray(dtype=tf.float32,
                                            size=0,
                                            dynamic_size=True)

        input_buffers = [
            self._convert_type(tf.zeros([1, 2**self.hp.n_layers // 2 + 1]))
        ]
        for i in range(self.hp.n_layers - 1, 0, -1):
            input_buffers.append(
                self._convert_type(tf.zeros([1, 2**i // 2 + 1])))

        def condition(time, unused_initial_input, unused_final_outputs,
                      unused_input_buffers, unused_c_buffers):
            return tf.less(time, synthesis_length)

        def body(time, current_inputs, final_outputs, current_input_buffers,
                 current_c_buffers):
            # we need shift condition by one
            current_c = c[:, time:time + 1, :] if c is not None else None

            current_outputs = current_inputs
            new_input_buffers = []
            new_c_buffers = []

            for layer, current_input_buffer, current_c_buffer in zip(
                    self.fft_layers, current_input_buffers, current_c_buffers):
                current_outputs, out_input_buffer, out_c_buffer = layer.incremental_forward(
                    inputs=current_outputs,
                    c=current_c,
                    input_buffers=current_input_buffer,
                    c_buffers=current_c_buffer,
                )
                new_input_buffers.append(out_input_buffer)
                new_c_buffers.append(out_c_buffer)

            current_outputs = self.out_layer(current_outputs)

            posterior = tf.nn.softmax(tf.reshape(current_outputs, [1, -1]),
                                      axis=-1)

            # dist = tf.distributions.Categorical(probs=posterior)
            # sample = tf.cast(dist.sample(), tf.int32)

            sample = tf.py_func(np.random.choice, [
                np.arange(self.hp.quantize_channels), 1, True,
                tf.reshape(posterior, [-1])
            ], tf.int64)
            sample = tf.reshape(sample, [-1])

            # sample = tf.argmax(posterior, axis=-1)

            decode_sample = utils.inv_mulaw_quantize(sample,
                                                     self.hp.quantize_channels)
            final_outputs = final_outputs.write(time, decode_sample)

            if utils.is_mulaw_quantize(self.hp.input_type):
                next_sample = tf.one_hot(tf.cast(sample, tf.int32),
                                         self.hp.quantize_channels)
            else:
                next_sample = decode_sample

            next_time = time + 1
            next_inputs = current_inputs[:, 1:, :]
            if test_inputs is not None:
                next_sample = tf.reshape(test_inputs[:, next_time],
                                         [1, 1, self.in_channels])
            else:
                next_sample = tf.reshape(next_sample, [1, 1, self.in_channels])

            next_inputs = tf.concat(
                [next_inputs, tf.cast(next_sample, tf.float32)], axis=1)

            return next_time, next_inputs, final_outputs, new_input_buffers, new_c_buffers

        result = tf.while_loop(condition,
                               body,
                               loop_vars=[
                                   initial_time, inputs, initial_outputs_ta,
                                   input_buffers, c_buffers
                               ],
                               parallel_iterations=32,
                               swap_memory=True)

        outputs_ta = result[2]
        outputs = outputs_ta.stack()
        self.eval_outputs = outputs
        self.eval_targets = utils.inv_mulaw_quantize(
            targets,
            self.hp.quantize_channels) if targets is not None else None