예제 #1
0
 def _convert_type(self, inputs):
     if utils.is_mulaw_quantize(self.hp.input_type):
         inputs = utils.mulaw_quantize(inputs, self.hp.quantize_channels)
         inputs = tf.one_hot(tf.cast(inputs, tf.int32),
                             self.hp.quantize_channels)
     else:
         inputs = tf.expand_dims(inputs, axis=-1)
     return inputs
예제 #2
0
    def __getitem__(self, index):
        entry = self.metadata[index]
        m = np.load(entry[2].strip())
        wav = np.load(entry[1].strip())

        if hp.input_type == 'raw' or hp.input_type=='mixture':
            wav = wav.astype(np.float32)
        elif hp.input_type == 'mulaw':
            wav = mulaw_quantize(wav, hp.mulaw_quantize_channels).astype(np.int)
        elif hp.input_type == 'bits':
            wav = quantize(wav).astype(np.int)
        else:
            raise ValueError("hp.input_type {} not recognized".format(hp.input_type))
        return m, wav
예제 #3
0
    def get_one_example(self):
        for meta in self._metadata:
            audio_file = meta[0]
            input_data = np.load(os.path.join(self.data_dir, audio_file))
            if self.use_local:
                mel_file = meta[1]
                local_feature = np.load(os.path.join(self.data_dir, mel_file))
            else:
                local_feature = False
            # ===== To Do ===== #
            global_feature = False
            # adjust time step for local condition
            max_time_step = self._limit_time()
            input_data, local_feature = self._adjust_time_step(input_data, local_feature, max_time_step)
            # make sure that target is under mu law encode
            if utils.is_mulaw_quantize(self._hparams.input_type):
                target_data = input_data
            else:
                target_data = utils.mulaw_quantize(input_data, self._hparams.quantize_channels)

            input_length = len(input_data)
            yield input_data, target_data, input_length, local_feature, global_feature
예제 #4
0
    def synthesis(self, c):
        c = tf.expand_dims(c, axis=-1)
        c = self.upsample_network(c)
        c = tf.transpose(tf.squeeze(c, axis=-1), perm=[0, 2, 1])

        batch_size, time_len, _ = c.shape
        initial_value = mulaw_quantize(0, 256)
        inputs = tf.one_hot(indices=initial_value, depth=256, dtype=tf.float32)
        inputs = tf.tile(tf.reshape(inputs, [1, 1, 256]), [batch_size, 1, 1])

        outputs = []
        for i in range(time_len):
            c_t = tf.expand_dims(c[:, i, :], axis=1)

            x = self.first_layer(inputs, is_synthesis=True)

            skips = None
            for block in self.residual_blocks:
                x, h = block.synthesis_feed(x, c_t)

                if skips is not None:
                    skips = skips + h
                else:
                    skips = h

            x = skips
            for layer in self.final_layers:
                x = layer(x, is_synthesis=True)

            x = tf.argmax(tf.squeeze(x, axis=1), axis=-1)
            x = tf.one_hot(x, depth=256)
            inputs = x

            outputs.append(tf.argmax(x, axis=1).numpy())

        outputs = np.array(outputs)

        return np.transpose(outputs, [1, 0])
예제 #5
0
    def incremental_forward(self,
                            c=None,
                            g=None,
                            test_inputs=None,
                            targets=None):
        if g is not None:
            raise NotImplementedError("global condition is not added now!")

        # use the zero as inputs
        inputs = tf.zeros([1, 1], dtype=tf.float32)
        if utils.is_mulaw_quantize(self.hp.input_type):
            inputs = utils.mulaw_quantize(inputs, self.hp.quantize_channels)
            inputs = tf.one_hot(tf.cast(inputs, tf.int32),
                                self.hp.quantize_channels)
        else:
            inputs = tf.expand_dims(inputs, axis=-1)

        # check whether need to upsample condition
        if c is not None and self.upsample_conv is not None:
            c = tf.expand_dims(c, axis=-1)  # [B T cin_channels 1]
            for transposed_conv in self.upsample_conv:
                c = transposed_conv(c)
            c = tf.squeeze(c, axis=-1)  # [B new_T cin_channels]

        # apply zero padding to condition
        if c is not None:
            c_shape = tf.shape(c)
            padding_c = tf.zeros(
                [c_shape[0], self.receptive_filed, c_shape[-1]])
            c = tf.concat([padding_c, c], axis=1)
            # create c_buffers
            c_buffers = [
                tf.zeros([1, 2**i // 2 + 1, self.hp.cin_channels])
                for i in range(self.hp.n_layers, 0, -1)
            ]

        synthesis_length = tf.shape(c)[1]

        initial_time = tf.constant(0, dtype=tf.int32)

        initial_outputs_ta = tf.TensorArray(dtype=tf.float32,
                                            size=0,
                                            dynamic_size=True)

        input_buffers = [
            self._convert_type(tf.zeros([1, 2**self.hp.n_layers // 2 + 1]))
        ]
        for i in range(self.hp.n_layers - 1, 0, -1):
            input_buffers.append(
                self._convert_type(tf.zeros([1, 2**i // 2 + 1])))

        def condition(time, unused_initial_input, unused_final_outputs,
                      unused_input_buffers, unused_c_buffers):
            return tf.less(time, synthesis_length)

        def body(time, current_inputs, final_outputs, current_input_buffers,
                 current_c_buffers):
            # we need shift condition by one
            current_c = c[:, time:time + 1, :] if c is not None else None

            current_outputs = current_inputs
            new_input_buffers = []
            new_c_buffers = []

            for layer, current_input_buffer, current_c_buffer in zip(
                    self.fft_layers, current_input_buffers, current_c_buffers):
                current_outputs, out_input_buffer, out_c_buffer = layer.incremental_forward(
                    inputs=current_outputs,
                    c=current_c,
                    input_buffers=current_input_buffer,
                    c_buffers=current_c_buffer,
                )
                new_input_buffers.append(out_input_buffer)
                new_c_buffers.append(out_c_buffer)

            current_outputs = self.out_layer(current_outputs)

            posterior = tf.nn.softmax(tf.reshape(current_outputs, [1, -1]),
                                      axis=-1)

            # dist = tf.distributions.Categorical(probs=posterior)
            # sample = tf.cast(dist.sample(), tf.int32)

            sample = tf.py_func(np.random.choice, [
                np.arange(self.hp.quantize_channels), 1, True,
                tf.reshape(posterior, [-1])
            ], tf.int64)
            sample = tf.reshape(sample, [-1])

            # sample = tf.argmax(posterior, axis=-1)

            decode_sample = utils.inv_mulaw_quantize(sample,
                                                     self.hp.quantize_channels)
            final_outputs = final_outputs.write(time, decode_sample)

            if utils.is_mulaw_quantize(self.hp.input_type):
                next_sample = tf.one_hot(tf.cast(sample, tf.int32),
                                         self.hp.quantize_channels)
            else:
                next_sample = decode_sample

            next_time = time + 1
            next_inputs = current_inputs[:, 1:, :]
            if test_inputs is not None:
                next_sample = tf.reshape(test_inputs[:, next_time],
                                         [1, 1, self.in_channels])
            else:
                next_sample = tf.reshape(next_sample, [1, 1, self.in_channels])

            next_inputs = tf.concat(
                [next_inputs, tf.cast(next_sample, tf.float32)], axis=1)

            return next_time, next_inputs, final_outputs, new_input_buffers, new_c_buffers

        result = tf.while_loop(condition,
                               body,
                               loop_vars=[
                                   initial_time, inputs, initial_outputs_ta,
                                   input_buffers, c_buffers
                               ],
                               parallel_iterations=32,
                               swap_memory=True)

        outputs_ta = result[2]
        outputs = outputs_ta.stack()
        self.eval_outputs = outputs
        self.eval_targets = utils.inv_mulaw_quantize(
            targets,
            self.hp.quantize_channels) if targets is not None else None