示例#1
0
def initialize_networks(args, device):
    # network
    En_A = networks.encoder(in_nc=args.in_ngc,
                            nf=args.ngf,
                            img_size=args.img_size).to(device)
    En_B = networks.encoder(in_nc=args.in_ngc,
                            nf=args.ngf,
                            img_size=args.img_size).to(device)
    De_A = networks.decoder(out_nc=args.out_ngc, nf=args.ngf).to(device)
    De_B = networks.decoder(out_nc=args.out_ngc, nf=args.ngf).to(device)
    Disc_A = networks.discriminator(in_nc=args.in_ndc,
                                    out_nc=args.out_ndc,
                                    nf=args.ndf,
                                    img_size=args.img_size).to(device)
    Disc_B = networks.discriminator(in_nc=args.in_ndc,
                                    out_nc=args.out_ndc,
                                    nf=args.ndf,
                                    img_size=args.img_size).to(device)

    print('---------- Networks initialized -------------')
    utils.print_network(En_A)
    utils.print_network(En_B)
    utils.print_network(De_A)
    utils.print_network(De_B)
    utils.print_network(Disc_A)
    utils.print_network(Disc_B)
    print('-----------------------------------------------')

    all_networks = [En_A, En_B, De_A, De_B, Disc_A, Disc_B]
    return all_networks
示例#2
0
def generator(speaker_embedding,
              inputs,
              is_training=True,
              scope_name='generator',
              reuse=None):
    '''Generate features.

    Args:
      speaker_embedding: A `Tensor` with type `float32` contains speaker information. [N, E]
      inputs: A `Tensor` with type `float32` contains speech features.
      is_training: Boolean, whether to train or inference.
      scope_name: Optional scope for `variable_scope`.
      reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.

    Returns:
      A decoded `Tensor` with aim speaker.
      vae mu vector.
      vae log_var vector.
    '''
    with tf.variable_scope(scope_name, reuse=reuse):
        sample, mu, log_var = encoder(inputs,
                                      is_training=is_training,
                                      scope='vae_encoder')  # [N, T, E]
        #speaker_embedding = tf.expand_dims(speaker_embedding, axis=1) # [N, 1, E]
        speaker_embedding = tf.tile(speaker_embedding,
                                    [1, tf.shape(sample)[1], 1])  # [N, T, E]
        encoded = tf.concat((speaker_embedding, sample),
                            axis=-1)  # [N, T, E+G]
        outputs = decoder(encoded,
                          is_training=is_training,
                          scope='vae_decoder')
        return outputs, mu, log_var  # [N, T, C]
def convert_encoder_to_pb():

    graph = tf.Graph()
    config = tf.ConfigProto()
    config.gpu_options.visible_device_list = GPU_TO_USE

    with graph.as_default():

        raw_images = tf.placeholder(dtype=tf.uint8,
                                    shape=[None, None, None, 3],
                                    name='images')
        names = ['Relu_{}_1'.format(X) for X in range(1, 6)]
        features = encoder(tf.to_float(raw_images))
        features = [tf.identity(features[n], n) for n in names]
        tf.train.init_from_checkpoint('pretrained/vgg_19.ckpt',
                                      {'vgg_19/': 'encoder/'})

        with tf.Session(graph=graph, config=config) as sess:
            sess.run(tf.global_variables_initializer())

            # output ops
            keep_nodes = names

            input_graph_def = tf.graph_util.convert_variables_to_constants(
                sess, graph.as_graph_def(), output_node_names=keep_nodes)
            output_graph_def = tf.graph_util.remove_training_nodes(
                input_graph_def, protected_nodes=keep_nodes)

            with tf.gfile.GFile('inference/encoder.pb', 'wb') as f:
                f.write(output_graph_def.SerializeToString())
            print('%d ops in the final graph.' % len(output_graph_def.node))
示例#4
0
def train_autoencoder(X_dir, Y_dir, batch_size, dim, X_channels, Y_channels,
                      log_dir, shuffle, **kwargs):
    # Dataset
    pairs_filename = load_dataset(X_dir, Y_dir)
    partition = partition_dataset(pairs_filename)
    # Generators
    training_generator = DataGenerator(partition['train'], batch_size, dim,
                                       X_channels, Y_channels, shuffle)
    validation_generator = DataGenerator(partition['validation'], batch_size,
                                         dim, X_channels, Y_channels, shuffle)
    # Design model
    input_img = Input(shape=(*dim, X_channels))
    encoder_img = encoder(n_features=8)
    decoder_lbl = decoder(n_output_features=Y_channels, n_features=8)
    latent_img = encoder_img(input_img)
    latent_lbl = latent_img  # TODO Put res_net here for image to label translation
    restored_lbl = decoder_lbl(latent_lbl)
    img2lbl = Model(input_img, restored_lbl)
    img2lbl.compile(optimizer='adadelta', loss='mean_squared_error')
    # Print summary
    img2lbl.summary()
    print('Model contains a total of %d trainable layers.\n' %
          len(img2lbl.trainable_weights))
    # Train model
    tbi_callback = TensorBoardImage(log_dir=log_dir,
                                    validation_data=validation_generator)
    tb_callback = TensorBoard(log_dir=log_dir)
    img2lbl.fit_generator(generator=training_generator,
                          validation_data=validation_generator,
                          epochs=50,
                          callbacks=[tb_callback, tbi_callback],
                          use_multiprocessing=True,
                          workers=2)
def generator(inputs, is_training=True, scope_name='generator', reuse=None):
    with tf.variable_scope(scope_name, reuse=reuse):
        sample, mu, log_var = encoder(inputs,
                                      is_training=is_training,
                                      scope='vae_encoder')  # [N, T, E]
        # speaker_embedding = tf.tile(speaker_embedding, [1, tf.shape(sample)[1], 1]) # [N, T, E]
        # tf.tile() 用来对张量(Tensor)进行扩展的,表示每一维度,拓展复制几次;
        # encoded = tf.concat((speaker_embedding, sample), axis=-1) # [N, T, E+G]
        outputs = decoder(sample, is_training=is_training, scope='vae_decoder')
        return outputs, mu, log_var  # [N, T, C]
def get_encoding_mu_logvar(encoding_image):
    """Computes the encoding_image's style noise parameters.

  Args:
    encoding_image: [B, H, W, 4] input RGBD image to run Infinite Nature on
  Returns:
    tuple of tensors ([B, z_dim], [B, z_dim]) corresponding to mu and logvar
    normal parameters.
  """

    with tf.compat.v1.variable_scope("generator",
                                     reuse=tf.compat.v1.AUTO_REUSE):
        mu_logvar = networks.encoder(encoding_image)
    return mu_logvar
示例#7
0
    def forward_pass(self, inputs, is_training, reuse=False):

        enc_z, enc_mean, enc_Sigma = encoder(self.opts,
                                             input=inputs,
                                             output_dim=2 * self.opts['zdim'],
                                             scope='encoder',
                                             reuse=reuse,
                                             is_training=is_training)

        dec_x, dec_mean = decoder(self.opts,
                                  input=enc_z,
                                  output_dim=self.output_dim,
                                  scope='decoder',
                                  reuse=reuse,
                                  is_training=is_training)
        return enc_z, enc_mean, enc_Sigma, dec_x, dec_mean
示例#8
0
    def forward_pass(self, inputs, is_training, reuse=False):
        """Performs a full pass over the model.

        inputs:                                 [batch,imgdim]
        return:
        enc_cat_logits:                         [batch,K]
        enc_z/enc_gauss_mean/enc_gauss_Sigma:   [batch,K,zdim]
        dec_mean, dec_Sigma:                    [batch,K,imgdim]

        """
        # Encode
        enc_cat_logits, enc_gauss_mean, enc_gauss_Sigma = encoder(
            self.opts,
            input=inputs,
            cat_output_dim=self.opts['nmixtures'],
            gaus_output_dim=2 * self.opts['zdim'],
            scope='encoder',
            reuse=reuse,
            is_training=is_training)
        enc_gauss_mean = tf.reshape(
            enc_gauss_mean, [-1, self.opts['nmixtures'], self.opts['zdim']])
        enc_gauss_Sigma = tf.reshape(
            enc_gauss_Sigma, [-1, self.opts['nmixtures'], self.opts['zdim']])
        enc_z = sample_all_gmm(self.opts, enc_gauss_mean,
                               enc_gauss_Sigma)  #[batch,nmixtures,zdim]
        enc_z_flat = tf.reshape(enc_z, [-1, self.opts['zdim']])
        # Decode
        dec_mean, dec_Sigma = decoder(self.opts,
                                      input=enc_z_flat,
                                      output_dim=self.output_dim,
                                      scope='decoder',
                                      reuse=reuse,
                                      is_training=is_training)
        outshape = [
            -1, self.opts['nmixtures'],
            np.prod(datashapes[self.opts['dataset']])
        ]
        dec_mean = tf.reshape(dec_mean, outshape)
        dec_Sigma = tf.reshape(dec_Sigma, outshape)

        return enc_cat_logits, enc_z, enc_gauss_mean, enc_gauss_Sigma, dec_mean, dec_Sigma
示例#9
0
    def __init__(self, options):
        super(ArtGAN, self).__init__()
        # build model
        self.encoder = encoder(options)
        self.decoder = decoder(options)
        self.discriminator = discriminator(options)
        self.discriminator_weight = {
            "pred_1": 1.,
            "pred_2": 1.,
            "pred_4": 1.,
            "pred_6": 1.,
            "pred_7": 1.
        }
        self.loss = nn.BCEWithLogitsLoss(reduction='mean')
        self.mse = nn.MSELoss(reduction='mean')
        self.abs = nn.L1Loss(reduction='mean')

        # Setup the optimizers
        dis_params = list(self.discriminator.parameters())
        gen_params = list(self.encoder.parameters()) + list(
            self.decoder.parameters())
        self.dis_opt = torch.optim.Adam(
            [p for p in dis_params if p.requires_grad],
            lr=options.lr,
            betas=(0.5, 0.999),
            weight_decay=0.0001,
            amsgrad=True)
        self.gen_opt = torch.optim.Adam(
            [p for p in gen_params if p.requires_grad],
            lr=options.lr,
            betas=(0.5, 0.999),
            weight_decay=0.0001,
            amsgrad=True)
        self.dis_scheduler = get_scheduler(self.dis_opt, options)
        self.gen_scheduler = get_scheduler(self.gen_opt, options)

        # Network weight initialization
        self.apply(weights_init(options.init))
        self.discriminator.apply(weights_init('gaussian'))
        self.gener_loss = torch.tensor(0.)
        self.discr_loss = torch.tensor(0.)
示例#10
0
    def __init__(self, mode="train"):
        """
        Initialize the class based off of the given mode
        :param mode: the mode to load the model based on
        """
        print("Loading your model...")

        # Initialize values used in class
        self.mode = mode
        self.global_step = None
        self.mel_loss = None
        self.mel_loss = None
        self.mag_loss = None
        self.learning_rate = None
        self.optimizer = None
        self.merged = None
        self.gradients = None
        self.clipped = None
        self.gvs = None
        self.opt_train = None

        # If is_training
        if mode == "train":
            self.is_training = True
        else:
            self.is_training = False

        print("Loading inputs...")
        # Load inputs
        if self.is_training:
            self.txt, self.mels, self.mags, self.file_names, self.num_batch = get_batch(
            )
        elif mode == "synthesize":
            self.txt = tf.placeholder(tf.int32, shape=(None, None))
            self.mels = tf.placeholder(tf.float32,
                                       shape=(None, None,
                                              N_MELS * REDUCTION_FACTOR))
        else:  # eval
            self.txt = tf.placeholder(tf.int32, shape=(None, None))
            self.mels = tf.placeholder(tf.float32,
                                       shape=(None, None,
                                              N_MELS * REDUCTION_FACTOR))
            self.mags = tf.placeholder(tf.float32,
                                       shape=(None, None, 1 + N_FFT // 2))
            self.file_names = tf.placeholder(tf.string, shape=(None, ))

        # decoder inputs
        self.decoder_inputs = tf.concat(
            (tf.zeros_like(self.mels[:, :1, :]), self.mels[:, :-1, :]), 1)
        self.decoder_inputs = self.decoder_inputs[:, :, -N_MELS:]

        # Networks
        with tf.variable_scope("Networks"):
            print("Loading the encoder...")
            # encoder
            self.memory = encoder(self.txt, is_training=self.is_training)

            print("Loading the decoder...")
            # decoder
            self.mel_hat, self.alignments = decoder(
                self.decoder_inputs, self.memory, is_training=self.is_training)

            print("Loading the post CBHG module...")
            # CBHG Module
            self.mags_hat = cbhg_helper(self.mel_hat,
                                        N_MELS,
                                        is_training=self.is_training,
                                        post=True)

        print("Audio out")
        # audio
        self.audio_out = tf.py_func(spectrogram2wav, [self.mags_hat[0]],
                                    tf.float32)

        # Training and evaluation
        if mode in ("train", "eval"):
            print("Generating Loss...")
            # Loss
            self.loss = self.get_loss()

            print("Getting the optimizer ready...")
            # Training Scheme
            self.optimize()

            print("Setting up your summary...")
            self.summarize()
示例#11
0
    def __init__(self, training=True):
        # Load vocabulary
        self.char2idx, self.idx2char = load_vocab()
        self.graph = tf.Graph()
        with self.graph.as_default():
            # Data Feeding
            ## x: Text. (N, T_x), int32
            ## y1: Reduced melspectrogram. (N, T_y//r, n_mels*r) float32
            ## y2: Reduced dones. (N, T_y//r,) int32
            ## z: Magnitude. (N, T_y, n_fft//2+1) float32
            if training:
                self.x, self.y1, self.y2, self.z, self.num_batch = get_batch()
                self.prev_max_attentions = tf.constant([0] * hp.batch_size)
            else:  # Evaluation
                self.x = tf.placeholder(tf.int32,
                                        shape=(hp.batch_size, hp.T_x))
                self.y1 = tf.placeholder(tf.float32,
                                         shape=(hp.batch_size, hp.T_y // hp.r,
                                                hp.n_mels * hp.r))
                self.prev_max_attentions = tf.placeholder(
                    tf.int32, shape=(hp.batch_size, ))

            # Get decoder inputs: feed last frames only (N, T_y//r, n_mels)
            self.decoder_input = tf.concat((tf.zeros_like(
                self.y1[:, :1, -hp.n_mels:]), self.y1[:, :-1, -hp.n_mels:]), 1)

            # Networks
            with tf.variable_scope("net"):
                # Encoder. keys: (N, T_x, e), vals: (N, T_x, e)
                self.keys, self.vals, self.masks = encoder(self.x,
                                                           training=training,
                                                           scope="encoder")

                # Decoder. mel_output: (N, T_y/r, n_mels*r), done_output: (N, T_y/r, 2),
                # decoder_output: (N, T_y/r, e), alignments: (N, T_y, T_x)
                self.mel_output, self.done_output, self.decoder_output, self.alignments, self.max_attentions = decoder(
                    self.decoder_input,
                    self.keys,
                    self.vals,
                    self.masks,
                    self.prev_max_attentions,
                    training=training,
                    scope="decoder",
                    reuse=None)
                # Restore shape. converter_input: (N, T_y, e/r)
                self.converter_input = tf.reshape(self.decoder_output,
                                                  (hp.batch_size, hp.T_y, -1))
                self.converter_input = normalize(self.converter_input,
                                                 type=hp.norm_type,
                                                 training=training,
                                                 activation_fn=tf.nn.relu)

                # Converter. mag_output: (N, T_y, 1+n_fft//2)
                self.mag_output = converter(self.converter_input,
                                            training=training,
                                            scope="converter")
            if training:
                # Loss
                self.loss1_mae = tf.reduce_mean(
                    tf.abs(self.mel_output - self.y1))
                self.loss1_ce = tf.reduce_mean(
                    tf.nn.sparse_softmax_cross_entropy_with_logits(
                        logits=self.done_output, labels=self.y2))
                self.loss2 = tf.reduce_mean(tf.abs(self.mag_output - self.z))
                self.loss = self.loss1_mae + self.loss1_ce + self.loss2

                # Training Scheme
                self.global_step = tf.Variable(0,
                                               name='global_step',
                                               trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr)
                ## gradient clipping
                self.gvs = self.optimizer.compute_gradients(self.loss)
                self.clipped = []
                for grad, var in self.gvs:
                    grad = tf.clip_by_value(grad, -1. * hp.max_grad_val,
                                            hp.max_grad_val)
                    grad = tf.clip_by_norm(grad, hp.max_grad_norm)
                    self.clipped.append((grad, var))
                self.train_op = self.optimizer.apply_gradients(
                    self.clipped, global_step=self.global_step)

                # Summary
                tf.summary.scalar('loss', self.loss)
                tf.summary.scalar('loss1_mae', self.loss1_mae)
                tf.summary.scalar('loss1_ce', self.loss1_ce)
                tf.summary.scalar('loss2', self.loss2)

                self.merged = tf.summary.merge_all()
示例#12
0
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
])
train_loader_A = utils.data_load(os.path.join('data', args.dataset), 'trainA', transform, args.batch_size, shuffle=True, drop_last=True)
train_loader_B = utils.data_load(os.path.join('data', args.dataset), 'trainB', transform, args.batch_size, shuffle=True, drop_last=True)
test_loader_A = utils.data_load(os.path.join('data', args.dataset), 'testA', transform, 1, shuffle=True, drop_last=True)
test_loader_B = utils.data_load(os.path.join('data', args.dataset), 'testB', transform, 1, shuffle=True, drop_last=True)

print('------------ Datasets -------------')
print('TrainA:', len(train_loader_A))
print('TrainB:', len(train_loader_B))
print('TestA:', len(test_loader_A))
print('TestB:', len(test_loader_B))
print('-------------- End ----------------')
# network
En_A = networks.encoder(in_nc=args.in_ngc, nf=args.ngf, img_size=args.img_size).to(device)
En_B = networks.encoder(in_nc=args.in_ngc, nf=args.ngf, img_size=args.img_size).to(device)
De_A = networks.decoder(out_nc=args.out_ngc, nf=args.ngf).to(device)
De_B = networks.decoder(out_nc=args.out_ngc, nf=args.ngf).to(device)
Disc_A = networks.discriminator(in_nc=args.in_ndc, out_nc=args.out_ndc, nf=args.ndf, img_size=args.img_size).to(device)
Disc_B = networks.discriminator(in_nc=args.in_ndc, out_nc=args.out_ndc, nf=args.ndf, img_size=args.img_size).to(device)
En_A.train()
En_B.train()
De_A.train()
De_B.train()
Disc_A.train()
Disc_B.train()
print('---------- Networks initialized -------------')
utils.print_network(En_A)
utils.print_network(En_B)
utils.print_network(De_A)
示例#13
0
    def __init__(self, mode="train"):
        # Load vocabulary
        self.char2idx, self.idx2char = load_vocab()

        # Set phase
        is_training=True if mode=="train" else False

        # Graph
        # Data Feeding
        # x: Text. (N, Tx)
        # y: Reduced melspectrogram. (N, Ty//r, n_mels*r)
        # z: Magnitude. (N, Ty, n_fft//2+1)
        if mode=="train":
            self.x, self.y, self.z, self.fnames, self.num_batch = get_batch()
        elif mode=="eval":
            self.x = tf.placeholder(tf.int32, shape=(None, None))
            self.y = tf.placeholder(tf.float32, shape=(None, None, hp.n_mels*hp.r))
            self.z = tf.placeholder(tf.float32, shape=(None, None, 1+hp.n_fft//2))
            self.fnames = tf.placeholder(tf.string, shape=(None,))
        else: # Synthesize
            self.x = tf.placeholder(tf.int32, shape=(None, None))
            self.y = tf.placeholder(tf.float32, shape=(None, None, hp.n_mels * hp.r))

        # Get encoder/decoder inputs
        self.encoder_inputs = embed(self.x, len(hp.vocab), hp.embed_size) # (N, T_x, E)
        self.decoder_inputs = tf.concat((tf.zeros_like(self.y[:, :1, :]), self.y[:, :-1, :]), 1) # (N, Ty/r, n_mels*r)
        self.decoder_inputs = self.decoder_inputs[:, :, -hp.n_mels:] # feed last frames only (N, Ty/r, n_mels)

        # Networks
        with tf.variable_scope("net"):
            # Encoder
            self.memory = encoder(self.encoder_inputs, is_training=is_training) # (N, T_x, E)

            # Decoder1
            self.y_hat, self.alignments = decoder1(self.decoder_inputs,
                                                     self.memory,
                                                     is_training=is_training) # (N, T_y//r, n_mels*r)
            # Decoder2 or postprocessing
            self.z_hat = decoder2(self.y_hat, is_training=is_training) # (N, T_y//r, (1+n_fft//2)*r)

        # monitor
        self.audio = tf.py_func(spectrogram2wav, [self.z_hat[0]], tf.float32)

        if mode in ("train", "eval"):
            # Loss
            self.loss1 = tf.reduce_mean(tf.abs(self.y_hat - self.y))
            self.loss2 = tf.reduce_mean(tf.abs(self.z_hat - self.z))
            self.loss = self.loss1 + self.loss2

            # Training Scheme
            self.global_step = tf.Variable(0, name='global_step', trainable=False)
            self.lr = learning_rate_decay(hp.lr, global_step=self.global_step)
            self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)

            ## gradient clipping
            self.gvs = self.optimizer.compute_gradients(self.loss)
            self.clipped = []
            for grad, var in self.gvs:
                grad = tf.clip_by_norm(grad, 5.)
                self.clipped.append((grad, var))
            self.train_op = self.optimizer.apply_gradients(self.clipped, global_step=self.global_step)

            # Summary
            tf.summary.scalar('{}/loss1'.format(mode), self.loss1)
            tf.summary.scalar('{}/loss'.format(mode), self.loss)
            tf.summary.scalar('{}/lr'.format(mode), self.lr)

            tf.summary.image("{}/mel_gt".format(mode), tf.expand_dims(self.y, -1), max_outputs=1)
            tf.summary.image("{}/mel_hat".format(mode), tf.expand_dims(self.y_hat, -1), max_outputs=1)
            tf.summary.image("{}/mag_gt".format(mode), tf.expand_dims(self.z, -1), max_outputs=1)
            tf.summary.image("{}/mag_hat".format(mode), tf.expand_dims(self.z_hat, -1), max_outputs=1)

            tf.summary.audio("{}/sample".format(mode), tf.expand_dims(self.audio, 0), hp.sr)
            self.merged = tf.summary.merge_all()
def model_fn(features, labels, mode, params, config):
    """
    This is a function for creating a computational tensorflow graph.
    The function is in format required by tf.estimator.
    """
    images = features
    is_training = mode == tf.estimator.ModeKeys.TRAIN

    # build the main graph
    feature_to_use = params['feature_to_use']  # Relu_X_1
    encoding = encoder(images)[feature_to_use]
    restored_images = decoder(encoding, feature_to_use)
    encoding_of_restored_images = encoder(restored_images)[feature_to_use]

    # use a pretrained backbone network
    if is_training:
        with tf.name_scope('init_from_checkpoint'):
            tf.train.init_from_checkpoint(params['pretrained_checkpoint'],
                                          {'vgg_19/': 'encoder/'})

    assert mode != tf.estimator.ModeKeys.PREDICT

    # add L2 regularization
    with tf.name_scope('weight_decay'):
        add_weight_decay(params['weight_decay'])
        regularization_loss = tf.losses.get_regularization_loss()

    batch_size = tf.to_float(tf.shape(images)[0])
    normalizer = 255.0 * batch_size
    reconstruction_loss = tf.nn.l2_loss(images - restored_images) / normalizer
    features_loss = tf.nn.l2_loss(encoding -
                                  encoding_of_restored_images) / normalizer

    tf.losses.add_loss(reconstruction_loss)
    tf.losses.add_loss(params['lambda'] * features_loss)
    tf.summary.scalar('regularization_loss', regularization_loss)
    tf.summary.scalar('reconstruction_loss', reconstruction_loss)
    tf.summary.scalar('features_loss', features_loss)
    total_loss = tf.losses.get_total_loss(add_regularization_losses=True)

    if mode == tf.estimator.ModeKeys.EVAL:

        eval_metric_ops = {
            'val_reconstruction_loss': tf.metrics.mean(reconstruction_loss),
            'val_features_loss': tf.metrics.mean(features_loss)
        }

        return tf.estimator.EstimatorSpec(mode,
                                          loss=total_loss,
                                          eval_metric_ops=eval_metric_ops)

    assert mode == tf.estimator.ModeKeys.TRAIN
    with tf.variable_scope('learning_rate'):
        global_step = tf.train.get_global_step()
        learning_rate = tf.train.polynomial_decay(
            params['initial_learning_rate'],
            global_step,
            params['num_steps'],
            params['end_learning_rate'],
            power=1.0  # linear decay
        )
        tf.summary.scalar('learning_rate', learning_rate)

    with tf.variable_scope('optimizer'):
        optimizer = tf.train.AdamOptimizer(learning_rate,
                                           beta1=0.9,
                                           beta2=0.999)
        train_op = optimizer.minimize(total_loss, global_step=global_step)

    return tf.estimator.EstimatorSpec(mode, loss=total_loss, train_op=train_op)
示例#15
0
    def __init__(self, mode="train"):
        # Load vocabulary
        self.char2idx, self.idx2char = load_vocab()

        # Set phase
        is_training = True if mode == "train" else False

        # Graph
        # Data Feeding
        # x: Text. (N, Tx)
        # y: Reduced melspectrogram. (N, Ty//r, n_mels*r)
        # z: Magnitude. (N, Ty, n_fft//2+1)
        if mode == "train":
            self.x, self.y, self.z, self.fnames, self.num_batch = get_batch()
        elif mode == "eval":
            self.x = tf.placeholder(tf.int32, shape=(None, None))
            self.y = tf.placeholder(tf.float32,
                                    shape=(None, None, hp.n_mels * hp.r))
            self.z = tf.placeholder(tf.float32,
                                    shape=(None, None, 1 + hp.n_fft // 2))
            self.fnames = tf.placeholder(tf.string, shape=(None, ))
        else:  # Synthesize
            self.x = tf.placeholder(tf.int32, shape=(None, None))
            self.y = tf.placeholder(tf.float32,
                                    shape=(None, None, hp.n_mels * hp.r))

        # Get encoder/decoder inputs
        self.encoder_inputs = embed(self.x, len(hp.vocab),
                                    hp.embed_size)  # (N, T_x, E)
        self.decoder_inputs = tf.concat(
            (tf.zeros_like(self.y[:, :1, :]), self.y[:, :-1, :]),
            1)  # (N, Ty/r, n_mels*r)
        self.decoder_inputs = self.decoder_inputs[:, :, -hp.
                                                  n_mels:]  # feed last frames only (N, Ty/r, n_mels)

        # Networks
        with tf.variable_scope("net"):
            # Encoder
            self.memory = encoder(self.encoder_inputs,
                                  is_training=is_training)  # (N, T_x, E)

            # Decoder1
            if int(hp.schedule_prob) == 1:
                self.y_hat, self.alignments = decoder1(self.decoder_inputs,
                                                       self.memory,
                                                       is_training=is_training)
            else:
                self.y_hat, self.alignments = decoder1_scheduled(
                    self.decoder_inputs,
                    self.memory,
                    is_training=is_training,
                    schedule=hp.schedule_prob)  # (N, T_y//r, n_mels*r)

            # Guided attention loss

            batch_size, N, T = tf.shape(self.alignments)[0], tf.shape(
                self.alignments)[1], tf.shape(self.alignments)[2]
            g = 0.2
            Ns = tf.tile(tf.expand_dims(tf.range(N) / N, 1),
                         [1, T])  # shape: [N, T]
            Ts = tf.tile(tf.expand_dims(tf.range(T) / T, 0),
                         [N, 1])  # shape: [N, T]
            W = tf.ones([N, T]) - tf.exp(
                -1 * (tf.cast(tf.square(Ns - Ts), tf.float32) /
                      (2 * tf.square(g))))
            nearly_diagonal_constraint = tf.multiply(
                self.alignments,
                tf.tile(tf.expand_dims(W, 0), [batch_size, 1, 1]))
            self.guided_attn_loss = tf.reduce_mean(nearly_diagonal_constraint)

            # Decoder2 or postprocessing
            self.z_hat = decoder2(
                self.y_hat,
                is_training=is_training)  # (N, T_y//r, (1+n_fft//2)*r)

        # monitor
        self.audio_h = tf.py_func(spectrogram2wav, [self.z_hat[0]], tf.float32)
        if mode == "train":
            self.audio_gt = tf.py_func(spectrogram2wav, [self.z[0]],
                                       tf.float32)

        if mode in ("train", "eval"):
            # Loss
            self.loss1 = tf.reduce_mean(tf.abs(self.y_hat - self.y))
            self.loss2 = tf.reduce_mean(tf.abs(self.z_hat - self.z))

            if hp.guided_attention:
                self.loss = self.loss1 + self.loss2 + self.guided_attn_loss
            else:
                self.loss = self.loss1 + self.loss2

            # Training Scheme
            self.global_step = tf.Variable(0,
                                           name='global_step',
                                           trainable=False)
            self.lr = learning_rate_decay(hp.lr, global_step=self.global_step)
            self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)

            ## gradient clipping
            self.gvs = self.optimizer.compute_gradients(self.loss)
            self.clipped = []
            for grad, var in self.gvs:
                grad = tf.clip_by_norm(grad, 5.)
                self.clipped.append((grad, var))
            self.train_op = self.optimizer.apply_gradients(
                self.clipped, global_step=self.global_step)

            # Summary
            if hp.guided_attention:
                tf.summary.scalar('{}/guided_attention_loss'.format(mode),
                                  self.guided_attn_loss)
            tf.summary.scalar('{}/loss1'.format(mode), self.loss1)
            tf.summary.scalar('{}/loss2'.format(mode), self.loss2)
            tf.summary.scalar('{}/loss'.format(mode), self.loss)
            tf.summary.scalar('{}/lr'.format(mode), self.lr)

            tf.summary.image("{}/mel_gt".format(mode),
                             tf.expand_dims(self.y, -1),
                             max_outputs=1)
            tf.summary.image("{}/mel_hat".format(mode),
                             tf.expand_dims(self.y_hat, -1),
                             max_outputs=1)
            tf.summary.image("{}/mag_gt".format(mode),
                             tf.expand_dims(self.z, -1),
                             max_outputs=1)
            tf.summary.image("{}/mag_hat".format(mode),
                             tf.expand_dims(self.z_hat, -1),
                             max_outputs=1)
            tf.summary.image("{}/attention".format(mode),
                             tf.expand_dims(self.alignments, -1),
                             max_outputs=1)

            tf.summary.audio("{}/sample_hat".format(mode),
                             tf.expand_dims(self.audio_h, 0), hp.sr)
            tf.summary.audio("{}/sample_gt".format(mode),
                             tf.expand_dims(self.audio_gt, 0), hp.sr)
            self.merged = tf.summary.merge_all()
示例#16
0
    def __init__(self, mode="train"):
        '''
        Args:
          mode: Either "train" or "eval".
        '''
        # Set flag
        training = True if mode=="train" else False

        # Graph
        # Data Feeding
        ## x: Quantized wav. (B, T, 1) int32
        ## wavs: Raw wav. (B, length) float32
        ## speakers: Speaker ids. (B,). [0, 108]. int32.
        if mode=="train":
            self.x, self.wavs, self.speaker_ids, self.num_batch = get_batch()
            self.y = self.x
        else:  # test
            self.x = tf.placeholder(tf.int32, shape=(2, 63488, 1))
            self.y = tf.placeholder(tf.int32, shape=(2, 63488, 1))
            self.speaker_ids = tf.placeholder(tf.int32, shape=(2,))

        # inputs:
        self.encoder_inputs = tf.to_float(self.x)
        self.decoder_inputs = tf.to_float(self.y)
        self.decoder_inputs = tf.concat((tf.zeros_like(self.decoder_inputs[:, :1, :]), self.decoder_inputs[:, :-1, :]), 1)

        # speaker embedding
        self.speakers = tf.one_hot(self.speaker_ids, len(hp.speakers)) # (B, len(speakers))

        # encoder
        self.z_e = encoder(self.encoder_inputs) # (B, T', D)

        # vq
        self.z_q = vq(self.z_e) # (B, T', D)

        # decoder: y -> reconstructed logits.
        self.y_logits = decoder(self.decoder_inputs, self.speakers, self.z_q) # (B, T, Q)
        self.y_hat = tf.argmax(self.y_logits, -1) # (B, T)

        # monitor
        self.sample0 = tf.py_func(mu_law_decode, [self.y_hat[0]], tf.float32)
        self.sample1 = tf.py_func(mu_law_decode, [self.y_hat[1]], tf.float32)

        # speech samples
        # tf.summary.audio('{}/original1'.format(mode), self.wavs[:1], hp.sr, 1)
        # tf.summary.audio('{}/original2'.format(mode), self.wavs[1:], hp.sr, 1)
        tf.summary.audio('{}/sample0'.format(mode), tf.expand_dims(self.sample0, 0), hp.sr, 1)
        tf.summary.audio('{}/sample1'.format(mode), tf.expand_dims(self.sample1, 0), hp.sr, 1)

        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        if training:
            self.dec_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.y_logits, labels=tf.squeeze(self.y)))
            self.vq_loss = tf.reduce_mean(tf.squared_difference(tf.stop_gradient(self.z_e), self.z_q))
            self.enc_loss = hp.beta * tf.reduce_mean(tf.squared_difference(self.z_e, tf.stop_gradient(self.z_q)))

            # decoder grads
            decoder_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "decoder")
            decoder_grads = tf.gradients(self.dec_loss, decoder_vars)
            decoder_grads_vars = list(zip(decoder_grads, decoder_vars))

            # embedding variables grads
            embed_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "vq")
            embed_grads = tf.gradients(self.dec_loss + self.vq_loss, embed_vars)
            embed_grads_vars = list(zip(embed_grads, embed_vars))

            # encoder grads
            encoder_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoder")
            transferred_grads = tf.gradients(self.dec_loss, self.z_q)
            encoder_grads = [tf.gradients(self.z_e, var, transferred_grads)[0] + tf.gradients(self.enc_loss, var)[0]
                                 for var in encoder_vars]
            encoder_grads_vars = list(zip(encoder_grads, encoder_vars))

            # total grads
            self.grads_vars = decoder_grads_vars + embed_grads_vars + encoder_grads_vars

            # Training Scheme
            self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr)

            # Summary
            tf.summary.scalar('train/dec_loss', self.dec_loss)
            tf.summary.scalar('train/vq_loss', self.vq_loss)
            tf.summary.scalar('train/enc_loss', self.enc_loss)

            # tf.summary.scalar("lr", self.lr)

            # gradient clipping
            self.clipped = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in self.grads_vars]

            with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
                self.train_op = self.optimizer.apply_gradients(self.clipped, global_step=self.global_step)

        # Summary
        self.merged = tf.summary.merge_all()
示例#17
0
文件: vae.py 项目: katalinic/betaVAE
def build_enc_dec_connection(observation, constants):
    mean, logstd = encoder(observation, constants)
    eps = tf.random_normal(tf.shape(mean))
    non_sampled_z = mean + tf.exp(logstd) * eps
    dec_out = decoder(non_sampled_z)
    return (mean, logstd), dec_out
示例#18
0
    def __init__(self, training=True):
        # Load vocabulary
        self.char2idx, self.idx2char = load_vocab()

        # Graph
        self.graph = tf.Graph()
        with self.graph.as_default():
            # Data Feeding
            ## x: Text. (N, Tx), int32
            ## y1: Melspectrogram. (N, Ty, n_mels) float32
            ## y2: Dones. (N, Ty) int32
            ## z: Magnitude. (N, Ty, n_fft//2+1) float32
            if training:
                self.x, self.y1, self.y2, self.z = get_batch()
                self.prev_max_attentions_li = tf.ones(shape=(hp.dec_layers,
                                                             hp.batch_size),
                                                      dtype=tf.int32)
            else:  # Inference
                self.x = tf.placeholder(tf.int32, shape=(hp.batch_size, hp.Tx))
                self.y1 = tf.placeholder(tf.float32,
                                         shape=(hp.batch_size, hp.Ty // hp.r,
                                                hp.n_mels * hp.r))
                self.prev_max_attentions_li = tf.placeholder(tf.int32,
                                                             shape=(
                                                                 hp.dec_layers,
                                                                 hp.batch_size,
                                                             ))

            # Get decoder inputs: feed last frames only (N, Ty, n_mels)
            self.decoder_input = tf.concat((tf.zeros_like(
                self.y1[:, :1, -hp.n_mels:]), self.y1[:, :-1, -hp.n_mels:]), 1)

            # Networks
            with tf.variable_scope("encoder"):
                self.keys, self.vals = encoder(self.x,
                                               training=training)  # (N, Tx, e)

            with tf.variable_scope("decoder"):
                # mel_logits: (N, Ty, n_mels)
                # done_output: (N, Ty, 2),
                # decoder_output: (N, Ty, e)
                # alignments_li: dec_layers*(Tx, Ty)
                # max_attentions_li: dec_layers*(N, T_y)
                self.mel_logits, self.done_output, self.decoder_output, self.alignments_li, self.max_attentions_li = decoder(
                    self.decoder_input,
                    self.keys,
                    self.vals,
                    self.prev_max_attentions_li,
                    training=training)
                self.mel_output = tf.nn.sigmoid(self.mel_logits)

            with tf.variable_scope("converter"):
                # Restore shape
                self.converter_input = tf.reshape(self.decoder_output,
                                                  (-1, hp.Ty, hp.embed_size))
                self.converter_input = fc_block(
                    self.converter_input,
                    hp.converter_channels,
                    activation_fn=tf.nn.relu,
                    training=training)  # (N, Ty, v)

                # Converter
                self.mag_logits = converter(
                    self.converter_input,
                    training=training)  # (N, Ty, 1+n_fft//2)
                self.mag_output = tf.nn.sigmoid(self.mag_logits)

            self.global_step = tf.Variable(0,
                                           name='global_step',
                                           trainable=False)
            if training:
                # Loss
                self.loss_mels = tf.reduce_mean(
                    tf.abs(self.mel_output - self.y1))
                self.loss_dones = tf.reduce_mean(
                    tf.nn.sparse_softmax_cross_entropy_with_logits(
                        logits=self.done_output, labels=self.y2))
                self.loss_mags = tf.reduce_mean(
                    tf.abs(self.mag_output - self.z))
                self.loss = self.loss_mels + self.loss_dones + self.loss_mags

                # Training Scheme
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr)
                ## gradient clipping
                self.gvs = self.optimizer.compute_gradients(self.loss)
                self.clipped = []
                for grad, var in self.gvs:
                    grad = tf.clip_by_value(grad, -1. * hp.max_grad_val,
                                            hp.max_grad_val)
                    grad = tf.clip_by_norm(grad, hp.max_grad_norm)
                    self.clipped.append((grad, var))
                self.train_op = self.optimizer.apply_gradients(
                    self.clipped, global_step=self.global_step)

                # Summary
                tf.summary.scalar('Train_Loss/LOSS', self.loss)
                tf.summary.scalar('Train_Loss/mels', self.loss_mels)
                tf.summary.scalar('Train_Loss/dones', self.loss_dones)
                tf.summary.scalar('Train_Loss/mags', self.loss_mags)

                self.merged = tf.summary.merge_all()
示例#19
0
    def __init__(self, config=None, training=True):
        # Load vocabulary
        self.char2idx, self.idx2char = load_vocab()
        self.graph = tf.Graph()
        with self.graph.as_default():
            # Data Feeding
            ## x: Text. (N, T_x), int32
            ## y1: Reduced melspectrogram. (N, T_y//r, n_mels*r) float32
            ## y2: Reduced dones. (N, T_y//r,) int32
            ## z: Magnitude. (N, T_y, n_fft//2+1) float32
            if training:
                self.origx, self.x, self.y1, self.y2, self.y3, self.num_batch = get_batch(
                    config)
                #self.origx, self.x, self.y1, self.y3, self.num_batch = get_batch(config)
                self.prev_max_attentions_li = tf.ones(shape=(hp.dec_layers,
                                                             hp.batch_size),
                                                      dtype=tf.int32)

            else:  # Evaluation
                self.x = tf.placeholder(tf.int32, shape=(1, hp.T_x))
                self.y1 = tf.placeholder(tf.float32,
                                         shape=(1, hp.T_y // hp.r,
                                                hp.n_mels * hp.r))
                self.prev_max_attentions_li = tf.placeholder(tf.int32,
                                                             shape=(
                                                                 hp.dec_layers,
                                                                 1,
                                                             ))

# Get decoder inputs: feed last frames only (N, Ty//r, n_mels)
            self.decoder_input = tf.concat((tf.zeros_like(
                self.y1[:, :1, -hp.n_mels:]), self.y1[:, :-1, -hp.n_mels:]), 1)

            # Networks
            with tf.variable_scope("encoder"):
                self.keys, self.vals = encoder(self.x,
                                               training=training)  # (N, Tx, e)

            with tf.variable_scope("decoder"):
                #self.mel_logits, self.decoder_output, self.alignments_li, self.max_attentions_li \
                self.mel_logits, self.done_output, self.decoder_output, self.alignments_li, self.max_attentions_li \
                    = decoder(self.decoder_input,
                             self.keys,
                             self.vals,
                             self.prev_max_attentions_li,
                             training=training)
                self.mel_output = tf.nn.sigmoid(self.mel_logits)

            with tf.variable_scope("converter"):
                # Restore shape
                self.converter_input = tf.reshape(
                    self.decoder_output, (-1, hp.T_y, hp.embed_size // hp.r))
                self.converter_input = fc_block(
                    self.converter_input,
                    hp.converter_channels,
                    activation_fn=tf.nn.relu,
                    training=training)  # (N, Ty, v)

                # Converter
                #self.mag_logits = converter(self.converter_input, training=training)
                # self.converter_input = tf.reshape(self.mel_output, (-1, hp.T_y, hp.n_mels))
                self.mag_logits = converter(self.converter_input,
                                            training=training)
                self.mag_output = tf.nn.sigmoid(self.mag_logits)

            self.global_step = tf.Variable(0,
                                           name='global_step',
                                           trainable=False)

            if training:
                # Loss
                self.loss1 = tf.reduce_mean(tf.abs(self.mel_output - self.y1))
                self.loss2 = tf.reduce_mean(
                    tf.nn.sparse_softmax_cross_entropy_with_logits(
                        logits=self.done_output, labels=self.y2))
                self.loss3 = tf.reduce_mean(tf.abs(self.mag_output - self.y3))
                self.loss = self.loss1 + self.loss2 + self.loss3
                #self.loss = self.loss1 + self.loss3

                # Training Scheme
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr)
                ## gradient clipping
                self.gvs = self.optimizer.compute_gradients(self.loss)
                self.clipped = []
                for grad, var in self.gvs:
                    grad = grad if grad is None else tf.clip_by_value(
                        grad, -1. * hp.max_grad_val, hp.max_grad_val)
                    grad = grad if grad is None else tf.clip_by_norm(
                        grad, hp.max_grad_norm)
                    self.clipped.append((grad, var))

                self.train_op = self.optimizer.apply_gradients(
                    self.clipped, global_step=self.global_step)

                # Summary
                tf.summary.histogram('mel_output', self.mel_output)
                tf.summary.histogram('mel_actual', self.y1)
                tf.summary.histogram('done_output', self.done_output)
                tf.summary.histogram('done_actual', self.y2)
                tf.summary.histogram('mag_output', self.mag_output)
                tf.summary.histogram('mag_actual', self.y3)

                tf.summary.scalar('loss', self.loss)
                tf.summary.scalar('loss1', self.loss1)
                tf.summary.scalar('loss2', self.loss2)
                tf.summary.scalar('loss3', self.loss3)

                self.merged = tf.summary.merge_all()
示例#20
0
    def __init__(self, mode="train"):
        training = True if mode == "train" else False

        self.x = tf.placeholder(tf.int32, shape=[hp.batch_size, hp.T, 1])
        self.y = self.x
        self.encoder_inputs = tf.one_hot(tf.squeeze(self.x, axis=-1),
                                         hp.Q,
                                         dtype=tf.float32)
        self.speaker_id = tf.placeholder(tf.int32, shape=[
            hp.batch_size,
        ])
        self.speakers = tf.one_hot(self.speaker_id,
                                   len(hp.speakers),
                                   dtype=tf.float32)

        # encoder
        self.z_e = encoder(self.encoder_inputs)  # (B, T', D)

        # vq
        self.z_q = vq(self.z_e)  # (B, T', D)

        # decoder: y -> reconstructed logits.
        self.y_logits = decoder(self.encoder_inputs, self.speakers,
                                self.z_q)  # (B, T-receptivefield+1, Q)
        # monitor
        # self.sample0 = tf.py_func(mu_law_decode, [self.y_hat[0]], tf.float32)
        # self.sample1 = tf.py_func(mu_law_decode, [self.y_hat[1]], tf.float32)

        # speech samples
        # tf.summary.audio('{}/original1'.format(mode), self.wavs[:1], hp.sr, 1)
        # tf.summary.audio('{}/original2'.format(mode), self.wavs[1:], hp.sr, 1)
        # tf.summary.audio('{}/sample0'.format(mode), tf.expand_dims(self.sample0, 0), hp.sr, 1)
        # tf.summary.audio('{}/sample1'.format(mode), tf.expand_dims(self.sample1, 0), hp.sr, 1)
        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        if training:
            self.y = tf.slice(self.y, [0, hp.dilations[-1] * hp.size - 1, 0],
                              [-1, -1, -1])
            self.y = tf.squeeze(self.y, axis=2)
            self.dec_loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=self.y_logits, labels=self.y))
            self.vq_loss = tf.reduce_mean(
                tf.squared_difference(tf.stop_gradient(self.z_e), self.z_q))
            self.enc_loss = hp.beta * tf.reduce_mean(
                tf.squared_difference(self.z_e, tf.stop_gradient(self.z_q)))
            # decoder grads
            decoder_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                             "decoder")
            decoder_grads = tf.gradients(self.dec_loss, decoder_vars)
            decoder_grads_vars = list(zip(decoder_grads, decoder_vars))

            # embedding variables grads
            embed_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                           "vq")
            embed_grads = tf.gradients(self.vq_loss, embed_vars)
            embed_grads_vars = list(zip(embed_grads, embed_vars))

            # encoder grads
            encoder_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                             "encoder")
            transferred_grads = tf.gradients(self.dec_loss, self.z_q)
            encoder_grads = [
                tf.gradients(self.z_e, var, transferred_grads)[0] +
                tf.gradients(self.enc_loss, var)[0] for var in encoder_vars
            ]
            encoder_grads_vars = list(zip(encoder_grads, encoder_vars))

            # total grads
            grads_vars = decoder_grads_vars + embed_grads_vars + encoder_grads_vars

            # Training Scheme
            optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr)

            # Summary
            tf.summary.scalar('train/dec_loss', self.dec_loss)
            tf.summary.scalar('train/vq_loss', self.vq_loss)
            tf.summary.scalar('train/enc_loss', self.enc_loss)

            # tf.summary.scalar("lr", self.lr)

            # gradient clipping
            for grad, var in grads_vars:
                if grad is not None:
                    self.clipped = [(tf.clip_by_value(grad, -1., 1.), var)]

            with tf.control_dependencies(
                    tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
                self.train_op = optimizer.apply_gradients(
                    self.clipped, global_step=self.global_step)

        # Summary
        self.merged = tf.summary.merge_all()
        self.train_writer = tf.summary.FileWriter(hp.logdir + '/train')
        if training == False:
            with tf.variable_scope('decoder'):
                self.z_q = transposed_conv(self.z_q)
示例#21
0
    def __init__(self, config=None, training=True, train_form='Both'):
        # Load vocabulary
        self.char2idx, self.idx2char = load_vocab()
        self.graph = tf.Graph()
        with self.graph.as_default():
            if training:
                self.origx, self.x, self.y1, self.y2, self.y3, self.num_batch = get_batch(
                    config, train_form)
                self.prev_max_attentions_li = tf.ones(shape=(hp.dec_layers,
                                                             self.num_batch),
                                                      dtype=tf.int32)

            else:  # Evaluation
                self.x = tf.placeholder(tf.int32, shape=(1, hp.T_x))
                self.y1 = tf.placeholder(tf.float32,
                                         shape=(1, hp.T_y // hp.r,
                                                hp.n_mels * hp.r))
                self.prev_max_attentions_li = tf.placeholder(tf.int32,
                                                             shape=(
                                                                 hp.dec_layers,
                                                                 1,
                                                             ))

# Get decoder inputs: feed last frames only
            if train_form != 'Converter':
                self.decoder_input = tf.concat(
                    (tf.zeros_like(self.y1[:, :1, -hp.n_mels:]),
                     self.y1[:, :-1, -hp.n_mels:]), 1)

            # Networks
            if train_form != 'Converter':
                with tf.variable_scope("encoder"):
                    self.encoded = encoder(self.x, training=training)

                with tf.variable_scope("decoder"):
                    self.mel_logits, self.done_output, self.max_attentions_li = decoder(
                        self.decoder_input,
                        self.encoded,
                        self.prev_max_attentions_li,
                        training=training)
                    #self.mel_output = self.mel_logits
                    self.mel_output = tf.nn.sigmoid(self.mel_logits)

            if train_form == 'Both':
                with tf.variable_scope("converter"):
                    #self.converter_input = tf.reshape(self.mel_output, (-1, hp.T_y, hp.n_mels))
                    self.converter_input = self.mel_output
                    self.mag_logits = converter(self.converter_input,
                                                training=training)
                    self.mag_output = tf.nn.sigmoid(self.mag_logits)
            elif train_form == 'Converter':
                with tf.variable_scope("converter"):
                    #self.converter_input = tf.reshape(self.mel_output, (-1, hp.T_y, hp.n_mels))
                    self.converter_input = self.y1
                    self.mag_logits = converter(self.converter_input,
                                                training=training)
                    self.mag_output = tf.nn.sigmoid(self.mag_logits)

            self.global_step = tf.Variable(0,
                                           name='global_step',
                                           trainable=False)

            if training:
                # Loss
                if train_form != 'Converter':
                    self.loss1 = tf.reduce_mean(
                        tf.abs(self.mel_output - self.y1))
                    if hp.include_dones:
                        self.loss2 = tf.reduce_mean(
                            tf.nn.sparse_softmax_cross_entropy_with_logits(
                                logits=self.done_output, labels=self.y2))
                if train_form != 'Encoder':
                    self.loss3 = tf.reduce_mean(
                        tf.abs(self.mag_output - self.y3))

                if train_form == 'Both':
                    if hp.include_dones:
                        self.loss = self.loss1 + self.loss2 + self.loss3
                    else:
                        self.loss = self.loss1 + self.loss3
                elif train_form == 'Encoder':
                    if hp.include_dones:
                        self.loss = self.loss1 + self.loss2
                    else:
                        self.loss = self.loss1
                else:
                    self.loss = self.loss3

                # Training Scheme
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr)
                ## gradient clipping
                self.gvs = self.optimizer.compute_gradients(self.loss)
                self.clipped = []
                for grad, var in self.gvs:
                    grad = grad if grad is None else tf.clip_by_value(
                        grad, -1. * hp.max_grad_val, hp.max_grad_val)
                    grad = grad if grad is None else tf.clip_by_norm(
                        grad, hp.max_grad_norm)
                    self.clipped.append((grad, var))

                self.train_op = self.optimizer.apply_gradients(
                    self.clipped, global_step=self.global_step)

                # Summary
                tf.summary.scalar('loss', self.loss)

                if train_form != 'Converter':
                    tf.summary.histogram('mel_output', self.mel_output)
                    tf.summary.histogram('mel_actual', self.y1)
                    tf.summary.scalar('loss1', self.loss1)
                    if hp.include_dones:
                        tf.summary.histogram('done_output', self.done_output)
                        tf.summary.histogram('done_actual', self.y2)
                        tf.summary.scalar('loss2', self.loss2)
                if train_form != 'Encoder':
                    tf.summary.histogram('mag_output', self.mag_output)
                    tf.summary.histogram('mag_actual', self.y3)
                    tf.summary.scalar('loss3', self.loss3)

                self.merged = tf.summary.merge_all()
示例#22
0
    def __init__(self, training=True):
        # Load vocabulary
        self.char2idx, self.idx2char = load_vocab()

        # Graph
        self.graph = tf.Graph()
        with self.graph.as_default():
            # Data Feeding
            ## x: Text. (N, Tx), int32
            ## y1: Reduced melspectrogram. (N, Ty//r, n_mels*r) float32
            ## y2: Reduced dones. (N, Ty//r,) int32
            ## z: Magnitude. (N, Ty, n_fft//2+1) float32
            if training:
                self.x, self.y1, self.y2, self.z, self.num_batch = get_batch()
                self.prev_max_attentions_li = tf.ones(shape=(hp.dec_layers, hp.batch_size), dtype=tf.int32)
            else: # Inference
                self.x = tf.placeholder(tf.int32, shape=(hp.batch_size, hp.Tx))
                self.y1 = tf.placeholder(tf.float32, shape=(hp.batch_size, hp.Ty//hp.r, hp.n_mels*hp.r))
                self.prev_max_attentions_li = tf.placeholder(tf.int32, shape=(hp.dec_layers, hp.batch_size,))

            # Get decoder inputs: feed last frames only (N, Ty//r, n_mels)
            self.decoder_input = tf.concat((tf.zeros_like(self.y1[:, :1, -hp.n_mels:]), self.y1[:, :-1, -hp.n_mels:]), 1)

            # Networks
            with tf.variable_scope("encoder"):
                self.keys, self.vals = encoder(self.x, training=training) # (N, Tx, e)

            with tf.variable_scope("decoder"):
                # mel_logits: (N, Ty/r, n_mels*r)
                # done_output: (N, Ty/r, 2),
                # decoder_output: (N, Ty/r, e)
                # alignments_li: dec_layers*(Tx, Ty/r)
                # max_attentions_li: dec_layers*(N, T_y/r)
                self.mel_logits, self.done_output, self.decoder_output, self.alignments_li, self.max_attentions_li \
                    = decoder(self.decoder_input,
                             self.keys,
                             self.vals,
                             self.prev_max_attentions_li,
                             training=training)
                self.mel_output = tf.nn.sigmoid(self.mel_logits)

            with tf.variable_scope("converter"):
                # Restore shape
                self.converter_input = tf.reshape(self.decoder_output, (-1, hp.Ty, hp.embed_size//hp.r))
                self.converter_input = fc_block(self.converter_input,
                                                hp.converter_channels,
                                                activation_fn=tf.nn.relu,
                                                training=training) # (N, Ty, v)

                # Converter
                self.mag_logits = converter(self.converter_input, training=training) # (N, Ty, 1+n_fft//2)
                self.mag_output = tf.nn.sigmoid(self.mag_logits)

            self.global_step = tf.Variable(0, name='global_step', trainable=False)
            if training:
                # Loss
                self.loss_mels = tf.reduce_mean(tf.abs(self.mel_output - self.y1))
                self.loss_dones = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.done_output, labels=self.y2))
                self.loss_mags = tf.reduce_mean(tf.abs(self.mag_output - self.z))
                self.loss = self.loss_mels + self.loss_dones + self.loss_mags

                # Training Scheme
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr)
                ## gradient clipping
                self.gvs = self.optimizer.compute_gradients(self.loss)
                self.clipped = []
                for grad, var in self.gvs:
                    grad = tf.clip_by_value(grad, -1. * hp.max_grad_val, hp.max_grad_val)
                    grad = tf.clip_by_norm(grad, hp.max_grad_norm)
                    self.clipped.append((grad, var))
                self.train_op = self.optimizer.apply_gradients(self.clipped, global_step=self.global_step)
                   
                # Summary
                tf.summary.scalar('Train_Loss/LOSS', self.loss)
                tf.summary.scalar('Train_Loss/mels', self.loss_mels)
                tf.summary.scalar('Train_Loss/dones', self.loss_dones)
                tf.summary.scalar('Train_Loss/mags', self.loss_mags)

                self.merged = tf.summary.merge_all()
示例#23
0
    def __init__(self, mode="train"):
        # Load vocabulary
        self.char2idx, self.idx2char = load_vocab()

        # Set phase
        is_training = True if mode == "train" else False

        # Graph
        # Data Feeding
        # x: Text. (N, Tx)
        # y: Reduced melspectrogram. (N, Ty//r, n_mels*r)
        # z: Magnitude. (N, Ty, n_fft//2+1)
        if mode == "train":
            self.x, self.y, self.z, self.fnames, self.num_batch = get_batch()
        elif mode == "eval":
            self.x = tf.placeholder(tf.int32, shape=(None, None))
            self.y = tf.placeholder(tf.float32,
                                    shape=(None, None, hp.n_mels * hp.r))
            self.z = tf.placeholder(tf.float32,
                                    shape=(None, None, 1 + hp.n_fft // 2))
            self.fnames = tf.placeholder(tf.string, shape=(None, ))
        else:  # Synthesize
            self.x = tf.placeholder(tf.int32, shape=(None, None))
            self.y = tf.placeholder(tf.float32,
                                    shape=(None, None, hp.n_mels * hp.r))

        # Get encoder/decoder inputs
        self.encoder_inputs = embed(self.x, len(hp.vocab),
                                    hp.embed_size)  # (N, T_x, E)
        self.decoder_inputs = tf.concat(
            (tf.zeros_like(self.y[:, :1, :]), self.y[:, :-1, :]),
            1)  # (N, Ty/r, n_mels*r)
        self.decoder_inputs = self.decoder_inputs[:, :, -hp.
                                                  n_mels:]  # feed last frames only (N, Ty/r, n_mels)

        # Networks
        with tf.variable_scope("net"):
            # Encoder
            self.memory = encoder(self.encoder_inputs,
                                  is_training=is_training)  # (N, T_x, E)

            # Decoder1
            self.y_hat, self.alignments = decoder1(
                self.decoder_inputs, self.memory,
                is_training=is_training)  # (N, T_y//r, n_mels*r)
            # Decoder2 or postprocessing
            self.z_hat = decoder2(
                self.y_hat,
                is_training=is_training)  # (N, T_y//r, (1+n_fft//2)*r)

        # monitor
        self.audio = tf.py_func(spectrogram2wav, [self.z_hat[0]], tf.float32)

        if mode in ("train", "eval"):
            # Loss
            self.loss1 = tf.reduce_mean(tf.abs(self.y_hat - self.y))
            self.loss2 = tf.reduce_mean(tf.abs(self.z_hat - self.z))
            self.loss = self.loss1 + self.loss2

            # Training Scheme
            self.global_step = tf.Variable(0,
                                           name='global_step',
                                           trainable=False)
            self.lr = learning_rate_decay(hp.lr, global_step=self.global_step)
            self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)

            ## gradient clipping
            self.gvs = self.optimizer.compute_gradients(self.loss)
            self.clipped = []
            for grad, var in self.gvs:
                grad = tf.clip_by_norm(grad, 5.)
                self.clipped.append((grad, var))
            self.train_op = self.optimizer.apply_gradients(
                self.clipped, global_step=self.global_step)

            # Summary
            tf.summary.scalar('{}/loss1'.format(mode), self.loss1)
            tf.summary.scalar('{}/loss'.format(mode), self.loss)
            tf.summary.scalar('{}/lr'.format(mode), self.lr)

            tf.summary.image("{}/mel_gt".format(mode),
                             tf.expand_dims(self.y, -1),
                             max_outputs=1)
            tf.summary.image("{}/mel_hat".format(mode),
                             tf.expand_dims(self.y_hat, -1),
                             max_outputs=1)
            tf.summary.image("{}/mag_gt".format(mode),
                             tf.expand_dims(self.z, -1),
                             max_outputs=1)
            tf.summary.image("{}/mag_hat".format(mode),
                             tf.expand_dims(self.z_hat, -1),
                             max_outputs=1)

            tf.summary.audio("{}/sample".format(mode),
                             tf.expand_dims(self.audio, 0), hp.sr)
            self.merged = tf.summary.merge_all()
示例#24
0
        features_quickdraw_dict = feature_gen(
            model=model_sketchy,
            loader=loader_quick_draw,
            dump_location=params.path_quickdraw_features)

    else:
        features_quickdraw_dict = pickle_load(params.path_quickdraw_features)
        print('quickdraw features file found. Loading completed')

    ## Generate a file containing glove vector corresponding to the classes being used.
    if (os.path.isfile(params.path_glove_vector) == False):
        generate_glove_vector()

    ## Load the z_encoder for both image and sketch. If saved model are not found, train and saved the model.
    if (not os.path.isfile(params.path_z_encoder_image)):
        z_encoder_image = encoder(in_dim=params.x_dim, z_dim=params.glove_dim)
        cuda(z_encoder_image)
        z_encoder_image = train_z_encoder(
            encoder_model=z_encoder_image,
            feature_dict=features_image_dict,
            dump_location=params.path_z_encoder_image)

    else:
        z_encoder_image = torch.load(params.path_z_encoder_image)
        cuda(z_encoder_image)

    if (not os.path.isfile(params.path_z_encoder_sketchy)):
        z_encoder_sketchy = encoder(in_dim=params.x_dim,
                                    z_dim=params.glove_dim)
        cuda(z_encoder_sketchy)
        z_encoder_sketchy = train_z_encoder(