Exemplo n.º 1
0
    def __init__(self, is_training):
        # Place Holder
        self.word = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_word')
        self.word_vec = tf.placeholder(dtype=tf.float32, shape=[None, FLAGS.word_size], name='word_vec')
        self.pos1 = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_pos1')
        self.pos2 = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_pos2')
        self.length = tf.placeholder(dtype=tf.int32, shape=[None], name='input_length')
        self.mask = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_mask')
        self.label = tf.placeholder(dtype=tf.int32, shape=[None], name='label')
        self.label_for_select = tf.placeholder(dtype=tf.int32, shape=[None], name='label_for_select')
        self.scope = tf.placeholder(dtype=tf.int32, shape=[FLAGS.batch_size + 1], name='scope')    
        self.weights = tf.placeholder(dtype=tf.float32, shape=[FLAGS.batch_size])

        # Network
        self.embedding = Embedding(is_training, self.word_vec, self.word, self.pos1, self.pos2)
        self.encoder = Encoder(is_training, length=self.length, mask=self.mask)
        self.selector = Selector(is_training, self.scope, self.label_for_select)
        self.classifier = Classifier(is_training, self.label, self.weights)

        # Metrics 
        self.acc_NA = Accuracy()
        self.acc_not_NA = Accuracy()
        self.acc_total = Accuracy()
        self.step = 0
        
        # Session
        self.sess = None
Exemplo n.º 2
0
    def __init__(self, is_training, use_bag=True):
        self.use_bag = use_bag
        # Place Holder
        self.word = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_word')
        #self.word_vec = tf.placeholder(dtype=tf.float32, shape=[None, FLAGS.word_size], name='word_vec')
        self.pos1 = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_pos1')
        self.pos2 = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_pos2')
        self.length = tf.placeholder(dtype=tf.int32, shape=[None], name='input_length')
        self.mask = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_mask')
        self.label = tf.placeholder(dtype=tf.int32, shape=[None], name='label')
        self.label_for_select = tf.placeholder(dtype=tf.int32, shape=[None], name='label_for_select')
        self.scope = tf.placeholder(dtype=tf.int32, shape=[FLAGS.batch_size + 1], name='scope')    
        self.weights = tf.placeholder(dtype=tf.float32, shape=[FLAGS.batch_size])

        self.data_word_vec = np.load(os.path.join(FLAGS.export_path, 'vec.npy'))

        # Network
        self.embedding = Embedding(is_training, self.data_word_vec, self.word, self.pos1, self.pos2)
        self.encoder = Encoder(is_training, FLAGS.drop_prob)
        self.selector = Selector(FLAGS.num_classes, is_training, FLAGS.drop_prob)
        self.classifier = Classifier(is_training, self.label, self.weights)

        # Metrics 
        self.acc_NA = Accuracy()
        self.acc_not_NA = Accuracy()
        self.acc_total = Accuracy()
        self.step = 0
        
        # Session
        self.sess = None
Exemplo n.º 3
0
	def __init__(self,is_training):
		# Place Holder

		self.word = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_word')
		self.pos1 = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_pos1')
		self.pos2 = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_pos2')
		self.length = tf.placeholder(dtype=tf.int32, shape=[None], name='input_length')
		self.mask = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_mask')
		self.label = tf.placeholder(dtype=tf.int32, shape=[None], name='label')
		self.label_for_select = tf.placeholder(dtype=tf.int32, shape=[None], name='label_for_select')
		self.scope = tf.placeholder(dtype=tf.int32, shape=[FLAGS.batch_size + 1], name='scope')
		self.weights = tf.placeholder(dtype=tf.float32, shape=[None])
		self.data_word_vec = np.load(os.path.join(FLAGS.export_path, 'vec.npy'))
		self.reward_holder = tf.placeholder(dtype = tf.float32, shape=[None],name='reward')
		self.action_holder = tf.placeholder(dtype = tf.int32, shape=[None],name='action')

		# Network
		self.embedding = Embedding(is_training, self.data_word_vec, self.word, self.pos1, self.pos2)
		self.encoder = Encoder(is_training, FLAGS.drop_prob)
		self.selector = Selector(FLAGS.num_classes, is_training, FLAGS.drop_prob)
		self.classifier = Classifier(is_training, self.label, self.weights)
		#compute
		self.word_embedding = self.embedding.word_embedding()
		self.pos_embedding = self.embedding.pos_embedding()
		self.embedding = self.embedding.concat_embedding(self.word_embedding, self.pos_embedding)
		self.x = self.encoder.cnn(self.embedding, FLAGS.hidden_size, self.mask, activation=tf.nn.relu)
		self.logit, self.repre = self.selector.no_bag(self.x)

		self.outputvalue=self.classifier.outputvalue(self.logit)
		self.output = self.classifier.output(self.logit)
		self.softmax_outputs=tf.nn.softmax(self.logit)

		self.action_onehot=tf.one_hot(indices=self.action_holder,depth=FLAGS.num_classes,dtype=tf.float32)
		self.step=tf.multiply(self.softmax_outputs,self.action_onehot)
		self.action_outputvalue=tf.reduce_sum(self.step,axis=1)
		self.temp=tf.log(self.action_outputvalue)*self.reward_holder
		self.loss=-tf.reduce_mean(tf.log(self.action_outputvalue)*self.reward_holder)#reduce_mean里面的是188个句子的loss
		self.loss_pre = self.classifier.softmax_cross_entropy(self.logit)

		self.optimizer_pre = tf.train.GradientDescentOptimizer(0.5)
		self.grads_and_vars = self.optimizer_pre.compute_gradients(self.loss_pre)
		self.train_op = self.optimizer_pre.apply_gradients(self.grads_and_vars)
		self.optimizer = tf.train.AdamOptimizer(0.0001)
		self.train_op_rl=self.optimizer.minimize(self.loss)
		self.tvars = tf.trainable_variables()
    def __init__(self, theta_encoder_len=1, lead_num=1):
        super(Model_nefnet, self).__init__()

        self.lead_num = lead_num

        self.W_encoder = Encoder(backbone='resnet34',
                                 in_channel=lead_num,
                                 use_first_pool=True,
                                 lead_num=lead_num,
                                 init_channels=128)
        self.theta_encoder = ThetaEncoder(encoder_len=theta_encoder_len)

        self.mlp1 = nn.Linear((2 * theta_encoder_len + 1) * 4, 128)
        self.mlp2 = nn.Linear((2 * theta_encoder_len + 1) * 4, 256 * 1)

        self.w_feature_extractor = nn.Sequential(nn.Conv1d(128, 128, 3, 1, 1),
                                                 nn.ReLU(inplace=True))

        self.w_conv = nn.Sequential(
            BasicBlock(128 * self.lead_num,
                       128 * self.lead_num,
                       1,
                       groups=self.lead_num))

        self.z1_conv = nn.Sequential(
            BasicBlock(64 * self.lead_num,
                       128 * self.lead_num,
                       1,
                       groups=self.lead_num))
        self.z2_conv1 = nn.Sequential(
            BasicBlock(64 * self.lead_num,
                       128 * self.lead_num,
                       1,
                       groups=self.lead_num))
        self.z2_conv2 = nn.Sequential(
            BasicBlock(128 * 7 * self.lead_num,
                       128 * 7 * self.lead_num,
                       1,
                       groups=self.lead_num * 7),
            nn.ConvTranspose1d(128 * 7 * self.lead_num,
                               128 * 7 * self.lead_num // 2,
                               kernel_size=2,
                               stride=2,
                               groups=self.lead_num * 7),
            BasicBlock(128 * 7 * self.lead_num // 2,
                       128 * 7 * self.lead_num,
                       1,
                       groups=self.lead_num * 7),
        )

        self.decoder = nn.Sequential(
            nn.Upsample(scale_factor=2, mode='linear', align_corners=False),
            DoubleConv(256 * 1, 128),
            nn.Upsample(scale_factor=2, mode='linear', align_corners=False),
            DoubleConv(128, 64), nn.Conv1d(64, 1, 3, padding=1))
Exemplo n.º 5
0
    def __init__(self, dataset_path, max_size):

        self.lang_code = dataset_path.split('/')[-1].split('.')[0]

        self.dataset, self.dataset_validation, self.inp_tokenizer, self.out_tokenizer = load_data(
            dataset_path, max_size)

        self.encoder = Encoder(data.vocab_inp_size, data.embedding_dim,
                               data.units, data.batch_size)
        self.decoder = Decoder(data.vocab_tar_size, data.embedding_dim,
                               data.units, data.batch_size)
        self.optimizer = tf.keras.optimizers.Adam()

        self.checkpoint_dir = './training_checkpoints_' + self.lang_code
        self.checkpoint_prefix = os.path.join(self.checkpoint_dir, "ckpt")
        checkpoint = tf.train.Checkpoint(optimizer=self.optimizer,
                                         encoder=self.encoder,
                                         decoder=self.decoder)
        self.checkpoint = checkpoint
Exemplo n.º 6
0
    def __init__(self, is_training, use_bag=True):
        self.use_bag = use_bag
        self.is_training = is_training
        # Place Holder
        self.word = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_word')
        #self.word_vec = tf.placeholder(dtype=tf.float32, shape=[None, FLAGS.word_size], name='word_vec')
        self.pos1 = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_pos1')
        self.pos2 = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_pos2')
        self.length = tf.placeholder(dtype=tf.int32, shape=[None], name='input_length')
        self.mask = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_mask')
        self.label = tf.placeholder(dtype=tf.int32, shape=[None], name='label')
        self.label_for_select = tf.placeholder(dtype=tf.int32, shape=[None], name='label_for_select')
        self.scope = tf.placeholder(dtype=tf.int32, shape=[FLAGS.batch_size + 1], name='scope')
        self.weights = tf.placeholder(dtype=tf.float32, shape=[FLAGS.batch_size])
        self.data_word_vec = np.load(os.path.join(FLAGS.export_path, 'vec.npy'))
        # Gcn
        self.ent2id = tf.placeholder(dtype=tf.int32, name='ent2id')
        self.features = tf.sparse_placeholder(dtype=tf.float32, name='kg_features')
        adj_name = ['h2r_adj', 'r2t_adj', 'self_adj']
        self.supports = [tf.sparse_placeholder(dtype=tf.float32, name=adj_name[i]) for i in range(3)]
        self.gcn_dims = [100, 85, 70, 53]
        self.num_features_nonzero = tf.placeholder(tf.int32)

        # Network
        self.embedding = Embedding(is_training, self.data_word_vec, self.word, self.pos1, self.pos2)
        self.encoder = Encoder(is_training, FLAGS.drop_prob)
        self.gcn = GCN(is_training, FLAGS.gcn_drop_prob, FLAGS.num_classes, self.gcn_dims)
        self.selector = Selector(FLAGS.num_classes, is_training, FLAGS.drop_prob)
        self.classifier = Classifier(is_training, self.label, self.weights)


        # Metrics
        self.acc_NA = Accuracy()
        self.acc_not_NA = Accuracy()
        self.acc_total = Accuracy()
        self.step = 0

        # Session
        self.sess = None
Exemplo n.º 7
0
    def __init__(self, args):
        super(ContrastiveEncoder, self).__init__()

        self.representation_layer = args.representation_layer
        self.encoder = Encoder(args, *ENCODER_PARAMS)
        if self.encoder.multiscale:
            self.skip_1 = nn.Linear(ENCODER_FILTERS[0], ENCODER_FILTERS[-1])
            self.skip_2 = nn.Linear(ENCODER_FILTERS[1], ENCODER_FILTERS[-1])

        p_in_dim = self.encoder.out_dim
        p_mid_dim = int(self.encoder.out_dim)

        self.projection = nn.ModuleList([
            nn.Sequential(nn.Linear(p_in_dim, p_mid_dim), nn.ReLU()),
            nn.Linear(p_mid_dim, LATENT_SIZE)
        ])

        # what will be used for downstream
        if self.representation_layer == 0:
            self.latent_size = p_in_dim
        elif self.representation_layer == 1:
            self.latent_size = p_mid_dim
        else:
            assert 1 == 0
Exemplo n.º 8
0
    def __init__(self, is_training):
        #Place Holder

        self.word = tf.placeholder(dtype=tf.int32,
                                   shape=[None, FLAGS.max_length],
                                   name='input_word')
        self.pos1 = tf.placeholder(dtype=tf.int32,
                                   shape=[None, FLAGS.max_length],
                                   name='input_pos1')
        self.pos2 = tf.placeholder(dtype=tf.int32,
                                   shape=[None, FLAGS.max_length],
                                   name='input_pos2')
        self.length = tf.placeholder(dtype=tf.int32,
                                     shape=[None],
                                     name='input_length')
        self.mask = tf.placeholder(dtype=tf.int32,
                                   shape=[None, FLAGS.max_length],
                                   name='input_mask')
        self.scope = tf.placeholder(dtype=tf.int32,
                                    shape=[FLAGS.batch_size + 1],
                                    name='scope')
        self.bag_label = tf.placeholder(dtype=tf.int32,
                                        shape=[None],
                                        name='bag_label')
        self.sentence_label = tf.placeholder(dtype=tf.int32,
                                             shape=[None],
                                             name='sentence_label')
        self.label_weights = tf.placeholder(dtype=tf.float32,
                                            shape=[FLAGS.batch_size])
        self.data_word_vec = np.load(os.path.join(FLAGS.export_path,
                                                  'vec.npy'))

        #Network
        self.embedding = Embedding(is_training, self.data_word_vec, self.word,
                                   self.pos1, self.pos2)
        self.encoder = Encoder(is_training, FLAGS.drop_prob)
        self.selector = Selector(FLAGS.num_classes, is_training,
                                 FLAGS.drop_prob)
        self.classifier = Classifier(is_training, self.bag_label,
                                     self.label_weights)
        #compute
        self.word_embedding = self.embedding.word_embedding()
        self.pos_embedding = self.embedding.pos_embedding()
        self.embedding = self.embedding.concat_embedding(
            self.word_embedding, self.pos_embedding)
        self.x = self.encoder.pcnn(self.embedding,
                                   FLAGS.hidden_size,
                                   self.mask,
                                   activation=tf.nn.relu)
        self.logit, self.repre = self.selector.attention(
            self.x, self.scope, self.sentence_label)

        #用于判断ds和selected哪一个好,与优化无关
        self.label_onehot = tf.one_hot(indices=self.bag_label,
                                       depth=FLAGS.num_classes,
                                       dtype=tf.float32)
        self.bag_loss_temp = tf.nn.softmax_cross_entropy_with_logits(
            labels=self.label_onehot, logits=self.logit)
        self.bag_loss = tf.reshape(self.bag_loss_temp, [1, -1])
        self.loss_mean = tf.reduce_mean(self.bag_loss)
        #计算reward
        self.softmax_output = tf.nn.softmax(self.logit)
        self.reward = tf.log(
            tf.reduce_sum(self.label_onehot * self.softmax_output, axis=1))
        #self.loss_mine = -tf.reduce_mean(self.reward, axis=0)这个就是loss一样的,只是没有加上权重
        #计算梯度下降
        self.loss = self.classifier.softmax_cross_entropy(self.logit)
        #self.loss_one = self.classifier.softmax_cross_entropy(self.logit_one)
        self.output = self.classifier.output(self.logit)
        #self.output_one = self.classifier.output(self.logit_one)
        self.outputvalue = self.classifier.outputvalue(self.logit)
        self.test_output = tf.argmax(self.logit, 1)  #输出什么关系
        self.test_outputvalue = tf.reduce_max(self.logit, axis=1)  #输出关系的概率
        # Optimizer
        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        tf.summary.scalar('learning_rate', FLAGS.learning_rate)
        self.optimizer = tf.train.GradientDescentOptimizer(FLAGS.learning_rate)
        self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
        self.train_op = self.optimizer.apply_gradients(
            self.grads_and_vars, global_step=self.global_step)
Exemplo n.º 9
0
    def __init__(
        self,
        idim: int,
        odim: int,
        adim: int = 384,
        aheads: int = 4,
        positionwise_layer_type: str = "linear",
        positionwise_conv_kernel_size: int = 3,
        # encoder
        eunits: int = 1536,
        elayers: int = 6,
        transformer_enc_dropout_rate: float = 0.0,
        transformer_enc_positional_dropout_rate: float = 0.0,
        transformer_enc_atten_dropout_rate: float = 0.0,
        encoder_normalized_before: bool = True,
        encoder_concat_after: bool = False,
        # variance
        pitch_embed_kernel_size: int = 1,
        pitch_embed_dropout: float = 0.0,
        energy_embed_kernel_size: int = 1,
        energy_embed_dropout: float = 0.0,
        duration_predictor_layers: int = 2,
        duration_predictor_chans: int = 256,
        duration_predictor_kernel_size: int = 3,
        duration_predictor_dropout_rate: float = 0.1,
        # decoder
        dlayers: int = 6,
        dunits: int = 1536,
        transformer_dec_dropout_rate: float = 0.1,
        transformer_dec_positional_dropout_rate: float = 0.1,
        transformer_dec_atten_dropout_rate: float = 0.1,
        decoder_normalized_before: bool = False,
        decoder_concat_after: bool = False,
        reduction_factor: int = 1,
        # postnet
        postnet_layers: int = 5,
        postnet_filts: int = 5,
        postnet_chans: int = 256,
        postnet_dropout_rate: float = 0.5,
        # init
        transformer_init: str = "pytorch",
        initial_encoder_alpha: float = 1.0,
        initial_decoder_alpha: float = 1.0,
        # other
        use_masking: bool = True,
        use_batch_norm: bool = True,
        use_scaled_pos_enc: bool = True,
    ):
        super(FeedForwardTransformer, self).__init__()
        self.use_scaled_pos_enc = use_scaled_pos_enc
        self.reduction_factor = reduction_factor
        self.odim = odim
        self.use_masking = use_masking

        self.reporter = Reporter()

        # encoder
        pos_enc_class = ScaledPositionalEncoding if use_scaled_pos_enc else PositionalEncoding

        padding_idx: int = 0
        encoder_input_layer = nn.Embedding(num_embeddings=idim,
                                           embedding_dim=adim,
                                           padding_idx=padding_idx)
        self.encoder = Encoder(
            input_layer=encoder_input_layer,
            attention_dim=adim,
            attention_heads=aheads,
            linear_units=eunits,
            num_blocks=elayers,
            dropout_rate=transformer_enc_dropout_rate,
            positional_dropout_rate=transformer_enc_positional_dropout_rate,
            attention_dropout_rate=transformer_enc_atten_dropout_rate,
            pos_enc_class=pos_enc_class,
            normalized_before=encoder_normalized_before,
            concate_after=encoder_concat_after,
            positionwise_layer_type=positionwise_layer_type,
            positionwise_conv_kernel_size=positionwise_conv_kernel_size)
        self.variance_adaptor = VarianceAdaptor(
            adim=adim,
            pitch_dim=4,
            energy_dim=1,
            pitch_embed_kernel_size=pitch_embed_kernel_size,
            pitch_embed_dropout_rate=pitch_embed_dropout,
            energy_embed_kernel_size=energy_embed_kernel_size,
            energy_embed_dropout_rate=energy_embed_dropout,
            duration_predictor_layers=duration_predictor_layers,
            duration_predictor_chans=duration_predictor_chans,
            duration_predictor_kernel_size=duration_predictor_kernel_size,
            duration_predictor_dropout_rate=duration_predictor_dropout_rate)
        self.decoder = Encoder(
            input_layer=None,
            attention_dim=adim,
            attention_heads=aheads,
            linear_units=dunits,
            num_blocks=dlayers,
            dropout_rate=transformer_dec_dropout_rate,
            positional_dropout_rate=transformer_dec_positional_dropout_rate,
            attention_dropout_rate=transformer_dec_atten_dropout_rate,
            pos_enc_class=pos_enc_class,
            normalized_before=decoder_normalized_before,
            concate_after=decoder_concat_after,
            positionwise_layer_type=positionwise_layer_type,
            positionwise_conv_kernel_size=positionwise_conv_kernel_size)
        self.feat_out = nn.Linear(in_features=adim,
                                  out_features=odim * reduction_factor)

        self.postnet = None if postnet_layers == 0 else PostNet(
            in_dim=idim,
            out_dim=odim,
            n_layers=postnet_layers,
            n_chans=postnet_chans,
            n_filts=postnet_filts,
            use_batch_norm=use_batch_norm,
            dropout_rate=postnet_dropout_rate)
        self._reset_parameters(init_type=transformer_init,
                               init_enc_alpha=initial_encoder_alpha,
                               init_dec_alpha=initial_decoder_alpha)
        self.duration_criterion = DurationPredictorLoss()
        self.mse_criterion = nn.MSELoss()
Exemplo n.º 10
0
class FeedForwardTransformer(nn.Module):
    def __init__(
        self,
        idim: int,
        odim: int,
        adim: int = 384,
        aheads: int = 4,
        positionwise_layer_type: str = "linear",
        positionwise_conv_kernel_size: int = 3,
        # encoder
        eunits: int = 1536,
        elayers: int = 6,
        transformer_enc_dropout_rate: float = 0.0,
        transformer_enc_positional_dropout_rate: float = 0.0,
        transformer_enc_atten_dropout_rate: float = 0.0,
        encoder_normalized_before: bool = True,
        encoder_concat_after: bool = False,
        # variance
        pitch_embed_kernel_size: int = 1,
        pitch_embed_dropout: float = 0.0,
        energy_embed_kernel_size: int = 1,
        energy_embed_dropout: float = 0.0,
        duration_predictor_layers: int = 2,
        duration_predictor_chans: int = 256,
        duration_predictor_kernel_size: int = 3,
        duration_predictor_dropout_rate: float = 0.1,
        # decoder
        dlayers: int = 6,
        dunits: int = 1536,
        transformer_dec_dropout_rate: float = 0.1,
        transformer_dec_positional_dropout_rate: float = 0.1,
        transformer_dec_atten_dropout_rate: float = 0.1,
        decoder_normalized_before: bool = False,
        decoder_concat_after: bool = False,
        reduction_factor: int = 1,
        # postnet
        postnet_layers: int = 5,
        postnet_filts: int = 5,
        postnet_chans: int = 256,
        postnet_dropout_rate: float = 0.5,
        # init
        transformer_init: str = "pytorch",
        initial_encoder_alpha: float = 1.0,
        initial_decoder_alpha: float = 1.0,
        # other
        use_masking: bool = True,
        use_batch_norm: bool = True,
        use_scaled_pos_enc: bool = True,
    ):
        super(FeedForwardTransformer, self).__init__()
        self.use_scaled_pos_enc = use_scaled_pos_enc
        self.reduction_factor = reduction_factor
        self.odim = odim
        self.use_masking = use_masking

        self.reporter = Reporter()

        # encoder
        pos_enc_class = ScaledPositionalEncoding if use_scaled_pos_enc else PositionalEncoding

        padding_idx: int = 0
        encoder_input_layer = nn.Embedding(num_embeddings=idim,
                                           embedding_dim=adim,
                                           padding_idx=padding_idx)
        self.encoder = Encoder(
            input_layer=encoder_input_layer,
            attention_dim=adim,
            attention_heads=aheads,
            linear_units=eunits,
            num_blocks=elayers,
            dropout_rate=transformer_enc_dropout_rate,
            positional_dropout_rate=transformer_enc_positional_dropout_rate,
            attention_dropout_rate=transformer_enc_atten_dropout_rate,
            pos_enc_class=pos_enc_class,
            normalized_before=encoder_normalized_before,
            concate_after=encoder_concat_after,
            positionwise_layer_type=positionwise_layer_type,
            positionwise_conv_kernel_size=positionwise_conv_kernel_size)
        self.variance_adaptor = VarianceAdaptor(
            adim=adim,
            pitch_dim=4,
            energy_dim=1,
            pitch_embed_kernel_size=pitch_embed_kernel_size,
            pitch_embed_dropout_rate=pitch_embed_dropout,
            energy_embed_kernel_size=energy_embed_kernel_size,
            energy_embed_dropout_rate=energy_embed_dropout,
            duration_predictor_layers=duration_predictor_layers,
            duration_predictor_chans=duration_predictor_chans,
            duration_predictor_kernel_size=duration_predictor_kernel_size,
            duration_predictor_dropout_rate=duration_predictor_dropout_rate)
        self.decoder = Encoder(
            input_layer=None,
            attention_dim=adim,
            attention_heads=aheads,
            linear_units=dunits,
            num_blocks=dlayers,
            dropout_rate=transformer_dec_dropout_rate,
            positional_dropout_rate=transformer_dec_positional_dropout_rate,
            attention_dropout_rate=transformer_dec_atten_dropout_rate,
            pos_enc_class=pos_enc_class,
            normalized_before=decoder_normalized_before,
            concate_after=decoder_concat_after,
            positionwise_layer_type=positionwise_layer_type,
            positionwise_conv_kernel_size=positionwise_conv_kernel_size)
        self.feat_out = nn.Linear(in_features=adim,
                                  out_features=odim * reduction_factor)

        self.postnet = None if postnet_layers == 0 else PostNet(
            in_dim=idim,
            out_dim=odim,
            n_layers=postnet_layers,
            n_chans=postnet_chans,
            n_filts=postnet_filts,
            use_batch_norm=use_batch_norm,
            dropout_rate=postnet_dropout_rate)
        self._reset_parameters(init_type=transformer_init,
                               init_enc_alpha=initial_encoder_alpha,
                               init_dec_alpha=initial_decoder_alpha)
        self.duration_criterion = DurationPredictorLoss()
        self.mse_criterion = nn.MSELoss()

    def _source_mask(self, ilens: torch.LongTensor):
        x_masks = make_non_pad_mask(ilens).to(self.feat_out.weight.device)
        return x_masks.unsqueeze(-2) & x_masks.unsqueeze(-1)

    def _reset_parameters(self,
                          init_type,
                          init_enc_alpha: float = 1.0,
                          init_dec_alpha: float = 1.0):
        initialize(self, init_type)
        if self.use_scaled_pos_enc:
            self.encoder.embed[-1].alpha.data = torch.tensor(init_enc_alpha)
            self.decoder.embed[-1].alpha.data = torch.tensor(init_dec_alpha)

    def _forward(self,
                 xs: torch.FloatTensor,
                 ilens: torch.LongTensor,
                 olens: torch.LongTensor = None,
                 ds: torch.LongTensor = None,
                 ps: torch.FloatTensor = None,
                 es: torch.FloatTensor = None,
                 in_masks: torch.LongTensor = None,
                 out_masks: torch.LongTensor = None,
                 is_inference: bool = False):
        x_masks = self._source_mask(ilens)
        hs, _ = self.encoder.forward(xs, x_masks)

        # ignore spk embedding

        d_masks = ~in_masks if in_masks is not None else None
        v_masks = ~out_masks if out_masks is not None else None
        if is_inference:
            hs, d_outs, p_outs, e_outs = self.variance_adaptor.inference(
                hs, ilens, d_masks, v_masks)
        else:
            hs, d_outs, p_outs, e_outs = self.variance_adaptor.forward(
                hs, ds, ilens, ps, es, d_masks, v_masks)

        # forward decoder
        if olens is not None:
            if self.reduction_factor > 1:
                olens_in = olens.new(
                    [olen // self.reduction_factor for olen in olens])
            else:
                olens_in = olens
            h_masks = self._source_mask(olens_in)
        else:
            h_masks = None
        zs, _ = self.decoder.forward(hs, h_masks)
        before_outs = self.feat_out.forward(zs).view(zs.shape[0], -1,
                                                     self.odim)

        # postnet
        if self.postnet is None:
            after_outs = before_outs
        else:
            after_outs = before_outs + self.postnet(before_outs.transpose(
                1, 2)).transpose(1, 2)

        if is_inference:
            return before_outs, after_outs
        else:
            return before_outs, after_outs, d_outs, p_outs, e_outs

    def forward(self, xs: torch.FloatTensor, ilens: torch.LongTensor,
                ys: torch.FloatTensor, olens: torch.LongTensor,
                ds: torch.FloatTensor, ps: torch.FloatTensor,
                es: torch.FloatTensor):
        # rm padded part
        xs = xs[:, :max(ilens)]
        ys = ys[:, :max(olens)]
        ds = ds[:, :max(ilens)]
        ps = ps[:, :max(olens)]
        es = es[:, :max(olens)]

        in_masks = make_non_pad_mask(ilens).to(xs.device)
        out_masks = make_non_pad_mask(olens).unsqueeze(-1).to(ys.device)
        # ignore spk embedding

        before_outs, after_outs, d_outs, p_outs, e_outs = \
            self._forward(xs, ilens, olens, ds, ps, es, in_masks=in_masks, out_masks=out_masks, is_inference=False)

        if self.reduction_factor > 1:
            olens = olens.new(
                [olen - olen % self.reduction_factor for olen in olens])
            max_olen = max(olens)
            ys = ys[:, :max_olen]

        if self.use_masking:
            d_outs = d_outs.masked_select(in_masks)
            ds = ds.masked_select(in_masks)
            before_outs = before_outs.masked_select(out_masks)
            after_outs = after_outs.masked_select(out_masks)
            ys = ys.masked_select(out_masks)
            p_outs = p_outs.masked_select(out_masks)
            e_outs = e_outs.masked_select(out_masks)
            ps = ps.masked_select(out_masks)
            es = es.masked_select(out_masks)

        # calculate loss
        if self.postnet is None:
            l1_loss = F.l1_loss(after_outs, ys)
        else:
            l1_loss = F.l1_loss(after_outs, ys) + F.l1_loss(before_outs, ys)
        duration_loss = self.duration_criterion(d_outs, ds)
        pitch_loss = self.mse_criterion(p_outs, ps)
        energy_loss = self.mse_criterion(e_outs, es)

        loss = l1_loss + duration_loss + pitch_loss + energy_loss
        # report loss
        report_keys = [{
            "l1_loss": l1_loss.item()
        }, {
            "duration_loss": duration_loss.item()
        }, {
            "pitch_loss": pitch_loss.item()
        }, {
            "energy_loss": energy_loss.item()
        }, {
            "loss": loss.item()
        }]

        if self.use_scaled_pos_enc:
            report_keys += [
                {
                    "encoder_alpha": self.encoder.embed[-1].alpha.data.item()
                },
                {
                    "decoder_alpha": self.decoder.embed[-1].alpha.data.item()
                },
            ]
        self.reporter.report(report_keys)
        return loss

    def inference(self, x: torch.LongTensor, y: torch.FloatTensor):
        ilens = torch.LongTensor([x.shape[0]]).to(x.device)
        xs = x.unsqueeze(0)
        in_masks = make_non_pad_mask(ilens).to(xs.device)
        _, outs = self._forward(xs,
                                ilens,
                                in_masks=in_masks,
                                is_inference=True)
        return outs[0]

    # for reporting attentions
    def calculate_all_attentions(self, xs: torch.FloatTensor,
                                 ilens: torch.LongTensor,
                                 ys: torch.FloatTensor,
                                 olens: torch.LongTensor, ds: torch.LongTensor,
                                 ps: torch.FloatTensor, es: torch.FloatTensor):
        with torch.no_grad():
            # remove unnecessary padded part
            xs = xs[:, :max(ilens)]
            ds = ds[:, :max(ilens)]
            ys = ys[:, :max(olens)]
            ps = ps[:, :max(olens)]
            es = es[:, :max(olens)]
            in_masks = make_non_pad_mask(ilens).to(xs.device)
            out_masks = make_non_pad_mask(olens).unsqueeze(-1).to(xs.device)
            outs = self._forward(xs,
                                 ilens,
                                 olens,
                                 ds,
                                 ps,
                                 es,
                                 in_masks,
                                 out_masks,
                                 is_inference=False)[0]

        att_ws_dict = dict()
        for name, m in self.named_modules():
            if isinstance(m, MultiHeadedAttention):
                atten = m.atten.cpu().numpy()
                if "encoder" in name:
                    atten = [
                        a[:, :l, :l] for a, l in zip(atten, ilens.tolist())
                    ]
                elif "decoder" in name:
                    if "src" in name:
                        atten = [
                            a[:, :ol, :il] for a, il, ol in zip(
                                atten, ilens.tolist(), olens.tolist())
                        ]
                    elif "self" in name:
                        atten = [
                            a[:, :l, :l]
                            for a, l in zip(atten, olens.tolist())
                        ]
                    else:
                        logging.warning(f"unknown attention module: {name}")
                else:
                    logging.warning(f"unknown attention module: {name}")
                att_ws_dict[name] = atten
        att_ws_dict["predicted_fbank"] = [
            m[:l].T for m, l in zip(outs.cpu().numpy(), olens.tolist())
        ]
        return att_ws_dict

    @property
    def attention_plot_class(self):
        return TTSPlot

    @property
    def base_plot_keys(self):
        plot_keys = ["loss", "l1_loss", "duration_loss"]
        if self.use_scaled_pos_enc:
            plot_keys += ["encoder_alpha", "decoder_alpha"]
        return plot_keys
Exemplo n.º 11
0
def main():
    """Runs the main training loop

  Creates tensorboard visualizations and saves models after each epoch
  """
    args = PARSER.parse_args()

    start = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')

    print('Started training with arguments {}'.format(sys.argv))

    np.random.seed(args.random_seed)

    training_files_0 = []
    with open(args.training_files_0, 'r') as training_file_reader_0:
        training_files_0 = training_file_reader_0.readlines()
        training_files_0 = [
            training_file.strip() for training_file in training_files_0
        ]

    training_files_1 = []
    with open(args.training_files_1, 'r') as training_file_reader_1:
        training_files_1 = training_file_reader_1.readlines()
        training_files_1 = [
            training_file.strip() for training_file in training_files_1
        ]

    training_files_2 = []
    with open(args.training_files_2, 'r') as training_file_reader_2:
        training_files_2 = training_file_reader_2.readlines()
        training_files_2 = [
            training_file.strip() for training_file in training_files_2
        ]

    training_dataset_0 = get_deserialized_dataset(
        training_files_0,
        args.training_record_byte_size,
        scale_data=args.scale_data)
    training_dataset_0 = training_dataset_0.shuffle(
        buffer_size=args.shuffle_buffer_size, seed=args.random_seed)
    training_dataset_0 = training_dataset_0.batch(args.batch_size)

    training_dataset_1 = get_deserialized_dataset(
        training_files_1,
        args.training_record_byte_size,
        scale_data=args.scale_data)
    training_dataset_1 = training_dataset_1.shuffle(
        buffer_size=args.shuffle_buffer_size, seed=args.random_seed)
    training_dataset_1 = training_dataset_1.batch(args.batch_size)

    training_dataset_2 = get_deserialized_dataset(
        training_files_2,
        args.training_record_byte_size,
        scale_data=args.scale_data)
    training_dataset_2 = training_dataset_2.shuffle(
        buffer_size=args.shuffle_buffer_size, seed=args.random_seed)
    training_dataset_2 = training_dataset_2.batch(args.batch_size)

    waveform_inputs = Input(shape=(44100, 1), name='waveform_inputs')
    encoded_data = Encoder(args.encoder_blocks,
                           args.encoder_layers,
                           args.encoder_channels,
                           args.encoder_kernel_size,
                           args.encoder_pool,
                           name='encoder')(waveform_inputs)
    classified_data = Classifier(args.n_classes,
                                 channels=args.classifier_channels,
                                 kernel_size=args.classifier_kernel_size,
                                 classifier_layers=args.classifier_layers,
                                 rate=args.classifier_dropout_rate,
                                 name='classifier')(encoded_data)

    classifier_model = Model(inputs=waveform_inputs, outputs=classified_data)

    def classifier_loss(target_genres, pred_logits):
        return sparse_categorical_crossentropy(target_genres,
                                               pred_logits,
                                               from_logits=True)

    classifier_optimizer = tf.keras.optimizers.Adam()
    classifier_loss_history = []

    def classifier_train_step(waveform_list, genres_list):
        """Performs a step of the classifier model
    arguments will be lists of tensors, with
    each element being from a different genre
    """
        waveforms = tf.concat(waveform_list, 0)
        genres = tf.concat(genres_list, 0)

        with tf.GradientTape() as tape:
            logits = classifier_model(waveforms, training=True)
            loss_value = classifier_loss(genres, logits)

        classifier_loss_history.append(loss_value.numpy().mean())
        grads = tape.gradient(loss_value, classifier_model.trainable_variables)
        classifier_optimizer.apply_gradients(
            zip(grads, classifier_model.trainable_variables))

    def transformer_loss(target_waveform, pred_waveform, target_genres,
                         pred_genres):
        waveform_loss = sparse_categorical_crossentropy(target_waveform,
                                                        pred_waveform,
                                                        from_logits=True)
        genre_loss = sparse_categorical_crossentropy(target_genres,
                                                     pred_genres,
                                                     from_logits=True)

        return tf.reduce_sum(waveform_loss,
                             axis=-1) - 0.01 * 44100 * genre_loss

    transformed_0_data = Decoder(args.decoder_blocks,
                                 args.decoder_layers,
                                 args.decoder_residual_channels,
                                 args.decoder_skip_channels,
                                 args.decoder_kernel_size,
                                 name='decoder_0')(waveform_inputs,
                                                   encoded_data)
    transformer_0_model = Model(inputs=waveform_inputs,
                                outputs=[transformed_0_data, classified_data])
    transformer_0_optimizer = tf.keras.optimizers.Adam()
    transformer_0_loss_history = []

    def transformer_0_train_step(augmented_waveforms, waveforms, genres):
        """Performs a step of a transformer model
    """
        with tf.GradientTape() as tape:
            waveform_logits, genre_logits = transformer_0_model(
                augmented_waveforms, training=True)
            loss_value = transformer_loss(waveforms, waveform_logits, genres,
                                          genre_logits)

        transformer_0_loss_history.append(loss_value.numpy().mean())
        grads = tape.gradient(loss_value,
                              transformer_0_model.trainable_variables)
        transformer_0_optimizer.apply_gradients(
            zip(grads, transformer_0_model.trainable_variables))

    transformed_1_data = Decoder(args.decoder_blocks,
                                 args.decoder_layers,
                                 args.decoder_residual_channels,
                                 args.decoder_skip_channels,
                                 args.decoder_kernel_size,
                                 name='decoder_1')(waveform_inputs,
                                                   encoded_data)
    transformer_1_model = Model(inputs=waveform_inputs,
                                outputs=[transformed_1_data, classified_data])
    transformer_1_optimizer = tf.keras.optimizers.Adam()
    transformer_1_loss_history = []

    def transformer_1_train_step(augmented_waveforms, waveforms, genres):
        """Performs a step of a transformer model
    """
        with tf.GradientTape() as tape:
            waveform_logits, genre_logits = transformer_1_model(
                augmented_waveforms, training=True)
            loss_value = transformer_loss(waveforms, waveform_logits, genres,
                                          genre_logits)

        transformer_1_loss_history.append(loss_value.numpy().mean())
        grads = tape.gradient(loss_value,
                              transformer_1_model.trainable_variables)
        transformer_1_optimizer.apply_gradients(
            zip(grads, transformer_1_model.trainable_variables))

    transformed_2_data = Decoder(args.decoder_blocks,
                                 args.decoder_layers,
                                 args.decoder_residual_channels,
                                 args.decoder_skip_channels,
                                 args.decoder_kernel_size,
                                 name='decoder_2')(waveform_inputs,
                                                   encoded_data)
    transformer_2_model = Model(inputs=waveform_inputs,
                                outputs=[transformed_2_data, classified_data])
    transformer_2_optimizer = tf.keras.optimizers.Adam()
    transformer_2_loss_history = []

    def transformer_2_train_step(augmented_waveforms, waveforms, genres):
        """Performs a step of a transformer model
    """
        with tf.GradientTape() as tape:
            waveform_logits, genre_logits = transformer_2_model(
                augmented_waveforms, training=True)
            loss_value = transformer_loss(waveforms, waveform_logits, genres,
                                          genre_logits)

        transformer_2_loss_history.append(loss_value.numpy().mean())
        grads = tape.gradient(loss_value,
                              transformer_2_model.trainable_variables)
        transformer_2_optimizer.apply_gradients(
            zip(grads, transformer_2_model.trainable_variables))

    log_dir = "logs/fit/" + start
    models_dir = 'models/' + start
    os.mkdir(models_dir)

    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir,
                                                          write_images=False)
    tensorboard_callback.set_model(classifier_model)
    summary_writer = tf.summary.create_file_writer(log_dir + '/train')

    def train(epochs):
        global_steps = 0
        for epoch in range(epochs):
            for (batch, \
                 ((genre_code_0, waveform_0, augmented_waveform_0), \
                  (genre_code_1, waveform_1, augmented_waveform_1), \
                  (genre_code_2, waveform_2, augmented_waveform_2))) \
                 in enumerate(zip(training_dataset_0, training_dataset_1, training_dataset_2)):
                print('Epoch {} batch {} fit:'.format(epoch, batch))

                classifier_start = time.time()
                classifier_train_step([
                    augmented_waveform_0, augmented_waveform_1,
                    augmented_waveform_2
                ], [genre_code_0, genre_code_1, genre_code_2])
                classifier_time = time.time() - classifier_start

                print('\tClassifier, fit time {}, loss {}'.format(
                    classifier_time, classifier_loss_history[-1]))

                transformer_0_start = time.time()
                transformer_0_train_step(augmented_waveform_0, waveform_0,
                                         genre_code_0)
                transformer_0_time = time.time() - transformer_0_start

                print('\tTransformer 0, fit time {}, loss {}'.format(
                    transformer_0_time, transformer_0_loss_history[-1]))

                transformer_1_start = time.time()
                transformer_1_train_step(augmented_waveform_1, waveform_1,
                                         genre_code_1)
                transformer_1_time = time.time() - transformer_1_start

                print('\tTransformer 1, fit time {}, loss {}'.format(
                    transformer_1_time, transformer_1_loss_history[-1]))

                transformer_2_start = time.time()
                transformer_2_train_step(augmented_waveform_2, waveform_2,
                                         genre_code_2)
                transformer_2_time = time.time() - transformer_2_start
                print('\tTransformer 2, fit time {}, loss {}'.format(
                    transformer_2_time, transformer_2_loss_history[-1]))

                if (batch + 1) % 100 == 0:
                    with summary_writer.as_default():
                        tf.summary.scalar('classifier loss',
                                          classifier_loss_history[-1],
                                          step=global_steps)
                        tf.summary.scalar('transformer 0 loss',
                                          transformer_0_loss_history[-1],
                                          step=global_steps)
                        tf.summary.scalar('transformer 1 loss',
                                          transformer_1_loss_history[-1],
                                          step=global_steps)
                        tf.summary.scalar('transformer 2 loss',
                                          transformer_2_loss_history[-1],
                                          step=global_steps)
                        tf.summary.flush()
                global_steps += 1

            # save weights every epoch
            print('Saving model weights')
            classifier_model_save_str = '/classifier_weights.{:d}-{:.2f}.h5'.format(
                epoch, classifier_loss_history[-1])
            classifier_model.save_weights(models_dir +
                                          classifier_model_save_str)
            print(
                'Finished training epoch {}, epoch losses are:'.format(epoch))
            print('\tClassifier loss = {}'.format(classifier_loss_history[-1]))
            print('\tTransformer 0 loss = {}'.format(
                transformer_0_loss_history[-1]))
            print('\tTransformer 1 loss = {}'.format(
                transformer_1_loss_history[-1]))
            print('\tTransformer 2 loss = {}'.format(
                transformer_2_loss_history[-1]))

            transformer_0_save_str = '/transformer_0_weights.{:d}-{:.2f}.h5'
            transformer_0_save_str = transformer_0_save_str.format(
                epoch, transformer_0_loss_history[-1])
            transformer_0_model.save_weights(models_dir +
                                             transformer_0_save_str)

            transformer_1_save_str = '/transformer_1_weights.{:d}-{:.2f}.h5'
            transformer_1_save_str = transformer_1_save_str.format(
                epoch, transformer_1_loss_history[-1])
            transformer_1_model.save_weights(models_dir +
                                             transformer_1_save_str)

            transformer_2_save_str = '/transformer_2_weights.{:d}-{:.2f}.h5'
            transformer_2_save_str = transformer_2_save_str.format(
                epoch, transformer_2_loss_history[-1])
            transformer_2_model.save_weights(models_dir +
                                             transformer_2_save_str)

            with summary_writer.as_default():
                tf.summary.scalar('classifier loss',
                                  classifier_loss_history[-1],
                                  step=global_steps)
                tf.summary.scalar('transformer 0 loss',
                                  transformer_0_loss_history[-1],
                                  step=global_steps)
                tf.summary.scalar('transformer 1 loss',
                                  transformer_1_loss_history[-1],
                                  step=global_steps)
                tf.summary.scalar('transformer 2 loss',
                                  transformer_2_loss_history[-1],
                                  step=global_steps)
            tf.summary.flush()

    train(args.max_epochs)