def __init__(self, is_training): # Place Holder self.word = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_word') self.word_vec = tf.placeholder(dtype=tf.float32, shape=[None, FLAGS.word_size], name='word_vec') self.pos1 = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_pos1') self.pos2 = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_pos2') self.length = tf.placeholder(dtype=tf.int32, shape=[None], name='input_length') self.mask = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_mask') self.label = tf.placeholder(dtype=tf.int32, shape=[None], name='label') self.label_for_select = tf.placeholder(dtype=tf.int32, shape=[None], name='label_for_select') self.scope = tf.placeholder(dtype=tf.int32, shape=[FLAGS.batch_size + 1], name='scope') self.weights = tf.placeholder(dtype=tf.float32, shape=[FLAGS.batch_size]) # Network self.embedding = Embedding(is_training, self.word_vec, self.word, self.pos1, self.pos2) self.encoder = Encoder(is_training, length=self.length, mask=self.mask) self.selector = Selector(is_training, self.scope, self.label_for_select) self.classifier = Classifier(is_training, self.label, self.weights) # Metrics self.acc_NA = Accuracy() self.acc_not_NA = Accuracy() self.acc_total = Accuracy() self.step = 0 # Session self.sess = None
def __init__(self, is_training, use_bag=True): self.use_bag = use_bag # Place Holder self.word = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_word') #self.word_vec = tf.placeholder(dtype=tf.float32, shape=[None, FLAGS.word_size], name='word_vec') self.pos1 = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_pos1') self.pos2 = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_pos2') self.length = tf.placeholder(dtype=tf.int32, shape=[None], name='input_length') self.mask = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_mask') self.label = tf.placeholder(dtype=tf.int32, shape=[None], name='label') self.label_for_select = tf.placeholder(dtype=tf.int32, shape=[None], name='label_for_select') self.scope = tf.placeholder(dtype=tf.int32, shape=[FLAGS.batch_size + 1], name='scope') self.weights = tf.placeholder(dtype=tf.float32, shape=[FLAGS.batch_size]) self.data_word_vec = np.load(os.path.join(FLAGS.export_path, 'vec.npy')) # Network self.embedding = Embedding(is_training, self.data_word_vec, self.word, self.pos1, self.pos2) self.encoder = Encoder(is_training, FLAGS.drop_prob) self.selector = Selector(FLAGS.num_classes, is_training, FLAGS.drop_prob) self.classifier = Classifier(is_training, self.label, self.weights) # Metrics self.acc_NA = Accuracy() self.acc_not_NA = Accuracy() self.acc_total = Accuracy() self.step = 0 # Session self.sess = None
def __init__(self,is_training): # Place Holder self.word = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_word') self.pos1 = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_pos1') self.pos2 = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_pos2') self.length = tf.placeholder(dtype=tf.int32, shape=[None], name='input_length') self.mask = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_mask') self.label = tf.placeholder(dtype=tf.int32, shape=[None], name='label') self.label_for_select = tf.placeholder(dtype=tf.int32, shape=[None], name='label_for_select') self.scope = tf.placeholder(dtype=tf.int32, shape=[FLAGS.batch_size + 1], name='scope') self.weights = tf.placeholder(dtype=tf.float32, shape=[None]) self.data_word_vec = np.load(os.path.join(FLAGS.export_path, 'vec.npy')) self.reward_holder = tf.placeholder(dtype = tf.float32, shape=[None],name='reward') self.action_holder = tf.placeholder(dtype = tf.int32, shape=[None],name='action') # Network self.embedding = Embedding(is_training, self.data_word_vec, self.word, self.pos1, self.pos2) self.encoder = Encoder(is_training, FLAGS.drop_prob) self.selector = Selector(FLAGS.num_classes, is_training, FLAGS.drop_prob) self.classifier = Classifier(is_training, self.label, self.weights) #compute self.word_embedding = self.embedding.word_embedding() self.pos_embedding = self.embedding.pos_embedding() self.embedding = self.embedding.concat_embedding(self.word_embedding, self.pos_embedding) self.x = self.encoder.cnn(self.embedding, FLAGS.hidden_size, self.mask, activation=tf.nn.relu) self.logit, self.repre = self.selector.no_bag(self.x) self.outputvalue=self.classifier.outputvalue(self.logit) self.output = self.classifier.output(self.logit) self.softmax_outputs=tf.nn.softmax(self.logit) self.action_onehot=tf.one_hot(indices=self.action_holder,depth=FLAGS.num_classes,dtype=tf.float32) self.step=tf.multiply(self.softmax_outputs,self.action_onehot) self.action_outputvalue=tf.reduce_sum(self.step,axis=1) self.temp=tf.log(self.action_outputvalue)*self.reward_holder self.loss=-tf.reduce_mean(tf.log(self.action_outputvalue)*self.reward_holder)#reduce_mean里面的是188个句子的loss self.loss_pre = self.classifier.softmax_cross_entropy(self.logit) self.optimizer_pre = tf.train.GradientDescentOptimizer(0.5) self.grads_and_vars = self.optimizer_pre.compute_gradients(self.loss_pre) self.train_op = self.optimizer_pre.apply_gradients(self.grads_and_vars) self.optimizer = tf.train.AdamOptimizer(0.0001) self.train_op_rl=self.optimizer.minimize(self.loss) self.tvars = tf.trainable_variables()
def __init__(self, theta_encoder_len=1, lead_num=1): super(Model_nefnet, self).__init__() self.lead_num = lead_num self.W_encoder = Encoder(backbone='resnet34', in_channel=lead_num, use_first_pool=True, lead_num=lead_num, init_channels=128) self.theta_encoder = ThetaEncoder(encoder_len=theta_encoder_len) self.mlp1 = nn.Linear((2 * theta_encoder_len + 1) * 4, 128) self.mlp2 = nn.Linear((2 * theta_encoder_len + 1) * 4, 256 * 1) self.w_feature_extractor = nn.Sequential(nn.Conv1d(128, 128, 3, 1, 1), nn.ReLU(inplace=True)) self.w_conv = nn.Sequential( BasicBlock(128 * self.lead_num, 128 * self.lead_num, 1, groups=self.lead_num)) self.z1_conv = nn.Sequential( BasicBlock(64 * self.lead_num, 128 * self.lead_num, 1, groups=self.lead_num)) self.z2_conv1 = nn.Sequential( BasicBlock(64 * self.lead_num, 128 * self.lead_num, 1, groups=self.lead_num)) self.z2_conv2 = nn.Sequential( BasicBlock(128 * 7 * self.lead_num, 128 * 7 * self.lead_num, 1, groups=self.lead_num * 7), nn.ConvTranspose1d(128 * 7 * self.lead_num, 128 * 7 * self.lead_num // 2, kernel_size=2, stride=2, groups=self.lead_num * 7), BasicBlock(128 * 7 * self.lead_num // 2, 128 * 7 * self.lead_num, 1, groups=self.lead_num * 7), ) self.decoder = nn.Sequential( nn.Upsample(scale_factor=2, mode='linear', align_corners=False), DoubleConv(256 * 1, 128), nn.Upsample(scale_factor=2, mode='linear', align_corners=False), DoubleConv(128, 64), nn.Conv1d(64, 1, 3, padding=1))
def __init__(self, dataset_path, max_size): self.lang_code = dataset_path.split('/')[-1].split('.')[0] self.dataset, self.dataset_validation, self.inp_tokenizer, self.out_tokenizer = load_data( dataset_path, max_size) self.encoder = Encoder(data.vocab_inp_size, data.embedding_dim, data.units, data.batch_size) self.decoder = Decoder(data.vocab_tar_size, data.embedding_dim, data.units, data.batch_size) self.optimizer = tf.keras.optimizers.Adam() self.checkpoint_dir = './training_checkpoints_' + self.lang_code self.checkpoint_prefix = os.path.join(self.checkpoint_dir, "ckpt") checkpoint = tf.train.Checkpoint(optimizer=self.optimizer, encoder=self.encoder, decoder=self.decoder) self.checkpoint = checkpoint
def __init__(self, is_training, use_bag=True): self.use_bag = use_bag self.is_training = is_training # Place Holder self.word = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_word') #self.word_vec = tf.placeholder(dtype=tf.float32, shape=[None, FLAGS.word_size], name='word_vec') self.pos1 = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_pos1') self.pos2 = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_pos2') self.length = tf.placeholder(dtype=tf.int32, shape=[None], name='input_length') self.mask = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_mask') self.label = tf.placeholder(dtype=tf.int32, shape=[None], name='label') self.label_for_select = tf.placeholder(dtype=tf.int32, shape=[None], name='label_for_select') self.scope = tf.placeholder(dtype=tf.int32, shape=[FLAGS.batch_size + 1], name='scope') self.weights = tf.placeholder(dtype=tf.float32, shape=[FLAGS.batch_size]) self.data_word_vec = np.load(os.path.join(FLAGS.export_path, 'vec.npy')) # Gcn self.ent2id = tf.placeholder(dtype=tf.int32, name='ent2id') self.features = tf.sparse_placeholder(dtype=tf.float32, name='kg_features') adj_name = ['h2r_adj', 'r2t_adj', 'self_adj'] self.supports = [tf.sparse_placeholder(dtype=tf.float32, name=adj_name[i]) for i in range(3)] self.gcn_dims = [100, 85, 70, 53] self.num_features_nonzero = tf.placeholder(tf.int32) # Network self.embedding = Embedding(is_training, self.data_word_vec, self.word, self.pos1, self.pos2) self.encoder = Encoder(is_training, FLAGS.drop_prob) self.gcn = GCN(is_training, FLAGS.gcn_drop_prob, FLAGS.num_classes, self.gcn_dims) self.selector = Selector(FLAGS.num_classes, is_training, FLAGS.drop_prob) self.classifier = Classifier(is_training, self.label, self.weights) # Metrics self.acc_NA = Accuracy() self.acc_not_NA = Accuracy() self.acc_total = Accuracy() self.step = 0 # Session self.sess = None
def __init__(self, args): super(ContrastiveEncoder, self).__init__() self.representation_layer = args.representation_layer self.encoder = Encoder(args, *ENCODER_PARAMS) if self.encoder.multiscale: self.skip_1 = nn.Linear(ENCODER_FILTERS[0], ENCODER_FILTERS[-1]) self.skip_2 = nn.Linear(ENCODER_FILTERS[1], ENCODER_FILTERS[-1]) p_in_dim = self.encoder.out_dim p_mid_dim = int(self.encoder.out_dim) self.projection = nn.ModuleList([ nn.Sequential(nn.Linear(p_in_dim, p_mid_dim), nn.ReLU()), nn.Linear(p_mid_dim, LATENT_SIZE) ]) # what will be used for downstream if self.representation_layer == 0: self.latent_size = p_in_dim elif self.representation_layer == 1: self.latent_size = p_mid_dim else: assert 1 == 0
def __init__(self, is_training): #Place Holder self.word = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_word') self.pos1 = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_pos1') self.pos2 = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_pos2') self.length = tf.placeholder(dtype=tf.int32, shape=[None], name='input_length') self.mask = tf.placeholder(dtype=tf.int32, shape=[None, FLAGS.max_length], name='input_mask') self.scope = tf.placeholder(dtype=tf.int32, shape=[FLAGS.batch_size + 1], name='scope') self.bag_label = tf.placeholder(dtype=tf.int32, shape=[None], name='bag_label') self.sentence_label = tf.placeholder(dtype=tf.int32, shape=[None], name='sentence_label') self.label_weights = tf.placeholder(dtype=tf.float32, shape=[FLAGS.batch_size]) self.data_word_vec = np.load(os.path.join(FLAGS.export_path, 'vec.npy')) #Network self.embedding = Embedding(is_training, self.data_word_vec, self.word, self.pos1, self.pos2) self.encoder = Encoder(is_training, FLAGS.drop_prob) self.selector = Selector(FLAGS.num_classes, is_training, FLAGS.drop_prob) self.classifier = Classifier(is_training, self.bag_label, self.label_weights) #compute self.word_embedding = self.embedding.word_embedding() self.pos_embedding = self.embedding.pos_embedding() self.embedding = self.embedding.concat_embedding( self.word_embedding, self.pos_embedding) self.x = self.encoder.pcnn(self.embedding, FLAGS.hidden_size, self.mask, activation=tf.nn.relu) self.logit, self.repre = self.selector.attention( self.x, self.scope, self.sentence_label) #用于判断ds和selected哪一个好,与优化无关 self.label_onehot = tf.one_hot(indices=self.bag_label, depth=FLAGS.num_classes, dtype=tf.float32) self.bag_loss_temp = tf.nn.softmax_cross_entropy_with_logits( labels=self.label_onehot, logits=self.logit) self.bag_loss = tf.reshape(self.bag_loss_temp, [1, -1]) self.loss_mean = tf.reduce_mean(self.bag_loss) #计算reward self.softmax_output = tf.nn.softmax(self.logit) self.reward = tf.log( tf.reduce_sum(self.label_onehot * self.softmax_output, axis=1)) #self.loss_mine = -tf.reduce_mean(self.reward, axis=0)这个就是loss一样的,只是没有加上权重 #计算梯度下降 self.loss = self.classifier.softmax_cross_entropy(self.logit) #self.loss_one = self.classifier.softmax_cross_entropy(self.logit_one) self.output = self.classifier.output(self.logit) #self.output_one = self.classifier.output(self.logit_one) self.outputvalue = self.classifier.outputvalue(self.logit) self.test_output = tf.argmax(self.logit, 1) #输出什么关系 self.test_outputvalue = tf.reduce_max(self.logit, axis=1) #输出关系的概率 # Optimizer self.global_step = tf.Variable(0, name='global_step', trainable=False) tf.summary.scalar('learning_rate', FLAGS.learning_rate) self.optimizer = tf.train.GradientDescentOptimizer(FLAGS.learning_rate) self.grads_and_vars = self.optimizer.compute_gradients(self.loss) self.train_op = self.optimizer.apply_gradients( self.grads_and_vars, global_step=self.global_step)
def __init__( self, idim: int, odim: int, adim: int = 384, aheads: int = 4, positionwise_layer_type: str = "linear", positionwise_conv_kernel_size: int = 3, # encoder eunits: int = 1536, elayers: int = 6, transformer_enc_dropout_rate: float = 0.0, transformer_enc_positional_dropout_rate: float = 0.0, transformer_enc_atten_dropout_rate: float = 0.0, encoder_normalized_before: bool = True, encoder_concat_after: bool = False, # variance pitch_embed_kernel_size: int = 1, pitch_embed_dropout: float = 0.0, energy_embed_kernel_size: int = 1, energy_embed_dropout: float = 0.0, duration_predictor_layers: int = 2, duration_predictor_chans: int = 256, duration_predictor_kernel_size: int = 3, duration_predictor_dropout_rate: float = 0.1, # decoder dlayers: int = 6, dunits: int = 1536, transformer_dec_dropout_rate: float = 0.1, transformer_dec_positional_dropout_rate: float = 0.1, transformer_dec_atten_dropout_rate: float = 0.1, decoder_normalized_before: bool = False, decoder_concat_after: bool = False, reduction_factor: int = 1, # postnet postnet_layers: int = 5, postnet_filts: int = 5, postnet_chans: int = 256, postnet_dropout_rate: float = 0.5, # init transformer_init: str = "pytorch", initial_encoder_alpha: float = 1.0, initial_decoder_alpha: float = 1.0, # other use_masking: bool = True, use_batch_norm: bool = True, use_scaled_pos_enc: bool = True, ): super(FeedForwardTransformer, self).__init__() self.use_scaled_pos_enc = use_scaled_pos_enc self.reduction_factor = reduction_factor self.odim = odim self.use_masking = use_masking self.reporter = Reporter() # encoder pos_enc_class = ScaledPositionalEncoding if use_scaled_pos_enc else PositionalEncoding padding_idx: int = 0 encoder_input_layer = nn.Embedding(num_embeddings=idim, embedding_dim=adim, padding_idx=padding_idx) self.encoder = Encoder( input_layer=encoder_input_layer, attention_dim=adim, attention_heads=aheads, linear_units=eunits, num_blocks=elayers, dropout_rate=transformer_enc_dropout_rate, positional_dropout_rate=transformer_enc_positional_dropout_rate, attention_dropout_rate=transformer_enc_atten_dropout_rate, pos_enc_class=pos_enc_class, normalized_before=encoder_normalized_before, concate_after=encoder_concat_after, positionwise_layer_type=positionwise_layer_type, positionwise_conv_kernel_size=positionwise_conv_kernel_size) self.variance_adaptor = VarianceAdaptor( adim=adim, pitch_dim=4, energy_dim=1, pitch_embed_kernel_size=pitch_embed_kernel_size, pitch_embed_dropout_rate=pitch_embed_dropout, energy_embed_kernel_size=energy_embed_kernel_size, energy_embed_dropout_rate=energy_embed_dropout, duration_predictor_layers=duration_predictor_layers, duration_predictor_chans=duration_predictor_chans, duration_predictor_kernel_size=duration_predictor_kernel_size, duration_predictor_dropout_rate=duration_predictor_dropout_rate) self.decoder = Encoder( input_layer=None, attention_dim=adim, attention_heads=aheads, linear_units=dunits, num_blocks=dlayers, dropout_rate=transformer_dec_dropout_rate, positional_dropout_rate=transformer_dec_positional_dropout_rate, attention_dropout_rate=transformer_dec_atten_dropout_rate, pos_enc_class=pos_enc_class, normalized_before=decoder_normalized_before, concate_after=decoder_concat_after, positionwise_layer_type=positionwise_layer_type, positionwise_conv_kernel_size=positionwise_conv_kernel_size) self.feat_out = nn.Linear(in_features=adim, out_features=odim * reduction_factor) self.postnet = None if postnet_layers == 0 else PostNet( in_dim=idim, out_dim=odim, n_layers=postnet_layers, n_chans=postnet_chans, n_filts=postnet_filts, use_batch_norm=use_batch_norm, dropout_rate=postnet_dropout_rate) self._reset_parameters(init_type=transformer_init, init_enc_alpha=initial_encoder_alpha, init_dec_alpha=initial_decoder_alpha) self.duration_criterion = DurationPredictorLoss() self.mse_criterion = nn.MSELoss()
class FeedForwardTransformer(nn.Module): def __init__( self, idim: int, odim: int, adim: int = 384, aheads: int = 4, positionwise_layer_type: str = "linear", positionwise_conv_kernel_size: int = 3, # encoder eunits: int = 1536, elayers: int = 6, transformer_enc_dropout_rate: float = 0.0, transformer_enc_positional_dropout_rate: float = 0.0, transformer_enc_atten_dropout_rate: float = 0.0, encoder_normalized_before: bool = True, encoder_concat_after: bool = False, # variance pitch_embed_kernel_size: int = 1, pitch_embed_dropout: float = 0.0, energy_embed_kernel_size: int = 1, energy_embed_dropout: float = 0.0, duration_predictor_layers: int = 2, duration_predictor_chans: int = 256, duration_predictor_kernel_size: int = 3, duration_predictor_dropout_rate: float = 0.1, # decoder dlayers: int = 6, dunits: int = 1536, transformer_dec_dropout_rate: float = 0.1, transformer_dec_positional_dropout_rate: float = 0.1, transformer_dec_atten_dropout_rate: float = 0.1, decoder_normalized_before: bool = False, decoder_concat_after: bool = False, reduction_factor: int = 1, # postnet postnet_layers: int = 5, postnet_filts: int = 5, postnet_chans: int = 256, postnet_dropout_rate: float = 0.5, # init transformer_init: str = "pytorch", initial_encoder_alpha: float = 1.0, initial_decoder_alpha: float = 1.0, # other use_masking: bool = True, use_batch_norm: bool = True, use_scaled_pos_enc: bool = True, ): super(FeedForwardTransformer, self).__init__() self.use_scaled_pos_enc = use_scaled_pos_enc self.reduction_factor = reduction_factor self.odim = odim self.use_masking = use_masking self.reporter = Reporter() # encoder pos_enc_class = ScaledPositionalEncoding if use_scaled_pos_enc else PositionalEncoding padding_idx: int = 0 encoder_input_layer = nn.Embedding(num_embeddings=idim, embedding_dim=adim, padding_idx=padding_idx) self.encoder = Encoder( input_layer=encoder_input_layer, attention_dim=adim, attention_heads=aheads, linear_units=eunits, num_blocks=elayers, dropout_rate=transformer_enc_dropout_rate, positional_dropout_rate=transformer_enc_positional_dropout_rate, attention_dropout_rate=transformer_enc_atten_dropout_rate, pos_enc_class=pos_enc_class, normalized_before=encoder_normalized_before, concate_after=encoder_concat_after, positionwise_layer_type=positionwise_layer_type, positionwise_conv_kernel_size=positionwise_conv_kernel_size) self.variance_adaptor = VarianceAdaptor( adim=adim, pitch_dim=4, energy_dim=1, pitch_embed_kernel_size=pitch_embed_kernel_size, pitch_embed_dropout_rate=pitch_embed_dropout, energy_embed_kernel_size=energy_embed_kernel_size, energy_embed_dropout_rate=energy_embed_dropout, duration_predictor_layers=duration_predictor_layers, duration_predictor_chans=duration_predictor_chans, duration_predictor_kernel_size=duration_predictor_kernel_size, duration_predictor_dropout_rate=duration_predictor_dropout_rate) self.decoder = Encoder( input_layer=None, attention_dim=adim, attention_heads=aheads, linear_units=dunits, num_blocks=dlayers, dropout_rate=transformer_dec_dropout_rate, positional_dropout_rate=transformer_dec_positional_dropout_rate, attention_dropout_rate=transformer_dec_atten_dropout_rate, pos_enc_class=pos_enc_class, normalized_before=decoder_normalized_before, concate_after=decoder_concat_after, positionwise_layer_type=positionwise_layer_type, positionwise_conv_kernel_size=positionwise_conv_kernel_size) self.feat_out = nn.Linear(in_features=adim, out_features=odim * reduction_factor) self.postnet = None if postnet_layers == 0 else PostNet( in_dim=idim, out_dim=odim, n_layers=postnet_layers, n_chans=postnet_chans, n_filts=postnet_filts, use_batch_norm=use_batch_norm, dropout_rate=postnet_dropout_rate) self._reset_parameters(init_type=transformer_init, init_enc_alpha=initial_encoder_alpha, init_dec_alpha=initial_decoder_alpha) self.duration_criterion = DurationPredictorLoss() self.mse_criterion = nn.MSELoss() def _source_mask(self, ilens: torch.LongTensor): x_masks = make_non_pad_mask(ilens).to(self.feat_out.weight.device) return x_masks.unsqueeze(-2) & x_masks.unsqueeze(-1) def _reset_parameters(self, init_type, init_enc_alpha: float = 1.0, init_dec_alpha: float = 1.0): initialize(self, init_type) if self.use_scaled_pos_enc: self.encoder.embed[-1].alpha.data = torch.tensor(init_enc_alpha) self.decoder.embed[-1].alpha.data = torch.tensor(init_dec_alpha) def _forward(self, xs: torch.FloatTensor, ilens: torch.LongTensor, olens: torch.LongTensor = None, ds: torch.LongTensor = None, ps: torch.FloatTensor = None, es: torch.FloatTensor = None, in_masks: torch.LongTensor = None, out_masks: torch.LongTensor = None, is_inference: bool = False): x_masks = self._source_mask(ilens) hs, _ = self.encoder.forward(xs, x_masks) # ignore spk embedding d_masks = ~in_masks if in_masks is not None else None v_masks = ~out_masks if out_masks is not None else None if is_inference: hs, d_outs, p_outs, e_outs = self.variance_adaptor.inference( hs, ilens, d_masks, v_masks) else: hs, d_outs, p_outs, e_outs = self.variance_adaptor.forward( hs, ds, ilens, ps, es, d_masks, v_masks) # forward decoder if olens is not None: if self.reduction_factor > 1: olens_in = olens.new( [olen // self.reduction_factor for olen in olens]) else: olens_in = olens h_masks = self._source_mask(olens_in) else: h_masks = None zs, _ = self.decoder.forward(hs, h_masks) before_outs = self.feat_out.forward(zs).view(zs.shape[0], -1, self.odim) # postnet if self.postnet is None: after_outs = before_outs else: after_outs = before_outs + self.postnet(before_outs.transpose( 1, 2)).transpose(1, 2) if is_inference: return before_outs, after_outs else: return before_outs, after_outs, d_outs, p_outs, e_outs def forward(self, xs: torch.FloatTensor, ilens: torch.LongTensor, ys: torch.FloatTensor, olens: torch.LongTensor, ds: torch.FloatTensor, ps: torch.FloatTensor, es: torch.FloatTensor): # rm padded part xs = xs[:, :max(ilens)] ys = ys[:, :max(olens)] ds = ds[:, :max(ilens)] ps = ps[:, :max(olens)] es = es[:, :max(olens)] in_masks = make_non_pad_mask(ilens).to(xs.device) out_masks = make_non_pad_mask(olens).unsqueeze(-1).to(ys.device) # ignore spk embedding before_outs, after_outs, d_outs, p_outs, e_outs = \ self._forward(xs, ilens, olens, ds, ps, es, in_masks=in_masks, out_masks=out_masks, is_inference=False) if self.reduction_factor > 1: olens = olens.new( [olen - olen % self.reduction_factor for olen in olens]) max_olen = max(olens) ys = ys[:, :max_olen] if self.use_masking: d_outs = d_outs.masked_select(in_masks) ds = ds.masked_select(in_masks) before_outs = before_outs.masked_select(out_masks) after_outs = after_outs.masked_select(out_masks) ys = ys.masked_select(out_masks) p_outs = p_outs.masked_select(out_masks) e_outs = e_outs.masked_select(out_masks) ps = ps.masked_select(out_masks) es = es.masked_select(out_masks) # calculate loss if self.postnet is None: l1_loss = F.l1_loss(after_outs, ys) else: l1_loss = F.l1_loss(after_outs, ys) + F.l1_loss(before_outs, ys) duration_loss = self.duration_criterion(d_outs, ds) pitch_loss = self.mse_criterion(p_outs, ps) energy_loss = self.mse_criterion(e_outs, es) loss = l1_loss + duration_loss + pitch_loss + energy_loss # report loss report_keys = [{ "l1_loss": l1_loss.item() }, { "duration_loss": duration_loss.item() }, { "pitch_loss": pitch_loss.item() }, { "energy_loss": energy_loss.item() }, { "loss": loss.item() }] if self.use_scaled_pos_enc: report_keys += [ { "encoder_alpha": self.encoder.embed[-1].alpha.data.item() }, { "decoder_alpha": self.decoder.embed[-1].alpha.data.item() }, ] self.reporter.report(report_keys) return loss def inference(self, x: torch.LongTensor, y: torch.FloatTensor): ilens = torch.LongTensor([x.shape[0]]).to(x.device) xs = x.unsqueeze(0) in_masks = make_non_pad_mask(ilens).to(xs.device) _, outs = self._forward(xs, ilens, in_masks=in_masks, is_inference=True) return outs[0] # for reporting attentions def calculate_all_attentions(self, xs: torch.FloatTensor, ilens: torch.LongTensor, ys: torch.FloatTensor, olens: torch.LongTensor, ds: torch.LongTensor, ps: torch.FloatTensor, es: torch.FloatTensor): with torch.no_grad(): # remove unnecessary padded part xs = xs[:, :max(ilens)] ds = ds[:, :max(ilens)] ys = ys[:, :max(olens)] ps = ps[:, :max(olens)] es = es[:, :max(olens)] in_masks = make_non_pad_mask(ilens).to(xs.device) out_masks = make_non_pad_mask(olens).unsqueeze(-1).to(xs.device) outs = self._forward(xs, ilens, olens, ds, ps, es, in_masks, out_masks, is_inference=False)[0] att_ws_dict = dict() for name, m in self.named_modules(): if isinstance(m, MultiHeadedAttention): atten = m.atten.cpu().numpy() if "encoder" in name: atten = [ a[:, :l, :l] for a, l in zip(atten, ilens.tolist()) ] elif "decoder" in name: if "src" in name: atten = [ a[:, :ol, :il] for a, il, ol in zip( atten, ilens.tolist(), olens.tolist()) ] elif "self" in name: atten = [ a[:, :l, :l] for a, l in zip(atten, olens.tolist()) ] else: logging.warning(f"unknown attention module: {name}") else: logging.warning(f"unknown attention module: {name}") att_ws_dict[name] = atten att_ws_dict["predicted_fbank"] = [ m[:l].T for m, l in zip(outs.cpu().numpy(), olens.tolist()) ] return att_ws_dict @property def attention_plot_class(self): return TTSPlot @property def base_plot_keys(self): plot_keys = ["loss", "l1_loss", "duration_loss"] if self.use_scaled_pos_enc: plot_keys += ["encoder_alpha", "decoder_alpha"] return plot_keys
def main(): """Runs the main training loop Creates tensorboard visualizations and saves models after each epoch """ args = PARSER.parse_args() start = datetime.datetime.now().strftime('%Y%m%d-%H%M%S') print('Started training with arguments {}'.format(sys.argv)) np.random.seed(args.random_seed) training_files_0 = [] with open(args.training_files_0, 'r') as training_file_reader_0: training_files_0 = training_file_reader_0.readlines() training_files_0 = [ training_file.strip() for training_file in training_files_0 ] training_files_1 = [] with open(args.training_files_1, 'r') as training_file_reader_1: training_files_1 = training_file_reader_1.readlines() training_files_1 = [ training_file.strip() for training_file in training_files_1 ] training_files_2 = [] with open(args.training_files_2, 'r') as training_file_reader_2: training_files_2 = training_file_reader_2.readlines() training_files_2 = [ training_file.strip() for training_file in training_files_2 ] training_dataset_0 = get_deserialized_dataset( training_files_0, args.training_record_byte_size, scale_data=args.scale_data) training_dataset_0 = training_dataset_0.shuffle( buffer_size=args.shuffle_buffer_size, seed=args.random_seed) training_dataset_0 = training_dataset_0.batch(args.batch_size) training_dataset_1 = get_deserialized_dataset( training_files_1, args.training_record_byte_size, scale_data=args.scale_data) training_dataset_1 = training_dataset_1.shuffle( buffer_size=args.shuffle_buffer_size, seed=args.random_seed) training_dataset_1 = training_dataset_1.batch(args.batch_size) training_dataset_2 = get_deserialized_dataset( training_files_2, args.training_record_byte_size, scale_data=args.scale_data) training_dataset_2 = training_dataset_2.shuffle( buffer_size=args.shuffle_buffer_size, seed=args.random_seed) training_dataset_2 = training_dataset_2.batch(args.batch_size) waveform_inputs = Input(shape=(44100, 1), name='waveform_inputs') encoded_data = Encoder(args.encoder_blocks, args.encoder_layers, args.encoder_channels, args.encoder_kernel_size, args.encoder_pool, name='encoder')(waveform_inputs) classified_data = Classifier(args.n_classes, channels=args.classifier_channels, kernel_size=args.classifier_kernel_size, classifier_layers=args.classifier_layers, rate=args.classifier_dropout_rate, name='classifier')(encoded_data) classifier_model = Model(inputs=waveform_inputs, outputs=classified_data) def classifier_loss(target_genres, pred_logits): return sparse_categorical_crossentropy(target_genres, pred_logits, from_logits=True) classifier_optimizer = tf.keras.optimizers.Adam() classifier_loss_history = [] def classifier_train_step(waveform_list, genres_list): """Performs a step of the classifier model arguments will be lists of tensors, with each element being from a different genre """ waveforms = tf.concat(waveform_list, 0) genres = tf.concat(genres_list, 0) with tf.GradientTape() as tape: logits = classifier_model(waveforms, training=True) loss_value = classifier_loss(genres, logits) classifier_loss_history.append(loss_value.numpy().mean()) grads = tape.gradient(loss_value, classifier_model.trainable_variables) classifier_optimizer.apply_gradients( zip(grads, classifier_model.trainable_variables)) def transformer_loss(target_waveform, pred_waveform, target_genres, pred_genres): waveform_loss = sparse_categorical_crossentropy(target_waveform, pred_waveform, from_logits=True) genre_loss = sparse_categorical_crossentropy(target_genres, pred_genres, from_logits=True) return tf.reduce_sum(waveform_loss, axis=-1) - 0.01 * 44100 * genre_loss transformed_0_data = Decoder(args.decoder_blocks, args.decoder_layers, args.decoder_residual_channels, args.decoder_skip_channels, args.decoder_kernel_size, name='decoder_0')(waveform_inputs, encoded_data) transformer_0_model = Model(inputs=waveform_inputs, outputs=[transformed_0_data, classified_data]) transformer_0_optimizer = tf.keras.optimizers.Adam() transformer_0_loss_history = [] def transformer_0_train_step(augmented_waveforms, waveforms, genres): """Performs a step of a transformer model """ with tf.GradientTape() as tape: waveform_logits, genre_logits = transformer_0_model( augmented_waveforms, training=True) loss_value = transformer_loss(waveforms, waveform_logits, genres, genre_logits) transformer_0_loss_history.append(loss_value.numpy().mean()) grads = tape.gradient(loss_value, transformer_0_model.trainable_variables) transformer_0_optimizer.apply_gradients( zip(grads, transformer_0_model.trainable_variables)) transformed_1_data = Decoder(args.decoder_blocks, args.decoder_layers, args.decoder_residual_channels, args.decoder_skip_channels, args.decoder_kernel_size, name='decoder_1')(waveform_inputs, encoded_data) transformer_1_model = Model(inputs=waveform_inputs, outputs=[transformed_1_data, classified_data]) transformer_1_optimizer = tf.keras.optimizers.Adam() transformer_1_loss_history = [] def transformer_1_train_step(augmented_waveforms, waveforms, genres): """Performs a step of a transformer model """ with tf.GradientTape() as tape: waveform_logits, genre_logits = transformer_1_model( augmented_waveforms, training=True) loss_value = transformer_loss(waveforms, waveform_logits, genres, genre_logits) transformer_1_loss_history.append(loss_value.numpy().mean()) grads = tape.gradient(loss_value, transformer_1_model.trainable_variables) transformer_1_optimizer.apply_gradients( zip(grads, transformer_1_model.trainable_variables)) transformed_2_data = Decoder(args.decoder_blocks, args.decoder_layers, args.decoder_residual_channels, args.decoder_skip_channels, args.decoder_kernel_size, name='decoder_2')(waveform_inputs, encoded_data) transformer_2_model = Model(inputs=waveform_inputs, outputs=[transformed_2_data, classified_data]) transformer_2_optimizer = tf.keras.optimizers.Adam() transformer_2_loss_history = [] def transformer_2_train_step(augmented_waveforms, waveforms, genres): """Performs a step of a transformer model """ with tf.GradientTape() as tape: waveform_logits, genre_logits = transformer_2_model( augmented_waveforms, training=True) loss_value = transformer_loss(waveforms, waveform_logits, genres, genre_logits) transformer_2_loss_history.append(loss_value.numpy().mean()) grads = tape.gradient(loss_value, transformer_2_model.trainable_variables) transformer_2_optimizer.apply_gradients( zip(grads, transformer_2_model.trainable_variables)) log_dir = "logs/fit/" + start models_dir = 'models/' + start os.mkdir(models_dir) tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, write_images=False) tensorboard_callback.set_model(classifier_model) summary_writer = tf.summary.create_file_writer(log_dir + '/train') def train(epochs): global_steps = 0 for epoch in range(epochs): for (batch, \ ((genre_code_0, waveform_0, augmented_waveform_0), \ (genre_code_1, waveform_1, augmented_waveform_1), \ (genre_code_2, waveform_2, augmented_waveform_2))) \ in enumerate(zip(training_dataset_0, training_dataset_1, training_dataset_2)): print('Epoch {} batch {} fit:'.format(epoch, batch)) classifier_start = time.time() classifier_train_step([ augmented_waveform_0, augmented_waveform_1, augmented_waveform_2 ], [genre_code_0, genre_code_1, genre_code_2]) classifier_time = time.time() - classifier_start print('\tClassifier, fit time {}, loss {}'.format( classifier_time, classifier_loss_history[-1])) transformer_0_start = time.time() transformer_0_train_step(augmented_waveform_0, waveform_0, genre_code_0) transformer_0_time = time.time() - transformer_0_start print('\tTransformer 0, fit time {}, loss {}'.format( transformer_0_time, transformer_0_loss_history[-1])) transformer_1_start = time.time() transformer_1_train_step(augmented_waveform_1, waveform_1, genre_code_1) transformer_1_time = time.time() - transformer_1_start print('\tTransformer 1, fit time {}, loss {}'.format( transformer_1_time, transformer_1_loss_history[-1])) transformer_2_start = time.time() transformer_2_train_step(augmented_waveform_2, waveform_2, genre_code_2) transformer_2_time = time.time() - transformer_2_start print('\tTransformer 2, fit time {}, loss {}'.format( transformer_2_time, transformer_2_loss_history[-1])) if (batch + 1) % 100 == 0: with summary_writer.as_default(): tf.summary.scalar('classifier loss', classifier_loss_history[-1], step=global_steps) tf.summary.scalar('transformer 0 loss', transformer_0_loss_history[-1], step=global_steps) tf.summary.scalar('transformer 1 loss', transformer_1_loss_history[-1], step=global_steps) tf.summary.scalar('transformer 2 loss', transformer_2_loss_history[-1], step=global_steps) tf.summary.flush() global_steps += 1 # save weights every epoch print('Saving model weights') classifier_model_save_str = '/classifier_weights.{:d}-{:.2f}.h5'.format( epoch, classifier_loss_history[-1]) classifier_model.save_weights(models_dir + classifier_model_save_str) print( 'Finished training epoch {}, epoch losses are:'.format(epoch)) print('\tClassifier loss = {}'.format(classifier_loss_history[-1])) print('\tTransformer 0 loss = {}'.format( transformer_0_loss_history[-1])) print('\tTransformer 1 loss = {}'.format( transformer_1_loss_history[-1])) print('\tTransformer 2 loss = {}'.format( transformer_2_loss_history[-1])) transformer_0_save_str = '/transformer_0_weights.{:d}-{:.2f}.h5' transformer_0_save_str = transformer_0_save_str.format( epoch, transformer_0_loss_history[-1]) transformer_0_model.save_weights(models_dir + transformer_0_save_str) transformer_1_save_str = '/transformer_1_weights.{:d}-{:.2f}.h5' transformer_1_save_str = transformer_1_save_str.format( epoch, transformer_1_loss_history[-1]) transformer_1_model.save_weights(models_dir + transformer_1_save_str) transformer_2_save_str = '/transformer_2_weights.{:d}-{:.2f}.h5' transformer_2_save_str = transformer_2_save_str.format( epoch, transformer_2_loss_history[-1]) transformer_2_model.save_weights(models_dir + transformer_2_save_str) with summary_writer.as_default(): tf.summary.scalar('classifier loss', classifier_loss_history[-1], step=global_steps) tf.summary.scalar('transformer 0 loss', transformer_0_loss_history[-1], step=global_steps) tf.summary.scalar('transformer 1 loss', transformer_1_loss_history[-1], step=global_steps) tf.summary.scalar('transformer 2 loss', transformer_2_loss_history[-1], step=global_steps) tf.summary.flush() train(args.max_epochs)