def blstm_layer(self, input): """ :return: """ with tf.variable_scope('rnn_layer'): cell_fw = [ get_rnn_cell(self.rnn_size, self.dropout_rate) for _ in range(self.num_layers) ] cell_bw = [ get_rnn_cell(self.rnn_size, self.dropout_rate) for _ in range(self.num_layers) ] # if self.num_layers > 1: # cell_fw = rnn.MultiRNNCell([cell_fw] * self.num_layers, state_is_tuple=True) # cell_bw = rnn.MultiRNNCell([cell_bw] * self.num_layers, state_is_tuple=True) rnn_output, _,_ = \ stack_bidirectional_dynamic_rnn(cell_fw, cell_bw, input, sequence_length=self.lengths, dtype=tf.float32) outputs = tf.concat(rnn_output, axis=2) outputs = tf.layers.dropout(outputs, 1. - self.dropout_rate, training=self.is_training) return outputs
def rnn_layers(x,seq_length,training,hidden_num=100,layer_num = 3,class_n = 5): cells_fw = list() cells_bw = list() for i in range(layer_num): #cell_fw = BNLSTMCell(hidden_num,training = training)#,training) #cell_bw = BNLSTMCell(hidden_num,training = training)#,training) cell_fw = LSTMCell(hidden_num) cell_bw = LSTMCell(hidden_num) cells_fw.append(cell_fw) cells_bw.append(cell_bw) with tf.variable_scope('BDLSTM_rnn') as scope: lasth,_,_=stack_bidirectional_dynamic_rnn(cells_fw = cells_fw,cells_bw=cells_bw,\ inputs = x,sequence_length = seq_length,dtype = tf.float32,scope=scope) #shape of lasth [batch_size,max_time,hidden_num*2] batch_size = lasth.get_shape().as_list()[0] max_time = lasth.get_shape().as_list()[1] with tf.variable_scope('rnn_fnn_layer'): weight_out = tf.Variable(tf.truncated_normal([2,hidden_num],stddev=np.sqrt(2.0 / (2*hidden_num))),name='weights') biases_out = tf.Variable(tf.zeros([hidden_num]),name = 'bias') weight_class = tf.Variable(tf.truncated_normal([hidden_num,class_n],stddev=np.sqrt(2.0 / hidden_num)),name = 'weights_class') bias_class = tf.Variable(tf.zeros([class_n]),name = 'bias_class') lasth_rs = tf.reshape(lasth,[batch_size,max_time,2,hidden_num],name = 'lasth_rs') lasth_output = tf.nn.bias_add(tf.reduce_sum(tf.multiply(lasth_rs,weight_out),axis = 2),biases_out,name = 'lasth_bias_add') lasth_output_rs = tf.reshape(lasth_output,[batch_size*max_time,hidden_num],name = 'lasto_rs') logits = tf.reshape(tf.nn.bias_add(tf.matmul(lasth_output_rs,weight_class),bias_class),[batch_size,max_time,class_n],name = "rnn_logits_rs") variable_summaries(weight_class) variable_summaries(biases_out) return logits
def _createStackBidirectionalDynamicRNN(self, use_gpu, use_shape, use_state_tuple, initial_states_fw=None, initial_states_bw=None, scope=None): self.layers = [2, 3] input_size = 5 batch_size = 2 max_length = 8 initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=self._seed) sequence_length = array_ops.placeholder(dtypes.int64) self.cells_fw = [ rnn_cell.LSTMCell(num_units, input_size, initializer=initializer, state_is_tuple=False) for num_units in self.layers ] self.cells_bw = [ rnn_cell.LSTMCell(num_units, input_size, initializer=initializer, state_is_tuple=False) for num_units in self.layers ] inputs = max_length * [ array_ops.placeholder( dtypes.float32, shape=(batch_size, input_size) if use_shape else (None, input_size)) ] inputs_c = array_ops.stack(inputs) inputs_c = array_ops.transpose(inputs_c, [1, 0, 2]) outputs, st_fw, st_bw = contrib_rnn.stack_bidirectional_dynamic_rnn( self.cells_fw, self.cells_bw, inputs_c, initial_states_fw=initial_states_fw, initial_states_bw=initial_states_bw, dtype=dtypes.float32, sequence_length=sequence_length, scope=scope) # Outputs has shape (batch_size, max_length, 2* layer[-1]. output_shape = [None, max_length, 2 * self.layers[-1]] if use_shape: output_shape[0] = batch_size self.assertAllEqual(outputs.get_shape().as_list(), output_shape) input_value = np.random.randn(batch_size, input_size) return input_value, inputs, outputs, st_fw, st_bw, sequence_length
def _build_rnn_op(self): with tf.variable_scope("bi_directional_rnn"): cell_fw = self._create_rnn_cell() cell_bw = self._create_rnn_cell() if self.cfg["use_stack_rnn"]: rnn_outs, *_ = stack_bidirectional_dynamic_rnn( cell_fw, cell_bw, self.word_emb, dtype=tf.float32, sequence_length=self.seq_len) else: rnn_outs, *_ = bidirectional_dynamic_rnn( cell_fw, cell_bw, self.word_emb, dtype=tf.float32, sequence_length=self.seq_len) rnn_outs = tf.concat(rnn_outs, axis=-1) rnn_outs = tf.layers.dropout(rnn_outs, rate=self.drop_rate, training=self.is_train) self.rnn_outs = rnn_outs print("rnn output shape: {}".format( rnn_outs.get_shape().as_list()))
def __call__(self, inputs, seq_len, return_last_state=False, time_major=False): assert not time_major, "StackBiRNN class cannot support time_major currently" with tf.variable_scope(self.scope): flat_inputs = flatten(inputs, keep=2) # reshape to [-1, max_time, dim] seq_len = flatten( seq_len, keep=0) # reshape to [x] (one dimension sequence) outputs, states_fw, states_bw = stack_bidirectional_dynamic_rnn( self.cells_fw, self.cells_fw, flat_inputs, sequence_length=seq_len, dtype=tf.float32) if return_last_state: # return last states # since states_fw is the final states, one tensor per layer, of the forward rnn and states_bw is the # final states, one tensor per layer, of the backward rnn, here we extract the last layer of forward # and backward states as last state h_fw, h_bw = states_fw[self.num_layers - 1].h, states_bw[self.num_layers - 1].h output = tf.concat([h_fw, h_bw], axis=-1) # shape = [-1, 2 * num_units] output = reconstruct( output, ref=inputs, keep=2, remove_shape=1) # remove the max_time shape else: output = tf.concat( outputs, axis=-1) # shape = [-1, max_time, 2 * num_units] output = reconstruct( output, ref=inputs, keep=2 ) # reshape to same as inputs, except the last two dim return output
def encode(self, inputs, sequence_length, **kwargs): scope = tf.get_variable_scope() scope.set_initializer(tf.random_uniform_initializer( -self.params["init_scale"], self.params["init_scale"])) cell_fw = training_utils.get_rnn_cell(**self.params["rnn_cell"]) cell_bw = training_utils.get_rnn_cell(**self.params["rnn_cell"]) cells_fw = _unpack_cell(cell_fw) cells_bw = _unpack_cell(cell_bw) result = rnn.stack_bidirectional_dynamic_rnn( cells_fw=cells_fw, cells_bw=cells_bw, inputs=inputs, dtype=tf.float32, sequence_length=sequence_length, **kwargs) outputs_concat, _output_state_fw, _output_state_bw = result final_state = (_output_state_fw, _output_state_bw) return EncoderOutput( outputs=outputs_concat, final_state=final_state, attention_values=outputs_concat, attention_values_length=sequence_length)
def encode(self, inputs, sequence_length, **kwargs): scope = tf.get_variable_scope() scope.set_initializer( tf.random_uniform_initializer(-self.params["init_scale"], self.params["init_scale"])) cell_fw = training_utils.get_rnn_cell(**self.params["rnn_cell"]) cell_bw = training_utils.get_rnn_cell(**self.params["rnn_cell"]) cells_fw = _unpack_cell(cell_fw) cells_bw = _unpack_cell(cell_bw) result = rnn.stack_bidirectional_dynamic_rnn( cells_fw=cells_fw, cells_bw=cells_bw, inputs=inputs, dtype=tf.float32, sequence_length=sequence_length, **kwargs) outputs_concat, _output_state_fw, _output_state_bw = result final_state = (_output_state_fw, _output_state_bw) return EncoderOutput(outputs=outputs_concat, final_state=final_state, attention_values=outputs_concat, attention_values_length=sequence_length)
def _createStackBidirectionalDynamicRNN(self, use_gpu, use_shape, use_state_tuple, initial_states_fw=None, initial_states_bw=None, scope=None): self.layers = [2, 3] input_size = 5 batch_size = 2 max_length = 8 initializer = init_ops.random_uniform_initializer( -0.01, 0.01, seed=self._seed) sequence_length = array_ops.placeholder(dtypes.int64) self.cells_fw = [ core_rnn_cell_impl.LSTMCell( num_units, input_size, initializer=initializer, state_is_tuple=False) for num_units in self.layers ] self.cells_bw = [ core_rnn_cell_impl.LSTMCell( num_units, input_size, initializer=initializer, state_is_tuple=False) for num_units in self.layers ] inputs = max_length * [ array_ops.placeholder( dtypes.float32, shape=(batch_size, input_size) if use_shape else (None, input_size)) ] inputs_c = array_ops.stack(inputs) inputs_c = array_ops.transpose(inputs_c, [1, 0, 2]) outputs, st_fw, st_bw = rnn.stack_bidirectional_dynamic_rnn( self.cells_fw, self.cells_bw, inputs_c, initial_states_fw=initial_states_fw, initial_states_bw=initial_states_bw, dtype=dtypes.float32, sequence_length=sequence_length, scope=scope) # Outputs has shape (batch_size, max_length, 2* layer[-1]. output_shape = [None, max_length, 2 * self.layers[-1]] if use_shape: output_shape[0] = batch_size self.assertAllEqual(outputs.get_shape().as_list(), output_shape) input_value = np.random.randn(batch_size, input_size) return input_value, inputs, outputs, st_fw, st_bw, sequence_length
def __call__(self, inputs, seq_len): with tf.variable_scope(self.scope): output, *_ = stack_bidirectional_dynamic_rnn( self.cells_fw, self.cells_bw, inputs, sequence_length=seq_len, dtype=tf.float32) return output
def _build(self, inputs, lengths): outputs, final_fw_state, final_bw_state = rnn.stack_bidirectional_dynamic_rnn( cells_fw=self.cell_fw._cells, cells_bw=self.cell_bw._cells, inputs=inputs, sequence_length=lengths, dtype=tf.float32) # Concatenate states of the forward and backward RNNs final_state = final_fw_state, final_bw_state return outputs, final_state
def build_net_aux(self, inputs, lengths): outputs = tf.reshape( inputs, [self._config.batch_size, -1, self._config.input_size]) # BLSTM layer with tf.variable_scope('blstm_aux'): def lstm_cell(): if not self._infer and self._config.keep_prob < 1.0: return tf.contrib.rnn.DropoutWrapper( tf.contrib.rnn.BasicLSTMCell( self._config.aux_hidden_size), output_keep_prob=self._config.keep_prob) else: return tf.contrib.rnn.BasicLSTMCell( self._config.aux_hidden_size) # tf.nn.rnn_cell.MultiRNNCell in r1.12 lstm_fw_cell = tf.contrib.rnn.MultiRNNCell( [lstm_cell() for _ in range(self._config.rnn_num_layers)], state_is_tuple=True) lstm_bw_cell = tf.contrib.rnn.MultiRNNCell( [lstm_cell() for _ in range(self._config.rnn_num_layers)], state_is_tuple=True) lstm_fw_cell = self._unpack_cell(lstm_fw_cell) lstm_bw_cell = self._unpack_cell(lstm_bw_cell) outputs, fw_final_states, bw_final_states = rnn.stack_bidirectional_dynamic_rnn( cells_fw=lstm_fw_cell, cells_bw=lstm_bw_cell, inputs=outputs, dtype=tf.float32, sequence_length=lengths) outputs = tf.reshape( outputs, [-1, 2 * self._config.aux_hidden_size ]) # transform blstm outputs into right output size with tf.variable_scope('layer2_aux'): weights2, biases2 = self._weight_and_bias( 2 * self._config.aux_hidden_size, self._config.aux_hidden_size) outputs = tf.nn.relu(tf.matmul(outputs, weights2) + biases2) with tf.variable_scope('layer3_aux'): weights3, biases3 = self._weight_and_bias( self._config.aux_hidden_size, self._config.aux_output_size) outputs = tf.matmul(outputs, weights3) + biases3 outputs = tf.reshape( outputs, [self._config.batch_size, -1, self._config.aux_output_size]) # average over the frames to get the speaker embedding spk_embed = tf.reduce_sum(outputs, 1) / tf.reshape( tf.to_float(self._lengths_aux), (-1, 1)) return spk_embed
def my_rnn_layers(x, seq_length, training, hidden_num=200, layer_num=5, class_n=5, cell='BNLSTM', dtype=tf.float32): """Generate RNN layers. Args: x (Float): A 3D-Tensor of shape [batch_size,max_time,channel] seq_length (Int): A 1D-Tensor of shape [batch_size], real length of each sequence. training (Boolean): A 0D-Tenosr indicate if it's in training. hidden_num (int, optional): Defaults to 100. Size of the hidden state, hidden unit will be deep concatenated, so the final hidden state will be size of 200. layer_num (int, optional): Defaults to 3. Number of layers in RNN. class_n (int, optional): Defaults to 5. Number of output class. cell(str): A String from 'LSTM','GRU','BNLSTM', the RNN Cell used. BNLSTM stand for Batch normalization LSTM Cell. Returns: logits: A 3D Tensor of shape [batch_size, max_time, class_n] """ cells_fw = list() cells_bw = list() for i in range(layer_num): if cell == 'LSTM': cell_fw = LSTMCell(hidden_num) cell_bw = LSTMCell(hidden_num) elif cell == 'GRU': cell_fw = GRUCell(hidden_num) cell_bw = GRUCell(hidden_num) elif cell == 'BNLSTM': cell_fw = BNLSTMCell(hidden_num, training=training) cell_bw = BNLSTMCell(hidden_num, training=training) else: raise ValueError("Cell type unrecognized.") cells_fw.append(cell_fw) cells_bw.append(cell_bw) #multi_cells_fw = tf.nn.rnn_cell.MultiRNNCell(cells_fw) #multi_cells_bw = tf.nn.rnn_cell.MultiRNNCell(cells_bw) with tf.variable_scope('BDLSTM_rnn') as scope: lasth, _, _ = stack_bidirectional_dynamic_rnn( cells_fw=cells_fw, cells_bw=cells_bw, inputs=x, sequence_length=seq_length, dtype=dtype, scope=scope) return lasth
def encode(self, inputs, sequence_length, **kwargs): cell_fw = training_utils.get_rnn_cell(**self.params["rnn_cell"]) cell_bw = training_utils.get_rnn_cell(**self.params["rnn_cell"]) cells_fw = _unpack_cell(cell_fw) cells_bw = _unpack_cell(cell_bw) result = rnn.stack_bidirectional_dynamic_rnn( cells_fw=cells_fw, cells_bw=cells_bw, inputs=inputs, dtype=tf.float32, sequence_length=sequence_length, **kwargs) outputs_concat, _output_state_fw, _output_state_bw = result final_state = (_output_state_fw, _output_state_bw) return EncoderOutput( outputs=outputs_concat, final_state=final_state, attention_values=outputs_concat, attention_values_length=sequence_length)
def _CreateCudnnCompatibleCanonicalRNN(rnn, inputs, is_bidi=False, scope=None): mode = rnn.rnn_mode num_units = rnn.num_units num_layers = rnn.num_layers # To reuse cuDNN-trained models, must use cudnn compatible rnn cells. if mode == CUDNN_LSTM: single_cell = lambda: cudnn_rnn_ops.CudnnCompatibleLSTMCell(num_units) elif mode == CUDNN_GRU: single_cell = lambda: cudnn_rnn_ops.CudnnCompatibleGRUCell(num_units) elif mode == CUDNN_RNN_TANH: single_cell = ( lambda: rnn_cell_impl.BasicRNNCell(num_units, math_ops.tanh)) elif mode == CUDNN_RNN_RELU: single_cell = ( lambda: rnn_cell_impl.BasicRNNCell(num_units, gen_nn_ops.relu)) else: raise ValueError("%s is not supported!" % mode) if not is_bidi: cell = rnn_cell_impl.MultiRNNCell( [single_cell() for _ in range(num_layers)]) return rnn_lib.dynamic_rnn(cell, inputs, dtype=dtypes.float32, time_major=True, scope=scope) else: cells_fw = [single_cell() for _ in range(num_layers)] cells_bw = [single_cell() for _ in range(num_layers)] (outputs, output_state_fw, output_state_bw) = contrib_rnn_lib.stack_bidirectional_dynamic_rnn( cells_fw, cells_bw, inputs, dtype=dtypes.float32, time_major=True, scope=scope) return outputs, (output_state_fw, output_state_bw)
def _build_model_op(self): with tf.variable_scope("bi_directional_rnn"): cell_fw = self._create_rnn_cell() cell_bw = self._create_rnn_cell() if self.cfg["use_stack_rnn"]: rnn_outs, *_ = stack_bidirectional_dynamic_rnn(cell_fw, cell_bw, self.word_emb, dtype=tf.float32, sequence_length=self.seq_len) else: rnn_outs, *_ = bidirectional_dynamic_rnn(cell_fw, cell_bw, self.word_emb, sequence_length=self.seq_len, dtype=tf.float32) rnn_outs = tf.concat(rnn_outs, axis=-1) rnn_outs = tf.layers.dropout(rnn_outs, rate=self.drop_rate, training=self.is_train) if self.cfg["use_residual"]: word_project = tf.layers.dense(self.word_emb, units=2 * self.cfg["num_units"], use_bias=False) rnn_outs = rnn_outs + word_project outputs = layer_normalize(rnn_outs) if self.cfg["use_layer_norm"] else rnn_outs # print("rnn output shape: {}".format(outputs.get_shape().as_list())) if self.cfg["use_attention"] == "self_attention": with tf.variable_scope("self_attention"): attn_outs = multi_head_attention(outputs, outputs, self.cfg["num_heads"], self.cfg["attention_size"], drop_rate=self.drop_rate, is_train=self.is_train) if self.cfg["use_residual"]: attn_outs = attn_outs + outputs outputs = layer_normalize(attn_outs) if self.cfg["use_layer_norm"] else attn_outs print("self-attention output shape: {}".format(outputs.get_shape().as_list())) elif self.cfg["use_attention"] == "normal_attention": with tf.variable_scope("normal_attention"): context = tf.transpose(outputs, [1, 0, 2]) p_context = tf.layers.dense(outputs, units=2 * self.cfg["num_units"], use_bias=False) p_context = tf.transpose(p_context, [1, 0, 2]) attn_cell = AttentionCell(self.cfg["num_units"], context, p_context) # time major based attn_outs, _ = dynamic_rnn(attn_cell, context, sequence_length=self.seq_len, time_major=True, dtype=tf.float32) outputs = tf.transpose(attn_outs, [1, 0, 2]) print("attention output shape: {}".format(outputs.get_shape().as_list())) with tf.variable_scope("project"): self.logits = tf.layers.dense(outputs, units=self.tag_vocab_size, use_bias=True)
def encode(self, inputs, sequence_length, **kwargs): scope = tf.get_variable_scope() scope.set_initializer( tf.random_uniform_initializer(-self.params["init_scale"], self.params["init_scale"])) self.params["rnn_cell"]["distributed"] = False self.params["rnn_cell"]["device_name"] = training_utils.getDeviceName( 0) cell_fw = training_utils.get_rnn_cell(**self.params["rnn_cell"]) self.params["rnn_cell"]["device_name"] = training_utils.getDeviceName( self.params["rnn_cell"]["num_layers"]) if self.params["rnn_cell"][ "device_name"] == training_utils.getDeviceName(0): self.params["rnn_cell"][ "device_name"] = training_utils.getDeviceName( 1 ) # to ensure the backward cell is working on aniother GPU cell_bw = training_utils.get_rnn_cell(**self.params["rnn_cell"]) cells_fw = _unpack_cell(cell_fw) cells_bw = _unpack_cell(cell_bw) result = rnn.stack_bidirectional_dynamic_rnn( cells_fw=cells_fw, cells_bw=cells_bw, inputs=inputs, dtype=tf.float32, sequence_length=sequence_length, **kwargs) outputs_concat, _output_state_fw, _output_state_bw = result final_state = (_output_state_fw, _output_state_bw) return EncoderOutput(outputs=outputs_concat, final_state=final_state, attention_values=outputs_concat, attention_values_length=sequence_length)
def _CreateCudnnCompatibleCanonicalRNN(rnn, inputs, is_bidi=False, scope=None): mode = rnn.rnn_mode num_units = rnn.num_units num_layers = rnn.num_layers # To reuse cuDNN-trained models, must use cudnn compatible rnn cells. if mode == CUDNN_LSTM: single_cell = lambda: cudnn_rnn_ops.CudnnCompatibleLSTMCell(num_units) elif mode == CUDNN_GRU: single_cell = lambda: cudnn_rnn_ops.CudnnCompatibleGRUCell(num_units) elif mode == CUDNN_RNN_TANH: single_cell = (lambda: rnn_cell_impl.BasicRNNCell(num_units, math_ops.tanh)) elif mode == CUDNN_RNN_RELU: single_cell = ( lambda: rnn_cell_impl.BasicRNNCell(num_units, gen_nn_ops.relu)) else: raise ValueError("%s is not supported!" % mode) if not is_bidi: cell = rnn_cell_impl.MultiRNNCell( [single_cell() for _ in range(num_layers)]) return rnn_lib.dynamic_rnn( cell, inputs, dtype=dtypes.float32, time_major=True, scope=scope) else: cells_fw = [single_cell() for _ in range(num_layers)] cells_bw = [single_cell() for _ in range(num_layers)] (outputs, output_state_fw, output_state_bw) = contrib_rnn_lib.stack_bidirectional_dynamic_rnn( cells_fw, cells_bw, inputs, dtype=dtypes.float32, time_major=True, scope=scope) return outputs, (output_state_fw, output_state_bw)
def __call__(self, inputs, seq_len): with tf.variable_scope(self.scope): output, *_ = stack_bidirectional_dynamic_rnn(self.cells_fw, self.cells_bw, inputs, sequence_length=seq_len, dtype=tf.float32) return output
def rnn_layers(x, seq_length, training, hidden_num=100, layer_num=3, class_n=5, cell='LSTM', dtype=tf.float32): """Generate RNN layers. Args: x (Float): A 3D-Tensor of shape [batch_size,max_time,channel] seq_length (Int): A 1D-Tensor of shape [batch_size], real length of each sequence. training (Boolean): A 0D-Tenosr indicate if it's in training. hidden_num (int, optional): Defaults to 100. Size of the hidden state, hidden unit will be deep concatenated, so the final hidden state will be size of 200. layer_num (int, optional): Defaults to 3. Number of layers in RNN. class_n (int, optional): Defaults to 5. Number of output class. cell(str): A String from 'LSTM','GRU','BNLSTM', the RNN Cell used. BNLSTM stand for Batch normalization LSTM Cell. Returns: logits: A 3D Tensor of shape [batch_size, max_time, class_n] """ cells_fw = list() cells_bw = list() for i in range(layer_num): if cell == 'LSTM': cell_fw = LSTMCell(hidden_num) cell_bw = LSTMCell(hidden_num) elif cell == 'GRU': cell_fw = GRUCell(hidden_num) cell_bw = GRUCell(hidden_num) elif cell == 'BNLSTM': cell_fw = BNLSTMCell(hidden_num, training=training) cell_bw = BNLSTMCell(hidden_num, training=training) else: raise ValueError("Cell type unrecognized.") cells_fw.append(cell_fw) cells_bw.append(cell_bw) #multi_cells_fw = tf.nn.rnn_cell.MultiRNNCell(cells_fw) #multi_cells_bw = tf.nn.rnn_cell.MultiRNNCell(cells_bw) with tf.variable_scope('BDLSTM_rnn') as scope: lasth, _, _ = stack_bidirectional_dynamic_rnn( cells_fw=cells_fw, cells_bw=cells_bw, inputs=x, sequence_length=seq_length, dtype=dtype, scope=scope) #lasth = tf.concat(outputs, 2, name='birnn_output_concat') # shape of lasth [batch_size,max_time,hidden_num*2] # Difference between bidrectional_dynamic_rnn and stack_bidirectional_dynamic_rnn # https://stackoverflow.com/questions/49242266/difference-between-multirnncell-and-stack-bidirectional-dynamic-rnn-in-tensorflo batch_size = tf.shape(lasth)[0] max_time = lasth.get_shape().as_list()[1] with tf.variable_scope('rnn_fnn_layer'): weight_out = _variable_on_cpu( name='weights', shape=[2, hidden_num], initializer=tf.truncated_normal_initializer( stddev=np.sqrt(2.0 / (2 * hidden_num))), dtype=dtype) biases_out = _variable_on_cpu(name='bias', shape=[hidden_num], initializer=tf.zeros_initializer(), dtype=dtype) weight_class = _variable_on_cpu( name='weights_class', shape=[hidden_num, class_n], initializer=tf.truncated_normal_initializer( stddev=np.sqrt(2.0 / hidden_num)), dtype=dtype) bias_class = _variable_on_cpu(name='bias_class', shape=[class_n], initializer=tf.zeros_initializer(), dtype=dtype) lasth_rs = tf.reshape(lasth, [batch_size, max_time, 2, hidden_num], name='lasth_rs') lasth_output = tf.nn.bias_add(tf.reduce_sum(tf.multiply( lasth_rs, weight_out), axis=2), biases_out, name='lasth_bias_add') lasth_output_rs = tf.reshape(lasth_output, [batch_size * max_time, hidden_num], name='lasto_rs') logits = tf.reshape(tf.nn.bias_add( tf.matmul(lasth_output_rs, weight_class), bias_class), [batch_size, max_time, class_n], name="rnn_logits_rs") return logits
def build_net(self): outputs = self._inputs # feed-forward layer, not used (set to false) when grid lstm is applied if self._config.dense_layer.lower() == 'true': with tf.variable_scope('forward1'): outputs = tf.reshape(outputs, [-1, self._config.input_size]) outputs = tf.layers.dense(outputs, units=self._config.rnn_size, activation=tf.nn.tanh, reuse=tf.get_variable_scope().reuse) outputs = tf.reshape(outputs, [self._config.batch_size, -1, self._config.rnn_size]) # grid lstm layer and a linear reduction layer if self._config.tflstm_size > 0: with tf.variable_scope('tflstm'): def tflstm_cell(): return tf.contrib.rnn.GridLSTMCell(self._config.tflstm_size, use_peepholes=True, share_time_frequency_weights=True, cell_clip=5.0, feature_size=self._config.tffeature_size, frequency_skip=self._config.tffrequency_skip, num_frequency_blocks=[int((self._config.input_size-self._config.tffeature_size)/self._config.tffrequency_skip+1)]) cell = tf.contrib.rnn.MultiRNNCell([tflstm_cell() for _ in range(self._config.tflstm_layers)], state_is_tuple=True) initial_state = cell.zero_state(self._config.batch_size, tf.float32) outputs, final_state = tf.nn.dynamic_rnn(cell, outputs, dtype=tf.float32, sequence_length=self._lengths, initial_state=initial_state) tflstm_output_size = 2*self._config.tflstm_size*int((self._config.input_size-self._config.tffeature_size)/self._config.tffrequency_skip+1) outputs = tf.reshape(outputs, [-1, tflstm_output_size]) weights, biases = self._weight_and_bias('linear', tflstm_output_size, self._config.rnn_size) outputs = tf.matmul(outputs, weights) + biases outputs = tf.reshape(outputs, [self._config.batch_size, -1, self._config.rnn_size]) # BLSTM layer with tf.variable_scope('blstm'): def lstm_cell(): return tf.contrib.rnn.BasicLSTMCell(self._config.rnn_size) #tf.nn.rnn_cell.BasicLSTMCell in r1.12 attn_cell = lstm_cell if not self._infer and self._config.keep_prob < 1.0: def attn_cell(): return tf.contrib.rnn.DropoutWrapper(lstm_cell(), output_keep_prob=self._config.keep_prob) # tf.nn.rnn_cell.MultiRNNCell in r1.12 lstm_fw_cell = tf.contrib.rnn.MultiRNNCell([attn_cell() for _ in range(self._config.rnn_num_layers)], state_is_tuple=True) lstm_bw_cell = tf.contrib.rnn.MultiRNNCell([attn_cell() for _ in range(self._config.rnn_num_layers)], state_is_tuple=True) lstm_fw_cell = self._unpack_cell(lstm_fw_cell) lstm_bw_cell = self._unpack_cell(lstm_bw_cell) outputs, fw_final_states, bw_final_states = rnn.stack_bidirectional_dynamic_rnn(cells_fw=lstm_fw_cell, cells_bw=lstm_bw_cell, inputs=outputs, dtype=tf.float32, sequence_length=self._lengths) # Mask estimation layer with tf.variable_scope('forward2'): blstm_output_size = 2*self._config.rnn_size outputs = tf.reshape(outputs, [-1, blstm_output_size]) weights1, biases1 = self._weight_and_bias('mask1', blstm_output_size, self._config.output_size) weights2, biases2 = self._weight_and_bias('mask2', blstm_output_size, self._config.output_size) if self._config.mask_type.lower() == 'relu': mask1 = tf.nn.relu(tf.matmul(outputs, weights1) + biases1) mask2 = tf.nn.relu(tf.matmul(outputs, weights2) + biases2) else: mask1 = tf.nn.sigmoid(tf.matmul(outputs, weights1) + biases1) mask2 = tf.nn.sigmoid(tf.matmul(outputs, weights2) + biases2) self._mask1 = tf.reshape(mask1, [self._config.batch_size, -1, self._config.output_size]) self._mask2 = tf.reshape(mask2, [self._config.batch_size, -1, self._config.output_size]) self._sep1 = self._mask1 * self._mixed self._sep2 = self._mask2 * self._mixed self.saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=30)
def __init__(self, x_mag_spec_batch, lengths_batch, y_mag_spec_batch=None, theta_x_batch=None, theta_y_batch=None, behavior='train'): ''' behavior = 'train/validation/infer' ''' if behavior != self.infer: assert (y_mag_spec_batch is not None) assert (theta_x_batch is not None) assert (theta_y_batch is not None) self._log_bias = tf.get_variable( 'logbias', [1], trainable=FLAGS.PARAM.LOG_BIAS_TRAINABLE, initializer=tf.constant_initializer(FLAGS.PARAM.INIT_LOG_BIAS)) self._real_logbias = self._log_bias + FLAGS.PARAM.MIN_LOG_BIAS self._x_mag_spec = x_mag_spec_batch self._norm_x_mag_spec = norm_mag_spec(self._x_mag_spec, FLAGS.PARAM.MAG_NORM_MAX) self._norm_x_logmag_spec = norm_logmag_spec(self._x_mag_spec, FLAGS.PARAM.MAG_NORM_MAX, self._log_bias, FLAGS.PARAM.MIN_LOG_BIAS) self._y_mag_spec = y_mag_spec_batch self._norm_y_mag_spec = norm_mag_spec(self._y_mag_spec, FLAGS.PARAM.MAG_NORM_MAX) self._norm_y_logmag_spec = norm_logmag_spec(self._y_mag_spec, FLAGS.PARAM.MAG_NORM_MAX, self._log_bias, FLAGS.PARAM.MIN_LOG_BIAS) self._lengths = lengths_batch self._batch_size = tf.shape(self._lengths)[0] self._x_theta = theta_x_batch self._y_theta = theta_y_batch self._model_type = FLAGS.PARAM.MODEL_TYPE if FLAGS.PARAM.INPUT_TYPE == 'mag': self.net_input = self._norm_x_mag_spec elif FLAGS.PARAM.INPUT_TYPE == 'logmag': self.net_input = self._norm_x_logmag_spec if FLAGS.PARAM.LABEL_TYPE == 'mag': self._y_labels = self._norm_y_mag_spec elif FLAGS.PARAM.LABEL_TYPE == 'logmag': self._y_labels = self._norm_y_logmag_spec outputs = self.net_input if FLAGS.PARAM.INPUT_BN: with tf.variable_scope('Batch_Norm_Layer'): if_BRN = (FLAGS.PARAM.MVN_TYPE == 'BRN') if FLAGS.PARAM.SELF_BN: outputs = tf.layers.batch_normalization(outputs, training=True, renorm=if_BRN) else: outputs = tf.layers.batch_normalization( outputs, training=(behavior == self.train or behavior == self.validation), renorm=if_BRN) lstm_attn_cell = lstm_cell if behavior != self.infer and FLAGS.PARAM.KEEP_PROB < 1.0: def lstm_attn_cell(n_units, n_proj, act): return tf.contrib.rnn.DropoutWrapper( lstm_cell(n_units, n_proj, act), output_keep_prob=FLAGS.PARAM.KEEP_PROB) GRU_attn_cell = GRU_cell if behavior != self.infer and FLAGS.PARAM.KEEP_PROB < 1.0: def GRU_attn_cell(n_units, act): return tf.contrib.rnn.DropoutWrapper( GRU_cell(n_units, act), output_keep_prob=FLAGS.PARAM.KEEP_PROB) if FLAGS.PARAM.MODEL_TYPE.upper() == 'BLSTM': with tf.variable_scope('BLSTM'): lstm_fw_cell = tf.contrib.rnn.MultiRNNCell([ lstm_attn_cell(FLAGS.PARAM.RNN_SIZE, FLAGS.PARAM.LSTM_num_proj, FLAGS.PARAM.LSTM_ACTIVATION) for _ in range(FLAGS.PARAM.RNN_LAYER) ], state_is_tuple=True) lstm_bw_cell = tf.contrib.rnn.MultiRNNCell([ lstm_attn_cell(FLAGS.PARAM.RNN_SIZE, FLAGS.PARAM.LSTM_num_proj, FLAGS.PARAM.LSTM_ACTIVATION) for _ in range(FLAGS.PARAM.RNN_LAYER) ], state_is_tuple=True) fw_cell = lstm_fw_cell._cells bw_cell = lstm_bw_cell._cells result = rnn.stack_bidirectional_dynamic_rnn( cells_fw=fw_cell, cells_bw=bw_cell, inputs=outputs, dtype=tf.float32, sequence_length=self._lengths) outputs, fw_final_states, bw_final_states = result if FLAGS.PARAM.MODEL_TYPE.upper() == 'BGRU': with tf.variable_scope('BGRU'): gru_fw_cell = tf.contrib.rnn.MultiRNNCell([ GRU_attn_cell(FLAGS.PARAM.RNN_SIZE, FLAGS.PARAM.LSTM_ACTIVATION) for _ in range(FLAGS.PARAM.RNN_LAYER) ], state_is_tuple=True) gru_bw_cell = tf.contrib.rnn.MultiRNNCell([ GRU_attn_cell(FLAGS.PARAM.RNN_SIZE, FLAGS.PARAM.LSTM_ACTIVATION) for _ in range(FLAGS.PARAM.RNN_LAYER) ], state_is_tuple=True) fw_cell = gru_fw_cell._cells bw_cell = gru_bw_cell._cells result = rnn.stack_bidirectional_dynamic_rnn( cells_fw=fw_cell, cells_bw=bw_cell, inputs=outputs, dtype=tf.float32, sequence_length=self._lengths) outputs, fw_final_states, bw_final_states = result self.fw_final_state = fw_final_states self.bw_final_state = bw_final_states # print(fw_final_states[0][0].get_shape().as_list()) # print(np.shape(fw_final_states),np.shape(bw_final_states)) # region full connection get mask # calcu rnn output size in_size = FLAGS.PARAM.RNN_SIZE mask = None if self._model_type.upper()[0] == 'B': # bidirection rnn_output_num = FLAGS.PARAM.RNN_SIZE * 2 if FLAGS.PARAM.MODEL_TYPE == 'BLSTM' and ( not (FLAGS.PARAM.LSTM_num_proj is None)): rnn_output_num = 2 * FLAGS.PARAM.LSTM_num_proj in_size = rnn_output_num outputs = tf.reshape(outputs, [-1, in_size]) out_size = FLAGS.PARAM.OUTPUT_SIZE with tf.variable_scope('fullconnectOut'): weights = tf.get_variable( 'weights1', [in_size, out_size], initializer=tf.random_normal_initializer(stddev=0.01)) biases = tf.get_variable('biases1', [out_size], initializer=tf.constant_initializer( FLAGS.PARAM.INIT_MASK_VAL)) if FLAGS.PARAM.TIME_NOSOFTMAX_ATTENTION: with tf.variable_scope('fullconnectCoef'): weights_coef = tf.get_variable( 'weights_coef', [in_size, 1], initializer=tf.random_normal_initializer(mean=1.0, stddev=0.01)) biases_coef = tf.get_variable( 'biases_coef', [1], initializer=tf.constant_initializer(0.0)) raw_mask = tf.reshape( tf.matmul(outputs, weights) + biases, [self._batch_size, -1, FLAGS.PARAM.OUTPUT_SIZE ]) # [batch,time,fre] batch_coef_vec = tf.nn.relu( tf.reshape( tf.matmul(outputs, weights_coef) + biases_coef, [self._batch_size, -1])) # [batch, time] mask = tf.multiply( raw_mask, tf.reshape(batch_coef_vec, [self._batch_size, -1, 1])) else: if FLAGS.PARAM.POST_BN: linear_out = tf.matmul(outputs, weights) with tf.variable_scope('POST_Batch_Norm_Layer'): if_BRN = (FLAGS.PARAM.MVN_TYPE == 'BRN') if FLAGS.PARAM.SELF_BN: linear_out = tf.layers.batch_normalization( linear_out, training=True, renorm=if_BRN) else: linear_out = tf.layers.batch_normalization( linear_out, training=(behavior == self.train or behavior == self.validation), renorm=if_BRN) weights2 = tf.get_variable( 'weights1', [out_size, out_size], initializer=tf.random_normal_initializer(stddev=0.01)) biases2 = tf.get_variable( 'biases1', [out_size], initializer=tf.constant_initializer( FLAGS.PARAM.INIT_MASK_VAL)) linear_out = tf.matmul(linear_out, weights2) + biases2 else: linear_out = tf.matmul(outputs, weights) + biases mask = linear_out if FLAGS.PARAM.ReLU_MASK: mask = tf.nn.relu(linear_out) # endregion self._mask = tf.reshape( mask, [self._batch_size, -1, FLAGS.PARAM.OUTPUT_SIZE]) if FLAGS.PARAM.TRAINING_MASK_POSITION == 'mag': self._y_estimation = self._mask * (self._norm_x_mag_spec + FLAGS.PARAM.SPEC_EST_BIAS) elif FLAGS.PARAM.TRAINING_MASK_POSITION == 'logmag': self._y_estimation = self._mask * (self._norm_x_logmag_spec + FLAGS.PARAM.SPEC_EST_BIAS) # region get infer spec if FLAGS.PARAM.DECODING_MASK_POSITION == 'mag': self._y_mag_estimation = rm_norm_mag_spec( self._mask * (self._norm_x_mag_spec + FLAGS.PARAM.SPEC_EST_BIAS), FLAGS.PARAM.MAG_NORM_MAX) elif FLAGS.PARAM.DECODING_MASK_POSITION == 'logmag': self._y_mag_estimation = rm_norm_logmag_spec( self._mask * (self._norm_x_logmag_spec + FLAGS.PARAM.SPEC_EST_BIAS), FLAGS.PARAM.MAG_NORM_MAX, self._log_bias, FLAGS.PARAM.MIN_LOG_BIAS) ''' _y_mag_estimation is estimated mag_spec _y_estimation is loss_targe, mag_sepec or logmag_spec ''' # endregion # region prepare y_estimation if FLAGS.PARAM.TRAINING_MASK_POSITION != FLAGS.PARAM.LABEL_TYPE: if FLAGS.PARAM.LABEL_TYPE == 'mag': self._y_estimation = normedLogmag2normedMag( self._y_estimation, FLAGS.PARAM.MAG_NORM_MAX, self._log_bias, FLAGS.PARAM.MIN_LOG_BIAS) elif FLAGS.PARAM.LABEL_TYPE == 'logmag': self._y_estimation = normedMag2normedLogmag( self._y_estimation, FLAGS.PARAM.MAG_NORM_MAX, self._log_bias, FLAGS.PARAM.MIN_LOG_BIAS) # endregion # region CBHG if FLAGS.PARAM.USE_CBHG_POST_PROCESSING: cbhg_kernels = 8 # All kernel sizes from 1 to cbhg_kernels will be used in the convolution bank of CBHG to act as "K-grams" cbhg_conv_channels = 128 # Channels of the convolution bank cbhg_pool_size = 2 # pooling size of the CBHG cbhg_projection = 256 # projection channels of the CBHG (1st projection, 2nd is automatically set to num_mels) cbhg_projection_kernel_size = 3 # kernel_size of the CBHG projections cbhg_highwaynet_layers = 4 # Number of HighwayNet layers cbhg_highway_units = 128 # Number of units used in HighwayNet fully connected layers cbhg_rnn_units = 128 # Number of GRU units used in bidirectional RNN of CBHG block. CBHG output is 2x rnn_units in shape batch_norm_position = 'before' # is_training = True is_training = bool(behavior == self.train) post_cbhg = CBHG(cbhg_kernels, cbhg_conv_channels, cbhg_pool_size, [cbhg_projection, FLAGS.PARAM.OUTPUT_SIZE], cbhg_projection_kernel_size, cbhg_highwaynet_layers, cbhg_highway_units, cbhg_rnn_units, batch_norm_position, is_training, name='CBHG_postnet') #[batch_size, decoder_steps(mel_frames), cbhg_channels] self._cbhg_inputs_y_est = self._y_estimation cbhg_outputs = post_cbhg(self._y_estimation, None) frame_projector = FrameProjection(FLAGS.PARAM.OUTPUT_SIZE, scope='CBHG_proj_to_spec') self._y_estimation = frame_projector(cbhg_outputs) if FLAGS.PARAM.DECODING_MASK_POSITION != FLAGS.PARAM.TRAINING_MASK_POSITION: print( 'DECODING_MASK_POSITION must be equal to TRAINING_MASK_POSITION when use CBHG post processing.' ) exit(-1) if FLAGS.PARAM.DECODING_MASK_POSITION == 'mag': self._y_mag_estimation = rm_norm_mag_spec( self._y_estimation, FLAGS.PARAM.MAG_NORM_MAX) elif FLAGS.PARAM.DECODING_MASK_POSITION == 'logmag': self._y_mag_estimation = rm_norm_logmag_spec( self._y_estimation, FLAGS.PARAM.MAG_NORM_MAX, self._log_bias, FLAGS.PARAM.MIN_LOG_BIAS) # endregion self.saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=30) if behavior == self.infer: return # region get labels LOSS # Labels if FLAGS.PARAM.MASK_TYPE == 'PSM': self._y_labels *= tf.cos(self._x_theta - self._y_theta) elif FLAGS.PARAM.MASK_TYPE == 'fixPSM': self._y_labels *= (1.0 + tf.cos(self._x_theta - self._y_theta)) * 0.5 elif FLAGS.PARAM.MASK_TYPE == 'AcutePM': self._y_labels *= tf.nn.relu(tf.cos(self._x_theta - self._y_theta)) elif FLAGS.PARAM.MASK_TYPE == 'PowFixPSM': self._y_labels *= tf.pow( tf.abs((1.0 + tf.cos(self._x_theta - self._y_theta)) * 0.5), FLAGS.PARAM.POW_FIX_PSM_COEF) elif FLAGS.PARAM.MASK_TYPE == 'IRM': pass else: tf.logging.error('Mask type error.') exit(-1) # LOSS if FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == 'SPEC_MSE': # log_mag and mag MSE self._loss = loss.reduce_sum_frame_batchsize_MSE( self._y_estimation, self._y_labels) if FLAGS.PARAM.USE_CBHG_POST_PROCESSING: if FLAGS.PARAM.DOUBLE_LOSS: self._loss = FLAGS.PARAM.CBHG_LOSS_COEF1 * loss.reduce_sum_frame_batchsize_MSE( self._cbhg_inputs_y_est, self._y_labels ) + FLAGS.PARAM.CBHG_LOSS_COEF2 * self._loss elif FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == 'MFCC_SPEC_MSE': self._loss1, self._loss2 = loss.balanced_MFCC_AND_SPEC_MSE( self._y_estimation, self._y_labels, self._y_mag_estimation, self._y_mag_spec) self._loss = FLAGS.PARAM.SPEC_LOSS_COEF * self._loss1 + FLAGS.PARAM.MFCC_LOSS_COEF * self._loss2 elif FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == 'MEL_MAG_MSE': self._loss1, self._loss2 = loss.balanced_MEL_AND_SPEC_MSE( self._y_estimation, self._y_labels, self._y_mag_estimation, self._y_mag_spec) self._loss = FLAGS.PARAM.SPEC_LOSS_COEF * self._loss1 + FLAGS.PARAM.MEL_LOSS_COEF * self._loss2 elif FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == "SPEC_MSE_LOWF_EN": self._loss = loss.reduce_sum_frame_batchsize_MSE( self._y_estimation, self._y_labels) elif FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == "FAIR_SPEC_MSE": self._loss = loss.fair_reduce_sum_frame_batchsize_MSE( self._y_estimation, self._y_labels) elif FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == "SPEC_MSE_FLEXIBLE_POW_C": self._loss = loss.reduce_sum_frame_batchsize_MSE_EmphasizeLowerValue( self._y_estimation, self._y_labels, FLAGS.PARAM.POW_COEF) elif FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == "RELATED_MSE": self._loss = loss.relative_reduce_sum_frame_batchsize_MSE( self._y_estimation, self._y_labels, FLAGS.PARAM.RELATED_MSE_IGNORE_TH) elif FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == "AUTO_RELATED_MSE": self._loss = loss.auto_ingore_relative_reduce_sum_frame_batchsize_MSE( self._y_estimation, self._y_labels, FLAGS.PARAM.AUTO_RELATED_MSE_AXIS_FIT_DEG) elif FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == "AUTO_RELATED_MSE2": self._loss = loss.auto_ingore_relative_reduce_sum_frame_batchsize_MSE_v2( self._y_estimation, self._y_labels, FLAGS.PARAM.AUTO_RELATED_MSE_AXIS_FIT_DEG, FLAGS.PARAM.LINEAR_BROKER, ) elif FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == "AUTO_RELATED_MSE3": self._loss = loss.auto_ingore_relative_reduce_sum_frame_batchsize_MSE_v3( self._y_estimation, self._y_labels, FLAGS.PARAM.AUTO_RELATIVE_LOSS3_A, FLAGS.PARAM.AUTO_RELATIVE_LOSS3_B, FLAGS.PARAM.AUTO_RELATIVE_LOSS3_C1, FLAGS.PARAM.AUTO_RELATIVE_LOSS3_C2) elif FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == "AUTO_RELATED_MSE4": self._loss = loss.auto_ingore_relative_reduce_sum_frame_batchsize_MSE_v4( self._y_estimation, self._y_labels, FLAGS.PARAM.AUTO_RELATED_MSE_AXIS_FIT_DEG) elif FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == "AUTO_RELATED_MSE5": self._loss = loss.auto_ingore_relative_reduce_sum_frame_batchsize_MSE_v5( self._y_estimation, self._y_labels) elif FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == "AUTO_RELATED_MSE6": self._loss = loss.auto_ingore_relative_reduce_sum_frame_batchsize_MSE_v6( self._y_estimation, self._y_labels, FLAGS.PARAM.AUTO_RELATIVE_LOSS6_A, FLAGS.PARAM.AUTO_RELATIVE_LOSS6_B, FLAGS.PARAM.AUTO_RELATIVE_LOSS6_C1, FLAGS.PARAM.AUTO_RELATIVE_LOSS6_C2) elif FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == "AUTO_RELATED_MSE7": self._loss = loss.auto_ingore_relative_reduce_sum_frame_batchsize_MSE_v7( self._y_estimation, self._y_labels, FLAGS.PARAM.AUTO_RELATIVE_LOSS7_A1, FLAGS.PARAM.AUTO_RELATIVE_LOSS7_A2, FLAGS.PARAM.AUTO_RELATIVE_LOSS7_B, FLAGS.PARAM.AUTO_RELATIVE_LOSS7_C1, FLAGS.PARAM.AUTO_RELATIVE_LOSS7_C2) elif FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == "AUTO_RELATED_MSE8": self._loss = loss.auto_ingore_relative_reduce_sum_frame_batchsize_MSE_v8( self._y_estimation, self._y_labels, FLAGS.PARAM.AUTO_RELATIVE_LOSS8_A, FLAGS.PARAM.AUTO_RELATIVE_LOSS8_B, FLAGS.PARAM.AUTO_RELATIVE_LOSS8_C1, FLAGS.PARAM.AUTO_RELATIVE_LOSS8_C2) elif FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == "AUTO_RELATED_MSE_USE_COS": self._loss = loss.cos_auto_ingore_relative_reduce_sum_frame_batchsize_MSE( self._y_estimation, self._y_labels, FLAGS.PARAM.COS_AUTO_RELATED_MSE_W) elif FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == 'MEL_AUTO_RELATED_MSE': # type(y_estimation) = FLAGS.PARAM.LABEL_TYPE self._loss = loss.MEL_AUTO_RELATIVE_MSE( self._y_estimation, self._norm_y_mag_spec, FLAGS.PARAM.MEL_NUM, FLAGS.PARAM.AUTO_RELATED_MSE_AXIS_FIT_DEG) else: print('Loss type error.') exit(-1) # endregion if behavior == self.validation: ''' val model cannot train. ''' return self._lr = tf.Variable(0.0, trainable=False) #TODO tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), FLAGS.PARAM.CLIP_NORM) optimizer = tf.train.AdamOptimizer(self.lr) #optimizer = tf.train.GradientDescentOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars)) self._new_lr = tf.placeholder(tf.float32, shape=[], name='new_learning_rate') self._lr_update = tf.assign(self._lr, self._new_lr)
def __init__(self, config, inputs, labels, lengths, infer=False): self._inputs = inputs self._labels = labels self._lengths = lengths self._model_type = config.model_type if infer: # if infer, we prefer to run one utterance one time. config.batch_size = 1 outputs = self._inputs ## This first layer-- feed forward layer ## Transform the input to the right size before feed into RNN with tf.variable_scope('forward1'): outputs = tf.reshape(outputs, [-1, config.input_size]) outputs = tf.layers.dense(outputs, units=config.rnn_size, activation=tf.nn.tanh, reuse=tf.get_variable_scope().reuse) outputs = tf.reshape(outputs, [config.batch_size, -1, config.rnn_size]) ## Configure the LSTM or BLSTM model ## For BLSTM, we use the BasicLSTMCell.For LSTM, we use LSTMCell. ## You can change them and test the performance... if config.model_type.lower() == 'blstm': with tf.variable_scope('blstm'): cell = tf.contrib.rnn.BasicLSTMCell(config.rnn_size) if not infer and config.keep_prob < 1.0: cell = tf.contrib.rnn.DropoutWrapper( cell, output_keep_prob=config.keep_prob) lstm_fw_cell = tf.contrib.rnn.MultiRNNCell( [cell] * config.rnn_num_layers) lstm_bw_cell = tf.contrib.rnn.MultiRNNCell( [cell] * config.rnn_num_layers) lstm_fw_cell = _unpack_cell(lstm_fw_cell) lstm_bw_cell = _unpack_cell(lstm_bw_cell) result = rnn.stack_bidirectional_dynamic_rnn( cells_fw=lstm_fw_cell, cells_bw=lstm_bw_cell, inputs=outputs, dtype=tf.float32, sequence_length=self._lengths) outputs, fw_final_states, bw_final_states = result if config.model_type.lower() == 'lstm': with tf.variable_scope('lstm'): def lstm_cell(): return tf.contrib.rnn.LSTMCell( config.rnn_size, forget_bias=1.0, use_peepholes=True, initializer=tf.contrib.layers.xavier_initializer(), state_is_tuple=True, activation=tf.tanh) attn_cell = lstm_cell if not infer and config.keep_prob < 1.0: def attn_cell(): return tf.contrib.rnn.DropoutWrapper( lstm_cell(), output_keep_prob=config.keep_prob) cell = tf.contrib.rnn.MultiRNNCell( [attn_cell() for _ in range(config.rnn_num_layers)], state_is_tuple=True) self._initial_state = cell.zero_state(config.batch_size, tf.float32) state = self.initial_state outputs, state = tf.nn.dynamic_rnn( cell, outputs, dtype=tf.float32, sequence_length=self._lengths, initial_state=self.initial_state) self._final_state = state ## Feed forward layer. Transform the RNN output to the right output size with tf.variable_scope('forward2'): if config.embedding_option == 0: #no embedding , frame by frame if self._model_type.lower() == 'blstm': outputs = tf.reshape(outputs, [-1, 2 * config.rnn_size]) in_size = 2 * config.rnn_size else: outputs = tf.reshape(outputs, [-1, config.rnn_size]) in_size = config.rnn_size else: if self._model_type.lower() == 'blstm': outputs = tf.reshape( outputs, [config.batch_size, -1, 2 * config.rnn_size]) in_size = 2 * config.rnn_size else: outputs = tf.reshape( outputs, [config.batch_size, -1, config.rnn_size]) in_size = config.rnn_size if config.embedding_option == 1: #last frame embedding #http://sqrtf.com/fetch-rnn-encoder-last-output-using-tf-gather_nd/ ind = tf.subtract(self._lengths, tf.constant(1)) batch_range = tf.range(config.batch_size) indices = tf.stack([batch_range, ind], axis=1) outputs = tf.gather_nd(outputs, indices) self._labels = tf.reduce_mean(self._labels, 1) elif config.embedding_option == 2: # mean pooing outputs = tf.reduce_mean(outputs, 1) self._labels = tf.reduce_mean(self._labels, 1) out_size = config.output_size weights1 = tf.get_variable( 'weights1', [in_size, out_size], initializer=tf.random_normal_initializer(stddev=0.01)) biases1 = tf.get_variable('biases1', [out_size], initializer=tf.constant_initializer(0.0)) outputs = tf.matmul(outputs, weights1) + biases1 if config.embedding_option == 0: outputs = tf.reshape(outputs, [config.batch_size, -1, out_size]) self._outputs = tf.nn.sigmoid(outputs) # Ability to save the model self.saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=30) if infer: return # Compute loss(CE) self._loss = tf.losses.sigmoid_cross_entropy(self._labels, outputs) if tf.get_variable_scope().reuse: return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), config.max_grad_norm) optimizer = tf.train.AdamOptimizer(self.lr) #optimizer = tf.train.GradientDescentOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars)) self._new_lr = tf.placeholder(tf.float32, shape=[], name='new_learning_rate') self._lr_update = tf.assign(self._lr, self._new_lr)
def __init__(self, config, inputs, labels, lengths, genders, infer=False): self._inputs = inputs self._mixed = inputs self._labels1 = tf.slice(labels, [0, 0, 0], [-1, -1, config.output_size]) self._labels2 = tf.slice(labels, [0, 0, config.output_size], [-1, -1, -1]) self._lengths = lengths self._genders = genders self._model_type = config.model_type outputs = self._inputs ## This first layer-- feed forward layer ## Transform the input to the right size before feed into RNN with tf.variable_scope('forward1'): outputs = tf.reshape(outputs, [-1, config.input_size]) outputs = tf.layers.dense(outputs, units=config.rnn_size, activation=tf.nn.tanh, reuse=tf.get_variable_scope().reuse) outputs = tf.reshape(outputs, [config.batch_size, -1, config.rnn_size]) def lstm_cell(): return tf.contrib.rnn.LSTMCell( config.rnn_size, forget_bias=1.0, use_peepholes=True, initializer=tf.contrib.layers.xavier_initializer(), state_is_tuple=True, activation=tf.tanh) attn_cell = lstm_cell if not infer and config.keep_prob < 1.0: def attn_cell(): return tf.contrib.rnn.DropoutWrapper( lstm_cell(), output_keep_prob=config.keep_prob) if config.model_type.lower() == 'blstm': with tf.variable_scope('blstm'): lstm_fw_cell = tf.contrib.rnn.MultiRNNCell( [attn_cell() for _ in range(config.rnn_num_layers)], state_is_tuple=True) lstm_bw_cell = tf.contrib.rnn.MultiRNNCell( [attn_cell() for _ in range(config.rnn_num_layers)], state_is_tuple=True) lstm_fw_cell = _unpack_cell(lstm_fw_cell) lstm_bw_cell = _unpack_cell(lstm_bw_cell) result = rnn.stack_bidirectional_dynamic_rnn( cells_fw=lstm_fw_cell, cells_bw=lstm_bw_cell, inputs=outputs, dtype=tf.float32, sequence_length=self._lengths) outputs, fw_final_states, bw_final_states = result if config.model_type.lower() == 'lstm': with tf.variable_scope('lstm'): cell = tf.contrib.rnn.MultiRNNCell( [attn_cell() for _ in range(config.rnn_num_layers)], state_is_tuple=True) self._initial_state = cell.zero_state(config.batch_size, tf.float32) state = self.initial_state outputs, state = tf.nn.dynamic_rnn( cell, outputs, dtype=tf.float32, sequence_length=self._lengths, initial_state=self.initial_state) self._final_state = state ## Feed forward layer. Transform the RNN output to the right output size with tf.variable_scope('forward2'): if self._model_type.lower() == 'blstm': outputs = tf.reshape(outputs, [-1, 2 * config.rnn_size]) in_size = 2 * config.rnn_size else: outputs = tf.reshape(outputs, [-1, config.rnn_size]) in_size = config.rnn_size # w1,b1 =self. _weight_and_bias("L_1",in_size,256) # outputs1 = tf.nn.relu(tf.matmul(outputs,w1)+b1) # w2,b2 = self._weight_and_bias("L_2",256,256) # outputs2 = tf.nn.relu(tf.matmul(outputs1,w2)+b2+outputs1) out_size = config.output_size # in_size=256 weights1 = tf.get_variable( 'weights1', [in_size, out_size], initializer=tf.random_normal_initializer(stddev=0.01)) biases1 = tf.get_variable('biases1', [out_size], initializer=tf.constant_initializer(0.0)) weights2 = tf.get_variable( 'weights2', [in_size, out_size], initializer=tf.random_normal_initializer(stddev=0.01)) biases2 = tf.get_variable('biases2', [out_size], initializer=tf.constant_initializer(0.0)) mask1 = tf.nn.relu(tf.matmul(outputs, weights1) + biases1) mask2 = tf.nn.relu(tf.matmul(outputs, weights2) + biases2) self._activations1 = tf.reshape( mask1, [config.batch_size, -1, config.output_size]) self._activations2 = tf.reshape( mask2, [config.batch_size, -1, config.output_size]) # in general, config.czt_dim == 0; However, we found that if we concatenate # 128 dim chrip-z transform feats to FFT feats, we got better SDR performance # for the same gender case. # so , if you don't use czt feats (just the fft feats), config.czt_dim=0 self._cleaned1 = self._activations1 * self._mixed self._cleaned2 = self._activations2 * self._mixed # Ability to save the model self.saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=30) if infer: return cost1 = tf.reduce_mean( tf.reduce_sum(tf.pow(self._cleaned1 - self._labels1, 2), 1) + tf.reduce_sum(tf.pow(self._cleaned2 - self._labels2, 2), 1), 1) cost2 = tf.reduce_mean( tf.reduce_sum(tf.pow(self._cleaned2 - self._labels1, 2), 1) + tf.reduce_sum(tf.pow(self._cleaned1 - self._labels2, 2), 1), 1) idx = tf.cast(cost1 > cost2, tf.float32) self._loss = tf.reduce_sum(idx * cost2 + (1 - idx) * cost1) if tf.get_variable_scope().reuse: return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), config.max_grad_norm) optimizer = tf.train.AdamOptimizer(self.lr) # optimizer = tf.train.GradientDescentOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars)) self._new_lr = tf.placeholder(tf.float32, shape=[], name='new_learning_rate') self._lr_update = tf.assign(self._lr, self._new_lr)
def __init__(self, x_mag_spec_batch, lengths_batch, y_mag_spec_batch=None, theta_x_batch=None, theta_y_batch=None, behavior='train'): ''' behavior = 'train/validation/infer' ''' if behavior != self.infer: assert (y_mag_spec_batch is not None) assert (theta_x_batch is not None) assert (theta_y_batch is not None) self._x_mag_spec = x_mag_spec_batch self._norm_x_mag_spec = norm_mag_spec(self._x_mag_spec, FLAGS.PARAM.MAG_NORM_MAX) self._y_mag_spec = y_mag_spec_batch self._norm_y_mag_spec = norm_mag_spec(self._y_mag_spec, FLAGS.PARAM.MAG_NORM_MAX) self._lengths = lengths_batch self._batch_size = tf.shape(self._lengths)[0] self._x_theta = theta_x_batch self._y_theta = theta_y_batch self._model_type = FLAGS.PARAM.MODEL_TYPE if FLAGS.PARAM.INPUT_TYPE == 'mag': self.logbias_net_input = self._norm_x_mag_spec elif FLAGS.PARAM.INPUT_TYPE == 'logmag': tf.logging.error( "Training_In_Turn_Model: NNET input must be magnitude spectrum." ) exit(-1) # region training dropout lstm_attn_cell = lstm_cell if behavior != self.infer and FLAGS.PARAM.KEEP_PROB < 1.0: def lstm_attn_cell(n_units, n_proj, act): return tf.contrib.rnn.DropoutWrapper( lstm_cell(n_units, n_proj, act), output_keep_prob=FLAGS.PARAM.KEEP_PROB) GRU_attn_cell = GRU_cell if behavior != self.infer and FLAGS.PARAM.KEEP_PROB < 1.0: def GRU_attn_cell(n_units, act): return tf.contrib.rnn.DropoutWrapper( GRU_cell(n_units, act), output_keep_prob=FLAGS.PARAM.KEEP_PROB) # endregion # region logbias net with tf.variable_scope('logbias_net'): logbias_net_outputs = self.logbias_net_input if FLAGS.PARAM.MODEL_TYPE.upper() == 'BLSTM': with tf.variable_scope('BLSTM_logbias'): lstm_fw_cell = tf.contrib.rnn.MultiRNNCell( [ lstm_attn_cell(FLAGS.PARAM.RNN_SIZE_LOGBIAS, FLAGS.PARAM.LSTM_num_proj_LOGBIAS, FLAGS.PARAM.LSTM_ACTIVATION_LOGBIAS) for _ in range(FLAGS.PARAM.RNN_LAYER_LOGBIAS) ], state_is_tuple=True) lstm_bw_cell = tf.contrib.rnn.MultiRNNCell( [ lstm_attn_cell(FLAGS.PARAM.RNN_SIZE_LOGBIAS, FLAGS.PARAM.LSTM_num_proj_LOGBIAS, FLAGS.PARAM.LSTM_ACTIVATION_LOGBIAS) for _ in range(FLAGS.PARAM.RNN_LAYER_LOGBIAS) ], state_is_tuple=True) fw_cell_logbiasnet = lstm_fw_cell._cells bw_cell_logbiasnet = lstm_bw_cell._cells if FLAGS.PARAM.MODEL_TYPE.upper() == 'BGRU': with tf.variable_scope('BGRU_logbias'): gru_fw_cell = tf.contrib.rnn.MultiRNNCell( [ GRU_attn_cell(FLAGS.PARAM.RNN_SIZE_LOGBIAS, FLAGS.PARAM.LSTM_ACTIVATION_LOGBIAS) for _ in range(FLAGS.PARAM.RNN_LAYER_LOGBIAS) ], state_is_tuple=True) gru_bw_cell = tf.contrib.rnn.MultiRNNCell( [ GRU_attn_cell(FLAGS.PARAM.RNN_SIZE_LOGBIAS, FLAGS.PARAM.LSTM_ACTIVATION_LOGBIAS) for _ in range(FLAGS.PARAM.RNN_LAYER_LOGBIAS) ], state_is_tuple=True) fw_cell_logbiasnet = gru_fw_cell._cells bw_cell_logbiasnet = gru_bw_cell._cells # dynamic rnn result = rnn.stack_bidirectional_dynamic_rnn( cells_fw=fw_cell_logbiasnet, cells_bw=bw_cell_logbiasnet, inputs=logbias_net_outputs, dtype=tf.float32, sequence_length=self._lengths) logbias_net_outputs, fw_final_states, bw_final_states = result logbias_biRnn_out_size = FLAGS.PARAM.RNN_SIZE_LOGBIAS * 2 # attend_fea = sum_attention_v2(logbias_net_outputs,self._batch_size,logbias_biRnn_out_size) # print(np.shape(fw_final_states),np.shape(bw_final_states),np.shape(logbias_net_outputs)) # attend_fea = sum_attention_with_final_state(logbias_net_outputs, # tf.concat(-1, [fw_final_states, # bw_final_states]), # logbias_biRnn_out_size, 1024) attend_fea = sum_attention(logbias_net_outputs, logbias_biRnn_out_size, 1024) with tf.variable_scope('fullconnectSuitableLogbias'): weights_logbias_fc = tf.get_variable( 'weights_logbias_fc', [logbias_biRnn_out_size, 1], initializer=tf.random_normal_initializer(stddev=0.01)) biases_logbias_fc = tf.get_variable( 'biases_logbias_fc', [1], initializer=tf.constant_initializer(0.0)) logbias_net_out = tf.expand_dims( tf.matmul(attend_fea, weights_logbias_fc) + biases_logbias_fc, axis=-1) # [batch,1,1] self._log_bias = tf.nn.relu(logbias_net_out + FLAGS.PARAM.INIT_LOG_BIAS) self._real_logbias = tf.add(self._log_bias, FLAGS.PARAM.MIN_LOG_BIAS) # endregion self._norm_x_logmag_spec = norm_logmag_spec(self._x_mag_spec, FLAGS.PARAM.MAG_NORM_MAX, self._log_bias, FLAGS.PARAM.MIN_LOG_BIAS) self._norm_y_logmag_spec = norm_logmag_spec(self._y_mag_spec, FLAGS.PARAM.MAG_NORM_MAX, self._log_bias, FLAGS.PARAM.MIN_LOG_BIAS) # region mask net with tf.variable_scope('mask_net'): mask_net_outputs = self._norm_x_logmag_spec if FLAGS.PARAM.MODEL_TYPE.upper() == 'BLSTM': with tf.variable_scope('BLSTM_mask'): lstm_fw_cell = tf.contrib.rnn.MultiRNNCell( [ lstm_attn_cell(FLAGS.PARAM.RNN_SIZE_MASK, FLAGS.PARAM.LSTM_num_proj_MASK, FLAGS.PARAM.LSTM_ACTIVATION_MASK) for _ in range(FLAGS.PARAM.RNN_LAYER_MASK) ], state_is_tuple=True) lstm_bw_cell = tf.contrib.rnn.MultiRNNCell( [ lstm_attn_cell(FLAGS.PARAM.RNN_SIZE_MASK, FLAGS.PARAM.LSTM_num_proj_MASK, FLAGS.PARAM.LSTM_ACTIVATION_MASK) for _ in range(FLAGS.PARAM.RNN_LAYER_MASK) ], state_is_tuple=True) fw_cell_masknet = lstm_fw_cell._cells bw_cell_masknet = lstm_bw_cell._cells if FLAGS.PARAM.MODEL_TYPE.upper() == 'BGRU': with tf.variable_scope('BGRU_mask'): gru_fw_cell = tf.contrib.rnn.MultiRNNCell( [ GRU_attn_cell(FLAGS.PARAM.RNN_SIZE, FLAGS.PARAM.LSTM_ACTIVATION) for _ in range(FLAGS.PARAM.RNN_LAYER) ], state_is_tuple=True) gru_bw_cell = tf.contrib.rnn.MultiRNNCell( [ GRU_attn_cell(FLAGS.PARAM.RNN_SIZE, FLAGS.PARAM.LSTM_ACTIVATION) for _ in range(FLAGS.PARAM.RNN_LAYER) ], state_is_tuple=True) fw_cell_masknet = gru_fw_cell._cells bw_cell_masknet = gru_bw_cell._cells # dynamic rnn result = rnn.stack_bidirectional_dynamic_rnn( cells_fw=fw_cell_masknet, cells_bw=bw_cell_masknet, inputs=mask_net_outputs, dtype=tf.float32, sequence_length=self._lengths) mask_net_outputs, fw_final_states, bw_final_states = result mask_biRnn_output_size = FLAGS.PARAM.RNN_SIZE_MASK * 2 flatten_outputs = tf.reshape(mask_net_outputs, [-1, mask_biRnn_output_size]) out_size = FLAGS.PARAM.OUTPUT_SIZE with tf.variable_scope('fullconnectMask'): weights = tf.get_variable( 'weights1', [mask_biRnn_output_size, out_size], initializer=tf.random_normal_initializer(stddev=0.01)) biases = tf.get_variable( 'biases1', [out_size], initializer=tf.constant_initializer(0.0)) mask = tf.nn.relu(tf.matmul(flatten_outputs, weights) + biases) self._mask = tf.reshape( mask, [self._batch_size, -1, FLAGS.PARAM.OUTPUT_SIZE]) # endregion # region prepare y_estimation and y_labels self._y_mag_labels = self._norm_y_mag_spec self._y_logmag_labels = self._norm_y_logmag_spec if FLAGS.PARAM.TRAINING_MASK_POSITION == 'mag': self._y_normed_mag_estimation = self._mask * self._norm_x_mag_spec self._y_normed_logmag_estimation = normedMag2normedLogmag( self._y_normed_mag_estimation, FLAGS.PARAM.MAG_NORM_MAX, self._log_bias, FLAGS.PARAM.MIN_LOG_BIAS) elif FLAGS.PARAM.TRAINING_MASK_POSITION == 'logmag': self._y_normed_logmag_estimation = self._mask * self._norm_x_logmag_spec self._y_normed_mag_estimation = normedLogmag2normedMag( self._y_normed_logmag_estimation, FLAGS.PARAM.MAG_NORM_MAX, self._log_bias, FLAGS.PARAM.MIN_LOG_BIAS) if FLAGS.PARAM.MASK_TYPE == 'PSM': self._y_mag_labels *= tf.cos(self._x_theta - self._y_theta) self._y_logmag_labels *= tf.cos(self._x_theta - self._y_theta) elif FLAGS.PARAM.MASK_TYPE == 'IRM': pass else: tf.logging.error('Mask type error.') exit(-1) # region get infer spec if FLAGS.PARAM.DECODING_MASK_POSITION != FLAGS.PARAM.TRAINING_MASK_POSITION: print( 'Error, DECODING_MASK_POSITION should be equal to TRAINING_MASK_POSITION when use training_in_turn_model.' ) if FLAGS.PARAM.DECODING_MASK_POSITION == 'mag': self._y_mag_estimation = rm_norm_mag_spec( self._y_normed_mag_estimation, FLAGS.PARAM.MAG_NORM_MAX) elif FLAGS.PARAM.DECODING_MASK_POSITION == 'logmag': self._y_mag_estimation = rm_norm_logmag_spec( self._y_normed_logmag_estimation, FLAGS.PARAM.MAG_NORM_MAX, self._log_bias, FLAGS.PARAM.MIN_LOG_BIAS) ''' _y_mag_estimation is estimated mag_spec ''' # endregion # endregion self.saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=30) if behavior == self.infer: return # region get LOSS if FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == 'SPEC_MSE': # log_mag and mag MSE self._logbiasnet_loss = loss.relative_reduce_sum_frame_batchsize_MSE( self._y_normed_mag_estimation, self._y_mag_labels, 1e-6) self._masknet_loss = loss.reduce_sum_frame_batchsize_MSE( self._y_normed_logmag_estimation, self._y_logmag_labels) elif FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == "SPEC_MSE_FLEXIBLE_POW_C": self._logbiasnet_loss = loss.reduce_sum_frame_batchsize_MSE_EmphasizeLowerValue( self._y_normed_mag_estimation, self._y_mag_labels, FLAGS.PARAM.POW_COEF) self._masknet_loss = loss.reduce_sum_frame_batchsize_MSE_EmphasizeLowerValue( self._y_normed_logmag_estimation, self._y_logmag_labels, FLAGS.PARAM.POW_COEF) else: print('Loss type error.') exit(-1) # endregion if behavior == self.validation: ''' val model cannot train. ''' return self._lr_logbiasnet = tf.Variable(0.0, trainable=False) self._lr_masknet = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() logbias_vars = [var for var in tvars if 'logbias_net' in var.name] mask_vars = [var for var in tvars if 'mask_net' in var.name] logbiasnet_grads, _ = tf.clip_by_global_norm( tf.gradients(self._logbiasnet_loss, logbias_vars), FLAGS.PARAM.CLIP_NORM) masknet_grads, _ = tf.clip_by_global_norm( tf.gradients(self._masknet_loss, mask_vars), FLAGS.PARAM.CLIP_NORM) optimizer_logbiasnet = tf.train.AdamOptimizer(self.lr_logbiasnet) optimizer_masknet = tf.train.AdamOptimizer(self.lr_masknet) #optimizer = tf.train.GradientDescentOptimizer(self.lr) # all_grads = [grad for grad in logbiasnet_grads] # for grad in masknet_grads: # all_grads.append(grad) # all_vars = [var for var in logbias_vars] # for var in mask_vars: # all_vars.append(var) train_logbiasnet = optimizer_logbiasnet.apply_gradients( zip(logbiasnet_grads, logbias_vars)) train_masknet = optimizer_masknet.apply_gradients( zip(masknet_grads, mask_vars)) if FLAGS.PARAM.TRAIN_TYPE == 'BOTH': self._train_op = [train_logbiasnet, train_masknet] elif FLAGS.PARAM.TRAIN_TYPE == 'LOGBIASNET': self._train_op = train_logbiasnet elif FLAGS.PARAM.TRAIN_TYPE == 'MASKNET': self._train_op = train_masknet self._new_lr_logbiasnet = tf.placeholder(tf.float32, shape=[], name='new_learning_rate1') self._new_lr_masknet = tf.placeholder(tf.float32, shape=[], name='new_learning_rate2') self._lr_update = [ tf.assign(self._lr_logbiasnet, self._new_lr_logbiasnet), tf.assign(self._lr_masknet, self._new_lr_masknet) ]
def __init__(self, x_mag_spec_batch, lengths_batch, y_mag_spec_batch=None, theta_x_batch=None, theta_y_batch=None, behavior='train'): ''' behavior = 'train/validation/infer' ''' if behavior != self.infer: assert (y_mag_spec_batch is not None) assert (theta_x_batch is not None) assert (theta_y_batch is not None) self._log_bias = tf.get_variable( 'logbias', [1], trainable=FLAGS.PARAM.LOG_BIAS_TRAINABLE, initializer=tf.constant_initializer(FLAGS.PARAM.INIT_LOG_BIAS)) self._real_logbias = self._log_bias + FLAGS.PARAM.MIN_LOG_BIAS self._x_mag_spec = x_mag_spec_batch self.indi_mean_x, self.indi_var_x = tf.nn.moments( self._x_mag_spec, axes=FLAGS.PARAM.BN_KEEP_DIMS, keep_dims=True) self._norm_x_mag_spec = indi_norm_mag_spec(self._x_mag_spec, self.indi_mean_x, self.indi_var_x) self._y_mag_spec = y_mag_spec_batch self.indi_mean_y, self.indi_var_y = tf.nn.moments( self._y_mag_spec, axes=FLAGS.PARAM.BN_KEEP_DIMS, keep_dims=True) self._norm_y_mag_spec = indi_norm_mag_spec(self._y_mag_spec, self.indi_mean_y, self.indi_var_y) self._lengths = lengths_batch self._batch_size = tf.shape(self._lengths)[0] self._x_theta = theta_x_batch self._y_theta = theta_y_batch self._model_type = FLAGS.PARAM.MODEL_TYPE self.net_input = self._norm_x_mag_spec self._y_labels = self._norm_y_mag_spec outputs = self.net_input lstm_attn_cell = lstm_cell if behavior != self.infer and FLAGS.PARAM.KEEP_PROB < 1.0: def lstm_attn_cell(n_units, n_proj, act): return tf.contrib.rnn.DropoutWrapper( lstm_cell(n_units, n_proj, act), output_keep_prob=FLAGS.PARAM.KEEP_PROB) GRU_attn_cell = GRU_cell if behavior != self.infer and FLAGS.PARAM.KEEP_PROB < 1.0: def GRU_attn_cell(n_units, act): return tf.contrib.rnn.DropoutWrapper( GRU_cell(n_units, act), output_keep_prob=FLAGS.PARAM.KEEP_PROB) if FLAGS.PARAM.MODEL_TYPE.upper() == 'BLSTM': with tf.variable_scope('BLSTM'): lstm_fw_cell = tf.contrib.rnn.MultiRNNCell([ lstm_attn_cell(FLAGS.PARAM.RNN_SIZE, FLAGS.PARAM.LSTM_num_proj, FLAGS.PARAM.LSTM_ACTIVATION) for _ in range(FLAGS.PARAM.RNN_LAYER) ], state_is_tuple=True) lstm_bw_cell = tf.contrib.rnn.MultiRNNCell([ lstm_attn_cell(FLAGS.PARAM.RNN_SIZE, FLAGS.PARAM.LSTM_num_proj, FLAGS.PARAM.LSTM_ACTIVATION) for _ in range(FLAGS.PARAM.RNN_LAYER) ], state_is_tuple=True) fw_cell = lstm_fw_cell._cells bw_cell = lstm_bw_cell._cells result = rnn.stack_bidirectional_dynamic_rnn( cells_fw=fw_cell, cells_bw=bw_cell, inputs=outputs, dtype=tf.float32, sequence_length=self._lengths) outputs, fw_final_states, bw_final_states = result if FLAGS.PARAM.MODEL_TYPE.upper() == 'BGRU': with tf.variable_scope('BGRU'): gru_fw_cell = tf.contrib.rnn.MultiRNNCell([ GRU_attn_cell(FLAGS.PARAM.RNN_SIZE, FLAGS.PARAM.LSTM_ACTIVATION) for _ in range(FLAGS.PARAM.RNN_LAYER) ], state_is_tuple=True) gru_bw_cell = tf.contrib.rnn.MultiRNNCell([ GRU_attn_cell(FLAGS.PARAM.RNN_SIZE, FLAGS.PARAM.LSTM_ACTIVATION) for _ in range(FLAGS.PARAM.RNN_LAYER) ], state_is_tuple=True) fw_cell = gru_fw_cell._cells bw_cell = gru_bw_cell._cells result = rnn.stack_bidirectional_dynamic_rnn( cells_fw=fw_cell, cells_bw=bw_cell, inputs=outputs, dtype=tf.float32, sequence_length=self._lengths) outputs, fw_final_states, bw_final_states = result # region full connection get mask # calcu rnn output size in_size = FLAGS.PARAM.RNN_SIZE mask = None if self._model_type.upper()[0] == 'B': # bidirection rnn_output_num = FLAGS.PARAM.RNN_SIZE * 2 if FLAGS.PARAM.MODEL_TYPE == 'BLSTM' and ( not (FLAGS.PARAM.LSTM_num_proj is None)): rnn_output_num = 2 * FLAGS.PARAM.LSTM_num_proj in_size = rnn_output_num outputs = tf.reshape(outputs, [-1, in_size]) out_size = FLAGS.PARAM.OUTPUT_SIZE with tf.variable_scope('fullconnectOut'): weights = tf.get_variable( 'weights1', [in_size, out_size], initializer=tf.random_normal_initializer(stddev=0.01)) biases = tf.get_variable('biases1', [out_size], initializer=tf.constant_initializer( FLAGS.PARAM.INIT_MASK_VAL)) if FLAGS.PARAM.TIME_NOSOFTMAX_ATTENTION: with tf.variable_scope('fullconnectCoef'): weights_coef = tf.get_variable( 'weights_coef', [in_size, 1], initializer=tf.random_normal_initializer(mean=1.0, stddev=0.01)) biases_coef = tf.get_variable( 'biases_coef', [1], initializer=tf.constant_initializer(0.0)) raw_mask = tf.reshape( tf.matmul(outputs, weights) + biases, [self._batch_size, -1, FLAGS.PARAM.OUTPUT_SIZE ]) # [batch,time,fre] batch_coef_vec = tf.nn.relu( tf.reshape( tf.matmul(outputs, weights_coef) + biases_coef, [self._batch_size, -1])) # [batch, time] mask = tf.multiply( raw_mask, tf.reshape(batch_coef_vec, [self._batch_size, -1, 1])) else: mask = tf.nn.relu(tf.matmul(outputs, weights) + biases) # endregion self._mask = tf.reshape( mask, [self._batch_size, -1, FLAGS.PARAM.OUTPUT_SIZE]) # region get infer spec if not FLAGS.PARAM.USE_ESTIMATED_MEAN_VAR: tmp_mean, tmp_var = self.indi_mean_x, self.indi_var_x else: tmp_mean, tmp_var = tf.nn.moments(self._mask * self._norm_x_mag_spec, axes=FLAGS.PARAM.BN_KEEP_DIMS, keep_dims=True) self._y_mag_estimation = rm_indi_norm_mag_spec( self._mask * self._norm_x_mag_spec, tmp_mean, tmp_var) ''' _y_mag_estimation is estimated mag_spec _y_estimation is loss_targe, mag_sepec ''' # endregion # region prepare y_estimation and y_labels self._y_estimation = self._mask * self._norm_x_mag_spec if FLAGS.PARAM.MASK_TYPE == 'PSM': self._y_labels *= tf.cos(self._x_theta - self._y_theta) elif FLAGS.PARAM.MASK_TYPE == 'IRM': pass else: tf.logging.error('Mask type error.') exit(-1) if FLAGS.PARAM.TRAINING_MASK_POSITION != FLAGS.PARAM.LABEL_TYPE: print( "error, FLAGS.PARAM.TRAINING_MASK_POSITION != FLAGS.PARAM.LABEL_TYPE." ) exit(-1) # endregion self.saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=30) if behavior == self.infer: return # region get LOSS if FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == 'SPEC_MSE': # log_mag and mag MSE self._loss = loss.reduce_sum_frame_batchsize_MSE( self._y_estimation, self._y_labels) elif FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == "SPEC_MSE_LOWF_EN": self._loss = loss.reduce_sum_frame_batchsize_MSE( self._y_estimation, self._y_labels) elif FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == "FAIR_SPEC_MSE": self._loss = loss.fair_reduce_sum_frame_batchsize_MSE( self._y_estimation, self._y_labels) elif FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == "SPEC_MSE_FLEXIBLE_POW_C": self._loss = loss.reduce_sum_frame_batchsize_MSE_EmphasizeLowerValue( self._y_estimation, self._y_labels, FLAGS.PARAM.POW_COEF) elif FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == "RELATED_MSE": self._loss = loss.relative_reduce_sum_frame_batchsize_MSE( self._y_estimation, self._y_labels, FLAGS.PARAM.RELATED_MSE_IGNORE_TH) elif FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == "AUTO_RELATED_MSE": self._loss = loss.auto_ingore_relative_reduce_sum_frame_batchsize_MSE( self._y_estimation, self._y_labels, FLAGS.PARAM.AUTO_RELATED_MSE_AXIS_FIT_DEG) else: print('Loss type error.') exit(-1) # endregion if behavior == self.validation: ''' val model cannot train. ''' return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), FLAGS.PARAM.CLIP_NORM) optimizer = tf.train.AdamOptimizer(self.lr) #optimizer = tf.train.GradientDescentOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars)) self._new_lr = tf.placeholder(tf.float32, shape=[], name='new_learning_rate') self._lr_update = tf.assign(self._lr, self._new_lr)
def __init__(self, config, inputs_cmvn, inputs, labels1, labels2, lengths, infer=False): # EPOCH self._inputs = inputs_cmvn self._mixed = inputs self._labels1 = labels1 self._labels2 = labels2 self._lengths = lengths self._model_type = config.model_type if infer: # if infer, we prefer to run one utterance one time. config.batch_size = 1 outputs = self._inputs # This first layer-- feed forward layer # Transform the input to the right size before feed into RNN with tf.variable_scope('forward1'): outputs = tf.reshape(outputs, [-1, config.input_size]) outputs = tf.layers.dense(outputs, units=config.rnn_size, activation=tf.nn.tanh, kernel_initializer=glorot_uniform_initializer(), reuse=tf.get_variable_scope().reuse) outputs = tf.reshape( outputs, [config.batch_size, -1, config.rnn_size]) # Configure the LSTM or BLSTM model # For BLSTM, we use the BasicLSTMCell.For LSTM, we use LSTMCell. # You can change them and test the performance... if config.model_type.lower() == 'blstm': with tf.variable_scope('blstm'): cell = tf.contrib.rnn.BasicLSTMCell(config.rnn_size) if not infer and config.keep_prob < 1.0: cell = tf.contrib.rnn.DropoutWrapper( cell, output_keep_prob=config.keep_prob) lstm_fw_cell = tf.contrib.rnn.MultiRNNCell( [cell] * config.rnn_num_layers) lstm_bw_cell = tf.contrib.rnn.MultiRNNCell( [cell] * config.rnn_num_layers) lstm_fw_cell = _unpack_cell(lstm_fw_cell) lstm_bw_cell = _unpack_cell(lstm_bw_cell) result = rnn.stack_bidirectional_dynamic_rnn( cells_fw=lstm_fw_cell, cells_bw=lstm_bw_cell, inputs=outputs, dtype=tf.float32, sequence_length=self._lengths) outputs, fw_final_states, bw_final_states = result if config.model_type.lower() == 'lstm': with tf.variable_scope('lstm'): def lstm_cell(): return tf.contrib.rnn.LSTMCell( config.rnn_size, forget_bias=1.0, use_peepholes=True, initializer=tf.contrib.layers.xavier_initializer(), state_is_tuple=True, activation=tf.tanh) attn_cell = lstm_cell if not infer and config.keep_prob < 1.0: def attn_cell(): return tf.contrib.rnn.DropoutWrapper(lstm_cell(), output_keep_prob=config.keep_prob) cell = tf.contrib.rnn.MultiRNNCell( [attn_cell() for _ in range(config.rnn_num_layers)], state_is_tuple=True) self._initial_state = cell.zero_state( config.batch_size, tf.float32) state = self.initial_state outputs, state = tf.nn.dynamic_rnn( cell, outputs, dtype=tf.float32, sequence_length=self._lengths, initial_state=self.initial_state) self._final_state = state # Feed forward layer. Transform the RNN output to the right output siz with tf.variable_scope('forward2'): if self._model_type.lower() == 'blstm': outputs = tf.reshape(outputs, [-1, 2*config.rnn_size]) in_size = 2*config.rnn_size else: outputs = tf.reshape(outputs, [-1, config.rnn_size]) in_size = config.rnn_size out_size = config.output_size weights1 = tf.get_variable('weights1', [in_size, out_size], initializer=tf.random_normal_initializer(stddev=0.01)) biases1 = tf.get_variable('biases1', [out_size], initializer=tf.constant_initializer(0.0)) weights2 = tf.get_variable('weights2', [in_size, out_size], initializer=tf.random_normal_initializer(stddev=0.01)) biases2 = tf.get_variable('biases2', [out_size], initializer=tf.constant_initializer(0.0)) mask1 = tf.nn.sigmoid(tf.matmul(outputs, weights1) + biases1) mask2 = tf.nn.sigmoid(tf.matmul(outputs, weights2) + biases2) self._activations1 = tf.reshape( mask1, [config.batch_size, -1, config.output_size]) self._activations2 = tf.reshape( mask2, [config.batch_size, -1, config.output_size]) self._cleaned1 = self._activations1 * \ self._mixed[:, :, config.czt_dim:] self._cleaned2 = self._activations2 * \ self._mixed[:, :, config.czt_dim:] # Ability to save the model self.saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=30) if infer: return # Compute loss(Mse) cost1 = tf.reduce_mean(tf.reduce_sum(tf.pow(self._cleaned1-self._labels1, 2), 1) + tf.reduce_sum(tf.pow(self._cleaned2-self._labels2, 2), 1), 1) cost2 = tf.reduce_mean(tf.reduce_sum(tf.pow(self._cleaned2-self._labels1, 2), 1) + tf.reduce_sum(tf.pow(self._cleaned1-self._labels2, 2), 1), 1) idx = tf.cast(cost1 > cost2, tf.float32) min_cost = idx*cost2+(1-idx)*cost1 max_cost = idx*cost1+(1-idx)*cost ## Prob PIT cost ########################################################################################## self.gamma = tf.Variable(0.00000000000000001, trainable=False) const = tf.constant(0.00000000001) def f1(): return tf.reduce_sum(min_cost) def f2(): return tf.reduce_sum(min_cost - self.gamma * tf.log(tf.exp((min_cost-max_cost)/self.gamma)+1)) self._loss = tf.cond(tf.less(self.gamma, const), f1, f2) ######################################################################## if tf.get_variable_scope().reuse: return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), config.max_grad_norm) optimizer = tf.train.AdamOptimizer(self.lr) # optimizer = tf.train.GradientDescentOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars)) self._new_lr = tf.placeholder( tf.float32, shape=[], name='new_learning_rate') self._lr_update = tf.assign(self._lr, self._new_lr)
def __call__(self, inputs, seq_len, keep_prob=None, is_train=None): with tf.variable_scope(self.scope): output, *_ = stack_bidirectional_dynamic_rnn(self.cells_fw, self.cells_bw, inputs, sequence_length=seq_len, dtype=tf.float32) output = dropout(output, keep_prob, is_train) return output
def build_net(self): # build auxiliary network to get the speaker embedding used for speaker extraction network with tf.variable_scope('spk_embed') as scope: spk_embed_aux = self.build_net_aux(self._inputs_aux, self._lengths_aux) outputs = tf.reshape( self._inputs, [self._config.batch_size, -1, self._config.input_size]) # BLSTM layer with tf.variable_scope('blstm'): def lstm_cell(): if not self._infer and self._config.keep_prob < 1.0: return tf.contrib.rnn.DropoutWrapper( tf.contrib.rnn.BasicLSTMCell(self._config.rnn_size), output_keep_prob=self._config.keep_prob) else: return tf.contrib.rnn.BasicLSTMCell(self._config.rnn_size) # tf.nn.rnn_cell.MultiRNNCell in r1.12 lstm_fw_cell = tf.contrib.rnn.MultiRNNCell( [lstm_cell() for _ in range(self._config.rnn_num_layers)], state_is_tuple=True) lstm_bw_cell = tf.contrib.rnn.MultiRNNCell( [lstm_cell() for _ in range(self._config.rnn_num_layers)], state_is_tuple=True) lstm_fw_cell = self._unpack_cell(lstm_fw_cell) lstm_bw_cell = self._unpack_cell(lstm_bw_cell) outputs, fw_final_states, bw_final_states = rnn.stack_bidirectional_dynamic_rnn( cells_fw=lstm_fw_cell, cells_bw=lstm_bw_cell, inputs=outputs, dtype=tf.float32, sequence_length=self._lengths) # speaker adaptation layer by concat the output from auxiliary network with tf.variable_scope('adapt_concat'): outputs = tf.reshape( outputs, [self._config.batch_size, -1, 2 * self._config.rnn_size]) frame_num = tf.shape(outputs)[1] spk_embed = tf.transpose(tf.reshape( tf.tile(tf.reshape(spk_embed_aux, (-1, 1)), (frame_num, 1)), (frame_num, self._config.batch_size, self._config.aux_output_size)), perm=[1, 0, 2]) outputs = tf.concat([outputs, spk_embed], 2) # remove the part out of the lenghts when concate speaker embeddings outputs = tf.multiply( tf.expand_dims( tf.sequence_mask(self._lengths, dtype=tf.float32), -1), outputs) concat_dim = 2 * self._config.rnn_size + self._config.aux_output_size outputs = tf.reshape(outputs, [-1, concat_dim]) # one more fully connected layer with tf.variable_scope('fc1'): weights1, biases1 = self._weight_and_bias(concat_dim, self._config.rnn_size) outputs = tf.nn.relu(tf.matmul(outputs, weights1) + biases1) outputs = tf.reshape( outputs, [self._config.batch_size, -1, self._config.rnn_size]) # BLSTM layer with tf.variable_scope('blstm2'): def lstm_cell(): if not self._infer and self._config.keep_prob < 1.0: return tf.contrib.rnn.DropoutWrapper( tf.contrib.rnn.BasicLSTMCell(self._config.rnn_size), output_keep_prob=self._config.keep_prob) else: return tf.contrib.rnn.BasicLSTMCell(self._config.rnn_size) # tf.nn.rnn_cell.MultiRNNCell in r1.12 lstm_fw_cell = tf.contrib.rnn.MultiRNNCell( [lstm_cell() for _ in range(self._config.rnn_num_layers)], state_is_tuple=True) lstm_bw_cell = tf.contrib.rnn.MultiRNNCell( [lstm_cell() for _ in range(self._config.rnn_num_layers)], state_is_tuple=True) lstm_fw_cell = self._unpack_cell(lstm_fw_cell) lstm_bw_cell = self._unpack_cell(lstm_bw_cell) outputs, fw_final_states, bw_final_states = rnn.stack_bidirectional_dynamic_rnn( cells_fw=lstm_fw_cell, cells_bw=lstm_bw_cell, inputs=outputs, dtype=tf.float32, sequence_length=self._lengths) outputs = tf.reshape(outputs, [-1, 2 * self._config.rnn_size]) # one more fully connected layer with tf.variable_scope('fc2'): weights2, biases2 = self._weight_and_bias( 2 * self._config.rnn_size, self._config.rnn_size) outputs = tf.nn.relu(tf.matmul(outputs, weights2) + biases2) # Mask estimation layer with tf.variable_scope('mask'): weights_m, biases_m = self._weight_and_bias( self._config.rnn_size, self._config.output_size) if self._config.mask_type.lower() == 'relu': mask = tf.nn.relu(tf.matmul(outputs, weights_m) + biases_m) else: mask = tf.nn.sigmoid(tf.matmul(outputs, weights_m) + biases_m) self._mask = tf.reshape( mask, [self._config.batch_size, -1, self._config.output_size]) self._sep = self._mask * self._mixed self.saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=50)
def __init__(self, x_mag_spec_batch, lengths_batch, y_mag_spec_batch=None, theta_x_batch=None, theta_y_batch=None, behavior='train'): ''' behavior = 'train/validation/infer' ''' assert (theta_x_batch is not None) if behavior != self.infer: assert (y_mag_spec_batch is not None) assert (theta_y_batch is not None) self._x_mag_spec = x_mag_spec_batch self._norm_x_mag_spec = norm_mag_spec(self._x_mag_spec, FLAGS.PARAM.MAG_NORM_MAX) self._y_mag_spec = y_mag_spec_batch self._norm_y_mag_spec = norm_mag_spec(self._y_mag_spec, FLAGS.PARAM.MAG_NORM_MAX) self._lengths = lengths_batch self._batch_size = tf.shape(self._lengths)[0] self._x_theta = theta_x_batch self._y_theta = theta_y_batch # self._norm_x_theta = self._x_theta/(2.0*FLAGS.PARAM.PI)+0.5 # self._norm_y_theta = self._y_theta/(2.0*FLAGS.PARAM.PI)+0.5 self._model_type = FLAGS.PARAM.MODEL_TYPE self.net_input = tf.concat([self._norm_x_mag_spec, self._x_theta], axis=-1) self._y_mag_labels = self._norm_y_mag_spec # self._y_theta_labels = self._norm_y_theta self._y_theta_labels = self._y_theta outputs = self.net_input lstm_attn_cell = lstm_cell if behavior != self.infer and FLAGS.PARAM.KEEP_PROB < 1.0: def lstm_attn_cell(n_units, n_proj, act): return tf.contrib.rnn.DropoutWrapper( lstm_cell(n_units, n_proj, act), output_keep_prob=FLAGS.PARAM.KEEP_PROB) GRU_attn_cell = GRU_cell if behavior != self.infer and FLAGS.PARAM.KEEP_PROB < 1.0: def GRU_attn_cell(n_units, act): return tf.contrib.rnn.DropoutWrapper( GRU_cell(n_units, act), output_keep_prob=FLAGS.PARAM.KEEP_PROB) with tf.variable_scope("BiRNN"): if FLAGS.PARAM.MODEL_TYPE.upper() == 'BLSTM': with tf.variable_scope('BLSTM'): lstm_fw_cell = tf.contrib.rnn.MultiRNNCell( [ lstm_attn_cell(FLAGS.PARAM.RNN_SIZE, FLAGS.PARAM.LSTM_num_proj, FLAGS.PARAM.LSTM_ACTIVATION) for _ in range(FLAGS.PARAM.RNN_LAYER) ], state_is_tuple=True) lstm_bw_cell = tf.contrib.rnn.MultiRNNCell( [ lstm_attn_cell(FLAGS.PARAM.RNN_SIZE, FLAGS.PARAM.LSTM_num_proj, FLAGS.PARAM.LSTM_ACTIVATION) for _ in range(FLAGS.PARAM.RNN_LAYER) ], state_is_tuple=True) fw_cell = lstm_fw_cell._cells bw_cell = lstm_bw_cell._cells if FLAGS.PARAM.MODEL_TYPE.upper() == 'BGRU': with tf.variable_scope('BGRU'): gru_fw_cell = tf.contrib.rnn.MultiRNNCell( [ GRU_attn_cell(FLAGS.PARAM.RNN_SIZE, FLAGS.PARAM.LSTM_ACTIVATION) for _ in range(FLAGS.PARAM.RNN_LAYER) ], state_is_tuple=True) gru_bw_cell = tf.contrib.rnn.MultiRNNCell( [ GRU_attn_cell(FLAGS.PARAM.RNN_SIZE, FLAGS.PARAM.LSTM_ACTIVATION) for _ in range(FLAGS.PARAM.RNN_LAYER) ], state_is_tuple=True) fw_cell = gru_fw_cell._cells bw_cell = gru_bw_cell._cells result = rnn.stack_bidirectional_dynamic_rnn( cells_fw=fw_cell, cells_bw=bw_cell, inputs=outputs, dtype=tf.float32, sequence_length=self._lengths) outputs, fw_final_states, bw_final_states = result # region full connection get mask # calcu rnn output size in_size = FLAGS.PARAM.RNN_SIZE if self._model_type.upper()[0] == 'B': # bidirection rnn_output_num = FLAGS.PARAM.RNN_SIZE * 2 if FLAGS.PARAM.MODEL_TYPE == 'BLSTM' and ( not (FLAGS.PARAM.LSTM_num_proj is None)): rnn_output_num = 2 * FLAGS.PARAM.LSTM_num_proj in_size = rnn_output_num outputs = tf.reshape(outputs, [-1, in_size]) out_size = FLAGS.PARAM.OUTPUT_SIZE with tf.variable_scope('fullconnectOut1'): out1_dense1 = tf.layers.Dense(out_size, activation='tanh') out1_dense2 = tf.layers.Dense( out_size // 2, activation='relu' if FLAGS.PARAM.ReLU_MASK else None, bias_initializer=tf.constant_initializer( FLAGS.PARAM.INIT_MASK_VAL)) self._mask1 = out1_dense2(out1_dense1(outputs)) with tf.variable_scope('fullconnectOut2'): out2_dense1 = tf.layers.Dense(out_size, activation='tanh') out2_dense2 = tf.layers.Dense( out_size // 2, activation='relu' if FLAGS.PARAM.ReLU_MASK else None, bias_initializer=tf.constant_initializer( FLAGS.PARAM.INIT_MASK_VAL)) self._mask2 = out2_dense2(out2_dense1(outputs)) self._mask1 = tf.reshape( self._mask1, [self._batch_size, -1, FLAGS.PARAM.OUTPUT_SIZE // 2]) self._mask2 = tf.reshape( self._mask2, [self._batch_size, -1, FLAGS.PARAM.OUTPUT_SIZE // 2]) self._mask = tf.concat([self._mask1, self._mask2], axis=-1) # endregion # mask type if FLAGS.PARAM.MASK_TYPE == 'PSM': self._y_mag_labels *= tf.cos(self._x_theta - self._y_theta) elif FLAGS.PARAM.MASK_TYPE == 'fixPSM': self._y_mag_labels *= (1.0 + tf.cos(self._x_theta - self._y_theta)) * 0.5 elif FLAGS.PARAM.MASK_TYPE == 'AcutePM': self._y_mag_labels *= tf.nn.relu( tf.cos(self._x_theta - self._y_theta)) elif FLAGS.PARAM.MASK_TYPE == 'IRM': pass else: tf.logging.error('Mask type error.') exit(-1) # region get infer spec # self._y_est = self._mask*self.net_input # est->estimation # self._norm_y_mag_est = tf.slice(self._y_est,[0,0,0],[-1,-1,FLAGS.PARAM.FFT_DOT]) # self._norm_y_theta_est = tf.slice(self._y_est,[0,0,FLAGS.PARAM.FFT_DOT],[-1,-1,-1]) self._norm_y_mag_est = self._mask1 * self._norm_x_mag_spec self._norm_y_theta_est = self._mask2 * self._x_theta self._y_mag_est = rm_norm_mag_spec(self._norm_y_mag_est, FLAGS.PARAM.MAG_NORM_MAX) # self._y_theta_est = (self._norm_y_theta_est-0.5)*2.0*FLAGS.PARAM.PI self._y_theta_est = self._norm_y_theta_est # endregion self.saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=30) if behavior == self.infer: return # region get LOSS if FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == 'SPEC_MSE': # log_mag and mag MSE self._mag_loss = loss.reduce_sum_frame_batchsize_MSE( self._norm_y_mag_est, self._y_mag_labels) elif FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == "RELATED_MSE": self._mag_loss = loss.relative_reduce_sum_frame_batchsize_MSE( self._norm_y_mag_est, self._y_mag_labels, FLAGS.PARAM.RELATED_MSE_IGNORE_TH) elif FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == "AUTO_RELATED_MSE": self._mag_loss = loss.auto_ingore_relative_reduce_sum_frame_batchsize_MSE( self._norm_y_mag_est, self._y_mag_labels, FLAGS.PARAM.AUTO_RELATED_MSE_AXIS_FIT_DEG) elif FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == "AUTO_RELATED_MSE_USE_COS": self._mag_loss = loss.cos_auto_ingore_relative_reduce_sum_frame_batchsize_MSE( self._norm_y_mag_est, self._y_mag_labels, FLAGS.PARAM.COS_AUTO_RELATED_MSE_W) else: tf.logging.error('Magnitude_Loss type error.') exit(-1) if FLAGS.PARAM.LOSS_FUNC_FOR_PHASE_SPEC == 'COS': self._phase_loss = tf.reduce_sum( tf.reduce_mean( tf.pow( tf.abs(1.0 - tf.cos(self._y_theta_est - self._y_theta_labels)), FLAGS.PARAM.PHASE_LOSS_INDEX), 1)) elif FLAGS.PARAM.LOSS_FUNC_FOR_PHASE_SPEC == 'MAG_WEIGHTED_COS': self._phase_loss = loss.magnitude_weighted_cos_deltaTheta( self._y_theta_est, self._y_theta_labels, self._norm_y_mag_spec, index_=FLAGS.PARAM.PHASE_LOSS_INDEX) elif FLAGS.PARAM.LOSS_FUNC_FOR_PHASE_SPEC == 'MIXMAG_WEIGHTED_COS': self._phase_loss = loss.magnitude_weighted_cos_deltaTheta( self._y_theta_est, self._y_theta_labels, self._norm_x_mag_spec, index_=FLAGS.PARAM.PHASE_LOSS_INDEX) elif FLAGS.PARAM.LOSS_FUNC_FOR_PHASE_SPEC == 'ABSOLUTE': self._phase_loss = tf.reduce_sum( tf.reduce_mean( tf.pow(tf.abs(self._y_theta_est - self._y_theta_labels), FLAGS.PARAM.PHASE_LOSS_INDEX), 1)) elif FLAGS.PARAM.LOSS_FUNC_FOR_PHASE_SPEC == 'MAG_WEIGHTED_ABSOLUTE': self._phase_loss = tf.reduce_sum( tf.reduce_mean( tf.pow( tf.abs(self._y_theta_est - self._y_theta_labels) * self._norm_y_mag_spec * 10.0, FLAGS.PARAM.PHASE_LOSS_INDEX), 1)) elif FLAGS.PARAM.LOSS_FUNC_FOR_PHASE_SPEC == 'MIXMAG_WEIGHTED_ABSOLUTE': self._phase_loss = tf.reduce_sum( tf.reduce_mean( tf.pow( tf.abs(self._y_theta_est - self._y_theta_labels) * self._norm_x_mag_spec * 10.0, FLAGS.PARAM.PHASE_LOSS_INDEX), 1)) else: tf.logging.error('Phase_Loss type error.') exit(-1) self._loss = self._mag_loss + self._phase_loss # endregion if behavior == self.validation: ''' val model cannot train. ''' return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), FLAGS.PARAM.CLIP_NORM) optimizer = tf.train.AdamOptimizer(self.lr) #optimizer = tf.train.GradientDescentOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars)) self._new_lr = tf.placeholder(tf.float32, shape=[], name='new_learning_rate') self._lr_update = tf.assign(self._lr, self._new_lr)
def build(self): with tf.variable_scope('Neural_Network') as vs: def lstm_cell(): return tf.contrib.rnn.LSTMCell( self.rnn_size, forget_bias=1.0, use_peepholes=True, initializer=tf.contrib.layers.xavier_initializer(), state_is_tuple=True, activation=tf.tanh) attn_cell = lstm_cell if self.training and self.dropouts < 1.0: def attn_cell(): return tf.contrib.rnn.DropoutWrapper( lstm_cell(), output_keep_prob=self.dropouts) with tf.variable_scope('Intputs'): self.x_noisy = tf.placeholder( tf.float32, shape=[None, self.dim_in[0], self.dim_in[1]], name='x') with tf.variable_scope('Outputs'): self.y_clean = tf.placeholder(tf.float32, shape=[None, self.dim_out], name='y_clean') # self.y_clean = tf.reshape(self.y_clean, (-1, self.dim_out)) with tf.variable_scope('DNN'): inputs = tf.reshape(self.x_noisy, (-1, self.dim_in[0] * self.dim_in[1])) layer1 = tf.layers.dense(inputs=inputs, units=1024, activation=tf.nn.relu) # layer1 = tf.layers.dropout(layer1, rate=self.dropouts, training=self.training) layer1 = tf.reshape(layer1, [-1, 1, 1024]) lstm_fw_cell = tf.contrib.rnn.MultiRNNCell( [attn_cell() for _ in range(self.rnn_num_layers)], state_is_tuple=True) lstm_bw_cell = tf.contrib.rnn.MultiRNNCell( [attn_cell() for _ in range(self.rnn_num_layers)], state_is_tuple=True) lstm_fw_cell = _unpack_cell(lstm_fw_cell) lstm_bw_cell = _unpack_cell(lstm_bw_cell) result = rnn.stack_bidirectional_dynamic_rnn( cells_fw=lstm_fw_cell, cells_bw=lstm_bw_cell, inputs=layer1, dtype=tf.float32) # sequence_length=self.batch_size) layer2, fw_final_states, bw_final_states = result layer2 = tf.reshape(layer2, [-1, 2 * self.rnn_size]) in_size = 2 * self.rnn_size self.enhanced_outputs = tf.layers.dense(inputs=layer2, units=self.dim_out, activation=None) with tf.name_scope('loss'): self.loss = tf.losses.mean_squared_error( self.y_clean, self.enhanced_outputs) tf.summary.scalar('Loss', self.loss) with tf.name_scope("exp_learning_rate"): self.global_step = tf.Variable(0, trainable=False) self.exp_learning_rate = tf.train.exponential_decay( self.lr, global_step=self.global_step, decay_steps=50000, decay_rate=0.8, staircase=False) tf.summary.scalar('Learning rate', self.exp_learning_rate) optimizer = tf.train.AdamOptimizer(self.lr) gradients, v = zip(*optimizer.compute_gradients(self.loss)) gradients, _ = tf.clip_by_global_norm(gradients, 0.5) self.optimizer = optimizer.apply_gradients( zip(gradients, v), global_step=self.global_step) self.saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=30)
def __init__(self, inputs_batch, label_batch, lengths_batch, theta_x_batch=None, theta_y_batch=None, infer=False): self._inputs = inputs_batch self._mixed = self._inputs self._labels = label_batch self._lengths = lengths_batch self.batch_size = tf.shape(self._lengths)[0] self._model_type = NNET_PARAM.MODEL_TYPE outputs = self._inputs def lstm_cell(): return tf.contrib.rnn.LSTMCell( NNET_PARAM.RNN_SIZE, forget_bias=1.0, use_peepholes=True, num_proj=NNET_PARAM.LSTM_num_proj, initializer=tf.contrib.layers.xavier_initializer(), state_is_tuple=True, activation=NNET_PARAM.LSTM_ACTIVATION) lstm_attn_cell = lstm_cell if not infer and NNET_PARAM.KEEP_PROB < 1.0: def lstm_attn_cell(): return tf.contrib.rnn.DropoutWrapper( lstm_cell(), output_keep_prob=NNET_PARAM.KEEP_PROB) def GRU_cell(): return tf.contrib.rnn.GRUCell( NNET_PARAM.RNN_SIZE, # kernel_initializer=tf.contrib.layers.xavier_initializer(), activation=NNET_PARAM.LSTM_ACTIVATION) GRU_attn_cell = lstm_cell if not infer and NNET_PARAM.KEEP_PROB < 1.0: def GRU_attn_cell(): return tf.contrib.rnn.DropoutWrapper( GRU_cell(), output_keep_prob=NNET_PARAM.KEEP_PROB) if NNET_PARAM.MODEL_TYPE.upper() == 'BLSTM': with tf.variable_scope('BLSTM'): lstm_fw_cell = tf.contrib.rnn.MultiRNNCell( [lstm_attn_cell() for _ in range(NNET_PARAM.RNN_LAYER)], state_is_tuple=True) lstm_bw_cell = tf.contrib.rnn.MultiRNNCell( [lstm_attn_cell() for _ in range(NNET_PARAM.RNN_LAYER)], state_is_tuple=True) lstm_fw_cell = lstm_fw_cell._cells lstm_bw_cell = lstm_bw_cell._cells result = rnn.stack_bidirectional_dynamic_rnn( cells_fw=lstm_fw_cell, cells_bw=lstm_bw_cell, inputs=outputs, dtype=tf.float32, sequence_length=self._lengths) outputs, fw_final_states, bw_final_states = result if NNET_PARAM.MODEL_TYPE.upper() == 'BGRU': with tf.variable_scope('BGRU'): gru_fw_cell = tf.contrib.rnn.MultiRNNCell( [GRU_attn_cell() for _ in range(NNET_PARAM.RNN_LAYER)], state_is_tuple=True) gru_bw_cell = tf.contrib.rnn.MultiRNNCell( [GRU_attn_cell() for _ in range(NNET_PARAM.RNN_LAYER)], state_is_tuple=True) gru_fw_cell = gru_fw_cell._cells gru_bw_cell = gru_bw_cell._cells result = rnn.stack_bidirectional_dynamic_rnn( cells_fw=gru_fw_cell, cells_bw=gru_bw_cell, inputs=outputs, dtype=tf.float32, sequence_length=self._lengths) outputs, fw_final_states, bw_final_states = result with tf.variable_scope('fullconnectOut'): if self._model_type.upper()[0] == 'B': # bidirection outputs = tf.reshape(outputs, [-1, 2 * NNET_PARAM.LSTM_num_proj]) in_size = 2 * NNET_PARAM.LSTM_num_proj out_size = NNET_PARAM.OUTPUT_SIZE weights = tf.get_variable( 'weights1', [in_size, out_size], initializer=tf.random_normal_initializer(stddev=0.01)) biases = tf.get_variable('biases1', [out_size], initializer=tf.constant_initializer(0.0)) mask = tf.nn.relu(tf.matmul(outputs, weights) + biases) self._activations_t = tf.reshape( mask, [self.batch_size, -1, NNET_PARAM.OUTPUT_SIZE]) # mask clip self._activations = self._activations_t # self._activations = tf.clip_by_value(self._activations_t,-1,1.5) masked_mag = None if DATA_PARAM.FEATURE_TYPE == 'LOG_MAG' and DATA_PARAM.MASK_ON_MAG_EVEN_LOGMAG: mag = data_tool.rmNormalization(self._mixed, eager=False) # norm to (0,1), 大数乘小数会有误差,mask比较小,所以将mag变小。 mag = tf.clip_by_value(mag, DATA_PARAM.MAG_NORM_MIN, DATA_PARAM.MAG_NORM_MAX) mag -= DATA_PARAM.MAG_NORM_MIN mag /= (DATA_PARAM.MAG_NORM_MAX - DATA_PARAM.MAG_NORM_MIN) # add mask on magnitude spectrum masked_mag = self._activations * mag # rm mag norm masked_mag = masked_mag * ( DATA_PARAM.MAG_NORM_MAX - DATA_PARAM.MAG_NORM_MIN) + DATA_PARAM.MAG_NORM_MIN # change to log_mag feature log_masked_mag = tf.log(masked_mag + DATA_PARAM.LOG_BIAS) / tf.log(10.0) log_masked_mag = tf.clip_by_value(log_masked_mag, DATA_PARAM.LOG_NORM_MIN, DATA_PARAM.LOG_NORM_MAX) log_masked_mag -= DATA_PARAM.LOG_NORM_MIN log_masked_mag /= (DATA_PARAM.LOG_NORM_MAX - DATA_PARAM.LOG_NORM_MIN) self._cleaned = log_masked_mag else: self._cleaned = self._activations * self._mixed self.saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=30) if infer: if DATA_PARAM.FEATURE_TYPE == 'LOG_MAG' and DATA_PARAM.MASK_ON_MAG_EVEN_LOGMAG: self._cleaned = masked_mag return if NNET_PARAM.MASK_TYPE == 'PSIRM': self._labels *= tf.cos(theta_x_batch - theta_y_batch) self._loss = NNET_PARAM.LOSS_FUNC(self.cleaned, self.labels) if tf.get_variable_scope().reuse: return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), NNET_PARAM.CLIP_NORM) optimizer = tf.train.AdamOptimizer(self.lr) #optimizer = tf.train.GradientDescentOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars)) self._new_lr = tf.placeholder(tf.float32, shape=[], name='new_learning_rate') self._lr_update = tf.assign(self._lr, self._new_lr)
def __init__(self, config, infer=False): # self._inputs = inputs # self._mixed = inputs # self._labels1 = tf.slice(labels, [0, 0, 0], [-1, -1, config.output_size]) # self._labels2 = tf.slice(labels, [0, 0, config.output_size], [-1, -1, -1]) # self._lengths = lengths with tf.name_scope('placeholder'): self._inputs = tf.placeholder(tf.float32, [None, None, config.input_size], name='inputs') self._mixed = self._inputs self._labels = tf.placeholder(tf.float32, [None, None, config.output_size * 2], name='labels') self._labels1 = tf.slice(self._labels, [0, 0, 0], [-1, -1, config.output_size]) self._labels2 = tf.slice(self._labels, [0, 0, config.output_size], [-1, -1, -1]) self._lengths = tf.placeholder(tf.float32, [None], name='lengths') self.batch_size = tf.shape(self._inputs)[0] self._model_type = config.model_type outputs = self._inputs # This first layer-- feed forward layer # Transform the input to the right size before feed into RNN with tf.variable_scope('forward1'): outputs = tf.reshape(outputs, [-1, config.input_size]) outputs = tf.layers.dense(outputs, units=config.rnn_size, activation=tf.nn.tanh, reuse=tf.get_variable_scope().reuse) # print(outputs.name,'__________________________________________________') # print([x.name for x in tf.global_variables()],'_______________________________________') outputs = tf.reshape(outputs, [self.batch_size, -1, config.rnn_size]) def lstm_cell(): return tf.contrib.rnn.LSTMCell( config.rnn_size, forget_bias=1.0, use_peepholes=True, initializer=tf.contrib.layers.xavier_initializer(), state_is_tuple=True, activation=tf.tanh) attn_cell = lstm_cell if not infer and config.keep_prob < 1.0: def attn_cell(): return tf.contrib.rnn.DropoutWrapper( lstm_cell(), output_keep_prob=config.keep_prob) if config.model_type.lower() == 'blstm': with tf.variable_scope('blstm'): lstm_fw_cell = tf.contrib.rnn.MultiRNNCell( [attn_cell() for _ in range(config.rnn_num_layers)], state_is_tuple=True) lstm_bw_cell = tf.contrib.rnn.MultiRNNCell( [attn_cell() for _ in range(config.rnn_num_layers)], state_is_tuple=True) lstm_fw_cell = _unpack_cell(lstm_fw_cell) lstm_bw_cell = _unpack_cell(lstm_bw_cell) result = rnn.stack_bidirectional_dynamic_rnn( cells_fw=lstm_fw_cell, cells_bw=lstm_bw_cell, inputs=outputs, dtype=tf.float32, sequence_length=tf.cast(self._lengths, tf.int32)) outputs, fw_final_states, bw_final_states = result if config.model_type.lower() == 'lstm': with tf.variable_scope('lstm'): cell = tf.contrib.rnn.MultiRNNCell( [attn_cell() for _ in range(config.rnn_num_layers)], state_is_tuple=True) self._initial_state = cell.zero_state(self.batch_size, tf.float32) state = self.initial_state outputs, state = tf.nn.dynamic_rnn( cell, outputs, dtype=tf.float32, sequence_length=self._lengths, initial_state=self.initial_state) self._final_state = state with tf.variable_scope('forward2'): if self._model_type.lower() == 'blstm': outputs = tf.reshape(outputs, [-1, 2 * config.rnn_size]) in_size = 2 * config.rnn_size else: outputs = tf.reshape(outputs, [-1, config.rnn_size]) in_size = config.rnn_size out_size = config.output_size # in_size=256 weights1 = tf.get_variable( 'weights1', [in_size, out_size], initializer=tf.random_normal_initializer(stddev=0.01)) biases1 = tf.get_variable('biases1', [out_size], initializer=tf.constant_initializer(0.0)) weights2 = tf.get_variable( 'weights2', [in_size, out_size], initializer=tf.random_normal_initializer(stddev=0.01)) biases2 = tf.get_variable('biases2', [out_size], initializer=tf.constant_initializer(0.0)) mask1 = tf.nn.relu(tf.matmul(outputs, weights1) + biases1) mask2 = tf.nn.relu(tf.matmul(outputs, weights2) + biases2) self._activations1 = tf.reshape( mask1, [self.batch_size, -1, config.output_size]) self._activations2 = tf.reshape( mask2, [self.batch_size, -1, config.output_size]) self._cleaned1 = self._activations1 * self._mixed self._cleaned2 = self._activations2 * self._mixed # Ability to save the model self.saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=30) if infer: return self._loss = utt_PIT_MSE_for_LSTM(self._cleaned1, self._cleaned2, self._labels1, self._labels2) if tf.get_variable_scope().reuse: return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), config.max_grad_norm) optimizer = tf.train.AdamOptimizer(self.lr) #optimizer = tf.train.GradientDescentOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars)) self._new_lr = tf.placeholder(tf.float32, shape=[], name='new_learning_rate') self._lr_update = tf.assign(self._lr, self._new_lr)
def __init__(self, x_mag_spec_batch, lengths_batch, y_mag_spec_batch=None, theta_x_batch=None, theta_y_batch=None, behavior='train'): ''' behavior = 'train/validation/infer' ''' if behavior != self.infer: assert(y_mag_spec_batch is not None) assert(theta_x_batch is not None) assert(theta_y_batch is not None) self._log_bias = tf.get_variable('logbias', [1], trainable=FLAGS.PARAM.LOG_BIAS_TRAINABLE, initializer=tf.constant_initializer(FLAGS.PARAM.INIT_LOG_BIAS)) self._real_logbias = self._log_bias + FLAGS.PARAM.MIN_LOG_BIAS self._x_mag_spec = x_mag_spec_batch self._norm_x_mag_spec = norm_mag_spec(self._x_mag_spec, FLAGS.PARAM.MAG_NORM_MAX) self._norm_x_logmag_spec = norm_logmag_spec(self._x_mag_spec, FLAGS.PARAM.MAG_NORM_MAX, self._log_bias, FLAGS.PARAM.MIN_LOG_BIAS) self._y_mag_spec = y_mag_spec_batch self._norm_y_mag_spec = norm_mag_spec(self._y_mag_spec, FLAGS.PARAM.MAG_NORM_MAX) self._norm_y_logmag_spec = norm_logmag_spec(self._y_mag_spec, FLAGS.PARAM.MAG_NORM_MAX, self._log_bias, FLAGS.PARAM.MIN_LOG_BIAS) self._lengths = lengths_batch self._batch_size = tf.shape(self._lengths)[0] self._x_theta = theta_x_batch self._y_theta = theta_y_batch self._model_type = FLAGS.PARAM.MODEL_TYPE if FLAGS.PARAM.INPUT_TYPE == 'mag': self.net_input = self._norm_x_mag_spec elif FLAGS.PARAM.INPUT_TYPE == 'logmag': self.net_input = self._norm_x_logmag_spec if FLAGS.PARAM.LABEL_TYPE == 'mag': self._y_labels = self._norm_y_mag_spec elif FLAGS.PARAM.LABEL_TYPE == 'logmag': self._y_labels = self._norm_y_logmag_spec outputs = self.net_input lstm_attn_cell = lstm_cell if behavior != self.infer and FLAGS.PARAM.KEEP_PROB < 1.0: def lstm_attn_cell(n_units, n_proj, act): return tf.contrib.rnn.DropoutWrapper(lstm_cell(n_units, n_proj, act), output_keep_prob=FLAGS.PARAM.KEEP_PROB) GRU_attn_cell = GRU_cell if behavior != self.infer and FLAGS.PARAM.KEEP_PROB < 1.0: def GRU_attn_cell(n_units, act): return tf.contrib.rnn.DropoutWrapper(GRU_cell(n_units, act), output_keep_prob=FLAGS.PARAM.KEEP_PROB) if FLAGS.PARAM.MODEL_TYPE.upper() == 'BLSTM': with tf.variable_scope('BLSTM'): lstm_fw_cell = tf.contrib.rnn.MultiRNNCell( [lstm_attn_cell(FLAGS.PARAM.RNN_SIZE, FLAGS.PARAM.LSTM_num_proj, FLAGS.PARAM.LSTM_ACTIVATION) for _ in range(FLAGS.PARAM.RNN_LAYER)], state_is_tuple=True) lstm_bw_cell = tf.contrib.rnn.MultiRNNCell( [lstm_attn_cell(FLAGS.PARAM.RNN_SIZE, FLAGS.PARAM.LSTM_num_proj, FLAGS.PARAM.LSTM_ACTIVATION) for _ in range(FLAGS.PARAM.RNN_LAYER)], state_is_tuple=True) fw_cell = lstm_fw_cell._cells bw_cell = lstm_bw_cell._cells result = rnn.stack_bidirectional_dynamic_rnn( cells_fw=fw_cell, cells_bw=bw_cell, inputs=outputs, dtype=tf.float32, sequence_length=self._lengths) outputs, fw_final_states, bw_final_states = result if FLAGS.PARAM.MODEL_TYPE.upper() == 'BGRU': with tf.variable_scope('BGRU'): gru_fw_cell = tf.contrib.rnn.MultiRNNCell( [GRU_attn_cell(FLAGS.PARAM.RNN_SIZE, FLAGS.PARAM.LSTM_ACTIVATION) for _ in range(FLAGS.PARAM.RNN_LAYER)], state_is_tuple=True) gru_bw_cell = tf.contrib.rnn.MultiRNNCell( [GRU_attn_cell(FLAGS.PARAM.RNN_SIZE, FLAGS.PARAM.LSTM_ACTIVATION) for _ in range(FLAGS.PARAM.RNN_LAYER)], state_is_tuple=True) fw_cell = gru_fw_cell._cells bw_cell = gru_bw_cell._cells result = rnn.stack_bidirectional_dynamic_rnn( cells_fw=fw_cell, cells_bw=bw_cell, inputs=outputs, dtype=tf.float32, sequence_length=self._lengths) outputs, fw_final_states, bw_final_states = result # region full connection get mask # calcu rnn output size in_size = FLAGS.PARAM.RNN_SIZE mask = None if self._model_type.upper()[0] == 'B': # bidirection rnn_output_num = FLAGS.PARAM.RNN_SIZE*2 if FLAGS.PARAM.MODEL_TYPE == 'BLSTM' and (not (FLAGS.PARAM.LSTM_num_proj is None)): rnn_output_num = 2*FLAGS.PARAM.LSTM_num_proj in_size = rnn_output_num outputs = tf.reshape(outputs, [-1, in_size]) out_size = FLAGS.PARAM.OUTPUT_SIZE with tf.variable_scope('fullconnectOut'): weights = tf.get_variable('weights1', [in_size, out_size], initializer=tf.random_normal_initializer(stddev=0.01)) biases = tf.get_variable('biases1', [out_size], initializer=tf.constant_initializer(0.0)) mask = tf.nn.relu(tf.matmul(outputs, weights) + biases) self._mask = tf.reshape( mask, [self._batch_size, -1, FLAGS.PARAM.OUTPUT_SIZE]) # endregion outputs = tf.reshape(outputs, [self._batch_size, -1, in_size]) # region Apply Noise Threshold Function on Mask if FLAGS.PARAM.THRESHOLD_FUNC is not None: # use noise threshold if FLAGS.PARAM.THRESHOLD_POS == FLAGS.PARAM.THRESHOLD_ON_MASK: self._mask, self._threshold = threshold_feature(self._mask, outputs, self._batch_size, in_size) elif FLAGS.PARAM.THRESHOLD_POS == FLAGS.PARAM.THRESHOLD_ON_SPEC: pass else: print('Threshold position error!') exit(-1) # endregion # region prepare y_estimation and y_labels if FLAGS.PARAM.TRAINING_MASK_POSITION == 'mag': self._y_estimation = self._mask*self._norm_x_mag_spec elif FLAGS.PARAM.TRAINING_MASK_POSITION == 'logmag': self._y_estimation = self._mask*self._norm_x_logmag_spec if FLAGS.PARAM.MASK_TYPE == 'PSM': self._y_labels *= tf.cos(self._x_theta-self._y_theta) elif FLAGS.PARAM.MASK_TYPE == 'IRM': pass else: tf.logging.error('Mask type error.') exit(-1) # region Apply Noise Threshold Function on Spec(log or mag) if FLAGS.PARAM.THRESHOLD_FUNC is not None: # use noise threshold if FLAGS.PARAM.THRESHOLD_POS == FLAGS.PARAM.THRESHOLD_ON_MASK: pass elif FLAGS.PARAM.THRESHOLD_POS == FLAGS.PARAM.THRESHOLD_ON_SPEC: self._y_estimation, self._threshold = threshold_feature(self._y_estimation, outputs, self._batch_size, in_size) # endregion # region get infer spec if FLAGS.PARAM.DECODING_MASK_POSITION != FLAGS.PARAM.TRAINING_MASK_POSITION: print('Error, DECODING_MASK_POSITION should be equal to TRAINING_MASK_POSITION when use thresohold model.') if FLAGS.PARAM.DECODING_MASK_POSITION == 'mag': self._y_mag_estimation = rm_norm_mag_spec(self._y_estimation, FLAGS.PARAM.MAG_NORM_MAX) elif FLAGS.PARAM.DECODING_MASK_POSITION == 'logmag': self._y_mag_estimation = rm_norm_logmag_spec(self._y_estimation, FLAGS.PARAM.MAG_NORM_MAX, self._log_bias, FLAGS.PARAM.MIN_LOG_BIAS) ''' _y_mag_estimation is estimated mag_spec _y_estimation is loss_targe, mag_sepec or logmag_spec ''' # endregion if FLAGS.PARAM.TRAINING_MASK_POSITION != FLAGS.PARAM.LABEL_TYPE: if FLAGS.PARAM.LABEL_TYPE == 'mag': self._y_estimation = normedLogmag2normedMag(self._y_estimation, FLAGS.PARAM.MAG_NORM_MAX, self._log_bias, FLAGS.PARAM.MIN_LOG_BIAS) elif FLAGS.PARAM.LABEL_TYPE == 'logmag': self._y_estimation = normedMag2normedLogmag(self._y_estimation, FLAGS.PARAM.MAG_NORM_MAX, self._log_bias, FLAGS.PARAM.MIN_LOG_BIAS) # endregion self.saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=30) if behavior == self.infer: return # region get LOSS if FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == 'SPEC_MSE': # log_mag and mag MSE self._loss = loss.reduce_sum_frame_batchsize_MSE(self._y_estimation,self._y_labels) elif FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == 'MFCC_SPEC_MSE': self._loss1, self._loss2 = loss.balanced_MFCC_AND_SPEC_MSE(self._y_estimation, self._y_labels, self._y_mag_estimation, self._y_mag_spec) self._loss = FLAGS.PARAM.SPEC_LOSS_COEF*self._loss1 + FLAGS.PARAM.MFCC_LOSS_COEF*self._loss2 elif FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == 'MEL_MAG_MSE': self._loss1, self._loss2 = loss.balanced_MEL_AND_SPEC_MSE(self._y_estimation, self._y_labels, self._y_mag_estimation, self._y_mag_spec) self._loss = FLAGS.PARAM.SPEC_LOSS_COEF*self._loss1 + FLAGS.PARAM.MEL_LOSS_COEF*self._loss2 elif FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == "SPEC_MSE_LOWF_EN": self._loss = loss.reduce_sum_frame_batchsize_MSE(self._y_estimation, self._y_labels) elif FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == "FAIR_SPEC_MSE": self._loss = loss.fair_reduce_sum_frame_batchsize_MSE(self._y_estimation, self._y_labels) elif FLAGS.PARAM.LOSS_FUNC_FOR_MAG_SPEC == "SPEC_MSE_FLEXIBLE_POW_C": self._loss = loss.reduce_sum_frame_batchsize_MSE_EmphasizeLowerValue(self._y_estimation, self._y_labels, FLAGS.PARAM.POW_COEF) else: print('Loss type error.') exit(-1) # endregion if behavior == self.validation: ''' val model cannot train. ''' return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), FLAGS.PARAM.CLIP_NORM) optimizer = tf.train.AdamOptimizer(self.lr) #optimizer = tf.train.GradientDescentOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars)) self._new_lr = tf.placeholder( tf.float32, shape=[], name='new_learning_rate') self._lr_update = tf.assign(self._lr, self._new_lr)
def __init__(self, x_mag_spec_batch, lengths_batch, y_mag_spec_batch=None, theta_x_batch=None, theta_y_batch=None, infer=False): self._log_bias = tf.get_variable('logbias', [1], trainable=PARAM.LOG_BIAS_TRAINABEL, initializer=tf.constant_initializer( PARAM.INIT_LOG_BIAS)) self._real_logbias = self._log_bias + DEFAULT_LOG_BIAS self._inputs = x_mag_spec_batch self._x_mag_spec = self.inputs self._norm_x_mag_spec = norm_mag_spec(self._x_mag_spec) self._norm_x_logmag_spec = norm_logmag_spec(self._x_mag_spec, self._log_bias) if not infer: self._y_mag_spec = y_mag_spec_batch self._norm_y_mag_spec = norm_mag_spec(self._y_mag_spec) self._norm_y_logmag_spec = norm_logmag_spec( self._y_mag_spec, self._log_bias) self._lengths = lengths_batch self.batch_size = tf.shape(self._lengths)[0] self._model_type = PARAM.MODEL_TYPE if PARAM.INPUT_TYPE == 'mag': self.net_input = self._norm_x_mag_spec elif PARAM.INPUT_TYPE == 'logmag': self.net_input = self._norm_x_logmag_spec if not infer: if PARAM.LABEL_TYPE == 'mag': self._labels = self._norm_y_mag_spec elif PARAM.LABEL_TYPE == 'logmag': self._labels = self._norm_y_logmag_spec outputs = self.net_input def lstm_cell(): return tf.contrib.rnn.LSTMCell( PARAM.RNN_SIZE, forget_bias=1.0, use_peepholes=True, num_proj=PARAM.LSTM_num_proj, initializer=tf.contrib.layers.xavier_initializer(), state_is_tuple=True, activation=PARAM.LSTM_ACTIVATION) lstm_attn_cell = lstm_cell if not infer and PARAM.KEEP_PROB < 1.0: def lstm_attn_cell(): return tf.contrib.rnn.DropoutWrapper( lstm_cell(), output_keep_prob=PARAM.KEEP_PROB) def GRU_cell(): return tf.contrib.rnn.GRUCell( PARAM.RNN_SIZE, # kernel_initializer=tf.contrib.layers.xavier_initializer(), activation=PARAM.LSTM_ACTIVATION) GRU_attn_cell = lstm_cell if not infer and PARAM.KEEP_PROB < 1.0: def GRU_attn_cell(): return tf.contrib.rnn.DropoutWrapper( GRU_cell(), output_keep_prob=PARAM.KEEP_PROB) if PARAM.MODEL_TYPE.upper() == 'BLSTM': with tf.variable_scope('BLSTM'): lstm_fw_cell = tf.contrib.rnn.MultiRNNCell( [lstm_attn_cell() for _ in range(PARAM.RNN_LAYER)], state_is_tuple=True) lstm_bw_cell = tf.contrib.rnn.MultiRNNCell( [lstm_attn_cell() for _ in range(PARAM.RNN_LAYER)], state_is_tuple=True) lstm_fw_cell = lstm_fw_cell._cells lstm_bw_cell = lstm_bw_cell._cells result = rnn.stack_bidirectional_dynamic_rnn( cells_fw=lstm_fw_cell, cells_bw=lstm_bw_cell, inputs=outputs, dtype=tf.float32, sequence_length=self._lengths) outputs, fw_final_states, bw_final_states = result if PARAM.MODEL_TYPE.upper() == 'BGRU': with tf.variable_scope('BGRU'): gru_fw_cell = tf.contrib.rnn.MultiRNNCell( [GRU_attn_cell() for _ in range(PARAM.RNN_LAYER)], state_is_tuple=True) gru_bw_cell = tf.contrib.rnn.MultiRNNCell( [GRU_attn_cell() for _ in range(PARAM.RNN_LAYER)], state_is_tuple=True) gru_fw_cell = gru_fw_cell._cells gru_bw_cell = gru_bw_cell._cells result = rnn.stack_bidirectional_dynamic_rnn( cells_fw=gru_fw_cell, cells_bw=gru_bw_cell, inputs=outputs, dtype=tf.float32, sequence_length=self._lengths) outputs, fw_final_states, bw_final_states = result with tf.variable_scope('fullconnectOut'): if self._model_type.upper()[0] == 'B': # bidirection outputs = tf.reshape(outputs, [-1, 2 * PARAM.LSTM_num_proj]) in_size = 2 * PARAM.LSTM_num_proj out_size = PARAM.OUTPUT_SIZE weights = tf.get_variable( 'weights1', [in_size, out_size], initializer=tf.random_normal_initializer(stddev=0.01)) biases = tf.get_variable('biases1', [out_size], initializer=tf.constant_initializer(0.0)) mask = tf.nn.relu(tf.matmul(outputs, weights) + biases) self._mask = tf.reshape(mask, [self.batch_size, -1, PARAM.OUTPUT_SIZE]) self.saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=30) if infer: if PARAM.DECODING_MASK_POSITION == 'mag': self._cleaned = rm_norm_mag_spec(self._mask * self._norm_x_mag_spec) elif PARAM.DECODING_MASK_POSITION == 'logmag': self._cleaned = rm_norm_logmag_spec( self._mask * self._norm_x_logmag_spec, self._log_bias) return if PARAM.TRAINING_MASK_POSITION == 'mag': self._cleaned = self._mask * self._norm_x_mag_spec elif PARAM.TRAINING_MASK_POSITION == 'logmag': self._cleaned = self._mask * self._norm_x_logmag_spec if PARAM.MASK_TYPE == 'PSM': self._labels *= tf.cos(theta_x_batch - theta_y_batch) elif PARAM.MASK_TYPE == 'IRM': pass else: tf.logging.error('Mask type error.') exit(-1) if PARAM.TRAINING_MASK_POSITION != PARAM.LABEL_TYPE: if PARAM.LABEL_TYPE == 'mag': self._cleaned = normedLogmag2normedMag(self._cleaned, self._log_bias) elif PARAM.LABEL_TYPE == 'logmag': self._cleaned = normedMag2normedLogmag(self._cleaned, self._log_bias) self._loss = PARAM.LOSS_FUNC(self._cleaned, self._labels) if tf.get_variable_scope().reuse: return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), PARAM.CLIP_NORM) optimizer = tf.train.AdamOptimizer(self.lr) #optimizer = tf.train.GradientDescentOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars)) self._new_lr = tf.placeholder(tf.float32, shape=[], name='new_learning_rate') self._lr_update = tf.assign(self._lr, self._new_lr)