def __init__(self, state, state_dims, action_dims, action_bound_low, action_bound_high, dense1_size, dense2_size, final_layer_init, scope='actor'): # state - State input to pass through the network # action_bounds - Network will output in range [-1,1]. Multiply this by action_bound to get output within desired boundaries of action space self.state = state self.state_dims = np.prod(state_dims) #Used to calculate the fan_in of the state layer (e.g. if state_dims is (3,2) fan_in should equal 6) self.action_dims = np.prod(action_dims) self.action_bound_low = action_bound_low self.action_bound_high = action_bound_high self.scope = scope with tf.variable_scope(self.scope): self.dense1_mul = dense(self.state, dense1_size, weight_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(self.state_dims))), 1/tf.sqrt(tf.to_float(self.state_dims))), bias_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(self.state_dims))), 1/tf.sqrt(tf.to_float(self.state_dims))), scope='dense1') self.dense1 = relu(self.dense1_mul, scope='dense1') self.dense2_mul = dense(self.dense1, dense2_size, weight_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size))), 1/tf.sqrt(tf.to_float(dense1_size))), bias_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size))), 1/tf.sqrt(tf.to_float(dense1_size))), scope='dense2') self.dense2 = relu(self.dense2_mul, scope='dense2') self.output_mul = dense(self.dense2, self.action_dims, weight_init=tf.random_uniform_initializer(-1*final_layer_init, final_layer_init), bias_init=tf.random_uniform_initializer(-1*final_layer_init, final_layer_init), scope='output') self.output_tanh = tanh(self.output_mul, scope='output') # Scale tanh output to lower and upper action bounds self.output = tf.multiply(0.5, tf.multiply(self.output_tanh, (self.action_bound_high-self.action_bound_low)) + (self.action_bound_high+self.action_bound_low)) self.network_params = tf.trainable_variables(scope=self.scope) self.bn_params = [] # No batch norm params
def build_bottom_block(self, inputs, name): outs = tf.contrib.layers.flatten(inputs, scope=name + '/flat') outs = ops.dense(outs, 512, name + '/dense1', activation_fn=tf.nn.relu) outs = ops.dropout(outs, 0.5, name + '/dropout1') outs = ops.dense(outs, 512, name + '/dense2', activation_fn=tf.nn.relu) outs = ops.dropout(outs, 0.5, name + '/dropout2') outs = ops.dense(outs, self.conf.class_num, name + '/dense_output', activation_fn=tf.nn.softmax) return outs
def __init__(self, num_actions, state, action=None, target=None, learning_rate=None, scope='DQN'): # State - Input state to pass through the network # Action - Action for which the Q value should be predicted (only required for training) # Target - Target Q value (only required for training) self.input = state self.action = action self.target = target self.num_actions = num_actions self.scope = scope if learning_rate is not None: self.optimizer = tf.train.RMSPropOptimizer(learning_rate, momentum=0.95, epsilon=0.01) with tf.variable_scope(self.scope): with tf.variable_scope('input_layers'): self.input_float = tf.to_float(self.input) self.input_norm = tf.divide(self.input_float, 255.0) self.conv1 = conv2d(self.input_norm, 8, 32, 4, tf.nn.relu, scope='conv1') self.conv2 = conv2d(self.conv1, 4, 64, 2, tf.nn.relu, scope='conv2') self.conv3 = conv2d(self.conv2, 3, 64, 1, tf.nn.relu, scope='conv3') self.flatten = flatten(self.conv3, scope='flatten') self.dense = dense(self.flatten, 512, tf.nn.relu, scope='dense') self.output = dense(self.dense, self.num_actions, scope='output') self.network_params = tf.trainable_variables(scope=self.scope)
def __init__(self, state, action, state_dims, action_dims, dense1_size, dense2_size, final_layer_init, num_atoms, v_min, v_max, is_training=False, scope='critic'): # state - State input to pass through the network # action - Action input for which the Z distribution should be predicted self.state = state self.action = action self.state_dims = np.prod(state_dims) #Used to calculate the fan_in of the state layer (e.g. if state_dims is (3,2) fan_in should equal 6) self.action_dims = np.prod(action_dims) self.is_training = is_training self.scope = scope with tf.variable_scope(self.scope): self.input_norm = batchnorm(self.state, self.is_training, scope='input_norm') self.dense1_mul = dense(self.input_norm, dense1_size, weight_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(self.state_dims))), 1/tf.sqrt(tf.to_float(self.state_dims))), bias_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(self.state_dims))), 1/tf.sqrt(tf.to_float(self.state_dims))), scope='dense1') self.dense1_bn = batchnorm(self.dense1_mul, self.is_training, scope='dense1') self.dense1 = relu(self.dense1_bn, scope='dense1') #Merge first dense layer with action input to get second dense layer self.dense2a = dense(self.dense1, dense2_size, weight_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), 1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), bias_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), 1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), scope='dense2a') self.dense2b = dense(self.action, dense2_size, weight_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), 1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), bias_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), 1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), scope='dense2b') self.dense2 = relu(self.dense2a + self.dense2b, scope='dense2') self.output_logits = dense(self.dense2, num_atoms, weight_init=tf.random_uniform_initializer(-1*final_layer_init, final_layer_init), bias_init=tf.random_uniform_initializer(-1*final_layer_init, final_layer_init), scope='output_logits') self.output_probs = softmax(self.output_logits, scope='output_probs') self.network_params = tf.trainable_variables(scope=self.scope) self.bn_params = [v for v in tf.global_variables(scope=self.scope) if 'batch_normalization/moving' in v.name] self.z_atoms = tf.lin_space(v_min, v_max, num_atoms) self.Q_val = tf.reduce_sum(self.z_atoms * self.output_probs) # the Q value is the mean of the categorical output Z-distribution self.action_grads = tf.gradients(self.output_probs, self.action, self.z_atoms) # gradient of mean of output Z-distribution wrt action input - used to train actor network, weighing the grads by z_values gives the mean across the output distribution
def inference(self, matrix, outs): outputs = [] outs = ops.simple_conv(matrix, outs, 4 * self.conf.ch_num, self.conf.rate, 'conv0', 3) for i in range(self.conf.l_num): ratio = 4 // 2**i outs = ops.simple_conv(matrix, outs, ratio * self.conf.ch_num, self.conf.rate, 'conv1_%s' % i, 3) outputs.append(outs) matrix, outs = ops.graph_pool(matrix, outs, 2, 'pool_%s' % i) axis_outs = [] for i, outs in enumerate(outputs): outs = tf.reduce_max(outs, axis=1, name='max_pool_%s' % i) axis_outs.append(outs) outs = tf.concat(axis_outs, axis=1) outs = ops.dense(outs, 1024, self.conf.rate, 'dense1') outs = ops.dense(outs, self.conf.class_num, self.conf.rate, 'dense2') return outs
def forward_pass(self, state_in, reshape=True, sigmoid_out=False, reuse=None): self.state_in = state_in shape_in = self.state_in.get_shape().as_list() # Get number of input channels for weight/bias init channels_in = shape_in[-1] with tf.variable_scope(self.scope, reuse=reuse): if reshape: # Reshape [batch_size, traj_len, H, W, C] into [batch_size*traj_len, H, W, C] self.state_in = tf.reshape(self.state_in, [-1, shape_in[1], shape_in[3], shape_in[2]]) self.layer1 = dense(self.state_in, 16, scope='layer1') self.layer2 = dense(self.layer1, 8, scope='layer2') self.layer3 = dense(self.layer2, 1, scope='layer3') if reshape: # Reshape [batch_size, traj_len, H, W, C] into [batch_size*traj_len, H, W, C] shape_in = self.layer3.get_shape().as_list() self.layer3 = tf.reshape(self.layer3, [-1, shape_in[1], shape_in[3], shape_in[2]]) self.output = dense(self.layer3, 1, scope='output') if sigmoid_out: self.output = tf.nn.sigmoid(self.output) if reshape: # Reshape 1d reward output [batch_size*traj_len] into batches [batch_size, traj_len] self.output = tf.reshape(self.output, [-1, shape_in[1]]) self.network_params = tf.trainable_variables(scope=self.scope) return self.output
def build_discriminator(self, input, channels=3, ndf=64, norm_type='batch', init_type='normal', init_gain=0.02, is_training=True): """ SRGAN Discriminator """ conv_block1 = ops.conv(input, in_channels=channels, out_channels=ndf, filter_size=3, stride=1, weight_init_type=init_type, weight_init_gain=init_gain, norm_type=None, activation_type='LeakyReLU', is_training=is_training, scope='conv1', reuse=self.reuse) conv_block2 = ops.conv(conv_block1, in_channels=ndf, out_channels=ndf, filter_size=3, stride=2, weight_init_type=init_type, weight_init_gain=init_gain, norm_type=norm_type, activation_type='LeakyReLU', is_training=is_training, scope='conv2', reuse=self.reuse) conv_block3 = ops.conv(conv_block2, in_channels=ndf, out_channels=2 * ndf, filter_size=3, stride=1, weight_init_type=init_type, weight_init_gain=init_gain, norm_type=norm_type, activation_type='LeakyReLU', is_training=is_training, scope='conv3', reuse=self.reuse) conv_block4 = ops.conv(conv_block3, in_channels=2 * ndf, out_channels=2 * ndf, filter_size=3, stride=2, weight_init_type=init_type, weight_init_gain=init_gain, norm_type=norm_type, activation_type='LeakyReLU', is_training=is_training, scope='conv4', reuse=self.reuse) conv_block5 = ops.conv(conv_block4, in_channels=2 * ndf, out_channels=4 * ndf, filter_size=3, stride=1, weight_init_type=init_type, weight_init_gain=init_gain, norm_type=norm_type, activation_type='LeakyReLU', is_training=is_training, scope='conv5', reuse=self.reuse) conv_block6 = ops.conv(conv_block5, in_channels=4 * ndf, out_channels=4 * ndf, filter_size=3, stride=2, weight_init_type=init_type, weight_init_gain=init_gain, norm_type=norm_type, activation_type='LeakyReLU', is_training=is_training, scope='conv6', reuse=self.reuse) conv_block7 = ops.conv(conv_block6, in_channels=4 * ndf, out_channels=8 * ndf, filter_size=3, stride=1, weight_init_type=init_type, weight_init_gain=init_gain, norm_type=norm_type, activation_type='LeakyReLU', is_training=is_training, scope='conv7', reuse=self.reuse) conv_block8 = ops.conv(conv_block7, in_channels=8 * ndf, out_channels=8 * ndf, filter_size=3, stride=2, weight_init_type=init_type, weight_init_gain=init_gain, norm_type=norm_type, activation_type='LeakyReLU', is_training=is_training, scope='conv8', reuse=self.reuse) x = ops.flatten(conv_block8) dense = ops.dense(x, in_size=x.get_shape().as_list()[1], out_size=1024, weight_init_type=init_type, weight_init_gain=init_gain, norm_type=None, activation_type='LeakyReLU', is_training=is_training, scope='dense', reuse=self.reuse) output = ops.dense(dense, in_size=1024, out_size=1, weight_init_type=init_type, weight_init_gain=init_gain, norm_type=None, activation_type='sigmoid', is_training=is_training, scope='output', reuse=self.reuse) return output
def __init__(self, state, action, noise, state_dims, action_dims, noise_dims, dense1_size, dense2_size, final_layer_init, num_atoms, v_min, v_max, is_training=False, scope='critic'): # state - State input to pass through the network # action - Action input for which the Z distribution should be predicted self.state = state self.action = action self.noise = noise self.noise_dims = np.prod(noise_dims) self.state_dims = np.prod(state_dims) #Used to calculate the fan_in of the state layer (e.g. if state_dims is (3,2) fan_in should equal 6) self.action_dims = np.prod(action_dims) self.v_min = v_min self.v_max = v_max self.num_atoms = num_atoms self.scope = scope batch_size = 256 self.is_training = is_training self.scope = scope with tf.variable_scope(self.scope): self.dense1_mul = dense(self.state, dense1_size, weight_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(self.state_dims))), 1/tf.sqrt(tf.to_float(self.state_dims))), bias_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(self.state_dims))), 1/tf.sqrt(tf.to_float(self.state_dims))), scope='dense1') self.dense1 = relu(self.dense1_mul, scope='dense1') #Merge first dense layer with action and noise input to get second dense layer self.dense2a = dense(self.dense1, dense2_size, weight_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), 1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), bias_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), 1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), scope='dense2a') self.dense2a = tf.reshape(self.dense2a, [batch_size, 1 , dense2_size]) self.dense2b = dense(self.action, dense2_size, weight_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), \ 1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))),\ bias_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), 1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), scope='dense2b') self.dense2b = tf.reshape(self.dense2b, [batch_size, 1 , dense2_size]) self.noise = tf.reshape(self.noise, [batch_size*num_atoms , noise_dims]) # TODO whether or not we need modified this item self.dense2c = dense(self.noise, dense2_size, weight_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size+self.noise_dims))), 1/tf.sqrt(tf.to_float(dense1_size+self.noise_dims))), bias_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size+self.noise_dims))), 1/tf.sqrt(tf.to_float(dense1_size+self.noise_dims))), scope='dense2c') self.dense2c = tf.reshape(self.dense2c, [batch_size, num_atoms , dense2_size]) self.dense2 = relu(self.dense2a + self.dense2b + self.dense2c, scope='dense2') self.dense2 = tf.reshape(self.dense2, [batch_size*num_atoms, dense2_size]) self.output_mul = dense(self.dense2, 1, weight_init=tf.random_uniform_initializer(-1*final_layer_init, final_layer_init), bias_init=tf.random_uniform_initializer(-1*final_layer_init, final_layer_init), scope='output') #self.output_tanh = tanh(0.05*self.output_mul, scope='output') #self.output_samples = tf.multiply(0.5, tf.multiply(self.output_tanh, (self.v_max - self.v_min)) + (self.v_max + self.v_min)) self.output_samples = self.output_mul self.output_samples = tf.reshape(self.output_samples, [batch_size, num_atoms]) self.network_params = tf.trainable_variables(scope=self.scope) self.bn_params = [v for v in tf.global_variables(scope=self.scope) if 'batch_normalization/moving' in v.name] self.CVaR_value = CVaR_sample(self.output_samples,train_params.CVaR_alpha,train_params.CVaR_optimizing) self.Q_val = tf.reduce_mean(self.output_samples, axis=1) # the Q value is the mean of the generated samples # TODO add utility function HERE! self.action_grads = tf.gradients(self.CVaR_value, self.action) # gradient of mean of output Z-distribution wrt action input - used to train actor network, weighing the grads by z_values gives the mean across the output distribution
def __init__(self, state, action, noise,state_dims, action_dims, noise_dims, dense1_size, dense2_size, final_layer_init, num_atoms, v_min, v_max, scope='critic'): # state - State input to pass through the network # action - Action input for which the Z distribution should be predicted self.state = state self.action = action self.noise = noise self.noise_dims = np.prod(noise_dims) self.state_dims = np.prod(state_dims) #Used to calculate the fan_in of the state layer (e.g. if state_dims is (3,2) fan_in should equal 6) self.action_dims = np.prod(action_dims) self.v_min = v_min self.v_max = v_max self.scope = scope batch_size = 256 with tf.variable_scope(self.scope): self.dense1_mul = dense(self.state, dense1_size, weight_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(self.state_dims))), 1/tf.sqrt(tf.to_float(self.state_dims))), bias_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(self.state_dims))), 1/tf.sqrt(tf.to_float(self.state_dims))), scope='dense1') self.dense1 = relu(self.dense1_mul, scope='dense1') #Merge first dense layer with action and noise input to get second dense layer self.dense2a = dense(self.dense1, dense2_size, weight_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), 1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), bias_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), 1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), scope='dense2a') self.dense2a = tf.reshape(self.dense2a, [batch_size, 1 , dense2_size]) self.dense2b = dense(self.action, dense2_size, weight_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), \ 1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))),\ bias_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), 1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), scope='dense2b') self.dense2b = tf.reshape(self.dense2b, [batch_size, 1 , dense2_size]) self.noise = tf.reshape(self.noise, [batch_size*num_atoms , noise_dims]) self.dense2c = dense(self.noise, dense2_size, weight_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size+self.noise_dims))), 1/tf.sqrt(tf.to_float(dense1_size+self.noise_dims))), bias_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size+self.noise_dims))), 1/tf.sqrt(tf.to_float(dense1_size+self.noise_dims))), scope='dense2c') self.dense2c = tf.reshape(self.dense2c, [batch_size, num_atoms , dense2_size]) self.dense2 = relu(self.dense2a + self.dense2b + self.dense2c, scope='dense2') self.dense2 = tf.reshape(self.dense2, [batch_size*num_atoms, dense2_size]) self.output_mul = dense(self.dense2, 1, weight_init=tf.random_uniform_initializer(-1*final_layer_init, final_layer_init), bias_init=tf.random_uniform_initializer(-1*final_layer_init, final_layer_init), scope='output') self.output_tanh = tanh(0.2*self.output_mul, scope='output') self.output_samples = tf.multiply(0.5, tf.multiply(self.output_tanh, (self.v_max - self.v_min)) + (self.v_max + self.v_min)) self.output_samples = tf.reshape(self.output_samples, [batch_size, num_atoms]) self.network_params = tf.trainable_variables(scope=self.scope) self.bn_params = [] # No batch norm params self.Q_val = tf.reduce_mean(self.output_samples, axis=1) # the Q value is the mean of the generated samples self.action_grads = tf.gradients(self.output_samples/num_atoms, self.action) # gradient of mean of output Z-distribution wrt action input - used to train actor network, weighing the grads by z_values gives the mean across the output distribution
def __init__(self, state, action, state_dims, action_dims, args, is_training=False, scope='critic'): # state - State input to pass through the network # action - Action input for which the Q value should be predicted self.state = state self.action = action self.state_dims = np.prod( state_dims ) #Used to calculate the fan_in of the state layer (e.g. if state_dims is (3,2) fan_in should equal 6) self.action_dims = np.prod(action_dims) self.args = args self.is_training = is_training self.scope = scope # Networks params dense1_size = self.args.dense1_size dense2_size = self.args.dense2_size final_layer_init = self.args.final_layer_init with tf.variable_scope(self.scope): self.input_norm = batchnorm(self.state, self.is_training, scope='input_norm') self.dense1_mul = dense( self.input_norm, dense1_size, weight_init=tf.random_uniform_initializer( (-1 / tf.sqrt(tf.to_float(self.state_dims))), 1 / tf.sqrt(tf.to_float(self.state_dims))), bias_init=tf.random_uniform_initializer( (-1 / tf.sqrt(tf.to_float(self.state_dims))), 1 / tf.sqrt(tf.to_float(self.state_dims))), scope='dense1') self.dense1_bn = batchnorm(self.dense1_mul, self.is_training, scope='dense1') self.dense1 = relu(self.dense1_bn, scope='dense1') #Merge first dense layer with action input to get second dense layer self.dense2a = dense( self.dense1, dense2_size, weight_init=tf.random_uniform_initializer( (-1 / tf.sqrt(tf.to_float(dense1_size + self.action_dims))), 1 / tf.sqrt(tf.to_float(dense1_size + self.action_dims))), bias_init=tf.random_uniform_initializer( (-1 / tf.sqrt(tf.to_float(dense1_size + self.action_dims))), 1 / tf.sqrt(tf.to_float(dense1_size + self.action_dims))), scope='dense2a') self.dense2b = dense( self.action, dense2_size, weight_init=tf.random_uniform_initializer( (-1 / tf.sqrt(tf.to_float(dense1_size + self.action_dims))), 1 / tf.sqrt(tf.to_float(dense1_size + self.action_dims))), bias_init=tf.random_uniform_initializer( (-1 / tf.sqrt(tf.to_float(dense1_size + self.action_dims))), 1 / tf.sqrt(tf.to_float(dense1_size + self.action_dims))), scope='dense2b') self.dense2 = relu(self.dense2a + self.dense2b, scope='dense2') self.output = dense(self.dense2, 1, weight_init=tf.random_uniform_initializer( -1 * final_layer_init, final_layer_init), bias_init=tf.random_uniform_initializer( -1 * final_layer_init, final_layer_init), scope='output') self.network_params = tf.trainable_variables(scope=self.scope) self.action_grads = tf.gradients( self.output, self.action ) # Gradient of value output wrt action input - used to train actor network
def build_bottom_block(self, inputs, name): outs = tf.contrib.layers.flatten(inputs, scope=name + '/flat') outs = ops.dense(outs, 4096, name + '/dense1') outs = ops.dense(outs, self.conf.class_num, name + '/dense2') return outs
def forward_pass(self, state_in, reshape=True, sigmoid_out=False, reuse=None): self.state_in = state_in shape_in = self.state_in.get_shape().as_list() # Get number of input channels for weight/bias init channels_in = shape_in[-1] with tf.variable_scope(self.scope, reuse=reuse): if reshape: # Reshape [batch_size, traj_len, H, W, C] into [batch_size*traj_len, H, W, C] self.state_in = tf.reshape( self.state_in, [-1, shape_in[2], shape_in[3], shape_in[4]]) self.conv1 = conv2d( self.state_in, self.num_filters, self.kernels[0], self.strides[0], kernel_init=tf.random_uniform_initializer((-1.0 / tf.sqrt( float(channels_in * self.kernels[0] * self.kernels[0]))), ( 1.0 / tf.sqrt( float(channels_in * self.kernels[0] * self.kernels[0])))), bias_init=tf.random_uniform_initializer((-1.0 / tf.sqrt( float(channels_in * self.kernels[0] * self.kernels[0]))), ( 1.0 / tf.sqrt( float(channels_in * self.kernels[0] * self.kernels[0])))), scope='conv1') self.conv1 = lrelu(self.conv1, self.lrelu_alpha, scope='conv1') self.conv2 = conv2d( self.conv1, self.num_filters, self.kernels[1], self.strides[1], kernel_init=tf.random_uniform_initializer((-1.0 / tf.sqrt( float(self.num_filters * self.kernels[1] * self.kernels[1]))), (1.0 / tf.sqrt( float(self.num_filters * self.kernels[1] * self.kernels[1])))), bias_init=tf.random_uniform_initializer((-1.0 / tf.sqrt( float(self.num_filters * self.kernels[1] * self.kernels[1]))), (1.0 / tf.sqrt( float(self.num_filters * self.kernels[1] * self.kernels[1])))), scope='conv2') self.conv2 = lrelu(self.conv2, self.lrelu_alpha, scope='conv2') self.conv3 = conv2d( self.conv2, self.num_filters, self.kernels[2], self.strides[2], kernel_init=tf.random_uniform_initializer((-1.0 / tf.sqrt( float(self.num_filters * self.kernels[2] * self.kernels[2]))), (1.0 / tf.sqrt( float(self.num_filters * self.kernels[2] * self.kernels[2])))), bias_init=tf.random_uniform_initializer((-1.0 / tf.sqrt( float(self.num_filters * self.kernels[2] * self.kernels[2]))), (1.0 / tf.sqrt( float(self.num_filters * self.kernels[2] * self.kernels[2])))), scope='conv3') self.conv3 = lrelu(self.conv3, self.lrelu_alpha, scope='conv3') self.conv4 = conv2d( self.conv3, self.num_filters, self.kernels[3], self.strides[3], kernel_init=tf.random_uniform_initializer((-1.0 / tf.sqrt( float(self.num_filters * self.kernels[3] * self.kernels[3]))), (1.0 / tf.sqrt( float(self.num_filters * self.kernels[3] * self.kernels[3])))), bias_init=tf.random_uniform_initializer((-1.0 / tf.sqrt( float(self.num_filters * self.kernels[3] * self.kernels[3]))), (1.0 / tf.sqrt( float(self.num_filters * self.kernels[3] * self.kernels[3])))), scope='conv4') self.conv4 = lrelu(self.conv4, self.lrelu_alpha, scope='conv4') self.flatten = flatten(self.conv4) self.dense = dense(self.flatten, self.dense_size, kernel_init=tf.random_uniform_initializer( (-1.0 / tf.sqrt(float(self.num_filters))), (1.0 / tf.sqrt(float(self.num_filters)))), bias_init=tf.random_uniform_initializer( (-1.0 / tf.sqrt(float(self.num_filters))), (1.0 / tf.sqrt(float(self.num_filters))))) self.output = dense(self.dense, 1, kernel_init=tf.random_uniform_initializer( (-1.0 / tf.sqrt(float(self.dense_size))), (1.0 / tf.sqrt(float(self.dense_size)))), bias_init=tf.random_uniform_initializer( (-1.0 / tf.sqrt(float(self.dense_size))), (1.0 / tf.sqrt(float(self.dense_size)))), scope='output') if sigmoid_out: self.output = tf.nn.sigmoid(self.output) if reshape: # Reshape 1d reward output [batch_size*traj_len] into batches [batch_size, traj_len] self.output = tf.reshape(self.output, [-1, shape_in[1]]) self.network_params = tf.trainable_variables(scope=self.scope) return self.output
def __init__(self, state, audio, state_dims, action_dims, action_bound_low, action_bound_high, args, is_training=False, scope='actor'): # state - State input to pass through the network # action_bounds - Network will output in range [-1,1]. Multiply this by action_bound to get output within desired boundaries of action space self.state = state self.audio = audio self.state_dims = np.prod( state_dims ) #Used to calculate the fan_in of the state layer (e.g. if state_dims is (3,2) fan_in should equal 6) self.action_dims = np.prod(action_dims) self.action_bound_low = action_bound_low self.action_bound_high = action_bound_high self.args = args self.is_training = is_training self.scope = scope # Networks params dense1_size = self.args.dense1_size dense2_size = self.args.dense2_size final_layer_init = self.args.final_layer_init with tf.variable_scope(self.scope): self.fc1_a = tf.layers.dense(self.audio, 50, tf.nn.relu) self.fc2_a = tf.layers.dense(self.fc1_a, 50, tf.nn.relu) self.final_flat = tf.concat( [createNetwork_cnn(self.state), self.fc2_a], 1) # self.state_dims = np.prod(self.final_flat) # print(np.shape(self.final_flat), self.state_dims) self.input_norm = batchnorm(self.final_flat, self.is_training, scope='input_norm') self.dense1_mul = dense( self.input_norm, dense1_size, weight_init=tf.random_uniform_initializer( (-1 / tf.sqrt(tf.to_float(self.state_dims))), 1 / tf.sqrt(tf.to_float(self.state_dims))), bias_init=tf.random_uniform_initializer( (-1 / tf.sqrt(tf.to_float(self.state_dims))), 1 / tf.sqrt(tf.to_float(self.state_dims))), scope='dense1') self.dense1_bn = batchnorm(self.dense1_mul, self.is_training, scope='dense1') self.dense1 = relu(self.dense1_bn, scope='dense1') self.dense2_mul = dense( self.dense1, dense2_size, weight_init=tf.random_uniform_initializer( (-1 / tf.sqrt(tf.to_float(dense1_size))), 1 / tf.sqrt(tf.to_float(dense1_size))), bias_init=tf.random_uniform_initializer( (-1 / tf.sqrt(tf.to_float(dense1_size))), 1 / tf.sqrt(tf.to_float(dense1_size))), scope='dense2') self.dense2_bn = batchnorm(self.dense2_mul, self.is_training, scope='dense2') self.dense2 = relu(self.dense2_bn, scope='dense2') self.output_mul = dense(self.dense2, self.action_dims, weight_init=tf.random_uniform_initializer( -1 * final_layer_init, final_layer_init), bias_init=tf.random_uniform_initializer( -1 * final_layer_init, final_layer_init), scope='output') self.output_tanh = tanh(self.output_mul, scope='output') # Scale tanh output to lower and upper action bounds self.output = tf.multiply( 0.5, tf.multiply(self.output_tanh, (self.action_bound_high - self.action_bound_low)) + (self.action_bound_high + self.action_bound_low)) self.network_params = tf.trainable_variables(scope=self.scope)