def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **kwargs): super(CustomPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=True) with tf.compat.v1.variable_scope("model", reuse=reuse): activ = tf.nn.sigmoid pi_latent2, vf_latent2 = mlp_extractor(self.processed_obs,net_arch = [128, dict(vf=[156, 156], pi=[128])], act_fun = tf.nn.relu, **kwargs) actionSpace = tf.compat.v1.layers.dense(pi_latent2, ac_space.n, activation= 'sigmoid', name = 'pf') value_fn = tf.compat.v1.layers.dense(vf_latent2, 1, name='vf') vf_latent = vf_latent2 # pi_h = extracted_features # for i, layer_size in enumerate([128, 128, 128]): # pi_h = activ(tf.compat.v1.layers.dense(pi_h, layer_size, name='pi_fc' + str(i))) # pi_latent = pi_h # # vf_h = extracted_features # for i, layer_size in enumerate([32, 32]): # vf_h = activ(tf.compat.v1.layers.dense(vf_h, layer_size, name='vf_fc' + str(i))) # value_fn = tf.compat.v1.layers.dense(vf_h, 1, name='vf') # vf_latent = vf_h self._proba_distribution, self._policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(actionSpace, vf_latent, init_scale=0.01) self._value_fn = value_fn self._setup_init()
def __init__(self, tf_session, ob_space, ac_space, num_env, num_steps, num_batch, activation_func=tf.nn.tanh, reuse=False, **kwargs): super(SafePolicy, self).__init__(tf_session, ob_space, ac_space, num_env, num_steps, num_batch, reuse=reuse) layers = [256, 256, 256] net_arch = [dict(vf=layers, pi=layers)] with tf.variable_scope("model", reuse=reuse): pi_latent, vf_latent = mlp_extractor( tf.layers.flatten(self.processed_obs), net_arch, activation_func) self._value_fn = linear(vf_latent, 'vf', 1) self._proba_distribution, self._policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01) self._setup_init()
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, act_norm_init=None, obs_norm_init=None, net_arch=None, reuse=False, act_fun=tf.tanh): super(NormalMlpPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse) if obs_norm_init is not None: self.obs_norm = TFNormalizer(sess, 'obs_norm', ob_space.shape[0], reuse=reuse, **obs_norm_init) else: self.obs_norm = None if act_norm_init is not None: self.act_norm = TFNormalizer(sess, 'act_norm', ac_space.shape[0], reuse=reuse, **act_norm_init) else: self.act_norm = None del self._pdtype self._pdtype = ActNormGaussProbDistType(ac_space.shape[0], self.act_norm) if net_arch is None: net_arch = [dict(vf=[64, 64], pi=[64, 64])] with tf.variable_scope("model", reuse=reuse): # normalization and clipping if self.obs_norm is not None: extractor_in = self.obs_norm.clip_normalize(tf.layers.flatten(self.processed_obs)) else: extractor_in = tf.layers.flatten(self.processed_obs) pi_latent, vf_latent = mlp_extractor(extractor_in, net_arch, act_fun) self._value_fn = linear(vf_latent, 'vf', 1) self._proba_distribution, self._policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01) self._setup_init()
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, layers=None, net_arch=None, act_fun=tf.tanh, cnn_extractor=nature_cnn, feature_extraction="cnn", **kwargs): super(BSSPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=(feature_extraction == "cnn")) self._kwargs_check(feature_extraction, kwargs) if layers is not None: warnings.warn( "Usage of the `layers` parameter is deprecated! Use net_arch instead " "(it has a different semantics though).", DeprecationWarning) if net_arch is not None: warnings.warn( "The new `net_arch` parameter overrides the deprecated `layers` parameter!", DeprecationWarning) if net_arch is None: if layers is None: layers = [64, 64] net_arch = [dict(vf=layers, pi=layers)] with tf.variable_scope("model", reuse=reuse): if feature_extraction == "cnn": pi_latent = vf_latent = cnn_extractor(self.processed_obs, **kwargs) else: pi_latent, vf_latent = mlp_extractor( tf.layers.flatten(self.processed_obs), net_arch, act_fun) self.pi_feature_m = pi_latent self.vf_feature_m = vf_latent self._value_fn = linear(vf_latent, 'vf', 1) self._proba_distribution, self._policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01) self._setup_init()
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=True, layers=None, net_arch=None, act_fun=tf.nn.relu, feature_extraction="mlp", **kwargs): super(TransformerPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse) encoder = TransformerPolicy.get_common_police_network() self._kwargs_check(feature_extraction, kwargs) if net_arch is None: if layers is None: layers = [ 128, ] net_arch = [dict(vf=layers, pi=layers)] with tf.variable_scope("model", reuse=reuse): transformer_encoded = encoder(self.processed_obs['backbone'], None) with_energy = tf.layers.flatten(transformer_encoded) with_energy = tf.keras.layers.Concatenate()( [with_energy, self.processed_obs['step_to_end']]) pi_latent, vf_latent = mlp_extractor(with_energy, net_arch, act_fun) self._value_fn = linear(vf_latent, 'vf', 1) self._proba_distribution, self._policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01) self._setup_init()
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **kwargs): super(CustomPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=True) self._is_graph_network = True with tf.compat.v1.variable_scope("model", reuse=reuse): activ = tf.nn.relu # pi_latent2, vf_latent2 = mlp_extractor(self.processed_obs,net_arch = [128, dict(vf=[156, 156], pi=[128])], act_fun = tf.nn.relu, **kwargs) # actionSpace = tf.compat.v1.layers.dense(pi_latent2, ac_space.n, activation= 'sigmoid', name = 'pf') # value_fn = tf.compat.v1.layers.dense(vf_latent2, 1, name='vf') # vf_latent = vf_latent2 shapesShared = [256] extracted_features = mlp_extractor(self.processed_obs, shapesShared, activ) # extracted_features = mlp_extractor(extracted_features, shapesShared, activ) pi_h = extracted_features[0] shapesp = [128, 64] for i, layer_size in enumerate(shapesp): if i == len(shapesp)-1: pi_h = tf.nn.sigmoid(tf.layers.dense(pi_h, layer_size, name='pi_fc' + str(i))) else: pi_h = activ(tf.layers.dense(pi_h, layer_size, name='pi_fc' + str(i))) pi_latent = pi_h vf_h = extracted_features[1] shapesv = [64,64] for i, layer_size in enumerate(shapesv): vf_h = activ(tf.compat.v1.layers.dense(vf_h, layer_size, name='vf_fc' + str(i))) value_fn = tf.compat.v1.layers.dense(vf_h, 1, name='vf') vf_latent = vf_h self._proba_distribution, self._policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01) self._value_fn = value_fn self._setup_init()
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, layers=None, net_arch=[dict(vf=[128, 128, 128], pi=[128, 128, 128])], act_fun=tf.tanh, cnn_extractor=nature_cnn, feature_extraction="gnn", layer_size=64, layer_count=2, network_graphs=None, dm_memory_length=None, iterations=10, vf_arch="mlp", **kwargs): super(FeedForwardPolicyWithGnn, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=(feature_extraction == "cnn")) self._kwargs_check(feature_extraction, kwargs) with tf.variable_scope("model", reuse=reuse): if feature_extraction == "cnn": pi_latent = vf_latent = cnn_extractor(self.processed_obs, **kwargs) elif feature_extraction == "gnn": pi_latent, vf_latent = gnn_extractor(tf.layers.flatten( self.processed_obs), act_fun, network_graphs, dm_memory_length, layer_size=layer_size, layer_count=layer_count, iterations=iterations, vf_arch=vf_arch) elif feature_extraction == "gnn_iter": pi_latent, vf_latent = gnn_iter_extractor( tf.layers.flatten(self.processed_obs), act_fun, network_graphs, dm_memory_length, layer_size=layer_size, layer_count=layer_count, iterations=iterations, vf_arch=vf_arch) else: # Assume mlp feature extraction pi_latent, vf_latent = mlp_extractor( tf.layers.flatten(self.processed_obs), net_arch, act_fun) # Need this here as removed from proba_distribution # ok to choose first as can only run mlp one one graph anyway pi_latent = linear(pi_latent, 'pi', network_graphs[0].number_of_edges() + 1, init_scale=0.01, init_bias=0.0) self._value_fn = linear(vf_latent, 'vf', 1) # self._proba_distribution, self._policy, self.q_value = \ # self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, # init_scale=0.01) self._proba_distribution, self._policy, self.q_value = \ self.proba_distribution_no_pi_linear(pi_latent, vf_latent, init_scale=0.01) self._setup_init()
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, layers=None, net_arch=None, act_fun=tf.tanh, feature_extraction="attention_mlp", n_object=2, **kwargs): super(AttentionPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=(feature_extraction == "cnn")) self._kwargs_check(feature_extraction, kwargs) if layers is not None: warnings.warn( "Usage of the `layers` parameter is deprecated! Use net_arch instead " "(it has a different semantics though).", DeprecationWarning) if net_arch is not None: warnings.warn( "The new `net_arch` parameter overrides the deprecated `layers` parameter!", DeprecationWarning) if net_arch is None: if layers is None: layers = [256, 256] net_arch = [dict(vf=layers, pi=layers)] with tf.variable_scope("model", reuse=reuse): # assert feature_extraction == 'attention_mlp' if feature_extraction == 'attention_mlp': latent = attention_mlp_extractor2(tf.layers.flatten( self.processed_obs), n_object=n_object, n_units=128) pi_latent, vf_latent = mlp_extractor(latent, net_arch, act_fun) elif feature_extraction == 'attention_mlp_particle': latent = attention_mlp_extractor_particle(tf.layers.flatten( self.processed_obs), n_object=n_object, n_units=128) pi_latent, vf_latent = mlp_extractor(latent, net_arch, act_fun) elif feature_extraction == 'self_attention_mlp': pi_latent, vf_latent = self_attention_mlp_extractor( tf.layers.flatten(self.processed_obs), n_object=n_object) else: raise NotImplementedError # if feature_extraction == "cnn": # pi_latent = vf_latent = cnn_extractor(self.processed_obs, **kwargs) # else: # pi_latent, vf_latent = mlp_extractor(tf.layers.flatten(self.processed_obs), net_arch, act_fun) self._value_fn = linear(vf_latent, 'vf', 1) self._proba_distribution, self._policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01) self._setup_init()
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, layers=None, net_arch=None, act_fun=tf.tanh, cnn_extractor=nature_cnn, feature_extraction="cnn", **kwargs): super(FeedForwardPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=(feature_extraction == "cnn")) self._kwargs_check(feature_extraction, kwargs) if layers is not None: warnings.warn( "Usage of the `layers` parameter is deprecated! Use net_arch instead " "(it has a different semantics though).", DeprecationWarning) if net_arch is not None: warnings.warn( "The new `net_arch` parameter overrides the deprecated `layers` parameter!", DeprecationWarning) if net_arch is None: if layers is None: layers = [64, 64] net_arch = [dict(vf=layers, pi=layers)] with tf.variable_scope("model", reuse=reuse): if feature_extraction == "cnn": activ = tf.nn.tanh observation_features = self.processed_obs[:, :, -8:] observation_features_flat = tf.layers.flatten( observation_features) visual_features = self.processed_obs[:, :, :-8] visual_features = tf.reshape(visual_features, [-1, 128, 128, 15]) vis_pi_latent = vis_vf_latent = cnn_extractor( visual_features, **kwargs) vis_pi_latent = tf.reshape(vis_pi_latent, [-1, 1, 512]) vis_vf_latent = tf.reshape(vis_vf_latent, [-1, 1, 512]) meas_pi_h = activ( linear(observation_features_flat, "pi_meas_fc", 512, init_scale=np.sqrt(2))) meas_pi_latent = tf.reshape(meas_pi_h, [-1, 1, 512]) features = tf.layers.flatten( tf.concat([vis_pi_latent, meas_pi_latent], axis=2)) pi_latent = activ( linear(features, "pi_fc", 128, init_scale=np.sqrt(2))) meas_vf_h = activ( linear(observation_features_flat, "vf_meas_fc", 512, init_scale=np.sqrt(2))) meas_vf_latent = tf.reshape(meas_vf_h, [-1, 1, 512]) features = tf.layers.flatten( tf.concat([vis_vf_latent, meas_vf_latent], axis=2)) vf_latent = activ( linear(features, "vf_fc", 128, init_scale=np.sqrt(2))) else: pi_latent, vf_latent = mlp_extractor( tf.layers.flatten(self.processed_obs), net_arch, act_fun) self._value_fn = linear(vf_latent, 'vf', 1) self._proba_distribution, self._policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01) self._setup_init()
def observation_input(ob_space, batch_size=None, name='Ob', scale=False, reuse=False): """ Build observation input with encoding depending on the observation space type When using Box ob_space, the input will be normalized between [1, 0] on the bounds ob_space.low and ob_space.high. :param ob_space: (Gym Space) The observation space :param batch_size: (int) batch size for input (default is None, so that resulting input placeholder can take tensors with any batch size) :param name: (str) tensorflow variable name for input placeholder :param scale: (bool) whether or not to scale the input :param reuse: (bool) :return: (TensorFlow Tensor, TensorFlow Tensor) input_placeholder, processed_input_tensor """ if isinstance(ob_space, Discrete): observation_ph = tf.placeholder(shape=(batch_size, ), dtype=tf.int32, name=name) processed_observations = tf.cast( tf.one_hot(observation_ph, ob_space.n), tf.float32) return observation_ph, processed_observations elif isinstance(ob_space, Box): observation_ph = tf.placeholder(shape=(batch_size, ) + ob_space.shape, dtype=ob_space.dtype, name=name) processed_observations = tf.cast(observation_ph, tf.float32) # rescale to [1, 0] if the bounds are defined if (scale and not np.any(np.isinf(ob_space.low)) and not np.any(np.isinf(ob_space.high)) and np.any((ob_space.high - ob_space.low) != 0)): # equivalent to processed_observations / 255.0 when bounds are set to [255, 0] processed_observations = ((processed_observations - ob_space.low) / (ob_space.high - ob_space.low)) return observation_ph, processed_observations elif isinstance(ob_space, MultiBinary): observation_ph = tf.placeholder(shape=(batch_size, ob_space.n), dtype=tf.int32, name=name) processed_observations = tf.cast(observation_ph, tf.float32) return observation_ph, processed_observations elif isinstance(ob_space, MultiDiscrete): observation_ph = tf.placeholder(shape=(batch_size, len(ob_space.nvec)), dtype=tf.int32, name=name) processed_observations = tf.concat([ tf.cast(tf.one_hot(input_split, ob_space.nvec[i]), tf.float32) for i, input_split in enumerate( tf.split(observation_ph, len(ob_space.nvec), axis=-1)) ], axis=-1) return observation_ph, processed_observations elif isinstance(ob_space, Dict): ob_space_dict = list(OrderedDict(ob_space.spaces)) ob_space_length = np.array( [np.prod(np.array(ob_space[key].shape)) for key in ob_space_dict]) observation_ph = tf.placeholder(shape=(batch_size, np.sum(ob_space_length)), dtype=tf.float32, name=name) observation_day_ph = observation_ph[:, :ob_space_length[1]] processed_observation_day = tf.cast(observation_day_ph, tf.float32) # observation_board_ph = observation_ph[:, (ob_space_length[1]+1):(ob_space_length[1]+ob_space_length[0])] # processed_observation_board = tf.cast(observation_board_ph, tf.float32) # # rescale to [1, 0] if the bounds are defined # if (scale and # not np.any(np.isinf(ob_space["board_config"].low)) and # not np.any(np.isinf(ob_space["board_config"].high)) and # np.any((ob_space["board_config"].high - ob_space["board_config"].low) != 0)): # # equivalent to processed_observations / 255.0 when bounds are set to [255, 0] # processed_observation_board = ((processed_observation_board - ob_space["board_config"].low) / # (ob_space["board_config"].high - ob_space["board_config"].low)) observation_prevsales_ph = observation_ph[:, -ob_space_length[-1]:] processed_observation_prevsales = tf.cast(observation_prevsales_ph, tf.float32) # rescale to [1, 0] if the bounds are defined if (scale and not np.any(np.isinf(ob_space["prev_sales"].low)) and not np.any(np.isinf(ob_space["prev_sales"].high)) and np.any((ob_space["prev_sales"].high - ob_space["prev_sales"].low) != 0)): # equivalent to processed_observations / 255.0 when bounds are set to [255, 0] processed_observation_prevsales = ( (processed_observation_prevsales - ob_space["prev_sales"].low) / (ob_space["prev_sales"].high - ob_space["prev_sales"].low)) # TODO: these should be in params net_arch = None act_fun = tf.tanh if net_arch is None: net_arch = [32, 16] with tf.variable_scope("input_embedding", reuse=reuse): # with tf.variable_scope("board_embed", reuse=reuse): # board_latent, _ = mlp_extractor(tf.layers.flatten(processed_observation_board), net_arch, act_fun) with tf.variable_scope("prevsales_embed", reuse=reuse): prevsales, _ = mlp_extractor( tf.layers.flatten(processed_observation_prevsales), net_arch, act_fun) processed_observations = tf.concat( [ processed_observation_day, # board_latent, prevsales ], axis=-1, name="final_obs") # TODO: watch out! the processed observation is passed as observation_ph return observation_ph, processed_observations
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, layers=None, net_arch=None, act_fun=tf.tanh, cnn_extractor=nature_cnn, feature_extraction="mlp", **kwargs): super(FeedForwardPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=(feature_extraction == "cnn")) print("OB_SPACE = ", ob_space) print("AC_SPACE = ", ac_space) print("N_ENV = ", n_env) print("N_STEPS = ", n_steps) print("N_BATCH = ", n_batch) print("REUSE = ", reuse) print("LAYERS = ", layers) print("NET_ARCH = ", net_arch) print("KWARGS = ", kwargs) self._pdtype = make_proba_dist_type(ac_space) self._kwargs_check(feature_extraction, kwargs) if layers is not None: warnings.warn( "Usage of the `layers` parameter is deprecated! Use net_arch instead " "(it has a different semantics though).", DeprecationWarning) if net_arch is not None: warnings.warn( "The new `net_arch` parameter overrides the deprecated `layers` parameter!", DeprecationWarning) if net_arch is None: if layers is None: layers = [64, 64] net_arch = [dict(vf=layers, pi=layers)] with tf.variable_scope("model", reuse=reuse): if feature_extraction == "cnn": pi_latent = vf_latent = cnn_extractor(self.processed_obs, **kwargs) else: pi_latent, vf_latent = mlp_extractor( tf.layers.flatten(self.processed_obs), net_arch, act_fun) self._value_fn = linear(vf_latent, 'vf', 1) self._proba_distribution, self._policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, pi_init_scale=1.0, pi_init_bias=0.0, pi_init_std=0.125, vf_init_scale=1.0, vf_init_bias=0.0) self._setup_init() return
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, layers=None, goal_num=1, goal_net_arch=None, net_arch=None, act_fun=tf.tanh, cnn_extractor=nature_cnn, goal_encoder='mlp', feature_extraction="mlp", **kwargs): super(GoalsConditionedMLPPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=(feature_extraction == "mlp")) self.goal_encoder = goal_encoder # self._kwargs_check(feature_extraction, kwargs) self.name = "mlp_policy_" + goal_encoder if layers is not None: warnings.warn( "Usage of the `layers` parameter is deprecated! Use net_arch instead " "(it has a different semantics though).", DeprecationWarning) if net_arch is not None: warnings.warn( "The new `net_arch` parameter overrides the deprecated `layers` parameter!", DeprecationWarning) if net_arch is None: if layers is None: layers = [64, 64] net_arch = [dict(vf=layers, pi=layers)] if goal_net_arch is None: goal_net_arch = [[64, 32], 2] with tf.variable_scope("model", reuse=tf.AUTO_REUSE): self.obs_goals = tf.placeholder(dtype=ob_space.dtype, shape=(None, ob_space.shape[0]), name='goal_states') obs_goals_reshape = self.obs_goals #tf.reshape(tensor=self.obs_goals, shape=(-1, self.goal_num * ob_space.shape[0])) if goal_encoder == "mlp_sample": logging.info('mlp encoder with z sampling') self.z_mu, self.z_log_sigma_sq = mlp_goal_encoder( obs_goals_reshape, goal_net_arch, act_fun) eps = tf.random_normal(shape=tf.shape(self.z_log_sigma_sq), mean=0, stddev=1, dtype=tf.float32) self.z_goal_sample = self.z_mu + tf.sqrt( tf.exp(self.z_log_sigma_sq)) * eps if goal_encoder == "mlp": logging.info('mlp encoder with z mu') self.z_mu, self.z_log_sigma_sq = mlp_goal_encoder( obs_goals_reshape, goal_net_arch, act_fun) self.z_goal_sample = self.z_mu if goal_encoder == "no_encoder" or goal_encoder == 'no_goal_proposing': logging.info('no encoder for goal obs') self.z_goal_sample = tf.stop_gradient(self.obs_goals) # self.z_goal_input = tf.placeholder(dtype=ob_space.dtype, shape=self.z_mu.shape, name='input_z_goal') self.z_goal_input = tf.placeholder(dtype=ob_space.dtype, shape=self.z_goal_sample.shape, name='input_z_goal') self.use_input_z = tf.placeholder_with_default(False, shape=(), name='use_input_z') def use_sample(): return self.z_goal_sample def use_input(): return self.z_goal_input self.z_goal = tf.cond(self.use_input_z, use_input, use_sample) if goal_encoder == 'no_goal_proposing': latent = tf.layers.flatten(self.processed_obs) else: latent = tf.concat( [tf.layers.flatten(self.processed_obs), self.z_goal], 1) logging_info = 'latent shape' + str(latent.shape) logging.info(logging_info) if feature_extraction == "cnn": pi_latent = vf_latent = cnn_extractor(self.processed_obs, **kwargs) else: pi_latent, vf_latent = mlp_extractor(latent, net_arch, act_fun) self._value_fn = linear(vf_latent, 'vf', 1) self._proba_distribution, self._policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01) if goal_encoder == "mlp_sample": kl_coef = 0.01 latent_loss = -0.5 * tf.reduce_sum( 1 + self.z_log_sigma_sq - tf.square(self.z_mu) - tf.exp(self.z_log_sigma_sq), axis=1) self.latent_loss = tf.reduce_mean(latent_loss) * kl_coef else: self.latent_loss = 0 self._setup_init()
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, layers=None, net_arch=None, act_fun=tf.tanh, cnn_extractor=nature_cnn, feature_extraction="cnn", **kwargs): source_policy_paths, SDW, no_bias = get_master_config(kwargs) super(AggregatePolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=(feature_extraction == "cnn")) if isinstance(ac_space, spaces.Box): n_actions = self.ac_space.shape[0] action_dtype = tf.float32 elif isinstance(ac_space, spaces.Discrete): n_actions = ac_space.n action_dtype = tf.int64 else: raise NotImplementedError( "Multipolar is not implemented for the required action space") sources_actions = get_sources_actions(self.obs_ph, source_policy_paths, n_batch, n_actions, ac_space, action_dtype) self.pdtype = make_multipolar_proba_dist_type(ac_space, sources_actions, no_bias, SDW, summary=reuse) self._kwargs_check(feature_extraction, kwargs) if layers is not None: warnings.warn( "Usage of the `layers` parameter is deprecated! Use net_arch instead " "(it has a different semantics though).", DeprecationWarning) if net_arch is not None: warnings.warn( "The new `net_arch` parameter overrides the deprecated `layers` parameter!", DeprecationWarning) if net_arch is None: if layers is None: layers = [64, 64] net_arch = [dict(vf=layers, pi=layers)] with tf.variable_scope("model", reuse=reuse): if feature_extraction == "cnn": pi_latent = vf_latent = cnn_extractor(self.processed_obs, **kwargs) else: pi_latent, vf_latent = mlp_extractor( tf.layers.flatten(self.processed_obs), net_arch, act_fun) self.value_fn = linear(vf_latent, 'vf', 1) self.proba_distribution, self.policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01) self.initial_state = None self._setup_init()
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, layers=None, net_arch=None, act_fun=tf.nn.relu, feature_extraction="mlp", **kwargs): super(LstmCustomPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse) # extracted_features = tf.keras.layers.Dense(128, activation='relu')(self.processed_obs) # extracted_features = tf.keras.layers.MaxPooling1D(pool_size=2)(extracted_features) # extracted_features = tf.keras.layers.Conv1D(128, kernel_size=3, padding='same')(extracted_features) # extracted_features = tf.keras.layers.MaxPooling1D(pool_size=2)(extracted_features) # lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=128) # extracted_features = nature_cnn(self.processed_obs, **kwargs) self._kwargs_check(feature_extraction, kwargs) if net_arch is None: if layers is None: layers = [128, 64] net_arch = [dict(vf=layers, pi=layers)] with tf.variable_scope("model", reuse=reuse): # x_image = tf.keras.layers.Reshape((-1, 3, 1))(self.processed_obs['residue_chain']) # batch_size x board_x x board_y x 1 activ = tf.nn.relu # encoded_chain = tf.keras.layers.LSTM(16)(self.processed_obs["residue_chain"]) # x_image = tf.keras.layers.Reshape((64, 3, 1))(self.processed_obs['residue_chain']) # batch_size x board_x x board_y x 1 # # embeded = tf.keras.layers.Conv2D(64, kernel_size=3, padding='same', use_bias=False)(x_image) # embeded = tf.keras.layers.MaxPool2D(pool_size=(2, 1), strides= (2, 1))(embeded) # embeded = tf.keras.layers.Conv2D(64, kernel_size=3, padding='same', use_bias=False)(embeded) # embeded = tf.keras.layers.Conv2D(64, kernel_size=3, padding='valid', use_bias=False)(embeded) with_energy = tf.layers.flatten(self.processed_obs['backbone']) # with_energy = tf.keras.layers.Dense(64)(with_energy) with_energy = tf.keras.layers.Concatenate()([ with_energy, self.processed_obs['protein_name'], self.processed_obs['residue_number'], self.processed_obs['step_to_end'] ]) pi_latent, vf_latent = mlp_extractor(with_energy, net_arch, act_fun) self._value_fn = linear(vf_latent, 'vf', 1) self._proba_distribution, self._policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(pi_latent, vf_latent, init_scale=0.01) self._setup_init()