def __init__(self, state_dim, action_dim, hidden_dim=256, training_step=1, alpha=0.1, train_alpha=True, batch_size=128, buffer_size=1e6, tau=0.005, learning_rate=0.0003, gamma=0.99, reward_scale=1, training_start=500): self.buffer = Buffer(buffer_size) self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate) self.critic1_optimizer = tf.keras.optimizers.Adam(learning_rate) self.critic2_optimizer = tf.keras.optimizers.Adam(learning_rate) self.state_dim = state_dim self.action_dim = action_dim self.batch_size = batch_size self.tau = tau self.gamma = gamma self.reward_scale = reward_scale self.training_start = training_start self.training_step = training_step self.log_alpha = tf.Variable(np.log(alpha), dtype=tf.float32, trainable=True) self.target_entropy = -action_dim self.alpha_optimizer = tf.keras.optimizers.Adam(learning_rate) self.train_alpha = train_alpha self.actor = Squashed_Gaussian_Actor(self.state_dim, self.action_dim, (hidden_dim, hidden_dim)) self.critic1 = Q_network(self.state_dim, self.action_dim, (hidden_dim, hidden_dim)) self.target_critic1 = Q_network(self.state_dim, self.action_dim, (hidden_dim, hidden_dim)) self.critic2 = Q_network(self.state_dim, self.action_dim, (hidden_dim, hidden_dim)) self.target_critic2 = Q_network(self.state_dim, self.action_dim, (hidden_dim, hidden_dim)) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'Target_Critic1': self.target_critic1, 'Target_Critic2': self.target_critic2 } self.name = 'SAC_v2'
def __init__(self, state_dim, action_dim, hidden_dim=256, training_step=1, batch_size=128, buffer_size=1e6, tau=0.005, learning_rate=0.0003, gamma=0.99, alpha=0.2, reward_scale=1, training_start=500): self.buffer = Buffer(buffer_size) self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate) self.critic1_optimizer = tf.keras.optimizers.Adam(learning_rate) self.critic2_optimizer = tf.keras.optimizers.Adam(learning_rate) self.v_network_optimizer = tf.keras.optimizers.Adam(learning_rate) self.state_dim = state_dim self.action_dim = action_dim self.batch_size = batch_size self.tau = tau self.gamma = gamma self.alpha = alpha self.reward_scale = reward_scale self.training_start = training_start self.training_step = training_step self.actor = Squashed_Gaussian_Actor(self.state_dim, self.action_dim, (hidden_dim, hidden_dim)) self.critic1 = Q_network(self.state_dim, self.action_dim, (hidden_dim, hidden_dim)) self.critic2 = Q_network(self.state_dim, self.action_dim, (hidden_dim, hidden_dim)) self.v_network = V_network(self.state_dim, (hidden_dim, hidden_dim)) self.target_v_network = V_network(self.state_dim, (hidden_dim, hidden_dim)) copy_weight(self.v_network, self.target_v_network) self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'V_network': self.v_network, 'Target_V_network': self.target_v_network } self.name = 'SAC_v1'
def __init__(self, state_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.state_dim = state_dim self.action_dim = action_dim self.batch_size = args.batch_size self.tau = args.tau self.gamma = args.gamma self.training_start = args.training_start self.training_step = args.training_step self.current_step = 0 self.critic_update = args.critic_update self.log_alpha = tf.Variable(np.log(args.alpha), dtype=tf.float32, trainable=True) self.target_entropy = -action_dim self.alpha_optimizer = tf.keras.optimizers.Adam(args.alpha_lr) self.train_alpha = args.train_alpha self.actor = Squashed_Gaussian_Actor(self.state_dim, self.action_dim, args.hidden_dim, args.log_std_min, args.log_std_max) self.critic1 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.target_critic1 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.critic2 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.target_critic2 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'Target_Critic1': self.target_critic1, 'Target_Critic2': self.target_critic2 } self.name = 'SAC_v2'
def __init__(self, state_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.state_dim = state_dim self.action_dim = action_dim self.batch_size = args.batch_size self.gamma = args.gamma self.tau = args.tau self.actor_lr = args.actor_lr self.critic_lr = args.critic_lr self.policy_delay = args.policy_delay self.actor_noise = args.actor_noise self.target_noise = args.target_noise self.noise_clip = args.noise_clip self.training_start = args.training_start self.training_step = args.training_step self.current_step = 0 self.actor = Policy_network(self.state_dim, self.action_dim, args.hidden_dim) self.target_actor = Policy_network(self.state_dim, self.action_dim, args.hidden_dim) self.critic1 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.target_critic1 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.critic2 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.target_critic2 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) copy_weight(self.actor, self.target_actor) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'Target_Critic1': self.target_critic1, 'Target_Critic2': self.target_critic2 } self.name = 'TD3'
def __init__(self, state_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.v_network_optimizer = tf.keras.optimizers.Adam(args.v_lr) self.state_dim = state_dim self.action_dim = action_dim self.batch_size = args.batch_size self.tau = args.tau self.gamma = args.gamma self.alpha = args.alpha self.training_start = args.training_start self.training_step = args.training_step self.current_step = 0 self.actor = Squashed_Gaussian_Actor(self.state_dim, self.action_dim, args.hidden_dim, args.log_std_min, args.log_std_max) self.critic1 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.critic2 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.v_network = V_network(self.state_dim, args.hidden_dim) self.target_v_network = V_network(self.state_dim, args.hidden_dim) copy_weight(self.v_network, self.target_v_network) self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'V_network': self.v_network, 'Target_V_network': self.target_v_network } self.name = 'SAC_v1'
def __init__(self, state_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.state_dim = state_dim self.action_dim = action_dim self.batch_size = args.batch_size self.gamma = args.gamma self.tau = args.tau self.noise_scale = args.noise_scale self.training_start = args.training_start self.training_step = args.training_step self.current_step = 0 self.actor = Policy_network(self.state_dim, self.action_dim, args.hidden_dim) self.target_actor = Policy_network(self.state_dim, self.action_dim, args.hidden_dim) self.critic = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.target_critic = Q_network(self.state_dim, self.action_dim, args.hidden_dim) copy_weight(self.actor, self.target_actor) copy_weight(self.critic, self.target_critic) self.network_list = { 'Actor': self.actor, 'Target_Actor': self.target_actor, 'Critic': self.critic, 'Target_Critic': self.target_critic } self.name = 'DDPG'
def __init__(self, obs_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.obs_dim = obs_dim self.action_dim = action_dim self.image_size = args.image_size self.pre_image_size = args.pre_image_size self.current_step = 0 self.log_alpha = tf.Variable(initial_value=tf.math.log(args.alpha), trainable=True) self.target_entropy = -action_dim self.gamma = args.gamma self.batch_size = args.batch_size self.feature_dim = args.feature_dim self.layer_num = args.layer_num self.filter_num = args.filter_num self.tau = args.tau self.encoder_tau = args.encoder_tau self.critic_update = args.critic_update self.training_start = args.training_start self.training_step = args.training_step self.train_alpha = args.train_alpha self.actor = Squashed_Gaussian_Actor(self.feature_dim, self.action_dim, args.hidden_dim, args.log_std_min, args.log_std_max) self.critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) copy_weight(self.encoder, self.target_encoder) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.log_alpha_optimizer = tf.keras.optimizers.Adam(args.alpha_lr, beta_1=0.5) self.network_list = {'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'Target_Critic1': self.target_critic1, 'Target_Critic2': self.target_critic2, 'Encoder': self.encoder, 'Target_Encoder': self.target_encoder} self.aug_funcs = {} self.aug_list = { 'crop': rad.crop, 'grayscale': rad.random_grayscale(), 'cutout': rad.random_cutout(), 'cutout_color': rad.random_cutout_color(), 'flip': rad.random_flip(), 'rotate': rad.random_rotation(), 'rand_conv': rad.random_convolution(), 'color_jitter': rad.random_color_jitter(), 'no_aug': rad.no_aug } for aug_name in args.data_augs.split('-'): assert aug_name in self.aug_list self.aug_funcs[aug_name] = self.aug_list[aug_name] self.name = 'RAD_SACv2'
def __init__(self, obs_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.obs_dim = obs_dim self.action_dim = action_dim self.image_size = obs_dim[-1] self.current_step = 0 self.gamma = args.gamma self.batch_size = args.batch_size self.feature_dim = args.feature_dim self.curl_latent_dim = args.curl_latent_dim self.layer_num = args.layer_num self.filter_num = args.filter_num self.tau = args.tau self.encoder_tau = args.encoder_tau self.policy_delay = args.policy_delay self.actor_noise = args.actor_noise self.target_noise = args.target_noise self.noise_clip = args.noise_clip self.training_start = args.training_start self.training_step = args.training_step self.actor = Policy_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_actor = Policy_network(self.feature_dim, self.action_dim, args.hidden_dim) self.critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) copy_weight(self.actor, self.target_actor) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) copy_weight(self.encoder, self.target_encoder) self.curl = CURL(self.feature_dim, self.curl_latent_dim) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr) self.cpc_optimizer = tf.keras.optimizers.Adam(args.cpc_lr) self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'Target_Critic1': self.target_critic1, 'Target_Critic2': self.target_critic2, 'Curl': self.curl, 'Encoder': self.encoder, 'Target_Encoder': self.target_encoder } self.name = 'CURL_TD3'
def __init__(self, obs_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.obs_dim = obs_dim self.action_dim = action_dim self.image_size = obs_dim[-1] self.current_step = 0 self.log_alpha = tf.Variable(initial_value=tf.math.log(args.alpha), trainable=True) self.target_entropy = -action_dim self.gamma = args.gamma self.batch_size = args.batch_size self.feature_dim = args.feature_dim self.curl_latent_dim = args.curl_latent_dim self.layer_num = args.layer_num self.filter_num = args.filter_num self.tau = args.tau self.encoder_tau = args.encoder_tau self.critic_update = args.critic_update self.training_start = args.training_start self.training_step = args.training_step self.train_alpha = args.train_alpha self.actor = Squashed_Gaussian_Actor(self.feature_dim, self.action_dim, args.hidden_dim, args.log_std_min, args.log_std_max) self.critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) copy_weight(self.encoder, self.target_encoder) self.curl = CURL(self.feature_dim, self.curl_latent_dim) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr) self.cpc_optimizer = tf.keras.optimizers.Adam(args.cpc_lr) self.log_alpha_optimizer = tf.keras.optimizers.Adam(args.alpha_lr, beta_1=0.5) self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'Target_Critic1': self.target_critic1, 'Target_Critic2': self.target_critic2, 'Curl': self.curl, 'Encoder': self.encoder, 'Target_Encoder': self.target_encoder } self.name = 'CURL_SACv2'
def __init__(self, obs_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.obs_dim = obs_dim self.action_dim = action_dim self.image_size = obs_dim[-1] self.gamma = args.gamma self.alpha = args.alpha self.batch_size = args.batch_size self.feature_dim = args.feature_dim self.curl_latent_dim = args.curl_latent_dim self.layer_num = args.layer_num self.filter_num = args.filter_num self.tau = args.tau self.encoder_tau = args.encoder_tau self.training_start = args.training_start self.training_step = args.training_step self.encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) self.actor = Squashed_Gaussian_Actor( self.feature_dim, self.action_dim, args.hidden_dim, args.log_std_min, args.log_std_max, kernel_initializer=tf.keras.initializers.orthogonal()) self.critic1 = Q_network( self.feature_dim, self.action_dim, args.hidden_dim, kernel_initializer=tf.keras.initializers.orthogonal()) self.critic2 = Q_network( self.feature_dim, self.action_dim, args.hidden_dim, kernel_initializer=tf.keras.initializers.orthogonal()) self.v_network = V_network( self.feature_dim, args.hidden_dim, kernel_initializer=tf.keras.initializers.orthogonal()) self.target_v_network = V_network( self.feature_dim, args.hidden_dim, kernel_initializer=tf.keras.initializers.orthogonal()) self.curl = CURL(self.feature_dim, self.curl_latent_dim) copy_weight(self.v_network, self.target_v_network) copy_weight(self.encoder, self.target_encoder) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.v_network_optimizer = tf.keras.optimizers.Adam(args.v_lr) self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr) self.cpc_optimizer = tf.keras.optimizers.Adam(args.cpc_lr) self.current_step = 0 self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'V_network': self.v_network, 'Target_V_network': self.target_v_network, 'Curl': self.curl, 'Encoder': self.encoder, 'Target_Encoder': self.target_encoder } self.name = 'CURL_SACv1'
def __init__(self, obs_dim, action_dim, hidden_dim=512, gamma=0.99, learning_rate=0.001, batch_size=512, policy_delay=2, actor_noise=0.1, target_noise=0.2, noise_clip=0.5, buffer_size=1e6, feature_dim=50, layer_num=4, filter_num=32, tau=0.005, encoder_tau=0.005, bisim_coef=0.5, training_start=1000): self.buffer = Buffer(buffer_size) self.obs_dim = obs_dim self.action_dim = action_dim self.hidden_dim = hidden_dim self.gamma = gamma self.learning_rate = learning_rate self.batch_size = batch_size self.feature_dim = feature_dim self.layer_num = layer_num self.filter_num = filter_num self.tau = tau self.encoder_tau = encoder_tau self.bisim_coef = bisim_coef self.policy_delay = policy_delay self.actor_noise = actor_noise self.target_noise = target_noise self.noise_clip = noise_clip self.training_start = training_start self.actor = Policy_network(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.target_actor = Policy_network(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.critic1 = Q_network(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.critic2 = Q_network(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.target_critic1 = Q_network(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.target_critic2 = Q_network(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.encoder = PixelEncoder(self.obs_dim, feature_dim, layer_num, filter_num) self.target_encoder = PixelEncoder(self.obs_dim, feature_dim, layer_num, filter_num) self.dynamics_model = Transition_Network(feature_dim, action_dim) self.reward_model = Reward_Network(feature_dim) copy_weight(self.actor, self.target_actor) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) copy_weight(self.encoder, self.target_encoder) self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate) self.critic1_optimizer = tf.keras.optimizers.Adam(learning_rate) self.critic2_optimizer = tf.keras.optimizers.Adam(learning_rate) self.encoder_optimizer = tf.keras.optimizers.Adam(learning_rate) self.dynamics_optimizer = tf.keras.optimizers.Adam(learning_rate) self.reward_optimizer = tf.keras.optimizers.Adam(learning_rate) self.name = 'DBC_TD3'
def __init__(self, obs_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.obs_dim = obs_dim self.action_dim = action_dim self.log_alpha = tf.Variable(initial_value=tf.math.log(args.alpha), trainable=True) self.target_entropy = -action_dim self.gamma = args.gamma self.batch_size = args.batch_size self.feature_dim = args.feature_dim self.layer_num = args.layer_num self.filter_num = args.filter_num self.tau = args.tau self.encoder_tau = args.encoder_tau self.actor_update = args.actor_update self.critic_update = args.critic_update self.training_start = args.training_start self.training_step = args.training_step self.train_alpha = args.train_alpha self.actor = Squashed_Gaussian_Actor(self.feature_dim, self.action_dim, args.hidden_dim, args.log_std_min, args.log_std_max) self.critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) self.dynamics_model = Transition_Network(self.feature_dim, action_dim, deterministic=False) self.reward_model = Reward_Network(self.feature_dim) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) copy_weight(self.encoder, self.target_encoder) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr) self.log_alpha_optimizer = tf.keras.optimizers.Adam(args.alpha_lr) self.dynamics_optimizer = tf.keras.optimizers.Adam(args.decoder_lr) self.reward_optimizer = tf.keras.optimizers.Adam(args.decoder_lr) self.current_step = 0 self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'Target_Critic1': self.target_critic1, 'Target_Critic2': self.target_critic2, 'Encoder': self.encoder, 'Target_Encoder': self.target_encoder, 'Dynamics': self.dynamics_model, 'Reward': self.reward_model } self.name = 'DBC_SACv2'
def __init__(self, obs_dim, action_dim, hidden_dim=256, gamma=0.99, learning_rate=1e-5, batch_size=128, buffer_size=1e6, feature_dim=50, layer_num=4, filter_num=32, tau=0.005, encoder_tau=0.005, bisim_coef=0.5, training_start=1000, train_alpha=True, alpha=0.1): self.buffer = Buffer(buffer_size) self.obs_dim = obs_dim self.action_dim = action_dim self.log_alpha = tf.Variable(initial_value=tf.math.log(alpha), trainable=True) self.target_entropy = -action_dim self.hidden_dim = hidden_dim self.gamma = gamma self.learning_rate = learning_rate self.bisim_coef = bisim_coef self.batch_size = batch_size self.feature_dim = feature_dim self.layer_num = layer_num self.filter_num = filter_num self.tau = tau self.encoder_tau = encoder_tau self.training_start = training_start self.train_alpha = train_alpha self.actor = Squashed_Gaussian_Actor(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.critic1 = Q_network(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.critic2 = Q_network(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.target_critic1 = Q_network(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.target_critic2 = Q_network(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.encoder = PixelEncoder(self.obs_dim, feature_dim, layer_num, filter_num) self.target_encoder = PixelEncoder(self.obs_dim, feature_dim, layer_num, filter_num) self.dynamics_model = Transition_Network(feature_dim, action_dim, deterministic=False) self.reward_model = Reward_Network(feature_dim) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) copy_weight(self.encoder, self.target_encoder) self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate) self.critic1_optimizer = tf.keras.optimizers.Adam(learning_rate) self.critic2_optimizer = tf.keras.optimizers.Adam(learning_rate) self.encoder_optimizer = tf.keras.optimizers.Adam(learning_rate) self.log_alpha_optimizer = tf.keras.optimizers.Adam(10 * learning_rate) self.dynamics_optimizer = tf.keras.optimizers.Adam(learning_rate) self.reward_optimizer = tf.keras.optimizers.Adam(learning_rate) self.name = 'DBC_SACv2'