def __init__(self, q_function, optimizer, t_max, gamma, i_target, explorer, phi=lambda x: x, average_q_decay=0.999, logger=getLogger(__name__), batch_states=batch_states): self.shared_q_function = q_function self.target_q_function = copy.deepcopy(q_function) self.q_function = copy.deepcopy(self.shared_q_function) async_.assert_params_not_shared( self.shared_q_function, self.q_function) self.optimizer = optimizer self.t_max = t_max self.gamma = gamma self.explorer = explorer self.i_target = i_target self.phi = phi self.logger = logger self.average_q_decay = average_q_decay self.batch_states = batch_states self.t_global = mp.Value('l', 0) self.t = 0 self.t_start = 0 self.past_action_values = {} self.past_states = {} self.past_rewards = {} self.average_q = 0
def __init__(self, model, optimizer, t_max, gamma, beta=1e-2, process_idx=0, phi=lambda x: x, pi_loss_coef=1.0, v_loss_coef=0.5, keep_loss_scale_same=False, normalize_grad_by_t_max=False, use_average_reward=False, average_reward_tau=1e-2, act_deterministically=False, average_entropy_decay=0.999, average_value_decay=0.999, batch_states=batch_states): assert isinstance(model, A3CModel) # Globally shared model self.shared_model = model # Thread specific model self.model = copy.deepcopy(self.shared_model) async_.assert_params_not_shared(self.shared_model, self.model) self.optimizer = optimizer self.t_max = t_max self.gamma = gamma self.beta = beta self.phi = phi self.pi_loss_coef = pi_loss_coef self.v_loss_coef = v_loss_coef self.keep_loss_scale_same = keep_loss_scale_same self.normalize_grad_by_t_max = normalize_grad_by_t_max self.use_average_reward = use_average_reward self.average_reward_tau = average_reward_tau self.act_deterministically = act_deterministically self.average_value_decay = average_value_decay self.average_entropy_decay = average_entropy_decay self.batch_states = batch_states self.t = 0 self.t_start = 0 self.past_action_log_prob = {} self.past_action_entropy = {} self.past_states = {} self.past_rewards = {} self.past_values = {} self.average_reward = 0 # A3C won't use a explorer, but this arrtibute is referenced by run_dqn self.explorer = None # Stats self.average_value = 0 self.average_entropy = 0
def __init__(self, model, optimizer, replay_buffer=None, t_max=None, gamma=0.99, tau=1e-2, phi=lambda x: x, pi_loss_coef=1.0, v_loss_coef=0.5, rollout_len=10, batchsize=1, disable_online_update=False, n_times_replay=1, replay_start_size=10**2, normalize_loss_by_steps=True, act_deterministically=False, average_loss_decay=0.999, average_entropy_decay=0.999, average_value_decay=0.999, explorer=None, logger=None, batch_states=batch_states, backprop_future_values=True, train_async=False): if train_async: # Globally shared model self.shared_model = model # Thread specific model self.model = copy.deepcopy(self.shared_model) async_.assert_params_not_shared(self.shared_model, self.model) else: self.model = model self.xp = self.model.xp self.optimizer = optimizer self.replay_buffer = replay_buffer self.t_max = t_max self.gamma = gamma self.tau = tau self.phi = phi self.pi_loss_coef = pi_loss_coef self.v_loss_coef = v_loss_coef self.rollout_len = rollout_len if not self.xp.isscalar(batchsize): batchsize = self.xp.int32(batchsize) """Fix Chainer Issue #2807 batchsize should (look to) be scalar. """ self.batchsize = batchsize self.normalize_loss_by_steps = normalize_loss_by_steps self.act_deterministically = act_deterministically self.disable_online_update = disable_online_update self.n_times_replay = n_times_replay self.replay_start_size = replay_start_size self.average_loss_decay = average_loss_decay self.average_value_decay = average_value_decay self.average_entropy_decay = average_entropy_decay self.logger = logger if logger else getLogger(__name__) self.batch_states = batch_states self.backprop_future_values = backprop_future_values self.train_async = train_async self.t = 0 self.last_state = None self.last_action = None self.explorer = explorer self.online_batch_losses = [] # Stats self.average_loss = 0 self.average_value = 0 self.average_entropy = 0 self.init_history_data_for_online_update()
def __init__(self, model, optimizer, t_max, gamma, replay_buffer, beta=1e-2, phi=lambda x: x, pi_loss_coef=1.0, Q_loss_coef=0.5, use_trust_region=True, trust_region_alpha=0.99, trust_region_delta=1, truncation_threshold=10, disable_online_update=False, n_times_replay=8, replay_start_size=10**4, normalize_loss_by_steps=True, act_deterministically=False, use_Q_opc=False, average_entropy_decay=0.999, average_value_decay=0.999, average_kl_decay=0.999, logger=None): # Globally shared model self.shared_model = model # Globally shared average model used to compute trust regions self.shared_average_model = copy.deepcopy(self.shared_model) # Thread specific model self.model = copy.deepcopy(self.shared_model) async_.assert_params_not_shared(self.shared_model, self.model) self.optimizer = optimizer self.replay_buffer = replay_buffer self.t_max = t_max self.gamma = gamma self.beta = beta self.phi = phi self.pi_loss_coef = pi_loss_coef self.Q_loss_coef = Q_loss_coef self.normalize_loss_by_steps = normalize_loss_by_steps self.act_deterministically = act_deterministically self.use_trust_region = use_trust_region self.trust_region_alpha = trust_region_alpha self.truncation_threshold = truncation_threshold self.trust_region_delta = trust_region_delta self.disable_online_update = disable_online_update self.n_times_replay = n_times_replay self.use_Q_opc = use_Q_opc self.replay_start_size = replay_start_size self.average_value_decay = average_value_decay self.average_entropy_decay = average_entropy_decay self.average_kl_decay = average_kl_decay self.logger = logger if logger else getLogger(__name__) self.t = 0 self.last_state = None self.last_action = None # ACER won't use a explorer, but this arrtibute is referenced by # run_dqn self.explorer = None # Stats self.average_value = 0 self.average_entropy = 0 self.average_kl = 0 self.init_history_data_for_online_update()
def __init__(self, generator, discriminator, gen_optimizer, dis_optimizer, dataset, conditional, reward_mode, imsize, max_episode_steps, rollout_n, gamma, beta, gp_lambda, lambda_R, staying_penalty, empty_drawing_penalty, n_save_final_obs_interval, outdir, act_deterministically=False, average_entropy_decay=0.999, average_value_decay=0.999, process_idx=0, pi_loss_coef=1.0, v_loss_coef=1.0): # globally shared model self.shared_generator = generator self.shared_discriminator = discriminator # process specific model self.generator = copy.deepcopy(self.shared_generator) async_.assert_params_not_shared(self.shared_generator, self.generator) self.discriminator = copy.deepcopy(self.shared_discriminator) async_.assert_params_not_shared(self.shared_discriminator, self.discriminator) self.gen_optimizer = gen_optimizer self.dis_optimizer = dis_optimizer self.dataset = dataset self.conditional = conditional assert reward_mode in ('l2', 'dcgan', 'wgangp') self.reward_mode = reward_mode self.imsize = imsize self.max_episode_steps = max_episode_steps self.rollout_n = rollout_n self.gamma = gamma self.beta = beta self.gp_lambda = gp_lambda self.lambda_R = lambda_R self.staying_penalty = staying_penalty self.empty_drawing_penalty = empty_drawing_penalty self.n_save_final_obs_interval = n_save_final_obs_interval self.outdir = outdir self.act_deterministically = act_deterministically self.average_entropy_decay = average_entropy_decay self.average_value_decay = average_value_decay self.pi_loss_coef = pi_loss_coef self.v_loss_coef = v_loss_coef self.observation_saver = ObservationSaver(self.outdir, self.rollout_n, self.imsize) # initialize stat self.stat_average_value = 0.0 self.stat_average_entropy = 0.0 self.update_n = 0 # number of updates self.__reset_flags() self.__reset_buffers() self.__reset_stats()