def __init__(self, actor_optimizer_spec, critic_optimizer_spec, num_feature, num_action, replay_memory_size=1000000, batch_size=64, tau=0.001): ############### # BUILD MODEL # ############### self.num_feature = num_feature self.num_action = num_action self.batch_size = batch_size self.tau = tau # Construct actor and critic self.actor = Actor(num_feature, num_action).type(dtype) self.target_actor = Actor(num_feature, num_action).type(dtype) self.critic = Critic(num_feature, num_action).type(dtype) self.target_critic = Critic(num_feature, num_action).type(dtype) # Construct the optimizers for actor and critic self.actor_optimizer = actor_optimizer_spec.constructor( self.actor.parameters(), **actor_optimizer_spec.kwargs) self.critic_optimizer = critic_optimizer_spec.constructor( self.critic.parameters(), **critic_optimizer_spec.kwargs) # Construct the replay memory self.replay_memory = ReplayMemory(replay_memory_size)
def __init__(self, args): self.args = args super(AElearner, self).__init__(args) self.cts_eta = args.cts_eta self.cts_beta = args.cts_beta self.ae_delta = args.ae_delta self.batch_size = args.batch_update_size self.replay_memory = ReplayMemory( args.replay_size, self.local_network_upper.get_input_shape(), # self.local_network.get_input_shape(), self.num_actions) #inits desity model(chooses how many steps for update ) #20 * q targt update steps self._init_density_model(args) #computes loss self._double_dqn_op() self.which_net_to_update_counter = 0 self.ae_counter = 0 self.epsilon_greedy_counter = 0 self.total_ae_counter = 0 self.total_epsilon_greedy_counter = 0 self.q_values_upper_max = [] self.q_values_lower_max = [] self.ae_valid_actions = True self.action_meanings = self.emulator.env.unwrapped.get_action_meanings( ) self.minimized_actions_counter = { value: 0 for value in self.action_meanings } print(self.minimized_actions_counter)
def __init__(self, args): super(BasePGQLearner, self).__init__(args) self.q_update_counter = 0 self.replay_size = args.replay_size self.pgq_fraction = args.pgq_fraction self.batch_update_size = args.batch_update_size scope_name = 'local_learning_{}'.format(self.actor_id) conf_learning = {'name': scope_name, 'input_shape': self.input_shape, 'num_act': self.num_actions, 'args': args} with tf.device('/cpu:0'): self.local_network = PolicyValueNetwork(conf_learning) with tf.device('/gpu:0'), tf.variable_scope('', reuse=True): self.batch_network = PolicyValueNetwork(conf_learning) self._build_q_ops() self.reset_hidden_state() self.replay_memory = ReplayMemory( self.replay_size, self.local_network.get_input_shape(), self.num_actions) if self.is_master(): var_list = self.local_network.params self.saver = tf.train.Saver(var_list=var_list, max_to_keep=3, keep_checkpoint_every_n_hours=2)
def __init__(self, optimizer_spec, num_goal=6, num_action=2, replay_memory_size=10000, batch_size=128): ############### # BUILD MODEL # ############### self.num_goal = num_goal self.num_action = num_action self.batch_size = batch_size # Construct meta-controller and controller self.meta_controller = MetaController().type(dtype) self.target_meta_controller = MetaController().type(dtype) self.controller = Controller().type(dtype) self.target_controller = Controller().type(dtype) # Construct the optimizers for meta-controller and controller self.meta_optimizer = optimizer_spec.constructor( self.meta_controller.parameters(), **optimizer_spec.kwargs) self.ctrl_optimizer = optimizer_spec.constructor( self.controller.parameters(), **optimizer_spec.kwargs) # Construct the replay memory for meta-controller and controller self.meta_replay_memory = ReplayMemory(replay_memory_size) self.ctrl_replay_memory = ReplayMemory(replay_memory_size)
def __init__(self, optimizer_spec, num_goal=81, num_action=81, replay_memory_size=10000, subgoals = 81, screen_size = (500,500), batch_size=128): ############### # BUILD MODEL # ############### self.num_goal = num_goal self.num_action = num_action self.batch_size = batch_size # Construct meta-controller and controller self.meta_controller = MetaController().type(dtype) self.target_meta_controller = MetaController().type(dtype) self.controller = Controller().type(dtype) self.target_controller = Controller().type(dtype) # Construct the optimizers for meta-controller and controller self.meta_optimizer = optimizer_spec.constructor(self.meta_controller.parameters(), **optimizer_spec.kwargs) self.ctrl_optimizer = optimizer_spec.constructor(self.controller.parameters(), **optimizer_spec.kwargs) # Construct the replay memory for meta-controller and controller self.meta_replay_memory = ReplayMemory(replay_memory_size) self.ctrl_replay_memory = ReplayMemory(replay_memory_size) self.subgoals = subgoals self.screen_size = screen_size self.idx_2_action = self.action_dict()
def __init__(self): super(DQNDoubleQAgent, self).__init__() self.training = False self.max_frames = 2000000 self._epsilon = Epsilon(start=1.0, end=0.1, update_increment=0.0001) self.gamma = 0.99 self.train_q_per_step = 4 self.train_q_batch_size = 256 self.steps_before_training = 10000 self.target_q_update_frequency = 50000 self._Q_weights_path = "./data/SC2DoubleQAgent" self._Q = DQNCNN() if os.path.isfile(self._Q_weights_path): self._Q.load_state_dict(torch.load(self._Q_weights_path)) print("Loading weights:", self._Q_weights_path) self._Qt = copy.deepcopy(self._Q) self._Q.cuda() self._Qt.cuda() self._optimizer = optim.Adam(self._Q.parameters(), lr=1e-8) self._criterion = nn.MSELoss() self._memory = ReplayMemory(100000) self._loss = deque(maxlen=1000) self._max_q = deque(maxlen=1000) self._action = None self._screen = None self._fig = plt.figure() self._plot = [plt.subplot(2, 2, i + 1) for i in range(4)] self._screen_size = 28
def __init__(self, config): self.config = config self.logger = logging.getLogger("DQNAgent") # define models (policy and target) self.policy_model = DQN(self.config) self.target_model = DQN(self.config) # define memory self.memory = ReplayMemory(self.config) # define loss self.loss = HuberLoss() # define optimizer self.optim = torch.optim.RMSprop(self.policy_model.parameters()) # define environment self.env = gym.make('CartPole-v0').unwrapped self.cartpole = CartPoleEnv(self.config.screen_width) # initialize counter self.current_episode = 0 self.current_iteration = 0 self.episode_durations = [] self.batch_size = self.config.batch_size # set cuda flag self.is_cuda = torch.cuda.is_available() if self.is_cuda and not self.config.cuda: self.logger.info( "WARNING: You have a CUDA device, so you should probably enable CUDA" ) self.cuda = self.is_cuda & self.config.cuda if self.cuda: self.logger.info("Program will run on *****GPU-CUDA***** ") print_cuda_statistics() self.device = torch.device("cuda") torch.cuda.set_device(self.config.gpu_device) else: self.logger.info("Program will run on *****CPU***** ") self.device = torch.device("cpu") self.policy_model = self.policy_model.to(self.device) self.target_model = self.target_model.to(self.device) self.loss = self.loss.to(self.device) # Initialize Target model with policy model state dict self.target_model.load_state_dict(self.policy_model.state_dict()) self.target_model.eval() # Summary Writer self.summary_writer = SummaryWriter(log_dir=self.config.summary_dir, comment='DQN')
def __init__(self, args): self.final_epsilon = args.final_epsilon super(PseudoCountQLearner, self).__init__(args) self.cts_eta = args.cts_eta self.cts_beta = args.cts_beta self.batch_size = args.batch_update_size self.replay_memory = ReplayMemory(args.replay_size) self._init_density_model(args) self._double_dqn_op()
def test_zero_step(self): self.memory = ReplayMemory(capacity=10, multi_step_n=0) for i in range(5): a = Transition([0, 1, 2, i], 0, [4, 5, 6, i*i], 1, False) self.memory.push(a) final = Transition([0, 1, 2, 10], 0, [4, 5, 6, 100], 10, True) self.memory.push(final) self.assertEqual(self.memory.memory[0].r, 1) self.assertEqual(self.memory.memory[3].r, 1) self.assertEqual(self.memory.memory[4].r, 1) self.assertEqual(self.memory.memory[5].r, 10)
def __init__(self, args): super(PseudoCountQLearner, self).__init__(args) self.cts_eta = .9 self.batch_size = 32 self.replay_memory = ReplayMemory(args.replay_size) #more cython tuning could useful here self.density_model = CTSDensityModel(height=args.cts_rescale_dim, width=args.cts_rescale_dim, num_bins=args.cts_bins, beta=0.05)
def __init__(self, args): self.args = args super(PseudoCountQLearner, self).__init__(args) self.cts_eta = args.cts_eta self.cts_beta = args.cts_beta self.batch_size = args.batch_update_size self.replay_memory = ReplayMemory(args.replay_size, self.local_network.get_input_shape(), self.num_actions) self._init_density_model(args) self._double_dqn_op()
def __init__(self, args): self.args = args super(PseudoCountQLearner, self).__init__(args) self.cts_eta = args.cts_eta self.cts_beta = args.cts_beta self.batch_size = args.batch_update_size self.replay_memory = ReplayMemory(args.replay_size, self.local_network.get_input_shape(), self.num_actions) #inits desity model(chooses how many steps for update ) #20 * q targt update steps self._init_density_model(args) #computes loss self._double_dqn_op()
def _build_q_ops(self): # pgq specific initialization self.pgq_fraction = self.pgq_fraction self.batch_size = self.batch_update_size self.replay_memory = ReplayMemory(self.replay_size) self.q_tilde = self.batch_network.beta * ( self.batch_network.log_output_layer_pi + tf.expand_dims(self.batch_network.output_layer_entropy, 1)) + self.batch_network.output_layer_v self.Qi, self.Qi_plus_1 = tf.split(axis=0, num_or_size_splits=2, value=self.q_tilde) self.V, _ = tf.split(axis=0, num_or_size_splits=2, value=self.batch_network.output_layer_v) self.log_pi, _ = tf.split( axis=0, num_or_size_splits=2, value=tf.expand_dims(self.batch_network.log_output_selected_action, 1)) self.R = tf.placeholder('float32', [None], name='1-step_reward') self.terminal_indicator = tf.placeholder(tf.float32, [None], name='terminal_indicator') self.max_TQ = self.gamma * tf.reduce_max( self.Qi_plus_1, 1) * (1 - self.terminal_indicator) self.Q_a = tf.reduce_sum( self.Qi * tf.split(axis=0, num_or_size_splits=2, value=self.batch_network.selected_action_ph)[0], 1) self.q_objective = -self.pgq_fraction * tf.reduce_mean( tf.stop_gradient(self.R + self.max_TQ - self.Q_a) * (self.V[:, 0] + self.log_pi[:, 0])) self.V_params = self.batch_network.params self.q_gradients = tf.gradients(self.q_objective, self.V_params) if self.batch_network.clip_norm_type == 'global': self.q_gradients = tf.clip_by_global_norm( self.q_gradients, self.batch_network.clip_norm)[0] elif self.batch_network.clip_norm_type == 'local': self.q_gradients = [ tf.clip_by_norm(g, self.batch_network.clip_norm) for g in self.q_gradients ]
def __init__(self, env, args, device='cpu'): """ Instantiate an NEC Agent ---------- env: gym.Env gym environment to train on args: args class from argparser args are from from train.py: see train.py for help with each arg device: string 'cpu' or 'cuda:0' depending on use_cuda flag from train.py """ self.environment_type = args.environment_type self.env = env self.device = device # Hyperparameters self.epsilon = args.initial_epsilon self.final_epsilon = args.final_epsilon self.epsilon_decay = args.epsilon_decay self.gamma = args.gamma self.N = args.N # Transition queue and replay memory self.transition_queue = [] self.replay_every = args.replay_every self.replay_buffer_size = args.replay_buffer_size self.replay_memory = ReplayMemory(self.replay_buffer_size) # CNN for state embedding network self.frames_to_stack = args.frames_to_stack self.embedding_size = args.embedding_size self.in_height = args.in_height self.in_width = args.in_width self.cnn = CNN(self.frames_to_stack, self.embedding_size, self.in_height, self.in_width).to(self.device) # Differentiable Neural Dictionary (DND): one for each action self.kernel = inverse_distance self.num_neighbors = args.num_neighbors self.max_memory = args.max_memory self.lr = args.lr self.dnd_list = [] for i in range(env.action_space.n): self.dnd_list.append( DND(self.kernel, self.num_neighbors, self.max_memory, args.optimizer, self.lr)) # Optimizer for state embedding CNN self.q_lr = args.q_lr self.batch_size = args.batch_size self.optimizer = get_optimizer(args.optimizer, self.cnn.parameters(), self.lr)
def __init__(self, environment_name="CartPole-v1", replay_memory_size=10000, action_threshold=0.7, batch_size=64, gamma=0.9): self.environment = gym.make(environment_name) state = self.environment.reset() self.state_shape = state.shape self.action_space = self.environment.action_space.n self.replay_memory = ReplayMemory(self.state_shape, capacity=replay_memory_size) self.model = self.build_network() self.target_model = self.build_network() self.action_threshold = action_threshold self.batch_size = batch_size self.gamma = gamma
def __init__( self, state_size, n_actions, args, device=torch.device("cuda" if torch.cuda.is_available() else "cpu")): self.device = device # Exploration / Exploitation params. self.steps_done = 0 self.eps_threshold = 1 self.eps_start = args.eps_start self.eps_end = args.eps_end self.eps_decay = args.eps_decay # RL params self.target_update = args.target_update self.discount = args.discount # Env params self.n_actions = n_actions self.state_size = state_size # Deep q networks params self.layers = args.layers self.batch_size = args.batch_size self.policy_net = DQN(state_size, n_actions, layers=self.layers).to(self.device).float() self.target_net = None self.grad_clip = args.grad_clip if str(args.optimizer).lower() == 'adam': self.optimizer = optim.Adam(self.policy_net.parameters()) if str(args.optimizer).lower() == 'rmsprop': self.optimizer = optim.RMSprop(self.policy_net.parameters()) else: raise NotImplementedError self.memory = ReplayMemory(args.replay_size) # Performance buffers. self.rewards_list = []
def __init__(self, environment_name="Acrobot-v1", replay_memory_size=10000, action_threshold=0.7, batch_size=64, gamma=0.9): super(MotionAthlete, self).__init__(environment_name, replay_memory_size, action_threshold, batch_size, gamma) self.environment.close() del self.environment self.environment = EnvironmentWrapper(environment_name) frame = self.environment.reset() frmae_shape = frame.shape self.motion_tracer = MotionTracer(frame_shape=frmae_shape) self.state_shape = self.motion_tracer.state_shape self.replay_memory = ReplayMemory(self.state_shape, capacity=replay_memory_size) del self.model del self.target_model self.model = self.build_network() self.target_model = self.build_network()
class BasePGQLearner(BaseA3CLearner): def __init__(self, args): super(BasePGQLearner, self).__init__(args) # args.entropy_regularisation_strength = 0.0 conf_learning = { 'name': 'local_learning_{}'.format(self.actor_id), 'input_shape': self.input_shape, 'num_act': self.num_actions, 'args': args } self.local_network = PolicyValueNetwork(conf_learning) self.reset_hidden_state() if self.is_master(): var_list = self.local_network.params self.saver = tf.train.Saver(var_list=var_list, max_to_keep=3, keep_checkpoint_every_n_hours=2) # pgq specific initialization self.batch_size = 32 self.pgq_fraction = args.pgq_fraction self.replay_memory = ReplayMemory(args.replay_size) self.q_tilde = self.local_network.beta * ( self.local_network.log_output_layer_pi + tf.expand_dims(self.local_network.output_layer_entropy, 1)) + self.local_network.output_layer_v self.Qi, self.Qi_plus_1 = tf.split(axis=0, num_or_size_splits=2, value=self.q_tilde) self.V, _ = tf.split(axis=0, num_or_size_splits=2, value=self.local_network.output_layer_v) self.log_pi, _ = tf.split( axis=0, num_or_size_splits=2, value=tf.expand_dims(self.local_network.log_output_selected_action, 1)) self.R = tf.placeholder('float32', [None], name='1-step_reward') self.terminal_indicator = tf.placeholder(tf.float32, [None], name='terminal_indicator') self.max_TQ = self.gamma * tf.reduce_max( self.Qi_plus_1, 1) * (1 - self.terminal_indicator) self.Q_a = tf.reduce_sum( self.Qi * tf.split(axis=0, num_or_size_splits=2, value=self.local_network.selected_action_ph)[0], 1) self.q_objective = -self.pgq_fraction * tf.reduce_mean( tf.stop_gradient(self.R + self.max_TQ - self.Q_a) * (self.V[:, 0] + self.log_pi[:, 0])) self.V_params = self.local_network.params self.q_gradients = tf.gradients(self.q_objective, self.V_params) if self.local_network.clip_norm_type == 'global': self.q_gradients = tf.clip_by_global_norm( self.q_gradients, self.local_network.clip_norm)[0] elif self.local_network.clip_norm_type == 'local': self.q_gradients = [ tf.clip_by_norm(g, self.local_network.clip_norm) for g in self.q_gradients ] if (self.optimizer_mode == "local"): if (self.optimizer_type == "rmsprop"): self.batch_opt_st = np.ones(size, dtype=ctypes.c_float) else: self.batch_opt_st = np.zeros(size, dtype=ctypes.c_float) elif (self.optimizer_mode == "shared"): self.batch_opt_st = args.batch_opt_state def apply_batch_q_update(self): s_i, a_i, r_i, s_f, is_terminal = self.replay_memory.sample_batch( self.batch_size) batch_grads, max_TQ, Q_a = self.session.run( [self.q_gradients, self.max_TQ, self.Q_a], feed_dict={ self.R: r_i, self.local_network.selected_action_ph: np.vstack([a_i, a_i]), self.local_network.input_ph: np.vstack([s_i, s_f]), self.terminal_indicator: is_terminal.astype(np.int), }) # print 'max_TQ={}, Q_a={}'.format(max_TQ[:5], Q_a[:5]) self._apply_gradients_to_shared_memory_vars(batch_grads, opt_st=self.batch_opt_st) def softmax(self, x, temperature): x /= temperature exp_x = np.exp(x - np.max(x)) return exp_x / exp_x.sum()
class BasePGQLearner(BaseA3CLearner): def __init__(self, args): super(BasePGQLearner, self).__init__(args) self.q_update_counter = 0 self.replay_size = args.replay_size self.pgq_fraction = args.pgq_fraction self.batch_update_size = args.batch_update_size scope_name = 'local_learning_{}'.format(self.actor_id) conf_learning = {'name': scope_name, 'input_shape': self.input_shape, 'num_act': self.num_actions, 'args': args} with tf.device('/cpu:0'): self.local_network = PolicyValueNetwork(conf_learning) with tf.device('/gpu:0'), tf.variable_scope('', reuse=True): self.batch_network = PolicyValueNetwork(conf_learning) self._build_q_ops() self.reset_hidden_state() self.replay_memory = ReplayMemory( self.replay_size, self.local_network.get_input_shape(), self.num_actions) if self.is_master(): var_list = self.local_network.params self.saver = tf.train.Saver(var_list=var_list, max_to_keep=3, keep_checkpoint_every_n_hours=2) def _build_q_ops(self): # pgq specific initialization self.pgq_fraction = self.pgq_fraction self.batch_size = self.batch_update_size self.q_tilde = self.batch_network.beta * ( self.batch_network.log_output_layer_pi + tf.expand_dims(self.batch_network.output_layer_entropy, 1) ) + self.batch_network.output_layer_v self.Qi, self.Qi_plus_1 = tf.split(axis=0, num_or_size_splits=2, value=self.q_tilde) self.V, _ = tf.split(axis=0, num_or_size_splits=2, value=self.batch_network.output_layer_v) self.log_pi, _ = tf.split(axis=0, num_or_size_splits=2, value=tf.expand_dims(self.batch_network.log_output_selected_action, 1)) self.R = tf.placeholder('float32', [None], name='1-step_reward') self.terminal_indicator = tf.placeholder(tf.float32, [None], name='terminal_indicator') self.max_TQ = self.gamma*tf.reduce_max(self.Qi_plus_1, 1) * (1 - self.terminal_indicator) self.Q_a = tf.reduce_sum(self.Qi * tf.split(axis=0, num_or_size_splits=2, value=self.batch_network.selected_action_ph)[0], 1) self.q_objective = - self.pgq_fraction * tf.reduce_mean(tf.stop_gradient(self.R + self.max_TQ - self.Q_a) * (0.5 * self.V[:, 0] + self.log_pi[:, 0])) self.V_params = self.batch_network.params self.q_gradients = tf.gradients(self.q_objective, self.V_params) self.q_gradients = self.batch_network._clip_grads(self.q_gradients) def batch_q_update(self): if len(self.replay_memory) < self.replay_memory.maxlen//10: return s_i, a_i, r_i, s_f, is_terminal = self.replay_memory.sample_batch(self.batch_size) batch_grads = self.session.run( self.q_gradients, feed_dict={ self.R: r_i, self.batch_network.selected_action_ph: np.vstack([a_i, a_i]), self.batch_network.input_ph: np.vstack([s_i, s_f]), self.terminal_indicator: is_terminal.astype(np.int), } ) self.apply_gradients_to_shared_memory_vars(batch_grads)
def __init__(self, args): super(BasePGQLearner, self).__init__(args) # args.entropy_regularisation_strength = 0.0 conf_learning = { 'name': 'local_learning_{}'.format(self.actor_id), 'input_shape': self.input_shape, 'num_act': self.num_actions, 'args': args } self.local_network = PolicyValueNetwork(conf_learning) self.reset_hidden_state() if self.is_master(): var_list = self.local_network.params self.saver = tf.train.Saver(var_list=var_list, max_to_keep=3, keep_checkpoint_every_n_hours=2) # pgq specific initialization self.batch_size = 32 self.pgq_fraction = args.pgq_fraction self.replay_memory = ReplayMemory(args.replay_size) self.q_tilde = self.local_network.beta * ( self.local_network.log_output_layer_pi + tf.expand_dims(self.local_network.output_layer_entropy, 1)) + self.local_network.output_layer_v self.Qi, self.Qi_plus_1 = tf.split(axis=0, num_or_size_splits=2, value=self.q_tilde) self.V, _ = tf.split(axis=0, num_or_size_splits=2, value=self.local_network.output_layer_v) self.log_pi, _ = tf.split( axis=0, num_or_size_splits=2, value=tf.expand_dims(self.local_network.log_output_selected_action, 1)) self.R = tf.placeholder('float32', [None], name='1-step_reward') self.terminal_indicator = tf.placeholder(tf.float32, [None], name='terminal_indicator') self.max_TQ = self.gamma * tf.reduce_max( self.Qi_plus_1, 1) * (1 - self.terminal_indicator) self.Q_a = tf.reduce_sum( self.Qi * tf.split(axis=0, num_or_size_splits=2, value=self.local_network.selected_action_ph)[0], 1) self.q_objective = -self.pgq_fraction * tf.reduce_mean( tf.stop_gradient(self.R + self.max_TQ - self.Q_a) * (self.V[:, 0] + self.log_pi[:, 0])) self.V_params = self.local_network.params self.q_gradients = tf.gradients(self.q_objective, self.V_params) if self.local_network.clip_norm_type == 'global': self.q_gradients = tf.clip_by_global_norm( self.q_gradients, self.local_network.clip_norm)[0] elif self.local_network.clip_norm_type == 'local': self.q_gradients = [ tf.clip_by_norm(g, self.local_network.clip_norm) for g in self.q_gradients ] if (self.optimizer_mode == "local"): if (self.optimizer_type == "rmsprop"): self.batch_opt_st = np.ones(size, dtype=ctypes.c_float) else: self.batch_opt_st = np.zeros(size, dtype=ctypes.c_float) elif (self.optimizer_mode == "shared"): self.batch_opt_st = args.batch_opt_state
class PseudoCountQLearner(ValueBasedLearner, DensityModelMixin): """ Based on DQN+CTS model from the paper 'Unifying Count-Based Exploration and Intrinsic Motivation' (https://arxiv.org/abs/1606.01868) Presently the implementation differs from the paper in that the novelty bonuses are computed online rather than by computing the prediction gains after the model has been updated with all frames from the episode. Async training with different final epsilon values tends to produce better results than just using a single actor-learner. """ def __init__(self, args): self.args = args super(PseudoCountQLearner, self).__init__(args) self.cts_eta = args.cts_eta self.cts_beta = args.cts_beta self.batch_size = args.batch_update_size self.replay_memory = ReplayMemory(args.replay_size, self.local_network.get_input_shape(), self.num_actions) self._init_density_model(args) self._double_dqn_op() def generate_final_epsilon(self): if self.num_actor_learners == 1: return self.args.final_epsilon else: return super(PseudoCountQLearner, self).generate_final_epsilon() def _get_summary_vars(self): q_vars = super(PseudoCountQLearner, self)._get_summary_vars() bonus_q05 = tf.Variable(0., name='novelty_bonus_q05') s1 = tf.summary.scalar('Novelty_Bonus_q05_{}'.format(self.actor_id), bonus_q05) bonus_q50 = tf.Variable(0., name='novelty_bonus_q50') s2 = tf.summary.scalar('Novelty_Bonus_q50_{}'.format(self.actor_id), bonus_q50) bonus_q95 = tf.Variable(0., name='novelty_bonus_q95') s3 = tf.summary.scalar('Novelty_Bonus_q95_{}'.format(self.actor_id), bonus_q95) augmented_reward = tf.Variable(0., name='augmented_episode_reward') s4 = tf.summary.scalar( 'Augmented_Episode_Reward_{}'.format(self.actor_id), augmented_reward) return q_vars + [bonus_q05, bonus_q50, bonus_q95, augmented_reward] #TODO: refactor to make this cleaner def prepare_state(self, state, total_episode_reward, steps_at_last_reward, ep_t, episode_ave_max_q, episode_over, bonuses, total_augmented_reward): # Start a new game on reaching terminal state if episode_over: T = self.global_step.value() * self.max_local_steps t = self.local_step e_prog = float(t) / self.epsilon_annealing_steps episode_ave_max_q = episode_ave_max_q / float(ep_t) s1 = "Q_MAX {0:.4f}".format(episode_ave_max_q) s2 = "EPS {0:.4f}".format(self.epsilon) self.scores.insert(0, total_episode_reward) if len(self.scores) > 100: self.scores.pop() logger.info('T{0} / STEP {1} / REWARD {2} / {3} / {4}'.format( self.actor_id, T, total_episode_reward, s1, s2)) logger.info( 'ID: {0} -- RUNNING AVG: {1:.0f} ± {2:.0f} -- BEST: {3:.0f}'. format( self.actor_id, np.array(self.scores).mean(), 2 * np.array(self.scores).std(), max(self.scores), )) self.log_summary( total_episode_reward, episode_ave_max_q, self.epsilon, np.percentile(bonuses, 5), np.percentile(bonuses, 50), np.percentile(bonuses, 95), total_augmented_reward, ) state = self.emulator.get_initial_state() ep_t = 0 total_episode_reward = 0 episode_ave_max_q = 0 episode_over = False return (state, total_episode_reward, steps_at_last_reward, ep_t, episode_ave_max_q, episode_over) def _double_dqn_op(self): q_local_action = tf.cast( tf.argmax(self.local_network.output_layer, axis=1), tf.int32) q_target_max = utils.ops.slice_2d( self.target_network.output_layer, tf.range(0, self.batch_size), q_local_action, ) self.one_step_reward = tf.placeholder(tf.float32, self.batch_size, name='one_step_reward') self.is_terminal = tf.placeholder(tf.bool, self.batch_size, name='is_terminal') self.y_target = self.one_step_reward + self.cts_eta*self.gamma*q_target_max \ * (1 - tf.cast(self.is_terminal, tf.float32)) self.double_dqn_loss = self.local_network._value_function_loss( self.local_network.q_selected_action - tf.stop_gradient(self.y_target)) self.double_dqn_grads = tf.gradients(self.double_dqn_loss, self.local_network.params) # def batch_update(self): # if len(self.replay_memory) < self.replay_memory.maxlen//10: # return # s_i, a_i, r_i, s_f, is_terminal = self.replay_memory.sample_batch(self.batch_size) # feed_dict={ # self.one_step_reward: r_i, # self.target_network.input_ph: s_f, # self.local_network.input_ph: np.vstack([s_i, s_f]), # self.local_network.selected_action_ph: np.vstack([a_i, a_i]), # self.is_terminal: is_terminal # } # grads = self.session.run(self.double_dqn_grads, feed_dict=feed_dict) # self.apply_gradients_to_shared_memory_vars(grads) def batch_update(self): if len(self.replay_memory) < self.replay_memory.maxlen // 10: return s_i, a_i, r_i, s_f, is_terminal = self.replay_memory.sample_batch( self.batch_size) feed_dict = { self.local_network.input_ph: s_f, self.target_network.input_ph: s_f, self.is_terminal: is_terminal, self.one_step_reward: r_i, } y_target = self.session.run(self.y_target, feed_dict=feed_dict) feed_dict = { self.local_network.input_ph: s_i, self.local_network.target_ph: y_target, self.local_network.selected_action_ph: a_i } grads = self.session.run(self.local_network.get_gradients, feed_dict=feed_dict) self.apply_gradients_to_shared_memory_vars(grads) def train(self): """ Main actor learner loop for n-step Q learning. """ logger.debug("Actor {} resuming at Step {}, {}".format( self.actor_id, self.global_step.value(), time.ctime())) s = self.emulator.get_initial_state() s_batch = list() a_batch = list() y_batch = list() bonuses = deque(maxlen=1000) episode_over = False t0 = time.time() global_steps_at_last_record = self.global_step.value() while (self.global_step.value() < self.max_global_steps): # # Sync local learning net with shared mem # self.sync_net_with_shared_memory(self.local_network, self.learning_vars) # self.save_vars() rewards = list() states = list() actions = list() max_q_values = list() local_step_start = self.local_step total_episode_reward = 0 total_augmented_reward = 0 episode_ave_max_q = 0 ep_t = 0 while not episode_over: # Sync local learning net with shared mem self.sync_net_with_shared_memory(self.local_network, self.learning_vars) self.save_vars() # Choose next action and execute it a, q_values = self.choose_next_action(s) new_s, reward, episode_over = self.emulator.next(a) total_episode_reward += reward max_q = np.max(q_values) current_frame = new_s[..., -1] bonus = self.density_model.update(current_frame) bonuses.append(bonus) # Rescale or clip immediate reward reward = self.rescale_reward( self.rescale_reward(reward) + bonus) total_augmented_reward += reward ep_t += 1 rewards.append(reward) states.append(s) actions.append(a) max_q_values.append(max_q) s = new_s self.local_step += 1 episode_ave_max_q += max_q global_step, _ = self.global_step.increment() if global_step % self.q_target_update_steps == 0: self.update_target() if global_step % self.density_model_update_steps == 0: self.write_density_model() # Sync local tensorflow target network params with shared target network params if self.target_update_flags.updated[self.actor_id] == 1: self.sync_net_with_shared_memory(self.target_network, self.target_vars) self.target_update_flags.updated[self.actor_id] = 0 if self.density_model_update_flags.updated[self.actor_id] == 1: self.read_density_model() self.density_model_update_flags.updated[self.actor_id] = 0 if self.local_step % self.q_update_interval == 0: self.batch_update() if self.is_master() and (self.local_step % 500 == 0): bonus_array = np.array(bonuses) steps = global_step - global_steps_at_last_record global_steps_at_last_record = global_step logger.debug( 'Mean Bonus={:.4f} / Max Bonus={:.4f} / STEPS/s={}'. format(bonus_array.mean(), bonus_array.max(), steps / float(time.time() - t0))) t0 = time.time() else: #compute monte carlo return mc_returns = np.zeros((len(rewards), ), dtype=np.float32) running_total = 0.0 for i, r in enumerate(reversed(rewards)): running_total = r + self.gamma * running_total mc_returns[len(rewards) - i - 1] = running_total mixed_returns = self.cts_eta * np.asarray(rewards) + ( 1 - self.cts_eta) * mc_returns #update replay memory states.append(new_s) episode_length = len(rewards) for i in range(episode_length): self.replay_memory.append(states[i], actions[i], mixed_returns[i], i + 1 == episode_length) s, total_episode_reward, _, ep_t, episode_ave_max_q, episode_over = \ self.prepare_state(s, total_episode_reward, self.local_step, ep_t, episode_ave_max_q, episode_over, bonuses, total_augmented_reward)
torch.cuda.manual_seed(args.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Define and build DDPG agent hidden_size = tuple(args.hidden_size) agent = DDPG(args.gamma, args.tau, hidden_size, env.observation_space.shape[0], env.action_space, checkpoint_dir=checkpoint_dir ) # Initialize replay memory memory = ReplayMemory(int(args.replay_size)) # Initialize OU-Noise nb_actions = env.action_space.shape[-1] ou_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(args.noise_stddev) * np.ones(nb_actions)) # Define counters and other variables start_step = 0 # timestep = start_step if args.load_model: # Load agent if necessary start_step, memory = agent.load_checkpoint() timestep = start_step // 10000 + 1 rewards, policy_losses, value_losses, mean_test_rewards = [], [], [], [] epoch = 0
class hDQN(): """ The Hierarchical-DQN Agent Parameters ---------- optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer num_goal: int The number of goal that agent can choose from num_action: int The number of action that agent can choose from replay_memory_size: int How many memories to store in the replay memory. batch_size: int How many transitions to sample each time experience is replayed. """ def __init__(self, optimizer_spec, num_goal=6, num_action=2, replay_memory_size=10000, batch_size=128): ############### # BUILD MODEL # ############### self.num_goal = num_goal self.num_action = num_action self.batch_size = batch_size # Construct meta-controller and controller self.meta_controller = MetaController().type(dtype) self.target_meta_controller = MetaController().type(dtype) self.controller = Controller().type(dtype) self.target_controller = Controller().type(dtype) # Construct the optimizers for meta-controller and controller self.meta_optimizer = optimizer_spec.constructor( self.meta_controller.parameters(), **optimizer_spec.kwargs) self.ctrl_optimizer = optimizer_spec.constructor( self.controller.parameters(), **optimizer_spec.kwargs) # Construct the replay memory for meta-controller and controller self.meta_replay_memory = ReplayMemory(replay_memory_size) self.ctrl_replay_memory = ReplayMemory(replay_memory_size) def get_intrinsic_reward(self, goal, state): return 1.0 if goal == state else 0.0 def select_goal(self, state, epilson): sample = random.random() if sample > epilson: state = torch.from_numpy(state).type(dtype) # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history with torch.no_grad(): return self.meta_controller(Variable( state, volatile=True)).data.max(1)[1].cpu() else: return torch.IntTensor([random.randrange(self.num_goal)]) def select_action(self, joint_state_goal, epilson): sample = random.random() if sample > epilson: joint_state_goal = torch.from_numpy(joint_state_goal).type(dtype) # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history with torch.no_grad(): return self.controller( Variable(joint_state_goal, volatile=True)).data.max(1)[1].cpu() else: return torch.IntTensor([random.randrange(self.num_action)]) def update_meta_controller(self, gamma=1.0): if len(self.meta_replay_memory) < self.batch_size: return state_batch, goal_batch, next_state_batch, ex_reward_batch, done_mask = \ self.meta_replay_memory.sample(self.batch_size) state_batch = Variable(torch.from_numpy(state_batch).type(dtype)) goal_batch = Variable(torch.from_numpy(goal_batch).long()) next_state_batch = Variable( torch.from_numpy(next_state_batch).type(dtype)) ex_reward_batch = Variable( torch.from_numpy(ex_reward_batch).type(dtype)) not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) if USE_CUDA: goal_batch = goal_batch.cuda() # Compute current Q value, meta_controller takes only state and output value for every state-goal pair # We choose Q based on goal chosen. current_Q_values = self.meta_controller(state_batch).gather( 1, goal_batch.unsqueeze(1)) # Compute next Q value based on which goal gives max Q values # Detach variable from the current graph since we don't want gradients for next Q to propagated next_max_q = self.target_meta_controller( next_state_batch).detach().max(1)[0] next_Q_values = not_done_mask * next_max_q # Compute the target of the current Q values target_Q_values = ex_reward_batch + (gamma * next_Q_values) # Compute Bellman error (using Huber loss) loss = F.smooth_l1_loss(current_Q_values.view(-1), target_Q_values) # Copy Q to target Q before updating parameters of Q self.target_meta_controller.load_state_dict( self.meta_controller.state_dict()) # Optimize the model self.meta_optimizer.zero_grad() loss.backward() for param in self.meta_controller.parameters(): param.grad.data.clamp_(-1, 1) self.meta_optimizer.step() def update_controller(self, gamma=1.0): if len(self.ctrl_replay_memory) < self.batch_size: return state_goal_batch, action_batch, next_state_goal_batch, in_reward_batch, done_mask = \ self.ctrl_replay_memory.sample(self.batch_size) state_goal_batch = Variable( torch.from_numpy(state_goal_batch).type(dtype)) action_batch = Variable(torch.from_numpy(action_batch).long()) next_state_goal_batch = Variable( torch.from_numpy(next_state_goal_batch).type(dtype)) in_reward_batch = Variable( torch.from_numpy(in_reward_batch).type(dtype)) not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) if USE_CUDA: action_batch = action_batch.cuda() # Compute current Q value, controller takes only (state, goal) and output value for every (state, goal)-action pair # We choose Q based on action taken. current_Q_values = self.controller(state_goal_batch).gather( 1, action_batch.unsqueeze(1)) # Compute next Q value based on which goal gives max Q values # Detach variable from the current graph since we don't want gradients for next Q to propagated next_max_q = self.target_controller( next_state_goal_batch).detach().max(1)[0] next_Q_values = not_done_mask * next_max_q # Compute the target of the current Q values target_Q_values = in_reward_batch + (gamma * next_Q_values) # Compute Bellman error (using Huber loss) loss = F.smooth_l1_loss(current_Q_values.view(-1), target_Q_values) # Copy Q to target Q before updating parameters of Q self.target_controller.load_state_dict(self.controller.state_dict()) # Optimize the model self.ctrl_optimizer.zero_grad() loss.backward() for param in self.controller.parameters(): param.grad.data.clamp_(-1, 1) self.ctrl_optimizer.step()
def __init__(self, env, embedding_network, replay_memory=ReplayMemory(100000), initial_epsilon=1.0, final_epsilon=0.01, epsilon_decay=0.99, batch_size=8, sgd_lr=1e-6, q_lr=0.01, gamma=0.99, lookahead_horizon=100, update_period=4, kernel=inverse_distance, num_neighbors=50, max_memory=500000): ''' Instantiate an NEC Agent Parameters ---------- env: gym.Env gym environment to train on embedding_network: torch.nn.Module Model to extract the embedding from a state replay_memory: ReplayMemory Replay memory to sample from for embedding network updates initial_epsilon: float Initial epsilon for epsilon greedy search epsilon_decay: float Exponential decay factor for epsilon batch_size: int Batch size to sample from the replay memory sgd_lr: float Learning rate to use for RMSProp updates to the embedding network and DND q_lr: float Learning rate to use for Q-updates on DND updates gamma: float Discount factor lookahead_horizon: int Lookahead horizon to use for N-step Q-value estimates update_period: int Inverse of rate at which embedding network gets updated i.e. if 1 then update after every timestep, if 16 then update every 16 timesteps, etc. kernel: (torch.autograd.Variable, torch.autograd.Variable) => (torch.autograd.Variable) Kernel function to use for DND lookups num_neighbors: int Number of neighbors to return in K-NN lookups in DND max_memory: int Maximum number of key-value pairs to store in each DND ''' self.env = env self.embedding_network = embedding_network self.replay_memory = replay_memory self.epsilon = initial_epsilon self.final_epsilon = final_epsilon self.epsilon_decay = epsilon_decay self.batch_size = batch_size self.q_lr = q_lr self.gamma = gamma self.lookahead_horizon = lookahead_horizon self.update_period = update_period self.transition_queue = [] self.optimizer = optim.RMSprop( self.embedding_network.parameters(), lr=sgd_lr) self.dnd_list = [DND(kernel, num_neighbors, max_memory, sgd_lr) for _ in range(env.action_space_n)]
class Athlete(object): def __init__(self, environment_name="CartPole-v1", replay_memory_size=10000, action_threshold=0.7, batch_size=64, gamma=0.9): self.environment = gym.make(environment_name) state = self.environment.reset() self.state_shape = state.shape self.action_space = self.environment.action_space.n self.replay_memory = ReplayMemory(self.state_shape, capacity=replay_memory_size) self.model = self.build_network() self.target_model = self.build_network() self.action_threshold = action_threshold self.batch_size = batch_size self.gamma = gamma def build_network(self) -> tf.keras.Model: yield NotImplemented() def choose_action(self, state: np.ndarray, threshold: float): if random.random() > threshold: # 随机取结果 action = random.randint(0, self.action_space - 1) else: # 模型取结果 results = self.model.predict(state.reshape([1] + list(state.shape))) action = np.argmax(results, 1)[0] return action def simulate(self, action_threshold: float): state = self.environment.reset() while not self.replay_memory.is_full: action = self.choose_action(state, action_threshold) state_after, reward, done, _ = self.environment.step(action) self.replay_memory.add(state, action, reward, done, state_after) state = state_after if done: state = self.environment.reset() return True def train(self, epoch=100, model_prefix="saved_models/model"): model_prefix = model_prefix + ".epoch_{}.score_{}.h5" self.model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.01), loss=tf.losses.mean_squared_error) for i in range(epoch): print("Epoch {} running:...".format(i)) self.target_model.set_weights(self.model.get_weights()) self.replay_memory.reset() self.simulate(self.action_threshold) self.replay_memory.compute_estimated_q(self.target_model, self.gamma) num_batches = self.replay_memory.length / self.batch_size for j in range(int(num_batches)): states, actions, rewards, dones, next_states, estimated_q = self.replay_memory.random_batch( self.batch_size) self.model.fit(states, estimated_q, epochs=1, verbose=0) if i % 5 == 0: score = self.estimate_model(self.model, render=False) model_path = model_prefix.format(i, score) print("Saving model: {} ...".format(model_path)) self.model.save(model_prefix.format(i, score)) def estimate_model(self, model=None, model_path="", render=True): if not model: model: tf.keras.Model = tf.keras.models.load_model(model_path) state = self.environment.reset() reward_count = 0 while True: action = model.predict( state.reshape([ 1, ] + list(self.state_shape))) print(state) print(action) action = np.argmax(action, 1)[0] print(action) if render: time.sleep(0.05) self.environment.render() state_after, revard, done, _ = self.environment.step(action) reward_count += revard if done: break state = state_after print("Steps taken: ", reward_count) return reward_count def score_model(self, model=None, model_path="", num_iteration=10): if not model: model: tf.keras.Model = tf.keras.models.load_model(model_path) scores = [] for i in range(num_iteration): score = self.estimate_model(model) scores.append(score) avg_score = sum(scores) / num_iteration return avg_score
class DQNAgent: def __init__(self, config): self.config = config self.logger = logging.getLogger("DQNAgent") # define models (policy and target) self.policy_model = DQN(self.config) self.target_model = DQN(self.config) # define memory self.memory = ReplayMemory(self.config) # define loss self.loss = HuberLoss() # define optimizer self.optim = torch.optim.RMSprop(self.policy_model.parameters()) # define environment self.env = gym.make('CartPole-v0').unwrapped self.cartpole = CartPoleEnv(self.config.screen_width) # initialize counter self.current_episode = 0 self.current_iteration = 0 self.episode_durations = [] self.batch_size = self.config.batch_size # set cuda flag self.is_cuda = torch.cuda.is_available() if self.is_cuda and not self.config.cuda: self.logger.info( "WARNING: You have a CUDA device, so you should probably enable CUDA" ) self.cuda = self.is_cuda & self.config.cuda if self.cuda: self.logger.info("Program will run on *****GPU-CUDA***** ") print_cuda_statistics() self.device = torch.device("cuda") torch.cuda.set_device(self.config.gpu_device) else: self.logger.info("Program will run on *****CPU***** ") self.device = torch.device("cpu") self.policy_model = self.policy_model.to(self.device) self.target_model = self.target_model.to(self.device) self.loss = self.loss.to(self.device) # Initialize Target model with policy model state dict self.target_model.load_state_dict(self.policy_model.state_dict()) self.target_model.eval() # Summary Writer self.summary_writer = SummaryWriter(log_dir=self.config.summary_dir, comment='DQN') def load_checkpoint(self, file_name): filename = self.config.checkpoint_dir + file_name try: self.logger.info("Loading checkpoint '{}'".format(filename)) checkpoint = torch.load(filename) self.current_episode = checkpoint['episode'] self.current_iteration = checkpoint['iteration'] self.policy_model.load_state_dict(checkpoint['state_dict']) self.optim.load_state_dict(checkpoint['optimizer']) self.logger.info( "Checkpoint loaded successfully from '{}' at (epoch {}) at (iteration {})\n" .format(self.config.checkpoint_dir, checkpoint['episode'], checkpoint['iteration'])) except OSError as e: self.logger.info( "No checkpoint exists from '{}'. Skipping...".format( self.config.checkpoint_dir)) self.logger.info("**First time to train**") def save_checkpoint(self, file_name="checkpoint.pth.tar", is_best=0): state = { 'episode': self.current_episode, 'iteration': self.current_iteration, 'state_dict': self.policy_model.state_dict(), 'optimizer': self.optim.state_dict(), } # Save the state torch.save(state, self.config.checkpoint_dir + file_name) # If it is the best copy it to another file 'model_best.pth.tar' if is_best: shutil.copyfile(self.config.checkpoint_dir + file_name, self.config.checkpoint_dir + 'model_best.pth.tar') def run(self): """ This function will the operator :return: """ try: self.train() except KeyboardInterrupt: self.logger.info("You have entered CTRL+C.. Wait to finalize") def select_action(self, state): """ The action selection function, it either uses the model to choose an action or samples one uniformly. :param state: current state of the model :return: """ if self.cuda: state = state.cuda() sample = random.random() eps_threshold = self.config.eps_start + ( self.config.eps_start - self.config.eps_end) * math.exp( -1. * self.current_iteration / self.config.eps_decay) self.current_iteration += 1 if sample > eps_threshold: with torch.no_grad(): return self.policy_model(state).max(1)[1].view(1, 1) # size (1,1) else: return torch.tensor([[random.randrange(2)]], device=self.device, dtype=torch.long) def optimize_policy_model(self): """ performs a single step of optimization for the policy model :return: """ if self.memory.length() < self.batch_size: return # sample a batch transitions = self.memory.sample_batch(self.batch_size) one_batch = Transition(*zip(*transitions)) # create a mask of non-final states non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, one_batch.next_state)), device=self.device, dtype=torch.uint8) # [128] non_final_next_states = torch.cat([ s for s in one_batch.next_state if s is not None ]) # [< 128, 3, 40, 80] # concatenate all batch elements into one state_batch = torch.cat(one_batch.state) # [128, 3, 40, 80] action_batch = torch.cat(one_batch.action) # [128, 1] reward_batch = torch.cat(one_batch.reward) # [128] state_batch = state_batch.to(self.device) non_final_next_states = non_final_next_states.to(self.device) curr_state_values = self.policy_model(state_batch) # [128, 2] curr_state_action_values = curr_state_values.gather( 1, action_batch) # [128, 1] # Get V(s_{t+1}) for all next states. By definition we set V(s)=0 if s is a terminal state. next_state_values = torch.zeros(self.batch_size, device=self.device) # [128] next_state_values[non_final_mask] = self.target_model( non_final_next_states).max(1)[0].detach() # [< 128] # Get the expected Q values expected_state_action_values = ( next_state_values * self.config.gamma) + reward_batch # [128] # compute loss: temporal difference error loss = self.loss(curr_state_action_values, expected_state_action_values.unsqueeze(1)) # optimizer step self.optim.zero_grad() loss.backward() for param in self.policy_model.parameters(): param.grad.data.clamp_(-1, 1) self.optim.step() return loss def train(self): """ Training loop based on the number of episodes :return: """ for episode in tqdm( range(self.current_episode, self.config.num_episodes)): self.current_episode = episode # reset environment self.env.reset() self.train_one_epoch() # The target network has its weights kept frozen most of the time if self.current_episode % self.config.target_update == 0: self.target_model.load_state_dict( self.policy_model.state_dict()) self.env.render() self.env.close() def train_one_epoch(self): """ One episode of training; it samples an action, observe next screen and optimize the model once :return: """ episode_duration = 0 prev_frame = self.cartpole.get_screen(self.env) curr_frame = self.cartpole.get_screen(self.env) # get state curr_state = curr_frame - prev_frame while (1): episode_duration += 1 # select action action = self.select_action(curr_state) # perform action and get reward _, reward, done, _ = self.env.step(action.item()) if self.cuda: reward = torch.Tensor([reward]).to(self.device) else: reward = torch.Tensor([reward]).to(self.device) prev_frame = curr_frame curr_frame = self.cartpole.get_screen(self.env) # assign next state if done: next_state = None else: next_state = curr_frame - prev_frame # add this transition into memory self.memory.push_transition(curr_state, action, next_state, reward) curr_state = next_state # Policy model optimization step curr_loss = self.optimize_policy_model() if curr_loss is not None: if self.cuda: curr_loss = curr_loss.cpu() self.summary_writer.add_scalar("Temporal Difference Loss", curr_loss.detach().numpy(), self.current_iteration) # check if done if done: break self.summary_writer.add_scalar("Training Episode Duration", episode_duration, self.current_episode) def validate(self): pass def finalize(self): """ Finalize all the operations of the 2 Main classes of the process the operator and the data loader :return: """ self.logger.info( "Please wait while finalizing the operation.. Thank you") self.save_checkpoint() self.summary_writer.export_scalars_to_json("{}all_scalars.json".format( self.config.summary_dir)) self.summary_writer.close()
class MotionAthlete(Athlete): def __init__(self, environment_name="Acrobot-v1", replay_memory_size=10000, action_threshold=0.7, batch_size=64, gamma=0.9): super(MotionAthlete, self).__init__(environment_name, replay_memory_size, action_threshold, batch_size, gamma) self.environment.close() del self.environment self.environment = EnvironmentWrapper(environment_name) frame = self.environment.reset() frmae_shape = frame.shape self.motion_tracer = MotionTracer(frame_shape=frmae_shape) self.state_shape = self.motion_tracer.state_shape self.replay_memory = ReplayMemory(self.state_shape, capacity=replay_memory_size) del self.model del self.target_model self.model = self.build_network() self.target_model = self.build_network() def simulate(self, action_threshold: float): print("Simulating...") frame = self.environment.reset() self.motion_tracer.reset() self.motion_tracer.add_frame(frame) while not self.replay_memory.is_full: state = self.motion_tracer.get_state() action = self.choose_action(state, action_threshold) frame_after, reward, done, _ = self.environment.step(action) self.motion_tracer.add_frame(frame_after) state_next = self.motion_tracer.get_state() self.replay_memory.add(state, action, reward, done, state_next) if done: frame = self.environment.reset() self.motion_tracer.reset() self.motion_tracer.add_frame(frame) print("Simulation finished") return True def estimate_model(self, model=None, model_path="", render=True): if not model: model: tf.keras.Model = tf.keras.models.load_model(model_path) frame = self.environment.reset() self.motion_tracer.reset() self.motion_tracer.add_frame(frame) state = self.motion_tracer.get_state() reward_count = 0 step_count = 0 while True: step_count += 1 action = model.predict( state.reshape([ 1, ] + list(self.state_shape))) print(frame) print(action) action = np.argmax(action, 1)[0] print(action) if render: time.sleep(0.05) self.environment.render() frame_after, revard, done, _ = self.environment.step(action) reward_count += revard if done: break self.motion_tracer.add_frame(frame_after) state = self.motion_tracer.get_state() print("Total reward: ", reward_count) print("Total step: ", step_count) return reward_count
class PseudoCountQLearner(ValueBasedLearner): def __init__(self, args): super(PseudoCountQLearner, self).__init__(args) self.cts_eta = .9 self.batch_size = 32 self.replay_memory = ReplayMemory(args.replay_size) #more cython tuning could useful here self.density_model = CTSDensityModel(height=args.cts_rescale_dim, width=args.cts_rescale_dim, num_bins=args.cts_bins, beta=0.05) def generate_final_epsilon(self): return 0.1 def _get_summary_vars(self): q_vars = super(PseudoCountQLearner, self)._get_summary_vars() bonus_q25 = tf.Variable(0., name='novelty_bonus_q25') s1 = tf.summary.scalar('Novelty_Bonus_q25_{}'.format(self.actor_id), bonus_q25) bonus_q50 = tf.Variable(0., name='novelty_bonus_q50') s2 = tf.summary.scalar('Novelty_Bonus_q50_{}'.format(self.actor_id), bonus_q50) bonus_q75 = tf.Variable(0., name='novelty_bonus_q75') s3 = tf.summary.scalar('Novelty_Bonus_q75_{}'.format(self.actor_id), bonus_q75) return q_vars + [bonus_q25, bonus_q50, bonus_q75] def prepare_state(self, state, total_episode_reward, steps_at_last_reward, ep_t, episode_ave_max_q, episode_over, bonuses): # prevent the agent from getting stuck reset_game = False if (self.local_step - steps_at_last_reward > 5000 or (self.emulator.get_lives() == 0 and self.emulator.game not in ONE_LIFE_GAMES)): steps_at_last_reward = self.local_step episode_over = True reset_game = True # Start a new game on reaching terminal state if episode_over: T = self.global_step.value() t = self.local_step e_prog = float(t) / self.epsilon_annealing_steps episode_ave_max_q = episode_ave_max_q / float(ep_t) s1 = "Q_MAX {0:.4f}".format(episode_ave_max_q) s2 = "EPS {0:.4f}".format(self.epsilon) self.scores.insert(0, total_episode_reward) if len(self.scores) > 100: self.scores.pop() logger.info('T{0} / STEP {1} / REWARD {2} / {3} / {4}'.format( self.actor_id, T, total_episode_reward, s1, s2)) logger.info( 'ID: {0} -- RUNNING AVG: {1:.0f} ± {2:.0f} -- BEST: {3:.0f}'. format( self.actor_id, np.array(self.scores).mean(), 2 * np.array(self.scores).std(), max(self.scores), )) if self.is_master() and self.is_train: stats = [ total_episode_reward, episode_ave_max_q, self.epsilon, np.percentile(bonuses, 25), np.percentile(bonuses, 50), np.percentile(bonuses, 75), ] feed_dict = { self.summary_ph[i]: stats[i] for i in range(len(stats)) } res = self.session.run(self.update_ops + [self.summary_op], feed_dict=feed_dict) self.summary_writer.add_summary(res[-1], self.global_step.value()) if reset_game or self.emulator.game in ONE_LIFE_GAMES: state = self.emulator.get_initial_state() ep_t = 0 total_episode_reward = 0 episode_ave_max_q = 0 episode_over = False return state, total_episode_reward, steps_at_last_reward, ep_t, episode_ave_max_q, episode_over def batch_update(self): if len(self.replay_memory) < self.batch_size: return s_i, a_i, r_i, s_f, is_terminal = self.replay_memory.sample_batch( self.batch_size) q_target_values = self.session.run( self.target_network.output_layer, feed_dict={self.target_network.input_ph: s_f}) y_target = r_i + self.cts_eta * self.gamma * q_target_values.max( axis=1) * (1 - is_terminal.astype(np.int)) feed_dict = { self.local_network.input_ph: s_i, self.local_network.target_ph: y_target, self.local_network.selected_action_ph: a_i } grads = self.session.run(self.local_network.get_gradients, feed_dict=feed_dict) self.apply_gradients_to_shared_memory_vars(grads) def _run(self): """ Main actor learner loop for n-step Q learning. """ if not self.is_train: return self.test() logger.debug("Actor {} resuming at Step {}, {}".format( self.actor_id, self.global_step.value(), time.ctime())) s = self.emulator.get_initial_state() s_batch = [] a_batch = [] y_batch = [] bonuses = deque(maxlen=100) exec_update_target = False total_episode_reward = 0 episode_ave_max_q = 0 episode_over = False qmax_down = 0 qmax_up = 0 prev_qmax = -10 * 6 low_qmax = 0 ep_t = 0 t0 = time.time() while (self.global_step.value() < self.max_global_steps): # Sync local learning net with shared mem self.sync_net_with_shared_memory(self.local_network, self.learning_vars) self.save_vars() rewards = [] states = [] actions = [] local_step_start = self.local_step while not episode_over: # Choose next action and execute it a, readout_t = self.choose_next_action(s) new_s, reward, episode_over = self.emulator.next(a) total_episode_reward += reward current_frame = new_s[..., -1] bonus = self.density_model.update(current_frame) bonuses.append(bonus) if self.is_master() and (self.local_step % 200 == 0): bonus_array = np.array(bonuses) logger.debug( 'Mean Bonus={:.4f} / Max Bonus={:.4f} / STEPS/s={}'. format(bonus_array.mean(), bonus_array.max(), 100. / (time.time() - t0))) t0 = time.time() # Rescale or clip immediate reward reward = self.rescale_reward( self.rescale_reward(reward) + bonus) ep_t += 1 rewards.append(reward) states.append(s) actions.append(a) s = new_s self.local_step += 1 episode_ave_max_q += np.max(readout_t) global_step, update_target = self.global_step.increment( self.q_target_update_steps) if update_target: update_target = False exec_update_target = True if self.local_step % 4 == 0: self.batch_update() self.local_network.global_step = global_step else: mc_returns = list() running_total = 0.0 for r in reversed(rewards): running_total = r + self.gamma * running_total mc_returns.insert(0, running_total) mixed_returns = self.cts_eta * np.array(rewards) + ( 1 - self.cts_eta) * np.array(mc_returns) states.append(new_s) episode_length = len(rewards) for i in range(episode_length): self.replay_memory.append( (states[i], actions[i], mixed_returns[i], states[i + 1], i + 1 == episode_length)) if exec_update_target: self.update_target() exec_update_target = False # Sync local tensorflow target network params with shared target network params if self.target_update_flags.updated[self.actor_id] == 1: self.sync_net_with_shared_memory(self.target_network, self.target_vars) self.target_update_flags.updated[self.actor_id] = 0 s, total_episode_reward, _, ep_t, episode_ave_max_q, episode_over = \ self.prepare_state(s, total_episode_reward, self.local_step, ep_t, episode_ave_max_q, episode_over, bonuses)
def __init__(self, env, embedding_network, replay_memory=ReplayMemory(500000), epsilon_schedule=epsilon_schedule, batch_size=8, sgd_learning_rate=1e-2, q_learning_rate=0.5, gamma=0.99, lookahead_horizon=100, update_period=4, kernel=inverse_distance, num_neighbors=50, max_memory=125000, warmup_period=1000, test_period=10): """ Instantiate an NEC Agent Parameters ---------- env: gym.Env gym environment to train on embedding_network: torch.nn.Module Model to extract the embedding from a state replay_memory: ReplayMemory Replay memory to sample from for embedding network updates epsilon_schedule: (int) => (float) Function that determines the epsilon for epsilon-greedy exploration from the timestep t batch_size: int Batch size to sample from the replay memory sgd_learning_rate: float Learning rate to use for RMSProp updates to the embedding network q_learning_rate: float Learning rate to use for Q-updates on DND updates gamma: float Discount factor lookahead_horizon: int Lookahead horizon to use for N-step Q-value estimates update_period: int Inverse of rate at which embedding network gets updated i.e. if 1 then update after every timestep, if 16 then update every 16 timesteps, etc. kernel: (torch.autograd.Variable, torch.autograd.Variable) => (torch.autograd.Variable) Kernel function to use for DND lookups num_neighbors: int Number of neighbors to return in K-NN lookups in DND max_memory: int Maximum number of key-value pairs to store in DND warmup_period: int Number of timesteps to act randomly before learning test_period: int Number of episodes between each test iteration """ self.env = env self.embedding_network = embedding_network if use_cuda: self.embedding_network.cuda() self.replay_memory = replay_memory self.epsilon_schedule = epsilon_schedule self.batch_size = batch_size self.q_learning_rate = q_learning_rate self.gamma = gamma self.lookahead_horizon = lookahead_horizon self.update_period = update_period self.warmup_period = warmup_period self.test_period = test_period self.transition_queue = [] self.optimizer = optim.RMSprop(self.embedding_network.parameters(), lr=sgd_learning_rate) state_dict = self.embedding_network.state_dict() self.dnd_list = [ DND(kernel, num_neighbors, max_memory, state_dict[next(reversed(state_dict))].size()[0]) for _ in range(env.action_space.n) ]
class DQNDoubleQAgent(BaseAgent): def __init__(self): super(DQNDoubleQAgent, self).__init__() self.training = False self.max_frames = 2000000 self._epsilon = Epsilon(start=1.0, end=0.1, update_increment=0.0001) self.gamma = 0.99 self.train_q_per_step = 4 self.train_q_batch_size = 256 self.steps_before_training = 10000 self.target_q_update_frequency = 50000 self._Q_weights_path = "./data/SC2DoubleQAgent" self._Q = DQNCNN() if os.path.isfile(self._Q_weights_path): self._Q.load_state_dict(torch.load(self._Q_weights_path)) print("Loading weights:", self._Q_weights_path) self._Qt = copy.deepcopy(self._Q) self._Q.cuda() self._Qt.cuda() self._optimizer = optim.Adam(self._Q.parameters(), lr=1e-8) self._criterion = nn.MSELoss() self._memory = ReplayMemory(100000) self._loss = deque(maxlen=1000) self._max_q = deque(maxlen=1000) self._action = None self._screen = None self._fig = plt.figure() self._plot = [plt.subplot(2, 2, i + 1) for i in range(4)] self._screen_size = 28 def get_env_action(self, action, obs): action = np.unravel_index(action, [1, self._screen_size, self._screen_size]) target = [action[2], action[1]] command = _MOVE_SCREEN #action[0] # removing unit selection out of the equation # if command == 0: # command = _SELECT_POINT # else: # command = _MOVE_SCREEN if command in obs.observation["available_actions"]: return actions.FunctionCall(command, [[0], target]) else: return actions.FunctionCall(_NO_OP, []) ''' :param s = obs.observation["screen"] :returns action = argmax action ''' def get_action(self, s): # greedy if np.random.rand() > self._epsilon.value(): # print("greedy action") s = Variable(torch.from_numpy(s).cuda()) s = s.unsqueeze(0).float() self._action = self._Q(s).squeeze().cpu().data.numpy() return self._action.argmax() # explore else: # print("random choice") # action = np.random.choice([0, 1]) action = 0 target = np.random.randint(0, self._screen_size, size=2) return action * self._screen_size * self._screen_size + target[ 0] * self._screen_size + target[1] def select_friendly_action(self, obs): player_relative = obs.observation["screen"][_PLAYER_RELATIVE] friendly_y, friendly_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() target = [int(friendly_x.mean()), int(friendly_y.mean())] return actions.FunctionCall(_SELECT_POINT, [[0], target]) def train(self, env, training=True): self._epsilon.isTraining = training self.run_loop(env, self.max_frames) if self._epsilon.isTraining: torch.save(self._Q.state_dict(), self._Q_weights_path) def run_loop(self, env, max_frames=0): """A run loop to have agents and an environment interact.""" total_frames = 0 start_time = time.time() action_spec = env.action_spec() observation_spec = env.observation_spec() self.setup(observation_spec, action_spec) try: while True: obs = env.reset()[0] # remove unit selection from the equation by selecting the friendly on every new game. select_friendly = self.select_friendly_action(obs) obs = env.step([select_friendly])[0] # distance = self.get_reward(obs.observation["screen"]) self.reset() while True: total_frames += 1 self._screen = obs.observation["screen"][5] s = np.expand_dims(obs.observation["screen"][5], 0) # plt.imshow(s[5]) # plt.pause(0.00001) if max_frames and total_frames >= max_frames: print("max frames reached") return if obs.last(): print("total frames:", total_frames, "Epsilon:", self._epsilon.value()) self._epsilon.increment() break action = self.get_action(s) env_actions = self.get_env_action(action, obs) obs = env.step([env_actions])[0] r = obs.reward s1 = np.expand_dims(obs.observation["screen"][5], 0) done = r > 0 if self._epsilon.isTraining: transition = Transition(s, action, s1, r, done) self._memory.push(transition) if total_frames % self.train_q_per_step == 0 and total_frames > self.steps_before_training and self._epsilon.isTraining: self.train_q() # pass if total_frames % self.target_q_update_frequency == 0 and total_frames > self.steps_before_training and self._epsilon.isTraining: self._Qt = copy.deepcopy(self._Q) self.show_chart() if total_frames % 1000 == 0 and total_frames > self.steps_before_training and self._epsilon.isTraining: self.show_chart() if not self._epsilon.isTraining and total_frames % 3 == 0: self.show_chart() except KeyboardInterrupt: pass finally: print("finished") elapsed_time = time.time() - start_time print("Took %.3f seconds for %s steps: %.3f fps" % (elapsed_time, total_frames, total_frames / elapsed_time)) def get_reward(self, s): player_relative = s[_PLAYER_RELATIVE] neutral_y, neutral_x = (player_relative == _PLAYER_NEUTRAL).nonzero() neutral_target = [int(neutral_x.mean()), int(neutral_y.mean())] friendly_y, friendly_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() if len(friendly_y) == 0 or len(friendly_x) == 0: # this is shit return 0 friendly_target = [int(friendly_x.mean()), int(friendly_y.mean())] distance_2 = (neutral_target[0] - friendly_target[0])**2 + ( neutral_target[1] - friendly_target[1])**2 distance = math.sqrt(distance_2) return -distance def show_chart(self): self._plot[0].clear() self._plot[0].set_xlabel('Last 1000 Training Cycles') self._plot[0].set_ylabel('Loss') self._plot[0].plot(list(self._loss)) self._plot[1].clear() self._plot[1].set_xlabel('Last 1000 Training Cycles') self._plot[1].set_ylabel('Max Q') self._plot[1].plot(list(self._max_q)) self._plot[2].clear() self._plot[2].set_title("screen") self._plot[2].imshow(self._screen) self._plot[3].clear() self._plot[3].set_title("action") self._plot[3].imshow(self._action) plt.pause(0.00001) def train_q(self): if self.train_q_batch_size >= len(self._memory): return s, a, s_1, r, done = self._memory.sample(self.train_q_batch_size) s = Variable(torch.from_numpy(s).cuda()).float() a = Variable(torch.from_numpy(a).cuda()).long() s_1 = Variable(torch.from_numpy(s_1).cuda(), volatile=True).float() r = Variable(torch.from_numpy(r).cuda()).float() done = Variable(torch.from_numpy(1 - done).cuda()).float() # Q_sa = r + gamma * max(Q_s'a') Q = self._Q(s) Q = Q.view(self.train_q_batch_size, -1) Q = Q.gather(1, a) Qt = self._Qt(s_1).view(self.train_q_batch_size, -1) # double Q best_action = self._Q(s_1).view(self.train_q_batch_size, -1).max(dim=1, keepdim=True)[1] y = r + done * self.gamma * Qt.gather(1, best_action) # Q # y = r + done * self.gamma * Qt.max(dim=1)[0].unsqueeze(1) y.volatile = False loss = self._criterion(Q, y) self._loss.append(loss.sum().cpu().data.numpy()) self._max_q.append(Q.max().cpu().data.numpy()[0]) self._optimizer.zero_grad() # zero the gradient buffers loss.backward() self._optimizer.step()