def __init__(self, optimizer_spec, num_goal=81, num_action=81, replay_memory_size=10000, subgoals = 81, screen_size = (500,500), batch_size=128): ############### # BUILD MODEL # ############### self.num_goal = num_goal self.num_action = num_action self.batch_size = batch_size # Construct meta-controller and controller self.meta_controller = MetaController().type(dtype) self.target_meta_controller = MetaController().type(dtype) self.controller = Controller().type(dtype) self.target_controller = Controller().type(dtype) # Construct the optimizers for meta-controller and controller self.meta_optimizer = optimizer_spec.constructor(self.meta_controller.parameters(), **optimizer_spec.kwargs) self.ctrl_optimizer = optimizer_spec.constructor(self.controller.parameters(), **optimizer_spec.kwargs) # Construct the replay memory for meta-controller and controller self.meta_replay_memory = ReplayMemory(replay_memory_size) self.ctrl_replay_memory = ReplayMemory(replay_memory_size) self.subgoals = subgoals self.screen_size = screen_size self.idx_2_action = self.action_dict()
def __init__(self, optimizer_spec, num_goal=6, num_action=2, replay_memory_size=10000, batch_size=128): ############### # BUILD MODEL # ############### self.num_goal = num_goal self.num_action = num_action self.batch_size = batch_size # Construct meta-controller and controller self.meta_controller = MetaController().type(dtype) self.target_meta_controller = MetaController().type(dtype) self.controller = Controller().type(dtype) self.target_controller = Controller().type(dtype) # Construct the optimizers for meta-controller and controller self.meta_optimizer = optimizer_spec.constructor( self.meta_controller.parameters(), **optimizer_spec.kwargs) self.ctrl_optimizer = optimizer_spec.constructor( self.controller.parameters(), **optimizer_spec.kwargs) # Construct the replay memory for meta-controller and controller self.meta_replay_memory = ReplayMemory(replay_memory_size) self.ctrl_replay_memory = ReplayMemory(replay_memory_size)
def __init__(self, args): super(BasePGQLearner, self).__init__(args) self.q_update_counter = 0 self.replay_size = args.replay_size self.pgq_fraction = args.pgq_fraction self.batch_update_size = args.batch_update_size scope_name = 'local_learning_{}'.format(self.actor_id) conf_learning = {'name': scope_name, 'input_shape': self.input_shape, 'num_act': self.num_actions, 'args': args} with tf.device('/cpu:0'): self.local_network = PolicyValueNetwork(conf_learning) with tf.device('/gpu:0'), tf.variable_scope('', reuse=True): self.batch_network = PolicyValueNetwork(conf_learning) self._build_q_ops() self.reset_hidden_state() self.replay_memory = ReplayMemory( self.replay_size, self.local_network.get_input_shape(), self.num_actions) if self.is_master(): var_list = self.local_network.params self.saver = tf.train.Saver(var_list=var_list, max_to_keep=3, keep_checkpoint_every_n_hours=2)
def __init__(self): super(DQNDoubleQAgent, self).__init__() self.training = False self.max_frames = 2000000 self._epsilon = Epsilon(start=1.0, end=0.1, update_increment=0.0001) self.gamma = 0.99 self.train_q_per_step = 4 self.train_q_batch_size = 256 self.steps_before_training = 10000 self.target_q_update_frequency = 50000 self._Q_weights_path = "./data/SC2DoubleQAgent" self._Q = DQNCNN() if os.path.isfile(self._Q_weights_path): self._Q.load_state_dict(torch.load(self._Q_weights_path)) print("Loading weights:", self._Q_weights_path) self._Qt = copy.deepcopy(self._Q) self._Q.cuda() self._Qt.cuda() self._optimizer = optim.Adam(self._Q.parameters(), lr=1e-8) self._criterion = nn.MSELoss() self._memory = ReplayMemory(100000) self._loss = deque(maxlen=1000) self._max_q = deque(maxlen=1000) self._action = None self._screen = None self._fig = plt.figure() self._plot = [plt.subplot(2, 2, i + 1) for i in range(4)] self._screen_size = 28
def __init__(self, args): self.args = args super(AElearner, self).__init__(args) self.cts_eta = args.cts_eta self.cts_beta = args.cts_beta self.ae_delta = args.ae_delta self.batch_size = args.batch_update_size self.replay_memory = ReplayMemory( args.replay_size, self.local_network_upper.get_input_shape(), # self.local_network.get_input_shape(), self.num_actions) #inits desity model(chooses how many steps for update ) #20 * q targt update steps self._init_density_model(args) #computes loss self._double_dqn_op() self.which_net_to_update_counter = 0 self.ae_counter = 0 self.epsilon_greedy_counter = 0 self.total_ae_counter = 0 self.total_epsilon_greedy_counter = 0 self.q_values_upper_max = [] self.q_values_lower_max = [] self.ae_valid_actions = True self.action_meanings = self.emulator.env.unwrapped.get_action_meanings( ) self.minimized_actions_counter = { value: 0 for value in self.action_meanings } print(self.minimized_actions_counter)
def __init__(self, actor_optimizer_spec, critic_optimizer_spec, num_feature, num_action, replay_memory_size=1000000, batch_size=64, tau=0.001): ############### # BUILD MODEL # ############### self.num_feature = num_feature self.num_action = num_action self.batch_size = batch_size self.tau = tau # Construct actor and critic self.actor = Actor(num_feature, num_action).type(dtype) self.target_actor = Actor(num_feature, num_action).type(dtype) self.critic = Critic(num_feature, num_action).type(dtype) self.target_critic = Critic(num_feature, num_action).type(dtype) # Construct the optimizers for actor and critic self.actor_optimizer = actor_optimizer_spec.constructor( self.actor.parameters(), **actor_optimizer_spec.kwargs) self.critic_optimizer = critic_optimizer_spec.constructor( self.critic.parameters(), **critic_optimizer_spec.kwargs) # Construct the replay memory self.replay_memory = ReplayMemory(replay_memory_size)
def __init__(self, config): self.config = config self.logger = logging.getLogger("DQNAgent") # define models (policy and target) self.policy_model = DQN(self.config) self.target_model = DQN(self.config) # define memory self.memory = ReplayMemory(self.config) # define loss self.loss = HuberLoss() # define optimizer self.optim = torch.optim.RMSprop(self.policy_model.parameters()) # define environment self.env = gym.make('CartPole-v0').unwrapped self.cartpole = CartPoleEnv(self.config.screen_width) # initialize counter self.current_episode = 0 self.current_iteration = 0 self.episode_durations = [] self.batch_size = self.config.batch_size # set cuda flag self.is_cuda = torch.cuda.is_available() if self.is_cuda and not self.config.cuda: self.logger.info( "WARNING: You have a CUDA device, so you should probably enable CUDA" ) self.cuda = self.is_cuda & self.config.cuda if self.cuda: self.logger.info("Program will run on *****GPU-CUDA***** ") print_cuda_statistics() self.device = torch.device("cuda") torch.cuda.set_device(self.config.gpu_device) else: self.logger.info("Program will run on *****CPU***** ") self.device = torch.device("cpu") self.policy_model = self.policy_model.to(self.device) self.target_model = self.target_model.to(self.device) self.loss = self.loss.to(self.device) # Initialize Target model with policy model state dict self.target_model.load_state_dict(self.policy_model.state_dict()) self.target_model.eval() # Summary Writer self.summary_writer = SummaryWriter(log_dir=self.config.summary_dir, comment='DQN')
def __init__(self, args): self.final_epsilon = args.final_epsilon super(PseudoCountQLearner, self).__init__(args) self.cts_eta = args.cts_eta self.cts_beta = args.cts_beta self.batch_size = args.batch_update_size self.replay_memory = ReplayMemory(args.replay_size) self._init_density_model(args) self._double_dqn_op()
def test_zero_step(self): self.memory = ReplayMemory(capacity=10, multi_step_n=0) for i in range(5): a = Transition([0, 1, 2, i], 0, [4, 5, 6, i*i], 1, False) self.memory.push(a) final = Transition([0, 1, 2, 10], 0, [4, 5, 6, 100], 10, True) self.memory.push(final) self.assertEqual(self.memory.memory[0].r, 1) self.assertEqual(self.memory.memory[3].r, 1) self.assertEqual(self.memory.memory[4].r, 1) self.assertEqual(self.memory.memory[5].r, 10)
def __init__(self, args): super(PseudoCountQLearner, self).__init__(args) self.cts_eta = .9 self.batch_size = 32 self.replay_memory = ReplayMemory(args.replay_size) #more cython tuning could useful here self.density_model = CTSDensityModel(height=args.cts_rescale_dim, width=args.cts_rescale_dim, num_bins=args.cts_bins, beta=0.05)
def __init__(self, args): self.args = args super(PseudoCountQLearner, self).__init__(args) self.cts_eta = args.cts_eta self.cts_beta = args.cts_beta self.batch_size = args.batch_update_size self.replay_memory = ReplayMemory(args.replay_size, self.local_network.get_input_shape(), self.num_actions) self._init_density_model(args) self._double_dqn_op()
def __init__(self, args): self.args = args super(PseudoCountQLearner, self).__init__(args) self.cts_eta = args.cts_eta self.cts_beta = args.cts_beta self.batch_size = args.batch_update_size self.replay_memory = ReplayMemory(args.replay_size, self.local_network.get_input_shape(), self.num_actions) #inits desity model(chooses how many steps for update ) #20 * q targt update steps self._init_density_model(args) #computes loss self._double_dqn_op()
def _build_q_ops(self): # pgq specific initialization self.pgq_fraction = self.pgq_fraction self.batch_size = self.batch_update_size self.replay_memory = ReplayMemory(self.replay_size) self.q_tilde = self.batch_network.beta * ( self.batch_network.log_output_layer_pi + tf.expand_dims(self.batch_network.output_layer_entropy, 1)) + self.batch_network.output_layer_v self.Qi, self.Qi_plus_1 = tf.split(axis=0, num_or_size_splits=2, value=self.q_tilde) self.V, _ = tf.split(axis=0, num_or_size_splits=2, value=self.batch_network.output_layer_v) self.log_pi, _ = tf.split( axis=0, num_or_size_splits=2, value=tf.expand_dims(self.batch_network.log_output_selected_action, 1)) self.R = tf.placeholder('float32', [None], name='1-step_reward') self.terminal_indicator = tf.placeholder(tf.float32, [None], name='terminal_indicator') self.max_TQ = self.gamma * tf.reduce_max( self.Qi_plus_1, 1) * (1 - self.terminal_indicator) self.Q_a = tf.reduce_sum( self.Qi * tf.split(axis=0, num_or_size_splits=2, value=self.batch_network.selected_action_ph)[0], 1) self.q_objective = -self.pgq_fraction * tf.reduce_mean( tf.stop_gradient(self.R + self.max_TQ - self.Q_a) * (self.V[:, 0] + self.log_pi[:, 0])) self.V_params = self.batch_network.params self.q_gradients = tf.gradients(self.q_objective, self.V_params) if self.batch_network.clip_norm_type == 'global': self.q_gradients = tf.clip_by_global_norm( self.q_gradients, self.batch_network.clip_norm)[0] elif self.batch_network.clip_norm_type == 'local': self.q_gradients = [ tf.clip_by_norm(g, self.batch_network.clip_norm) for g in self.q_gradients ]
def __init__(self, env, args, device='cpu'): """ Instantiate an NEC Agent ---------- env: gym.Env gym environment to train on args: args class from argparser args are from from train.py: see train.py for help with each arg device: string 'cpu' or 'cuda:0' depending on use_cuda flag from train.py """ self.environment_type = args.environment_type self.env = env self.device = device # Hyperparameters self.epsilon = args.initial_epsilon self.final_epsilon = args.final_epsilon self.epsilon_decay = args.epsilon_decay self.gamma = args.gamma self.N = args.N # Transition queue and replay memory self.transition_queue = [] self.replay_every = args.replay_every self.replay_buffer_size = args.replay_buffer_size self.replay_memory = ReplayMemory(self.replay_buffer_size) # CNN for state embedding network self.frames_to_stack = args.frames_to_stack self.embedding_size = args.embedding_size self.in_height = args.in_height self.in_width = args.in_width self.cnn = CNN(self.frames_to_stack, self.embedding_size, self.in_height, self.in_width).to(self.device) # Differentiable Neural Dictionary (DND): one for each action self.kernel = inverse_distance self.num_neighbors = args.num_neighbors self.max_memory = args.max_memory self.lr = args.lr self.dnd_list = [] for i in range(env.action_space.n): self.dnd_list.append( DND(self.kernel, self.num_neighbors, self.max_memory, args.optimizer, self.lr)) # Optimizer for state embedding CNN self.q_lr = args.q_lr self.batch_size = args.batch_size self.optimizer = get_optimizer(args.optimizer, self.cnn.parameters(), self.lr)
def __init__(self, environment_name="CartPole-v1", replay_memory_size=10000, action_threshold=0.7, batch_size=64, gamma=0.9): self.environment = gym.make(environment_name) state = self.environment.reset() self.state_shape = state.shape self.action_space = self.environment.action_space.n self.replay_memory = ReplayMemory(self.state_shape, capacity=replay_memory_size) self.model = self.build_network() self.target_model = self.build_network() self.action_threshold = action_threshold self.batch_size = batch_size self.gamma = gamma
def __init__( self, state_size, n_actions, args, device=torch.device("cuda" if torch.cuda.is_available() else "cpu")): self.device = device # Exploration / Exploitation params. self.steps_done = 0 self.eps_threshold = 1 self.eps_start = args.eps_start self.eps_end = args.eps_end self.eps_decay = args.eps_decay # RL params self.target_update = args.target_update self.discount = args.discount # Env params self.n_actions = n_actions self.state_size = state_size # Deep q networks params self.layers = args.layers self.batch_size = args.batch_size self.policy_net = DQN(state_size, n_actions, layers=self.layers).to(self.device).float() self.target_net = None self.grad_clip = args.grad_clip if str(args.optimizer).lower() == 'adam': self.optimizer = optim.Adam(self.policy_net.parameters()) if str(args.optimizer).lower() == 'rmsprop': self.optimizer = optim.RMSprop(self.policy_net.parameters()) else: raise NotImplementedError self.memory = ReplayMemory(args.replay_size) # Performance buffers. self.rewards_list = []
def __init__(self, environment_name="Acrobot-v1", replay_memory_size=10000, action_threshold=0.7, batch_size=64, gamma=0.9): super(MotionAthlete, self).__init__(environment_name, replay_memory_size, action_threshold, batch_size, gamma) self.environment.close() del self.environment self.environment = EnvironmentWrapper(environment_name) frame = self.environment.reset() frmae_shape = frame.shape self.motion_tracer = MotionTracer(frame_shape=frmae_shape) self.state_shape = self.motion_tracer.state_shape self.replay_memory = ReplayMemory(self.state_shape, capacity=replay_memory_size) del self.model del self.target_model self.model = self.build_network() self.target_model = self.build_network()
def train_model(env, conv_layers, learning_rate=5e-4, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=100000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, double_dqn=False, **network_kwargs) -> tf.keras.Model: """Train a DQN model. Parameters ------- env: gym.Env openai gym conv_layers: list a list of triples that defines the conv network learning_rate: float learning rate for adam optimizer total_timesteps: int number of env steps to run the environment buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every train_freq steps. batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to store a checkpoint during training checkpoint_path: str the fs path for storing the checkpoints learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. double_dqn: bool specifies if double q-learning is used during training Returns ------- dqn: an instance of tf.Module that contains the trained model """ q_func = build_dueling_q_func(conv_layers, **network_kwargs) dqn = DeepQ(model_builder=q_func, observation_shape=env.observation_space.shape, num_actions=env.action_space.n, learning_rate=learning_rate, gamma=gamma, double_dqn=double_dqn) manager = None if checkpoint_path is not None: load_path = osp.expanduser(checkpoint_path) ckpt = tf.train.Checkpoint(model=dqn.q_network) manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=5) ckpt.restore(manager.latest_checkpoint) print("Restoring from {}".format(manager.latest_checkpoint)) current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") train_log_dir = 'logs/gradient_tape/' + current_time + '/train' train_summary_writer = tf.summary.create_file_writer(train_log_dir) # Create the replay buffer replay_buffer = ReplayMemory(buffer_size) # Create the schedule for exploration starting from 1. exploration = LinearSchedule(total_timesteps=int(exploration_fraction * total_timesteps), initial_prob=1.0, final_prob=exploration_final_eps) dqn.update_target() episode_rewards = [0.0] obs = env.reset() obs = np.expand_dims(np.array(obs), axis=0) for t in range(total_timesteps): update_eps = exploration.step_to(t) action, _, _, _ = dqn.step(tf.constant(obs), update_eps=update_eps) action = action[0].numpy() new_obs, reward, done, _ = env.step(action) # Store transition in the replay buffer. new_obs = np.expand_dims(np.array(new_obs), axis=0) replay_buffer.add(obs[0], action, reward, new_obs[0], float(done)) obs = new_obs episode_rewards[-1] += reward if done: obs = env.reset() obs = np.expand_dims(np.array(obs), axis=0) episode_rewards.append(0.0) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, _ = tf.ones_like(rewards), None td_loss = dqn.train(obses_t, actions, rewards, obses_tp1, dones, weights) if t > learning_starts and t % target_network_update_freq == 0: # Update target network every target_network_update_freq steps dqn.update_target() reward_100_mean = np.round(np.mean(episode_rewards[-101:-1]), 1) number_episodes = len(episode_rewards) - 1 if done and print_freq is not None and number_episodes % print_freq == 0: format_str = "Steps: {}, Episodes: {}, 100 ep reward average: {}, Reward: {}, Epsilon-greedy %explore: {}" print( format_str.format(t, number_episodes, reward_100_mean, episode_rewards[-2], int(100 * exploration.value(t)))) with train_summary_writer.as_default(): tf.summary.scalar('loss', dqn.train_loss_metrics.result(), step=t) tf.summary.scalar('reward', episode_rewards[-2], step=t) if checkpoint_path is not None and t % checkpoint_freq == 0: manager.save() # Every training step, reset the loss metric dqn.train_loss_metrics.reset_states() return dqn.q_network
torch.cuda.manual_seed(args.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Define and build DDPG agent hidden_size = tuple(args.hidden_size) agent = DDPG(args.gamma, args.tau, hidden_size, env.observation_space.shape[0], env.action_space, checkpoint_dir=checkpoint_dir ) # Initialize replay memory memory = ReplayMemory(int(args.replay_size)) # Initialize OU-Noise nb_actions = env.action_space.shape[-1] ou_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(args.noise_stddev) * np.ones(nb_actions)) # Define counters and other variables start_step = 0 # timestep = start_step if args.load_model: # Load agent if necessary start_step, memory = agent.load_checkpoint() timestep = start_step // 10000 + 1 rewards, policy_losses, value_losses, mean_test_rewards = [], [], [], [] epoch = 0
def __init__(self, env, embedding_network, replay_memory=ReplayMemory(100000), initial_epsilon=1.0, final_epsilon=0.01, epsilon_decay=0.99, batch_size=8, sgd_lr=1e-6, q_lr=0.01, gamma=0.99, lookahead_horizon=100, update_period=4, kernel=inverse_distance, num_neighbors=50, max_memory=500000): ''' Instantiate an NEC Agent Parameters ---------- env: gym.Env gym environment to train on embedding_network: torch.nn.Module Model to extract the embedding from a state replay_memory: ReplayMemory Replay memory to sample from for embedding network updates initial_epsilon: float Initial epsilon for epsilon greedy search epsilon_decay: float Exponential decay factor for epsilon batch_size: int Batch size to sample from the replay memory sgd_lr: float Learning rate to use for RMSProp updates to the embedding network and DND q_lr: float Learning rate to use for Q-updates on DND updates gamma: float Discount factor lookahead_horizon: int Lookahead horizon to use for N-step Q-value estimates update_period: int Inverse of rate at which embedding network gets updated i.e. if 1 then update after every timestep, if 16 then update every 16 timesteps, etc. kernel: (torch.autograd.Variable, torch.autograd.Variable) => (torch.autograd.Variable) Kernel function to use for DND lookups num_neighbors: int Number of neighbors to return in K-NN lookups in DND max_memory: int Maximum number of key-value pairs to store in each DND ''' self.env = env self.embedding_network = embedding_network self.replay_memory = replay_memory self.epsilon = initial_epsilon self.final_epsilon = final_epsilon self.epsilon_decay = epsilon_decay self.batch_size = batch_size self.q_lr = q_lr self.gamma = gamma self.lookahead_horizon = lookahead_horizon self.update_period = update_period self.transition_queue = [] self.optimizer = optim.RMSprop( self.embedding_network.parameters(), lr=sgd_lr) self.dnd_list = [DND(kernel, num_neighbors, max_memory, sgd_lr) for _ in range(env.action_space_n)]
def setUp(self): self.memory = ReplayMemory(capacity=10)
def __init__(self, actor_optimizer_spec, critic_optimizer_spec, num_feature, num_action, net_type, replay_memory_size=1000000, batch_size=64, tau=0.001): ############### # BUILD MODEL # ############### self.num_feature = num_feature self.num_action = num_action self.batch_size = batch_size self.tau = tau # Construct actor and critic if net_type == 0: self.actor = MLPA(input_size=num_feature, output_size=num_action, hidden_size=(400, 300), n_layers=2, tanh_flag=1).type(dtype) self.target_actor = MLPA(input_size=num_feature, output_size=num_action, hidden_size=(400, 300), n_layers=2, tanh_flag=1).type(dtype) self.critic = MLPC(input_size_state=num_feature, input_size_action=num_action, output_size=1, hidden_size=(400, 300), n_layers=2).type(dtype) self.target_critic = MLPC(input_size_state=num_feature, input_size_action=num_action, output_size=1, hidden_size=(400, 300), n_layers=2).type(dtype) elif net_type == 1: self.actor = MLPA(input_size=num_feature + 1, output_size=num_action, hidden_size=(400, 300), n_layers=2, tanh_flag=1).type(dtype) self.target_actor = MLPA(input_size=num_feature + 1, output_size=num_action, hidden_size=(400, 300), n_layers=2, tanh_flag=1).type(dtype) self.critic = MLPC(input_size_state=num_feature + 1, input_size_action=num_action, output_size=1, hidden_size=(400, 300), n_layers=2).type(dtype) self.target_critic = MLPC(input_size_state=num_feature + 1, input_size_action=num_action, output_size=1, hidden_size=(400, 300), n_layers=2).type(dtype) elif net_type == 2: self.actor = PMLPA(input_size=num_feature, output_size=num_action, hidden_size=(400, 300), dtype=dtype, n_layers=2, tanh_flag=1).type(dtype) self.target_actor = PMLPA(input_size=num_feature, output_size=num_action, hidden_size=(400, 300), dtype=dtype, n_layers=2, tanh_flag=1).type(dtype) self.critic = PMLPC(input_size_state=num_feature, input_size_action=num_action, output_size=1, hidden_size=(400, 300), dtype=dtype, n_layers=2).type(dtype) self.target_critic = PMLPC(input_size_state=num_feature, input_size_action=num_action, output_size=1, hidden_size=(400, 300), dtype=dtype, n_layers=2).type(dtype) # Construct the optimizers for actor and critic self.actor_optimizer = actor_optimizer_spec.constructor( self.actor.parameters(), **actor_optimizer_spec.kwargs) self.critic_optimizer = critic_optimizer_spec.constructor( self.critic.parameters(), **critic_optimizer_spec.kwargs) # Construct the replay memory self.replay_memory = ReplayMemory(replay_memory_size)
def __init__(self, env, embedding_network, replay_memory=ReplayMemory(500000), epsilon_schedule=epsilon_schedule, batch_size=8, sgd_learning_rate=1e-2, q_learning_rate=0.5, gamma=0.99, lookahead_horizon=100, update_period=4, kernel=inverse_distance, num_neighbors=50, max_memory=125000, warmup_period=1000, test_period=10): """ Instantiate an NEC Agent Parameters ---------- env: gym.Env gym environment to train on embedding_network: torch.nn.Module Model to extract the embedding from a state replay_memory: ReplayMemory Replay memory to sample from for embedding network updates epsilon_schedule: (int) => (float) Function that determines the epsilon for epsilon-greedy exploration from the timestep t batch_size: int Batch size to sample from the replay memory sgd_learning_rate: float Learning rate to use for RMSProp updates to the embedding network q_learning_rate: float Learning rate to use for Q-updates on DND updates gamma: float Discount factor lookahead_horizon: int Lookahead horizon to use for N-step Q-value estimates update_period: int Inverse of rate at which embedding network gets updated i.e. if 1 then update after every timestep, if 16 then update every 16 timesteps, etc. kernel: (torch.autograd.Variable, torch.autograd.Variable) => (torch.autograd.Variable) Kernel function to use for DND lookups num_neighbors: int Number of neighbors to return in K-NN lookups in DND max_memory: int Maximum number of key-value pairs to store in DND warmup_period: int Number of timesteps to act randomly before learning test_period: int Number of episodes between each test iteration """ self.env = env self.embedding_network = embedding_network if use_cuda: self.embedding_network.cuda() self.replay_memory = replay_memory self.epsilon_schedule = epsilon_schedule self.batch_size = batch_size self.q_learning_rate = q_learning_rate self.gamma = gamma self.lookahead_horizon = lookahead_horizon self.update_period = update_period self.warmup_period = warmup_period self.test_period = test_period self.transition_queue = [] self.optimizer = optim.RMSprop(self.embedding_network.parameters(), lr=sgd_learning_rate) state_dict = self.embedding_network.state_dict() self.dnd_list = [ DND(kernel, num_neighbors, max_memory, state_dict[next(reversed(state_dict))].size()[0]) for _ in range(env.action_space.n) ]
def __init__(self, args): super(BasePGQLearner, self).__init__(args) # args.entropy_regularisation_strength = 0.0 conf_learning = { 'name': 'local_learning_{}'.format(self.actor_id), 'input_shape': self.input_shape, 'num_act': self.num_actions, 'args': args } self.local_network = PolicyValueNetwork(conf_learning) self.reset_hidden_state() if self.is_master(): var_list = self.local_network.params self.saver = tf.train.Saver(var_list=var_list, max_to_keep=3, keep_checkpoint_every_n_hours=2) # pgq specific initialization self.batch_size = 32 self.pgq_fraction = args.pgq_fraction self.replay_memory = ReplayMemory(args.replay_size) self.q_tilde = self.local_network.beta * ( self.local_network.log_output_layer_pi + tf.expand_dims(self.local_network.output_layer_entropy, 1)) + self.local_network.output_layer_v self.Qi, self.Qi_plus_1 = tf.split(axis=0, num_or_size_splits=2, value=self.q_tilde) self.V, _ = tf.split(axis=0, num_or_size_splits=2, value=self.local_network.output_layer_v) self.log_pi, _ = tf.split( axis=0, num_or_size_splits=2, value=tf.expand_dims(self.local_network.log_output_selected_action, 1)) self.R = tf.placeholder('float32', [None], name='1-step_reward') self.terminal_indicator = tf.placeholder(tf.float32, [None], name='terminal_indicator') self.max_TQ = self.gamma * tf.reduce_max( self.Qi_plus_1, 1) * (1 - self.terminal_indicator) self.Q_a = tf.reduce_sum( self.Qi * tf.split(axis=0, num_or_size_splits=2, value=self.local_network.selected_action_ph)[0], 1) self.q_objective = -self.pgq_fraction * tf.reduce_mean( tf.stop_gradient(self.R + self.max_TQ - self.Q_a) * (self.V[:, 0] + self.log_pi[:, 0])) self.V_params = self.local_network.params self.q_gradients = tf.gradients(self.q_objective, self.V_params) if self.local_network.clip_norm_type == 'global': self.q_gradients = tf.clip_by_global_norm( self.q_gradients, self.local_network.clip_norm)[0] elif self.local_network.clip_norm_type == 'local': self.q_gradients = [ tf.clip_by_norm(g, self.local_network.clip_norm) for g in self.q_gradients ] if (self.optimizer_mode == "local"): if (self.optimizer_type == "rmsprop"): self.batch_opt_st = np.ones(size, dtype=ctypes.c_float) else: self.batch_opt_st = np.zeros(size, dtype=ctypes.c_float) elif (self.optimizer_mode == "shared"): self.batch_opt_st = args.batch_opt_state