def test_training_flag(): obs_size = (3, 4) obs0 = np.random.random(obs_size) terminal0 = False obs1 = np.random.random(obs_size) terminal1 = True obs2 = np.random.random(obs_size) terminal2 = False for training in (True, False): memory = SequentialMemory(3, window_length=2) state = memory.get_recent_state(obs0) assert state.shape == (2,) + obs_size assert np.allclose(state[0], 0.) assert np.all(state[1] == obs0) assert memory.nb_entries == 0 memory.append(obs0, 0, 0., terminal1, training=training) state = memory.get_recent_state(obs1) assert state.shape == (2,) + obs_size assert np.all(state[0] == obs0) assert np.all(state[1] == obs1) if training: assert memory.nb_entries == 1 else: assert memory.nb_entries == 0 memory.append(obs1, 0, 0., terminal2, training=training) state = memory.get_recent_state(obs2) assert state.shape == (2,) + obs_size assert np.allclose(state[0], 0.) assert np.all(state[1] == obs2) if training: assert memory.nb_entries == 2 else: assert memory.nb_entries == 0
enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) # load memory if possible if args.load_memory == "agent": if not os.path.isfile(MEMORY_FILE): env.close() del env exit(0) with open(MEMORY_FILE, 'rb') as handle: memory_list = pickle.load(handle) for obs, a, r, done, training in memory_list: memory.append(obs, a, r, done, training) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. dqn.fit(env, nb_steps=args.fit_step, visualize=False, verbose=2) # After training is done, we save the final weights. dqn.save_weights(args.out_dir + '/duel_dqn_{}_weights.h5f'.format(CONFIG_FILE[7:-5]), overwrite=True) env.close() del env df = pd.DataFrame()
def test_get_recent_state_with_episode_boundaries(): memory = SequentialMemory(3, window_length=2, ignore_episode_boundaries=False) obs_size = (3, 4) obs0 = np.random.random(obs_size) terminal0 = False obs1 = np.random.random(obs_size) terminal1 = False obs2 = np.random.random(obs_size) terminal2 = False obs3 = np.random.random(obs_size) terminal3 = True obs4 = np.random.random(obs_size) terminal4 = False obs5 = np.random.random(obs_size) terminal5 = True obs6 = np.random.random(obs_size) terminal6 = False state = memory.get_recent_state(obs0) assert state.shape == (2,) + obs_size assert np.allclose(state[0], 0.) assert np.all(state[1] == obs0) # memory.append takes the current observation, the reward after taking an action and if # the *new* observation is terminal, thus `obs0` and `terminal1` is correct. memory.append(obs0, 0, 0., terminal1) state = memory.get_recent_state(obs1) assert state.shape == (2,) + obs_size assert np.all(state[0] == obs0) assert np.all(state[1] == obs1) memory.append(obs1, 0, 0., terminal2) state = memory.get_recent_state(obs2) assert state.shape == (2,) + obs_size assert np.all(state[0] == obs1) assert np.all(state[1] == obs2) memory.append(obs2, 0, 0., terminal3) state = memory.get_recent_state(obs3) assert state.shape == (2,) + obs_size assert np.all(state[0] == obs2) assert np.all(state[1] == obs3) memory.append(obs3, 0, 0., terminal4) state = memory.get_recent_state(obs4) assert state.shape == (2,) + obs_size assert np.all(state[0] == np.zeros(obs_size)) assert np.all(state[1] == obs4) memory.append(obs4, 0, 0., terminal5) state = memory.get_recent_state(obs5) assert state.shape == (2,) + obs_size assert np.all(state[0] == obs4) assert np.all(state[1] == obs5) memory.append(obs5, 0, 0., terminal6) state = memory.get_recent_state(obs6) assert state.shape == (2,) + obs_size assert np.all(state[0] == np.zeros(obs_size)) assert np.all(state[1] == obs6)
def test_sampling(): memory = SequentialMemory(100, window_length=2, ignore_episode_boundaries=False) obs_size = (3, 4) actions = range(5) obs0 = np.random.random(obs_size) terminal0 = False action0 = np.random.choice(actions) reward0 = np.random.random() obs1 = np.random.random(obs_size) terminal1 = False action1 = np.random.choice(actions) reward1 = np.random.random() obs2 = np.random.random(obs_size) terminal2 = False action2 = np.random.choice(actions) reward2 = np.random.random() obs3 = np.random.random(obs_size) terminal3 = True action3 = np.random.choice(actions) reward3 = np.random.random() obs4 = np.random.random(obs_size) terminal4 = False action4 = np.random.choice(actions) reward4 = np.random.random() obs5 = np.random.random(obs_size) terminal5 = False action5 = np.random.choice(actions) reward5 = np.random.random() obs6 = np.random.random(obs_size) terminal6 = False action6 = np.random.choice(actions) reward6 = np.random.random() # memory.append takes the current observation, the reward after taking an action and if # the *new* observation is terminal, thus `obs0` and `terminal1` is correct. memory.append(obs0, action0, reward0, terminal1) memory.append(obs1, action1, reward1, terminal2) memory.append(obs2, action2, reward2, terminal3) memory.append(obs3, action3, reward3, terminal4) memory.append(obs4, action4, reward4, terminal5) memory.append(obs5, action5, reward5, terminal6) assert memory.nb_entries == 6 experiences = memory.sample(batch_size=5, batch_idxs=[0, 1, 2, 3, 4]) assert len(experiences) == 5 assert_allclose(experiences[0].state0, np.array([np.zeros(obs_size), obs0])) assert_allclose(experiences[0].state1, np.array([obs0, obs1])) assert experiences[0].action == action0 assert experiences[0].reward == reward0 assert experiences[0].terminal1 is False assert_allclose(experiences[1].state0, np.array([obs0, obs1])) assert_allclose(experiences[1].state1, np.array([obs1, obs2])) assert experiences[1].action == action1 assert experiences[1].reward == reward1 assert experiences[1].terminal1 is False assert_allclose(experiences[2].state0, np.array([obs1, obs2])) assert_allclose(experiences[2].state1, np.array([obs2, obs3])) assert experiences[2].action == action2 assert experiences[2].reward == reward2 assert experiences[2].terminal1 is True # Next experience has been re-sampled since since state0 would be terminal in which case we # cannot really have a meaningful transition because the environment gets reset. We thus # just ensure that state0 is not terminal. assert not np.all(experiences[3].state0 == np.array([obs2, obs3])) assert_allclose(experiences[4].state0, np.array([np.zeros(obs_size), obs4])) assert_allclose(experiences[4].state1, np.array([obs4, obs5])) assert experiences[4].action == action4 assert experiences[4].reward == reward4 assert experiences[4].terminal1 is False
# warm up pi = None for p in policy_iteration_iterator(10, 0.5, file_path="/tmp/state_table.csv", save_path="/tmp/OSCAR/"): pi = p for i in range(20): obs = env.reset() while True: s = state_from_obs(obs) a = pi[s.id()] old_obs = obs obs, r, done, debug_dict = env.step(a) memory.append(old_obs, a, r, done, False) if done: break env.close() env = GeneralLearningEnv(CONFIG_FILE, False, log_file_path=LOG_FILE, publish_stats=False) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. dqn.fit(env, nb_steps=500000, visualize=False, verbose=2) # After training is done, we save the final weights.
class MACE(Agent): def __init__(self, env: gym.Env, **kwargs): super(MACE, self).__init__(**kwargs) self.nb_actions = env.action_space.shape[0] obs_input_actor = Input(shape=(1, ) + env.observation_space.shape, name='observation_input') x_ac = Flatten()(obs_input_actor) x_ac = Dense(units=256, activation='relu')(x_ac) obs_input_critic = Input(shape=(1, ) + env.observation_space.shape, name='observation_input') x_cr = Flatten()(obs_input_critic) x_cr = Dense(units=256, activation='relu')(x_cr) x_critic = Dense(units=128, activation='relu')(x_cr) value = Dense(units=1)(x_critic) x_actor = Dense(units=128, activation='relu')(x_ac) action = Dense(units=self.nb_actions, activation='tanh')(x_actor) actor = Model(inputs=obs_input_actor, outputs=action) critic = Model(inputs=obs_input_critic, outputs=value) metrics = [] metrics += [mean_q] critic_metrics = metrics critic_optimizer = Adam(lr=1e-3) actor_optimizer = Adam(lr=1e-3) # critic_optimizer = SGD(lr=1e-4, momentum=0.9) # actor_optimizer = SGD(lr=1e-3, momentum=0.9) self.actor = actor self.critic = critic self.target_actor = clone_model(self.actor) self.target_actor.compile(optimizer='sgd', loss='mse') self.target_critic = clone_model(self.critic) self.target_critic.compile(optimizer='sgd', loss='mse') self.target_model_update = 1e-3 #self.target_model_update=500 if self.target_model_update < 1.: # We use the `AdditionalUpdatesOptimizer` to efficiently soft-update the target model. critic_updates = get_soft_target_model_updates( self.target_critic, self.critic, self.target_model_update) critic_optimizer = AdditionalUpdatesOptimizer( critic_optimizer, critic_updates) actor_updates = get_soft_target_model_updates( self.target_actor, self.actor, self.target_model_update) actor_optimizer = AdditionalUpdatesOptimizer( actor_optimizer, actor_updates) self.delta_clip = np.inf def clipped_error(y_true, y_pred): return K.mean(huber_loss(y_true, y_pred, self.delta_clip), axis=-1) actor.compile(actor_optimizer, loss='mse') critic.compile(critic_optimizer, loss='mse', metrics=critic_metrics) self.compiled = True self.memory = SequentialMemory(limit=100000, window_length=1) self.memory_interval = 1 self.memory_actor = SequentialMemory(limit=100000, window_length=1) self.memory_critic = SequentialMemory(limit=100000, window_length=1) self.nb_steps_warmup = 50000 self.train_interval = 4 self.batch_size = 64 self.gamma = 0.99 self.processor = None self.random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.3, size=self.nb_actions) self.eps = 0.9 def process_state_batch(self, batch): batch = np.array(batch) if self.processor is None: return batch return self.processor.process_state_batch(batch) def select_action(self, state): batch = [state] action = self.actor.predict_on_batch(np.asarray(batch)).flatten() # Apply noise, if a random process is set. if self.training and self.random_process is not None: #Actor exploration Bernoulli # rd = np.random.rand() # if rd<self.eps: noise = self.random_process.sample() assert noise.shape == action.shape action += noise # self.action_exploration=True # else: # self.action_exploration=False return action def forward(self, observation): # Select an action. state = self.memory.get_recent_state(observation) action = self.select_action(state) # TODO: move this into policy # Book-keeping. self.recent_observation = observation self.recent_action = action return action def backward(self, reward, terminal=False): # Store most recent experience in memory. if self.step % self.memory_interval == 0: self.memory.append(self.recent_observation, self.recent_action, reward, terminal, training=self.training) metrics = [np.nan for _ in self.metrics_names] if not self.training: # We're done here. No need to update the experience memory since we only use the working # memory to obtain the state over the most recent observations. return metrics # Train the network on a single stochastic batch. can_train_either = self.step > self.nb_steps_warmup if can_train_either and self.step % self.train_interval == 0: experiences = self.memory.sample(self.batch_size) assert len(experiences) == self.batch_size # Start by extracting the necessary parameters (we use a vectorized implementation). state0_batch = [] reward_batch = [] action_batch = [] terminal1_batch = [] state1_batch = [] for e in experiences: state0_batch.append(e.state0) state1_batch.append(e.state1) reward_batch.append(e.reward) action_batch.append(e.action) terminal1_batch.append(0. if e.terminal1 else 1.) # Prepare and validate parameters. state0_batch = self.process_state_batch(state0_batch) state1_batch = self.process_state_batch(state1_batch) terminal1_batch = np.array(terminal1_batch) reward_batch = np.array(reward_batch) action_batch = np.array(action_batch) assert reward_batch.shape == (self.batch_size, ) assert terminal1_batch.shape == reward_batch.shape assert action_batch.shape == (self.batch_size, self.nb_actions) # Update actor and critic, if warm up is over. if self.step > self.nb_steps_warmup: if len(self.critic.inputs) >= 3: state1_batch_with_action = state1_batch[:] else: state1_batch_with_action = [state1_batch] target_q_values = self.target_critic.predict_on_batch( state1_batch_with_action).flatten() assert target_q_values.shape == (self.batch_size, ) # Compute r_t + gamma * max_a Q(s_t+1, a) and update the target ys accordingly, # but only for the affected output units (as given by action_batch). discounted_reward_batch = self.gamma * target_q_values discounted_reward_batch *= terminal1_batch assert discounted_reward_batch.shape == reward_batch.shape targets = (reward_batch + discounted_reward_batch).reshape( self.batch_size, 1) # Perform a single batch update on the critic network. if len(self.critic.inputs) >= 3: state0_batch_with_action = state0_batch[:] else: state0_batch_with_action = [state0_batch] #state0_batch_with_action.insert(self.critic_action_input_idx, action_batch) metrics = self.critic.train_on_batch(state0_batch_with_action, targets) if self.processor is not None: metrics += self.processor.metrics #Actor experiences = self.memory.sample(self.batch_size) assert len(experiences) == self.batch_size # Start by extracting the necessary parameters (we use a vectorized implementation). state0_batch = [] reward_batch = [] action_batch = [] terminal1_batch = [] state1_batch = [] for e in experiences: state0_batch.append(e.state0) state1_batch.append(e.state1) reward_batch.append(e.reward) action_batch.append(e.action) terminal1_batch.append(0. if e.terminal1 else 1.) # Prepare and validate parameters. state0_batch = self.process_state_batch(state0_batch) state1_batch = self.process_state_batch(state1_batch) terminal1_batch = np.array(terminal1_batch) reward_batch = np.array(reward_batch) action_batch = np.array(action_batch) assert reward_batch.shape == (self.batch_size, ) assert terminal1_batch.shape == reward_batch.shape assert action_batch.shape == (self.batch_size, self.nb_actions) if self.step > self.nb_steps_warmup: #Actor target_q_values1 = self.target_critic.predict_on_batch( state1_batch_with_action).flatten() discounted_reward_batch = self.gamma * target_q_values1 discounted_reward_batch *= terminal1_batch targets = (reward_batch + discounted_reward_batch) target_q_values0 = self.target_critic.predict_on_batch( state0_batch_with_action).flatten() delta = targets - target_q_values0 if len(self.actor.inputs) >= 2: inputs = state0_batch[:] else: #inputs = [state0_batch] inputs = state0_batch pos_dif = delta > 0 # if self.step%1000==0: # print(np.sum(pos_dif)) inputs = np.asarray(inputs)[pos_dif] actions_target = action_batch[pos_dif] #state0_batch_with_action.insert(self.critic_action_input_idx, action_batch) self.actor.train_on_batch(inputs, actions_target) if self.target_model_update >= 1 and self.step % self.target_model_update == 0: self.update_target_models_hard() return metrics def reset_states(self): if self.random_process is not None: self.random_process.reset_states() self.recent_action = None self.recent_observation = None if self.compiled: self.actor.reset_states() self.critic.reset_states() self.target_actor.reset_states() self.target_critic.reset_states() def update_target_models_hard(self): self.target_critic.set_weights(self.critic.get_weights()) self.target_actor.set_weights(self.actor.get_weights()) @property def metrics_names(self): names = self.critic.metrics_names[:] if self.processor is not None: names += self.processor.metrics_names[:] return names
class DDPG(object): def __init__(self, n_state, log_writer, args): self.n_state = n_state self.log_writer = log_writer self.output = args.output self.action_start = args.action_start self.action_end = args.action_end self.n_action = self.action_end - self.action_start + 1 # create actor and critic network net_config = { 'n_state': self.n_state, 'n_action': self.n_action, 'hidden1': args.hidden1, 'hidden2': args.hidden2 } self.actor = Actor(**net_config) self.actor_target = Actor(**net_config) self.actor_optim = Adam(self.actor.parameters(), lr=args.lr_a) self.critic = Critic(**net_config) self.critic_target = Critic(**net_config) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr_c) # make sure target is with the same weight self.hard_update(self.actor_target, self.actor) self.hard_update(self.critic_target, self.critic) # create replay buffer self.memory = SequentialMemory(size=args.rmsize) # hyper-parameters self.batch_size = args.bsize self.discount = args.discount self.tau = args.tau # noise ??? ''' ''' if torch.cuda.is_available(): self.cuda() # moving average baseline self.moving_average = None self.moving_alpha = args.moving_alpha def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def random_action(self): # print('random_int') return random.randint(self.action_start, self.action_end) def select_action(self, state): action_prob = to_numpy(self.actor(to_tensor(state.reshape( 1, -1)))).squeeze(0) dice = stats.rv_discrete( values=(range(self.action_start, self.action_end + 1), action_prob)) action = dice.rvs(size=1) # print(action_prob) # print('select action: {}'.format(action[0])) return action[0] def get_exact_action(self, state_batch, kind): if kind == 0: action_prob = self.actor_target(state_batch) else: action_prob = self.actor(state_batch) max_val, prediction = torch.max(action_prob, 1) prediction = prediction.reshape(self.batch_size, -1).float() return prediction / self.n_action def update_policy(self): # sample batch # print('start update policy\n') # self.log_writer.flush() state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_and_split(self.batch_size) action_batch = (action_batch - self.action_start) / self.n_action # normalize the reward batch_mean_reward = reward_batch.mean().item() if self.moving_average is None: self.moving_average = batch_mean_reward else: self.moving_average += self.moving_alpha * (batch_mean_reward - self.moving_average) reward_batch -= self.moving_average # update critic self.critic.zero_grad() q_batch = self.critic([state_batch, action_batch]) with torch.no_grad(): # prepare for the target q batch next_q_values = self.critic_target( [next_state_batch, self.get_exact_action(next_state_batch, 0)]) target_q_batch = reward_batch + self.discount * terminal_batch * next_q_values value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() # update actor self.actor.zero_grad() policy_loss = -self.critic( [state_batch, self.get_exact_action(state_batch, 1)]) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() # print('end update policy\n') # self.log_writer.flush() # target network update self.soft_update(self.actor_target, self.actor) self.soft_update(self.critic_target, self.critic) def hard_update(self, target, source): for target_param, source_param in zip(target.parameters(), source.parameters()): target_param.data.copy_(source_param.data) def soft_update(self, target, source): for target_param, source_param in zip(target.parameters(), source.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.tau) + source_param.data * self.tau) def append_replay(self, s_t, a_t, r_t, done): self.memory.append(s_t, a_t, r_t, done) def save_model(self, num): torch.save(self.actor.state_dict(), '{}/actor-{}.pkl'.format(self.output, num)) torch.save(self.critic.state_dict(), '{}/critic-{}.pkl'.format(self.output, num)) def load_weights(self, state_dir, num): self.actor.load_state_dict( torch.load('{}/actor-{}.pkl'.format(state_dir, num))) self.critic.load_state_dict( torch.load('{}/critic-{}.pkl'.format(state_dir, num))) self.actor_target.load_state_dict( torch.load('{}/actor-{}.pkl'.format(state_dir, num))) self.critic_target.load_state_dict( torch.load('{}/critic-{}.pkl'.format(state_dir, num)))
class deepAMDP(): def __init__(self, inputDim=16, alpha=1e-4, gamma=0.99, epsilon=0.1, numberOfActions=0, tau=1e-1): self.predictionModel = Sequential() self.predictionModel.add( Dense(16, input_dim=inputDim, activation='relu')) self.predictionModel.add(Dense(16, activation='relu')) self.predictionModel.add(Dense(numberOfActions, activation='linear')) self.predictionModel.compile(loss="mse", optimizer=Adam(lr=alpha)) self.targetModel = Sequential() self.targetModel.add(Dense(16, input_dim=inputDim)) self.targetModel.add(Dense(16, activation='relu')) self.targetModel.add(Dense(numberOfActions, activation="linear")) self.targetModel.compile(loss="mse", optimizer=Adam(lr=alpha)) self.memory = SequentialMemory(limit=100000, window_length=1) self.otherMemory = deque(maxlen=2000) self.numberOfActions = numberOfActions self.alpha = alpha self.gamma = gamma self.tau = tau self.epsilon = epsilon self.initialEpsilon = 0.1 self.finalEpsilon = 0.01 self.currentEpsilon = self.initialEpsilon self.episodesToDecay = 500 def addExperience(self, latentState, action, reward, done): self.memory.append(latentState, action, reward, done) #print(latentState, action, reward) def memorize(self, state, action, reward, next_state, done): self.otherMemory.append((state, action, reward, next_state, done)) def action(self, state): if np.random.random() < self.currentEpsilon: return np.random.randint(self.numberOfActions) state = state.reshape(1, -1) qValues = self.predictionModel.predict(state) return np.argmax(self.predictionModel.predict(state)[0]) def replay(self, batchSize=8): #print("replay") #if len(self.memory) < batchSize: # return experiences = self.memory.sample(batchSize) # Start by extracting the necessary parameters (we use a vectorized implementation). state0Batch = [] rewardBatch = [] actionBatch = [] terminal1Batch = [] state1Batch = [] for e in experiences: # print(e.state0, e.state1, e.reward, e.action) state0Batch.append(e.state0[0]) state1Batch.append(e.state1[0]) rewardBatch.append(e.reward) actionBatch.append(e.action) terminal1Batch.append(0. if e.terminal1 else 1.) state0Batch = np.array(state0Batch) rewardBatch = np.array(rewardBatch) actionBatch = np.array(actionBatch) terminal1Batch = np.array(terminal1Batch) state1Batch = np.array(state1Batch) #state0Batch = normalize(state0Batch, axis=-1) #state1Batch = normalize(state1Batch, axis=-1) targetQValues = self.targetModel.predict_on_batch(state1Batch) #print("Target Q Values") #print(targetQValues) #print("Target Q 1") #print(state1Batch[0]) qBatch = np.max(targetQValues, axis=1).flatten() #targets = np.zeros((batchSize, self.numberOfActions)) #targets = np.random.rand(batchSize, self.numberOfActions) targets = targetQValues discountedRewardBatch = self.gamma * qBatch discountedRewardBatch *= terminal1Batch Rs = rewardBatch + discountedRewardBatch #print("Our Targets") #print(targets) for (target, r, action) in zip(targets, Rs, actionBatch): target[action] = r #print("Updated Targets") #print(targets) self.predictionModel.fit(state0Batch, targets, verbose=0) #self.updateTargetModel() def otherReplay(self, batch_size=8): minibatch = random.sample(self.otherMemory, batch_size) for state, action, reward, next_state, done in minibatch: target = reward state = state.reshape(1, -1) next_state = next_state.reshape(1, -1) if not done: target = (reward + self.gamma * np.amax(self.predictionModel.predict(next_state)[0])) target_f = self.predictionModel.predict(state) target_f[0][action] = target self.predictionModel.fit(state, target_f, epochs=1, verbose=0) #self.updateTargetModel() def updateTargetModel(self): predictionWeights = self.predictionModel.get_weights() targetWeights = self.targetModel.get_weights() for i in range(0, len(targetWeights)): targetWeights[i] = predictionWeights[i] * self.tau + targetWeights[ i] * (1 - self.tau) self.targetModel.set_weights(targetWeights)