class Agent: def __init__(self, args, state_dim, action_dim): self.h = args.hyper self.mode = 'observe' self.args = args self.metrics = Metrics() self.action_dim = action_dim self.state_dim = state_dim self.memory = Memory(self.h.memory_size) self.run_count = -1 self.replay_count = -1 self.save_iterator = -1 self.update_iterator = -1 if self.args.directory == 'default': self.args.directory = G.CUR_FOLDER results_location = G.RESULT_FOLDER_FULL + '/' + self.args.directory data_location = G.DATA_FOLDER_FULL + '/' + self.args.directory os.makedirs(results_location, exist_ok=True) # Generates results folder os.makedirs(data_location, exist_ok=True) # Generates data folder self.results_location = results_location + '/' self.data_location = data_location + '/' def act(self, s): return random.randrange(0, self.action_dim) def observe(self, sample): self.memory.add(sample) def replay(self, debug=False): pass
def test_sampling(self): mem = Memory(10) self.assertEqual(len(mem), 0) for i in range(10): mem.add(random()) self.assertEqual(len(mem), 10) sample = mem.sample(5) self.assertEqual(len(sample), 5)
def main(): clock = pygame.time.Clock() N_EPOCHS = 1000 GAMMA = 0.99 N_BIRD = 64 S_BATCH = 256 env = FlappyBird(N_BIRD) main_model = Model() target_model = Model() memory = Memory() agent = Agent() for epoch in range(1, N_EPOCHS + 1): print('Epoch: {}'.format(epoch)) env.reset() states, rewards, finished = env.random_step() target_model.model.set_weights(main_model.model.get_weights()) running = True while running: clock.tick(60) actions = [] for state in states: actions.append(agent.get_action(state, epoch, main_model)) next_states, rewards, finished = env.step(actions) for state, reward, action, next_state in zip( states, rewards, actions, next_states): memory.add((state, action, reward, next_state)) states = next_states if len(memory.buffer) % S_BATCH == 0: main_model.replay(memory, env.n_bird, GAMMA, target_model) target_model.model.set_weights(main_model.model.get_weights()) if not len(env.birds): running = False break env.draw() for event in pygame.event.get(): if event.type == pygame.QUIT: running = False print('\tScore: {}'.format(env.score)) pygame.quit() env = FlappyBird()
class RandomAgent: def __init__(self, actionsCount, memory_capacity): self.memory = Memory(memory_capacity) self.actionsCount = actionsCount def act(self, state): return random.randint(0, self.actionsCount - 1) def observe(self, sample): # Sample = (s, a, r, s') self.memory.add(sample) def replay(self): pass
class Agent: def __init__(self, num_states, num_actions, eps_min=0.05, eps_max=1, lam=1e-3): self.num_states = num_states self.num_actions = num_actions self.eps_min = eps_min self.eps_max = eps_max self.lam = lam self.brain = Brain(num_states, num_actions) self.memory = Memory(MEMORY_CAPACITY) self.step = 0 def act(self, s): if random.random() < self.eps: return random.randint(0, self.num_actions - 1) else: return np.argmax(self.brain.predict_one(s)) def observe(self, sars_): ''' takes in a sample of the environment, (s, a, r, s_), and adds it to the memory replay ''' self.step += 1 self.memory.add(sars_) def replay(self): batch = self.memory.sample(BATCH_SIZE) states = batch[0] actions = batch[1] rewards = batch[2] states_ = batch[3] p = self.brain.predict(states) p_ = self.brain.predict(np.nan_to_num(states_)) t = np.copy(p) t[:, actions] = rewards real_state = ~np.isnan(states_).any(axis=1) t[real_state,actions] += GAMMA * np.amax(p_, axis=1)[real_state] self.brain.train(states, t) @property def eps(self): return self.eps_min + (self.eps_max + self.eps_min) * np.exp(- self.lam * self.step)
def main(): p = psutil.Process(os.getpid()) mem_size = 10000000 memory = Memory(mem_size) agent = train.NNAgent((10, 24), 6) info = { "board": np.zeros((24, 10), dtype=np.int8), } for _ in range(mem_size): memory.add(info, info, 0) for i in range(10000000000): start = time.time() agent.train(memory, 1 << 14) end = time.time() rss = p.memory_info().rss / 1024 / 1024 duration = end - start print(f"{i}: Memory: {rss:.1f}GB, Duration (sec): {duration:.1f}")
class Agent: def __init__( self, device, key, actor_model, n_step, ): self.DEVICE = device self.KEY = key # NEURAL MODEL self.actor_model = actor_model # MEMORY self.memory = Memory() # HYPERPARAMETERS self.N_STEP = n_step def act(self, state): state = torch.from_numpy(state).float().unsqueeze(0).to(self.DEVICE) self.actor_model.eval() with torch.no_grad(): action, log_prob, _ = self.actor_model(state) self.actor_model.train() action = action.cpu().detach().numpy().item() log_prob = log_prob.cpu().detach().numpy().item() return action, log_prob def step(self, actor_state, critic_state, action, log_prob, reward): self.memory.add(actor_state, critic_state, action, log_prob, reward)
class DoomDDdqN: """ Deep Q Network model for doom. Parameters ---------- lr: float Learning rate gamma: float Discounting factor for future rewards eps: float Explore-exploit tradeoff for agent actions min_eps: float Minimum value for epsilon max_eps: float Maxumum value for epsilon name: str, default = 'DoomDqNet' Variable for tf namescope state_size: list, default = [100, 120, 4] Shape of input stack max_tau: int Max C step in updating the target network """ lr: int = 0.0002 gamma: float = 0.99 eps: float = 0.00005 min_eps: float = 0.01 max_eps: float = 1.0 memory_size: int = 100000 name: str = 'DoomDDQN' state_size: list = field(default_factory=get_state_size) action_size = 7 max_tau: int = 10000 def __post_init__(self): self.build_model() self.memory = Memory(self.memory_size) self.setup_writer() def build_model(self): """ Builds the Networks to use in training """ with tf.compat.v1.variable_scope(self.name, reuse=tf.AUTO_REUSE): self.inputs = tf.compat.v1.placeholder( tf.float32, (None, *self.state_size), name='inputs') self.ISweights = tf.compat.v1.placeholder( tf.float32, (None, 1), name='ISweights') self.actions = tf.compat.v1.placeholder( tf.float32, (None, self.action_size), name='actions') self.target_Q = tf.compat.v1.placeholder( tf.float32, (None), name='target') self.build_conv_net() def build_conv_net(self): """ Creates the model's layers and variables """ conv_one = tf.layers.conv2d( inputs=self.inputs, filters=32, strides=[4, 4], kernel_size=(8, 8), padding='valid', kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(), name='conv_one' ) conv_one_out = tf.nn.relu(features=conv_one, name='conv_one_out') conv_two = tf.layers.conv2d( inputs=conv_one_out, filters=64, kernel_size=(4, 4), strides=(2, 2), padding='valid', kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(), name='conv_two' ) conv_two_out = tf.nn.relu( features=conv_two, name='conv_two' ) conv_three = tf.layers.conv2d( inputs=conv_two_out, filters=128, kernel_size=(4, 4), strides=(2, 2), padding='valid', kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(), name='conv_three' ) conv_three_out = tf.nn.relu(features=conv_three, name='conv_three_out') flatten = tf.layers.flatten(conv_three_out) self.separate_to_streams(flatten) self.aggregate() def separate_to_streams(self, flatten): """ Creates the Value(s) and Advantage(s, a) layers """ value_fc = tf.layers.dense( inputs=flatten, activation=tf.nn.relu, units=512, kernel_initializer=tf.contrib.layers.xavier_initializer(), name='value_fc' ) self.value = tf.layers.dense( inputs=value_fc, units=1, activation=None, kernel_initializer=tf.contrib.layers.xavier_initializer(), name='value' ) advantg_fc = tf.layers.dense( inputs=flatten, activation=tf.nn.relu, units=512, kernel_initializer=tf.contrib.layers.xavier_initializer(), name='advantg_fc') self.advantg = tf.layers.dense( inputs=advantg_fc, activation=None, units=self.action_size, kernel_initializer=tf.contrib.layers.xavier_initializer(), name='advantage') def _dense(self, inputs, units, activation=None, name='', **kwargs): """ Returns a tf dense layer of specified args """ return tf.layers.dense( inputs=inputs, units=units, activation=activation, kernel_initializer=kwargs.get('initializer') or tf.contrib.layers.xavier_initializer(), name=name ) def aggregate(self): """ Defines output and loss """ # Q(s, a):= V(s) + A(s,a) - 1/|A| * sum[A(s,a')] self.output = self.value + tf.subtract( self.advantg, tf.reduce_mean(self.advantg, axis=1, keepdims=True)) # Predicted Q self.Q = tf.reduce_sum(tf.multiply(self.output, self.actions)) self.abs_errors = tf.abs(self.target_Q - self.Q) self.loss = tf.reduce_mean( self.ISweights * tf.squared_difference(self.target_Q, self.Q)) self.optimizer = tf.train.AdamOptimizer(self.lr).minimize(self.loss) def prepopulate(self, episodes=100000): """ Creates random experiences to hold in memory """ self.memory = Memory(self.memory_size) self.game, self.actions_choice = create_env() self.game.new_episode() state = self.game.get_state().screen_buffer state, stacked_frames = stack_frames(state, new_episode=True) for episode in range(episodes): action = np.random.choice(self.actions_choice.shape[0], size=1)[0] action = list(self.actions_choice[action]) reward = self.game.make_action(action) done = self.game.is_episode_finished() print(f'Episode {episode}: {done}') if done: next_state = np.zeros(state.shape, dtype=np.int) self.memory + (state, action, reward, next_state, done) self.game.new_episode() state = self.game.get_state().screen_buffer state, stacked_frames = stack_frames(state, new_episode=True) else: next_state = self.game.get_state().screen_buffer next_state, stacked_frames = stack_frames( next_state, stacked_frames) self.memory + (state, action, reward, next_state, done) state = next_state def setup_writer(self): """ Sets up the tensorboard writer """ self.writer = tf.compat.v1.summary.FileWriter( '/root/tensorboard/dddqn/1') tf.compat.v1.summary.scalar('Loss', self.loss) self.writer_op = tf.compat.v1.summary.merge_all() self.saver = tf.train.Saver() def predict_action(self, sess, state, decay_step): """ Predicts the next action for the agent. Uses the value of epsilon to select a random value or action at argmax(Q[s, a]) """ explore_exploit_tradeoff = np.random.uniform() explore_prob = self.min_eps + \ (self.max_eps - self.min_eps) * np.exp(-self.eps * decay_step) if explore_prob > explore_exploit_tradeoff: # Explore action = self.actions_choice[np.random.choice( self.actions_choice.shape[0], size=1)][0] else: # Exploit -> Estimate Q values state Qs = sess.run( self.output, feed_dict={ self.inputs: state.reshape((1, *state.shape))}) # Best action choice = np.argmax(Qs) action = self.actions_choice[int(choice)] return list(action), explore_prob def update_target_graph(self): """ Copies parameters of the DQN to the target network """ from_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, 'DQNet') to_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, 'TargetNet') up_holder = [to_vars.assign(from_vars) for from_vars, to_vars in zip(from_vars, to_vars)] return up_holder def train(self, episodes=5000, batch_size=64, max_steps=3000, training=True): """ Trains the model """ if training: with tf.Session() as sess: sess.run(tf.global_variables_initializer()) decay_step = 0 tau = 0 loss, acc = '', '' self.game.init() sess.run(self.update_target_graph()) for episode in range(episodes): step = 0 episode_rewards = [] self.game.new_episode() state = self.game.get_state().screen_buffer state, stacked_frames = stack_frames( state, new_episode=True) while step <= max_steps: step += 1 tau += 1 decay_step += 1 action, explore_prob = self.predict_action( sess, state, decay_step) reward = self.game.make_action(action) episode_rewards += [reward] done = self.game.is_episode_finished() if done: next_state = np.zeros( # (120, 140), resolution, dtype=np.int) next_state, stacked_frames = stack_frames( next_state, stacked_frames) step = max_steps total_reward = np.sum(episode_rewards) print(f'Episode {episode}' + f'Total reward: {total_reward}' + f'loss: {loss}' + f'acc: {acc}' + f'Explore prob: {explore_prob}' ) exp = state, action, reward, next_state, done self.memory.add(exp) elif not done: next_state = self.game.get_state().screen_buffer next_state, stacked_frames = stack_frames( next_state, stacked_frames) self.memory + (state, action, reward, next_state, done) state = next_state loss, abs_err = self._learn( sess, episode, batch_size) print(f'Episode: {episode}, loss {loss}') if tau > self.max_tau: sess.run(self.update_target_graph()) tau = 0 self.save(sess, episode, interval=5) def _learn(self, sess, episode, batch_size): """ Uses experiences stored in memory to get target Q values """ mini_batches, tree_index = self.sample_experiences(batch_size) targets = self.get_target_Qs(sess, mini_batches) loss, abs_errs = self.find_loss( sess, targets, mini_batches) self.memory.update_priorities(tree_index, abs_errs) mini_batches.update({'targets': targets}) # self.summarize(sess, episode, mini_batches) return loss, abs_errs def get_target_Qs(self, sess, mini_batch): """ Sets the target_ Q as r for episodes ending at s + 1 else, at r + gamma * max[Q(s',a')] """ q_next_state = sess.run(self.output, feed_dict={ self.inputs: mini_batch.get('next_states') }) q_target_ns = sess.run( target_net.output, feed_dict={ target_net.inputs: mini_batch.get('next_states')}) target_Qs = [] for i in range(mini_batch.get('batch_len')): terminal = mini_batch.get('dones')[i] action = np.argmax(q_next_state[i]) rewards = mini_batch.get('rewards')[i] if terminal: target_Qs.append(rewards) else: target_Qs.append(rewards + self.gamma * q_target_ns[i][action]) targets_mb = [m_b for m_b in target_Qs] return targets_mb def find_loss(self, sess, targets, mini_batches): """ Finds difference between Q and targets """ _, loss, err = sess.run( [self.optimizer, self.loss, self.abs_errors], feed_dict={self.inputs: mini_batches.get('states'), self.target_Q: targets, self.actions: mini_batches.get('actions'), self.ISweights: mini_batches.get( 'ISweights') }) return loss, err def sample_experiences(self, batch_size): """ Samples experience mini batches from memory """ tree_index, batch, IS_weights = self.memory.sample(batch_size) states = self.__from_memory(batch, key=0, min_dims=3) actions = self.__from_memory(batch, 1) rewards = self.__from_memory(batch, 2) next_states = self.__from_memory(batch, 3, 3) dones = self.__from_memory(batch, 4) return { 'states': states, 'actions': actions, 'rewards': rewards, 'next_states': next_states, 'dones': dones, 'batch_len': len(batch), 'ISweights': IS_weights }, tree_index def __from_memory(self, batch, key, min_dims=0): """ Gives states, actions, rewards, as mini batches from a memory sample """ f_key = 0 m_b = np.array([m_bch[f_key][key]for m_bch in batch], ndmin=min_dims) return m_b def summarize(self, sess, episode, batches): """ Writes tf summaries """ summary = sess.run( self.writer_op, feed_dict={self.inputs: batches.get('states'), self.target_Q: batches.get('targets'), self.actions: batches.get('actions'), self.ISweights: batches.get('ISweights') }) self.writer.add_summary(summary, episode) self.writer.flush() def save(self, sess, episode, interval): """ Updates and saves the model """ if not episode % interval: self.saver.save(sess, './models/dddqn.ckpt') def play(self, episodes=25): """ Plays the trained agent """ path = '/usr/local/lib/python3.7/dist-packages/vizdoom/scenarios/' with tf.compat.v1.Session() as sess: game, actions_choice = create_env(visible=True) game.load_config(os.path.join(path, 'deadly_corridor.cfg')) game.set_doom_scenario_path( os.path.join(path, 'deadly_corridor.wad')) eps = .01 self.saver.restore(sess, './models/dddqn.ckpt') game.init() total_score = [] for i in range(episodes): game.new_episode() state = game.get_state().screen_buffer state, stacked_frames = stack_frames(state, new_episode=True) while not game.is_episode_finished(): tradeoff = np.random.randn() if tradeoff > eps: action = actions_choice[np.random.choice( actions_choice.shape[0], size=1)][0] else: # Exploit -> Estimate Q values state Qs = sess.run( self.output, feed_dict={ self.inputs: state.reshape((1, *state.shape))}) # Best action choice = np.argmax(Qs) action = self.actions_choice[int(choice)] game.make_action(list(action)) done = game.is_episode_finished() if not done: next_state = game.get_state().screen_buffer next_state, stacked_frames = stack_frames( next_state, stacked_frames) state = next_state else: break reward = game.get_total_reward() print(f'reward: {reward}') total_score.append(reward) print(f'\nScore: {np.sum(total_score) / episodes}') game.close()
action = possible_actions[randint(0, 3)] # Get rewards terminal, reward = game.perform_action(action) # Look if the episode is finished # done = game.is_episode_finished() # If episode ends if terminal: # episode finishes next_state = np.zeros(state.shape) # Add experience to memory memory.add((state, action, reward, next_state, terminal)) # Start a new episode game.reset() # get a state state, color_frame = game.createImage() # Stack the frames state, stacked_frames = stack_frames(stacked_frames, state, True) else: # Get next state next_state, color_frame = game.createImage() next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
class Agent(object): def __init__(self, sess): self.sess = sess # some config self.state_size = 4 self.n_action = 2 self.epsilon = 0.1 self.max_epsilon = 1 self.min_epsilon = 0.01 self.epsilon_decay_rate = 0.001 self.discount = 0.99 self.steps = 0 self.batch_size = 64 self.lr = 0.00025 self.memory = Memory() # build network self._build_network() self.loss_summary = tf.summary.scalar('loss', self.loss) self.writer = tf.summary.FileWriter('logs/', sess.graph) sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() def _build_network(self): self.w = {} initializer = tf.truncated_normal_initializer(0, 0.02) activation_fn = tf.nn.relu with tf.variable_scope('prediction'): # input self.s = tf.placeholder(tf.float32, [None, self.state_size], name='s') # l1 self.l1, self.w['l1_w'], self.w['l1_b'] = linear(self.s, 64, initializer, activation_fn, name='l1') # q self.q, self.w['q_w'], self.w['q_b'] = linear(self.l1, self.n_action, initializer, activation_fn=None, name='q') with tf.variable_scope('loss'): self.target_q = tf.placeholder(tf.float32, [None, 1], name='target_q') self.action = tf.placeholder('int64', [None, 1], name='action') action_one_hot = tf.one_hot(self.action, 2)[:, 0, :] q_acted = tf.reduce_sum(self.q * action_one_hot, reduction_indices=1, name='q_acted') q_acted = tf.reshape(q_acted, [-1, 1]) self.loss = tf.losses.mean_squared_error(self.target_q, q_acted) self.optimizer = tf.train.RMSPropOptimizer(self.lr) self.train_op = self.optimizer.minimize(self.loss) def act(self, s): s = np.array(s) s = s[np.newaxis, ...] if np.random.random() < self.epsilon: return np.random.randint(self.n_action) else: return np.argmax(self.q.eval({self.s: s}), axis=1)[0] def observe(self, s, a, r, s_, terminal): self.memory.add(s, a, r, s_, terminal) self.steps += 1 self.epsilon = self.min_epsilon + (self.max_epsilon - self.min_epsilon) * np.exp(-self.epsilon_decay_rate * self.steps) def replay(self): s, a, r, s_, terminal = self.memory.sample(self.batch_size) q_ = self.q.eval({self.s: s_}) max_q_ = np.max(q_, axis=1).reshape([-1, 1]) target_q = (1 - terminal) * self.discount * max_q_ self.write_loss, _ = self.sess.run([self.loss_summary, self.train_op], {self.s: s, self.target_q: target_q, self.action: a}) def save(self, epsodes): self.saver.save(self.sess, 'save/cart_pole', global_step=epsodes)
class DDPG: def __init__(self, sess, params): self.sess = sess self.__dict__.update(params) # create placeholders self.create_input_placeholders() # create actor/critic models self.actor = Actor(self.sess, self.inputs, **self.actor_params) self.critic = Critic(self.sess, self.inputs, **self.critic_params) self.noise_params = {k: np.array(list(map(float, v.split(",")))) for k, v in self.noise_params.items()} self.noise = Noise(**self.noise_params) self.ou_level = np.zeros(self.dimensions["u"]) self.memory = Memory(self.n_mem_objects, self.memory_size) def create_input_placeholders(self): self.inputs = {} with tf.name_scope("inputs"): for ip_name, dim in self.dimensions.items(): self.inputs[ip_name] = tf.placeholder(tf.float32, shape=(None, dim), name=ip_name) self.inputs["g"] = tf.placeholder(tf.float32, shape=self.inputs["u"].shape, name="a_grad") self.inputs["p"] = tf.placeholder(tf.float32, shape=(None, 1), name="pred_q") def step(self, x, is_u_discrete, explore=True): x = x.reshape(-1, self.dimensions["x"]) u = self.actor.predict(x) if explore: self.ou_level = self.noise.ornstein_uhlenbeck_level(self.ou_level) u = u + self.ou_level q = self.critic.predict(x, u) if is_u_discrete: return [np.argmax(u), u[0], q[0]] return [u[0], u, q[0]] def remember(self, experience): self.memory.add(experience) def train(self): # check if the memory contains enough experiences if self.memory.size < 3*self.b_size: return x, g, ag, u, r, nx, ng, t = self.get_batch() # for her transitions her_idxs = np.where(np.random.random(self.b_size) < 0.80)[0] # print("{} of {} selected for HER transitions". # format(len(her_idxs), self.b_size)) g[her_idxs] = ag[her_idxs] r[her_idxs] = 1 t[her_idxs] = 1 x = np.hstack([x, g]) nx = np.hstack([nx, ng]) nu = self.actor.predict_target(nx) tq = r + self.gamma*self.critic.predict_target(nx, nu)*(1-t) self.critic.train(x, u, tq) grad = self.critic.get_action_grads(x, u) # print("Grads:\n", g) self.actor.train(x, grad) self.update_targets() def get_batch(self): return self.memory.sample(self.b_size) def update_targets(self): self.critic.update_target() self.actor.update_target()
class Agent: def __init__(self, input_shape, action_count, steps=0, model_path=None, learning_rate=None): if learning_rate is not None: SET_LEARNING_RATE(learning_rate) self.steps = steps self.epsilon = MAX_EPSILON if steps == 0 else self.__calc_epsilon(steps) self.brain = Brain(action_count, input_shape=input_shape, model_path=model_path) self.memory = Memory(MEMORY_CAPACITY) self.input_shape = input_shape self.action_count = action_count def act(self, s): action = -1 if random.random() < self.epsilon: action = random.randint(0, self.action_count - 1) else: predictions = np.squeeze(self.brain.predict(s.astype(np.float32))) action = round(np.argmax(predictions)) weight_sqrsum = 0 for i in range(self.action_count): if predictions[i] < 0 or predictions[i] * 2 < predictions[action]: predictions[i] = 0 else: weight_sqrsum += math.pow(predictions[i], 2) if weight_sqrsum != 0: dice = random.random() * weight_sqrsum weight_begin = 0 for i in range(self.action_count): if weight_begin < dice and dice < weight_begin + math.pow(predictions[i], 2): action = i break else: weight_begin = math.pow(predictions[i], 2) return action def observe(self, sample): # in (s, a, r, s_) format self.memory.add(sample) def __calc_epsilon(self, steps): return MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * steps) def replay(self, batch_size=BATCH_SIZE): # slowly decrease Epsilon based on our eperience self.steps += 1 self.epsilon = self.__calc_epsilon(self.steps) batch = self.memory.sample(batch_size) batch_len = len(batch) no_state = np.zeros(self.input_shape) # CNTK: explicitly setting to float32 states = np.array([o[0] for o in batch], dtype=np.float32) states_ = np.array([(no_state if o[3] is None else o[3]) for o in batch], dtype=np.float32) p = self.brain.predict(states) p_ = self.brain.predict(states_) # CNTK: explicitly setting to float32 x = np.zeros((batch_len, *self.input_shape)).astype(np.float32) y = np.zeros((batch_len, self.action_count)).astype(np.float32) for i in range(batch_len): s, a, r, s_ = batch[i] # CNTK: [0] because of sequence dimension t = p[0][i] if s_ is None: t[a] = r else: t[a] = r + GAMMA * np.amax(p_[0][i]) x[i] = s y[i] = t self.brain.train(x, y)
class Brain: train_queue = [ [], [], [], [], [] ] # s, a, r, s', s' terminal mask lock_queue = threading.Lock() def __init__(self, agent, modelFunc=None): self.initialized = False self.finalized = False self.c = 0 self.agent = agent self.state_dim = self.agent.state_dim self.action_dim = self.agent.action_dim self.gamma = self.agent.h.gamma self.n_step_return = self.agent.h.memory_size self.gamma_n = self.gamma ** self.n_step_return self.loss_v = self.agent.h.extra.loss_v self.loss_entropy = self.agent.h.extra.loss_entropy self.batch = self.agent.h.batch self.learning_rate = self.agent.h.learning_rate self.brain_memory_size = self.agent.args.hyper.extra.brain_memory_size self.env = self.agent.args.env self.metrics = self.agent.metrics self.brain_memory = Memory(self.brain_memory_size, self.state_dim, self.action_dim) if self.agent.args.data: # Load memory s, a, r, s_, t = loadMemory_direct('../data/' + self.agent.args.data + '/') self.brain_memory.add(s, a, r, s_, t) self.NONE_STATE = np.zeros(self.state_dim) self.visualization = agent.visualization self.model = self.create_model(modelFunc) def init_model(self): if self.initialized == True: return if self.visualization == False: ####################################### self.session = tf.Session() K.set_session(self.session) K.manual_variable_initialization(True) self.graph = self.create_graph(self.model) self.session.run(tf.global_variables_initializer()) self.default_graph = tf.get_default_graph() self.initialized = True # # avoid modifications ####################################### def init_vars(self): init_op = tf.global_variables_initializer() self.session.run(init_op) def finalize_model(self): if self.finalized == True: return self.default_graph.finalize() self.finalized = True #for layer in self.model.layers: # weights = layer.get_weights() # print(np.sum(np.sum(weights))) #c += 1 #print(c) #print(np.sum(layer.get_weights())) def create_model(self, modelFunc=None): print(self.state_dim) print(self.action_dim) if not modelFunc: modelFunc = models.model_mid_default model = models.model_start(self.state_dim, self.action_dim, models.model_top_a3c, modelFunc, self.visualization) model._make_predict_function() # have to initialize before threading print("Finished building the model") print(model.summary()) return model def create_graph(self, model): batch_size = None # = None state_dim = [batch_size] + self.state_dim print(state_dim) s_t = tf.placeholder(tf.float32, shape=(state_dim)) a_t = tf.placeholder(tf.float32, shape=(batch_size, self.action_dim)) r_t = tf.placeholder(tf.float32, shape=(batch_size, 1)) # Discounted Reward p, v = model(s_t) log_prob = tf.log( tf.reduce_sum(p * a_t, axis=1, keep_dims=True) + 1e-6) # Negative, larger when action is less likely advantage = r_t - v loss_policy = - log_prob * tf.stop_gradient(advantage) # Pos if better than expected, Neg if bad loss_value = self.loss_v * tf.square(advantage) # Positive # minimize value error entropy = self.loss_entropy * tf.reduce_sum(p * tf.log(p + 1e-6), axis=1, keep_dims=True) # Negative Value loss_total = tf.reduce_mean(loss_policy + loss_value + entropy) optimizer = tf.train.AdamOptimizer(self.learning_rate, epsilon=1e-3) minimize = optimizer.minimize(loss_total) return s_t, a_t, r_t, minimize, loss_total, log_prob, loss_policy, loss_value, entropy def optimize_batch_full(self, reset=1, suppress=1): # Use for online learning if self.brain_memory.isFull != True: return idx = np.arange(0, self.brain_memory.max_size) self.optimize_batch_index(idx, 1, reset, suppress) def optimize_batch_full_multithread(self, reset=1, suppress=1): # Use for online learning if self.brain_memory.isFull != True: time.sleep(0) # yield return idx = np.arange(0, self.brain_memory.max_size) self.optimize_batch_index_multithread(idx, 1, reset, suppress) def optimize_batch(self, batch_count=1, suppress=0): # Use for offline learning if self.brain_memory.isFull != True: time.sleep(0) # yield return idx = self.brain_memory.sample(self.batch * batch_count) self.optimize_batch_index(idx, batch_count, suppress) def optimize_batch_index(self, idx, batch_count=1, reset=0, suppress=0): s = self.brain_memory.s [idx, :] a = self.brain_memory.a [idx, :] r = np.copy(self.brain_memory.r [idx, :]) s_ = self.brain_memory.s_[idx, :] t = self.brain_memory.t [idx, :] if reset == 1: self.brain_memory.isFull = False self.brain_memory.size = 0 self.optimize_batch_child(s, a, r, s_, t, batch_count, suppress) def optimize_batch_index_multithread(self, idx, batch_count=1, reset=1, suppress=0): with self.lock_queue: if self.brain_memory.isFull != True: return s = np.copy(self.brain_memory.s [idx, :]) a = np.copy(self.brain_memory.a [idx, :]) r = np.copy(self.brain_memory.r [idx, :]) s_ = np.copy(self.brain_memory.s_[idx, :]) t = np.copy(self.brain_memory.t [idx, :]) if reset == 1: self.brain_memory.isFull = False self.brain_memory.size = 0 self.c += 1 self.optimize_batch_child(s, a, r, s_, t, batch_count, suppress) def optimize_batch_child(self, s, a, r, s_, t, batch_count=1, suppress=0): s_t, a_t, r_t, minimize, loss_total, log_prob, loss_policy, loss_value, entropy = self.graph for i in range(batch_count): start = i * self.batch end = (i+1) * self.batch r[start:end] = r[start:end] + self.gamma_n * self.predict_v(s_[start:end]) * t[start:end] # set v to 0 where s_ is terminal state _, loss_current, log_current, loss_p_current, loss_v_current, entropy_current = self.session.run([minimize, loss_total, log_prob, loss_policy, loss_value, entropy], feed_dict={s_t: s[start:end], a_t: a[start:end], r_t: r[start:end]}) #self.metrics.a3c.update(loss_current, log_current, loss_p_current, loss_v_current, entropy_current) if i % 10 == 0 and suppress == 0: print('\r', 'Learning', '(', i, '/', batch_count, ')', end="") if suppress == 0: print('\r', 'Learning', '(', batch_count, '/', batch_count, ')') def train_augmented(self, s, a, r, s_): if self.env.problem == 'Hexagon': if s_ is None: self.train_push_all_augmented(data_aug.full_augment([[s, a, r, self.NONE_STATE, 0.]])) else: self.train_push_all_augmented(data_aug.full_augment([[s, a, r, s_, 1.]])) else: if s_ is None: self.train_push_augmented([s, a, r, self.NONE_STATE, 0.]) else: self.train_push_augmented([s, a, r, s_, 1.]) def train_push_all_augmented(self, frames): for frame in frames: self.train_push_augmented(frame) # TODO: t value is flipped for brain memory and agent memory... should be consistent. Not a bug however. def train_push_augmented(self, frame): a_cat = np.zeros(self.action_dim) a_cat[frame[1]] = 1 with self.lock_queue: if self.brain_memory.isFull == True: time.sleep(0) return self.brain_memory.add_single(frame[0], a_cat, frame[2], frame[3], frame[4]) #self.train_queue.append([frame[0], a_cat, frame[2], frame[3], frame[4]]) def predict(self, s): with self.default_graph.as_default(): p, v = self.model.predict(s) return p, v def predict_p(self, s): with self.default_graph.as_default(): p, _ = self.model.predict(s) return p def predict_v(self, s): with self.default_graph.as_default(): _, v = self.model.predict(s) return v
class MADDPG(object): def __init__(self, n, state_global, action_global, gamma, memory_size): self.n = n self.gamma = gamma self.memory = Memory(memory_size) self.agents = [ DDPGAgent(index, 1600, 400, 0.5, state_global, action_global) for index in range(0, n) ] def update_agent(self, sample, index): observations, actions, rewards, next_obs, dones = sample curr_agent = self.agents[index] curr_agent.critic_train.zero_grad() all_target_actions = [] # 根据局部观测值输出动作目标网络的动作 for i in range(0, self.n): action = curr_agent.Actor_target(next_obs[:, i]) all_target_actions.append(action) action_target_all = torch.cat(all_target_actions, dim=0).to(device).reshape( actions.size()[0], actions.size()[1], actions.size()[2]) target_vf_in = torch.cat((next_obs, action_target_all), dim=2) # 计算在目标网络下,基于贝尔曼方程得到当前情况的评价 target_value = rewards[:, index] + self.gamma * curr_agent.Critic_target( target_vf_in).squeeze(dim=1) vf_in = torch.cat((observations, actions), dim=2) actual_value = curr_agent.Critic(vf_in).squeeze(dim=1) # 计算针对Critic的损失函数 vf_loss = curr_agent.loss_td(actual_value, target_value.detach()) vf_loss.backward() curr_agent.critic_train.step() curr_agent.actor_train.zero_grad() curr_pol_out = curr_agent.Actor(observations[:, index]) curr_pol_vf_in = curr_pol_out all_pol_acs = [] for i in range(0, self.n): if i == index: all_pol_acs.append(curr_pol_vf_in) else: all_pol_acs.append(self.agents[i].Actor( observations[:, i]).detach()) vf_in = torch.cat( (observations, torch.cat(all_pol_acs, dim=0).to(device).reshape( actions.size()[0], actions.size()[1], actions.size()[2])), dim=2) # DDPG中针对Actor的损失函数 pol_loss = -torch.mean(curr_agent.Critic(vf_in)) pol_loss.backward() curr_agent.actor_train.step() def update(self, sample): for index in range(0, self.n): self.update_agent(sample, index) def update_all_agents(self): for agent in self.agents: soft_update(agent.Critic_target, agent.Critic, agent.tau) soft_update(agent.Actor_target, agent.Actor, agent.tau) def add_data(self, s, a, r, s_, done): self.memory.add(s, a, r, s_, done) def save_model(self, episode): for i in range(0, self.n): model_name_c = "Critic_Agent" + str(i) + "_" + str(episode) + ".pt" model_name_a = "Actor_Agent" + str(i) + "_" + str(episode) + ".pt" torch.save(self.agents[i].Critic_target, 'model_tag/' + model_name_c) torch.save(self.agents[i].Actor_target, 'model_tag/' + model_name_a) def load_model(self, episode): for i in range(0, self.n): model_name_c = "Critic_Agent" + str(i) + "_" + str(episode) + ".pt" model_name_a = "Actor_Agent" + str(i) + "_" + str(episode) + ".pt" self.agents[i].Critic_target = torch.load("model_tag/" + model_name_c) self.agents[i].Critic = torch.load("model_tag/" + model_name_c) self.agents[i].Actor_target = torch.load("model_tag/" + model_name_a) self.agents[i].Actor = torch.load("model_tag/" + model_name_a)
action, q = sess.run([train_actor_output, train_critic_current_action], feed_dict={k: [[v]] for k, v in zip(state_placeholders, env_state)}) action = action[0] action = action if testing else eta_noise.reflected_ou(action * np.array([1, 1, 0, 1]), theta=[.15, .15, .75, .15], sigma=[.10, .10, .10, .10], min=-1, max=1) assert action.shape == env.action_space.sample().shape, (action.shape, env.action_space.sample().shape) max_xvel = 20 max_yvel = 8 max_yawrate = 0.2 max_altitude = 15 action = np.clip(action, -1, 1) * np.array([max_xvel, max_yvel, max_yawrate, max_altitude / 4.0]) - np.array([0, 0, 0, max_altitude]) env_next_state, env_reward, env_done, env_info = env.step(action) replay_buffer.add(env_state, env_reward, action, env_done, priority=300) env_state = env_next_state total_reward += env_reward if training: states_batch, action_batch, reward_batch, next_states_batch, done_batch, indexes = replay_buffer.sample(BATCH_SIZE, prioritized=True) feed = { action_placeholder: action_batch, reward_placeholder: reward_batch, done_placeholder: done_batch } feed.update({k: v for k, v in zip(state_placeholders, states_batch)})
class Agent(): """Interacts with and learns from the environment (env).""" def __init__(self, s_size, a_size, random_seed): """Initialize an Agent object. Params ====== s_size (int): dimension of each state (s) a_size (int): dimension of each action (a) random_seed (int): random seed """ self.s_size = s_size self.a_size = a_size self.random_seed = random.seed(random_seed) # Q-Network self.q = Q(s_size, a_size, random_seed).to(device) self.q_target = Q(s_size, a_size, random_seed).to(device) self.optimizer = optim.Adam(self.q.parameters(), lr=LR) # Replay memory self.memory = Memory(a_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, s, a, r, s2, done): # Save/add experience in/to replay memory/buffer self.memory.add(s, a, r, s2, done) # Exploration vs exploitation # # Learn every UPDATE_EVERY time steps. # self.t_step = (self.t_step + 1) % UPDATE_EVERY # if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: E = self.memory.sample() # E: expriences, e: exprience self.learn(E, GAMMA) def act(self, s, eps=0.): """Returns an action (a) for a given state (s) as the current policy (a). Params ====== state (array_like): current state (s) eps (float): epsilon, for epsilon-greedy action (a) selection """ s = torch.from_numpy(s).float().unsqueeze(0).to(device) self.q.eval() with torch.no_grad(): a_values = self.q(s) # a_values: action_values self.q.train() # # Epsilon-greedy (eps) action (a) selection # if random.random() > eps: return np.argmax(a_values.cpu().data.numpy()) # else: # return random.choice(np.arange(self.a_size)) def learn(self, E, gamma): """Update value parameters using given batch of experience (e) tuples. Params ====== exprience (Tuple[torch.Tensor]): tuple of (state, action, reward, next_state, done) e (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) e (Tuple[torch.Tensor]): tuple of (s, a, r, s2, done) gamma (float): discount factor """ S, A, rewards, S2, dones = E # Get max predicted Q (values) for next states (S2) from target model Q2 = self.q_target(S2).detach().max(1)[0].unsqueeze(1) print(self.q_target(S2).detach().max(1)[0].unsqueeze(1)) print(self.q_target(S2).detach().max(1)[0]) print(self.q_target(S2).detach().max(1)) print(self.q_target(S2).detach()) print(self.q_target(S2)) # Compute Q target for current states (S) Q = rewards + (gamma * Q2 * (1 - dones)) # Get expected Q (values) from local model Q_ = self.q(S).gather(1, A) print(self.q(S).gather(1, A)) print(self.q(S)) # Compute loss #loss = F.mse_loss(Q_expected, Q_targets) loss = ((Q_ - Q)**2).mean() # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.q, self.q_target, GAMMA) def soft_update(self, local_model, target_model, gamma): """Soft update model parameters. θ_target = (1-γ)*θ_local + γ*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(((1-gamma)*local_param.data) + (gamma*target_param.data))
es_params[i] = updated_es_params[i] actor_steps = 0 # evaluate noisy actor(s) outs = ray.get([ workers[i].evaluate.remote(es_params[i], n_episodes=args.n_episodes, noise=a_noise) for i in range(args.n_noisy) ]) for f, steps, transitions, last_reward in outs: for transition in transitions: memory.add(transition) actor_steps += steps prCyan('Noisy actor {} fitness:{}'.format(i, f)) # evaluate all actors outs = ray.get([ workers[i].evaluate.remote(params, n_episodes=args.n_episodes) for i, params in enumerate(es_params) ]) for f, steps, transitions, last_reward in outs: for transition in transitions: memory.add(transition)
class A2Cagent(nn.Module): def __init__(self): super(A2Cagent, self).__init__() self.A, self.C = Actor(), Critic() if USE_CUDA: self.A.cuda() self.C.cuda() self.opt = torch.optim.Adam(self.parameters(), lr=LEARNING_RATE) self.exp_buffer = Memory(EXP_BUFFER_MAX) def forward(self, x): a_p = self.A(x) v = self.C(x) return a_p, v def remember(self, x): self.exp_buffer.add(x) def get_adv(self, s, s_new, r_new): # Gets A(s_t,a_t) adv = r_new # reward adv += GAMMA * (self.C(s_new)) # value of next state adv += self.C(s) # value of current state return adv def act(self, s, action=None): prob, v = self.forward(s) dist = Categorical(prob) if action is None: action = dist.sample() log_prob = dist.log_prob(action) entropy = dist.entropy() return action, log_prob, entropy, v.squeeze() # Replay S states, A actions, R Rewards, Adv advantages def replay(self, batch_size): if self.exp_buffer.size < EXP_BUFFER_MIN: return 0, 0, 0 S, S_next, R, Adv, log_p_old, A, _ = self.exp_buffer.get_batch( batch_size) self.opt.zero_grad() a, log_p, ent, v = self.act(S, A) # A2C loss function for actor p_loss_ratio = torch.exp(log_p - log_p_old) p_loss_1 = p_loss_ratio * Adv p_loss_2 = torch.clamp(p_loss_ratio, 1 - CLIP_RANGE, 1 + CLIP_RANGE) * Adv p_loss = -torch.min(p_loss_1, p_loss_2).mean() # MSE for Critic Loss v_loss = 0.5 * (R - v).pow(2).mean() ent = ent.mean() (p_loss + v_loss - BETA * ent).backward(retain_graph=True) nn.utils.clip_grad_norm_(self.parameters(), 5) self.opt.step() return p_loss, v_loss, ent
class Agent(object): """ Agente inteligente responsável por tomar as decisões inerentes ao reinforcement learning. Durante o treinamento, utiliza o protocolo epsilon-greedy. Argumentos: action_pool (dict): Ações que podem ser tomadas pelo agente. Cada ação possui um código numérico. Atributos: _action_pool (dict): Mapa de ações que pode ser tomadas ({'id':'ação'}). _last_action (str): Última ação tomada. _config (Configuration): Arquivo de configurações globais. _memory (Memory): Memória para armazenar as ações tomadas, mudanças de estado e recompensas. Os itens guardados na memporia tem o formato {'state': '', 'action': '', 'reward': '', 'next_state': ''}. _taken_actions (list(str)): Lista com todas as ações tomadas durante a execução. _weekdays_map (dict): Mapa com sigla de dias da semana para números. Métodos: take_action(model, environment, training, network, current_step, actions_taken) : O agente executa uma ação no ambiente seguindo o protocolo epsilon-greedy. reset() : Reinicia o agente. sample_memory() : Faz uma amostragem aleatória da memória do agente. """ def __init__(self, action_pool={}): self._action_pool = action_pool self._last_action = None self._config = Configuration() self._memory = Memory(self._config.max_memory_size) self._taken_actions = [] self._weekdays_map = { 'MON': 1, 'TUE': 2, 'WED': 3, 'THU': 4, 'FRI': 5, 'SAT': 6, 'SUN': 7 } def take_action(self, model, environment, training, network, current_step, actions_taken): """ O agente executa uma ação no ambiente seguindo o protocolo epsilon-greedy. Durante a etapa de treino, o protocolo é seguido. Durante a execução normal a ação tomada é gulosa. Parâmetros: model (OpenDssEngine): Motor do OpenDSS utilizado para a simulação. environment (Environment): Ambiente onde serão executadas as ações. training (bool): Indica se está no processo de treinamento ou não. network (Network): Rede utilizada para escolher a ação. current_step (int): Passo atual da simulação (qual minuto do dia). actions_taken (int): Quantas ações foram tomadas no passo atual. Erros: None Retorna: None """ alpha = environment.get_base_learning_rate() gamma = environment.get_discount_factor() initial_state = deepcopy(model.get_state()) initial_state_voltages = deepcopy(model.get_voltages()) p = np.random.random() # Greedy if (training and p < environment.get_epsilon()): a = random.choice(list(self._action_pool.keys())) _a = self._action_pool[a] if _a: model.take_action(_a) new_state = deepcopy(model.get_state()) new_state_voltages = deepcopy(model.get_voltages()) reward = environment.calculate_reward(initial_state_voltages, new_state_voltages, _a, self._last_action) self._last_action = _a self._memory.add({ 'state': deepcopy( initial_state.state_space_repr( current_step, actions_taken, self._weekdays_map[model.get_weekday()])), 'action': deepcopy(a), 'reward': deepcopy(reward), 'next_state': deepcopy( new_state.state_space_repr( current_step, actions_taken, self._weekdays_map[model.get_weekday()])) }) # Optimal else: inputs = np.expand_dims( np.array(initial_state.state_space_repr( current_step, actions_taken, self._weekdays_map[model.get_weekday()]), dtype=np.float32), 0) a = np.squeeze(np.argmax(network.model(inputs), axis=-1)) _a = self._action_pool[int(a)] if _a: model.take_action(_a) self._taken_actions.append(_a) if training: new_state = deepcopy(model.get_state()) new_state_voltages = deepcopy(model.get_voltages()) reward = environment.calculate_reward(initial_state_voltages, new_state_voltages, _a, self._last_action) self._last_action = _a self._memory.add({ 'state': deepcopy( initial_state.state_space_repr( current_step, actions_taken, self._weekdays_map[model.get_weekday()])), 'action': deepcopy(a), 'reward': deepcopy(reward), 'next_state': deepcopy( new_state.state_space_repr( current_step, actions_taken, self._weekdays_map[model.get_weekday()])) }) def reset(self): """ Apaga a memória, ações tomadas e última ação tomada do agente. Parâmetros: None Erros: None Retorna: None """ self._last_action = None self._memory = Memory(self._config.max_memory_size) self._taken_actions = [] def sample_memory(self): """ Faz uma amostragem aleatória da memória do agente. Função utilizada no treinamento da rede. Parâmetros: None Erros: None Retorna: Lista com itens aleatórios da memória. """ return self._memory.sample(self._config.memory_batch_size)
class MADDPG: def __init__(self, state_size, action_size, num_agents, config): ''' Constructs the multi-agent eco-system ''' self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') print(f'Using {self.device}') self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.ddpg_agents = [ DDPGAgent(config['seed'] + idx, state_size, action_size, num_agents, self.device, config) for idx in range(num_agents) ] self.update_every = config['update_every'] self.update_iters = config['update_iterations'] # Note: Could be replaced by parallel env batching seed = config['seed'] self.batch_size = config['batch_size'] self.memory = Memory(config['memory_size'], self.batch_size, seed) self.memory.to_device(self.device) def reset_noise(self): '''Resets the noise amplitude for each ddpg agent''' [agent.reset_noise() for agent in self.ddpg_agents] def act(self, states): ''' For each agent idx, select a_idx = policy_idx(o_idx) + noise ''' actions = [ self.ddpg_agents[idx].act(states[np.newaxis, idx]).squeeze(0) for idx in range(self.num_agents) ] return actions # Note: We need to add all the observations, otherwise we break the stationarity of the environment def remember(self, states, actions, rewards, next_states, dones): '''Populates the replay memory with new batch of data; observations of all agents''' self.memory.add( Experience(states, actions, rewards, next_states, dones)) def step(self, timestep): '''Steps through each ddpg agent''' if len(self.memory ) > self.batch_size and timestep % self.update_every == 0: for _ in range(self.update_iters): for idx in range(self.num_agents): states, actions, rewards, next_states, dones = self.memory.sample( ) predicted_best_next_actions = torch.cat([ self.ddpg_agents[idx].target_actor(next_states[:, idx, :]) for idx in range(self.num_agents) ], dim=1) predicted_best_current_actions = torch.cat([ self.ddpg_agents[idx].learnt_actor(states[:, idx, :]) for idx in range(self.num_agents) ], dim=1) states = torch.cat( [states[:, idx, :] for idx in range(self.num_agents)], dim=1) actions = torch.cat( [actions[:, idx, :] for idx in range(self.num_agents)], dim=1) next_states = torch.cat([ next_states[:, idx, :] for idx in range(self.num_agents) ], dim=1) self.ddpg_agents[idx].step(predicted_best_current_actions, predicted_best_next_actions, states, actions, rewards, next_states, dones) def save(self, actor_weights_path, critic_weights_path): [ torch.save(self.ddpg_agents[idx].learnt_actor.state_dict(), actor_weights_path + str(idx + 1) + '.pth') for idx in range(self.num_agents) ] [ torch.save(self.ddpg_agents[idx].learnt_actor.state_dict(), critic_weights_path + str(idx + 1) + '.pth') for idx in range(self.num_agents) ]
class DQN: def __init__(self, env, params): self.env = env params.actions = env.actions() self.num_actions = env.actions() self.episodes = params.episodes self.steps = params.steps self.train_steps = params.train_steps self.update_freq = params.update_freq self.save_weights = params.save_weights self.history_length = params.history_length self.discount = params.discount self.eps = params.init_eps self.eps_delta = (params.init_eps - params.final_eps) / params.final_eps_frame self.replay_start_size = params.replay_start_size self.eps_endt = params.final_eps_frame self.random_starts = params.random_starts self.batch_size = params.batch_size self.ckpt_file = params.ckpt_dir+'/'+params.game self.global_step = tf.Variable(0, trainable=False) if params.lr_anneal: self.lr = tf.train.exponential_decay(params.lr, self.global_step, params.lr_anneal, 0.96, staircase=True) else: self.lr = params.lr self.buffer = Buffer(params) self.memory = Memory(params.size, self.batch_size) with tf.variable_scope("train") as self.train_scope: self.train_net = ConvNet(params, trainable=True) with tf.variable_scope("target") as self.target_scope: self.target_net = ConvNet(params, trainable=False) self.optimizer = tf.train.RMSPropOptimizer(self.lr, params.decay_rate, 0.0, self.eps) self.actions = tf.placeholder(tf.float32, [None, self.num_actions]) self.q_target = tf.placeholder(tf.float32, [None]) self.q_train = tf.reduce_max(tf.mul(self.train_net.y, self.actions), reduction_indices=1) self.diff = tf.sub(self.q_target, self.q_train) half = tf.constant(0.5) if params.clip_delta > 0: abs_diff = tf.abs(self.diff) clipped_diff = tf.clip_by_value(abs_diff, 0, 1) linear_part = abs_diff - clipped_diff quadratic_part = tf.square(clipped_diff) self.diff_square = tf.mul(half, tf.add(quadratic_part, linear_part)) else: self.diff_square = tf.mul(half, tf.square(self.diff)) if params.accumulator == 'sum': self.loss = tf.reduce_sum(self.diff_square) else: self.loss = tf.reduce_mean(self.diff_square) # backprop with RMS loss self.task = self.optimizer.minimize(self.loss, global_step=self.global_step) def randomRestart(self): self.env.restart() for _ in range(self.random_starts): action = rand.randrange(self.num_actions) reward = self.env.act(action) state = self.env.getScreen() terminal = self.env.isTerminal() self.buffer.add(state) if terminal: self.env.restart() def trainEps(self, train_step): if train_step < self.eps_endt: return self.eps - train_step * self.eps_delta else: return self.eps_endt def observe(self, exploration_rate): if rand.random() < exploration_rate: a = rand.randrange(self.num_actions) else: x = self.buffer.getInput() action_values = self.train_net.y.eval( feed_dict={ self.train_net.x: x } ) a = np.argmax(action_values) state = self.buffer.getState() action = np.zeros(self.num_actions) action[a] = 1.0 reward = self.env.act(a) screen = self.env.getScreen() self.buffer.add(screen) next_state = self.buffer.getState() terminal = self.env.isTerminal() self.memory.add(state, action, reward, next_state, terminal) return state, action, reward, next_state, terminal def doMinibatch(self, sess, successes, failures): batch = self.memory.getSample() state = np.array([batch[i][0] for i in range(self.batch_size)]).astype(np.float32) actions = np.array([batch[i][1] for i in range(self.batch_size)]).astype(np.float32) rewards = np.array([batch[i][2] for i in range(self.batch_size)]).astype(np.float32) successes += np.sum(rewards==1) failures += np.sum(rewards==-1) next_state = np.array([batch[i][3] for i in range(self.batch_size)]).astype(np.float32) terminals = np.array([batch[i][4] for i in range(self.batch_size)]).astype(np.float32) rewards = np.clip(rewards, -1.0, 1.0) q_target = self.target_net.y.eval( feed_dict={ self.target_net.x: next_state } ) q_target_max = np.argmax(q_target, axis=1) q_target = rewards + ((1.0 - terminals) * (self.discount * q_target_max)) (result, loss) = sess.run( [self.task, self.loss], feed_dict={ self.q_target: q_target, self.train_net.x: state, self.actions: actions } ) return successes, failures, loss def play(self): self.randomRestart() self.env.restart() for i in xrange(self.episodes): terminal = False while not terminal: action, reward, screen, terminal = self.observe(self.eps) def copy_weights(self, sess): for key in self.train_net.weights.keys(): t_key = 'target/' + key.split('/', 1)[1] sess.run(self.target_net.weights[t_key].assign(self.train_net.weights[key])) def save(self, saver, sess, step): saver.save(sess, self.ckpt_file, global_step=step) def restore(self, saver): ckpt = tf.train.get_checkpoint_state(self.ckpt_file) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path)
class Player: def __init__(self, game): with open("config.yaml", 'r') as stream: try: config = yaml.load(stream) except yaml.YAMLError as exc: print(exc) self.batch_size = config['batch_size'] self.learning_rate = config['learning_rate'] self.memory_size = config['memory_size'] self.gamma = config['gamma'] self.epsilon = config['epsilon'] self.explore_start = config['explore_start'] self.explore_stop = config['explore_stop'] self.decay_rate = config['decay_rate'] self.decay_step = config['decay_step'] self.total_episodes = config['total_episodes'] self.max_steps = config['max_steps'] self.env = retro.make(game=game) self.memory = Memory(max_size=self.memory_size) self.action_size = self.env.action_space.n self.state_size = [38, 42, 4] self.possible_actions = np.array( np.identity(self.action_size, dtype=int).tolist()) self.possible_actions = list( itertools.product((0, 1), repeat=self.action_size)) self.action_size = len(self.possible_actions) tf.reset_default_graph() self.myNN = MyNN(self.action_size, self.state_size, self.learning_rate) def init_memory(self): state = self.env.reset() stacked_frames = deque( [np.zeros((38, 42), dtype=np.int) for i in range(4)], maxlen=4) state, stacked_frames = stack_frames(stacked_frames, state, True) for i in range(self.batch_size): choice = random.randint(1, len(self.possible_actions)) - 1 # action = possible_actions[choice] action = np.zeros(512, dtype=np.int) action[choice] = 1 next_state, reward, done, _ = self.env.step(action) next_state, stacked_frames = stack_frames(stacked_frames, next_state, False) self.memory.add((state, action, reward, next_state, done)) state = next_state def train(self, render=False): self.init_memory() state = self.env.reset() stacked_frames = deque( [np.zeros((38, 42), dtype=np.int) for i in range(4)], maxlen=4) state, stacked_frames = stack_frames(stacked_frames, state, True) saver = tf.train.Saver() with tf.Session() as session: session.run(tf.global_variables_initializer()) total_rewards = 0 episode = 0 for episode in range(self.total_episodes): step = 0 state = self.env.reset() state, stacked_frames = stack_frames(stacked_frames, state, True) # episode += 1 while step < self.max_steps: a = datetime.now() exp_exp_tradeoff = np.random.rand() explore_probability = self.explore_stop + ( self.explore_start - self.explore_stop) * np.exp( -self.decay_rate * self.decay_step) if (explore_probability > exp_exp_tradeoff): choice = random.randint(1, len( self.possible_actions)) - 1 action = self.possible_actions[choice] else: Qs = session.run(self.myNN.output, feed_dict={ self.myNN.input: state.reshape((1, *state.shape)) }) choice = np.argmax(Qs) action = self.possible_actions[choice] batch = self.memory.sample(self.batch_size) target_Qs_batch = [] memory_states = [] memory_actions = [] memory_rewards = [] memory_next_states = [] memory_dones = [] for m in batch: memory_states.append(m[0]) memory_actions.append(m[1]) memory_rewards.append(m[2]) memory_next_states.append(m[3]) memory_dones.append(m[4]) nextQs = session.run( self.myNN.output, feed_dict={self.myNN.input: memory_next_states}) for i in range(0, self.batch_size): if batch[i][4]: target_Qs_batch.append(batch[i][2]) else: target_Qs_batch.append(batch[i][2] + self.gamma * np.max(nextQs[i])) target_Qs_batch = np.array( [each for each in target_Qs_batch]) loss, _ = session.run( [self.myNN.loss, self.myNN.optimizer], feed_dict={ self.myNN.input: memory_states, self.myNN.target_Q: target_Qs_batch, self.myNN.actions: memory_actions }) next_state, reward, done, _ = self.env.step(action) total_rewards += reward next_state, stacked_frames = stack_frames( stacked_frames, next_state, False) if (render): self.env.render() current_action = action action = np.zeros(512, dtype=np.int) action[choice] = 1 self.memory.add((state, action, reward, next_state, done)) if done: next_state = np.zeros((38, 42), dtype=np.int) next_state, stacked_frames = stack_frames( stacked_frames, next_state, False) self.memory.add( (state, action, reward, next_state, done)) break self.decay_step += 1 step += 1 state = next_state b = datetime.now() os.system('clear') print("episode: ") print(episode) print("step: ") print(step) print("action: ") print(current_action) print("total_rewards: ") print(total_rewards) print("loss: ") print(loss) print("decay_step: ") print(self.decay_step) print("explore_probability: ") print(explore_probability) print("step time (seconds): ") print((b - a).total_seconds()) if episode % 5 == 0: save_path = saver.save(session, "./models/model.ckpt") print("Model Saved") def play(self, model_path=None): with tf.Session() as sess: total_test_rewards = [] saver = tf.train.Saver() # Load the model if (model_path == None): saver.restore(sess, "./models/model.ckpt") else: saver.restore(sess, model_path) for episode in range(1): total_rewards = 0 state = self.env.reset() stacked_frames = deque( [np.zeros((38, 42), dtype=np.int) for i in range(4)], maxlen=4) state, stacked_frames = stack_frames(stacked_frames, state, True) print("****************************************************") print("EPISODE ", episode) while True: state = state.reshape((1, *self.state_size)) Qs = sess.run(self.myNN.output, feed_dict={self.myNN.input: state}) choice = np.argmax(Qs) action = self.possible_actions[choice] next_state, reward, done, _ = self.env.step(action) self.env.render() total_rewards += reward if done: print("Score", total_rewards) total_test_rewards.append(total_rewards) break next_state, stacked_frames = stack_frames( stacked_frames, next_state, False) state = next_state self.env.close()
class DQN(object): def __init__(self, config, sess): self.cf = config self.sess = sess self.env = Env(self.cf) self.eval_env = Env(self.cf) self.mainQnet = mainQnet(self.cf, action_n=self.env.action_n, scope='mainQnet') self.targetQnet = Qnet(self.cf, action_n=self.env.action_n, scope='targetQnet') main_vars = tf.trainable_variables('mainQnet') target_vars = tf.trainable_variables('targetQnet') self.update_targetQnet_ops = [] for v, tv in zip(main_vars, target_vars): self.update_targetQnet_ops.append(tv.assign(v)) self.model_dir = self.cf.model_dir self.saver = tf.train.Saver(max_to_keep=1) def predict_a(self, state): net = self.mainQnet a, Qout = self.sess.run([net.predict, net.Qout], {net.input: state}) # print('predict_a:', a, Qout) return a def train_mainQnet(self, step): pre_state, action, reward, done, post_state = self.memory.sample() targetQout = self.sess.run(self.targetQnet.Qout, {self.targetQnet.input: post_state}) targetQmax = np.max(targetQout, axis=1) # print('targetQout:', targetQout, targetQout.shape) # print('targetQmax:', targetQmax, targetQmax.shape) # print('done: ', 1. - done) targetQ = (1. - done) * self.cf.discount * targetQmax + reward # print('targetQ: ', targetQ, targetQ.shape) net = self.mainQnet run_ops = [net.trainer, net.grad_norm, net.q_loss] results = self.sess.run(run_ops, { net.input: pre_state, net.action: action, net.targetQ: targetQ }) # if results[1] > self.cf.max_grad_norm: # if True: # print(*results[1:]) for i in results[1:]: assert not np.isnan(i) self.mgn_avg.append(results[1]) self.q_loss_avg.append(results[2]) def update_targetQnet(self): # print('update_targetQnet...\n') self.sess.run(self.update_targetQnet_ops) def get_action(self, step): if step < self.cf.memory_start_size: return self.env.sample_action() if self.explore > self.cf.final_explore: self.explore -= self.explore_descend else: self.explore = self.cf.final_explore if random.random() > self.explore: action = self.predict_a(self.env.recent_states)[0] # print('predict_a:', action) else: action = self.env.sample_action() # print('sample_action:', action) return action def learn(self): self.memory = Memory(self.cf) self.explore = self.cf.init_explore self.explore_descend = (self.cf.init_explore - self.cf.final_explore ) / self.cf.final_explore_step if not os.path.exists(self.model_dir): os.makedirs(self.model_dir) self.summary_dir = self.cf.summary_dir self.summary_write = tf.summary.FileWriter(self.summary_dir, self.sess.graph) self.summary_write.flush() self.episode_r_summary = tf.Variable(0., trainable=False) er_op = tf.summary.scalar('r/episode_r_avg', self.episode_r_summary) self.eval_episode_r_summary = tf.Variable(0., trainable=False) eer_op = tf.summary.scalar('r/evaluate_episode_r_avg', self.eval_episode_r_summary) self.q_loss_summary = tf.Variable(0., trainable=False) ql_op = tf.summary.scalar('loss/q_loss_avg', self.q_loss_summary) self.mgn_summary = tf.Variable(0., trainable=False) mgn_op = tf.summary.scalar('loss/mgn_avg', self.mgn_summary) self.summary_op = tf.summary.merge([er_op, eer_op, ql_op, mgn_op]) self.q_loss_avg = [] self.mgn_avg = [] print('\nLearning...\n') self.update_targetQnet() step = 0 state = self.env.reset() done = False episode = 1 episode_step = 0 episode_reward = 0 episodes_average = [] best_score = -9999. while step < self.cf.total_step: step += 1 episode_step += 1 action = self.get_action(step) state_1, reward, done = self.env.act(action) # print('reward: ', reward, done) self.memory.add(state, action, np.sign(reward), done) episode_reward += reward state = state_1 if done: if self.env.real_done: episodes_average.append(episode_reward) episode += 1 episode_step = 0 episode_reward = 0 # self.memory.add(state, 0, 0, False) state = self.env.reset() done = False if step > self.cf.memory_start_size: if step % self.cf.train_frequency == 0: self.train_mainQnet(step) if step % self.cf.update_frequency == 0: self.update_targetQnet() if step % self.cf.evaluate_every_step == 0: episode_r = np.array(episodes_average) q_l_a = np.array(self.q_loss_avg) mgn_a = np.array(self.mgn_avg) eval_episode_r = self.evaluate() summary_op = self.sess.run( self.summary_op, { self.episode_r_summary: episode_r.mean(), self.eval_episode_r_summary: eval_episode_r.mean(), self.mgn_summary: mgn_a.mean(), self.q_loss_summary: q_l_a.mean() }) self.summary_write.add_summary(summary_op, global_step=step) self.summary_write.flush() episodes_average = [] self.q_loss_avg = [] self.mgn_avg = [] with open(self.summary_dir + 'r.csv', 'a') as f: r_csv = str(time.time()) + ',' + str(step) + ',' + str(episode_r.mean()) + ',' + str(episode_r.std()) +\ ',' + str(eval_episode_r.mean()) + ',' + str(eval_episode_r.std()) +\ ',' + str(mgn_a.mean()) + ',' + str(mgn_a.std()) + \ ',' + str(q_l_a.mean()) + ',' + str(q_l_a.std()) + '\n' print(r_csv) f.write(r_csv) if eval_episode_r.mean() > best_score: best_score = eval_episode_r.mean() self.saver.save(self.sess, self.model_dir + str(step)) self.env.close() def get_action_for_evaluate(self): if random.random() > self.cf.evaluate_explore: action = self.predict_a(self.eval_env.recent_states) else: action = self.eval_env.sample_action() return action def evaluate(self, load_model=False): if load_model: print('\nEvaluate...') print('Loading Model...' + self.model_dir + '\n') ckpt_state = tf.train.get_checkpoint_state(self.model_dir) print('ckpt_state: ', ckpt_state.model_checkpoint_path) self.saver.restore(self.sess, ckpt_state.model_checkpoint_path) self.eval_env.reset() episode_step = 0 episode_reward = 0 episodes_average = [] while len(episodes_average) < self.cf.evaluate_episodes: episode_step += 1 action = self.get_action_for_evaluate() state, reward, done = self.eval_env.act(action, is_training=False) episode_reward += reward if done or episode_step > self.cf.evaluate_episode_step: # print('evaluate episode_step: ', episode_step) self.eval_env.real_done = True episodes_average.append(episode_reward) episode_step = 0 episode_reward = 0 self.eval_env.reset() e_a = np.array(episodes_average) # print('evaluate: ', 'episodes: ', e_a.size, 'average: ', e_a.mean(), 'std: ', e_a.std()) return e_a
class Agent: def __init__(self): self.model, self.target = DQN(), DQN() if USE_CUDA: self.model.cuda() self.target.cuda() self.exp_buffer = Memory() self.exp_number = 0 # size of exp buffer so far self.param_updates = 0 # track how many times params updated self.opt = torch.optim.RMSprop(self.model.parameters(), lr=LEARNING_RATE) self.loss = nn.SmoothL1Loss() # Make an action given a state def act(self, state, explore=True): if explore and np.random.rand() <= EPSILON: # Act randomly a = np.random.randint(NUM_ACTIONS) else: # Send state to model a_vec = self.model(state) a = int(torch.argmax(torch.squeeze(a_vec))) return a # clear the buffer def clear_exp_buffer(self): self.exp_buffer = Memory() self.exp_number = 0 # Add experience to exp buffer def add_exp(self, exp): self.exp_buffer.add(exp) self.exp_number += 1 # Replay gets batch and trains on it def replay(self, batch_size): q_loss = 0 # If experience buffer isn't right size yet, don't do anything if self.exp_number < MIN_BUFFER_SIZE: return # Get batch from experience_buffer batch = self.exp_buffer.get_batch(batch_size) s, a, r, s_new, _ = zip(*batch) s_new = s_new[:-1] # Remove last item (it is 'None') # First turn batch into something we can run through model s = torch.cat(s) a = torch.LongTensor(a).unsqueeze(1) r = torch.FloatTensor(r).unsqueeze(1) s_new = torch.cat(s_new) #print(a.shape,r.shape, s.shape, s_new.shape) if USE_CUDA: a = a.cuda() r = r.cuda() # Get q vals for s (what model outputted) from a # .gather gets us q value for specific action a pred_q_vals = self.model(s).gather(1, a) # Having chosen a in s, # What is the highest possible reward we can get from s_new? # We add q of performing a in s then add best q from next state # cat 0 to end for the terminal state s_new_q_vals = self.target(s_new).max(1)[0] zero = torch.FloatTensor(0) if USE_CUDA: zero = zero.cuda() s_new_q_vals = torch.cat((s_new_q_vals, zero)) exp_q_vals = r + s_new_q_vals * GAMMA myloss = self.loss(pred_q_vals, exp_q_vals) self.opt.zero_grad() myloss.backward() self.opt.step() if WEIGHT_CLIPPING: for param in self.model.parameters(): param.grad.data.clamp_( -1, 1) # Weight clipping avoids exploding gradients if self.param_updates % TARGET_UPDATE_INTERVAL == 0: self.target.load_state_dict(self.model.state_dict()) self.param_updates += 1 global EPSILON if EPSILON > EPSILON_MIN: EPSILON *= EPSILON_DECAY return myloss.item()
class Agent: def __init__(self, n_states, n_actions, n_goals, action_bounds, capacity, env, k_future, batch_size, action_size=1, tau=0.05, actor_lr=1e-3, critic_lr=1e-3, gamma=0.98): self.device = device("cpu") self.n_states = n_states self.n_actions = n_actions self.n_goals = n_goals self.k_future = k_future self.action_bounds = action_bounds self.action_size = action_size self.env = env self.actor = Actor(self.n_states, n_actions=self.n_actions, n_goals=self.n_goals).to(self.device) self.critic = Critic(self.n_states, action_size=self.action_size, n_goals=self.n_goals).to(self.device) self.sync_networks(self.actor) self.sync_networks(self.critic) self.actor_target = Actor(self.n_states, n_actions=self.n_actions, n_goals=self.n_goals).to(self.device) self.critic_target = Critic(self.n_states, action_size=self.action_size, n_goals=self.n_goals).to(self.device) self.init_target_networks() self.tau = tau self.gamma = gamma self.capacity = capacity self.memory = Memory(self.capacity, self.k_future, self.env) self.batch_size = batch_size self.actor_lr = actor_lr self.critic_lr = critic_lr self.actor_optim = Adam(self.actor.parameters(), self.actor_lr) self.critic_optim = Adam(self.critic.parameters(), self.critic_lr) self.state_normalizer = Normalizer(self.n_states[0], default_clip_range=5) self.goal_normalizer = Normalizer(self.n_goals, default_clip_range=5) def choose_action(self, state, goal, train_mode=True): #takes state and goal, concatenates it and passes it to actor network #actor returns action, to which random weird noises are added and returned state = self.state_normalizer.normalize(state) goal = self.goal_normalizer.normalize(goal) state = np.expand_dims(state, axis=0) goal = np.expand_dims(goal, axis=0) with torch.no_grad(): x = np.concatenate([state, goal], axis=1) x = from_numpy(x).float().to(self.device) action = self.actor(x)[0].cpu().data.numpy() if train_mode: action += 0.2 * np.random.randn(self.n_actions) action = np.clip(action, self.action_bounds[0], self.action_bounds[1]) random_actions = np.random.uniform(low=self.action_bounds[0], high=self.action_bounds[1], size=self.n_actions) action += np.random.binomial(1, 0.3, 1)[0] * (random_actions - action) return action def store(self, mini_batch): for batch in mini_batch: self.memory.add(batch) self._update_normalizer(mini_batch) def init_target_networks(self): self.hard_update_networks(self.actor, self.actor_target) self.hard_update_networks(self.critic, self.critic_target) @staticmethod def hard_update_networks(local_model, target_model): target_model.load_state_dict(local_model.state_dict()) @staticmethod def soft_update_networks(local_model, target_model, tau=0.05): for t_params, e_params in zip(target_model.parameters(), local_model.parameters()): t_params.data.copy_(tau * e_params.data + (1 - tau) * t_params.data) def train(self): states, actions, rewards, next_states, goals = self.memory.sample( self.batch_size) states = self.state_normalizer.normalize(states) next_states = self.state_normalizer.normalize(next_states) goals = self.goal_normalizer.normalize(goals) inputs = np.concatenate([states, goals], axis=1) next_inputs = np.concatenate([next_states, goals], axis=1) inputs = torch.Tensor(inputs).to(self.device) rewards = torch.Tensor(rewards).to(self.device) next_inputs = torch.Tensor(next_inputs).to(self.device) actions = torch.Tensor(actions).to(self.device) with torch.no_grad(): #get Qmax target_q = self.critic_target(next_inputs, self.actor_target(next_inputs)) #apply bellman equation on Qmax to get computed Q for actions from above(initial state, action) target_returns = rewards + self.gamma * target_q.detach() target_returns = torch.clamp(target_returns, -1 / (1 - self.gamma), 0) #use critic to generate actual Q for (initial states and actions) q_eval = self.critic(inputs, actions) critic_loss = (target_returns - q_eval).pow(2).mean() a = self.actor(inputs) actor_loss = -self.critic(inputs, a).mean() actor_loss += a.pow(2).mean() self.actor_optim.zero_grad() actor_loss.backward() self.sync_grads(self.actor) self.actor_optim.step() self.critic_optim.zero_grad() critic_loss.backward() self.sync_grads(self.critic) self.critic_optim.step() return actor_loss.item(), critic_loss.item() def save_weights(self): torch.save( { "actor_state_dict": self.actor.state_dict(), "state_normalizer_mean": self.state_normalizer.mean, "state_normalizer_std": self.state_normalizer.std, "goal_normalizer_mean": self.goal_normalizer.mean, "goal_normalizer_std": self.goal_normalizer.std }, "NBM_FetchPickAndPlace_v2.pth") def load_weights(self): checkpoint = torch.load("NBM_FetchPickAndPlace_v2.pth") actor_state_dict = checkpoint["actor_state_dict"] self.actor.load_state_dict(actor_state_dict) state_normalizer_mean = checkpoint["state_normalizer_mean"] self.state_normalizer.mean = state_normalizer_mean state_normalizer_std = checkpoint["state_normalizer_std"] self.state_normalizer.std = state_normalizer_std goal_normalizer_mean = checkpoint["goal_normalizer_mean"] self.goal_normalizer.mean = goal_normalizer_mean goal_normalizer_std = checkpoint["goal_normalizer_std"] self.goal_normalizer.std = goal_normalizer_std def set_to_eval_mode(self): self.actor.eval() # self.critic.eval() def update_networks(self): self.soft_update_networks(self.actor, self.actor_target, self.tau) self.soft_update_networks(self.critic, self.critic_target, self.tau) def _update_normalizer(self, mini_batch): states, goals = self.memory.sample_for_normalization(mini_batch) self.state_normalizer.update(states) self.goal_normalizer.update(goals) self.state_normalizer.recompute_stats() self.goal_normalizer.recompute_stats() @staticmethod def sync_networks(network): comm = MPI.COMM_WORLD flat_params = _get_flat_params_or_grads(network, mode='params') comm.Bcast(flat_params, root=0) _set_flat_params_or_grads(network, flat_params, mode='params') @staticmethod def sync_grads(network): flat_grads = _get_flat_params_or_grads(network, mode='grads') comm = MPI.COMM_WORLD global_grads = np.zeros_like(flat_grads) comm.Allreduce(flat_grads, global_grads, op=MPI.SUM) _set_flat_params_or_grads(network, global_grads, mode='grads')
choice = random.randint(1, len(possible_actions)) - 1 action = possible_actions[choice] next_state, reward, done, _ = env.step(action) # env.render() # Stack the frames next_state, stacked_frames = stack_frames(stacked_frames, next_state, False, stack_size) # If the episode is finished (we're dead 3x) if done: # We finished the episode next_state = np.zeros(state.shape) # Add experience to memory memory.add((state, action, reward, next_state, done)) # Start a new episode state = env.reset() # Stack the frames state, stacked_frames = stack_frames(stacked_frames, state, True, stack_size) else: # Add experience to memory memory.add((state, action, reward, next_state, done)) # Our new state is now the next_state state = next_state
class Agent: def __init__(self, input_dim, n_actions, lr=0.00025, eps=1.0, memory=150000): self.n_states = input_dim self.n_actions = n_actions self.eps = eps self.BATCH_SIZE = 32 self.GAMMA = 0.99 self.MIN_EPS = 0.1 self.model = self.build_model(input_dim, n_actions, lr) self.memory = Memory(memory) self.zeros = np.zeros(self.n_states) def build_model(self, input_dim, n_actions, lr): model = Sequential() model.add(Dense(input_dim=input_dim, units=256, activation='relu')) model.add(Dense(input_dim=input_dim, units=1024, activation='relu')) model.add(Dropout(0.3)) model.add(Dense(input_dim=input_dim, units=2048, activation='relu')) model.add(Dropout(0.4)) model.add(Dense(input_dim=input_dim, units=48, activation='relu')) model.add(Dense(units=n_actions, activation='linear')) optimizer = RMSprop(lr=lr) model.compile(loss='mse', optimizer=optimizer) return model def train(self, x, y, verbose=0): self.model.fit(x, y, verbose=verbose, batch_size=64) def predict(self, state): return self.model.predict(state) def predict_single(self, state): q_val = self.predict(state.reshape(1, self.n_states)) if random.random() > self.eps: return [np.argmax(q_val.flatten()), q_val] else: return [random.randint(0, self.n_actions - 1), q_val] def save(self, state, next_state, action, reward): self.memory.add(np.array(state, dtype=np.uint8), np.array(next_state), int(action), int(reward)) def replay(self): batch = self.memory.sample(self.BATCH_SIZE) x = np.empty(0).reshape(0, self.n_states) y = np.empty(0).reshape(0, self.n_actions) none_list = [None for itr in range(0, self.n_states)] next_state = np.array([ (self.zeros if np.array_equal(state, none_list) else state) for state in batch[:, self.n_states:2 * self.n_states] ]) state = batch[:, :self.n_states] target_Q = self.predict(next_state) Q_value = self.predict(state) for indx, element in enumerate(batch): state = element[:self.n_states] next_state = element[self.n_states:2 * self.n_states] action = int(element[2 * self.n_states]) reward = element[2 * self.n_states + 1] q_val = Q_value[indx] if np.array_equal(next_state, none_list): q_val[action] = reward else: q_val[action] = reward + self.GAMMA * np.amax( np.array(target_Q[indx])) y = np.vstack([y, q_val]) x = np.vstack([x, state]) self.train(x, y) def decay(self): if self.eps > self.MIN_EPS: self.eps = self.eps * 0.99
class Agent: """ エージェントクラス Attributes ---------- brain : brain memory : memory replay_size : int 経験再生時に取り出す経験データの数 last_state : ndarray last_action : int episode : int 動的に追加する. それぞれ状態,行動,エピソードの番号を保存 """ def __init__(self, state_size, action_size, replay_size=32): """ state_size : int 状態空間の次元数 action_size : int 行動空間の次元数 """ self.brain = Brain(state_size, action_size) self.memory = Memory() self.replay_size = replay_size def get_action(self, state, episode, optimal=False): """ 行動を決定する. Parameters ---------- state : list 状態ベクトル. """ # 方策により行動する if np.random.rand() < 0.001 + 0.9 / (1.0 + episode): action = np.random.randint(self.brain.action_size) else: # Q値を取得 q_values = self.brain.get_q_values(state) action = np.argmax(q_values) # 状態と行動を保存する self.last_state = state self.last_action = action self.episode = episode return action def learn(self, reward, next_state, done): """ 学習を行う. Parameters ---------- reward : たぶんint 報酬 state : array 状態ベクトル done : bool 終端状態かどうか """ # 経験をメモリーに保存する experience = (self.last_state, self.last_action, reward, next_state, done) self.memory.add(experience) # 経験再生 if self.memory.is_able_fit(): experiences = self.memory.get_sample() self.brain.replay(self.episode, experiences, self.replay_size)
class Agent: def __init__(self, net, actionSet, goalSet, metaEpsilon=defaultMetaEpsilon, epsilon=defaultEpsilon, controllerEpsilon=defaultControllerEpsilon, tau=defaultTau): self.actionSet = actionSet self.controllerEpsilon = controllerEpsilon self.goalSet = goalSet self.metaEpsilon = metaEpsilon self.nSamples = defaultNSample self.metaNSamples = defaultMetaNSamples self.gamma = defaultGamma self.targetTau = tau self.net = net self.memory = Memory(controllerMemCap) self.metaMemory = Memory(metaMemCap) def selectMove(self, state, goal): goalVec = utils.oneHot(goal) if self.controllerEpsilon[goal] < random.random(): # predict action dummyYtrue = np.zeros((1, 8)) dummyMask = np.zeros((1, 8)) return np.argmax(self.net.controllerNet.predict([np.reshape(state, (1, 84, 84, 4)), np.asarray([goalVec]), dummyYtrue, dummyMask], verbose=0)[1]) return random.choice(self.actionSet) def setControllerEpsilon(self, epsilonArr): self.controllerEpsilon = epsilonArr def selectGoal(self, state): if self.metaEpsilon < random.random(): # predict action pred = self.net.metaNet.predict([np.reshape(state, (1, 84, 84, 4)), np.zeros((1,3)), np.zeros((1,3))], verbose=0)[1] return np.argmax(pred) return random.choice(self.goalSet) def selectTrueGoal(self, goalNum): return trueSubgoalOrder[goalNum] def setMetaEpsilon(self, epsilon): self.metaEpsilon = epsilon def criticize(self, reachGoal, action, die, distanceReward, useSparseReward): reward = 0.0 if reachGoal: reward += 50.0 # if die: # reward -= 200.0 if not useSparseReward: # if action == 0: # reward -= 0.1 reward += distanceReward reward = np.minimum(reward, maxReward) reward = np.maximum(reward, minReward) return reward def store(self, experience, meta=False): if meta: self.metaMemory.add(np.abs(experience.reward), experience) else: self.memory.add(np.abs(experience.reward), experience) def _update(self, stepCount): batches = self.memory.sample(self.nSamples) stateVector = [] goalVector = [] for batch in batches: exp = batch[1] stateVector.append(exp.state) goalVector.append(utils.oneHot(exp.goal)) stateVector = np.asarray(stateVector) goalVector = np.asarray(goalVector) nextStateVector = [] for batch in batches: exp = batch[1] nextStateVector.append(exp.next_state) nextStateVector = np.asarray(nextStateVector) rewardVectors = self.net.controllerNet.predict([stateVector, goalVector, np.zeros((self.nSamples,8)), np.zeros((self.nSamples, 8 ))], verbose=0)[1] rewardVectorsCopy = np.copy(rewardVectors) rewardVectors = np.zeros((self.nSamples, 8)) nextStateRewardVectors = self.net.targetControllerNet.predict([nextStateVector, goalVector, np.zeros((self.nSamples,8)), np.zeros((self.nSamples, 8 ))], verbose=0)[1] maskVector = np.zeros((self.nSamples, 8)) for i, batch in enumerate(batches): exp = batch[1] idx = batch[0] maskVector[i, exp.action] = 1. rewardVectors[i][exp.action] = exp.reward if not exp.done: rewardVectors[i][exp.action] += self.gamma * max(nextStateRewardVectors[i]) self.memory.update(idx, np.abs(rewardVectors[i][exp.action] - rewardVectorsCopy[i][exp.action])) rewardVectors = np.asarray(rewardVectors) loss = self.net.controllerNet.train_on_batch([stateVector, goalVector, rewardVectors, maskVector], [np.zeros(self.nSamples), rewardVectors]) #Update target network controllerWeights = self.net.controllerNet.get_weights() controllerTargetWeights = self.net.targetControllerNet.get_weights() for i in range(len(controllerWeights)): controllerTargetWeights[i] = self.targetTau * controllerWeights[i] + (1 - self.targetTau) * controllerTargetWeights[i] self.net.targetControllerNet.set_weights(controllerTargetWeights) return loss def _update_meta(self, stepCount): batches = self.metaMemory.sample(self.metaNSamples) stateVectors = np.asarray([batch[1].state for batch in batches]) nextStateVectors = np.asarray([batch[1].next_state for batch in batches]) rewardVectors = self.net.metaNet.predict([stateVectors, np.zeros((self.nSamples,3)), np.zeros((self.nSamples, 3))], verbose=0)[1] rewardVectorsCopy = np.copy(rewardVectors) rewardVectors = np.zeros((self.metaNSamples, 3)) nextStateRewardVectors = self.net.targetMetaNet.predict([nextStateVectors, np.zeros((self.nSamples,3)), np.zeros((self.nSamples, 3))], verbose=0)[1] maskVector = np.zeros((self.metaNSamples, 3)) for i, batch in enumerate(batches): exp = batch[1] idx = batch[0] maskVector[i, exp.goal] = 1. rewardVectors[i][exp.goal] = exp.reward if not exp.done: rewardVectors[i][np.argmax(exp.goal)] += self.gamma * max(nextStateRewardVectors[i]) self.metaMemory.update(idx, np.abs(rewardVectors[i][exp.goal] - rewardVectorsCopy[i][exp.goal])) loss = self.net.metaNet.train_on_batch([stateVectors, rewardVectors, maskVector], [np.zeros(self.nSamples), rewardVectors]) #Update target network metaWeights = self.net.metaNet.get_weights() metaTargetWeights = self.net.targetMetaNet.get_weights() for i in range(len(metaWeights)): metaTargetWeights[i] = self.targetTau * metaWeights[i] + (1 - self.targetTau) * metaTargetWeights[i] self.net.targetMetaNet.set_weights(metaTargetWeights) return loss def update(self, stepCount, meta=False): if meta: loss = self._update_meta(stepCount) else: loss = self._update(stepCount) return loss def annealMetaEpsilon(self, stepCount): self.metaEpsilon = defaultEndEpsilon + max(0, (defaultMetaEpsilon - defaultEndEpsilon) * \ (defaultAnnealSteps - max(0, stepCount - defaultRandomPlaySteps)) / defaultAnnealSteps) def annealControllerEpsilon(self, stepCount, goal): self.controllerEpsilon[goal] = defaultEndEpsilon + max(0, (defaultControllerEpsilon[goal] - defaultEndEpsilon) * \ (defaultAnnealSteps - max(0, stepCount - defaultRandomPlaySteps)) / defaultAnnealSteps)
def main(): global frame_size, stack_size state_size = list(frame_size) state_size.append(stack_size) game = Doom() no_actions = len(game.actions) learning_rate = 0.002 no_episodes = 500 max_steps = 100 batch_size = 32 explore_max = 1. explore_min = 0.01 decay_rate = 0.00001 gamma = 0.95 pretrain_length = batch_size memory_size = 1000000 training = True episode_render = True tf.reset_default_graph() deep_Q_network = DeepQNetwork(state_size, no_actions, learning_rate) memory = Memory(max_size=memory_size) game.start_game() game.restart_episode() for i in range(pretrain_length): if i == 0: img, game_vars = game.get_environment_state() state = frame_stacking(img, True) action = random.choice(game.actions) reward = game.take_action(action) done = game.is_episode_finished() if done: next_state = np.zeros(state.shape) memory.add((state, action, reward, next_state, done)) game.restart_episode() img, game_vars = game.get_environment_state() state = frame_stacking(img, True) else: next_img, next_game_vars = game.get_environment_state() next_state = frame_stacking(img, False) memory.add((state, action, reward, next_state, done)) state = next_state writer = tf.summary.FileWriter("./tensorboard/dqn/1") tf.summary.scalar("Loss", deep_Q_network.loss) write_op = tf.summary.merge_all() """Prediction """ def predict_action(curr_decay_step, curr_state): exp_exp_tradeoff = np.random.rand() curr_explore_prob = explore_min + ((explore_max - explore_min) * np.exp(-decay_rate * curr_decay_step)) if curr_explore_prob > exp_exp_tradeoff: curr_action = random.choice(game.actions) else: Qs = sess.run(deep_Q_network.output, feed_dict={deep_Q_network.inputs: curr_state.reshape((1, *curr_state.shape))}) choice = np.argmax(Qs) curr_action = game.actions[choice] return curr_action, curr_explore_prob """Training Agent""" saver = tf.train.Saver() if training: with tf.Session() as sess: sess.run(tf.global_variables_initializer()) decay_step = 0 game.start_game() for episode in range(no_episodes): step = 0 episode_rewards = [] game.restart_episode() img, game_vars = game.get_environment_state() state = frame_stacking(img, True) while step < max_steps: step += 1 decay_step += 1 action, explore_prob = predict_action(decay_step, state) reward = game.take_action(action) done = game.is_episode_finished() episode_rewards.append(reward) if done: next_img = np.zeros(frame_size, dtype=np.int) next_state = frame_stacking(next_img, False) step = max_steps total_rewards = np.sum(episode_rewards) print("Episode No. {}".format(episode), "Total reward: {}".format(total_rewards), "Training Loss: {:.4f}".format(loss_val), "Explore Prob: {:.4f}".format(explore_prob)) memory.add((state, action, reward, next_state, done)) else: next_img, next_game_vars = game.get_environment_state() next_state = frame_stacking(next_img, False) memory.add((state, action, reward, next_state, done)) state = next_state """Learning Part """ """Get mini-batches from memory and train""" batch = memory.sample(batch_size) states_mb = [] actions_mb = [] rewards_mb = [] next_states_mb = [] dones_mb = [] for each in batch: states_mb.append(each[0]) actions_mb.append(each[1]) rewards_mb.append(each[2]) next_states_mb.append(each[3]) dones_mb.append(each[4]) states_mb = np.array(states_mb) actions_mb = np.array(actions_mb) rewards_mb = np.array(rewards_mb) next_states_mb = np.array(next_states_mb) dones_mb = np.array(dones_mb) target_Qs_batch = [] Qs_next_state = sess.run(deep_Q_network.output, feed_dict={deep_Q_network.inputs: next_states_mb}) for i in range(0, len(batch)): terminal = dones_mb[i] if terminal: target_Qs_batch.append(rewards_mb[i]) else: target = rewards_mb[i] + (gamma * np.max(Qs_next_state[i])) target_Qs_batch.append(target) targets_mb = np.array(target_Qs_batch) loss_val, _ = sess.run([deep_Q_network.loss, deep_Q_network.optimizer], feed_dict={deep_Q_network.inputs: states_mb, deep_Q_network.target_Q: targets_mb, deep_Q_network.actions: actions_mb}) summary = sess.run(write_op, feed_dict={deep_Q_network.inputs: states_mb, deep_Q_network.target_Q: targets_mb, deep_Q_network.actions: actions_mb}) writer.add_summary(summary, episode) writer.flush() if episode % 5 == 0: save_path = saver.save(sess, "./models/model.ckpt") print("Model Saved")
def ppo_train(model_name, load_model=False, actor_filename=None, critic_filename=None, optimizer_filename=None): print("PPO -- Training") env = make('hungry_geese') trainer = env.train(['greedy', None, 'agents/boilergoose.py', 'agents/handy_rl.py']) agent = PPOAgent(rows=11, columns=11, num_actions=3) memory = Memory() if load_model: agent.load_model_weights(actor_filename, critic_filename) agent.load_optimizer_weights(optimizer_filename) episode = 0 start_episode = 0 end_episode = 50000 reward_threshold = None threshold_reached = False epochs = 4 batch_size = 128 current_frame = 0 training_rewards = [] evaluation_rewards = [] last_1000_ep_reward = [] for episode in range(start_episode + 1, end_episode + 1): obs_dict = trainer.reset() ep_reward, ep_steps, done = 0, 0, False prev_direction = 0 while not done: current_frame += 1 ep_steps += 1 state = preprocess_state(obs_dict, prev_direction) action = agent.select_action(state, training=True) direction = get_direction(prev_direction, action) next_obs_dict, _, done, _ = trainer.step(env.specification.action.enum[direction]) reward = calculate_reward(obs_dict, next_obs_dict) next_state = preprocess_state(next_obs_dict, direction) memory.add(state, action, reward, next_state, float(done)) obs_dict = next_obs_dict prev_direction = direction ep_reward += reward if current_frame % batch_size == 0: for _ in range(epochs): states, actions, rewards, next_states, dones = memory.get_all_samples() agent.fit(states, actions, rewards, next_states, dones) memory.clear() agent.update_networks() print("EPISODE " + str(episode) + " - REWARD: " + str(ep_reward) + " - STEPS: " + str(ep_steps)) if len(last_1000_ep_reward) == 1000: last_1000_ep_reward = last_1000_ep_reward[1:] last_1000_ep_reward.append(ep_reward) if reward_threshold: if len(last_1000_ep_reward) == 1000: if np.mean(last_1000_ep_reward) >= reward_threshold: print("You solved the task after" + str(episode) + "episodes") agent.save_model_weights('models/ppo_actor_' + model_name + '_' + str(episode) + '.h5', 'models/ppo_critic_' + model_name + '_' + str(episode) + '.h5') threshold_reached = True break if episode % 1000 == 0: print('Episode ' + str(episode) + '/' + str(end_episode)) last_1000_ep_reward_mean = np.mean(last_1000_ep_reward).round(3) training_rewards.append(last_1000_ep_reward_mean) print('Average reward in last 1000 episodes: ' + str(last_1000_ep_reward_mean)) print() if episode % 1000 == 0: eval_reward = 0 for i in range(100): obs_dict = trainer.reset() done = False prev_direction = 0 while not done: state = preprocess_state(obs_dict, prev_direction) action = agent.select_action(state) direction = get_direction(prev_direction, action) next_obs_dict, _, done, _ = trainer.step(env.specification.action.enum[direction]) reward = calculate_reward(obs_dict, next_obs_dict) obs_dict = next_obs_dict prev_direction = direction eval_reward += reward eval_reward /= 100 evaluation_rewards.append(eval_reward) print("Evaluation reward: " + str(eval_reward)) print() if episode % 5000 == 0: agent.save_model_weights('models/ppo_actor_' + model_name + '_' + str(episode) + '.h5', 'models/ppo_critic_' + model_name + '_' + str(episode) + '.h5') agent.save_optimizer_weights('models/ppo_' + model_name + '_' + str(episode) + '_optimizer.npy') agent.save_model_weights('models/ppo_actor_' + model_name + '_' + str(end_episode) + '.h5', 'models/ppo_critic_' + model_name + '_' + str(end_episode) + '.h5') agent.save_optimizer_weights('models/ppo_' + model_name + '_' + str(end_episode) + '_optimizer.npy') if threshold_reached: plt.plot([i for i in range(start_episode + 1000, episode, 1000)], training_rewards) else: plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)], training_rewards) plt.title("Reward") plt.show() plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)], evaluation_rewards) plt.title('Evaluation rewards') plt.show()
class Agent: def __init__(self, env, model, epsilon=.9, min_epsilon=.1, epsilon_decay=1e-3): self.env = env self.model = model self.epsilon = epsilon self.min_epsilon = min_epsilon self.epsilon_decay = epsilon_decay self.episode = 0 self.positiveMemory = Memory(model=self.model, episode_max_size=20) self.negativeMemory = Memory(model=self.model, episode_max_size=10) def play(self): terminal = False observation = self.env.reset() X = np.zeros((2,) + observation.shape) X[0] = observation X[1] = observation total_reward = 0 while terminal == False and total_reward < 200: y = self.model.predict(X) action = np.argmax(y) observation, reward, terminal, info = self.env.executeAction(action) total_reward += reward X[0] = X[1] X[1] = observation return total_reward def learn(self, overfit=False, games=1, warmup=0, skip_frames=4): self.episode += 1. epsilon = max(self.min_epsilon, self.epsilon - self.episode * self.epsilon_decay) total_reward = 0 qs = [] predictions = None if warmup > 0: print "Adding %d warmup games"%(warmup) games += warmup for game in range(1, games + 1): print "Game %d/%d..."%(game, games) terminal = False observation = self.env.reset() framebuffer = np.zeros((skip_frames,) + observation.shape) framebuffer[-1] = observation frame = 0 action = np.random.randint(0, 2) episode = [] while terminal == False: frame += 1 if frame%skip_frames != 0: observation, reward, terminal, info = self.env.executeAction(action) if frame%skip_frames == 0 or reward != 0 or terminal: X = framebuffer.copy() y = self.model.predict(X) qs.append(max(y)) if predictions is None: predictions = np.zeros_like(y) predictions[np.argmax(y)] += 1 if frame%skip_frames == 0: if np.random.rand() <= epsilon: action = np.random.randint(0, len(y)) else: action = np.argmax(y) observation, reward, terminal, info = self.env.executeAction(action) total_reward += reward y[action] = 1. # encourage current action, for now episode.append((X, y, action, reward, terminal)) if reward == 1: self.positiveMemory.add(episode, positive=True) episode = [] if reward == -1: self.negativeMemory.add(episode, positive=False) episode = [] framebuffer[0:skip_frames-1] = framebuffer[1:] framebuffer[-1] = observation print "Score %.1f"%(total_reward / games) X_pos, y_pos = self.positiveMemory.sample(nbr_positive=(games-warmup)*25) X_neg, y_neg = self.negativeMemory.sample(nbr_negative=(games-warmup)*100) if not X_pos is None: print "Sample %d positive and %d negative memories"%(len(y_pos), len(y_neg)) X_t = np.concatenate((X_pos, X_neg)) y_t = np.concatenate((y_pos, y_neg)) else: print "Sample %d negative memories"%(len(y_neg)) X_t = X_neg y_t = y_neg while overfit: loss = self.model.learn(X_t, y_t) print "Loss: %f"%(loss) loss = self.model.learn(X_t, y_t) return total_reward / games, loss, np.mean(qs), epsilon, predictions