def agent_loop(dictionary, lock1, lock2): random.seed() environment = GridWorldModel() agent = Agent(environment) agent.Q = dictionary[Q_SHARED_KEY] # initialize with shared Q while environment.step_count < MAX_STEPS_PER_AGENT: environment.reset() agent.state = environment.get_start_state() while True: agent.act() if environment.step_count % ASYNC_UPDATE_INTERVAL == 0 or environment.is_terminal_state( ): lock1.acquire() q = dictionary[Q_SHARED_KEY] # Need to write it back, otherwise the proxy won't pick up the changes. dictionary[Q_SHARED_KEY] = np.add(q, agent.dQ) lock1.release() agent.dQ = np.zeros((GridWorldModel.get_number_of_states(), GridWorldModel.get_number_of_actions()), dtype=float) if environment.is_terminal_state(): break lock2.acquire() combined_rewards = dictionary[REWARDS_KEY] agents_rewards = np.array(agent.rewards) # ...same here dictionary[REWARDS_KEY] = np.add(combined_rewards, agents_rewards[:MAX_STEPS_PER_AGENT]) lock2.release()
def train(): env = gym.make("CartPole-v1") input_space = env.observation_space.shape[0] output_space = env.action_space.n print(input_space, output_space) agent = Agent(input_space, output_space) run = 0 x = [] y = [] while run < 100: run += 1 state = env.reset() state = np.reshape(state, [1, -1]) step = 0 while True: step += 1 # 步数越多,相当于站立的时间越长,比较容易理解。 env.render() action = agent.act(state) state_next, reward, done, _ = env.step(action) reward = reward if not done else -reward # 棍子倒了,分数肯定是负数了 state_next = np.reshape(state_next, [1, -1]) agent.add_data(state, action, reward, state_next, done) state = state_next if done: print("Run: " + str(run) + ", exploration: " + str(agent.exploration) + ", score:" + str(step)) x.append(run) y.append(step) break agent.train_from_buffer() # 每次都要执行训练 plt.plot(x, y) plt.show()
def run(self, agent: Agent.Agent): s = th.tensor(self.env.reset(), dtype=th.float) R = 0 while True: self.env.render() a = agent.act(s) s_, r, done, info = self.env.step(a) s_ = th.tensor(s_, dtype=th.float) if done: # terminal state s_ = None agent.observe((s, a, r, s_)) agent.replay() s = s_ R += r if done: break print("Total reward:", R)
def run_dqn(): total_reward_history = [] # start Q-Learning agent = Agent(shape=state_shape, num_actions=num_actions) # Init Memory\ state = env.reset() state = img_prcss.preprocess(image=state) print("Start adding memory") while agent.memory.is_full() == False: action = agent.act_randomly() next_state, reward, done, _ = env.step(action) next_state = img_prcss.preprocess(image=next_state) if done: next_state = None experience = (state, action, reward, next_state) agent.memory.add(experience) if done: state = env.reset() else: state = next_state print("memory is full") # init display img_states = env.render(mode='rgb_array') img = plt.imshow(img_states) # only call this once for episode in range(NUM_EPISODES): # reset env state = env.reset() state = img_prcss.preprocess(image=state) total_reward = 0 for time in range(NUM_STEPS): # 1: get action(t) action = agent.act(state, episode) # 2: action(t) -> {state(t+1)} next_state, reward, done, _ = env.step(action) next_state = img_prcss.preprocess(image=next_state) if done: next_state = None # 3: get reward(t) total_reward += reward # 4: Memory stored as (s(t), a(t), r(t), s(t+1)) experience = (state, action, reward, next_state) agent.memory.add(experience) # 5: update target Q-network agent.update_target_network() # 6: replay experiences and update network weight agent.replay() # 7: save state state = next_state # ex: judge go to next episode if done: total_reward_history.append(total_reward) #plt.plot([ep for ep in range(episode+1)], total_reward_history) #plt.pause(0.001) # ex: logout print('Ep:', episode, ', Tm:', time, 'Rwd:', total_reward) #env.render() break # go to next episode # ex: display img_states = env.render(mode='rgb_array') img.set_data(img_states) # just update the data
def train_agent(env: UnityEnvironment, brain_name: str, agent: Agent, n_episodes: int, max_steps: int = 1500) -> []: """ Trans the agent for n episodes :param env: :param brain_name: :param agent: :param n_episodes: number of episodes to train :param max_steps: max amount of steps :return: returns an array containing the score of every episode """ scores: [int] = [] # store the last 100 scores into a queue to check if the agent reached the goal scores_window = deque(maxlen=100) for i_episode in range(1, n_episodes + 1): # reset the environment env_info = env.reset(train_mode=True)[brain_name] agent.reset() state = env_info.vector_observations[0] score = 0 # the environment will end the episode after n steps, thus no manual termination of the episode is needed for a in range(max_steps): action: int = agent.act(state, add_noise=False) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] score += reward agent.step((state, action, reward, next_state, done)) state = next_state if done: break scores_window.append(score) # save most recent score scores.append(score) # save most recent score # print('\rEpisode {}\tavg Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="") if i_episode % 10 == 0: print( f"""Episode {i_episode}: Average Score: {np.mean(scores_window):.2f}""" ) if np.mean(scores_window) >= 30.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode, np.mean(scores_window))) torch.save(agent.actor_local.state_dict(), 'checkpoint-actor.pth') torch.save(agent.critic_local.state_dict(), 'checkpoint-critic.pth') break return scores
class Train: def __init__(self): self.sample_batch_size = 128 self.episodes = 10000 self.state_size = 150 self.action_size = 3 self.agent = Agent(self.state_size, self.action_size) self.env = Env([0, 1, 2], [j for j in range(1, self.state_size)], self.agent.x_train[0], self.agent.y_train[0]) self.label_data = 1.0 self.label_past = 1.0 def run(self): try: for index_episode in range(self.episodes): #time_to_begin = random.randint(1,self.state_size) index_random_data = random.randint(0, 49) seq = self.agent.x_train[index_random_data] seq_label = self.agent.y_train[index_random_data] self.env.reset(seq, seq_label) done = False index = 1 #time_to_begin while not done and index <= 150: state = self.env.get_sequence_state() action = self.agent.act(state, index) next_state, reward, done = self.env.step(action) next_state = self.env.get_sequence_state() next_state = np.reshape(self.env.get_sequence_state(), (1, 150, 1, 1)) self.agent.remember(state, action, reward, next_state, done) state = next_state index += 1 if (index_episode % 20 == 0): print("Episode {}".format(index_episode)) if index_episode % 100 == 0 and index_episode != 0: acc, res, t = self.agent.compute_acc() acc_val, res_val, t_val = self.agent.compute_acc_val() print( "acc_train {} ======> average_time_train {} ======> update {}" .format(acc, np.mean(t), self.agent.update_number)) print( "acc_val {} ======> average_time_val {} ======> update {}" .format(acc_val, np.mean(t_val), self.agent.update_number)) if acc > 0.9: self.agent.save_weight() self.agent.replay(self.sample_batch_size) self.agent.target_train() finally: self.agent.save_model()
class Environment(threading.Thread): stop_signal = False def __init__(self, env, render=False, eps_start=EPS_START, eps_end=EPS_STOP, eps_steps=EPS_STEPS): threading.Thread.__init__(self) self.render = render self.env = env self.agent = Agent() def runEpisode(self): s = self.env.reset() R = 0 while True: time.sleep(THREAD_DELAY) # yield if self.render: self.env.render() a = self.agent.act(s) s_, r, done = self.env.step(a) if done: # terminal state s_ = None self.agent.train(s, a, r, s_) s = s_ R += r if done or self.stop_signal: break print("Total R:", R) def run(self): while not self.stop_signal: self.runEpisode() def stop(self): self.stop_signal = True
def train_the_agent(env,n_episodes = 400,max_t = 700): brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=False)[brain_name] states = env_info.vector_observations state_size = states.shape[1] action_size = brain.vector_action_space_size print(state_size,action_size) agent = Agent(state_size=state_size, action_size=action_size, random_seed=10,sigma=0.05) scores_deque = deque(maxlen=100) scores = [] max_score = -np.Inf for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations agent.reset() score = 0 while True: action = agent.act(state) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] agent.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_deque.append(score) scores.append(score) print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format(i_episode, np.mean(scores_deque), score), end="") if i_episode % 100 == 0: torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth') torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth') print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque))) print(scores) env.close()
def watch_agent(env: UnityEnvironment, brain_name: str, agent: Agent) -> None: """ Shows agent simulation :param env: :param brain_name: :param agent: :return: """ env_info = env.reset(train_mode=False)[brain_name] # reset the environment state = env_info.vector_observations[0] # get the current state score = 0 # initialize the score while True: action = agent.act(state) env_info = env.step(action)[ brain_name] # send the action to the environment next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished score += reward # update the score state = next_state # roll over the state to next time step if done: # exit loop if episode finished break print(f"Agent achieved a score of {score}")
env = Maze_Env(maze, display_width, display_height, MAX_MOVES) #initialising agent directory = 'maze2' agent = Agent(env, alpha=0, dir=None) #load the final policy file = 'maze2/policy_final.pickle' agent.set_policy(file) #or load an intermediate Q-table #with open('maze2/Q_table_1000.pickle', 'rb') as f: # Q = pickle.load(f) #testing the agent for i in range(10): #running for 10 times state = env.reset() total_reward = 0 while True: action = agent.act(state, test=True) #rendering the environment env.render(action) time.sleep(0.3) state, reward, done = env.step(action) total_reward += reward if done: env.render(action) time.sleep(2) print(total_reward) break
episode = 0 running = True isTrained = False while running: episode += 1 state = env.reset() #Reset enviroment state, _, done, info = env.step(0) #get game details while not done: beginState = state beginState = getState(getBoard(beginState), info["current_piece"], info["next_piece"]) #output [column, rotation] eg. [0,0,0,1,0,0,0,0,0,0], [0,0,1,0] netOut = agent.act(beginState) #Act on last state print(netOut) curPiece = list(info["statistics"].keys()).index(info["current_piece"][0]) actionArr = getActions(netOut, curPiece) #do actions based on what rotation and location network wants piece in for action in actionArr: _, _, done, info = env.step(action) #until next piece nextPiece = info["next_piece"] while info["current_piece"] != nextPiece: rawstate, reward, done, info = env.step(5) #Move down board = getBoard(rawstate) state = getState(board, info["current_piece"], info["next_piece"])
from Agent import Agent from Brain import Brain from ConnectFourEnvironment import ConnectFourEnvironment if __name__ == '__main__': env = ConnectFourEnvironment(play_with_rng=False) model = './yellow.h5' brain = Brain(model) agent = Agent(brain=brain) while True: s = env.reset() env.render() while not env.is_finished(): if env.yellows_turn(): a = agent.act(s) else: a = input("Choose action (0-6): ") a = int(a) s_prime, r, done = env.step(a) env.render() s = s_prime
# Main training loop totalSimSteps = 0 while totalSimSteps < max_steps: #Run episodes until the iteration simulation budget runs out iterSimSteps = 0 while iterSimSteps < N: # Reset the simulation observation = sim.reset() # Simulate this episode until done while True: # Query the agent for action given the state observation action = agent.act(sess, observation) #Simulate using the action #Note: this tutorial does not repeat the same action for two steps, #unlike the Run.py script used for the ICML paper results. #Repeating the action for multiple steps seems to yield better exploration #in most cases, possibly because it reduces high-frequency action noise. nextObservation, reward, done, info = sim.step(action[0, :]) # Save the experience point agent.memorize(observation, action, reward, nextObservation, done) observation = nextObservation # Bookkeeping iterSimSteps += 1
env_test = MarketEnv(df, seq_size, foreignScaler=env.scaler) sess = tf.Session() agent = Agent(sess, seq_size, n_features, hidden_size=16, a_size=3) sess.run(tf.global_variables_initializer()) i = 0 reward_history = [] print('Sequence size: ' + str(seq_size)) while True: running_reward = 0 s = env.reset() while True: a = agent.act(s) s_, r, done = env.step(a) td_error = agent.critic_learn(np.array([s]), r, np.array([s_])) agent.actor_learn(np.array([s]), a, td_error) s = s_ running_reward += r if done: reward_history.append(running_reward) break if i % 100 == 0 and i > 0: reward_history = reward_history[-100:] last_100_episodes_mean = np.mean(reward_history)
class Environment: ''' This is the Environment class, it is responsible for spawning a new thread and using an agent to act and train ''' def __init__(self, n_step, gamma, queue_sync, queue_upd, n_processors, g_counter, env, isGlobal=False): # removed network param from Network import Network # from Agent import Agent # import keras self.n_step = n_step self.gamma = gamma self.isGlobal = isGlobal self.queue_sync = queue_sync self.queue_upd = queue_upd self.n_processors = n_processors # global and local counter self.g_counter = g_counter self.counter = 0 self.lock = Lock() # create new random seed for each child process np.random.seed() if isGlobal: self.network = Network(self.gamma, self.n_step, self.queue_sync, self.n_processors, self.isGlobal) self.run_sync_agents() else: self.run(env) # Synchronize the agents as long as they are training. This function is mainly # used by the global network. def run_sync_agents(self): while True: # start synchronization with the agents every 'x' timestep if self.g_counter.value % 500 == 0: while not self.queue_sync.empty(): _ = self.queue_sync.get() for _ in range(self.n_processors): # send the global network's weights to the agents weights = self.network.get_weights() self.queue_sync.put(self.pickle_weights(weights)) self.increment_global_counter() print('GLOBAL NET: Syncing weights to agents!') self.network.model.save('mario_model.h5') # update the global network's weights when there is d_w in the queue while not self.queue_upd.empty(): # d_w is the change of the weights d_w = self.unpickle_weights(self.queue_upd.get()) self.network.update_weights(d_w) print('GLOBAL NET: Updated weights from an agent!') # self.increment_global_counter() # initialize the network, environment and the agent and # then run the training. def run(self, env): self.network = Network(self.gamma, self.n_step, self.queue_sync, self.n_processors, self.isGlobal) self.env = env self.agent = Agent(self.env, self.n_step, self.gamma, self.network) self.env.reset() self.run_episode() def run_episode(self): # Reset the env s = self.env.reset() self.init_env() # Reset the agent self.agent.init_frames() # Act and observe until were done while True: action_idx, action = self.agent.act() s_prim, reward, done, info = self.env.step(action) # process the rewards done, reward_processed = self.process_reward(reward, info, done) if done: s_prim = None onehot_action = np.zeros(len(self.agent.actions)) onehot_action[action_idx] = 1 has_updated = self.agent.train(s, onehot_action, reward_processed, s_prim) #s = s_prim if done: # or self.stop_signal self.init_env() self.agent.init_frames() self.env.change_level(0) continue else: s = s_prim self.agent.next_frame(s) # increment the local and global counter after each episode self.increment_local_counter() self.increment_global_counter() if has_updated: w = self.network.get_weights() delta_w = w - self.network.get_global_weights() self.queue_upd.put(self.pickle_weights(delta_w)) if self.counter % 100 == 0: if not self.queue_sync.empty(): new_w = self.unpickle_weights(self.queue_sync.get()) self.network.set_weights(new_w) self.network.set_global_weights(np.array(new_w)) self.counter = 0 # prevent unecessarily large numbers def init_env(self): self.gameInfo = { 'max_distance': 0, 'time': 400, 'score': 0, 'staleness': 0 } #s = self.env.reset() def get_state_dim(self): return (self.env.observation_space.shape[0], self.env.observation_space.shape[1]) # increments the local counter def increment_local_counter(self): self.counter += 1 # Increment the global counter def increment_global_counter(self): with self.g_counter.get_lock(): self.g_counter.value = self.g_counter.value + 1 # pickle the weights so they can be passed through a queue def pickle_weights(self, w): # use protocol=-1 to use the latest protocol return pickle.dumps(w, protocol=-1) def unpickle_weights(self, w): return pickle.loads(np.array(w)) def process_reward(self, reward, info, done): ##### TODO: FIX REWARD WHEN RESPAWNING r = 0 if 'distance' in info: if info['distance'] > self.gameInfo['max_distance']: self.gameInfo['max_distance'] = info['distance'] self.gameInfo['staleness'] = 0 else: self.gameInfo['staleness'] += 1 r += reward * 0.5 # Check time if info['time'] < self.gameInfo['time']: r -= 0.01 # Check score r += (info['score'] - self.gameInfo['score']) * 0.0001 # tune if info['life'] == 0 or self.gameInfo[ 'staleness'] > 200: # tune staleness r -= 1 done = True if done and info[ 'distance'] > 0.97 * 3266: # 3266 is max_distance @ level 1 r += 1 self.gameInfo['time'] = info['time'] self.gameInfo['score'] = info['score'] return done, r return done, r
mode=modes[mode], gamma=0, initialMean=-1 * np.ones(actionDim), initialSd=0.25 * np.ones(actionDim), H=5) #in this simple problem, we may use a smaller H tf.global_variables_initializer().run(session=sess) agent.init(sess) # must be called after TensorFlow global variables init #always use the same random seed so that all algorithms start from the same initial action distribution np.random.seed(0) #loop over training iterations for iter in range(nIter): print("Iter {}".format(iter)) #query actions actions = agent.act(sess, dummyState) #compute rewards rewards = -np.sum(np.square(actions), axis=1) #make the agent memorize the episodes, each episode with just one action for idx in range(actions.shape[0]): agent.memorize(dummyState[idx], actions[idx, :], rewards[idx], dummyState[idx], True) #update agent (trains the value function predictor and policy networks) agent.updateWithMemorized(sess) #visualize if (iter + 1) in plotIters: nCols = len(plotIters) plotIdx = plotIters.index(iter + 1) pp.subplot(nModes, nCols, plotIdx + 1 + mode * nCols) pp.cla()
obj = self.object_grid[pos[0]][pos[1]][0] self.object_grid[pos[0]][pos[1]][0] = 0 return obj def let_object(self, pos, obj): if self.object_grid[pos[0]][pos[1]][0] == 0: self.object_grid[pos[0]][pos[1]][0] = copy.deepcopy(obj) return True else: return False def show(self): for i in self.object_grid: print([k[0] for k in i]) if __name__ == "__main__": g = Grid(50, 200, 200) agents = list() for i in range(20): a = Agent(10, 0.1, 0.3, 1, g) g.place_agent(a) agents.append(a) print(g.object_grid) for step in range(200000): for a in agents: a.act() if step % 1000 == 0: print("step {}".format(step)) g.show()
seed=seed, lr=LR, memory=memory, update_every=UPDATE_EVERY, batch_size=BATCH_SIZE, gamma=GAMMA, TAU=TAU, device=device) for i_episode in range(1, n_episodes + 1): #state = env.reset() env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] score = 0 for t in range(max_t): action = agent.act(state, eps) #next_state, reward, done, _ = env.step(action) env_info = env.step(action.astype(int))[brain_name] next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] agent.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(eps_end, eps_decay * eps) # decrease epsilon print('\rEpisode {}\tAverage Score: {:.2f}'.format(
batch_size=args.replay_size, input_dimension=INPUT_SIZE_90X, number_of_actions=NUMBER_OF_ACTIONS, alpha=args.alpha, load_weights=args.load) EPSILON = args.epsilon for episode in tqdm(range(args.ep)): state = Env.reset_scene() episode_rw = 0.0 done = 0 Agent.min_rw = 1000 Agent.last_rw = 0 for step in range(args.steps): action = Agent.act(state[3], EPSILON) vell = Agent.action_to_vel(action) reward, next_state = Env.do_step(vell, args.model) ############################## #print(reward) Agent.last_rw = reward Agent.min_rw = reward if (reward < Agent.min_rw) else Agent.min_rw ############################## episode_rw += reward done = Env.done() if done: break Agent.write_memory(state[3], action, reward, done, next_state[3]) state = next_state if len(Agent.memory) >= int(Agent.BATCH_SIZE): evall = Agent.replay(args.gamma, args.epochs)
# scores = ddpg() # assert False agent.actor_local.load_state_dict(torch.load('actor4850_1.pth')) # agent.critic_local.load_state_dict(torch.load('critic1.pth')) state_list = np.load('init_state.npy') fuel_list = [] for ep in range(500): total_reward = 0 fuel = 0 # state = state_list[ep] # state = env.reset(state=state, set_state=True) state = env.reset() for t in range(200): action = agent.act(state, add_noise=False) print(action, type(action)) assert False fuel += abs(action) state, reward, done, _ = env.step(action) total_reward += reward if done: break print(t, total_reward) if t == 199: fuel_list.append(fuel) # np.save('init_state.npy', np.array(state_list)) print(len(fuel_list) / 500, np.mean(fuel_list)) env.close()
batch_size = 32 agent = Agent(window_size, batch_size) data = getStockData("LT.NS") l = len(data) - 1 episode_count = 200 Buy, Sell, Rewards, Total_Profit = [], [], [], [] Buy_t, Sell_t, Rewards_t, Total_Profit_t = [], [], [], [] for e in tqdm.tqdm(range(episode_count)): # print("Episode " + str(e) + "/" + str(episode_count)) state = getState(data, 0, window_size + 1) agent.inventory = [] total_profit = 0 done = False for t in range(l): action = agent.act(state) action_prob = agent.actor_local.model.predict(state) next_state = getState(data, t + 1, window_size + 1) reward = 0 if action == 1: agent.inventory.append(data[t]) # print("Buy:" + formatPrice(data[t])) elif action == 2 and len(agent.inventory) > 0: # sell bought_price = agent.inventory.pop(0) reward = max(data[t] - bought_price, 0) total_profit += data[t] - bought_price # print("sell: " + formatPrice(data[t]) + "| profit: " + # formatPrice(data[t] - bought_price)) if t == l - 1:
def main(): logger.configure('./{}_logs'.format(C['env_id'])) for k, v in C.items(): logger.record_tabular(k, v) logger.dump_tabular() train_tracker = [0.0] eval_tracker = [] best_reward = 0 sess = tf.InteractiveSession() train_reward = tf.placeholder(tf.float32, name='train_reward') eval_reward = tf.placeholder(tf.float32, name='eval_reward') train_env = make_env(C['env_id'], C['noop_max']) eval_env = make_env(C['env_id'], C['noop_max']) agent = Agent(train_env, C) sess.run(tf.global_variables_initializer()) agent.nn.update_target() train_summary = tf.summary.scalar('train_rew', train_reward) eval_summary = tf.summary.scalar('eval_reward', eval_reward) writer = tf.summary.FileWriter('{}{}_summary'.format('./', C['env_id']), sess.graph) train_fs = reset_fs() train_s = train_env.reset() for it in range(C['iterations']): # Training train_fs.append(train_s) train_a = agent.act(np.transpose(train_fs, (1, 2, 0))) ns, train_r, train_d, _ = train_env.step(train_a) train_tracker[-1] += train_r agent.perceive(train_s, train_a, train_r, float(train_d), it) train_s = ns if train_d: if train_env.env.env.was_real_done: if len(train_tracker) % 100 == 0: summary = sess.run(train_summary, feed_dict={ train_reward: np.mean(train_tracker[-100:]) }) writer.add_summary(summary, it) logger.record_tabular('steps', it) logger.record_tabular('episode', len(train_tracker)) logger.record_tabular('epsilon', 100 * agent.epsilon) logger.record_tabular('learning rate', agent.lr) logger.record_tabular('mean 100 episodes', np.mean(train_tracker[-100:])) logger.dump_tabular() train_tracker.append(0.0) train_fs = reset_fs() train_s = train_env.reset() # Evaluation if it % C['eval_freq'] == 0: for _ in range(C['eval_episodes']): temp_video = [] temp_reward = 0 eval_tracker.append(0.0) eval_fs = reset_fs() eval_s = eval_env.reset() while True: temp_video.append(eval_s) eval_fs.append(eval_s) eval_a = agent.greedy_act(np.transpose(eval_fs, (1, 2, 0))) eval_s, eval_r, eval_d, _ = eval_env.step(eval_a) eval_tracker[-1] += eval_r if eval_env.env.env.was_real_done: break if eval_d: eval_fs = reset_fs() eval_s = eval_env.reset() if eval_tracker[-1] > best_reward: # Save best video best_reward = eval_tracker[-1] logger.log( 'Dump best video reward: {}'.format(best_reward)) best_video = temp_video with open('video.pkl', 'wb') as f: pickle.dump(best_video, f, protocol=pickle.HIGHEST_PROTOCOL) logger.log( 'Evaluate mean reward: {:.2f}, max reward: {:.2f}, std: {:.2f}' .format(np.mean(eval_tracker[-C['eval_episodes']:]), np.max(eval_tracker[-C['eval_episodes']:]), np.std(eval_tracker[-C['eval_episodes']:]))) summary = sess.run(eval_summary, feed_dict={ eval_reward: np.mean(eval_tracker[-C['eval_episodes']:]) }) writer.add_summary(summary, it) agent.nn.save('./{}_model'.format(C['env_id']))
def main(env_name, mode, learning_rate, ppo_epsilon, ppo_ent_l_w, max_steps, iter_steps, render, batch_size, history_buffer_size, n_updates, verbose, run_suffix): suffix = '%s-%s-batch_size=%d,iter_steps=%d' % (mode, env_name, batch_size, iter_steps) if mode == "PPO": suffix += '-epsilon=%.3f-ppo_ent_l_w=%.2f' % (ppo_epsilon, ppo_ent_l_w) else: suffix += '-H=%d' % history_buffer_size print('Starting run for the settings %s' % suffix) logger.configure(dir='%s-%s' % (suffix, run_suffix)) # Init tensorflow sess = tf.InteractiveSession() # Create environment sim = gym.make(env_name) # Create the agent agent = Agent( mode=mode, stateDim=sim.observation_space.low.shape[0], actionDim=sim.action_space.low.shape[0], actionMin=sim.action_space.low, actionMax=sim.action_space.high, learningRate=learning_rate, PPOepsilon=ppo_epsilon, PPOentropyLossWeight=ppo_ent_l_w, H=history_buffer_size, useScaler= True # This makes the agent to try to normalize the scale state observations. ) # Finalize initialization tf.global_variables_initializer().run(session=sess) # print("Initializing agent") agent.init( sess ) # Should only be called after the global variables initializer above # How many simulation steps to use the same action (larger values than 1 seem to help in MuJoCo agent exploration) actionRepeat = 2 # Main training loop totalSimSteps = 0 nextObservation = None iteration = 0 while totalSimSteps < max_steps: #Counter for total simulation steps taken in this iteration nSimSteps = 0 # A list to hold the experience trajectories trajectories = [] #run episodes until budget runs out, computing the average episode reward nEpisodes = 0 averageEpisodeReward = 0 # print("Collecting experience...") while nSimSteps < iter_steps: # Reset the episode observation = sim.reset() done = False episodeReward = 0 # List to hold the experience of this episode trajectory = [] # Simulate this episode until done while not done: # Query the agent for action action = agent.act(sess, observation) # Simulate using the action, repeating the same action for actionRepeat steps. # Also, compute the total reward received. reward = 0 for _ in range(actionRepeat): nextObservation, stepReward, done, info = sim.step( action[0, :]) # Uncomment the following two lines to enable rendering if render and nEpisodes < 5: # Only render the first few episodes of each iteration sim.render() nSimSteps += 1 totalSimSteps += 1 reward += stepReward episodeReward += stepReward if done: break # Save the experience point e = Experience(observation, action, reward, nextObservation, done) trajectory.append(e) observation = nextObservation # Episode done, bookkeeping trajectories.append(trajectory) averageEpisodeReward += episodeReward nEpisodes += 1 #All episodes of this iteration done, print results and update the agent averageEpisodeReward /= nEpisodes iteration += 1 print('================ Iteration %d ================' % iteration) logger.record_tabular("Total iterations", iteration) logger.record_tabular("Total timesteps", totalSimSteps) logger.record_tabular("Episode reward mean", averageEpisodeReward) logger.record_tabular("Average policy std", agent.getAverageActionStdev()) logger.dump_tabular() agent.update(sess, trajectories, batchSize=batch_size, nBatches=n_updates, verbose=verbose) sess.close() print('Finished run for the settings %s' % suffix)
class Learning(object): """ docstring for Learning """ def __init__(self, number_of_actions, input_dimension, load, batch_size=25, episodes=10, max_steps=100, epsilon=0, gamma=0.0, alpha=0.0, epsilon_decay=1.0, episodes_decay=30, epochs=1): self.episodes = episodes self.max_steps = max_steps self.epsilon = epsilon self.gamma = gamma self.alpha = alpha self.epsilon_decay = epsilon_decay self.episodes_decay = episodes_decay self.epochs = epochs self.agent = Agent(number_of_actions, input_dimension, batch_size, self.alpha, load, 'model_weights.h5') self.analyzer = Results() """ append a new action in the memory, in form of a tuple, for further replay with a batch """ def write_memory(self, memory, state_list, action, reward, next_state_list, is_done): memory.append((state_list, action, reward, next_state_list, is_done)) """ replays the memory in a batch, learning from past actions to maximize reward """ def replay(self): mini_batch = random.sample(self.agent.memory, int(self.agent.batch_size)) fit = None for state_list, action, reward, next_state_list, done in mini_batch: target = reward if not done: target = (reward + self.gamma * (np.amax( self.agent.model.predict([ next_state_list[0][1].reshape( 1, self.agent.input_dimension, self.agent.input_dimension, 1), next_state_list[1] [1].reshape(1, self.agent.input_dimension, self.agent.input_dimension, 1), next_state_list[2][1].reshape( 1, self.agent.input_dimension, self.agent.input_dimension, 1) ])[0]))) target_f = self.agent.model.predict([ state_list[0][1].reshape(1, self.agent.input_dimension, self.agent.input_dimension, 1), state_list[1][1].reshape(1, self.agent.input_dimension, self.agent.input_dimension, 1), state_list[2][1].reshape(1, self.agent.input_dimension, self.agent.input_dimension, 1) ]) target_f[0][action] = target fit = self.agent.model.fit([ state_list[0][1].reshape(1, self.agent.input_dimension, self.agent.input_dimension, 1), state_list[1][1].reshape(1, self.agent.input_dimension, self.agent.input_dimension, 1), state_list[2][1].reshape(1, self.agent.input_dimension, self.agent.input_dimension, 1) ], target_f, self.epochs, verbose=0) if fit == None: return 0 else: return fit """ main loop for the learning itself """ def run(self): print("starting everything") for episode in range(self.episodes): self.agent.controller.start_sim() sleep(0.5) now = datetime.now() #print(str(now) + " starting ep " + str(episode+1) + "\n") init = time.time() state_list = [] self.agent.instant_reward = 0.0 state_list.append(self.agent.vision.get_image( 1)) #state = (resolution, grayscale, colored RGB) state_list.append(self.agent.vision.get_image( 2)) #state = (resolution, grayscale, colored RGB) state_list.append(self.agent.vision.get_image( 3)) #state = (resolution, grayscale, colored RGB) steps_done = None for step in range(self.max_steps): steps_done = step action_taken = self.agent.act(state_list[0], state_list[1], state_list[2], self.epsilon) next_state1, next_state2, next_state3, reward, done = self.agent.do_step( action_taken) ##extrair imagem aqui dentro self.agent.instant_reward += reward self.write_memory(self.agent.memory, state_list, action_taken, reward, [next_state1, next_state2, next_state3], done) state_list[0] = next_state1 state_list[1] = next_state2 state_list[2] = next_state3 if done: break self.analyzer.steps_list.append(step + 1) end = time.time() self.agent.controller.stop_sim() sleep(0.5) evall = None if len(self.agent.memory) > int(self.agent.batch_size): rep_init = time.time() evall = self.replay() rep_end = time.time() now = datetime.now() self.analyzer.mse_values.append( evall.history['mean_squared_error']) print( str(now) + " mse value: " + str(round(evall.history['mean_squared_error'][0], 2)) + " loss: " + str(round(evall.history['loss'][0], 4)) + " replay " + str(round((rep_end - rep_init) / 60.0, 2)) + " minutes") self.analyzer.rewards_list.append(self.agent.instant_reward) self.agent.cummulative_reward += self.agent.instant_reward if episode > 0 and (episode % self.episodes_decay == 0): self.epsilon *= self.epsilon_decay now = datetime.now() #print str(now) + " epsilon decay" if episode > 0 and episode % 10 == 0: now = datetime.now() #print str(now) + " weights backup..." self.agent.model.save_weights('model_weights.h5') now = datetime.now() print( str(now) + " duration " + str(round((end - init) / 60.0, 2)) + " min // ep " + str(episode + 1) + "/" + str(self.episodes) + " // steps " + str(step) + " // reward " + str(round(self.agent.instant_reward, 2))) self.agent.step_lost_counter = 0 self.agent.controller.stop_sim() self.agent.controller.close_connection() cv.destroyAllWindows() now = datetime.now() self.agent.model.save_weights('model_weights.h5') now = datetime.now() os.chdir("logs") dirr = str(now) os.mkdir(dirr) file = open(os.path.join(os.getcwd(), dirr, "data.txt"), 'w') file.write(str(self.analyzer.rewards_list)) file.write(str(self.analyzer.steps_list)) file.write(str(self.analyzer.mse_values)) file.close() self.analyzer.plot_media_n(self.analyzer.rewards_list, self.analyzer.reward_fig, dirr, 10, "REWARDxEP(media)", "Reward Media x 10 Episodio", "Reward Media") self.analyzer.plot_raw(self.analyzer.rewards_list, self.analyzer.reward_fig, dirr, "REWARDxEP", "Reward x Episodio", "Reward") self.analyzer.plot_raw(self.analyzer.steps_list, self.analyzer.steps_fig, dirr, "STEPS", "Steps Gastos x Episodio", "Steps") self.analyzer.plot_raw(self.analyzer.mse_values, self.analyzer.mse_fig, dirr, "MSE", "Mean Squared Error x Episodio", "Valor MSE", normalize=True)
class Simulator(object): def __init__(self, env, display=True, log_metrics=False, filename="sim"): self.env = env self.agent = Agent(env.observation_space.shape[0], env.action_space.n) self.testing = False self.log_metrics = log_metrics self.display = display if self.log_metrics: self.log_filename = os.path.join("logs", filename + "_cartpole.csv") self.log_fields = [ 'episode', 'testing', 'net_reward', 'epsilon', 'gamma', 'alpha' ] self.log_file = open(self.log_filename, 'w', newline='') self.log_writer = csv.DictWriter(self.log_file, fieldnames=self.log_fields) self.log_writer.writeheader() def log_trial(self, episode, net_reward): if self.log_metrics: self.log_writer.writerow({ 'episode': episode, 'testing': self.testing, 'net_reward': net_reward, 'alpha': self.agent.learn_rate, 'epsilon': self.agent.epsilon, 'gamma': self.agent.gamma }) def run(self, episodes=5000, n_test=0): state = self.env.reset() for e in range(episodes): state = self.env.reset() state = np.reshape(state, [1, 4]) net_reward = 0.0 if (e % 100) == 0: display = True else: display = False for time_t in range(5000): if display: self.env.render() action = self.agent.act(state) next_state, reward, done, _ = self.env.step(action) next_state = np.reshape(next_state, [1, 4]) self.agent.remember(state, action, reward, next_state, done) state = next_state net_reward += reward if done: print("episode: {}/{}, , e = {}, score = {}".format( e, episodes, self.agent.epsilon, time_t)) break if e > 32 and self.agent.epsilon > 0.0: self.agent.learn(batch_size=32) elif self.agent.epsilon == 0.0 and not self.testing: self.testing = True self.log_trial(e, net_reward) if self.log_metrics: self.log_file.close()
state = preprocess(game.get_state().screen_buffer) for epoch in range(EPOCHS): print("\n\nEpoch %d\n-------" % (epoch + 1)) train_episodes_finished = 0 train_scores = [] print("Training...") game.new_episode() episode_buffer = [] agent.reset_cell_state() state = preprocess(game.get_state().screen_buffer) for learning_step in trange(STEPS_PER_EPOCH, leave=False): action = agent.act(state) reward = game.make_action(actions[action], FRAME_REPEAT) done = game.is_episode_finished() if not done: state_new = preprocess(game.get_state().screen_buffer) else: state_new = None agent.add_transition(state, action, reward, state_new, done) state = state_new if learning_step % UPDATE_FREQUENCY == 0: agent.learn_from_memory() updateTarget(targetOps, SESSION) if done:
from World import World from Agent import Agent world = World() agent = Agent(world) for e in range(5000): done = False result = 0 while not done: done, result = agent.act() agent.update_policy(result) agent.reset() # # if result > 0: # print("Agent has won :)") # else: # print("Agent has lost :(") agent.display_policy()
SESSION.run(init) ########################################## if not SKIP_LEARNING: time_start = time() print("\nFilling out replay memory") updateTarget(targetOps, SESSION) agent.reset_cell_state() state = game.get_state() for _ in range(RANDOM_WANDER_STEPS): if not LOAD_MODEL: action = agent.random_action() else: action = agent.act(game.get_last_action(), state) img_state, reward, done = game.make_action(action) if not done: state_new = img_state else: state_new = None agent.add_transition(state, action, reward, state_new, done) state = state_new if done: game.reset() agent.reset_cell_state() state = game.get_state() max_avgR = -10000.0
MAX_EPISODES = 200 def plot_results(): plt.subplot(2, 1, 1) plt.plot(np.cumsum(np.array(agent.rewards))) plt.ylabel('Cumulative Rewards') plt.xlabel('Steps') plt.subplot(2, 1, 2) plt.plot(np.array(agent.number_of_steps_til_reward)) plt.ylabel('# Steps to Reward') plt.xlabel('Episodes') plt.show() # Iterate through a number of episodes (set by MAX_EPISODES) and plot the results. if __name__ == '__main__': environment = GridWorldModel() agent = Agent(environment) for _ in itertools.repeat(None, MAX_EPISODES): environment.reset() agent.state = environment.get_start_state() while True: agent.act() if environment.is_terminal_state(): break plot_results()
hyp = np.log(np.array([1, 1, 10])) cov = NormalARD() gp = GaussianProcess(lik, hyp, cov) gp2 = GaussianProcess(lik, hyp, cov) sig =np.ones((3,)) * 0.001 sig2 = np.ones((3,)) * 0.1 start_z = np.array([[0., 0., 0.]]) agent = Agent(gp, reward, sig, start_z) agent2 = Agent(gp2, reward, sig2, start_z) fig = plt.figure(figsize=(20,7), dpi=300) zlim = (-10, 10, -10, 10) for i in xrange(0, 1000): agent.observe() agent.decide() agent.act() agent2.observe() agent2.decide() agent2.act() t = agent.gp.Z[-1].flatten()[-1] a = [0] * 4 a[0] = agent.gp.Z[-1].flatten()[0] a[1] = agent.gp.Z[-1].flatten()[1] a[2] = agent.gp.Z[-1].flatten()[0] a[3] = agent.gp.Z[-1].flatten()[1] extent = np.max(np.abs(a)) lim = extent + 3 if extent > 10 else 10 zlim = (-lim, lim, -lim, lim) fig.clf() ax1 = fig.add_subplot(1, 3, 1)
print("Generating " + str(number_of_samples) + " samples for training") for sample in tqdm(range(number_of_samples)): initial_pos = [np.random.randint(0, 360) for i in range(6)] Agent.controller.set_positions(Agent.handlers, initial_pos) sleep(0.08) initial_states = [] initial_states.append(Agent.vision.get_image(sensor_number=1)) initial_states.append(Agent.vision.get_image(sensor_number=2)) initial_states.append(Agent.vision.get_image(sensor_number=3)) action = Agent.act(initial_states[0], initial_states[1], initial_states[2], epsilon=11) new_state1, new_state2, new_state3, reward, done = Agent.do_step(action) new_states = [new_state1, new_state2, new_state3] writer.writerow([action, reward, done]) for image in range(len(new_states)): cv.imwrite( "/media/leonardo/Seagate Expansion Driver/DTASET_IC/init/" + str(sample + 1) + "_" + str(image + 1) + ".png", initial_states[image][1]) cv.imwrite( "/media/leonardo/Seagate Expansion Driver/DTASET_IC/end/" + str(sample + 1) + "_" + str(image + 1) + ".png", new_states[image][1])