def local_test(index, opt, global_model): torch.manual_seed(123 + index) env, num_states, num_actions = create_train_env(args.world, args.stage, args.action_type) local_model = ActorCritic(num_states, num_actions) local_model.eval() state = torch.from_numpy(env.reset()) done = True curr_step = 0 actions = deque(maxlen=args.max_actions) while True: curr_step += 1 if done: local_model.load_state_dict(global_model.state_dict()) with torch.no_grad(): if done: h_0 = torch.zeros((1, 512), dtype=torch.float) c_0 = torch.zeros((1, 512), dtype=torch.float) else: h_0 = h_0.detach() c_0 = c_0.detach() logits, value, h_0, c_0 = local_model(state, h_0, c_0) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() state, reward, done, _ = env.step(action) env.render() actions.append(action) if curr_step > args.num_global_steps or actions.count(actions[0]) == actions.maxlen: done = True if done: curr_step = 0 actions.clear() state = env.reset() state = torch.from_numpy(state)
def test(game_size, norm): # start_pprof_server(port=8081) env = gym.make('game2048-v0', size=game_size, norm=norm) obs = env.reset() rewards = 0 step = 0 for _ in range(1): start = time.time() * 1000 while True: # if render for every step # env.render() action = env.action_space.sample() obs, reward, done, info = env.step(action) rewards += reward step += 1 if done: escape = time.time() * 1000 - start env.render() print(f'obs: {obs}') print( f'play games steps: {step} reward: {rewards} info: {info}' + f' use {escape:.3f}ms speed: {(step * 1000 / escape):.3f}ops/s' ) time.sleep(0.5) step = 0 rewards = 0 start = time.time() * 1000 env.reset()
def run_env(env, n_runs=100): """ Plots simulated games in an environment for visualization :param env: environment to be run :param n_runs: how many episodes should be run :return: plot of each step in the environment """ for i in range(n_runs): env.reset() env.show() done = False while not done: state = env.agents[0].board_to_state( ) # for the reinforcement agent convert board to state input action = env.agents[0].select_action(state, 0.00) action = action[0, 0] # action is unwrapped from the LongTensor move = env.agents[0].action_to_move( action) # e.g. action = 1 -> move = ((0, 0), (0, 1)) _, done, won = env.step(move) env.show() if done and won: print("Won!") elif done and not won or env.steps > 20: print("Lost") break
def monte_carlo_control(): action_value_function = defaultdict(float) n_s = defaultdict(int) n_s_a = defaultdict(int) n_zero = 1E5 episodes = xrange(int(1E8)) pbar = ProgressBar(maxval=len(episodes)).start() for episode in episodes: state = State() while not state.terminal: player = state.player dealer = state.dealer epsilon = float(n_zero) / (n_zero + n_s[(dealer, player)]) action = epsilon_greedy_policy(action_value_function, state, epsilon) n_s[(dealer, player)] += 1 n_s_a[(dealer, player, action)] += 1 reward = step(state, action) # update the action value function alpha = 1.0 / n_s_a[(dealer, player, action)] new_reward = action_value_function[(dealer, player, action)] action_value_function[(dealer, player, action)] += alpha * (reward - new_reward) pbar.update(episode) pbar.finish() value_function = action_value_to_value_function(action_value_function) plot_value_function(value_function, "Optimal Value Function: Question 2") return action_value_function
def train_sl(size, lr, rd): env = gym.make('game2048-v0', size=size) agent = model.SarsaLambda(env.action_space) trials = 1 * 10000 * (size ** 2) for trial in range(trials): obs = env.reset() obs = str(obs.reshape(size ** 2).tolist()) action = agent.choose_action(obs) stepno = 0 rewards = 0 while True: stepno += 1 obs_, reward, done, _ = env.step(action) obs_ = str(obs_.reshape(size ** 2).tolist()) action_ = agent.choose_action(obs_) if done: obs_ = 'terminal' agent.learn(obs, action, reward, obs_, action_) obs = obs_ action = action_ rewards += reward if done: break env.render() print(f'Completed in {trial} use {stepno} steps highest: \ {env.highest()} rewards: {rewards}') stepno = 0 rewards = 0 print(len(agent.q_table))
def behaviour(self, candidate): obs = env.reset() done = False while not done: action = get_action(ns, obs) obs, reward, done, _ = env.step(action) return obs
def self_play(env, agent, return_trajectory=False, verbose=False): if return_trajectory: trajectory = [] observation = env.reset() for step in itertools.count(): board,_,player,_,_ = observation action, prob = agent.decide(observation, return_prob=True) if verbose: print(strfboard(observation)) logging.info('The {} step:palyer {}, action {}'.format(step, player, action)) observation, winner, done, _ = env.step(action[0]) if return_trajectory: m,n = board.shape board = np.reshape(board, m*n) trajectory.append((player, board, prob)) if done: if verbose: print(strfboard(observation)) logging.info('Winner {}'.format(winner)) break if return_trajectory: df_trajectory = pd.DataFrame(trajectory, columns=['player', 'board', 'prob']) df_trajectory['winner'] = winner return df_trajectory else: return winner
def some_random_games_first(): for episode in range(10): env.reset() for t in range(goal_steps): action = env.action_space() observation, reward, done, info = env.step(action) if done: break
def respond(self, env): mask = env.get_mask() for i in range(len(action_space)): if mask[i]: # print('taking action, ', action_space[i]) return env.step(action_space[i]) raise Exception("should not be here") return None, None
def respond(self, env): mask = env.get_mask() valid_actions = np.take(np.arange(len(action_space)), mask.nonzero()) valid_actions = valid_actions.reshape(-1) a = np.random.choice(valid_actions) # print('taking action, ', action_space[a]) return env.step(action_space[a])
def initial_population(): training_data = [] scores = [] accepted_scores = [] for _ in range(initial_games): env.reset() if (_ % 100 == 0): print(_) score = 0 game_memory = [] prev_observation = [0, 0, 0, 0, 0, 0, 0, 0, 0] for _ in range(goal_steps): #print(prev_observation) action = env.action_space() observation, reward, done, info = env.step(action) #print(action) if len(prev_observation) > 0: game_memory.append([prev_observation, action]) prev_observation = observation score += reward #if done: # break if score >= score_requirement: accepted_scores.append(score) for data in game_memory: if data[1] == 1: output = [1, 0, 0, 0, 0, 0, 0, 0, 0] elif data[1] == 2: output = [0, 1, 0, 0, 0, 0, 0, 0, 0] elif data[1] == 3: output = [0, 0, 1, 0, 0, 0, 0, 0, 0] elif data[1] == 4: output = [0, 0, 0, 1, 0, 0, 0, 0, 0] elif data[1] == 5: output = [0, 0, 0, 0, 1, 0, 0, 0, 0] elif data[1] == 6: output = [0, 0, 0, 0, 0, 1, 0, 0, 0] elif data[1] == 7: output = [0, 0, 0, 0, 0, 0, 1, 0, 0] elif data[1] == 8: output = [0, 0, 0, 0, 0, 0, 0, 1, 0] elif data[1] == 9: output = [0, 0, 0, 0, 0, 0, 0, 0, 1] training_data.append([data[0], output]) scores.append(score) training_data_save = np.array(training_data) np.save('saved2.npy', training_data_save) print('Average accepted score:', mean(accepted_scores)) print('Median accepted score: ', median(accepted_scores)) print(Counter(accepted_scores)) return training_data
def main(args): param_str = ( f'{args.env}_{args.algo}_rep={args.repeat}_hor={args.horizon}_prop={args.proposals}' f'_iter={args.iterations}_sigma={args.sigma}') env = gym.make(args.env) env = ActionRepeat(env, args.repeat) # Pool of workers, each has its own copy of global environment variable pool = Pool(32, initializer, [env]) if args.algo == 'gaussian': planner = partial(gaussian_cem, pool=pool, action_space=env.action_space, horizon=args.horizon, proposals=args.proposals, topk=args.topk, iterations=args.iterations) elif args.algo == 'nonparametric': planner = partial(nonparametric_cem, pool=pool, action_space=env.action_space, horizon=args.horizon, proposals=args.proposals, topk=args.topk, iterations=args.iterations, sigma=args.sigma) scores = np.zeros(args.episodes) observations = np.zeros((args.episodes, env.num_steps + 1) + env.observation_space.shape) actions = np.zeros((args.episodes, env.num_steps) + env.action_space.shape) for i in range(args.episodes): logger = Logger(os.path.join(args.logdir, f'{param_str}_run{i}')) observations[i, 0] = env.reset() for t in range(env.num_steps): state = env.sim.get_state() actions[i, t] = planner(state) observations[i, t + 1], reward, _, _ = env.step(actions[i, t]) scores[i] += reward logger.log_scalar('reward', scores[i], t) print(scores[i]) print(param_str) print('Mean score: ', scores.mean()) print('Standard deviation: ', scores.std()) if args.save: path = os.path.join(args.savedir, args.env) if not os.path.exists(path): os.makedirs(path) np.save(os.path.join(path, 'obs'), observations) np.save(os.path.join(path, 'act'), actions)
def test(env): action = env.action_space.sample() obs, r, done, info = env.step(action) env.render() print('action:', action) print('reward:', r) print('done:', done) print('info:', info) print('nb_actions', env.action_space.n)
def evaluate(self): obs = env.reset() done = False total_reward = 0 while not done: action = get_action(ns, obs) obs, reward, done, _ = env.step(action) total_reward += reward return total_reward
def dqn(n_runs, n_episodes, max_t=300, eps_start=0.05, eps_end=1e-4, eps_decay=0.996): steps = np.zeros(n_episodes) acc_rewards = [] scores = [] eps = eps_start map_vec = env.init_map_vec() probMap = np.full((8, 8), 0) for num in map_vec: loc = util.num_to_loc(num, 8) probMap[loc[0]][loc[1]] = 1 print(agent.probMap) for i_run in range(0, n_runs): # train print("run: ", i_run) # provide the learned map #agent.reset() for i_episode in range(0, n_episodes): if i_episode % 500 == 0: print(i_episode) state = env.reset() #score = 0 #agent.probMap = probMap #agent.visitMap = np.full((8, 8), 0) for t in range(max_t): success = False action = agent.act(state, eps) next_state, reward, done = env.step(action) agent.step(state, action, reward, next_state, done, False, True) # not update the map state = next_state eps = max(eps * eps_decay, eps_end) #score += reward if done: #print(env.map) #print("t",t,"score",score) steps[i_episode] = steps[i_episode] + t success = True #print(t) break if not success: steps[i_episode] = steps[i_episode] + max_t #print(t) #agent.reset() return scores, steps, agent.probMap
def rollout(sentence_generator, vae, sentences, inst_to_one_hot, dict_goals, valid_goals, env, policy, env_params, inits, goals, self_eval, true_eval, biased_init=False, \ animated=False): expressions = get_list_of_expressions() scores = [] np.random.shuffle(expressions) for expression in expressions: print('\nAttempting expression: ', expression) observation = env.unwrapped.reset_goal(np.array(goals[i]), biased_init=biased_init) config_inital = observation['achieved_goal'].copy() trial_counter = 0 success = False while trial_counter < 5: trial_counter += 1 goals_str = sample_vae_logic(vae, inst_to_one_hot, observation['achieved_goal'], expression, valid_goals) if len(goals_str) > 0: goal = dict_goals[np.random.choice(list(goals_str))] # goal = dict_goals[np.random.choice(list(goals_str))] env.unwrapped.target_goal = goal.copy() observation = env.unwrapped._get_obs() obs = observation['observation'] ag = observation['achieved_goal'] g = observation['desired_goal'] # start to collect samples for t in range(env_params['max_timesteps']): # run policy no_noise = self_eval or true_eval action = policy.act(obs.copy(), ag.copy(), g.copy(), no_noise) # feed the actions into the environment if animated: env.render() observation_new, _, _, info = env.step(action) obs = observation_new['observation'] ag = observation_new['achieved_goal'] config_final = ag.copy() true_sentences = sentence_generator(config_inital, config_final) if check_sentence(true_sentences, expression): scores.append(trial_counter) success = True print('Success!') break else: print('\tFailed. Trying again.') if not success: scores.append(0) print('\tFailed 5 times, Moving On.') return scores.copy()
def q_learning(size, num_episodes, alpha, gamma=1.0, plot_every=100): env = gym.make('game2048-v0', size=size) """Q-Learning - TD Control Params ====== num_episodes (int): number of episodes to run the algorithm alpha (float): learning rate gamma (float): discount factor plot_every (int): number of episodes to use when calculating average score """ nA = env.action_space.n # number of actions Q = defaultdict(lambda: np.zeros(nA)) # initialize empty dictionary of arrays # monitor performance tmp_scores = deque(maxlen=plot_every) # deque for keeping track of scores avg_scores = deque(maxlen=num_episodes) # average scores over every plot_every episodes for i_episode in range(1, num_episodes+1): # monitor progress score = 0 # initialize score state = env.reset() # start episode state = str(state.reshape(size ** 2).tolist()) eps = 1.0 / i_episode # set value of epsilon while True: action = epsilon_greedy(env, Q, state, nA, eps) # epsilon-greedy action selection next_state, reward, done, info = env.step(action) # take action A, observe R, S' next_state = str(next_state.reshape(size ** 2).tolist()) score += reward # add reward to agent's score Q[state][action] = update_Q_sarsamax(alpha, gamma, Q, \ state, action, reward, next_state) state = next_state # S <- S' if done: tmp_scores.append(score) # append score break print("\rEpisode {}/{}\t Average Score: {:.2f}".format(i_episode, num_episodes, np.mean(tmp_scores)), end="") if i_episode % 100 == 0: print("\rEpisode {}/{}".format(i_episode, num_episodes)) sys.stdout.flush() if (i_episode % plot_every == 0): avg_scores.append(np.mean(tmp_scores)) # plot performance plt.plot(np.linspace(0,num_episodes,len(avg_scores),endpoint=False), np.asarray(avg_scores)) plt.xlabel('Episode Number') plt.ylabel('Average Reward (Over Next %d Episodes)' % plot_every) plt.show() # print best 100-episode performance print(('Best Average Reward over %d Q length: %d Episodes: ' % (plot_every, len(Q))), np.max(avg_scores)) return Q
def test_env(model, vis=False): state = env.reset() if vis: env.render() done = False total_reward = 0 while not done: state = torch.FloatTensor(state).unsqueeze(0).to(device) dist, _ = model(state) next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0]) state = next_state if vis: env.render() total_reward += reward return total_reward, env.get_score()
def test(): training_data = np.load('saved2.npy') X = np.array([i[0] for i in training_data]).reshape(-1, len(training_data[0][0]), 1) model = neural_network_model(input_size=len(X[0])) model.load("model2.model") scores = [] choices = [] for each_game in range(1000): score = 0 game_memory = [] prev_obs = [0, 0, 0, 0, 0, 0, 0, 0, 0] env.reset() for _ in range(goal_steps): #if len(prev_obs) == 0: # action = random.randrange(1,10) #else: action = np.argmax( model.predict( np.array(prev_obs).reshape(-1, len(prev_obs), 1))[0]) #if(action==9): #print(action) #print(prev_obs) #print(action) choices.append(action) new_observation, reward, done, info = env.step(action) print(new_observation) prev_obs = new_observation game_memory.append([new_observation, action]) score += reward if done: break scores.append(score) print('Average Score', sum(scores) / len(scores)) print( 'Choice 1: {}, Choice 2: {}, Choice 3: {}, Choice 4: {}, Choice 5: {}, Choice 6: {}, Choice 7: {}, Choice 8: {}, Choice 9: {}' .format( choices.count(1) / len(choices), choices.count(2) / len(choices), choices.count(3) / len(choices), choices.count(4) / len(choices), choices.count(5) / len(choices), choices.count(6) / len(choices), choices.count(7) / len(choices), choices.count(8) / len(choices), choices.count(9) / len(choices)))
def train_ql(size, lr, rd, eps_start=1.0, eps_end=0.05, eps_decay=0.999): env = gym.make('game2048-v0', size=size) agent = model.QLearning(env.action_space, learning_rate=lr, reward_decay=rd) total_steps = 0 total_scores = 0 highest_score = 0 # trials = 1 * 100000 * (size ** 2) trials = 400000 rewards_window = deque(maxlen=100) scores_window = deque(maxlen=100) eps = eps_start for trial in range(1, trials+1): obs = env.reset() obs = str(obs.reshape(size ** 2).tolist()) stepno = 0 rewards = 0 while True: stepno += 1 total_steps += 1 action = agent.choose_action(str(obs), eps) obs_, reward, done, _ = env.step(action) obs_ = str(obs_.reshape(size ** 2).tolist()) if done: obs_ = 'terminal' agent.learn(obs, action, reward, obs_) obs = obs_ rewards += reward if done: break #env.render() eps = max(eps_end, eps_decay * eps) rewards_window.append(rewards) scores_window.append(env.get_score()) if env.get_score() > highest_score: highest_score = env.get_score() total_scores += env.get_score() print('\rEpisode {}\t total_steps: {}\t Average Rewards: {:.2f}\t Average Scores: {:.2f} {}'. format(trial, total_steps, np.mean(rewards_window), np.mean(scores_window), eps), end="") if trial% 100 == 0: print('\rEpisode {}\t total_steps: {}\t Average Rewards: {:.2f}\t Average Scores: {:.2f} {}'. format(trial, total_steps, np.mean(rewards_window), np.mean(scores_window), eps)) eval(env, agent, 1000, render=False) print(f'table_len: {len(agent.q_table)} steps: {total_steps} avg_score: {total_scores / trials} \ highest_score: {highest_score} at size: {size} lr: {lr} reward_decay: {rd}') print(f'table_len: {len(agent.q_table)} steps: {total_steps}')
def train_sarsa(size, lr, rd): env = gym.make('game2048-v0', size=size) agent = model.Sarsa(env.action_space, learning_rate=lr, reward_decay=rd) total_steps = 0 total_scores = 0 highest_score = 0 trials = 1 * 1000 * (size ** 2) for trial in range(trials): obs = env.reset() obs = str(obs.reshape(size ** 2).tolist()) action = agent.choose_action(obs) stepno = 0 rewards = 0 while True: stepno += 1 total_steps += 1 obs_, reward, done, _ = env.step(action) obs_ = str(obs_.reshape(size ** 2).tolist()) action_ = agent.choose_action(obs_, True) if done: obs_ = 'terminal' agent.learn(obs, action, reward, obs_, action_) obs = obs_ action = action_ rewards += reward if done: break #env.render() print(f'Completed in {trial} use {stepno} steps highest: \ {env.highest()} rewards: {rewards}', end="") if env.highest() >= 2 ** (size ** 2 - 1): highest[trial] = env.highest() if env.highest() >= 2 ** (size ** 2): targets[trial] = env.highest() if env.get_score() > highest_score: highest_score = env.get_score() total_scores += env.get_score() stepno = 0 rewards = 0 eval(env, agent, render=False) print(f'table_len: {len(agent.q_table)} steps: {total_steps} avg_score: {total_scores / trials} \ highest_score: {highest_score} at size: {size} lr: {lr} reward_decay: {rd}') print(f'highest len: {len(highest)} prob: {len(highest) * 1.0 / trials} \ target len: {len(targets)} prob: {len(targets) * 1.0 / trials}')
def respond(self, env): mask = get_mask(to_char(self.env.get_curr_cards()), self.action_space, to_char(self.env.get_last_cards())) s = env.get_state() s = np.reshape(s, [1, -1]) policy, val = self.sess.run([ self.agents[0].network.valid_policy, self.agents[0].network.val_pred], feed_dict={ self.agents[0].network.input: s, self.agents[0].network.mask: np.reshape(mask, [1, -1]) }) policy = policy[0] valid_actions = np.take(np.arange(self.a_dim), mask.nonzero()) valid_actions = valid_actions.reshape(-1) # a = np.random.choice(valid_actions, p=policy) a = valid_actions[np.argmax(policy)] # print("taking action: ", self.action_space[a]) return env.step(self.action_space[a])
def train(RL): acc_r = [0] total_steps = 0 episode = 0 all_reward = 0 # observation = env.reset() while True: # if total_steps-MEMORY_SIZE > 9000: env.render() s, t = env.reset() observation = s + list(t.reshape(-1, )) for i in range(200): action = RL.choose_action(observation) # f_action = (action-(ACTION_SPACE-1)/2)/((ACTION_SPACE-1)/4) # [-2 ~ 2] float actions (s_, t), reward, done, info = env.step(actions[action]) observation_ = s_ + list(t.reshape(-1, )) acc_r.append(reward + acc_r[-1]) # accumulated reward RL.store_transition(observation, action, reward, observation_) observation = observation_ total_steps += 1 all_reward += reward if total_steps > MEMORY_SIZE: RL.learn() if done: break # if total_steps-MEMORY_SIZE > 15000: # break episode += 1 if (episode % 100 == 0): info = {'averageTotalReward': all_reward / 100} all_reward = 0 for tag, value in info.items(): logger.scalar_summary(tag, value, i) saver.save(sess, './ddpg.ckpt', global_step=episode + 1) if (episode > 2000): break return RL.cost_his, acc_r
def main(): env_name = "dobro-CartPole-v0" env = gym.make(env_name) time_horizon = 20 agent_args = { 'discount_factor': 0.99, 'time_horizon': time_horizon, 'time_step': 0.02, } agent = Agent(env, agent_args) max_steps = 1000 max_ep_len = min(500, env.spec.max_episode_steps) episodes = int(max_steps / max_ep_len) epochs = int(1e5) for epoch in range(epochs): ep_step = 0 while ep_step < max_steps: state = env.reset() done = False score = 0 step = 0 while True: step += 1 ep_step += 1 action = agent.get_action(state) next_state, reward, done, info = env.step(action) env.render() #time.sleep(0.01) state = next_state score += reward if done or step >= max_ep_len: break print(score)
def objective(space): env = gym.make(ENV) env = ActionRepeat(env, int(space['repeat'])) proposals = 1000 iterations = 10 # Pool of workers, each has its own copy of global environment variable pool = Pool(32, initializer, [env]) cost = 0 env.reset() for _ in range(env.num_steps): state = env.sim.get_state() action = cem_planner(pool, env.action_space, state, int(space['horizon']), proposals, int(space['topk']), iterations) _, reward, _, _ = env.step(action) cost -= reward return {'loss': cost, 'status': STATUS_OK}
def eval(env, agent, times=1000, render=False): if False: write_explore(agent, 'explore_old.file') highest_score = 0 total_scores = 0 size = env.get_size() scores = [] max_tiles = [] for i in range(times): obs = env.reset() obs = str(obs.reshape(size ** 2).tolist()) while True: action = agent.choose_action(obs) obs_, reward, done, _ = env.step(action) obs_ = str(obs_.reshape(size ** 2).tolist()) if render: print(f'action is: {action} {obs} {obs_}') env.render() if obs_ == obs: # env.render() agent.learn(obs, action, reward, obs_) obs = obs_ if done: break env.render() scores.append(env.get_score()) max_tiles.append(env.highest()) if env.get_score() > highest_score: highest_score = env.get_score() total_scores += env.get_score() if times > 0: plot_score(scores, max_tiles) print(f'eval avg_score: {total_scores / times} highest_score: {highest_score}') if False: write_explore(agent, 'explore_new.file')
def evaluate(time, env, agent, render=False): eval_reward = [] for i in range(time): obs = env.reset() episode_reward = 0 step = 0 while True: step += 1 action = agent.predict(obs) # 选取最优动作 action = np.clip(action, -1, 1) obs, reward, isOver, _ = env.step(action) episode_reward += reward if render: env.render() if isOver or step >= 200: break eval_reward.append(episode_reward) mean_reward = np.mean(eval_reward) print("evaluating on {} episodes with mean reward {}.".format(time, mean_reward)) logging.warning("evaluating on {} episodes with mean reward {}.".format(time, mean_reward)) return mean_reward
def run_episode(env, agent, rpm): obs = env.reset() step = 0 total_reward = 0 while True: action = agent.predict(obs) # 采样动作 action = np.clip(np.random.normal(action, opt["NOISE"]), -1.0, 1.0) next_obs, reward, done, info = env.step(action) rpm.append((obs, action, opt["REWARD_SCALE"] * reward, next_obs, done)) if len(rpm) > opt["MEMORY_WARMUP_SIZE"] and (step % opt["LEARN_FREQ"]) == 0: (batch_obs, batch_action, batch_reward, batch_next_obs, batch_done) = rpm.sample(opt["BATCH_SIZE"]) agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs, batch_done) obs = next_obs total_reward += reward step += 1 if done or step >= 200: break return step, total_reward
def test_agent(fname, agent, avg=100, seed=43): _, env_args = load_args(CONFIG_PATH) if fname is not None: # if map is specified, use the map without random map env_args["fname"] = fname env_args["random_map"] = False env = gym.make("ScavengerHuntMap-v0", **env_args) env.seed(seed) dist_list = [] a = agent(env) for i in range(avg): print("Running %d/%d" % ((i + 1), avg), end="\r") obs = env.reset() done = False dist = 0 while not done: act = a.next_node(obs) cl = env.env.map.get_current_loc() obs, _, done, info = env.step(act) dist += info["cost"] dist_list.append(dist) return sum(dist_list) / avg, np.std(dist_list)
def eval(env, agent, times=1000, render=False): highest_score = 0 scores = [] max_tiles = [] eps = 0.0 random = False for i in range(times): obs = env.reset() while True: action, action_values = agent.choose_action(obs, eps, rand=random) obs_, reward, done, _ = env.step(action) if render: env.render() if str(obs_) == str(obs): random = True #env.render() # print(f'action is: {action} {reward} {action_values} {obs} {obs_}') print( f'action is: {action} {reward} {action_values} {obs} {obs_}' ) else: random = False obs = obs_ if done: break env.render() scores.append(env.get_score()) max_tiles.append(env.highest()) if env.get_score() > highest_score: highest_score = env.get_score() if times > 0: plot_score(scores, max_tiles) print( f'eval avg_score: {np.mean(scores)} highest_score: {highest_score}' )
def sarsa(lambd): n_episodes = 1000 epi_batch = 100 episodes = xrange(n_episodes) action_value_function = defaultdict(float) n_zero = 100 n_s = defaultdict(int) n_s_a = defaultdict(int) if lambd == 0.0 or lambd == 1.0: mses = [] for episode in episodes: if episode%epi_batch == 0: if lambd == 0.0 or lambd == 1.0: mses.append(compute_mse(action_value_function)) # initialize state, action, epsilon, and eligibility-trace state = State() current_dealer = state.dealer current_player = state.player epsilon = float(n_zero) / (n_zero + n_s[(current_dealer, current_player)]) current_action = epsilon_greedy_policy(action_value_function, state, epsilon) eligibility_trace = defaultdict(int) while not state.terminal: n_s[(current_dealer, current_player)] += 1 n_s_a[(current_dealer, current_player, current_action)] += 1 reward = step(state, current_action) new_dealer = state.dealer new_player = state.player epsilon = float(n_zero) / (n_zero + n_s[(new_dealer, new_player)]) new_action = epsilon_greedy_policy(action_value_function, state, epsilon) alpha = 1.0 / n_s_a[(current_dealer, current_player, current_action)] prev_action_value = action_value_function[(current_dealer, current_player, current_action)] new_action_value = action_value_function[(new_dealer, new_player, new_action)] delta = reward + new_action_value - prev_action_value eligibility_trace[(current_dealer, current_player, current_action)] += 1 for key in action_value_function.keys(): dealer, player, action = key # update the action value function action_value_function[(dealer, player, action)] \ += alpha * delta * eligibility_trace[(dealer, player, action)] # update eligibility-trace eligibility_trace[(dealer, player, action)] *= lambd # update state and action current_dealer = new_dealer current_player = new_player current_action = new_action if lambd == 0.0 or lambd == 1.0: mses.append(compute_mse(action_value_function)) # plot mses curve if lambd == 0.0 or lambd == 1.0: print "Plotting learning curve for $\lambda$=",lambd x = range(0, n_episodes + 1, epi_batch) fig = plt.figure() plt.title('Learning curve of MSE against episode number: $\lambda$ = ' + str(lambd)) plt.xlabel("episode number") plt.xlim([0, n_episodes]) plt.xticks(range(0, n_episodes + 1, epi_batch)) plt.ylabel("Mean-Squared Error (MSE)") plt.plot(x, mses) fname = "mse_lambda%f_%s.png" % (lambd, str(datetime.now())) plt.savefig(fname) # plt.show() mse = compute_mse(action_value_function) return mse
from rl.agents.dqn import DQNAgent from rl.policy import BoltzmannQPolicy from rl.memory import SequentialMemory ENV_NAME = 'timetable-case0001-v0001' # Get the environment and extract the number of actions. env = gym.make(ENV_NAME) print('observation space:', env.observation_space) print('action space:', env.action_space) env.render() action = env.action_space.sample() print(action) obs, r, done, info = env.step(action) print('next observation:', obs) print('reward:', r) print('done:', done) print('info:', info) print('nb_actions', env.action_space.n) env = gym.make(ENV_NAME) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n # Next, we build a very simple model. model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(300))
def sarsa(lambd): n_episodes = 1000 epi_batch = 100 episodes = xrange(n_episodes) action_value_function = defaultdict(float) linear_function = LinearFunction() params_hit = np.array([0 for i in range(18)]) params_stick = np.array([0 for i in range(18)]) n_zero = 10 epsilon = 0.05 alpha = 0.01 if lambd == 0.0 or lambd == 1.0: mses = [] for episode in episodes: if episode%epi_batch == 0: if lambd == 0.0 or lambd == 1.0: mses.append(calculate_mse(action_value_function)) # initialize state, action, epsilon, and eligibility-trace state = State() linear_function.update(state) current_feats = linear_function.get_features() action = epsilon_greedy_policy(action_value_function, state, epsilon, current_feats) eligibility_hit = np.array([0 for i in range(18)]) eligibility_stick = np.array([0 for i in range(18)]) while not state.terminal: np_feats = np.array(current_feats) if action is HIT: eligibility_hit = np.add(eligibility_hit, np_feats) else: eligibility_stick = np.add(eligibility_stick, np_feats) reward = step(state, action) linear_function.update(state) new_features = linear_function.get_features() # update delta delta_hit = reward - np.array(tuple(new_features)).dot(params_hit) delta_stick = reward - np.array(tuple(new_features)).dot(params_stick) # update Action Value Function if action == HIT: update_action_value_function(action_value_function, (new_features, action), params_hit) else: update_action_value_function(action_value_function, (new_features, action), params_stick) # update delta, parameters, and eligibility-trace if action == HIT: delta_hit += action_value_function[(tuple(new_features), HIT)] else: delta_stick += action_value_function[(tuple(new_features), STICK)] params_hit = np.add(params_hit, alpha * delta_hit * eligibility_hit) params_stick = np.add(params_stick, alpha * delta_stick * eligibility_stick) eligibility_hit = eligibility_hit * lambd eligibility_stick = eligibility_stick * lambd # decide an action action = epsilon_greedy_policy(action_value_function, state, epsilon, new_features) # update state and action current_features = new_features if lambd == 0.0 or lambd == 1.0: mses.append(calculate_mse(action_value_function)) # plot mses curve if lambd == 0.0 or lambd == 1.0: print "Plotting learning curve for $\lambda$=",lambd x = range(0, n_episodes + 1, epi_batch) fig = plt.figure() plt.title('Learning curve of MSE against Episodes @ $\lambda$ = ' + str(lambd)) plt.xlabel("episode number") plt.xlim([0, n_episodes]) plt.xticks(range(0, n_episodes + 1, epi_batch)) plt.ylabel("Mean-Squared Error (MSE)") plt.plot(x, mses) fname = "lapprox_mse_lambda%f_%s.png" % (lambd, str(datetime.now())) plt.savefig(fname) # plt.show() mse = calculate_mse(action_value_function) return mse