예제 #1
0
def main():
    env = gym.make('Taxi-v2')
    num_episodes = 20000

    ## Sarsa
    agent = Agent(method='Sarsa')
    sarsa_avg_rewards, sarsa_best_avg_reward = interact(
        env, agent, num_episodes=num_episodes)
    plot_performance(num_episodes, sarsa_avg_rewards, "Sarsa", disp_plot=True)

    # ## Expected Sarsa
    agent = Agent(method='Expected Sarsa')
    exp_sarsa_avg_rewards, exp_sarsa_best_avg_reward = interact(
        env, agent, num_episodes=num_episodes)
    plot_performance(num_episodes,
                     exp_sarsa_avg_rewards,
                     "Expected Sarsa",
                     disp_plot=True)

    ## Q-Learning
    agent = Agent(method='Q-Learning')
    sarsamax_avg_rewards, sarsamax_best_avg_reward = interact(
        env, agent, num_episodes=num_episodes)
    plot_performance(num_episodes,
                     sarsamax_avg_rewards,
                     "Sarsamax (Q-Learning)",
                     disp_plot=True)

    ## All performances
    plot_all_performances(
        num_episodes,
        [sarsa_avg_rewards, exp_sarsa_avg_rewards, sarsamax_avg_rewards],
        title="Comparison of Temporal Difference control methods")
예제 #2
0
def objective(args):
    env = gym.make(f'Taxi-{c_args.taxi_version}')

    best_scores = []
    for i in range(c_args.n_iters):
        agent = Agent(algorithm=c_args.algo,
                      alpha=args[0],
                      start_epsilon=args[1],
                      epsilon_decay=args[2],
                      epsilon_cut=None if args[3][0] is None else args[3][1],
                      gamma=args[4])

        avg_rewards, best_avg_reward = interact(env, agent, print_logs=False)
        best_scores.append(best_avg_reward)

    return -sum(best_scores) / len(best_scores)
예제 #3
0
import gym
from ddpg_agent import Agent
import numpy as np
from unityagents import UnityEnvironment
import matplotlib.pyplot as plt
from monitor import interact

env = UnityEnvironment(file_name="./Reacher_Linux_NoVis/Reacher.x86_64")
# reset env and extract state_dim and action_dim
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
env_info = env.reset(train_mode=True)[brain_name]
state_dim = env_info.vector_observations.shape[1]
action_dim = brain.vector_action_space_size
# number of agents
num_agents = len(env_info.agents)

agent = Agent(state_dim=state_dim,
              action_dim=action_dim,
              num_agents=num_agents,
              seed=np.random.randint(100))
scores = interact(env, brain_name, agent, num_agents)

# # plot the scores
# fig = plt.figure()
# ax = fig.add_subplot(111)
# plt.plot(np.arange(len(scores)), scores)
# plt.ylabel('Score')
# plt.xlabel('Episode #')
# plt.show()
예제 #4
0
# -1: 每走一步,就扣一分
# -10: 把客人送到錯誤的位置

env = gym.make('Taxi-v2')

## 先觀察環境
action_size = env.action_space.n
state_size = env.observation_space.n
print('狀態數量: ', state_size)
print('可執行的動作: ', action_size)

agent = Agent()

# 建立超參數
total_episodes = 20000
avg_rewards, best_avg_reward = interact(env, agent, total_episodes)

#Q = agent.Q
#
## 測試
## 值定某個狀態
#env.reset()
#state = 122
## 可視化
#env.render()
#print('==== 開始位置 ======')
#
#while True:
#    # 根據機率來隨機挑選動作 -- (A0)
#    action = np.argmax(Q[state])
#    # 根據A0獲得 R1, S1
from itertools import product
from agent import Agent
from monitor import interact
import gym
import numpy as np

env = gym.make('Taxi-v3')
epss = (0.0, 0.1, 0.2)
alphas = (0.05, 0.1, 0.2)
rewards = dict()
rewards_over_time = dict()

products = list(product(epss, alphas))
for i, (eps, alpha) in enumerate(products):
    print(f'{i}/{len(products)}:, eps: {eps}, alpha: {alpha}')
    agent = Agent(eps=eps, alpha=alpha)
    avg_rewards, best_avg_reward = interact(env, agent, num_episodes=20_000)
    rewards[(eps, alpha)] = best_avg_reward
    rewards_over_time[(eps, alpha)] = avg_rewards

예제 #6
0
import gym
from nav_agent import Agent
import numpy as np
from unityagents import UnityEnvironment
import matplotlib.pyplot as plt
from monitor import interact

env = UnityEnvironment(file_name="./Banana_Linux_NoVis/Banana.x86_64")
# reset env and extract state_dim and action_dim
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
env_info = env.reset(train_mode=True)[brain_name]
state_dim = len(env_info.vector_observations[0])
action_dim = brain.vector_action_space_size

agent = Agent(state_dim=state_dim, action_dim=action_dim, seed=0)
scores = interact(env, brain_name, agent)

# # plot the scores
# fig = plt.figure()
# ax = fig.add_subplot(111)
# plt.plot(np.arange(len(scores)), scores)
# plt.ylabel('Score')
# plt.xlabel('Episode #')
# plt.show()
예제 #7
0
def fitness_function(individual, parameter):
    agent = Agent(epsilon=individual['epsilon'],
                  gamma=individual['gamma'],
                  epsilonreducer=individual['epsilonreducer'])
    avg_rewards, best_avg_reward = interact(env, agent)
    return best_avg_reward
def objective(args):
    env = gym.make('Taxi-v2')
    agent = Agent(alpha=args['alpha'])
    avg_rewards, best_avg_reward = interact(env, agent)
    return -1*best_avg_reward
예제 #9
0
from agent import Agent
from monitor import interact
import gym
import numpy as np

env = gym.make('Taxi-v2')
agent = Agent()
avg_rewards, best_avg_reward = interact(env, agent)
예제 #10
0
from agent import Agent
from monitor import interact
from tqdm.auto import tqdm
import gym
import numpy as np

env = gym.make('Taxi-v3')

## Q-Learning
agent = Agent(alpha=0.1, gamma=0.99)
avg_rewards, best_avg_reward = interact(env, agent, is_qlearning=True)

## Expected Sarsa
# agent = Agent(epsilon=0.001, alpha=0.1, gamma=0.99)
# avg_rewards, best_avg_reward = interact(env, agent, is_qlearning=False)
예제 #11
0
from agent import Agent
from monitor import interact
import gym
import numpy as np

env = gym.make('Taxi-v2')
agent = Agent()
avg_rewards, best_avg_reward, samp_rewards = interact(env, agent)
예제 #12
0
from agent import Agent
from monitor import interact
import gym
import numpy as np

env = gym.make('Taxi-v2')
agent = Agent()
avg_rewards, best_avg_reward = interact(env,
                                        agent,
                                        num_episodes=100000,
                                        window=100)
예제 #13
0
from agent import Agent
from monitor import interact
import gym
import numpy as np
import time
import datetime

env = gym.make('Taxi-v2')
agent = Agent()

# set timer
tick = time.time()
avg_rewards, best_avg_reward, scores = interact(env, agent, 100000)
tock = time.time()
elapsed = tock - tick
print(str(datetime.timedelta(seconds=elapsed)))

for score in scores:
    print(score)
def interact_wrapper(epsilon, alpha, gamma):
    agent = Agent(epsilon=epsilon, alpha=alpha, gamma=gamma)
    avg_rewards, best_avg_reward = interact(env, agent)
    return best_avg_reward
예제 #15
0
파일: main.py 프로젝트: Alucarrd/DLND
from agent import Agent
from monitor import interact
import gym
import numpy as np

env = gym.make('Taxi-v2')
agent = Agent()
avg_rewards, best_avg_reward = interact(env, agent, num_episodes=50000)
예제 #16
0
from agent import Agent
from monitor import interact
import gym
import numpy as np

alpha_values = [0.5, 0.1, 0.05, 0.01]
num_episode_values = [20000]
gamma_values = [1, 0.8, 0.5]
epsilon_divisor_values = [4, 40, 400]

env = gym.make('Taxi-v2')
agent = Agent()

for alpha in alpha_values:
    for gamma in gamma_values:
        for epsilon_divisor in epsilon_divisor_values:
            for num_episode in num_episode_values:
                avg_rewards, best_avg_reward = interact(
                    env, agent, num_episode, 100, alpha, gamma,
                    epsilon_divisor)
                print(
                    'alpha: {}, gamma: {}, epsilon_divisor: {}, num_episode: {}, reward: {}'
                    .format(alpha, gamma, epsilon_divisor, num_episode,
                            best_avg_reward))
예제 #17
0
only_KG = True

data = pd.DataFrame([], columns=['Agent', 'episode', 'reward'])
if only_KG:
    data_agent = pd.DataFrame(
        [], columns=['Agent', 'episode', 'greedy', 'mu', 'nu'])

n_episodes = 100000

for i in range(len(agent_classes)):
    agent = agent_classes[i](env)
    agent_name = agent_names[i]

    avg_rewards, best_avg_reward = interact(env,
                                            agent,
                                            num_episodes=n_episodes,
                                            window=n_episodes)

    data_new = pd.DataFrame(list(avg_rewards), columns=['reward'])
    data_new.loc[:, 'episode'] = range(0, len(list(avg_rewards)))
    data_new.loc[:, 'Agent'] = agent_name
    data = data.append(data_new)

    if only_KG:
        data_agent_new = pd.DataFrame(agent.greedy_choice, columns=['greedy'])
        data_agent_new.loc[:, 'episode'] = range(0, len(agent.greedy_choice))
        data_agent_new.loc[:, 'Agent'] = agent_name
        mu = [x[0] for x in agent.mu_vs_nu]
        nu = [x[1] for x in agent.mu_vs_nu]
        data_agent_new.loc[:, 'mu'] = mu
        data_agent_new.loc[:, 'nu'] = nu
예제 #18
0
def interact_function(epsilon, gamma, epsilonreducer):
    agent = Agent(epsilon=epsilon, gamma=gamma, epsilonreducer=epsilonreducer)
    avg_rewards, best_avg_reward = interact(env, agent)
    return -(best_avg_reward)  ## Because i need to maximise this !!
예제 #19
0
def interact_wrapper(decay_rate, alpha, gamma):
    agent = Agent(decay_rate, alpha, gamma)
    avg_rewards, best_avg_reward = interact(env, agent, 15000)
    return best_avg_reward
예제 #20
0
import gym
from ma_ddgp_agent import maddpgagent
import numpy as np
from unityagents import UnityEnvironment
import matplotlib.pyplot as plt
from monitor import interact

env = UnityEnvironment(file_name="Tennis_Linux_NoVis/Tennis.x86_64", seed=0)
# reset env and extract state_dim and action_dim
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
env_info = env.reset(train_mode=True)[brain_name]
state_dim = env_info.vector_observations.shape[1]
action_dim = brain.vector_action_space_size
# number of agents
num_agents = len(env_info.agents)
agent = maddpgagent(state_dim=state_dim, action_dim=action_dim,num_agents = num_agents, seed=0)
scores = interact(env, state_dim, brain_name, agent,num_agents)
예제 #21
0
def main():
    env = gym.make('Taxi-v3')
    agent = Agent()
    avg_rewards, best_avg_reward = interact(env, agent)
예제 #22
0
    # Set seeds based on local seed and run sequence number
    random.seed(i + local_seed)
    np.random.seed(100 * i + local_seed)
    env.seed(10000 * i + local_seed)
    env.action_space.seed(1000000 * i + local_seed)

    # Run the learning problem
    agent = Agent(alpha=alpha,
                  gamma=gamma,
                  get_epsilon=epfunc,
                  c1=c1,
                  c2=c2,
                  beta=beta)
    avg_rewards, best_avg_reward = interact(env,
                                            agent,
                                            n_episodes,
                                            show_progress=10000,
                                            endline='\n')
    best_avg_rewards.append(best_avg_reward)

    # Monitor results after each run
    print("\rRun {}/{}, average so far={}".format(
        i, nruns,
        sum(best_avg_rewards) / len(best_avg_rewards)))

print('\nLocal seed: ', local_seed)
print('Average: ', sum(best_avg_rewards) / len(best_avg_rewards))
print('Median: ', sorted(best_avg_rewards)[medsub])
np.array(sorted(best_avg_rewards))
예제 #23
0
# -*- coding: utf-8 -*-
"""
Created on Sun May  3 13:23:33 2020

@author: Srimanth Tenneti
"""

from agent import Agent
from monitor import interact
import gym
import numpy as np
import matplotlib.pyplot as plt

env = gym.make('Taxi-v3')  # Loading the environment
agent = Agent()  # Creating an agent instance
avg_rewards, best_avg_reward = interact(env, agent)  # Training the agent
def run(params):
    alpha, epsilon, gamma = params
    env = gym.make('Taxi-v2')
    agent = Agent(epsilon=epsilon, alpha=alpha, gamma=gamma)
    avg_rewards, best_avg_reward = interact(env, agent, 10000)
    return best_avg_reward
예제 #25
0
    with open('obj/' + name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


def load_obj(name):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)


env = gym.make('Taxi-v2')

num_episodes = 10000
alphas = np.arange(0.1, 1, 0.1)
epsilons = np.arange(0.1, 1, 0.1)
algo_types = [AlgorithmType.QLearning, AlgorithmType.ExpectedSarsa]
# experiment_tag = 'alpha-.1-1-.1-eps-.1-1-.1'

# avg_rewards = defaultdict(dict)
# best_avg_rewards = defaultdict(dict)

# for t in algo_types:
#     for i, a in enumerate(alphas):
#         for e in epsilons:
#             avg_rewards[a][e], best_avg_rewards[a][e] = interact(env, Agent(algorithm_type=t, epsilon=e, alpha=a), num_episodes=num_episodes)
#             if i == len(alphas)-1:
#                 save_obj(avg_rewards, 'avg_rewards-'+ experiment_tag + '-' + t.name)
#                 save_obj(best_avg_rewards, 'best_avg_rewards-' + experiment_tag + '-'  + t.name)

avg_rewards, best_avg_rewards = interact(env,
                                         Agent(),
                                         num_episodes=num_episodes)
예제 #26
0
from agent import Agent
from monitor import interact
import gym
import numpy as np

env = gym.make('Taxi-v3')
agent = Agent()
<<<<<<< HEAD
avg_rewards, best_avg_reward = interact(env, agent )
#avg_rewards, best_avg_reward = interact(env, agent, 'SARSA_MAX')
#avg_rewards, best_avg_reward = interact(env, agent, 'EXPECTED_SARSA')
=======
avg_rewards, best_avg_reward = interact(env, agent)
>>>>>>> 8f30861f8dd600672a58df9907d3b74f8999f1dd
예제 #27
0
parser.add_argument("--port", default=52162)

# Pass args
args = parser.parse_args()

if __name__ == '__main__':

    # Create environment
    env = gym.make('LunarLander-v2')
    env.seed(0)

    # Instantiate agent
    agent = Agent(
        state_size=8,  # Box(-inf, inf, (8,), float32)
        action_size=4,  # Discrete(4)
        buffer_size=args.buffer_size,
        batch_size=args.batch_size,
        gamma=args.gamma,
        tau=args.tau,
        lr=args.lr,
        update_every=args.update_every)

    # Interact with environment
    scores = interact(env,
                      agent,
                      n_episodes=args.n_episodes,
                      max_t=args.max_t,
                      eps_start=args.eps_start,
                      eps_end=args.eps_end,
                      eps_decay=args.eps_decay)