Exemplo n.º 1
0
def main(stdscr):
    model = Model()
    agent = A2CAgent(model)
    learning_environment = LearningEnvironment()

    agent.initialize_model(learning_environment)
    agent.load_model_if_previously_saved()

    game_controller = GameController(stdscr)
    game_controller.play(agent, learning_environment)
Exemplo n.º 2
0
    def play(self, stdscr):
        self._initialize_screen(stdscr)
        win = self._create_window()

        model = Model()
        agent = A2CAgent(model)
        learning_environment = LearningEnvironment()
        agent.load_pretrained_model(learning_environment)

        self._play_test_game(learning_environment, agent, win)
        self._draw_game_over_text_and_wait_for_input(win)
    def run(self, epochs, batch_size):
        model = Model()
        agent = A2CAgent(model)
        learning_environment = LearningEnvironment()

        agent.initialize_model(learning_environment)
        variables = model.get_variables()

        for _ in range(epochs):
            variables = self._receive_variables_from_master(variables)
            model.set_variables(variables)

            observations, acts_and_advs, returns = agent.generate_experience_batch(
                learning_environment, batch_size)
            self._send_experience_to_master(observations, acts_and_advs,
                                            returns)
Exemplo n.º 4
0
    def learn(self, stdscr):
        self._initialize_screen(stdscr)
        win = self._create_window()

        model = Model()
        agent = A2CAgent(model)
        learning_environment = LearningEnvironment()

        agent.load_model_if_previously_saved(learning_environment)

        for iter in range(200):
            if iter % 10 == 0:
                self._play_test_game(learning_environment, agent, win)

            agent.train(learning_environment)

        agent.save_model()
    def run(self, max_worker_id, epochs, worker_batch_size):
        model = Model()
        agent = A2CAgent(model)
        learning_environment = LearningEnvironment()

        agent.initialize_model(learning_environment)
        agent.load_model_if_previously_saved()

        saved_model_score = self._get_average_score(learning_environment,
                                                    agent, 100)

        for ep in range(1, epochs + 1):
            print("Epoch {}/{}".format(ep, epochs))

            if ep % 1000 == 0:
                current_score = self._get_average_score(
                    learning_environment, agent, 100)
                if current_score > saved_model_score:
                    agent.save_model()
                    saved_model_score = current_score

            variables = model.get_variables()
            self._send_variables_to_workers(variables)

            all_observations, all_acts_and_advs, all_returns = self._receive_experience_from_worker(
                1, worker_batch_size)
            for i in range(2, max_worker_id + 1):
                observations, acts_and_advs, returns = self._receive_experience_from_worker(
                    i, worker_batch_size)

                all_observations = np.concatenate(
                    (all_observations, observations))
                all_acts_and_advs = np.concatenate(
                    (all_acts_and_advs, acts_and_advs))
                all_returns = np.concatenate((all_returns, returns))

            model.train_on_batch(all_observations,
                                 [all_acts_and_advs, all_returns])

        current_score = self._get_average_score(learning_environment, agent,
                                                100)
        if current_score > saved_model_score:
            agent.save_model()
Exemplo n.º 6
0
import numpy as np

from a2c import A2CAgent

env = gym.make("CartPole-v0")
obs_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
MAX_EPISODE = 1000
MAX_STEPS = 500

lr = 7e-3
gamma = 0.99
value_coeff = 0.5
entropy_coeff = 1e-4

agent = A2CAgent(env, gamma, lr, value_coeff, entropy_coeff)

ep_rewards = []
for episode in range(MAX_EPISODE):
    state = env.reset()
    trajectory = []  # [[s, a, r, s', done], [], ...]
    episode_reward = 0
    for steps in range(MAX_STEPS):
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)
        trajectory.append([state, action, reward, next_state, done])
        episode_reward += reward

        if done:
            break
Exemplo n.º 7
0
import matplotlib.pyplot as plt
import numpy as np
import mutliprocessing
from a2c import A2CAgent

seed = 112

env = gym.make('LunarLander-v2')
env.seed(seed)

# get the dimensions of the space
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = A2CAgent(theta=.0001,
                 learning_rate=.002,
                 discount=.99,
                 actions=action_size,
                 space=state_size)

episodes = 100
scores = {}

# play the game, each iteration is an episode
for i in range(episodes):
    done = False
    # score keeping
    total_rewards = 0

    # reset the environment
    observation = env.reset()
    observation = np.reshape(observation, [1, state_size])
Exemplo n.º 8
0
import gym
import gym_sumo
from a2c import A2CAgent
from dqn import DQNAgent

env = gym.make('gym_sumo-v0')
agent = A2CAgent()
#agent = DQNAgent()

#agent.train(env)
agent.test(env)
Exemplo n.º 9
0
def main():
    args = parse_args()

    field_config = load_field_config(args.env_config)
    make_env = EnvFactory(field_config)
    env = make_env()

    # test env for meta setup
    make_test_env = None
    if args.test_env_config is not None:
        test_field_config = load_field_config(args.test_env_config)
        make_test_env = EnvFactory(test_field_config)

    experiment_config = get_config(args.experiment_config, env)

    device = torch.device(experiment_config['train']['device'])
    train_params = experiment_config['train']
    experiment_name = get_experiment_name(args)
    save_path = experiment_config['train'][
        'checkpoints_dir'] + experiment_name + '.pt'
    logdir = f'a2c/logs/{experiment_name}'
    if args.logdir is not None:
        logdir = args.logdir.rstrip('/') + '/' + experiment_name

    if not os.path.exists(logdir):
        os.mkdir(logdir)

    reward_logs = []
    test_reward_logs = []
    for _ in tqdm(range(args.runs)):
        if args.hint_type is None:
            agent = A2CAgent(
                experiment_config['state'],
                receptive_field=env.receptive_field_size).to(device)
        else:
            agent = A2CAgent(
                experiment_config['state'],
                hint_type=args.hint_type,
                hint_config=experiment_config['hint'],
                receptive_field=env.receptive_field_size).to(device)

        optimizer = torch.optim.Adam(agent.parameters(),
                                     lr=experiment_config['train']['lr'])
        log = train(train_params['epochs'],
                    train_params['n_agents'],
                    make_env,
                    agent,
                    optimizer,
                    max_steps=train_params['max_steps'],
                    hint_type=args.hint_type,
                    make_test_env=make_test_env,
                    device=device,
                    experiment_name=experiment_name,
                    save_path=save_path,
                    log_dir=logdir,
                    max_reward_limit=train_params['max_reward_limit'],
                    reward_log_freq=train_params['reward_log_freq'],
                    plot_every=1)
        if make_test_env is not None:
            train_log, test_log = log
            reward_logs.append(train_log)
            test_reward_logs.append(test_log)
        else:
            reward_logs.append(log)

    logs_avg = average_logs(reward_logs) if args.runs > 1 else reward_logs
    if make_test_env is not None:
        test_logs_avg = average_logs(
            test_reward_logs) if args.runs > 1 else test_reward_logs

    writer = SummaryWriter(log_dir=logdir, filename_suffix=experiment_name)
    log_name = 'Reward' if make_test_env is None else 'META/Train reward'
    for i, reward in enumerate(logs_avg):
        writer.add_scalar(log_name, reward, i)
    if make_test_env is not None:
        for i, reward in enumerate(test_logs_avg):
            writer.add_scalar('META/Test reward', reward, i)
Exemplo n.º 10
0
                    default=100000)

# GAME options
parser.add_argument("--n_actions",
                    type=int,
                    help="number of game output actions",
                    default=2)
parser.add_argument("--frame_size",
                    type=str,
                    help="size of game frame in pixels",
                    default=84)

if __name__ == '__main__':
    options = parser.parse_args()

    # Select agent
    if options.algo == 'dqn':
        agent = DQNAgent(options)
    elif options.algo == 'a2c':
        agent = A2CAgent(options)
    elif options.algo == 'ppo':
        agent = PPOAgent(options)
    else:
        print("ERROR. This algorithm has not been implemented yet.")

    # Train or evaluate agent
    if options.mode == 'train':
        agent.train()
    elif options.mode == 'eval':
        agent.play_game()
Exemplo n.º 11
0
import gym

from a2c import A2CAgent

env = gym.make("CartPole-v0")
obs_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
MAX_EPISODE = 1500
MAX_STEPS = 500

lr = 1e-4
gamma = 0.99

agent = A2CAgent(env, gamma, lr)


def run():
    for episode in range(MAX_EPISODE):
        state = env.reset()
        trajectory = []  # [[s, a, r, s', done], [], ...]
        episode_reward = 0
        for steps in range(MAX_STEPS):
            action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)
            trajectory.append([state, action, reward, next_state, done])
            episode_reward += reward

            if done:
                break

            state = next_state
Exemplo n.º 12
0
            use_feature_units=False),
        step_mul=4,
        game_steps_per_episode=None,
        disable_fog=False,
        visualize=False)
    #with tf.Session() as sess:
    #A2C = a2c(sess, 0.00001)

    #sess.run(tf.global_variables_initializer())
    #saver = tf.train.Saver()
    # saver.restore(sess, "4wayBeacon_a2c/tmp/model.ckpt")

    state_size = 2
    action_size = 4

    agent = A2CAgent(state_size, action_size)

    for episodes in range(62626):

        obs = env.reset()  #####

        action = actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])
        obs = env.step(actions=[action])

        done = False  #####
        sub_done = False
        global_step = 0

        ##### states = np.empty(shape=[0, 2])
        ##### actions_list = np.empty(shape=[0, 4])
        ##### next_states = np.empty(shape=[0, 2])