Exemplo n.º 1
0
def main(args):
    env = gym.make('CartPole-v0')
    dim_state = env.observation_space.shape[0]
    dim_action = env.action_space.n

    actor = Actor(dim_state, args.dim_hidden, dim_action)
    critic = Critic(dim_state, args.dim_hidden)
    agent = ActorCriticAgent(env=env,
                             actor=actor,
                             critic=critic,
                             lr=args.lr,
                             gamma=args.gamma,
                             render=args.render)

    scores = 0
    history = []
    for i in range(args.n_episodes):
        scores += agent.run_episode()
        if (i + 1) % args.print_interval == 0:
            print(
                f"[Episode {i+1}] Avg Score: {scores / args.print_interval:.3f}"
            )
            history.append(scores / args.print_interval)
            scores = 0.0

    plot_result(history, args.print_interval)
Exemplo n.º 2
0
# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

action_size = brain.vector_action_space_size  # size of each action
print('Size of each action:', action_size)

# examine the state space
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(
    states.shape[0], state_size))

# Create the agent to train with the parameters to use
agent = ActorCriticAgent(state_size=state_size,
                         action_size=action_size,
                         seed=0)

# Run the training
scores_mean_agent, score_mean_last100 = ddpg(env,
                                             agent,
                                             num_agents,
                                             brain_name,
                                             n_episodes=200,
                                             save_checkpoint=True,
                                             simu_name='single_train')

# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores_mean_agent)), scores_mean_agent)
Exemplo n.º 3
0
def train(shared_model: torch.nn.Module, directory: str,
          hyperparams: HyperParams, frame_counter: torch.multiprocessing.Value,
          optimizer: torch.optim.Optimizer, monitor_queue: Queue,
          process_number: int):
    """
    trains an a3c agent on an openai gym environment.
    """
    torch.manual_seed(process_number)

    # make environment
    atari = True if hyperparams.feature_type == 'cnn' else False
    monitor = process_number == 0
    env = create_environment(hyperparams.env_name,
                             directory,
                             atari=atari,
                             monitor=monitor)
    env.seed(process_number)
    state = env.reset()
    state = torch.from_numpy(state)
    done = False
    episode_reward = 0
    episode_length = 0
    episode_values = []
    episode_start_time = time.time()
    hidden_state = (torch.zeros(1, 256), torch.zeros(1, 256))

    # make agent
    model = ActorCritic(env.observation_space.shape, env.action_space.n,
                        hyperparams.feature_type)
    agent = ActorCriticAgent(model, shared_model)

    # training loop
    while frame_counter.value < hyperparams.max_timesteps:
        # load weights from shared model
        model.load_state_dict(shared_model.state_dict())

        # reset batch
        batch = []

        # run environment to get batch
        for _ in range(hyperparams.batch_size):
            action, value, log_prob, entropy, hidden_state = agent.act(
                state, hidden_state)

            state, reward, done, _ = env.step(action)

            episode_reward += reward
            episode_length += 1
            episode_values.append(value.item())

            batch.append(
                TimestepInfo(value=value,
                             log_prob=log_prob,
                             reward=reward,
                             entropy=entropy))

            if done:
                state = env.reset()
                hidden_state = (torch.zeros(1, 256), torch.zeros(1, 256))

            state = torch.from_numpy(state)

            if done:
                now = time.time()
                episode_data = EpisodeData(
                    score=episode_reward,
                    length=episode_length,
                    average_value=np.mean(episode_values),
                    time_taken=now - episode_start_time)
                monitor_queue.put(episode_data)
                with frame_counter.get_lock():
                    frame_counter.value += episode_length
                episode_reward = 0
                episode_length = 0
                episode_values = []
                episode_start_time = now
                break

        # Get value of final timestep
        values = [x.value for x in batch]
        if done:
            values.append(torch.Tensor([0.]))
        else:
            _, value, _ = model(state, hidden_state)
            values.append(value)

        # reflect on batch
        critic_loss = 0
        actor_loss = 0
        gae = torch.Tensor([0])
        real_value = values[-1]

        # if -1 in [x.reward for x in batch]:
        #     import ipdb; ipdb.set_trace()

        for i in reversed(range(len(batch))):
            real_value = (hyperparams.discount_factor * real_value +
                          batch[i].reward)
            advantage = real_value - values[i]
            critic_loss = critic_loss + 0.5 * advantage.pow(2)

            value_delta = (batch[i].reward +
                           hyperparams.discount_factor * values[i + 1].data -
                           values[i].data)
            gae = (gae * hyperparams.discount_factor * hyperparams.gae +
                   value_delta)

            actor_loss = (actor_loss -
                          batch[i].log_prob * torch.Tensor([gae]) -
                          hyperparams.entropy_coef * batch[i].entropy)

        optimizer.zero_grad()

        loss = (critic_loss * hyperparams.critic_coef +
                actor_loss * hyperparams.actor_coef)
        loss.backward()

        # Clip gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 50)

        # Share gradients
        for param, shared_param in zip(model.parameters(),
                                       shared_model.parameters()):
            if shared_param.grad is not None:
                break
            shared_param._grad = param.grad

        optimizer.step()

        hidden_state = (hidden_state[0].data, hidden_state[1].data)
Exemplo n.º 4
0
def run(train, n_episodes, log_dir, render=False):
    ## init
    env = AtariPong(gamma=0.999, seed=1)

    obs = env.initial_observation()
    agent = ActorCriticAgent(env.n_actions(), initial_observation=obs)

    step_idx = 0  # an episode consists of n>=1 steps
    episode_idx = 0  # an "episode" refers to a "rally" in Pong
    game_idx = 0  # a game consists of n>=1 episodes
    discounted_returns = [
        0
    ] * n_episodes  # from the start state of every episode

    ## bookkeeper per game because training is done at then of a game
    if train == True:
        training_data = {'obss': [], 'rewards': [], 'labels': []}

    ## main loop
    while (episode_idx < n_episodes):
        ## msg
        print('episode_idx= '+str(episode_idx)+ \
              ' @step_idx= '+str(step_idx)+ \
              ' @game_idx= '+str(game_idx))
        if render:
            env.render()
            time.sleep(1 / 60.0)

        ## step!
        action, label = agent.act(obs)
        obs, reward, info = env.step(action)

        discounted_returns[episode_idx] += ((env.gamma**step_idx) * reward)

        ## collect data for training
        if train == True:
            training_data['obss'].append(obs)
            training_data['rewards'].append(reward)
            training_data['labels'].append(label)

        ## close an episode(== a rally)
        if info['end_of_episode']:
            print('episode_idx= '+str(episode_idx)+ \
                  ': ended with G= '+str('%.3f'%discounted_returns[episode_idx]))

            episode_idx += 1
            step_idx = 0

            if info['end_of_game'] or (episode_idx == n_episodes):
                ## train
                if train == True:
                    print('training...')

                    ## finalize training data
                    for k in training_data.keys():
                        training_data[k] = np.vstack(training_data[k])
                    training_data['returns'] = env.compute_returns(
                        training_data['rewards'])

                    ## train!
                    agent.train(training_data)

                    ## reset training data
                    training_data = {'obss': [], 'rewards': [], 'labels': []}

                ## set for the next game
                obs = env.initial_observation()
                game_idx += 1
        else:
            step_idx += 1

    ## closure
    env.close()

    if train == True:
        print('discounted_returns for the last 10 training episodes:')
        print(str(discounted_returns[-10:]))
Exemplo n.º 5
0
def train(train_env, vocab_size, n_iters, log_every=1000, val_envs={}):
    ''' Train on training set, validating on both seen and unseen. '''

    agent = ActorCriticAgent(train_env, vocab_size, "", batch_size,
                             max_episode_len)

    data_log = defaultdict(list)
    start = time.time()
    guide_prob = 0.7
    for idx in range(0, n_iters, log_every):
        interval = min(log_every, n_iters - idx)
        iter = idx + interval

        agent.train(interval, guide_prob)

        train_losses = np.array(agent.losses)
        train_loss_avg = np.average(train_losses)
        data_log['train loss'].append(train_loss_avg)
        loss_str = ''  #'guide prob: %.2f' % guide_prob
        #loss_str += ', train loss: %.4f' % train_loss_avg
        # Run validation
        for env_name, (env, evaluator) in val_envs.iteritems():
            agent.env = env
            agent.results_path = '%s%s_%s_iter_%d.json' % (
                RESULT_DIR, model_prefix, env_name, iter)
            agent.test(0.0)  #guide_prob)

            #val_losses = np.array(agent.losses)
            #val_loss_avg = np.average(val_losses)
            #data_log['%s loss' % env_name].append(val_loss_avg)

            agent.write_results()

            score_summary, _ = evaluator.score(agent.results_path)
            #loss_str += ', %s loss: %.4f' % (env_name, val_loss_avg)
            loss_str += ', %s' % (env_name)
            for metric, val in score_summary.iteritems():
                data_log['%s %s' % (env_name, metric)].append(val)
                if metric in ['success_rate']:
                    loss_str += ' success: %.2f' % (val)

        agent.env = train_env

        print('%s (%d %d%%) %s' % (timeSince(start,
                                             float(iter) / n_iters), iter,
                                   float(iter) / n_iters * 100, loss_str))
        guide_prob -= 0.01
        guide_prob = max(guide_prob, 0.0)
Exemplo n.º 6
0
                gamma=args.gamma,
                batch_size=args.batch_size,
                replay_memory_size=args.replay_memory_size,
                hidden_size=args.hidden_size,
                model_input_size=env.observation_space.shape[0],
                use_PER=args.use_PER,
                use_ICM=args.use_ICM)
     trainQ(a, env, args.MAX_NUMBER_OF_STEPS, args.EPISODES_TO_TRAIN,
            args.START_RENDERING, args.update_frequency)
 else:
     if not args.use_ICM:
         a = ActorCriticAgent(
             continuous=False,
             nb_actions=env.action_space.n,
             learning_rate=args.learning_rate,
             gamma=args.gamma,
             hidden_size=args.hidden_size,
             model_input_size=env.observation_space.shape[0],
             entropy_coeff_start=args.entropy_coefficient_start,
             entropy_coeff_end=args.entropy_coefficient_end,
             entropy_coeff_anneal=args.entropy_anneal)
         trainActor(a, env, args.MAX_NUMBER_OF_STEPS,
                    args.EPISODES_TO_TRAIN, args.START_RENDERING)
     else:
         a = ActorCriticAgentUsingICM(
             continuous=False,
             nb_actions=env.action_space.n,
             learning_rate=args.learning_rate,
             gamma=args.gamma,
             hidden_size=args.hidden_size,
             model_input_size=env.observation_space.shape[0],
             entropy_coeff_start=args.entropy_coefficient_start,
def main():

    #env = gym.make('InvertedPendulum-v1')
    env = gym.make('Pendulum-v0')
    action_dim = env.action_space.shape[0]
    state_dim = env.observation_space.shape[0]
    agent = ActorCriticAgent(state_dim, action_dim)
    state = env.reset()
    timestep_limit = env.spec.timestep_limit
    start_time = t.time()
    #timestep_limit = min(env.spec.timestep_limit, 20)   # For checking purposes; make it proper for run
    # Initial data build up
    done_flag = 0
    for i in range(REPLAY_MEMORY):

        if (done_flag == True):
            state = env.reset()

        action = env.action_space.sample()
        next_state, reward, done_flag, info = env.step(action)
        agent.append_memory(state, action, reward, next_state, done_flag)
        state = next_state

    print "Initial memory built!!"

    # Initial Training for a few steps
    for _ in range(5):
        agent.update_network()

    print "Initial network performance = ", policy_evaluation(agent, env, 5)
    # =================================================================================

    print "******** Starting learning process *************"
    num_episodes = 2
    update_freq = 1  # update after how many steps (within each episode)
    print_freq = 1  # how often to print (episodes)

    performance = np.zeros(num_episodes)
    best_ep = 0
    best_agent = copy.deepcopy(agent)

    for ep in range(num_episodes):
        done_flag = 0
        state = env.reset()
        time = 0

        while (done_flag != True and time <= timestep_limit):
            action_pred = np.array(
                agent.actor_net.predict(state.reshape(1, -1)))
            action_pred = action_pred[0]
            next_state, reward, done_flag, _ = env.step(action_pred)
            agent.append_memory(state, action_pred, reward, next_state,
                                done_flag)
            state = next_state

            #print time, timestep_limit

            if (time % update_freq == 0):
                agent.update_network()

            time += 1

            performance[ep] = policy_evaluation(agent, env, 2)

        if (ep % print_freq == 0):
            print "Now in episode: ", ep, " of ", num_episodes
            print "Agent performance = ", performance[ep]

        if (performance[ep] > performance[best_ep]):
            best_agent = copy.deepcopy(agent)
            best_ep = ep

    end_time = t.time()
    print "Total time", (end_time - start_time)
    plt.plot(performance[-100:])
    plt.show()

    inspect_performance(agent, env)

    # Save agent to file
    with open('objs.pickle', 'wb') as f:
        pickle.dump([best_agent, performance], f)
Exemplo n.º 8
0
def main():
	agent = ActorCriticAgent('agent')
	env = PaintEnv('env')
def main():
    
    #env = gym.make('InvertedPendulum-v1')
    env = gym.make('Pendulum-v0')
    action_dim = env.action_space.shape[0]
    state_dim = env.observation_space.shape[0]
    agent = ActorCriticAgent(state_dim, action_dim)
    state = env.reset()
    timestep_limit = env.spec.timestep_limit
    start_time = t.time()
    #timestep_limit = min(env.spec.timestep_limit, 20)   # For checking purposes; make it proper for run
    # Initial data build up
    done_flag = 0
    for i in range(REPLAY_MEMORY):

        if (done_flag == True):
            state = env.reset()

        action = env.action_space.sample()
        next_state, reward, done_flag, info = env.step(action)
        agent.append_memory(state, action, reward, next_state, done_flag)
        state = next_state

    print "Initial memory built!!"

    # Initial Training for a few steps
    for _ in range(5):
        agent.update_network()

    print "Initial network performance = ", policy_evaluation(agent, env, 5)
    # =================================================================================

    print "******** Starting learning process *************"
    num_episodes = 2
    update_freq = 1       # update after how many steps (within each episode)
    print_freq = 1        # how often to print (episodes)

    performance = np.zeros(num_episodes)
    best_ep = 0
    best_agent = copy.deepcopy(agent)

    for ep in range(num_episodes):
        done_flag = 0
        state = env.reset()
        time = 0
    
        while (done_flag!=True and time<=timestep_limit):
            action_pred = np.array(agent.actor_net.predict(state.reshape(1,-1)))
            action_pred = action_pred[0]            
            next_state, reward, done_flag, _ = env.step(action_pred)
            agent.append_memory(state, action_pred, reward, next_state, done_flag)
            state = next_state

            #print time, timestep_limit
    
            if (time % update_freq == 0):
                agent.update_network()
    
            time += 1

            performance[ep] = policy_evaluation(agent, env, 2)

        if (ep % print_freq == 0):
            print "Now in episode: ", ep, " of ", num_episodes
            print "Agent performance = ", performance[ep]

        if (performance[ep] > performance[best_ep]):
            best_agent = copy.deepcopy(agent)
            best_ep = ep

    end_time = t.time()
    print "Total time", (end_time - start_time)
    plt.plot(performance[-100:])
    plt.show()
    
    inspect_performance(agent, env)
    
    # Save agent to file
    with open('objs.pickle', 'wb') as f:
        pickle.dump([best_agent, performance], f)
Exemplo n.º 10
0
import gym
import math
from matplotlib import pyplot as plt
'''
-tensorboard
-checkpointing
-worker frame collecting
-batch ppo
'''

if __name__ == '__main__':

    #   make agent
    # inputShape = (10, 240, 256)
    agent = ActorCriticAgent(alpha=0.001,
                             inputChannels=1,
                             gamma=0.99,
                             numActions=7)

    #   make env
    from nes_py.wrappers import JoypadSpace
    import gym_super_mario_bros
    from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
    env = gym_super_mario_bros.make('SuperMarioBros-v1')
    # env = gym_super_mario_bros.make('SuperMarioBros-2-1-v1')
    env = JoypadSpace(env, SIMPLE_MOVEMENT)

    scoreHistory = []
    numHiddenEpisodes = -1
    highScore = -math.inf
    recordTimeSteps = math.inf
Exemplo n.º 11
0
def main():
    config = read_config("config.yaml")
    agent_config = config['Agent']
    network_config = agent_config['Network']
    training_config = config['Training']
    files_config = config['Files']
    eval_config = config['Evaluation']

    print('\t\t --------------------------------------------')
    print('\t\t ------  Parameters of the experiment  ------')
    print('\t\t --------------------------------------------\n')

    print('## Agent params')
    print('Agent : ' + agent_config['name'])
    print('Gamma : ', agent_config['gamma'])
    print('')

    print('## Network Params')
    print('Network used : ' + network_config['name'])
    print('Number of filters : ', network_config['n_filters'])
    print('activation function : ' + network_config['activation'])
    print('state embedding size : ', network_config['state_embedding_size'])
    print('')

    print('## Training params')
    print('Number of iteration : ', training_config['n_iter'])
    print('Learning rate : ', network_config['lr'])
    print('Number of games per iteration : ', training_config['n_games'])
    print('Number of workers : ', training_config['n_workers'])
    print('Batch size : ', training_config['batch_size'])
    print('Buffer size : ', training_config['buffer_size'])
    print('')

    print('## Evaluation params')
    print('Number of games per iteration : ', eval_config['n_games'])
    print('Number of workers : ', eval_config['n_workers'])
    print('')

    sleep(2.0)

    # Init files and tensorboard
    model_name = agent_config['name']
    checkpoints_dir = os.path.join(model_name, files_config['checkpoints_dir'])
    tensorboard_log_dir = os.path.join(model_name,
                                       files_config['tensorboard_log_dir'])
    results_log_path = os.path.join(model_name,
                                    files_config['results_log_path'])

    # fix random seed
    if config['Seed'] is None:
        np.random.seed(seed=42)
    else:
        np.random.seed(int(seed))

    print('\n\n')
    env = Env()

    # if train from scratch
    if training_config["init_checkpoint"] == 0:
        # initialize dir for tensorboard
        flush_or_create(tensorboard_log_dir)
        # initialize dir for checkpoitns
        flush_or_create(checkpoints_dir)
        # init agent and network from scratch
        agent = ActorCriticAgent(agent_config, network_config, checkpoints_dir,
                                 tensorboard_log_dir)
        # initialize iteration number
        start = 0

    # else restart training from last checkpoint
    else:
        agent = ActorCriticAgent(agent_config,
                                 network_config,
                                 checkpoints_dir,
                                 tensorboard_log_dir,
                                 restore=True)
        print('\nnetwork restored from checkpoint # ', latest_checkpoint)
        print('')
        start = latest_checkpoint

    # intialize the summary writer and results log file
    log_file = open(results_log_path,
                    "wb+")  # open log file to write in during evaluation

    display_every = training_config["display_every"]
    n_games_train = training_config["n_games"]
    n_workers_train = training_config["n_workers"]
    T_update_net = training_config["T_update_net"]
    T_update_target_net = training_config["T_update_target_net"]
    n_games_eval = eval_config["n_games"]
    n_workers_eval = eval_config["n_workers"]
    prefill_buffer = training_config["prefill_buffer"]
    # gamma = agent_config['gamma']

    summary_dict = dict({})
    data_buffer = Buffer(capacity=training_config['buffer_size'])

    logger = logging.getLogger(__name__)

    if prefill_buffer:
        # populate buffer with intial data from random games
        print('\nPopulating Buffer ... \n')
        populate_buffer(agent, n_workers_train, data_buffer)

    print('\n\n')
    print('Starting training\n\n')
    batch_size = training_config['batch_size']
    for it in tqdm(np.arange(start, training_config["n_iter"]),
                   desc="parallel gameplay iterations"):
        # play games to generate data and train the network
        env.reset()
        try:
            agent.train(env, n_games_train, data_buffer, batch_size,
                        n_workers_train, display_every, T_update_net)
        except Exception as error:
            print('\n\n#### AN ERROR OCCURED WHILE TRAINING ####\n\n')
            agent.net.summary_writer.close()
            agent.net.sess.close()
            log_file.close()
            logger.error(error)
            raise
        agent.net.save_checkpoint(checkpoints_dir, it=it + 1)

        # play games with latest checkpoint and track average final reward
        results = agent.evaluate(env, n_games_eval, n_workers_eval)
        # save results
        pickle.dump(results, log_file)
        print('')

    agent.net.summary_writer.close()
    agent.net.sess.close()
    log_file.close()
    print('End of training')
Exemplo n.º 12
0
import numpy as np

from agent import ActorCriticAgent
from gridworld import GridWorld

env = GridWorld()
agent = ActorCriticAgent(savepath='saved_models', load=False)

for i in np.arange(1, 10001):
    obs, end, reward = env.reset()
    while end == 0:
        # obs = np.zeros((5,5,1))
        # obs = np.concatenate((obs, np.ones_like(obs)*0/200), axis=-1)
        # obs[0,0,0] = 1
        action, predicted_value = agent.act(obs, reward)
        # print(action, predicted_value)
        # quit()
        obs, end, reward = env.step(action)
    obs = np.zeros((5, 5, 2))
    # obs = np.concatenate((obs, np.zeros_like(obs)), axis=-1)
    # obs = np.concatenate((obs, np.ones_like(obs)*0/200), axis=-1)
    obs[1, 0, 0] = 1
    obs[1, 1, 1] = 1
    obs[2, 2, 1] = 1
    action, predicted_value = agent.act(obs, reward=None)
    print(predicted_value)
    # quit()
    agent.episode_end(end, reward)
    print('Total Reward: {}'.format(env.total_reward))