def compute(self, config, budget, working_directory, *args, **kwargs):
            """
            Simple example for a compute function using a feed forward network.
            It is trained on the MNIST dataset.
            The input parameter "config" (dictionary) contains the sampled
            configurations passed by the bohb optimizer
            """
            env = ContinuousCartPoleEnv(reward_function=smooth_reward)
            state_dim = env.observation_space.shape[0]
            # Try to ensure determinism
            ############################
            torch.manual_seed(config['seed'])
            env.seed(config['seed'])
            torch.backends.cudnn.deterministic = True
            torch.backends.cudnn.benchmark = False
            ############################
            # conf dictionary to controll training
            conf = {'lr':config['lr'], 'bs':64, 'loss':nn.MSELoss(),
                    'hidden_dim':config['hidden_dim'],
                    'mem_size':50000, 'activation':config['activation'],
                    'epsilon':config['epsilon'],
                    'eps_scheduler':'exp', 'n_episodes':budget,
                    'dropout_rate': config['dropout_rate'], 'n_cycles': 1,
                    'decay_rate': config['decay_rate']
                  }
            ############################
            # create dqn object and train it
            dqn = DQN(state_dim, config['action_dim'],
                      gamma=config['gamma'], conf=conf)
            time_steps = 1000
            stats = dqn.train(int(budget), time_steps, env, conf)
            # plot_episode_stats(stats, noshow=True)
            final_reward = 0
            for _ in range(5):
                s = env.reset()
                for _ in range(time_steps):
                    # env.render()
                    action = dqn.get_action(s, 0.)
                    s, r, d, _ = env.step(dqn.action.act(action))  #(action - 5)/6]))
                    final_reward += r
                    if d:
                        break
            env.close()
#           ###########################
            return ({
                     # remember: HpBandSter always minimizes!
                    'loss': - (final_reward / 5),
                    'info': {'max_len_train': max(stats.episode_lengths),
                             'max_reward_train': max(stats.episode_rewards),
                             'avg_final': (final_reward / 5) }
            })
 def _thunk():
     env = ContinuousCartPoleEnv()
     return env
            state_, reward, done, info = env.step(action)
            state = state_
            score += reward

    stats.update_test_stats(num_test_episodes_inc=1, latest_test_score=score)
    stats.print_test_run_stats()

def make_env():
    def _thunk():
        env = ContinuousCartPoleEnv()
        return env
    return _thunk

if __name__ == "__main__":
    STATE_SHAPE = (4,)
    ACTION_SHAPE = (1,)
    
    stats = Stats()
    agent = Agent(STATE_SHAPE, ACTION_SHAPE, stats)
    rollout_collector = RolloutCollector(
        num_env_workers=8, make_env_func=make_env, agent=agent, batch_size=32, rollout_length=24, num_recurrence_steps=4,
            state_shape=STATE_SHAPE, action_shape=ACTION_SHAPE, stats=stats)

    test_env = ContinuousCartPoleEnv()
    while True:
        rollout_collector.collect_samples()
        rollout_collector.compute_gae()
        agent.learn(rollout_collector)
        rollout_collector.reset()

        play_test_episode(agent, test_env, stats)
import contextlib
from arg_parser import parse
from pathlib import Path
from continuous_cartpole import ContinuousCartPoleEnv
from reinforce_discrete import REINFORCE
from utils import D2C, Visualizer
from utils import reward_laplacian, reward_carrot_stick, reward_no_fast_rotation
from utils import reward_func_map

if __name__ == '__main__':
    print('--- running main ---')
    args = parse()

    # ============ Parameters ============
    reward_func = reward_func_map[args.reward_function]
    env = ContinuousCartPoleEnv(reward_function=reward_func)
    state_dim = env.observation_space.shape[0]
    action_dim = args.action_dim
    episodes = args.episode
    timesteps = args.steps
    hidden_dim = args.hidden_dim
    policy_lr = args.actor_lr
    baseline_lr = args.critic_lr
    exp_count = args.exp_count
    render_flag = args.render
    load_flag = args.load
    # ====================================

    # --- choose algorithm and hyperparameters ---
    d2c_converter = D2C(action_dim, env.action_space.low,
                        env.action_space.high)
示例#5
0
import gym
import sys

import numpy as np
import matplotlib.pyplot as plt

from continuous_cartpole import ContinuousCartPoleEnv

# Create the Cart-Pole game environment
env = ContinuousCartPoleEnv()

rewards_list = []
steps_list = []
num_episodes = 5
episodes_list = np.arange(1, num_episodes + 1)

# Number of episodes
for i_episode in range(num_episodes):
    print("")
    print("========= EPISODE %d =========" % (i_episode + 1))
    observation = env.reset()
    total_reward = 0

    # Number of time-steps
    for t in range(100):
        env.render()
        action = env.action_space.sample()  # Take random action
        observation, reward, done, info = env.step(action)
        total_reward += reward
        '''
        print("----------- Begin time-step %d ----------" % (t))
示例#6
0
 def _thunk():
     # env = gym.make(ENV_NAME)
     env = ContinuousCartPoleEnv()
     return env
示例#7
0
def train_agent(agent,
                desc='Agent1',
                file_name='agent1',
                runs=5,
                episodes=5000,
                time_steps=300,
                test_episodes=10,
                init_state=None,
                init_noise=None,
                model_dir='../save/models',
                data_dir='../save/stats',
                plt_dir='../save/plots',
                show=False):

    print_header(1, desc)

    run_train_stats = []
    run_test_stats = []

    for run in range(runs):
        print_header(2, 'RUN {}'.format(run + 1))
        print_header(3, 'Training')

        # Training
        env = ContinuousCartPoleEnv(reward_function=agent.reward_fun)

        # Clear weights
        agent.reset_parameters()

        # Train agent...
        stats = agent.train(env,
                            episodes,
                            time_steps,
                            initial_state=init_state,
                            initial_noise=init_noise)
        # ... and append statistics to list
        run_train_stats.append(stats)

        # Save agent checkpoint
        exp_model_dir = model_dir + '/' + file_name
        mkdir(exp_model_dir)
        with open(
                '{}/model_{}_run_{}_{}.pkl'.format(exp_model_dir,
                                                   file_name, run + 1,
                                                   timestamp()), 'wb') as f:
            pickle.dump(agent, f)

        # Run (deterministic) tests on the trained agent and save the statistics
        test_stats = test_agent(env,
                                agent,
                                run=run + 1,
                                episodes=test_episodes,
                                time_steps=time_steps,
                                initial_state=init_state,
                                initial_noise=init_noise,
                                render=show)
        run_test_stats.append(test_stats)

    # Concatenate stats for all runs ...
    train_rewards = []
    train_lengths = []
    train_losses = []
    test_rewards = []
    test_lengths = []

    for r in range(runs):
        train_rewards.append(run_train_stats[r].episode_rewards)
        train_lengths.append(run_train_stats[r].episode_lengths)
        train_losses.append(run_train_stats[r].episode_loss)
        test_rewards.append(run_test_stats[r].episode_rewards)
        test_lengths.append(run_test_stats[r].episode_lengths)

    train_rewards = np.array(train_rewards)
    train_lengths = np.array(train_lengths)
    train_losses = np.array(train_losses)
    test_rewards = np.array(test_rewards)
    test_lengths = np.array(test_lengths)

    # ... and store them in a dictionary
    plot_stats = [{
        'run': 'train',
        'stats': {
            'rewards': train_rewards,
            'lengths': train_lengths,
            'losses': train_losses
        }
    }, {
        'run': 'test',
        'stats': {
            'rewards': test_rewards,
            'lengths': test_lengths
        }
    }]

    # ... and print their aggregate values
    print_header(1, 'Aggregate Stats')
    print_agg_stats(plot_stats)

    # Save Statistics
    exp_stats_dir = data_dir + '/' + file_name
    mkdir(exp_stats_dir)
    with open(
            '{}/stats_{}_{}.pkl'.format(exp_stats_dir, file_name, timestamp()),
            'wb') as f:
        pickle.dump(plot_stats, f)

    # Plot Statistics
    plot_run_stats(plot_stats, path=plt_dir, experiment=file_name, show=show)
示例#8
0
parser.add_argument('--smw',
                    action='store',
                    default=10,
                    help='Smoothing window.',
                    type=int)

args = parser.parse_args()

initial_state = initial_states[args.inist]
initial_noise = initial_noises[args.inirnd]

with open(args.file, 'rb') as f:
    agent = pickle.load(f)

reward_function = agent.reward_fun
env = ContinuousCartPoleEnv(reward_function=reward_function)

stats = test_agent(env,
                   agent,
                   episodes=args.ep,
                   time_steps=args.ts,
                   initial_state=initial_state,
                   initial_noise=initial_noise,
                   render=True,
                   deterministic=not args.stoc)

plt_stats = [{
    'run': 'test',
    'stats': {
        'rewards': stats.episode_rewards.reshape([1, args.ep]),
        'lengths': stats.episode_lengths.reshape([1, args.ep])
示例#9
0
        ''' update based on new policy of old states '''
        self.critic.eval()
        retrospective_actions = self.choose_action(states, target=False)
        self.actor.train()
        retrospective_values = self.critic(states, retrospective_actions)
        actor_loss = torch.mean(-retrospective_values)

        self.actor.optimizer.zero_grad()
        actor_loss.backward()
        self.actor.optimizer.step()

        self.update_params()


if __name__ == '__main__':
    env = ContinuousCartPoleEnv()
    agent = Agent(learn_rate=0.001,
                  state_shape=(4, ),
                  num_actions=1,
                  batch_size=64,
                  layers=(256, 128))

    high_score = -math.inf
    episode = 0
    num_samples = 0
    while True:
        done = False
        state = env.reset()

        score, frame = 0, 1
        while not done:
示例#10
0
######## Hyperparameters #########
max_nb_episodes = 1000
T = 1024  #
N = 1
update_time = N * T
K_epochs = 25
batch_size = 32
eps_clip = 0.1  # to encourage policy change
gamma = 0.99
lr = 0.00025
betas = (0.9, 0.99)
action_std = 0.25
max_length_episode = 650
render = False
######## environment #########
env = ContinuousCartPoleEnv(reward)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
# torch.seed()
# env.seed()
# np.random.seed()

######## Cuda ##########
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")

####### intialization########

running_reward = 0
avg_length = 0
avg_running_reward = 0
示例#11
0
def train(args):
    env = ContinuousCartPoleEnv()
    STATE_SIZE = 4
    ACTION_SPACE_SIZE = 1

    actor = LunarLanderActor(state_size=STATE_SIZE,
                             num_actions=ACTION_SPACE_SIZE)
    critic = Critic(state_size=STATE_SIZE)
    agent = Agent(env,
                  actor_lr=args["ACTOR_LEARNING_RATE"],
                  critic_lr=args["CRITIC_LEARNING_RATE"],
                  actor_model=actor,
                  critic_model=critic,
                  device=args["DEVICE"],
                  gamma=args["GAMMA"])

    stats = {"episode_reward": deque([]), "del_ts": deque([])}

    if args["LOAD_PREVIOUS"]:
        print("Loading previously trained model")
        agent.load()

    for i in range(args["NUM_EPISODES"]):
        print("Starting episode", i)
        total = 0

        agent.start_episode()
        state = env.reset()

        num_step = 0
        done = False
        oup_noise = np.zeros(ACTION_SPACE_SIZE)
        while not done:
            action = agent.get_action(state)

            # Exploration strategy
            gauss_noise = np.random.normal(0,
                                           args["exploration_stddev"],
                                           size=ACTION_SPACE_SIZE)
            oup_noise = gauss_noise + args["KAPPA"] * oup_noise
            target_action = torch.clamp(action + torch.Tensor(oup_noise),
                                        min=-1,
                                        max=1)

            new_state, reward, done, info = env.step(
                target_action.detach().numpy())
            transition = Transition(reward=reward,
                                    state=state,
                                    action=action,
                                    target_action=target_action,
                                    next_state=new_state)
            agent.step(transition)

            if (num_step % args["PRINT_EVERY"] == 0):
                print("\tStep", num_step, "for episode", i)
                print("\t", action, target_action)
                print("\tReward accumulated:", total)

            assert (type(target_action) == torch.Tensor)
            assert (target_action.requires_grad)
            assert (action.requires_grad)

            total += reward
            state = new_state
            num_step += 1

        # Learn from this episode
        agent.learn()

        if args["RENDER_ENV"]:
            env.render()

        if i % 1 == 0:
            agent.save()
            stats["episode_reward"].append(total)

            transitions, del_ts = agent.get_episode_stats()
            stats["del_ts"].extend(del_ts)

            print("Reward is ", total, "and average reward is",
                  total / num_step)

    return stats