def compute(self, config, budget, working_directory, *args, **kwargs): """ Simple example for a compute function using a feed forward network. It is trained on the MNIST dataset. The input parameter "config" (dictionary) contains the sampled configurations passed by the bohb optimizer """ env = ContinuousCartPoleEnv(reward_function=smooth_reward) state_dim = env.observation_space.shape[0] # Try to ensure determinism ############################ torch.manual_seed(config['seed']) env.seed(config['seed']) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False ############################ # conf dictionary to controll training conf = {'lr':config['lr'], 'bs':64, 'loss':nn.MSELoss(), 'hidden_dim':config['hidden_dim'], 'mem_size':50000, 'activation':config['activation'], 'epsilon':config['epsilon'], 'eps_scheduler':'exp', 'n_episodes':budget, 'dropout_rate': config['dropout_rate'], 'n_cycles': 1, 'decay_rate': config['decay_rate'] } ############################ # create dqn object and train it dqn = DQN(state_dim, config['action_dim'], gamma=config['gamma'], conf=conf) time_steps = 1000 stats = dqn.train(int(budget), time_steps, env, conf) # plot_episode_stats(stats, noshow=True) final_reward = 0 for _ in range(5): s = env.reset() for _ in range(time_steps): # env.render() action = dqn.get_action(s, 0.) s, r, d, _ = env.step(dqn.action.act(action)) #(action - 5)/6])) final_reward += r if d: break env.close() # ########################### return ({ # remember: HpBandSter always minimizes! 'loss': - (final_reward / 5), 'info': {'max_len_train': max(stats.episode_lengths), 'max_reward_train': max(stats.episode_rewards), 'avg_final': (final_reward / 5) } })
def _thunk(): env = ContinuousCartPoleEnv() return env
state_, reward, done, info = env.step(action) state = state_ score += reward stats.update_test_stats(num_test_episodes_inc=1, latest_test_score=score) stats.print_test_run_stats() def make_env(): def _thunk(): env = ContinuousCartPoleEnv() return env return _thunk if __name__ == "__main__": STATE_SHAPE = (4,) ACTION_SHAPE = (1,) stats = Stats() agent = Agent(STATE_SHAPE, ACTION_SHAPE, stats) rollout_collector = RolloutCollector( num_env_workers=8, make_env_func=make_env, agent=agent, batch_size=32, rollout_length=24, num_recurrence_steps=4, state_shape=STATE_SHAPE, action_shape=ACTION_SHAPE, stats=stats) test_env = ContinuousCartPoleEnv() while True: rollout_collector.collect_samples() rollout_collector.compute_gae() agent.learn(rollout_collector) rollout_collector.reset() play_test_episode(agent, test_env, stats)
import contextlib from arg_parser import parse from pathlib import Path from continuous_cartpole import ContinuousCartPoleEnv from reinforce_discrete import REINFORCE from utils import D2C, Visualizer from utils import reward_laplacian, reward_carrot_stick, reward_no_fast_rotation from utils import reward_func_map if __name__ == '__main__': print('--- running main ---') args = parse() # ============ Parameters ============ reward_func = reward_func_map[args.reward_function] env = ContinuousCartPoleEnv(reward_function=reward_func) state_dim = env.observation_space.shape[0] action_dim = args.action_dim episodes = args.episode timesteps = args.steps hidden_dim = args.hidden_dim policy_lr = args.actor_lr baseline_lr = args.critic_lr exp_count = args.exp_count render_flag = args.render load_flag = args.load # ==================================== # --- choose algorithm and hyperparameters --- d2c_converter = D2C(action_dim, env.action_space.low, env.action_space.high)
import gym import sys import numpy as np import matplotlib.pyplot as plt from continuous_cartpole import ContinuousCartPoleEnv # Create the Cart-Pole game environment env = ContinuousCartPoleEnv() rewards_list = [] steps_list = [] num_episodes = 5 episodes_list = np.arange(1, num_episodes + 1) # Number of episodes for i_episode in range(num_episodes): print("") print("========= EPISODE %d =========" % (i_episode + 1)) observation = env.reset() total_reward = 0 # Number of time-steps for t in range(100): env.render() action = env.action_space.sample() # Take random action observation, reward, done, info = env.step(action) total_reward += reward ''' print("----------- Begin time-step %d ----------" % (t))
def _thunk(): # env = gym.make(ENV_NAME) env = ContinuousCartPoleEnv() return env
def train_agent(agent, desc='Agent1', file_name='agent1', runs=5, episodes=5000, time_steps=300, test_episodes=10, init_state=None, init_noise=None, model_dir='../save/models', data_dir='../save/stats', plt_dir='../save/plots', show=False): print_header(1, desc) run_train_stats = [] run_test_stats = [] for run in range(runs): print_header(2, 'RUN {}'.format(run + 1)) print_header(3, 'Training') # Training env = ContinuousCartPoleEnv(reward_function=agent.reward_fun) # Clear weights agent.reset_parameters() # Train agent... stats = agent.train(env, episodes, time_steps, initial_state=init_state, initial_noise=init_noise) # ... and append statistics to list run_train_stats.append(stats) # Save agent checkpoint exp_model_dir = model_dir + '/' + file_name mkdir(exp_model_dir) with open( '{}/model_{}_run_{}_{}.pkl'.format(exp_model_dir, file_name, run + 1, timestamp()), 'wb') as f: pickle.dump(agent, f) # Run (deterministic) tests on the trained agent and save the statistics test_stats = test_agent(env, agent, run=run + 1, episodes=test_episodes, time_steps=time_steps, initial_state=init_state, initial_noise=init_noise, render=show) run_test_stats.append(test_stats) # Concatenate stats for all runs ... train_rewards = [] train_lengths = [] train_losses = [] test_rewards = [] test_lengths = [] for r in range(runs): train_rewards.append(run_train_stats[r].episode_rewards) train_lengths.append(run_train_stats[r].episode_lengths) train_losses.append(run_train_stats[r].episode_loss) test_rewards.append(run_test_stats[r].episode_rewards) test_lengths.append(run_test_stats[r].episode_lengths) train_rewards = np.array(train_rewards) train_lengths = np.array(train_lengths) train_losses = np.array(train_losses) test_rewards = np.array(test_rewards) test_lengths = np.array(test_lengths) # ... and store them in a dictionary plot_stats = [{ 'run': 'train', 'stats': { 'rewards': train_rewards, 'lengths': train_lengths, 'losses': train_losses } }, { 'run': 'test', 'stats': { 'rewards': test_rewards, 'lengths': test_lengths } }] # ... and print their aggregate values print_header(1, 'Aggregate Stats') print_agg_stats(plot_stats) # Save Statistics exp_stats_dir = data_dir + '/' + file_name mkdir(exp_stats_dir) with open( '{}/stats_{}_{}.pkl'.format(exp_stats_dir, file_name, timestamp()), 'wb') as f: pickle.dump(plot_stats, f) # Plot Statistics plot_run_stats(plot_stats, path=plt_dir, experiment=file_name, show=show)
parser.add_argument('--smw', action='store', default=10, help='Smoothing window.', type=int) args = parser.parse_args() initial_state = initial_states[args.inist] initial_noise = initial_noises[args.inirnd] with open(args.file, 'rb') as f: agent = pickle.load(f) reward_function = agent.reward_fun env = ContinuousCartPoleEnv(reward_function=reward_function) stats = test_agent(env, agent, episodes=args.ep, time_steps=args.ts, initial_state=initial_state, initial_noise=initial_noise, render=True, deterministic=not args.stoc) plt_stats = [{ 'run': 'test', 'stats': { 'rewards': stats.episode_rewards.reshape([1, args.ep]), 'lengths': stats.episode_lengths.reshape([1, args.ep])
''' update based on new policy of old states ''' self.critic.eval() retrospective_actions = self.choose_action(states, target=False) self.actor.train() retrospective_values = self.critic(states, retrospective_actions) actor_loss = torch.mean(-retrospective_values) self.actor.optimizer.zero_grad() actor_loss.backward() self.actor.optimizer.step() self.update_params() if __name__ == '__main__': env = ContinuousCartPoleEnv() agent = Agent(learn_rate=0.001, state_shape=(4, ), num_actions=1, batch_size=64, layers=(256, 128)) high_score = -math.inf episode = 0 num_samples = 0 while True: done = False state = env.reset() score, frame = 0, 1 while not done:
######## Hyperparameters ######### max_nb_episodes = 1000 T = 1024 # N = 1 update_time = N * T K_epochs = 25 batch_size = 32 eps_clip = 0.1 # to encourage policy change gamma = 0.99 lr = 0.00025 betas = (0.9, 0.99) action_std = 0.25 max_length_episode = 650 render = False ######## environment ######### env = ContinuousCartPoleEnv(reward) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] # torch.seed() # env.seed() # np.random.seed() ######## Cuda ########## #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") device = torch.device("cpu") ####### intialization######## running_reward = 0 avg_length = 0 avg_running_reward = 0
def train(args): env = ContinuousCartPoleEnv() STATE_SIZE = 4 ACTION_SPACE_SIZE = 1 actor = LunarLanderActor(state_size=STATE_SIZE, num_actions=ACTION_SPACE_SIZE) critic = Critic(state_size=STATE_SIZE) agent = Agent(env, actor_lr=args["ACTOR_LEARNING_RATE"], critic_lr=args["CRITIC_LEARNING_RATE"], actor_model=actor, critic_model=critic, device=args["DEVICE"], gamma=args["GAMMA"]) stats = {"episode_reward": deque([]), "del_ts": deque([])} if args["LOAD_PREVIOUS"]: print("Loading previously trained model") agent.load() for i in range(args["NUM_EPISODES"]): print("Starting episode", i) total = 0 agent.start_episode() state = env.reset() num_step = 0 done = False oup_noise = np.zeros(ACTION_SPACE_SIZE) while not done: action = agent.get_action(state) # Exploration strategy gauss_noise = np.random.normal(0, args["exploration_stddev"], size=ACTION_SPACE_SIZE) oup_noise = gauss_noise + args["KAPPA"] * oup_noise target_action = torch.clamp(action + torch.Tensor(oup_noise), min=-1, max=1) new_state, reward, done, info = env.step( target_action.detach().numpy()) transition = Transition(reward=reward, state=state, action=action, target_action=target_action, next_state=new_state) agent.step(transition) if (num_step % args["PRINT_EVERY"] == 0): print("\tStep", num_step, "for episode", i) print("\t", action, target_action) print("\tReward accumulated:", total) assert (type(target_action) == torch.Tensor) assert (target_action.requires_grad) assert (action.requires_grad) total += reward state = new_state num_step += 1 # Learn from this episode agent.learn() if args["RENDER_ENV"]: env.render() if i % 1 == 0: agent.save() stats["episode_reward"].append(total) transitions, del_ts = agent.get_episode_stats() stats["del_ts"].extend(del_ts) print("Reward is ", total, "and average reward is", total / num_step) return stats