def init_agent_env(self, proc_id, role, role_id): env = cartPole.CartPoleEnv() # env = flappyBird.FlappyBirdEnv() NUM_STATE_FEATURES = env.get_num_state_features() NUM_ACTIONS = env.get_num_actions() PRINT_EVERY_EPISODE = 20 LEARNING_RATE = 0.003 REWARD_DISCOUNT = 0.99 COEF_VALUE= 1 COEF_ENTROPY = 0 agent = A2C.Agent((NUM_STATE_FEATURES, ), NUM_ACTIONS, REWARD_DISCOUNT, LEARNING_RATE, COEF_VALUE, COEF_ENTROPY) return agent, env
def test_ddpg(args): if not args.actor_model: print('ERROR: Need trained model folder.') return env = gym.make(args.env) env.reset(args) state_dim = env.observation_space.shape action_dim = env.action_space.shape action_lim = env.action_space.high other_cars = args.cars > 1 agent = A2C(state_dim, action_dim, action_lim, update_type=args.update, batch_size=args.batch_size, other_cars=other_cars, ego_dim=args.ego_dim) agent.load_actor(args.actor_model) evaluate(agent, env, args, None, render_episode=True, log=False)
def train_ddpg(args): timestr = time.strftime("%Y%m%d-%H%M%S") change = args.change * "-change" + (not args.change) * "-follow" savedir = args.logdir + 'julia-sim/' + args.env.lower( ) + change + '/' + timestr + '/' if not os.path.exists(savedir): os.makedirs(savedir) env = gym.make(args.env) env.reset(args) state_dim = env.observation_space.shape action_dim = env.action_space.shape action_lim = env.action_space.high other_cars = args.cars > 1 agent = A2C(state_dim, action_dim, action_lim, update_type=args.update, batch_size=args.batch_size, other_cars=other_cars, ego_dim=args.ego_dim) ep_start = 1 if args.resume_train: if not args.actor_model: print('ERROR: Need trained model folder.') return agent.load_all(args.actor_model) ep_start = int(args.actor_model.split('_')[-1].split('.')[0][2:]) agent.train() if args.seed: print("Random Seed: {}".format(args.random_seed)) env.seed(args.random_seed) torch.manual_seed(args.random_seed) np.random.seed(args.random_seed) avg_reward = 0.0 logfile = open(savedir + 'log.txt', 'w+') for episode in range(ep_start, args.episodes + 1): ep_reward = 0.0 state = env.reset() agent.reset_noise() for t in range(1, args.max_steps + 1): action = agent.select_action(np.array(state)) action = np.clip(action, env.action_space.low, env.action_space.high) next_state, reward, terminal, debug = env.step(action) if args.debug: s_f, t_f, phi_f, v_ego = debug[:4] print("(s, t, phi, v) = (%3.2f, %3.2f, %3.2f, %3.2f)" % (s_f, t_f, phi_f, v_ego)) logfile.write( "(Episode, Step): (%d, %d) | (s, t, phi, v) = (%3.2f, %3.2f, %3.2f, %3.2f)\n" % (episode, t, s_f, t_f, phi_f, v_ego)) logfile.flush() agent.append(state, action, reward, next_state, float(terminal)) state = next_state ep_reward += reward avg_reward += reward if args.update_always: junk = np.random.normal(np.random.randint(-10, 10), np.random.random() + 5.0) tot_actor_loss = junk tot_critic_loss = junk for b in range(args.update_batches): actor_loss, critic_loss = agent.update(target_noise=False) if (actor_loss is not None) and (critic_loss is not None): tot_actor_loss += actor_loss tot_critic_loss += critic_loss if (tot_actor_loss != junk) and (tot_critic_loss != junk): tot_actor_loss -= junk tot_critic_loss -= junk tot_actor_loss /= args.update_batches tot_critic_loss /= args.update_batches logfile.write('LOSS: %d,%f,%f\n' % (episode, tot_actor_loss, tot_critic_loss)) logfile.flush() if terminal or t == args.max_steps: junk = np.random.normal(np.random.randint(-10, 10), np.random.random() + 5.0) tot_actor_loss = junk tot_critic_loss = junk for b in range(args.update_batches): actor_loss, critic_loss = agent.update(target_noise=False) if (actor_loss is not None) and (critic_loss is not None): tot_actor_loss += actor_loss tot_critic_loss += critic_loss if (tot_actor_loss != junk) and (tot_critic_loss != junk): tot_actor_loss -= junk tot_critic_loss -= junk tot_actor_loss /= args.update_batches tot_critic_loss /= args.update_batches logfile.write('LOSS: %d,%f,%f\n' % (episode, tot_actor_loss, tot_critic_loss)) logfile.flush() break logfile.write('%d,%f\n' % (episode, ep_reward)) logfile.flush() if (episode % args.save_every) == 0: agent.save(savedir, episode, previous=episode - args.save_every) if (episode % args.eval_every) == 0: avg_reward /= args.eval_every ep_start = episode - args.eval_every + 1 print('Episodes %d - %d | Average reward = %f' % (ep_start, episode, avg_reward)) avg_reward = 0.0 # print("Evaluating!") evaluate(agent, env, args, logfile) agent.train() print("Testing!") evaluate(agent, env, args, logfile) logfile.close()
env = cartPole.CartPoleEnv() NUM_STATE_FEATURES = env.get_num_state_features() NUM_ACTIONS = env.get_num_actions() EPISODE_NUM = 2000 PRINT_EVERY_EPISODE = 20 LEARNING_RATE = 0.003 REWARD_DISCOUNT = 0.99 exp_stg = EPSG.EpsilonGreedy(0.2, NUM_ACTIONS) # agent = Agent((NUM_STATE_FEATURES, ), NUM_ACTIONS, REWARD_DISCOUNT, LEARNING_RATE, exp_stg) agent_params = ((NUM_STATE_FEATURES, ), NUM_ACTIONS, REWARD_DISCOUNT, LEARNING_RATE, exp_stg) init_local_agent_funct = lambda: A2C.Agent( (NUM_STATE_FEATURES, ), NUM_ACTIONS, REWARD_DISCOUNT, LEARNING_RATE, exp_stg) init_local_env_funct = lambda: cartPole.CartPoleEnv() master = Parallel.Master(EPISODE_NUM, init_local_agent_funct, init_local_env_funct, -1) master.start_workers() state = env.reset() accum_reward = 0 accum_loss = 0 # tqdm progress bar bar = [] # Reward & LossHistory r_his = []
n_step = 10 use_cuda = False is_render = True save_model = False ########################################### buffer_state = [[] for _ in range(num_worker)] buffer_action = [[] for _ in range(num_worker)] buffer_reward = [[] for _ in range(num_worker)] buffer_next_state = [[] for _ in range(num_worker)] model = A2C(s_dim, a_dim, num_worker, gamma=0.95, epsilon_start=1.0, epsilon_end=0.1, epsilon_length=100000, use_cuda=use_cuda, n_step=n_step, lr=0.001) model.load('0000800.pt') for idx in range(num_worker): parent_conn, child_conn = Pipe() worker = MarioEnv(env_id, idx, child_conn, queue, n_step, is_render) worker.start() workers.append(worker) parent_conns.append(parent_conn) while model.g_episode < max_episode:
# Block any pop-up windows os.environ['SDL_VIDEODRIVER'] = 'dummy' # Test GPU and show the available logical & physical GPUs Util.test_gpu() env = FlappyBird.FlappyBirdEnv() NUM_STATE_FEATURES = env.get_num_state_features() NUM_ACTIONS = env.get_num_actions() EPISODE_NUM = 10000 PRINT_EVERY_EPISODE = 50 LEARNING_RATE = 0.003 REWARD_DISCOUNT = 0.99 exp_stg = EPSG.EpsilonGreedy(0.2, NUM_ACTIONS) agent = A2C.Agent((NUM_STATE_FEATURES, ), NUM_ACTIONS, REWARD_DISCOUNT, LEARNING_RATE, exp_stg) state = env.reset() accum_reward = 0 # tqdm progress bar bar = [] # Reward & LossHistory r_his = [] avg_r_his = [0] loss_his = [] episode_reward = 0 print("Episode 1") for episode in range(1, EPISODE_NUM + 1): if episode % PRINT_EVERY_EPISODE == 1:
# Test GPU and show the available logical & physical GPUs Util.test_gpu() env = cartPole.CartPoleEnv() NUM_STATE_FEATURES = env.get_num_state_features() NUM_ACTIONS = env.get_num_actions() EPISODE_NUM = 200 PRINT_EVERY_EPISODE = 20 LEARNING_RATE = 0.03 REWARD_DISCOUNT = 0.99 COEF_VALUE = 1 COEF_ENTROPY = 0 exp_stg = EPSG.EpsilonGreedy(0.2, NUM_ACTIONS) agent = A2C.Agent((NUM_STATE_FEATURES, ), NUM_ACTIONS, REWARD_DISCOUNT, LEARNING_RATE, COEF_VALUE, COEF_ENTROPY) # agent_params = ((NUM_STATE_FEATURES, ), NUM_ACTIONS, REWARD_DISCOUNT, LEARNING_RATE, exp_stg) # init_local_agent_funct = lambda: Agent((NUM_STATE_FEATURES, ), NUM_ACTIONS, REWARD_DISCOUNT, LEARNING_RATE, exp_stg) # init_local_env_funct = lambda: CartPoleEnv() # master = Master(EPISODE_NUM, init_local_agent_funct, init_local_env_funct, 2) # master.start_workers() state = env.reset() accum_reward = 0 accum_loss = 0 # tqdm progress bar bar = [] # Reward & LossHistory