def main(env_name, seed, individual, args, eval_episodes=10): env = EnvWrapper(env_name) state_dim = sum(list(env.unwrapped().observation_space.shape)) action_dim = sum(list(env.unwrapped().action_space.shape)) hidden_sizes = args['hidden_sizes'] activation = args['activation'] layernorm = args['layernorm'] torch.manual_seed(seed) np.random.seed(seed) env.seed(seed) policy = DeterministicPolicy(state_dim, action_dim, hidden_sizes, -1, activation, layernorm).eval() file_dir = os.path.abspath(os.path.dirname(__file__)) save_dir = os.path.join( file_dir, 'results', env_name, args['activation'] + ('_LayerNorm' if args['layernorm'] else ''), 'seed' + str(seed), ) model_path = os.path.join(save_dir, 'learned_model', 'individual' + str(individual) + '.pth') model_state_dict = torch.load(model_path) policy.load_state_dict(model_state_dict) env.seed(seed + 100) episode_rewards = [] for _ in range(eval_episodes): state = env.reset() done = False sum_rewards = 0 while not done: # env.render() action = policy.deterministic_action( torch.tensor(state.reshape(1, -1), dtype=torch.float)) next_state, reward, done, _ = env.step(action) sum_rewards += reward state = next_state episode_rewards.append(sum_rewards) print( f'Episode: {len(episode_rewards)} Sum Rewards: {sum_rewards:.3f}') avg_reward = np.mean(episode_rewards) print('\n---------------------------------------') print(f'Evaluation over {eval_episodes} episodes: {avg_reward:.3f}') print('---------------------------------------')
def main(arglist): ACTORS = 1 env = EnvWrapper(arglist.scenario, ACTORS, arglist.saved_episode) if arglist.eval: current_time = strftime("%Y-%m-%d-%H-%M-%S", gmtime()) writer = SummaryWriter(log_dir='./logs/' + current_time + '-' + arglist.scenario) maddpg_wrapper = MADDPG(ACTORS) maddpg_wrapper.create_agents(env, arglist) j = 0 for episode in range(arglist.max_episode): obs = env.reset() terminal = False maddpg_wrapper.reset() total_reward = [0 for i in maddpg_wrapper.workers] step = 0 while not terminal and step < 25: if not arglist.eval: env.render(0) time.sleep(0.03) actions = maddpg_wrapper.take_actions(obs) obs2, reward, done = env.step(actions) for actor in range(ACTORS): for i, rew in enumerate(reward[actor]): total_reward[i] += rew j += ACTORS #terminal = all(done) if arglist.eval: maddpg_wrapper.update(j, ACTORS, actions, reward, obs, obs2, done) obs = obs2 step += 1 if arglist.eval and episode % arglist.saved_episode == 0 and episode > 0: maddpg_wrapper.save(episode) if arglist.eval: for worker, ep_ave_max in zip(maddpg_wrapper.workers, maddpg_wrapper.ep_ave_max_q_value): print(worker.pos, ' => average_max_q: ', ep_ave_max / float(step), ' Reward: ', total_reward[worker.pos], ' Episode: ', episode) writer.add_scalar( str(worker.pos) + '/Average_max_q', ep_ave_max / float(step), episode) writer.add_scalar( str(worker.pos) + '/Reward Agent', total_reward[worker.pos], episode) env.close()
def rollout_worker(index, task_pipe, result_pipe, model_bucket, env_name): env = EnvWrapper(env_name) env.seed(index) while True: identifier = task_pipe.recv() if identifier == 'TERMINATE': exit(0) policy = model_bucket[identifier] fitness = 0.0 num_frames = 0 state = env.reset() done = False rollout_transition = [] while not done: action = policy.deterministic_action(torch.tensor(state.reshape(1, -1), dtype=torch.float)) next_state, reward, done, info = env.step(action) fitness += reward num_frames += 1 done_buffer = done if num_frames < env.unwrapped()._max_episode_steps else False rollout_transition.append({ 'state': state, 'next_state': next_state, 'action': action, 'reward': reward, 'mask': float(not done_buffer) }) state = next_state result_pipe.send([identifier, fitness, rollout_transition])
def __init__(self, name, num_episodes=500): self.name = name self.num_episodes = num_episodes self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.env = gym.make(name).unwrapped self.env.reset() self.env_w = EnvWrapper(self.env, self.device) self.cfg = Config() self.cfg.n_actions = self.env.action_space.n self.cfg.policy_net = DQN(self.env_w.screen_height, self.env_w.screen_width, self.cfg.n_actions).to(self.device) self.cfg.target_net = DQN(self.env_w.screen_height, self.env_w.screen_width, self.cfg.n_actions).to(self.device) self.agent = Agent(self.env, self.env_w, self.device, self.cfg)
def train_agent(episodes=100, model='DDPG', print_every=10): if model.lower() == 'd4pg': agent = D4PGAgent() print('Use D4PG agent......\n') else: agent = DDPGAgent() print('Use default DDPG agent......\n') print('Batch size: ', BATCH_SIZE) print('Actor learning rate: ', LR_ACTOR) print('Critic learning rate: ', LR_CRITIC) print('\n') env = EnvWrapper(file_name='Reacher_Windows_x86_64\Reacher.exe', train_mode=True) scores = [] scores_window = deque(maxlen=100) for ep in range(1, episodes + 1): agent.reset() agent.states = env.reset() for s in range(agent.max_steps): agent.actions = agent.act(add_noise=True) agent.rewards, agent.next_states, agent.dones = env.step( agent.actions) agent.step() agent.states = agent.next_states scores.append(agent.scores.mean()) scores_window.append(agent.scores.mean()) if ep % print_every == 0: print('Episode %d, avg score: %.2f' % (ep, agent.scores.mean())) if np.mean(scores_window) >= 30: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(ep - 100, np.mean(scores_window))) torch.save(agent.actor.state_dict(), 'checkpoints/reacher_%s_actor_checkpoint.pth' % model) torch.save(agent.critic.state_dict(), 'checkpoints/reacher_%s_critic_checkpoint.pth' % model) env.close() return scores, agent
def test(args): gpu_ops = tf.GPUOptions(allow_growth=True) config = tf.ConfigProto(gpu_options=gpu_ops) sess = tf.Session(config=config) env = EnvWrapper(args) qn = DeepQN(state_shape=env.state_shape, num_actions=env.num_actions, gamma=args.gamma, type=args.qn_version) qn.reset_sess(sess) qn.load(args.model_path) testor = Tester(qn, env, report_interval=args.tester_report_interval, episodes=args.tester_episodes) _, rs = testor.run(qn, sess, render=args.render) f = open(args.model_path + '_test.log', 'w') f.write(str(rs)) f.close() return
FRAMES_IN_STATE_COUNT = 4 EPSILON = 0.05 GAME_ENV_NAME = 'BreakoutDeterministic-v4' RENDER = False PRINT_LATEX = True MODEL_PATH_PREFIX = './drive/app/models/' # list of models with iteration count as file names STARTING_MODELS = [ 0, 200, 600, 800, 1000, 1200, 1400, 1550, 1800, 2000, 2200, 2400, 2500, 2700, 3000, 3200, 3500, 3800, 4150, 4400, 4600, 4800, 5000, 5250, 5400, 5600, 5800, 6000, 6200, 6400, 6800, 7000, 7200, 7400, 7600, 7750, 8000 ] GAMES_PER_MODEL = 5 env = EnvWrapper(GAME_ENV_NAME, IMG_SIZE, FRAMES_IN_STATE_COUNT, 1) action_count = env.action_count results = np.zeros((len(STARTING_MODELS), 2)) program_start_time = time.time() for i in range(0, len(STARTING_MODELS)): model_name = STARTING_MODELS[i] model_path = MODEL_PATH_PREFIX + str(model_name) model = helpers.load_model(model_path) total_games_reward = 0 start_time = time.time() for i_game in range(1, GAMES_PER_MODEL + 1): env.reset() if RENDER: env.render()
def train(): print(tf.__version__) gpu_ops = tf.GPUOptions(allow_growth=True) config = tf.ConfigProto(gpu_options=gpu_ops) sess = tf.Session(config=config) env1 = EnvWrapper('MountainCar-v0', mod_r=False) env2 = EnvWrapper('MountainCar-v0', mod_r=True) mr = MemoryReplayer(env1.state_shape, capacity=50000) qn = DeepQN(state_shape=env1.state_shape, num_actions=env1.num_actions, gamma=0.99) qn.reset_sess(sess) qn.set_train(0.008) init = tf.global_variables_initializer() sess.run(init) testor = Tester(qn, env1, report_interval=100, episodes=100) score = [] for epi in range(1000000): s = env2.reset() done = False rc = 0 while not done: a = qn.select_action_eps_greedy(get_eps(epi), s) a_ = a[0] s_, r, done, _ = env2.step(a_) mr.remember(s, s_, r, a_, done) s = s_ rc += r score.append(rc) # replay s, s_, r, a, done = mr.replay(batch_size=64) qn.train(s, s_, r, a, done) if (epi + 1) % 200 == 0: avg_score = np.mean(score) print('avg score last 200 episodes ', avg_score) score = [] testor.run(qn, sess, render=False) return
ENV_NAME = 'Seaquest-v0' TOTAL_FRAMES = 20000000 MAX_TRAINING_STEPS = 20 * 60 * 60 / 3 TESTING_GAMES = 30 MAX_TESTING_STEPS = 5 * 60 * 60 / 3 TRAIN_AFTER_FRAMES = 50000 epoch_size = 50000 MAX_NOOP_START = 30 LOG_DIR = 'logs' outdir = 'results' test_mode = True logger = tf.summary.FileWriter(LOG_DIR) # Intialize tensorflow session session = tf.InteractiveSession() env = EnvWrapper(ENV_NAME, test_mode) # print(dir(env.action_space)) agent = DQN( state_size=env.observation_space.shape, action_size=env.action_space.n, session=session, summary_writer=logger, exploration_period=1000000, minibatch_size=32, discount_factor=0.99, experience_replay_buffer=1000000, target_qnet_update_frequency=20000, initial_exploration_epsilon=1.0, final_exploration_epsilon=0.1, reward_clipping=1.0, )
with open(file_name, "a", newline='') as csvfile: writer = csv.writer(csvfile, delimiter=",") for episode in range(len(goal_agents_list)): goal_agents = goal_agents_list[episode] episode_reward = episode_rewards[episode] row = [episode, episode_reward] row.extend([goal_agents.count(f"agent {agent}") for agent in range(num_good_agents)]) writer.writerow(row) if __name__ == '__main__': arglist = parse_args() # Create environment env = EnvWrapper(arglist.scenario, arglist.benchmark, agent_speeds=arglist.agent_speeds) with U.single_threaded_session(): # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.rollout or arglist.shapley_M or arglist.true_shapley or arglist.restore_episode != 0 or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) if arglist.true_shapley:
def main(): print(tf.__version__) gpu_ops = tf.GPUOptions(allow_growth=True) config = tf.ConfigProto(gpu_options=gpu_ops, log_device_placement=False) sess = tf.Session(config=config) env = EnvWrapper('CartPole-v0') mr = MemoryReplayer(env.state_shape, capacity=100000, enabled=True) # set type='v1' for linear model, 'v3' for three layer model (two tanh activations) # type='v5' use dual qn = DeepQN(state_shape=env.state_shape, num_actions=env.num_actions, gamma=0.99, type='v1') qn.reset_sess(sess) qn.set_train(0.001) init = tf.global_variables_initializer() sess.run(init) plotter = Plotter() testor = Tester(qn, env, report_interval=100) print('Pretrain test:') testor.run(qn, sess) score = [] reward_record = [] cnt_iter = 0 for epi in range(1000000): s = env.reset() done = False rc = 0 while not done: a = qn.select_action_eps_greedy(get_eps(epi), s) a_ = a[0] s_, r, done, _ = env.step(a_) mr.remember(s, s_, r, a_, done) s = s_ rc += r cnt_iter += 1 if (cnt_iter + 1) % 10000 == 0: r_test = record(qn, sess, env) print("Iteration {}, avg reward is {}".format( cnt_iter, r_test)) reward_record.append(r_test) score.append(rc) # replay s, s_, r, a, done = mr.replay(batch_size=64) qn.train(s, s_, r, a, done) if cnt_iter > 1000000: break # if (epi + 1) % 200 == 0: # avg_score = np.mean(score) # plotter.plot(avg_score) # print('avg score last 200 episodes ', avg_score) # score = [] # if avg_score > 195: # qn.save(path='./trained_model_linear_CartPole_w_mr.ckpt') # break f = open('CartPole-v0_q2_data.log', 'w') f.write(str(reward_record)) f.close() return
from env_wrapper import EnvWrapper from a2c_agent import A2CAgent from state_generator import StateGenerator from utils import preprocess_experiences from training_parameters import clip_range, sample_size, epoch, n_env, n_steps, skip_frames, ent_coef, vf_coef, max_grad_norm, episodes_before_training, render, input_shape, lr, GAMMA, LAMBDA, load_model, frame_size, stack_size, max_steps def run_env(env): env.step(n_steps) if __name__ == "__main__": n_env = multiprocessing.cpu_count() envs = [ EnvWrapper(frame_size, skip_frames, stack_size) for i in range(n_env) ] action_size = envs[0].get_action_size() tf.reset_default_graph() gpu_options = tf.GPUOptions(allow_growth=True) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) train_model = A2CAgent("train_model", True, sess, input_shape, action_size, lr, GAMMA, LAMBDA, max_grad_norm, ent_coef, vf_coef, clip_range, load_model) old_model = A2CAgent("old_model", False, sess, input_shape, action_size, lr, GAMMA, LAMBDA, max_grad_norm, ent_coef, vf_coef, clip_range, False) sync_ops = old_model.create_sync_ops(train_model)
def train(args=None): gpu_ops = tf.GPUOptions(allow_growth=True) config = tf.ConfigProto(gpu_options=gpu_ops, log_device_placement=False) sess = tf.Session(config=config) args_test = copy.copy(args) args_test.use_monitor = False env = EnvWrapper(args.env, mod_r=True) env_test = EnvWrapper(args.env, mod_r=False) if args.use_mr: print('Set experience replay ON') else: print('Set experience replay OFF') path = './tmp/burn_in_' + args.env + '-' + str( args.mr_capacity) + '.pickle' if os.path.exists(path): print('Found existing burn_in memory replayer, load...') with open(path, 'rb') as f: mr = pickle.load(file=f) else: mr = MemoryReplayer(env.state_shape, capacity=args.mr_capacity, enabled=args.use_mr) # burn_in mr = utils.burn_in(env, mr) # set type='v1' for linear model, 'v3' for three layer model (two tanh activations) # type='v5' use dual print('Set Q-network version: ', args.qn_version) qn = DeepQN(state_shape=env.state_shape, num_actions=env.num_actions, gamma=args.gamma, type=args.qn_version) qn.reset_sess(sess) qn.set_train(args.lr) if not args.reuse_model: print('Set reuse model OFF') init = tf.global_variables_initializer() sess.run(init) else: print('Set reuse model ON') try: qn.load('./tmp/qn-' + args.qn_version + '-' + args.env + '-keyinterrupt' + '.ckpt') optimizer_scope = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "optimizer") init = tf.variables_initializer(optimizer_scope) sess.run(init) print('Found previous model') except tf.errors.NotFoundError: print('No previous model found, init new model') init = tf.global_variables_initializer() sess.run(init) # plotter = Plotter(save_path=args.performance_plot_path, interval=args.performance_plot_interval, # episodes=args.performance_plot_episodes) pretrain_test = Tester(qn, env_test, report_interval=100) print('Pretrain test:') pretrain_test.run(qn, sess) print('Pretrain test done.') tester_1 = Tester(qn, env, episodes=args.performance_plot_episodes, report_interval=args.performance_plot_episodes, title='test-r-mod') tester_2 = Tester(qn, env_test, episodes=args.performance_plot_episodes, report_interval=args.performance_plot_episodes, title='test-r-real') score = deque([], maxlen=args.performance_plot_episodes) reward_record = [] try: for epi in range(args.max_episodes): s = env.reset() done = False rc = 0 while not done: a = qn.select_action_eps_greedy(get_eps(epi), s) a_ = a[0] s_, r, done, _ = env.step(a_) mr.remember(s, s_, r, a_, done) s = s_ rc += r score.append(rc) # replay s, s_, r, a, done = mr.replay(batch_size=args.batch_size) qn.train(s, s_, r, a, done) if (epi + 1) % args.performance_plot_interval == 0: print('train-r-mod reward avg: ', np.mean(score)) tester_2.run(qn, sess) #r_avg, _ = tester_2.run(qn, sess) # reward_record.append(r_avg) except KeyboardInterrupt: qn.save('./tmp/qn-' + args.qn_version + '-' + args.env + '-keyinterrupt' + '.ckpt') # save mr with open(path, 'wb+') as f: pickle.dump(mr, f) exit(-1) qn.save(args.model_path) f = open(args.log_name, 'w') f.write(str(reward_record)) f.close() return
FRAMES_IN_STATE_COUNT = 4 BATCH_SIZE = 32 MEMORY_SIZE = 1000000 FREEZE_ITERATIONS = 10000 REPLAY_START_SIZE = 50000 LAST_EPSILON_DECREASE_ITERATION = 1000000 START_EPSILON = 1.0 END_EPSILON = 0.1 # -------- REPORT CONSTS -------- REPORT_ITERATIONS = 10000 SAVE_MODEL_ITERATIONS = 50000 print(device_lib.list_local_devices()) env = EnvWrapper(GAME_ENV_NAME, IMG_SIZE, FRAMES_IN_STATE_COUNT, MEMORY_SIZE) action_count = env.action_count if STARTING_MODEL is None: model = atari_model.model(action_count, IMG_SIZE, FRAMES_IN_STATE_COUNT) else: model = helpers.load_model(STARTING_MODEL) print('Loaded model: ', STARTING_MODEL) if LEARN: frozen_target_model = helpers.copy_model(model) process = psutil.Process(os.getpid()) print('RAM :', helpers.convert_size(process.memory_info().rss))