def main(stdscr): model = Model() agent = A2CAgent(model) learning_environment = LearningEnvironment() agent.initialize_model(learning_environment) agent.load_model_if_previously_saved() game_controller = GameController(stdscr) game_controller.play(agent, learning_environment)
def play(self, stdscr): self._initialize_screen(stdscr) win = self._create_window() model = Model() agent = A2CAgent(model) learning_environment = LearningEnvironment() agent.load_pretrained_model(learning_environment) self._play_test_game(learning_environment, agent, win) self._draw_game_over_text_and_wait_for_input(win)
def run(self, epochs, batch_size): model = Model() agent = A2CAgent(model) learning_environment = LearningEnvironment() agent.initialize_model(learning_environment) variables = model.get_variables() for _ in range(epochs): variables = self._receive_variables_from_master(variables) model.set_variables(variables) observations, acts_and_advs, returns = agent.generate_experience_batch( learning_environment, batch_size) self._send_experience_to_master(observations, acts_and_advs, returns)
def learn(self, stdscr): self._initialize_screen(stdscr) win = self._create_window() model = Model() agent = A2CAgent(model) learning_environment = LearningEnvironment() agent.load_model_if_previously_saved(learning_environment) for iter in range(200): if iter % 10 == 0: self._play_test_game(learning_environment, agent, win) agent.train(learning_environment) agent.save_model()
def run(self, max_worker_id, epochs, worker_batch_size): model = Model() agent = A2CAgent(model) learning_environment = LearningEnvironment() agent.initialize_model(learning_environment) agent.load_model_if_previously_saved() saved_model_score = self._get_average_score(learning_environment, agent, 100) for ep in range(1, epochs + 1): print("Epoch {}/{}".format(ep, epochs)) if ep % 1000 == 0: current_score = self._get_average_score( learning_environment, agent, 100) if current_score > saved_model_score: agent.save_model() saved_model_score = current_score variables = model.get_variables() self._send_variables_to_workers(variables) all_observations, all_acts_and_advs, all_returns = self._receive_experience_from_worker( 1, worker_batch_size) for i in range(2, max_worker_id + 1): observations, acts_and_advs, returns = self._receive_experience_from_worker( i, worker_batch_size) all_observations = np.concatenate( (all_observations, observations)) all_acts_and_advs = np.concatenate( (all_acts_and_advs, acts_and_advs)) all_returns = np.concatenate((all_returns, returns)) model.train_on_batch(all_observations, [all_acts_and_advs, all_returns]) current_score = self._get_average_score(learning_environment, agent, 100) if current_score > saved_model_score: agent.save_model()
import numpy as np from a2c import A2CAgent env = gym.make("CartPole-v0") obs_dim = env.observation_space.shape[0] action_dim = env.action_space.n MAX_EPISODE = 1000 MAX_STEPS = 500 lr = 7e-3 gamma = 0.99 value_coeff = 0.5 entropy_coeff = 1e-4 agent = A2CAgent(env, gamma, lr, value_coeff, entropy_coeff) ep_rewards = [] for episode in range(MAX_EPISODE): state = env.reset() trajectory = [] # [[s, a, r, s', done], [], ...] episode_reward = 0 for steps in range(MAX_STEPS): action = agent.get_action(state) next_state, reward, done, _ = env.step(action) trajectory.append([state, action, reward, next_state, done]) episode_reward += reward if done: break
import matplotlib.pyplot as plt import numpy as np import mutliprocessing from a2c import A2CAgent seed = 112 env = gym.make('LunarLander-v2') env.seed(seed) # get the dimensions of the space state_size = env.observation_space.shape[0] action_size = env.action_space.n agent = A2CAgent(theta=.0001, learning_rate=.002, discount=.99, actions=action_size, space=state_size) episodes = 100 scores = {} # play the game, each iteration is an episode for i in range(episodes): done = False # score keeping total_rewards = 0 # reset the environment observation = env.reset() observation = np.reshape(observation, [1, state_size])
import gym import gym_sumo from a2c import A2CAgent from dqn import DQNAgent env = gym.make('gym_sumo-v0') agent = A2CAgent() #agent = DQNAgent() #agent.train(env) agent.test(env)
def main(): args = parse_args() field_config = load_field_config(args.env_config) make_env = EnvFactory(field_config) env = make_env() # test env for meta setup make_test_env = None if args.test_env_config is not None: test_field_config = load_field_config(args.test_env_config) make_test_env = EnvFactory(test_field_config) experiment_config = get_config(args.experiment_config, env) device = torch.device(experiment_config['train']['device']) train_params = experiment_config['train'] experiment_name = get_experiment_name(args) save_path = experiment_config['train'][ 'checkpoints_dir'] + experiment_name + '.pt' logdir = f'a2c/logs/{experiment_name}' if args.logdir is not None: logdir = args.logdir.rstrip('/') + '/' + experiment_name if not os.path.exists(logdir): os.mkdir(logdir) reward_logs = [] test_reward_logs = [] for _ in tqdm(range(args.runs)): if args.hint_type is None: agent = A2CAgent( experiment_config['state'], receptive_field=env.receptive_field_size).to(device) else: agent = A2CAgent( experiment_config['state'], hint_type=args.hint_type, hint_config=experiment_config['hint'], receptive_field=env.receptive_field_size).to(device) optimizer = torch.optim.Adam(agent.parameters(), lr=experiment_config['train']['lr']) log = train(train_params['epochs'], train_params['n_agents'], make_env, agent, optimizer, max_steps=train_params['max_steps'], hint_type=args.hint_type, make_test_env=make_test_env, device=device, experiment_name=experiment_name, save_path=save_path, log_dir=logdir, max_reward_limit=train_params['max_reward_limit'], reward_log_freq=train_params['reward_log_freq'], plot_every=1) if make_test_env is not None: train_log, test_log = log reward_logs.append(train_log) test_reward_logs.append(test_log) else: reward_logs.append(log) logs_avg = average_logs(reward_logs) if args.runs > 1 else reward_logs if make_test_env is not None: test_logs_avg = average_logs( test_reward_logs) if args.runs > 1 else test_reward_logs writer = SummaryWriter(log_dir=logdir, filename_suffix=experiment_name) log_name = 'Reward' if make_test_env is None else 'META/Train reward' for i, reward in enumerate(logs_avg): writer.add_scalar(log_name, reward, i) if make_test_env is not None: for i, reward in enumerate(test_logs_avg): writer.add_scalar('META/Test reward', reward, i)
default=100000) # GAME options parser.add_argument("--n_actions", type=int, help="number of game output actions", default=2) parser.add_argument("--frame_size", type=str, help="size of game frame in pixels", default=84) if __name__ == '__main__': options = parser.parse_args() # Select agent if options.algo == 'dqn': agent = DQNAgent(options) elif options.algo == 'a2c': agent = A2CAgent(options) elif options.algo == 'ppo': agent = PPOAgent(options) else: print("ERROR. This algorithm has not been implemented yet.") # Train or evaluate agent if options.mode == 'train': agent.train() elif options.mode == 'eval': agent.play_game()
import gym from a2c import A2CAgent env = gym.make("CartPole-v0") obs_dim = env.observation_space.shape[0] action_dim = env.action_space.n MAX_EPISODE = 1500 MAX_STEPS = 500 lr = 1e-4 gamma = 0.99 agent = A2CAgent(env, gamma, lr) def run(): for episode in range(MAX_EPISODE): state = env.reset() trajectory = [] # [[s, a, r, s', done], [], ...] episode_reward = 0 for steps in range(MAX_STEPS): action = agent.get_action(state) next_state, reward, done, _ = env.step(action) trajectory.append([state, action, reward, next_state, done]) episode_reward += reward if done: break state = next_state
use_feature_units=False), step_mul=4, game_steps_per_episode=None, disable_fog=False, visualize=False) #with tf.Session() as sess: #A2C = a2c(sess, 0.00001) #sess.run(tf.global_variables_initializer()) #saver = tf.train.Saver() # saver.restore(sess, "4wayBeacon_a2c/tmp/model.ckpt") state_size = 2 action_size = 4 agent = A2CAgent(state_size, action_size) for episodes in range(62626): obs = env.reset() ##### action = actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) obs = env.step(actions=[action]) done = False ##### sub_done = False global_step = 0 ##### states = np.empty(shape=[0, 2]) ##### actions_list = np.empty(shape=[0, 4]) ##### next_states = np.empty(shape=[0, 2])