def __init__(self, env_name): self.device = torch.device('cuda') self.env_name = env_name self.env = wrappers.make_env(env_name) self.gamma = 0.99 self.batch_size = 32 self.replay_buffer_size = 10000 self.replay_start_size = 10000 self.learning_rate = 1e-4 self.update_target_interval = 1000 self.epsilon_start = 1.0 self.epsilon_end = 0.02 self.epsilon_period = 100000 self.reward_bound = 19.5 self.replay_buffer = replay_buffer.ReplayBuffer( self.replay_buffer_size) self.network = dqn_model.DQNModel(self.env.observation_space.shape, self.env.action_space.n).to( self.device) self.target_network = dqn_model.DQNModel( self.env.observation_space.shape, self.env.action_space.n).to(self.device) self.optimizer = optim.Adam(self.network.parameters(), lr=self.learning_rate) print(self.network) self.writer = SummaryWriter(comment='dqn' + self.env_name) self.total_rewards = [] self.frame_index = 0
def main(to_train, save_path): torch.manual_seed(1234) parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda computation") parser.add_argument("--env", default=DEFAULT_ENV_NAME, help="default env name") args = parser.parse_args() device = torch.device( "cuda" if args.cuda and torch.cuda.is_available() else "cpu") os.makedirs(save_path, exist_ok=True) env = wrappers.make_env(args.env) net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) target_net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) print(net) buffer = ExperienceBuffer(REPLAY_SIZE) agent = Agent(env, buffer) epsilon = EPSILON_START optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE) # only need one optimizer if to_train: train(env, net, target_net, buffer, agent, optimizer, device, save_path)
def __init__(self, env_name): self.env = wrappers.make_env(env_name) self.env_name = env_name self.device = torch.device('cuda') self.learning_rate = 2.5e-4 self.stabilizer = 0.01 self.gradient_momentum = 0.95 self.gamma = 0.99 self.batch_size = 32 self.replay_start_size = 50000 self.replay_buffer_size = 1000000 self.update_target_interval = 10000 self.training_frequency = 4 self.epsilon_start = 1.0 self.epsilon_end = 0.05 self.epsilon_period = 1000000 self.network = dqn_model.DQNModel(self.env.observation_space.shape, self.env.action_space.n).to(self.device) self.target_network = dqn_model.DQNModel(self.env.observation_space.shape, self.env.action_space.n).to(self.device) self.replay_buffer = replay_buffer.ReplayBuffer(self.replay_buffer_size) self.optimizer = optim.RMSprop(self.network.parameters(), lr = self.learning_rate, alpha = self.gradient_momentum, eps = self.stabilizer) self.loss_criterion = nn.SmoothL1Loss() print(self.network) self.writer = SummaryWriter(comment = 'dqn' + self.env_name) self.total_rewards = [] self.best_mean_reward = None self.steps = 0
def __init__(self, env_name): self.env = wrappers.make_env(env_name) self.env_name = env_name self.device = torch.device('cuda') self.rollout_length = 2 torch.manual_seed(2) np.random.seed(2) self.env.seed(2) self.env.action_space.seed(2) self.env.observation_space.seed(2) self.learning_rate = 1e-4 self.gamma = 0.99 self.batch_size = 32 self.replay_start_size = 10000 self.replay_buffer_size = 10000 self.update_target_interval = 1000 self.epsilon_start = 1.0 self.epsilon_end = 0.02 self.epsilon_period = 100000 self.alpha = 0.6 self.beta_start = 0.4 self.beta_period = 100000 self.beta = self.beta_start self.network = dqn_model.DuelingDQNModel( self.env.observation_space.shape, self.env.action_space.n).to(self.device) self.target_network = dqn_model.DuelingDQNModel( self.env.observation_space.shape, self.env.action_space.n).to(self.device) self.replay_buffer = replay_buffer.NStepPriorityBuffer( self.replay_buffer_size, self.rollout_length, self.gamma, self.alpha) self.optimizer = optim.Adam(self.network.parameters(), lr=self.learning_rate) print(self.network) self.writer = SummaryWriter(comment='combineddqn' + self.env_name) self.total_rewards = [] self.best_mean_reward = None self.steps = 0
def __init__(self, env_name): self.env = wrappers.make_env(env_name) self.device = torch.device('cuda') self.env_name = env_name torch.manual_seed(2) np.random.seed(2) self.env.seed(2) self.env.action_space.seed(2) self.env.observation_space.seed(2) self.replay_buffer_size = 10000 self.replay_start_size = 10000 self.update_interval = 1000 self.learning_rate = 1e-4 self.gamma = 0.99 self.batch_size = 32 n_actions = self.env.action_space.n input_shape = self.env.observation_space.shape self.network = dqn_model.DQNNoisyModel(input_shape, n_actions).to(self.device) self.target_network = dqn_model.DQNNoisyModel( input_shape, n_actions).to(self.device) self.replay_buffer = replay_buffer.ReplayBuffer( self.replay_buffer_size) self.optimizer = optim.Adam(self.network.parameters(), lr=self.learning_rate) print(self.network) self.writer = SummaryWriter(comment='noisy_dqn' + env_name) self.total_rewards = [] self.best_mean_reward = None self.steps = 0
expected_state_action_values = next_state_values * GAMMA + rewards_v return nn.MSELoss()(state_action_values, expected_state_action_values) if __name__ == "__main__": mkdir('.', 'checkpoints') parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") parser.add_argument("--env", default=DEFAULT_ENV_NAME, help="Name of the environment, default=" + DEFAULT_ENV_NAME) parser.add_argument("--reward", type=float, default=MEAN_REWARD_GOAL, help="Mean reward goal to stop training, default=%.2f" % MEAN_REWARD_GOAL) args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") env = wrappers.make_env(args.env) net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) tgt_net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) writer = SummaryWriter(comment="-" + args.env) print(net) buffer = ExperienceBuffer(REPLAY_BUFFER_SIZE) agent = Agent(env, buffer) epsilon = EPSILON_START optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE) total_rewards = [] frame_idx = 0 ts_frame = 0 ts = time.time()
print(s) return True return False # ********************************************************************************************************************** # * 1000 episodes * # ********************************************************************************************************************** if __name__ == "__main__": print('\n\n*******************************************************') print("* Random model's playing 1000 episodes of MsPacman... *") print('*******************************************************\n') DEFAULT_ENV_NAME = "MsPacmanNoFrameskip-v4" env = wrappers.make_env(DEFAULT_ENV_NAME) f = open('bug_log_Random.txt', 'w+') f.close() for game in range(10): state = env.reset() total_reward = 0.0 # wait the game starts for i in range(65): state, reward, is_done, _ = env.step(0) bug_flags = [False, False, False, False] count_actions = 0 while True:
def main(cuda: bool, env_name: str, reward_stop: float, render: bool, weights_fn: str, fps: float, epsilon_fixed: float, no_learn: bool): device = torch.device("cuda" if cuda else "cpu") # create environment env: gym.Env = wrappers.make_env(env_name) # create both neural networks net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) tgt_net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) if weights_fn: assert os.path.isfile(weights_fn), "File {0} does not exist.".format( weights_fn) state_dict = torch.load(weights_fn, map_location=device) net.load_state_dict(state_dict) tgt_net.load_state_dict(state_dict) # create summary writer for tensorboard writer = SummaryWriter(comment="-" + env_name) # create buffer and agent and init epsilon buffer = ExperienceBuffer(REPLAY_SIZE) agent = Agent(env, buffer, render=render) optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE) total_rewards: List[float] = [] frame_idx = 0 ts_frame = 0 ts = time.time() best_mean_reward: Optional[float] = None while True: frame_idx += 1 # update epsilon if epsilon_fixed: epsilon = epsilon_fixed else: epsilon = max(EPSILON_FINAL, EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME) # play one step t_step_0 = time.time() reward = agent.play_step(net, epsilon, device) if fps: while 1 / (time.time() - t_step_0) > fps: time.sleep(0.01) if reward is not None: # add reward to total and calculate mean total_rewards.append(reward) mean_reward = np.mean(total_rewards[-100:]) # meter speed speed = (frame_idx - ts_frame) / (time.time() - ts) ts_frame = frame_idx ts = time.time() # print and write information print( "%d: done %d games, mean reward %.3f, eps %.2f, speed %.2f f/s" % (frame_idx, len(total_rewards), float(mean_reward), epsilon, speed)) writer.add_scalar("epsilon", epsilon, frame_idx) writer.add_scalar("speed", speed, frame_idx) writer.add_scalar("reward_100", mean_reward, frame_idx) writer.add_scalar("reward", reward, frame_idx) if best_mean_reward is None or best_mean_reward < mean_reward: torch.save(net.state_dict(), env_name + "-best.dat") if best_mean_reward is not None: print( "Best mean reward updated %.3f -> %.3f, model saved" % (best_mean_reward, float(mean_reward))) best_mean_reward = float(mean_reward) if mean_reward > reward_stop: print("Solved in {0} frames!".format(frame_idx)) break if len(buffer) < REPLAY_START_SIZE or no_learn: continue # sync target net with training net if frame_idx % SYNC_TARGET_FRAMES == 0: tgt_net.load_state_dict(net.state_dict()) optimizer.zero_grad() batch = buffer.sample(BATCH_SIZE) loss_t = calc_loss(batch, net, tgt_net, device=device) loss_t.backward() optimizer.step() writer.close()
"--model", required=False, help="Model file to load") parser.add_argument("-e", "--env", default=DEFAULT_ENV_NAME, help="Environment name to use, default=" + DEFAULT_ENV_NAME) args = parser.parse_args() env_name = DEFAULT_ENV_NAME if args.env is None else args.env device = torch.device("cpu" if (args.mode is None) or ( args.mode == "c") else "cuda") env = wrappers.make_env(env_name, lives=True, fire=True) print("{} environment".format(env_name)) print(env.unwrapped.get_action_meanings()) model = DEFAULT_MODEL_NAME if (args.model is None) else args.model net = VanillaDQN(env.observation_space.shape, env.action_space.n).to(device) net.load_state_dict(torch.load(model, map_location=torch.device(device))) total_reward = 0.0 total_steps = 0 obs = env.reset() net.eval() with torch.no_grad(): while True: start_ts = time.time()
DEFAULT_ENV_NAME = "PongNoFrameskip-v4" FPS = 25 if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-m", "--model", required=True, help="Model file to load") parser.add_argument("-e", "--env", default=DEFAULT_ENV_NAME, help="Environment name to use, default=" + DEFAULT_ENV_NAME) parser.add_argument("-r", "--record", help="Directory to store video recording") parser.add_argument("--no-visualize", default=True, action='store_false', dest='visualize', help="Disable visualization of the game play") args = parser.parse_args() env = wrappers.make_env(args.env) if args.record: env = gym.wrappers.Monitor(env, args.record) net = dqn_model.DQN(env.observation_space.shape, env.action_space.n) net.load_state_dict(torch.load(args.model, map_location=lambda storage, loc: storage)) state = env.reset() total_reward = 0.0 c = collections.Counter() while True: start_ts = time.time() if args.visualize: env.render() state_v = torch.tensor(np.array([state], copy=False)) q_vals = net(state_v).data.numpy()[0]
def make_env(): return wrappers.make_env(DEFAULT_ENV_NAME)
if __name__ == "__main__": params = common.HYPERPARAMS['pong'] parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=True, action="store_true", help="Enable cuda") parser.add_argument("--n", default=1, help="how many steps to unroll from the bellman equation") parser.add_argument("--double", default=False, action="store_true", help="Enable double DQN") args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") unrolling_steps = int(args.n) if args.n else 1 print('bellman unrolling steps: '+ str(unrolling_steps)) double = args.double if args.double else False #env = gym.make(params['env_name']) #ptan.common.wrappers.wrap_dqn(env) env = wrappers.make_env(params['env_name']) writer = SummaryWriter(comment="-" + params['run_name'] + "-basic") net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) tgt_net = ptan.agent.TargetNet(net) selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start']) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=unrolling_steps) buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params['replay_size']) optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) frame_idx = 0 eval_states = None with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: while True: frame_idx += 1 buffer.populate(1)#where all the magic happens!
default=is_cuda, action="store_true", help="Enable cuda") parser.add_argument("--env", default=DEFAULT_ENV_NAME, help="Name of the environment, default=" + DEFAULT_ENV_NAME) args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") make_dir(out_dir) # save models make_dir(data_dir) # save datas print('device:', device, 'BETA: ', BETA, 'NAME: ', NAME) print('clip_reward:', IS_CLIP_REWARD, 'clip_loss:', IS_CLIP_LOSS, 'is_huber:', is_huber) env = wrappers.make_env(args.env, SKIP_FRAME, STACK_FRAME) net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) tgt_net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) buffer = ExperienceBuffer(REPLAY_SIZE) agent = Agent(env, buffer) epsilon = EPSILON_START optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE) scheduler = StepLR(optimizer, step_size=10000, gamma=LR_GAMMA) optimizer_tgt = optim.Adam(tgt_net.parameters(), lr=LEARNING_RATE) total_rewards, mean_return, train_times, total_losses = [], [], [], [] frame_idx, ts_frame, episode, ep_time, ep_loss, best_mean_reward = 0, 0, 0, 0, 0, -21 ts = time.time()
parser.add_argument("-r", "--reward", help="set the reward bound for specific env:", type=int) args = parser.parse_args() # ------------------------------ # using the args # Make our device the GPU device = torch.device("cpu" if (args.mode is None) or ( args.mode == "c") else "cuda") # Create the environment with wrappers env = wrappers.make_env(DEFAULT_ENV_NAME, lives=False, fire=True) print(env.unwrapped.get_action_meanings()) if args.reward is not None: REWARD_BOUND = args.reward # Initialise the neural network, which will try to learn the Q values # Initialise the target network, which provides a copy of the network weights # from previous training iterations if args.model == 0 or args.model is None: print("Using vanilla network") net = nnmodels.VanillaDQN(env.observation_space.shape, env.action_space.n).to(device) tgtNet = nnmodels.VanillaDQN(env.observation_space.shape, env.action_space.n).to(device) # -------------------------
state, reward, done, _ = env.step(action) total_reward += reward if done: return True return False if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-m1", "--model1", required=True, help="Model player 1 file to load") parser.add_argument("-m2", "--model2", required=True, help="Model player 2 file to load") parser.add_argument("-e", "--env", default=DEFAULT_ENV_NAME, help="Environment name to use, default=" + DEFAULT_ENV_NAME) args = parser.parse_args() env = wrappers.make_env(args.env, gui=True, scenario="multi_side_ai") net1 = dqn_model.DQN(env.observation_space.shape, env.action_space.n) net2 = dqn_model.DQN(env.observation_space.shape, env.action_space.n) net1.load_state_dict(torch.load(args.model1, map_location=lambda storage, loc: storage)) net2.load_state_dict(torch.load(args.model2, map_location=lambda storage, loc: storage)) state1 = env.reset() state2 = state1 total_reward1 = 0.0 total_reward2 = 0.0 counter1 = collections.Counter() counter2 = collections.Counter() epsilon = 0.2 frame = 0 while True:
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-m", "--model", required=True, help="Model file to load") parser.add_argument("-e", "--env", default=DEFAULT_ENV_NAME, help="Environment name to use, default=" + DEFAULT_ENV_NAME) args = parser.parse_args() env = wrappers.make_env(args.env, gui=True, scenario="side1_pass", variations=4) net = dqn_model.DQN(env.observation_space.shape, env.action_space.n) net.load_state_dict( torch.load(args.model, map_location=lambda storage, loc: storage)) state = env.reset() total_reward = 0.0 c = collections.Counter() while True: start_ts = time.time() state_v = torch.tensor(np.array([state], copy=False)) q_vals = net(state_v).data.numpy()[0] action = np.argmax(q_vals) c[action] += 1