Пример #1
0
    def __init__(self, env_name, params):
        self.env = envs.make(env_name)
        self.params = params
        self.action_bound = self.env.action_bound[1]
        
        self.iterations = params["iterations"]
        self.mem_len = params["mem_len"]
        self.seed = params["seed"]
        self.render = params["render"]
        self.log_interval = params["log_interval"]
        self.warmup = params["warmup"]
        self.batch_size = params["batch_size"]
        self.save = params["save"]

        hidden_dim = params["hidden_dim"]
        state_dim = self.env.observation_space
        action_dim = self.env.action_space
        cuda = params["cuda"]
        network_settings = params["network_settings"]

        actor = utils.Actor(state_dim, hidden_dim, action_dim)
        target_actor = utils.Actor(state_dim, hidden_dim, action_dim)
        critic = utils.Critic(state_dim+action_dim, hidden_dim, 1)
        target_critic = utils.Critic(state_dim+action_dim, hidden_dim, 1)
        self.memory = utils.ReplayMemory(1000000)
        self.agent = sw.Sleepwalk(actor, 
                                critic,
                                target_actor, 
                                target_critic,
                                network_settings,
                                GPU=cuda)

        self.noise = utils.OUNoise(action_dim)
        self.noise.set_seed(self.seed)
        self.memory = utils.ReplayMemory(self.mem_len)

        self.pol_opt = torch.optim.Adam(actor.parameters())
        self.crit_opt = torch.optim.Adam(critic.parameters())

        if cuda:
            self.Tensor = torch.cuda.FloatTensor
        else:
            self.Tensor = torch.Tensor
        
        if self.render:
            self.env.init_rendering()
        
        self.best = None

        # initialize experiment logging
        self.logging = params["logging"]
        if self.logging:
            self.directory = os.getcwd()
            filename = self.directory + "/data/qprop.csv"
            with open(filename, "w") as csvfile:
                self.writer = csv.writer(csvfile)
                self.writer.writerow(["episode", "reward"])
                self.train()
        else:
            self.train()
Пример #2
0
def play(save_path):
    ''' Loads network from the location of save_path and plays a game of Pong. '''

    # Initialize the Pong gym environment, set seeds
    env = gym.make('Pong-v0')
    replay_memory = u.ReplayMemory()
    G = tf.Graph()
    with G.as_default():
        # Import TF graph
        saver = tf.train.import_meta_graph(save_path + '.meta',
                                           clear_devices=True)
        G.device(
            '/cpu:0'
        )  # Run graph on CPU so play can be done without taking GPU resources
        # Get input/output tensors
        X = G.get_tensor_by_name('X:0')
        Y = G.get_tensor_by_name('Y:0')
        # Initialize TF session
        sess_config = tf.ConfigProto(device_count={'CPU': 1, 'GPU': 0})
        with tf.Session(config=sess_config) as sess:
            print('Reloading parameters...')
            saver.restore(sess, save_path)
            # Iterate over episodes
            while True:
                obs = u.preprocess_image(env.reset())
                for i in range(3):
                    replay_memory.add_frame(
                        np.zeros((160 // DOWNSAMPLE, 160 // DOWNSAMPLE)))
                replay_memory.add_frame(obs)

                # Iterate over frames
                done = False
                while not done:
                    # Feed state into DQN
                    s = np.stack(
                        [replay_memory.frames[i] for i in range(-4, 0)],
                        axis=-1).reshape(1, 160 // DOWNSAMPLE,
                                         160 // DOWNSAMPLE, 4)
                    y = sess.run(Y, feed_dict={X: s})

                    # Decide on action greedily
                    a = np.argmax(y) + 1

                    # Take action, observe environment, reward
                    obs, r, done, _ = env.step(a)
                    for i in range(STEPS_TO_SKIP):
                        obs, r, done_temp, _ = env.step(1)
                        if done_temp == True:
                            done = True
                    env.render()

                    # Add new frame to replay memory
                    replay_memory.add_frame(u.preprocess_image(obs))

                q = input('Play again? ')
                if q in ['', 'y', 'Y']:
                    pass
                else:
                    env.render(close=True)
                    break
Пример #3
0
    def __init__(self,
                 num_inputs,
                 action_space,
                 args,
                 writer=None,
                 outdir=None,
                 device=torch.device("cpu")):
        self.index = 0
        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha
        self.writer = writer
        self.outdir = outdir
        self.batch_size = args.batch_size
        self.save_freq = args.save_freq

        self.policy_type = args.policy
        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.device = device

        self.replay_buffer = utils.ReplayMemory(capacity=args.buffer_max_size,
                                                seed=args.seed)

        self.critic = QNetwork(num_inputs, action_space.shape[0],
                               args.hidden_size).to(device=self.device)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(),
                                             lr=args.lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      args.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)

        if self.policy_type == "Gaussian":
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning is True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device)
                self.alpha_optim = torch.optim.Adam([self.log_alpha],
                                                    lr=args.lr)

            self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                         args.hidden_size,
                                         action_space).to(self.device)
            self.policy_optim = torch.optim.Adam(self.policy.parameters(),
                                                 lr=args.lr)

        else:
            self.alpha = 0
            self.automatic_entropy_tuning = False
            self.policy = DeterministicPolicy(num_inputs,
                                              action_space.shape[0],
                                              args.hidden_size,
                                              action_space).to(self.device)
            self.policy_optim = torch.optim.Adam(self.policy.parameters(),
                                                 lr=args.lr)
Пример #4
0
    def __init__(self, model_name, env):
        super(DQNAgent, self).__init__(model_name, env)
        self.episode = self.configs.episode
        self.batch_size = self.configs.batch_size
        self.gamma = self.configs.gamma
        self.eps_start = self.configs.eps_start
        self.eps_end = self.configs.eps_end
        self.eps_decay = self.configs.eps_decay
        self.target_update_episode = self.configs.target_update_episode

        self.model_path = self.configs.save_path
        self.save_episode = self.configs.save_episode
        self.plot_episode = self.configs.plot_episode

        self.policy_net = models.DQN(self.configs, env).to(self.device)
        self.target_net = models.DQN(self.configs, env).to(self.device)
        self.load_model(self.model_path)
        self.optimizer = optim.Adam(
            self.policy_net.parameters(),
            lr=self.configs.optimizer_lr,
            betas=(self.configs.optimizer_beta1, self.configs.optimizer_beta2),
            eps=self.configs.optimizer_eps,
            weight_decay=self.configs.optimizer_weight_decay)
        self.memory = utils.ReplayMemory(10000)
        self.num_random_choose = 0

        self.num_choice_per_dim = self.configs.num_choice_per_dim
        self.action_dim = env.action_spec().shape
        self.action_min = env.action_spec().minimum
        self.action_max = env.action_spec().maximum

        self.action_space = utils.enumerate(self.num_choice_per_dim,
                                            self.action_min, self.action_max)
Пример #5
0
def play(G, save_path):

    # Initialize the Pong gym environment, set seeds
    env = gym.make('Pong-v0')
    replay_memory = u.ReplayMemory()
    with G.as_default():
        # Get input/output tensors
        X = G.get_tensor_by_name('X:0')
        Y = G.get_tensor_by_name('Y:0')
        saver = tf.train.Saver(var_list=None, max_to_keep=5)
        # Initialize TF session
        with tf.Session() as sess:
            print('Reloading parameters...')
            saver.restore(sess, save_path)
            # Iterate over episodes
            while True:
                obs = u.preprocess_image(env.reset())
                for i in range(3):
                    replay_memory.add_frame(
                        np.zeros((160 // DOWNSAMPLE, 160 // DOWNSAMPLE)))
                replay_memory.add_frame(obs)

                # Iterate over frames
                done = False
                while not done:
                    # Feed state into DQN
                    s = np.stack(
                        [replay_memory.frames[i] for i in range(-4, 0)],
                        axis=-1).reshape(1, 160 // DOWNSAMPLE,
                                         160 // DOWNSAMPLE, 4)
                    y = sess.run(Y, feed_dict={X: s})

                    # Decide on action
                    a = np.argmax(y) + 1

                    # Take action, observe environment, reward
                    obs, r, done, _ = env.step(a)
                    for i in range(STEPS_TO_SKIP):
                        obs, r, done_temp, _ = env.step(1)
                        if done_temp == True:
                            done = True
                    env.render()

                    # Add new state/reward to replay memory
                    replay_memory.add_frame(u.preprocess_image(obs))

                q = input('play again?')
                if q in ['', 'y', 'Y']:
                    pass
                else:
                    env.render(close=True)
                    break
Пример #6
0
    def __init__(self, agent_id, num_countries, replay_capacity, num_node_actions, num_global_actions, gamma, device):
        # more node features because we will add indicator of self country and ally countries
        num_node_features, num_edge_features = 4, 7

        # create two DQNs for stable learning
        self.policy_net = net.RecurGraphAgent(num_node_features, num_edge_features, num_node_actions, num_global_actions).to(device)
        self.target_net = net.RecurGraphAgent(num_node_features, num_edge_features, num_node_actions, num_global_actions).to(device)
        self.optimizer = torch.optim.RMSprop(self.policy_net.parameters())

        self.memory = utils.ReplayMemory(replay_capacity)

        # ensure they match
        self.target_net.load_state_dict(self.policy_net.state_dict())

        self.agent_id = agent_id
        self.num_countries = num_countries
        self.num_node_actions = num_node_actions
        self.num_global_actions = num_global_actions
        self.gamma = gamma
        self.device = device
Пример #7
0
def play(env, args, transpose=True, fps=30, zoom=None, callback=None, keys_to_action=None):
    """Allows one to play the game using keyboard.
    To simply play the game use:
        play(gym.make("Pong-v4"))
    Above code works also if env is wrapped, so it's particularly useful in
    verifying that the frame-level preprocessing does not render the game
    unplayable.
    If you wish to plot real time statistics as you play, you can use
    gym.utils.play.PlayPlot. Here's a sample code for plotting the reward
    for last 5 second of gameplay.
        def callback(obs_t, obs_tp1, action, rew, done, info):
            return [rew,]
        plotter = PlayPlot(callback, 30 * 5, ["reward"])
        env = gym.make("Pong-v4")
        play(env, callback=plotter.callback)
    Arguments
    ---------
    env: gym.Env
        Environment to use for playing.
    transpose: bool
        If True the output of observation is transposed.
        Defaults to true.
    fps: int
        Maximum number of steps of the environment to execute every second.
        Defaults to 30.
    zoom: float
        Make screen edge this many times bigger
    callback: lambda or None
        Callback if a callback is provided it will be executed after
        every step. It takes the following input:
            obs_t: observation before performing action
            obs_tp1: observation after performing action
            action: action that was executed
            rew: reward that was received
            done: whether the environment is done or not
            info: debug info
    keys_to_action: dict: tuple(int) -> int or None
        Mapping from keys pressed to action performed.
        For example if pressed 'w' and space at the same time is supposed
        to trigger action number 2 then key_to_action dict would look like this:
            {
                # ...
                sorted(ord('w'), ord(' ')) -> 2
                # ...
            }
        If None, default key_to_action mapping for that env is used, if provided.
    """
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    data_list = []
    obs = env.reset(sess)
    rendered = env.env.render(mode='rgb_array')

    if keys_to_action is None:
        if hasattr(env.env, 'get_keys_to_action'):
            keys_to_action = env.env.get_keys_to_action()
        elif hasattr(env.env.unwrapped, 'get_keys_to_action'):
            keys_to_action = env.env.unwrapped.get_keys_to_action()
        else:
            assert False, env.env.spec.id + " does not have explicit key to action mapping, " + \
                          "please specify one manually"
    relevant_keys = set(sum(map(list, keys_to_action.keys()), []))

    video_size = [rendered.shape[1], rendered.shape[0]]
    if zoom is not None:
        video_size = int(video_size[0] * zoom), int(video_size[1] * zoom)

    pressed_keys = []
    running = True
    env_done = True

    screen = pygame.display.set_mode(video_size)
    clock = pygame.time.Clock()
    count = 0
    num_traj = 0
    while running:
        if env_done and count > 0:
            env_done = False
            num_traj += 1
            obs = env.reset(sess)
            print(num_traj, count)
            replay_mem = utils.ReplayMemory(len(data_list))
            for i in range(len(data_list)):
                action = data_list[i][0]
                obs = data_list[i][1]
                rew = data_list[i][2]
                terminal = data_list[i][3]
                replay_mem.add_experience(action=action,
                                          frame=obs[:, :, 0],
                                          reward=rew,
                                          terminal=terminal)
            pickle.dump(replay_mem, open("human_" + args.env +  "_" + str(num_traj) + ".pkl", "wb"), protocol=4)
        else:
            action = keys_to_action.get(tuple(sorted(pressed_keys)), 0)
            obs, rew, env_done, terminal, frame = env.step(sess, action)
            data_list.append([action, obs, rew, terminal])

            count += 1
        if obs is not None:
            rendered = env.env.render(mode='rgb_array')
            display_arr(screen, rendered, transpose=transpose, video_size=video_size)

        # process pygame events
        for event in pygame.event.get():
            # test events, set key states
            if event.type == pygame.KEYDOWN:
                if event.key in relevant_keys:
                    pressed_keys.append(event.key)
                elif event.key == 27:
                    running = False
            elif event.type == pygame.KEYUP:
                if event.key in relevant_keys:
                    pressed_keys.remove(event.key)
            elif event.type == pygame.QUIT:
                running = False
            elif event.type == VIDEORESIZE:
                video_size = event.size
                screen = pygame.display.set_mode(video_size)
                print(video_size)

        pygame.display.flip()
        clock.tick(fps)
    pygame.quit()
Пример #8
0
# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

logs = open(log_path, 'w')


transform = T.Compose([T.ToPILImage(),
                       T.ToTensor()])

policy_net = dqn.DQN(n_angle, n_actions, hidden_layer1_size, hidden_layer2_size).to(device)
target_net = dqn.DQN(n_angle, n_actions, hidden_layer1_size, hidden_layer2_size).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters())
memory = utils.ReplayMemory(100000)


steps_done = 0

def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            return policy_net(state).max(1)[1].view(1, 1)
    else:
        return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)
Пример #9
0
    def __init__(self, env_name, params):
        # initialize environment
        self.env = envs.make(env_name)
        self.env_name = env_name

        # save important experiment parameters for the training loop
        self.iterations = params["iterations"]
        self.mem_len = params["mem_len"]
        self.seed = params["seed"]
        self.render = params["render"]
        self.log_interval = params["log_interval"]
        self.warmup = params["warmup"]
        self.batch_size = params["batch_size"]
        self.save = params["save"]

        # initialize DDPG agent using experiment parameters from config file
        self.action_bound = self.env.action_bound[1]
        state_dim = self.env.observation_space
        action_dim = self.env.action_space
        hidden_dim = params["hidden_dim"]
        cuda = params["cuda"]
        network_settings = params["network_settings"]
        actor = ddpg.Actor(state_dim, hidden_dim, action_dim)
        target_actor = ddpg.Actor(state_dim, hidden_dim, action_dim)
        critic = utils.Critic(state_dim + action_dim, hidden_dim, 1)
        target_critic = utils.Critic(state_dim + action_dim, hidden_dim, 1)
        self.agent = ddpg.DDPG(actor,
                               target_actor,
                               critic,
                               target_critic,
                               network_settings,
                               GPU=cuda)

        # intitialize ornstein-uhlenbeck noise for random action exploration
        ou_scale = params["ou_scale"]
        ou_mu = params["ou_mu"]
        ou_sigma = params["ou_sigma"]
        self.noise = utils.OUNoise(action_dim,
                                   scale=ou_scale,
                                   mu=ou_mu,
                                   sigma=ou_sigma)
        self.noise.set_seed(self.seed)
        self.memory = utils.ReplayMemory(self.mem_len)

        self.pol_opt = torch.optim.Adam(actor.parameters())
        self.crit_opt = torch.optim.Adam(critic.parameters())

        # want to save the best policy
        self.best = None

        # send to GPU if flagged in experiment config file
        if cuda:
            self.Tensor = torch.cuda.FloatTensor
            self.agent = self.agent.cuda()
        else:
            self.Tensor = torch.Tensor

        if self.render:
            self.env.init_rendering()

        # initialize experiment logging. This wipes any previous file with the same name
        self.logging = params["logging"]
        if self.logging:
            self.directory = os.getcwd()
            filename = self.directory + "/data/ddpg.csv"
            with open(filename, "w") as csvfile:
                self.writer = csv.writer(csvfile)
                self.writer.writerow(["episode", "reward"])
                self.train()
        else:
            self.train()
Пример #10
0
def train():
    # Graph Part
    print("Graph initialization...")
    xdim = xtrim[1] - xtrim[0]
    ydim = ytrim[1] - ytrim[0]
    channel=3
    num_action = env.action_space.n
    policy_net = NETWORK(ydim=ydim, xdim=xdim, channel=channel,
                        num_action=num_action,
                        learning_rate=learning_rate,
                        batch_size=batch_size)

    target_net = NETWORK(ydim=ydim, xdim=xdim, channel=channel,
                        num_action=num_action,
                        learning_rate=learning_rate,
                        batch_size=batch_size)
    policy_net.to(DEVICE)
    target_net.to(DEVICE)

    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    # Memory
    memory = utils.ReplayMemory(10000)

    # ETCs
    steps_done = 0
    episode_durations = []

    policy_net.float()
    target_net.float()

    print("Training Start.....")
    for episode in range(num_episodes):
        REWARD = 0
        previous_screenshot = utils.dimension_manipulation(env.reset()[xtrim[0]:xtrim[1], ytrim[0]:ytrim[1]])
        current_screenshot = previous_screenshot
        state = torch.from_numpy(current_screenshot - previous_screenshot).float().to(DEVICE)
        for t in count():
            #env.render()
            action = utils.select_action(state, steps_done, policy_net)
            observation, reward, done, _ = env.step(action.item())
            previous_screenshot = current_screenshot
            current_screenshot = utils.dimension_manipulation(observation[xtrim[0]:xtrim[1], ytrim[0]:ytrim[1]])

            if not done:
                next_status = torch.from_numpy(current_screenshot - previous_screenshot).float().to(DEVICE)
                REWARD += reward
            else:
                next_status = None
            if True :
                memory.push(state,
                            action,
                            next_status,
                            torch.tensor(float(t+1)).to(DEVICE)[None])
            state = next_status
            utils.optimize_model(policy_net, target_net, memory, batch_size)

            if done:
                utils.optimize_model(policy_net, target_net, memory, batch_size)
                episode_durations.append(t + 1)
                utils.plot_durations(episode_durations)
                if REWARD != 0:
                    print("\n########  Episode " + str(episode))
                    print("Duration : " + str(t + 1))
                    print("REWARD : " + str(REWARD))
                    print("loss : " + str(policy_net.loss.item()))
                break
        if episode % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())
Пример #11
0
def save_gif(gif_save_path, save_path):
    ''' Loads network from the location of save_path and plays a game of Pong. '''

    # Initialize the Pong gym environment, set seeds
    env = gym.make('Pong-v0')
    replay_memory = u.ReplayMemory()
    G = tf.Graph()
    gifwriter = matplotlib.animation.ImageMagickFileWriter(fps=20)
    plt.ioff()
    fig = plt.figure('Pong')
    gifwriter.setup(fig, gif_save_path, dpi=100)
    with G.as_default():
        # Import TF graph
        saver = tf.train.import_meta_graph(save_path + '.meta',
                                           clear_devices=False)
        G.device('/gpu:0')
        # Get input/output tensors
        X = G.get_tensor_by_name('X:0')
        Y = G.get_tensor_by_name('Y:0')
        # Initialize TF session
        sess_config = tf.ConfigProto(device_count={'CPU': 1, 'GPU': 1})
        with tf.Session(config=sess_config) as sess:
            print('Reloading parameters...')
            saver.restore(sess, save_path)
            # Play a single episode
            obs = env.reset()
            plt.clf()
            fig.clf()
            plt.imshow(obs)
            gifwriter.grab_frame()
            obs = u.preprocess_image(obs)
            for i in range(3):
                replay_memory.add_frame(
                    np.zeros((160 // DOWNSAMPLE, 160 // DOWNSAMPLE)))
            replay_memory.add_frame(obs)

            # Iterate over frames
            done = False
            f = 0
            while not done:
                f += 1
                print('Frame {}'.format(f))
                # Feed state into DQN
                s = np.stack([replay_memory.frames[i] for i in range(-4, 0)],
                             axis=-1).reshape(1, 160 // DOWNSAMPLE,
                                              160 // DOWNSAMPLE, 4)
                y = sess.run(Y, feed_dict={X: s})

                # Decide on action greedily
                a = np.argmax(y) + 1

                # Take action, observe environment, reward
                obs, r, done, _ = env.step(a)
                plt.clf()
                fig.clf()
                plt.imshow(obs)
                gifwriter.grab_frame()
                for i in range(STEPS_TO_SKIP):
                    obs, r, done_temp, _ = env.step(1)
                    plt.clf()
                    fig.clf()
                    plt.imshow(obs)
                    gifwriter.grab_frame()
                    if done_temp == True:
                        done = True


#                    env.render()

# Add new frame to replay memory
                replay_memory.add_frame(u.preprocess_image(obs))
            # Save gif
            gifwriter.finish()
Пример #12
0
def train(G, max_episodes, save_path):
    '''
    Trains a DQN to play pong. Periodically saves progress to a checkpoint file, and saves plots of several metrics to monitor training.
        Input:
            G: computational graph by which the action-value function Q is calculated.
            max_episodes: the maximum number of episodes to run for before terminating training
            save_path: a file path to the location of the checkpoint files
        Output: none
    '''

    # Define some constants, lists, metrics, etc
    action_map = {1: 'x', 2: '^', 3: 'v'}  # Stay, up, down
    replay_memory = u.ReplayMemory(max_exp_len=REPLAY_MEM_LEN)
    step_list = []
    reward_list = []
    avg_reward = None
    val_Q_list = []
    episode_length_list = []
    episode_time_list = []
    avg_episode_length_list = []
    avg_episode_length = None
    episode_score_list = {'player': [], 'computer': []}
    X_val = u.load_validation_screens()

    # Initialize the Pong gym environment, set seeds
    env = gym.make('Pong-v0')
    np.random.seed(SEED)
    tf.set_random_seed(SEED)
    plt.ioff()

    # Gather screens

    # Initialize computational graph
    with G.as_default():
        # Get input/output tensors
        X = G.get_tensor_by_name('X:0')
        Y = G.get_tensor_by_name('Y:0')
        Q = G.get_tensor_by_name('Q:0')
        A = G.get_tensor_by_name('A:0')
        L = G.get_tensor_by_name('L:0')
        LR = G.get_tensor_by_name('LR:0')
        train_op = G.get_operation_by_name('TrainOp')

        saver = tf.train.Saver()

        # Initialize TF session
        with tf.Session() as sess:
            # Reload/initialize variables
            if RELOAD_PARAMETERS:
                print('Reloading from last checkpoint...')
                saver.restore(sess, save_path)
            else:
                print('Initializing variables...')
                sess.run(tf.global_variables_initializer())
            # Iterate over episodes
            global_steps = 0
            for episode in range(max_episodes):
                tic = time.time()
                obs = u.preprocess_image(env.reset())
                for i in range(3):
                    replay_memory.add_frame(
                        np.zeros((160 // DOWNSAMPLE, 160 // DOWNSAMPLE),
                                 dtype=bool))
                replay_memory.add_frame(obs)

                # Iterate over frames
                done = False
                frame = 0
                episode_score = [0, 0]
                while not done:
                    if (global_steps >= OBSERVE_STEPS):
                        # Feed state into DQN
                        s = np.stack(
                            [replay_memory.frames[i] for i in range(-4, 0)],
                            axis=-1).reshape(1, 160 // DOWNSAMPLE,
                                             160 // DOWNSAMPLE, 4)
                        y = sess.run(Y, feed_dict={X: s})

                        # Decide on action
                        epsilon = max(
                            MAX_EPSILON *
                            (1 - global_steps / EPSILON_ANNEALING_STEPS),
                            MIN_EPSILON)
                        if (np.random.rand() < epsilon):
                            a = np.random.choice([1, 2, 3])
                        else:
                            a = np.argmax(y) + 1
                    else:
                        a = np.random.choice([1, 2, 3])

                    # Take action, observe environment, reward
                    obs, r, done, _ = env.step(a)
                    r_sum = r
                    for i in range(STEPS_TO_SKIP):
                        obs, r, done_temp, _ = env.step(1)
                        r_sum += r
                        if done_temp == True:
                            done = True
                    if r_sum > 0:
                        episode_score[0] += int(r_sum)
                    elif r_sum < 0:
                        episode_score[1] -= int(r_sum)

                    # Add new state/reward to replay memory
                    replay_memory.add_frame(u.preprocess_image(obs))
                    experience = (np.stack(list(replay_memory.frames),
                                           axis=-1).astype(bool), a, r_sum,
                                  done)
                    replay_memory.add_exp(experience)

                    # Do training batch update
                    if (global_steps >= OBSERVE_STEPS):
                        S, A_, R, D = replay_memory.sample(BATCH_SIZE)
                        y2 = sess.run(Y, feed_dict={X: S[:, :, :, -4:]})
                        q = R + (1 - D) * GAMMA * np.max(y2, axis=1)
                        _, batch_loss = sess.run(
                            [train_op, L],
                            feed_dict={
                                X: S[:, :, :, -5:-1],
                                Q: q,
                                A: (A_ - 1),
                                LR: LEARNING_RATE
                            })
                        if (batch_loss == np.nan):
                            print('nan error, exiting training')
                            exit()
                        elif (np.mean(np.max(y2, axis=-1)) > 1e2):
                            print('unstable Q value, exiting training')
                            exit()

                        # Print updates
                        print(
                            'Episode: {}/{},\tframe: {},\tscore: {},\t<max(Q)>: {:.3e},\nmax(Q): {:.3e},\taction: {},\tcurrent std(Q)/mean(Q): {:.3e}'
                            .format(episode + 1, max_episodes,
                                    (frame + 1) * (STEPS_TO_SKIP + 1),
                                    episode_score, np.mean(np.max(y2,
                                                                  axis=-1)),
                                    np.max(y), action_map[a],
                                    np.std(y) / np.mean(y)))

                        # Plot frame-by-frame metrics
                        if avg_reward is None:
                            avg_reward = r_sum
                        else:
                            avg_reward = (1 -
                                          np.exp(-1 / 500)) * r_sum + np.exp(
                                              -1 / 500) * avg_reward
                        if (global_steps % PLOT_EVERY_N_STEPS == 0):
                            step_list.append(global_steps)
                            reward_list.append(10 * avg_reward)
                            y_val = sess.run(Y, feed_dict={X: X_val})
                            val_Q_list.append(np.mean(np.max(y_val, axis=-1)))
                            u.plot_metrics(step_list, 'PongMetrics',
                                           'Pong Metrics', 'Global step', '',
                                           (val_Q_list, 'Validation <max(Q)>'),
                                           (reward_list, '10*<R>'))
                    else:
                        print('Observation step {}/{}'.format(
                            global_steps, OBSERVE_STEPS))

                    # Update state variables
                    global_steps += 1
                    frame += 1

                # Save parameters at end of episode, plot episode metrics
                print('Saving parameters...')
                saver.save(sess, SAVE_PATH)
                episode_length_list.append(frame * (STEPS_TO_SKIP + 1) / 1000)
                if avg_episode_length is None:
                    avg_episode_length = frame * (STEPS_TO_SKIP + 1)
                else:
                    avg_episode_length = (1 - np.exp(-1 / 10)) * frame * (
                        STEPS_TO_SKIP + 1) + np.exp(
                            -1 / 10) * avg_episode_length
                avg_episode_length_list.append(avg_episode_length / 1000)
                toc = time.time()
                episode_time_list.append((toc - tic) / 60)
                episode_score_list['player'].append(episode_score[0])
                episode_score_list['computer'].append(episode_score[1])
                u.plot_metrics(range(episode + 1), 'EpisodeLength',
                               'Episode Length', 'Episode', 'Steps/1000',
                               (episode_length_list, 'Steps/episode'),
                               (avg_episode_length_list, 'Average'))
                u.plot_metrics(range(episode + 1), 'EpisodeScore',
                               'Episode Score', 'Episode', 'Score',
                               (episode_score_list['player'], 'Player'),
                               (episode_score_list['computer'], 'Computer'))
                u.plot_metrics(range(episode + 1), 'EpisodeTime',
                               'Episode time', 'Episode', 'Time (min)',
                               (episode_time_list, 'Episode time'))
Пример #13
0
    for agent_start in agent_start_list:
        reward_mean_dict[agent_start] = 0
        reward_std_dict[agent_start] = 0
        step_counts_mean_dict[agent_start] = 0
        reward_max_dict[agent_start] = 0
        reward_accum_dict[agent_start] = 0
        step_counts_min_dict[agent_start] = 0

        test_reward_mean_dict[agent_start] = 0
        test_reward_std_dict[agent_start] = 0
        test_step_counts_mean_dict[agent_start] = 0
        test_reward_max_dict[agent_start] = 0
        test_reward_accum_dict[agent_start] = 0
        test_step_counts_min_dict[agent_start] = 0

    memory = utils.ReplayMemory(10000)
    test_memory = utils.ReplayMemory(1)  # dummy replay memory for test (to reuse code)
    optimizer = optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
    steps_done = 0
    steps_done_test = 0

    min_reward = round(-env.steps_remaining * 0.1, 1)
    global_reward_max = min_reward
    test_global_reward_max = min_reward

    total_train_reward_accum = 0
    total_test_reward_accum = 0

    eval_ep_batch = 10

    for epoch in range(EPOCHS):
Пример #14
0
def agent_training(agent_file_path, agent_file_name, fig_path, num_steps_train_total = 5000):
    # training parameters
    num_epochs = 5
    num_steps_train_epoch = num_steps_train_total/num_epochs  # steps per epoch of training
    num_steps_test = 100
    update_frequency = 10  # step frequency of model training/updates

    epsilon = 0.15  # percentage of time we perform a random action, help exploration.
    epsilon_steps = 1000  # decay steps
    epsilon_min = 0.1
    epsilon_rate = (epsilon - epsilon_min) / epsilon_steps

    # memory settings
    max_memory_size = 10000
    min_memory_size = 60  # number needed before model training starts

    game = FlappyBird()
    env = PLE(game, fps=30, display_screen=True, force_fps=True, state_preprocessor=process_state)
    my_agent = init_agent(env)

    memory = utils.ReplayMemory(max_memory_size, min_memory_size)
    env.init()

    # Logging configuration and figure plotting
    logging.basicConfig(filename='../learning.log', filemode='w',
                        level=logging.DEBUG, format='%(levelname)s:%(message)s')
    logging.info('========================================================')
    logging.info('Training started for total training steps: '+str(num_steps_train_total)+'.\n')
    learning_rewards = [0]
    testing_rewards = [0]

    for epoch in range(1, num_epochs + 1):
        steps, num_episodes = 0, 0
        losses, rewards = [], []
        env.display_screen = False

        # training loop
        while steps < num_steps_train_epoch:
            episode_reward = 0.0
            my_agent.start_episode()

            while env.game_over() == False and steps < num_steps_train_epoch:
                state = env.getGameState()
                reward, action = my_agent.act(state, epsilon=epsilon)
                memory.add([state, action, reward, env.game_over()])

                if steps % update_frequency == 0:
                    loss = memory.train_agent_batch(my_agent)

                    if loss is not None:
                        losses.append(loss)
                        epsilon = np.max(epsilon_min, epsilon - epsilon_rate)

                episode_reward += reward
                steps += 1

            if steps < num_steps_train_epoch:
                learning_rewards.append(episode_reward)

            if num_episodes % 5 == 0:
                # print "Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward)
                logging.info("Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward))

            rewards.append(episode_reward)
            num_episodes += 1
            my_agent.end_episode()

        # print "Train Epoch {:02d}: Epsilon {:0.4f} | Avg. Loss {:0.3f} | Avg. Reward {:0.3f}\n"\
        #     .format(epoch, epsilon, np.mean(losses), np.sum(rewards) / num_episodes)
        logging.info("Train Epoch {:02d}: Epsilon {:0.4f} | Avg. Loss {:0.3f} | Avg. Reward {:0.3f}\n"
                     .format(epoch, epsilon, np.mean(losses), np.sum(rewards) / num_episodes))

        steps, num_episodes = 0, 0
        losses, rewards = [], []

        # display the screen
        # env.display_screen = True

        # slow it down so we can watch it fail!
        # env.force_fps = True

        # testing loop
        while steps < num_steps_test:
            episode_reward = 0.0
            my_agent.start_episode()

            while env.game_over() == False and steps < num_steps_test:
                state = env.getGameState()
                reward, action = my_agent.act(state, epsilon=0.05)

                episode_reward += reward
                testing_rewards.append(testing_rewards[-1]+reward)
                steps += 1

                # done watching after 500 steps.
                if steps > 500:
                    env.display_screen = False

            if num_episodes % 5 == 0:
                # print "Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward)
                logging.info("Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward))

            if steps < num_steps_test:
                testing_rewards.append(episode_reward)

            rewards.append(episode_reward)
            num_episodes += 1
            my_agent.end_episode()

        # print "Test Epoch {:02d}: Best Reward {:0.3f} | Avg. Reward {:0.3f}\n"\
        #     .format(epoch, np.max(rewards), np.sum(rewards) / num_episodes)
        logging.info("Test Epoch {:02d}: Best Reward {:0.3f} | Avg. Reward {:0.3f}\n"
                     .format(epoch, np.max(rewards), np.sum(rewards) / num_episodes))

    logging.info("Training complete.\n\n")
    plot_figure(fig_path, learning_rewards, 'reward', 'reward_in_training', num_steps_train_total)
    plot_figure(fig_path, testing_rewards, 'reward', 'reward_in_testing', num_steps_train_total)

    save_agent(my_agent, agent_file_path, agent_file_name)
Пример #15
0
import sys
sys.path.append('.')

import gym
import torch
from torch import optim

import agent, train, utils

# hyperparameters
replay_mem_size = int(1e6)
mini_batch_size = 32
num_episodes = int(2e3)

agt = agent.DQNAgent()
replay_memory = utils.ReplayMemory(replay_mem_size, mini_batch_size)
obs_history = utils.ObsHistory()
optimizer = optim.RMSprop(agt.qnet.parameters())

env = gym.envs.make('PongNoFrameskip-v4')

for episode in range(num_episodes):  # loop over episodes
    obs_init = env.reset()  # reset environment to start new episode
    obs_history.reset(obs_init)  # reset observations for new episode
    done = False

    print('Episode #{}'.format(episode))
    if episode % 10 == 9:
        torch.save(agt.qnet.state_dict(), 'dqn_agt.pt')

    cumulative_loss = 0
Пример #16
0
def train(G, max_episodes, save_path):
    '''
    Trains a DQN to play pong.
    '''

    # Define some constants, lists, metrics, etc
    action_map = {1: 'x', 2: '^', 3: 'v'}  # Stay, up, down
    replay_memory = u.ReplayMemory(max_exp_len=REPLAY_MEM_LEN)
    step_list = []
    reward_list = []
    val_Q_list = []
    episode_length_list = []
    avg_episode_length_list = []
    episode_score_list = {'player': [], 'computer': []}
    X_val = u.load_validation_screens()

    # Initialize the Pong gym environment, set seeds
    env = gym.make('Pong-v0')
    np.random.seed(SEED)
    tf.set_random_seed(SEED)
    plt.ioff()

    # Gather screens

    # Initialize computational graph
    with G.as_default():
        # Get input/output tensors
        X = G.get_tensor_by_name('X:0')
        Y = G.get_tensor_by_name('Y:0')

        # Append loss function to graph
        Q = tf.placeholder(dtype=tf.float32, shape=[None], name='Q')
        A = tf.placeholder(dtype=tf.int32, shape=[None], name='A')
        mask = tf.one_hot(A, depth=3, dtype=tf.float32, axis=-1)
        L = tf.reduce_mean(tf.square(tf.reduce_sum(mask * Y, axis=-1) - Q),
                           name='L')

        # Define optimizer, training op, gradient clipping, etc.
        if not RELOAD_PARAMETERS:
            optimizer = tf.train.AdamOptimizer(LEARNING_RATE, name='Adam')
        else:
            optimizer = G.get_operation_by_name('Adam')
        saver = tf.train.Saver()
        # Initialize TF session
        with tf.Session() as sess:
            # Reload/initialize variables
            if RELOAD_PARAMETERS:
                print('Reloading from last checkpoint...')
                saver.restore(sess, save_path)
            else:
                print('Initializing variables...')
            gradients, variables = zip(*optimizer.compute_gradients(L))
            train_op = optimizer.apply_gradients(zip(gradients, variables))
            sess.run(tf.global_variables_initializer())
            # Iterate over episodes
            global_steps = 0
            for episode in range(max_episodes):
                obs = u.preprocess_image(env.reset())
                for i in range(3):
                    replay_memory.add_frame(
                        np.zeros((160 // DOWNSAMPLE, 160 // DOWNSAMPLE)))
                replay_memory.add_frame(obs)

                # Iterate over frames
                done = False
                frame = 0
                episode_score = [0, 0]
                while not done:
                    if (global_steps >= OBSERVE_STEPS):
                        # Feed state into DQN
                        s = np.stack(
                            [replay_memory.frames[i] for i in range(-4, 0)],
                            axis=-1).reshape(1, 160 // DOWNSAMPLE,
                                             160 // DOWNSAMPLE, 4)
                        y = sess.run(Y, feed_dict={X: s})

                        # Decide on action
                        epsilon = max(
                            MAX_EPSILON *
                            (1 - global_steps / EPSILON_ANNEALING_STEPS),
                            MIN_EPSILON)
                        if (np.random.rand() < epsilon):
                            a = np.random.choice([1, 2, 3])
                        else:
                            a = np.argmax(y) + 1
                    else:
                        a = np.random.choice([1, 2, 3])

                    # Take action, observe environment, reward
                    obs, r, done, _ = env.step(a)
                    r_sum = r
                    for i in range(STEPS_TO_SKIP):
                        obs, r, done_temp, _ = env.step(1)
                        r_sum += r
                        if done_temp == True:
                            done = True
                    if r_sum > 0:
                        episode_score[0] += r_sum
                    elif r_sum < 0:
                        episode_score[1] -= r_sum

                    # Add new state/reward to replay memory
                    replay_memory.add_frame(u.preprocess_image(obs))
                    experience = (np.stack(list(replay_memory.frames),
                                           axis=-1), a, r_sum, done)
                    replay_memory.add_exp(experience)

                    # Do training batch update
                    if (global_steps >= OBSERVE_STEPS):
                        S, A_, R, D = replay_memory.sample(BATCH_SIZE)
                        y2 = sess.run(Y, feed_dict={X: S[:, :, :, -4:]})
                        q = R + (1 - D) * GAMMA * np.max(y2, axis=1)
                        _, batch_loss = sess.run([train_op, L],
                                                 feed_dict={
                                                     X: S[:, :, :, -5:-1],
                                                     Q: q,
                                                     A: (A_ - 1)
                                                 })
                        if (batch_loss == np.nan):
                            print('nan error, exiting training')
                            exit()
                        elif (np.mean(np.max(y2, axis=-1)) > 1e2):
                            print('unstable Q value, exiting training')
                            exit()

                        # Print updates
                        print(
                            'Episode: {}/{},\tframe: {},\treward: {},\t<max(Q)>: {:.3e},\nmax(Q): {:.3e},\taction: {},\tcurrent std(Q)/mean(Q): {:.3e}'
                            .format(episode + 1, max_episodes, frame + 1,
                                    int(r_sum), np.mean(np.max(y2, axis=-1)),
                                    np.max(y), action_map[a],
                                    np.std(y) / np.mean(y)))

                        # Plot frame-by-frame metrics
                        if global_steps == 0:
                            avg_reward = r_sum
                        else:
                            avg_reward = (1 -
                                          np.exp(-1 / 500)) * r_sum + np.exp(
                                              -1 / 500) * avg_reward
                        if (global_steps % PLOT_EVERY_N_STEPS == 0):
                            step_list.append(global_steps)
                            reward_list.append(10 * avg_reward)
                            y_val = sess.run(Y, feed_dict={X: X_val})
                            val_Q_list.append(np.mean(np.max(y_val, axis=-1)))
                            u.plot_metrics(step_list, 'PongMetrics',
                                           'Pong Metrics', 'Global step', '',
                                           (val_Q_list, 'Validation <max(Q)>'),
                                           (reward_list, '10*<R>'))
                    else:
                        print('Observation step {}/{}'.format(
                            global_steps, OBSERVE_STEPS))

                    # Update state variables
                    global_steps += 1
                    frame += 1

                # Save parameters at end of episode, plot episode metrics
                saver.save(sess, SAVE_PATH)
                episode_length_list.append(frame * (STEPS_TO_SKIP + 1) / 1000)
                if episode == 0:
                    avg_episode_length = frame * (STEPS_TO_SKIP + 1)
                else:
                    avg_episode_length = (1 - np.exp(-1 / 10)) * frame * (
                        STEPS_TO_SKIP + 1) + np.exp(
                            -1 / 10) * avg_episode_length
                avg_episode_length_list.append(avg_episode_length / 1000)
                episode_score_list['player'].append(episode_score[0])
                episode_score_list['computer'].append(episode_score[1])
                u.plot_metrics(range(episode + 1), 'EpisodeLength',
                               'Episode Length', 'Episode', 'Length/1000',
                               (episode_length_list, 'Episode length'),
                               (avg_episode_length_list, 'Average'))
                u.plot_metrics(range(episode + 1), 'EpisodeScore',
                               'Episode Score', 'Episode', 'Score',
                               (episode_score_list['player'], 'Player'),
                               (episode_score_list['computer'], 'Computer'))