예제 #1
0
def do_q_learning(env, reward_function, train_episodes, figure=False):
    alpha = 0.01
    gamma = 0.9
    epsilon = 0.1
    policy = DQNPolicy(env, lr=alpha, gamma=gamma, input=2,
                       output=4)  # 4 actions output, up, right, down, left
    replay_buffer = ReplayBuffer()
    # Play with a random policy and see
    # run_current_policy(env.env, policy)
    agg_interval = 100
    avg_history = {'episodes': [], 'timesteps': [], 'reward': []}
    # Train the network to predict actions for each of the states
    for episode_i in range(train_episodes):
        episode_timestep = 0
        episode_reward = 0.0
        env.__init__()
        # todo : the first current state should be 0
        cur_state = env.cur_state
        counter = 0
        done = False
        while not done:
            # Let each episode be of 30 steps
            counter += 1
            done = counter >= 30

            # todo : check if this line is working
            action = policy.select_action(cur_state.reshape(1, -1), epsilon)

            # take action in the environment
            next_state = env.step(action)
            reward = reward_function(next_state)

            # add the transition to replay buffer
            replay_buffer.add(cur_state, action, next_state, reward, done)

            # sample minibatch of transitions from the replay buffer
            # the sampling is done every timestep and not every episode
            sample_transitions = replay_buffer.sample()

            # update the policy using the sampled transitions
            policy.update_policy(**sample_transitions)

            episode_reward += reward
            episode_timestep += 1

            cur_state = next_state

        avg_history['episodes'].append(episode_i + 1)
        avg_history['timesteps'].append(episode_timestep)
        avg_history['reward'].append(episode_reward)

        learning_policy_progress.update()

    if figure:
        plt.plot(avg_history['episodes'], avg_history['reward'])
        plt.title('Reward')
        plt.xlabel('Episode')
        plt.ylabel('Reward')
        plt.show()
    return policy.q_model
예제 #2
0
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 hidden_layers=[64, 64],
                 buffer_size=int(1e5),
                 batch_size=64,
                 gamma=0.99,
                 tau=1e-3,
                 learning_rate=5e-4,
                 update_every=4):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            hidden_layers (list of int ; optional): number of each layer nodes
            buffer_size (int ; optional): replay buffer size
            batch_size (int; optional): minibatch size
            gamma (float; optional): discount factor
            tau (float; optional): for soft update of target parameters
            learning_rate (float; optional): learning rate
            update_every (int; optional): how often to update the network
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr = learning_rate
        self.update_every = update_every

        # detect GPU device
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        # Q-Network
        model_params = [state_size, action_size, seed, hidden_layers]
        self.qnetwork_local = QNetwork(*model_params).to(self.device)
        self.qnetwork_target = QNetwork(*model_params).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.lr)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size, seed, self.device)
        # Initialize time step (for updating every self.update_every steps)
        self.t_step = 0
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 buffer_size=int(1e5),
                 batch_size=64,
                 gamma=0.99,
                 tau=1e-3,
                 lr=5e-4,
                 update_every=4,
                 checkpoint_file='checkpoint.pth'):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            buffer_size(int):  replay buffer size
            batch_size(int):   minibatch size
            gamma:             discount factor
            tau:               for soft update of target parameters
            lr:                learning rate 
            update_every:      how often to update the network
            
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr = lr
        self.update_every = update_every
        self.checkpoint_file = checkpoint_file
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(self.device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed,
                                   self.device)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
예제 #4
0
def test_buffer_replace():
    shape = (2, 2)
    capacity = 2
    buffer = ReplayBuffer(capacity)
    for i in range(10):
        x = onp.ones(shape) * i
        a, r = i, i
        discount = 1.0
        timestep = dm_env.TimeStep(dm_env.StepType.FIRST, r, discount, x)
        buffer.add(timestep, a, timestep)
        logging.debug("i: {},  r: {}, len(buffer): {}".format(
            i, capacity, len(buffer)))
    # make sure the buffer recycles if adding more elements than its capacity
    assert len(buffer) == capacity
    # make sure the oldest elements are recycled
    assert onp.array_equal(
        onp.array([buffer[i].s for i in range(len(buffer))]),
        onp.array([[[8.0, 8.0], [8.0, 8.0]], [[9.0, 9], [9.0, 9.0]]],
                  dtype=onp.float32),
    )
    assert onp.array_equal(
        onp.array([buffer[i].r for i in range(len(buffer))]),
        onp.array([8.0, 9.0], dtype=onp.float32),
    )
    assert onp.array_equal(
        onp.array([buffer[i].a for i in range(len(buffer))]),
        onp.array([8.0, 9.0], dtype=onp.float32),
    )
    # try sampling with n < len(buffer)
    batch = buffer.sample(1)
    assert len(batch[0]) == 1
    logging.debug(batch)
    # try sampling wiht n == len(buffer)
    batch = buffer.sample(2)
    assert len(batch[0]) == len(buffer)
    logging.debug(batch)
    # try sampling with n > len(buffer)
    batch = buffer.sample(3)
    assert len(batch[0]) == len(buffer)
    logging.debug(batch)
    return
예제 #5
0
from numpy import savetxt
USE_CUDA = torch.cuda.is_available()
from dqn import QLearner, compute_td_loss, ReplayBuffer

env_id = "PongNoFrameskip-v4"  # established environment that will be played
env = make_atari(env_id)
env = wrap_deepmind(env)
env = wrap_pytorch(env)

num_frames = 1000000  # total frames that will be learning from
batch_size = 32  # the number of samples that are provided to the model for update services at a given time
gamma = 0.99  # the discount of future rewards
record_idx = 10000  #

replay_initial = 10000  # number frames that are held
replay_buffer = ReplayBuffer(100000)
model = QLearner(env, num_frames, batch_size, gamma, replay_buffer)
model.load_state_dict(
    torch.load("model_pretrained.pth",
               map_location='cpu'))  #loading in the pretrained model

target_model = QLearner(env, num_frames, batch_size, gamma,
                        replay_buffer)  #load in model
target_model.copy_from(model)

optimizer = optim.Adam(model.parameters(),
                       lr=0.0001)  #learning rate set and optimizing the model
if USE_CUDA:
    model = model.cuda()  # sends model to gpu
    target_model = target_model.cuda()
    print("Using cuda")
예제 #6
0
        total_step, total_reward))


# In[]:

cp_alpha = 0.001
cp_gamma = 0.95
cp_epsilon = 0.05
cp_avg_history = {'episodes': [], 'timesteps': [], 'reward': []}
agg_interval = 1
avg_reward = 0.0
avg_timestep = 0

# initialize policy and replay buffer
cp_policy = DQNPolicy(cp_env, lr=cp_alpha, gamma=cp_gamma)
replay_buffer = ReplayBuffer()
cp_start_episode = 0

# Play with a random policy and see
# run_current_policy(cp_env.env, cp_policy)

cp_train_episodes = 200
pbar_cp = tqdm(total=cp_train_episodes)

# In[]:

# Train the network to predict actions for each of the states
for episode_i in range(cp_start_episode, cp_start_episode + cp_train_episodes):
    episode_timestep = 0
    episode_reward = 0.0
예제 #7
0
update_episode = 5
weight_decay = 10

avg_history = {'episodes': [], 'timesteps_unweighted': [], 'timesteps_weighted': [],
               'unweighted_reward': [], 'weighted_reward':[],
               'loss_unweighted': [], 'loss_weighted': []}
agg_interval = 10

# initialize policy and replay buffer
policy = DQN(input_size=env.observation_space.shape[0], output_size=env.action_space.n, hidden_size=24)

# TODO : Check if this can perform better in a smaller no. of hidden sizes
policy_weighted = DQN(input_size=env.observation_space.shape[0], output_size=env.action_space.n, hidden_size=24)
optimizer_weighted = Adam(policy_weighted.parameters(), lr=learning_rate_weighted)

replay_buffer = ReplayBuffer()
replay_buffer_weighted = ReplayBuffer()

optimizer = Adam(policy.parameters(), lr=learning_rate_unweighted)
start_episode = 0

# Play with a random policy and see
# run_current_policy(env.env, policy)

train_episodes = 2000

# In[]:


# Update the policy by a TD(0)
def update_policy(cur_states, actions, next_states, rewards, dones):
예제 #8
0
env_id = "PongNoFrameskip-v4"
env = make_atari(env_id)
env = wrap_deepmind(env, frame_stack=USE_FRAME_STACK)
env = wrap_pytorch(env)

train_num_frames = 5000000
sample_num_frames = 50000
batch_size = 32
gamma = 0.99
target_update = 50000
epsilon_start = 1.0
epsilon_final = 0.01
epsilon_decay = 1000000
replay_initial = 10000
learning_rate = 1e-5
train_replay_buffer = ReplayBuffer(100000)
analysis_replay_buffer = ReplayBuffer(100000)

policy_model = QLearner(env, train_num_frames, batch_size, gamma,
                        train_replay_buffer)
target_model = QLearner(env, train_num_frames, batch_size, gamma,
                        train_replay_buffer)
target_model.load_state_dict(policy_model.state_dict())
target_model.eval()

optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if USE_CUDA:
    policy_model = policy_model.to(device)
    target_model = target_model.to(device)
예제 #9
0
    plt.figure(2)
    plt.plot([reward[0] for reward in all_rewards], [reward[1] for reward in all_rewards])
    plt.xlabel('Frame #')
    plt.ylabel('Episode Reward')
    plt.savefig(f'rewards_lr={lr}.pdf')

USE_CUDA = torch.cuda.is_available()

# Set up game
env_id = "PongNoFrameskip-v4"
env = make_atari(env_id)
env = wrap_deepmind(env)
env = wrap_pytorch(env)

replay_buffer = ReplayBuffer(replay_buff_size)                                  # Buffer size
model = QLearner(env, num_frames, batch_size, gamma, replay_buffer)             # Create model
model.load_state_dict(torch.load("model_pretrained.pth", map_location='cpu'))
model.eval()

target_model = QLearner(env, num_frames, batch_size, gamma, replay_buffer)      # Create target model
target_model.copy_from(model)

# Optimize model's parameters
optimizer = optim.Adam(model.parameters(), lr=lr)
if USE_CUDA:
    model = model.cuda()
    target_model = target_model.cuda()
    print("Using cuda")

# Neg exp func. Start exploring then exploiting according to frame_indx
예제 #10
0
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu')
    #device = torch.device('cpu')
    print(device)
    env = make_env(seed)
    state_shape = env.observation_space.shape
    n_actions = env.action_space.n
    state = env.reset()

    agent = DQNAgent(state_shape, n_actions, epsilon=0.9).to(device)
    #agent.load_state_dict(torch.load('dqn.weights'))
    target_network = DQNAgent(state_shape, n_actions).to(device)
    target_network.load_state_dict(agent.state_dict())
    opt = torch.optim.Adam(agent.parameters(), lr=1e-4)
    exp_replay = ReplayBuffer(buffer_size)

    print('test_buffer')
    for i in range(100):
        play_and_record(state, agent, env, exp_replay, n_steps=10**2)
        if len(exp_replay) == buffer_size:
            break
    print(len(exp_replay))

    state = env.reset()
    for step in trange(step, total_steps + 1):

        agent.epsilon = linear_decay(init_epsilon, final_epsilon, step,
                                     decay_steps)

        # play
예제 #11
0
USE_CUDA = torch.cuda.is_available()
from dqn import QLearner, compute_td_loss, ReplayBuffer
import pickle as pkl

env_id = "PongNoFrameskip-v4"
env = make_atari(env_id)
env = wrap_deepmind(env)
env = wrap_pytorch(env)

num_frames = 2000000
batch_size = 32
gamma = 0.99

replay_initial = 20000
replay_buffer = ReplayBuffer(200000)
model = QLearner(env, num_frames, batch_size, gamma, replay_buffer)
optimizer = optim.Adam(model.parameters(), lr=0.00001)
if USE_CUDA:
    model = model.cuda()

epsilon_start = 1.0
epsilon_final = 0.01
epsilon_decay = 30000
epsilon_by_frame = lambda frame_idx: epsilon_final + (
    epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

losses = []
all_rewards = []
episode_reward = 0
예제 #12
0
def train(env_id,
          lr=1e-4,
          gamma=0.99,
          memory_size=1000,
          batch_size=32,
          train_timesteps=10000,
          train_start_time=1000,
          target_update_frequency=1000,
          init_epsilon=1,
          final_epsilon=0.1,
          epsilon_decay=300,
          model_path=None):

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    LOG_PATH = f'logs/dqn_log_{env_id}.txt'

    if get_env_type(env_id) == 'atari':
        env = make_atari(env_id)
        env = wrap_deepmind(env)
        env = wrap_pytorch(env)

        model_type = 'conv'
    else:
        env = gym.make(env_id)

        model_type = 'linear'

    obs_shape = env.observation_space.shape
    num_actions = env.action_space.n

    memory = ReplayBuffer(memory_size)

    agent = DQN(obs_shape, num_actions, lr, gamma, device, model_type)
    policy = EpsilonGreedy(agent, num_actions, init_epsilon, final_epsilon,
                           epsilon_decay)

    # populate replay memory
    obs = env.reset()
    for t in range(train_start_time):

        # uniform random policy
        action = random.randrange(num_actions)
        next_obs, reward, done, _ = env.step(action)
        memory.add(obs, action, reward, next_obs, done)

        obs = next_obs

        if done:
            # start a new episode
            obs = env.reset()

    # for monitoring
    ep_num = 1
    ep_start_time = 1
    episode_reward = 0
    reward_list = []

    # train start
    obs = env.reset()
    for t in tqdm.tqdm(range(1, train_timesteps + 1)):

        # choose action
        action = policy.act(obs, t)
        next_obs, reward, done, _ = env.step(action)
        memory.add(obs, action, reward, next_obs, done)

        obs = next_obs

        # sample batch transitions from memory
        transitions = memory.sample(batch_size)
        # train
        loss = agent.train(transitions)

        # record reward
        episode_reward += reward

        # update target network at every C timesteps
        if t % target_update_frequency == 0:
            agent.update_target()

        if done:
            # start a new episode
            obs = env.reset()

            # write log
            with open(LOG_PATH, 'a') as f:
                f.write(f'{ep_num}\t{episode_reward}\t{ep_start_time}\t{t}\n')

            if model_path is not None:
                # save model
                info = {
                    'epoch': ep_num,
                    'timesteps': t,
                }
                agent.save(model_path, info)

            ep_num += 1
            ep_start_time = t + 1
            reward_list.append(episode_reward)
            episode_reward = 0