Python DQNPolicy примеры использования

Язык программирования: Python

Пространство имен/Пакет: dqn

Класс/Тип: DQNPolicy

Примеров на hotexamples.com: 5

Python DQNPolicy - 5 примеров найдено. Это лучшие примеры Python кода для dqn.DQNPolicy, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

DQNPolicy(5)

select_action(3)

update_policy(3)

Пример #1

Показать файл

def do_q_learning(env, reward_function, train_episodes, figure=False):
    alpha = 0.01
    gamma = 0.9
    epsilon = 0.1
    policy = DQNPolicy(env, lr=alpha, gamma=gamma, input=2,
                       output=4)  # 4 actions output, up, right, down, left
    replay_buffer = ReplayBuffer()
    # Play with a random policy and see
    # run_current_policy(env.env, policy)
    agg_interval = 100
    avg_history = {'episodes': [], 'timesteps': [], 'reward': []}
    # Train the network to predict actions for each of the states
    for episode_i in range(train_episodes):
        episode_timestep = 0
        episode_reward = 0.0
        env.__init__()
        # todo : the first current state should be 0
        cur_state = env.cur_state
        counter = 0
        done = False
        while not done:
            # Let each episode be of 30 steps
            counter += 1
            done = counter >= 30

            # todo : check if this line is working
            action = policy.select_action(cur_state.reshape(1, -1), epsilon)

            # take action in the environment
            next_state = env.step(action)
            reward = reward_function(next_state)

            # add the transition to replay buffer
            replay_buffer.add(cur_state, action, next_state, reward, done)

            # sample minibatch of transitions from the replay buffer
            # the sampling is done every timestep and not every episode
            sample_transitions = replay_buffer.sample()

            # update the policy using the sampled transitions
            policy.update_policy(**sample_transitions)

            episode_reward += reward
            episode_timestep += 1

            cur_state = next_state

        avg_history['episodes'].append(episode_i + 1)
        avg_history['timesteps'].append(episode_timestep)
        avg_history['reward'].append(episode_reward)

        learning_policy_progress.update()

    if figure:
        plt.plot(avg_history['episodes'], avg_history['reward'])
        plt.title('Reward')
        plt.xlabel('Episode')
        plt.ylabel('Reward')
        plt.show()
    return policy.q_model

Пример #2

Показать файл

Файл: utils.py Проект: JarryLc/ECE448-AI

def loadmodel(modelfile: str, env: gym.Env, statesize, actionsize):
    if '.model' in modelfile:
        # PyTorch
        pt_model = torch.load(modelfile)
        model = DQNPolicy(pt_model, statesize, actionsize, 0, None)
    elif '.npy' in modelfile:
        # Numpy
        pt_model = torch.load(modelfile)
        model = TabQPolicy(env, pt_model.shape[:-1], actionsize, 0, None, model=pt_model)
        pass
    else:
        raise Exception("Unknown model file extension")

    return model

Пример #3

Показать файл

    print("Total timesteps = {}, total reward = {}".format(
        total_step, total_reward))


# In[]:

cp_alpha = 0.001
cp_gamma = 0.95
cp_epsilon = 0.05
cp_avg_history = {'episodes': [], 'timesteps': [], 'reward': []}
agg_interval = 1
avg_reward = 0.0
avg_timestep = 0

# initialize policy and replay buffer
cp_policy = DQNPolicy(cp_env, lr=cp_alpha, gamma=cp_gamma)
replay_buffer = ReplayBuffer()
cp_start_episode = 0

# Play with a random policy and see
# run_current_policy(cp_env.env, cp_policy)

cp_train_episodes = 200
pbar_cp = tqdm(total=cp_train_episodes)

# In[]:

# Train the network to predict actions for each of the states
for episode_i in range(cp_start_episode, cp_start_episode + cp_train_episodes):
    episode_timestep = 0
    episode_reward = 0.0

Пример #4

Показать файл

Файл: dqn_mountain_car_v0.py Проект: shaktikshri/adaptiveSystems

# In[]:

env = gym.make('MountainCar-v0')
# env = gym.make('CartPole-v0')

# TODO : Can change these parameters
lr = 0.001
# TODO : Need to do the epsilon decay
epsilon = 1
epsilon_decay = 0.05
epsilon_min = 0.01
gamma = 0.99
hidden_dim = 24
mod_episode = 10

env_policy = DQNPolicy(env, lr, gamma, hidden_dim)
replay_buffer = ReplayBuffer()
total_train_episodes = 500

# play with a random policy
# run_current_policy(env_policy, env, env.reset())

# In[]:
history = dict({'reward':list(), 'timesteps':list(), 'episodes':list()})

for episode in range(total_train_episodes):
    done = False
    # print('Epoch :', episode + 1)
    ep_reward = 0
    ep_timesteps = 0
    cur_state = env.reset()

Пример #5

Показать файл

                np.dot(gamma_matrix.reshape(1, -1),
                       basis.pdf(trajectory).reshape(-1, 1))[0][0])
        values_all_trajectories.append(values)
        trajectory_progress.update()
    values_all_trajectories = np.array(values_all_trajectories)
    # values_all_trajectories is a 5000*225 array
    values_per_basis = values_all_trajectories.mean(axis=0)
    return values_per_basis


# In[]:

true_values_per_basis = run_trajectories(
    true_policy)  # it is the value of state(0,0) as per the best policy
# true_values_per_basis is a (225,) vector
policy = DQNPolicy(env, 0.01, 0.9, input=2, output=4).q_model

# In[]:

# Do the inductive step again and again
for iterations in range(1):
    # print('Running Trajectory for the policy')
    trajectory_progress = tqdm(total=5000)
    list_of_values_per_basis = np.append(list_of_values_per_basis,
                                         run_trajectories(policy).reshape(
                                             1, -1),
                                         axis=0)
    # it is the value of state(0,0) as per the candidate policies
    # list_of_values_per_basis is a K*225 dimensional matrix where K is the number of candidate policies

    # Now need to do Linear Program