示例#1
0
 def showExamples(self, env):
     init_state = np.array([0,29,82])
     gridworld = env.genGridWorld()
     cout = 0
     frames = []
     for i in range(3):
         step = 0
         pre_states = []
         self.state = init_state[i]
         converge = False
         while True:
             pre_states.append(self.state)
             action = np.argmax(self.policy[self.state])
             _, next_state, _, _ = env.P[self.state][action][0]
             ax, fig = env.showWorld(gridworld, tlt="Round {0}, Step {1}".format(i+1, step))
             env.movingAgent(gridworld, ax, self.state, pre_states)
             image = fig_to_image(fig)
             frames.append(image)
             plt.close()
             self.state = next_state
             if converge == True:
                 cout +=1
                 break
             if self.state == self.terminate_state:
                 converge = True
             step += 1
             cout += 1
     file_dir = get_dirs(os.path.join(RESULT_PATH, "DP"))
     imageio.mimsave(os.path.join(file_dir, "pi_test.gif"), frames, fps=5)
示例#2
0
 def run_policy(self, env):
     frames = []
     state = env.reset()
     policy = self.epsilon_greedy_policy(env.action_space.n)
     for _ in itertools.count():
         frames.append(env.render(mode='rgb_array'))
         action_prob = policy(state)
         action = np.random.choice(np.arange(len(action_prob)), p=action_prob)
         next_step, _, done, _ = env.step(action)
         if done:
             env.close()
             break
         state = next_step
     save_path = get_dirs(os.path.join(RESULT_PATH, "Qlearning"))
     imageio.mimsave(save_path + "/vfa_car_qlearning.gif", frames, fps=30)
示例#3
0
def main():
    # create a Mountain Car environment
    env = MountainCarEnv()
    # create a approximator for the agent
    approximator = Approximator(env)
    # create an agent that knows how to perform Q learing method with value function approximation method
    agent_qlearning_vfa = Agent_QLearning_VFA(approximator)
    # perform q learning with function approximation method
    Q_stats = agent_qlearning_vfa.q_learning_fa(env=env, num_episodes=200)
    #
    plot_episode_stats(stats=Q_stats)
    save_path = get_dirs(os.path.join(RESULT_PATH, "Qlearning"))
    plt.savefig(save_path+ "/vfa_rewards.png")
    # plt.show()
    #
    agent_qlearning_vfa.run_policy(env)
示例#4
0
 def q_learning_fa(self,env, num_episodes, discount_factor=1.0, epsilon=0.1):
     frames = []
     episode_stats = EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))
     for i_episode in range(num_episodes):
         print("Episode {0}".format(i_episode))
         # The policy we are following
         policy = self.epsilon_greedy_policy(nA=env.action_space.n, epsilon=epsilon)
         # Reset the environment and pick the first action
         state = env.reset()
         if i_episode % 5 == 0:
             fig = plot_cost_mountain_car(env, self.approximator, step=i_episode)
             image = fig_to_image(fig)
             frames.append(image)
             plt.close()
         for t in itertools.count():
             # Choose an action
             action_probs = policy(state)
             action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
             # Take a step.
             next_state, reward, done, _ = env.step(action)
             # store staatistics
             episode_stats.episode_lengths[i_episode] = t
             episode_stats.episode_rewards[i_episode] += reward
             # TD Update.
             q_values_next = self.approximator.predict(next_state)
             # compute TD target.
             td_target = reward + discount_factor * np.max(q_values_next)
             # update the parameters in function approximator.
             self.approximator.update(state, action, td_target)
             if done:
                 break
             state = next_state
     save_path = get_dirs(os.path.join(RESULT_PATH, "Qlearning"))
     imageio.mimsave(save_path + "/vfa_values_qlearning.gif", frames, fps=20)
     return episode_stats
示例#5
0
#
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import os
import imageio
# from gym.envs.toy_text.cliffwalking import CliffWalkingEnv
from rl.env.gridWorld import GridWorldEnv
from rl.misc.utilies import ROOT_PATH, get_dirs
from rl.misc.utilies import fig_to_image


RESULT_PATH = get_dirs(os.path.join(ROOT_PATH, "results"))


class Agent:

    def __init__(self, env):
        self.V = np.zeros(env.nS)
        self.policy = np.ones([env.nS, env.nA]) / env.nA
        self.init_state = 0
        self.state = None
        self.terminate_state = 46

    def policy_evaluation(self, env, policy, theta=0.00001, discount_factor=0.9):
        V = np.zeros(env.nS)
        while True:
            Delta = 0.0
            for s in range(env.nS):
                v = 0
                for a, action_prob in enumerate(policy[s]):
示例#6
0
from rl.misc.utilies import get_dirs
import os

results_path = os.path.join(os.path.realpath("../../../"), 'results')
freps_path = get_dirs(os.path.join(results_path, 'freps'))
示例#7
0
import numpy as np
import matplotlib.pyplot as plt
import itertools
from collections import defaultdict
import os
import imageio
#
from rl.env.cliff_walking import CliffWalkingEnv
from rl.misc.utilies import get_dirs, fig_to_image
from rl.algo.td.sarsa.SARSA import Agent_SARSA
#
ROOT_PATH = os.path.realpath("../../../../")
#
RESULT_PATH = os.path.join(ROOT_PATH, "results")
#
Q_SAVE_PATH = get_dirs(os.path.join(RESULT_PATH, "Qlearning"))
S_SAVE_PATH = get_dirs(os.path.join(RESULT_PATH, "SARSA"))


class Agent_QLearning:

    def __init__(self, env):
        self.Q = defaultdict(lambda : np.zeros(env.nA))
        self.policy = np.ones([env.nS, env.nA]) / env.nA

    def epsilon_greedy_policy(self, env, Q, epsilon=0.1):
        def policy_fn(observation):
            A = np.ones(env.nA, dtype=float) * epsilon / env.nA
            best_action = np.argmax(Q[observation])
            A[best_action] += (1.0 - epsilon)
            return A
示例#8
0
import tensorflow as tf
import os
import ot
import numpy as np
import pandas as pd
#
from rl.misc.utilies import get_dirs
#
env_ID = "Pendulum-v0"
#
path_all_results = os.path.join(os.path.realpath("../../../"), 'results')
path_ppo_results = get_dirs(os.path.join(path_all_results, 'ppo'))
path_env_result = get_dirs(os.path.join(path_ppo_results, env_ID))
path_csv = os.path.join(path_env_result, 'data.csv')
#
columns = ['methods', 'alphas', 'trials', 'episodes', 'rewards', 'losses_c', 'losses_a', 'divergences', 'entropies', "beta"]
#
data = pd.DataFrame(columns=columns)
seed = 12345
#
params = {'methods': {'clip': [None],
                      # 'f': [1.0, 2.0, 'GAN'],
                      # 'w2': [None]
                      },
          'num_trials': 5,
          'num_episodes': 100,
          'num_sample_trans': 3200,
          'epochs': 10,
          'batch_size': 32,
          'gamma': 0.99,
          'lam': 0.95,
示例#9
0
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
from collections import namedtuple
#
import sklearn.pipeline
import sklearn.preprocessing
from sklearn.linear_model import SGDRegressor
from sklearn.kernel_approximation import RBFSampler
#
from rl.misc.utilies import get_dirs, fig_to_image
from rl.algo.td.util import plot_episode_stats, plot_cost_mountain_car
from gym.envs.classic_control.mountain_car import MountainCarEnv
#
# Global variables
ROOT_PATH = os.path.realpath("../../../../")
RESULT_PATH = get_dirs(os.path.join(ROOT_PATH, 'results'))
EpisodeStats = namedtuple("Stats", ["episode_lengths", "episode_rewards"])


#
class Approximator:
    """
    Value Function Approximator
    """
    def __init__(self, env):
        self.observation_examples = np.array(
            [env.observation_space.sample() for x in range(10000)])
        self.scaler = self.standard_scaler(env)
        self.featurizer = self.sklearn_featurizer()
        # I don't quite understand this part.
        self.models = []
示例#10
0
import numpy as np
import matplotlib.pyplot as plt
import itertools
import os
import imageio
from collections import defaultdict
from rl.misc.utilies import get_dirs, fig_to_image
from rl.env.windy_gridWorld import WindyGridworldEnv
# import constants_TD as C
#
ROOT_PATH = os.path.realpath("../../../../")
#
RESULT_PATH = get_dirs(os.path.join(ROOT_PATH, "results"))
#
SAVE_PATH = get_dirs(os.path.join(RESULT_PATH, "SARSA"))


class Agent_SARSA:
    def __init__(self, env):
        self.Q = defaultdict(lambda: np.zeros(env.nA))
        self.policy = np.ones([env.nS, env.nA]) / env.nA

    def epsilon_greedy_policy(self, Q, epsilon, nA):
        def policy_fn(observation):
            A = np.ones(nA, dtype=float) * epsilon / nA
            best_action = np.argmax(Q[observation])
            A[best_action] += (1.0 - epsilon)
            return A

        return policy_fn