예제 #1
0
 def __init__(self, max_angle=12, max_num_steps=1000):
     self.env = CartPoleEnv()
     # self.env.theta_threshold_radians = max_angle * 2 * math.pi / 360
     self.observation_space = self.env.observation_space
     self.action_space = self.env.action_space
     self.step_counter = 0
     self.max_num_steps = max_num_steps
예제 #2
0
    def __init__(self, noise_type='uniform', noise_scale=0.0, init_scale=0.0):
        self.noise_type = noise_type
        assert self.noise_type in ['normal', 'uniform']
        self.noise_scale = noise_scale
        self.init_scale = init_scale

        CartPoleEnv.__init__(self)
예제 #3
0
def run_cartpole_reinforce(args, log_dir="./logs/reinforce"):
    os.makedirs(log_dir, exist_ok=True)
    env = CartPoleEnv()
    agent: PolicyAgent = PolicyAgent(env.observation_space.shape[0],
                                     env.action_space.n)
    env.seed(args.seed)
    torch.manual_seed(args.seed)
    env = BenchMonitor(env, log_dir, allow_early_resets=True)
    train(env, agent, args)
    return agent, env
예제 #4
0
def test_random_agent():
    from agentos.agents import RandomAgent
    from gym.envs.classic_control import CartPoleEnv

    environment = CartPoleEnv()
    environment.reset()
    agent = RandomAgent(environment=environment)
    done = agent.advance()
    assert not done, "CartPole never finishes after one random step."
    run_agent(agent)
예제 #5
0
def run_cartpole_dqn(num_batches=1000,
                     batch_size=32,
                     log_dir="./logs/dqn",
                     seed=0):
    os.makedirs(log_dir, exist_ok=True)
    env = CartPoleEnv()
    env.seed(seed)
    torch.manual_seed(seed)
    agent = CartPoleAgent(env.observation_space, env.action_space)
    from baselines.bench import Monitor as BenchMonitor

    env = BenchMonitor(env, log_dir, allow_early_resets=True)
    train(agent, env, num_batches=num_batches, batch_size=batch_size)
    return agent, env
예제 #6
0
def get():
    env = CartPoleEnv()

    for i_episode in range(10000):
        observation = env.reset()
        action = chose_action(model=model)
        while True:
            observation_, reward, done, info = env.step(action)
            x, x_dot, theta, theta_dot = observation_
            r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
            r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
            reward = r1 + r2
            ransition = np.hstack((observation, [action, reward], observation_))
            print()
예제 #7
0
파일: my_brain.py 프로젝트: bigcong/io
def save():
    env = CartPoleEnv()

    total_steps = 0
    memory = []

    memory_counter = 0
    for i_episode in range(100):

        observation = env.reset()
        while True:
            env.render()
            action = env.action_space.sample()

            observation_, reward, done, info = env.step(action)

            x, x_dot, theta, theta_dot = observation_
            r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
            r2 = (env.theta_threshold_radians -
                  abs(theta)) / env.theta_threshold_radians - 0.5
            reward = r1 + r2

            transition = np.hstack((observation, [action,
                                                  reward], observation_))
            memory.append(transition)

            if done:
                break

            observation = observation_
            total_steps += 1
    memory = np.array(memory)
    np.save("memory.npy", memory)

    env.close()
예제 #8
0
def go2():
    env = CartPoleEnv()
    episode_step_counter = 0
    for i_episode in range(10000):
        action = env.reset()

        step_counter = 0
        while True:
            env.render()
            # 随机选择一个action
            # 获取环境给予的奖励
            observation_, reward, done, info = env.step(action)

            x, x_dot, theta, theta_dot = observation_
            r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
            r2 = (env.theta_threshold_radians -
                  abs(theta)) / env.theta_threshold_radians - 0.5
            reward = r1 + r2
            print(reward)

            step_counter = step_counter + 1

            if done:
                episode_step_counter += step_counter
                # print("第{}回合,坚持了{}步".format(i_episode, step_counter))
                print("平均步数:{}".format(episode_step_counter / (i_episode + 1)))

                break

    env.close()
예제 #9
0
def run_cartpole_a2c(args: A2CParams, log_dir="./logs/a2c"):
    os.makedirs(log_dir, exist_ok=True)
    env = CartPoleEnv()
    env = BenchMonitor(env, log_dir, allow_early_resets=True)
    env = CartPoleEnvSelfReset(env)
    # env.seed(params.seed)
    # torch.manual_seed(params.seed)
    agent: CartPoleA2CAgent = CartPoleA2CAgent(env.observation_space,
                                               env.action_space, args)
    exp_mem = build_experience_memory(agent, env, args.num_rollout_steps)

    w = World(env, agent, exp_mem)
    with torch.no_grad():
        w.agent.eval()
        gather_exp_via_rollout(w.env, w.agent, w.exp_mem,
                               args.num_rollout_steps)
    optimizer = torch.optim.Adam(agent.parameters(), args.lr)

    for k in tqdm(range(args.num_batches)):
        with torch.no_grad():
            w.agent.eval()
            rollout = collect_experiences_calc_advantage(w, args)

        train_batch(w.agent, rollout, optimizer)

    return agent, env
예제 #10
0
class TFPolicy(agentos.Policy):
    def __init__(self, tf_model):
        self.tf_model = tf_model
        self.observation_space = CartPoleEnv().observation_space
        self.action_space = CartPoleEnv().action_space

    def compute_action(self, obs):
        assert self.observation_space.contains(obs), obs
        action = self.tf_model(np.array(obs)[np.newaxis])
        env_compatible_action = int(round(action.numpy()[0][0]))
        assert self.action_space.contains(
            env_compatible_action), env_compatible_action
        return env_compatible_action

    def __deepcopy__(self, memo):
        return TFPolicy(keras.models.clone_model(self.tf_model))
예제 #11
0
def test_order_enforcing():
    """Checks that the order enforcing works as expected, raising an error before reset is called and not after."""
    # The reason for not using gym.make is that all environments are by default wrapped in the order enforcing wrapper
    env = CartPoleEnv()
    assert not has_wrapper(env, OrderEnforcing)

    # Assert that the order enforcing works for step and render before reset
    order_enforced_env = OrderEnforcing(env)
    assert order_enforced_env._has_reset is False
    with pytest.raises(ResetNeeded):
        order_enforced_env.step(0)
    with pytest.raises(ResetNeeded):
        order_enforced_env.render(mode="rgb_array")
    assert order_enforced_env._has_reset is False

    # Assert that the Assertion errors are not raised after reset
    order_enforced_env.reset()
    assert order_enforced_env._has_reset is True
    order_enforced_env.step(0)
    order_enforced_env.render(mode="rgb_array")

    # Assert that with disable_render_order_enforcing works, the environment has already been reset
    env = CartPoleEnv()
    env = OrderEnforcing(env, disable_render_order_enforcing=True)
    env.render(mode="rgb_array")  # no assertion error
예제 #12
0
파일: my_brain.py 프로젝트: bigcong/io
def go():
    env = CartPoleEnv()

    total_steps = 0
    memory = []
    model = create_model()

    epsilon = 0.9
    memory_counter = 1000
    for i_episode in range(1000):

        observation = env.reset()
        ep_r = 0

        while True:
            env.render()

            if np.random.uniform() < epsilon:
                actions_value = model.predict(np.array([observation]))
                action = np.argmax(actions_value)
            else:
                action = env.action_space.sample()

            observation_, reward, done, info = env.step(action)

            x, x_dot, theta, theta_dot = observation_
            r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
            r2 = (env.theta_threshold_radians -
                  abs(theta)) / env.theta_threshold_radians - 0.5
            reward = r1 + r2
            transition = np.hstack((observation, [action,
                                                  reward], observation_))
            memory.append(transition)
            if len(memory) > memory_counter:
                xx, yy = get_data(np.array(memory), model)
                print(xx.shape)
                model.fit(xx, yy, epochs=10)
                epsilon = epsilon + 0.00001
                memory = []
                # memory_counter = memory_counter + 5
            ep_r = ep_r + reward

            if done:
                # print(ep_r)

                break

            observation = observation_
            total_steps += 1

    model.save("logs/cp.h5")
    model.summary()
    env.close()
예제 #13
0
    def test_openai_gym(self):
        self.start_tests(name='openai-gym')
        self.unittest(environment=dict(environment='gym', level='CartPole-v1'),
                      num_episodes=2)

        from gym.envs.classic_control import CartPoleEnv

        self.unittest(environment=dict(environment='gym',
                                       level=CartPoleEnv(),
                                       max_episode_timesteps=100),
                      num_episodes=2)
예제 #14
0
def run_cartpole_reinforce(args, log_dir="./logs/reinforce"):
    os.makedirs(log_dir, exist_ok=True)
    env = CartPoleEnv()
    agent = CartPoleReinforceAgent(env.observation_space.shape[0], env.action_space.n)
    env.seed(args.seed)
    torch.manual_seed(args.seed)
    env = BenchMonitor(env, log_dir, allow_early_resets=True)
    env = CartPoleEnvSelfReset(env)

    exp_mem = build_experience_memory(agent, env, args.num_rollout_steps)
    w = World(env, agent, exp_mem)

    with torch.no_grad():
        w.agent.eval()
        gather_exp_via_rollout(w.env, w.agent, w.exp_mem, args.num_rollout_steps)

    optimizer = torch.optim.Adam(agent.parameters(), args.lr)

    for k in tqdm(range(args.num_batches)):
        with torch.no_grad():
            agent.eval()
            batch = do_rollout(w, args)
        train_batch(agent, batch, optimizer)

    return agent, env
예제 #15
0
    def test_discrete_vectorized_original_equality(self):
        venv = DiscreteVectorizedCartPoleEnv()
        state, action = self.state_action
        action = (action > 0).astype(int)

        dim1, dim2 = self.dims

        venv.state = state
        vobs, vreward, vdone, _ = venv.step(action)

        env = CartPoleEnv()
        for i in range(dim1):
            for j in range(dim2):
                env.reset()
                env.state = state[i, j]
                obs, reward, done, _ = env.step(int(action[i, j, 0]))

                np.testing.assert_allclose(obs, vobs[i, j])
                np.testing.assert_allclose(reward, vreward[i, j])
                np.testing.assert_allclose(done, vdone[i, j])
예제 #16
0
class CartPoleDictEnvWrapper(gym.Env):
    def __init__(self, max_angle=12, max_num_steps=1000):
        self.env = CartPoleEnv()
        # self.env.theta_threshold_radians = max_angle * 2 * math.pi / 360
        self.observation_space = self.env.observation_space
        self.action_space = self.env.action_space
        self.step_counter = 0
        self.max_num_steps = max_num_steps

    def step(self, action):
        if isinstance(action, numpy.ndarray):
            action = action[0]
        assert isinstance(action, numpy.int64)
        obs, _, done, _ = self.env.step(action)
        self.step_counter += 1
        if self.step_counter % self.max_num_steps == 0:
            done = True
        if done:
            reward = -10.0
            obs = self.env.reset()
        else:
            reward = 0.0
        return {"observation": obs, "reward": reward, "done": int(done)}

    def reset(self):
        obs = self.env.reset()
        return {"observation": obs, "reward": 0.0, "done": int(False)}

    def render(self, mode="human"):
        return self.env.render(mode)

    def close(self):
        self.env.close()

    def seed(self, seed=None):
        return self.env.seed(seed)
예제 #17
0
from tensorforce.environments.openai_gym import OpenAIGym
from env_gym import SimplePendulumEnv
from gym.envs.classic_control import CartPoleEnv
from tensorforce.execution import Runner
import os

batch_size = 10
n_step = 2000

# Instantiate the environment
n_env = 12

list_envs = []

# env = OpenAIGym(SimplePendulumEnv())
env = OpenAIGym(CartPoleEnv())

actor_network = [
    dict(type='dense', size=128, activation='relu'),
    dict(type='dense', size=64, activation='relu'),
    dict(type='dense', size=64, activation='relu')
]

critic_network = [
    dict(type='dense', size=128, activation='relu'),
    dict(type='dense', size=64, activation='relu'),
    dict(type='dense', size=64, activation='relu')
]

agent = Agent.create(agent='ppo',
                     batch_size=batch_size,
예제 #18
0
파일: learn.py 프로젝트: bigcong/io
import gym
from gym.envs.classic_control import CartPoleEnv

env = CartPoleEnv()
env = env.unwrapped  # 不做这个会有很多限制

print(env.action_space)  # 查看这个环境中可用的 action 有多少个
print(env.observation_space)  # 查看这个环境中可用的 state 的 observation 有多少个
print(env.observation_space.high)  # 查看 observation 最高取值
print(env.observation_space.low)
예제 #19
0
 def __init__(self, tf_model):
     self.tf_model = tf_model
     self.observation_space = CartPoleEnv().observation_space
     self.action_space = CartPoleEnv().action_space
예제 #20
0
        return int(
            max(0, round(self.nn(np.array(obs)[np.newaxis]).numpy()[0][0])))


class RandomTFAgent(agentos.Agent):
    def __init__(self, environment, policy):
        super().__init__(environment=environment, policy=policy)
        self.ret_vals = []

    def advance(self):
        trajs = agentos.rollout(self.policy, self.environment, max_steps=2000)
        self.ret_vals.append(sum(trajs.rewards))


if __name__ == "__main__":
    from gym.envs.classic_control import CartPoleEnv

    random_nn_agent = RandomTFAgent(
        environment=CartPoleEnv,
        policy=SingleLayerTFPolicy(
            CartPoleEnv().action_space,
            CartPoleEnv().observation_space,
        ),
    )
    agentos.run_agent(random_nn_agent, max_iters=10)
    print(f"Agent done!\n"
          f"Num rollouts: {len(random_nn_agent.ret_vals)}\n"
          f"Avg return: {np.mean(random_nn_agent.ret_vals)}\n"
          f"Max return: {max(random_nn_agent.ret_vals)}\n"
          f"Median return: {np.median(random_nn_agent.ret_vals)}\n")
예제 #21
0
    for i_episode in range(10000):
        observation = env.reset()
        action = chose_action(model=model)
        while True:
            observation_, reward, done, info = env.step(action)
            x, x_dot, theta, theta_dot = observation_
            r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
            r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
            reward = r1 + r2
            ransition = np.hstack((observation, [action, reward], observation_))
            print()


if __name__ == '__main__':

    env = CartPoleEnv()
    for i_episode in range(20):
        observation = env.reset()
        for t in range(100):
            env.render()
            action = env.action_space.sample()
            observation_, reward, done, info = env.step(action)
            x, x_dot, theta, theta_dot = observation_
            r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
            r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
            reward = r1 + r2
            print(reward)
            ransition = np.hstack((observation, [action, reward], observation_))
            print(ransition)

예제 #22
0
import numpy as np
import scipy as sp
from gym.envs.classic_control import CartPoleEnv
from util.policy_nn_boltzmann import *
from util.learner_nn import *
from util.util_cartpole import *
import sys

np.set_printoptions(precision=6)
np.set_printoptions(suppress=True)

mdp = CartPoleEnv()
mdp.horizon = 200
agent_policy = nnBoltzmannPolicy(nStateFeatures=4,
                                 nActions=2,
                                 nHiddenNeurons=64,
                                 paramInitMaxVal=0.025)
agent_learner = nnGpomdpLearner(mdp, agent_policy, gamma=0.995)

#eps = collect_pendulum_episodes(mdp,agent_policy,10,mdp.horizon)
#agent_policy.optimize_gradient(eps,0.003)

ctlearn(agent_learner,
        steps=10000,
        nEpisodes=25,
        learningRate=0.003,
        plotGradient=True,
        printInfo=True)
예제 #23
0
import os
from collections import deque

import matplotlib.pyplot as plt
import numpy as np
from gym.envs.classic_control import CartPoleEnv
from tensorboardX import SummaryWriter

from training.dqn.dqn_agent import Agent
from utility.Scheduler import Scheduler

currentDT = datetime.datetime.now()
print(f'Start at {currentDT.strftime("%Y-%m-%d %H:%M:%S")}')
seed = 5
# np.random.seed(seed)
env = CartPoleEnv()  # gym.make("CartPole-v0")
env.seed(seed)
np.random.seed(seed)
state_size = 4
action_size = 2
STARTING_BETA = 0.6  # the higher the more it decreases the influence of high TD transitions
ALPHA = 0.6  # the higher the more aggressive the sampling towards high TD transitions
EPS_DECAY = 0.2
MIN_EPS = 0.01

current_time = currentDT.strftime('%b%d_%H-%M-%S')
comment = f"alpha={ALPHA}, min_eps={MIN_EPS}, eps_decay={EPS_DECAY}"
log_dir = os.path.join('../runs', current_time + '_' + comment)
os.mkdir(log_dir)
print(f"logging to {log_dir}")
writer = SummaryWriter(log_dir=log_dir)
예제 #24
0
파일: MBPG_HA_test.py 프로젝트: gaosh/MBPG
def run_task(snapshot_config, *_):
    """Set up environment and algorithm and run the task.
    Args:
        snapshot_config (garage.experiment.SnapshotConfig): The snapshot
            configuration used by LocalRunner to create the snapshotter.
            If None, it will create one with default settings.
        _ : Unused parameters
    """

    th = 1.8
    g_max = 0.1
    #delta = 1e-7
    if args.env == 'CartPole':
        #CartPole

        env = TfEnv(normalize(CartPoleEnv()))
        runner = LocalRunner(snapshot_config)
        batch_size = 5000
        max_length = 100
        n_timestep = 5e5
        n_counts = 5
        name = 'CartPole'
        grad_factor = 5
        th = 1.2
        #batchsize: 1
        # lr = 0.1
        # w = 2
        # c = 50

        #batchsize: 50
        lr = 0.75
        c = 3
        w = 2

        discount = 0.995
        path = './init/CartPole_policy.pth'

    if args.env == 'Walker':
        #Walker_2d
        env = TfEnv(normalize(Walker2dEnv()))
        runner = LocalRunner(snapshot_config)
        batch_size = 50000
        max_length = 500

        n_timestep = 1e7
        n_counts = 5
        lr = 0.75
        w = 2
        c = 12
        grad_factor = 6

        discount = 0.999

        name = 'Walk'
        path = './init/Walk_policy.pth'

    if args.env == 'HalfCheetah':
        env = TfEnv(normalize(HalfCheetahEnv()))
        runner = LocalRunner(snapshot_config)

        batch_size = 50000
        max_length = 500

        n_timestep = 1e7
        n_counts = 5
        lr = 0.6
        w = 1
        c = 4
        grad_factor = 5
        th = 1.2
        g_max = 0.06

        discount = 0.999

        name = 'HalfCheetah'
        path = './init/HalfCheetah_policy.pth'

    if args.env == 'Hopper':
        #Hopper
        env = TfEnv(normalize(HopperEnv()))
        runner = LocalRunner(snapshot_config)

        batch_size = 50000
        max_length = 1000
        th = 1.5
        n_timestep = 1e7
        n_counts = 5
        lr = 0.75
        w = 1
        c = 3
        grad_factor = 6
        g_max = 0.15
        discount = 0.999

        name = 'Hopper'
        path = './init/Hopper_policy.pth'

    for i in range(n_counts):
        # print(env.spec)
        if args.env == 'CartPole':
            policy = CategoricalMLPPolicy(env.spec,
                                       hidden_sizes=[8, 8],
                                       hidden_nonlinearity=torch.tanh,
                                       output_nonlinearity=None)
        else:
            policy = GaussianMLPPolicy(env.spec,
                                       hidden_sizes=[64, 64],
                                       hidden_nonlinearity=torch.tanh,
                                       output_nonlinearity=None)


        policy.load_state_dict(torch.load(path))
        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = MBPG_HA(env_spec=env.spec,
                   env = env,
                    env_name= name,
                   policy=policy,
                   baseline=baseline,
                   max_path_length=max_length,
                   discount=discount,
                   grad_factor=grad_factor,
                   policy_lr= lr,
                   c = c,
                   w = w,
                   th=th,
                   g_max=g_max,
                   n_timestep=n_timestep,

                   batch_size=batch_size,
                   center_adv=True,
                   # delta=delta
                   #decay_learning_rate=d_lr,

                   )

        runner.setup(algo, env)
        runner.train(n_epochs=100, batch_size=batch_size)
예제 #25
0
 parser = argparse.ArgumentParser(
     description="Run reinforce with a simple TF policy on gym CartPole. "
     "One rollout per call to agent.advance(), "
     "200 steps per rollout.", )
 parser.add_argument(
     "max_iters",
     type=int,
     metavar="MAX_ITERS",
     help="How many times to call advance() on agent.",
 )
 parser.add_argument("--rollouts_per_iter", type=int, default=1)
 parser.add_argument("--max_steps_per_rollout", type=int, default=200)
 parser.add_argument("--discount_rate", type=float, default=0.9)
 args = parser.parse_args()
 reinforce_agent = ReinforceAgent(
     CartPoleEnv(),
     TwoLayerTFPolicy(),
     rollouts_per_iter=args.rollouts_per_iter,
     max_steps_per_rollout=args.max_steps_per_rollout,
     discount_rate=args.discount_rate,
 )
 agentos.run_agent(
     reinforce_agent,
     max_iters=args.max_iters,
 )
 print("Agent done!")
 if reinforce_agent.ret_vals:
     print(f"Num rollouts: {len(reinforce_agent.ret_vals)}\n"
           f"Avg return: {np.mean(reinforce_agent.ret_vals)}\n"
           f"Max return: {max(reinforce_agent.ret_vals)}\n"
           f"Median return: {np.median(reinforce_agent.ret_vals)}\n")