def main():
    learn(
        network,
        env,
        seed=None,
        total_timesteps=None,
        nb_epochs=None,  # with default settings, perform 1M steps total
        nb_epoch_cycles=20,
        nb_rollout_steps=100,
        reward_scale=1.0,
        render=False,
        render_eval=False,
        noise_type='adaptive-param_0.2',
        normalize_returns=False,
        normalize_observations=True,
        critic_l2_reg=1e-2,
        actor_lr=1e-4,
        critic_lr=1e-3,
        popart=False,
        gamma=0.99,
        clip_norm=None,
        nb_train_steps=50,  # per epoch cycle and MPI worker,
        nb_eval_steps=100,
        batch_size=64,  # per MPI worker
        tau=0.01,
        eval_env=None,
        param_noise_adaption_interval=50,
        **network_kwargs)
示例#2
0
def run_baselines(env, seed, log_dir):
    """Create baselines model and training.

    Replace the ppo and its training with the algorithm you want to run.

    Args:
        env (gym.Env): Environment of the task.
        seed (int): Random seed for the trial.
        log_dir (str): Log dir path.

    Returns:
        str: The log file path.

    """
    seed = seed + 1000000
    set_global_seeds(seed)
    env.seed(seed)

    # Set up logger for baselines
    configure(dir=log_dir, format_strs=['stdout', 'log', 'csv', 'tensorboard'])
    baselines_logger.info('seed={}, logdir={}'.format(
        seed, baselines_logger.get_dir()))

    env = DummyVecEnv([
        lambda: bench.Monitor(
            env, baselines_logger.get_dir(), allow_early_resets=True)
    ])

    ddpg.learn(network='mlp',
               env=env,
               nb_epochs=params['n_epochs'],
               nb_epoch_cycles=params['steps_per_epoch'],
               normalize_observations=False,
               critic_l2_reg=0,
               actor_lr=params['policy_lr'],
               critic_lr=params['qf_lr'],
               gamma=params['discount'],
               nb_train_steps=params['n_train_steps'],
               nb_rollout_steps=params['n_rollout_steps'],
               nb_eval_steps=100)

    return osp.join(log_dir, 'progress.csv')
示例#3
0
def train(arglist):
    # Create environment
    env = make_env(arglist.scenario, arglist, arglist.benchmark)
    ddpg_env = DdpgEnv(env)
    from baselines.ddpg.ddpg import learn

    if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
        rank = 0
        configure_logger(arglist.log_path)
    else:
        rank = MPI.COMM_WORLD.Get_rank()
        configure_logger(arglist.log_path, format_strs=[])

    learn(
        network="mlp",
        env=ddpg_env,
        # total_timesteps=400,
        nb_rollout_steps=400,
        nb_epochs=3000,
        render=arglist.display)
示例#4
0
    def train(self):
        hostname = socket.gethostname()
        time_str = datetime.datetime.now().strftime('%y.%m.%d-%H:%M:%S')
        rand_str = str(int(random.random() * 100000))
        model_fname = 'runs/' + cfg_id + '-' + hostname + '-' + time_str + '-' + rand_str + '-model'

        self.tb_logger = Logger(self.cfg, rand_str)
        logger.configure()

        total_timesteps = self.cfg['ppo']['total_timesteps']
        max_steps = self.cfg['aquarium']['max_steps']

        model = ddpg.learn(
            env=self.env,
            network=self.cfg['ddpg']['network'],
            total_timesteps=self.cfg['ddpg']['total_timesteps'],
            nb_epochs=None,  # This stays None.
            nb_epoch_cycles=10,
            nb_rollout_steps=max_steps,
            reward_scale=1.0,
            render=False,
            render_eval=False,
            noise_type='adaptive-param_0.2',
            normalize_returns=False,
            normalize_observations=True,
            critic_l2_reg=1e-2,
            actor_lr=1e-4,
            critic_lr=1e-3,
            popart=False,
            gamma=0.99,
            clip_norm=None,
            nb_train_steps=50, # per epoch cycle and MPI worker,
            nb_eval_steps=100,
            batch_size=64, # per MPI worker
            tau=0.01,
            eval_env=None,
            param_noise_adaption_interval=50,
            load_path=None,
            num_layers=self.cfg['ppo']['num_layers'],  # TODO lmao this needs to use ddpg key!!!!!!!
            num_hidden=self.cfg['ppo']['num_hidden'],
            tb_logger=self.tb_logger,
            evaluator=self.evaluate_and_log,
            model_fname=model_fname
        )

        # model.save(model_fname + '-F')  # F stands for final.

        # import pdb; pdb.set_trace()  # noqa
        self.evaluate_and_log(model, int(total_timesteps / max_steps))
示例#5
0
def main():
    # unpause Simulation so that robot receives data on all topics
    gazebo_connection.GazeboConnection().unpauseSim()
    # create node
    rospy.init_node('pickbot_gym', anonymous=True, log_level=rospy.FATAL)

    env = make_vec_env(env_id,
                       env_type,
                       num_env,
                       seed,
                       wrapper_kwargs=None,
                       start_index=0,
                       reward_scale=1.0,
                       flatten_dict_observations=True,
                       gamestate=None)

    act = ddpg.learn(env=env, network='mlp', total_timesteps=10000)
示例#6
0
def main():
    num_env = 5
    env_id = "Pendulum-v0"
    env_type = "classic_control"
    seed = None

    env = make_vec_env(env_id,
                       env_type,
                       num_env,
                       seed,
                       wrapper_kwargs=None,
                       start_index=0,
                       reward_scale=1.0,
                       flatten_dict_observations=True,
                       gamestate=None)

    act = ddpg.learn(env=env,
                     network='mlp',
                     total_timesteps=500000,
                     render=False)
示例#7
0
def main():
    baselines.logger.configure(
        dir='/tmp/pendulum_ddpg',
        format_strs=['stdout', 'log', 'csv', 'tensorboard'])
    #env = gym.make("pendulum-legacy-v0")
    #env = gym.make("Pendulum-v0")
    env_id = "Pendulum-v0"
    env_type = "classic_control"
    num_env = 1
    seed = 1234
    reward_scale = 1.
    flatten_dict_observations = False
    env = make_vec_env(env_id, env_type, num_env, seed, reward_scale,
                       flatten_dict_observations)

    act = ddpg.learn(
        env=env,
        network='mlp',
        seed=seed,
        nb_epoch_cycles=20,
        nb_rollout_steps=100,
        reward_scale=1.0,
        render=False,  #True,
        render_eval=False,
        noise_type='adaptive-param_0.2',
        normalize_returns=False,
        normalize_observations=True,
        critic_l2_reg=1e-2,
        actor_lr=1e-4,
        critic_lr=1e-3,
        popart=False,
        gamma=0.99,
        clip_norm=None,
        nb_train_steps=50,  # per epoch cycle and MPI worker,
        nb_eval_steps=100,
        batch_size=64,  # per MPI worker
        tau=0.01,
        eval_env=None,
        param_noise_adaption_interval=50)
    print("Saving model to pendulum_model_ddpg.pkl")
    act.save("pendulum_model_ddpg.pkl")
示例#8
0
# -*- coding: utf-8 -*-
import numpy as np
import torch.optim
from baselines.ddpg import ddpg, models
import util
import env
import agent

instrument = env.Instrument(2, (50, 70))

D = dict(s=[], a=[], r=[], s_=[], t=[])
epsilon = 0.1
MAX_LEN_D = 10000
batch_size = 100
gamma = 0.5

ddpg.learn('Player', instrument, instrument_range=instrument.region_length)
# print(instrument.observation_space.shape)
示例#9
0
        reward_scale=1.0,
    )


# Train
seed = 321
train = True
logger_dir = '/home/lihepeng/Documents/Github/tmp/ev/ddpg/train'
env = DummyVecEnv(
    [make_thunk(i, seed, train, logger_dir, mpi_rank) for i in range(nenv)])

model = learn(
    network='mlp',
    num_hidden=64,
    num_layers=3,
    env=env,
    seed=seed,
    total_timesteps=800000,
    nb_eval_steps=2000,
)

env.close()
df_train = load_results(logger_dir)

# Test
seed = 1314
train = False
logger.log("Running trained model")
logger_dir = '/home/lihepeng/Documents/Github/tmp/ev/ddpg/test'
env = DummyVecEnv(
    [make_thunk(i, seed, train, logger_dir, mpi_rank) for i in range(nenv)])
    observation = env.reset()

    print('Training...')

    if False:
        ppo_model = ppo2.learn(network='lstm',
                               env=env.unwrapped,
                               total_timesteps=1000)

    if False:
        trpo_model = trpo_mpi.learn(network='lstm',
                                    env=env.unwrapped,
                                    total_timesteps=1000)

    if True:
        ddpg_model = ddpg.learn(network='mlp', env=env, total_timesteps=1000)

        obs = env.reset()

        while True:
            actions, _, state, _ = ddpg_model.step(obs)
            obs, _, done, _ = env.step(actions)
            env.render()
            done = done.any() if isinstance(done, np.ndarray) else done

            if done:
                obs = env.reset()

    print('Done.')
示例#11
0
from baselines.ddpg import ddpg
#  baselines.common.vec_env.dummy_vec_env.DummyVecEnv
from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
from baselines.common.cmd_util import make_env
import gym
import sys
sys.path.append(r"./../")

from EnvNav.Env import RobotWorld

if __name__ == "__main__":
    env = RobotWorld(index=0)
    # env =  gym.make("Pendulum-v0")
    # env_id  = "Pendulum-v0"
    # env = make_env(env_id = env_id, env_type=None)
    env = DummyVecEnv([lambda: env])
    print(env.action_space)
    act = ddpg.learn(env=env, network="mlp", total_timesteps=10000)

    print("Finish!")