def main(): learn( network, env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=20, nb_rollout_steps=100, reward_scale=1.0, render=False, render_eval=False, noise_type='adaptive-param_0.2', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=50, # per epoch cycle and MPI worker, nb_eval_steps=100, batch_size=64, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=50, **network_kwargs)
def run_baselines(env, seed, log_dir): """Create baselines model and training. Replace the ppo and its training with the algorithm you want to run. Args: env (gym.Env): Environment of the task. seed (int): Random seed for the trial. log_dir (str): Log dir path. Returns: str: The log file path. """ seed = seed + 1000000 set_global_seeds(seed) env.seed(seed) # Set up logger for baselines configure(dir=log_dir, format_strs=['stdout', 'log', 'csv', 'tensorboard']) baselines_logger.info('seed={}, logdir={}'.format( seed, baselines_logger.get_dir())) env = DummyVecEnv([ lambda: bench.Monitor( env, baselines_logger.get_dir(), allow_early_resets=True) ]) ddpg.learn(network='mlp', env=env, nb_epochs=params['n_epochs'], nb_epoch_cycles=params['steps_per_epoch'], normalize_observations=False, critic_l2_reg=0, actor_lr=params['policy_lr'], critic_lr=params['qf_lr'], gamma=params['discount'], nb_train_steps=params['n_train_steps'], nb_rollout_steps=params['n_rollout_steps'], nb_eval_steps=100) return osp.join(log_dir, 'progress.csv')
def train(arglist): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) ddpg_env = DdpgEnv(env) from baselines.ddpg.ddpg import learn if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: rank = 0 configure_logger(arglist.log_path) else: rank = MPI.COMM_WORLD.Get_rank() configure_logger(arglist.log_path, format_strs=[]) learn( network="mlp", env=ddpg_env, # total_timesteps=400, nb_rollout_steps=400, nb_epochs=3000, render=arglist.display)
def train(self): hostname = socket.gethostname() time_str = datetime.datetime.now().strftime('%y.%m.%d-%H:%M:%S') rand_str = str(int(random.random() * 100000)) model_fname = 'runs/' + cfg_id + '-' + hostname + '-' + time_str + '-' + rand_str + '-model' self.tb_logger = Logger(self.cfg, rand_str) logger.configure() total_timesteps = self.cfg['ppo']['total_timesteps'] max_steps = self.cfg['aquarium']['max_steps'] model = ddpg.learn( env=self.env, network=self.cfg['ddpg']['network'], total_timesteps=self.cfg['ddpg']['total_timesteps'], nb_epochs=None, # This stays None. nb_epoch_cycles=10, nb_rollout_steps=max_steps, reward_scale=1.0, render=False, render_eval=False, noise_type='adaptive-param_0.2', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=50, # per epoch cycle and MPI worker, nb_eval_steps=100, batch_size=64, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=50, load_path=None, num_layers=self.cfg['ppo']['num_layers'], # TODO lmao this needs to use ddpg key!!!!!!! num_hidden=self.cfg['ppo']['num_hidden'], tb_logger=self.tb_logger, evaluator=self.evaluate_and_log, model_fname=model_fname ) # model.save(model_fname + '-F') # F stands for final. # import pdb; pdb.set_trace() # noqa self.evaluate_and_log(model, int(total_timesteps / max_steps))
def main(): # unpause Simulation so that robot receives data on all topics gazebo_connection.GazeboConnection().unpauseSim() # create node rospy.init_node('pickbot_gym', anonymous=True, log_level=rospy.FATAL) env = make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_index=0, reward_scale=1.0, flatten_dict_observations=True, gamestate=None) act = ddpg.learn(env=env, network='mlp', total_timesteps=10000)
def main(): num_env = 5 env_id = "Pendulum-v0" env_type = "classic_control" seed = None env = make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_index=0, reward_scale=1.0, flatten_dict_observations=True, gamestate=None) act = ddpg.learn(env=env, network='mlp', total_timesteps=500000, render=False)
def main(): baselines.logger.configure( dir='/tmp/pendulum_ddpg', format_strs=['stdout', 'log', 'csv', 'tensorboard']) #env = gym.make("pendulum-legacy-v0") #env = gym.make("Pendulum-v0") env_id = "Pendulum-v0" env_type = "classic_control" num_env = 1 seed = 1234 reward_scale = 1. flatten_dict_observations = False env = make_vec_env(env_id, env_type, num_env, seed, reward_scale, flatten_dict_observations) act = ddpg.learn( env=env, network='mlp', seed=seed, nb_epoch_cycles=20, nb_rollout_steps=100, reward_scale=1.0, render=False, #True, render_eval=False, noise_type='adaptive-param_0.2', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=50, # per epoch cycle and MPI worker, nb_eval_steps=100, batch_size=64, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=50) print("Saving model to pendulum_model_ddpg.pkl") act.save("pendulum_model_ddpg.pkl")
# -*- coding: utf-8 -*- import numpy as np import torch.optim from baselines.ddpg import ddpg, models import util import env import agent instrument = env.Instrument(2, (50, 70)) D = dict(s=[], a=[], r=[], s_=[], t=[]) epsilon = 0.1 MAX_LEN_D = 10000 batch_size = 100 gamma = 0.5 ddpg.learn('Player', instrument, instrument_range=instrument.region_length) # print(instrument.observation_space.shape)
reward_scale=1.0, ) # Train seed = 321 train = True logger_dir = '/home/lihepeng/Documents/Github/tmp/ev/ddpg/train' env = DummyVecEnv( [make_thunk(i, seed, train, logger_dir, mpi_rank) for i in range(nenv)]) model = learn( network='mlp', num_hidden=64, num_layers=3, env=env, seed=seed, total_timesteps=800000, nb_eval_steps=2000, ) env.close() df_train = load_results(logger_dir) # Test seed = 1314 train = False logger.log("Running trained model") logger_dir = '/home/lihepeng/Documents/Github/tmp/ev/ddpg/test' env = DummyVecEnv( [make_thunk(i, seed, train, logger_dir, mpi_rank) for i in range(nenv)])
observation = env.reset() print('Training...') if False: ppo_model = ppo2.learn(network='lstm', env=env.unwrapped, total_timesteps=1000) if False: trpo_model = trpo_mpi.learn(network='lstm', env=env.unwrapped, total_timesteps=1000) if True: ddpg_model = ddpg.learn(network='mlp', env=env, total_timesteps=1000) obs = env.reset() while True: actions, _, state, _ = ddpg_model.step(obs) obs, _, done, _ = env.step(actions) env.render() done = done.any() if isinstance(done, np.ndarray) else done if done: obs = env.reset() print('Done.')
from baselines.ddpg import ddpg # baselines.common.vec_env.dummy_vec_env.DummyVecEnv from baselines.common.vec_env.dummy_vec_env import DummyVecEnv from baselines.common.cmd_util import make_env import gym import sys sys.path.append(r"./../") from EnvNav.Env import RobotWorld if __name__ == "__main__": env = RobotWorld(index=0) # env = gym.make("Pendulum-v0") # env_id = "Pendulum-v0" # env = make_env(env_id = env_id, env_type=None) env = DummyVecEnv([lambda: env]) print(env.action_space) act = ddpg.learn(env=env, network="mlp", total_timesteps=10000) print("Finish!")