def main(params): config = vars(parser.parse_args()) # env = gym.make(config['env']) env = make_env(config['env']) env.seed(seed) agent = PPO(env, cfg['agent']) tag = params['tag'] # Initiate the tracker for stats tracker = Tracker( config['env'], #env.unwrapped.spec.id, tag, seed, cfg['agent'], ['Epoch', 'Ep_Reward', 'Cost']) # Train the agent agent.train(tracker, n_episodes=config['epochs'], n_step=config['stepmax'], verbose=config['verbose'], params=cfg['agent'], hyperp=config)
def _thunk(): env = make_env( cube_goal_pose=goal_dict, goal_difficulty=goal_difficulty, action_space=action_space, frameskip=frameskip, sim=sim, visualization=visualization, reward_fn=reward_fn, termination_fn=termination_fn, initializer=initializer, episode_length=10 * episode_length, # make this long enough to ensure that we have "episode_length" steps in the residual_state. rank=rank, monitor=monitor, ) if domain_randomization: env = RandomizedEnvWrapper(env) env = ResidualWrapper(env, state_machine, frameskip, max_torque, residual_state, max_length=episode_length) env = EpisodeInfo(env) env.seed(seed + rank) return env
def init(env_name, args, final_init=True): if env_name == 'levers': env = gym.make('Levers-v0') env.multi_agent_init(args.total_agents, args.nagents) env = GymWrapper(env) elif env_name == 'number_pairs': env = gym.make('NumberPairs-v0') m = args.max_message env.multi_agent_init(args.nagents, m) env = GymWrapper(env) elif env_name == 'predator_prey': env = gym.make('PredatorPrey-v0') if args.display: env.init_curses() env.multi_agent_init(args) env = GymWrapper(env) elif env_name == 'traffic_junction': from ic3net_envs.ic3net_envs.traffic_junction_env import TrafficJunctionEnv env = TrafficJunctionEnv() #env = gym.make('TrafficJunction-v0') if args.display: env.init_curses() env.multi_agent_init(args) env = GymWrapper(env) elif env_name == 'starcraft': env = gym.make('StarCraftWrapper-v0') env.multi_agent_init(args, final_init) env = GymWrapper(env.env) elif env_name == 'simple_tag': env = make_env(env_name, args) env = EnvWrapper(env) elif env_name == 'simple_spread': env = make_env(env_name, args) env = EnvWrapper(env) else: raise RuntimeError("wrong env name") return env
def _init_env(goal_pose_dict, difficulty): eval_config = { 'action_space': 'torque_and_position', 'frameskip': 3, 'reward_fn': 'competition_reward', 'termination_fn': 'no_termination', 'initializer': 'random_init', 'monitor': False, 'visualization': True, 'sim': True, 'rank': 0 } set_seed(0) env = make_env(goal_pose_dict, difficulty, **eval_config) return env
def _init_env(goal_pose_json, difficulty, path=None): eval_config = { 'action_space': 'torque_and_position', 'frameskip': 3, 'reward_fn': 'competition_reward', 'termination_fn': 'no_termination', 'initializer': 'bo_init', 'monitor': False, 'visualization': False, 'sim': True, 'rank': 0, 'episode_length': EPISODE_LEN_SIM } set_seed(0) goal_pose_dict = json.loads(goal_pose_json) env = make_env(goal_pose_dict, difficulty, path=path, **eval_config) return env
def main(args): env = make_env(args.scenario) n_agents = env.n n_actions = env.world.dim_p # env = ActionNormalizedEnv(env) # env = ObsEnv(env) n_states = env.observation_space[0].shape[0] torch.manual_seed(args.seed) if args.tensorboard and args.mode == "train": writer = SummaryWriter(log_dir='runs/' + args.algo + "/" + args.log_dir) if args.algo == "bicnet": model = BiCNet(n_states, n_actions, n_agents, args) if args.algo == "commnet": model = CommNet(n_states, n_actions, n_agents, args) if args.algo == "maddpg": model = MADDPG(n_states, n_actions, n_agents, args) print(model) model.load_model() episode = 0 total_step = 0 while episode < args.max_episodes: state = env.reset() episode += 1 step = 0 accum_reward = 0 rewardA = 0 rewardB = 0 rewardC = 0 while True: if args.mode == "train": action = model.choose_action(state, noisy=True) next_state, reward, done, info = env.step(action) step += 1 total_step += 1 reward = np.array(reward) rew1 = reward_from_state(next_state) reward = rew1 + (np.array(reward, dtype=np.float32) / 100.) accum_reward += sum(reward) rewardA += reward[0] rewardB += reward[1] rewardC += reward[2] if args.algo == "maddpg" or args.algo == "commnet": obs = torch.from_numpy(np.stack(state)).float().to(device) obs_ = torch.from_numpy( np.stack(next_state)).float().to(device) if step != args.episode_length - 1: next_obs = obs_ else: next_obs = None rw_tensor = torch.FloatTensor(reward).to(device) ac_tensor = torch.FloatTensor(action).to(device) if args.algo == "commnet" and next_obs is not None: model.memory.push(obs.data, ac_tensor, next_obs, rw_tensor) if args.algo == "maddpg": model.memory.push(obs.data, ac_tensor, next_obs, rw_tensor) obs = next_obs else: model.memory(state, action, reward, next_state, done) state = next_state if args.episode_length < step or (True in done): c_loss, a_loss = model.update(episode) print("[Episode %05d] reward %6.4f" % (episode, accum_reward)) if args.tensorboard: writer.add_scalar(tag='agent/reward', global_step=episode, scalar_value=accum_reward.item()) writer.add_scalar(tag='agent/reward_0', global_step=episode, scalar_value=rewardA.item()) writer.add_scalar(tag='agent/reward_1', global_step=episode, scalar_value=rewardB.item()) writer.add_scalar(tag='agent/reward_2', global_step=episode, scalar_value=rewardC.item()) if c_loss and a_loss: writer.add_scalars('agent/loss', global_step=episode, tag_scalar_dict={ 'actor': a_loss, 'critic': c_loss }) if c_loss and a_loss: print(" a_loss %3.2f c_loss %3.2f" % (a_loss, c_loss), end='') if episode % args.save_interval == 0 and args.mode == "train": model.save_model(episode) env.reset() # model.reset() break elif args.mode == "eval": action = model.choose_action(state, noisy=False) next_state, reward, done, info = env.step(action) step += 1 total_step += 1 state = next_state reward = np.array(reward) import time time.sleep(0.02) env.render() rew1 = reward_from_state(next_state) reward = rew1 + (np.array(reward, dtype=np.float32) / 100.) accum_reward += sum(reward) rewardA += reward[0] rewardB += reward[1] rewardC += reward[2] if args.episode_length < step or (True in done): print("[Episode %05d] reward %6.4f " % (episode, accum_reward)) env.reset() break if args.tensorboard: writer.close()
path = Path(cube_path, joint_conf_path, tip_path, grasp) path.set_min_height(self.env, path_min_height) return path if __name__ == '__main__': from trifinger_simulation.tasks import move_cube from env.make_env import make_env reward_fn = 'competition_reward' termination_fn = 'position_close_to_goal' initializer = 'small_rot_init' env = make_env(move_cube.sample_goal(-1).to_dict(), 4, reward_fn=reward_fn, termination_fn=termination_fn, initializer=initializer, action_space='position', sim=True, visualization=True) for i in range(1): obs = env.reset() pos = obs["object_position"] quat = obs["object_orientation"] goal_pos = obs["goal_object_position"] goal_quat = obs["goal_object_orientation"] planner = WholeBodyPlanner(env) path = planner.plan(pos, quat, goal_pos,
def main(args): env = make_env('simple_spread') # env = make_env('simple') # env = gym.make('Pendulum-v0') env = ActionNormalizedEnv(env) env = ObsEnv(env) kwargs = dict() kwargs['config'] = args torch.manual_seed(args.seed) if args.tensorboard: writer = SummaryWriter(log_dir='runs/'+args.log_dir) model = BiCNet(14, 2, 3, **kwargs) # model = BiCNet(4, 2, 1, **kwargs) episode = 0 total_step = 0 while episode < args.max_episodes: state = env.reset() episode += 1 step = 0 accum_reward = 0 rewardA = 0 rewardB = 0 rewardC = 0 prev_reward = np.zeros((3), dtype=np.float32) while True: # action = agent.random_action() if episode > args.warmup: action = model.choose_action(state, noisy=True) else: action = model.random_action() next_state, reward, done, info = env.step(action) step += 1 total_step += 1 reward = np.array(reward) '''KeyboardInterrupt Reward Shaping - Distance to landmarks ''' rew1 = reward_from_state(next_state) #if step % 5 == 0: # rew1 -= 0.1 reward = rew1 + (np.array(reward, dtype=np.float32) / 100.) accum_reward += sum(reward) rewardA += reward[0] rewardB += reward[1] rewardC += reward[2] if args.render and episode % 100 == 0: env.render(mode='rgb_array') model.memory(state, action, reward, next_state, done) state = next_state if len(model.replay_buffer) >= args.batch_size and total_step % args.steps_per_update == 0: model.prep_train() model.train() model.prep_eval() if args.episode_length < step or (True in done): c_loss, a_loss = model.get_loss() action_std = model.get_action_std() print("[Episode %05d] reward %6.4f eps %.4f" % (episode, accum_reward, model.epsilon), end='') if args.tensorboard: writer.add_scalar(tag='agent/reward', global_step=episode, scalar_value=accum_reward.item()) writer.add_scalar(tag='agent/reward_0', global_step=episode, scalar_value=rewardA.item()) writer.add_scalar(tag='agent/reward_1', global_step=episode, scalar_value=rewardB.item()) writer.add_scalar(tag='agent/reward_2', global_step=episode, scalar_value=rewardC.item()) writer.add_scalar(tag='agent/epsilon', global_step=episode, scalar_value=model.epsilon) if c_loss and a_loss: writer.add_scalars('agent/loss', global_step=episode, tag_scalar_dict={'actor':a_loss, 'critic':c_loss}) if action_std: writer.add_scalar(tag='agent/action_std', global_step=episode, scalar_value=action_std) if c_loss and a_loss: print(" a_loss %3.2f c_loss %3.2f" % (a_loss, c_loss), end='') if action_std: print(" action_std %3.2f" % (action_std), end='') print() env.reset() model.reset() break if args.tensorboard: writer.close()