def get_time_to_first_contact(env, policy, is_random=False, num_trajs=100):
    import itertools
    time_contact = []
    if is_random:
        from rllab.policies.uniform_control_policy import UniformControlPolicy
        policy = UniformControlPolicy(env.spec)
    print("Using {}".format(policy))
    for traj_i in range(num_trajs):
        obs = env.reset()
        print("Start traj {}".format(traj_i))
        for t in itertools.count():
            action, _ = policy.get_action(obs)
            obs, reward, done, env_info = env.step(action)
            if env_info['contact_reward'] > 0 or done:
                time_contact.append(t)
                break
    # plt.hist(time_contact)
    # plt.title("Time to first contact over {} trajectories".format(num_trajs))
    # plt.show()
    data_path = input("Where do you want to save it? \n")
    np.save(data_path, time_contact)
    print("Data saved")
    print(
        "Mean time to first contact: {}, median:{}, std:{} for {}, ({} trajectories)"
        .format(np.mean(time_contact), np.median(time_contact),
                np.std(time_contact), policy, num_trajs))
    def __init__(self, env, tensorboard_path, **kwargs):
        exploration_strategy = RandomStrategy(env.spec)
        policy = UniformControlPolicy(env.spec)
        super().__init__(env, policy, exploration_strategy)

        self.summary_writer = tf.summary.FileWriter(
            tensorboard_path, graph=tf.get_default_graph())
        self.summary = None
예제 #3
0
 def __init__(self,
              sess,
              env,
              cost_approximator,
              cost_trainer,
              novice_policy,
              novice_policy_optimizer,
              num_frames=4,
              concat_timesteps=True,
              train_disc=True):
     """
     sess : tensorflow session
     cost_approximator : the NN or whatever cost function that can take in your observations/states and then give you your reward
     cost_trainer : this is the trainer for optimizing the cost (i.e. runs tensorflow training ops, etc.)
     novice_policy : the policy of your novice agent
     novice_policy_optimizer : the optimizer which runs a policy optimization step (or constrained number of iterations)
     much of this can be found in https://github.com/bstadie/third_person_im/blob/master/sandbox/bradly/third_person/algos/cyberpunk_trainer.py#L164
     """
     self.sess = sess
     self.env = env
     self.cost_approximator = cost_approximator
     self.cost_trainer = cost_trainer
     self.iteration = 0
     self.novice_policy = novice_policy
     self.novice_policy_optimizer = novice_policy_optimizer
     # self.sampler = BaseSampler(self.novice_policy_optimizer)
     self.concat_timesteps = concat_timesteps
     self.num_frames = num_frames
     self.replay_buffer = {}
     self.max_replays = 3
     self.replay_index = 0
     self.replay_times = 40
     self.should_train_cost = True
     self.prev_reward_dist = None
     self.is_first_disc_update = True
     self.gc_time = time.time()
     self.gc_time_threshold = 60  # seconds between garbage collection
     # as in traditional GANs, we add failure noise
     self.noise_fail_policy = UniformControlPolicy(env.spec)
     self.train_disc = train_disc
     self.zero_baseline = ZeroBaseline(env_spec=env.spec)
     self.rand_algo = NOP(
         env=env,
         policy=self.noise_fail_policy,
         baseline=self.zero_baseline,
         batch_size=1 * self.env.horizon,
         max_path_length=self.env.horizon,
         n_itr=1,
         discount=0.995,
         step_size=0.01,
     )
     self.rand_algo.start_worker(
     )  # TODO: Call this in constructor instead ?
     self.rand_algo.init_opt()
     self.should_do_policy_step = True
     self.should_do_exploration = True
     self.num_steps_since_last_trpo = 0
예제 #4
0
def episode_reward(env, policy, is_random=False):
	import itertools
	mean_reward = []
	if is_random:
		from rllab.policies.uniform_control_policy import UniformControlPolicy
		policy = UniformControlPolicy(env.spec)
	print ("Using {}".format(policy))
	for traj_i in range(num_trajs):
		obs = env.reset()
		print ("Start traj {}".format(traj_i))
		rewards
		for t in itertools.count():
			action, _ = policy.get_action(obs)
			obs, reward, done, env_info = env.step(action)
			if done:
				break
	plt.his
	print ("Mean time to first contact: {} for {}, ({} trajectories)".format(np.mean(time_contact), policy, num_trajs))
예제 #5
0
def test_state_hist(env):
	policy = UniformControlPolicy(env.spec)
	_states = []
	o = env.reset()
	try:
		while True:
			_states.append(o)
			a, _ = policy.get_action(o)
			next_o, r, d, env_info = env.step(a)
			if d:
				o = env.reset()
			else:
				o = next_o
	except KeyboardInterrupt:
		states = np.asarray(_states)
		save_path = '/Users/dianchen/state.npy'
		np.save(save_path, states)
		# pickle.dump(states, save_path)
		print ("State samples saved to {}".format(save_path))
def random_action_launcher(variant):
	from railrl.algos.noop_algo import NoOpAlgo
	from rllab.exploration_strategies.ou_strategy import OUStrategy
	from rllab.policies.uniform_control_policy import UniformControlPolicy
	from railrl.launchers.launcher_util import get_env_settings
	env_settings = get_env_settings(**variant['env_params'])
	env = env_settings['env']
	es = OUStrategy(env)
	policy = UniformControlPolicy(env_spec=env.spec)
	algorithm = NoOpAlgo(
		env,
		policy,
		es,
		**variant['algo_params']
	)
	algorithm.train()
예제 #7
0
                    policy, sess)
 elif args.test_inverse_loss:
     investigate_inverse_loss(encoder,
                              inverse_model,
                              forward_model,
                              env,
                              policy,
                              sess,
                              img_path=args.data_path,
                              num_trajs=100,
                              animate=args.render)
 elif args.test_forward_loss:
     if policy is None:
         # TODO: Remove this hack after CoRL deadline
         from rllab.policies.uniform_control_policy import UniformControlPolicy
         policy = UniformControlPolicy(env.spec)
     investigate_forward_loss(encoder,
                              inverse_model,
                              forward_model,
                              env,
                              policy,
                              sess,
                              data_path=args.data_path,
                              num_trajs=200,
                              animate=args.render,
                              num_top=50)
 elif args.plot_forward:
     plot_forward(encoder, inverse_model, forward_model, env,
                  policy, sess)
 elif args.time_contact:
     get_time_to_first_contact(env,
예제 #8
0
    if args.seed >= 0:
        set_seed(args.seed)
    if args.collection_file:
        all_feasible_starts = pickle.load(open(args.collection_file, 'rb'))

    with tf.Session() as sess:
        data = joblib.load(args.file)
        if "algo" in data:
            policy = data["algo"].policy
            env = data["algo"].env
        else:
            policy = data['policy']
            env = data['env']

        if args.random_policy:
            policy = UniformControlPolicy(env_spec=env.spec)

        while True:
            if args.init_state:
                from sandbox.envs.base import FixedStateGenerator
                env.update_start_generator(FixedStateGenerator(
                    args.init_state))
            elif args.collection_file:
                from sandbox.envs.base import UniformListStateGenerator
                init_states = all_feasible_starts.sample(1000)
                env.update_start_generator(
                    UniformListStateGenerator(init_states))
            if args.deterministic:
                with policy.set_std_to_0():
                    path = rollout(env,
                                   policy,
예제 #9
0
from rllab.algos.nop import NOP
from rllab.baselines.zero_baseline import ZeroBaseline
from rllab.envs.box2d.cartpole_env import CartpoleEnv
from rllab.envs.normalized_env import normalize
from rllab.policies.uniform_control_policy import UniformControlPolicy

env = normalize(CartpoleEnv())

policy = UniformControlPolicy(
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
)

baseline = ZeroBaseline(env_spec=env.spec)

algo = NOP(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=4000,
    max_path_length=100,
    n_itr=40,
    discount=0.99,
    step_size=0.01,
)
algo.train()