env=novice_env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=50, n_itr=40, discount=0.99, step_size=0.01, optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) ) with tf.Session() as sess: algo.n_itr = 0 algo.start_itr = 0 algo.train(sess=sess) im_size = 50 im_channels = 3 dim_input = [im_size, im_size, im_channels] disc = DomainConfusionVelocityDiscriminator(input_dim=dim_input, output_dim_class=2, output_dim_dom=2, tf_sess=sess) expert_policy = load_expert_reacher(expert_env, sess) #from rllab.sampler.utils import rollout #while True: # t = rollout(env=expert_env, agent=expert_policy, max_path_length=50, animated=True)
def train(self): expert_env = TfEnv( self.expert_env ) #TfEnv(GymEnv("Pusher3DOF-v1", force_reset=True, record_video=False)) # expert_env = TfEnv(normalize(ReacherEnv())) novice_env = TfEnv( self.novice_env ) #TfEnv(GymEnv("Pusher3DOFNoChange-v1", force_reset=True, record_video=True)) # novice_env = TfEnv(normalize(ReacherTwoEnv(), normalize_obs=True)) expert_fail_pol = RandomPolicy(expert_env.spec) policy = GaussianMLPPolicy( name="novice_policy", env_spec=novice_env.spec, init_std=10, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=expert_env.spec) algo = TRPO(env=novice_env, policy=policy, baseline=baseline, batch_size=50 * 500, max_path_length=self.horizon, n_itr=self.itrs, discount=0.99, step_size=0.01, optimizer=ConjugateGradientOptimizer( hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: #What do the n_itr and start_itr mean? algo.n_itr = 0 algo.start_itr = 0 algo.train(sess=sess) #TODO: What is happening here? im_height = self.imsize[0] im_width = self.imsize[1] im_channels = 3 dim_input = [im_height, im_width, im_channels] disc = DomainConfusionVelocityDiscriminator(input_dim=dim_input, output_dim_class=2, output_dim_dom=2, tf_sess=sess) #data = joblib.load(self.expert_pkl)#"/home/andrewliu/research/viewpoint/rllab-tpil/third_person_im/data/local/experiment/experiment_2017_05_07_20_58_39_0001/itr_123.pkl")#"/home/abhigupta/abhishek_sandbox/viewpoint/third_person_im/data/local/experiment/experiment_2017_05_06_18_07_38_0001/itr_900.pkl") #expert_policy = data['policy'] with open(self.expert_pkl, 'rb') as pfile: expert_policy = pickle.load(pfile) # expert_policy = load_expert_reacher(expert_env, sess) #Load the expert #TODO: Need to train the expert #from rllab.sampler.utils import rollout #while True: # t = rollout(env=expert_env, agent=expert_policy, max_path_length=50, animated=True) algo.n_itr = self.itrs trainer = CyberPunkTrainer(disc=disc, novice_policy_env=novice_env, expert_fail_pol=expert_fail_pol, expert_env=expert_env, novice_policy=policy, novice_policy_opt_algo=algo, expert_success_pol=expert_policy, im_width=im_width, im_height=im_height, im_channels=im_channels, tf_sess=sess, horizon=self.horizon) iterations = self.itrs for iter_step in range(0, iterations): logger.record_tabular('Iteration', iter_step) trainer.take_iteration(n_trajs_cost=self.trajs, n_trajs_policy=self.trajs) logger.dump_tabular(with_prefix=False) trainer.log_and_finish()