def get_exploration_trajs(args, model_exp, env, output_dim, min_traj_length): """ Get exploration data for initialization @param args: command line arguments @param model_exp: exploration model @param env: environment @param output_dim: action dimension @param min_traj_length: minimum trajectory length @return: observations list of do x L and actions list of da x L """ X_obs_good = [] X_act_good = [] pi_exp = policies.RandomGaussianPolicy(output_dim, rng=args.rng) if args.load_reactive <> '': #load a previously trained reactive policy re_params = load_params(args.load_reactive) if len(re_params) > 0: x_dim = env.dimensions[0] model_exp = FiniteHistoryModel(obs_dim=x_dim, past_window=args.filter_w) state_dim = model_exp.state_dimension re_params['layer_id_0_W'] = re_params['layer_id_0_W'][:, -state_dim:] pi_exp = get_policy[args.init_policy](x_dim=state_dim, output_dim=output_dim, num_layers=args.nL, nh=args.nh, activation=args.nn_act, rng=args.rng, min_std=args.min_std) pi_exp._load(re_params) leni = args.len if args.leni is None else args.leni exp_trajs = env.run(model_exp, pi_exp, leni, render=False, min_traj_length=min_traj_length, num_trajs=args.initN, num_samples=args.initS) #print ('Using %d exp trajectories.' % len(exp_trajs)) col_trajs = [(t.obs, t.act) for t in exp_trajs] X_obs_rnd = [c[0] for c in col_trajs] X_act_rnd = [c[1] for c in col_trajs] X_obs = X_obs_rnd + X_obs_good X_act = X_act_rnd + X_act_good return X_obs, X_act
def run_policy_continuous(args, flname): """ Train a continuous RPSPnet from commandline arguments @param args: command line args @param flname: filename to store results @return: logger results to save """ args.flname = flname env = load_environment(args) env = load_environment(args) (x_dim, a_dim) = env.dimensions args.a_dim = a_dim args.x_dim = x_dim print(x_dim,"@@") model_exp = ObservableModel(x_dim) pi_exp = policies.RandomGaussianPolicy(x_dim, rng=args.rng) baseline = args.b min_traj_length = getattr(args, 'mintrajlen', args.past + args.fut + 2) PiUpdater = None fkwargs = {'baseline': baseline, 'lr': args.lr, 'beta_reinf': args.wrwd, 'beta_pred': args.wpred, 'beta_pred_decay': args.wdecay, 'beta_only_reinf': args.wrwd_only, 'gamma': args.gamma, 'grad_step': args.grad_step, 'trpo_step': args.trpo_step, 'past': args.past, 'fut': args.fut, 'cg_opt': args.cg_opt, 'max_traj_length': args.len, 'num_trajs': args.numtrajs, 'normalize_grad': args.norm_g, 'hvec': args.hvec, 'env': env, 'min_traj_len': min_traj_length} print ('build updater ... ', args.method) #run the observable model with reactive policy if args.method == 'obsVR': model, PiUpdater, pp = load_observable_policy(args, model_exp, **fkwargs) elif args.method == 'arVR': model, PiUpdater, pp = load_finite_mem_policy(args, model_exp, **fkwargs) else: #run the psr network with obs model or psr model model, pp = load_rpsp_policy(args, model_exp, **fkwargs) print ('done building updater') print ('len:', args.len, 'num trajs:', args.numtrajs, 'iter:', args.iter) state_shape = (1,model._state_dim) num_actions = 64 batch_size =8 q_learner = dqn.Agent(state_shape, num_actions, batch_size=batch_size) best_mean_rewards=-100 best_rewards=-100 MAX_EPISODES = 8000 MAX_STEPS = 50 mct=MCTS(model) episode_history = deque(maxlen=25) for i in xrange(MAX_EPISODES): # initialize action=np.zeros(2) _act=np.zeros(2) o = env.reset() obs=model._process_obs(o) init_q=model.initial_state a=np.zeros(2) act=model._process_act(a) state=mct.update_state(init_q,obs, act) total_rewards = 0 for t in range(MAX_STEPS): #env.render() a = q_learner.choose_action(state) _act[0]=int('{:0>6b}'.format(a)[0:3], 2) _act[1]=int('{:0>6b}'.format(a)[3:6], 2) for n in range(2): action[n] = 1.4 - 0.2 * _act[n] action=np.array([action[0],action[1]]) next_obs, reward, done = env.step(action) if t == 48: done=True t_next_obs=mct.model._process_obs(next_obs) t_act=mct.model._process_act(action) total_rewards += reward next_state=mct.update_state(state,t_next_obs, t_act) q_learner.update_buffer(state, a, reward, next_state, done) # Only start learning after buffer has some experience in it if i > 50: q_learner.update_policy() state = next_state if done == True: break episode_history.append(total_rewards) mean_rewards = np.mean(episode_history) print("Episode {}".format(i)) print("Finished after {} timesteps".format(t+1)) print("Reward for this episode: {}".format(total_rewards)) print("Average reward for last 100 episodes: {:.2f}".format(mean_rewards)) if mean_rewards >= best_mean_rewards: best_mean_rewards=mean_rewards if total_rewards >= best_rewards: best_rewards=total_rewards #print>>file,mean_rewards print(mean_rewards ,file=file) print("best reward",best_rewards) print("best_mean_reward",best_mean_rewards)
def run_policy_continuous(args, flname): """ Train a continuous RPSPnet from commandline arguments @param args: command line args @param flname: filename to store results @return: logger results to save """ args.flname = flname env = load_environment(args) env = load_environment(args) (x_dim, a_dim) = env.dimensions args.a_dim = a_dim args.x_dim = x_dim model_exp = ObservableModel(x_dim) pi_exp = policies.RandomGaussianPolicy(x_dim, rng=args.rng) baseline = args.b min_traj_length = getattr(args, 'mintrajlen', args.past + args.fut + 2) PiUpdater = None fkwargs = { 'baseline': baseline, 'lr': args.lr, 'beta_reinf': args.wrwd, 'beta_pred': args.wpred, 'beta_pred_decay': args.wdecay, 'beta_only_reinf': args.wrwd_only, 'gamma': args.gamma, 'grad_step': args.grad_step, 'trpo_step': args.trpo_step, 'past': args.past, 'fut': args.fut, 'cg_opt': args.cg_opt, 'max_traj_length': args.len, 'num_trajs': args.numtrajs, 'normalize_grad': args.norm_g, 'hvec': args.hvec, 'env': env, 'min_traj_len': min_traj_length } print('build updater ... ', args.method) #run the observable model with reactive policy if args.method == 'obsVR': model, PiUpdater, pp = load_observable_policy(args, model_exp, **fkwargs) elif args.method == 'arVR': model, PiUpdater, pp = load_finite_mem_policy(args, model_exp, **fkwargs) else: #run the psr network with obs model or psr model model, PiUpdater, pp = load_rpsp_policy(args, model_exp, **fkwargs) print('done building updater') print('len:', args.len, 'num trajs:', args.numtrajs, 'iter:', args.iter) def run_experiment(): if args.loadfile != '': PiUpdater._load(args.params) elif args.load_reactive != '': re_params = load_params(args.load_reactive) try: PiUpdater._policy._policy._load(re_params) except AttributeError: pass learn_policy(PiUpdater, model, env, min_traj_length=0, max_traj_len=args.len, num_trajs=args.numtrajs, num_samples=args.numsamples, num_iter=args.iter, logger=pp.logger) try: run_experiment() except AssertionError as exc: print('WARNING: Got AssertionError !') print('Message: %s' % exc.message) print('Stacktrace:') traceback.print_exc() return None pp._results['params'] = PiUpdater._save() if args.addobs or args.method == 'arVR': try: re_params = PiUpdater._policy._policy._save() except AttributeError: re_params = PiUpdater._policy._save() save_params(re_params, 're_pi_{}.pkl'.format(args.seed), args.tfile) env.close() return pp._results