def obs_model(args, data=[], **kwargs): X_obs, X_act = data x_dim = X_obs[0].shape[1] model = ObservableModel(obs_dim=x_dim) filter = rnn_filter.ObservableRNNFilter(model) filter.train(X_obs, X_act, on_unused_input='ignore') return model, filter
def mctsrun(self, env_exp, model, numsteps=100, render=False): rngs = [] if model is None: model = ObservableModel(self.dimensions[0]) print(env_exp) mct = MCTS(model) rngs.append(self.rng().get_state()) o = env_exp.reset() obs = model._process_obs(o) print("obs", obs) init_q = model.initial_state a = np.zeros(2) act = model._process_act(a) q = mct.update_state(init_q, obs, act) o0 = np.copy(o) q0 = np.copy(q) undiscountedReturn = 0 discountedReturn = 0 discount = 1 discountconstant = 0.9 print(model) done = False while not done: if render: self.render() a = mct.SelectAction(q) if a == 1: action = np.array([10, 0]) else: action = np.array([-10, 0]) o, r, done = self.step(action) action = np.array([0, a]) print(o) #print(action) q = mct.update_state(q, o, action) undiscountedReturn += r discountedReturn += r * discount discount *= discountconstant print('return:', undiscountedReturn) if done: break print('discountedreturn:', discountedReturn) print('undiscountedreturn:', undiscountedReturn)
def load_rpsp_policy(args, model_exp, **kwargs): """ Load an RPSP policy and policy updater @param args: command line arguments @param model_exp: observable model @param kwargs: policy updater keyword args @return: observable model, policy updater, and logger """ model = ObservableModel(obs_dim=args.x_dim) X_obs, X_act = get_exploration_trajs(args, model_exp, kwargs.get('env'), args.a_dim, kwargs.get('min_traj_length')) tic = time() psr, filter = model_call(args, data=[X_obs, X_act], x_dim=args.x_dim) print('INIT RPSP without refinement takes:', time() - tic) state_dim = filter.state_dimension pi_react = get_policy[args.pi_exp](x_dim=state_dim, output_dim=args.a_dim, num_layers=args.nL, nh=args.nh, activation=args.nn_act, rng=args.rng, min_std=args.min_std) if isinstance(filter, rffpsr_rnn.RFFPSR_RNN): pi = psrlite_policy.RFFPSRNetworkPolicy(filter, pi_react, np.zeros((args.a_dim))) else: pi = psrlite_policy.PSRLitePolicy(filter, pi_react, np.zeros((args.a_dim))) pp = Log(args, args.flname, pred_model=filter) print('Building policy psr graph') tic = time() PiUpdater = policy_updater[args.vr][args.method](pi, **kwargs) print('took ', time() - tic) return model, PiUpdater, pp
def load_rpsp_policy(args, model_exp, **kwargs): """ Load an RPSP policy and policy updater @param args: command line arguments @param model_exp: observable model @param kwargs: policy updater keyword args @return: observable model, policy updater, and logger """ model = ObservableModel(obs_dim=args.x_dim) X_obs, X_act = get_exploration_trajs(args, model_exp, kwargs.get('env'), args.a_dim, kwargs.get('min_traj_length')) tic = time() psr, filter = model_call(args, data=[X_obs, X_act], x_dim=args.x_dim) print ('INIT RPSP without refinement takes:', time() - tic) state_dim = filter.state_dimension pp = Log(args, args.flname, pred_model=filter) print ('took ', time() - tic) return filter, pp
def run(self, model, policy, max_traj_length, min_traj_length=0, num_trajs=0, num_samples=0, render=False): ''' Generate trajectories of length up max_traj_length each. Returns: A list of trajectories (See models.Trajectory). Additional parameters: - model: An object that implements models.FilteringModel interface. Used to track the state. If None, an ObservableModel is used, which returns the current observation. - policy: An object that implements policies.Policy interface. Used to provide actions. - render: Whether to render generated trajectories in real-time. This calls 'render' method which needs ot be implemented. - num_trajs: Number of trajectories to return. - num_samples: Total number of samples in generated trajectories. Must set num_trajs or num_samples (but not both) to a positive number. ''' trajs = [] rngs = [] if model is None: model = ObservableModel(self.dimensions[0]) if (num_samples > 0) == (num_trajs > 0): raise ValueError( 'Must specify exactly one of num_trajs and num_samples') done_all = False d_o, d_a = self.dimensions i_sample = 0 tic = time.time() best_traj = [0, [0.0]] while not done_all: obs = np.empty((max_traj_length, d_o)) act = np.empty((max_traj_length, d_a)) rwd = np.empty((max_traj_length, 1)) vel = np.empty((max_traj_length, 1)) act_probs = np.empty((max_traj_length, 1)) env_states = [] states = np.empty((max_traj_length, model.state_dimension)) dbg_info = {} rngs.append(self.rng().get_state()) # Make a reset for each trajectory policy.reset() o = self.reset() q = model.reset(o) o0 = np.copy(o) q0 = np.copy(q) env_states.append(self.env_state) forward_pos = self.env_state[0][0] for j in xrange(max_traj_length): if render: self.render() a, p, info = policy.sample_action(q) o, r, done = self.step(a) env_states.append(self.env_state) q = model.update_state(o, a) act[j, :] = a obs[j, :] = o rwd[j] = r states[j, :] = q act_probs[j, :] = p vel[j] = (self.env_state[0][0] - forward_pos) / float(self.dt) forward_pos = self.env_state[0][0] for (k, v) in info.items(): if j == 0: # Build arrays for diagnostic info if type(v) is np.ndarray: dbg_info[k] = np.empty((max_traj_length, v.size)) else: dbg_info[k] = np.empty((max_traj_length, 1)) dbg_info[k][j, :] = v # act variance if done: break j += 1 drop_traj = False if j >= min_traj_length: # Check if we need to truncate trajectory to maintain num_samples if num_samples > 0 and i_sample + j >= num_samples: j -= (i_sample + j - num_samples) done_all = True # TODO: remove this will never happen because of outer if? if j < min_traj_length: # Last trajectory is too short. Ignore it. drop_traj = True if not drop_traj: i_sample += j new_traj = Trajectory(obs=obs[:j, :], states=states[:j, :], act=act[:j, :], rewards=rwd[:j, :], act_probs=act_probs[:j, :], obs0=o0, state0=q0, rng=rngs[-1], vel=vel[:j, :]) for (k, v) in dbg_info.iteritems(): dbg_info[k] = v[:j, :] new_traj.dbg_info = dbg_info trajs.append(new_traj) if np.sum(rwd[:j, :]) >= np.sum( trajs[best_traj[0]].rewards): best_traj[0] = len(trajs) - 1 best_traj[1] = env_states if num_trajs > 0 and len(trajs) == num_trajs: done_all = True print('Gathering trajectories took:', time.time() - tic) # add best trajectory trajs[best_traj[0]].env_states = best_traj[ 1] # save env states for best trajectory trajs[-1].bib = best_traj[0] # save best in batch on last trajectory return trajs
def run_policy_continuous(args, flname): """ Train a continuous RPSPnet from commandline arguments @param args: command line args @param flname: filename to store results @return: logger results to save """ args.flname = flname env = load_environment(args) env = load_environment(args) (x_dim, a_dim) = env.dimensions args.a_dim = a_dim args.x_dim = x_dim print(x_dim,"@@") model_exp = ObservableModel(x_dim) pi_exp = policies.RandomGaussianPolicy(x_dim, rng=args.rng) baseline = args.b min_traj_length = getattr(args, 'mintrajlen', args.past + args.fut + 2) PiUpdater = None fkwargs = {'baseline': baseline, 'lr': args.lr, 'beta_reinf': args.wrwd, 'beta_pred': args.wpred, 'beta_pred_decay': args.wdecay, 'beta_only_reinf': args.wrwd_only, 'gamma': args.gamma, 'grad_step': args.grad_step, 'trpo_step': args.trpo_step, 'past': args.past, 'fut': args.fut, 'cg_opt': args.cg_opt, 'max_traj_length': args.len, 'num_trajs': args.numtrajs, 'normalize_grad': args.norm_g, 'hvec': args.hvec, 'env': env, 'min_traj_len': min_traj_length} print ('build updater ... ', args.method) #run the observable model with reactive policy if args.method == 'obsVR': model, PiUpdater, pp = load_observable_policy(args, model_exp, **fkwargs) elif args.method == 'arVR': model, PiUpdater, pp = load_finite_mem_policy(args, model_exp, **fkwargs) else: #run the psr network with obs model or psr model model, pp = load_rpsp_policy(args, model_exp, **fkwargs) print ('done building updater') print ('len:', args.len, 'num trajs:', args.numtrajs, 'iter:', args.iter) state_shape = (1,model._state_dim) num_actions = 64 batch_size =8 q_learner = dqn.Agent(state_shape, num_actions, batch_size=batch_size) best_mean_rewards=-100 best_rewards=-100 MAX_EPISODES = 8000 MAX_STEPS = 50 mct=MCTS(model) episode_history = deque(maxlen=25) for i in xrange(MAX_EPISODES): # initialize action=np.zeros(2) _act=np.zeros(2) o = env.reset() obs=model._process_obs(o) init_q=model.initial_state a=np.zeros(2) act=model._process_act(a) state=mct.update_state(init_q,obs, act) total_rewards = 0 for t in range(MAX_STEPS): #env.render() a = q_learner.choose_action(state) _act[0]=int('{:0>6b}'.format(a)[0:3], 2) _act[1]=int('{:0>6b}'.format(a)[3:6], 2) for n in range(2): action[n] = 1.4 - 0.2 * _act[n] action=np.array([action[0],action[1]]) next_obs, reward, done = env.step(action) if t == 48: done=True t_next_obs=mct.model._process_obs(next_obs) t_act=mct.model._process_act(action) total_rewards += reward next_state=mct.update_state(state,t_next_obs, t_act) q_learner.update_buffer(state, a, reward, next_state, done) # Only start learning after buffer has some experience in it if i > 50: q_learner.update_policy() state = next_state if done == True: break episode_history.append(total_rewards) mean_rewards = np.mean(episode_history) print("Episode {}".format(i)) print("Finished after {} timesteps".format(t+1)) print("Reward for this episode: {}".format(total_rewards)) print("Average reward for last 100 episodes: {:.2f}".format(mean_rewards)) if mean_rewards >= best_mean_rewards: best_mean_rewards=mean_rewards if total_rewards >= best_rewards: best_rewards=total_rewards #print>>file,mean_rewards print(mean_rewards ,file=file) print("best reward",best_rewards) print("best_mean_reward",best_mean_rewards)
def run_policy_continuous(args, flname): """ Train a continuous RPSPnet from commandline arguments @param args: command line args @param flname: filename to store results @return: logger results to save """ args.flname = flname env = load_environment(args) env = load_environment(args) (x_dim, a_dim) = env.dimensions args.a_dim = a_dim args.x_dim = x_dim model_exp = ObservableModel(x_dim) pi_exp = policies.RandomGaussianPolicy(x_dim, rng=args.rng) baseline = args.b min_traj_length = getattr(args, 'mintrajlen', args.past + args.fut + 2) PiUpdater = None fkwargs = { 'baseline': baseline, 'lr': args.lr, 'beta_reinf': args.wrwd, 'beta_pred': args.wpred, 'beta_pred_decay': args.wdecay, 'beta_only_reinf': args.wrwd_only, 'gamma': args.gamma, 'grad_step': args.grad_step, 'trpo_step': args.trpo_step, 'past': args.past, 'fut': args.fut, 'cg_opt': args.cg_opt, 'max_traj_length': args.len, 'num_trajs': args.numtrajs, 'normalize_grad': args.norm_g, 'hvec': args.hvec, 'env': env, 'min_traj_len': min_traj_length } print('build updater ... ', args.method) #run the observable model with reactive policy if args.method == 'obsVR': model, PiUpdater, pp = load_observable_policy(args, model_exp, **fkwargs) elif args.method == 'arVR': model, PiUpdater, pp = load_finite_mem_policy(args, model_exp, **fkwargs) else: #run the psr network with obs model or psr model model, PiUpdater, pp = load_rpsp_policy(args, model_exp, **fkwargs) print('done building updater') print('len:', args.len, 'num trajs:', args.numtrajs, 'iter:', args.iter) def run_experiment(): if args.loadfile != '': PiUpdater._load(args.params) elif args.load_reactive != '': re_params = load_params(args.load_reactive) try: PiUpdater._policy._policy._load(re_params) except AttributeError: pass learn_policy(PiUpdater, model, env, min_traj_length=0, max_traj_len=args.len, num_trajs=args.numtrajs, num_samples=args.numsamples, num_iter=args.iter, logger=pp.logger) try: run_experiment() except AssertionError as exc: print('WARNING: Got AssertionError !') print('Message: %s' % exc.message) print('Stacktrace:') traceback.print_exc() return None pp._results['params'] = PiUpdater._save() if args.addobs or args.method == 'arVR': try: re_params = PiUpdater._policy._policy._save() except AttributeError: re_params = PiUpdater._policy._save() save_params(re_params, 're_pi_{}.pkl'.format(args.seed), args.tfile) env.close() return pp._results