示例#1
0
def get_exploration_trajs(args, model_exp, env, output_dim, min_traj_length):
    """
    Get exploration data for initialization
    @param args: command line arguments
    @param model_exp: exploration model
    @param env: environment
    @param output_dim: action dimension
    @param min_traj_length: minimum trajectory length
    @return: observations list of do x L and actions list of da x L
    """
    X_obs_good = []
    X_act_good = []
    pi_exp = policies.RandomGaussianPolicy(output_dim, rng=args.rng)
    if args.load_reactive <> '':  #load a previously trained reactive policy
        re_params = load_params(args.load_reactive)
        if len(re_params) > 0:
            x_dim = env.dimensions[0]
            model_exp = FiniteHistoryModel(obs_dim=x_dim,
                                           past_window=args.filter_w)
            state_dim = model_exp.state_dimension
            re_params['layer_id_0_W'] = re_params['layer_id_0_W'][:,
                                                                  -state_dim:]
            pi_exp = get_policy[args.init_policy](x_dim=state_dim,
                                                  output_dim=output_dim,
                                                  num_layers=args.nL,
                                                  nh=args.nh,
                                                  activation=args.nn_act,
                                                  rng=args.rng,
                                                  min_std=args.min_std)
            pi_exp._load(re_params)

    leni = args.len if args.leni is None else args.leni
    exp_trajs = env.run(model_exp,
                        pi_exp,
                        leni,
                        render=False,
                        min_traj_length=min_traj_length,
                        num_trajs=args.initN,
                        num_samples=args.initS)
    #print ('Using %d exp trajectories.' % len(exp_trajs))
    col_trajs = [(t.obs, t.act) for t in exp_trajs]
    X_obs_rnd = [c[0] for c in col_trajs]
    X_act_rnd = [c[1] for c in col_trajs]
    X_obs = X_obs_rnd + X_obs_good
    X_act = X_act_rnd + X_act_good
    return X_obs, X_act
def run_policy_continuous(args, flname):
    """
    Train a continuous RPSPnet from commandline arguments
    @param args: command line args
    @param flname: filename to store results
    @return: logger results to save
    """
    args.flname = flname
    env = load_environment(args)
    env = load_environment(args)
    (x_dim, a_dim) = env.dimensions
    args.a_dim = a_dim
    args.x_dim = x_dim
    print(x_dim,"@@")
    model_exp = ObservableModel(x_dim)
    pi_exp = policies.RandomGaussianPolicy(x_dim, rng=args.rng)
    baseline = args.b
    min_traj_length = getattr(args, 'mintrajlen', args.past + args.fut + 2)
    PiUpdater = None
    fkwargs = {'baseline': baseline, 'lr': args.lr, 'beta_reinf': args.wrwd,
               'beta_pred': args.wpred, 'beta_pred_decay': args.wdecay,
               'beta_only_reinf': args.wrwd_only, 'gamma': args.gamma,
               'grad_step': args.grad_step, 'trpo_step': args.trpo_step,
               'past': args.past, 'fut': args.fut, 'cg_opt': args.cg_opt,
               'max_traj_length': args.len, 'num_trajs': args.numtrajs,
               'normalize_grad': args.norm_g, 'hvec': args.hvec,
               'env': env, 'min_traj_len': min_traj_length}
    print ('build updater ... ', args.method)

    #run the observable model with reactive policy
    if args.method == 'obsVR':
        model, PiUpdater, pp = load_observable_policy(args, model_exp, **fkwargs)
    elif args.method == 'arVR':
        model, PiUpdater, pp = load_finite_mem_policy(args, model_exp, **fkwargs)
    else:
        #run the psr network with obs model or psr model
        model,  pp = load_rpsp_policy(args, model_exp, **fkwargs)
    print ('done building updater')
    print ('len:', args.len, 'num trajs:', args.numtrajs, 'iter:', args.iter)
    state_shape = (1,model._state_dim)
    num_actions = 64
    batch_size =8
    q_learner = dqn.Agent(state_shape, num_actions, batch_size=batch_size)
    best_mean_rewards=-100
    best_rewards=-100
    MAX_EPISODES = 8000
    MAX_STEPS = 50
    mct=MCTS(model)
    episode_history = deque(maxlen=25)
    for i in xrange(MAX_EPISODES):

        # initialize
        action=np.zeros(2)
        _act=np.zeros(2)
        o = env.reset()
        obs=model._process_obs(o)
        init_q=model.initial_state
        a=np.zeros(2)
        act=model._process_act(a)
        state=mct.update_state(init_q,obs, act)
        total_rewards = 0

        for t in range(MAX_STEPS):
            #env.render()
            a = q_learner.choose_action(state)
            _act[0]=int('{:0>6b}'.format(a)[0:3], 2)
            _act[1]=int('{:0>6b}'.format(a)[3:6], 2)
            for n in range(2):
                action[n] = 1.4 - 0.2 * _act[n]
            action=np.array([action[0],action[1]])
            next_obs, reward, done = env.step(action)
            if t == 48:
                done=True
            t_next_obs=mct.model._process_obs(next_obs)
            t_act=mct.model._process_act(action)
            total_rewards += reward
            next_state=mct.update_state(state,t_next_obs, t_act)
            q_learner.update_buffer(state, a, reward, next_state, done)
            
            # Only start learning after buffer has some experience in it
            if i > 50:
                q_learner.update_policy()


            state = next_state
            if done == True: 
                 break
        episode_history.append(total_rewards)
        mean_rewards = np.mean(episode_history)
        print("Episode {}".format(i))
        print("Finished after {} timesteps".format(t+1))
        print("Reward for this episode: {}".format(total_rewards))
        print("Average reward for last 100 episodes: {:.2f}".format(mean_rewards))

        if mean_rewards >= best_mean_rewards:
            best_mean_rewards=mean_rewards
        if total_rewards >= best_rewards:
            best_rewards=total_rewards 
        #print>>file,mean_rewards
        print(mean_rewards ,file=file)
    print("best reward",best_rewards)
    print("best_mean_reward",best_mean_rewards) 
示例#3
0
def run_policy_continuous(args, flname):
    """
    Train a continuous RPSPnet from commandline arguments
    @param args: command line args
    @param flname: filename to store results
    @return: logger results to save
    """
    args.flname = flname
    env = load_environment(args)
    env = load_environment(args)
    (x_dim, a_dim) = env.dimensions
    args.a_dim = a_dim
    args.x_dim = x_dim
    model_exp = ObservableModel(x_dim)
    pi_exp = policies.RandomGaussianPolicy(x_dim, rng=args.rng)
    baseline = args.b
    min_traj_length = getattr(args, 'mintrajlen', args.past + args.fut + 2)
    PiUpdater = None
    fkwargs = {
        'baseline': baseline,
        'lr': args.lr,
        'beta_reinf': args.wrwd,
        'beta_pred': args.wpred,
        'beta_pred_decay': args.wdecay,
        'beta_only_reinf': args.wrwd_only,
        'gamma': args.gamma,
        'grad_step': args.grad_step,
        'trpo_step': args.trpo_step,
        'past': args.past,
        'fut': args.fut,
        'cg_opt': args.cg_opt,
        'max_traj_length': args.len,
        'num_trajs': args.numtrajs,
        'normalize_grad': args.norm_g,
        'hvec': args.hvec,
        'env': env,
        'min_traj_len': min_traj_length
    }
    print('build updater ... ', args.method)

    #run the observable model with reactive policy
    if args.method == 'obsVR':
        model, PiUpdater, pp = load_observable_policy(args, model_exp,
                                                      **fkwargs)
    elif args.method == 'arVR':
        model, PiUpdater, pp = load_finite_mem_policy(args, model_exp,
                                                      **fkwargs)
    else:
        #run the psr network with obs model or psr model
        model, PiUpdater, pp = load_rpsp_policy(args, model_exp, **fkwargs)
    print('done building updater')
    print('len:', args.len, 'num trajs:', args.numtrajs, 'iter:', args.iter)

    def run_experiment():
        if args.loadfile != '':
            PiUpdater._load(args.params)
        elif args.load_reactive != '':
            re_params = load_params(args.load_reactive)
            try:
                PiUpdater._policy._policy._load(re_params)
            except AttributeError:
                pass

        learn_policy(PiUpdater,
                     model,
                     env,
                     min_traj_length=0,
                     max_traj_len=args.len,
                     num_trajs=args.numtrajs,
                     num_samples=args.numsamples,
                     num_iter=args.iter,
                     logger=pp.logger)

    try:
        run_experiment()
    except AssertionError as exc:
        print('WARNING: Got AssertionError !')
        print('Message: %s' % exc.message)
        print('Stacktrace:')
        traceback.print_exc()
        return None
    pp._results['params'] = PiUpdater._save()
    if args.addobs or args.method == 'arVR':
        try:
            re_params = PiUpdater._policy._policy._save()
        except AttributeError:
            re_params = PiUpdater._policy._save()
        save_params(re_params, 're_pi_{}.pkl'.format(args.seed), args.tfile)
    env.close()
    return pp._results