예제 #1
0
def load_and_run_model(env, name):
    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=bf,
                               max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name),
                       controller=controller,
                       reward=R,
                       sparse=False)

    print('Running {:s}'.format(name))
    rollout(env, pilco, timesteps=T_sim, verbose=False, SUBS=SUBS)
예제 #2
0
    def work_acer(self):
        b_states=[None]
        done = True
        step = 0
        print(self.name, " using ", self.offline_steps, "offline steps, per online step")

        while step < self.MAX_STEPS:
            """
            """
            self.agent.update_target()
            # n -step rollout from the environment, with n = RETURN_STEPS or until done.
            b_states, b_actions, b_rewards, b_mus, done = rollout(self.agent, self.env, [b_states[-1]], done, self.RETURN_STEPS)
            pi, q_a, val = self.agent.get_retrace_values(b_states[:-1], b_actions)

            importance_weights = np.divide(pi, np.add(b_mus, 1e-14))
            importance_weights_a = np.take(np.reshape(importance_weights, [-1]), (
                    np.arange(importance_weights.shape[0]) * importance_weights.shape[1] + b_actions))
            #calculate retrace values.
            retrace_targets = q_retrace(b_rewards, done, q_a, val, importance_weights_a, self.DISCOUNT)
            #update step, returns current global step and summary (not used here)
            _, step = self.agent.update_step(b_states[:-1], b_actions, retrace_targets, importance_weights)
            # append trajectory to the replay buffer
            self.memory.remember((b_states, b_actions, b_rewards, b_mus, done))
            #offline version, instead of rollout the trajectory is sampled.
            if self.offline_steps>0 and self.memory.can_sample():
                for _ in range(self.offline_steps):
                    mem_states, mem_actions, mem_rewards, mem_mus, mem_done = self.memory.sample_from_memory()
                    pi, q_a, val = self.agent.get_retrace_values(mem_states[:-1], mem_actions)

                    importance_weights = np.divide(pi, np.add(mem_mus, 1e-14))
                    importance_weights_a = np.take(np.reshape(importance_weights, [-1]), (
                            np.arange(importance_weights.shape[0]) * importance_weights.shape[1] + mem_actions))
                    retrace_targets = q_retrace(mem_rewards, mem_done, q_a, val, importance_weights_a, self.DISCOUNT)
                    sum, step = self.agent.update_step(mem_states[:-1], mem_actions, retrace_targets, importance_weights)
예제 #3
0
    def eval(self):
        """Evaluate deterministically the Gaussian policy.

        Returns:
            np.array: Expected accumulated reward

        """
        # Put models in evaluation mode
        for model in self.trainable_models:
            model.eval()

        for rr in range(self.eval_rollouts):
            rollout_info = rollout(
                self.env,
                self.policy,
                max_horizon=self.max_horizon,
                fixed_horizon=self.fixed_horizon,
                render=self.render,
                return_info_dict=True,
                device=self.torch_device,
                deterministic=True,
            )

            self.logging_eval_rewards[rr] = torch.tensor(
                rollout_info['reward']).mean()
            self.logging_eval_returns[rr] = torch.tensor(
                rollout_info['reward']).sum()

            self.num_eval_interactions += 1

        gt.stamp('eval')

        return self.logging_eval_returns.mean().item()
예제 #4
0
def run():
    env = gym.make('CorridorSmall-v10')
    action_space = list(range(env.action_space.n))

    q = Approximator_ResidualBoosting(action_space)
    initial_learning_rate = 0.15
    learning_rate = initial_learning_rate
    initial_epsilon = 0.15
    epsilon = initial_epsilon
    batch_size = 10

    for learning_iteration in range(1000):
        policy = Policy_EpsilonGreedy(q, epsilon)
        episodes = [rollout(policy, env) for _ in range(batch_size)]
        targets = TD0_targets(episodes, q)
        X, Y_target = zip(*targets)
        Y_target = np.reshape(Y_target, (-1, 1))

        learning_rate = decay(initial_learning_rate, learning_iteration)
        epsilon = decay(initial_epsilon, learning_iteration)
        q.learn(learning_rate, X, Y_target)

        if learning_iteration % 1 == 0:
            greedy_policy = Policy_Greedy(q)
            reward_sum = avg(
                test_policy(greedy_policy, env) for _ in range(10))
            print(
                f"Episode {learning_iteration*batch_size} Reward {reward_sum} lr {learning_rate} epsilon {epsilon}"
            )
def rollout_plans(env: LegacyEnv, plans: np.ndarray, states: np.ndarray):
    returns = np.empty((plans.shape[0], plans.shape[1]))
    assert len(returns.shape) == 2

    assert len(plans.shape) == 4
    for i in range(plans.shape[0]):
        for j in range(plans.shape[1]):
            returns[i, j] = rollout(plans[i, j], env, states[j])
    return returns
예제 #6
0
    def work_and_eval_acer(self, net_saver, TB_DIR, evalrewards=[]):
        b_states = [None]
        done = True
        step = 0
        runningreward = 1
        bestreward = 0
        rewardlist=[]
        if evalrewards !=[]:
            runningreward = evalrewards[-1]
            print(runningreward)
        next_verbose = 0
        summary_writer = tf.summary.FileWriter(TB_DIR + "/tb", self.sess.graph, flush_secs=30)
        print(self.name, " using ", self.offline_steps, "offline steps, per online step")

        while step < self.MAX_STEPS:
            self.agent.update_target()
            b_states, b_actions, b_rewards, b_mus, done = rollout(self.agent, self.env, [b_states[-1]], done,
                                                                  self.RETURN_STEPS)
            pi, q_a, val = self.agent.get_retrace_values(b_states[:-1], b_actions)
            rewardlist.append(np.sum(b_rewards))
            importance_weights = np.divide(pi, np.add(b_mus, 1e-14))
            importance_weights_a = np.take(np.reshape(importance_weights, [-1]), (
                    np.arange(importance_weights.shape[0]) * importance_weights.shape[1] + b_actions))
            retrace_targets = q_retrace(b_rewards, done, q_a, val, importance_weights_a, self.DISCOUNT)
            sum, step = self.agent.update_step(b_states[:-1], b_actions, retrace_targets, importance_weights)
            self.memory.remember((b_states, b_actions, b_rewards, b_mus, done))
            if done:
                bestreward = np.maximum(bestreward,np.sum(rewardlist))
                runningreward = 0.9*runningreward+0.1*np.sum(rewardlist)
                evalrewards.append(runningreward)
                np.savetxt(TB_DIR + "reward.out",evalrewards)
                rewardlist=[]
                if step > next_verbose:
                    print("Worker ", self.name, "At ", step, " Running/Max: ", runningreward, bestreward, " Frames:", self.memory.counter)
                    print("pi:", self.agent.get_pi(b_states[-1]))
                    print("Saving Model")
                    next_verbose +=(self.MAX_STEPS/100)
                    net_saver.save(self.sess, TB_DIR + "checkpoints/model" + str(step) + ".cptk")
                if sum is not None:
                    summary_writer.add_summary(sum, step)

            if self.offline_steps>0 and self.memory.can_sample():
                for _ in range(self.offline_steps):

                    mem_states, mem_actions, mem_rewards, mem_mus, mem_done = self.memory.sample_from_memory()
                    pi, q_a, val = self.agent.get_retrace_values(mem_states[:-1], mem_actions)

                    importance_weights = np.divide(pi, np.add(mem_mus, 1e-14))
                    importance_weights_a = np.take(np.reshape(importance_weights, [-1]), (
                            np.arange(importance_weights.shape[0]) * importance_weights.shape[1] + mem_actions))
                    retrace_targets = q_retrace(mem_rewards, mem_done, q_a, val, importance_weights_a, self.DISCOUNT)
                    sum, step = self.agent.update_step(mem_states[:-1], mem_actions, retrace_targets, importance_weights)
예제 #7
0
def run(env, config):
    action_space = list(range(env.action_space.n))
    replay_buffer = Replay_buffer()

    q = Approximator_ResidualBoosting(action_space)
    learning_rate = config.initial_learning_rate
    epsilon = config.initial_epsilon
    interaction_count = 0

    for learning_iteration in range(config.learning_iterations):
        if learning_iteration % 1 == 0:
            greedy_policy = Policy_Greedy(q)

            reward_sum = avg(
                test_rollout(greedy_policy, env)
                for _ in range(config.test_rollouts))
            print(
                f"Episode {learning_iteration*config.rollout_batch_size:05d} Reward {reward_sum:05f} lr {learning_rate:05f} epsilon {epsilon:05f}"
            )
            yield interaction_count, reward_sum

        policy = Policy_EpsilonGreedy(q, epsilon=epsilon)
        episodes = [
            list(rollout(policy, env))
            for _ in range(config.rollout_batch_size)
        ]
        interaction_count += sum(map(len, episodes))
        replay_buffer += episodes
        sampled_episodes = replay_buffer.sample(config.replay_batch_size)

        targets = TD0_targets(sampled_episodes, q, config.discount)
        X, Y_target = zip(*targets)
        Y_target = np.reshape(Y_target, (-1, 1))

        learning_rate = decay(config.initial_learning_rate,
                              learning_iteration * config.rollout_batch_size)
        epsilon = decay(config.initial_epsilon,
                        learning_iteration * config.rollout_batch_size)
        q.learn(learning_rate, X, Y_target)
예제 #8
0
    parser.add_argument('--max_length', type=int, default=1000,
                        help='Max length of rollout')
    parser.add_argument('--speedup', type=int, default=1,
                        help='Speedup')
    parser.add_argument('--loop', type=int, default=1,
                        help='# of loops')
    args = parser.parse_args()

    policy = None
    env = None
    while True:
        if ':' in args.file:
            # fetch file using ssh
            os.system("rsync -avrz %s /tmp/%s.pkl" % (args.file, filename))
            data = joblib.load("/tmp/%s.pkl" % filename)
            if policy:
                new_policy = data['policy']
                policy.set_param_values(new_policy.get_param_values())
                path = rollout(env, policy, max_path_length=args.max_length,
                               animated=True, speedup=args.speedup)
            else:
                policy = data['policy']
                env = data['env']
                path = rollout(env, policy, max_path_length=args.max_length,
                               animated=True, speedup=args.speedup)
        else:
            data = joblib.load(args.file)
            policy = data['policy']
            env = data['env']
            path = rollout(env, policy )
        break
예제 #9
0
def safe_swimmer_run(seed=0, logging=False):
    env = SwimmerWrapper()
    state_dim = 9
    control_dim = 2
    SUBS = 2
    maxiter = 60
    max_action = 1.0
    m_init = np.reshape(np.zeros(state_dim),
                        (1, state_dim))  # initial state mean
    S_init = 0.05 * np.eye(state_dim)
    J = 1
    N = 12
    T = 25
    bf = 30
    T_sim = 100

    # Reward function that dicourages the joints from hitting their max angles
    weights_l = np.zeros(state_dim)
    weights_l[0] = 0.5
    max_ang = (100 / 180 * np.pi) * 0.95
    R1 = LinearReward(state_dim, weights_l)

    C1 = SingleConstraint(1, low=-max_ang, high=max_ang, inside=False)
    C2 = SingleConstraint(2, low=-max_ang, high=max_ang, inside=False)
    C3 = SingleConstraint(3, low=-max_ang, high=max_ang, inside=False)
    R = CombinedRewards(state_dim, [R1, C1, C2, C3],
                        coefs=[1.0, -10.0, -10.0, -10.0])

    th = 0.2
    # Initial random rollouts to generate a dataset
    X, Y, _, _ = rollout(env,
                         None,
                         timesteps=T,
                         random=True,
                         SUBS=SUBS,
                         verbose=True)
    for i in range(1, J):
        X_, Y_, _, _ = rollout(env,
                               None,
                               timesteps=T,
                               random=True,
                               SUBS=SUBS,
                               verbose=True)
        X = np.vstack((X, X_))
        Y = np.vstack((Y, Y_))

    state_dim = Y.shape[1]
    control_dim = X.shape[1] - state_dim
    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=bf,
                               max_action=max_action)

    pilco = PILCO((X, Y),
                  controller=controller,
                  horizon=T,
                  reward=R,
                  m_init=m_init,
                  S_init=S_init)
    for model in pilco.mgpr.models:
        model.likelihood.variance.assign(0.001)
        set_trainable(model.likelihood.variance, False)

    new_data = True
    eval_runs = T_sim
    evaluation_returns_full = np.zeros((N, eval_runs))
    evaluation_returns_sampled = np.zeros((N, eval_runs))
    X_eval = []
    for rollouts in range(N):
        print("**** ITERATION no", rollouts, " ****")
        if new_data:
            pilco.optimize_models(maxiter=100)
            new_data = False
        pilco.optimize_policy(maxiter=1, restarts=2)

        m_p = np.zeros((T, state_dim))
        S_p = np.zeros((T, state_dim, state_dim))
        predicted_risk1 = np.zeros(T)
        predicted_risk2 = np.zeros(T)
        predicted_risk3 = np.zeros(T)
        for h in range(T):
            m_h, S_h, _ = pilco.predict(m_init, S_init, h)
            m_p[h, :], S_p[h, :, :] = m_h[:], S_h[:, :]
            predicted_risk1[h], _ = C1.compute_reward(m_h, S_h)
            predicted_risk2[h], _ = C2.compute_reward(m_h, S_h)
            predicted_risk3[h], _ = C3.compute_reward(m_h, S_h)
        estimate_risk1 = 1 - np.prod(1.0 - predicted_risk1)
        estimate_risk2 = 1 - np.prod(1.0 - predicted_risk2)
        estimate_risk3 = 1 - np.prod(1.0 - predicted_risk3)
        overall_risk = 1 - (1 - estimate_risk1) * (1 - estimate_risk2) * (
            1 - estimate_risk3)
        if overall_risk < th:
            X_new, Y_new, _, _ = rollout(env,
                                         pilco,
                                         timesteps=T_sim,
                                         verbose=True,
                                         SUBS=SUBS)
            new_data = True
            # Update dataset
            X = np.vstack((X, X_new[:T, :]))
            Y = np.vstack((Y, Y_new[:T, :]))
            pilco.mgpr.set_data((X, Y))
            if estimate_risk1 < th / 10:
                R.coefs.assign(R.coefs.value() * [1.0, 0.75, 1.0, 1.0])
            if estimate_risk2 < th / 10:
                R.coefs.assign(R.coefs.value() * [1.0, 1.0, 0.75, 1.0])
            if estimate_risk3 < th / 10:
                R.coefs.assign(R.coefs.value() * [1.0, 1.0, 1.0, 0.75])
        else:
            print("*********CHANGING***********")
            if estimate_risk1 > th / 3:
                R.coefs.assign(R.coefs.value() * [1.0, 1.5, 1.0, 1.0])
            if estimate_risk2 > th / 3:
                R.coefs.assign(R.coefs.value() * [1.0, 1.0, 1.5, 1.0])
            if estimate_risk3 > th / 3:
                R.coefs.assign(R.coefs.value() * [1.0, 1.0, 1.0, 1.5])
            _, _, r = pilco.predict(m_init, S_init, T)
예제 #10
0
def safe_cars(seed=0):
    T = 25
    th = 0.10
    np.random.seed(seed)
    J = 5
    N = 5
    eval_runs = 5
    env = LinearCars()
    # Initial random rollouts to generate a dataset
    X1, Y1, _, _ = rollout(env,
                           pilco=None,
                           timesteps=T,
                           verbose=True,
                           random=True,
                           render=False)
    for i in range(1, 5):
        X1_, Y1_, _, _ = rollout(env,
                                 pilco=None,
                                 timesteps=T,
                                 verbose=True,
                                 random=True,
                                 render=False)
        X1 = np.vstack((X1, X1_))
        Y1 = np.vstack((Y1, Y1_))

    env = Normalised_Env(np.mean(X1[:, :4], 0), np.std(X1[:, :4], 0))
    X, Y, _, _ = rollout(env,
                         pilco=None,
                         timesteps=T,
                         verbose=True,
                         random=True,
                         render=False)
    for i in range(1, J):
        X_, Y_, _, _ = rollout(env,
                               pilco=None,
                               timesteps=T,
                               verbose=True,
                               random=True,
                               render=False)
        X = np.vstack((X, X_))
        Y = np.vstack((Y, Y_))

    state_dim = Y.shape[1]
    control_dim = X.shape[1] - state_dim

    m_init = np.transpose(X[0, :-1, None])
    S_init = 0.1 * np.eye(state_dim)

    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=40,
                               max_action=0.2)

    #w1 = np.diag([1.5, 0.001, 0.001, 0.001])
    #t1 = np.divide(np.array([3.0, 1.0, 3.0, 1.0]) - env.m, env.std)
    #R1 = ExponentialReward(state_dim=state_dim, t=t1, W=w1)
    # R1 = LinearReward(state_dim=state_dim, W=np.array([0.1, 0.0, 0.0, 0.0]))
    R1 = LinearReward(state_dim=state_dim,
                      W=np.array([
                          1.0 * env.std[0],
                          0.,
                          0.,
                          0,
                      ]))

    bound_x1 = 1 / env.std[0]
    bound_x2 = 1 / env.std[2]
    B = RiskOfCollision(
        2,
        [-bound_x1 - env.m[0] / env.std[0], -bound_x2 - env.m[2] / env.std[2]],
        [bound_x1 - env.m[0] / env.std[0], bound_x2 - env.m[2] / env.std[2]])

    pilco = SafePILCO((X, Y),
                      controller=controller,
                      mu=-300.0,
                      reward_add=R1,
                      reward_mult=B,
                      horizon=T,
                      m_init=m_init,
                      S_init=S_init)

    for model in pilco.mgpr.models:
        model.likelihood.variance.assign(0.001)
        set_trainable(model.likelihood.variance, False)

    # define tolerance
    new_data = True
    # init = tf.global_variables_initializer()
    evaluation_returns_full = np.zeros((N, eval_runs))
    evaluation_returns_sampled = np.zeros((N, eval_runs))
    X_eval = []
    for rollouts in range(N):
        print("***ITERATION**** ", rollouts)
        if new_data:
            pilco.optimize_models(maxiter=100)
            new_data = False
        pilco.optimize_policy(maxiter=20, restarts=2)
        # check safety
        m_p = np.zeros((T, state_dim))
        S_p = np.zeros((T, state_dim, state_dim))
        predicted_risks = np.zeros(T)
        predicted_rewards = np.zeros(T)

        for h in range(T):
            m_h, S_h, _ = pilco.predict(m_init, S_init, h)
            m_p[h, :], S_p[h, :, :] = m_h[:], S_h[:, :]
            predicted_risks[h], _ = B.compute_reward(m_h, S_h)
            predicted_rewards[h], _ = R1.compute_reward(m_h, S_h)
        overall_risk = 1 - np.prod(1.0 - predicted_risks)

        print("Predicted episode's return: ", sum(predicted_rewards))
        print("Overall risk ", overall_risk)
        print("Mu is ", pilco.mu.numpy())
        print("bound1 ", bound_x1, " bound1 ", bound_x2)

        if overall_risk < th:
            X_new, Y_new, _, _ = rollout(env,
                                         pilco=pilco,
                                         timesteps=T,
                                         verbose=True,
                                         render=False)
            new_data = True
            X = np.vstack((X, X_new))
            Y = np.vstack((Y, Y_new))
            pilco.mgpr.set_data((X, Y))
            if overall_risk < (th / 4):
                pilco.mu.assign(0.75 * pilco.mu.numpy())

        else:
            X_new, Y_new, _, _ = rollout(env,
                                         pilco=pilco,
                                         timesteps=T,
                                         verbose=True,
                                         render=False)
            print(m_p[:, 0] - X_new[:, 0])
            print(m_p[:, 2] - X_new[:, 2])
            print("*********CHANGING***********")
            _, _, r = pilco.predict(m_init, S_init, T)
            print(r)
            # to verify this actually changes, run the reward wrapper before and after on the same trajectory
            pilco.mu.assign(1.5 * pilco.mu.numpy())
            _, _, r = pilco.predict(m_init, S_init, T)
            print(r)
예제 #11
0
    args.epsilon_decay_factor = 0.99
    args.lr = 0.001
    args.gamma = 0.90

    policy = DQNPolicy(make_dqn(statesize, actionsize),
                       statesize,
                       actionsize,
                       lr=args.lr,
                       gamma=args.gamma)
    utils.qlearn(env, policy, args)
    torch.save(policy.model, args.model)

    # From here, take from mp7.py
    # Environment (a Markov Decision Process model)
    # Q Model

    model = utils.loadmodel(args.model, env, statesize, actionsize)
    print("Model: {}".format(model))

    # Rollout
    _, rewards = utils.rollout(env,
                               model,
                               args.episodes,
                               args.epsilon,
                               render=True)

    # Report
    #Evaluate total rewards for MountainCar environment
    score = np.array([np.array(rewards) > -200.0]).sum()
    print('Score: ' + str(score) + '/' + str(args.episodes))
예제 #12
0
파일: main.py 프로젝트: Sha-Lab/qmc
def compare_grad(args):
    set_seed(args.seed)
    env = LQR(
        N=args.xu_dim[0],
        M=args.xu_dim[1],
        lims=100,
        init_scale=1.0,
        max_steps=args.H,
        Sigma_s_kappa=1.0,
        Q_kappa=1.0,
        P_kappa=1.0,
        A_norm=1.0,
        B_norm=1.0,
        Sigma_s_scale=args.noise,
    )
    #K = env.optimal_controller()
    K = np.random.randn(env.M, env.N)
    mean_network = nn.Linear(*K.shape[::-1], bias=False)
    mean_network.weight.data = tensor(K)
    policy = GaussianPolicy(*K.shape[::-1],
                            mean_network,
                            learn_std=False,
                            gate_output=False)
    out_set = set()  # here

    Sigma_a = np.diag(np.ones(env.M))
    mc_grads = []
    for i in tqdm(range(args.n_trajs), 'mc'):
        noises = np.random.randn(env.max_steps, env.M)
        states, actions, rewards, _, _ = rollout(env, policy, noises)
        if len(states) < args.H:
            out_set.add('mc')
            break
        mc_grads.append(
            get_gaussian_policy_gradient(states, actions, rewards, policy,
                                         variance_reduced_loss))
    mc_grads = np.asarray(mc_grads)
    mc_means = np.cumsum(mc_grads, axis=0) / np.arange(
        1,
        len(mc_grads) + 1)[:, np.newaxis, np.newaxis]

    rqmc_grads = []
    #loc = torch.zeros(env.max_steps * env.M)
    #scale = torch.ones(env.max_steps * env.M)
    #rqmc_noises = Normal_RQMC(loc, scale).sample(torch.Size([args.n_trajs])).data.numpy()
    rqmc_noises = uniform2normal(
        random_shift(
            ssj_uniform(
                args.n_trajs,
                args.H * env.M,
            ).reshape(args.n_trajs, args.H, env.M),
            0,
        ))
    for i in tqdm(range(args.n_trajs), 'rqmc'):
        states, actions, rewards, _, _ = rollout(
            env, policy, rqmc_noises[i].reshape(env.max_steps, env.M))
        if len(states) < args.H:
            out_set.add('rqmc')
            break
        rqmc_grads.append(
            get_gaussian_policy_gradient(states, actions, rewards, policy,
                                         variance_reduced_loss))
    rqmc_grads = np.asarray(rqmc_grads)
    rqmc_means = np.cumsum(rqmc_grads, axis=0) / np.arange(
        1,
        len(rqmc_grads) + 1)[:, np.newaxis, np.newaxis]

    arqmc_means_dict = {}
    #arqmc_noises = get_rqmc_noises(args.n_trajs, args.H, env.M, 'array')
    uniform_noises = ssj_uniform(args.n_trajs, env.M)  # n_trajs , action_dim
    arqmc_noises = uniform2normal(
        random_shift(np.expand_dims(uniform_noises, 1).repeat(args.H, 1),
                     0))  # n_trajs, horizon, action_dim
    for sorter in args.sorter:
        arqmc_grads = []
        sort_f = get_sorter(sorter, env, K)
        data = ArrayRQMCSampler(env, args.n_trajs,
                                sort_f=sort_f).sample(policy, arqmc_noises)
        for traj in data:
            states, actions, rewards = np.asarray(traj['states']), np.asarray(
                traj['actions']), np.asarray(traj['rewards'])
            if len(states) < args.H:
                out_set.add('arqmc_{}'.format(sorter))
                break
            arqmc_grads.append(
                get_gaussian_policy_gradient(states, actions, rewards, policy,
                                             variance_reduced_loss))
        arqmc_grads = np.asarray(arqmc_grads)
        arqmc_means = np.cumsum(arqmc_grads, axis=0) / np.arange(
            1,
            len(arqmc_grads) + 1)[:, np.newaxis, np.newaxis]
        arqmc_means_dict[sorter] = arqmc_means

    expected_grad = env.expected_policy_gradient(K, Sigma_a)

    mc_errors = [np.nan] if 'mc' in out_set else ((
        mc_means - expected_grad)**2).reshape(mc_means.shape[0], -1).mean(
            1)  # why the sign is reversed?
    rqmc_errors = [np.nan] if 'rqmc' in out_set else (
        (rqmc_means -
         expected_grad)**2).reshape(rqmc_means.shape[0], -1).mean(1)
    arqmc_errors_dict = {
        sorter: [np.nan] if 'arqmc_{}'.format(sorter) in out_set else
        ((arqmc_means -
          expected_grad)**2).reshape(arqmc_means.shape[0], -1).mean(1)
        for sorter, arqmc_means in arqmc_means_dict.items()
    }
    info = {
        **vars(args),
        'out': out_set,
        'expected_grad': expected_grad,
        'means': {
            'mc': mc_means,
            'rqmc': rqmc_means,
            **arqmc_means_dict,
        },
    }
    if args.save_fn is not None:
        with open(save_fn, 'wb') as f:
            dill.dump(
                dict(mc_errors=mc_errors,
                     rqmc_errors=rqmc_errors,
                     arqmc_errors_dict=arqmc_errors_dict,
                     info=info), f)
    if args.show_fig:
        mc_data = pd.DataFrame({
            'name': 'mc',
            'x': np.arange(len(mc_errors)),
            'error': mc_errors,
        })
        rqmc_data = pd.DataFrame({
            'name': 'rqmc',
            'x': np.arange(len(rqmc_errors)),
            'error': rqmc_errors,
        })
        arqmc_data = pd.concat([
            pd.DataFrame({
                'name': 'arqmc_{}'.format(sorter),
                'x': np.arange(len(arqmc_errors)),
                'error': arqmc_errors,
            }) for sorter, arqmc_errors in arqmc_errors_dict.items()
        ])
        plot = sns.lineplot(x='x',
                            y='error',
                            hue='name',
                            data=pd.concat([mc_data, rqmc_data, arqmc_data]))
        plot.set(yscale='log')
        plt.show()
    return mc_errors, rqmc_errors, arqmc_errors_dict, info
예제 #13
0
파일: main.py 프로젝트: Sha-Lab/qmc
def compare_cost(args):
    set_seed(args.seed)
    env = LQR(
        #N=20,
        #M=12,
        init_scale=1.0,
        max_steps=args.H,  # 10, 20
        Sigma_s_kappa=1.0,
        Q_kappa=1.0,
        P_kappa=1.0,
        A_norm=1.0,
        B_norm=1.0,
        Sigma_s_scale=0.0,
    )
    K = env.optimal_controller()
    mean_network = nn.Linear(*K.shape[::-1], bias=False)
    mean_network.weight.data = tensor(K)
    policy = GaussianPolicy(*K.shape[::-1],
                            mean_network,
                            learn_std=False,
                            gate_output=False)

    # mc
    mc_costs = []  # individual
    mc_means = []  # cumulative
    for i in tqdm(range(args.n_trajs), 'mc'):
        noises = np.random.randn(env.max_steps, env.M)
        _, _, rewards, _, _ = rollout(env, policy, noises)
        mc_costs.append(-rewards.sum())
        mc_means.append(np.mean(mc_costs))

    # rqmc
    rqmc_costs = []
    rqmc_means = []
    rqmc_noises = get_rqmc_noises(args.n_trajs, env.max_steps, env.M,
                                  'trajwise')
    for i in tqdm(range(args.n_trajs), 'rqmc'):
        _, _, rewards, _, _ = rollout(env, policy, rqmc_noises[i])
        rqmc_costs.append(-rewards.sum())
        rqmc_means.append(np.mean(rqmc_costs))

    # array rqmc
    arqmc_costs_dict = {}
    arqmc_means_dict = {}
    arqmc_noises = get_rqmc_noises(args.n_trajs, env.max_steps, env.M, 'ssj')
    #arqmc_noises = get_rqmc_noises(args.n_trajs, env.max_steps, env.M, 'array')

    for sorter in args.sorter:
        arqmc_costs = []
        arqmc_means = []
        sort_f = get_sorter(sorter, env)

        data = ArrayRQMCSampler(env, args.n_trajs,
                                sort_f=sort_f).sample(policy, arqmc_noises)
        for traj in data:
            rewards = np.asarray(traj['rewards'])
            arqmc_costs.append(-rewards.sum())
            arqmc_means.append(np.mean(arqmc_costs))
        arqmc_costs_dict[sorter] = arqmc_costs
        arqmc_means_dict[sorter] = arqmc_means

    expected_cost = env.expected_cost(K, np.diag(np.ones(env.M)))

    mc_errors = np.abs(mc_means - expected_cost)
    rqmc_errors = np.abs(rqmc_means - expected_cost)
    arqmc_errors_dict = {
        sorter: np.abs(arqmc_means - expected_cost)
        for sorter, arqmc_means in arqmc_means_dict.items()
    }
    logger.info('mc: {}, rqmc: {} '.format(mc_errors[-1], rqmc_errors[-1]) + \
        ' '.join(['arqmc ({}): {}'.format(sorter, arqmc_errors[-1]) for sorter, arqmc_errors in arqmc_errors_dict.items()]))
    info = {
        **vars(args), 'mc_costs': mc_costs,
        'rqmc_costs': rqmc_costs,
        'arqmc_costs': arqmc_costs
    }
    if args.save_fn is not None:
        with open(args.save_fn, 'wb') as f:
            dill.dump(
                dict(mc_errors=mc_errors,
                     rqmc_errors=rqmc_errors,
                     arqmc_errors_dict=arqmc_errors_dict,
                     info=info), f)
    if args.show_fig:
        data = pd.concat([
            pd.DataFrame({
                'name': 'mc',
                'x': np.arange(len(mc_errors)),
                'error': mc_errors,
            }),
            pd.DataFrame({
                'name': 'rqmc',
                'x': np.arange(len(rqmc_errors)),
                'error': rqmc_errors,
            }),
            pd.concat([
                pd.DataFrame({
                    'name': 'arqmc_{}'.format(sorter),
                    'x': np.arange(len(arqmc_errors)),
                    'error': arqmc_errors,
                }) for sorter, arqmc_errors in arqmc_errors_dict.items()
            ]),
        ])
        plot = sns.lineplot(x='x', y='error', hue='name', data=data)
        plot.set(yscale='log')
        plt.show()
    return mc_errors, rqmc_errors, arqmc_errors_dict, info
예제 #14
0
config.gpu_options.allow_growth = True

# Reward function parameters: lin_pos[3] + ang_pos[3] + lin_vel[3] + ang_vel[3]
#                   x    y    z       r     p    q    x.   y.   z.   r.  p.    q.
target = np.array([0.0, 0.0, 0.4075, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
weights = np.diag([0.3, 0.3, 2.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0, 1.0, 1.0, 0.2])

subs=2

m_init = np.random.randn(12)*0.01
S_init = m_init*0.0 + 0.02

with tf.Session(config=config, graph=tf.Graph()) as sess:
    env = gym.make('VrepBalanceBot2-v0')
    # Initial random rollouts to generate a dataset
    X,Y = rollout(env=env, pilco=None, random=True, timesteps=80, SUBS=subs, render=False)
    for i in range(1,12): #uniform action sampling
        X_, Y_ = rollout(env=env, pilco=None, random=True,  timesteps=80, SUBS=subs, render=False)
        X = np.vstack((X, X_)).astype(np.float64)
        Y = np.vstack((Y, Y_)).astype(np.float64)
    for i in range(1,24): #Gaussian/Normal distribution action sampling
        X_, Y_ = rollout(env=env, pilco=None, random="Normal",  timesteps=80, SUBS=subs, render=False)
        X = np.vstack((X, X_)).astype(np.float64)
        Y = np.vstack((Y, Y_)).astype(np.float64)
    for i in range(1,4): #No action sampling; u := 0
        X_, Y_ = rollout(env=env, pilco=None, random=None,  timesteps=80, SUBS=subs, render=False)
        X = np.vstack((X, X_)).astype(np.float64)
        Y = np.vstack((Y, Y_)).astype(np.float64)        


    state_dim = Y.shape[1]
예제 #15
0
    # Load data into arrays
    all_obs = np.zeros((args.num_rollouts, max_path_length, flat_obs))
    all_rewards = np.zeros((args.num_rollouts, max_path_length))
    rew = []

    ### changes start
    import ipdb
    ipdb.set_trace()
    if args.weight:
        func = args.weight
    controller = control.StraightController(func)

    ### changes end
    for j in range(args.num_rollouts):
        # run a single rollout of the experiment
        path = rollout(env=env, agent=policy, controller=controller)

        # collect the observations and rewards from the rollout
        new_obs = path['observations']
        all_obs[j, :new_obs.shape[0], :new_obs.shape[1]] = new_obs
        new_rewards = path['rewards']
        all_rewards[j, :len(new_rewards)] = new_rewards

        # print the cumulative reward of the most recent rollout
        print("Round {}, return: {}".format(j, sum(new_rewards)))
        rew.append(sum(new_rewards))

    # print the average cumulative reward across rollouts
    print("Average, std return: {}, {}".format(np.mean(rew), np.std(rew)))

    # ensure that a reward_plots folder exists in the directory, and if not,
예제 #16
0
        np.random.seed(seed)
        torch.cuda.manual_seed(seed)
        torch.manual_seed(seed)
        env.seed(seed)

        # Get models from file
        itr_dir = 'itr_%03d' % args.iteration if args.iteration > -1 else 'last_itr'
        models_dir = osp.join(args.log_dir, 'models', itr_dir)
        policy_file = osp.join(models_dir, 'policy.pt')
        policy = torch.load(policy_file,
                            map_location=lambda storage, loc: storage)

        print('\n' * 5)
        print('--->horizon', horizon)
        rollout(env,
                policy,
                max_horizon=horizon,
                fixed_horizon=True,
                render=True,
                return_info_dict=False,
                scale_pol_output=True,
                device='cpu',
                record_video_name=None,
                deterministic=not args.stochastic)

        if not args.record:
            input("Press a key to close the script")

        env.close()
예제 #17
0
    hidden_sizes=(32,32,),
)
baseline = LinearFeatureBaseline(env_spec=env.spec)
algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=3000,
    max_path_length=env.horizon,
    n_itr=100,
    discount=0.995,
    step_size=0.01,
    plot=False,
)
algo.train()
rollout(env, policy)

with open("models/rc_gradient/agentturn" + "policy.pkl", "w") as f:
    f.dump(policy)

# run_experiment_lite(
#     algo.train(),
#     # Number of parallel workers for sampling
#     n_parallel=4,
#     # Only keep the snapshot parameters for the last iteration
#     snapshot_mode="last",
#     script="scripts/run_experiment_lite_rl.py",
#     # script="scripts/run_experiment_lite.py",
#     log_dir="Results/Tmp",
#     # Specifies the seed for the experiment. If this is not provided, a random seed
#     # will be used
예제 #18
0
import numpy as np
import gym

from pilco.models import PILCO
from pilco.controllers import RbfController, LinearController
from pilco.rewards import ExponentialReward
import tensorflow as tf
from tensorflow import logging
np.random.seed(0)

from utils import rollout, policy

with tf.Session(graph=tf.Graph()) as sess:
    env = gym.make('Pendulum-v0')
    # Initial random rollouts to generate a dataset
    X,Y = rollout(env=env, pilco=None, random=True, timesteps=40)
    for i in range(1,3):
        X_, Y_ = rollout(env=env, pilco=None, random=True,  timesteps=40)
        X = np.vstack((X, X_))
        Y = np.vstack((Y, Y_))


    state_dim = Y.shape[1]
    control_dim = X.shape[1] - state_dim
    controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=5)
    #controller = LinearController(state_dim=state_dim, control_dim=control_dim)

    pilco = PILCO(X, Y, controller=controller, horizon=40)
    # Example of user provided reward function, setting a custom target state
    # R = ExponentialReward(state_dim=state_dim, t=np.array([0.1,0,0,0]))
    # pilco = PILCO(X, Y, controller=controller, horizon=40, reward=R)
예제 #19
0
파일: test.py 프로젝트: alxhrzg/PILCO
    weights[0, 0] = 1.0
    weights[3, 3] = 1.0
    m_init = np.zeros(state_dim)[None, :]
    S_init = 0.005 * np.eye(state_dim)
    T = 40
    J = 5
    N = 12
    T_sim = 130
    restarts = True
    lens = []

    env = DoublePendWrapper()
    # Initial random rollouts to generate a dataset
    X, Y, _, _ = rollout(env,
                         None,
                         timesteps=T,
                         random=True,
                         SUBS=SUBS,
                         render=True)
    for i in range(1, J):
        X_, Y_, _, _ = rollout(env,
                               None,
                               timesteps=T,
                               random=True,
                               SUBS=SUBS,
                               verbose=True,
                               render=True)
        X = np.vstack((X, X_))
        Y = np.vstack((Y, Y_))

    state_dim = Y.shape[1]
    control_dim = X.shape[1] - state_dim
예제 #20
0
import numpy as np
import tensorflow as tf
from pilco.controllers import RbfController
from pilco.models import PILCO
from utils import rollout

np.random.seed(0)

with tf.Session(graph=tf.Graph()) as sess:
    env = gym.make('InvertedPendulum-v2')

    # Evaluate random actions so we know how bad random is
    random_rewards = []
    for i in range(1, 100):
        _, Y_, rewards = rollout(env=env,
                                 pilco=None,
                                 random=True,
                                 timesteps=40)
        random_rewards.append(sum(rewards))

    # Initial random rollouts to generate a dataset
    X, Y, _ = rollout(env=env, pilco=None, random=True, timesteps=40)
    random_rewards = []
    for i in range(1, 3):
        X_, Y_, rewards = rollout(env=env,
                                  pilco=None,
                                  random=True,
                                  timesteps=40)
        X = np.vstack((X, X_))
        Y = np.vstack((Y, Y_))
        random_rewards.append(sum(rewards))
예제 #21
0
max_action = 2.0  # used by the controller, but really defined by the environment

# Reward function parameters
target = np.array([1.0, 0.0, 0.0])
weights = np.diag([2.0, 2.0, 0.3])

# Environment defined
m_init = np.reshape([-1.0, 0.0, 0.0], (1, 3))
S_init = np.diag([0.01, 0.01, 0.01])

# Random rollouts
X, Y = rollout(env,
               None,
               timesteps=T,
               verbose=False,
               random=True,
               SUBS=SUBS,
               render=True)

for i in range(1, J):
    X_, Y_ = rollout(env,
                     None,
                     timesteps=T,
                     verbose=False,
                     random=True,
                     SUBS=SUBS,
                     render=True)
    X = np.vstack((X, X_))
    Y = np.vstack((Y, Y_))
print(X)
예제 #22
0
import numpy as np
import gym
from pilco.models import PILCO
from pilco.controllers import RbfController, LinearController
from pilco.rewards import ExponentialReward
import tensorflow as tf
np.random.seed(0)
from utils import policy, rollout, Normalised_Env

SUBS = 5
T = 25
env = gym.make('MountainCarContinuous-v0')
# Initial random rollouts to generate a dataset
X1, Y1, _, _ = rollout(env=env,
                       pilco=None,
                       random=True,
                       timesteps=T,
                       SUBS=SUBS,
                       render=True)
for i in range(1, 5):
    X1_, Y1_, _, _ = rollout(env=env,
                             pilco=None,
                             random=True,
                             timesteps=T,
                             SUBS=SUBS,
                             render=True)
    X1 = np.vstack((X1, X1_))
    Y1 = np.vstack((Y1, Y1_))
env.close()

env = Normalised_Env('MountainCarContinuous-v0', np.mean(X1[:, :2], 0),
                     np.std(X1[:, :2], 0))
예제 #23
0
# weights[0,0] = 0.5
# weights[3,3] = 0.5
m_init = np.zeros(state_dim)[None, :]
S_init = 0.01 * np.eye(state_dim)
T = 100
J = 7
N = 15
T_sim = 100
restarts=True
lens = []

with tf.Session() as sess:
    env = DriftCarWrapper()

    # Initial random rollouts to generate a dataset
    X,Y = rollout(env, None, timesteps=T, random=True, SUBS=SUBS)
    for i in range(1,J):
        X_, Y_ = rollout(env, None, timesteps=T, random=True, SUBS=SUBS, verbose=True)
        X = np.vstack((X, X_))
        Y = np.vstack((Y, Y_))

    state_dim = Y.shape[1]
    control_dim = X.shape[1] - state_dim

    controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action)

    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)

    pilco = PILCO(X, Y, controller=controller, horizon=T, reward=R, m_init=m_init, S_init=S_init)

    # for numerical stability
예제 #24
0
    env = gym.make('Pendulum-v0')
    action_dim = env.action_space.shape[0]
    state_dim = env.observation_space.shape[0]

    ddpg = DDPG(state_dim, action_dim,
                [env.action_space.low, env.action_space.high])

    if args.type == 'train':

        get_state = lambda x: x
        for i in range(10000):
            total_reward = ddpg.update(env, get_state)
            print("Iteration " + str(i) + " reward: " + str(total_reward))
            if i % 20 == 0:
                [_, _, rewards] = rollout(env,
                                          ddpg.curr_policy(),
                                          get_state,
                                          render=True)
                total_reward = np.sum(np.array(rewards))
                print("Test reward: " + str(total_reward))

            if i % 100 == 0:
                ddpg.save_model(args.file)

        policy = ddpg.curr_policy()
        rollout(env, policy, get_state, render=True)
        ddpg.save_model(args.file)
    elif args.type == 'test':
        ddpg.load_model(args.file)
        get_state = lambda x: x
        for i in range(20):
            [_, _, rewards] = rollout(env,
        self.env.render()


if __name__ == '__main__':
    env = TendonGymEnv()
    e = np.array(
        [[1]])  # Max control input. Set too low can lead to Cholesky failures.
    T = 10
    maxiter = 10
    T_sim = 300
    buffer_size = 600
    verbose = True

    X, Y, _, _, _ = rollout(env=env,
                            pilco=None,
                            random=True,
                            timesteps=T_sim,
                            render=False,
                            verbose=verbose)
    for i in range(1, 1):
        X_, Y_, _, _, _ = rollout(env=env,
                                  pilco=None,
                                  random=True,
                                  timesteps=T_sim,
                                  render=False,
                                  verbose=verbose)
        X = np.vstack((X, X_))
        Y = np.vstack((Y, Y_))

    state_dim = Y.shape[1]
    control_dim = X.shape[1] - state_dim
    m_init = np.reshape(np.zeros(state_dim),
예제 #26
0
def evaluate_prob_success(env, policy):
    rolls = [rollout(env, policy, show=False) for i in range(100)]
    reward, successes = zip(*rolls)
    print np.mean(reward)
    return sum(successes) * 1. / 100