def nonbatch(task, method, N, M):
    simulation_object = create_env(task)
    d = simulation_object.num_of_features
    lower_input_bound = [x[0] for x in simulation_object.feed_bounds]
    upper_input_bound = [x[1] for x in simulation_object.feed_bounds]

    w_sampler = Sampler(d)
    psi_set = []
    s_set = []
    input_A = np.random.uniform(low=2 * lower_input_bound,
                                high=2 * upper_input_bound,
                                size=(2 * simulation_object.feed_size))
    input_B = np.random.uniform(low=2 * lower_input_bound,
                                high=2 * upper_input_bound,
                                size=(2 * simulation_object.feed_size))
    psi, s = get_feedback(simulation_object, input_A, input_B)
    psi_set.append(psi)
    s_set.append(s)
    for i in range(1, N):
        w_sampler.A = psi_set
        w_sampler.y = np.array(s_set).reshape(-1, 1)
        w_samples = w_sampler.sample(M)
        mean_w_samples = np.mean(w_samples, axis=0)
        print('w-estimate = {}'.format(mean_w_samples /
                                       np.linalg.norm(mean_w_samples)))
        input_A, input_B = run_algo(method, simulation_object, w_samples)
        psi, s = get_feedback(simulation_object, input_A, input_B)
        psi_set.append(psi)
        s_set.append(s)
    w_sampler.A = psi_set
    w_sampler.y = np.array(s_set).reshape(-1, 1)
    w_samples = w_sampler.sample(M)
    print('w-estimate = {}'.format(mean_w_samples /
                                   np.linalg.norm(mean_w_samples)))
Пример #2
0
def nonbatch(task, method, N, M):
    simulation_object = create_env(task)
    d = simulation_object.num_of_features
	
    w_true = 2*np.random.rand(d)-1
    w_true = w_true / np.linalg.norm(w_true)
    print('If in automated mode: true w = {}'.format(w_true/np.linalg.norm(w_true)))
	
    lower_input_bound = [x[0] for x in simulation_object.feed_bounds]
    upper_input_bound = [x[1] for x in simulation_object.feed_bounds]

    w_sampler = Sampler(d)
    psi_set = []
    s_set = []
    for i in range(N):
        w_sampler.A = psi_set
        w_sampler.y = np.array(s_set).reshape(-1,1)
        w_samples = w_sampler.sample(M)
        mean_w_samples = np.mean(w_samples,axis=0)
        print('Samples so far: ' + str(i))
        print('w estimate = {}'.format(mean_w_samples/np.linalg.norm(mean_w_samples)))
        print('Alignment = {}'.format(mean_w_samples.dot(w_true)/np.linalg.norm(mean_w_samples)))
        input_A, input_B = run_algo(method, simulation_object, w_samples)
        psi, s = get_feedback(simulation_object, input_A, input_B, w_true)
        psi_set.append(psi)
        s_set.append(s)
    w_sampler.A = psi_set
    w_sampler.y = np.array(s_set).reshape(-1,1)
    w_samples = w_sampler.sample(M)
    print('Samples so far: ' + str(N))
    print('w estimate = {}'.format(mean_w_samples/np.linalg.norm(mean_w_samples)))
    print('Alignment = {}'.format(mean_w_samples.dot(w_true)/np.linalg.norm(mean_w_samples)))
Пример #3
0
def nonbatch(task, criterion, query_type, epsilon, M):
	simulation_object = create_env(task)		
	d = simulation_object.num_of_features

	true_delta = 1 # make this None if you will also learn delta, and change the samplers below from sample_given_delta to sample (and of course remove the true_delta argument)

	lower_input_bound = [x[0] for x in simulation_object.feed_bounds]
	upper_input_bound = [x[1] for x in simulation_object.feed_bounds]

	w_sampler = Sampler(d)
	i = 0
	score = np.inf
	while score >= epsilon:
		w_samples, delta_samples = w_sampler.sample_given_delta(M, query_type, true_delta)
		mean_w_samples = np.mean(w_samples,axis=0)
		print('w-estimate = {}'.format(mean_w_samples/np.linalg.norm(mean_w_samples)))
		input_A, input_B, score = run_algo(criterion, simulation_object, w_samples, delta_samples)
		if criterion == 'information':
			print('Expected info gain = {}'.format(score))
		elif criterion == 'volume':
			print('Expected volume removal (meaningless scale) = {}'.format(score/M))
		if score > epsilon:
			phi_A, phi_B, s = get_feedback(simulation_object, input_A, input_B, query_type)
			w_sampler.feed(phi_A, phi_B, [s])
			i += 1
	w_samples, delta_samples = w_sampler.sample_given_delta(M, query_type, true_delta)
	mean_w_samples = np.mean(w_samples,axis=0)
	print('w-estimate = {}'.format(mean_w_samples/np.linalg.norm(mean_w_samples)))
def run_comparison_plots(num_preference_queries,
                         num_membership_queries,
                         ground_truth_reward,
                         ground_truth_boundary,
                         M,
                         task='driver',
                         method='nonbatch'):
    bsearch_num_samples = 2**num_membership_queries
    preference_learned_rewards = nonbatch(task,
                                          method,
                                          num_preference_queries,
                                          M,
                                          checkpoints=[num_preference_queries
                                                       ])[-1]
    simulation_object = create_env(task)
    # collect trajectories
    reward_samples_full_set = collect_trajectories(simulation_object, method,
                                                   bsearch_num_samples,
                                                   preference_learned_rewards)
    random_samples_full_set = collect_trajectories(simulation_object, "random",
                                                   num_membership_queries,
                                                   preference_learned_rewards)

    # get the boundary and SVM values from the query methods
    preference_bsearch_boundary = membership_threshold(
        lattice.sort_on_rewards(reward_samples_full_set),
        simulation_object,
        get_labels=False)
    preference_svm_coeff, preference_svm_boundary, preference_svm = svm_threshold(
        reward_samples_full_set[:num_membership_queries], simulation_object)
    random_svm_coeff, random_svm_boundary, random_svm = svm_threshold(
        random_samples_full_set, simulation_object)
    # normalize the preference coefficients
    preference_svm_boundary = preference_svm_boundary / np.linalg.norm(
        preference_svm_coeff)
    preference_svm_coeff = preference_svm_coeff / np.linalg.norm(
        preference_svm_coeff)
    random_svm_boundary = random_svm_boundary / np.linalg.norm(
        random_svm_coeff)
    random_svm_coeff = random_svm_coeff / np.linalg.norm(random_svm_coeff)

    #now, use them for evaluation
    def compute_angle(vec1, vec2):
        return np.arccos(np.clip(np.dot(vec1, vec2), -1.0, 1.0))

    difference_values = {}
    difference_values["svm w/ reward"] = compute_angle(preference_svm_coeff, ground_truth_reward) \
                                         + abs(preference_svm_boundary - ground_truth_boundary)

    difference_values["svm w/ random"] = compute_angle(random_svm_coeff, ground_truth_reward) \
                                         + abs(random_svm_boundary - ground_truth_boundary)
    difference_values["bsearch w/ reward"] = compute_angle(preference_learned_rewards, ground_truth_reward) \
                                         + abs(preference_bsearch_boundary - ground_truth_boundary)
    print("Differences computed are: {}".format(difference_values))
    return difference_values
def batch(task, method, N, M, b):
    if N % b != 0:
        print('N must be divisible to b')
        exit(0)
    B = 20 * b

    simulation_object = create_env(task)
    d = simulation_object.num_of_features
    lower_input_bound = [x[0] for x in simulation_object.feed_bounds]
    upper_input_bound = [x[1] for x in simulation_object.feed_bounds]

    w_sampler = Sampler(d)
    psi_set = []
    s_set = []
    inputA_set = np.random.uniform(low=2 * lower_input_bound,
                                   high=2 * upper_input_bound,
                                   size=(b, 2 * simulation_object.feed_size))
    inputB_set = np.random.uniform(low=2 * lower_input_bound,
                                   high=2 * upper_input_bound,
                                   size=(b, 2 * simulation_object.feed_size))
    for j in range(b):
        input_A = inputA_set[j]
        input_B = inputB_set[j]
        psi, s = get_feedback(simulation_object, input_A, input_B)
        psi_set.append(psi)
        s_set.append(s)
    i = b
    while i < N:
        w_sampler.A = psi_set
        w_sampler.y = np.array(s_set).reshape(-1, 1)
        w_samples = w_sampler.sample(M)
        mean_w_samples = np.mean(w_samples, axis=0)
        print('w-estimate = {}'.format(mean_w_samples /
                                       np.linalg.norm(mean_w_samples)))
        print('Samples so far: ' + str(i))
        inputA_set, inputB_set = run_algo(method, simulation_object, w_samples,
                                          b, B)
        for j in range(b):
            input_A = inputA_set[j]
            input_B = inputB_set[j]
            psi, s = get_feedback(simulation_object, input_B, input_A)
            psi_set.append(psi)
            s_set.append(s)
        i += b
    w_sampler.A = psi_set
    w_sampler.y = np.array(s_set).reshape(-1, 1)
    w_samples = w_sampler.sample(M)
    mean_w_samples = np.mean(w_samples, axis=0)
    print('w-estimate = {}'.format(mean_w_samples /
                                   np.linalg.norm(mean_w_samples)))
def nonbatch(task, method, N, M, checkpoints=None):
    if checkpoints is None:
        checkpoints = []
    checkpointed_weights = []
    simulation_object = create_env(task)
    d = simulation_object.num_of_features
    lower_input_bound = [x[0] for x in simulation_object.feed_bounds]
    upper_input_bound = [x[1] for x in simulation_object.feed_bounds]

    w_sampler = Sampler(d)
    psi_set = []
    s_set = []
    input_A = np.random.uniform(low=2 * lower_input_bound,
                                high=2 * upper_input_bound,
                                size=(2 * simulation_object.feed_size))
    input_B = np.random.uniform(low=2 * lower_input_bound,
                                high=2 * upper_input_bound,
                                size=(2 * simulation_object.feed_size))
    psi, s = get_feedback_auto(
        simulation_object, input_A,
        input_B)  # psi is the difference, s is the 1 or -1 signal
    psi_set.append(psi)
    s_set.append(s)
    for i in range(1, N):
        w_sampler.A = psi_set
        w_sampler.y = np.array(s_set).reshape(-1, 1)
        w_samples = w_sampler.sample(M)
        mean_w_samples = np.mean(w_samples, axis=0)
        print('w-estimate = {}'.format(mean_w_samples /
                                       np.linalg.norm(mean_w_samples)))
        if i in checkpoints:
            checkpointed_weights.append(mean_w_samples /
                                        np.linalg.norm(mean_w_samples))
            print("Weights saved at iteration {}".format(i))
        input_A, input_B = run_algo(method, simulation_object, w_samples)
        psi, s = get_feedback_auto(simulation_object, input_A, input_B)
        psi_set.append(psi)
        s_set.append(s)
    w_sampler.A = psi_set
    w_sampler.y = np.array(s_set).reshape(-1, 1)
    w_samples = w_sampler.sample(M)
    checkpointed_weights.append(mean_w_samples /
                                np.linalg.norm(mean_w_samples))
    print('w-estimate = {}'.format(mean_w_samples /
                                   np.linalg.norm(mean_w_samples)))
    return checkpointed_weights
Пример #7
0
def batch(task, method, N, M, b):
    if N % b != 0:
        print('N must be divisible to b')
        exit(0)
    B = 20*b

    simulation_object = create_env(task)
    d = simulation_object.num_of_features
	
    w_true = 2*np.random.rand(d)-1
    w_true = w_true / np.linalg.norm(w_true)
    print('If in automated mode: true w = {}'.format(w_true/np.linalg.norm(w_true)))
	
    lower_input_bound = [x[0] for x in simulation_object.feed_bounds]
    upper_input_bound = [x[1] for x in simulation_object.feed_bounds]

    w_sampler = Sampler(d)
    psi_set = []
    s_set = []
    i = 0
    while i < N:
        w_sampler.A = psi_set
        w_sampler.y = np.array(s_set).reshape(-1,1)
        w_samples = w_sampler.sample(M)
        mean_w_samples = np.mean(w_samples,axis=0)
        print('Samples so far: ' + str(i))
        print('w estimate = {}'.format(mean_w_samples/np.linalg.norm(mean_w_samples)))
        print('Alignment = {}'.format(mean_w_samples.dot(w_true)/np.linalg.norm(mean_w_samples)))
        inputA_set, inputB_set = run_algo(method, simulation_object, w_samples, b, B)
        for j in range(b):
            input_A = inputA_set[j]
            input_B = inputB_set[j]
            psi, s = get_feedback(simulation_object, input_B, input_A, w_true)
            psi_set.append(psi)
            s_set.append(s)
        i += b
    w_sampler.A = psi_set
    w_sampler.y = np.array(s_set).reshape(-1,1)
    w_samples = w_sampler.sample(M)
    mean_w_samples = np.mean(w_samples, axis=0)
    print('Samples so far: ' + str(N))
    print('w estimate = {}'.format(mean_w_samples/np.linalg.norm(mean_w_samples)))
    print('Alignment = {}'.format(mean_w_samples.dot(w_true)/np.linalg.norm(mean_w_samples)))
Пример #8
0
from algos import generate_psi
from simulation_utils import create_env
import numpy as np
import sys
import os

task = sys.argv[1].lower()
K = int(sys.argv[2])

simulation_object = create_env(task)
z = simulation_object.feed_size
lower_input_bound = [x[0] for x in simulation_object.feed_bounds]
upper_input_bound = [x[1] for x in simulation_object.feed_bounds]
inputs_set = np.random.uniform(low=2*lower_input_bound, high=2*upper_input_bound, size=(K, 2*z))
psi_set = generate_psi(simulation_object, inputs_set)

if not os.path.isdir('ctrl_samples'):
    os.mkdir('ctrl_samples')
np.savez('ctrl_samples/' + simulation_object.name + '.npz', inputs_set=inputs_set, psi_set=psi_set)
print('Done!')
def find_threshold(num_weighted_samples,
                   num_random_samples,
                   reward_values,
                   num_membership_queries=0,
                   task='driver',
                   method="nonbatch"):
    # first, sample the trajectories from the distribution\
    simulation_object = create_env(task)
    d = simulation_object.num_of_features
    lower_input_bound = [x[0] for x in simulation_object.feed_bounds]
    upper_input_bound = [x[1] for x in simulation_object.feed_bounds]

    w_sampler = Sampler(d)
    # set the reward weights of the sampler
    # set the number of membership queries as a log function of the total # of samples
    num_membership_queries = max(
        num_membership_queries,
        int(math.ceil(math.log(num_weighted_samples + num_random_samples))))
    reward_traj_set = collect_trajectories(simulation_object, method,
                                           num_weighted_samples, reward_values)
    random_traj_set = collect_trajectories(simulation_object, "random",
                                           num_random_samples, reward_values)
    w_true = np.array([0.56687795, -0.51010378, 0.5178173, 0.38769675])

    svm_reward_set = reward_traj_set[:
                                     num_membership_queries] + collect_trajectories(
                                         simulation_object, method,
                                         num_weighted_samples, reward_values)
    #adding n more samples to the svm dataset --> n + log(n) samples

    svm_random_set = random_traj_set[:
                                     num_membership_queries] + collect_trajectories(
                                         simulation_object, "random",
                                         num_weighted_samples, reward_values)
    #adding n more samples to the svm dataset --> n + log(n) samples

    full_traj_set = reward_traj_set + random_traj_set
    # sort the trajectories by reward
    sorted_lattice = lattice.sort_on_rewards(full_traj_set)

    #test set trajectories sampled from w_true
    f_reward = open('reward_test_set.obj', 'rb')
    reward_traj_set_test = pickle.load(f_reward)
    f_reward.close()
    #test set trajectories sampled randomly
    f_random = open('random_test_set.obj', 'rb')
    random_traj_set_test = pickle.load(f_random)
    f_random.close()

    #get data and labels for the test set
    x = []
    y = []
    r = []
    reward_traj_set_test = reward_traj_set_test + random_traj_set_test
    for node in reward_traj_set_test:
        #print(node.reward_value)
        x.append(node.features)
        reward = np.sum(np.dot(w_true, node.features))
        r.append(reward)
        if reward < 0.74:
            y.append(0)
        else:
            y.append(1)
    print(y)

    #now, begin getting membership query feedback on things
    bsearch_reward_bound, labeled_data = membership_threshold(
        sorted_lattice, simulation_object, get_labels=True)
    svm_bsearch_coeff, svm_bsearch_inter, clssfr_bsearch = svm_threshold(
        svm_reward_set, simulation_object, labeled_samples=labeled_data)
    svm_reward_coeff, svm_reward_inter, clssfr_reward = svm_threshold(
        svm_reward_set, simulation_object)
    svm_random_coeff, svm_random_inter, clssfr_random = svm_threshold(
        svm_random_set, simulation_object)
    # finished process
    print("Reward boundary retrieved from binary search method is {}".format(
        bsearch_reward_bound))
    print(
        "SVM coefficient and intercept for same queries as binary search are: {} and {}"
        .format(svm_bsearch_coeff, svm_bsearch_inter))
    print(
        "SVM coefficient and intercept for reward-sampled queries are: {} and {}"
        .format(svm_reward_coeff, svm_reward_inter))
    print(
        "SVM coefficient and intercept for random-sampled queries are: {} and {}"
        .format(svm_random_coeff, svm_random_inter))
    print("Reward weights for task are {}".format(reward_values))

    acc_bsearch = get_accuracy(r,
                               y,
                               reward_bound=bsearch_reward_bound,
                               clssfr=None)
    acc_svm_learnt = get_accuracy(x,
                                  y,
                                  reward_bound=None,
                                  clssfr=clssfr_reward)
    acc_svm_random = get_accuracy(x,
                                  y,
                                  reward_bound=None,
                                  clssfr=clssfr_random)

    print("Accuracy for binary search is ", acc_bsearch)
    print("Accuracy for svm with reward-sampled queries is ", acc_svm_learnt)
    print("Accuracy for svm with randomly-sampled queries is ", acc_svm_random)
Пример #10
0
from simulation_utils import create_env, perform_best
import sys

task = 'Tosser'
w = [0.29754784, 0.03725074, 0.00664673, 0.80602143]
iter_count = 5  # the optimization is nonconvex, so you can specify the number of random starting points

##### YOU DO NOT NEED TO MODIFY THE CODE BELOW THIS LINE #####

D = create_env(task.lower())
perform_best(D, w, iter_count)
Пример #11
0
from simulation_utils import create_env, compute_best, play
import sys

task = sys.argv[1].lower()
iter_count = int(
    sys.argv[2]
)  # the optimization is nonconvex, so you can specify the number of random starting points
w = [float(x) for x in sys.argv[3:]]

##### YOU DO NOT NEED TO MODIFY THE CODE BELOW THIS LINE #####

simulation_object = create_env(task.lower())
optimal_ctrl = compute_best(simulation_object, w, iter_count)
play(simulation_object, optimal_ctrl)