Exemplo n.º 1
0
    acc = np.zeros(ITER)
    for _ in range(ITER):
        print "     Iteration: " + str(_)
        print "     Retraining with " + str(len(dagger.net.data)) + " examples"
        dagger.retrain()
        acc[_] = dagger.svm.acc()
        iteration_states = []
        dagger.record = True
        for i in range(SAMP):
            if i >= LIMIT_DATA:
                dagger.record = False
            dagger.rollout()
            iteration_states += dagger.get_recent_rollout_states().tolist()            
            r[_] = r[_] + dagger.get_reward() / SAMP
        if _ == ITER - 1 and t == 0:
            dagger_analysis.count_states(np.array(iteration_states))
            dagger_analysis.save_states("comparisons/boost_dt_comparisons/boost_dt_dagger_final.png")            
            dagger_analysis.show_states()
    if t == 0:
        dagger_analysis.reset_density()        
        dagger_analysis.count_states(dagger.get_states())
        dagger_analysis.save_states("comparisons/boost_dt_comparisons/boost_dt_dagger.png")
        dagger_analysis.show_states()
        plotter.plot_state_actions(mdp.pi, rewards=rewards, sinks=sinks,
                filename='comparisons/boost_dt_comparisons/boost_dt_dagger_state_action.png')
    dagger_data[t,:] = r
    dagger_acc[t,:] = acc


# print value_iter_data
# print classic_il_data
Exemplo n.º 2
0
def run(ne, lr):
    plotter = plot_class.Plotter()
    
    comparisons_directory, data_directory = make_name(ne, lr)
    if not os.path.exists(comparisons_directory):
        os.makedirs(comparisons_directory)
    if not os.path.exists(data_directory):
        os.makedirs(data_directory)

    #ITER = 25
    #TRIALS = 10
    #SAMP = 20 
    ITER = 10
    TRIALS = 3
    SAMP = 10
    LIMIT_DATA = 1
    DEPTH = 6
    
    H = 15
    W = 15

    grid = BasicGrid(H, W)
    rewards = scenarios.scenario3['rewards']
    sinks = scenarios.scenario3['sinks']
    grid.reward_states = rewards
    grid.sink_states = sinks

    mdp = ClassicMDP(ClassicPolicy(grid), grid)
    #mdp.value_iteration()
    #mdp.save_policy('scen4.p')
    mdp.load_policy('scen4.p')

    value_iter_pi = mdp.pi
    plotter.plot_state_actions(value_iter_pi, rewards = grid.reward_states, sinks = grid.sink_states,
            filename=comparisons_directory + 'value_iter_state_action.png')

    value_iter_data = np.zeros([TRIALS, ITER])
    classic_il_data = np.zeros([TRIALS, ITER])
    classic_il_acc = np.zeros([TRIALS, ITER])
    classic_il_loss = np.zeros([TRIALS, ITER])

    for t in range(TRIALS):
        print "\nIL Trial: " + str(t)
        mdp.load_policy('scen4.p')

        boost = SVC(kernel='linear')
        boost = AdaBoostClassifier(base_estimator=boost, algorithm='SAMME', n_estimators=ne, learning_rate=lr)
        sup = ScikitSupervise(grid, mdp, Classifier=boost)
        sup.sample_policy()

        value_iter_analysis = Analysis(W, H, ITER, rewards=rewards, sinks=sinks,
                desc='Value iter policy')
        value_iter_r = np.zeros(ITER)
        classic_il_r = np.zeros(ITER)
        acc = np.zeros(ITER)
        loss = np.zeros(ITER)

        sup.record = True
        #for _ in range(4):
        #    sup.rollout()

        for i in range(ITER):
            print "     Iteration: " + str(i)
            mdp.pi = value_iter_pi 
            sup.record = True
            for _ in range(SAMP):
                if _  >= LIMIT_DATA:
                    sup.record = False
                sup.rollout()
                value_iter_r[i] += sup.get_reward() / (SAMP)

            sup.record = False
            print "     Training on " + str(len(sup.net.data)) + " examples"
            sup.train()

            acc[i] = sup.svm.acc()
            for _ in range(SAMP):
                sup.record=False
                sup.rollout()
                loss[i] += sup.get_loss() / float(SAMP)
                classic_il_r[i] += sup.get_reward() / SAMP
            #print acc        
        if t == 0:
            plotter.plot_state_actions(mdp.pi, rewards=rewards, sinks=sinks,
                    filename=comparisons_directory + 'svm_classic_il_state_action.png')        
        classic_il_data[t,:] = classic_il_r
        value_iter_data[t,:] = value_iter_r
        classic_il_acc[t,:] = acc
        classic_il_loss[t,:] = loss




    #DAGGER
    dagger_data = np.zeros((TRIALS, ITER))
    dagger_analysis = Analysis(H, W, ITER, rewards = grid.reward_states, sinks=grid.sink_states, desc="Dagger's policy progression")
    dagger_acc = np.zeros((TRIALS, ITER))
    dagger_loss = np.zeros((TRIALS, ITER))
    for t in range(TRIALS):
        print "DAgger Trial: " + str(t)
        mdp.load_policy('scen4.p')
        dagger = SVMDagger(grid, mdp, depth=DEPTH)
        dagger.svm.nonlinear=False
        dagger.record = True
        dagger.rollout()
        #for _ in range(5):     
        #    dagger.rollout()
        r = np.zeros(ITER)
        acc = np.zeros(ITER)
        loss = np.zeros(ITER)
        for _ in range(ITER):
            print "     Iteration: " + str(_)
            print "     Retraining with " + str(len(dagger.net.data)) + " examples"
            dagger.retrain()
            acc[_] = dagger.svm.acc()
            iteration_states = []
            dagger.record = True
            for i in range(SAMP):
                if i >= LIMIT_DATA:
                    dagger.record = False
                dagger.rollout()
                loss[_] += dagger.get_loss() / float(SAMP)
                iteration_states += dagger.get_recent_rollout_states().tolist()            
                r[_] = r[_] + dagger.get_reward() / SAMP
            #if _ == ITER - 1 and t == 0:
            if _ == 0 and t ==0:
                dagger_analysis.count_states(np.array(iteration_states))
                dagger_analysis.save_states(comparisons_directory + "svm_dagger_final.png")            
                dagger_analysis.show_states()
        if t == 0:
            dagger_analysis.reset_density()        
            dagger_analysis.count_states(dagger.get_states())
            dagger_analysis.save_states(comparisons_directory + "svm_dagger.png")
            dagger_analysis.show_states()
            plotter.plot_state_actions(mdp.pi, rewards=rewards, sinks=sinks,
                    filename=comparisons_directory + 'svm_dagger_state_action.png')
        dagger_data[t,:] = r
        dagger_acc[t,:] = acc
        dagger_loss[t,:] = loss


    # print value_iter_data
    # print classic_il_data
    # print dagger_data
    print classic_il_loss
    print dagger_loss

    np.save(data_directory + 'svm_sup_data.npy', value_iter_data)
    np.save(data_directory + 'svm_classic_il_data.npy', classic_il_data)
    np.save(data_directory + 'svm_dagger_data.npy', dagger_data)

    np.save(data_directory + 'svm_dagger_acc.npy', dagger_acc)
    np.save(data_directory + 'svm_classic_il_acc.npy', classic_il_acc)

    analysis = Analysis(H, W, ITER, rewards=rewards, sinks=sinks, desc="General comparison")
    analysis.get_perf(value_iter_data)
    analysis.get_perf(classic_il_data)
    analysis.get_perf(dagger_data)

    #analysis.plot(names = ['Value iteration', 'Adaboost IL'], filename=comparisons_directory + 'svm_reward_comparison.png', ylims=[-60, 100])
    analysis.plot(names = ['Value iteration', 'LSVM Boosted IL', 'LSVM DAgger'], filename=comparisons_directory + 'svm_reward_comparison.png', ylims=[-60, 100])
    print "Saving analysis to: " + comparisons_directory + 'svm_reward_comparison.png'

    acc_analysis = Analysis(H, W, ITER, rewards = grid.reward_states, sinks=grid.sink_states, desc="Accuracy comparison")
    acc_analysis.get_perf(classic_il_acc)
    acc_analysis.get_perf(dagger_acc)

    acc_analysis.plot(names = ['LSVM Boosted Acc.', 'LSVM DAgger Acc.'], label='Accuracy', filename=comparisons_directory + 'svm_acc_comparison.png', ylims=[0,1])
    #acc_analysis.plot(names = ['Adaboost IL Acc.'], label='Accuracy', filename=comparisons_directory + 'svm_acc_comparison.png', ylims=[0,1])

    loss_analysis = Analysis(H, W, ITER, rewards=rewards, sinks=sinks, desc="Loss plot")
    loss_analysis.get_perf(classic_il_loss)
    loss_analysis.get_perf(dagger_loss)
    loss_analysis.plot(names = ['LSVM Boosted IL loss', 'LSVM DAgger loss'], filename=comparisons_directory + 'loss_plot.png', ylims=[0, 1])
Exemplo n.º 3
0
classic_il_acc = np.zeros([TRIALS, ITER])
for t in range(TRIALS):
    mdp.load_policy()
    sup = SVMSupervise(grid, mdp)
    sup.sample_policy()

    value_iter_analysis = Analysis(W, H, ITER, rewards=rewards, sinks=sinks,
            desc='Value iter policy')

    r = 0.0
    for _ in range(ITER * SAMP):
        sup.rollout()
        r = r + sup.get_reward() / (ITER * SAMP)
    print "Value iter reward: " + str(r)
    if t == 0:
        value_iter_analysis.count_states(sup.get_states())
        value_iter_analysis.save_states("comparisons/svm_comparisons/value_iter.png")
        value_iter_analysis.show_states()
    sup.train()

    classic_il_acc[t,:] = np.zeros(ITER) + sup.svm.acc()            

    value_iter_data[t,:] = np.zeros(ITER) + r

    r = 0.0
    sup.net.clear_data()
    sup.sample_policy()
    il_analysis = Analysis(H, W, ITER, rewards=rewards, sinks=sinks, desc="IL's policy")    
    for _ in range(SAMP * ITER):
        sup.animate = False
        sup.rollout()
Exemplo n.º 4
0
sup_data = np.zeros([TRIALS,ITER])
classic_il_data = np.zeros([TRIALS, ITER])
for t in range(TRIALS):
    mdp.load_policy()
    sup = Supervise(grid, mdp)
    sup.sample_policy()

    supervisor_analysis = Analysis(H, W, ITER, rewards = grid.reward_states, sinks=grid.sink_states, desc="Supervisor's policy")
    
    r = 0.0
    for _ in range(ITER * SAMP):
        sup.rollout()
        r = r + sup.get_reward() / (ITER * SAMP)
    print "Value iter reward: " + str(r)
    if t == 0:
        supervisor_analysis.count_states(sup.get_states())
        supervisor_analysis.save_states("comparisons/comparisons/value_iter.png") 
        supervisor_analysis.show_states()


    sup.train()
    classic_train, classic_test = sup.net.return_stats()
    classic_train = np.zeros((TRIALS, ITER)) + classic_train
    classic_test = np.zeros((TRIALS, ITER)) + classic_test
    sup_data[t,:] = np.zeros(ITER) + r
    
    r = 0.0
    sup.net.clear_data()
    sup.sample_policy()
    il_analysis = Analysis(H, W, ITER, rewards = grid.reward_states, sinks = grid.sink_states, desc="IL's policy")
    print sup.get_states()
Exemplo n.º 5
0
	mdp.load_policy()
	nsupervise = Supervise(grid,mdp)
	#Collect Noisy Supervise Samples
	
	for t in range(ITER*SAMP):
	   	nsupervise.rollout()
	nsupervise.train()
	#Evaluate Policy
	r = 0.0
	for t in range(SAMP):
		nsupervise.rollout()
		r = r+nsupervise.get_reward()/SAMP
	r_SN = np.zeros(ITER)+r
	data[k,:] = r_SN

        analysis.count_states(nsupervise.get_states())
	test_loss_n[k] = nsupervise.get_test_loss()
	train_loss_n[k] = nsupervise.get_train_loss() 

analysis.show_states()
analysis.get_perf(data)




# #####NOISY SUPERVISOR LOGISTIC#####
# data = np.zeros([TRIALS,ITER])
# test_loss = np.zeros([TRIALS])
# train_loss = np.zeros([TRIALS])
# for k in range(TRIALS):
# 	mdp.load_policy()
Exemplo n.º 6
0
plotter.plot_state_actions(mdp.pi, rewards = grid.reward_states, sinks = grid.sink_states)
q.rollout()
a = mdp.pi.get_next(State(0, 0))
print "action: " + str(a)
tup = q.Q.preprocess(0, 0, a)
print q.Q.dataset[tup]
print "Actual: " + str(np.mean(q.Q.dataset[tup]))
print "predicted: " + str(q.Q.get(State(0, 0), a))

for ac in mdp.pi.available_actions:
    if ac != a:
        print "Seeing for action: " + str(ac)
        tup = q.Q.preprocess(0, 0, ac)
        if tup in q.Q.dataset:
            print "Actual: " + str(np.mean(q.Q.dataset[tup]))
            #print np.mean(q.Q.dataset[tup])            
        else:
            print "No actual"
        print "predicted: " + str(q.Q.get(State(0, 0), ac))
    
#q.animate=True
#q.rollout()

an.count_states(q.get_states())
an.show_states()




Exemplo n.º 7
0
mdp.load_policy()

####DAgger##########
data = np.zeros([TRIALS,ITER])
for k in range(TRIALS):
	mdp.load_policy()
	dagger = SVMDagger(grid, mdp)
	dagger.rollout()            # rollout with supervisor policy
	r_D = np.zeros(ITER)
	for t in range(ITER):
		dagger.retrain()
		for i in range(SAMP):
			dagger.rollout()
			r_D[t] = r_D[t]+dagger.get_reward()/SAMP

	analysis.count_states(dagger.get_states())
	data[k,:] = r_D

analysis.show_states()
analysis.get_perf(data)
analysis.save("test.p")

####SUPERVISE########
# data = np.zeros([TRIALS,ITER])
# for k in range(TRIALS):
# 	mdp.load_policy()
# 	supervise = Supervise(grid,mdp)

# 	#Collect Supervise Samples
# 	for t in range(ITER*SAMP):
# 	   	supervise.rollout()
Exemplo n.º 8
0
        q.guide()
        r = r + q.get_reward() / (ITER)
    print "Value iter reward: " + str(r)
    value_iter_data[t,:] = np.zeros(ITER) + r

    r = 0.0
    
    q.clear_states()
    mdp.pi = QPolicy(q)    
    a = Analysis(W, H, ITER, rewards=rewards, sinks=sinks, desc='Q policy')
    for i in range(ITER * SAMP):
        q.rollout()
        r = r + q.get_reward() / (ITER * SAMP)
    print "Q learn reward: " + str(r)
    if t == 0:
        a.count_states(q.get_states())
        a.show_states()
        plotter.plot_state_actions(mdp.pi, rewards = grid.reward_states, sinks = grid.sink_states)    
    classic_q_data[t,:] = np.zeros(ITER) + r



# DAGGER

dagger_data = np.zeros((TRIALS, ITER))
dagger_analysis = Analysis(H, W, ITER, rewards = grid.reward_states, sinks=grid.sink_states, desc="Dagger's policy progression")
for t in range(TRIALS):
    print "Trial: " + str(t)
    mdp.load_policy(filename='scen1.p')
    dagger = SVMDagger(grid, mdp)
    dagger.rollout()