def average_and_std_type(all_results, index): avg = EmptyObject() for n, run in enumerate(all_results): if n == 0: avg.a_o_e = np.array(run[index].a_o_e) avg.a_o_t = np.array(run[index].a_o_t) avg.policy_diff1 = run[index].policy_diff1 avg.policy_diff2 = run[index].policy_diff2 avg.e_on_e = [run[index].e_on_e] avg.e_o_t = [run[index].e_o_t] else: avg.a_o_e = np.vstack((avg.a_o_e, np.array(run[index].a_o_e))) avg.a_o_t = np.vstack((avg.a_o_t, np.array(run[index].a_o_t))) avg.policy_diff1 = np.vstack( (avg.policy_diff1, np.array(run[index].policy_diff1))) avg.policy_diff2 = np.vstack( (avg.policy_diff2, np.array(run[index].policy_diff2))) avg.e_on_e.append(run[index].e_on_e) avg.e_o_t.append(run[index].e_o_t) avg.mean_a_o_e = np.mean(avg.a_o_e, axis=0) avg.std_a_o_e = np.std(avg.a_o_e, axis=0) avg.mean_a_o_t = np.mean(avg.a_o_t, axis=0) avg.std_a_o_t = np.std(avg.a_o_t, axis=0) avg.mean_policy_diff1 = np.mean(avg.policy_diff1, axis=0) avg.std_policy_diff1 = np.std(avg.policy_diff1, axis=0) avg.mean_policy_diff2 = np.mean(avg.policy_diff2, axis=0) avg.std_policy_diff2 = np.std(avg.policy_diff2, axis=0) avg.e_on_e = np.mean(avg.e_on_e) avg.e_o_t = np.mean(avg.e_o_t) return avg
def average_and_std_type(all_results,index): avg = EmptyObject() for n,run in enumerate(all_results): if n ==0: avg.a_o_e = np.array(run[index].a_o_e); avg.a_o_t = np.array(run[index].a_o_t) avg.policy_diff1 = run[index].policy_diff1; avg.policy_diff2 = run[index].policy_diff2 avg.e_on_e = [run[index].e_on_e]; avg.e_o_t = [run[index].e_o_t] else: avg.a_o_e = np.vstack((avg.a_o_e,np.array(run[index].a_o_e))); avg.a_o_t =np.vstack((avg.a_o_t, np.array(run[index].a_o_t))) avg.policy_diff1 =np.vstack((avg.policy_diff1, np.array(run[index].policy_diff1))); avg.policy_diff2 =np.vstack((avg.policy_diff2,np.array(run[index].policy_diff2))) avg.e_on_e.append(run[index].e_on_e); avg.e_o_t.append(run[index].e_o_t) avg.mean_a_o_e = np.mean(avg.a_o_e,axis = 0); avg.std_a_o_e = np.std(avg.a_o_e,axis = 0) avg.mean_a_o_t = np.mean(avg.a_o_t,axis = 0); avg.std_a_o_t = np.std(avg.a_o_t,axis = 0) avg.mean_policy_diff1 = np.mean(avg.policy_diff1,axis=0); avg.std_policy_diff1 = np.std(avg.policy_diff1,axis = 0) avg.mean_policy_diff2 = np.mean(avg.policy_diff2,axis=0); avg.std_policy_diff2 = np.std(avg.policy_diff2,axis = 0) avg.e_on_e = np.mean(avg.e_on_e); avg.e_o_t = np.mean(avg.e_o_t) return avg
def learn_from_failure(expert1,expert2,apprentice,iterations,steps,initial_states,test_states,failure = "false",initial_bad_states = None): #initialise the lagrange multipliers to 1 print "INITIALISED LEARNING. LEARNING FROM FAILURE = ",failure direc ="results/" fn.make_dir(direc) C = 5.0 D=.7 delta_c = .96 disc = expert1.disc a,s,f = expert1.feature_f.shape #experts exp1_policy,ignore,exp1_state_exp,exp1_all = inference(expert1,steps,initial_states,discount =0.9) if initial_bad_states == None: exp2_policy,ignore,exp2_state_exp,exp2_all = inference(expert2,steps,initial_states,discount = 0.9) else: exp2_policy,ignore,exp2_state_exp,exp2_all = inference(expert2,steps,initial_bad_states,discount = 0.9) #print "POLICYY", exp1_policy.shape exp1_feature_avg = np.dot(exp1_state_exp.reshape(s*a,order = "F"),expert1.feature_f.reshape(s*a,f,order ="F")) exp2_feature_avg = np.dot(exp2_state_exp.reshape(s*a,order = "F"),expert2.feature_f.reshape(s*a,f,order = "F")) e_on_e = eval_value(expert1.w,exp1_policy,expert1,test_states,steps) t_o_t = eval_value(expert2.w,exp2_policy,expert2,test_states,steps) expert_on_taboo = eval_value(expert2.w,exp1_policy,expert2,test_states,steps) z_stat = None #initiate results structure results = EmptyObject() results.a_o_e = [] results.a_o_t = [] results.policy_diff1 = [] results.policy_diff2 = [] results.e_on_e = e_on_e results.t_o_t = t_o_t results.e_o_t = expert_on_taboo # learning rate rate = 0.08 rate2 = 0.08 # delay before failure data is includes. Large numbers avoid oscilations delay = 0 for i in range(iterations): apprentice_policy,z_stat,a_state_exp,a_all = inference(apprentice,steps,initial_states,z_states = None,discount = 0.9) apprentice_feature_avg = np.dot(a_state_exp.reshape(s*a,order = "F"),apprentice.feature_f.reshape(s*a,f,order = "F")) difference_exp1 = exp1_feature_avg - apprentice_feature_avg if initial_bad_states == None: difference_exp2 = exp2_feature_avg - apprentice_feature_avg else: apprentice_policy,z_stat,a_state_exp_bad,a_all = inference(apprentice,steps,initial_bad_states,z_states = None,discount = 0.9) apprentice_feature_avg_bad = np.dot(a_state_exp_bad.reshape(s*a,order = "F"),apprentice.feature_f.reshape(s*a,f,order = "F")) difference_exp2 = apprentice_feature_avg_bad - exp2_feature_avg if i ==0: difference_random = np.copy(difference_exp2) apprentice_feature_avg_bad_prev = apprentice_feature_avg*0 if failure =="L2": #first update the alphas according to their gradient. apprentice.w = fn.pin_to_threshold(apprentice.w + rate*difference_exp1,C,-C) if i>delay: apprentice.zeta =-difference_exp2 #print "ZETAAA",apprentice.zeta #print "-------------------------------------------"s elif failure == "L1": apprentice.w = apprentice.w + rate*difference_exp1 #apprentice.zeta = apprentice.zeta + rate2*(difference_exp2+ D*apprentice.zeta) apprentice.zeta = -0.9*difference_exp2 elif failure == "false": apprentice.w = apprentice.w + rate*difference_exp1 elif failure == "slow": apprentice.w = apprentice.w + rate*difference_exp1 C = C*delta_c if 1./C>D: C = 1/D if i >delay: apprentice.zeta =-difference_exp2/(C) apprentice_feature_avg_bad_prev =apprentice_feature_avg_bad apprentice.reward_f = apprentice.buildRewardFunction() #evaluation a_on_e = eval_value(expert1.w,apprentice_policy,apprentice,test_states,steps) a_o_t = eval_value(expert2.w,apprentice_policy,apprentice,test_states,steps) #if i ==iterations-1: if i <iterations: print "failure",failure print "Iteration",i print "Aprentice on Expert" ,a_on_e print "Expert on expert",e_on_e print "Apprentice on Taboo",a_o_t print "Taboo on Taboo",t_o_t print "Expert on Taboo",expert_on_taboo print "______________________________________" results.a_o_e.append(a_on_e) results.a_o_t.append(a_o_t) results.policy_diff1.append(np.sum(np.sum(np.absolute(apprentice_policy-exp1_policy)))/(2*disc.tot_states)*100) results.policy_diff2.append(np.sum(np.sum(np.absolute(apprentice_policy-exp2_policy)))/(2*disc.tot_states)*100) if i == iterations-1: print "Policy Difference",results.policy_diff1[-1] print "Policy Difference",results.policy_diff2[-1] return results
def learn_from_failure(expert1, expert2, apprentice, iterations, steps, initial_states, test_states, failure="false", initial_bad_states=None): #initialise the lagrange multipliers to 1 print "INITIALISED LEARNING. LEARNING FROM FAILURE = ", failure direc = "results/" fn.make_dir(direc) # learning rate rate = 0.08 rate2 = 0.08 C = 5.0 D = .7 delta_c = .96 delay = 0 disc = expert1.disc a, s, f = expert1.feature_f.shape #experts exp1_policy, ignore, exp1_state_exp, exp1_all = inference(expert1, steps, initial_states, discount=0.9) if initial_bad_states == None: exp2_policy, ignore, exp2_state_exp, exp2_all = inference( expert2, steps, initial_states, discount=0.9) else: exp2_policy, ignore, exp2_state_exp, exp2_all = inference( expert2, steps, initial_bad_states, discount=0.9) #print "POLICYY", exp1_policy.shape exp1_feature_avg = np.dot(exp1_state_exp.reshape(s * a, order="F"), expert1.feature_f.reshape(s * a, f, order="F")) exp2_feature_avg = np.dot(exp2_state_exp.reshape(s * a, order="F"), expert2.feature_f.reshape(s * a, f, order="F")) e_on_e = eval_value(expert1.w, exp1_policy, expert1, test_states, steps) t_o_t = eval_value(expert2.w, exp2_policy, expert2, test_states, steps) expert_on_taboo = eval_value(expert2.w, exp1_policy, expert2, test_states, steps) z_stat = None #initiate results structure results = EmptyObject() results.a_o_e = [] results.a_o_t = [] results.policy_diff1 = [] results.policy_diff2 = [] results.e_on_e = e_on_e results.t_o_t = t_o_t results.e_o_t = expert_on_taboo for i in range(iterations): apprentice_policy, z_stat, a_state_exp, a_all = inference( apprentice, steps, initial_states, z_states=None, discount=0.9) apprentice_feature_avg = np.dot( a_state_exp.reshape(s * a, order="F"), apprentice.feature_f.reshape(s * a, f, order="F")) difference_exp1 = exp1_feature_avg - apprentice_feature_avg if initial_bad_states == None: difference_exp2 = exp2_feature_avg - apprentice_feature_avg else: apprentice_policy, z_stat, a_state_exp_bad, a_all = inference( apprentice, steps, initial_bad_states, z_states=None, discount=0.9) apprentice_feature_avg_bad = np.dot( a_state_exp_bad.reshape(s * a, order="F"), apprentice.feature_f.reshape(s * a, f, order="F")) difference_exp2 = apprentice_feature_avg_bad - exp2_feature_avg if i == 0: difference_random = np.copy(difference_exp2) apprentice_feature_avg_bad_prev = apprentice_feature_avg * 0 #updates elif failure == "L1": apprentice.w = apprentice.w + rate * difference_exp1 #apprentice.zeta = apprentice.zeta + rate2*(difference_exp2+ D*apprentice.zeta) apprentice.zeta = 0.9 * difference_exp2 elif failure == "false": apprentice.w = apprentice.w + rate * difference_exp1 elif failure == "slow": apprentice.w = apprentice.w + rate * difference_exp1 C = C * delta_c if 1. / C > D: C = 1 / D if i > delay: apprentice.zeta = -difference_exp2 / (C) #print "ZETAAA",apprentice.zeta #print "-------------------------------------------" elif failure == "cvx": delay = 0 apprentice.w = apprentice.w + rate * difference_exp1 #sings = difference_random*difference_exp2 #print sings #idx = np.where(sings < 0) #difference_exp2[idx]=0 rho = 0.01 #if rho>0.8: # rho=0.8 #apprentice.zeta = apprentice.zeta + rate2*(difference_exp2+ D*apprentice.zeta) if i > delay: apprentice.zeta = 0.9 * (apprentice_feature_avg_bad_prev - rho * apprentice_feature_avg_bad + (rho - 1) * exp2_feature_avg) apprentice_feature_avg_bad_prev = apprentice_feature_avg_bad #apprentice.zeta = difference_random - 0.2*difference_exp2 elif failure == "sign": apprentice.w = apprentice.w + rate * difference_exp1 rho = 0.01 apprentice.zeta = np.sign(difference_random) elif failure == "only": apprentice.zeta = apprentice.zeta - rate2 * (difference_exp2 + D * apprentice.zeta) apprentice.zeta = -2 * difference_exp2 #print "ZETAAA",apprentice.zeta #print "-------------------------------------------" apprentice.reward_f = apprentice.buildRewardFunction() #evaluation a_on_e = eval_value(expert1.w, apprentice_policy, apprentice, test_states, steps) a_o_t = eval_value(expert2.w, apprentice_policy, apprentice, test_states, steps) #if i ==iterations-1: if i < iterations: print "failure", failure print "Iteration", i print "Aprentice on Expert", a_on_e print "Expert on expert", e_on_e print "Apprentice on Taboo", a_o_t print "Taboo on Taboo", t_o_t print "Expert on Taboo", expert_on_taboo print "______________________________________" results.a_o_e.append(a_on_e) results.a_o_t.append(a_o_t) results.policy_diff1.append( np.sum(np.sum(np.absolute(apprentice_policy - exp1_policy))) / (2 * disc.tot_states) * 100) results.policy_diff2.append( np.sum(np.sum(np.absolute(apprentice_policy - exp2_policy))) / (2 * disc.tot_states) * 100) if i == iterations - 1: print "Policy Difference", results.policy_diff1[-1] print "Policy Difference", results.policy_diff2[-1] return results
def learn_from_failure(expert1,expert2,apprentice,iterations,steps,initial_states,test_states,failure = "false",initial_bad_states = None): #initialise the lagrange multipliers to 1 print "INITIALISED LEARNING. LEARNING FROM FAILURE = ",failure direc ="results/" fn.make_dir(direc) # learning rate rate = 0.08 rate2 = 0.08 C = 5.0 D=.7 delta_c = .96 delay = 0 disc = expert1.disc a,s,f = expert1.feature_f.shape #experts exp1_policy,ignore,exp1_state_exp,exp1_all = inference(expert1,steps,initial_states,discount =0.9) if initial_bad_states == None: exp2_policy,ignore,exp2_state_exp,exp2_all = inference(expert2,steps,initial_states,discount = 0.9) else: exp2_policy,ignore,exp2_state_exp,exp2_all = inference(expert2,steps,initial_bad_states,discount = 0.9) #print "POLICYY", exp1_policy.shape exp1_feature_avg = np.dot(exp1_state_exp.reshape(s*a,order = "F"),expert1.feature_f.reshape(s*a,f,order ="F")) exp2_feature_avg = np.dot(exp2_state_exp.reshape(s*a,order = "F"),expert2.feature_f.reshape(s*a,f,order = "F")) e_on_e = eval_value(expert1.w,exp1_policy,expert1,test_states,steps) t_o_t = eval_value(expert2.w,exp2_policy,expert2,test_states,steps) expert_on_taboo = eval_value(expert2.w,exp1_policy,expert2,test_states,steps) z_stat = None #initiate results structure results = EmptyObject() results.a_o_e = [] results.a_o_t = [] results.policy_diff1 = [] results.policy_diff2 = [] results.e_on_e = e_on_e results.t_o_t = t_o_t results.e_o_t = expert_on_taboo for i in range(iterations): apprentice_policy,z_stat,a_state_exp,a_all = inference(apprentice,steps,initial_states,z_states = None,discount = 0.9) apprentice_feature_avg = np.dot(a_state_exp.reshape(s*a,order = "F"),apprentice.feature_f.reshape(s*a,f,order = "F")) difference_exp1 = exp1_feature_avg - apprentice_feature_avg if initial_bad_states == None: difference_exp2 = exp2_feature_avg - apprentice_feature_avg else: apprentice_policy,z_stat,a_state_exp_bad,a_all = inference(apprentice,steps,initial_bad_states,z_states = None,discount = 0.9) apprentice_feature_avg_bad = np.dot(a_state_exp_bad.reshape(s*a,order = "F"),apprentice.feature_f.reshape(s*a,f,order = "F")) difference_exp2 = apprentice_feature_avg_bad - exp2_feature_avg if i ==0: difference_random = np.copy(difference_exp2) apprentice_feature_avg_bad_prev = apprentice_feature_avg*0 #updates elif failure == "L1": apprentice.w = apprentice.w + rate*difference_exp1 #apprentice.zeta = apprentice.zeta + rate2*(difference_exp2+ D*apprentice.zeta) apprentice.zeta = 0.9*difference_exp2 elif failure == "false": apprentice.w = apprentice.w + rate*difference_exp1 elif failure == "slow": apprentice.w = apprentice.w + rate*difference_exp1 C = C*delta_c if 1./C>D: C = 1/D if i >delay: apprentice.zeta =-difference_exp2/(C) #print "ZETAAA",apprentice.zeta #print "-------------------------------------------" elif failure == "cvx": delay = 0 apprentice.w = apprentice.w + rate*difference_exp1 #sings = difference_random*difference_exp2 #print sings #idx = np.where(sings < 0) #difference_exp2[idx]=0 rho = 0.01 #if rho>0.8: # rho=0.8 #apprentice.zeta = apprentice.zeta + rate2*(difference_exp2+ D*apprentice.zeta) if i>delay: apprentice.zeta =0.9*(apprentice_feature_avg_bad_prev - rho*apprentice_feature_avg_bad + (rho-1)*exp2_feature_avg) apprentice_feature_avg_bad_prev =apprentice_feature_avg_bad #apprentice.zeta = difference_random - 0.2*difference_exp2 elif failure == "sign": apprentice.w = apprentice.w + rate*difference_exp1 rho = 0.01 apprentice.zeta =np.sign(difference_random) elif failure == "only": apprentice.zeta =apprentice.zeta -rate2*(difference_exp2 + D*apprentice.zeta) apprentice.zeta = -2*difference_exp2 #print "ZETAAA",apprentice.zeta #print "-------------------------------------------" apprentice.reward_f = apprentice.buildRewardFunction() #evaluation a_on_e = eval_value(expert1.w,apprentice_policy,apprentice,test_states,steps) a_o_t = eval_value(expert2.w,apprentice_policy,apprentice,test_states,steps) #if i ==iterations-1: if i <iterations: print "failure",failure print "Iteration",i print "Aprentice on Expert" ,a_on_e print "Expert on expert",e_on_e print "Apprentice on Taboo",a_o_t print "Taboo on Taboo",t_o_t print "Expert on Taboo",expert_on_taboo print "______________________________________" results.a_o_e.append(a_on_e) results.a_o_t.append(a_o_t) results.policy_diff1.append(np.sum(np.sum(np.absolute(apprentice_policy-exp1_policy)))/(2*disc.tot_states)*100) results.policy_diff2.append(np.sum(np.sum(np.absolute(apprentice_policy-exp2_policy)))/(2*disc.tot_states)*100) if i == iterations-1: print "Policy Difference",results.policy_diff1[-1] print "Policy Difference",results.policy_diff2[-1] return results
def learn_from_failure(expert1, expert2, apprentice, iterations, steps, initial_states, test_states, failure="false", initial_bad_states=None): #initialise the lagrange multipliers to 1 print "INITIALISED LEARNING. LEARNING FROM FAILURE = ", failure direc = "results/" fn.make_dir(direc) C = 5.0 D = .7 delta_c = .96 disc = expert1.disc a, s, f = expert1.feature_f.shape #experts exp1_policy, ignore, exp1_state_exp, exp1_all = inference(expert1, steps, initial_states, discount=0.90) if initial_bad_states == None: exp2_policy, ignore, exp2_state_exp, exp2_all = inference( expert2, steps, initial_states, discount=0.90) else: exp2_policy, ignore, exp2_state_exp, exp2_all = inference( expert2, steps, initial_bad_states, discount=0.90) #print "POLICYY", exp1_policy.shape exp1_feature_avg = np.dot(exp1_state_exp.reshape(s * a, order="F"), expert1.feature_f.reshape(s * a, f, order="F")) exp2_feature_avg = np.dot(exp2_state_exp.reshape(s * a, order="F"), expert2.feature_f.reshape(s * a, f, order="F")) e_on_e = eval_value(expert1.w, exp1_policy, expert1, test_states, steps) t_o_t = eval_value(expert2.w, exp2_policy, expert2, test_states, steps) expert_on_taboo = eval_value(expert2.w, exp1_policy, expert2, test_states, steps) z_stat = None #initiate results structure results = EmptyObject() results.a_o_e = [] results.a_o_t = [] results.policy_diff1 = [] results.policy_diff2 = [] results.e_on_e = e_on_e results.t_o_t = t_o_t results.e_o_t = expert_on_taboo # learning rate rate = 0.08 rate2 = 0.08 # delay before failure data is includes. Large numbers avoid oscilations delay = 0 for i in range(iterations): apprentice_policy, z_stat, a_state_exp, a_all = inference( apprentice, steps, initial_states, z_states=None, discount=0.95) apprentice_feature_avg = np.dot( a_state_exp.reshape(s * a, order="F"), apprentice.feature_f.reshape(s * a, f, order="F")) difference_exp1 = exp1_feature_avg - apprentice_feature_avg if initial_bad_states == None: difference_exp2 = exp2_feature_avg - apprentice_feature_avg else: apprentice_policy, z_stat, a_state_exp_bad, a_all = inference( apprentice, steps, initial_bad_states, z_states=None, discount=0.95) apprentice_feature_avg_bad = np.dot( a_state_exp_bad.reshape(s * a, order="F"), apprentice.feature_f.reshape(s * a, f, order="F")) difference_exp2 = apprentice_feature_avg_bad - exp2_feature_avg if i == 0: difference_random = np.copy(difference_exp2) apprentice_feature_avg_bad_prev = apprentice_feature_avg * 0 if failure == "L2": #first update the alphas according to their gradient. apprentice.w = fn.pin_to_threshold( apprentice.w + rate * difference_exp1, C, -C) if i > delay: apprentice.zeta = -difference_exp2 #print "ZETAAA",apprentice.zeta #print "-------------------------------------------"s elif failure == "L1": apprentice.w = apprentice.w + rate * difference_exp1 #apprentice.zeta = apprentice.zeta + rate2*(difference_exp2+ D*apprentice.zeta) apprentice.zeta = 0.9 * difference_exp2 elif failure == "false": apprentice.w = apprentice.w + rate * difference_exp1 elif failure == "slow": apprentice.w = apprentice.w + rate * difference_exp1 C = C * delta_c if 1. / C > D: C = 1 / D if i > delay: apprentice.zeta = -difference_exp2 / (C) apprentice_feature_avg_bad_prev = apprentice_feature_avg_bad apprentice.reward_f = apprentice.buildRewardFunction() #evaluation a_on_e = eval_value(expert1.w, apprentice_policy, apprentice, test_states, steps) a_o_t = eval_value(expert2.w, apprentice_policy, apprentice, test_states, steps) #if i ==iterations-1: if i < iterations: print "failure", failure print "Iteration", i print "Aprentice on Expert", a_on_e print "Expert on expert", e_on_e print "Apprentice on Taboo", a_o_t print "Taboo on Taboo", t_o_t print "Expert on Taboo", expert_on_taboo print "______________________________________" results.a_o_e.append(a_on_e) results.a_o_t.append(a_o_t) results.policy_diff1.append( np.sum(np.sum(np.absolute(apprentice_policy - exp1_policy))) / (2 * disc.tot_states) * 100) results.policy_diff2.append( np.sum(np.sum(np.absolute(apprentice_policy - exp2_policy))) / (2 * disc.tot_states) * 100) if i == iterations - 1: print "Policy Difference", results.policy_diff1[-1] print "Policy Difference", results.policy_diff2[-1] return results