def caus_ent_backward(transition,reward_f,conv=5,discount = 0.9,z_states = None): num_actions = transition.tot_actions;num_states = transition.tot_states if reward_f.shape[0] ==num_actions: state_action = True else: state_action =False z_actions = np.zeros([num_actions,num_states]) if z_states==None: z_states = np.zeros(num_states) #Backward - - - - - - - - - - - - - - - - - - - - - - - - - - #print "Caus Ent Backward" count = 0 delta = 0 while True: prev = np.zeros(z_states.shape) prev += z_states for i in range(num_states): tr = transition.dense_backward[i] ch = transition.chunks_backward[i] out = discount*np.array(sum_chunks(tr[2,:]*z_states[map(int,tr[1,:])],ch)) z_actions[:,i] = out +reward_f[:,i] m = np.amax(z_actions,axis = 0) z_states = m + np.log(np.sum(np.exp(z_actions-m),axis = 0)) count+=1 #Action Probability Computation - - - - - - - - - - - - - - - - delta = np.amax(np.absolute(prev-z_states)) if count == 50: #print "Count and delta", count,delta z_actions = z_actions m = np.amax(z_actions,axis = 0) z_states = m + np.log(np.sum(np.exp(z_actions-m),axis = 0)) policy= np.exp(z_actions-z_states) break return policy,np.log(policy),z_states
def caus_ent_backward_nodisount(transition,reward_f,steps): num_actions = transition.tot_actions;num_states = transition.tot_states if reward_f.shape[0] ==num_actions: state_action = True else: state_action =False gamma = discount z_actions = np.zeros([num_actions,num_states]) z_states = np.zeros(num_states) #Backward - - - - - - - - - - - - - - - - - - - - - - - - - - print "Caus Ent Backward" count = 0 delta = 0 for j in range(steps): prev = np.zeros(z_states.shape) prev += z_states for i in range(num_states): tr = transition.dense_backward[i] ch = transition.chunks_backward[i] out = gamma*np.array(sum_chunks(tr[2,:]*z_states[map(int,tr[1,:])],ch)) z_actions[:,i] = out +reward_f[:,i] m = np.amax(z_actions) z_states = m + np.log(np.sum(np.exp(z_actions-m),axis = 0)) count+=1 #Action Probability Computation - - - - - - - - - - - - - - - - delta = np.sum(np.sum(np.absolute(prev-z_states))) #delta +=1 #print "DElta cause",delta,delta2 if j==steps-1: z_actions = z_actions m = np.amax(z_actions) z_states = m + np.log(np.sum(np.exp(z_actions-m),axis = 0)) policy= np.exp(z_actions-z_states) return policy,np.log(policy),z_states
def timed_backward(transition, reward_f, conv=5, discount=0.9, z_states=None): sweep_timer = timer() convergence_timer = timer() num_actions = transition.tot_actions num_states = transition.tot_states if reward_f.shape[0] == num_actions: state_action = True else: state_action = False gamma = discount z_actions = np.zeros([num_actions, num_states]) if z_states == None: z_states = np.zeros(num_states) #Backward - - - - - - - - - - - - - - - - - - - - - - - - - - print "Caus Ent Backward" count = 0 delta = 0 convergence_timer.start() while True: prev = np.zeros(z_states.shape) prev += z_states sweep_timer.start() for i in range(num_states): tr = transition.dense_backward[i] ch = transition.chunks_backward[i] out = gamma * np.array( sum_chunks(tr[2, :] * z_states[map(int, tr[1, :])], ch)) z_actions[:, i] = out + reward_f[:, i] sweep_timer.stop() m = np.amax(z_actions) z_states = m + np.log(np.sum(np.exp(z_actions - m), axis=0)) count += 1 #Action Probability Computation - - - - - - - - - - - - - - - - delta = np.amax(np.absolute(prev - z_states)) print delta if count > 2 and delta < conv: print "Count and delta", count, delta z_actions = z_actions m = np.amax(z_actions) z_states = m + np.log(np.sum(np.exp(z_actions - m), axis=0)) policy = np.exp(z_actions - z_states) break convergence_timer.stop() times = [ sum(convergence_timer.time_taken) / len(convergence_timer.time_taken), sum(sweep_timer.time_taken) / len(sweep_timer.time_taken) ] return policy, np.log(policy), z_states, times
def timed_backward(transition,reward_f,conv=5,discount = 0.9,z_states = None): sweep_timer = timer() convergence_timer = timer() num_actions = transition.tot_actions;num_states = transition.tot_states if reward_f.shape[0] ==num_actions: state_action = True else: state_action =False gamma = discount z_actions = np.zeros([num_actions,num_states]) if z_states==None: z_states = np.zeros(num_states) #Backward - - - - - - - - - - - - - - - - - - - - - - - - - - print "Caus Ent Backward" count = 0 delta = 0 convergence_timer.start() while True: prev = np.zeros(z_states.shape) prev += z_states sweep_timer.start() for i in range(num_states): tr = transition.dense_backward[i] ch = transition.chunks_backward[i] out = gamma*np.array(sum_chunks(tr[2,:]*z_states[map(int,tr[1,:])],ch)) z_actions[:,i] = out +reward_f[:,i] sweep_timer.stop() m = np.amax(z_actions) z_states = m + np.log(np.sum(np.exp(z_actions-m),axis = 0)) count+=1 #Action Probability Computation - - - - - - - - - - - - - - - - delta = np.amax(np.absolute(prev-z_states)) print delta if count>2 and delta<conv: print "Count and delta", count,delta z_actions = z_actions m = np.amax(z_actions) z_states = m + np.log(np.sum(np.exp(z_actions-m),axis = 0)) policy= np.exp(z_actions-z_states) break convergence_timer.stop() times = [sum(convergence_timer.time_taken)/len(convergence_timer.time_taken),sum(sweep_timer.time_taken)/len(sweep_timer.time_taken)] return policy,np.log(policy),z_states,times
def caus_ent_backward(transition, reward_f, conv=5, discount=0.9, z_states=None): num_actions = transition.tot_actions num_states = transition.tot_states if reward_f.shape[0] == num_actions: state_action = True else: state_action = False z_actions = np.zeros([num_actions, num_states]) if z_states == None: z_states = np.zeros(num_states) #Backward - - - - - - - - - - - - - - - - - - - - - - - - - - #print "Caus Ent Backward" count = 0 delta = 0 while True: prev = np.zeros(z_states.shape) prev += z_states for i in range(num_states): tr = transition.dense_backward[i] ch = transition.chunks_backward[i] out = discount * np.array( sum_chunks(tr[2, :] * z_states[map(int, tr[1, :])], ch)) z_actions[:, i] = out + reward_f[:, i] m = np.amax(z_actions, axis=0) z_states = m + np.log(np.sum(np.exp(z_actions - m), axis=0)) count += 1 #Action Probability Computation - - - - - - - - - - - - - - - - delta = np.amax(np.absolute(prev - z_states)) if count == 50: #print "Count and delta", count,delta z_actions = z_actions m = np.amax(z_actions, axis=0) z_states = m + np.log(np.sum(np.exp(z_actions - m), axis=0)) policy = np.exp(z_actions - z_states) break return policy, np.log(policy), z_states
def caus_ent_backward_nodisount(transition, reward_f, steps): num_actions = transition.tot_actions num_states = transition.tot_states if reward_f.shape[0] == num_actions: state_action = True else: state_action = False gamma = discount z_actions = np.zeros([num_actions, num_states]) z_states = np.zeros(num_states) #Backward - - - - - - - - - - - - - - - - - - - - - - - - - - print "Caus Ent Backward" count = 0 delta = 0 for j in range(steps): prev = np.zeros(z_states.shape) prev += z_states for i in range(num_states): tr = transition.dense_backward[i] ch = transition.chunks_backward[i] out = gamma * np.array( sum_chunks(tr[2, :] * z_states[map(int, tr[1, :])], ch)) z_actions[:, i] = out + reward_f[:, i] m = np.amax(z_actions) z_states = m + np.log(np.sum(np.exp(z_actions - m), axis=0)) count += 1 #Action Probability Computation - - - - - - - - - - - - - - - - delta = np.sum(np.sum(np.absolute(prev - z_states))) #delta +=1 #print "DElta cause",delta,delta2 if j == steps - 1: z_actions = z_actions m = np.amax(z_actions) z_states = m + np.log(np.sum(np.exp(z_actions - m), axis=0)) policy = np.exp(z_actions - z_states) return policy, np.log(policy), z_states