def caus_ent_backward(transition,reward_f,conv=5,discount = 0.9,z_states = None):
    num_actions = transition.tot_actions;num_states = transition.tot_states
    if reward_f.shape[0] ==num_actions:
      state_action = True
    else: state_action =False
    z_actions = np.zeros([num_actions,num_states])
    if z_states==None:
      z_states = np.zeros(num_states)
    #Backward - - - - - - - - - - - - - - - - - - - - - - - - - -
    #print "Caus Ent Backward"
    count = 0
    delta = 0
    while True:
      prev = np.zeros(z_states.shape)
      prev += z_states
      for i in range(num_states):
        tr = transition.dense_backward[i]
        ch = transition.chunks_backward[i]
        out = discount*np.array(sum_chunks(tr[2,:]*z_states[map(int,tr[1,:])],ch))
        z_actions[:,i] = out +reward_f[:,i]
      m = np.amax(z_actions,axis = 0)
      z_states = m + np.log(np.sum(np.exp(z_actions-m),axis = 0))
      count+=1
      #Action Probability Computation - - - - - - - - - - - - - - - -
      delta = np.amax(np.absolute(prev-z_states))
      if count == 50:
        #print "Count and delta", count,delta
        z_actions = z_actions
        m = np.amax(z_actions,axis = 0)
        z_states = m + np.log(np.sum(np.exp(z_actions-m),axis = 0))
        policy= np.exp(z_actions-z_states)
        break
    return policy,np.log(policy),z_states
def caus_ent_backward_nodisount(transition,reward_f,steps):
    num_actions = transition.tot_actions;num_states = transition.tot_states
    if reward_f.shape[0] ==num_actions:
      state_action = True
    else: state_action =False
    gamma = discount
    z_actions = np.zeros([num_actions,num_states])
    z_states = np.zeros(num_states)
    #Backward - - - - - - - - - - - - - - - - - - - - - - - - - -
    print "Caus Ent Backward"
    count = 0
    delta = 0
    for j in range(steps):
      prev = np.zeros(z_states.shape)
      prev += z_states
      for i in range(num_states):
        tr = transition.dense_backward[i]
        ch = transition.chunks_backward[i]
        out = gamma*np.array(sum_chunks(tr[2,:]*z_states[map(int,tr[1,:])],ch))
        z_actions[:,i] = out +reward_f[:,i]
      m = np.amax(z_actions)
      z_states = m + np.log(np.sum(np.exp(z_actions-m),axis = 0))
      count+=1
      #Action Probability Computation - - - - - - - - - - - - - - - -
      delta = np.sum(np.sum(np.absolute(prev-z_states)))
      #delta +=1 
      #print "DElta cause",delta,delta2
      if j==steps-1:
        z_actions = z_actions
        m = np.amax(z_actions)
        z_states = m + np.log(np.sum(np.exp(z_actions-m),axis = 0))
        policy= np.exp(z_actions-z_states)
    return policy,np.log(policy),z_states
예제 #3
0
def timed_backward(transition, reward_f, conv=5, discount=0.9, z_states=None):
    sweep_timer = timer()
    convergence_timer = timer()
    num_actions = transition.tot_actions
    num_states = transition.tot_states
    if reward_f.shape[0] == num_actions:
        state_action = True
    else:
        state_action = False
    gamma = discount
    z_actions = np.zeros([num_actions, num_states])
    if z_states == None:
        z_states = np.zeros(num_states)
    #Backward - - - - - - - - - - - - - - - - - - - - - - - - - -
    print "Caus Ent Backward"
    count = 0
    delta = 0
    convergence_timer.start()
    while True:
        prev = np.zeros(z_states.shape)
        prev += z_states
        sweep_timer.start()
        for i in range(num_states):
            tr = transition.dense_backward[i]
            ch = transition.chunks_backward[i]
            out = gamma * np.array(
                sum_chunks(tr[2, :] * z_states[map(int, tr[1, :])], ch))
            z_actions[:, i] = out + reward_f[:, i]
        sweep_timer.stop()
        m = np.amax(z_actions)
        z_states = m + np.log(np.sum(np.exp(z_actions - m), axis=0))
        count += 1
        #Action Probability Computation - - - - - - - - - - - - - - - -
        delta = np.amax(np.absolute(prev - z_states))
        print delta
        if count > 2 and delta < conv:
            print "Count and delta", count, delta
            z_actions = z_actions
            m = np.amax(z_actions)
            z_states = m + np.log(np.sum(np.exp(z_actions - m), axis=0))
            policy = np.exp(z_actions - z_states)
            break
    convergence_timer.stop()
    times = [
        sum(convergence_timer.time_taken) / len(convergence_timer.time_taken),
        sum(sweep_timer.time_taken) / len(sweep_timer.time_taken)
    ]
    return policy, np.log(policy), z_states, times
def timed_backward(transition,reward_f,conv=5,discount = 0.9,z_states = None):
    sweep_timer = timer()
    convergence_timer = timer()
    num_actions = transition.tot_actions;num_states = transition.tot_states
    if reward_f.shape[0] ==num_actions:
      state_action = True
    else: state_action =False
    gamma = discount
    z_actions = np.zeros([num_actions,num_states])
    if z_states==None:
      z_states = np.zeros(num_states)
    #Backward - - - - - - - - - - - - - - - - - - - - - - - - - -
    print "Caus Ent Backward"
    count = 0
    delta = 0
    convergence_timer.start()
    while True:
      prev = np.zeros(z_states.shape)
      prev += z_states
      sweep_timer.start()
      for i in range(num_states):
        tr = transition.dense_backward[i]
        ch = transition.chunks_backward[i]
        out = gamma*np.array(sum_chunks(tr[2,:]*z_states[map(int,tr[1,:])],ch))
        z_actions[:,i] = out +reward_f[:,i]
      sweep_timer.stop()
      m = np.amax(z_actions)
      z_states = m + np.log(np.sum(np.exp(z_actions-m),axis = 0))
      count+=1
      #Action Probability Computation - - - - - - - - - - - - - - - -
      delta = np.amax(np.absolute(prev-z_states))
      print delta
      if count>2 and delta<conv:
        print "Count and delta", count,delta
        z_actions = z_actions
        m = np.amax(z_actions)
        z_states = m + np.log(np.sum(np.exp(z_actions-m),axis = 0))
        policy= np.exp(z_actions-z_states)
        break
    convergence_timer.stop()
    times = [sum(convergence_timer.time_taken)/len(convergence_timer.time_taken),sum(sweep_timer.time_taken)/len(sweep_timer.time_taken)]
    return policy,np.log(policy),z_states,times
예제 #5
0
def caus_ent_backward(transition,
                      reward_f,
                      conv=5,
                      discount=0.9,
                      z_states=None):
    num_actions = transition.tot_actions
    num_states = transition.tot_states
    if reward_f.shape[0] == num_actions:
        state_action = True
    else:
        state_action = False
    z_actions = np.zeros([num_actions, num_states])
    if z_states == None:
        z_states = np.zeros(num_states)
    #Backward - - - - - - - - - - - - - - - - - - - - - - - - - -
    #print "Caus Ent Backward"
    count = 0
    delta = 0
    while True:
        prev = np.zeros(z_states.shape)
        prev += z_states
        for i in range(num_states):
            tr = transition.dense_backward[i]
            ch = transition.chunks_backward[i]
            out = discount * np.array(
                sum_chunks(tr[2, :] * z_states[map(int, tr[1, :])], ch))
            z_actions[:, i] = out + reward_f[:, i]
        m = np.amax(z_actions, axis=0)
        z_states = m + np.log(np.sum(np.exp(z_actions - m), axis=0))
        count += 1
        #Action Probability Computation - - - - - - - - - - - - - - - -
        delta = np.amax(np.absolute(prev - z_states))
        if count == 50:
            #print "Count and delta", count,delta
            z_actions = z_actions
            m = np.amax(z_actions, axis=0)
            z_states = m + np.log(np.sum(np.exp(z_actions - m), axis=0))
            policy = np.exp(z_actions - z_states)
            break
    return policy, np.log(policy), z_states
예제 #6
0
def caus_ent_backward_nodisount(transition, reward_f, steps):
    num_actions = transition.tot_actions
    num_states = transition.tot_states
    if reward_f.shape[0] == num_actions:
        state_action = True
    else:
        state_action = False
    gamma = discount
    z_actions = np.zeros([num_actions, num_states])
    z_states = np.zeros(num_states)
    #Backward - - - - - - - - - - - - - - - - - - - - - - - - - -
    print "Caus Ent Backward"
    count = 0
    delta = 0
    for j in range(steps):
        prev = np.zeros(z_states.shape)
        prev += z_states
        for i in range(num_states):
            tr = transition.dense_backward[i]
            ch = transition.chunks_backward[i]
            out = gamma * np.array(
                sum_chunks(tr[2, :] * z_states[map(int, tr[1, :])], ch))
            z_actions[:, i] = out + reward_f[:, i]
        m = np.amax(z_actions)
        z_states = m + np.log(np.sum(np.exp(z_actions - m), axis=0))
        count += 1
        #Action Probability Computation - - - - - - - - - - - - - - - -
        delta = np.sum(np.sum(np.absolute(prev - z_states)))
        #delta +=1
        #print "DElta cause",delta,delta2
        if j == steps - 1:
            z_actions = z_actions
            m = np.amax(z_actions)
            z_states = m + np.log(np.sum(np.exp(z_actions - m), axis=0))
            policy = np.exp(z_actions - z_states)
    return policy, np.log(policy), z_states