Exemplo n.º 1
0
def forward(policy, transition, start, time_steps, discount=None):
    #print "START of FORWARD ------------------->"
    num_actions = transition.tot_actions
    num_states = transition.tot_states
    dt_states = 0.0625 * np.zeros([num_states, time_steps])
    dt_states_actions = np.zeros([num_actions, num_states, time_steps])
    for i in start:
        dt_states[i, 0] += 1
    dt_states[:, 0] /= len(start)
    for i in range(time_steps):
        for j in range(num_states):
            tr = transition.dense_forward[j]
            if i != time_steps - 1:
                dt_states[j, i +
                          1] = np.sum(dt_states[map(int, tr[1, :]), i] *
                                      policy[map(int, tr[0, :]),
                                             map(int, tr[1, :])] * tr[2, :])
            dt_states_actions[:, j, i] = dt_states[j, i] * policy[:, j]
    if discount == None:
        state_action_freq = np.sum(dt_states_actions, axis=2)
        state_freq = np.sum(dt_states, axis=1)
    else:
        state_action_freq = discounted_sum(dt_states_actions, discount, ax=2)
        state_freq = discounted_sum(dt_states, discount, ax=1)

    #print "END of Forward"
    return state_freq, state_action_freq, dt_states
Exemplo n.º 2
0
def forward_sparse(policy,
                   transition_forward,
                   start,
                   time_steps,
                   discount=None):
    #print "START of FORWARD ------------------->"
    num_states = transition_forward.shape[1]
    num_actions = transition_forward.shape[0] / num_states
    #print num_states,num_actions
    #alpha = 1w*0/(num_states*100)
    alpha = 0
    dt_states = np.zeros((num_states, time_steps))
    dt_states_actions = np.zeros((num_actions * num_states, time_steps))
    for i in start:
        dt_states[i, 0] += 1
    dt_states[:, 0] /= len(start)
    for i in range(time_steps):
        #print "CHAPE YOU WANT",(dt_states[:,i]*policy).shape
        dt_states_actions[:, i] = (dt_states[:, i] * policy).reshape(
            num_actions * num_states, order="F").T
        if i != time_steps - 1:
            dt_states[:, i + 1] = np.dot(
                (1 - alpha) * transition_forward.T,
                dt_states_actions[:,
                                  i]) + alpha * np.sum(dt_states_actions[:, i])
    if discount == None:
        state_action_freq = np.sum(dt_states_actions, axis=1)
        state_freq = np.sum(dt_states, axis=1)
    else:
        state_action_freq = discounted_sum(dt_states_actions, discount, ax=1)
        state_freq = discounted_sum(dt_states, discount, ax=1)
    state_action_freq = state_action_freq.reshape(num_actions,
                                                  num_states,
                                                  order="F")
    return state_freq, state_action_freq, dt_states
def forward(policy,transition,start,time_steps, discount = None):
    #print "START of FORWARD ------------------->"
    num_actions = transition.tot_actions;num_states = transition.tot_states
    dt_states = 0.0625 * np.zeros([num_states,time_steps])
    dt_states_actions = np.zeros([num_actions,num_states,time_steps])
    for i in start: dt_states[i,0]+=1 
    dt_states[:,0] /=len(start)
    for i in range(time_steps):
      for j in range(num_states):
        tr = transition.dense_forward[j]
        if i != time_steps-1:
          dt_states[j,i+1] = np.sum(dt_states[map(int,tr[1,:]),i] *policy[map(int,tr[0,:]),map(int,tr[1,:])] * tr[2,:]) 
        dt_states_actions[:,j,i] = dt_states[j,i]*policy[:,j]
    if discount ==None:
      state_action_freq = np.sum(dt_states_actions,axis=2)
      state_freq = np.sum(dt_states,axis = 1)
    else: 
      state_action_freq = discounted_sum(dt_states_actions,discount,ax=2)
      state_freq = discounted_sum(dt_states,discount,ax = 1)
    
    #print "END of Forward"
    return state_freq,state_action_freq,dt_states
def forward_sparse(policy,transition_forward,start,time_steps, discount = None):
    #print "START of FORWARD ------------------->"
    num_states = transition_forward.shape[1];num_actions = transition_forward.shape[0]/num_states
    #print num_states,num_actions
    #alpha = 1w*0/(num_states*100)
    alpha = 0
    dt_states = np.zeros((num_states,time_steps))
    dt_states_actions = np.zeros((num_actions*num_states,time_steps))
    for i in start: dt_states[i,0]+=1 
    dt_states[:,0] /=len(start)
    for i in range(time_steps):
      #print "CHAPE YOU WANT",(dt_states[:,i]*policy).shape
      dt_states_actions[:,i] = (dt_states[:,i]*policy).reshape(num_actions*num_states,order="F").T
      if i != time_steps-1:
        dt_states[:,i+1] =np.dot((1-alpha)*transition_forward.T,dt_states_actions[:,i]) + alpha*np.sum(dt_states_actions[:,i]) 
    if discount ==None:
      state_action_freq = np.sum(dt_states_actions,axis=1)
      state_freq = np.sum(dt_states,axis = 1)
    else: 
      state_action_freq = discounted_sum(dt_states_actions,discount,ax=1)
      state_freq = discounted_sum(dt_states,discount,ax = 1)
    state_action_freq = state_action_freq.reshape(num_actions,num_states,order = "F")
    return state_freq,state_action_freq,dt_states