transitions.add((7, 'W', 7, 1))

mdp = MDP(states, acts, transitions)
#Define rewards
Rs = dict((s, 0) for s in states)
Rs[1] = 0
Rs[2] = -10
Rs[3] = 1
Rs[4] = 0
Rs[5] = 1
Rs[6] = 1
Rs[7] = 2

R = dict(((s, a), Rs[s]) for (s, a, t) in mdp.transitions)

V, policy = mdp.T_step_value_iteration(R)
print V, policy

# ## Abstracted MDP
# states = {1,7}
# acts = {'N','S','E','W'}
# transitions = set() #transitions are: (s,a,s',p)
# transitions.add((1,'N',1,1))
# transitions.add((1,'S',1,0.8))
# transitions.add((1,'S',7,0.2))
# transitions.add((1,'E',1,1))
# transitions.add((1,'W',1,1))
# transitions.add((7,'N',7,1))
# transitions.add((7,'S',7,1))
# transitions.add((7,'E',7,1))
# transitions.add((7,'W',7,1))
alphabet = [0,1,2,3] # North, south, west, east
transitions = []
for s in states:
    for a in alphabet:
        for t in np.nonzero(gwg.prob[gwg.actlist[a]][s])[0]:
            p = gwg.prob[gwg.actlist[a]][s][t]
            transitions.append((s, alphabet.index(a), t, p))

mdp = MDP(states, set(alphabet),transitions)

# V, goodpolicy = mdp.max_reach_prob(goodtargets, epsilon=0.0001)
# V, badpolicy = mdp.max_reach_prob(badtargets, epsilon=0.0001)
randomness = 0
R = dict([(s,a,next_s),0.0] for s in mdp.states for a in mdp.available(s) for next_s in mdp.post(s,a) )
R.update([(s,a,next_s),1.0] for s in mdp.states  for a in mdp.available(s) for next_s in mdp.post(s,a) if next_s in goodtargets and s in goodtargets)
V,goodpolicy =  mdp.T_step_value_iteration(R,10)
R = dict([(s,a,next_s),0.0] for s in mdp.states for a in mdp.available(s) for next_s in mdp.post(s,a) )
R.update([(s,a,next_s),1.0] for s in mdp.states  for a in mdp.available(s) for next_s in mdp.post(s,a) if next_s in badtargets and s in badtargets)

V,badpolicy =  mdp.T_step_value_iteration(R,10)
good_MC = mdp.construct_MC(goodpolicy,'Examples/7x5_good.txt')
bad_MC = mdp.construct_MC(badpolicy,'Examples/7x5_bad.txt')

# Construct product mdp
states = [(s1,s2) for s1 in gwg.states for s2 in gwg.states]
product_trans = []
for s1 in states:
    for s2 in states:
        for a in alphabet:
            p1 = gwg.prob[gwg.actlist[a]][s1[0]][s2[0]]
            p2 = bad_MC[(s1[1],s2[1])]
                w = (1.0 / (gwg.nstates - len(ab[s]))) * randomness
            # tempdict = dict([(s, a, t),0.0] for t in states)
            for t in np.nonzero(gwg.prob[gwg.actlist[a]][s])[0]:
                p = gwg.prob[gwg.actlist[a]][s][t]
                transdict[(s, agentbehaviours.index(ab), t)] += p * w
        for t in states:
            transitions.append((s, agentbehaviours.index(ab), t,
                                transdict[(s, agentbehaviours.index(ab), t)]))
mdp1 = MDP(states, alphabet=range(2), transitions=transitions)

R = dict([(s, a, next_s), 0.0] for s in mdp.states for a in mdp.available(s)
         for next_s in mdp.post(s, a))
R.update([(s, a, next_s), 1.0] for s in mdp.states for a in mdp.available(s)
         for next_s in mdp.post(s, a)
         if next_s in targets[0] and s not in targets[0])
V, policyT = mdp.T_step_value_iteration(R, T=20)
policyT1 = dict([s, set(range(gwg.nactions))] for s in mdp.states
                for a in mdp.available(s))
agentbehaviours = [policyT1, policyT]
transitions = []
for ab in agentbehaviours:
    for s in states:
        transdict = dict([(s, agentbehaviours.index(ab), t), 0.0]
                         for t in states)
        for a in ab[s]:
            # tempdict = dict([(s, a, t),0.0] for t in states)
            for t in np.nonzero(gwg.prob[gwg.actlist[a]][s])[0]:
                p = gwg.prob[gwg.actlist[a]][s][t]
                transdict[(s, agentbehaviours.index(ab),
                           t)] += p * 1.0 / len(ab[s])
        for t in states: