#
states = range(gwg.nstates)
alphabet = [0,1,2,3] # North, south, west, east
transitions = []
for s in states:
    for a in alphabet:
        for t in np.nonzero(gwg.prob[gwg.actlist[a]][s])[0]:
            p = gwg.prob[gwg.actlist[a]][s][t]
            transitions.append((s, alphabet.index(a), t, p))

mdp = MDP(states, set(alphabet),transitions)

# V, goodpolicy = mdp.max_reach_prob(goodtargets, epsilon=0.0001)
# V, badpolicy = mdp.max_reach_prob(badtargets, epsilon=0.0001)
randomness = 0
R = dict([(s,a,next_s),0.0] for s in mdp.states for a in mdp.available(s) for next_s in mdp.post(s,a) )
R.update([(s,a,next_s),1.0] for s in mdp.states  for a in mdp.available(s) for next_s in mdp.post(s,a) if next_s in goodtargets and s in goodtargets)
V,goodpolicy =  mdp.T_step_value_iteration(R,10)
R = dict([(s,a,next_s),0.0] for s in mdp.states for a in mdp.available(s) for next_s in mdp.post(s,a) )
R.update([(s,a,next_s),1.0] for s in mdp.states  for a in mdp.available(s) for next_s in mdp.post(s,a) if next_s in badtargets and s in badtargets)

V,badpolicy =  mdp.T_step_value_iteration(R,10)
good_MC = mdp.construct_MC(goodpolicy,'Examples/7x5_good.txt')
bad_MC = mdp.construct_MC(badpolicy,'Examples/7x5_bad.txt')

# Construct product mdp
states = [(s1,s2) for s1 in gwg.states for s2 in gwg.states]
product_trans = []
for s1 in states:
    for s2 in states:
        for a in alphabet:
                         for t in states)
        for a in range(gwg.nactions):
            if a in ab[s]:
                w = 1.0 / len(ab[s]) - randomness / (gwg.nstates - len(ab[s]))
            else:
                w = (1.0 / (gwg.nstates - len(ab[s]))) * randomness
            # tempdict = dict([(s, a, t),0.0] for t in states)
            for t in np.nonzero(gwg.prob[gwg.actlist[a]][s])[0]:
                p = gwg.prob[gwg.actlist[a]][s][t]
                transdict[(s, agentbehaviours.index(ab), t)] += p * w
        for t in states:
            transitions.append((s, agentbehaviours.index(ab), t,
                                transdict[(s, agentbehaviours.index(ab), t)]))
mdp1 = MDP(states, alphabet=range(2), transitions=transitions)

R = dict([(s, a, next_s), 0.0] for s in mdp.states for a in mdp.available(s)
         for next_s in mdp.post(s, a))
R.update([(s, a, next_s), 1.0] for s in mdp.states for a in mdp.available(s)
         for next_s in mdp.post(s, a)
         if next_s in targets[0] and s not in targets[0])
V, policyT = mdp.T_step_value_iteration(R, T=20)
policyT1 = dict([s, set(range(gwg.nactions))] for s in mdp.states
                for a in mdp.available(s))
agentbehaviours = [policyT1, policyT]
transitions = []
for ab in agentbehaviours:
    for s in states:
        transdict = dict([(s, agentbehaviours.index(ab), t), 0.0]
                         for t in states)
        for a in ab[s]:
            # tempdict = dict([(s, a, t),0.0] for t in states)