示例#1
0
文件: main.py 项目: yyc/birl
        demo.append((s0, a0))
        reward_cum += mdp.rewards[s0]
        if s0 == 35 or len(demo) > 30:
            go_on = False
        s0 = s1
    confidence = 400.
    demos.append((reward_cum, demo, confidence))
    return demos


if __name__ == '__main__':
    transitions = initialize_gridworld(6, 6)
    rewards = -0.01 * np.ones(36)
    rewards[np.random.randint(1, 35, 3)] = -0.1
    rewards[35] = 1.
    #mdp = MDP(transitions, initialize_rewards(5, 36), 0.99)
    mdp = MDP(transitions, rewards, 0.99)
    thing = GridWorld(mdp)
    optimal_policy = mdp.policy_iteration()[0]
    demos = get_trajectories(optimal_policy, mdp)
    #demos = thing.record(1)
    print demos
    _, playout = thing.play(optimal_policy)
    policy = birl(mdp, 0.02, 1000, 1.0, demos, 50, 5,
                  PriorDistribution.UNIFORM)
    print "Finished BIRL"
    print "Agent Playing"
    reward, playout = thing.play(policy)
    print "Reward is " + str(reward)
    print "Playout is " + str(playout)