demo.append((s0, a0)) reward_cum += mdp.rewards[s0] if s0 == 35 or len(demo) > 30: go_on = False s0 = s1 confidence = 400. demos.append((reward_cum, demo, confidence)) return demos if __name__ == '__main__': transitions = initialize_gridworld(6, 6) rewards = -0.01 * np.ones(36) rewards[np.random.randint(1, 35, 3)] = -0.1 rewards[35] = 1. #mdp = MDP(transitions, initialize_rewards(5, 36), 0.99) mdp = MDP(transitions, rewards, 0.99) thing = GridWorld(mdp) optimal_policy = mdp.policy_iteration()[0] demos = get_trajectories(optimal_policy, mdp) #demos = thing.record(1) print demos _, playout = thing.play(optimal_policy) policy = birl(mdp, 0.02, 1000, 1.0, demos, 50, 5, PriorDistribution.UNIFORM) print "Finished BIRL" print "Agent Playing" reward, playout = thing.play(policy) print "Reward is " + str(reward) print "Playout is " + str(playout)