def run_experiment(intrinsic_motivation, gamma, c, mc_n, runs, steps): trajectories = [] start = np.array([50, 50]) true_belief = True for _ in range(runs): goal = draw_goal(start, 6) manual = draw_goal(start, 3) print("Goal: {}".format(goal)) print("Manual: {}".format(manual)) world = state.ToyWorld([100, 100], intrinsic_motivation, goal, manual) belief = None if true_belief: belief = dict( zip( [ state.ToyWorldAction(np.array([0, 1])), state.ToyWorldAction(np.array([0, -1])), state.ToyWorldAction(np.array([1, 0])), state.ToyWorldAction(np.array([-1, 0])), ], [[10, 10, 10, 10], [10, 10, 10, 10], [10, 10, 10, 10], [10, 10, 10, 10]], ) ) root_state = state.ToyWorldState(start, world, belief=belief) print(root_state.pos) next_state = StateNode(None, root_state, 0) trajectory = [] for _ in range(steps): try: ba = mcts_search(next_state, gamma, c=c, n=mc_n) print("") print("=" * 80) print("State: {}".format(next_state.state)) print("Belief: {}".format(next_state.state.belief)) print("Reward: {}".format(next_state.reward)) print("N: {}".format(next_state.n)) print("Q: {}".format(next_state.q)) print("Action: {}".format(ba.action)) trajectory.append(next_state.state.pos) if (next_state.state.pos == np.array(goal)).all(): break next_s = next_state.children[ba].sample_state(real_world=True) next_state = next_s next_state.parent = None except KeyboardInterrupt: break trajectories.append(trajectory) with open(gen_name("trajectories", "pkl"), "w") as f: pickle.dump(trajectories, f) print("=" * 80)
def run_experiment(intrinsic_motivation, gamma, c, mc_n, runs, steps): trajectories = [] start = np.array([50, 50]) true_belief = True for _ in range(runs): goal = draw_goal(start, 6) manual = draw_goal(start, 3) print("Goal: {}".format(goal)) print("Manual: {}".format(manual)) world = state.ToyWorld([100, 100], intrinsic_motivation, goal, manual) belief = None if true_belief: belief = dict( zip([ state.ToyWorldAction(np.array([0, 1])), state.ToyWorldAction(np.array([0, -1])), state.ToyWorldAction(np.array([1, 0])), state.ToyWorldAction(np.array([-1, 0])) ], [[10, 10, 10, 10], [10, 10, 10, 10], [10, 10, 10, 10], [10, 10, 10, 10]])) root_state = state.ToyWorldState(start, world, belief=belief) print(root_state.pos) next_state = StateNode(None, root_state, 0) trajectory = [] for _ in range(steps): try: ba = mcts_search(next_state, gamma, c=c, n=mc_n) print("") print("=" * 80) print("State: {}".format(next_state.state)) print("Belief: {}".format(next_state.state.belief)) print("Reward: {}".format(next_state.reward)) print("N: {}".format(next_state.n)) print("Q: {}".format(next_state.q)) print("Action: {}".format(ba.action)) trajectory.append(next_state.state.pos) if (next_state.state.pos == np.array(goal)).all(): break next_s = next_state.children[ba].sample_state(real_world=True) next_state = next_s next_state.parent = None except KeyboardInterrupt: break trajectories.append(trajectory) with open(gen_name("trajectories", "pkl"), "w") as f: pickle.dump(trajectories, f) print("=" * 80)
def run_experiment(intrinsic_motivation, gamma, c, mc_n, runs, steps, problem): st1 = time.time() # trajectories = [] start = np.array([50, 50]) true_belief = True mcts_search = MCTS(tree_policy=UCB1(c=1.41), default_policy=immediate_reward, backup=monte_carlo) rewards = [] for r in range(runs): sta = time.time() print("RUN number", r) goal = draw_goal(start, 6) # manual = draw_goal(start, 3) # print("Goal: {}".format(goal)) world = PaintingWorld((100, 100), False, (100, 100), problem) belief = None root_state = PaintingWorldState((0, 0), (1, 1, 1), world) if true_belief: belief = {} for action in root_state.actions: belief[action] = [1] * len(root_state.actions) root_state.belief = belief # print(root_state.pos) next_state = StateNode(None, root_state) # trajectory =[] rew = 0 for step in range(steps): st = time.time() ba = mcts_search(next_state, n=mc_n) # print("=" * 80) # print("State: {}".format(next_state.state)) # # print("Belief: {}".format(next_state.state.belief)) # print("Reward: {}".format(next_state.reward)) # print("N: {}".format(next_state.n)) # print("Q: {}".format(next_state.q)) # print("Action: {}".format(ba.action)) # trajectory.append(next_state.state.pos) rew = next_state.reward if (next_state.state.pos == np.array(goal)).all(): break next_s = next_state.children[ba].sample_state(real_world=True) next_state = next_s next_state.parent = None en = time.time() print("step", step, "time elapsed", en - st) if step >= 5 and rew > 0.5: break # except KeyboardInterrupt: # break # trajectories.append(trajectory) # print (next_state.reward) rewards.append(rew) # with open(gen_name("trajectories", "pkl"), "w") as f: # pickle.dump(trajectories, f) # print("=" * 80) end = time.time() print("run", r, "time elapsed", end - sta) # if rewards[-1] > 0: # break w = max(rewards) print("REWARD", w) end1 = time.time() print("problem time elapsed", end1 - st1) return w