def find_failure_case(num_bases, num_blocks, max_levels):
    while True:
        # print("trying..")
        env = BlocksWorldEnv(show=False)
        thing_below, goal_thing_below = random_problem_instance(
            env, num_blocks, max_levels, num_bases)
        am = make_abstract_machine(env, num_bases, max_levels)
        am_results = run_machine(am, goal_thing_below, {"jnt": "rest"})
        env.close()
        ticks, running_time, sym_reward, spa_reward = am_results
        if sym_reward <= -2: break
        # print(sym_reward)
    return thing_below, goal_thing_below, sym_reward
Exemplo n.º 2
0
def run_trial(num_bases, num_blocks, max_levels):

    env = BlocksWorldEnv(show=False)

    # rejection sample non-trivial instance
    thing_below, goal_thing_below = random_problem_instance(
        env, num_blocks, max_levels, num_bases)

    am = make_abstract_machine(env, num_bases, max_levels)
    nvm = virtualize(am)

    am_results = run_machine(am, goal_thing_below, {"jnt": "rest"})

    env.reset()
    env.load_blocks(thing_below, num_bases)

    nvm_results = run_machine(nvm, goal_thing_below,
                              {"jnt": tr.tensor(am.ik["rest"]).float()})

    env.close()

    return am_results, nvm_results, nvm.size(), thing_below, goal_thing_below
Exemplo n.º 3
0
                # init baseline
                baseline = 0

                for epoch in range(num_epochs):
                    start_epoch = time.perf_counter()
                    epoch_rewards = []
                    epoch_baselines = []
                    epoch_rtgs = []

                    for episode in range(num_episodes):
                        start_episode = time.perf_counter()

                        # random problem instance
                        env = BlocksWorldEnv(
                            show=showenv, step_hook=penalty_tracker.step_hook)
                        thing_below, goal_thing_below = random_problem_instance(
                            env, num_blocks, max_levels, num_bases)
                        nvm.env = env

                        reward, log_prob, rewards, log_probs = run_episode(
                            env, thing_below, goal_thing_below, nvm, init_regs,
                            init_conns, penalty_tracker, sigma)

                        env.close()

                        if use_penalties:
                            rewards = np.array(rewards)
                            rewards_to_go = np.cumsum(rewards)
                            rewards_to_go = rewards_to_go[
                                -1] - rewards_to_go + rewards
                            if reward <= reject_above:
                                for t in range(len(rewards)):