Exemplo n.º 1
0
def run_episode(env,
                thing_below,
                goal_thing_below,
                nvm,
                init_regs,
                init_conns,
                penalty_tracker,
                sigma=0):

    # reload blocks
    env.reset()
    env.load_blocks(thing_below)

    # invert goals for nvm
    goal_thing_above = invert(goal_thing_below,
                              num_blocks=len(thing_below),
                              num_bases=len(env.bases))
    for key, val in goal_thing_above.items():
        if val == "none": goal_thing_above[key] = "nil"

    # reset nvm, input new env, mount main program
    nvm.reset_state(init_regs, init_conns)
    memorize_env(nvm, goal_thing_above)
    nvm.mount("main")

    log_prob = 0.0  # accumulate over episode
    log_probs, rewards = [], []

    dbg = False
    if dbg: nvm.dbg()
    target_changed = False
    while True:
        done = nvm.tick()  # reliable if core is not trained
        if dbg: nvm.dbg()
        # if nvm.tick_counter % 100 == 0: print("     tick %d" % nvm.tick_counter)
        if target_changed:
            mu = nvm.registers["jnt"].content
            if sigma > 0:
                dist = tr.distributions.normal.Normal(mu, sigma)
                position = dist.sample()
                log_probs.append(
                    dist.log_prob(position).sum())  # multivariate white noise
                log_prob += log_probs[-1]
            else:
                position = mu

            penalty_tracker.reset()
            # nvm.dbg()
            # print("       pos:", position.detach().numpy())
            nvm.env.goto_position(position.detach().numpy())
            rewards.append(-penalty_tracker.penalty)
            # print("net penalty: %.5f" % penalty_tracker.penalty)
            # input('...')

        tar = nvm.registers["tar"]
        # decode has some robustness to noise even if tar connections are trained
        target_changed = (tar.decode(tar.content) != tar.decode(
            tar.old_content))
        if done: break

    if len(rewards) == 0:  # target never changed
        mu = nvm.registers["jnt"].content
        dist = tr.distributions.normal.Normal(mu, 0.001)
        log_probs.append(dist.log_prob(mu).sum())  # multivariate white noise
        rewards = [-10]

    sym_reward = compute_symbolic_reward(nvm.env, goal_thing_below)
    spa_reward = compute_spatial_reward(nvm.env, goal_thing_below)
    end_reward = calc_reward(sym_reward, spa_reward)
    rewards[-1] += end_reward

    return end_reward, log_prob, rewards, log_probs
    sigma = 0.001  # stdev in random angular sampling (radians)

    # one failure case:
    max_levels = 3
    num_blocks = 5
    num_bases = 5
    thing_below = {'b0': 't1', 'b2': 'b0', 'b4': 'b2', 'b1': 't4', 'b3': 't2'}
    goal_thing_below = {
        'b1': 't1',
        'b2': 't3',
        'b3': 'b2',
        'b0': 't0',
        'b4': 'b0'
    }
    goal_thing_above = invert(goal_thing_below, num_blocks, num_bases)
    for key, val in goal_thing_above.items():
        if val == "none": goal_thing_above[key] = "nil"
    domain = bp.BlockStackingDomain(num_blocks, num_bases, max_levels)
    problem = bp.BlockStackingProblem(domain, thing_below, goal_thing_below,
                                      goal_thing_above)

    penalty_tracker = PenaltyTracker(period=5)

    if run_exp:

        lr_results = {lr: list() for lr in learning_rates}
        for rep in range(num_repetitions):
            for learning_rate in learning_rates:
                print("Starting lr=%f" % learning_rate)