def run_episode(env, thing_below, goal_thing_below, nvm, init_regs, init_conns, penalty_tracker, sigma=0): # reload blocks env.reset() env.load_blocks(thing_below) # invert goals for nvm goal_thing_above = invert(goal_thing_below, num_blocks=len(thing_below), num_bases=len(env.bases)) for key, val in goal_thing_above.items(): if val == "none": goal_thing_above[key] = "nil" # reset nvm, input new env, mount main program nvm.reset_state(init_regs, init_conns) memorize_env(nvm, goal_thing_above) nvm.mount("main") log_prob = 0.0 # accumulate over episode log_probs, rewards = [], [] dbg = False if dbg: nvm.dbg() target_changed = False while True: done = nvm.tick() # reliable if core is not trained if dbg: nvm.dbg() # if nvm.tick_counter % 100 == 0: print(" tick %d" % nvm.tick_counter) if target_changed: mu = nvm.registers["jnt"].content if sigma > 0: dist = tr.distributions.normal.Normal(mu, sigma) position = dist.sample() log_probs.append( dist.log_prob(position).sum()) # multivariate white noise log_prob += log_probs[-1] else: position = mu penalty_tracker.reset() # nvm.dbg() # print(" pos:", position.detach().numpy()) nvm.env.goto_position(position.detach().numpy()) rewards.append(-penalty_tracker.penalty) # print("net penalty: %.5f" % penalty_tracker.penalty) # input('...') tar = nvm.registers["tar"] # decode has some robustness to noise even if tar connections are trained target_changed = (tar.decode(tar.content) != tar.decode( tar.old_content)) if done: break if len(rewards) == 0: # target never changed mu = nvm.registers["jnt"].content dist = tr.distributions.normal.Normal(mu, 0.001) log_probs.append(dist.log_prob(mu).sum()) # multivariate white noise rewards = [-10] sym_reward = compute_symbolic_reward(nvm.env, goal_thing_below) spa_reward = compute_spatial_reward(nvm.env, goal_thing_below) end_reward = calc_reward(sym_reward, spa_reward) rewards[-1] += end_reward return end_reward, log_prob, rewards, log_probs
sigma = 0.001 # stdev in random angular sampling (radians) # one failure case: max_levels = 3 num_blocks = 5 num_bases = 5 thing_below = {'b0': 't1', 'b2': 'b0', 'b4': 'b2', 'b1': 't4', 'b3': 't2'} goal_thing_below = { 'b1': 't1', 'b2': 't3', 'b3': 'b2', 'b0': 't0', 'b4': 'b0' } goal_thing_above = invert(goal_thing_below, num_blocks, num_bases) for key, val in goal_thing_above.items(): if val == "none": goal_thing_above[key] = "nil" domain = bp.BlockStackingDomain(num_blocks, num_bases, max_levels) problem = bp.BlockStackingProblem(domain, thing_below, goal_thing_below, goal_thing_above) penalty_tracker = PenaltyTracker(period=5) if run_exp: lr_results = {lr: list() for lr in learning_rates} for rep in range(num_repetitions): for learning_rate in learning_rates: print("Starting lr=%f" % learning_rate)