def run_machine(machine, goal_thing_below, reset_dict): goal_thing_above = machine.env.invert(goal_thing_below) for key, val in goal_thing_above.items(): if val == "none": goal_thing_above[key] = "nil" start = time.perf_counter() memorize_env(machine, goal_thing_above) machine.reset(reset_dict) ticks = machine.run() running_time = time.perf_counter() - start sym_reward = compute_symbolic_reward(machine.env, goal_thing_below) spa_reward = compute_spatial_reward(machine.env, goal_thing_below) return ticks, running_time, sym_reward, spa_reward
def run_episode(env, thing_below, goal_thing_below, nvm, init_regs, init_conns, sigma=0): # reload blocks env.reset() env.load_blocks(thing_below) # reset nvm, input new env, mount main program nvm.reset_state(init_regs, init_conns) memorize_env(nvm, goal_thing_above) nvm.mount("main") log_prob = 0.0 # accumulate over episode dbg = False if dbg: nvm.dbg() target_changed = True while True: done = nvm.tick() if dbg: nvm.dbg() # if nvm.tick_counter % 100 == 0: print(" tick %d" % nvm.tick_counter) if target_changed: mu = nvm.registers["jnt"].content if sigma > 0: dist = tr.distributions.normal.Normal(mu, sigma) position = dist.sample() log_probs = dist.log_prob(position) log_prob += log_probs.sum() # multivariate white noise else: position = mu nvm.env.goto_position(position.detach().numpy()) tar = nvm.registers["tar"] target_changed = (tar.decode(tar.content) != tar.decode( tar.old_content)) if done: break sym_reward = compute_symbolic_reward(nvm.env, goal_thing_below) spa_reward = compute_spatial_reward(nvm.env, goal_thing_below) reward = calc_reward(sym_reward, spa_reward) return reward, log_prob
def rvm_baseline(env, thing_below, goal_thing_above, rvm): start = time.perf_counter() # reload blocks env.reset() env.load_blocks(thing_below) # reset rvm, input new env, mount main program rvm.env = env memorize_env(rvm, goal_thing_above) rvm.reset({"jnt": "rest"}) rvm.mount("main") # run ticks = rvm.run() running_time = time.perf_counter() - start sym_reward = compute_symbolic_reward(env, goal_thing_below) spa_reward = compute_spatial_reward(env, goal_thing_below) reward = calc_reward(sym_reward, spa_reward) return running_time, reward
def run_episode(env, thing_below, goal_thing_below, nvm, init_regs, init_conns, penalty_tracker, sigma=0): # reload blocks env.reset() env.load_blocks(thing_below) # invert goals for nvm goal_thing_above = invert(goal_thing_below, num_blocks=len(thing_below), num_bases=len(env.bases)) for key, val in goal_thing_above.items(): if val == "none": goal_thing_above[key] = "nil" # reset nvm, input new env, mount main program nvm.reset_state(init_regs, init_conns) memorize_env(nvm, goal_thing_above) nvm.mount("main") log_prob = 0.0 # accumulate over episode log_probs, rewards = [], [] dbg = False if dbg: nvm.dbg() target_changed = False while True: done = nvm.tick() # reliable if core is not trained if dbg: nvm.dbg() # if nvm.tick_counter % 100 == 0: print(" tick %d" % nvm.tick_counter) if target_changed: mu = nvm.registers["jnt"].content if sigma > 0: dist = tr.distributions.normal.Normal(mu, sigma) position = dist.sample() log_probs.append( dist.log_prob(position).sum()) # multivariate white noise log_prob += log_probs[-1] else: position = mu penalty_tracker.reset() # nvm.dbg() # print(" pos:", position.detach().numpy()) nvm.env.goto_position(position.detach().numpy()) rewards.append(-penalty_tracker.penalty) # print("net penalty: %.5f" % penalty_tracker.penalty) # input('...') tar = nvm.registers["tar"] # decode has some robustness to noise even if tar connections are trained target_changed = (tar.decode(tar.content) != tar.decode( tar.old_content)) if done: break if len(rewards) == 0: # target never changed mu = nvm.registers["jnt"].content dist = tr.distributions.normal.Normal(mu, 0.001) log_probs.append(dist.log_prob(mu).sum()) # multivariate white noise rewards = [-10] sym_reward = compute_symbolic_reward(nvm.env, goal_thing_below) spa_reward = compute_spatial_reward(nvm.env, goal_thing_below) end_reward = calc_reward(sym_reward, spa_reward) rewards[-1] += end_reward return end_reward, log_prob, rewards, log_probs
def run_trial(domain): env = BlocksWorldEnv(show=False) # rejection sample non-trivial instance problem = domain.random_problem_instance() env.reset() env.load_blocks(problem.thing_below, num_bases=domain.num_bases) # set up rvm and virtualize rvm = make_abstract_machine(env, domain) memorize_problem(rvm, problem) rvm.reset({"jnt": "rest"}) rvm.mount("main") nvm = virtualize(rvm, σ=nv.default_activator, detach_gates=True) nvm.mount("main") W_init = { name: { 0: nvm.net.batchify_weights(conn.W) } for name, conn in nvm.connections.items() } v_init = { name: { 0: nvm.net.batchify_activities(reg.content) } for name, reg in nvm.registers.items() } v_init["jnt"][0] = nvm.net.batchify_activities( tr.tensor(rvm.ik["rest"]).float()) # rvm_results = run_machine(rvm, problem.goal_thing_below, {"jnt": "rest"}) start = time.perf_counter() tar_changed = False while True: done = rvm.tick() if tar_changed: position = rvm.ik[rvm.registers["jnt"].content] env.goto_position(position, speed=1.5) if done: break tar_changed = (rvm.registers["tar"].content != rvm.registers["tar"].old_content) rvm_ticks = rvm.tick_counter rvm_runtime = time.perf_counter() - start rvm_sym = compute_symbolic_reward(env, problem.goal_thing_below) rvm_spa = compute_spatial_reward(env, problem.goal_thing_below) rvm_results = rvm_ticks, rvm_runtime, rvm_sym, rvm_spa # nvm_results = run_machine(nvm, problem.goal_thing_below, {"jnt": tr.tensor(rvm.ik["rest"]).float()}) env.reset() env.load_blocks(problem.thing_below, num_bases=domain.num_bases) start = time.perf_counter() while True: t = nvm.net.tick_counter if t > 0 and nvm.decode("ipt", t, 0) == nvm.decode("ipt", t - 1, 0): break nvm.net.tick(W_init, v_init) nvm.pullback(t) if t > 1 and nvm.decode("tar", t - 2, 0) != nvm.decode( "tar", t - 1, 0): position = nvm.net.activities["jnt"][t][0, :, 0].detach().numpy() env.goto_position(position, speed=1.5) nvm_ticks = nvm.net.tick_counter nvm_runtime = time.perf_counter() - start nvm_sym = compute_symbolic_reward(env, problem.goal_thing_below) nvm_spa = compute_spatial_reward(env, problem.goal_thing_below) nvm_results = nvm_ticks, nvm_runtime, nvm_sym, nvm_spa env.close() return rvm_results, nvm_results, nvm.size(), problem