def dump_trajectory_state(obs_text, frame_idx): if (episode_logger['num'] % config.training_dump_freq == config.training_dump_freq - 1): st = obs_text.split('|') logging('[Episode {} step {}] Obs: \nl={}\ni={}\no={}\n'.format( episode_logger['num'], frame_idx - episode_logger['init_time'], clean(st[0]), clean(st[1]), clean(st[2]))) return
def evaluate_episode(agent, env): step = 0 done = False ob, info = env.reset() state = agent.build_state([ob], [info])[0] log('Obs{}: {} Inv: {} Desc: {}'.format(step, clean(ob), clean(info['inv']), clean(info['look']))) while not done: valid_acts = info['valid'] valid_ids = agent.encode(valid_acts) _, action_idx, action_values = agent.act([state], [valid_ids], sample=False) action_idx = action_idx[0] action_values = action_values[0] action_str = valid_acts[action_idx] log('Action{}: {}, Q-Value {:.2f}'.format( step, action_str, action_values[action_idx].item())) s = '' for idx, (act, val) in enumerate( sorted(zip(valid_acts, action_values), key=lambda x: x[1], reverse=True), 1): s += "{}){:.2f} {} ".format(idx, val.item(), act) log('Q-Values: {}'.format(s)) ob, rew, done, info = env.step(action_str) log("Reward{}: {}, Score {}, Done {}".format(step, rew, info['score'], done)) step += 1 log('Obs{}: {} Inv: {} Desc: {}'.format(step, clean(ob), clean(info['inv']), clean(info['look']))) state = agent.build_state([ob], [info])[0] return info['score']
def train(agent, lm, envs, max_steps, update_freq, eval_freq, checkpoint_freq, log_freq, args): start = time.time() obs, rewards, dones, infos, transitions = [], [], [], [], [] env_steps, max_score, d_in, d_out = 0, 0, defaultdict(list), defaultdict( list) if args.lm_top_k: # small pre-trained classifier to filter invalid actions that CALM generates import fasttext detector = fasttext.load_model('valid_model.bin') reject = lambda ob: detector.predict(clean(ob))[0][ 0] == '__label__invalid' if args.lm_dict: d_in = defaultdict(list, json.load(open('%s/d_in.json' % args.lm_dict, 'r'))) d_out = defaultdict( list, json.load(open('%s/d_out.json' % args.lm_dict, 'r'))) lm.generate_dict = json.load(open('%s/lm.json' % args.lm_dict, 'r')) for env in envs: ob, info = env.reset() obs, rewards, dones, infos, transitions = \ obs + [ob], rewards + [0], dones + [False], infos + [info], transitions + [[]] states = build_state(lm, obs, infos) valid_ids = [[lm.act2ids(a) for a in info['valid']] for info in infos] for step in range(1, max_steps + 1): # act action_ids, action_idxs, action_values = agent.act(states, valid_ids, lm=lm, eps=args.eps, alpha=args.alpha, k=args.eps_top_k) action_strs = [ info['valid'][idx] for info, idx in zip(infos, action_idxs) ] # log envs[0] examples = [(action, value) for action, value in zip( infos[0]['valid'], action_values[0].tolist())] examples = sorted(examples, key=lambda x: -x[1]) log('State {}: {}'.format(step, lm.tokenizer.decode(states[0].state))) log('Actions{}: {}'.format(step, [action for action, _ in examples])) log('Qvalues{}: {}'.format(step, [round(value, 2) for _, value in examples])) # step with rejection next_obs, next_rewards, next_dones, next_infos = [], [], [], [] for i, (env, action) in enumerate(zip(envs, action_strs)): if dones[i]: if env.max_score >= max_score: # put in alpha queue for transition in transitions[i]: agent.observe(transition, is_prior=True) env_steps += infos[i]['moves'] ob, info = env.reset() action_strs[i], action_ids[i], transitions[i] = 'reset', [], [] next_obs, next_rewards, next_dones, next_infos = next_obs + [ ob ], next_rewards + [0], next_dones + [False ], next_infos + [info] continue prev_inv, prev_look = infos[i]['inv'], infos[i]['look'] ob, reward, done, info = env.step(action) if args.lm_top_k: # deal with rejection key = hash(tuple(states[i][0] + states[i][1] + states[i][2])) l_in, l_out = d_in[key], d_out[key] actions = infos[i]['valid'] rej = reject(ob) and prev_inv == info[ 'inv'] and prev_look == info['look'] # while action is invalid, pull another action from CALM generated candidates while not done and rej and len(actions) > 1: if action not in l_out: l_out.append(action) if args.lm_type == 'ngram' and action in l_in: l_in.remove(action) actions.remove(action) action = choice(actions) ob, reward, done, info = env.step(action) rej = reject(ob) and prev_inv == info[ 'inv'] and prev_look == info['look'] action_strs[i] = action if not rej and action not in l_in: l_in.append(action) if reward < 0 and action not in l_out: l_out.append(action) # screen negative-reward actions next_obs, next_rewards, next_dones, next_infos = \ next_obs + [ob], next_rewards + [reward], next_dones + [done], next_infos + [info] if info['score'] > max_score: # new high score experienced max_score = info['score'] agent.memory.clear_alpha() if done: tb.logkv_mean('EpisodeScore', info['score']) rewards, dones, infos = next_rewards, next_dones, next_infos # continue to log envs[0] log('>> Action{}: {}'.format(step, action_strs[0])) log("Reward{}: {}, Score {}, Done {}\n".format(step, rewards[0], infos[0]['score'], dones[0])) # generate valid actions next_states = build_state(lm, next_obs, infos, prev_obs=obs, prev_acts=action_strs) if args.lm_top_k: for env, info, state, done in zip(envs, infos, next_states, dones): if not done: key = hash(tuple(state[0] + state[1] + state[2])) if args.lm_type == 'ngram': objs = env.get_objects() actions = lm.generate(objs, k=args.lm_top_k).copy() else: actions = lm.generate(state.state, k=args.lm_top_k) l_in, l_out = d_in[key], d_out[key] actions += [ action for action in l_in if action not in actions ] # add extra valid actions = [ action for action in actions if action and action not in l_out ] # remove invalid if not actions: actions = ['wait', 'yes', 'no'] info['valid'] = actions next_valids = [[lm.act2ids(a) for a in info['valid']] for info in infos] for state, act, rew, next_state, valids, done, transition in zip( states, action_ids, rewards, next_states, next_valids, dones, transitions): if act: # not [] (i.e. reset) transition.append( Transition(state, act, rew, next_state, valids, done)) agent.observe(transition[-1]) # , is_prior=(rew != 0)) obs, states, valid_ids = next_obs, next_states, next_valids if step % log_freq == 0: tb.logkv('Step', env_steps) tb.logkv("FPS", round((step * args.num_envs) / (time.time() - start), 2)) tb.logkv("Max score seen", max_score) tb.logkv("#dict", len(lm.generate_dict)) # tb.logkv("#dict_d", sum(len(env.d) for env in envs) / len(envs)) # tb.logkv("#dict_in", len(d_in)) tb.logkv( "Last100EpisodeScores", sum(env.get_end_scores(last=100) for env in envs) / len(envs)) tb.dumpkvs() if step % update_freq == 0: loss = agent.update() if loss is not None: tb.logkv_mean('Loss', loss) if step % checkpoint_freq == 0: json.dump(d_in, open('%s/d_in.json' % args.output_dir, 'w'), indent=4) json.dump(d_out, open('%s/d_out.json' % args.output_dir, 'w'), indent=4) json.dump(lm.generate_dict, open('%s/lm.json' % args.output_dir, 'w'), indent=4)
def update_state(self, visible_state, inventory_state, objs, prev_action=None, cache=None, cs_graph=None): prev_room = self.room if (cs_graph != None): # for key in cs_graph: # for edge in cs_graph["key"]: # edge = edge[self.relation] # subject = edge["e1"] # # relation = edge["relation"] # relation = "Capable Of" # predicate = edge["beams"] # for pred in (predicate): # self.graph_state.add_edge(subject, pred, rel=relation) self.graph_state = nx.compose(cs_graph, self.graph_state) # print(self.graph_state.edges) # print(self.graph_state.edges) graph_copy = self.graph_state.copy() con_cs = [ graph_copy.subgraph(c) for c in nx.weakly_connected_components(graph_copy) ] prev_room_subgraph = None prev_you_subgraph = None for con_c in con_cs: for node in con_c.nodes: node = set(str(node).split()) if set(prev_room.split()).issubset(node): prev_room_subgraph = nx.induced_subgraph( graph_copy, con_c.nodes) for edge in self.graph_state.edges: if 'you' in edge[0]: graph_copy.remove_edge(*edge) self.graph_state = graph_copy visible_state = visible_state.split('\n') room = visible_state[0] visible_state = clean(' '.join(visible_state[1:])) dirs = [ 'north', 'south', 'east', 'west', 'southeast', 'southwest', 'northeast', 'northwest', 'up', 'down' ] self.visible_state = str(visible_state) rules = [] if cache is None: sents = openie.call_stanford_openie( self.visible_state)['sentences'] else: sents = cache if sents == "": return [] in_aliases = [ 'are in', 'are facing', 'are standing', 'are behind', 'are above', 'are below', 'are in front' ] in_rl = [] in_flag = False for i, ov in enumerate(sents): sent = ' '.join([a['word'] for a in ov['tokens']]) triple = ov['openie'] for d in dirs: if d in sent and i != 0: rules.append((room, 'has', 'exit to ' + d)) for tr in triple: h, r, t = tr['subject'].lower(), tr['relation'].lower( ), tr['object'].lower() if h == 'you': for rp in in_aliases: if fuzz.token_set_ratio(r, rp) > 80: r = "in" in_rl.append((h, r, t)) in_flag = True break if h == 'it': break if not in_flag: rules.append((h, r, t)) if in_flag: cur_t = in_rl[0] for h, r, t in in_rl: if set(cur_t[2].split()).issubset(set(t.split())): cur_t = h, r, t rules.append(cur_t) room = cur_t[2] try: items = inventory_state.split(':')[1].split('\n')[1:] for item in items: rules.append(('you', 'have', str(' '.join(item.split()[1:])))) except: pass if prev_action is not None: for d in dirs: if d in prev_action and self.room != "": rules.append((prev_room, d + ' of', room)) if prev_room_subgraph is not None: for ed in prev_room_subgraph.edges: rules.append( (ed[0], prev_room_subgraph[ed]['rel'], ed[1])) break for o in objs: #if o != 'all': rules.append((str(o), 'in', room)) add_rules = rules ### ADDING NEW NODES DONE for rule in add_rules: u = '_'.join(str(rule[0]).split()) v = '_'.join(str(rule[2]).split()) if u in self.vocab_kge['entity'].keys( ) and v in self.vocab_kge['entity'].keys(): if u != 'it' and v != 'it': # print(rule[0],"space", rule[2],"space", rule[1]) self.graph_state.add_edge(rule[0], rule[2], rel=rule[1]) # print((self.graph_state.edges)) # if(cs_graph != None): # self.visualize() # print("---------------") return add_rules, sents
def train(self, max_steps): start = time.time() transitions = [] obs, infos, graph_infos = self.vec_env.reset() for step in range(1, max_steps + 1): tb.logkv('Step', step) obs_reps = np.array([g.ob_rep for g in graph_infos]) graph_mask_tt = self.generate_graph_mask(graph_infos) graph_state_reps = [g.graph_state_rep for g in graph_infos] scores = [info['score'] for info in infos] tmpl_pred_tt, obj_pred_tt, dec_obj_tt, dec_tmpl_tt, value, dec_steps = self.model( obs_reps, scores, graph_state_reps, graph_mask_tt) tb.logkv_mean('Value', value.mean().item()) # Log the predictions and ground truth values topk_tmpl_probs, topk_tmpl_idxs = F.softmax( tmpl_pred_tt[0]).topk(5) topk_tmpls = [ self.template_generator.templates[t] for t in topk_tmpl_idxs.tolist() ] tmpl_pred_str = ', '.join([ '{} {:.3f}'.format(tmpl, prob) for tmpl, prob in zip(topk_tmpls, topk_tmpl_probs.tolist()) ]) # Generate the ground truth and object mask admissible = [g.admissible_actions for g in graph_infos] objs = [g.objs for g in graph_infos] tmpl_gt_tt, obj_mask_gt_tt = self.generate_targets( admissible, objs) # Log template/object predictions/ground_truth gt_tmpls = [ self.template_generator.templates[i] for i in tmpl_gt_tt[0]. nonzero().squeeze().cpu().numpy().flatten().tolist() ] gt_objs = [ self.vocab_act[i] for i in obj_mask_gt_tt[ 0, 0].nonzero().squeeze().cpu().numpy().flatten().tolist() ] log('TmplPred: {} GT: {}'.format(tmpl_pred_str, ', '.join(gt_tmpls))) topk_o1_probs, topk_o1_idxs = F.softmax(obj_pred_tt[0, 0]).topk(5) topk_o1 = [self.vocab_act[o] for o in topk_o1_idxs.tolist()] o1_pred_str = ', '.join([ '{} {:.3f}'.format(o, prob) for o, prob in zip(topk_o1, topk_o1_probs.tolist()) ]) # graph_mask_str = [self.vocab_act[i] for i in graph_mask_tt[0].nonzero().squeeze().cpu().numpy().flatten().tolist()] log('ObjtPred: {} GT: {}'.format( o1_pred_str, ', '.join(gt_objs))) # , ', '.join(graph_mask_str))) chosen_actions = self.decode_actions(dec_tmpl_tt, dec_obj_tt) obs, rewards, dones, infos, graph_infos = self.vec_env.step( chosen_actions) tb.logkv_mean( 'TotalStepsPerEpisode', sum([i['steps'] for i in infos]) / float(len(graph_infos))) tb.logkv_mean('Valid', infos[0]['valid']) log('Act: {}, Rew {}, Score {}, Done {}, Value {:.3f}'.format( chosen_actions[0], rewards[0], infos[0]['score'], dones[0], value[0].item())) log('Obs: {}'.format(clean(obs[0]))) if dones[0]: log('Step {} EpisodeScore {}\n'.format(step, infos[0]['score'])) for done, info in zip(dones, infos): if done: tb.logkv_mean('EpisodeScore', info['score']) rew_tt = torch.FloatTensor(rewards).cuda().unsqueeze(1) done_mask_tt = (~torch.tensor(dones)).float().cuda().unsqueeze(1) self.model.reset_hidden(done_mask_tt) transitions.append( (tmpl_pred_tt, obj_pred_tt, value, rew_tt, done_mask_tt, tmpl_gt_tt, dec_tmpl_tt, dec_obj_tt, obj_mask_gt_tt, graph_mask_tt, dec_steps)) if len(transitions) >= self.params['bptt']: tb.logkv('StepsPerSecond', float(step) / (time.time() - start)) self.model.clone_hidden() obs_reps = np.array([g.ob_rep for g in graph_infos]) graph_mask_tt = self.generate_graph_mask(graph_infos) graph_state_reps = [g.graph_state_rep for g in graph_infos] scores = [info['score'] for info in infos] _, _, _, _, next_value, _ = self.model(obs_reps, scores, graph_state_reps, graph_mask_tt) returns, advantages = self.discount_reward( transitions, next_value) log('Returns: ', ', '.join(['{:.3f}'.format(a[0].item()) for a in returns])) log( 'Advants: ', ', '.join( ['{:.3f}'.format(a[0].item()) for a in advantages])) tb.logkv_mean('Advantage', advantages[-1].median().item()) loss = self.update(transitions, returns, advantages) del transitions[:] self.model.restore_hidden() if step % self.params['checkpoint_interval'] == 0: parameters = {'model': self.model} torch.save(parameters, os.path.join(self.params['output_dir'], 'kga2c.pt')) self.vec_env.close_extras()
def sent2ids(self, sent, maxlen=512): ret = self.tokenizer.encode(clean(sent)) if len(ret) > maxlen: ret = ret[-maxlen:] if not ret: ret = [0] return ret
def act2ids(self, act): ret = self.tokenizer.encode(clean(act), add_prefix_space=True) if not ret: ret = [0] return ret
def interactive_run(env): ob, info = env.reset() while True: print(clean(ob), 'Reward', reward, 'Done', done, 'Valid', info) ob, reward, done, info = env.step(input())
def train(self, max_steps): start = time.time() transitions = [] self.back_step = -1 previous_best_seen_score = float("-inf") previous_best_step = 0 previous_best_state = None previous_best_snapshot = None self.cur_reload_step = 0 force_reload = [False] * self.params['batch_size'] last_edges = None obs, infos, graph_infos, env_str = self.vec_env.reset() # print (obs) # print (infos) # print (graph_infos) for step in range(1, max_steps + 1): if any(force_reload): print("FORCING RELOAD") # obs, infos, graph_infos, env_str = self.vec_env.reset() print(force_reload) self.vec_env.load_from(self.cur_reload_state, force_reload) force_reload = [False] * self.params['batch_size'] # do i need to extract obs, infos, graph_infos from the refreshed state? tb.logkv('Step', step) obs_reps = np.array([g.ob_rep for g in graph_infos]) graph_mask_tt = self.generate_graph_mask(graph_infos) graph_state_reps = [g.graph_state_rep for g in graph_infos] scores = [info['score'] for info in infos] tmpl_pred_tt, obj_pred_tt, dec_obj_tt, dec_tmpl_tt, value, dec_steps = self.model( obs_reps, scores, graph_state_reps, graph_mask_tt) tb.logkv_mean('Value', value.mean().item()) # Log the predictions and ground truth values topk_tmpl_probs, topk_tmpl_idxs = F.softmax( tmpl_pred_tt[0]).topk(5) topk_tmpls = [ self.template_generator.templates[t] for t in topk_tmpl_idxs.tolist() ] tmpl_pred_str = ', '.join([ '{} {:.3f}'.format(tmpl, prob) for tmpl, prob in zip(topk_tmpls, topk_tmpl_probs.tolist()) ]) # Generate the ground truth and object mask admissible = [g.admissible_actions for g in graph_infos] objs = [g.objs for g in graph_infos] tmpl_gt_tt, obj_mask_gt_tt = self.generate_targets( admissible, objs) # Log template/object predictions/ground_truth gt_tmpls = [ self.template_generator.templates[i] for i in tmpl_gt_tt[0]. nonzero().squeeze().cpu().numpy().flatten().tolist() ] gt_objs = [ self.vocab_act[i] for i in obj_mask_gt_tt[ 0, 0].nonzero().squeeze().cpu().numpy().flatten().tolist() ] log('TmplPred: {} GT: {}'.format(tmpl_pred_str, ', '.join(gt_tmpls))) topk_o1_probs, topk_o1_idxs = F.softmax(obj_pred_tt[0, 0]).topk(5) topk_o1 = [self.vocab_act[o] for o in topk_o1_idxs.tolist()] o1_pred_str = ', '.join([ '{} {:.3f}'.format(o, prob) for o, prob in zip(topk_o1, topk_o1_probs.tolist()) ]) # graph_mask_str = [self.vocab_act[i] for i in graph_mask_tt[0].nonzero().squeeze().cpu().numpy().flatten().tolist()] log('ObjtPred: {} GT: {}'.format( o1_pred_str, ', '.join(gt_objs))) # , ', '.join(graph_mask_str))) chosen_actions = self.decode_actions(dec_tmpl_tt, dec_obj_tt) obs, rewards, dones, infos, graph_infos, env_str = self.vec_env.step( chosen_actions) force_reload = dones edges = [ set(graph_info.graph_state.edges) for graph_info in graph_infos ] if last_edges: stayed_same = [ 1 if (len(edges[i] - last_edges[i]) <= self.params['kg_diff_threshold']) else 0 for i in range(self.params['batch_size']) ] # print ("stayed_same: {}".format(stayed_same)) valid_kg_update = last_edges and sum(stayed_same) / self.params[ 'batch_size'] > self.params['kg_diff_batch_percentage'] last_edges = edges snapshot = self.vec_env.get_snapshot() scores = np.array([infos[i]['score'] for i in range(len(rewards))]) cur_max_score_idx = np.argmax(scores) if scores[ cur_max_score_idx] > previous_best_seen_score: # or valid_kg_update: print("New Reward Founded OR KG updated") previous_best_step = step previous_best_state = env_str[cur_max_score_idx] previous_best_seen_score = scores[cur_max_score_idx] previous_best_snapshot = snapshot[cur_max_score_idx] print("\tepoch: {}".format(previous_best_step)) print("\tnew score: {}".format(previous_best_seen_score)) # print ("\tnew state: {}".format(previous_best_state[0])) # print ("rewards: {}".format(rewards)) print("step {}: scores: {}, max_score: {}".format( step, scores, previous_best_seen_score)) tb.logkv_mean( 'TotalStepsPerEpisode', sum([i['steps'] for i in infos]) / float(len(graph_infos))) tb.logkv_mean('Valid', infos[0]['valid']) log('Act: {}, Rew {}, Score {}, Done {}, Value {:.3f}'.format( chosen_actions[0], rewards[0], infos[0]['score'], dones[0], value[0].item())) log('Obs: {}'.format(clean(obs[0]))) if dones[0]: log('Step {} EpisodeScore {}\n'.format(step, infos[0]['score'])) for done, info in zip(dones, infos): if done: tb.logkv_mean('EpisodeScore', info['score']) rew_tt = torch.FloatTensor(rewards).cuda().unsqueeze(1) done_mask_tt = (~torch.tensor(dones)).float().cuda().unsqueeze(1) self.model.reset_hidden(done_mask_tt) transitions.append( (tmpl_pred_tt, obj_pred_tt, value, rew_tt, done_mask_tt, tmpl_gt_tt, dec_tmpl_tt, dec_obj_tt, obj_mask_gt_tt, graph_mask_tt, dec_steps)) if len(transitions) >= self.params['bptt']: tb.logkv('StepsPerSecond', float(step) / (time.time() - start)) self.model.clone_hidden() obs_reps = np.array([g.ob_rep for g in graph_infos]) graph_mask_tt = self.generate_graph_mask(graph_infos) graph_state_reps = [g.graph_state_rep for g in graph_infos] scores = [info['score'] for info in infos] _, _, _, _, next_value, _ = self.model(obs_reps, scores, graph_state_reps, graph_mask_tt) returns, advantages = self.discount_reward( transitions, next_value) log('Returns: ', ', '.join(['{:.3f}'.format(a[0].item()) for a in returns])) log( 'Advants: ', ', '.join( ['{:.3f}'.format(a[0].item()) for a in advantages])) tb.logkv_mean('Advantage', advantages[-1].median().item()) loss = self.update(transitions, returns, advantages) del transitions[:] self.model.restore_hidden() if step % self.params['checkpoint_interval'] == 0: parameters = {'model': self.model} torch.save(parameters, os.path.join(self.params['output_dir'], 'qbert.pt')) if step - previous_best_step >= self.params['patience']: new_back_step = ( step - previous_best_step - self.params['patience']) // self.params['patience'] if new_back_step == 0: self.vec_env.import_snapshot(previous_best_snapshot) self.cur_reload_state = previous_best_snapshot[-1 - new_back_step] self.cur_reload_step = previous_best_step if new_back_step != self.back_step: force_reload = [True] * self.params['batch_size'] self.back_step = new_back_step print("Bottleneck detected at step: {}".format(step)) print("preivous_best_step: {}".format(previous_best_step)) print("Stepping back num: {}".format(self.back_step)) print("Reloading with env_str: {}".format( self.cur_reload_state[0])) self.vec_env.close_extras()
def train(self, max_steps): start = time.time() transitions = [] obs, infos, graph_infos = self.vec_env.reset() obs_memory = "" act_mem = "" cs_graph = None # chosen_actions = ["Bedroom (in bed)"] * self.batch_size complete = np.zeros(self.params['batch_size']).astype(int) for step in progressbar.progressbar(range(1, max_steps + 1), redirect_stdout=True): # tb.logkv('Step', step) wandb.log({'Step': step}, step=step) descs = [g.description for g in graph_infos] # get desc #SJF # if(chosen_actions == None): # chosen_actions = [g.description for g in graph_infos] obs_reps = np.array([g.ob_rep for g in graph_infos]) graph_mask_tt = self.generate_graph_mask(graph_infos) graph_state_reps = [g.graph_state_rep for g in graph_infos] scores = [info['score'] for info in infos] # tmpl_pred_tt, obj_pred_tt, dec_obj_tt, dec_tmpl_tt, value, dec_steps = self.model( # obs_reps, scores, graph_state_reps, graph_mask_tt, descs) tmpl_pred_tt, obj_pred_tt, dec_obj_tt, dec_tmpl_tt, value, dec_steps = self.model( obs_reps, scores, graph_state_reps, graph_mask_tt) wandb.log({'Value': value.mean().item()}, step=step) # tb.logkv_mean('Value', value.mean().item()) # Log the predictions and ground truth values topk_tmpl_probs, topk_tmpl_idxs = F.softmax( tmpl_pred_tt[0]).topk(5) topk_tmpls = [ self.template_generator.templates[t] for t in topk_tmpl_idxs.tolist() ] tmpl_pred_str = ', '.join([ '{} {:.3f}'.format(tmpl, prob) for tmpl, prob in zip(topk_tmpls, topk_tmpl_probs.tolist()) ]) # Generate the ground truth and object mask admissible = [g.admissible_actions for g in graph_infos] objs = [g.objs for g in graph_infos] tmpl_gt_tt, obj_mask_gt_tt = self.generate_targets( admissible, objs) # Log template/object predictions/ground_truth gt_tmpls = [ self.template_generator.templates[i] for i in tmpl_gt_tt[0]. nonzero().squeeze().cpu().numpy().flatten().tolist() ] gt_objs = [ self.vocab_act[i] for i in obj_mask_gt_tt[ 0, 0].nonzero().squeeze().cpu().numpy().flatten().tolist() ] log('TmplPred: {} GT: {}'.format(tmpl_pred_str, ', '.join(gt_tmpls))) topk_o1_probs, topk_o1_idxs = F.softmax(obj_pred_tt[0, 0]).topk(5) topk_o1 = [self.vocab_act[o] for o in topk_o1_idxs.tolist()] o1_pred_str = ', '.join([ '{} {:.3f}'.format(o, prob) for o, prob in zip(topk_o1, topk_o1_probs.tolist()) ]) # graph_mask_str = [self.vocab_act[i] for i in graph_mask_tt[0].nonzero().squeeze().cpu().numpy().flatten().tolist()] log('ObjtPred: {} GT: {}'.format( o1_pred_str, ', '.join(gt_objs))) # , ', '.join(graph_mask_str))) chosen_actions = self.decode_actions(dec_tmpl_tt, dec_obj_tt) ##### ## GENERATING THE COMMONSENSE KNOWLEDGE GRAPH BASED ON OBSERVED TRIPLES obs, rewards, dones, infos = self.vec_env.step(chosen_actions) obs = list(obs) ### Making entire walkthrough for ind, ob in enumerate(obs): # Deleting observations # if(ob.find('Bathroom') != -1 ): # obs[ind] = ob.replace(", with a sink, toilet and shower", "") if (ob.find('Bedroom') != -1): # obs[ind] = ob.replace("Cleaner clothing can be found in the", "There is a") complete[ind] = 1 if (ob.find('Bathroom') != -1 and complete[ind] == 1): complete[ind] = 2 if (ob.find('You take off the gold watch.') != -1 and complete[ind] == 2): # ob = ob.replace(", with a sink, toilet and shower", "") complete[ind] = 3 if (ob.find('You take off the soiled clothing') != -1 and complete[ind] == 3): complete[ind] = 4 if ((ob.find('Dropped') != -1 or ob.find('Removed') != -1) and ob.find('soiled clothing') != -1 and complete[ind] == 4): complete[ind] = 5 if (ob.find( 'You step into the shower, turn on the water, and within a few moments you feel like a new man.' ) != -1): complete[ind] = 6 if (ob.find('You put on the gold watch.') != -1 and complete[ind] == 6): complete[ind] = 7 # if(((ob.find('keys:Taken') != -1 or ob.find('keys:Removed') != -1) and (ob.find('wallet:Taken') != -1 or ob.find('wallet:Removed') != -1)) and complete[ind] == 7): # complete[ind] = 8 # if(ob.find('You open the dresser, revealing some clean clothing.') != -1 and complete[ind] == 8): # complete[ind] = 9 # if(ob.find('You put on the clean clothing.') != -1 and complete[ind] >= 8 and complete[ind] <= 9): # complete[ind] = 10 # if(ob.find('Living room') != -1 and complete[ind] == 10): # complete[ind] = 11 # if(ob.find('You open the front door.') != -1 and complete[ind] == 11): # complete[ind] = 12 # if(ob.find('Driveway') != -1 and complete[ind] == 12): # complete[ind] = 13 # if(ob.find('You climb inside and start up the engine.') != -1 and complete[ind] == 13): # complete[ind] = 14 # if(ob.find('Driving.') != -1 and complete[ind] == 14): # complete[ind] = 15 # obs[ind] = "This is a far from luxurious but still quite functional bathroom. The bedroom lies to the north." obs = tuple(obs) if (self.use_cs == True): cs_graph = [None] * len(obs) for idx, ob in enumerate(obs): pos_tags = (nltk.pos_tag(nltk.word_tokenize(str( obs[idx])))) comet_input = [] for tag in pos_tags: if (tag[1] == 'NN' or tag[1] == 'NNS'): comet_input.append(tag[0]) nouns = [] [nouns.append(x) for x in comet_input if x not in nouns] cs_graph[idx] = self.kg_extract.make_graph(nouns) graph_infos = self.vec_env.step(chosen_actions, obs=obs, done=dones, make_graph=1, use_cs=True, cs_graph=cs_graph) ###### else: graph_infos = self.vec_env.step(chosen_actions, obs=obs, done=dones, make_graph=1, use_cs=False, cs_graph=cs_graph) # tb.logkv_mean('TotalStepsPerEpisode', sum([i['steps'] for i in infos]) / float(len(graph_infos))) wandb.log( { 'TotalStepsPerEpisode': sum([i['steps'] for i in infos]) / float(len(graph_infos)) }, step=step) # tb.logkv_mean('Valid', infos[0]['valid']) wandb.log({'Valid': infos[0]['valid']}, step=step) log('Act: {}, Rew {}, Score {}, Done {}, Value {:.3f}'.format( chosen_actions[0], rewards[0], infos[0]['score'], dones[0], value[0].item())) log('Obs: {}'.format(clean(obs[0]))) if dones[0]: log('Step {} EpisodeScore {}\n'.format(step, infos[0]['score'])) complete_mean = 0 run_cmp = 0 score_comp = 0 for ind, (done, info) in enumerate(zip(dones, infos)): if done: # # tb.logkv_mean('EpisodeScore', info['score']) if (complete[ind] == 15): score_comp = 1 # tb.logkv('EpisodeScore', 1) complete_mean += complete[ind] # tb.logkv('EpisodeReward', complete[ind]) complete[ind] = 0 run_cmp += 1 if (run_cmp != 0): wandb.log({'EpisodeReward': float(complete_mean) / run_cmp}, step=step) # else: # wandb.log({'EpisodeReward': 0}, step = step) if (score_comp == 1): wandb.log({'EpisodeScore': 1}, step=step) # else: # wandb.log({'EpisodeScore': 0}, step = step) ## Replacing rewards with complete variable rew_tt = torch.FloatTensor(tuple(complete)).cuda().unsqueeze(1) # rew_tt = torch.FloatTensor(rewards).cuda().unsqueeze(1) done_mask_tt = (~torch.tensor(dones)).float().cuda().unsqueeze(1) self.model.reset_hidden(done_mask_tt) transitions.append( (tmpl_pred_tt, obj_pred_tt, value, rew_tt, done_mask_tt, tmpl_gt_tt, dec_tmpl_tt, dec_obj_tt, obj_mask_gt_tt, graph_mask_tt, dec_steps)) if len(transitions) >= self.params['bptt']: # tb.logkv('StepsPerSecond', float(step) / (time.time() - start)) wandb.log( {'StepsPerSecond': float(step) / (time.time() - start)}, step=step) self.model.clone_hidden() obs_reps = np.array([g.ob_rep for g in graph_infos]) graph_mask_tt = self.generate_graph_mask(graph_infos) graph_state_reps = [g.graph_state_rep for g in graph_infos] scores = [info['score'] for info in infos] descs = [g.description for g in graph_infos] # get desc #SJF # _, _, _, _, next_value, _ = self.model(obs_reps, scores, graph_state_reps, graph_mask_tt, descs) _, _, _, _, next_value, _ = self.model(obs_reps, scores, graph_state_reps, graph_mask_tt) returns, advantages = self.discount_reward( transitions, next_value) log('Returns: ', ', '.join(['{:.3f}'.format(a[0].item()) for a in returns])) log( 'Advants: ', ', '.join( ['{:.3f}'.format(a[0].item()) for a in advantages])) # tb.logkv_mean('Advantage', advantages[-1].median().item()) wandb.log({'Advantage': advantages[-1].median().item()}, step=step) loss = self.update(transitions, returns, advantages, step) del transitions[:] self.model.restore_hidden() if step % self.params['checkpoint_interval'] == 0: parameters = {'model': self.model} torch.save( parameters, os.path.join(self.params['output_dir'], 'kga2c_zork_cs.pt')) self.vec_env.close_extras()
def update_state(self, visible_state, inventory_state, objs, prev_action=None, cache=None): # Step 1: Build a copy of past KG (full) graph_copy = self.graph_state.copy() prev_room = self.room prev_room_subgraph = None con_cs = [ graph_copy.subgraph(c) for c in nx.weakly_connected_components(graph_copy) ] for con_c in con_cs: for node in con_c.nodes: node = set(str(node).split()) if set(prev_room.split()).issubset(node): prev_room_subgraph = nx.induced_subgraph( graph_copy, con_c.nodes) # Step 2: Bemove old ones with "you" --> past KG without "you" for edge in self.graph_state.edges: if 'you' in edge[0]: graph_copy.remove_edge(*edge) self.graph_state = graph_copy # Keep room connectivity only, remove "you" # <you, in, room>, <room, connect, room> --> <room, connect, room> graph_copy_1_connectivity = self.graph_state_1_connectivity.copy() for edge in self.graph_state_1_connectivity.edges: if 'you' in edge[0]: graph_copy_1_connectivity.remove_edge(*edge) self.graph_state_1_connectivity = graph_copy_1_connectivity # Step 3: Reinitialize sub-KG self.graph_state_2_roomitem = nx.DiGraph() # re-init self.graph_state_3_youritem = nx.DiGraph() # re-init self.graph_state_4_otherroom = graph_copy.copy( ) # Just past information # Preprocess visible state --> get sents visible_state = visible_state.split('\n') room = visible_state[0] visible_state = clean(' '.join(visible_state[1:])) self.visible_state = str(visible_state) if cache is None: sents = openie.call_stanford_openie( self.visible_state)['sentences'] else: sents = cache if sents == "": return [] dirs = [ 'north', 'south', 'east', 'west', 'southeast', 'southwest', 'northeast', 'northwest', 'up', 'down' ] in_aliases = [ 'are in', 'are facing', 'are standing', 'are behind', 'are above', 'are below', 'are in front' ] # Update graph, "rules" are new triples to be added # Add two rule lists for "you" and "woyou" rules_1_connectivity = [] # <you,in>, <room,connect> rules_2_roomitem = [] # <you,in>, <room,have> rules_3_youritem = [] # <you,have> rules = [] in_rl = [] in_flag = False for i, ov in enumerate(sents): sent = ' '.join([a['word'] for a in ov['tokens']]) triple = ov['openie'] # 1.1 -> check directions # direction rules: <room, has, exit to direction> for d in dirs: if d in sent and i != 0: rules.append((room, 'has', 'exit to ' + d)) rules_1_connectivity.append((room, 'has', 'exit to ' + d)) # 1.2 -> check OpenIE triples for tr in triple: h, r, t = tr['subject'].lower(), tr['relation'].lower( ), tr['object'].lower() # case 1: "you", "in" if h == 'you': for rp in in_aliases: if fuzz.token_set_ratio(r, rp) > 80: r = "in" in_rl.append((h, r, t)) # <you, in, > in_flag = True break # case 2: should not be "it" if h == 'it': break # case 3: other triples if not in_flag: rules.append((h, r, t)) rules_2_roomitem.append((h, r, t)) # 1.3 "you are in" cases if in_flag: cur_t = in_rl[0] for h, r, t in in_rl: if set(cur_t[2].split()).issubset(set(t.split())): cur_t = h, r, t rules.append(cur_t) rules_1_connectivity.append(cur_t) rules_2_roomitem.append(cur_t) room = cur_t[2] self.room = room # 1.4 inventory: <you, have, ...> try: items = inventory_state.split(':')[1].split('\n')[1:] for item in items: rules.append(('you', 'have', str(' '.join(item.split()[1:])))) rules_3_youritem.append( ('you', 'have', str(' '.join(item.split()[1:])))) # [20200420] 3 except: pass # 1.5 room connectivity: <room, dir, room> if prev_action is not None: for d in dirs: if d in prev_action and self.room != "": rules.append((prev_room, d + ' of', room)) rules_1_connectivity.append((prev_room, d + ' of', room)) if prev_room_subgraph is not None: for ed in prev_room_subgraph.edges: rules.append( (ed[0], "prev_graph_relations", ed[1])) break # 1.6 room item: <item,in,room> # If the action is "drop" --> something will be in this room # Therefore binary exploration bonus should not be used! for o in objs: rules.append((str(o), 'in', room)) rules_2_roomitem.append((str(o), 'in', room)) # add edges: if this edge already exists, adding will not show difference add_rules = rules for rule in add_rules: u = '_'.join(str(rule[0]).split()) v = '_'.join(str(rule[2]).split()) if u in self.vocab_kge['entity'].keys( ) and v in self.vocab_kge['entity'].keys(): if u != 'it' and v != 'it': self.graph_state.add_edge(rule[0], rule[2], rel=rule[1]) # build graph_state_1_connectivity for rule in rules_1_connectivity: u = '_'.join(str(rule[0]).split()) v = '_'.join(str(rule[2]).split()) if u in self.vocab_kge['entity'].keys( ) and v in self.vocab_kge['entity'].keys(): if u != 'it' and v != 'it': self.graph_state_1_connectivity.add_edge(rule[0], rule[2], rel=rule[1]) # build graph_state_5_mask self.graph_state_5_mask = self.graph_state_1_connectivity.copy() # build graph_state_2_roomitem (and graph_state_5_mask) for rule in rules_2_roomitem: u = '_'.join(str(rule[0]).split()) v = '_'.join(str(rule[2]).split()) if u in self.vocab_kge['entity'].keys( ) and v in self.vocab_kge['entity'].keys(): if u != 'it' and v != 'it': self.graph_state_2_roomitem.add_edge(rule[0], rule[2], rel=rule[1]) self.graph_state_5_mask.add_edge(rule[0], rule[2], rel=rule[1]) # build graph_state_3_youritem (and graph_state_5_mask) for rule in rules_3_youritem: u = '_'.join(str(rule[0]).split()) v = '_'.join(str(rule[2]).split()) if u in self.vocab_kge['entity'].keys( ) and v in self.vocab_kge['entity'].keys(): if u != 'it' and v != 'it': self.graph_state_3_youritem.add_edge(rule[0], rule[2], rel=rule[1]) self.graph_state_5_mask.add_edge(rule[0], rule[2], rel=rule[1]) return add_rules, sents
def train(self, max_steps): start = time.time() if self.params['training_type'] == 'chained': self.log_file( "BEGINNING OF TRAINING: patience={}, max_n_steps_back={}\n". format(self.params['patience'], self.params['buffer_size'])) frozen_policies = [] transitions = [] self.back_step = -1 previous_best_seen_score = float("-inf") previous_best_step = 0 previous_best_state = None previous_best_snapshot = None previous_best_ACTUAL_score = 0 self.cur_reload_step = 0 force_reload = [False] * self.params['batch_size'] last_edges = None self.valid_track = np.zeros(self.params['batch_size']) self.stagnant_steps = 0 INTRINSIC_MOTIVTATION = [ set() for i in range(self.params['batch_size']) ] obs, infos, graph_infos, env_str = self.vec_env.reset() snap_obs = obs[0] snap_info = infos[0] snap_graph_reps = None # print (obs) # print (infos) # print (graph_infos) for step in range(1, max_steps + 1): wallclock = time.time() if any(force_reload) and self.params['training_type'] == 'chained': num_reload = force_reload.count(True) t_obs = np.array(obs) t_obs[force_reload] = [snap_obs] * num_reload obs = tuple(t_obs) t_infos = np.array(infos) t_infos[force_reload] = [snap_info] * num_reload infos = tuple(t_infos) t_graphs = list(graph_infos) # namedtuple gets lost in np.array t_updates = self.vec_env.load_from(self.cur_reload_state, force_reload, snap_graph_reps, snap_obs) for i in range(self.params['batch_size']): if force_reload[i]: t_graphs[i] = t_updates[i] graph_infos = tuple(t_graphs) force_reload = [False] * self.params['batch_size'] tb.logkv('Step', step) obs_reps = np.array([g.ob_rep for g in graph_infos]) graph_mask_tt = self.generate_graph_mask(graph_infos) graph_state_reps = [g.graph_state_rep for g in graph_infos] if self.params['reward_type'] == 'game_only': scores = [info['score'] for info in infos] elif self.params['reward_type'] == 'IM_only': scores = np.array([ int( len(INTRINSIC_MOTIVTATION[i]) * self.params['intrinsic_motivation_factor']) for i in range(self.params['batch_size']) ]) elif self.params['reward_type'] == 'game_and_IM': scores = np.array([ infos[i]['score'] + (len(INTRINSIC_MOTIVTATION[i]) * ((infos[i]['score'] + self.params['epsilon']) / self.max_game_score)) for i in range(self.params['batch_size']) ]) tmpl_pred_tt, obj_pred_tt, dec_obj_tt, dec_tmpl_tt, value, dec_steps = self.model( obs_reps, scores, graph_state_reps, graph_mask_tt) tb.logkv_mean('Value', value.mean().item()) # Log the predictions and ground truth values topk_tmpl_probs, topk_tmpl_idxs = F.softmax( tmpl_pred_tt[0]).topk(5) topk_tmpls = [ self.template_generator.templates[t] for t in topk_tmpl_idxs.tolist() ] tmpl_pred_str = ', '.join([ '{} {:.3f}'.format(tmpl, prob) for tmpl, prob in zip(topk_tmpls, topk_tmpl_probs.tolist()) ]) # Generate the ground truth and object mask admissible = [g.admissible_actions for g in graph_infos] objs = [g.objs for g in graph_infos] tmpl_gt_tt, obj_mask_gt_tt = self.generate_targets( admissible, objs) # Log template/object predictions/ground_truth gt_tmpls = [ self.template_generator.templates[i] for i in tmpl_gt_tt[0]. nonzero().squeeze().cpu().numpy().flatten().tolist() ] gt_objs = [ self.vocab_act[i] for i in obj_mask_gt_tt[ 0, 0].nonzero().squeeze().cpu().numpy().flatten().tolist() ] log('TmplPred: {} GT: {}'.format(tmpl_pred_str, ', '.join(gt_tmpls))) topk_o1_probs, topk_o1_idxs = F.softmax(obj_pred_tt[0, 0]).topk(5) topk_o1 = [self.vocab_act[o] for o in topk_o1_idxs.tolist()] o1_pred_str = ', '.join([ '{} {:.3f}'.format(o, prob) for o, prob in zip(topk_o1, topk_o1_probs.tolist()) ]) # graph_mask_str = [self.vocab_act[i] for i in graph_mask_tt[0].nonzero().squeeze().cpu().numpy().flatten().tolist()] log('ObjtPred: {} GT: {}'.format( o1_pred_str, ', '.join(gt_objs))) # , ', '.join(graph_mask_str))) chosen_actions = self.decode_actions(dec_tmpl_tt, dec_obj_tt) #stepclock = time.time() obs, rewards, dones, infos, graph_infos, env_str = self.vec_env.step( chosen_actions) #print('stepclock', time.time() - stepclock) self.valid_track += [info['valid'] for info in infos] self.stagnant_steps += 1 force_reload = list(dones) edges = [ set(graph_info.graph_state.edges) for graph_info in graph_infos ] size_updates = [0] * self.params['batch_size'] for i, s in enumerate(INTRINSIC_MOTIVTATION): orig_size = len(s) s.update(edges[i]) size_updates[i] = len(s) - orig_size rewards = list(rewards) for i in range(self.params['batch_size']): if self.params['reward_type'] == 'IM_only': rewards[i] = size_updates[i] * self.params[ 'intrinsic_motivation_factor'] elif self.params['reward_type'] == 'game_and_IM': rewards[i] += size_updates[i] * self.params[ 'intrinsic_motivation_factor'] rewards = tuple(rewards) if last_edges: stayed_same = [ 1 if (len(edges[i] - last_edges[i]) <= self.params['kg_diff_threshold']) else 0 for i in range(self.params['batch_size']) ] # print ("stayed_same: {}".format(stayed_same)) valid_kg_update = last_edges and sum(stayed_same) / self.params[ 'batch_size'] > self.params['kg_diff_batch_percentage'] last_edges = edges snapshot = self.vec_env.get_snapshot() real_scores = np.array( [infos[i]['score'] for i in range(len(rewards))]) if self.params['reward_type'] == 'game_only': scores = [info['score'] for info in infos] elif self.params['reward_type'] == 'IM_only': scores = np.array([ int( len(INTRINSIC_MOTIVTATION[i]) * self.params['intrinsic_motivation_factor']) for i in range(self.params['batch_size']) ]) elif self.params['reward_type'] == 'game_and_IM': scores = np.array([ infos[i]['score'] + (len(INTRINSIC_MOTIVTATION[i]) * ((infos[i]['score'] + self.params['epsilon']) / self.max_game_score)) for i in range(self.params['batch_size']) ]) cur_max_score_idx = np.argmax(scores) if scores[ cur_max_score_idx] > previous_best_seen_score and self.params[ 'training_type'] == 'chained': # or valid_kg_update: print("New Reward Founded OR KG updated") previous_best_step = step previous_best_state = env_str[cur_max_score_idx] previous_best_seen_score = scores[cur_max_score_idx] previous_best_snapshot = snapshot[cur_max_score_idx] self.back_step = -1 self.valid_track = np.zeros(self.params['batch_size']) self.stagnant_steps = 0 print("\tepoch: {}".format(previous_best_step)) print("\tnew score: {}".format(previous_best_seen_score)) print("\tthis info: {}".format(infos[cur_max_score_idx])) self.log_file( "New High Score Founded: step:{}, new_score:{}, infos:{}\n" .format(previous_best_step, previous_best_seen_score, infos[cur_max_score_idx])) previous_best_ACTUAL_score = max(np.max(real_scores), previous_best_ACTUAL_score) print("step {}: scores: {}, best_real_score: {}".format( step, scores, previous_best_ACTUAL_score)) tb.logkv_mean( 'TotalStepsPerEpisode', sum([i['steps'] for i in infos]) / float(len(graph_infos))) tb.logkv_mean('Valid', infos[0]['valid']) log('Act: {}, Rew {}, Score {}, Done {}, Value {:.3f}'.format( chosen_actions[0], rewards[0], infos[0]['score'], dones[0], value[0].item())) log('Obs: {}'.format(clean(obs[0]))) if dones[0]: log('Step {} EpisodeScore {}\n'.format(step, infos[0]['score'])) for done, info in zip(dones, infos): if done: tb.logkv_mean('EpisodeScore', info['score']) rew_tt = torch.FloatTensor(rewards).cuda().unsqueeze(1) done_mask_tt = (~torch.tensor(dones)).float().cuda().unsqueeze(1) self.model.reset_hidden(done_mask_tt) transitions.append( (tmpl_pred_tt, obj_pred_tt, value, rew_tt, done_mask_tt, tmpl_gt_tt, dec_tmpl_tt, dec_obj_tt, obj_mask_gt_tt, graph_mask_tt, dec_steps)) if len(transitions) >= self.params['bptt']: tb.logkv('StepsPerSecond', float(step) / (time.time() - start)) self.model.clone_hidden() obs_reps = np.array([g.ob_rep for g in graph_infos]) graph_mask_tt = self.generate_graph_mask(graph_infos) graph_state_reps = [g.graph_state_rep for g in graph_infos] if self.params['reward_type'] == 'game_only': scores = [info['score'] for info in infos] elif self.params['reward_type'] == 'IM_only': scores = np.array([ int( len(INTRINSIC_MOTIVTATION[i]) * self.params['intrinsic_motivation_factor']) for i in range(self.params['batch_size']) ]) elif self.params['reward_type'] == 'game_and_IM': scores = np.array([ infos[i]['score'] + (len(INTRINSIC_MOTIVTATION[i]) * ((infos[i]['score'] + self.params['epsilon']) / self.max_game_score)) for i in range(self.params['batch_size']) ]) _, _, _, _, next_value, _ = self.model(obs_reps, scores, graph_state_reps, graph_mask_tt) returns, advantages = self.discount_reward( transitions, next_value) log('Returns: ', ', '.join(['{:.3f}'.format(a[0].item()) for a in returns])) log( 'Advants: ', ', '.join( ['{:.3f}'.format(a[0].item()) for a in advantages])) tb.logkv_mean('Advantage', advantages[-1].median().item()) loss = self.update(transitions, returns, advantages) del transitions[:] self.model.restore_hidden() if step % self.params['checkpoint_interval'] == 0: parameters = {'model': self.model} torch.save(parameters, os.path.join(self.params['output_dir'], 'qbert.pt')) bottleneck = self.params['training_type'] == 'chained' and \ ((self.stagnant_steps >= self.params['patience'] and not self.params['patience_valid_only']) or (self.params['patience_valid_only'] and sum(self.valid_track >= self.params['patience']) >= self.params['batch_size'] * self.params['patience_batch_factor'])) if bottleneck: print("Bottleneck detected at step: {}".format(step)) # new_backstep += 1 # new_back_step = (step - previous_best_step - self.params['patience']) // self.params['patience'] self.back_step += 1 if self.back_step == 0: self.vec_env.import_snapshot(previous_best_snapshot) cur_time = time.strftime("%Y%m%d-%H%M%S") torch.save( self.model.state_dict(), os.path.join(self.chkpt_path, '{}.pt'.format(cur_time))) frozen_policies.append((cur_time, previous_best_state)) # INTRINSIC_MOTIVTATION= [set() for i in range(self.params['batch_size'])] self.log_file( "Current model saved at: model/checkpoints/{}.pt\n". format(cur_time)) self.model = QBERT(self.params, self.template_generator.templates, self.max_word_length, self.vocab_act, self.vocab_act_rev, len(self.sp), gat=self.params['gat']).cuda() if self.back_step >= self.params['buffer_size']: print("Buffer exhausted. Finishing training") self.vec_env.close_extras() return print(previous_best_snapshot[-1 - self.back_step]) snap_obs, snap_info, snap_graph_reps, self.cur_reload_state = previous_best_snapshot[ -1 - self.back_step] print("Loading snapshot, infos: {}".format(snap_info)) self.log_file( "Loading snapshot, infos: {}\n".format(snap_info)) self.cur_reload_step = previous_best_step force_reload = [True] * self.params['batch_size'] self.valid_track = np.zeros(self.params['batch_size']) self.stagnant_steps = 0 # print out observations here print("Current observations: {}".format( [info['look'] for info in infos])) print("Previous_best_step: {}, step_back: {}".format( previous_best_step, self.back_step)) self.log_file( "Bottleneck Detected: step:{}, previous_best_step:{}, cur_step_back:{}\n" .format(i, previous_best_step, self.back_step)) self.log_file("Current observations: {}\n".format( [info['look'] for info in infos])) #exit() self.vec_env.close_extras()
def goexplore_train(self, obs, infos, graph_infos, max_steps, INTRINSIC_MOTIVTATION): start = time.time() transitions = [] if obs == None: obs, infos, graph_infos = self.vec_env.go_reset() for step in range(1, max_steps + 1): self.total_steps += 1 tb.logkv('Step', self.total_steps) obs_reps = np.array([g.ob_rep for g in graph_infos]) graph_mask_tt = self.generate_graph_mask(graph_infos) graph_state_reps = [g.graph_state_rep for g in graph_infos] # scores = [info['score'] for info in infos] if self.params['reward_type'] == 'game_only': scores = [info['score'] for info in infos] elif self.params['reward_type'] == 'IM_only': scores = np.array([ int( len(INTRINSIC_MOTIVTATION[i]) * self.params['intrinsic_motivation_factor']) for i in range(self.params['batch_size']) ]) elif self.params['reward_type'] == 'game_and_IM': scores = np.array([ infos[i]['score'] + (len(INTRINSIC_MOTIVTATION[i]) * ((infos[i]['score'] + self.params['epsilon']) / self.max_game_score)) for i in range(self.params['batch_size']) ]) tmpl_pred_tt, obj_pred_tt, dec_obj_tt, dec_tmpl_tt, value, dec_steps = self.model( obs_reps, scores, graph_state_reps, graph_mask_tt) tb.logkv_mean('Value', value.mean().item()) # Log some of the predictions and ground truth values topk_tmpl_probs, topk_tmpl_idxs = F.softmax( tmpl_pred_tt[0]).topk(5) topk_tmpls = [ self.template_generator.templates[t] for t in topk_tmpl_idxs.tolist() ] tmpl_pred_str = ', '.join([ '{} {:.3f}'.format(tmpl, prob) for tmpl, prob in zip(topk_tmpls, topk_tmpl_probs.tolist()) ]) admissible = [g.admissible_actions for g in graph_infos] objs = [g.objs for g in graph_infos] tmpl_gt_tt, obj_mask_gt_tt = self.generate_targets( admissible, objs) gt_tmpls = [ self.template_generator.templates[i] for i in tmpl_gt_tt[0]. nonzero().squeeze().cpu().numpy().flatten().tolist() ] gt_objs = [ self.vocab_act[i] for i in obj_mask_gt_tt[ 0, 0].nonzero().squeeze().cpu().numpy().flatten().tolist() ] log('TmplPred: {} GT: {}'.format(tmpl_pred_str, ', '.join(gt_tmpls))) topk_o1_probs, topk_o1_idxs = F.softmax(obj_pred_tt[0, 0]).topk(5) topk_o1 = [self.vocab_act[o] for o in topk_o1_idxs.tolist()] o1_pred_str = ', '.join([ '{} {:.3f}'.format(o, prob) for o, prob in zip(topk_o1, topk_o1_probs.tolist()) ]) graph_mask_str = [ self.vocab_act[i] for i in graph_mask_tt[0].nonzero().squeeze( ).cpu().numpy().flatten().tolist() ] log('ObjtPred: {} GT: {} Mask: {}'.format( o1_pred_str, ', '.join(gt_objs), ', '.join(graph_mask_str))) chosen_actions = self.decode_actions(dec_tmpl_tt, dec_obj_tt) # Chooses random valid-actions to execute obs, rewards, dones, infos, graph_infos = self.vec_env.go_step( chosen_actions) edges = [ set(graph_info.graph_state.edges) for graph_info in graph_infos ] size_updates = [0] * self.params['batch_size'] for i, s in enumerate(INTRINSIC_MOTIVTATION): orig_size = len(s) s.update(edges[i]) size_updates[i] = len(s) - orig_size rewards = list(rewards) for i in range(self.params['batch_size']): if self.params['reward_type'] == 'IM_only': rewards[i] = size_updates[i] * self.params[ 'intrinsic_motivation_factor'] elif self.params['reward_type'] == 'game_and_IM': rewards[i] += size_updates[i] * self.params[ 'intrinsic_motivation_factor'] rewards = tuple(rewards) tb.logkv_mean( 'TotalStepsPerEpisode', sum([i['steps'] for i in infos]) / float(len(graph_infos))) tb.logkv_mean('Valid', infos[0]['valid']) log('Act: {}, Rew {}, Score {}, Done {}, Value {:.3f}'.format( chosen_actions[0], rewards[0], infos[0]['score'], dones[0], value[0].item())) log('Obs: {}'.format(clean(obs[0]))) if dones[0]: log('Step {} EpisodeScore {}\n'.format(step, infos[0]['score'])) for done, info in zip(dones, infos): if done: tb.logkv_mean('EpisodeScore', info['score']) rew_tt = torch.FloatTensor(rewards).cuda().unsqueeze(1) done_mask_tt = (~torch.tensor(dones)).float().cuda().unsqueeze(1) self.model.reset_hidden(done_mask_tt) transitions.append( (tmpl_pred_tt, obj_pred_tt, value, rew_tt, done_mask_tt, tmpl_gt_tt, dec_tmpl_tt, dec_obj_tt, obj_mask_gt_tt, graph_mask_tt, dec_steps)) if len(transitions) >= self.params['bptt']: tb.logkv('StepsPerSecond', float(step) / (time.time() - start)) self.model.clone_hidden() obs_reps = np.array([g.ob_rep for g in graph_infos]) graph_mask_tt = self.generate_graph_mask(graph_infos) graph_state_reps = [g.graph_state_rep for g in graph_infos] # scores = [info['score'] for info in infos] if self.params['reward_type'] == 'game_only': scores = [info['score'] for info in infos] elif self.params['reward_type'] == 'IM_only': scores = np.array([ int( len(INTRINSIC_MOTIVTATION[i]) * self.params['intrinsic_motivation_factor']) for i in range(self.params['batch_size']) ]) elif self.params['reward_type'] == 'game_and_IM': scores = np.array([ infos[i]['score'] + (len(INTRINSIC_MOTIVTATION[i]) * ((infos[i]['score'] + self.params['epsilon']) / self.max_game_score)) for i in range(self.params['batch_size']) ]) _, _, _, _, next_value, _ = self.model(obs_reps, scores, graph_state_reps, graph_mask_tt) returns, advantages = self.discount_reward( transitions, next_value) log('Returns: ', ', '.join(['{:.3f}'.format(a[0].item()) for a in returns])) log( 'Advants: ', ', '.join( ['{:.3f}'.format(a[0].item()) for a in advantages])) tb.logkv_mean('Advantage', advantages[-1].median().item()) loss = self.update(transitions, returns, advantages) del transitions[:] self.model.restore_hidden() if step % self.params['checkpoint_interval'] == 0: parameters = {'model': self.model} torch.save(parameters, os.path.join(self.params['output_dir'], 'qbert.pt')) # self.vec_env.close_extras() return obs, rewards, dones, infos, graph_infos, scores, chosen_actions, INTRINSIC_MOTIVTATION