예제 #1
0
 def dump_trajectory_state(obs_text, frame_idx):
     if (episode_logger['num'] %
             config.training_dump_freq == config.training_dump_freq - 1):
         st = obs_text.split('|')
         logging('[Episode {} step {}] Obs: \nl={}\ni={}\no={}\n'.format(
             episode_logger['num'], frame_idx - episode_logger['init_time'],
             clean(st[0]), clean(st[1]), clean(st[2])))
     return
예제 #2
0
파일: train.py 프로젝트: qiming-zou/tdqn
def evaluate_episode(agent, env):
    step = 0
    done = False
    ob, info = env.reset()
    state = agent.build_state([ob], [info])[0]
    log('Obs{}: {} Inv: {} Desc: {}'.format(step, clean(ob),
                                            clean(info['inv']),
                                            clean(info['look'])))
    while not done:
        valid_acts = info['valid']
        valid_ids = agent.encode(valid_acts)
        _, action_idx, action_values = agent.act([state], [valid_ids],
                                                 sample=False)
        action_idx = action_idx[0]
        action_values = action_values[0]
        action_str = valid_acts[action_idx]
        log('Action{}: {}, Q-Value {:.2f}'.format(
            step, action_str, action_values[action_idx].item()))
        s = ''
        for idx, (act, val) in enumerate(
                sorted(zip(valid_acts, action_values),
                       key=lambda x: x[1],
                       reverse=True), 1):
            s += "{}){:.2f} {} ".format(idx, val.item(), act)
        log('Q-Values: {}'.format(s))
        ob, rew, done, info = env.step(action_str)
        log("Reward{}: {}, Score {}, Done {}".format(step, rew, info['score'],
                                                     done))
        step += 1
        log('Obs{}: {} Inv: {} Desc: {}'.format(step, clean(ob),
                                                clean(info['inv']),
                                                clean(info['look'])))
        state = agent.build_state([ob], [info])[0]
    return info['score']
예제 #3
0
def train(agent, lm, envs, max_steps, update_freq, eval_freq, checkpoint_freq,
          log_freq, args):
    start = time.time()
    obs, rewards, dones, infos, transitions = [], [], [], [], []
    env_steps, max_score, d_in, d_out = 0, 0, defaultdict(list), defaultdict(
        list)
    if args.lm_top_k:  # small pre-trained classifier to filter invalid actions that CALM generates
        import fasttext
        detector = fasttext.load_model('valid_model.bin')
        reject = lambda ob: detector.predict(clean(ob))[0][
            0] == '__label__invalid'
    if args.lm_dict:
        d_in = defaultdict(list,
                           json.load(open('%s/d_in.json' % args.lm_dict, 'r')))
        d_out = defaultdict(
            list, json.load(open('%s/d_out.json' % args.lm_dict, 'r')))
        lm.generate_dict = json.load(open('%s/lm.json' % args.lm_dict, 'r'))
    for env in envs:
        ob, info = env.reset()
        obs, rewards, dones, infos, transitions = \
            obs + [ob], rewards + [0], dones + [False], infos + [info], transitions + [[]]
    states = build_state(lm, obs, infos)
    valid_ids = [[lm.act2ids(a) for a in info['valid']] for info in infos]

    for step in range(1, max_steps + 1):
        # act
        action_ids, action_idxs, action_values = agent.act(states,
                                                           valid_ids,
                                                           lm=lm,
                                                           eps=args.eps,
                                                           alpha=args.alpha,
                                                           k=args.eps_top_k)
        action_strs = [
            info['valid'][idx] for info, idx in zip(infos, action_idxs)
        ]

        # log envs[0]
        examples = [(action, value) for action, value in zip(
            infos[0]['valid'], action_values[0].tolist())]
        examples = sorted(examples, key=lambda x: -x[1])
        log('State  {}: {}'.format(step, lm.tokenizer.decode(states[0].state)))
        log('Actions{}: {}'.format(step, [action for action, _ in examples]))
        log('Qvalues{}: {}'.format(step,
                                   [round(value, 2) for _, value in examples]))

        # step with rejection
        next_obs, next_rewards, next_dones, next_infos = [], [], [], []
        for i, (env, action) in enumerate(zip(envs, action_strs)):
            if dones[i]:
                if env.max_score >= max_score:  # put in alpha queue
                    for transition in transitions[i]:
                        agent.observe(transition, is_prior=True)
                env_steps += infos[i]['moves']
                ob, info = env.reset()
                action_strs[i], action_ids[i], transitions[i] = 'reset', [], []
                next_obs, next_rewards, next_dones, next_infos = next_obs + [
                    ob
                ], next_rewards + [0], next_dones + [False
                                                     ], next_infos + [info]
                continue
            prev_inv, prev_look = infos[i]['inv'], infos[i]['look']
            ob, reward, done, info = env.step(action)
            if args.lm_top_k:  # deal with rejection
                key = hash(tuple(states[i][0] + states[i][1] + states[i][2]))
                l_in, l_out = d_in[key], d_out[key]
                actions = infos[i]['valid']
                rej = reject(ob) and prev_inv == info[
                    'inv'] and prev_look == info['look']

                # while action is invalid, pull another action from CALM generated candidates
                while not done and rej and len(actions) > 1:
                    if action not in l_out: l_out.append(action)
                    if args.lm_type == 'ngram' and action in l_in:
                        l_in.remove(action)
                    actions.remove(action)
                    action = choice(actions)
                    ob, reward, done, info = env.step(action)
                    rej = reject(ob) and prev_inv == info[
                        'inv'] and prev_look == info['look']
                action_strs[i] = action

                if not rej and action not in l_in: l_in.append(action)
                if reward < 0 and action not in l_out:
                    l_out.append(action)  # screen negative-reward actions
            next_obs, next_rewards, next_dones, next_infos = \
                next_obs + [ob], next_rewards + [reward], next_dones + [done], next_infos + [info]
            if info['score'] > max_score:  # new high score experienced
                max_score = info['score']
                agent.memory.clear_alpha()
            if done: tb.logkv_mean('EpisodeScore', info['score'])
        rewards, dones, infos = next_rewards, next_dones, next_infos

        # continue to log envs[0]
        log('>> Action{}: {}'.format(step, action_strs[0]))
        log("Reward{}: {}, Score {}, Done {}\n".format(step, rewards[0],
                                                       infos[0]['score'],
                                                       dones[0]))

        # generate valid actions
        next_states = build_state(lm,
                                  next_obs,
                                  infos,
                                  prev_obs=obs,
                                  prev_acts=action_strs)
        if args.lm_top_k:
            for env, info, state, done in zip(envs, infos, next_states, dones):
                if not done:
                    key = hash(tuple(state[0] + state[1] + state[2]))
                    if args.lm_type == 'ngram':
                        objs = env.get_objects()
                        actions = lm.generate(objs, k=args.lm_top_k).copy()
                    else:
                        actions = lm.generate(state.state, k=args.lm_top_k)
                        l_in, l_out = d_in[key], d_out[key]
                        actions += [
                            action for action in l_in if action not in actions
                        ]  # add extra valid
                        actions = [
                            action for action in actions
                            if action and action not in l_out
                        ]  # remove invalid
                    if not actions: actions = ['wait', 'yes', 'no']
                    info['valid'] = actions
        next_valids = [[lm.act2ids(a) for a in info['valid']]
                       for info in infos]
        for state, act, rew, next_state, valids, done, transition in zip(
                states, action_ids, rewards, next_states, next_valids, dones,
                transitions):
            if act:  # not [] (i.e. reset)
                transition.append(
                    Transition(state, act, rew, next_state, valids, done))
                agent.observe(transition[-1])  # , is_prior=(rew != 0))
        obs, states, valid_ids = next_obs, next_states, next_valids

        if step % log_freq == 0:
            tb.logkv('Step', env_steps)
            tb.logkv("FPS",
                     round((step * args.num_envs) / (time.time() - start), 2))
            tb.logkv("Max score seen", max_score)
            tb.logkv("#dict", len(lm.generate_dict))
            # tb.logkv("#dict_d", sum(len(env.d) for env in envs) / len(envs))
            # tb.logkv("#dict_in", len(d_in))
            tb.logkv(
                "Last100EpisodeScores",
                sum(env.get_end_scores(last=100) for env in envs) / len(envs))
            tb.dumpkvs()
        if step % update_freq == 0:
            loss = agent.update()
            if loss is not None:
                tb.logkv_mean('Loss', loss)
        if step % checkpoint_freq == 0:
            json.dump(d_in,
                      open('%s/d_in.json' % args.output_dir, 'w'),
                      indent=4)
            json.dump(d_out,
                      open('%s/d_out.json' % args.output_dir, 'w'),
                      indent=4)
            json.dump(lm.generate_dict,
                      open('%s/lm.json' % args.output_dir, 'w'),
                      indent=4)
예제 #4
0
    def update_state(self,
                     visible_state,
                     inventory_state,
                     objs,
                     prev_action=None,
                     cache=None,
                     cs_graph=None):

        prev_room = self.room
        if (cs_graph != None):
            # for key in cs_graph:
            #     for edge in cs_graph["key"]:
            #         edge = edge[self.relation]
            #         subject = edge["e1"]
            #         # relation = edge["relation"]
            #         relation = "Capable Of"
            #         predicate = edge["beams"]
            #         for pred in (predicate):
            #             self.graph_state.add_edge(subject, pred, rel=relation)
            self.graph_state = nx.compose(cs_graph, self.graph_state)
            # print(self.graph_state.edges)

        # print(self.graph_state.edges)

        graph_copy = self.graph_state.copy()
        con_cs = [
            graph_copy.subgraph(c)
            for c in nx.weakly_connected_components(graph_copy)
        ]
        prev_room_subgraph = None
        prev_you_subgraph = None

        for con_c in con_cs:
            for node in con_c.nodes:
                node = set(str(node).split())
                if set(prev_room.split()).issubset(node):
                    prev_room_subgraph = nx.induced_subgraph(
                        graph_copy, con_c.nodes)

        for edge in self.graph_state.edges:
            if 'you' in edge[0]:
                graph_copy.remove_edge(*edge)

        self.graph_state = graph_copy
        visible_state = visible_state.split('\n')
        room = visible_state[0]
        visible_state = clean(' '.join(visible_state[1:]))

        dirs = [
            'north', 'south', 'east', 'west', 'southeast', 'southwest',
            'northeast', 'northwest', 'up', 'down'
        ]

        self.visible_state = str(visible_state)
        rules = []

        if cache is None:
            sents = openie.call_stanford_openie(
                self.visible_state)['sentences']
        else:
            sents = cache

        if sents == "":
            return []

        in_aliases = [
            'are in', 'are facing', 'are standing', 'are behind', 'are above',
            'are below', 'are in front'
        ]

        in_rl = []
        in_flag = False
        for i, ov in enumerate(sents):
            sent = ' '.join([a['word'] for a in ov['tokens']])
            triple = ov['openie']
            for d in dirs:
                if d in sent and i != 0:
                    rules.append((room, 'has', 'exit to ' + d))

            for tr in triple:
                h, r, t = tr['subject'].lower(), tr['relation'].lower(
                ), tr['object'].lower()

                if h == 'you':
                    for rp in in_aliases:
                        if fuzz.token_set_ratio(r, rp) > 80:
                            r = "in"
                            in_rl.append((h, r, t))
                            in_flag = True
                            break

                if h == 'it':
                    break
                if not in_flag:
                    rules.append((h, r, t))

        if in_flag:
            cur_t = in_rl[0]
            for h, r, t in in_rl:
                if set(cur_t[2].split()).issubset(set(t.split())):
                    cur_t = h, r, t
            rules.append(cur_t)
            room = cur_t[2]

        try:
            items = inventory_state.split(':')[1].split('\n')[1:]
            for item in items:
                rules.append(('you', 'have', str(' '.join(item.split()[1:]))))
        except:
            pass

        if prev_action is not None:
            for d in dirs:
                if d in prev_action and self.room != "":
                    rules.append((prev_room, d + ' of', room))
                    if prev_room_subgraph is not None:
                        for ed in prev_room_subgraph.edges:
                            rules.append(
                                (ed[0], prev_room_subgraph[ed]['rel'], ed[1]))
                    break

        for o in objs:
            #if o != 'all':
            rules.append((str(o), 'in', room))

        add_rules = rules
        ### ADDING NEW NODES DONE
        for rule in add_rules:
            u = '_'.join(str(rule[0]).split())
            v = '_'.join(str(rule[2]).split())
            if u in self.vocab_kge['entity'].keys(
            ) and v in self.vocab_kge['entity'].keys():
                if u != 'it' and v != 'it':
                    # print(rule[0],"space", rule[2],"space",  rule[1])
                    self.graph_state.add_edge(rule[0], rule[2], rel=rule[1])

        # print((self.graph_state.edges))
        # if(cs_graph != None):
        # self.visualize()

        # print("---------------")
        return add_rules, sents
예제 #5
0
    def train(self, max_steps):
        start = time.time()
        transitions = []
        obs, infos, graph_infos = self.vec_env.reset()
        for step in range(1, max_steps + 1):
            tb.logkv('Step', step)
            obs_reps = np.array([g.ob_rep for g in graph_infos])
            graph_mask_tt = self.generate_graph_mask(graph_infos)
            graph_state_reps = [g.graph_state_rep for g in graph_infos]
            scores = [info['score'] for info in infos]
            tmpl_pred_tt, obj_pred_tt, dec_obj_tt, dec_tmpl_tt, value, dec_steps = self.model(
                obs_reps, scores, graph_state_reps, graph_mask_tt)
            tb.logkv_mean('Value', value.mean().item())

            # Log the predictions and ground truth values
            topk_tmpl_probs, topk_tmpl_idxs = F.softmax(
                tmpl_pred_tt[0]).topk(5)
            topk_tmpls = [
                self.template_generator.templates[t]
                for t in topk_tmpl_idxs.tolist()
            ]
            tmpl_pred_str = ', '.join([
                '{} {:.3f}'.format(tmpl, prob)
                for tmpl, prob in zip(topk_tmpls, topk_tmpl_probs.tolist())
            ])

            # Generate the ground truth and object mask
            admissible = [g.admissible_actions for g in graph_infos]
            objs = [g.objs for g in graph_infos]
            tmpl_gt_tt, obj_mask_gt_tt = self.generate_targets(
                admissible, objs)

            # Log template/object predictions/ground_truth
            gt_tmpls = [
                self.template_generator.templates[i] for i in tmpl_gt_tt[0].
                nonzero().squeeze().cpu().numpy().flatten().tolist()
            ]
            gt_objs = [
                self.vocab_act[i] for i in obj_mask_gt_tt[
                    0, 0].nonzero().squeeze().cpu().numpy().flatten().tolist()
            ]
            log('TmplPred: {} GT: {}'.format(tmpl_pred_str,
                                             ', '.join(gt_tmpls)))
            topk_o1_probs, topk_o1_idxs = F.softmax(obj_pred_tt[0, 0]).topk(5)
            topk_o1 = [self.vocab_act[o] for o in topk_o1_idxs.tolist()]
            o1_pred_str = ', '.join([
                '{} {:.3f}'.format(o, prob)
                for o, prob in zip(topk_o1, topk_o1_probs.tolist())
            ])
            # graph_mask_str = [self.vocab_act[i] for i in graph_mask_tt[0].nonzero().squeeze().cpu().numpy().flatten().tolist()]
            log('ObjtPred: {} GT: {}'.format(
                o1_pred_str,
                ', '.join(gt_objs)))  # , ', '.join(graph_mask_str)))

            chosen_actions = self.decode_actions(dec_tmpl_tt, dec_obj_tt)

            obs, rewards, dones, infos, graph_infos = self.vec_env.step(
                chosen_actions)
            tb.logkv_mean(
                'TotalStepsPerEpisode',
                sum([i['steps'] for i in infos]) / float(len(graph_infos)))
            tb.logkv_mean('Valid', infos[0]['valid'])
            log('Act: {}, Rew {}, Score {}, Done {}, Value {:.3f}'.format(
                chosen_actions[0], rewards[0], infos[0]['score'], dones[0],
                value[0].item()))
            log('Obs: {}'.format(clean(obs[0])))
            if dones[0]:
                log('Step {} EpisodeScore {}\n'.format(step,
                                                       infos[0]['score']))
            for done, info in zip(dones, infos):
                if done:
                    tb.logkv_mean('EpisodeScore', info['score'])
            rew_tt = torch.FloatTensor(rewards).cuda().unsqueeze(1)
            done_mask_tt = (~torch.tensor(dones)).float().cuda().unsqueeze(1)
            self.model.reset_hidden(done_mask_tt)
            transitions.append(
                (tmpl_pred_tt, obj_pred_tt, value, rew_tt, done_mask_tt,
                 tmpl_gt_tt, dec_tmpl_tt, dec_obj_tt, obj_mask_gt_tt,
                 graph_mask_tt, dec_steps))

            if len(transitions) >= self.params['bptt']:
                tb.logkv('StepsPerSecond', float(step) / (time.time() - start))
                self.model.clone_hidden()
                obs_reps = np.array([g.ob_rep for g in graph_infos])
                graph_mask_tt = self.generate_graph_mask(graph_infos)
                graph_state_reps = [g.graph_state_rep for g in graph_infos]
                scores = [info['score'] for info in infos]
                _, _, _, _, next_value, _ = self.model(obs_reps, scores,
                                                       graph_state_reps,
                                                       graph_mask_tt)
                returns, advantages = self.discount_reward(
                    transitions, next_value)
                log('Returns: ',
                    ', '.join(['{:.3f}'.format(a[0].item()) for a in returns]))
                log(
                    'Advants: ', ', '.join(
                        ['{:.3f}'.format(a[0].item()) for a in advantages]))
                tb.logkv_mean('Advantage', advantages[-1].median().item())
                loss = self.update(transitions, returns, advantages)
                del transitions[:]
                self.model.restore_hidden()

            if step % self.params['checkpoint_interval'] == 0:
                parameters = {'model': self.model}
                torch.save(parameters,
                           os.path.join(self.params['output_dir'], 'kga2c.pt'))

        self.vec_env.close_extras()
예제 #6
0
 def sent2ids(self, sent, maxlen=512):
     ret = self.tokenizer.encode(clean(sent))
     if len(ret) > maxlen:
         ret = ret[-maxlen:]
     if not ret: ret = [0]
     return ret
예제 #7
0
 def act2ids(self, act):
     ret = self.tokenizer.encode(clean(act), add_prefix_space=True)
     if not ret: ret = [0]
     return ret
def interactive_run(env):
    ob, info = env.reset()
    while True:
        print(clean(ob), 'Reward', reward, 'Done', done, 'Valid', info)
        ob, reward, done, info = env.step(input())
예제 #9
0
    def train(self, max_steps):
        start = time.time()
        transitions = []
        self.back_step = -1

        previous_best_seen_score = float("-inf")
        previous_best_step = 0
        previous_best_state = None
        previous_best_snapshot = None
        self.cur_reload_step = 0
        force_reload = [False] * self.params['batch_size']
        last_edges = None

        obs, infos, graph_infos, env_str = self.vec_env.reset()
        # print (obs)
        # print (infos)
        # print (graph_infos)
        for step in range(1, max_steps + 1):
            if any(force_reload):
                print("FORCING RELOAD")
                # obs, infos, graph_infos, env_str = self.vec_env.reset()
                print(force_reload)
                self.vec_env.load_from(self.cur_reload_state, force_reload)
                force_reload = [False] * self.params['batch_size']

                # do i need to extract obs, infos, graph_infos from the refreshed state?
            tb.logkv('Step', step)
            obs_reps = np.array([g.ob_rep for g in graph_infos])
            graph_mask_tt = self.generate_graph_mask(graph_infos)
            graph_state_reps = [g.graph_state_rep for g in graph_infos]
            scores = [info['score'] for info in infos]
            tmpl_pred_tt, obj_pred_tt, dec_obj_tt, dec_tmpl_tt, value, dec_steps = self.model(
                obs_reps, scores, graph_state_reps, graph_mask_tt)
            tb.logkv_mean('Value', value.mean().item())

            # Log the predictions and ground truth values
            topk_tmpl_probs, topk_tmpl_idxs = F.softmax(
                tmpl_pred_tt[0]).topk(5)
            topk_tmpls = [
                self.template_generator.templates[t]
                for t in topk_tmpl_idxs.tolist()
            ]
            tmpl_pred_str = ', '.join([
                '{} {:.3f}'.format(tmpl, prob)
                for tmpl, prob in zip(topk_tmpls, topk_tmpl_probs.tolist())
            ])

            # Generate the ground truth and object mask
            admissible = [g.admissible_actions for g in graph_infos]
            objs = [g.objs for g in graph_infos]
            tmpl_gt_tt, obj_mask_gt_tt = self.generate_targets(
                admissible, objs)

            # Log template/object predictions/ground_truth
            gt_tmpls = [
                self.template_generator.templates[i] for i in tmpl_gt_tt[0].
                nonzero().squeeze().cpu().numpy().flatten().tolist()
            ]
            gt_objs = [
                self.vocab_act[i] for i in obj_mask_gt_tt[
                    0, 0].nonzero().squeeze().cpu().numpy().flatten().tolist()
            ]
            log('TmplPred: {} GT: {}'.format(tmpl_pred_str,
                                             ', '.join(gt_tmpls)))
            topk_o1_probs, topk_o1_idxs = F.softmax(obj_pred_tt[0, 0]).topk(5)
            topk_o1 = [self.vocab_act[o] for o in topk_o1_idxs.tolist()]
            o1_pred_str = ', '.join([
                '{} {:.3f}'.format(o, prob)
                for o, prob in zip(topk_o1, topk_o1_probs.tolist())
            ])
            # graph_mask_str = [self.vocab_act[i] for i in graph_mask_tt[0].nonzero().squeeze().cpu().numpy().flatten().tolist()]
            log('ObjtPred: {} GT: {}'.format(
                o1_pred_str,
                ', '.join(gt_objs)))  # , ', '.join(graph_mask_str)))

            chosen_actions = self.decode_actions(dec_tmpl_tt, dec_obj_tt)

            obs, rewards, dones, infos, graph_infos, env_str = self.vec_env.step(
                chosen_actions)
            force_reload = dones

            edges = [
                set(graph_info.graph_state.edges) for graph_info in graph_infos
            ]
            if last_edges:
                stayed_same = [
                    1 if (len(edges[i] - last_edges[i]) <=
                          self.params['kg_diff_threshold']) else 0
                    for i in range(self.params['batch_size'])
                ]
                # print ("stayed_same: {}".format(stayed_same))
            valid_kg_update = last_edges and sum(stayed_same) / self.params[
                'batch_size'] > self.params['kg_diff_batch_percentage']
            last_edges = edges

            snapshot = self.vec_env.get_snapshot()
            scores = np.array([infos[i]['score'] for i in range(len(rewards))])
            cur_max_score_idx = np.argmax(scores)
            if scores[
                    cur_max_score_idx] > previous_best_seen_score:  # or valid_kg_update:
                print("New Reward Founded OR KG updated")
                previous_best_step = step
                previous_best_state = env_str[cur_max_score_idx]
                previous_best_seen_score = scores[cur_max_score_idx]
                previous_best_snapshot = snapshot[cur_max_score_idx]
                print("\tepoch: {}".format(previous_best_step))
                print("\tnew score: {}".format(previous_best_seen_score))
                # print ("\tnew state: {}".format(previous_best_state[0]))
            # print ("rewards: {}".format(rewards))
            print("step {}: scores: {}, max_score: {}".format(
                step, scores, previous_best_seen_score))
            tb.logkv_mean(
                'TotalStepsPerEpisode',
                sum([i['steps'] for i in infos]) / float(len(graph_infos)))
            tb.logkv_mean('Valid', infos[0]['valid'])
            log('Act: {}, Rew {}, Score {}, Done {}, Value {:.3f}'.format(
                chosen_actions[0], rewards[0], infos[0]['score'], dones[0],
                value[0].item()))
            log('Obs: {}'.format(clean(obs[0])))
            if dones[0]:
                log('Step {} EpisodeScore {}\n'.format(step,
                                                       infos[0]['score']))
            for done, info in zip(dones, infos):
                if done:
                    tb.logkv_mean('EpisodeScore', info['score'])
            rew_tt = torch.FloatTensor(rewards).cuda().unsqueeze(1)
            done_mask_tt = (~torch.tensor(dones)).float().cuda().unsqueeze(1)
            self.model.reset_hidden(done_mask_tt)
            transitions.append(
                (tmpl_pred_tt, obj_pred_tt, value, rew_tt, done_mask_tt,
                 tmpl_gt_tt, dec_tmpl_tt, dec_obj_tt, obj_mask_gt_tt,
                 graph_mask_tt, dec_steps))

            if len(transitions) >= self.params['bptt']:
                tb.logkv('StepsPerSecond', float(step) / (time.time() - start))
                self.model.clone_hidden()
                obs_reps = np.array([g.ob_rep for g in graph_infos])
                graph_mask_tt = self.generate_graph_mask(graph_infos)
                graph_state_reps = [g.graph_state_rep for g in graph_infos]
                scores = [info['score'] for info in infos]
                _, _, _, _, next_value, _ = self.model(obs_reps, scores,
                                                       graph_state_reps,
                                                       graph_mask_tt)
                returns, advantages = self.discount_reward(
                    transitions, next_value)
                log('Returns: ',
                    ', '.join(['{:.3f}'.format(a[0].item()) for a in returns]))
                log(
                    'Advants: ', ', '.join(
                        ['{:.3f}'.format(a[0].item()) for a in advantages]))
                tb.logkv_mean('Advantage', advantages[-1].median().item())
                loss = self.update(transitions, returns, advantages)
                del transitions[:]
                self.model.restore_hidden()

            if step % self.params['checkpoint_interval'] == 0:
                parameters = {'model': self.model}
                torch.save(parameters,
                           os.path.join(self.params['output_dir'], 'qbert.pt'))

            if step - previous_best_step >= self.params['patience']:
                new_back_step = (
                    step - previous_best_step -
                    self.params['patience']) // self.params['patience']
                if new_back_step == 0:
                    self.vec_env.import_snapshot(previous_best_snapshot)
                self.cur_reload_state = previous_best_snapshot[-1 -
                                                               new_back_step]
                self.cur_reload_step = previous_best_step
                if new_back_step != self.back_step:
                    force_reload = [True] * self.params['batch_size']
                self.back_step = new_back_step

                print("Bottleneck detected at step: {}".format(step))
                print("preivous_best_step: {}".format(previous_best_step))
                print("Stepping back num: {}".format(self.back_step))
                print("Reloading with env_str: {}".format(
                    self.cur_reload_state[0]))

        self.vec_env.close_extras()
예제 #10
0
    def train(self, max_steps):
        start = time.time()
        transitions = []
        obs, infos, graph_infos = self.vec_env.reset()
        obs_memory = ""
        act_mem = ""
        cs_graph = None
        # chosen_actions = ["Bedroom (in bed)"] * self.batch_size
        complete = np.zeros(self.params['batch_size']).astype(int)
        for step in progressbar.progressbar(range(1, max_steps + 1),
                                            redirect_stdout=True):
            # tb.logkv('Step', step)
            wandb.log({'Step': step}, step=step)

            descs = [g.description for g in graph_infos]  # get desc #SJF
            # if(chosen_actions == None):
            #     chosen_actions = [g.description for g in graph_infos]
            obs_reps = np.array([g.ob_rep for g in graph_infos])
            graph_mask_tt = self.generate_graph_mask(graph_infos)
            graph_state_reps = [g.graph_state_rep for g in graph_infos]
            scores = [info['score'] for info in infos]

            # tmpl_pred_tt, obj_pred_tt, dec_obj_tt, dec_tmpl_tt, value, dec_steps = self.model(
            #     obs_reps, scores, graph_state_reps, graph_mask_tt, descs)
            tmpl_pred_tt, obj_pred_tt, dec_obj_tt, dec_tmpl_tt, value, dec_steps = self.model(
                obs_reps, scores, graph_state_reps, graph_mask_tt)

            wandb.log({'Value': value.mean().item()}, step=step)
            # tb.logkv_mean('Value', value.mean().item())

            # Log the predictions and ground truth values
            topk_tmpl_probs, topk_tmpl_idxs = F.softmax(
                tmpl_pred_tt[0]).topk(5)
            topk_tmpls = [
                self.template_generator.templates[t]
                for t in topk_tmpl_idxs.tolist()
            ]
            tmpl_pred_str = ', '.join([
                '{} {:.3f}'.format(tmpl, prob)
                for tmpl, prob in zip(topk_tmpls, topk_tmpl_probs.tolist())
            ])

            # Generate the ground truth and object mask
            admissible = [g.admissible_actions for g in graph_infos]
            objs = [g.objs for g in graph_infos]
            tmpl_gt_tt, obj_mask_gt_tt = self.generate_targets(
                admissible, objs)

            # Log template/object predictions/ground_truth
            gt_tmpls = [
                self.template_generator.templates[i] for i in tmpl_gt_tt[0].
                nonzero().squeeze().cpu().numpy().flatten().tolist()
            ]
            gt_objs = [
                self.vocab_act[i] for i in obj_mask_gt_tt[
                    0, 0].nonzero().squeeze().cpu().numpy().flatten().tolist()
            ]
            log('TmplPred: {} GT: {}'.format(tmpl_pred_str,
                                             ', '.join(gt_tmpls)))
            topk_o1_probs, topk_o1_idxs = F.softmax(obj_pred_tt[0, 0]).topk(5)
            topk_o1 = [self.vocab_act[o] for o in topk_o1_idxs.tolist()]
            o1_pred_str = ', '.join([
                '{} {:.3f}'.format(o, prob)
                for o, prob in zip(topk_o1, topk_o1_probs.tolist())
            ])
            # graph_mask_str = [self.vocab_act[i] for i in graph_mask_tt[0].nonzero().squeeze().cpu().numpy().flatten().tolist()]
            log('ObjtPred: {} GT: {}'.format(
                o1_pred_str,
                ', '.join(gt_objs)))  # , ', '.join(graph_mask_str)))

            chosen_actions = self.decode_actions(dec_tmpl_tt, dec_obj_tt)

            #####
            ## GENERATING THE COMMONSENSE KNOWLEDGE GRAPH BASED ON OBSERVED TRIPLES
            obs, rewards, dones, infos = self.vec_env.step(chosen_actions)
            obs = list(obs)

            ### Making entire walkthrough
            for ind, ob in enumerate(obs):

                # Deleting observations
                # if(ob.find('Bathroom') != -1 ):
                # obs[ind] = ob.replace(", with a sink, toilet and shower", "")

                if (ob.find('Bedroom') != -1):
                    # obs[ind] = ob.replace("Cleaner clothing can be found in the", "There is a")
                    complete[ind] = 1
                if (ob.find('Bathroom') != -1 and complete[ind] == 1):
                    complete[ind] = 2
                if (ob.find('You take off the gold watch.') != -1
                        and complete[ind] == 2):
                    # ob = ob.replace(", with a sink, toilet and shower", "")
                    complete[ind] = 3
                if (ob.find('You take off the soiled clothing') != -1
                        and complete[ind] == 3):
                    complete[ind] = 4
                if ((ob.find('Dropped') != -1 or ob.find('Removed') != -1)
                        and ob.find('soiled clothing') != -1
                        and complete[ind] == 4):
                    complete[ind] = 5
                if (ob.find(
                        'You step into the shower, turn on the water, and within a few moments you feel like a new man.'
                ) != -1):
                    complete[ind] = 6
                if (ob.find('You put on the gold watch.') != -1
                        and complete[ind] == 6):
                    complete[ind] = 7
                # if(((ob.find('keys:Taken') != -1 or ob.find('keys:Removed') != -1) and (ob.find('wallet:Taken') != -1 or ob.find('wallet:Removed') != -1)) and complete[ind] == 7):
                #     complete[ind] = 8
                # if(ob.find('You open the dresser, revealing some clean clothing.') != -1 and complete[ind] == 8):
                #     complete[ind] = 9
                # if(ob.find('You put on the clean clothing.') != -1 and complete[ind] >= 8 and complete[ind] <= 9):
                #     complete[ind] = 10
                # if(ob.find('Living room') != -1 and complete[ind] == 10):
                #     complete[ind] = 11
                # if(ob.find('You open the front door.') != -1 and complete[ind] == 11):
                #     complete[ind] = 12
                # if(ob.find('Driveway') != -1 and complete[ind] == 12):
                #     complete[ind] = 13
                # if(ob.find('You climb inside and start up the engine.') != -1 and complete[ind] == 13):
                #     complete[ind] = 14
                # if(ob.find('Driving.') != -1 and complete[ind] == 14):
                #     complete[ind] = 15
                # obs[ind] = "This is a far from luxurious but still quite functional bathroom. The bedroom lies to the north."
            obs = tuple(obs)

            if (self.use_cs == True):

                cs_graph = [None] * len(obs)
                for idx, ob in enumerate(obs):
                    pos_tags = (nltk.pos_tag(nltk.word_tokenize(str(
                        obs[idx]))))
                    comet_input = []
                    for tag in pos_tags:
                        if (tag[1] == 'NN' or tag[1] == 'NNS'):
                            comet_input.append(tag[0])
                    nouns = []

                    [nouns.append(x) for x in comet_input if x not in nouns]
                    cs_graph[idx] = self.kg_extract.make_graph(nouns)

                graph_infos = self.vec_env.step(chosen_actions,
                                                obs=obs,
                                                done=dones,
                                                make_graph=1,
                                                use_cs=True,
                                                cs_graph=cs_graph)

            ######
            else:

                graph_infos = self.vec_env.step(chosen_actions,
                                                obs=obs,
                                                done=dones,
                                                make_graph=1,
                                                use_cs=False,
                                                cs_graph=cs_graph)

            # tb.logkv_mean('TotalStepsPerEpisode', sum([i['steps'] for i in infos]) / float(len(graph_infos)))
            wandb.log(
                {
                    'TotalStepsPerEpisode':
                    sum([i['steps'] for i in infos]) / float(len(graph_infos))
                },
                step=step)
            # tb.logkv_mean('Valid', infos[0]['valid'])
            wandb.log({'Valid': infos[0]['valid']}, step=step)
            log('Act: {}, Rew {}, Score {}, Done {}, Value {:.3f}'.format(
                chosen_actions[0], rewards[0], infos[0]['score'], dones[0],
                value[0].item()))
            log('Obs: {}'.format(clean(obs[0])))
            if dones[0]:
                log('Step {} EpisodeScore {}\n'.format(step,
                                                       infos[0]['score']))
            complete_mean = 0
            run_cmp = 0
            score_comp = 0
            for ind, (done, info) in enumerate(zip(dones, infos)):
                if done:
                    # # tb.logkv_mean('EpisodeScore', info['score'])

                    if (complete[ind] == 15):
                        score_comp = 1
                        # tb.logkv('EpisodeScore', 1)

                    complete_mean += complete[ind]
                    # tb.logkv('EpisodeReward', complete[ind])
                    complete[ind] = 0
                    run_cmp += 1
            if (run_cmp != 0):
                wandb.log({'EpisodeReward': float(complete_mean) / run_cmp},
                          step=step)
            # else:
            #     wandb.log({'EpisodeReward': 0}, step = step)

            if (score_comp == 1):
                wandb.log({'EpisodeScore': 1}, step=step)
            # else:
            #     wandb.log({'EpisodeScore': 0}, step = step)

            ## Replacing rewards with complete variable
            rew_tt = torch.FloatTensor(tuple(complete)).cuda().unsqueeze(1)
            # rew_tt = torch.FloatTensor(rewards).cuda().unsqueeze(1)
            done_mask_tt = (~torch.tensor(dones)).float().cuda().unsqueeze(1)
            self.model.reset_hidden(done_mask_tt)
            transitions.append(
                (tmpl_pred_tt, obj_pred_tt, value, rew_tt, done_mask_tt,
                 tmpl_gt_tt, dec_tmpl_tt, dec_obj_tt, obj_mask_gt_tt,
                 graph_mask_tt, dec_steps))

            if len(transitions) >= self.params['bptt']:
                # tb.logkv('StepsPerSecond', float(step) / (time.time() - start))
                wandb.log(
                    {'StepsPerSecond': float(step) / (time.time() - start)},
                    step=step)
                self.model.clone_hidden()
                obs_reps = np.array([g.ob_rep for g in graph_infos])
                graph_mask_tt = self.generate_graph_mask(graph_infos)
                graph_state_reps = [g.graph_state_rep for g in graph_infos]
                scores = [info['score'] for info in infos]
                descs = [g.description for g in graph_infos]  # get desc #SJF
                # _, _, _, _, next_value, _ = self.model(obs_reps, scores, graph_state_reps, graph_mask_tt, descs)
                _, _, _, _, next_value, _ = self.model(obs_reps, scores,
                                                       graph_state_reps,
                                                       graph_mask_tt)

                returns, advantages = self.discount_reward(
                    transitions, next_value)
                log('Returns: ',
                    ', '.join(['{:.3f}'.format(a[0].item()) for a in returns]))
                log(
                    'Advants: ', ', '.join(
                        ['{:.3f}'.format(a[0].item()) for a in advantages]))
                # tb.logkv_mean('Advantage', advantages[-1].median().item())
                wandb.log({'Advantage': advantages[-1].median().item()},
                          step=step)
                loss = self.update(transitions, returns, advantages, step)
                del transitions[:]
                self.model.restore_hidden()

            if step % self.params['checkpoint_interval'] == 0:
                parameters = {'model': self.model}
                torch.save(
                    parameters,
                    os.path.join(self.params['output_dir'],
                                 'kga2c_zork_cs.pt'))

        self.vec_env.close_extras()
예제 #11
0
    def update_state(self,
                     visible_state,
                     inventory_state,
                     objs,
                     prev_action=None,
                     cache=None):
        # Step 1: Build a copy of past KG (full)
        graph_copy = self.graph_state.copy()
        prev_room = self.room
        prev_room_subgraph = None
        con_cs = [
            graph_copy.subgraph(c)
            for c in nx.weakly_connected_components(graph_copy)
        ]
        for con_c in con_cs:
            for node in con_c.nodes:
                node = set(str(node).split())
                if set(prev_room.split()).issubset(node):
                    prev_room_subgraph = nx.induced_subgraph(
                        graph_copy, con_c.nodes)
        # Step 2: Bemove old ones with "you" --> past KG without "you"
        for edge in self.graph_state.edges:
            if 'you' in edge[0]:
                graph_copy.remove_edge(*edge)
        self.graph_state = graph_copy
        # Keep room connectivity only, remove "you"
        # <you, in, room>, <room, connect, room> --> <room, connect, room>
        graph_copy_1_connectivity = self.graph_state_1_connectivity.copy()
        for edge in self.graph_state_1_connectivity.edges:
            if 'you' in edge[0]:
                graph_copy_1_connectivity.remove_edge(*edge)
        self.graph_state_1_connectivity = graph_copy_1_connectivity
        # Step 3: Reinitialize sub-KG
        self.graph_state_2_roomitem = nx.DiGraph()  # re-init
        self.graph_state_3_youritem = nx.DiGraph()  # re-init
        self.graph_state_4_otherroom = graph_copy.copy(
        )  # Just past information
        # Preprocess visible state --> get sents
        visible_state = visible_state.split('\n')
        room = visible_state[0]
        visible_state = clean(' '.join(visible_state[1:]))
        self.visible_state = str(visible_state)
        if cache is None:
            sents = openie.call_stanford_openie(
                self.visible_state)['sentences']
        else:
            sents = cache
        if sents == "":
            return []
        dirs = [
            'north', 'south', 'east', 'west', 'southeast', 'southwest',
            'northeast', 'northwest', 'up', 'down'
        ]
        in_aliases = [
            'are in', 'are facing', 'are standing', 'are behind', 'are above',
            'are below', 'are in front'
        ]
        # Update graph, "rules" are new triples to be added
        # Add two rule lists for "you" and "woyou"
        rules_1_connectivity = []  # <you,in>, <room,connect>
        rules_2_roomitem = []  # <you,in>, <room,have>
        rules_3_youritem = []  # <you,have>
        rules = []
        in_rl = []
        in_flag = False
        for i, ov in enumerate(sents):
            sent = ' '.join([a['word'] for a in ov['tokens']])
            triple = ov['openie']
            # 1.1 -> check directions
            # direction rules: <room, has, exit to direction>
            for d in dirs:
                if d in sent and i != 0:
                    rules.append((room, 'has', 'exit to ' + d))
                    rules_1_connectivity.append((room, 'has', 'exit to ' + d))
            # 1.2 -> check OpenIE triples
            for tr in triple:
                h, r, t = tr['subject'].lower(), tr['relation'].lower(
                ), tr['object'].lower()
                # case 1: "you", "in"
                if h == 'you':
                    for rp in in_aliases:
                        if fuzz.token_set_ratio(r, rp) > 80:
                            r = "in"
                            in_rl.append((h, r, t))  # <you, in, >
                            in_flag = True
                            break
                # case 2: should not be "it"
                if h == 'it':
                    break
                # case 3: other triples
                if not in_flag:
                    rules.append((h, r, t))
                    rules_2_roomitem.append((h, r, t))
        # 1.3 "you are in" cases
        if in_flag:
            cur_t = in_rl[0]
            for h, r, t in in_rl:
                if set(cur_t[2].split()).issubset(set(t.split())):
                    cur_t = h, r, t
            rules.append(cur_t)
            rules_1_connectivity.append(cur_t)
            rules_2_roomitem.append(cur_t)
            room = cur_t[2]
            self.room = room
        # 1.4 inventory: <you, have, ...>
        try:
            items = inventory_state.split(':')[1].split('\n')[1:]
            for item in items:
                rules.append(('you', 'have', str(' '.join(item.split()[1:]))))
                rules_3_youritem.append(
                    ('you', 'have',
                     str(' '.join(item.split()[1:]))))  # [20200420] 3
        except:
            pass
        # 1.5 room connectivity: <room, dir, room>
        if prev_action is not None:
            for d in dirs:
                if d in prev_action and self.room != "":
                    rules.append((prev_room, d + ' of', room))
                    rules_1_connectivity.append((prev_room, d + ' of', room))
                    if prev_room_subgraph is not None:
                        for ed in prev_room_subgraph.edges:
                            rules.append(
                                (ed[0], "prev_graph_relations", ed[1]))
                    break
        # 1.6 room item: <item,in,room>
        # If the action is "drop" --> something will be in this room
        # Therefore binary exploration bonus should not be used!
        for o in objs:
            rules.append((str(o), 'in', room))
            rules_2_roomitem.append((str(o), 'in', room))

        # add edges: if this edge already exists, adding will not show difference
        add_rules = rules
        for rule in add_rules:
            u = '_'.join(str(rule[0]).split())
            v = '_'.join(str(rule[2]).split())
            if u in self.vocab_kge['entity'].keys(
            ) and v in self.vocab_kge['entity'].keys():
                if u != 'it' and v != 'it':
                    self.graph_state.add_edge(rule[0], rule[2], rel=rule[1])

        # build graph_state_1_connectivity
        for rule in rules_1_connectivity:
            u = '_'.join(str(rule[0]).split())
            v = '_'.join(str(rule[2]).split())
            if u in self.vocab_kge['entity'].keys(
            ) and v in self.vocab_kge['entity'].keys():
                if u != 'it' and v != 'it':
                    self.graph_state_1_connectivity.add_edge(rule[0],
                                                             rule[2],
                                                             rel=rule[1])
        # build graph_state_5_mask
        self.graph_state_5_mask = self.graph_state_1_connectivity.copy()
        # build graph_state_2_roomitem (and graph_state_5_mask)
        for rule in rules_2_roomitem:
            u = '_'.join(str(rule[0]).split())
            v = '_'.join(str(rule[2]).split())
            if u in self.vocab_kge['entity'].keys(
            ) and v in self.vocab_kge['entity'].keys():
                if u != 'it' and v != 'it':
                    self.graph_state_2_roomitem.add_edge(rule[0],
                                                         rule[2],
                                                         rel=rule[1])
                    self.graph_state_5_mask.add_edge(rule[0],
                                                     rule[2],
                                                     rel=rule[1])
        # build graph_state_3_youritem (and graph_state_5_mask)
        for rule in rules_3_youritem:
            u = '_'.join(str(rule[0]).split())
            v = '_'.join(str(rule[2]).split())
            if u in self.vocab_kge['entity'].keys(
            ) and v in self.vocab_kge['entity'].keys():
                if u != 'it' and v != 'it':
                    self.graph_state_3_youritem.add_edge(rule[0],
                                                         rule[2],
                                                         rel=rule[1])
                    self.graph_state_5_mask.add_edge(rule[0],
                                                     rule[2],
                                                     rel=rule[1])
        return add_rules, sents
예제 #12
0
    def train(self, max_steps):
        start = time.time()
        if self.params['training_type'] == 'chained':
            self.log_file(
                "BEGINNING OF TRAINING: patience={}, max_n_steps_back={}\n".
                format(self.params['patience'], self.params['buffer_size']))
        frozen_policies = []
        transitions = []
        self.back_step = -1

        previous_best_seen_score = float("-inf")
        previous_best_step = 0
        previous_best_state = None
        previous_best_snapshot = None
        previous_best_ACTUAL_score = 0
        self.cur_reload_step = 0
        force_reload = [False] * self.params['batch_size']
        last_edges = None

        self.valid_track = np.zeros(self.params['batch_size'])
        self.stagnant_steps = 0

        INTRINSIC_MOTIVTATION = [
            set() for i in range(self.params['batch_size'])
        ]

        obs, infos, graph_infos, env_str = self.vec_env.reset()
        snap_obs = obs[0]
        snap_info = infos[0]
        snap_graph_reps = None
        # print (obs)
        # print (infos)
        # print (graph_infos)
        for step in range(1, max_steps + 1):
            wallclock = time.time()

            if any(force_reload) and self.params['training_type'] == 'chained':
                num_reload = force_reload.count(True)
                t_obs = np.array(obs)
                t_obs[force_reload] = [snap_obs] * num_reload
                obs = tuple(t_obs)

                t_infos = np.array(infos)
                t_infos[force_reload] = [snap_info] * num_reload
                infos = tuple(t_infos)

                t_graphs = list(graph_infos)
                # namedtuple gets lost in np.array
                t_updates = self.vec_env.load_from(self.cur_reload_state,
                                                   force_reload,
                                                   snap_graph_reps, snap_obs)
                for i in range(self.params['batch_size']):
                    if force_reload[i]:
                        t_graphs[i] = t_updates[i]
                graph_infos = tuple(t_graphs)

                force_reload = [False] * self.params['batch_size']

            tb.logkv('Step', step)
            obs_reps = np.array([g.ob_rep for g in graph_infos])
            graph_mask_tt = self.generate_graph_mask(graph_infos)
            graph_state_reps = [g.graph_state_rep for g in graph_infos]

            if self.params['reward_type'] == 'game_only':
                scores = [info['score'] for info in infos]
            elif self.params['reward_type'] == 'IM_only':
                scores = np.array([
                    int(
                        len(INTRINSIC_MOTIVTATION[i]) *
                        self.params['intrinsic_motivation_factor'])
                    for i in range(self.params['batch_size'])
                ])
            elif self.params['reward_type'] == 'game_and_IM':
                scores = np.array([
                    infos[i]['score'] +
                    (len(INTRINSIC_MOTIVTATION[i]) *
                     ((infos[i]['score'] + self.params['epsilon']) /
                      self.max_game_score))
                    for i in range(self.params['batch_size'])
                ])
            tmpl_pred_tt, obj_pred_tt, dec_obj_tt, dec_tmpl_tt, value, dec_steps = self.model(
                obs_reps, scores, graph_state_reps, graph_mask_tt)
            tb.logkv_mean('Value', value.mean().item())

            # Log the predictions and ground truth values
            topk_tmpl_probs, topk_tmpl_idxs = F.softmax(
                tmpl_pred_tt[0]).topk(5)
            topk_tmpls = [
                self.template_generator.templates[t]
                for t in topk_tmpl_idxs.tolist()
            ]
            tmpl_pred_str = ', '.join([
                '{} {:.3f}'.format(tmpl, prob)
                for tmpl, prob in zip(topk_tmpls, topk_tmpl_probs.tolist())
            ])

            # Generate the ground truth and object mask
            admissible = [g.admissible_actions for g in graph_infos]
            objs = [g.objs for g in graph_infos]
            tmpl_gt_tt, obj_mask_gt_tt = self.generate_targets(
                admissible, objs)

            # Log template/object predictions/ground_truth
            gt_tmpls = [
                self.template_generator.templates[i] for i in tmpl_gt_tt[0].
                nonzero().squeeze().cpu().numpy().flatten().tolist()
            ]
            gt_objs = [
                self.vocab_act[i] for i in obj_mask_gt_tt[
                    0, 0].nonzero().squeeze().cpu().numpy().flatten().tolist()
            ]
            log('TmplPred: {} GT: {}'.format(tmpl_pred_str,
                                             ', '.join(gt_tmpls)))
            topk_o1_probs, topk_o1_idxs = F.softmax(obj_pred_tt[0, 0]).topk(5)
            topk_o1 = [self.vocab_act[o] for o in topk_o1_idxs.tolist()]
            o1_pred_str = ', '.join([
                '{} {:.3f}'.format(o, prob)
                for o, prob in zip(topk_o1, topk_o1_probs.tolist())
            ])
            # graph_mask_str = [self.vocab_act[i] for i in graph_mask_tt[0].nonzero().squeeze().cpu().numpy().flatten().tolist()]
            log('ObjtPred: {} GT: {}'.format(
                o1_pred_str,
                ', '.join(gt_objs)))  # , ', '.join(graph_mask_str)))

            chosen_actions = self.decode_actions(dec_tmpl_tt, dec_obj_tt)

            #stepclock = time.time()

            obs, rewards, dones, infos, graph_infos, env_str = self.vec_env.step(
                chosen_actions)

            #print('stepclock', time.time() - stepclock)
            self.valid_track += [info['valid'] for info in infos]
            self.stagnant_steps += 1
            force_reload = list(dones)

            edges = [
                set(graph_info.graph_state.edges) for graph_info in graph_infos
            ]
            size_updates = [0] * self.params['batch_size']
            for i, s in enumerate(INTRINSIC_MOTIVTATION):
                orig_size = len(s)
                s.update(edges[i])
                size_updates[i] = len(s) - orig_size
            rewards = list(rewards)
            for i in range(self.params['batch_size']):
                if self.params['reward_type'] == 'IM_only':
                    rewards[i] = size_updates[i] * self.params[
                        'intrinsic_motivation_factor']
                elif self.params['reward_type'] == 'game_and_IM':
                    rewards[i] += size_updates[i] * self.params[
                        'intrinsic_motivation_factor']
            rewards = tuple(rewards)

            if last_edges:
                stayed_same = [
                    1 if (len(edges[i] - last_edges[i]) <=
                          self.params['kg_diff_threshold']) else 0
                    for i in range(self.params['batch_size'])
                ]
                # print ("stayed_same: {}".format(stayed_same))
            valid_kg_update = last_edges and sum(stayed_same) / self.params[
                'batch_size'] > self.params['kg_diff_batch_percentage']
            last_edges = edges

            snapshot = self.vec_env.get_snapshot()
            real_scores = np.array(
                [infos[i]['score'] for i in range(len(rewards))])

            if self.params['reward_type'] == 'game_only':
                scores = [info['score'] for info in infos]
            elif self.params['reward_type'] == 'IM_only':
                scores = np.array([
                    int(
                        len(INTRINSIC_MOTIVTATION[i]) *
                        self.params['intrinsic_motivation_factor'])
                    for i in range(self.params['batch_size'])
                ])
            elif self.params['reward_type'] == 'game_and_IM':
                scores = np.array([
                    infos[i]['score'] +
                    (len(INTRINSIC_MOTIVTATION[i]) *
                     ((infos[i]['score'] + self.params['epsilon']) /
                      self.max_game_score))
                    for i in range(self.params['batch_size'])
                ])
            cur_max_score_idx = np.argmax(scores)
            if scores[
                    cur_max_score_idx] > previous_best_seen_score and self.params[
                        'training_type'] == 'chained':  # or valid_kg_update:
                print("New Reward Founded OR KG updated")
                previous_best_step = step
                previous_best_state = env_str[cur_max_score_idx]
                previous_best_seen_score = scores[cur_max_score_idx]
                previous_best_snapshot = snapshot[cur_max_score_idx]
                self.back_step = -1
                self.valid_track = np.zeros(self.params['batch_size'])
                self.stagnant_steps = 0
                print("\tepoch: {}".format(previous_best_step))
                print("\tnew score: {}".format(previous_best_seen_score))
                print("\tthis info: {}".format(infos[cur_max_score_idx]))
                self.log_file(
                    "New High Score Founded: step:{}, new_score:{}, infos:{}\n"
                    .format(previous_best_step, previous_best_seen_score,
                            infos[cur_max_score_idx]))

            previous_best_ACTUAL_score = max(np.max(real_scores),
                                             previous_best_ACTUAL_score)
            print("step {}: scores: {}, best_real_score: {}".format(
                step, scores, previous_best_ACTUAL_score))

            tb.logkv_mean(
                'TotalStepsPerEpisode',
                sum([i['steps'] for i in infos]) / float(len(graph_infos)))
            tb.logkv_mean('Valid', infos[0]['valid'])
            log('Act: {}, Rew {}, Score {}, Done {}, Value {:.3f}'.format(
                chosen_actions[0], rewards[0], infos[0]['score'], dones[0],
                value[0].item()))
            log('Obs: {}'.format(clean(obs[0])))
            if dones[0]:
                log('Step {} EpisodeScore {}\n'.format(step,
                                                       infos[0]['score']))
            for done, info in zip(dones, infos):
                if done:
                    tb.logkv_mean('EpisodeScore', info['score'])
            rew_tt = torch.FloatTensor(rewards).cuda().unsqueeze(1)
            done_mask_tt = (~torch.tensor(dones)).float().cuda().unsqueeze(1)
            self.model.reset_hidden(done_mask_tt)
            transitions.append(
                (tmpl_pred_tt, obj_pred_tt, value, rew_tt, done_mask_tt,
                 tmpl_gt_tt, dec_tmpl_tt, dec_obj_tt, obj_mask_gt_tt,
                 graph_mask_tt, dec_steps))

            if len(transitions) >= self.params['bptt']:
                tb.logkv('StepsPerSecond', float(step) / (time.time() - start))
                self.model.clone_hidden()
                obs_reps = np.array([g.ob_rep for g in graph_infos])
                graph_mask_tt = self.generate_graph_mask(graph_infos)
                graph_state_reps = [g.graph_state_rep for g in graph_infos]
                if self.params['reward_type'] == 'game_only':
                    scores = [info['score'] for info in infos]
                elif self.params['reward_type'] == 'IM_only':
                    scores = np.array([
                        int(
                            len(INTRINSIC_MOTIVTATION[i]) *
                            self.params['intrinsic_motivation_factor'])
                        for i in range(self.params['batch_size'])
                    ])
                elif self.params['reward_type'] == 'game_and_IM':
                    scores = np.array([
                        infos[i]['score'] +
                        (len(INTRINSIC_MOTIVTATION[i]) *
                         ((infos[i]['score'] + self.params['epsilon']) /
                          self.max_game_score))
                        for i in range(self.params['batch_size'])
                    ])
                _, _, _, _, next_value, _ = self.model(obs_reps, scores,
                                                       graph_state_reps,
                                                       graph_mask_tt)
                returns, advantages = self.discount_reward(
                    transitions, next_value)
                log('Returns: ',
                    ', '.join(['{:.3f}'.format(a[0].item()) for a in returns]))
                log(
                    'Advants: ', ', '.join(
                        ['{:.3f}'.format(a[0].item()) for a in advantages]))
                tb.logkv_mean('Advantage', advantages[-1].median().item())
                loss = self.update(transitions, returns, advantages)
                del transitions[:]
                self.model.restore_hidden()

            if step % self.params['checkpoint_interval'] == 0:
                parameters = {'model': self.model}
                torch.save(parameters,
                           os.path.join(self.params['output_dir'], 'qbert.pt'))

            bottleneck = self.params['training_type'] == 'chained' and \
                ((self.stagnant_steps >= self.params['patience'] and not self.params['patience_valid_only']) or
                (self.params['patience_valid_only'] and sum(self.valid_track >= self.params['patience']) >= self.params['batch_size'] * self.params['patience_batch_factor']))
            if bottleneck:
                print("Bottleneck detected at step: {}".format(step))
                # new_backstep += 1
                # new_back_step = (step - previous_best_step - self.params['patience']) // self.params['patience']
                self.back_step += 1
                if self.back_step == 0:
                    self.vec_env.import_snapshot(previous_best_snapshot)
                    cur_time = time.strftime("%Y%m%d-%H%M%S")
                    torch.save(
                        self.model.state_dict(),
                        os.path.join(self.chkpt_path,
                                     '{}.pt'.format(cur_time)))
                    frozen_policies.append((cur_time, previous_best_state))
                    # INTRINSIC_MOTIVTATION= [set() for i in range(self.params['batch_size'])]
                    self.log_file(
                        "Current model saved at: model/checkpoints/{}.pt\n".
                        format(cur_time))
                self.model = QBERT(self.params,
                                   self.template_generator.templates,
                                   self.max_word_length,
                                   self.vocab_act,
                                   self.vocab_act_rev,
                                   len(self.sp),
                                   gat=self.params['gat']).cuda()

                if self.back_step >= self.params['buffer_size']:
                    print("Buffer exhausted. Finishing training")
                    self.vec_env.close_extras()
                    return
                print(previous_best_snapshot[-1 - self.back_step])
                snap_obs, snap_info, snap_graph_reps, self.cur_reload_state = previous_best_snapshot[
                    -1 - self.back_step]
                print("Loading snapshot, infos: {}".format(snap_info))
                self.log_file(
                    "Loading snapshot, infos: {}\n".format(snap_info))
                self.cur_reload_step = previous_best_step
                force_reload = [True] * self.params['batch_size']
                self.valid_track = np.zeros(self.params['batch_size'])
                self.stagnant_steps = 0

                # print out observations here
                print("Current observations: {}".format(
                    [info['look'] for info in infos]))
                print("Previous_best_step: {}, step_back: {}".format(
                    previous_best_step, self.back_step))
                self.log_file(
                    "Bottleneck Detected: step:{}, previous_best_step:{}, cur_step_back:{}\n"
                    .format(i, previous_best_step, self.back_step))
                self.log_file("Current observations: {}\n".format(
                    [info['look'] for info in infos]))
            #exit()

        self.vec_env.close_extras()
예제 #13
0
    def goexplore_train(self, obs, infos, graph_infos, max_steps,
                        INTRINSIC_MOTIVTATION):
        start = time.time()
        transitions = []
        if obs == None:
            obs, infos, graph_infos = self.vec_env.go_reset()
        for step in range(1, max_steps + 1):
            self.total_steps += 1
            tb.logkv('Step', self.total_steps)
            obs_reps = np.array([g.ob_rep for g in graph_infos])
            graph_mask_tt = self.generate_graph_mask(graph_infos)
            graph_state_reps = [g.graph_state_rep for g in graph_infos]
            # scores = [info['score'] for info in infos]
            if self.params['reward_type'] == 'game_only':
                scores = [info['score'] for info in infos]
            elif self.params['reward_type'] == 'IM_only':
                scores = np.array([
                    int(
                        len(INTRINSIC_MOTIVTATION[i]) *
                        self.params['intrinsic_motivation_factor'])
                    for i in range(self.params['batch_size'])
                ])
            elif self.params['reward_type'] == 'game_and_IM':
                scores = np.array([
                    infos[i]['score'] +
                    (len(INTRINSIC_MOTIVTATION[i]) *
                     ((infos[i]['score'] + self.params['epsilon']) /
                      self.max_game_score))
                    for i in range(self.params['batch_size'])
                ])

            tmpl_pred_tt, obj_pred_tt, dec_obj_tt, dec_tmpl_tt, value, dec_steps = self.model(
                obs_reps, scores, graph_state_reps, graph_mask_tt)
            tb.logkv_mean('Value', value.mean().item())

            # Log some of the predictions and ground truth values
            topk_tmpl_probs, topk_tmpl_idxs = F.softmax(
                tmpl_pred_tt[0]).topk(5)
            topk_tmpls = [
                self.template_generator.templates[t]
                for t in topk_tmpl_idxs.tolist()
            ]
            tmpl_pred_str = ', '.join([
                '{} {:.3f}'.format(tmpl, prob)
                for tmpl, prob in zip(topk_tmpls, topk_tmpl_probs.tolist())
            ])

            admissible = [g.admissible_actions for g in graph_infos]
            objs = [g.objs for g in graph_infos]
            tmpl_gt_tt, obj_mask_gt_tt = self.generate_targets(
                admissible, objs)

            gt_tmpls = [
                self.template_generator.templates[i] for i in tmpl_gt_tt[0].
                nonzero().squeeze().cpu().numpy().flatten().tolist()
            ]
            gt_objs = [
                self.vocab_act[i] for i in obj_mask_gt_tt[
                    0, 0].nonzero().squeeze().cpu().numpy().flatten().tolist()
            ]
            log('TmplPred: {} GT: {}'.format(tmpl_pred_str,
                                             ', '.join(gt_tmpls)))
            topk_o1_probs, topk_o1_idxs = F.softmax(obj_pred_tt[0, 0]).topk(5)
            topk_o1 = [self.vocab_act[o] for o in topk_o1_idxs.tolist()]
            o1_pred_str = ', '.join([
                '{} {:.3f}'.format(o, prob)
                for o, prob in zip(topk_o1, topk_o1_probs.tolist())
            ])
            graph_mask_str = [
                self.vocab_act[i] for i in graph_mask_tt[0].nonzero().squeeze(
                ).cpu().numpy().flatten().tolist()
            ]
            log('ObjtPred: {} GT: {} Mask: {}'.format(
                o1_pred_str, ', '.join(gt_objs), ', '.join(graph_mask_str)))

            chosen_actions = self.decode_actions(dec_tmpl_tt, dec_obj_tt)

            # Chooses random valid-actions to execute

            obs, rewards, dones, infos, graph_infos = self.vec_env.go_step(
                chosen_actions)

            edges = [
                set(graph_info.graph_state.edges) for graph_info in graph_infos
            ]
            size_updates = [0] * self.params['batch_size']
            for i, s in enumerate(INTRINSIC_MOTIVTATION):
                orig_size = len(s)
                s.update(edges[i])
                size_updates[i] = len(s) - orig_size
            rewards = list(rewards)
            for i in range(self.params['batch_size']):
                if self.params['reward_type'] == 'IM_only':
                    rewards[i] = size_updates[i] * self.params[
                        'intrinsic_motivation_factor']
                elif self.params['reward_type'] == 'game_and_IM':
                    rewards[i] += size_updates[i] * self.params[
                        'intrinsic_motivation_factor']
            rewards = tuple(rewards)

            tb.logkv_mean(
                'TotalStepsPerEpisode',
                sum([i['steps'] for i in infos]) / float(len(graph_infos)))
            tb.logkv_mean('Valid', infos[0]['valid'])
            log('Act: {}, Rew {}, Score {}, Done {}, Value {:.3f}'.format(
                chosen_actions[0], rewards[0], infos[0]['score'], dones[0],
                value[0].item()))
            log('Obs: {}'.format(clean(obs[0])))
            if dones[0]:
                log('Step {} EpisodeScore {}\n'.format(step,
                                                       infos[0]['score']))
            for done, info in zip(dones, infos):
                if done:
                    tb.logkv_mean('EpisodeScore', info['score'])
            rew_tt = torch.FloatTensor(rewards).cuda().unsqueeze(1)

            done_mask_tt = (~torch.tensor(dones)).float().cuda().unsqueeze(1)
            self.model.reset_hidden(done_mask_tt)
            transitions.append(
                (tmpl_pred_tt, obj_pred_tt, value, rew_tt, done_mask_tt,
                 tmpl_gt_tt, dec_tmpl_tt, dec_obj_tt, obj_mask_gt_tt,
                 graph_mask_tt, dec_steps))

            if len(transitions) >= self.params['bptt']:
                tb.logkv('StepsPerSecond', float(step) / (time.time() - start))
                self.model.clone_hidden()
                obs_reps = np.array([g.ob_rep for g in graph_infos])
                graph_mask_tt = self.generate_graph_mask(graph_infos)
                graph_state_reps = [g.graph_state_rep for g in graph_infos]
                # scores = [info['score'] for info in infos]
                if self.params['reward_type'] == 'game_only':
                    scores = [info['score'] for info in infos]
                elif self.params['reward_type'] == 'IM_only':
                    scores = np.array([
                        int(
                            len(INTRINSIC_MOTIVTATION[i]) *
                            self.params['intrinsic_motivation_factor'])
                        for i in range(self.params['batch_size'])
                    ])
                elif self.params['reward_type'] == 'game_and_IM':
                    scores = np.array([
                        infos[i]['score'] +
                        (len(INTRINSIC_MOTIVTATION[i]) *
                         ((infos[i]['score'] + self.params['epsilon']) /
                          self.max_game_score))
                        for i in range(self.params['batch_size'])
                    ])
                _, _, _, _, next_value, _ = self.model(obs_reps, scores,
                                                       graph_state_reps,
                                                       graph_mask_tt)
                returns, advantages = self.discount_reward(
                    transitions, next_value)
                log('Returns: ',
                    ', '.join(['{:.3f}'.format(a[0].item()) for a in returns]))
                log(
                    'Advants: ', ', '.join(
                        ['{:.3f}'.format(a[0].item()) for a in advantages]))
                tb.logkv_mean('Advantage', advantages[-1].median().item())
                loss = self.update(transitions, returns, advantages)
                del transitions[:]
                self.model.restore_hidden()

            if step % self.params['checkpoint_interval'] == 0:
                parameters = {'model': self.model}
                torch.save(parameters,
                           os.path.join(self.params['output_dir'], 'qbert.pt'))

        # self.vec_env.close_extras()
        return obs, rewards, dones, infos, graph_infos, scores, chosen_actions, INTRINSIC_MOTIVTATION