示例#1
0
    def compute_metric(self, preds, data):
        '''
        compute f1 and extract match scores for output
        '''

        m = collections.defaultdict(list)
        for ex, feat in tqdm.tqdm(data, ncols=80, desc='compute_metric'):
            # if 'repeat_idx' in ex: ex = self.load_task_json(self.args, ex, None)[0]
            key = (ex['task_id'], ex['repeat_idx'])
            # feat should already contain the following, since all AlfredDataset s which are fed into this function have test_mode=False
            # feat = self.featurize(ex, self.args, False, load_mask=True, load_frames=True)

            # Evaluate low level actions.
            label = ' '.join(self.vocab['action_low'].index2word(
                feat['action_low'].tolist()))
            pred = ' '.join(self.vocab['action_low'].index2word(
                preds[key]['action_low']))

            label_lower = label.lower()
            pred_lower = pred.lower()

            m['action_low_f1'].append(compute_f1(label_lower, pred_lower))
            m['action_low_em'].append(compute_exact(label_lower, pred_lower))
            m['action_low_gold_length'].append(len(label.split()))
            m['action_low_pred_length'].append(len(pred.split()))
            m['action_low_edit_distance'].append(
                compute_edit_distance(label_lower, pred_lower))

            # Evaluate high-level controller.
            # Get indexes of predicted transitions.
            stop_idxs = np.argwhere(
                np.array(preds[key]['action_low'])[:-1] == 2).flatten()
            high_idxs = np.append([0], stop_idxs + 1).astype(np.int32)

            # Get predicted submodule transitions
            pred_high_idx = np.array(preds[key]['controller_attn'])[high_idxs]
            label_high_idx = feat['module_idxs'][np.nonzero(
                feat['transition_mask'])]

            pred = ' '.join(self.vocab['high_level'].index2word(
                pred_high_idx.tolist()))
            label = ' '.join(self.vocab['high_level'].index2word(
                label_high_idx.tolist()))

            label_lower = label.lower()
            pred_lower = pred.lower()

            m['action_high_f1'].append(compute_f1(label_lower, pred_lower))
            m['action_high_em'].append(compute_exact(label_lower, pred_lower))

            m['action_high_gold_length'].append(len(label.split()))
            m['action_high_pred_length'].append(len(pred.split()))
            m['action_high_edit_distance'].append(
                compute_edit_distance(label_lower, pred_lower))

        return {k: sum(v) / len(v) for k, v in m.items()}
示例#2
0
 def compute_metric(self, preds, data):
     '''
     compute f1 and extract match scores for output
     '''
     m = collections.defaultdict(list)
     for task in data:
         ex = self.load_task_json(task)
         i = self.get_task_and_ann_id(ex)
         label = ' '.join([
             a['discrete_action']['action']
             for a in ex['plan']['low_actions']
         ])
         m['action_low_f1'].append(
             compute_f1(label.lower(), preds[i]['action_low'].lower()))
         m['action_low_em'].append(
             compute_exact(label.lower(), preds[i]['action_low'].lower()))
     return {k: sum(v) / len(v) for k, v in m.items()}
示例#3
0
 def compute_metric(self, preds, data):
     '''
     compute f1 and extract match scores for output
     '''
     m = collections.defaultdict(list)
     for ex in data:
         # if 'repeat_idx'in ex: ex = self.load_task_json(ex, None)[0]
         key = (ex['task_id'], ex['repeat_idx'])
         label = ' '.join([
             a['discrete_action']['action']
             for a in ex['plan']['low_actions']
         ])
         m['action_low_f1'].append(
             compute_f1(label.lower(), preds[key]['action_low'].lower()))
         m['action_low_em'].append(
             compute_exact(label.lower(), preds[key]['action_low'].lower()))
     return {k: sum(v) / len(v) for k, v in m.items()}