def main(): opts = Options() fill_from_args(opts) id2gt = dict() for line in jsonl_lines(opts.gt): jobj = json.loads(line) qid = jobj['id'] id2gt[qid] = jobj['agg_index'] sums = defaultdict(float) counts = defaultdict(float) for line in jsonl_lines(opts.input): jobj = json.loads(line) qid = jobj['id'] gt = id2gt[qid] preds = np.array(jobj['predictions'], dtype=np.float32) correct = 1 if np.argmax(preds) == gt else 0 counts[f'accuracy_{gt}'] += 1 sums[f'accuracy_{gt}'] += correct counts[f'accuracy'] += 1 sums[f'accuracy'] += correct metric_names = list(sums.keys()) metric_names.sort() for n in metric_names: print(f'{n} = {sums[n]/counts[n]} over {counts[n]}')
def qid2predictions(pred_file): qid2preds = dict() for line in jsonl_lines(pred_file): jobj = json.loads(line) preds = [] for p in jobj['predictions']: preds.append(f'{p[0]}-{p[1]}') qid2preds[jobj['id']] = preds return qid2preds
def gather_predictions(input_file, *, softmax=False): predictions = defaultdict(list) for line in jsonl_lines(input_file): jobj = json.loads(line) if softmax: pred = log_softmax(np.array(jobj['predictions'], dtype=np.float32))[1] else: pred = jobj['predictions'][1] qid, ndx_str = jobj['id'].split(':') predictions[qid].append((int(ndx_str), pred)) return predictions
def __init__(self, hypers, per_gpu_batch_size: int, tokenizer, data_dir, *, files_per_dataloader=1, checkpoint_info=None, is_separate=False, is_single=False, json_mapper=standard_json_mapper, teacher_labels=None): super().__init__(hypers, per_gpu_batch_size, data_dir, checkpoint_info=checkpoint_info, files_per_dataloader=files_per_dataloader) self.tokenizer = tokenizer # NOTE: maybe should use tokenizer.cls_token_id, tokenizer.sep_token_id self.cls_id, self.sep_id = tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]"]) self.is_separate = is_separate self.is_single = is_single self.json_mapper = json_mapper # just load the entire teacher predictions if teacher_labels: logger.info(f'loading teacher labels from {teacher_labels}') self.id2teacher_labels = dict() for line in jsonl_lines(teacher_labels): jobj = json.loads(line) id = jobj['id'] preds = jobj['predictions'] self.id2teacher_labels[id] = np.array(preds, dtype=np.float32) else: self.id2teacher_labels = None
def get_dataloader(self): input_files, files_are_shared = self._get_input_files() if input_files is None: return None lines = jsonl_lines(input_files) # if input_files are supposed to be shared then get only the lines for our global_rank if files_are_shared: lines = itertools.islice(lines, self.hypers.global_rank, None, self.hypers.world_size) logger.warning( f'on {self.hypers.global_rank} rank, using files: {input_files}, shared: {files_are_shared}' ) batches = self._one_load(lines) displayer = None if not self.first_batches_loaded: self.first_batches_loaded = True displayer = self.display_batch batches.post_init(batch_size=self.per_gpu_batch_size * self.hypers.n_gpu, displayer=displayer, uneven_batches=self.uneven_batches, random=random.Random(123 * self.on_epoch)) return batches
def write_agg_classify(data_dir, split, *, exclude_header=False, cell_sep_token='*'): with write_open(os.path.join(data_dir, f'{split}_agg_classify.jsonl.gz')) as out: for line in jsonl_lines(os.path.join(data_dir, f'{split}_agg.jsonl.gz')): jobj = json.loads(line) if not exclude_header: agg_inst = { 'id': jobj['id'], 'text_a': jobj['question'], 'text_b': f' {cell_sep_token} '.join(jobj['header']), 'label': jobj['agg_index'] } else: agg_inst = { 'id': jobj['id'], 'text': jobj['question'], 'label': jobj['agg_index'] } out.write(json.dumps(agg_inst) + '\n')
def main(): opts = Options() fill_from_args(opts) id2qinfo = defaultdict(QInfo) for line in jsonl_lines(opts.gt): jobj = json.loads(line) id2qinfo[jobj['id']].fill_from_gt(jobj, blind_gt=opts.blind_gt) sums = defaultdict(float) counts = defaultdict(float) for line in jsonl_lines(opts.agg_preds): jobj = json.loads(line) qid = jobj['id'] qinfo = id2qinfo[qid] preds = np.array(jobj['predictions'], dtype=np.float32) predicted = np.argmax(preds) gt = qinfo.gt_agg_index qinfo.agg_pred = predicted qinfo.agg_confs = preds correct = 1 if predicted == gt else 0 counts[f'accuracy_{gt}'] += 1 sums[f'accuracy_{gt}'] += correct counts[f'accuracy'] += 1 sums[f'accuracy'] += correct if not opts.blind_gt: metric_names = list(sums.keys()) metric_names.sort() for n in metric_names: print(f'{n} = {sums[n]/counts[n]} over {counts[n]}') for line in jsonl_lines(opts.cell_preds): jobj = json.loads(line) qid = jobj['qid'] cell_preds = np.array(jobj['cells'], dtype=np.float32) qinfo = id2qinfo[qid] qinfo.cell_confs = cell_preds if opts.lookup_preds: for line in jsonl_lines(opts.lookup_preds): jobj = json.loads(line) qid = jobj['qid'] qinfo = id2qinfo[qid] if qinfo.compute_agg_pred() == 0: cell_preds = np.array(jobj['cells'], dtype=np.float32) qinfo.cell_confs = cell_preds err_analysis_count = 0 # make non-zero to show cases where no threshold is possible agg_ops = ['', 'MAX', 'MIN', 'COUNT', 'SUM', 'AVG'] per_agg_thresholds = np.zeros(len(agg_ops), dtype=np.float32) if opts.use_threshold <= -1000: for qinfo in id2qinfo.values(): qinfo.compute_threshold_range() if qinfo.threshold_range is None and qinfo.agg_pred != 0 and err_analysis_count > 0: err_analysis_count -= 1 print(f'No threshold possible: {qinfo.question}\nagg {agg_ops[qinfo.gt_agg_index]} over {qinfo.col_gt},{qinfo.row_gt} yielding {qinfo.answers_gt}') print(f'Predicted agg {agg_ops[qinfo.agg_pred]} over {np.argmax(qinfo.cell_confs[0])} yielding {qinfo.agg_answers}') print([f'{h}:{qinfo.col_vals[hi] is not None}' for hi, h in enumerate(qinfo.header)]) for ri, row in enumerate(qinfo.rows): to_show = [f'{cell}:{qinfo.cell_confs[ri,ci]}' for ci, cell in enumerate(row)] print(to_show) max_accuracy, best_threshold = find_best_threshold(id2qinfo.values()) print(f'can get {max_accuracy} with threshold {best_threshold}') print(f' {accuracy_at_threshold(id2qinfo.values(), best_threshold-0.1)} with threshold {best_threshold - 0.1}') print(f' {accuracy_at_threshold(id2qinfo.values(), best_threshold+0.1)} with threshold {best_threshold + 0.1}') for ai in range(0, per_agg_thresholds.shape[0]): acc, bt = find_best_threshold(id2qinfo.values(), for_agg_index=ai) print(f'for {agg_ops[ai]} can get {acc} with threshold {bt}') per_agg_thresholds[ai] = bt else: best_threshold = opts.use_threshold per_agg_thresholds[:] = opts.use_threshold missed_lookup = 0 lookup = 0 non_lookup = 0 lookup_by_agg = 0 pred_out = write_open(opts.prediction_file) if opts.prediction_file else None for qinfo in id2qinfo.values(): if qinfo.gt_agg_index == 0: lookup += 1 if qinfo.agg_pred != 0 and qinfo.threshold_range is not None: #print(f'Aggregation gets right answer anyway? {qinfo.question}\nagg {agg_ops[qinfo.gt_agg_index]} over {qinfo.col_gt},{qinfo.row_gt} yielding {qinfo.answers_gt}') #print(f'Predicted agg {agg_ops[qinfo.agg_pred]} over {np.argmax(qinfo.cell_confs[0])} yielding {qinfo.agg_answers}') if qinfo.threshold_range[0] <= best_threshold <= qinfo.threshold_range[1]: lookup_by_agg += 1 else: non_lookup += 1 if qinfo.gt_agg_index == 0 and qinfo.agg_pred != 0: missed_lookup += 1 if pred_out is not None: this_threshold = per_agg_thresholds[qinfo.agg_pred] if opts.threshold_per_agg else best_threshold pred_out.write(json.dumps({ 'id': qinfo.qid, 'predictions': qinfo.answer_at_threshold(this_threshold) })+'\n') if pred_out is not None: pred_out.close() if not opts.blind_gt: print(f'Lookup count = {lookup}, Non-lookup = {non_lookup}, ' f'Lookup mispredicted as non-lookup = {missed_lookup}, but correct anyway = {lookup_by_agg}')
def read_rc_examples(input_file, tokenizer: BertTokenizer, first_answer_only=False, include_source_info=False): """Read a RC jsonl file into a list of RCExample.""" #filter_pattern = re.compile("[\d{}]+$".format(re.escape(string.punctuation))) filter_pattern = re.compile("[{}]+$".format(re.escape(string.punctuation))) examples = [] impossible_count = 0 qid2answers = dict() if include_source_info else None answer_type_stats = np.zeros(len(AnswerType), dtype=np.int32) for line in jsonl_lines(input_file): jobj = json.loads(line) qid = jobj["qid"] passage_orig_text = jobj['passage'] passage_toks, passage_tok_offsets, passage_text = tokenizer.tokenize_offsets( passage_orig_text) if len(passage_toks) == 0: logger.info(f'bad passage: {passage_orig_text}') continue passage_tok_offsets = np.array(passage_tok_offsets, dtype=np.int32) norm_passage, norm_to_orig = normalize(passage_toks, filter_pattern) # TODO: we also need the passage normalized without filter_pattern, for use when the filter_pattern leaves the answer empty question_toks = tokenizer.tokenize(jobj["question"]) if qid2answers is not None: if qid in qid2answers and qid2answers[qid] != jobj['answers']: raise ValueError('answers not consistent!') qid2answers[qid] = jobj['answers'] # answer_type (span, yes, no) if 'answer_type' in jobj: answer_type = AnswerType[jobj['answer_type']] answer_type_stats[answer_type.value] += 1 else: answer_type = None # if the answer_type is anything other than span, the 'answers' should be empty if answer_type is None or answer_type == AnswerType.span: answers = jobj["answers"] else: answers = [] ans_starts = [] ans_ends = [] for ans in answers: ans_toks = tokenizer.tokenize(ans) if len(ans_toks) == 0 or sum([len(tok) for tok in ans_toks]) == 0: logger.info(f'bad answer for {qid}: "{ans}"') continue norm_ans, _ = normalize( ans_toks, filter_pattern ) # TODO: we need to know if we applied the filter or not so we can decide to use norm_passage with or without filter nstarts = find_answer_starts(norm_passage, norm_ans) starts = [norm_to_orig[s] for s in nstarts] ends = [norm_to_orig[s + len(norm_ans) - 1] for s in nstarts] ans_starts.extend(starts) ans_ends.extend(ends) if (answer_type is None or answer_type == AnswerType.span) and len(ans_starts) == 0: # sample the impossible ones # if impossible_count < 10: # logger.info(f'Impossible:\n Question "{jobj["question"]}"\n' # f' Passage "{passage_text}"\n Answers {str(answers)}\n' # f' Passage Tokens {str(norm_passage)}\n' # f' Answer Tokens {[normalize(tokenizer.tokenize(ans), filter_pattern)[0] for ans in answers]}') impossible_count += 1 # discard source information for training data to save some memory if not include_source_info: qid = None passage_text = None passage_tok_offsets = None if first_answer_only: ans_starts = ans_starts[:1] ans_ends = ans_ends[:1] example = RCExample(qid=qid, question=question_toks, passage=passage_toks, start_positions=ans_starts, end_positions=ans_ends, answer_type=answer_type, passage_text=passage_text, passage_token_offsets=passage_tok_offsets) examples.append(example) logger.info( f'from {input_file} loaded {impossible_count} impossible, {len(examples)} total' ) if answer_type_stats.sum() > 0: logger.info(f'Answer type statistics:') for at in AnswerType: logger.info(f' {at.name} = {answer_type_stats[at.value]}') return examples, qid2answers
ans_file = os.path.join(opts.data_dir, f"{split}_ans.jsonl.gz") tbl_file = os.path.join(opts.data_dir, f"{split}.tables.jsonl") engine = DBEngine(db_file) exact_match = [] with open(orig) as fs, write_open(ans_file) as fo: grades = [] for ls in tqdm(fs, total=count_lines(orig)): eg = json.loads(ls) sql = eg['sql'] qg = Query.from_dict(sql, ordered=False) gold = engine.execute_query(eg['table_id'], qg, lower=True) assert isinstance(gold, list) #if len(gold) != 1: # print(f'for {sql} : {gold}') eg['answer'] = gold eg['rowids'] = engine.execute_query_rowid(eg['table_id'], qg, lower=True) # CONSIDER: if it is not an agg query, somehow identify the particular cell fo.write(json.dumps(eg) + '\n') convert(jsonl_lines(ans_file), jsonl_lines(tbl_file), os.path.join(opts.data_dir, f"{split}_agg.jsonl.gz"), skip_aggregation=False) convert(jsonl_lines(ans_file), jsonl_lines(tbl_file), os.path.join(opts.data_dir, f"{split}_lookup.jsonl.gz"), skip_aggregation=True) write_agg_classify(opts.data_dir, split)
def main(): opts = Options() fill_from_args(opts) if opts.gt: id2gt = dict() lookup_subset = set() for line in jsonl_lines(opts.gt): jobj = json.loads(line) qid = jobj['id'] tbl = jobj['rows'] correct_cells = np.zeros((len(tbl), len(tbl[0])), dtype=np.bool) target_rows = jobj['target_rows'] if 'target_rows' in jobj else [ jobj['target_row'] ] target_cols = jobj[ 'target_columns'] if 'target_columns' in jobj else [ jobj['target_column'] ] # TODO: also support getting correct cells from answers list for r in target_rows: for c in target_cols: correct_cells[r, c] = True #if correct_cells.sum() == 0: # print(f'No answer! {target_rows}, {target_cols}, {jobj["agg_index"]}') id2gt[qid] = correct_cells if 'agg_index' not in jobj or jobj['agg_index'] == 0: lookup_subset.add(qid) else: id2gt = None lookup_subset = None sums = defaultdict(float) counts = defaultdict(float) table_count = 0 no_answer_count = 0 col_predictions = gather_predictions(opts.col, softmax=opts.softmax) row_predictions = gather_predictions(opts.row, softmax=False) if opts.cell_prediction_output: cell_prediction_output = write_open(opts.cell_prediction_output) else: cell_prediction_output = None with write_open(opts.output) as out: for qid, col_preds in col_predictions.items(): col_preds = to_ndarray(col_preds) row_preds = to_ndarray(row_predictions[qid]) cell_preds = row_preds.reshape((-1, 1)) + col_preds.reshape( (1, -1)) if id2gt is not None: correct_cells = id2gt[qid] if correct_cells.sum() > 0: avg_p = average_precision_score( y_true=correct_cells.reshape(-1), y_score=cell_preds.reshape(-1)) sums['auc'] += avg_p counts['auc'] += 1 if qid in lookup_subset: sums['auc (lookup)'] += avg_p counts['auc (lookup)'] += 1 else: sums['auc (aggregation)'] += avg_p counts['auc (aggregation)'] += 1 else: no_answer_count += 1 table_count += 1 out.write( json.dumps({ 'qid': qid, 'cells': cell_preds.tolist(), 'rows': row_preds.tolist(), 'cols': col_preds.tolist() }) + '\n') if cell_prediction_output is not None: cell_prediction_output.write( json.dumps({ 'id': qid, 'cell_predictions': to_cell_predictions(cell_preds, top_k=20) }) + '\n') if cell_prediction_output is not None: cell_prediction_output.close() for n, v in sums.items(): print(f'{n} = {v/counts[n]}') print(f'Over {table_count} tables') if id2gt is not None and no_answer_count > 0: print(f'{no_answer_count} tables with no correct answer')
else: self.neg_count += 1 if not is_pos: self.all_neg_count += 1 return insts class Options(Config): def __init__(self): super().__init__() self.input_dir = '' self.style = 'lookup' self.output_dir = '' if __name__ == "__main__": opts = Options() fill_from_args(opts) for split in ['train', 'dev', 'test']: cols = ColumnConvert(opts) rows = RowConvert(opts) with write_open(os.path.join(opts.output_dir, split, 'row.jsonl.gz')) as rout, \ write_open(os.path.join(opts.output_dir, split, 'col.jsonl.gz')) as cout: for line in jsonl_lines( os.path.join(opts.input_dir, f'{split}_{opts.style}.jsonl.gz')): for r in rows.convert(line): rout.write(json.dumps(r.to_dict()) + '\n') for c in cols.convert(line): cout.write(json.dumps(c.to_dict()) + '\n')