def test(config): score_file_path = os.path.join(config.test_dir, 'test_scores.jsonl') labels_file_path = os.path.join(config.setup_dir, 'test_labels.jsonl') tpms_scores_file = config.tpms_scores_file scores = {} for data in utils.jsonl_reader(tpms_scores_file): source_id = data['source_id'] target_id = data['target_id'] score = data['score'] if source_id not in scores: scores[source_id] = {} if target_id not in scores[source_id]: scores[source_id][target_id] = score with open(score_file_path, 'w') as w: for data in utils.jsonl_reader(labels_file_path): paperid = data['source_id'] userid = data['target_id'] label = data['label'] if paperid in scores: score = scores[paperid].get(userid, 0.0) if float(score) > -float('inf'): result = { 'source_id': paperid, 'target_id': userid, 'score': float(score), 'label': int(label) } w.write(json.dumps(result) + '\n') (list_of_list_of_labels, list_of_list_of_scores) = utils.load_labels(score_file_path) map_score = float(eval_map(list_of_list_of_labels, list_of_list_of_scores)) hits_at_1 = float( eval_hits_at_k(list_of_list_of_labels, list_of_list_of_scores, k=1)) hits_at_3 = float( eval_hits_at_k(list_of_list_of_labels, list_of_list_of_scores, k=3)) hits_at_5 = float( eval_hits_at_k(list_of_list_of_labels, list_of_list_of_scores, k=5)) hits_at_10 = float( eval_hits_at_k(list_of_list_of_labels, list_of_list_of_scores, k=10)) score_lines = [[ config.name, text, data ] for text, data in [('MAP', map_score), ( 'Hits@1', hits_at_1), ('Hits@3', hits_at_3), ('Hits@5', hits_at_5), ('Hits@10', hits_at_10)]] config.test_save(score_lines, 'test.scores.tsv')
def load_jsonl(filename): labels_by_forum = defaultdict(dict) scores_by_forum = defaultdict(dict) for data in utils.jsonl_reader(filename): forum = data['source_id'] reviewer = data['target_id'] label = data['label'] score = data['score'] labels_by_forum[forum][reviewer] = label scores_by_forum[forum][reviewer] = score result_labels = [] result_scores = [] for forum, labels_by_reviewer in labels_by_forum.items(): scores_by_reviewer = scores_by_forum[forum] reviewer_scores = list(scores_by_reviewer.items()) reviewer_labels = list(labels_by_reviewer.items()) sorted_labels = [label for _, label in sorted(reviewer_labels)] sorted_scores = [score for _, score in sorted(reviewer_scores)] result_labels.append(sorted_labels) result_scores.append(sorted_scores) return result_labels, result_scores
def test(config): dataset = Dataset(**config.dataset) labels_by_reviewer_by_forum = defaultdict(dict) for bid in dataset.bids(): label = 1 if bid.tag in dataset.positive_bid_values else 0 labels_by_reviewer_by_forum[bid.forum][bid.signatures[0]] = label inferred_scores_path = os.path.join(config.infer_dir, config.name + '-scores.jsonl') labeled_data_list = [] for data in utils.jsonl_reader(inferred_scores_path): forum = data['source_id'] reviewer = data['target_id'] score = float(data['score']) if not score >= 0.0: score = 0.0 if reviewer in labels_by_reviewer_by_forum[forum]: label = labels_by_reviewer_by_forum[forum][reviewer] labeled_data = {k: v for k, v in data.items()} labeled_data.update({'label': label, 'score': score}) labeled_data_list.append(labeled_data) config.test_save(labeled_data_list, 'score_labels.jsonl') labels_file = config.test_path('score_labels.jsonl') list_of_list_of_labels, list_of_list_of_scores = utils.load_labels( labels_file) map_score = float(eval_map(list_of_list_of_labels, list_of_list_of_scores)) hits_at_1 = float( eval_hits_at_k(list_of_list_of_labels, list_of_list_of_scores, k=1)) hits_at_3 = float( eval_hits_at_k(list_of_list_of_labels, list_of_list_of_scores, k=3)) hits_at_5 = float( eval_hits_at_k(list_of_list_of_labels, list_of_list_of_scores, k=5)) hits_at_10 = float( eval_hits_at_k(list_of_list_of_labels, list_of_list_of_scores, k=10)) score_lines = [[ config.name, text, data ] for text, data in [('MAP', map_score), ( 'Hits@1', hits_at_1), ('Hits@3', hits_at_3), ('Hits@5', hits_at_5), ('Hits@10', hits_at_10)]] config.test_save(score_lines, 'test.scores.tsv')
def read_bid_records(data_dir, return_batches): for filename in os.listdir(data_dir): filepath = os.path.join(data_dir, filename) file_id = filename.replace('.jsonl', '') if return_batches: batch = [] for record in utils.jsonl_reader(filepath): if not return_batches: yield file_id, record else: batch.append(record) if return_batches: yield file_id, batch
def _read_bids(self): for filename in os.listdir(self.bids_dir): filepath = os.path.join(self.bids_dir, filename) file_id = filename.replace('.jsonl', '') for json_line in utils.jsonl_reader(filepath): yield Tag.from_json(json_line)
def fold_reader(id): fold_file = f'{id}.jsonl' fold_path = os.path.join(config.kp_setup_dir, 'folds', fold_file) return utils.jsonl_reader(fold_path)
def fold_reader(id): fold_file = f'{id}.jsonl' fold_path = os.path.join(config.bpr_samples, fold_file) return utils.jsonl_reader(fold_path)