예제 #1
0
def get_papers(names):
    inp = [{"ids": [""], "names": [n]} for n in names]
    out = calc_reviewer_db_mapping(inp,
                                   db,
                                   author_col="name",
                                   author_field='authors')
    print(out.shape)
    data = {}
    for j, n in enumerate(names):
        ind, = out[:, j].nonzero()
        _, papers = torch.topk(torch.tensor(mat[:, ind].sum(-1)), 25)
        # print("Author:", n)
        # for i in papers:
        #     print("\t", accepted_submissions[i].content["title"])
        #     print()
        #     print()
        data[n] = [abstract_keys[p] for p in papers.tolist()]
    return data
예제 #2
0
        for i, data in enumerate(reviewer_data_orig):
            if data['areaChair']:
                reviewer_remapping[i] = len(reviewer_data)
                reviewer_data.append(data)
            else:
                reviewer_remapping[i] = -1
        for data in reviewer_data:
            if 'name' in data:
                data['names'] = [data['name']]
                del data['name']
        reviewer_names = [x['names'][0] for x in reviewer_data]
        print(f'Have {len(reviewer_data)} reviewers', file=sys.stderr)
    with open(args.db_file, "r") as f:
        db = [json.loads(x) for x in f]  # for debug
        db_abs = [x['paperAbstract'] for x in db]
    rdb = calc_reviewer_db_mapping(reviewer_data, db, author_field='authors')

    # At least half of the above papers are not authored by reviewers, hacking them out
    includes_reviewer = rdb.sum(axis=1)
    new_db = []
    for i, paper in enumerate(db):
        if includes_reviewer[i] >= 1:
            new_db.append(paper)
    db = new_db
    db_abs = [x['paperAbstract'] for x in db]
    rdb = calc_reviewer_db_mapping(reviewer_data, db, author_field='authors')

    # Calculate or load paper similarity matrix
    if args.paper_matrix and os.path.exists(args.paper_matrix):
        mat = np.load(args.paper_matrix)
        assert (mat.shape[0] == len(submission_abs)
                        help="Which field to filter on (name,id)")
    parser.add_argument(
        "--bid_file",
        type=str,
        required=True,
        help=
        "A file containing numpy array of bids (0 = COI, 1 = no, 2 = maybe, 3 = yes)"
    )

    args = parser.parse_args()

    # Load the data
    with open(args.suggestion_file, "r") as f:
        submissions = [json.loads(x) for x in f]
    with open(args.reviewer_file, "r") as f:
        reviewer_data = [json.loads(x) for x in f]
        reviewer_names = [x['names'][0] for x in reviewer_data]
    bids = np.load(args.bid_file)
    mapping = suggest_utils.calc_reviewer_db_mapping(
        reviewer_data,
        submissions,
        author_col=args.filter_field,
        author_field='assignedReviewers')

    all_assignments = np.sum(mapping)

    # Total of all bid scores, minus one, divided by number of assignments
    for bid in range(4):
        bid_count = np.sum(np.where((mapping == 1) & (bids == bid), 1, 0))
        print(f'Ratio of {bid}: {bid_count/all_assignments}')
예제 #4
0
def main():

    # --------------------------------------------------------------------------
    # Part 1: Read in the arguments
    # --------------------------------------------------------------------------

    args = parse_args()

    # --------------------------------------------------------------------------
    # Part 2: Load the data and calculate similarity between submissions and
    # reviewers
    # --------------------------------------------------------------------------

    data_load_start_time = time.time()

    with open(args.submission_file, "r") as f:
        submissions = [json.loads(x) for x in f]
        submission_abs = [x['paperAbstract'] for x in submissions]
    with open(args.reviewer_file, "r") as f:
        reviewer_data = [json.loads(x) for x in f]
        for data in reviewer_data:
            if 'name' in data:
                data['names'] = [data['name']]
                del data['name']
        reviewer_names = [x['names'][0] for x in reviewer_data]
    with open(args.db_file, "r") as f:
        db = [json.loads(x) for x in f]  # for debug
        db_abs = [x['paperAbstract'] for x in db]
    rdb = calc_reviewer_db_mapping(reviewer_data, db, author_field='authors')

    # FIXME: about half of the above papers are bollocks -- quick hack to filter
    # to those papers actually authored by reviewers
    includes_reviewer = rdb.sum(axis=1)
    new_db = []
    for i, paper in enumerate(db):
        if includes_reviewer[i] >= 1:
            new_db.append(paper)
    db = new_db
    db_abs = [x['paperAbstract'] for x in db]
    rdb = calc_reviewer_db_mapping(reviewer_data, db, author_field='authors')

    data_load_end_time = time.time()
    data_load_time = round((data_load_end_time - data_load_start_time) / 60, 2)
    print(f"Time loading and preprocessing data: {data_load_time} minutes",
          file=sys.stderr)
    similarity_matrix_start_time = time.time()

    # Calculate or load paper similarity matrix
    if args.load_paper_matrix:
        mat = np.load(args.load_paper_matrix)
        assert (mat.shape[0] == len(submission_abs)
                and mat.shape[1] == len(db_abs))
    else:
        print('Loading model', file=sys.stderr)
        model, epoch = load_model(None, args.model_file, force_cpu=True)
        model.eval()
        assert not model.training
        mat = calc_similarity_matrix(model, db_abs, submission_abs)
        if args.save_paper_matrix:
            np.save(args.save_paper_matrix, mat)

    similarity_matrix_end_time = time.time()
    similarity_matrix_time = round(
        (similarity_matrix_end_time - similarity_matrix_start_time) / 60, 2)
    print(
        "Time calculating paper similarity matrix:"
        f" {similarity_matrix_time} minutes",
        file=sys.stderr)
    aggregation_start_time = time.time()

    # Calculate reviewer scores based on paper similarity scores
    if args.load_aggregate_matrix:
        reviewer_scores = np.load(args.load_aggregate_matrix)
        assert (reviewer_scores.shape[0] == len(submission_abs)
                and reviewer_scores.shape[1] == len(reviewer_names))
    else:
        print('Calculating aggregate reviewer scores', file=sys.stderr)
        reviewer_scores = calc_aggregate_reviewer_score(
            rdb, mat, args.aggregator)
        if args.save_aggregate_matrix:
            np.save(args.save_aggregate_matrix, reviewer_scores)

    aggregation_end_time = time.time()
    aggregation_time = round(
        (aggregation_end_time - aggregation_start_time) / 60, 2)
    print(
        "Time calculating aggregated similarity matrix:"
        f" {aggregation_time} minutes",
        file=sys.stderr)
    formulization_start_time = time.time()

    # --------------------------------------------------------------------------
    # Part 3: Adjust reviewer_scores based on COI, AC role; add quota
    # constraints; optionally split into subproblems by track
    # --------------------------------------------------------------------------

    # --------------------------------------------------------------------------
    # Part 3(a): Adjust reviewer_scores based on COIs
    # --------------------------------------------------------------------------

    cois = np.where(np.load(args.bid_file) == 0, 1,
                    0) if args.bid_file else None
    if cois is not None:
        num_cois = np.sum(cois)
        print(f"Applying {num_cois} COIs", file=sys.stderr)
        reviewer_scores = np.where(cois == 0, reviewer_scores,
                                   reviewer_scores - 110)

    # --------------------------------------------------------------------------
    # Part 3(b): Load reviewer specific quotas
    # --------------------------------------------------------------------------

    quotas = {}
    if args.quota_file:
        quotas = assign_quotas(reviewer_data,
                               args.quota_file,
                               args.max_papers_per_reviewer,
                               area_chairs=args.area_chairs)
    print(f"Set {len(quotas)} reviewer quotas", file=sys.stderr)

    # --------------------------------------------------------------------------
    # Part 3(c): Adjust reviewer_scores based on ACs
    # If --area_chairs is specified, only ACs get papers. If it is not
    # specified, SACs and ACs should not get papers or be shown as similar
    # reviewers (i.e., the reviewer_score is set to -150 for those positions)
    # --------------------------------------------------------------------------

    reviewer_scores, num_included, num_excluded = exclude_positions(
        reviewer_data, reviewer_scores, area_chairs=args.area_chairs)
    print(f"Excluded {num_excluded} reviewers/chairs, leaving {num_included}",
          file=sys.stderr)

    # --------------------------------------------------------------------------
    # Part 4: Break the optimization into subproblems
    # If --track is not specified, there will be a single subproblem called
    # ``all_tracks``, although ACs or reviewers will be excluded as necessary.
    # If --track is specified, the matrix will be broken into one optimization
    # subproblem per track
    # --------------------------------------------------------------------------

    optimization_problems, problem_papers, problem_reviewers, problem_quotas = (
        split_by_subproblem(reviewer_data,
                            submissions,
                            reviewer_scores,
                            quotas,
                            by_track=args.track,
                            area_chairs=args.area_chairs))

    formulization_end_time = time.time()
    formulization_time = round(
        (formulization_end_time - formulization_start_time) / 60, 2)
    print(
        f"Time formulating optimization problem: {formulization_time} minutes",
        file=sys.stderr)
    optimization_start_time = time.time()

    # --------------------------------------------------------------------------
    # Part 5: Calculate a reviewer assignment based on the constraints
    # --------------------------------------------------------------------------

    problem_assignments = {}
    problem_scores = {}
    for problem in optimization_problems.keys():
        final_scores = optimization_problems[problem]

        if args.anonymity_multiplier != 1.0:
            print(
                "Calculating initial assignment of reviewers for category"
                f" {problem}",
                file=sys.stderr)
            final_scores, assignment_score = create_suggested_assignment(
                final_scores,
                min_papers_per_reviewer=args.min_papers_per_reviewer,
                max_papers_per_reviewer=args.max_papers_per_reviewer,
                reviews_per_paper=args.reviews_per_paper,
                quotas=problem_quotas[problem],
                anonymity_multiplier=args.anonymity_multiplier)
            print(
                "Done calculating initial assignment,"
                f" total score: {assignment_score}",
                file=sys.stderr)
            final_scores += np.random.random(final_scores.shape) * 1e-4

        print(f"Calculating assignment of reviewers for category {problem}",
              file=sys.stderr)

        # final_scores includes the penalties for COI. The constraints for CP
        # itself are only the quota constraints (max/min # of papers a reviewer
        # wants to review)
        assignment, assignment_score = create_suggested_assignment(
            final_scores,
            min_papers_per_reviewer=args.min_papers_per_reviewer,
            max_papers_per_reviewer=args.max_papers_per_reviewer,
            reviews_per_paper=args.reviews_per_paper,
            quotas=problem_quotas[problem])

        problem_assignments[problem] = assignment
        problem_scores[problem] = assignment_score

        print(f"Done calculating assignment. Total score: {assignment_score}",
              file=sys.stderr)
        if assignment is None:
            warnings.warn(f"No solution found for category {problem}",
                          RuntimeWarning)

    optimization_end_time = time.time()
    optimization_time = round(
        (optimization_end_time - optimization_start_time) / 60, 2)
    print(
        "Time calculating optimal assignment of papers:"
        f" {optimization_time} minutes",
        file=sys.stderr)

    # --------------------------------------------------------------------------
    # Part 6: Parse the assignments into a dictionary of reviewer IDs and other
    # info for each submission
    # --------------------------------------------------------------------------

    global_assignments = parse_assignments(
        submissions=submissions,
        paper_similarity_matrix=mat,
        optimization_matrices=optimization_problems,
        problem_papers=problem_papers,
        problem_reviewers=problem_reviewers,
        problem_assignments=problem_assignments,
        by_track=args.track,
        num_assigned=args.reviews_per_paper,
        num_similar=args.num_similar_to_list)

    # --------------------------------------------------------------------------
    # Part 7: Print out the results in jsonl format
    # --------------------------------------------------------------------------

    jsonl_data = get_jsonl_rows(assignments=global_assignments,
                                submissions=submissions,
                                reviewers=reviewer_data,
                                db_papers=db)

    with open(args.suggestion_file, 'w') as outf:
        for entry in jsonl_data:
            if args.output_type == 'json':
                print(json.dumps(entry), file=outf)
            elif args.output_type == 'text':
                print_text_report(entry, file=outf)
            else:
                raise ValueError(f'Illegal output_type {args.output_type}')

    print(f"Done creating suggestions, written to {args.suggestion_file}\n",
          file=sys.stderr)

    # --------------------------------------------------------------------------
    # Part 8 (optional): Print out the results in more human-readable
    # spreadsheets
    # --------------------------------------------------------------------------

    # ACL-2021: We are outputting an alternative data file to easily create
    # a per-track spreadsheet of assigned reviewers, as well as a global file
    # with the minimum assignment information
    if args.assignment_spreadsheet:

        global_header_info = get_csv_header(
            reviews_per_paper=args.reviews_per_paper,
            num_similar=args.num_similar_to_list,
            area_chairs=args.area_chairs,
            is_global=True)
        track_header_info = get_csv_header(
            reviews_per_paper=args.reviews_per_paper,
            num_similar=args.num_similar_to_list,
            area_chairs=args.area_chairs,
            is_global=False)
        coi_header_info = track_header_info + ['Original track']

        global_data, track_data = get_csv_rows(
            assignments=global_assignments,
            reviewers=reviewer_data,
            cois=cois,
            reviews_per_paper=args.reviews_per_paper,
            area_chairs=args.area_chairs)

        global_rows, global_softconf_uploadable = global_data
        track_rows, track_softconf_uploadables = track_data

        # Separate the input file base from its extension so we can print
        # multiple files with the same general schema
        file_base, file_extension = (os.path.splitext(
            args.assignment_spreadsheet)[:2])

        # Open the file path given in the arguments as the global assignment
        # spreadsheet, writing each row
        with open(args.assignment_spreadsheet, 'w+') as f:
            writer = csv.writer(f,
                                delimiter=',',
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)
            writer.writerow(global_header_info)
            for entry in global_rows:
                writer.writerow(entry)
        with open(file_base + '.txt', 'w+') as f:
            for line in global_softconf_uploadable:
                print(line, file=f)

        # For each track, create a file as a csv spreadsheet for all the track
        # submissions and their reviewer assignments
        for track in track_rows.keys():
            alphanum_track = '-'.join(re.split(r'[\W,:]+', track))
            filename = f'{file_base}_{alphanum_track}{file_extension}'
            with open(filename, 'w+') as f:
                writer = csv.writer(f,
                                    delimiter=',',
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)
                if track == 'COI':
                    writer.writerow(coi_header_info)
                else:
                    writer.writerow(track_header_info)
                for entry in track_rows[track]:
                    writer.writerow(entry)
            filename = f'{file_base}_{alphanum_track}.txt'
            with open(filename, 'w+') as f:
                for line in track_softconf_uploadables[track]:
                    print(line, file=f)
예제 #5
0
            reviewers_by_track[data['track']].extend(data['names'])
        reviewer_names = [x['names'][0] for x in reviewer_data]

        num_tracks = len(acs_by_track)
        assert set(sacs_by_track.keys()) == set(acs_by_track.keys())
        #assert set(sacs_by_track.keys()) == sub_tracks # there's a COI track, with no papers (yet)

        # FIXME: someone has AC roles 'Information Extraction:NLP Applications'

    with open(args.db_file, "r") as f:
        db = [json.loads(x) for x in f]  # for debug
        db_abs = [x['paperAbstract'] for x in db]

    # create binary matrix of reviewer x paper
    rdb = calc_reviewer_db_mapping(reviewer_data,
                                   db,
                                   author_col=args.filter_field,
                                   author_field='authors')

    # Calculate or load paper similarity matrix
    if args.load_paper_matrix:
        mat = np.load(args.load_paper_matrix)
        assert (mat.shape[0] == len(submission_abs)
                and mat.shape[1] == len(db_abs))
    else:
        print('Loading model', file=sys.stderr)
        model, epoch = load_model(None, args.model_file, force_cpu=True)
        model.eval()
        assert not model.training
        mat = calc_similarity_matrix(model, db_abs, submission_abs)
        if args.save_paper_matrix:
            np.save(args.save_paper_matrix, mat)