Exemplo n.º 1
0
def run_textrank(config):
    '''
    First define the dataset, vocabulary, and keyphrase extractor
    '''

    experiment_path = os.path.dirname(config.experiment_dir)

    kps_dir = os.path.join(experiment_path, 'keyphrases')
    if not os.path.isdir(kps_dir):
        os.makedirs(kps_dir)
    config.update(kp_setup_dir=kps_dir)

    print('starting setup')
    dataset = Dataset(directory=config.dataset['directory'])
    textrank_vocab = Vocab() # vocab used for textrank-based keyphrases
    full_vocab = Vocab() # vocab used on the full text

    print('keyphrase extraction')
    textrank_kps_by_id = {}
    full_kps_by_id = {}

    all_archives = itertools.chain(
        dataset.submissions(return_batches=True),
        dataset.archives(return_batches=True))

    for archive_id, content_list in tqdm(
            all_archives, total=dataset.total_archive_count + dataset.submission_count):

        scored_kps = []
        full_kps = []
        for content in content_list:
            text = utils.content_to_text(content)
            top_tokens, full_tokens = keyphrases(text, include_scores=True, include_tokenlist=True)
            scored_kps.extend(top_tokens)
            full_kps.append(full_tokens)
        sorted_kps = [kp for kp, _ in sorted(scored_kps, key=lambda x: x[1], reverse=True)]

        top_kps = []
        kp_count = 0
        for kp in sorted_kps:
            if kp not in top_kps:
                top_kps.append(kp)
                kp_count += 1
            if kp_count >= config.max_num_keyphrases:
                break

        textrank_vocab.load_items(top_kps)
        full_vocab.load_items([kp for archive in full_kps for kp in archive])
        assert archive_id not in textrank_kps_by_id
        textrank_kps_by_id[archive_id] = top_kps
        full_kps_by_id[archive_id] = full_kps

    utils.dump_pkl(os.path.join(config.kp_setup_dir, 'textrank_kps_by_id.pkl'), textrank_kps_by_id)
    utils.dump_pkl(os.path.join(config.kp_setup_dir, 'full_kps_by_id.pkl'), full_kps_by_id)
    utils.dump_pkl(os.path.join(config.kp_setup_dir, 'textrank_vocab.pkl'), textrank_vocab)
    utils.dump_pkl(os.path.join(config.kp_setup_dir, 'full_vocab.pkl'), full_vocab)

    return config
Exemplo n.º 2
0
def test(config):

    dataset = Dataset(**config.dataset)

    labels_by_reviewer_by_forum = defaultdict(dict)
    for bid in dataset.bids():
        label = 1 if bid.tag in dataset.positive_bid_values else 0
        labels_by_reviewer_by_forum[bid.forum][bid.signatures[0]] = label

    inferred_scores_path = os.path.join(config.infer_dir,
                                        config.name + '-scores.jsonl')

    labeled_data_list = []
    for data in utils.jsonl_reader(inferred_scores_path):
        forum = data['source_id']
        reviewer = data['target_id']
        score = float(data['score'])
        if not score >= 0.0:
            score = 0.0

        if reviewer in labels_by_reviewer_by_forum[forum]:
            label = labels_by_reviewer_by_forum[forum][reviewer]

            labeled_data = {k: v for k, v in data.items()}
            labeled_data.update({'label': label, 'score': score})
            labeled_data_list.append(labeled_data)

    config.test_save(labeled_data_list, 'score_labels.jsonl')

    labels_file = config.test_path('score_labels.jsonl')

    list_of_list_of_labels, list_of_list_of_scores = utils.load_labels(
        labels_file)

    map_score = float(eval_map(list_of_list_of_labels, list_of_list_of_scores))
    hits_at_1 = float(
        eval_hits_at_k(list_of_list_of_labels, list_of_list_of_scores, k=1))
    hits_at_3 = float(
        eval_hits_at_k(list_of_list_of_labels, list_of_list_of_scores, k=3))
    hits_at_5 = float(
        eval_hits_at_k(list_of_list_of_labels, list_of_list_of_scores, k=5))
    hits_at_10 = float(
        eval_hits_at_k(list_of_list_of_labels, list_of_list_of_scores, k=10))

    score_lines = [[
        config.name, text, data
    ] for text, data in [('MAP', map_score), (
        'Hits@1', hits_at_1), ('Hits@3',
                               hits_at_3), ('Hits@5',
                                            hits_at_5), ('Hits@10',
                                                         hits_at_10)]]
    config.test_save(score_lines, 'test.scores.tsv')
Exemplo n.º 3
0
def infer(config):
    experiment_dir = Path(config['experiment_dir']).resolve()

    model = utils.load_pkl(config['tfidf_model'])

    dataset = Dataset(**config['dataset'])

    paperids = list(model.bow_archives_by_paperid.keys())
    paperidx_by_id = {paperid: index for index, paperid in enumerate(paperids)}

    score_file_path = experiment_dir.joinpath(config['name'] + '-scores.csv')

    bids_by_forum = expertise.utils.get_bids_by_forum(dataset)
    submission_ids = [n for n in dataset.submission_ids]
    reviewer_ids = [r for r in dataset.reviewer_ids]
    # samples = expertise.utils.format_bid_labels(submission_ids, bids_by_forum)

    scores = {}
    max_score = 0.0
    for paperid, userid in itertools.product(submission_ids, reviewer_ids):
        # label = data['label']

        if userid not in scores:
            # bow_archive is a list of BOWs.
            if userid in model.bow_archives_by_userid and len(
                    model.bow_archives_by_userid[userid]) > 0:
                bow_archive = model.bow_archives_by_userid[userid]
            else:
                bow_archive = [[]]

            best_scores = np.amax(model.index[bow_archive], axis=0)
            scores[userid] = best_scores

            user_max_score = max(best_scores)
            if user_max_score > max_score:
                max_score = user_max_score

    print('max score', max_score)

    with open(score_file_path, 'w') as w:
        for userid, user_scores in scores.items():
            for paperidx, paper_score in enumerate(user_scores):
                paperid = paperids[paperidx]
                score = scores[userid][paperidx] / max_score

                w.write('{0},{1},{2:.3f}'.format(paperid, userid, score))
                w.write('\n')

    return config
Exemplo n.º 4
0
def setup(config):

    print('starting setup')
    dataset = Dataset(**config.dataset)
    bids_by_forum = utils.get_bids_by_forum(dataset)
    vocab = utils.load_pkl(
        os.path.join(config.kp_setup_dir, 'textrank_vocab.pkl'))

    (train_set_ids, dev_set_ids,
     test_set_ids) = utils.split_ids(list(dataset.submission_ids),
                                     seed=config.random_seed)

    def fold_reader(id):
        fold_file = f'{id}.jsonl'
        fold_path = os.path.join(config.kp_setup_dir, 'folds', fold_file)
        return utils.jsonl_reader(fold_path)

    train_folds = [fold_reader(i) for i in train_set_ids]
    dev_folds = [fold_reader(i) for i in dev_set_ids]
    test_folds = [fold_reader(i) for i in test_set_ids]

    train_samples = (data_to_sample(data, vocab, config.max_num_keyphrases)
                     for data in itertools.chain(*train_folds))

    train_samples_path = os.path.join(config.setup_dir, 'train_samples.jsonl')

    utils.dump_jsonl(train_samples_path, train_samples)

    dev_samples = (data_to_sample(data, vocab, config.max_num_keyphrases)
                   for data in itertools.chain(*dev_folds))

    dev_samples_path = os.path.join(config.setup_dir, 'dev_samples.jsonl')

    utils.dump_jsonl(dev_samples_path, dev_samples)

    test_samples = (data_to_sample(data, vocab, config.max_num_keyphrases)
                    for data in itertools.chain(*test_folds))

    test_samples_path = os.path.join(config.setup_dir, 'test_samples.jsonl')

    utils.dump_jsonl(test_samples_path, test_samples)

    # features_dir = './scibert_features/akbc19/setup/archives-features/'
    features_dir = config.bert_features_dir
    archive_features_dir = os.path.join(features_dir, 'archives-features')
    submission_features_dir = os.path.join(features_dir,
                                           'submissions-features')
Exemplo n.º 5
0
def setup(config):
    print('starting setup')
    setup_dir = os.path.join(config.experiment_dir, 'setup')
    if not os.path.exists(setup_dir):
        os.mkdir(setup_dir)

    dataset = Dataset(**config.dataset)
    vocab = utils.load_pkl(
        os.path.join(config.kp_setup_dir, 'textrank_vocab.pkl'))

    (train_set_ids, dev_set_ids,
     test_set_ids) = utils.split_ids(list(dataset.submission_ids),
                                     seed=config.random_seed)

    def fold_reader(id):
        fold_file = f'{id}.jsonl'
        fold_path = os.path.join(config.bpr_samples, fold_file)
        return utils.jsonl_reader(fold_path)

    train_folds = [fold_reader(i) for i in train_set_ids]
    dev_folds = [fold_reader(i) for i in dev_set_ids]
    test_folds = [fold_reader(i) for i in test_set_ids]

    train_samples = (data_to_sample(data, vocab, config.max_num_keyphrases)
                     for data in itertools.chain(*train_folds))

    train_samples_path = os.path.join(setup_dir, 'train_samples.jsonl')

    utils.dump_jsonl(train_samples_path, train_samples)

    dev_samples = (data_to_sample(data, vocab, config.max_num_keyphrases)
                   for data in itertools.chain(*dev_folds))

    dev_samples_path = os.path.join(setup_dir, 'dev_samples.jsonl')

    utils.dump_jsonl(dev_samples_path, dev_samples)

    test_samples = (data_to_sample(data, vocab, config.max_num_keyphrases)
                    for data in itertools.chain(*test_folds))

    test_samples_path = os.path.join(setup_dir, 'test_samples.jsonl')

    utils.dump_jsonl(test_samples_path, test_samples)

    return config
Exemplo n.º 6
0
def setup(config):
    assert os.path.exists(
        config.tpms_scores_file
    ), 'This model requires a pre-computed tpms score file.'

    dataset = Dataset(**config.dataset)
    experiment_dir = os.path.abspath(config.experiment_dir)

    setup_dir = os.path.join(experiment_dir, 'setup')
    if not os.path.exists(setup_dir):
        os.mkdir(setup_dir)

    (train_set_ids, dev_set_ids,
     test_set_ids) = utils.split_ids(list(dataset.submission_ids),
                                     seed=config.random_seed)

    bids_by_forum = utils.get_bids_by_forum(dataset)

    test_labels = utils.format_bid_labels(test_set_ids, bids_by_forum)

    utils.dump_jsonl(os.path.join(config.setup_dir, 'test_labels.jsonl'),
                     test_labels)
Exemplo n.º 7
0
def test(config):

    dataset = Dataset(**config.dataset)

    model = expertise.utils.load_pkl(os.path.join(config.train_dir, 'model.pkl'))

    paperidx_by_id = {
        paperid: index
        for index, paperid
        in enumerate(model.bow_archives_by_paperid.keys())
    }

    test_dir = os.path.join(config.experiment_dir, 'test')
    if not os.path.isdir(test_dir):
        os.mkdir(test_dir)

    config.update(test_dir=test_dir)

    score_file_path = os.path.join(config.test_dir, 'test_scores.jsonl')
    labels_file_path = os.path.join(config.setup_dir, 'test_labels.jsonl')

    scores = {}

    with open(score_file_path, 'w') as w:
        for data in expertise.utils.jsonl_reader(labels_file_path):
            paperid = data['source_id']
            userid = data['target_id']
            label = data['label']

            if userid not in scores:
                # bow_archive is a list of BOWs.
                if userid in model.bow_archives_by_userid and len(model.bow_archives_by_userid[userid]) > 0:
                    bow_archive = model.bow_archives_by_userid[userid]
                else:
                    bow_archive = [[]]

                best_scores = np.amax(model.index[bow_archive], axis=0)
                scores[userid] = best_scores

            if paperid in paperidx_by_id:
                paper_index = paperidx_by_id[paperid]
                score = scores[userid][paper_index]


                result = {
                    'source_id': paperid,
                    'target_id': userid,
                    'score': float(score),
                    'label': int(label)
                }

                w.write(json.dumps(result) + '\n')

    (list_of_list_of_labels,
     list_of_list_of_scores) = expertise.utils.load_labels(score_file_path)

    map_score = float(eval_map(list_of_list_of_labels, list_of_list_of_scores))
    hits_at_1 = float(eval_hits_at_k(list_of_list_of_labels, list_of_list_of_scores, k=1))
    hits_at_3 = float(eval_hits_at_k(list_of_list_of_labels, list_of_list_of_scores, k=3))
    hits_at_5 = float(eval_hits_at_k(list_of_list_of_labels, list_of_list_of_scores, k=5))
    hits_at_10 = float(eval_hits_at_k(list_of_list_of_labels, list_of_list_of_scores, k=10))

    score_lines = [
        [config.name, text, data] for text, data in [
            ('MAP', map_score),
            ('Hits@1', hits_at_1),
            ('Hits@3', hits_at_3),
            ('Hits@5', hits_at_5),
            ('Hits@10', hits_at_10)
        ]
    ]
    expertise.utils.dump_csv(
        os.path.join(config.test_dir, 'test.scores.tsv'), score_lines)