Пример #1
0
def one_fold():
    path = 'D:/output/topk/random/'
    # path = '/mnt/sata_disk/hossein/wsdmcup2020/output/'
    k_list = [3, 5, 10, 100]
    random_sample_size = 1000
    top_relevant_paper_size = 100

    feature_types = ['bm.1_1_gram', 'bm.1_2_gram']
    feature_types += ['tf.1_1_gram', 'tf.1_2_gram']
    feature_types += ['tfidf.1_1_gram', 'tfidf.1_2_gram']
    feature_types += ['emd.50', 'emd.100', 'emd.200', 'emd.300']
    feature_types += [
        'glv.emd.50', 'glv.emd.100', 'glv.emd.200', 'glv.emd.300'
    ]
    feature_types += ['d2v.50', 'd2v.100', 'd2v.200', 'd2v.300']

    paper_info = ['title', 'abstract', 'journal']

    workbook = xlsxwriter.Workbook(
        path + 'topk/random/eval_one_fold.{}.{}.xlsx'.format(
            random_sample_size, top_relevant_paper_size))
    ws_avg_map_k = workbook.add_worksheet('avg_map_k')
    ws_var_map_k = workbook.add_worksheet('var_map_k')
    ws_avg_suc_k = workbook.add_worksheet('avg_suc_k')
    ws_var_suc_k = workbook.add_worksheet('var_suc_k')
    row = 0
    for info in paper_info:
        for feature_type in feature_types:
            row += 1
            for i, k in enumerate(k_list):
                ws_avg_map_k.write(0, i + 1, 'avg_map@{}'.format(k))
                ws_avg_map_k.write(row, 0, '{}.{}'.format(info, feature_type))
                ws_var_map_k.write(0, i + 1, 'var_map@{}'.format(k))
                ws_var_map_k.write(row, 0, '{}.{}'.format(info, feature_type))
                ws_avg_suc_k.write(0, i + 1, 'avg_suc@{}'.format(k))
                ws_avg_suc_k.write(row, 0, '{}.{}'.format(info, feature_type))
                ws_var_suc_k.write(0, i + 1, 'avg_suc@{}'.format(k))
                ws_var_suc_k.write(row, 0, '{}.{}'.format(info, feature_type))

                predictions = pd.read_csv(
                    path + 'topk/random/train.cos.{}.{}.{}.top{}'.format(
                        info, feature_type, random_sample_size,
                        top_relevant_paper_size),
                    header=0)
                trues = [[t] for t in predictions['paper_id'].tolist()]
                preds = [str(p).split(',') for p in predictions['topk']]

                mean_suc_k(true=trues, pred=preds, k=k)

                print('map@{}:{}.{}:{}'.format(
                    k, info, feature_type,
                    mean_ap_k(true=trues, pred=preds, k=k)))
                print('suc@{}:{}.{}:{}'.format(
                    k, info, feature_type,
                    mean_suc_k(true=trues, pred=preds, k=k)))
                ws_avg_map_k.write(row, i + 1,
                                   mean_ap_k(true=trues, pred=preds, k=k))
                ws_avg_suc_k.write(row, i + 1,
                                   mean_suc_k(true=trues, pred=preds, k=k))

    workbook.close()
Пример #2
0
def one_fold_mixed():
    path = 'D:/output/topk/random/'
    # path = '/mnt/sata_disk/hossein/wsdmcup2020/output/'
    k_list = [3, 5, 10, 100]
    random_sample_size = 1000
    top_relevant_paper_size = 100

    feature_types = [
        'bm.1_1_gram', 'bm.1_2_gram', 'tfidf.1_1_gram', 'tfidf.1_2_gram'
    ]
    paper_info = ['title', 'abstract']
    k = 100

    workbook = xlsxwriter.Workbook(
        path + 'eval_one_fold.mix.{}.{}.xlsx'.format(random_sample_size,
                                                     top_relevant_paper_size))
    ws_avg_map_k = workbook.add_worksheet('avg_map_k')
    ws_var_map_k = workbook.add_worksheet('var_map_k')
    ws_avg_suc_k = workbook.add_worksheet('avg_suc_k')
    ws_var_suc_k = workbook.add_worksheet('var_suc_k')
    row = 0
    for info1 in paper_info:
        for feature_type1 in feature_types:
            for info2 in paper_info:
                for feature_type2 in feature_types:
                    if info1 == info2 and feature_type1 == feature_type2:
                        continue
                    logger.info(
                        'train.cos.{}.{} mixed with train.cos.{}.{} ...'.
                        format(info1, feature_type1, info2, feature_type2))
                    row += 1
                    for i, k in enumerate(k_list):
                        ws_avg_map_k.write(0, i + 1, 'avg_map@{}'.format(k))
                        ws_avg_map_k.write(
                            row, 0,
                            '{}.{}.mix.{}.{}'.format(info1, feature_type1,
                                                     info2, feature_type2))
                        ws_var_map_k.write(0, i + 1, 'var_map@{}'.format(k))
                        ws_var_map_k.write(
                            row, 0,
                            '{}.{}.mix.{}.{}'.format(info1, feature_type1,
                                                     info2, feature_type2))
                        ws_avg_suc_k.write(0, i + 1, 'avg_suc@{}'.format(k))
                        ws_avg_suc_k.write(
                            row, 0,
                            '{}.{}.mix.{}.{}'.format(info1, feature_type1,
                                                     info2, feature_type2))
                        ws_var_suc_k.write(0, i + 1, 'avg_suc@{}'.format(k))
                        ws_var_suc_k.write(
                            row, 0,
                            '{}.{}.mix.{}.{}'.format(info1, feature_type1,
                                                     info2, feature_type2))

                        predictions = pd.read_csv(
                            path + 'train.cos.{}.{}.mix.{}.{}.{}.top{}'.format(
                                info1, feature_type1, info2, feature_type2,
                                random_sample_size, top_relevant_paper_size))

                        trues = [[t] for t in predictions['paper_id'].tolist()]
                        preds = [
                            str(p).split(',') for p in predictions['topk']
                        ]

                        mean_suc_k(true=trues, pred=preds, k=k)

                        ws_avg_map_k.write(
                            row, i + 1, mean_ap_k(true=trues, pred=preds, k=k))
                        ws_avg_suc_k.write(
                            row, i + 1, mean_suc_k(true=trues, pred=preds,
                                                   k=k))

    workbook.close()