def load_testset_from_json_and_add_pos_tag():
    pos_tagger = load_pos_tagger()

    for dataset_name in test_dataset_names:
        abstract_key = 'abstract'
        if dataset_name == 'stackexchange':
            abstract_key = 'question'

        print('-' * 50)
        print('Loading %s' % dataset_name)
        json_path = os.path.join(basedir, dataset_name,
                                 dataset_name + '_testing.json')

        dataset_dict_list = []
        # load from json file
        with open(json_path, 'r') as json_file:
            for line in json_file:
                dataset_dict_list.append(json.loads(line))

        print('Processing and dumping to %s' % dataset_name)
        # dump back to json
        json_path = os.path.join(basedir, dataset_name,
                                 dataset_name + '_testing_postag.json')
        with open(json_path, 'w') as json_file:
            # postag title/abstract and insert into data example
            for e_id, example_dict in enumerate(dataset_dict_list):
                print('=' * 50)
                print(e_id)
                print(example_dict['title'])
                print('len(title)=%d' % len(example_dict['title']))
                print('len(abstract)=%d' % len(example_dict[abstract_key]))

                if len(example_dict[abstract_key]) > 1000:
                    print('truncate to 1000 words')
                    example_dict[abstract_key] = example_dict[
                        abstract_key][:1000]

                if e_id % 10 == 0:
                    print('Processing %d/%d' % (e_id, len(dataset_dict_list)))

                title_postag_tokens = pos_tagger.tag(
                    copyseq_tokenize(example_dict['title']))
                # print('#(title token)=%d : %s' % (len(title_postag_tokens), str(title_postag_tokens)))
                abstract_postag_tokens = pos_tagger.tag(
                    copyseq_tokenize(example_dict[abstract_key]))
                # print('#(abstract token)=%d : %s' % (len(abstract_postag_tokens), str(abstract_postag_tokens)))
                example_dict['title_postag'] = ' '.join(
                    [str(t[0]) + '_' + str(t[1]) for t in title_postag_tokens])
                example_dict['abstract_postag'] = ' '.join([
                    str(t[0]) + '_' + str(t[1]) for t in abstract_postag_tokens
                ])

                # for example_dict in postag_dataset_dict_list:
                json_file.write(json.dumps(example_dict) + '\n')
示例#2
0
def evaluate_(source_str_list,
              targets_str_list,
              prediction_str_list,
              model_name,
              dataset_name,
              filter_criteria='present',
              matching_after_stemming=True,
              output_path=None):
    '''
    '''
    assert filter_criteria in ['absent', 'present', 'all']
    stemmer = PorterStemmer()

    if output_path != None:
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        if not os.path.exists(os.path.join(output_path, model_name)):
            os.makedirs(os.path.join(output_path, model_name))

    json_writer = open(
        os.path.join(output_path, model_name, '%s.json' % dataset_name), 'w+')
    score_csv_path = os.path.join(output_path, 'all_scores.csv')
    csv_writer = open(score_csv_path, 'a')

    print('Evaluating on %s@%s' % (model_name, dataset_name))
    # Evaluation part
    macro_metrics = []
    macro_matches = []

    total_source_length = 0
    length_groundtruth = []
    length_groundtruth_for_evaluate = []
    number_groundtruth = []
    number_groundtruth_for_evaluate = []
    total_number_groundtruth = 0
    total_number_groundtruth_for_evaluate = 0
    total_groundtruth_set = set()
    total_groundtruth_set_for_evaluate = set()

    # remove the empty targets first
    new_targets_str_list = []
    for targets_str in targets_str_list:
        new_targets_str = []
        for target_str in targets_str:
            if len(target_str.strip()) > 0:
                new_targets_str.append(target_str.strip())
        new_targets_str_list.append(new_targets_str)

    targets_str_list = new_targets_str_list

    real_test_size = 0
    """
    Iterate each document
    """
    for doc_id, (source_text, targets, predictions)\
            in enumerate(zip(source_str_list, targets_str_list, prediction_str_list)):
        # print(targets)
        # print(predictions)
        # print('*' * 100)

        # if doc_id > 5:
        #     break
        if doc_id + 1 % 1000 == 0:
            print(doc_id)
        '''
        stem all texts/targets/predictions
        '''
        stemmed_source_text_tokens = [
            stemmer.stem(t).strip().lower()
            for t in io.copyseq_tokenize(source_text)
        ]
        stemmed_targets_tokens = [[
            stemmer.stem(w).strip().lower()
            for w in io.copyseq_tokenize(target)
        ] for target in targets]
        stemmed_predictions_tokens = [[
            stemmer.stem(w).strip().lower()
            for w in io.copyseq_tokenize(prediction)
        ] for prediction in predictions]
        '''
        check and filter targets/predictions by whether it appear in source text
        '''
        if filter_criteria != 'all':
            if matching_after_stemming:
                source_tokens_to_match = stemmed_source_text_tokens
                targets_tokens_to_match = stemmed_targets_tokens
                predictions_tokens_to_match = stemmed_predictions_tokens
            else:
                source_tokens_to_match = io.copyseq_tokenize(
                    source_text.strip().lower())
                targets_tokens_to_match = [
                    io.copyseq_tokenize(target.strip().lower())
                    for target in targets
                ]
                predictions_tokens_to_match = [
                    io.copyseq_tokenize(prediction.strip().lower())
                    for prediction in predictions
                ]

            target_present_flags = check_if_present(source_tokens_to_match,
                                                    targets_tokens_to_match)
            prediction_present_flags = check_if_present(
                source_tokens_to_match, predictions_tokens_to_match)

            if filter_criteria == 'present':
                targets_valid_flags = target_present_flags
                prediction_valid_flags = prediction_present_flags
            elif filter_criteria == 'absent':
                targets_valid_flags = [not f for f in target_present_flags]
                prediction_valid_flags = [
                    not f for f in prediction_present_flags
                ]

            targets_for_evaluate = np.asarray(
                targets)[targets_valid_flags].tolist()
            stemmed_targets_for_evaluate = np.asarray(
                stemmed_targets_tokens)[targets_valid_flags].tolist()
            predictions_for_evaluate = np.asarray(
                predictions)[prediction_valid_flags].tolist()
            stemmed_predictions_for_evaluate = np.asarray(
                stemmed_predictions_tokens)[prediction_valid_flags].tolist()

        else:
            targets_for_evaluate = targets
            stemmed_targets_for_evaluate = stemmed_targets_tokens
            predictions_for_evaluate = predictions
            stemmed_predictions_for_evaluate = stemmed_predictions_tokens

        total_source_length += len(source_tokens_to_match)
        total_number_groundtruth += len(targets)
        total_number_groundtruth_for_evaluate += len(targets_for_evaluate)

        number_groundtruth.append(len(targets))
        number_groundtruth_for_evaluate.append(len(targets_for_evaluate))

        for target in targets:
            total_groundtruth_set.add(' '.join(target))
            length_groundtruth.append(len(target))
        for target in targets_for_evaluate:
            total_groundtruth_set_for_evaluate.add(' '.join(target))
            length_groundtruth_for_evaluate.append(len(target))

        if len(targets_for_evaluate) > 0:
            real_test_size += 1

        # """
        '''
        check each prediction if it can match any ground-truth target
        '''
        valid_predictions_match_flags = get_match_flags(
            stemmed_targets_for_evaluate, stemmed_predictions_for_evaluate)
        predictions_match_flags = get_match_flags(stemmed_targets_for_evaluate,
                                                  stemmed_predictions_tokens)
        '''
        Compute metrics
        '''
        metric_dict = {}
        for number_to_predict in [5, 10]:
            metric_dict['target_number'] = len(targets_for_evaluate)
            metric_dict['prediction_number'] = len(predictions_for_evaluate)
            metric_dict['correct_number@%d' % number_to_predict] = sum(
                valid_predictions_match_flags[:number_to_predict])

            # Precision
            metric_dict['p@%d' % number_to_predict] = float(
                sum(valid_predictions_match_flags[:number_to_predict])
            ) / float(number_to_predict)

            # Recall
            if len(targets_for_evaluate) != 0:
                metric_dict['r@%d' % number_to_predict] = float(sum(valid_predictions_match_flags[:number_to_predict])) \
                                                          / float(len(targets_for_evaluate))
            else:
                metric_dict['r@%d' % number_to_predict] = 0

            # F-score
            if metric_dict['p@%d' % number_to_predict] + metric_dict[
                    'r@%d' % number_to_predict] != 0:
                metric_dict['f1@%d' % number_to_predict] = 2 * metric_dict[
                    'p@%d' % number_to_predict] * metric_dict[
                        'r@%d' % number_to_predict] / float(
                            metric_dict['p@%d' % number_to_predict] +
                            metric_dict['r@%d' % number_to_predict])
            else:
                metric_dict['f1@%d' % number_to_predict] = 0

            # Bpref: binary preference measure
            bpref = 0.
            trunked_match = valid_predictions_match_flags[:number_to_predict].tolist(
            )  # get the first K prediction to evaluate
            match_indexes = np.nonzero(trunked_match)[0]

            if len(match_indexes) > 0:
                for mid, mindex in enumerate(match_indexes):
                    bpref += 1. - float(mindex - mid) / float(
                        number_to_predict
                    )  # there're mindex elements, and mid elements are correct, before the (mindex+1)-th element
                metric_dict['bpref@%d' %
                            number_to_predict] = float(bpref) / float(
                                len(match_indexes))
            else:
                metric_dict['bpref@%d' % number_to_predict] = 0

            # MRR: mean reciprocal rank
            rank_first = 0
            try:
                rank_first = trunked_match.index(1) + 1
            except ValueError:
                pass

            if rank_first > 0:
                metric_dict['mrr@%d' %
                            number_to_predict] = float(1) / float(rank_first)
            else:
                metric_dict['mrr@%d' % number_to_predict] = 0

        macro_metrics.append(metric_dict)
        macro_matches.append(valid_predictions_match_flags)
        '''
        Print information on each prediction
        '''
        print_out = '[DOC_ID] %d\n' % doc_id
        print_out += '[SOURCE][{0}]: {1}\n'.format(len(source_text),
                                                   source_text)
        print_out += '[STEMMED SOURCE][{0}]: {1}'.format(
            len(stemmed_source_text_tokens),
            ' '.join(stemmed_source_text_tokens))
        print_out += '\n'

        print_out += '[TARGET]: %d/%d valid/all targets\n' % (
            len(targets_for_evaluate), len(targets))
        for target, stemmed_target, targets_valid_flag in zip(
                targets, stemmed_targets_tokens, targets_valid_flags):
            if targets_valid_flag:
                print_out += '\t\t%s (%s)\n' % (target,
                                                ' '.join(stemmed_target))
        for target, stemmed_target, targets_valid_flag in zip(
                targets, stemmed_targets_tokens, targets_valid_flags):
            if not targets_valid_flag:
                print_out += '\t\t[ABSENT]%s (%s)\n' % (
                    target, ' '.join(stemmed_target))

        print_out += '\n'

        num_correct_5 = sum(predictions_match_flags[:5]) if len(
            predictions_match_flags) >= 5 else sum(predictions_match_flags)
        num_correct_10 = sum(predictions_match_flags[:10]) if len(
            predictions_match_flags) >= 10 else sum(predictions_match_flags)
        print_out += '[DECODE]: %d/%d valid/all predictions, #(correct@5)=%d, #(correct@10)=%d' \
            % (len(predictions_for_evaluate), len(predictions), num_correct_5, num_correct_10)
        for prediction, stemmed_prediction, prediction_present_flag, predictions_match_flag \
                in zip(predictions, stemmed_predictions_tokens, prediction_present_flags, predictions_match_flags):
            if prediction_present_flag:
                print_out += ('\n\t\t%s (%s)' %
                              (prediction, ' '.join(stemmed_prediction)))
            else:
                print_out += ('\n\t\t[ABSENT]%s (%s)' %
                              (prediction, ' '.join(stemmed_prediction)))
            if predictions_match_flag == 1:
                print_out += ' [correct!]'
        # c += '\n'
        # for prediction, stemmed_prediction, prediction_present_flag, predictions_match_flag \
        #         in zip(predictions, stemmed_predictions_tokens, prediction_present_flags, predictions_match_flags):
        #     if not prediction_present_flag:
        #         c += ('\n\t\t[ABSENT]%s (%s)' % (prediction, ' '.join(stemmed_prediction)))
        #         if predictions_match_flag == 1:
        #             c += ' [correct!]'

        # c = '[DECODE]: {}'.format(' '.join(cut_zero(phrase, idx2word)))
        # if inputs_unk is not None:
        #     k = '[_INPUT]: {}\n'.format(' '.join(cut_zero(inputs_unk.tolist(),  idx2word, Lmax=len(idx2word))))
        #     logger.info(k)
        # a += k

        for number_to_predict in [5, 10]:
            print_out += '@%d - Precision=%.4f, Recall=%.4f, F1=%.4f, Bpref=%.4f, MRR=%.4f' % (
                number_to_predict, metric_dict['p@%d' % number_to_predict],
                metric_dict['r@%d' % number_to_predict],
                metric_dict['f1@%d' % number_to_predict],
                metric_dict['bpref@%d' % number_to_predict],
                metric_dict['mrr@%d' % number_to_predict])

        # logger.info(print_out)
        # logger.info('*' * 100)

        out_dict = {}
        out_dict['src_str'] = source_text
        out_dict['trg_str'] = targets
        out_dict['trg_present_flag'] = target_present_flags
        out_dict['pred_str'] = predictions
        out_dict['pred_score'] = [0.0] * len(predictions)
        out_dict['present_flag'] = prediction_present_flags
        out_dict['valid_flag'] = [True] * len(predictions)
        out_dict['match_flag'] = [float(m) for m in predictions_match_flags]

        # print(out_dict)

        json_writer.write(json.dumps(out_dict) + '\n')

        assert len(out_dict['trg_str']) == len(out_dict['trg_present_flag'])
        assert len(out_dict['pred_str']) == len(out_dict['present_flag']) \
               == len(out_dict['valid_flag']) == len(out_dict['match_flag']) == len(out_dict['pred_score'])
        # """

    logger.info('Avg(Source Text Length)=%.4f' %
                (float(total_source_length) / len(source_str_list)))
    logger.info('#(Target)=%d' % (len(length_groundtruth)))
    logger.info('Avg(Target Length)=%.4f' % (np.mean(length_groundtruth)))
    logger.info(
        '#(%s Target)=%d' %
        (filter_criteria.upper(), len(length_groundtruth_for_evaluate)))
    logger.info(
        'Avg(%s Target Length)=%.4f' %
        (filter_criteria.upper(), np.mean(length_groundtruth_for_evaluate)))

    logger.info('#(Ground-truth Keyphrase)=%d' % total_number_groundtruth)
    logger.info(
        '#(%s Ground-truth Keyphrase)=%d' %
        (filter_criteria.upper(), total_number_groundtruth_for_evaluate))
    logger.info('Avg(Ground-truth Keyphrase)=%.4f' %
                (float(total_number_groundtruth) / len(source_str_list)))
    logger.info(
        'Avg(%s Ground-truth Keyphrase)=%.4f' %
        (filter_criteria.upper(),
         float(total_number_groundtruth_for_evaluate) / len(source_str_list)))

    logger.info('#(Unique Ground-truth Keyphrase)=%d' %
                (len(total_groundtruth_set)))
    logger.info(
        '#(Unique %s Ground-truth Keyphrase)=%d' %
        (filter_criteria.upper(), len(total_groundtruth_set_for_evaluate)))

    logger.info('Avg(Ground-truth Keyphrase)=%.4f' %
                (np.mean(number_groundtruth)))
    logger.info('Var(Ground-truth Keyphrase)=%.4f' %
                (np.var(number_groundtruth)))
    logger.info('Std(Ground-truth Keyphrase)=%.4f' %
                (np.std(number_groundtruth)))

    logger.info(
        'Avg(%s Ground-truth Keyphrase)=%.4f' %
        (filter_criteria.upper(), np.mean(number_groundtruth_for_evaluate)))
    logger.info(
        'Var(%s Ground-truth Keyphrase)=%.4f' %
        (filter_criteria.upper(), np.var(number_groundtruth_for_evaluate)))
    logger.info(
        'Std(%s Ground-truth Keyphrase)=%.4f' %
        (filter_criteria.upper(), np.std(number_groundtruth_for_evaluate)))
    '''
    Export the f@5 and f@10 for significance test
    '''
    # for k in [5, 10]:
    #     with open(config['predict_path'] + '/macro-f@%d-' % (k) + model_name+'-'+dataset_name+'.txt', 'w') as writer:
    #         writer.write('\n'.join([str(m['f1@%d' % k]) for m in macro_metrics]))

    # """
    '''
    Compute the corpus evaluation
    '''
    overall_score = {}

    for k in [5, 10]:
        correct_number = sum(
            [m['correct_number@%d' % k] for m in macro_metrics])
        overall_target_number = sum(
            [m['target_number'] for m in macro_metrics])
        overall_prediction_number = sum(
            [m['prediction_number'] for m in macro_metrics])

        if real_test_size * k < overall_prediction_number:
            overall_prediction_number = real_test_size * k

        overall_score['target_number'] = sum(
            [m['target_number'] for m in macro_metrics])
        overall_score['correct_number@%d' % k] = sum(
            [m['correct_number@%d' % k] for m in macro_metrics])
        overall_score['prediction_number@%d' % k] = overall_prediction_number

        # Compute the macro Measures, by averaging the macro-score of each prediction
        overall_score['p@%d' % k] = float(
            sum([m['p@%d' % k]
                 for m in macro_metrics])) / float(real_test_size)
        overall_score['r@%d' % k] = float(
            sum([m['r@%d' % k]
                 for m in macro_metrics])) / float(real_test_size)
        overall_score['f1@%d' % k] = float(
            sum([m['f1@%d' % k]
                 for m in macro_metrics])) / float(real_test_size)

        # Print basic statistics
        logger.info('%s@%s' % (model_name, dataset_name))
        output_str = 'Overall - valid testing data=%d, Number of Target=%d/%d, ' \
                     'Number of Prediction=%d, Number of Correct=%d' % (
                    real_test_size,
                    overall_target_number, total_number_groundtruth,
                    overall_prediction_number, correct_number
        )
        logger.info(output_str)

        # Print macro-average performance
        output_str = 'macro:\t\tP@%d=%f, R@%d=%f, F1@%d=%f' % (
            k, overall_score['p@%d' % k], k, overall_score['r@%d' % k], k,
            overall_score['f1@%d' % k])
        logger.info(output_str)

        # Compute the binary preference measure (Bpref)
        overall_score['bpref@%d' % k] = float(
            sum([m['bpref@%d' % k]
                 for m in macro_metrics])) / float(real_test_size)

        # Compute the mean reciprocal rank (MRR)
        overall_score['mrr@%d' % k] = float(
            sum([m['mrr@%d' % k]
                 for m in macro_metrics])) / float(real_test_size)

        output_str = '\t\t\tBpref@%d=%f, MRR@%d=%f' % (
            k, overall_score['bpref@%d' % k], k, overall_score['mrr@%d' % k])
        logger.info(output_str)

    csv_writer.write(
        '%s, %s, '
        '%d, %d, %d, %d, %d, %d, '
        '%f, %f, %f, %f, %f, '
        '%f, %f, %f, %f, %f\n' %
        (model_name, dataset_name, len(source_str_list), real_test_size,
         total_number_groundtruth, total_number_groundtruth_for_evaluate,
         overall_score['correct_number@%d' % 5],
         overall_score['correct_number@%d' % 10], overall_score['p@%d' % 5],
         overall_score['r@%d' % 5], overall_score['f1@%d' % 5],
         overall_score['bpref@%d' % 5], overall_score['mrr@%d' % 5],
         overall_score['p@%d' % 10], overall_score['r@%d' % 10],
         overall_score['f1@%d' % 10], overall_score['bpref@%d' % 10],
         overall_score['mrr@%d' % 10]))

    json_writer.close()
    csv_writer.close()
            if os.path.exists(source_file_path):
                print('=' * 50)
                print('Processing %s' % source_file_path)
            else:
                continue

            with open(source_file_path, 'r') as paper_file:
                for line in paper_file:
                    papers.append(json.loads(line))

            for paper in papers:
                # print(paper['keyword'])
                for kw in paper[kw_key_name].split(';'):
                    target_total_count += 1
                    trg_tokens = copyseq_tokenize(kw)
                    kw_freq = keyword_count_dict.get(kw, 0)
                    keyword_count_dict[kw] = kw_freq + 1

                    length_keyword_set = length_keyword_dict.get(
                        len(trg_tokens), set())
                    length_keyword_set.add(kw)
                    length_keyword_dict[len(trg_tokens)] = length_keyword_set

            print('len(example) = %d' % len(papers))
            print('len(total targets) = %d' % target_total_count)
            print("export the keyword list")
            keyword_list = sorted(keyword_count_dict.keys())
            if not os.path.exists(os.path.join(source_dir, 'keyword_stats')):
                os.makedirs(os.path.join(source_dir, 'keyword_stats'))
            output_file_path = os.path.join(