def load_testset_from_json_and_add_pos_tag(): pos_tagger = load_pos_tagger() for dataset_name in test_dataset_names: abstract_key = 'abstract' if dataset_name == 'stackexchange': abstract_key = 'question' print('-' * 50) print('Loading %s' % dataset_name) json_path = os.path.join(basedir, dataset_name, dataset_name + '_testing.json') dataset_dict_list = [] # load from json file with open(json_path, 'r') as json_file: for line in json_file: dataset_dict_list.append(json.loads(line)) print('Processing and dumping to %s' % dataset_name) # dump back to json json_path = os.path.join(basedir, dataset_name, dataset_name + '_testing_postag.json') with open(json_path, 'w') as json_file: # postag title/abstract and insert into data example for e_id, example_dict in enumerate(dataset_dict_list): print('=' * 50) print(e_id) print(example_dict['title']) print('len(title)=%d' % len(example_dict['title'])) print('len(abstract)=%d' % len(example_dict[abstract_key])) if len(example_dict[abstract_key]) > 1000: print('truncate to 1000 words') example_dict[abstract_key] = example_dict[ abstract_key][:1000] if e_id % 10 == 0: print('Processing %d/%d' % (e_id, len(dataset_dict_list))) title_postag_tokens = pos_tagger.tag( copyseq_tokenize(example_dict['title'])) # print('#(title token)=%d : %s' % (len(title_postag_tokens), str(title_postag_tokens))) abstract_postag_tokens = pos_tagger.tag( copyseq_tokenize(example_dict[abstract_key])) # print('#(abstract token)=%d : %s' % (len(abstract_postag_tokens), str(abstract_postag_tokens))) example_dict['title_postag'] = ' '.join( [str(t[0]) + '_' + str(t[1]) for t in title_postag_tokens]) example_dict['abstract_postag'] = ' '.join([ str(t[0]) + '_' + str(t[1]) for t in abstract_postag_tokens ]) # for example_dict in postag_dataset_dict_list: json_file.write(json.dumps(example_dict) + '\n')
def evaluate_(source_str_list, targets_str_list, prediction_str_list, model_name, dataset_name, filter_criteria='present', matching_after_stemming=True, output_path=None): ''' ''' assert filter_criteria in ['absent', 'present', 'all'] stemmer = PorterStemmer() if output_path != None: if not os.path.exists(output_path): os.makedirs(output_path) if not os.path.exists(os.path.join(output_path, model_name)): os.makedirs(os.path.join(output_path, model_name)) json_writer = open( os.path.join(output_path, model_name, '%s.json' % dataset_name), 'w+') score_csv_path = os.path.join(output_path, 'all_scores.csv') csv_writer = open(score_csv_path, 'a') print('Evaluating on %s@%s' % (model_name, dataset_name)) # Evaluation part macro_metrics = [] macro_matches = [] total_source_length = 0 length_groundtruth = [] length_groundtruth_for_evaluate = [] number_groundtruth = [] number_groundtruth_for_evaluate = [] total_number_groundtruth = 0 total_number_groundtruth_for_evaluate = 0 total_groundtruth_set = set() total_groundtruth_set_for_evaluate = set() # remove the empty targets first new_targets_str_list = [] for targets_str in targets_str_list: new_targets_str = [] for target_str in targets_str: if len(target_str.strip()) > 0: new_targets_str.append(target_str.strip()) new_targets_str_list.append(new_targets_str) targets_str_list = new_targets_str_list real_test_size = 0 """ Iterate each document """ for doc_id, (source_text, targets, predictions)\ in enumerate(zip(source_str_list, targets_str_list, prediction_str_list)): # print(targets) # print(predictions) # print('*' * 100) # if doc_id > 5: # break if doc_id + 1 % 1000 == 0: print(doc_id) ''' stem all texts/targets/predictions ''' stemmed_source_text_tokens = [ stemmer.stem(t).strip().lower() for t in io.copyseq_tokenize(source_text) ] stemmed_targets_tokens = [[ stemmer.stem(w).strip().lower() for w in io.copyseq_tokenize(target) ] for target in targets] stemmed_predictions_tokens = [[ stemmer.stem(w).strip().lower() for w in io.copyseq_tokenize(prediction) ] for prediction in predictions] ''' check and filter targets/predictions by whether it appear in source text ''' if filter_criteria != 'all': if matching_after_stemming: source_tokens_to_match = stemmed_source_text_tokens targets_tokens_to_match = stemmed_targets_tokens predictions_tokens_to_match = stemmed_predictions_tokens else: source_tokens_to_match = io.copyseq_tokenize( source_text.strip().lower()) targets_tokens_to_match = [ io.copyseq_tokenize(target.strip().lower()) for target in targets ] predictions_tokens_to_match = [ io.copyseq_tokenize(prediction.strip().lower()) for prediction in predictions ] target_present_flags = check_if_present(source_tokens_to_match, targets_tokens_to_match) prediction_present_flags = check_if_present( source_tokens_to_match, predictions_tokens_to_match) if filter_criteria == 'present': targets_valid_flags = target_present_flags prediction_valid_flags = prediction_present_flags elif filter_criteria == 'absent': targets_valid_flags = [not f for f in target_present_flags] prediction_valid_flags = [ not f for f in prediction_present_flags ] targets_for_evaluate = np.asarray( targets)[targets_valid_flags].tolist() stemmed_targets_for_evaluate = np.asarray( stemmed_targets_tokens)[targets_valid_flags].tolist() predictions_for_evaluate = np.asarray( predictions)[prediction_valid_flags].tolist() stemmed_predictions_for_evaluate = np.asarray( stemmed_predictions_tokens)[prediction_valid_flags].tolist() else: targets_for_evaluate = targets stemmed_targets_for_evaluate = stemmed_targets_tokens predictions_for_evaluate = predictions stemmed_predictions_for_evaluate = stemmed_predictions_tokens total_source_length += len(source_tokens_to_match) total_number_groundtruth += len(targets) total_number_groundtruth_for_evaluate += len(targets_for_evaluate) number_groundtruth.append(len(targets)) number_groundtruth_for_evaluate.append(len(targets_for_evaluate)) for target in targets: total_groundtruth_set.add(' '.join(target)) length_groundtruth.append(len(target)) for target in targets_for_evaluate: total_groundtruth_set_for_evaluate.add(' '.join(target)) length_groundtruth_for_evaluate.append(len(target)) if len(targets_for_evaluate) > 0: real_test_size += 1 # """ ''' check each prediction if it can match any ground-truth target ''' valid_predictions_match_flags = get_match_flags( stemmed_targets_for_evaluate, stemmed_predictions_for_evaluate) predictions_match_flags = get_match_flags(stemmed_targets_for_evaluate, stemmed_predictions_tokens) ''' Compute metrics ''' metric_dict = {} for number_to_predict in [5, 10]: metric_dict['target_number'] = len(targets_for_evaluate) metric_dict['prediction_number'] = len(predictions_for_evaluate) metric_dict['correct_number@%d' % number_to_predict] = sum( valid_predictions_match_flags[:number_to_predict]) # Precision metric_dict['p@%d' % number_to_predict] = float( sum(valid_predictions_match_flags[:number_to_predict]) ) / float(number_to_predict) # Recall if len(targets_for_evaluate) != 0: metric_dict['r@%d' % number_to_predict] = float(sum(valid_predictions_match_flags[:number_to_predict])) \ / float(len(targets_for_evaluate)) else: metric_dict['r@%d' % number_to_predict] = 0 # F-score if metric_dict['p@%d' % number_to_predict] + metric_dict[ 'r@%d' % number_to_predict] != 0: metric_dict['f1@%d' % number_to_predict] = 2 * metric_dict[ 'p@%d' % number_to_predict] * metric_dict[ 'r@%d' % number_to_predict] / float( metric_dict['p@%d' % number_to_predict] + metric_dict['r@%d' % number_to_predict]) else: metric_dict['f1@%d' % number_to_predict] = 0 # Bpref: binary preference measure bpref = 0. trunked_match = valid_predictions_match_flags[:number_to_predict].tolist( ) # get the first K prediction to evaluate match_indexes = np.nonzero(trunked_match)[0] if len(match_indexes) > 0: for mid, mindex in enumerate(match_indexes): bpref += 1. - float(mindex - mid) / float( number_to_predict ) # there're mindex elements, and mid elements are correct, before the (mindex+1)-th element metric_dict['bpref@%d' % number_to_predict] = float(bpref) / float( len(match_indexes)) else: metric_dict['bpref@%d' % number_to_predict] = 0 # MRR: mean reciprocal rank rank_first = 0 try: rank_first = trunked_match.index(1) + 1 except ValueError: pass if rank_first > 0: metric_dict['mrr@%d' % number_to_predict] = float(1) / float(rank_first) else: metric_dict['mrr@%d' % number_to_predict] = 0 macro_metrics.append(metric_dict) macro_matches.append(valid_predictions_match_flags) ''' Print information on each prediction ''' print_out = '[DOC_ID] %d\n' % doc_id print_out += '[SOURCE][{0}]: {1}\n'.format(len(source_text), source_text) print_out += '[STEMMED SOURCE][{0}]: {1}'.format( len(stemmed_source_text_tokens), ' '.join(stemmed_source_text_tokens)) print_out += '\n' print_out += '[TARGET]: %d/%d valid/all targets\n' % ( len(targets_for_evaluate), len(targets)) for target, stemmed_target, targets_valid_flag in zip( targets, stemmed_targets_tokens, targets_valid_flags): if targets_valid_flag: print_out += '\t\t%s (%s)\n' % (target, ' '.join(stemmed_target)) for target, stemmed_target, targets_valid_flag in zip( targets, stemmed_targets_tokens, targets_valid_flags): if not targets_valid_flag: print_out += '\t\t[ABSENT]%s (%s)\n' % ( target, ' '.join(stemmed_target)) print_out += '\n' num_correct_5 = sum(predictions_match_flags[:5]) if len( predictions_match_flags) >= 5 else sum(predictions_match_flags) num_correct_10 = sum(predictions_match_flags[:10]) if len( predictions_match_flags) >= 10 else sum(predictions_match_flags) print_out += '[DECODE]: %d/%d valid/all predictions, #(correct@5)=%d, #(correct@10)=%d' \ % (len(predictions_for_evaluate), len(predictions), num_correct_5, num_correct_10) for prediction, stemmed_prediction, prediction_present_flag, predictions_match_flag \ in zip(predictions, stemmed_predictions_tokens, prediction_present_flags, predictions_match_flags): if prediction_present_flag: print_out += ('\n\t\t%s (%s)' % (prediction, ' '.join(stemmed_prediction))) else: print_out += ('\n\t\t[ABSENT]%s (%s)' % (prediction, ' '.join(stemmed_prediction))) if predictions_match_flag == 1: print_out += ' [correct!]' # c += '\n' # for prediction, stemmed_prediction, prediction_present_flag, predictions_match_flag \ # in zip(predictions, stemmed_predictions_tokens, prediction_present_flags, predictions_match_flags): # if not prediction_present_flag: # c += ('\n\t\t[ABSENT]%s (%s)' % (prediction, ' '.join(stemmed_prediction))) # if predictions_match_flag == 1: # c += ' [correct!]' # c = '[DECODE]: {}'.format(' '.join(cut_zero(phrase, idx2word))) # if inputs_unk is not None: # k = '[_INPUT]: {}\n'.format(' '.join(cut_zero(inputs_unk.tolist(), idx2word, Lmax=len(idx2word)))) # logger.info(k) # a += k for number_to_predict in [5, 10]: print_out += '@%d - Precision=%.4f, Recall=%.4f, F1=%.4f, Bpref=%.4f, MRR=%.4f' % ( number_to_predict, metric_dict['p@%d' % number_to_predict], metric_dict['r@%d' % number_to_predict], metric_dict['f1@%d' % number_to_predict], metric_dict['bpref@%d' % number_to_predict], metric_dict['mrr@%d' % number_to_predict]) # logger.info(print_out) # logger.info('*' * 100) out_dict = {} out_dict['src_str'] = source_text out_dict['trg_str'] = targets out_dict['trg_present_flag'] = target_present_flags out_dict['pred_str'] = predictions out_dict['pred_score'] = [0.0] * len(predictions) out_dict['present_flag'] = prediction_present_flags out_dict['valid_flag'] = [True] * len(predictions) out_dict['match_flag'] = [float(m) for m in predictions_match_flags] # print(out_dict) json_writer.write(json.dumps(out_dict) + '\n') assert len(out_dict['trg_str']) == len(out_dict['trg_present_flag']) assert len(out_dict['pred_str']) == len(out_dict['present_flag']) \ == len(out_dict['valid_flag']) == len(out_dict['match_flag']) == len(out_dict['pred_score']) # """ logger.info('Avg(Source Text Length)=%.4f' % (float(total_source_length) / len(source_str_list))) logger.info('#(Target)=%d' % (len(length_groundtruth))) logger.info('Avg(Target Length)=%.4f' % (np.mean(length_groundtruth))) logger.info( '#(%s Target)=%d' % (filter_criteria.upper(), len(length_groundtruth_for_evaluate))) logger.info( 'Avg(%s Target Length)=%.4f' % (filter_criteria.upper(), np.mean(length_groundtruth_for_evaluate))) logger.info('#(Ground-truth Keyphrase)=%d' % total_number_groundtruth) logger.info( '#(%s Ground-truth Keyphrase)=%d' % (filter_criteria.upper(), total_number_groundtruth_for_evaluate)) logger.info('Avg(Ground-truth Keyphrase)=%.4f' % (float(total_number_groundtruth) / len(source_str_list))) logger.info( 'Avg(%s Ground-truth Keyphrase)=%.4f' % (filter_criteria.upper(), float(total_number_groundtruth_for_evaluate) / len(source_str_list))) logger.info('#(Unique Ground-truth Keyphrase)=%d' % (len(total_groundtruth_set))) logger.info( '#(Unique %s Ground-truth Keyphrase)=%d' % (filter_criteria.upper(), len(total_groundtruth_set_for_evaluate))) logger.info('Avg(Ground-truth Keyphrase)=%.4f' % (np.mean(number_groundtruth))) logger.info('Var(Ground-truth Keyphrase)=%.4f' % (np.var(number_groundtruth))) logger.info('Std(Ground-truth Keyphrase)=%.4f' % (np.std(number_groundtruth))) logger.info( 'Avg(%s Ground-truth Keyphrase)=%.4f' % (filter_criteria.upper(), np.mean(number_groundtruth_for_evaluate))) logger.info( 'Var(%s Ground-truth Keyphrase)=%.4f' % (filter_criteria.upper(), np.var(number_groundtruth_for_evaluate))) logger.info( 'Std(%s Ground-truth Keyphrase)=%.4f' % (filter_criteria.upper(), np.std(number_groundtruth_for_evaluate))) ''' Export the f@5 and f@10 for significance test ''' # for k in [5, 10]: # with open(config['predict_path'] + '/macro-f@%d-' % (k) + model_name+'-'+dataset_name+'.txt', 'w') as writer: # writer.write('\n'.join([str(m['f1@%d' % k]) for m in macro_metrics])) # """ ''' Compute the corpus evaluation ''' overall_score = {} for k in [5, 10]: correct_number = sum( [m['correct_number@%d' % k] for m in macro_metrics]) overall_target_number = sum( [m['target_number'] for m in macro_metrics]) overall_prediction_number = sum( [m['prediction_number'] for m in macro_metrics]) if real_test_size * k < overall_prediction_number: overall_prediction_number = real_test_size * k overall_score['target_number'] = sum( [m['target_number'] for m in macro_metrics]) overall_score['correct_number@%d' % k] = sum( [m['correct_number@%d' % k] for m in macro_metrics]) overall_score['prediction_number@%d' % k] = overall_prediction_number # Compute the macro Measures, by averaging the macro-score of each prediction overall_score['p@%d' % k] = float( sum([m['p@%d' % k] for m in macro_metrics])) / float(real_test_size) overall_score['r@%d' % k] = float( sum([m['r@%d' % k] for m in macro_metrics])) / float(real_test_size) overall_score['f1@%d' % k] = float( sum([m['f1@%d' % k] for m in macro_metrics])) / float(real_test_size) # Print basic statistics logger.info('%s@%s' % (model_name, dataset_name)) output_str = 'Overall - valid testing data=%d, Number of Target=%d/%d, ' \ 'Number of Prediction=%d, Number of Correct=%d' % ( real_test_size, overall_target_number, total_number_groundtruth, overall_prediction_number, correct_number ) logger.info(output_str) # Print macro-average performance output_str = 'macro:\t\tP@%d=%f, R@%d=%f, F1@%d=%f' % ( k, overall_score['p@%d' % k], k, overall_score['r@%d' % k], k, overall_score['f1@%d' % k]) logger.info(output_str) # Compute the binary preference measure (Bpref) overall_score['bpref@%d' % k] = float( sum([m['bpref@%d' % k] for m in macro_metrics])) / float(real_test_size) # Compute the mean reciprocal rank (MRR) overall_score['mrr@%d' % k] = float( sum([m['mrr@%d' % k] for m in macro_metrics])) / float(real_test_size) output_str = '\t\t\tBpref@%d=%f, MRR@%d=%f' % ( k, overall_score['bpref@%d' % k], k, overall_score['mrr@%d' % k]) logger.info(output_str) csv_writer.write( '%s, %s, ' '%d, %d, %d, %d, %d, %d, ' '%f, %f, %f, %f, %f, ' '%f, %f, %f, %f, %f\n' % (model_name, dataset_name, len(source_str_list), real_test_size, total_number_groundtruth, total_number_groundtruth_for_evaluate, overall_score['correct_number@%d' % 5], overall_score['correct_number@%d' % 10], overall_score['p@%d' % 5], overall_score['r@%d' % 5], overall_score['f1@%d' % 5], overall_score['bpref@%d' % 5], overall_score['mrr@%d' % 5], overall_score['p@%d' % 10], overall_score['r@%d' % 10], overall_score['f1@%d' % 10], overall_score['bpref@%d' % 10], overall_score['mrr@%d' % 10])) json_writer.close() csv_writer.close()
if os.path.exists(source_file_path): print('=' * 50) print('Processing %s' % source_file_path) else: continue with open(source_file_path, 'r') as paper_file: for line in paper_file: papers.append(json.loads(line)) for paper in papers: # print(paper['keyword']) for kw in paper[kw_key_name].split(';'): target_total_count += 1 trg_tokens = copyseq_tokenize(kw) kw_freq = keyword_count_dict.get(kw, 0) keyword_count_dict[kw] = kw_freq + 1 length_keyword_set = length_keyword_dict.get( len(trg_tokens), set()) length_keyword_set.add(kw) length_keyword_dict[len(trg_tokens)] = length_keyword_set print('len(example) = %d' % len(papers)) print('len(total targets) = %d' % target_total_count) print("export the keyword list") keyword_list = sorted(keyword_count_dict.keys()) if not os.path.exists(os.path.join(source_dir, 'keyword_stats')): os.makedirs(os.path.join(source_dir, 'keyword_stats')) output_file_path = os.path.join(