def evaluate_genia_dataset(): import logging.config logging.config.fileConfig(os.path.join('..', 'logging.conf')) import multiprocessing from jgtextrank import keywords_extraction_from_tagged_corpus import time reader = WordListCorpusReader('../resource', 'smart-stop-list.txt') stop_list = reader.words() gs_terms = load_genia_gs_terms('genia_gs_terms.txt') corpus_directory = os.path.join('GENIAcorpus302', 'text', 'files') pre_processed_corpus = pre_processing_corpus_with_spacy(corpus_directory, encoding="utf-8", lemma=True) print("term extraction from spaCy pre-processed corpus [%s] ..." % corpus_directory) pre_processed_corpus = list(pre_processed_corpus) start = time.time() genia_keywords, genia_top_vertices = keywords_extraction_from_tagged_corpus( pre_processed_corpus, window=2, top_p=1, stop_words=stop_list, weight_comb="norm_max", export=False, workers=multiprocessing.cpu_count()) end = time.time() print("\n") print("Complete. Wall-clock elapsed time: ", end - start, "s") print("\n") print("top 50 keywords: ", genia_keywords[:50]) evaluate_results(genia_keywords, gs_terms, "genia+pagerank") start = time.time() genia_gcvalue_keywords, genia_gcvalue_top_vertices = keywords_extraction_from_tagged_corpus( pre_processed_corpus, window=2, top_p=1, stop_words=stop_list, weight_comb="gcvalue", export=False, workers=multiprocessing.cpu_count()) end = time.time() print("\n") print("Complete. Wall-clock elapsed time: ", end - start, "s") print("\n") print("top 50 keywords: ", genia_gcvalue_keywords[:50]) evaluate_results(genia_gcvalue_keywords, gs_terms, "genia+pagerank+gcvalue")
def evaluate_hulth2003_testset(): import logging.config logging.config.fileConfig(os.path.join('..', 'logging.conf')) import multiprocessing from jgtextrank import keywords_extraction_from_tagged_corpus import time reader = WordListCorpusReader('../resource', 'smart-stop-list.txt') stop_list = reader.words() hulth2003_corpus_directory = os.path.join('Hulth2003', 'Test') hulth2003_gs_terms = load_Hulth2003_gs_terms(hulth2003_corpus_directory) hulth2003_corpus_directory = os.path.join('Hulth2003', 'Test') pre_processed_hulth2003_corpus = pre_processing_corpus_with_spacy(hulth2003_corpus_directory, encoding="utf-8", lemma=True, default_file_suffix=".abstr") print("term extraction from spaCy pre-processed hulth2003 corpus [%s] ..." % hulth2003_corpus_directory) pre_processed_hulth2003_corpus = list(pre_processed_hulth2003_corpus) start = time.time() hulth2003_keywords, hulth2003_top_t_vertices = keywords_extraction_from_tagged_corpus(pre_processed_hulth2003_corpus, window=3, top_p = 1, stop_words=stop_list, weight_comb="norm_max", export=False, workers=multiprocessing.cpu_count()) end = time.time() print("\n") print("Complete. Wall-clock elapsed time: ", end - start, "s") print("\n") print("top 50 keywords: ", hulth2003_keywords[:50]) print("\n") evaluate_results(hulth2003_keywords, hulth2003_gs_terms, "hulth2003+pagerank") start = time.time() hulth2003_keywords, hulth2003_top_t_vertices = keywords_extraction_from_tagged_corpus(pre_processed_hulth2003_corpus, window=2, top_p = 1, stop_words=stop_list, weight_comb="gcvalue", export=False, workers=multiprocessing.cpu_count()) end = time.time() print("\n") print("Complete. Wall-clock elapsed time: ", end - start, "s") print("\n") print("top 50 keywords: ", hulth2003_keywords[:50]) print("\n") evaluate_results(hulth2003_keywords, hulth2003_gs_terms, "hulth2003+pagerank+gcvalue")
def evaluate_semeval2017_testset(): import logging.config logging.config.fileConfig(os.path.join('..', 'logging.conf')) import multiprocessing from jgtextrank import keywords_extraction_from_tagged_corpus import time gs_terms = load_scienceie_test_dataset("semeval_articles_test", file_suffix=".ann") print(len(gs_terms), " gs terms loaded.") reader = WordListCorpusReader('../resource', 'smart-stop-list.txt') stop_list = reader.words() semeval2017_pre_processed_corpus = pre_processing_corpus_with_spacy( "semeval_articles_test", default_file_suffix=".txt") semeval2017_pre_processed_corpus = list(semeval2017_pre_processed_corpus) start = time.time() semeval2017_keywords, semeval2017_top_t_vertices = keywords_extraction_from_tagged_corpus( semeval2017_pre_processed_corpus, window=3, top_p=1, stop_words=stop_list, weight_comb="avg", export=False, workers=1) end = time.time() print("\n") print("Complete. Wall-clock elapsed time: ", end - start, "s") print("\n") print("top 50 keywords: ", semeval2017_keywords[:50]) print("\n") evaluate_results(semeval2017_keywords, gs_terms, "pagerank+norm_max") start = time.time() semeval2017_gcvalue_keywords, semeval2017_gcvalue_top_t_vertices = keywords_extraction_from_tagged_corpus( semeval2017_pre_processed_corpus, window=3, top_p=1, stop_words=stop_list, weight_comb="gcvalue", export=False, workers=1) end = time.time() print("\n") print("Complete. Wall-clock elapsed time: ", end - start, "s") print("\n") print("top 50 keywords: ", semeval2017_gcvalue_keywords[:50]) print("\n") evaluate_results(semeval2017_gcvalue_keywords, gs_terms, "pagerank + GC-Value")
def evaluate_aclrdtec1_dataset(): import logging.config logging.config.fileConfig(os.path.join('../jgtextrank', 'logging.conf')) import multiprocessing from jgtextrank import keywords_extraction_from_tagged_corpus import time reader = WordListCorpusReader('../jgtextrank', 'smart-stop-list.txt') stop_list = reader.words() # C:\\Data\\NLP-corpus\\ACL RD-TEC\\_all_annotated_candid_term\\_all_annotated_candid_term aclrdtec1_gs_terms_file_path = os.path.join('/home', 'jieg', 'data', 'ACL RD-TEC-1', '_all_annotated_candid_term') print("loading ACLRDTEC 1.0 GS terms from [%s] ..." % aclrdtec1_gs_terms_file_path) aclrdtec1_gs_terms = load_aclrdtec1_gs_terms(aclrdtec1_gs_terms_file_path) print("total [%s] normed GS terms loaded" % len(aclrdtec1_gs_terms)) zipped_corpus_path = os.path.join('/home', 'jieg', 'data', 'ACL RD-TEC-1', 'ACLRDTEC-1.zip') print("term extraction from spaCy pre-processed corpus [%s] ..." % zipped_corpus_path) start = time.time() doc_content_list = load_all_files_from_zip_file(zipped_corpus_path) pre_processed_corpus = pre_processing_unzipped_corpus_with_spacy( doc_content_list) pre_processed_corpus = list(pre_processed_corpus) end = time.time() print("\n") print("Complete corpus pre-processing. Wall-clock elapsed time: ", end - start, "s") print("\n") start = time.time() aclrdtec1_keywords, aclrdtec1_top_vertices = keywords_extraction_from_tagged_corpus( pre_processed_corpus, window=2, top_p=1, stop_words=stop_list, weight_comb="norm_max", export=False, workers=multiprocessing.cpu_count()) end = time.time() print("\n") print("Complete. Wall-clock elapsed time: ", end - start, "s") print("\n") print("top 50 keywords: ", aclrdtec1_keywords[:50]) evaluate_results(aclrdtec1_keywords, aclrdtec1_gs_terms, "aclrdtec1 + pagerank") start = time.time() aclrdtec1_gcvalue_keywords, aclrdtec1_gcvalue_top_vertices = keywords_extraction_from_tagged_corpus( pre_processed_corpus, window=2, top_p=1, stop_words=stop_list, weight_comb="gcvalue", export=False, workers=multiprocessing.cpu_count()) end = time.time() print("\n") print("Complete. Wall-clock elapsed time: ", end - start, "s") print("\n") print("top 50 keywords: ", aclrdtec1_gcvalue_keywords[:50]) evaluate_results(aclrdtec1_gcvalue_keywords, aclrdtec1_gs_terms, "aclrdtec1+pagerank+gcvalue")