def test_annotated_empty(self): anno_json = corpus.TermLabels( "../../data/test/samples_with_manual_annotation.json1") check_corpus = corpus.Corpus( "../../data/test/samples_news_clean_random.xml", annotations=anno_json) self.assertEqual(len(check_corpus), 0)
def test_subsetting(self): check_corpus = corpus.Corpus( "../../data/test/samples_news_clean_random.xml") sample_urls = [ "https://theloadstar.com/will-digitisation-kill-off-forwarder/", "https://theloadstar.com/alert-airlines-unsafe-hoverboards-iata-calls-stiff-penalty-shippers-mis-declare-battery-devices/", "https://theloadstar.com/australian-box-terminal-operators-offset-falling-volumes-by-hiking-fees/", ] subset_xml = check_corpus.get_documents_by_urls(sample_urls) self.assertEqual(len(subset_xml), len(sample_urls))
def process_manual_annotation(): log.info( f"Begin incorporating manual annotation to the XML, result in {RELEVANT_DIR}" ) anno_json = corpus.TermLabels( os.path.join(MANUAL_DIR, "terms", "news.jsonl")) manual_corpus = corpus.Corpus(os.path.join(PROCESSED_DIR, "lda_sampling_15p.xml"), annotations=anno_json) manual_corpus.write_xml_to( os.path.join(PROCESSED_DIR, "lda_sampling_15p.annotated.xml"))
def preprocess_corpus(): log.info(f"Begin combining from {SCRAPED_DIR}") combined_corpus = corpus.Corpus(SCRAPED_DIR) log.info("Begin filtering empty documents") combined_corpus.filter_empty() n_sample = 10 log.info(f"Begin sampling, n={n_sample}") sampled_corpus = combined_corpus.get_sample(n_sample) log.info(f"Write sample.xml to {INTERIM_DIR}") sampled_corpus.write_xml_to(os.path.join( INTERIM_DIR, "sample.xml")) # use dummy filename for now
def evaluate_terms(): annotated_corpus = corpus.Corpus(os.path.join(RELEVANT_DIR, "dev.xml")) log.info("Begin evaluation") evaluator = evaluation.Evaluator(annotated_corpus) extracted_terms = { "TF-IDF": "tfidf.csv", "KPM": "kpm.csv", "YAKE": "yake.csv", "SingleRank": "singlerank.csv", "TopicRank": "topicrank.csv", "MultipartiteRank": "mprank.csv", "PositionRank": "positionrank.csv", "EmbedRank": "embedrank_wiki_unigrams.csv" } for method, file_name in extracted_terms.items(): t = terms.TermsExtractor.read_terms_from( os.path.join(EXTRACTED_DIR, file_name)) evaluator.add_prediction(method, t) today_date = date.today().strftime("%Y%m%d") evaluator.evaluate_and_visualize( os.path.join(PLOT_DIR, f"eval_{today_date}.html"))
def create_core_nlp_documents(core_nlp_folder): log.info(f"Begin preparing Core NLP Documents to {core_nlp_folder}") annotated_corpus = corpus.Corpus( os.path.join(PROCESSED_DIR, "lda_sampling_15p.annotated.xml")) annotated_corpus.write_to_core_nlp_xmls(core_nlp_folder)
def test_news(self): check_corpus1 = corpus.Corpus("../../data/test/samples_news_raw.xml") self.assertEqual(len(check_corpus1), 2)
def test_combine_and_filter(self): combined_corpus = corpus.Corpus( xml_input="../../data/test/scrape_samples/") self.assertEqual(len(combined_corpus), 102) combined_corpus.filter_empty() self.assertEqual(len(combined_corpus), 99)
def test_sampling(self): check_corpus = corpus.Corpus( "../../data/test/samples_news_clean_random.xml") sample_xml = check_corpus.get_sample(3) self.assertEqual(len(sample_xml), 3)
def test_wiki(self): check_corpus = corpus.Corpus("../../data/test/samples_wiki.xml") self.assertEqual(len(check_corpus), 3)
def test_existing_annotation_w_extra_annotation(self): anno_json = corpus.TermLabels( "../../data/test/samples_with_manual_annotation.json1") check_corpus = corpus.Corpus("../../data/test/samples_with_terms.xml", annotations=anno_json) self.assertEqual(len(check_corpus), 2)
def test_existing_annotation(self): check_corpus = corpus.Corpus("../../data/test/samples_with_terms.xml") self.assertEqual(len(check_corpus), 2)