from datetime import date from spacy.lang.en.stop_words import STOP_WORDS from kargo import logger, corpus, scraping, terms, evaluation from pke.utils import compute_document_frequency, load_document_frequency_file from pke.unsupervised import TfIdf, KPMiner, YAKE from pke.unsupervised import SingleRank, TopicRank, PositionRank, MultipartiteRank SCRAPED_DIR = "data/scraped" INTERIM_DIR = "data/interim" PROCESSED_DIR = "data/processed" MANUAL_DIR = "data/annotations" RESULTS_DIR = "results" RELEVANT_DIR = os.path.join(PROCESSED_DIR, "news", "relevant") CORE_NLP_DIR = os.path.join(RELEVANT_DIR, "dev") EXTRACTED_DIR = os.path.join(RESULTS_DIR, "extracted_terms", "dev") PLOT_DIR = os.path.join(RESULTS_DIR, "plots") log = logger.get_logger(__name__, logger.INFO) def scraping_news_sites(): log.info("Begin scraping processes") air_cargo_news_spider = scraping.AirCargoNewsSpider( seed_url="https://www.aircargonews.net/news-by-date/page/", output_folder=os.path.join(SCRAPED_DIR, "aircargonews.net")) log.info("Begin scraping aircargonews.net") air_cargo_news_spider.start(1, 2) air_cargo_week_spider = scraping.AirCargoWeekSpider( seed_url="https://www.aircargoweek.com/category/news-menu/page/", output_folder=os.path.join(SCRAPED_DIR, "aircargoweek.com")) log.info("Begin scraping aircargoweek.com") air_cargo_week_spider.start(1, 2) air_cargo_world_spider = scraping.AirCargoWorldSpider(
import unittest import scraping from kargo import logger scraping.log = logger.get_logger(__name__, logger.WARNING) class TestScraping(unittest.TestCase): def test_aircargonews(self): air_cargo_news_spider = scraping.AirCargoNewsSpider( seed_url="https://www.aircargonews.net/news-by-date/page/", output_folder="../../data/interim/") result = air_cargo_news_spider.scrape(1) self.assertGreater(len(result), 0) def test_aircargoweek(self): air_cargo_week_spider = scraping.AirCargoWeekSpider( seed_url="https://www.aircargoweek.com/category/news-menu/page/", output_folder="../../data/interim/") result = air_cargo_week_spider.scrape(1) self.assertGreater(len(result), 0) def test_aircargoworld(self): air_cargo_world_spider = scraping.AirCargoWorldSpider( seed_url="https://aircargoworld.com/category/news/page/", output_folder="../../data/interim/") result = air_cargo_world_spider.scrape( 10) # articles in page 1 are behind paywall self.assertGreater(len(result), 0) def test_stattimes(self): stat_times_spider = scraping.StatTimesSpider(