# """ # if __name__ == "__main__": # nbd_spyder = NbdSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_NBD) # nbd_spyder.get_historical_news(start_page=684) # # Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run() # DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run() """ Example-2: 爬取实时新闻数据 """ if __name__ == '__main__': from Kite.database import Database from Kite import config from Leorio.tokenization import Tokenization from Killua.denull import DeNull from Killua.deduplication import Deduplication import threading # 如果没有历史数据从头爬取,如果已爬取历史数据,则从最新的时间开始爬取 # 如历史数据中最近的新闻时间是"2020-12-09 20:37:10",则从该时间开始爬取 nbd_spyder = NbdSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_NBD) nbd_spyder.get_historical_news() Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run() DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run() nbd_spyder.get_realtime_news()
obj = Database() df = obj.get_data(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK, keys=["Date", "Category"]) cnstock_spyder = CnStockSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK) # 先补充历史数据,比如已爬取数据到2020-12-01,但是启动实时爬取程序在2020-12-23,则先 # 自动补充爬取2020-12-02至2020-12-23的新闻数据 for url_to_be_crawled, type_chn in config.WEBSITES_LIST_TO_BE_CRAWLED_CNSTOCK.items( ): # 查询type_chn的最近一条数据的时间 latets_date_in_db = max(df[df.Category == type_chn]["Date"].to_list()) cnstock_spyder.get_historical_news(url_to_be_crawled, category_chn=type_chn, start_date=latets_date_in_db) Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK).run() DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK).run() # 开启多线程并行实时爬取 thread_list = [] for url, type_chn in config.WEBSITES_LIST_TO_BE_CRAWLED_CNSTOCK.items(): thread = threading.Thread(target=cnstock_spyder.get_realtime_news, args=(url, type_chn, 60)) thread_list.append(thread) for thread in thread_list: thread.start() for thread in thread_list: thread.join()