def func(*args, **kwargs): ret = inner(*args, **kwargs) different = [Similar.check(x.title) for x in ret] logs.info( f"以下新闻因重复暂不录入数据库:{[x for x in ret if x.title not in different]}" ) ret = [x for x in ret if x.title in different] return ret
def schedule_special_hibor(): session = Session(**database) news = api.special_hibor() for n in news: session.insert_one(n) logs.info("慧博资讯导入数据库完成") print("慧博资讯导入数据库完成") session.close()
def special_hibor(): ret = [] keywords = by_keyword["hibor"] for k in keywords: sems = SpecialHiBor(k, 1) news = sems.collect() if news: ret.extend(news) logs.info("慧博资讯爬取完成") return ret
def start_schedule(): logs.info(f"开始执行爬虫任务,当前任务执行周期为@{hour}hours") print(f"开始执行爬虫任务,当前任务执行周期为@{hour}hours") for name in website.keys(): schedule(name) scheduler.add_job(schedule, 'interval', hours=hour, seconds=second, args=(name, )) scheduler.add_job( schedule_special, 'interval', hours=hour, seconds=second, ) scheduler.add_job( schedule_special_search_api, 'interval', hours=hour, seconds=second, ) scheduler.add_job( schedule_special_hibor, 'interval', hours=6, seconds=second, ) try: schedule_special_hibor() except Exception as e: print(e) schedule_special() scheduler.start() while True: time.sleep(1 * 60 * 60)
def schedule(website_name): session = Session(**database) web = website[website_name] for k, v in web.items(): for section in v: logs.info( f"{datetime.now().strftime('%Y-%m-%d %H:%M')} 执行任务<{website_name} {section['section']}>" ) for i in range(1, 3): section["page"] = i try: news = getattr(api, website_name)(**section) except Exception as e: logs.error(e) break for n in news: n = api.revise(n) if n: session.insert_one(n) session.close()