def main(): process = CrawlerProcess(get_project_settings()) process.crawl(PlaysportCrawler) scheduler = TwistedScheduler() scheduler.add_job(process.crawl, 'interval', hours=3, args=[PlaysportCrawler]) scheduler.add_listener(my_listener, EVENT_JOB_EXECUTED | EVENT_JOB_ERROR) scheduler.start() process.start(False) _ = _notifier(msg='\n'.join([ "Scheduler Start", ]))
class Scheduler: def __init__(self): self.scrapers = [ HistorySpider, WpbccSpider, LWVChicago, LibraryEvents, GreatLakesReader ] self.interval_seconds = 60 * config.schedule_interval self.scheduler = TwistedScheduler() self.scheduler.add_listener(self.schedule_missed, EVENT_JOB_MISSED) def add_schedule(self, scraper, seconds_delay): self.scheduler.add_job(self.run_scraper, id=scraper.__name__, trigger='interval', args=[scraper], start_date=datetime.now() + relativedelta(seconds=seconds_delay), seconds=self.interval_seconds) def schedule_missed(self, event): print(f'{event.job_id} missed. Interval time: {self.interval_seconds}') def run_scraper(self, scraper): start_date = datetime.now().strftime('%m-%d-%Y') end_date = (datetime.now() + relativedelta(months=+1)).strftime('%m-%d-%Y') print(f'{datetime.now()} starting {scraper.__name__}') runner = CrawlerRunner(get_project_settings()) runner.crawl(scraper, start_date, end_date) runner.join() def run_schedule(self): configure_logging() start_interval = self.interval_seconds / len(self.scrapers) now = datetime.now() self.last_scheduled = now for index, scraper in enumerate(self.scrapers): self.add_schedule(scraper, start_interval * index) self.scheduler.start() reactor.run()
event.job_id ), 'jobs') else: toLog('Event {} happenend'.format( event_code_translator(event.code)), 'jobs' ) def event_code_translator(code): event_dict = { 1: 'EVENT_SCHEDULER_START', 2: 'EVENT_SCHEDULER_SHUTDOWN', 4: 'EVENT_EXECUTOR_ADDED', 8: 'EVENT_EXECUTOR_REMOVED', 16: 'EVENT_JOBSTORE_ADDED', 32: 'EVENT_JOBSTORE_REMOVED', 64: 'EVENT_ALL_JOBS_REMOVED', 128: 'EVENT_JOB_ADDED', 256: 'EVENT_JOB_REMOVED', 512: 'EVENT_JOB_MODIFIED', 1024: 'EVENT_JOB_EXECUTED', 2048: 'EVENT_JOB_ERROR', 4096: 'EVENT_JOB_MISSED' } return event_dict.get(code, None) scheduler.add_listener(job_logger, events.EVENT_ALL)
toLog( 'Event {} for job {} happenend'.format( event_code_translator(event.code), event.job_id), 'jobs') else: toLog('Event {} happenend'.format(event_code_translator(event.code)), 'jobs') def event_code_translator(code): event_dict = { 1: 'EVENT_SCHEDULER_START', 2: 'EVENT_SCHEDULER_SHUTDOWN', 4: 'EVENT_EXECUTOR_ADDED', 8: 'EVENT_EXECUTOR_REMOVED', 16: 'EVENT_JOBSTORE_ADDED', 32: 'EVENT_JOBSTORE_REMOVED', 64: 'EVENT_ALL_JOBS_REMOVED', 128: 'EVENT_JOB_ADDED', 256: 'EVENT_JOB_REMOVED', 512: 'EVENT_JOB_MODIFIED', 1024: 'EVENT_JOB_EXECUTED', 2048: 'EVENT_JOB_ERROR', 4096: 'EVENT_JOB_MISSED' } return event_dict.get(code, None) scheduler.add_listener(job_logger, events.EVENT_ALL)
# else: # # job not scheduled, add it and run now # scheduler.add_job(FilterAndInsertData, 'cron', args=[PttMoviesSpider]) if __name__ == '__main__': process = CrawlerProcess(get_project_settings()) scheduler = TwistedScheduler() scheduler.add_job(process.crawl, trigger, args=[YahoomovieSpider], name='yahoo') # scheduler.get_job(job_id ="my_job_id").modify(next_run_time=datetime.datetime.now()) scheduler.add_job(process.crawl, 'cron', args=[PttMoviesSpider], hour='23', minute='59', name='ptt') # scheduler.add_job(FilterAndInsertData, 'cron', day='last sun', name='insertData') scheduler.add_listener(execution_listener, EVENT_JOB_EXECUTED | EVENT_JOB_ERROR) scheduler.start() process.start(False) # Do not stop reactor after spider closes # try: # while True: # time.sleep(1) # except (KeyboardInterrupt, SystemExit): # scheduler.shutdown()
import logging logging.basicConfig(format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S',filename='list.log', filemode='w') n = 6000 ll = [0 for i in range(n)] def func(i): ll[i] += 1 def err_lis(ev): logger = logging.getLogger("") logger.error(str(ev)) #scheduler = BackgroundScheduler() scheduler = TwistedScheduler() for i in range(n): start = datetime.datetime.now() + datetime.timedelta(seconds=i%10) scheduler.add_job(func, 'interval', args=(i,), start_date=start, seconds=10) scheduler.add_listener(err_lis, apscheduler.events.EVENT_JOB_ERROR | apscheduler.events.EVENT_JOB_MISSED) scheduler.start() time.sleep(5) scheduler.shutdown() s = 0 for i in ll: s+=i print s