def setup(cls, conf): if "sql" in conf: DatabaseMixin.sql = \ adbapi.ConnectionPool("psycopg2", host=conf["sql"]['host'], database=conf["sql"]['database'], user=conf["sql"]['username'], password=conf["sql"]['password'], cp_min=1, cp_max=10, cp_reconnect=True, cp_noisy=conf["debug"]) if "redis" in conf: DatabaseMixin.redis = \ cyclone.redis.lazyConnectionPool( host=conf["redis"]['host'], dbid=conf["redis"]['dbid'], poolsize=10, reconnect=True) if conf["redis"].get("pubsub", False): pubsub = cyclone.redis.SubscriberFactory() pubsub.maxDelay = 20 pubsub.continueTrying = True pubsub.protocol = PubSubProtocol reactor.connectTCP(conf["redis"]['host'], 6379, pubsub) DatabaseMixin.sched = TwistedScheduler() DatabaseMixin.build() DatabaseMixin.sched.start()
def trigger_spider_job(seconds=10, source_type="1", source_key="jd", document_item=YJdItem): scheduler = TwistedScheduler() trigger = CronTrigger(hour=10, minute=42, second=seconds) scheduler.add_job(print_time, trigger, args=[source_type, source_key, document_item] , misfire_grace_time=120) scheduler.start()
def __init__(self): self.scrapers = [ HistorySpider, WpbccSpider, LWVChicago, LibraryEvents, GreatLakesReader ] self.interval_seconds = 60 * config.schedule_interval self.scheduler = TwistedScheduler() self.scheduler.add_listener(self.schedule_missed, EVENT_JOB_MISSED)
def startYourEngines(self): sched = TwistedScheduler() sched.start() if sched.get_job('host_status'): pass else: sched.add_job(self.hoststatus, 'interval', seconds=10, id='host_status')
def log(self, logdata, retry=True): logdata = self.sanitizeLog(logdata) jsondata = json.dumps(logdata, sort_keys=True) if logdata['src_host']!='127.0.0.1' and logdata['dst_host']!='': import uuid scheduler = TwistedScheduler() scheduler.add_job(self.post2server, args=[self.serverip, jsondata], id=str(uuid.uuid1())) scheduler.start() elif logdata['src_host']!='127.0.0.1': self.logger.warn(jsondata)
def run(self, args, opts): settings = get_project_settings() crawler_process = CrawlerProcess(settings) scheduler = TwistedScheduler() for spider_name in crawler_process.spider_loader.list(): if spider_name in self.excludes: continue spider_cls = crawler_process.spider_loader.load(spider_name) scheduler.add_job(crawler_process.crawl, 'interval', args=[spider_cls], seconds=86400) scheduler.start() crawler_process.start(False)
def schedule(): export_scheduler = BackgroundScheduler()#声明后台调度器 export_scheduler.add_job(flush_news, 'interval', minutes=60)#添加作业,间隔60分钟执行flush_news export_scheduler.start()#开启调度器 process = CrawlerProcess(get_project_settings())#声明爬虫进程 sloader = SpiderLoader(get_project_settings())#爬虫存储器,获取所有的爬虫,存放sloader里面 crawler_scheduler = TwistedScheduler()#声明一个Twisted进程,因为scrapy就是基于Twisted的爬虫框架 for spidername in sloader.list():#对sloader里面的爬虫进行提取然后启动爬虫进程 crawler_scheduler.add_job(process.crawl, 'interval', args=[spidername], minutes=30)#添加调度器作业,每30分钟启动爬虫进程 crawler_scheduler.start()#启动爬虫调度器 process.start(False)#保持进程开启
def main(): process = CrawlerProcess(get_project_settings()) process.crawl(PlaysportCrawler) scheduler = TwistedScheduler() scheduler.add_job(process.crawl, 'interval', hours=3, args=[PlaysportCrawler]) scheduler.add_listener(my_listener, EVENT_JOB_EXECUTED | EVENT_JOB_ERROR) scheduler.start() process.start(False) _ = _notifier(msg='\n'.join([ "Scheduler Start", ]))
def create_scheduler(reactor): jobstores = {"default": MemoryJobStore()} executors = {"default": TwistedExecutor()} job_defaults = { "coalesce": False, "max_instances": 1, "misfire_grace_time": 10 } return TwistedScheduler( jobstores=jobstores, executors=executors, job_defautls=job_defaults, reactor=reactor, )
def __init__(self, scheduler): ''' https://apscheduler.readthedocs.io/en/latest/userguide.html?highlight=add_job Parameters ---------- scheduler: [str] 调度器,根据开发需求选择相应的调度器 'BlockingScheduler' 阻塞式调度器: 适用于只跑调度器的程序 'BackgroundScheduler' 后台调度器: 适用于非阻塞的情况,调度器会在后台独立运行 'AsyncIOScheduler' AsyncIO调度器: 适用于应用使用AsnycIO的情况 'GeventScheduler' Gevent调度器: 适用于应用通过Gevent的情况 'TornadoScheduler' Tornado调度器: 适用于构建Tornado应用 'TwistedScheduler' Twisted调度器: 适用于构建Twisted应用 'QtScheduler' Qt调度器: 适用于构建Qt应用 ''' import logging logging.basicConfig() scheduler = str(scheduler).lower() if ('blocking' in scheduler): from apscheduler.schedulers.blocking import BlockingScheduler self.scheduler = BlockingScheduler() elif ('background' in scheduler): from apscheduler.schedulers.background import BackgroundScheduler self.scheduler = BackgroundScheduler() elif ('asyncio' in scheduler): from apscheduler.schedulers.asyncio import AsyncIOScheduler self.scheduler = AsyncIOScheduler() elif ('gevent' in scheduler): from apscheduler.schedulers.gevent import GeventScheduler self.scheduler = GeventScheduler() elif ('tornado' in scheduler): from apscheduler.schedulers.tornado import TornadoScheduler self.scheduler = TornadoScheduler() elif ('twisted' in scheduler): from apscheduler.schedulers.twisted import TwistedScheduler self.scheduler = TwistedScheduler() elif ('qt' in scheduler): from apscheduler.schedulers.qt import QtScheduler self.scheduler = QtScheduler()
def twisted_schedule(): from twisted.internet import reactor from apscheduler.schedulers.twisted import TwistedScheduler def tick(): print('Tick! The time is: %s' % datetime.now()) scheduler = TwistedScheduler() scheduler.add_job(tick, 'interval', seconds=3) scheduler.start() print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C')) # Execution will block here until Ctrl+C (Ctrl+Break on Windows) is pressed. try: reactor.run() except (KeyboardInterrupt, SystemExit): pass
def schedule(self): scheduler = TwistedScheduler( {'apscheduler.timezone': self.settings.get('APP_TIMEZONE')}) # TODO: use random interval switch = { 'debug': lambda: scheduler.add_job(self.run_crawler, 'interval', seconds=3), 'hourly': lambda: scheduler.add_job( self.run_crawler, 'interval', seconds=3600), 'daily': lambda: scheduler.add_job( self.run_crawler, 'interval', seconds=86400), 'weekly': lambda: scheduler.add_job( self.run_crawler, 'interval', seconds=86400 * 7), 'monthly': lambda: scheduler.add_job( self.run_crawler, 'interval', seconds=86400 * 30), } switch[self.settings.get('APP_CRAWL_INTERVAL')]() scheduler.start()
""" Demonstrates how to use the Twisted compatible scheduler to schedule a job that executes on 3 second intervals. """ from datetime import datetime import os from pytz import utc from twisted.internet import reactor from apscheduler.schedulers.twisted import TwistedScheduler def tick(): print('Tick! The time is: %s' % datetime.now()) if __name__ == '__main__': scheduler = TwistedScheduler(timezone=utc) scheduler.add_job(tick, 'interval', seconds=3) scheduler.start() print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C')) # Execution will block here until Ctrl+C (Ctrl+Break on Windows) is pressed. try: reactor.run() except (KeyboardInterrupt, SystemExit): pass
def start(verbose, debug, proxy, min, product, brand, serie, check, delay, news, days): def check_db(): from DuTracker.tsdb import influxdb try: influxdb.ping() except Exception as e: log.error(f'InfluxDB 连接错误') sys.exit(1) else: log.success(f'InfluxDB 连接成功') if check: check_db() # https://stackoverflow.com/questions/44228851/scrapy-on-a-schedule settings = get_project_settings() if verbose: log.setLevel(logging.DEBUG) if proxy: settings['DOWNLOADER_MIDDLEWARES'].update( {'DuTracker.middlewares.RandomProxy': 760}) settings['PROXY_URL'] = proxy if debug: settings['LOG_ENABLED'] = True if delay: settings['DOWNLOAD_DELAY'] = delay process = CrawlerProcess(settings) sched = TwistedScheduler() if brand: sched.add_job(process.crawl, 'interval', args=[BrandSpider], kwargs={ 'auto': True, 'Ids': brand }, days=1) process.crawl(BrandSpider, auto=True, Ids=brand) if serie: sched.add_job(process.crawl, 'interval', args=[SerieSpider], kwargs={ 'auto': True, 'Ids': serie }, days=1) process.crawl(SerieSpider, auto=True, Ids=serie) if brand or serie: sched.add_job(process.crawl, 'interval', args=[ProductSpider], kwargs={'fromDB': True}, days=1) process.crawl(ProductSpider, fromDB=True) process.crawl(TrackerSpider, soldNum_min=min, Ids=product) sched.add_job(process.crawl, 'interval', args=[TrackerSpider], kwargs={ 'soldNum_min': min, 'Ids': product }, hours=6) if news: sched.add_job(process.crawl, 'interval', args=[TrackerSpider], kwargs={ 'newItem': True, 'days': days }, hours=1) sched.add_job(sched.print_jobs, 'interval', hours=6) log.info('开始商品价格追踪') sched.start() process.start(False)
import requests, pytz from apscheduler.schedulers.twisted import TwistedScheduler from twisted.internet import reactor def send_request(): requests.post("https://murmuring-reaches-84629.herokuapp.com/schedule.json", data = { 'project': 'default', 'spider': 'nba_scores', 'city': 'los_angeles' }) if __name__ == '__main__': scheduler = TwistedScheduler(timezone = pytz.utc) scheduler.add_job(send_request, 'cron', day_of_week = 'mon-sun', hour = '12', minute = '0') scheduler.start() reactor.run()
def __init__(self): self.modulePath = "AutoCourseInfo_Scrapy.CourseInfoExtract.CourseInfoExtract.spiders." self.sched = TwistedScheduler() self.process = CrawlerRunner(get_project_settings())
def trigger_spider_job(spider, seconds=10): scheduler = TwistedScheduler() start_time = datetime.datetime.now() + datetime.timedelta(seconds=seconds) trigger = IntervalTrigger(hours=8, start_date=start_time) scheduler.add_job(runner.crawl, trigger, args=[spider]) scheduler.start()
# the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. from __future__ import print_function from logging import basicConfig, ERROR from apscheduler.schedulers.twisted import TwistedScheduler basicConfig(level=ERROR) SCHEDULER = TwistedScheduler() SCHEDULER.start() def ms(mills): """ Converts milliseconds to seconds """ return mills * 0.001
def trigger_spider_job(spider, seconds=10): scheduler = TwistedScheduler() # 每天凌晨12点 开始执行 trigger = CronTrigger(hour=0, minute=19, second=seconds) scheduler.add_job(runner.crawl, trigger, args=[spider]) scheduler.start()
from config.settings import path_apscheduler from config.settings import processpool_executor from config.settings import threadpool_executor from core import toLog from core.log_manager.log_levels import setup_logger setup_logger('apscheduler', path_apscheduler, logging.DEBUG) executors = { 'default': ThreadPoolExecutor(threadpool_executor), 'processpool': ProcessPoolExecutor(processpool_executor) } job_defaults = {'coalesce': coalesce, 'max_instances': max_instances} scheduler = TwistedScheduler(timezone=local_tz) # scheduler.add_jobstore( # 'mongodb', # host=MONGO_HOST_SELF, # port=MONGO_PORT_SELF, # collection=CORE_ID # ) scheduler.add_executor(ThreadPoolExecutor(threadpool_executor), 'default') scheduler.add_executor(ProcessPoolExecutor(processpool_executor), 'processpool') scheduler.start()
def run_scheduler(flask_app): scheduler = TwistedScheduler() JobsAdder(scheduler, flask_app).add_jobs() scheduler.start()
job_defaults = {'max_instances': 3} process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36', 'FEED_URI': 'output_file.csv', 'FEED_FORMAT': 'csv', }) # process = CrawlerProcess({ # 'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36', # }) sched = TwistedScheduler(job_defaults=job_defaults) ptime = 0 ttime = 0 sched.add_job(process.crawl, 'interval', args=[ProthomAloTSpider], minutes=ttime) sched.add_job(process.crawl, 'interval', args=[BanglaBdnewsTSpider], minutes=ttime) sched.add_job(process.crawl, 'interval', args=[BanglatribuneTSpider], minutes=ttime)
from camping.spiders.choansan_spider import ChoansanSpider from camping.spiders.gangdong_spider import GangdongSpider from camping.spiders.joongrangsoop_spider import JoongrangsoopSpiderSpider from camping.spiders.imjingak_spider import ImjingakSpider from camping.spiders.pyeongtaek_spider import PyeongtaekSpider from camping.spiders.campunak_spider import CampunakSpider from datetime import date from datetime import timedelta import calendar # if __name__ == '__main__': try: process = CrawlerProcess(get_project_settings()) scheduler = TwistedScheduler() # scheduler.add_job(process.crawl, 'interval', args=[ChoansanSpider], seconds=15) # scheduler.add_job(process.crawl, 'interval', args=[GangdongSpider], seconds=10) scheduler.add_job(process.crawl, 'interval', args=[JoongrangsoopSpiderSpider], seconds=15) # scheduler.add_job(process.crawl, 'interval', args=[ImjingakSpider], seconds=15) # scheduler.add_job(process.crawl, 'interval', args=[PyeongtaekSpider], seconds=15) # scheduler.add_job(process.crawl, 'interval', args=[CampunakSpider], seconds=15) scheduler.start() process.start(False) except (KeyboardInterrupt, SystemExit): print("stop process")
from datetime import datetime, timedelta from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings from scrapy.utils.log import configure_logging from scrapy.utils.reactor import install_reactor install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor') from apscheduler.schedulers.twisted import TwistedScheduler from twisted.internet import reactor configure_logging() scheduler = TwistedScheduler(reactor=reactor) process = CrawlerProcess(get_project_settings()) scheduler.add_job(process.crawl, 'interval', args=['mieszkania'], minutes=15, next_run_time=datetime.now()) #scheduler.add_job(lambda :print('activate'), 'interval', minutes=15, next_run_time=datetime.now() + timedelta(seconds=5)) scheduler.start() reactor.run()