Exemplo n.º 1
0
    def setup(cls, conf):
        if "sql" in conf:
            DatabaseMixin.sql = \
            adbapi.ConnectionPool("psycopg2",
                                  host=conf["sql"]['host'],
                                  database=conf["sql"]['database'],
                                  user=conf["sql"]['username'],
                                  password=conf["sql"]['password'],
                                  cp_min=1,
                                  cp_max=10,
                                  cp_reconnect=True,
                                  cp_noisy=conf["debug"])

        if "redis" in conf:
            DatabaseMixin.redis = \
            cyclone.redis.lazyConnectionPool(
                          host=conf["redis"]['host'],
                          dbid=conf["redis"]['dbid'],
                          poolsize=10,
                          reconnect=True)

            if conf["redis"].get("pubsub", False):
                pubsub = cyclone.redis.SubscriberFactory()
                pubsub.maxDelay = 20
                pubsub.continueTrying = True
                pubsub.protocol = PubSubProtocol
                reactor.connectTCP(conf["redis"]['host'], 6379, pubsub)

        DatabaseMixin.sched = TwistedScheduler()
        DatabaseMixin.build()
        DatabaseMixin.sched.start()
Exemplo n.º 2
0
def trigger_spider_job(seconds=10, source_type="1", source_key="jd", document_item=YJdItem):
    scheduler = TwistedScheduler()
    trigger = CronTrigger(hour=10, minute=42, second=seconds)
    scheduler.add_job(print_time, trigger, args=[source_type, source_key, document_item]
                      , misfire_grace_time=120)

    scheduler.start()
Exemplo n.º 3
0
    def __init__(self):
        self.scrapers = [
            HistorySpider, WpbccSpider, LWVChicago, LibraryEvents,
            GreatLakesReader
        ]
        self.interval_seconds = 60 * config.schedule_interval

        self.scheduler = TwistedScheduler()
        self.scheduler.add_listener(self.schedule_missed, EVENT_JOB_MISSED)
Exemplo n.º 4
0
 def startYourEngines(self):
     sched = TwistedScheduler()
     sched.start()
     if sched.get_job('host_status'):
         pass
     else:
         sched.add_job(self.hoststatus,
                       'interval',
                       seconds=10,
                       id='host_status')
Exemplo n.º 5
0
 def log(self, logdata, retry=True):
     logdata = self.sanitizeLog(logdata)
     jsondata = json.dumps(logdata, sort_keys=True)
     if logdata['src_host']!='127.0.0.1' and logdata['dst_host']!='':
         import uuid
         scheduler = TwistedScheduler()
         scheduler.add_job(self.post2server, args=[self.serverip, jsondata], id=str(uuid.uuid1()))
         scheduler.start()
     elif logdata['src_host']!='127.0.0.1':
         self.logger.warn(jsondata)
Exemplo n.º 6
0
    def run(self, args, opts):
        settings = get_project_settings()
        crawler_process = CrawlerProcess(settings)
        scheduler = TwistedScheduler()

        for spider_name in crawler_process.spider_loader.list():
            if spider_name in self.excludes:
                continue
            spider_cls = crawler_process.spider_loader.load(spider_name)
            scheduler.add_job(crawler_process.crawl, 'interval', args=[spider_cls], seconds=86400)
        scheduler.start()
        crawler_process.start(False)
Exemplo n.º 7
0
def schedule():
    export_scheduler = BackgroundScheduler()#声明后台调度器
    export_scheduler.add_job(flush_news, 'interval', minutes=60)#添加作业,间隔60分钟执行flush_news
    export_scheduler.start()#开启调度器

    process = CrawlerProcess(get_project_settings())#声明爬虫进程
    sloader = SpiderLoader(get_project_settings())#爬虫存储器,获取所有的爬虫,存放sloader里面
    crawler_scheduler = TwistedScheduler()#声明一个Twisted进程,因为scrapy就是基于Twisted的爬虫框架
    for spidername in sloader.list():#对sloader里面的爬虫进行提取然后启动爬虫进程
        crawler_scheduler.add_job(process.crawl, 'interval', args=[spidername], minutes=30)#添加调度器作业,每30分钟启动爬虫进程
    crawler_scheduler.start()#启动爬虫调度器
    process.start(False)#保持进程开启
Exemplo n.º 8
0
def main():
    process = CrawlerProcess(get_project_settings())
    process.crawl(PlaysportCrawler)
    scheduler = TwistedScheduler()
    scheduler.add_job(process.crawl,
                      'interval',
                      hours=3,
                      args=[PlaysportCrawler])
    scheduler.add_listener(my_listener, EVENT_JOB_EXECUTED | EVENT_JOB_ERROR)
    scheduler.start()
    process.start(False)
    _ = _notifier(msg='\n'.join([
        "Scheduler Start",
    ]))
Exemplo n.º 9
0
def create_scheduler(reactor):
    jobstores = {"default": MemoryJobStore()}
    executors = {"default": TwistedExecutor()}
    job_defaults = {
        "coalesce": False,
        "max_instances": 1,
        "misfire_grace_time": 10
    }
    return TwistedScheduler(
        jobstores=jobstores,
        executors=executors,
        job_defautls=job_defaults,
        reactor=reactor,
    )
Exemplo n.º 10
0
    def __init__(self, scheduler):
        '''
		https://apscheduler.readthedocs.io/en/latest/userguide.html?highlight=add_job

		Parameters
		----------
		scheduler:
			[str] 调度器,根据开发需求选择相应的调度器
			'BlockingScheduler' 阻塞式调度器:
				适用于只跑调度器的程序
			'BackgroundScheduler' 后台调度器:
				适用于非阻塞的情况,调度器会在后台独立运行
			'AsyncIOScheduler' AsyncIO调度器:
				适用于应用使用AsnycIO的情况
			'GeventScheduler' Gevent调度器:
				适用于应用通过Gevent的情况
			'TornadoScheduler' Tornado调度器:
				适用于构建Tornado应用
			'TwistedScheduler' Twisted调度器:
				适用于构建Twisted应用
			'QtScheduler' Qt调度器:
				适用于构建Qt应用
		'''
        import logging
        logging.basicConfig()
        scheduler = str(scheduler).lower()
        if ('blocking' in scheduler):
            from apscheduler.schedulers.blocking import BlockingScheduler
            self.scheduler = BlockingScheduler()
        elif ('background' in scheduler):
            from apscheduler.schedulers.background import BackgroundScheduler
            self.scheduler = BackgroundScheduler()
        elif ('asyncio' in scheduler):
            from apscheduler.schedulers.asyncio import AsyncIOScheduler
            self.scheduler = AsyncIOScheduler()
        elif ('gevent' in scheduler):
            from apscheduler.schedulers.gevent import GeventScheduler
            self.scheduler = GeventScheduler()
        elif ('tornado' in scheduler):
            from apscheduler.schedulers.tornado import TornadoScheduler
            self.scheduler = TornadoScheduler()
        elif ('twisted' in scheduler):
            from apscheduler.schedulers.twisted import TwistedScheduler
            self.scheduler = TwistedScheduler()
        elif ('qt' in scheduler):
            from apscheduler.schedulers.qt import QtScheduler
            self.scheduler = QtScheduler()
Exemplo n.º 11
0
def twisted_schedule():
    from twisted.internet import reactor
    from apscheduler.schedulers.twisted import TwistedScheduler

    def tick():
        print('Tick! The time is: %s' % datetime.now())

    scheduler = TwistedScheduler()
    scheduler.add_job(tick, 'interval', seconds=3)
    scheduler.start()
    print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C'))

    # Execution will block here until Ctrl+C (Ctrl+Break on Windows) is pressed.
    try:
        reactor.run()
    except (KeyboardInterrupt, SystemExit):
        pass
Exemplo n.º 12
0
    def schedule(self):
        scheduler = TwistedScheduler(
            {'apscheduler.timezone': self.settings.get('APP_TIMEZONE')})

        # TODO: use random interval
        switch = {
            'debug':
            lambda: scheduler.add_job(self.run_crawler, 'interval', seconds=3),
            'hourly':
            lambda: scheduler.add_job(
                self.run_crawler, 'interval', seconds=3600),
            'daily':
            lambda: scheduler.add_job(
                self.run_crawler, 'interval', seconds=86400),
            'weekly':
            lambda: scheduler.add_job(
                self.run_crawler, 'interval', seconds=86400 * 7),
            'monthly':
            lambda: scheduler.add_job(
                self.run_crawler, 'interval', seconds=86400 * 30),
        }

        switch[self.settings.get('APP_CRAWL_INTERVAL')]()
        scheduler.start()
Exemplo n.º 13
0
"""
Demonstrates how to use the Twisted compatible scheduler to schedule a job that executes on 3
second intervals.
"""

from datetime import datetime
import os
from pytz import utc
from twisted.internet import reactor
from apscheduler.schedulers.twisted import TwistedScheduler


def tick():
    print('Tick! The time is: %s' % datetime.now())


if __name__ == '__main__':
    scheduler = TwistedScheduler(timezone=utc)
    scheduler.add_job(tick, 'interval', seconds=3)
    scheduler.start()
    print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C'))

    # Execution will block here until Ctrl+C (Ctrl+Break on Windows) is pressed.
    try:
        reactor.run()
    except (KeyboardInterrupt, SystemExit):
        pass
Exemplo n.º 14
0
def start(verbose, debug, proxy, min, product, brand, serie, check, delay,
          news, days):
    def check_db():
        from DuTracker.tsdb import influxdb
        try:
            influxdb.ping()
        except Exception as e:
            log.error(f'InfluxDB 连接错误')
            sys.exit(1)
        else:
            log.success(f'InfluxDB 连接成功')

    if check: check_db()

    # https://stackoverflow.com/questions/44228851/scrapy-on-a-schedule
    settings = get_project_settings()

    if verbose: log.setLevel(logging.DEBUG)
    if proxy:
        settings['DOWNLOADER_MIDDLEWARES'].update(
            {'DuTracker.middlewares.RandomProxy': 760})
        settings['PROXY_URL'] = proxy
    if debug: settings['LOG_ENABLED'] = True
    if delay: settings['DOWNLOAD_DELAY'] = delay

    process = CrawlerProcess(settings)
    sched = TwistedScheduler()

    if brand:
        sched.add_job(process.crawl,
                      'interval',
                      args=[BrandSpider],
                      kwargs={
                          'auto': True,
                          'Ids': brand
                      },
                      days=1)
        process.crawl(BrandSpider, auto=True, Ids=brand)
    if serie:
        sched.add_job(process.crawl,
                      'interval',
                      args=[SerieSpider],
                      kwargs={
                          'auto': True,
                          'Ids': serie
                      },
                      days=1)
        process.crawl(SerieSpider, auto=True, Ids=serie)
    if brand or serie:
        sched.add_job(process.crawl,
                      'interval',
                      args=[ProductSpider],
                      kwargs={'fromDB': True},
                      days=1)
        process.crawl(ProductSpider, fromDB=True)
    process.crawl(TrackerSpider, soldNum_min=min, Ids=product)

    sched.add_job(process.crawl,
                  'interval',
                  args=[TrackerSpider],
                  kwargs={
                      'soldNum_min': min,
                      'Ids': product
                  },
                  hours=6)
    if news:
        sched.add_job(process.crawl,
                      'interval',
                      args=[TrackerSpider],
                      kwargs={
                          'newItem': True,
                          'days': days
                      },
                      hours=1)

    sched.add_job(sched.print_jobs, 'interval', hours=6)

    log.info('开始商品价格追踪')
    sched.start()
    process.start(False)
Exemplo n.º 15
0
import requests, pytz
from apscheduler.schedulers.twisted import TwistedScheduler
from twisted.internet import reactor

def send_request():
    requests.post("https://murmuring-reaches-84629.herokuapp.com/schedule.json", data = {
        'project': 'default',
        'spider': 'nba_scores',
        'city': 'los_angeles'
    })

if __name__ == '__main__':
    scheduler = TwistedScheduler(timezone = pytz.utc)
    scheduler.add_job(send_request, 'cron', day_of_week = 'mon-sun', hour = '12', minute = '0')
    scheduler.start()
    reactor.run()
Exemplo n.º 16
0
 def __init__(self):
     self.modulePath = "AutoCourseInfo_Scrapy.CourseInfoExtract.CourseInfoExtract.spiders."
     self.sched = TwistedScheduler()
     self.process = CrawlerRunner(get_project_settings())
Exemplo n.º 17
0
def trigger_spider_job(spider, seconds=10):
    scheduler = TwistedScheduler()
    start_time = datetime.datetime.now() + datetime.timedelta(seconds=seconds)
    trigger = IntervalTrigger(hours=8, start_date=start_time)
    scheduler.add_job(runner.crawl, trigger, args=[spider])
    scheduler.start()
Exemplo n.º 18
0
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

from __future__ import print_function
from logging import basicConfig, ERROR
from apscheduler.schedulers.twisted import TwistedScheduler

basicConfig(level=ERROR)

SCHEDULER = TwistedScheduler()

SCHEDULER.start()

def ms(mills):

    """
    Converts milliseconds to seconds
    """

    return mills * 0.001
Exemplo n.º 19
0
def trigger_spider_job(spider, seconds=10):
    scheduler = TwistedScheduler()
    # 每天凌晨12点 开始执行
    trigger = CronTrigger(hour=0, minute=19, second=seconds)
    scheduler.add_job(runner.crawl, trigger, args=[spider])
    scheduler.start()
Exemplo n.º 20
0
from config.settings import path_apscheduler
from config.settings import processpool_executor
from config.settings import threadpool_executor
from core import toLog
from core.log_manager.log_levels import setup_logger

setup_logger('apscheduler', path_apscheduler, logging.DEBUG)

executors = {
    'default': ThreadPoolExecutor(threadpool_executor),
    'processpool': ProcessPoolExecutor(processpool_executor)
}

job_defaults = {'coalesce': coalesce, 'max_instances': max_instances}

scheduler = TwistedScheduler(timezone=local_tz)

# scheduler.add_jobstore(
#     'mongodb',
#     host=MONGO_HOST_SELF,
#     port=MONGO_PORT_SELF,
#     collection=CORE_ID
# )

scheduler.add_executor(ThreadPoolExecutor(threadpool_executor), 'default')

scheduler.add_executor(ProcessPoolExecutor(processpool_executor),
                       'processpool')

scheduler.start()
Exemplo n.º 21
0
def run_scheduler(flask_app):
    scheduler = TwistedScheduler()
    JobsAdder(scheduler, flask_app).add_jobs()
    scheduler.start()
job_defaults = {'max_instances': 3}

process = CrawlerProcess({
    'USER_AGENT':
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36',
    'FEED_URI': 'output_file.csv',
    'FEED_FORMAT': 'csv',
})

# process = CrawlerProcess({
#     'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36',

# })

sched = TwistedScheduler(job_defaults=job_defaults)
ptime = 0
ttime = 0

sched.add_job(process.crawl,
              'interval',
              args=[ProthomAloTSpider],
              minutes=ttime)
sched.add_job(process.crawl,
              'interval',
              args=[BanglaBdnewsTSpider],
              minutes=ttime)
sched.add_job(process.crawl,
              'interval',
              args=[BanglatribuneTSpider],
              minutes=ttime)
Exemplo n.º 23
0
from camping.spiders.choansan_spider import ChoansanSpider
from camping.spiders.gangdong_spider import GangdongSpider
from camping.spiders.joongrangsoop_spider import JoongrangsoopSpiderSpider
from camping.spiders.imjingak_spider import ImjingakSpider
from camping.spiders.pyeongtaek_spider import PyeongtaekSpider
from camping.spiders.campunak_spider import CampunakSpider

from datetime import date
from datetime import timedelta
import calendar

# if __name__ == '__main__':

try:
    process = CrawlerProcess(get_project_settings())
    scheduler = TwistedScheduler()
    # scheduler.add_job(process.crawl, 'interval', args=[ChoansanSpider], seconds=15)
    # scheduler.add_job(process.crawl, 'interval', args=[GangdongSpider], seconds=10)
    scheduler.add_job(process.crawl,
                      'interval',
                      args=[JoongrangsoopSpiderSpider],
                      seconds=15)
    # scheduler.add_job(process.crawl, 'interval', args=[ImjingakSpider], seconds=15)
    # scheduler.add_job(process.crawl, 'interval', args=[PyeongtaekSpider], seconds=15)
    # scheduler.add_job(process.crawl, 'interval', args=[CampunakSpider], seconds=15)
    scheduler.start()
    process.start(False)
except (KeyboardInterrupt, SystemExit):
    print("stop process")

Exemplo n.º 24
0
from datetime import datetime, timedelta
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from scrapy.utils.log import configure_logging
from scrapy.utils.reactor import install_reactor
install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
from apscheduler.schedulers.twisted import TwistedScheduler
from twisted.internet import reactor
configure_logging()
scheduler = TwistedScheduler(reactor=reactor)
process = CrawlerProcess(get_project_settings())
scheduler.add_job(process.crawl,
                  'interval',
                  args=['mieszkania'],
                  minutes=15,
                  next_run_time=datetime.now())
#scheduler.add_job(lambda :print('activate'), 'interval', minutes=15, next_run_time=datetime.now() + timedelta(seconds=5))
scheduler.start()
reactor.run()