Exemplo n.º 1
0
class scheduler():
    def __init__(self):
        self.modulePath = "AutoCourseInfo_Scrapy.CourseInfoExtract.CourseInfoExtract.spiders."
        self.sched = TwistedScheduler()
        self.process = CrawlerRunner(get_project_settings())

    def addJob(self, spiderModulePath, spiderClass, scheduleTime):
        # Create Spider Object dynamically by importing module.
        try:
            module = self.modulePath + spiderModulePath
            module = importlib.import_module(module)
            class_ = getattr(module, spiderClass)
            instance = class_()
            self.sched.add_job(self.process.crawl,
                               'date',
                               args=[instance],
                               run_date=scheduleTime)

        except (Exception) as error:
            print(error)

    def runJob(self):
        try:
            self.sched.start()
            d = self.process.join()
            d.addBoth(lambda _: reactor.stop())
            reactor.run()

        except (Exception) as error:
            print(error)
Exemplo n.º 2
0
    def __init__(self):
        self.scrapers = [
            HistorySpider, WpbccSpider, LWVChicago, LibraryEvents,
            GreatLakesReader
        ]
        self.interval_seconds = 60 * config.schedule_interval

        self.scheduler = TwistedScheduler()
        self.scheduler.add_listener(self.schedule_missed, EVENT_JOB_MISSED)
Exemplo n.º 3
0
    def setup(cls, conf):
        if "sql" in conf:
            DatabaseMixin.sql = \
            adbapi.ConnectionPool("psycopg2",
                                  host=conf["sql"]['host'],
                                  database=conf["sql"]['database'],
                                  user=conf["sql"]['username'],
                                  password=conf["sql"]['password'],
                                  cp_min=1,
                                  cp_max=10,
                                  cp_reconnect=True,
                                  cp_noisy=conf["debug"])

        if "redis" in conf:
            DatabaseMixin.redis = \
            cyclone.redis.lazyConnectionPool(
                          host=conf["redis"]['host'],
                          dbid=conf["redis"]['dbid'],
                          poolsize=10,
                          reconnect=True)

            if conf["redis"].get("pubsub", False):
                pubsub = cyclone.redis.SubscriberFactory()
                pubsub.maxDelay = 20
                pubsub.continueTrying = True
                pubsub.protocol = PubSubProtocol
                reactor.connectTCP(conf["redis"]['host'], 6379, pubsub)

        DatabaseMixin.sched = TwistedScheduler()
        DatabaseMixin.build()
        DatabaseMixin.sched.start()
Exemplo n.º 4
0
def twisted_schedule():
    from twisted.internet import reactor
    from apscheduler.schedulers.twisted import TwistedScheduler

    def tick():
        print('Tick! The time is: %s' % datetime.now())

    scheduler = TwistedScheduler()
    scheduler.add_job(tick, 'interval', seconds=3)
    scheduler.start()
    print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C'))

    # Execution will block here until Ctrl+C (Ctrl+Break on Windows) is pressed.
    try:
        reactor.run()
    except (KeyboardInterrupt, SystemExit):
        pass
Exemplo n.º 5
0
    def __init__(self, root, local_items):
        resource.Resource.__init__(self)
        self.root = root
        self.local_items = local_items
        self.spider_status_dic = {}

        logging.basicConfig()
        self.scheduler = TwistedScheduler()
        self.scheduler.start()
Exemplo n.º 6
0
 def startYourEngines(self):
     sched = TwistedScheduler()
     sched.start()
     if sched.get_job('host_status'):
         pass
     else:
         sched.add_job(self.hoststatus,
                       'interval',
                       seconds=10,
                       id='host_status')
Exemplo n.º 7
0
def trigger_spider_job(seconds=10, source_type="1", source_key="jd", document_item=YJdItem):
    scheduler = TwistedScheduler()
    trigger = CronTrigger(hour=10, minute=42, second=seconds)
    scheduler.add_job(print_time, trigger, args=[source_type, source_key, document_item]
                      , misfire_grace_time=120)

    scheduler.start()
Exemplo n.º 8
0
def main():
    process = CrawlerProcess(get_project_settings())
    process.crawl(PlaysportCrawler)
    scheduler = TwistedScheduler()
    scheduler.add_job(process.crawl,
                      'interval',
                      hours=3,
                      args=[PlaysportCrawler])
    scheduler.add_listener(my_listener, EVENT_JOB_EXECUTED | EVENT_JOB_ERROR)
    scheduler.start()
    process.start(False)
    _ = _notifier(msg='\n'.join([
        "Scheduler Start",
    ]))
Exemplo n.º 9
0
def create_scheduler(reactor):
    jobstores = {"default": MemoryJobStore()}
    executors = {"default": TwistedExecutor()}
    job_defaults = {
        "coalesce": False,
        "max_instances": 1,
        "misfire_grace_time": 10
    }
    return TwistedScheduler(
        jobstores=jobstores,
        executors=executors,
        job_defautls=job_defaults,
        reactor=reactor,
    )
Exemplo n.º 10
0
 def log(self, logdata, retry=True):
     logdata = self.sanitizeLog(logdata)
     jsondata = json.dumps(logdata, sort_keys=True)
     if logdata['src_host']!='127.0.0.1' and logdata['dst_host']!='':
         import uuid
         scheduler = TwistedScheduler()
         scheduler.add_job(self.post2server, args=[self.serverip, jsondata], id=str(uuid.uuid1()))
         scheduler.start()
     elif logdata['src_host']!='127.0.0.1':
         self.logger.warn(jsondata)
Exemplo n.º 11
0
    def __init__(self, scheduler):
        '''
		https://apscheduler.readthedocs.io/en/latest/userguide.html?highlight=add_job

		Parameters
		----------
		scheduler:
			[str] 调度器,根据开发需求选择相应的调度器
			'BlockingScheduler' 阻塞式调度器:
				适用于只跑调度器的程序
			'BackgroundScheduler' 后台调度器:
				适用于非阻塞的情况,调度器会在后台独立运行
			'AsyncIOScheduler' AsyncIO调度器:
				适用于应用使用AsnycIO的情况
			'GeventScheduler' Gevent调度器:
				适用于应用通过Gevent的情况
			'TornadoScheduler' Tornado调度器:
				适用于构建Tornado应用
			'TwistedScheduler' Twisted调度器:
				适用于构建Twisted应用
			'QtScheduler' Qt调度器:
				适用于构建Qt应用
		'''
        import logging
        logging.basicConfig()
        scheduler = str(scheduler).lower()
        if ('blocking' in scheduler):
            from apscheduler.schedulers.blocking import BlockingScheduler
            self.scheduler = BlockingScheduler()
        elif ('background' in scheduler):
            from apscheduler.schedulers.background import BackgroundScheduler
            self.scheduler = BackgroundScheduler()
        elif ('asyncio' in scheduler):
            from apscheduler.schedulers.asyncio import AsyncIOScheduler
            self.scheduler = AsyncIOScheduler()
        elif ('gevent' in scheduler):
            from apscheduler.schedulers.gevent import GeventScheduler
            self.scheduler = GeventScheduler()
        elif ('tornado' in scheduler):
            from apscheduler.schedulers.tornado import TornadoScheduler
            self.scheduler = TornadoScheduler()
        elif ('twisted' in scheduler):
            from apscheduler.schedulers.twisted import TwistedScheduler
            self.scheduler = TwistedScheduler()
        elif ('qt' in scheduler):
            from apscheduler.schedulers.qt import QtScheduler
            self.scheduler = QtScheduler()
Exemplo n.º 12
0
    def run(self, args, opts):
        settings = get_project_settings()
        crawler_process = CrawlerProcess(settings)
        scheduler = TwistedScheduler()

        for spider_name in crawler_process.spider_loader.list():
            if spider_name in self.excludes:
                continue
            spider_cls = crawler_process.spider_loader.load(spider_name)
            scheduler.add_job(crawler_process.crawl, 'interval', args=[spider_cls], seconds=86400)
        scheduler.start()
        crawler_process.start(False)
Exemplo n.º 13
0
def schedule():
    export_scheduler = BackgroundScheduler()#声明后台调度器
    export_scheduler.add_job(flush_news, 'interval', minutes=60)#添加作业,间隔60分钟执行flush_news
    export_scheduler.start()#开启调度器

    process = CrawlerProcess(get_project_settings())#声明爬虫进程
    sloader = SpiderLoader(get_project_settings())#爬虫存储器,获取所有的爬虫,存放sloader里面
    crawler_scheduler = TwistedScheduler()#声明一个Twisted进程,因为scrapy就是基于Twisted的爬虫框架
    for spidername in sloader.list():#对sloader里面的爬虫进行提取然后启动爬虫进程
        crawler_scheduler.add_job(process.crawl, 'interval', args=[spidername], minutes=30)#添加调度器作业,每30分钟启动爬虫进程
    crawler_scheduler.start()#启动爬虫调度器
    process.start(False)#保持进程开启
Exemplo n.º 14
0
    def schedule(self):
        scheduler = TwistedScheduler(
            {'apscheduler.timezone': self.settings.get('APP_TIMEZONE')})

        # TODO: use random interval
        switch = {
            'debug':
            lambda: scheduler.add_job(self.run_crawler, 'interval', seconds=3),
            'hourly':
            lambda: scheduler.add_job(
                self.run_crawler, 'interval', seconds=3600),
            'daily':
            lambda: scheduler.add_job(
                self.run_crawler, 'interval', seconds=86400),
            'weekly':
            lambda: scheduler.add_job(
                self.run_crawler, 'interval', seconds=86400 * 7),
            'monthly':
            lambda: scheduler.add_job(
                self.run_crawler, 'interval', seconds=86400 * 30),
        }

        switch[self.settings.get('APP_CRAWL_INTERVAL')]()
        scheduler.start()
Exemplo n.º 15
0
def twisted_schedule():
    from twisted.internet import reactor
    from apscheduler.schedulers.twisted import TwistedScheduler

    def tick():
        print('Tick! The time is: %s' % datetime.now())

    scheduler = TwistedScheduler()
    scheduler.add_job(tick, 'interval', seconds=3)
    scheduler.start()
    print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C'))

    # Execution will block here until Ctrl+C (Ctrl+Break on Windows) is pressed.
    try:
        reactor.run()
    except (KeyboardInterrupt, SystemExit):
        pass
Exemplo n.º 16
0
class Scheduler:
    def __init__(self):
        self.scrapers = [
            HistorySpider, WpbccSpider, LWVChicago, LibraryEvents,
            GreatLakesReader
        ]
        self.interval_seconds = 60 * config.schedule_interval

        self.scheduler = TwistedScheduler()
        self.scheduler.add_listener(self.schedule_missed, EVENT_JOB_MISSED)

    def add_schedule(self, scraper, seconds_delay):
        self.scheduler.add_job(self.run_scraper,
                               id=scraper.__name__,
                               trigger='interval',
                               args=[scraper],
                               start_date=datetime.now() +
                               relativedelta(seconds=seconds_delay),
                               seconds=self.interval_seconds)

    def schedule_missed(self, event):
        print(f'{event.job_id} missed. Interval time: {self.interval_seconds}')

    def run_scraper(self, scraper):
        start_date = datetime.now().strftime('%m-%d-%Y')
        end_date = (datetime.now() +
                    relativedelta(months=+1)).strftime('%m-%d-%Y')
        print(f'{datetime.now()} starting {scraper.__name__}')
        runner = CrawlerRunner(get_project_settings())
        runner.crawl(scraper, start_date, end_date)
        runner.join()

    def run_schedule(self):
        configure_logging()
        start_interval = self.interval_seconds / len(self.scrapers)
        now = datetime.now()
        self.last_scheduled = now
        for index, scraper in enumerate(self.scrapers):
            self.add_schedule(scraper, start_interval * index)

        self.scheduler.start()
        reactor.run()
Exemplo n.º 17
0
        'ITEM_PIPELINES': {
            'pipelines.FilterPipeline': 300,
            'pipelines.SaveReviewPipeline': 400
        },
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy_splash.SplashCookiesMiddleware':
            723,
            'scrapy_splash.SplashMiddleware':
            725,
            'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware':
            810,
        },
        'POSTGRES_HOST': 'localhost',
        'POSTGRES_PORT': '25432',
        'POSTGRES_DB': 'mob',
        'POSTGRES_USER': '******',
        'POSTGRES_PASSWORD': '******'
    })

if len(sys.argv) == 1:
    scheduler = TwistedScheduler()
    scheduler.add_job(process.crawl,
                      'interval',
                      args=[ReviewSpider, lambda: start_objs()],
                      seconds=45)
    scheduler.start()
    process.start(False)
else:
    process.crawl(ReviewSpider, lambda: start_objs())
    process.start()
Exemplo n.º 18
0
from core import toLog
from core.log_manager.log_levels import setup_logger

setup_logger('apscheduler', path_apscheduler, logging.DEBUG)

executors = {
    'default': ThreadPoolExecutor(threadpool_executor),
    'processpool': ProcessPoolExecutor(processpool_executor)
}

job_defaults = {
    'coalesce': coalesce,
    'max_instances': max_instances
}

scheduler = TwistedScheduler(timezone=local_tz)
scheduler.add_jobstore(
    'mongodb',
    host=MONGO_HOST_SELF,
    port=MONGO_PORT_SELF,
    collection=CORE_ID
)

scheduler.add_executor(
    ThreadPoolExecutor(threadpool_executor), 'default'
)

scheduler.add_executor(
    ProcessPoolExecutor(processpool_executor), 'processpool'
)
Exemplo n.º 19
0
from config.settings import path_apscheduler
from config.settings import processpool_executor
from config.settings import threadpool_executor
from core import toLog
from core.log_manager.log_levels import setup_logger

setup_logger('apscheduler', path_apscheduler, logging.DEBUG)

executors = {
    'default': ThreadPoolExecutor(threadpool_executor),
    'processpool': ProcessPoolExecutor(processpool_executor)
}

job_defaults = {'coalesce': coalesce, 'max_instances': max_instances}

scheduler = TwistedScheduler(timezone=local_tz)

# scheduler.add_jobstore(
#     'mongodb',
#     host=MONGO_HOST_SELF,
#     port=MONGO_PORT_SELF,
#     collection=CORE_ID
# )

scheduler.add_executor(ThreadPoolExecutor(threadpool_executor), 'default')

scheduler.add_executor(ProcessPoolExecutor(processpool_executor),
                       'processpool')

scheduler.start()
Exemplo n.º 20
0
     # "lianjia-cj-hz",
     # "lianjia-cj-nj",
     # "lianjia-cj-cs",
     # "lianjia-cj-wh",
     # "lianjia-cj-tj",
     # "lianjia-cj-zz",
     #"lianjia-cj-xa",
     #"lianjia-cj-cd",
     #"lianjia-cj-su",
     #  "lianjia-cj-cq",
     # "lianjia-cj-xm",
     # "lianjia-cj-hf",
 ])
 process = CrawlerProcess(get_project_settings())
 sloader = SpiderLoader(get_project_settings())
 scheduler = TwistedScheduler()
 hour = 3
 for spidername in sloader.list():
     # scheduler.add_job(task, 'cron', minute="*/20")
     if spidername in allow2:
         #https://apscheduler.readthedocs.io/en/latest/modules/triggers/cron.html
         # scheduler.add_job(process.crawl, 'cron', args=[spidername], hour="*/" + str(hour))
         # scheduler.add_job(func=aps_test, args=('定时任务',), trigger='cron', second='*/5')
         # scheduler.add_job(func=aps_test, args=('一次性任务',),
         #                   next_run_time=datetime.datetime.now() + datetime.timedelta(seconds=12))
         # scheduler.add_job(func=aps_test, args=('循环任务',), trigger='interval', seconds=3)
         print(spidername)
         scheduler.add_job(process.crawl,
                           'cron',
                           args=[spidername],
                           next_run_time=datetime.datetime.now() +
Exemplo n.º 21
0
                    player_5 = player_5.first()
                else:
                    check = False
                # if all players exists then create the team
                if check:
                    team_model = Team.objects.create(name=team_dict["Name"], start_date=team_dict["Start_Date"],
                                                     end_date=team_dict["End_Date"], Player_1=player_1,
                                                     Player_2=player_2, Player_3=player_3,
                                                     Player_4=player_4, Player_5=player_5,
                                                     winning_percentage=team_dict["Winning_Percentage"])
                    team_model.save()

    def get_dates_from_response(self, response):
        """this method returns the start date and end date as datetime object from a response"""
        params_string = response.url.split("?")[-1]
        params_string = params_string.split("&")
        start_date = params_string[0].split("=")[-1]
        start_date = datetime.strptime(start_date, "%Y-%m-%d")
        end_date = params_string[1].split("=")[-1]
        end_date = datetime.strptime(end_date, "%Y-%m-%d")
        return start_date, end_date


if __name__ == "__main__":
    process = CrawlerProcess(get_project_settings())
    scheduler = TwistedScheduler()
    scheduler.add_job(process.crawl, args=[StatsSpider])
    scheduler.add_job(process.crawl, 'interval', args=[StatsSpider], seconds=60 * 60 * 6)
    scheduler.start()
    process.start(False)
Exemplo n.º 22
0
    parser = argparse.ArgumentParser(description="股票行情爬虫")
    parser.add_argument("--cron",
                        type=str,
                        const=False,
                        nargs='?',
                        help="是否依照指定的时间执行,默认为False")

    args = parser.parse_args()

    cron = str2bool(args.cron, False)

    process = CrawlerProcess(get_project_settings())

    # 设置日志级别
    logging.getLogger('scrapy.core.scraper').setLevel(logging.WARNING)

    if not cron:
        sequence_run()
        process.start()
    else:
        scheduler = TwistedScheduler()

        scheduler.add_job(sequence_run,
                          'cron',
                          day_of_week='mon-fri',
                          hour='9-15',
                          minute='0/30')
        scheduler.start()
        process.start(False)
Exemplo n.º 23
0
"""
Demonstrates how to use the Twisted compatible scheduler to schedule a job that executes on 3
second intervals.
"""

from datetime import datetime
import os
from pytz import utc
from twisted.internet import reactor
from apscheduler.schedulers.twisted import TwistedScheduler


def tick():
    print('Tick! The time is: %s' % datetime.now())


if __name__ == '__main__':
    scheduler = TwistedScheduler(timezone=utc)
    scheduler.add_job(tick, 'interval', seconds=3)
    scheduler.start()
    print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C'))

    # Execution will block here until Ctrl+C (Ctrl+Break on Windows) is pressed.
    try:
        reactor.run()
    except (KeyboardInterrupt, SystemExit):
        pass
Exemplo n.º 24
0
def start(verbose, debug, proxy, min, product, brand, serie, check, delay,
          news, days):
    def check_db():
        from DuTracker.tsdb import influxdb
        try:
            influxdb.ping()
        except Exception as e:
            log.error(f'InfluxDB 连接错误')
            sys.exit(1)
        else:
            log.success(f'InfluxDB 连接成功')

    if check: check_db()

    # https://stackoverflow.com/questions/44228851/scrapy-on-a-schedule
    settings = get_project_settings()

    if verbose: log.setLevel(logging.DEBUG)
    if proxy:
        settings['DOWNLOADER_MIDDLEWARES'].update(
            {'DuTracker.middlewares.RandomProxy': 760})
        settings['PROXY_URL'] = proxy
    if debug: settings['LOG_ENABLED'] = True
    if delay: settings['DOWNLOAD_DELAY'] = delay

    process = CrawlerProcess(settings)
    sched = TwistedScheduler()

    if brand:
        sched.add_job(process.crawl,
                      'interval',
                      args=[BrandSpider],
                      kwargs={
                          'auto': True,
                          'Ids': brand
                      },
                      days=1)
        process.crawl(BrandSpider, auto=True, Ids=brand)
    if serie:
        sched.add_job(process.crawl,
                      'interval',
                      args=[SerieSpider],
                      kwargs={
                          'auto': True,
                          'Ids': serie
                      },
                      days=1)
        process.crawl(SerieSpider, auto=True, Ids=serie)
    if brand or serie:
        sched.add_job(process.crawl,
                      'interval',
                      args=[ProductSpider],
                      kwargs={'fromDB': True},
                      days=1)
        process.crawl(ProductSpider, fromDB=True)
    process.crawl(TrackerSpider, soldNum_min=min, Ids=product)

    sched.add_job(process.crawl,
                  'interval',
                  args=[TrackerSpider],
                  kwargs={
                      'soldNum_min': min,
                      'Ids': product
                  },
                  hours=6)
    if news:
        sched.add_job(process.crawl,
                      'interval',
                      args=[TrackerSpider],
                      kwargs={
                          'newItem': True,
                          'days': days
                      },
                      hours=1)

    sched.add_job(sched.print_jobs, 'interval', hours=6)

    log.info('开始商品价格追踪')
    sched.start()
    process.start(False)
from scrapy.crawler import CrawlerProcess
from spiders.zhihu import ZhihuSpider
from scrapy.utils.project import get_project_settings
from apscheduler.schedulers.twisted import TwistedScheduler

process = CrawlerProcess(get_project_settings())
sched = TwistedScheduler()
sched.add_job(process.crawl, 'interval', args=[ZhihuSpider], seconds=300)
sched.start()
process.start(False)  # Do not stop reactor after spider closes
Exemplo n.º 26
0
#
# @defer.inlineCallbacks
# def crawl():
#     yield runner.crawl(AnjukeSpider)
#     yield runner.crawl(BeikeSpider)
#     yield runner.crawl(LianjiaSpider)
#     reactor.stop()
#
# crawl()
# reactor.run() # the script will block here until the last crawl call is finished

from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from apscheduler.schedulers.twisted import TwistedScheduler

from scrapy_jojozu.spiders.anjuke import AnjukeSpider
from scrapy_jojozu.spiders.beike import BeikeSpider
from scrapy_jojozu.spiders.lianjia import LianjiaSpider
from scrapy_jojozu.spiders.fangtianxia import FangSpider
from scrapy_jojozu.spiders.douban import DoubanSpider

process = CrawlerProcess(get_project_settings())
scheduler = TwistedScheduler()
scheduler.add_job(process.crawl, 'interval', args=[AnjukeSpider], minutes=30),
scheduler.add_job(process.crawl, 'interval', args=[BeikeSpider], minutes=30),
scheduler.add_job(process.crawl, 'interval', args=[FangSpider], minutes=30),
scheduler.add_job(process.crawl, 'interval', args=[LianjiaSpider], minutes=30),
scheduler.add_job(process.crawl, "interval", args=[DoubanSpider], minutes=30)
scheduler.start()
process.start(False)
Exemplo n.º 27
0
import json

from klein import Klein, route, run
from scrapy import signals
from scrapy.crawler import CrawlerRunner
from scrapy.utils.project import get_project_settings
from apscheduler.schedulers.twisted import TwistedScheduler
import datetime

from meituan_spider.spiders.meituan_article import MeituanArticleSpider

app = Klein()

scheduler = TwistedScheduler()
# crawl_job = None

class MyCrawlerRunner(CrawlerRunner):
    """
    Crawler object that collects items and returns output after finishing crawl.
    """
    def crawl(self, crawler_or_spidercls, *args, **kwargs):
        # keep all items scraped
        self.items = []

        # create crawler (Same as in base CrawlerProcess)
        crawler = self.create_crawler(crawler_or_spidercls)

        # handle each item scraped
        crawler.signals.connect(self.item_scraped, signals.item_scraped)

        # create Twisted.Deferred launching crawl
Exemplo n.º 28
0
def run_scheduler(flask_app):
    scheduler = TwistedScheduler()
    JobsAdder(scheduler, flask_app).add_jobs()
    scheduler.start()
Exemplo n.º 29
0
from apscheduler.schedulers.twisted import TwistedScheduler
from twisted.internet import reactor
from twisted.internet.defer import inlineCallbacks
from autobahn.twisted.wamp import ApplicationSession


scheduler = TwistedScheduler()
scheduler.add_job(tick, 'interval', seconds=3)
scheduler.start()
Exemplo n.º 30
0
class Spiders(resource.Resource):
    u"""
    显示Spider的工作状态以及调度计划
    """
    config = Config()
    def __init__(self, root, local_items):
        resource.Resource.__init__(self)
        self.root = root
        self.local_items = local_items
        self.spider_status_dic = {}

        logging.basicConfig()
        self.scheduler = TwistedScheduler()
        self.scheduler.start()

    def get_spider_status(self, project):
        spider_status = self.spider_status_dic.get(project)
        if not spider_status:
            spider_status = dict((spider, {'status': 'finished', 'timestamp': None, 'job': None, 'schedule_job': None})
                                 for spider in get_spider_list(project))
            self.spider_status_dic[project] = spider_status
        self._update_spider_status(project)
        return spider_status

    def _update_spider_status(self, project):
        u"""
        先获取目前任务调度情况,然后再获取apsheduler中的任务。
        """
        spider_status = self.spider_status_dic.get(project)
        for project, queue in self.root.poller.queues.items():
            for m in queue.list():
                spider = m['name']
                job = m['_job']
                spider_status[spider]['status'] = 'pending'
                spider_status[spider]['timestamp'] = None
                spider_status[spider]['job'] = job
        for p in self.root.launcher.processes.values():
            spider = p.spider
            spider_status[spider]['status'] = 'running'
            spider_status[spider]['timestamp'] = p.start_time
            spider_status[spider]['job'] = p.job
        for p in self.root.launcher.finished:
            spider = p.spider
            spider_status[spider]['status'] = 'finished'
            spider_status[spider]['timestamp'] = p.end_time
            spider_status[spider]['job'] = p.job

        for spider in spider_status:
            status = spider_status[spider]
            sjob = self.scheduler.get_job(spider)
            status['schedule_job'] = sjob
            if sjob:
                status['next_time'] = sjob.next_run_time
            else:
                status['next_time'] = None
            # sjob._get_run_times()

    def render_GET(self, txrequest):
        args = dict((k, v[0]) for k, v in txrequest.args.items())
        project = args.pop('project', 'careerTalk')
        spider_status = self.get_spider_status(project)
        content = "<tr>"
        for th in ['spider', 'status', 'timestamp', 'next_time', 'data']:
            content += "<th>%s</th>" % th
        content += "</tr>"
        for spider in spider_status:
            status = spider_status[spider]
            content += "<tr>"
            content += "<td>%s</td><td>%s</td><td>%s</td><td>%s</td>" \
                       % (spider, status['status'], status['timestamp'], status['next_time'])
            content += "<td><a href='/data/%s/'>data</a></td>" % spider
            content += "</tr>"
        sub_form = "<form action='' method='post'><input type='submit' value='开启所有任务'></input></form>"
        html = "<table>"+content+"</table>"+sub_form
        return html

    def render_POST(self, txrequest):
        args = dict((k, v[0]) for k, v in txrequest.args.items())
        project = args.pop('project', 'careerTalk')
        # spiders = ['NJU', 'BIT', 'ECUST', 'RUC']
        spiders = get_spider_list(project)

        tstart = dt.datetime.utcnow()
        for spider in spiders:
            job = self.scheduler.add_job(spider_crawl, 'interval', minutes=60, replace_existing=True,
                                         id=spider, next_run_time=tstart, args=[project, spider])
            tstart = tstart + dt.timedelta(seconds=5)
        return "<span>任务全部开启</span><a href='/'>返回</a>"
Exemplo n.º 31
0
configure_logging()
session = DBSession()
config = session.query(Config).filter(Config.id == 1).one()

runner = CrawlerRunner(get_scrapy_settings(config))


@defer.inlineCallbacks
def crawl(rule):
    dot = rule.spider_clsass.rindex('.')
    module, name = rule.spider_clsass[:dot], rule.spider_clsass[dot + 1:]
    if (module == GeneralSpider.__module__ and name == GeneralSpider.__name__):
        yield runner.crawl(GeneralSpider, rule)


sched = TwistedScheduler()
# sched.add_jobstore('sqlalchemy', url='mysql+pymysql://root:djejeUJ3qj^[email protected]:3306/apscheduler')

spiderRules = session.query(SpiderRule).filter(SpiderRule.enable,
                                               SpiderRule.cron).all()

for sr in spiderRules:
    if sr.cron:
        sched.add_job(crawl,
                      CronTrigger.from_crontab(sr.cron),
                      args=[sr],
                      name=sr.name,
                      id='%s' % sr.id)
    # else:
    #     sched.add_job(crawl, 'date', args=[sr],  name=sr.name, id='%s'%sr.id)
session.close()
Exemplo n.º 32
0
from config.settings import path_apscheduler
from config.settings import processpool_executor
from config.settings import threadpool_executor
from core import toLog
from core.log_manager.log_levels import setup_logger

setup_logger('apscheduler', path_apscheduler, logging.DEBUG)

executors = {
    'default': ThreadPoolExecutor(threadpool_executor),
    'processpool': ProcessPoolExecutor(processpool_executor)
}

job_defaults = {'coalesce': coalesce, 'max_instances': max_instances}

scheduler = TwistedScheduler(timezone=local_tz)
scheduler.add_jobstore('mongodb',
                       host=MONGO_HOST_SELF,
                       port=MONGO_PORT_SELF,
                       collection=CORE_ID)

scheduler.add_executor(ThreadPoolExecutor(threadpool_executor), 'default')

scheduler.add_executor(ProcessPoolExecutor(processpool_executor),
                       'processpool')

scheduler.start()


def job_logger(event):
    if event.code > 512:
Exemplo n.º 33
0
        title = response.xpath("//article[@class='detail']/div[@class='detail__header']/h1/text()").get()
        paragraphs = response.xpath("//div[@class='detail__body itp_bodycontent_wrapper']/div[@class='detail__body-text itp_bodycontent']/p/text()").extract()
        
        if(paragraphs == None):
            return

        content = ""
        for paragraph in paragraphs:
            content = content + paragraph

        database = connection["scrapingdb"]
        collection = database["detik_articles"]

        collection.insert_one({
            "title":title,
            "content":content,
            "url": response.url
        })

        connection.close()
        yield { "message" : f"success: {response.url}" }


process = CrawlerProcess()
# process.crawl(MySpider)
# process.start() 
scheduler = TwistedScheduler()
scheduler.add_job(process.crawl, 'interval', args=[MySpider], seconds=5)
scheduler.start()
process.start(False)
Exemplo n.º 34
0
# "lianjia-cj-cs",
# "lianjia-cj-wh",
# "lianjia-cj-tj",
# "lianjia-cj-zz",
#"lianjia-cj-xa",
#"lianjia-cj-cd",
#"lianjia-cj-su",
#  "lianjia-cj-cq",
# "lianjia-cj-xm",
# "lianjia-cj-hf",
    ])
  process = CrawlerProcess(get_project_settings())
  runner = CrawlerRunner(get_project_settings())
  # runner.crawl()
  sloader = SpiderLoader(get_project_settings())
  scheduler = TwistedScheduler()
  hour = 1
  for spidername in sloader.list():
    # scheduler.add_job(task, 'cron', minute="*/20")
    if spidername in allow:
      #https://apscheduler.readthedocs.io/en/latest/modules/triggers/cron.html
      # scheduler.add_job(process.crawl, 'cron', args=[spidername], hour="*/" + str(hour))
      # scheduler.add_job(func=aps_test, args=('定时任务',), trigger='cron', second='*/5')
      # scheduler.add_job(func=aps_test, args=('一次性任务',),
      #                   next_run_time=datetime.datetime.now() + datetime.timedelta(seconds=12))
      # scheduler.add_job(func=aps_test, args=('循环任务',), trigger='interval', seconds=3)
      print(spidername)
      scheduler.add_job(process.crawl, trigger='date', args=[spidername],
                        run_date=datetime.datetime(2018, 9, 10, hour, 0, 0))
      # scheduler.add_job(process.crawl, trigger='cron', args=[spidername],
      #                   year='*', month='*', day=9, week='*', day_of_week='*', hour=hour, minute=20, second=0)
Exemplo n.º 35
0
from camping.spiders.choansan_spider import ChoansanSpider
from camping.spiders.gangdong_spider import GangdongSpider
from camping.spiders.joongrangsoop_spider import JoongrangsoopSpiderSpider
from camping.spiders.imjingak_spider import ImjingakSpider
from camping.spiders.pyeongtaek_spider import PyeongtaekSpider
from camping.spiders.campunak_spider import CampunakSpider

from datetime import date
from datetime import timedelta
import calendar

# if __name__ == '__main__':

try:
    process = CrawlerProcess(get_project_settings())
    scheduler = TwistedScheduler()
    # scheduler.add_job(process.crawl, 'interval', args=[ChoansanSpider], seconds=15)
    # scheduler.add_job(process.crawl, 'interval', args=[GangdongSpider], seconds=10)
    scheduler.add_job(process.crawl,
                      'interval',
                      args=[JoongrangsoopSpiderSpider],
                      seconds=15)
    # scheduler.add_job(process.crawl, 'interval', args=[ImjingakSpider], seconds=15)
    # scheduler.add_job(process.crawl, 'interval', args=[PyeongtaekSpider], seconds=15)
    # scheduler.add_job(process.crawl, 'interval', args=[CampunakSpider], seconds=15)
    scheduler.start()
    process.start(False)
except (KeyboardInterrupt, SystemExit):
    print("stop process")

Exemplo n.º 36
0
import logging

logging.basicConfig(format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S',filename='list.log', filemode='w')

n = 6000
ll = [0 for i in range(n)]

def func(i):
        ll[i] += 1

def err_lis(ev):
    logger = logging.getLogger("")
    logger.error(str(ev))

#scheduler = BackgroundScheduler()
scheduler = TwistedScheduler()
for i in range(n):
    start = datetime.datetime.now() + datetime.timedelta(seconds=i%10)
    scheduler.add_job(func, 'interval', args=(i,), start_date=start, seconds=10)

scheduler.add_listener(err_lis, apscheduler.events.EVENT_JOB_ERROR | apscheduler.events.EVENT_JOB_MISSED)
scheduler.start()
time.sleep(5)
scheduler.shutdown()
s = 0
for i in ll:
    s+=i
print s