Exemplo n.º 1
0
class scheduler():
    def __init__(self):
        self.modulePath = "AutoCourseInfo_Scrapy.CourseInfoExtract.CourseInfoExtract.spiders."
        self.sched = TwistedScheduler()
        self.process = CrawlerRunner(get_project_settings())

    def addJob(self, spiderModulePath, spiderClass, scheduleTime):
        # Create Spider Object dynamically by importing module.
        try:
            module = self.modulePath + spiderModulePath
            module = importlib.import_module(module)
            class_ = getattr(module, spiderClass)
            instance = class_()
            self.sched.add_job(self.process.crawl,
                               'date',
                               args=[instance],
                               run_date=scheduleTime)

        except (Exception) as error:
            print(error)

    def runJob(self):
        try:
            self.sched.start()
            d = self.process.join()
            d.addBoth(lambda _: reactor.stop())
            reactor.run()

        except (Exception) as error:
            print(error)
Exemplo n.º 2
0
def trigger_spider_job(seconds=10, source_type="1", source_key="jd", document_item=YJdItem):
    scheduler = TwistedScheduler()
    trigger = CronTrigger(hour=10, minute=42, second=seconds)
    scheduler.add_job(print_time, trigger, args=[source_type, source_key, document_item]
                      , misfire_grace_time=120)

    scheduler.start()
Exemplo n.º 3
0
 def startYourEngines(self):
     sched = TwistedScheduler()
     sched.start()
     if sched.get_job('host_status'):
         pass
     else:
         sched.add_job(self.hoststatus,
                       'interval',
                       seconds=10,
                       id='host_status')
Exemplo n.º 4
0
 def log(self, logdata, retry=True):
     logdata = self.sanitizeLog(logdata)
     jsondata = json.dumps(logdata, sort_keys=True)
     if logdata['src_host']!='127.0.0.1' and logdata['dst_host']!='':
         import uuid
         scheduler = TwistedScheduler()
         scheduler.add_job(self.post2server, args=[self.serverip, jsondata], id=str(uuid.uuid1()))
         scheduler.start()
     elif logdata['src_host']!='127.0.0.1':
         self.logger.warn(jsondata)
Exemplo n.º 5
0
def schedule():
    export_scheduler = BackgroundScheduler()#声明后台调度器
    export_scheduler.add_job(flush_news, 'interval', minutes=60)#添加作业,间隔60分钟执行flush_news
    export_scheduler.start()#开启调度器

    process = CrawlerProcess(get_project_settings())#声明爬虫进程
    sloader = SpiderLoader(get_project_settings())#爬虫存储器,获取所有的爬虫,存放sloader里面
    crawler_scheduler = TwistedScheduler()#声明一个Twisted进程,因为scrapy就是基于Twisted的爬虫框架
    for spidername in sloader.list():#对sloader里面的爬虫进行提取然后启动爬虫进程
        crawler_scheduler.add_job(process.crawl, 'interval', args=[spidername], minutes=30)#添加调度器作业,每30分钟启动爬虫进程
    crawler_scheduler.start()#启动爬虫调度器
    process.start(False)#保持进程开启
Exemplo n.º 6
0
    def run(self, args, opts):
        settings = get_project_settings()
        crawler_process = CrawlerProcess(settings)
        scheduler = TwistedScheduler()

        for spider_name in crawler_process.spider_loader.list():
            if spider_name in self.excludes:
                continue
            spider_cls = crawler_process.spider_loader.load(spider_name)
            scheduler.add_job(crawler_process.crawl, 'interval', args=[spider_cls], seconds=86400)
        scheduler.start()
        crawler_process.start(False)
Exemplo n.º 7
0
def main():
    process = CrawlerProcess(get_project_settings())
    process.crawl(PlaysportCrawler)
    scheduler = TwistedScheduler()
    scheduler.add_job(process.crawl,
                      'interval',
                      hours=3,
                      args=[PlaysportCrawler])
    scheduler.add_listener(my_listener, EVENT_JOB_EXECUTED | EVENT_JOB_ERROR)
    scheduler.start()
    process.start(False)
    _ = _notifier(msg='\n'.join([
        "Scheduler Start",
    ]))
Exemplo n.º 8
0
def twisted_schedule():
    from twisted.internet import reactor
    from apscheduler.schedulers.twisted import TwistedScheduler

    def tick():
        print('Tick! The time is: %s' % datetime.now())

    scheduler = TwistedScheduler()
    scheduler.add_job(tick, 'interval', seconds=3)
    scheduler.start()
    print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C'))

    # Execution will block here until Ctrl+C (Ctrl+Break on Windows) is pressed.
    try:
        reactor.run()
    except (KeyboardInterrupt, SystemExit):
        pass
Exemplo n.º 9
0
def twisted_schedule():
    from twisted.internet import reactor
    from apscheduler.schedulers.twisted import TwistedScheduler

    def tick():
        print('Tick! The time is: %s' % datetime.now())

    scheduler = TwistedScheduler()
    scheduler.add_job(tick, 'interval', seconds=3)
    scheduler.start()
    print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C'))

    # Execution will block here until Ctrl+C (Ctrl+Break on Windows) is pressed.
    try:
        reactor.run()
    except (KeyboardInterrupt, SystemExit):
        pass
Exemplo n.º 10
0
class Scheduler:
    def __init__(self):
        self.scrapers = [
            HistorySpider, WpbccSpider, LWVChicago, LibraryEvents,
            GreatLakesReader
        ]
        self.interval_seconds = 60 * config.schedule_interval

        self.scheduler = TwistedScheduler()
        self.scheduler.add_listener(self.schedule_missed, EVENT_JOB_MISSED)

    def add_schedule(self, scraper, seconds_delay):
        self.scheduler.add_job(self.run_scraper,
                               id=scraper.__name__,
                               trigger='interval',
                               args=[scraper],
                               start_date=datetime.now() +
                               relativedelta(seconds=seconds_delay),
                               seconds=self.interval_seconds)

    def schedule_missed(self, event):
        print(f'{event.job_id} missed. Interval time: {self.interval_seconds}')

    def run_scraper(self, scraper):
        start_date = datetime.now().strftime('%m-%d-%Y')
        end_date = (datetime.now() +
                    relativedelta(months=+1)).strftime('%m-%d-%Y')
        print(f'{datetime.now()} starting {scraper.__name__}')
        runner = CrawlerRunner(get_project_settings())
        runner.crawl(scraper, start_date, end_date)
        runner.join()

    def run_schedule(self):
        configure_logging()
        start_interval = self.interval_seconds / len(self.scrapers)
        now = datetime.now()
        self.last_scheduled = now
        for index, scraper in enumerate(self.scrapers):
            self.add_schedule(scraper, start_interval * index)

        self.scheduler.start()
        reactor.run()
Exemplo n.º 11
0
    def schedule(self):
        scheduler = TwistedScheduler(
            {'apscheduler.timezone': self.settings.get('APP_TIMEZONE')})

        # TODO: use random interval
        switch = {
            'debug':
            lambda: scheduler.add_job(self.run_crawler, 'interval', seconds=3),
            'hourly':
            lambda: scheduler.add_job(
                self.run_crawler, 'interval', seconds=3600),
            'daily':
            lambda: scheduler.add_job(
                self.run_crawler, 'interval', seconds=86400),
            'weekly':
            lambda: scheduler.add_job(
                self.run_crawler, 'interval', seconds=86400 * 7),
            'monthly':
            lambda: scheduler.add_job(
                self.run_crawler, 'interval', seconds=86400 * 30),
        }

        switch[self.settings.get('APP_CRAWL_INTERVAL')]()
        scheduler.start()
Exemplo n.º 12
0
    isLeaf = True

    def render_GET(self, request):
        # self.count +=1

        sessionweb = DBSession()
        spiderRule = sessionweb.query(SpiderRule).filter(
            SpiderRule.enable and SpiderRule.cron == None).one()
        # spiderRule.name = '%s-%s'%(spiderRule.name,self.count)
        # print('我来了%s次!!!'%spiderRule.name)
        self.scheduler.add_job(crawl,
                               'date',
                               args=[spiderRule],
                               name=spiderRule.name,
                               id='%s' % spiderRule.id,
                               replace_existing=True)

        sessionweb.close()
        request.setHeader("Content-Type", "text/html; charset=utf-8")
        return ("<html>Hello, world!</html>").encode('utf-8')

    def render_POST(self, request):
        pass


site = server.Site(Simple(sched))
endpoint = endpoints.TCP4ServerEndpoint(reactor, 8080)
endpoint.listen(site)

sched.start()
reactor.run()
Exemplo n.º 13
0
def trigger_spider_job(spider, seconds=10):
    scheduler = TwistedScheduler()
    # 每天凌晨12点 开始执行
    trigger = CronTrigger(hour=0, minute=19, second=seconds)
    scheduler.add_job(runner.crawl, trigger, args=[spider])
    scheduler.start()
Exemplo n.º 14
0
class Spiders(resource.Resource):
    u"""
    显示Spider的工作状态以及调度计划
    """
    config = Config()
    def __init__(self, root, local_items):
        resource.Resource.__init__(self)
        self.root = root
        self.local_items = local_items
        self.spider_status_dic = {}

        logging.basicConfig()
        self.scheduler = TwistedScheduler()
        self.scheduler.start()

    def get_spider_status(self, project):
        spider_status = self.spider_status_dic.get(project)
        if not spider_status:
            spider_status = dict((spider, {'status': 'finished', 'timestamp': None, 'job': None, 'schedule_job': None})
                                 for spider in get_spider_list(project))
            self.spider_status_dic[project] = spider_status
        self._update_spider_status(project)
        return spider_status

    def _update_spider_status(self, project):
        u"""
        先获取目前任务调度情况,然后再获取apsheduler中的任务。
        """
        spider_status = self.spider_status_dic.get(project)
        for project, queue in self.root.poller.queues.items():
            for m in queue.list():
                spider = m['name']
                job = m['_job']
                spider_status[spider]['status'] = 'pending'
                spider_status[spider]['timestamp'] = None
                spider_status[spider]['job'] = job
        for p in self.root.launcher.processes.values():
            spider = p.spider
            spider_status[spider]['status'] = 'running'
            spider_status[spider]['timestamp'] = p.start_time
            spider_status[spider]['job'] = p.job
        for p in self.root.launcher.finished:
            spider = p.spider
            spider_status[spider]['status'] = 'finished'
            spider_status[spider]['timestamp'] = p.end_time
            spider_status[spider]['job'] = p.job

        for spider in spider_status:
            status = spider_status[spider]
            sjob = self.scheduler.get_job(spider)
            status['schedule_job'] = sjob
            if sjob:
                status['next_time'] = sjob.next_run_time
            else:
                status['next_time'] = None
            # sjob._get_run_times()

    def render_GET(self, txrequest):
        args = dict((k, v[0]) for k, v in txrequest.args.items())
        project = args.pop('project', 'careerTalk')
        spider_status = self.get_spider_status(project)
        content = "<tr>"
        for th in ['spider', 'status', 'timestamp', 'next_time', 'data']:
            content += "<th>%s</th>" % th
        content += "</tr>"
        for spider in spider_status:
            status = spider_status[spider]
            content += "<tr>"
            content += "<td>%s</td><td>%s</td><td>%s</td><td>%s</td>" \
                       % (spider, status['status'], status['timestamp'], status['next_time'])
            content += "<td><a href='/data/%s/'>data</a></td>" % spider
            content += "</tr>"
        sub_form = "<form action='' method='post'><input type='submit' value='开启所有任务'></input></form>"
        html = "<table>"+content+"</table>"+sub_form
        return html

    def render_POST(self, txrequest):
        args = dict((k, v[0]) for k, v in txrequest.args.items())
        project = args.pop('project', 'careerTalk')
        # spiders = ['NJU', 'BIT', 'ECUST', 'RUC']
        spiders = get_spider_list(project)

        tstart = dt.datetime.utcnow()
        for spider in spiders:
            job = self.scheduler.add_job(spider_crawl, 'interval', minutes=60, replace_existing=True,
                                         id=spider, next_run_time=tstart, args=[project, spider])
            tstart = tstart + dt.timedelta(seconds=5)
        return "<span>任务全部开启</span><a href='/'>返回</a>"
Exemplo n.º 15
0
def start(verbose, debug, proxy, min, product, brand, serie, check, delay,
          news, days):
    def check_db():
        from DuTracker.tsdb import influxdb
        try:
            influxdb.ping()
        except Exception as e:
            log.error(f'InfluxDB 连接错误')
            sys.exit(1)
        else:
            log.success(f'InfluxDB 连接成功')

    if check: check_db()

    # https://stackoverflow.com/questions/44228851/scrapy-on-a-schedule
    settings = get_project_settings()

    if verbose: log.setLevel(logging.DEBUG)
    if proxy:
        settings['DOWNLOADER_MIDDLEWARES'].update(
            {'DuTracker.middlewares.RandomProxy': 760})
        settings['PROXY_URL'] = proxy
    if debug: settings['LOG_ENABLED'] = True
    if delay: settings['DOWNLOAD_DELAY'] = delay

    process = CrawlerProcess(settings)
    sched = TwistedScheduler()

    if brand:
        sched.add_job(process.crawl,
                      'interval',
                      args=[BrandSpider],
                      kwargs={
                          'auto': True,
                          'Ids': brand
                      },
                      days=1)
        process.crawl(BrandSpider, auto=True, Ids=brand)
    if serie:
        sched.add_job(process.crawl,
                      'interval',
                      args=[SerieSpider],
                      kwargs={
                          'auto': True,
                          'Ids': serie
                      },
                      days=1)
        process.crawl(SerieSpider, auto=True, Ids=serie)
    if brand or serie:
        sched.add_job(process.crawl,
                      'interval',
                      args=[ProductSpider],
                      kwargs={'fromDB': True},
                      days=1)
        process.crawl(ProductSpider, fromDB=True)
    process.crawl(TrackerSpider, soldNum_min=min, Ids=product)

    sched.add_job(process.crawl,
                  'interval',
                  args=[TrackerSpider],
                  kwargs={
                      'soldNum_min': min,
                      'Ids': product
                  },
                  hours=6)
    if news:
        sched.add_job(process.crawl,
                      'interval',
                      args=[TrackerSpider],
                      kwargs={
                          'newItem': True,
                          'days': days
                      },
                      hours=1)

    sched.add_job(sched.print_jobs, 'interval', hours=6)

    log.info('开始商品价格追踪')
    sched.start()
    process.start(False)
Exemplo n.º 16
0
from apscheduler.schedulers.twisted import TwistedScheduler
from twisted.internet import reactor
from twisted.internet.defer import inlineCallbacks
from autobahn.twisted.wamp import ApplicationSession


scheduler = TwistedScheduler()
scheduler.add_job(tick, 'interval', seconds=3)
scheduler.start()
Exemplo n.º 17
0
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

from __future__ import print_function
from logging import basicConfig, ERROR
from apscheduler.schedulers.twisted import TwistedScheduler

basicConfig(level=ERROR)

SCHEDULER = TwistedScheduler()

SCHEDULER.start()

def ms(mills):

    """
    Converts milliseconds to seconds
    """

    return mills * 0.001
Exemplo n.º 18
0
def run_scheduler(flask_app):
    scheduler = TwistedScheduler()
    JobsAdder(scheduler, flask_app).add_jobs()
    scheduler.start()
Exemplo n.º 19
0
    mysql_client = SQLServer.from_settings(settings, cf.get("MYSQL_SERVER", "type"), db=cf.get("MYSQL_SERVER", "db"))
    sql = "select COLUMN_NAME from information_schema.COLUMNS where table_name = 'CRAWLER_SPIDER_TASK';"
    column_name_list = [x[0] for x in mysql_client.select(sql)]  # 查询Task表中的所有列名
    sql = "SELECT {} FROM `CRAWLER_SPIDER_TASK` WHERE isUse=1;".format(",".join(column_name_list))
    site_info_dict_list = []
    for site_info in mysql_client.select(sql):  # 查询所有当前要触发的任务,并转换格式
        item = {}
        for i, x in enumerate(column_name_list):
            item[x] = site_info[i]
        site_info_dict_list.append(item)
    for site_info in site_info_dict_list:
        site_info["cf"] = cf
        settings.set("CONCURRENT_REQUESTS", site_info.get("CONCURRENT_REQUESTS", 16), priority="project")
        crawler_process.crawl(site_info["SpiderName"], **site_info)

if __name__ == '__main__':
    settings = get_project_settings()
    crawler_process = CrawlerProcess(settings)
    Scheduler = TwistedScheduler()
    if get_current_ip() != settings.get("MASTER_HOST", ""):
        print(get_current_ip())
        print(settings.get("MASTER_HOST", ""))
        print("消费者集群")
        runAllSpiderConsume()
    else:
        # RunCrawlerServer()
        Scheduler.add_job(func=CreateTask, trigger='interval', seconds=2, args=(settings,), id='Test')
    Scheduler._logger = logger
    Scheduler.start()
    reactor.run()
Exemplo n.º 20
0
# if __name__ == '__main__':

try:
    process = CrawlerProcess(get_project_settings())
    scheduler = TwistedScheduler()
    # scheduler.add_job(process.crawl, 'interval', args=[ChoansanSpider], seconds=15)
    # scheduler.add_job(process.crawl, 'interval', args=[GangdongSpider], seconds=10)
    scheduler.add_job(process.crawl,
                      'interval',
                      args=[JoongrangsoopSpiderSpider],
                      seconds=15)
    # scheduler.add_job(process.crawl, 'interval', args=[ImjingakSpider], seconds=15)
    # scheduler.add_job(process.crawl, 'interval', args=[PyeongtaekSpider], seconds=15)
    # scheduler.add_job(process.crawl, 'interval', args=[CampunakSpider], seconds=15)
    scheduler.start()
    process.start(False)
except (KeyboardInterrupt, SystemExit):
    print("stop process")


def getSaturday():
    # today = date.today()
    # print(today)
    # print(today.weekday())
    # offset = (today.weekday() - 5)%7
    # print(offset)
    # print(timedelta(days=offset))

    # last_saturday = today - timedelta(days=offset)
    # print(last_saturday)
Exemplo n.º 21
0
def trigger_spider_job(spider, seconds=10):
    scheduler = TwistedScheduler()
    start_time = datetime.datetime.now() + datetime.timedelta(seconds=seconds)
    trigger = IntervalTrigger(hours=8, start_date=start_time)
    scheduler.add_job(runner.crawl, trigger, args=[spider])
    scheduler.start()