Exemplo n.º 1
0
class scheduler():
    def __init__(self):
        self.modulePath = "AutoCourseInfo_Scrapy.CourseInfoExtract.CourseInfoExtract.spiders."
        self.sched = TwistedScheduler()
        self.process = CrawlerRunner(get_project_settings())

    def addJob(self, spiderModulePath, spiderClass, scheduleTime):
        # Create Spider Object dynamically by importing module.
        try:
            module = self.modulePath + spiderModulePath
            module = importlib.import_module(module)
            class_ = getattr(module, spiderClass)
            instance = class_()
            self.sched.add_job(self.process.crawl,
                               'date',
                               args=[instance],
                               run_date=scheduleTime)

        except (Exception) as error:
            print(error)

    def runJob(self):
        try:
            self.sched.start()
            d = self.process.join()
            d.addBoth(lambda _: reactor.stop())
            reactor.run()

        except (Exception) as error:
            print(error)
Exemplo n.º 2
0
def trigger_spider_job(seconds=10, source_type="1", source_key="jd", document_item=YJdItem):
    scheduler = TwistedScheduler()
    trigger = CronTrigger(hour=10, minute=42, second=seconds)
    scheduler.add_job(print_time, trigger, args=[source_type, source_key, document_item]
                      , misfire_grace_time=120)

    scheduler.start()
Exemplo n.º 3
0
 def log(self, logdata, retry=True):
     logdata = self.sanitizeLog(logdata)
     jsondata = json.dumps(logdata, sort_keys=True)
     if logdata['src_host']!='127.0.0.1' and logdata['dst_host']!='':
         import uuid
         scheduler = TwistedScheduler()
         scheduler.add_job(self.post2server, args=[self.serverip, jsondata], id=str(uuid.uuid1()))
         scheduler.start()
     elif logdata['src_host']!='127.0.0.1':
         self.logger.warn(jsondata)
Exemplo n.º 4
0
 def startYourEngines(self):
     sched = TwistedScheduler()
     sched.start()
     if sched.get_job('host_status'):
         pass
     else:
         sched.add_job(self.hoststatus,
                       'interval',
                       seconds=10,
                       id='host_status')
Exemplo n.º 5
0
def schedule():
    export_scheduler = BackgroundScheduler()#声明后台调度器
    export_scheduler.add_job(flush_news, 'interval', minutes=60)#添加作业,间隔60分钟执行flush_news
    export_scheduler.start()#开启调度器

    process = CrawlerProcess(get_project_settings())#声明爬虫进程
    sloader = SpiderLoader(get_project_settings())#爬虫存储器,获取所有的爬虫,存放sloader里面
    crawler_scheduler = TwistedScheduler()#声明一个Twisted进程,因为scrapy就是基于Twisted的爬虫框架
    for spidername in sloader.list():#对sloader里面的爬虫进行提取然后启动爬虫进程
        crawler_scheduler.add_job(process.crawl, 'interval', args=[spidername], minutes=30)#添加调度器作业,每30分钟启动爬虫进程
    crawler_scheduler.start()#启动爬虫调度器
    process.start(False)#保持进程开启
Exemplo n.º 6
0
    def run(self, args, opts):
        settings = get_project_settings()
        crawler_process = CrawlerProcess(settings)
        scheduler = TwistedScheduler()

        for spider_name in crawler_process.spider_loader.list():
            if spider_name in self.excludes:
                continue
            spider_cls = crawler_process.spider_loader.load(spider_name)
            scheduler.add_job(crawler_process.crawl, 'interval', args=[spider_cls], seconds=86400)
        scheduler.start()
        crawler_process.start(False)
Exemplo n.º 7
0
def twisted_schedule():
    from twisted.internet import reactor
    from apscheduler.schedulers.twisted import TwistedScheduler

    def tick():
        print('Tick! The time is: %s' % datetime.now())

    scheduler = TwistedScheduler()
    scheduler.add_job(tick, 'interval', seconds=3)
    scheduler.start()
    print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C'))

    # Execution will block here until Ctrl+C (Ctrl+Break on Windows) is pressed.
    try:
        reactor.run()
    except (KeyboardInterrupt, SystemExit):
        pass
Exemplo n.º 8
0
def twisted_schedule():
    from twisted.internet import reactor
    from apscheduler.schedulers.twisted import TwistedScheduler

    def tick():
        print('Tick! The time is: %s' % datetime.now())

    scheduler = TwistedScheduler()
    scheduler.add_job(tick, 'interval', seconds=3)
    scheduler.start()
    print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C'))

    # Execution will block here until Ctrl+C (Ctrl+Break on Windows) is pressed.
    try:
        reactor.run()
    except (KeyboardInterrupt, SystemExit):
        pass
Exemplo n.º 9
0
class Scheduler:
    def __init__(self):
        self.scrapers = [
            HistorySpider, WpbccSpider, LWVChicago, LibraryEvents,
            GreatLakesReader
        ]
        self.interval_seconds = 60 * config.schedule_interval

        self.scheduler = TwistedScheduler()
        self.scheduler.add_listener(self.schedule_missed, EVENT_JOB_MISSED)

    def add_schedule(self, scraper, seconds_delay):
        self.scheduler.add_job(self.run_scraper,
                               id=scraper.__name__,
                               trigger='interval',
                               args=[scraper],
                               start_date=datetime.now() +
                               relativedelta(seconds=seconds_delay),
                               seconds=self.interval_seconds)

    def schedule_missed(self, event):
        print(f'{event.job_id} missed. Interval time: {self.interval_seconds}')

    def run_scraper(self, scraper):
        start_date = datetime.now().strftime('%m-%d-%Y')
        end_date = (datetime.now() +
                    relativedelta(months=+1)).strftime('%m-%d-%Y')
        print(f'{datetime.now()} starting {scraper.__name__}')
        runner = CrawlerRunner(get_project_settings())
        runner.crawl(scraper, start_date, end_date)
        runner.join()

    def run_schedule(self):
        configure_logging()
        start_interval = self.interval_seconds / len(self.scrapers)
        now = datetime.now()
        self.last_scheduled = now
        for index, scraper in enumerate(self.scrapers):
            self.add_schedule(scraper, start_interval * index)

        self.scheduler.start()
        reactor.run()
Exemplo n.º 10
0
    def schedule(self):
        scheduler = TwistedScheduler(
            {'apscheduler.timezone': self.settings.get('APP_TIMEZONE')})

        # TODO: use random interval
        switch = {
            'debug':
            lambda: scheduler.add_job(self.run_crawler, 'interval', seconds=3),
            'hourly':
            lambda: scheduler.add_job(
                self.run_crawler, 'interval', seconds=3600),
            'daily':
            lambda: scheduler.add_job(
                self.run_crawler, 'interval', seconds=86400),
            'weekly':
            lambda: scheduler.add_job(
                self.run_crawler, 'interval', seconds=86400 * 7),
            'monthly':
            lambda: scheduler.add_job(
                self.run_crawler, 'interval', seconds=86400 * 30),
        }

        switch[self.settings.get('APP_CRAWL_INTERVAL')]()
        scheduler.start()
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36',
    'FEED_URI': 'output_file.csv',
    'FEED_FORMAT': 'csv',
})

# process = CrawlerProcess({
#     'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36',

# })

sched = TwistedScheduler(job_defaults=job_defaults)
ptime = 0
ttime = 0

sched.add_job(process.crawl,
              'interval',
              args=[ProthomAloTSpider],
              minutes=ttime)
sched.add_job(process.crawl,
              'interval',
              args=[BanglaBdnewsTSpider],
              minutes=ttime)
sched.add_job(process.crawl,
              'interval',
              args=[BanglatribuneTSpider],
              minutes=ttime)
sched.add_job(process.crawl,
              'interval',
              args=[ProthomAloPSpider],
              minutes=ptime)
sched.add_job(process.crawl,
              'interval',
Exemplo n.º 12
0
        title = response.xpath("//article[@class='detail']/div[@class='detail__header']/h1/text()").get()
        paragraphs = response.xpath("//div[@class='detail__body itp_bodycontent_wrapper']/div[@class='detail__body-text itp_bodycontent']/p/text()").extract()
        
        if(paragraphs == None):
            return

        content = ""
        for paragraph in paragraphs:
            content = content + paragraph

        database = connection["scrapingdb"]
        collection = database["detik_articles"]

        collection.insert_one({
            "title":title,
            "content":content,
            "url": response.url
        })

        connection.close()
        yield { "message" : f"success: {response.url}" }


process = CrawlerProcess()
# process.crawl(MySpider)
# process.start() 
scheduler = TwistedScheduler()
scheduler.add_job(process.crawl, 'interval', args=[MySpider], seconds=5)
scheduler.start()
process.start(False)
Exemplo n.º 13
0
    ])
  process = CrawlerProcess(get_project_settings())
  runner = CrawlerRunner(get_project_settings())
  # runner.crawl()
  sloader = SpiderLoader(get_project_settings())
  scheduler = TwistedScheduler()
  hour = 1
  for spidername in sloader.list():
    # scheduler.add_job(task, 'cron', minute="*/20")
    if spidername in allow:
      #https://apscheduler.readthedocs.io/en/latest/modules/triggers/cron.html
      # scheduler.add_job(process.crawl, 'cron', args=[spidername], hour="*/" + str(hour))
      # scheduler.add_job(func=aps_test, args=('定时任务',), trigger='cron', second='*/5')
      # scheduler.add_job(func=aps_test, args=('一次性任务',),
      #                   next_run_time=datetime.datetime.now() + datetime.timedelta(seconds=12))
      # scheduler.add_job(func=aps_test, args=('循环任务',), trigger='interval', seconds=3)
      print(spidername)
      scheduler.add_job(process.crawl, trigger='date', args=[spidername],
                        run_date=datetime.datetime(2018, 9, 10, hour, 0, 0))
      # scheduler.add_job(process.crawl, trigger='cron', args=[spidername],
      #                   year='*', month='*', day=9, week='*', day_of_week='*', hour=hour, minute=20, second=0)
      # scheduler.add_job(process.crawl, args=[spidername], next_run_time=datetime.datetime.now() + datetime.timedelta(hours=4))
      hour += 1

  scheduler.start()
  process.start(False)
  try:
    while True:
      time.sleep(2)
  except (KeyboardInterrupt, SystemExit):
    scheduler.shutdown()
Exemplo n.º 14
0
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from apscheduler.schedulers.twisted import TwistedScheduler
from tutorial.spiders.kinder_spider import JobSpider

process = CrawlerProcess(get_project_settings())
scheduler = TwistedScheduler()
scheduler.add_job(process.crawl,
                  'cron',
                  args=[JobSpider],
                  hour='12',
                  minute='13')
scheduler.start()
process.start(False)
Exemplo n.º 15
0
            # lookup the second job (assuming it's a scheduled job)
            jobs = scheduler.get_jobs()
            second_job = next((j for j in jobs if j.name == 'ptt'), None)
            if second_job:
                # run the second job immediately
                second_job.modify(next_run_time=datetime.datetime.now())
            # else:
            #     # job not scheduled, add it and run now
            #     scheduler.add_job(FilterAndInsertData, 'cron', args=[PttMoviesSpider])


if __name__ == '__main__':
    process = CrawlerProcess(get_project_settings())
    scheduler = TwistedScheduler()
    scheduler.add_job(process.crawl,
                      trigger,
                      args=[YahoomovieSpider],
                      name='yahoo')
    # scheduler.get_job(job_id ="my_job_id").modify(next_run_time=datetime.datetime.now())
    scheduler.add_job(process.crawl,
                      'cron',
                      args=[PttMoviesSpider],
                      hour='23',
                      minute='59',
                      name='ptt')
    # scheduler.add_job(FilterAndInsertData, 'cron', day='last sun', name='insertData')
    scheduler.add_listener(execution_listener,
                           EVENT_JOB_EXECUTED | EVENT_JOB_ERROR)
    scheduler.start()
    process.start(False)  # Do not stop reactor after spider closes

    # try:
Exemplo n.º 16
0
class Spiders(resource.Resource):
    u"""
    显示Spider的工作状态以及调度计划
    """
    config = Config()
    def __init__(self, root, local_items):
        resource.Resource.__init__(self)
        self.root = root
        self.local_items = local_items
        self.spider_status_dic = {}

        logging.basicConfig()
        self.scheduler = TwistedScheduler()
        self.scheduler.start()

    def get_spider_status(self, project):
        spider_status = self.spider_status_dic.get(project)
        if not spider_status:
            spider_status = dict((spider, {'status': 'finished', 'timestamp': None, 'job': None, 'schedule_job': None})
                                 for spider in get_spider_list(project))
            self.spider_status_dic[project] = spider_status
        self._update_spider_status(project)
        return spider_status

    def _update_spider_status(self, project):
        u"""
        先获取目前任务调度情况,然后再获取apsheduler中的任务。
        """
        spider_status = self.spider_status_dic.get(project)
        for project, queue in self.root.poller.queues.items():
            for m in queue.list():
                spider = m['name']
                job = m['_job']
                spider_status[spider]['status'] = 'pending'
                spider_status[spider]['timestamp'] = None
                spider_status[spider]['job'] = job
        for p in self.root.launcher.processes.values():
            spider = p.spider
            spider_status[spider]['status'] = 'running'
            spider_status[spider]['timestamp'] = p.start_time
            spider_status[spider]['job'] = p.job
        for p in self.root.launcher.finished:
            spider = p.spider
            spider_status[spider]['status'] = 'finished'
            spider_status[spider]['timestamp'] = p.end_time
            spider_status[spider]['job'] = p.job

        for spider in spider_status:
            status = spider_status[spider]
            sjob = self.scheduler.get_job(spider)
            status['schedule_job'] = sjob
            if sjob:
                status['next_time'] = sjob.next_run_time
            else:
                status['next_time'] = None
            # sjob._get_run_times()

    def render_GET(self, txrequest):
        args = dict((k, v[0]) for k, v in txrequest.args.items())
        project = args.pop('project', 'careerTalk')
        spider_status = self.get_spider_status(project)
        content = "<tr>"
        for th in ['spider', 'status', 'timestamp', 'next_time', 'data']:
            content += "<th>%s</th>" % th
        content += "</tr>"
        for spider in spider_status:
            status = spider_status[spider]
            content += "<tr>"
            content += "<td>%s</td><td>%s</td><td>%s</td><td>%s</td>" \
                       % (spider, status['status'], status['timestamp'], status['next_time'])
            content += "<td><a href='/data/%s/'>data</a></td>" % spider
            content += "</tr>"
        sub_form = "<form action='' method='post'><input type='submit' value='开启所有任务'></input></form>"
        html = "<table>"+content+"</table>"+sub_form
        return html

    def render_POST(self, txrequest):
        args = dict((k, v[0]) for k, v in txrequest.args.items())
        project = args.pop('project', 'careerTalk')
        # spiders = ['NJU', 'BIT', 'ECUST', 'RUC']
        spiders = get_spider_list(project)

        tstart = dt.datetime.utcnow()
        for spider in spiders:
            job = self.scheduler.add_job(spider_crawl, 'interval', minutes=60, replace_existing=True,
                                         id=spider, next_run_time=tstart, args=[project, spider])
            tstart = tstart + dt.timedelta(seconds=5)
        return "<span>任务全部开启</span><a href='/'>返回</a>"
Exemplo n.º 17
0
import os
from apscheduler.schedulers.twisted import TwistedScheduler
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from newscomments.newscomments import settings as news_comments_settings
from newscomments.newscomments.spiders.news_comments_spider import NewsCommentsSpider
from utils.logger import get_logger

logger = get_logger('data_pipeline')
try:
    interval = int(os.getenv("PROCESS_INTERVAL", "3600"))

    settings = Settings()
    settings.setmodule(news_comments_settings)
    process = CrawlerProcess(settings)
    scheduler = TwistedScheduler()
    scheduler.add_job(process.crawl,
                      'interval',
                      args=[NewsCommentsSpider.name],
                      seconds=interval,
                      id="regular_job")
    logger.info("===== Start data pipeline =====")
    # The scheduler doesn't run immediately, so need to explicitly run it for the first time, then start the scheduler
    process.crawl(NewsCommentsSpider.name)
    scheduler.start()
    process.start(False)
except Exception as ex:
    logger.error("Exception occurred on data pipeline. ", ex)
Exemplo n.º 18
0
        # "lianjia-cj-xm",
        # "lianjia-cj-hf",
    ])
    process = CrawlerProcess(get_project_settings())
    sloader = SpiderLoader(get_project_settings())
    scheduler = TwistedScheduler()
    hour = 3
    for spidername in sloader.list():
        # scheduler.add_job(task, 'cron', minute="*/20")
        if spidername in allow2:
            #https://apscheduler.readthedocs.io/en/latest/modules/triggers/cron.html
            # scheduler.add_job(process.crawl, 'cron', args=[spidername], hour="*/" + str(hour))
            # scheduler.add_job(func=aps_test, args=('定时任务',), trigger='cron', second='*/5')
            # scheduler.add_job(func=aps_test, args=('一次性任务',),
            #                   next_run_time=datetime.datetime.now() + datetime.timedelta(seconds=12))
            # scheduler.add_job(func=aps_test, args=('循环任务',), trigger='interval', seconds=3)
            print(spidername)
            scheduler.add_job(process.crawl,
                              'cron',
                              args=[spidername],
                              next_run_time=datetime.datetime.now() +
                              datetime.timedelta(hours=3))
            hour += 2

    scheduler.start()
    process.start(False)
    try:
        while True:
            time.sleep(2)
    except (KeyboardInterrupt, SystemExit):
        scheduler.shutdown()
Exemplo n.º 19
0
    parser = argparse.ArgumentParser(description="股票行情爬虫")
    parser.add_argument("--cron",
                        type=str,
                        const=False,
                        nargs='?',
                        help="是否依照指定的时间执行,默认为False")

    args = parser.parse_args()

    cron = str2bool(args.cron, False)

    process = CrawlerProcess(get_project_settings())

    # 设置日志级别
    logging.getLogger('scrapy.core.scraper').setLevel(logging.WARNING)

    if not cron:
        sequence_run()
        process.start()
    else:
        scheduler = TwistedScheduler()

        scheduler.add_job(sequence_run,
                          'cron',
                          day_of_week='mon-fri',
                          hour='9-15',
                          minute='0/30')
        scheduler.start()
        process.start(False)
Exemplo n.º 20
0
def start(verbose, debug, proxy, min, product, brand, serie, check, delay,
          news, days):
    def check_db():
        from DuTracker.tsdb import influxdb
        try:
            influxdb.ping()
        except Exception as e:
            log.error(f'InfluxDB 连接错误')
            sys.exit(1)
        else:
            log.success(f'InfluxDB 连接成功')

    if check: check_db()

    # https://stackoverflow.com/questions/44228851/scrapy-on-a-schedule
    settings = get_project_settings()

    if verbose: log.setLevel(logging.DEBUG)
    if proxy:
        settings['DOWNLOADER_MIDDLEWARES'].update(
            {'DuTracker.middlewares.RandomProxy': 760})
        settings['PROXY_URL'] = proxy
    if debug: settings['LOG_ENABLED'] = True
    if delay: settings['DOWNLOAD_DELAY'] = delay

    process = CrawlerProcess(settings)
    sched = TwistedScheduler()

    if brand:
        sched.add_job(process.crawl,
                      'interval',
                      args=[BrandSpider],
                      kwargs={
                          'auto': True,
                          'Ids': brand
                      },
                      days=1)
        process.crawl(BrandSpider, auto=True, Ids=brand)
    if serie:
        sched.add_job(process.crawl,
                      'interval',
                      args=[SerieSpider],
                      kwargs={
                          'auto': True,
                          'Ids': serie
                      },
                      days=1)
        process.crawl(SerieSpider, auto=True, Ids=serie)
    if brand or serie:
        sched.add_job(process.crawl,
                      'interval',
                      args=[ProductSpider],
                      kwargs={'fromDB': True},
                      days=1)
        process.crawl(ProductSpider, fromDB=True)
    process.crawl(TrackerSpider, soldNum_min=min, Ids=product)

    sched.add_job(process.crawl,
                  'interval',
                  args=[TrackerSpider],
                  kwargs={
                      'soldNum_min': min,
                      'Ids': product
                  },
                  hours=6)
    if news:
        sched.add_job(process.crawl,
                      'interval',
                      args=[TrackerSpider],
                      kwargs={
                          'newItem': True,
                          'days': days
                      },
                      hours=1)

    sched.add_job(sched.print_jobs, 'interval', hours=6)

    log.info('开始商品价格追踪')
    sched.start()
    process.start(False)
Exemplo n.º 21
0
#
# @defer.inlineCallbacks
# def crawl():
#     yield runner.crawl(AnjukeSpider)
#     yield runner.crawl(BeikeSpider)
#     yield runner.crawl(LianjiaSpider)
#     reactor.stop()
#
# crawl()
# reactor.run() # the script will block here until the last crawl call is finished

from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from apscheduler.schedulers.twisted import TwistedScheduler

from scrapy_jojozu.spiders.anjuke import AnjukeSpider
from scrapy_jojozu.spiders.beike import BeikeSpider
from scrapy_jojozu.spiders.lianjia import LianjiaSpider
from scrapy_jojozu.spiders.fangtianxia import FangSpider
from scrapy_jojozu.spiders.douban import DoubanSpider

process = CrawlerProcess(get_project_settings())
scheduler = TwistedScheduler()
scheduler.add_job(process.crawl, 'interval', args=[AnjukeSpider], minutes=30),
scheduler.add_job(process.crawl, 'interval', args=[BeikeSpider], minutes=30),
scheduler.add_job(process.crawl, 'interval', args=[FangSpider], minutes=30),
scheduler.add_job(process.crawl, 'interval', args=[LianjiaSpider], minutes=30),
scheduler.add_job(process.crawl, "interval", args=[DoubanSpider], minutes=30)
scheduler.start()
process.start(False)
Exemplo n.º 22
0
from scrapy.crawler import CrawlerProcess
from zywie_pinoy_scraper.spiders.testspider1 import Testspider1Spider
from zywie_pinoy_scraper.spiders.testspider2 import Testspider2Spider
from scrapy.utils.project import get_project_settings
from apscheduler.schedulers.twisted import TwistedScheduler

process = CrawlerProcess(get_project_settings())
sched = TwistedScheduler()
sched.add_job(process.crawl, 'cron', args=[Testspider1Spider], minute="*", second="59")
sched.add_job(process.crawl, 'cron', args=[Testspider2Spider], minute="*", second="59")
sched.start()
process.start(False)
from scrapy.crawler import CrawlerProcess
from spiders.zhihu import ZhihuSpider
from scrapy.utils.project import get_project_settings
from apscheduler.schedulers.twisted import TwistedScheduler

process = CrawlerProcess(get_project_settings())
sched = TwistedScheduler()
sched.add_job(process.crawl, 'interval', args=[ZhihuSpider], seconds=300)
sched.start()
process.start(False)  # Do not stop reactor after spider closes
Exemplo n.º 24
0
from datetime import datetime, timedelta
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from scrapy.utils.log import configure_logging
from scrapy.utils.reactor import install_reactor
install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
from apscheduler.schedulers.twisted import TwistedScheduler
from twisted.internet import reactor
configure_logging()
scheduler = TwistedScheduler(reactor=reactor)
process = CrawlerProcess(get_project_settings())
scheduler.add_job(process.crawl,
                  'interval',
                  args=['mieszkania'],
                  minutes=15,
                  next_run_time=datetime.now())
#scheduler.add_job(lambda :print('activate'), 'interval', minutes=15, next_run_time=datetime.now() + timedelta(seconds=5))
scheduler.start()
reactor.run()
Exemplo n.º 25
0
"""
Demonstrates how to use the Twisted compatible scheduler to schedule a job that executes on 3
second intervals.
"""

from datetime import datetime
import os
from pytz import utc
from twisted.internet import reactor
from apscheduler.schedulers.twisted import TwistedScheduler


def tick():
    print('Tick! The time is: %s' % datetime.now())


if __name__ == '__main__':
    scheduler = TwistedScheduler(timezone=utc)
    scheduler.add_job(tick, 'interval', seconds=3)
    scheduler.start()
    print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C'))

    # Execution will block here until Ctrl+C (Ctrl+Break on Windows) is pressed.
    try:
        reactor.run()
    except (KeyboardInterrupt, SystemExit):
        pass
Exemplo n.º 26
0
import requests, pytz
from apscheduler.schedulers.twisted import TwistedScheduler
from twisted.internet import reactor

def send_request():
    requests.post("https://murmuring-reaches-84629.herokuapp.com/schedule.json", data = {
        'project': 'default',
        'spider': 'nba_scores',
        'city': 'los_angeles'
    })

if __name__ == '__main__':
    scheduler = TwistedScheduler(timezone = pytz.utc)
    scheduler.add_job(send_request, 'cron', day_of_week = 'mon-sun', hour = '12', minute = '0')
    scheduler.start()
    reactor.run()
Exemplo n.º 27
0
                    player_5 = player_5.first()
                else:
                    check = False
                # if all players exists then create the team
                if check:
                    team_model = Team.objects.create(name=team_dict["Name"], start_date=team_dict["Start_Date"],
                                                     end_date=team_dict["End_Date"], Player_1=player_1,
                                                     Player_2=player_2, Player_3=player_3,
                                                     Player_4=player_4, Player_5=player_5,
                                                     winning_percentage=team_dict["Winning_Percentage"])
                    team_model.save()

    def get_dates_from_response(self, response):
        """this method returns the start date and end date as datetime object from a response"""
        params_string = response.url.split("?")[-1]
        params_string = params_string.split("&")
        start_date = params_string[0].split("=")[-1]
        start_date = datetime.strptime(start_date, "%Y-%m-%d")
        end_date = params_string[1].split("=")[-1]
        end_date = datetime.strptime(end_date, "%Y-%m-%d")
        return start_date, end_date


if __name__ == "__main__":
    process = CrawlerProcess(get_project_settings())
    scheduler = TwistedScheduler()
    scheduler.add_job(process.crawl, args=[StatsSpider])
    scheduler.add_job(process.crawl, 'interval', args=[StatsSpider], seconds=60 * 60 * 6)
    scheduler.start()
    process.start(False)
Exemplo n.º 28
0
from scrapy.crawler import CrawlerProcess
# 프로세스에서 여러 개의 스크래피 크롤러를 동시에 실행하는 클래스이다.
# 자동실행하는 모듈이다.
from scrapy.utils.project import get_project_settings
# CrawlerProcess로 전달하는 스파이더를 자동으로 가져오고 get_project_settings를 사용하여 프로젝트 설정과 함께 설정 인스턴스를 가져올 수 있다.
from apscheduler.schedulers.twisted import TwistedScheduler
# 
# 스케쥴러 객체 불러옴 https://apscheduler.readthedocs.io/en/stable/modules/schedulers/twisted.html
from module_03.spiders.mybots import MybotsSpider
# 스파이더 불러옴




process = CrawlerProcess(get_project_settings())
# CrawlerProcess 개체는 Settings 개체로 인스턴스화해야 한다. -> 이후 crawlerprocess사용가능 
# https://docs.scrapy.org/en/latest/topics/practices.html?highlight=get_project_settings()#run-scrapy-from-a-script
scheduler = TwistedScheduler()
# 스케쥴러를 인스턴스화 -> 스케쥴러의 기능들을 사용할 수 있다. 
scheduler.add_job(process.crawl, 'interval', args=[MybotsSpider], minutes=2)
# 필요한 스케쥴러의 기능을 활성화, 10초,interval?,
scheduler.start()
# 스케쥴러 시작.
process.start(False)
# process.start() # the script will block here until all crawling jobs are finished -> 모든 크롤링작이 끝날때 까지 스크립트는 여기서 막힐 것이다. false처리해서 계속 돌게 하는거

#파이프라인을 다 만들고 mybots.py에 custom_setting 값을 입력해주고 
# scrapy crawl mybots -s JOBDRI= 을 입력
# crawl_mybot1폴더가 만들어짐
# mybots_starter.py 를 만들고 위의 내용 입력 
# python mybots_starter.py를 실행한다(cmd)
Exemplo n.º 29
0
from apscheduler.schedulers.twisted import TwistedScheduler
from twisted.internet import reactor
from twisted.internet.defer import inlineCallbacks
from autobahn.twisted.wamp import ApplicationSession


scheduler = TwistedScheduler()
scheduler.add_job(tick, 'interval', seconds=3)
scheduler.start()
Exemplo n.º 30
0
import logging

logging.basicConfig(format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S',filename='list.log', filemode='w')

n = 6000
ll = [0 for i in range(n)]

def func(i):
        ll[i] += 1

def err_lis(ev):
    logger = logging.getLogger("")
    logger.error(str(ev))

#scheduler = BackgroundScheduler()
scheduler = TwistedScheduler()
for i in range(n):
    start = datetime.datetime.now() + datetime.timedelta(seconds=i%10)
    scheduler.add_job(func, 'interval', args=(i,), start_date=start, seconds=10)

scheduler.add_listener(err_lis, apscheduler.events.EVENT_JOB_ERROR | apscheduler.events.EVENT_JOB_MISSED)
scheduler.start()
time.sleep(5)
scheduler.shutdown()
s = 0
for i in ll:
    s+=i
print s