Пример #1
0
 def flush(self):
     """Delete all run-time data generated by this crawler."""
     Queue.flush(self)
     Tag.delete(self)
     Event.delete(self)
     CrawlerState.flush(self)
     CrawlerRun.flush(self)
Пример #2
0
 def emit(self,
          rule="pass",
          stage=None,
          data={},
          delay=None,
          optional=False):
     """Invoke the next stage, either based on a handling rule, or by
     calling the `pass` rule by default."""
     if stage is None:
         stage = self.stage.handlers.get(rule)
     if optional and stage is None:
         return
     if stage is None or stage not in self.crawler.stages:
         self.log.info("No next stage: %s (%s)" % (stage, rule))
         return
     if settings.DEBUG:
         # sampling rate is a float between 0.0 to 1.0. If it's 0.2, we
         # aim to execute only 20% of the crawler's tasks.
         sampling_rate = self.get("sampling_rate")
         if sampling_rate and random.random() > float(sampling_rate):
             self.log.info("Skipping emit due to sampling rate")
             return
     if is_sync_mode():
         # In sync mode we use a in-memory backend for the task queue.
         # Make a copy of the data to avoid mutation in that case.
         data = deepcopy(data)
     state = self.dump_state()
     stage = self.crawler.get(stage)
     delay = delay or self.params.get("delay", 0) or self.crawler.delay
     self.sleep(delay)
     Queue.queue(stage, state, data)
Пример #3
0
 def enforce_rate_limit(self, rate_limit):
     """
     Enforce rate limit for a resource. If rate limit is exceeded, put the
     offending stage on a timeout (don't execute tasks for that stage for
     some time)
     """
     rate_limit.update()
     if not rate_limit.check():
         Queue.timeout(self.stage, rate_limit=rate_limit)
Пример #4
0
 def emit(self, rule='pass', stage=None, data={}, delay=None):
     """Invoke the next stage, either based on a handling rule, or by calling
     the `pass` rule by default."""
     if stage is None:
         stage = self.stage.handlers.get(rule)
     if stage is None or stage not in self.crawler.stages:
         self.log.info("No next stage: %s (%s)" % (stage, rule))
         return
     state = self.dump_state()
     delay = delay or self.crawler.delay
     Queue.queue(stage, state, data, delay)
Пример #5
0
    def run(self, incremental=None, run_id=None):
        """Queue the execution of a particular crawler."""
        state = {
            "crawler": self.name,
            "run_id": run_id or Job.random_id(),
            "incremental": settings.INCREMENTAL,
            "continue_on_error": settings.CONTINUE_ON_ERROR,
        }
        if incremental is not None:
            state["incremental"] = incremental

        # Cancel previous runs:
        self.cancel()
        init_stage = self.get(self.init_stage)
        Queue.queue(init_stage, state, {})
Пример #6
0
    def run(self, incremental=None, run_id=None):
        """Queue the execution of a particular crawler."""
        state = {
            'crawler': self.name,
            'run_id': run_id or Job.random_id(),
            'incremental': settings.INCREMENTAL
        }
        if incremental is not None:
            state['incremental'] = incremental

        # Cancel previous runs:
        self.cancel()
        # Flush out previous events data but keep the counts:
        Event.delete_data(self)
        Queue.queue(self.init_stage, state, {})
Пример #7
0
    def run(self, incremental=None, run_id=None):
        """Queue the execution of a particular crawler."""
        state = {
            'crawler': self.name,
            'run_id': run_id,
            'incremental': settings.INCREMENTAL
        }
        if incremental is not None:
            state['incremental'] = incremental

        # TaskRunner.execute(stage.name, state, {})
        Queue.queue(self.init_stage, state, {})

        if not settings.REDIS_HOST:
            TaskRunner.run()
Пример #8
0
    def run(self, incremental=None, run_id=None):
        """Queue the execution of a particular crawler."""
        state = {
            "crawler": self.name,
            "run_id": run_id or Job.random_id(),
            "incremental": settings.INCREMENTAL,
        }
        if incremental is not None:
            state["incremental"] = incremental

        # Cancel previous runs:
        self.cancel()
        # Flush out previous events data but keep the counts:
        Event.delete_data(self)
        init_stage = self.get(self.init_stage)
        Queue.queue(init_stage, state, {})
Пример #9
0
    def run(self, incremental=None, run_id=None):
        """Queue the execution of a particular crawler."""
        state = {
            'crawler': self.name,
            'run_id': run_id,
            'incremental': settings.INCREMENTAL
        }
        if incremental is not None:
            state['incremental'] = incremental

        # Cancel previous runs:
        self.cancel()
        # Flush out previous events:
        Event.delete(self)
        Queue.queue(self.init_stage, state, {})

        if not settings.REDIS_HOST:
            TaskRunner.run()
Пример #10
0
def index():
    """List the available crawlers."""
    crawler_list = []
    for crawler in manager:
        is_due = 'yes' if crawler.check_due() else 'no'
        if crawler.disabled:
            is_due = 'off'
        crawler_list.append([
            crawler.name, crawler.description, crawler.schedule, is_due,
            Queue.size(crawler)
        ])
    headers = ['Name', 'Description', 'Schedule', 'Due', 'Pending']
    print(tabulate(crawler_list, headers=headers))
Пример #11
0
 def emit(self, rule='pass', stage=None, data={}, delay=None,
          optional=False):
     """Invoke the next stage, either based on a handling rule, or by
     calling the `pass` rule by default."""
     if stage is None:
         stage = self.stage.handlers.get(rule)
     if optional and stage is None:
         return
     if stage is None or stage not in self.crawler.stages:
         self.log.info("No next stage: %s (%s)" % (stage, rule))
         return
     if settings.DEBUG:
         # sampling rate is a float between 0.0 to 1.0. If it's 0.2, we
         # aim to execute only 20% of the crawler's tasks.
         sampling_rate = self.get('sampling_rate')
         if sampling_rate and random.random() > float(sampling_rate):
             self.log.info("Skipping emit due to sampling rate")
             return
     state = self.dump_state()
     stage = self.crawler.get(stage)
     delay = delay or self.params.get('delay', 0) or self.crawler.delay
     self.sleep(delay)
     Queue.queue(stage, state, data)
Пример #12
0
 def is_running(self):
     """Is the crawler currently running?"""
     return Queue.is_running(self)
Пример #13
0
 def cancel(self):
     Queue.flush(self)
Пример #14
0
 def cancel(self):
     Crawl.abort_all(self)
     Queue.flush(self)