def check_jobs(self): """Share the URLs of each job with other staging servers. The initial or discovered URLs for each job need to be spread over other stager servers to be archived. The ``share_urls`` function of a job yields a list of URLs, assigned stager servers and backup location. An URL is send to its assigned stager server:: JOB_URL :obj:`webarchiver.url.UrlConfig` and to the backup location the URL is send:: JOB_URL_BACKUP :obj:`webarchiver.url.UrlConfig` <listener assigned server> Note: If the server value of the yielded data is None, the current stager server has the URL assigned. """ if check_time(self._last_jobs_check, JOBS_CHECK_TIME): for job in self._jobs.values(): for urlconfig, s, backups in job.share_urls(): if s is None: logger.debug('Assigning URL %s to self.', urlconfig) s = self._socket self._command_job_url(None, [None, urlconfig]) else: self._write_socket_message(s, 'JOB_URL', urlconfig) self._write_socket_message(backups, 'JOB_URL_BACKUP', urlconfig, s.listener) self._last_jobs_check = time.time()
def chosen(self): """Chooses a stager server for upload. A stager server is only selected if there is at least one permission for upload. A chosen server is remembered. Note: Behavior by other implemented functions using this attribute is to delete the WarcFile is False is received, since there is no implementation to 'reset' a :class:`WarcFile` object. Returns: :obj:`webarchiver.server.base.Node` or bool or NoneType: None if ``REQUEST_UPLOAD_TIME`` seconds have not passed yet or if no previous request was made and no permissions have been received yet. If permission have been received and ``REQUEST_UPLOAD_TIME`` seconds have passes, a :class:`webarchiver.server.base.Node` object for chosen stager server is returned. If a decision was previously made, the same decision is returned. """ if not hasattr(self, '_chosen'): if self._last_answer == 0: self._last_answer = time.time() return None if not check_time(self._last_answer, REQUEST_UPLOAD_TIME): return None if len(self._granted) == 0: return False self._chosen = sample(self._granted, 1) return self._chosen
def ping(self): """Pings all stager servers. Every ``PING_TIME`` second a ping message is send to each stager server:: PING When the message is send the servers are set not having replied with a pong. """ if check_time(self._last_ping, PING_TIME): for s in self._stager: self._stager[s].pong = False self._write_socket_message(self._stager, 'PING') self._last_ping = time.time()
def request_stager(self): """Requests new stager server to connect to. The new stager server are needed and the last request was long enough ago a request is made to a random stager server to send a certain number of stager server the crawler server is not connected to:: REQUEST_STAGER <number stagers needed> <listeners of connected stager servers> """ if self.stager_needed > 0 and check_time(self._last_stager_request, REQUEST_STAGER_TIME): self._write_socket_message(sample(self._stager, 1), 'REQUEST_STAGER', self.stager_needed, *[s.listener for s in self._stager]) self._last_stager_request = time.time()
def ping(self): """Pings the connected servers. The servers ping both the crawler and stager servers:: PING The pong variable for each server it set to False, so a pong is expected from the server. """ if check_time(self._last_ping, PING_TIME): for s in self._stager: self._stager[s].pong = False self._write_socket_message(self._stager, 'PING') for s in self._crawlers: self._crawlers[s].pong = False self._write_socket_message(self._crawlers, 'PING') self._last_ping = time.time()
def finish_jobs(self): """Checks running jobs for being finished. If a job is finished on this stager server, this is reported to other stager servers runnign this job:: STAGER_JOB_FINISHED <job identifier> Note: A finished job means that the job currently is not active. It can become active again if new URLs are send to it. """ #TODO what do if a job is fully job.finished? if not check_time(self._last_finish_check, FINISH_CHECK_TIME): return None for identifier, job in self._jobs.items(): if job.crawlers_finished: self._write_socket_message(job.stagers, 'STAGER_JOB_FINISHED', identifier) self._last_finish_check = time.time()
def request_url_quota(self): """Requests a quotum for URLs to crawl for a job. After `URL_QUOTA_TIME` seconds a random stager connected to a running job is asked for an URL quota for URLs to be crawled:: REQUEST_URL_QUOTA <job identifier> """ if check_time(self._last_url_quota, URL_QUOTA_TIME): self._last_url_quota = time.time() if len(self._jobs) == 0: return None job = key_lowest_value({ job: self._jobs[job].received_url_quota for job in self._jobs }) self._write_socket_message( sample(self._jobs[job].stagers, 1)[0], 'REQUEST_URL_QUOTA', job)