def chosen(self): """Chooses a stager server for upload. A stager server is only selected if there is at least one permission for upload. A chosen server is remembered. Note: Behavior by other implemented functions using this attribute is to delete the WarcFile is False is received, since there is no implementation to 'reset' a :class:`WarcFile` object. Returns: :obj:`webarchiver.server.base.Node` or bool or NoneType: None if ``REQUEST_UPLOAD_TIME`` seconds have not passed yet or if no previous request was made and no permissions have been received yet. If permission have been received and ``REQUEST_UPLOAD_TIME`` seconds have passes, a :class:`webarchiver.server.base.Node` object for chosen stager server is returned. If a decision was previously made, the same decision is returned. """ if not hasattr(self, '_chosen'): if self._last_answer == 0: self._last_answer = time.time() return None if not check_time(self._last_answer, REQUEST_UPLOAD_TIME): return None if len(self._granted) == 0: return False self._chosen = sample(self._granted, 1) return self._chosen
def found_urls(self): """Reports discovered URLs to the stager servers. The URLs discovered in a crawl are added to a found URLs set. They are in :class:`webarchiver.url.UrlConfig` objects. Each URL is send to a randomly chosen stager server connected to the job the URL was discovered in:: JOB_URL_DISCOVERED :obj:`webarchiver.url.UrlConfig` Send URLs are removed from the set. """ if len(self._found_urls_set) == 0: return None finished = set() logger.debug('Reporting finished URLs.') with self._found_urls_set.lock: for urlconfig in self._found_urls_set: finished.add(urlconfig) identifier = urlconfig.job_identifier if self._jobs[identifier].archived_url(urlconfig): continue print(urlconfig.url, urlconfig.parent_url, urlconfig.depth) if not self._jobs[identifier].allowed_url(urlconfig): continue print('passed', urlconfig.url, urlconfig.parent_url, urlconfig.depth) stager = sample(self._jobs[identifier].stagers, 1)[0] self._write_socket_message(stager, 'JOB_URL_DISCOVERED', urlconfig) self._found_urls_set.difference_update(finished)
def request_stager(self): """Requests new stager server to connect to. The new stager server are needed and the last request was long enough ago a request is made to a random stager server to send a certain number of stager server the crawler server is not connected to:: REQUEST_STAGER <number stagers needed> <listeners of connected stager servers> """ if self.stager_needed > 0 and check_time(self._last_stager_request, REQUEST_STAGER_TIME): self._write_socket_message(sample(self._stager, 1), 'REQUEST_STAGER', self.stager_needed, *[s.listener for s in self._stager]) self._last_stager_request = time.time()
def request_url_quota(self): """Requests a quotum for URLs to crawl for a job. After `URL_QUOTA_TIME` seconds a random stager connected to a running job is asked for an URL quota for URLs to be crawled:: REQUEST_URL_QUOTA <job identifier> """ if check_time(self._last_url_quota, URL_QUOTA_TIME): self._last_url_quota = time.time() if len(self._jobs) == 0: return None job = key_lowest_value({ job: self._jobs[job].received_url_quota for job in self._jobs }) self._write_socket_message( sample(self._jobs[job].stagers, 1)[0], 'REQUEST_URL_QUOTA', job)
def _command_request_stager(self, s, message): """Processes the ``REQUEST_STAGER`` command. Send back a number of stager server to the crawler server that is requesting them. Each stager server is send with:: ADD_STAGER <listener of the stager server> Args: s (:obj:`webarchiver.server.base.Node`): The crawler server that queued the command. message (list): The command that was received:: REQUEST_STAGER <number stagers needed> <listeners of connected stager servers> """ listeners = message[2:] for s_ in sample(self._stager, message[1]): if s_.listener not in listeners: self._write_socket_message(s, 'ADD_STAGER', s_.listener)
def job_add_stager(self, identifier, listeners=None, initial=True): """Adds a stager server to a job. If a list of listeners is given these listeners will have the job assigned to them. Else a random number of stager server will be chosen to fill the number of stager servers that can be connected to the job. The jobs is announced to the stager servers:: NEW_JOB <job configuration> If this is the initial time the job is shared among stager server, a list of listeners of the assigned stager servers is send to each assigned stager server:: NEW_JOB_STAGER <job identifier> <listener this stager server> <listeners of assigned stager server> A counter for the URL quota for the job is randomly selected and shared with the assigned stager servers:: JOB_SET_COUNTER <job identifier> <listener of counter> If however this is not the initial stager server sharing the job, this functions is called with a command to add a number of stagers to a job. In that case the job is initially confirmed to the stager server:: CONFIRMED_JOB 0 Args: identifier (str): The job identifier. listeners (list of tuples, optional): List of tuples (host, port) of stager servers that should be connected to the job. If None and ``initial`` True, this stager server is taken as initial stager server for the job and the job will be shared among new stagers. Default is None. initial (bool, optional): Whether the this is the initial stager server. Default is None. Returns: bool: True if the stager is added, False if the job is not known. """ logger.debug('Adding stagers to job %s.', identifier) if identifier not in self._jobs: logger.warning('Job %s does not exist.', identifier) return False job = self._jobs[identifier] if listeners is not None: print(self._listeners, listeners, listeners[0] in self._listeners) stager = [self._listeners[l] for l in listeners] else: stager = sample(self._stager, max(0, MAX_STAGER - len(job.stagers))) for s in stager: job.add_stager(s) #job['stager'][s] = { # 'confirmed': False, # 'started': False #} #job['backup'][self._stager[s]['listener']] = set() self._write_socket_message(stager, 'NEW_JOB', job.settings) if initial: for s in job.stagers: self._write_socket_message(s, 'NEW_JOB_STAGER', identifier, self._address, *[d.listener for d in job.stagers if d != s]) counter = sample(job.stagers, 1)[0] self._write_socket_message(job.stagers, 'JOB_SET_COUNTER', identifier, counter.listener) job.add_counter(counter) else: self._write_socket_message(job.stagers, 'CONFIRMED_JOB', 0, identifier) return True