예제 #1
0
    def check_jobs(self):
        """Share the URLs of each job with other staging servers.

        The initial or discovered URLs for each job need to be spread over
        other stager servers to be archived. The ``share_urls`` function of a
        job yields a list of URLs, assigned stager servers and backup location.
        An URL is send to its assigned stager server::

            JOB_URL :obj:`webarchiver.url.UrlConfig`

        and to the backup location the URL is send::

            JOB_URL_BACKUP :obj:`webarchiver.url.UrlConfig`
                <listener assigned server>

        Note:
            If the server value of the yielded data is None, the current stager
            server has the URL assigned.
        """
        if check_time(self._last_jobs_check, JOBS_CHECK_TIME):
            for job in self._jobs.values():
                for urlconfig, s, backups in job.share_urls():
                    if s is None:
                        logger.debug('Assigning URL %s to self.', urlconfig)
                        s = self._socket
                        self._command_job_url(None, [None, urlconfig])
                    else:
                        self._write_socket_message(s, 'JOB_URL', urlconfig)
                    self._write_socket_message(backups, 'JOB_URL_BACKUP',
                                               urlconfig, s.listener)
            self._last_jobs_check = time.time()
예제 #2
0
    def chosen(self):
        """Chooses a stager server for upload.

        A stager server is only selected if there is at least one permission
        for upload. A chosen server is remembered.

        Note:
            Behavior by other implemented functions using this attribute is to
            delete the WarcFile is False is received, since there is no
            implementation to 'reset' a :class:`WarcFile` object.

        Returns:
            :obj:`webarchiver.server.base.Node` or bool or NoneType: None if
                ``REQUEST_UPLOAD_TIME`` seconds have not passed yet or if no
                previous request was made and no permissions have been received
                yet. If permission have been received and
                ``REQUEST_UPLOAD_TIME`` seconds have passes, a
                :class:`webarchiver.server.base.Node` object for chosen stager
                server is returned.

                If a decision was previously made, the same decision is
                returned.
        """
        if not hasattr(self, '_chosen'):
            if self._last_answer == 0:
                self._last_answer = time.time()
                return None
            if not check_time(self._last_answer, REQUEST_UPLOAD_TIME):
                return None
            if len(self._granted) == 0:
                return False
            self._chosen = sample(self._granted, 1)
        return self._chosen
예제 #3
0
    def ping(self):
        """Pings all stager servers.

        Every ``PING_TIME`` second a ping message is send to each stager server::

            PING

        When the message is send the servers are set not having replied with a
        pong.
        """
        if check_time(self._last_ping, PING_TIME):
            for s in self._stager:
                self._stager[s].pong = False
            self._write_socket_message(self._stager, 'PING')
            self._last_ping = time.time()
예제 #4
0
    def request_stager(self):
        """Requests new stager server to connect to.

        The new stager server are needed and the last request was long enough
        ago a request is made to a random stager server to send a certain
        number of stager server the crawler server is not connected to::

            REQUEST_STAGER <number stagers needed> <listeners of connected
                stager servers>
        """
        if self.stager_needed > 0 and check_time(self._last_stager_request,
                                                 REQUEST_STAGER_TIME):
            self._write_socket_message(sample(self._stager, 1),
                                       'REQUEST_STAGER', self.stager_needed,
                                       *[s.listener for s in self._stager])
            self._last_stager_request = time.time()
예제 #5
0
    def ping(self):
        """Pings the connected servers.

        The servers ping both the crawler and stager servers::

            PING

        The pong variable for each server it set to False, so a pong is
        expected from the server.
        """
        if check_time(self._last_ping, PING_TIME):
            for s in self._stager:
                self._stager[s].pong = False
            self._write_socket_message(self._stager, 'PING')
            for s in self._crawlers:
                self._crawlers[s].pong = False
            self._write_socket_message(self._crawlers, 'PING')
            self._last_ping = time.time()
예제 #6
0
    def finish_jobs(self):
        """Checks running jobs for being finished.

        If a job is finished on this stager server, this is reported to other
        stager servers runnign this job::

            STAGER_JOB_FINISHED <job identifier>

        Note:
            A finished job means that the job currently is not active. It can
            become active again if new URLs are send to it.
        """ #TODO what do if a job is fully job.finished?
        if not check_time(self._last_finish_check, FINISH_CHECK_TIME):
            return None
        for identifier, job in self._jobs.items():
            if job.crawlers_finished:
                self._write_socket_message(job.stagers, 'STAGER_JOB_FINISHED',
                                           identifier)
        self._last_finish_check = time.time()
예제 #7
0
    def request_url_quota(self):
        """Requests a quotum for URLs to crawl for a job.

        After `URL_QUOTA_TIME` seconds a random stager connected to a running
        job is asked for an URL quota for URLs to be crawled::

            REQUEST_URL_QUOTA <job identifier>
        """
        if check_time(self._last_url_quota, URL_QUOTA_TIME):
            self._last_url_quota = time.time()
            if len(self._jobs) == 0:
                return None
            job = key_lowest_value({
                job: self._jobs[job].received_url_quota
                for job in self._jobs
            })
            self._write_socket_message(
                sample(self._jobs[job].stagers, 1)[0], 'REQUEST_URL_QUOTA',
                job)