예제 #1
0
    def chosen(self):
        """Chooses a stager server for upload.

        A stager server is only selected if there is at least one permission
        for upload. A chosen server is remembered.

        Note:
            Behavior by other implemented functions using this attribute is to
            delete the WarcFile is False is received, since there is no
            implementation to 'reset' a :class:`WarcFile` object.

        Returns:
            :obj:`webarchiver.server.base.Node` or bool or NoneType: None if
                ``REQUEST_UPLOAD_TIME`` seconds have not passed yet or if no
                previous request was made and no permissions have been received
                yet. If permission have been received and
                ``REQUEST_UPLOAD_TIME`` seconds have passes, a
                :class:`webarchiver.server.base.Node` object for chosen stager
                server is returned.

                If a decision was previously made, the same decision is
                returned.
        """
        if not hasattr(self, '_chosen'):
            if self._last_answer == 0:
                self._last_answer = time.time()
                return None
            if not check_time(self._last_answer, REQUEST_UPLOAD_TIME):
                return None
            if len(self._granted) == 0:
                return False
            self._chosen = sample(self._granted, 1)
        return self._chosen
예제 #2
0
    def found_urls(self):
        """Reports discovered URLs to the stager servers.

        The URLs discovered in a crawl are added to a found URLs set. They
        are in :class:`webarchiver.url.UrlConfig` objects. Each URL is send to
        a randomly chosen stager server connected to the job the URL was
        discovered in::

            JOB_URL_DISCOVERED :obj:`webarchiver.url.UrlConfig`

        Send URLs are removed from the set.
        """
        if len(self._found_urls_set) == 0:
            return None
        finished = set()
        logger.debug('Reporting finished URLs.')
        with self._found_urls_set.lock:
            for urlconfig in self._found_urls_set:
                finished.add(urlconfig)
                identifier = urlconfig.job_identifier
                if self._jobs[identifier].archived_url(urlconfig):
                    continue
                print(urlconfig.url, urlconfig.parent_url, urlconfig.depth)
                if not self._jobs[identifier].allowed_url(urlconfig):
                    continue
                print('passed', urlconfig.url, urlconfig.parent_url,
                      urlconfig.depth)
                stager = sample(self._jobs[identifier].stagers, 1)[0]
                self._write_socket_message(stager, 'JOB_URL_DISCOVERED',
                                           urlconfig)
            self._found_urls_set.difference_update(finished)
예제 #3
0
    def request_stager(self):
        """Requests new stager server to connect to.

        The new stager server are needed and the last request was long enough
        ago a request is made to a random stager server to send a certain
        number of stager server the crawler server is not connected to::

            REQUEST_STAGER <number stagers needed> <listeners of connected
                stager servers>
        """
        if self.stager_needed > 0 and check_time(self._last_stager_request,
                                                 REQUEST_STAGER_TIME):
            self._write_socket_message(sample(self._stager, 1),
                                       'REQUEST_STAGER', self.stager_needed,
                                       *[s.listener for s in self._stager])
            self._last_stager_request = time.time()
예제 #4
0
    def request_url_quota(self):
        """Requests a quotum for URLs to crawl for a job.

        After `URL_QUOTA_TIME` seconds a random stager connected to a running
        job is asked for an URL quota for URLs to be crawled::

            REQUEST_URL_QUOTA <job identifier>
        """
        if check_time(self._last_url_quota, URL_QUOTA_TIME):
            self._last_url_quota = time.time()
            if len(self._jobs) == 0:
                return None
            job = key_lowest_value({
                job: self._jobs[job].received_url_quota
                for job in self._jobs
            })
            self._write_socket_message(
                sample(self._jobs[job].stagers, 1)[0], 'REQUEST_URL_QUOTA',
                job)
예제 #5
0
    def _command_request_stager(self, s, message):
        """Processes the ``REQUEST_STAGER`` command.

        Send back a number of stager server to the crawler server that is
        requesting them. Each stager server is send with::

            ADD_STAGER <listener of the stager server>

        Args:
            s (:obj:`webarchiver.server.base.Node`): The crawler server that
                queued the command.
            message (list): The command that was received::

                    REQUEST_STAGER <number stagers needed>
                        <listeners of connected stager servers>
        """
        listeners = message[2:]
        for s_ in sample(self._stager, message[1]):
            if s_.listener not in listeners:
                self._write_socket_message(s, 'ADD_STAGER',
                                           s_.listener)
예제 #6
0
    def job_add_stager(self, identifier, listeners=None, initial=True):
        """Adds a stager server to a job.

        If a list of listeners is given these listeners will have the job
        assigned to them. Else a random number of stager server will be chosen
        to fill the number of stager servers that can be connected to the job.
        The jobs is announced to the stager servers::

            NEW_JOB <job configuration>

        If this is the initial time the job is shared among stager server, a
        list of listeners of the assigned stager servers is send to each
        assigned stager server::

            NEW_JOB_STAGER <job identifier> <listener this stager server>
            <listeners of assigned stager server>

        A counter for the URL quota for the job is randomly selected and shared
        with the assigned stager servers::

            JOB_SET_COUNTER <job identifier> <listener of counter>

        If however this is not the initial stager server sharing the job, this
        functions is called with a command to add a number of stagers to a job.
        In that case the job is initially confirmed to the stager server::

            CONFIRMED_JOB 0

        Args:
            identifier (str): The job identifier.
            listeners (list of tuples, optional): List of tuples (host, port)
                of stager servers that should be connected to the job. If None
                and ``initial`` True, this stager server is taken as initial
                stager server for the job and the job will be shared among new
                stagers. Default is None.
            initial (bool, optional): Whether the this is the initial stager
                server. Default is None.

        Returns:
            bool: True if the stager is added, False if the job is not known.
        """
        logger.debug('Adding stagers to job %s.', identifier)
        if identifier not in self._jobs:
            logger.warning('Job %s does not exist.', identifier)
            return False
        job = self._jobs[identifier]
        if listeners is not None:
            print(self._listeners, listeners, listeners[0] in self._listeners)
            stager = [self._listeners[l] for l in listeners]
        else:
            stager = sample(self._stager, max(0, MAX_STAGER - len(job.stagers)))
        for s in stager:
            job.add_stager(s)
            #job['stager'][s] = {
            #    'confirmed': False,
            #    'started': False
            #}
            #job['backup'][self._stager[s]['listener']] = set()
        self._write_socket_message(stager, 'NEW_JOB', job.settings)
        if initial:
            for s in job.stagers:
                self._write_socket_message(s, 'NEW_JOB_STAGER', identifier,
                                           self._address,
                                           *[d.listener for d in job.stagers
                                             if d != s])
            counter = sample(job.stagers, 1)[0]
            self._write_socket_message(job.stagers, 'JOB_SET_COUNTER',
                                       identifier, counter.listener)
            job.add_counter(counter)
        else:
            self._write_socket_message(job.stagers, 'CONFIRMED_JOB', 0,
                                       identifier)
        return True