Exemplo n.º 1
0
    def enqueue(self, job_id):
        """
        Enqueue URLs for the spider to crawl.

        Arguments:
            job_id: intefer job id.

        Returns: None
        """

        urls = data.redis.smembers('job' + str(job_id))
        self._queue.extend(urls)
        if not self._active and not data.job_is_aborted(job_id):
            self._deploy(job_id)
Exemplo n.º 2
0
    def enqueue(self, job_id):

        """
        Enqueue URLs for the spider to crawl.

        Arguments:
            job_id: intefer job id.

        Returns: None
        """

        urls = data.redis.smembers('job' + str(job_id))
        self._queue.extend(urls)
        if not self._active and not data.job_is_aborted(job_id):
            self._deploy(job_id)
Exemplo n.º 3
0
    def _fetch_and_parse(self, job_id, url, depth):

        """
        Fetch a webpage and parse it for links and images.

        Arguments:
            job_id: intefer job id.
            url: string URL.
            depth: integer current depth.

        Returns: None.
        """

        html_parser = MyHtmlParser(url)
        request_headers = {'User-Agent': self.user_agent}
        request = urllib_Request(url, headers=request_headers)

        try:
            webpage = urlopen(request).read().decode()
        except Exception as error:
            data.redis.set(url, 'failed')
            return

        try:
            html_parser.feed(webpage)
        except (HTMLParseError) as error:
            data.redis.set(url, 'failed')
            return

        data.add_webpages(url, html_parser.hyperlinks, depth)
        data.redis.set(url, 'complete')
        data.complete_crawl(url)

        if 0 < depth and self._active and not data.job_is_aborted(job_id):
            if html_parser.hyperlinks:
                data.redis.sadd('job' + str(job_id), *html_parser.hyperlinks)
            data.redis.publish('deploy', pickle.dumps(job_id))
Exemplo n.º 4
0
    def _fetch_and_parse(self, job_id, url, depth):
        """
        Fetch a webpage and parse it for links and images.

        Arguments:
            job_id: intefer job id.
            url: string URL.
            depth: integer current depth.

        Returns: None.
        """

        html_parser = MyHtmlParser(url)
        request_headers = {'User-Agent': self.user_agent}
        request = urllib_Request(url, headers=request_headers)

        try:
            webpage = urlopen(request).read().decode()
        except Exception as error:
            data.redis.set(url, 'failed')
            return

        try:
            html_parser.feed(webpage)
        except (HTMLParseError) as error:
            data.redis.set(url, 'failed')
            return

        data.add_webpages(url, html_parser.hyperlinks, depth)
        data.redis.set(url, 'complete')
        data.complete_crawl(url)

        if 0 < depth and self._active and not data.job_is_aborted(job_id):
            if html_parser.hyperlinks:
                data.redis.sadd('job' + str(job_id), *html_parser.hyperlinks)
            data.redis.publish('deploy', pickle.dumps(job_id))
Exemplo n.º 5
0
    def _deploy(self, job_id):
        """
        Deploy a spider to crawl the web. Use the DeploymentManager's enqueue
        method to specify which URLs to crawl. Depth should be assigned to each
        submitted URL prior to deployment.

        Arguments:
            job_id: intefer job id.

        Returns: None
        """

        if data.job_is_aborted(job_id):
            self._active = False
            self._queue = []
            return

        self._active = True
        queue_copy = self._queue[:]
        for index, url in enumerate(queue_copy):

            if data.job_is_aborted(job_id):
                break

            self._queue.remove(url)
            validated_url = validate_url(url)
            url = validated_url['url']
            webpage_info = data.get_webpage_info(url)

            if not claim(url):
                continue

            if not validated_url['valid']:
                continue

            # Ignore webpages crawled less than 15 min ago.
            if self._less_than_15_min_ago(webpage_info['completion_datetime']):
                continue

            # Database latency means depth is occasionally still unavailable.
            if not webpage_info['depth']:
                # Child URLs with no job_id and no depth have been deleted.
                if bool(data.redis.llen('reg:' + url)):
                    data.redis.set(url, 'ready')
                    self._queue.append(url)
                continue

            depth = webpage_info['depth'] - 1
            self._set_job_status(job_id, depth, index, len(queue_copy))
            self._fetch_and_parse(job_id, url, depth)
            time.sleep(self.delay)

        if data.job_is_aborted(job_id):
            self._active = False
            self._queue = []
        else:
            if len(self._queue):
                time.sleep(self.delay)
                self._deploy(job_id)
            else:
                self._set_job_status(job_id, -1, -1, 0, 'Complete')
                self._active = False
Exemplo n.º 6
0
    def _deploy(self, job_id):

        """
        Deploy a spider to crawl the web. Use the DeploymentManager's enqueue
        method to specify which URLs to crawl. Depth should be assigned to each
        submitted URL prior to deployment.

        Arguments:
            job_id: intefer job id.

        Returns: None
        """

        if data.job_is_aborted(job_id):
            self._active = False
            self._queue = []
            return

        self._active = True
        queue_copy = self._queue[:]
        for index, url in enumerate(queue_copy):

            if data.job_is_aborted(job_id):
                break

            self._queue.remove(url)
            validated_url = validate_url(url)
            url = validated_url['url']
            webpage_info = data.get_webpage_info(url)

            if not claim(url):
                continue

            if not validated_url['valid']:
                continue

            # Ignore webpages crawled less than 15 min ago.
            if self._less_than_15_min_ago(webpage_info['completion_datetime']):
                continue

            # Database latency means depth is occasionally still unavailable.
            if not webpage_info['depth']:
                # Child URLs with no job_id and no depth have been deleted.
                if bool(data.redis.llen('reg:' + url)):
                    data.redis.set(url, 'ready')
                    self._queue.append(url)
                continue

            depth = webpage_info['depth'] - 1
            self._set_job_status(job_id, depth, index, len(queue_copy))
            self._fetch_and_parse(job_id, url, depth)
            time.sleep(self.delay)

        if data.job_is_aborted(job_id):
            self._active = False
            self._queue = []
        else:
            if len(self._queue):
                time.sleep(self.delay)
                self._deploy(job_id)
            else:
                self._set_job_status(job_id, -1, -1, 0, 'Complete')
                self._active = False