Exemplo n.º 1
0
    def _process_scrape_info(self, scraper: BaseScraper,
                             scrape_result: ScrapeResult,
                             item_session: ItemSession):
        '''Collect the URLs from the scrape info dict.'''
        if not scrape_result:
            return 0, 0

        num_inline = 0
        num_linked = 0

        for link_context in scrape_result.link_contexts:
            url_info = self.parse_url(link_context.link)

            if not url_info:
                continue

            url_info = self.rewrite_url(url_info)

            child_url_record = item_session.child_url_record(
                url_info.url, inline=link_context.inline
            )
            if not self._fetch_rule.consult_filters(item_session.request.url_info, child_url_record)[0]:
                continue

            if link_context.inline:
                num_inline += 1
            else:
                num_linked += 1

            item_session.add_child_url(url_info.url, inline=link_context.inline,
                                       link_type=link_context.link_type)

        return num_inline, num_linked
Exemplo n.º 2
0
    def my_get_urls(self, item_session: ItemSession):

        the_url = item_session.request.url

        logger.info("get_urls() for url `%s`", the_url)


        if the_url.startswith(VALKYRIE_URL_PREFIX) \
            or (the_url.startswith(CHIHIRO_URL_PREFIX) and not the_url.endswith(CHIHIRO_IMAGE_URL_SUFFIX)):

            the_type = None
            if the_url.startswith(VALKYRIE_URL_PREFIX):
                the_type = UrlType.VALKYRIE
            elif the_url.startswith(CHIHIRO_URL_PREFIX):
                the_type = UrlType.CHIHIRO
            else:
                raise Exception("unknown url prefix? `%s`", the_url)

            urls = self.process_result(the_type, item_session)

            for iter_url in urls:

                item_session.add_child_url(iter_url)
        else:

            # not one of the main JSON api urls, don't add any new urls
            logger.info("url doesn't start with JSON api prefix, or had the /image suffix, not adding any new urls")
Exemplo n.º 3
0
    def _process_scrape_info(self, scraper: BaseScraper,
                             scrape_result: ScrapeResult,
                             item_session: ItemSession):
        '''Collect the URLs from the scrape info dict.'''
        if not scrape_result:
            return 0, 0

        num_inline = 0
        num_linked = 0

        for link_context in scrape_result.link_contexts:
            url_info = self.parse_url(link_context.link)

            if not url_info:
                continue

            url_info = self.rewrite_url(url_info)

            child_url_record = item_session.child_url_record(
                url_info.url, inline=link_context.inline)
            if not self._fetch_rule.consult_filters(
                    item_session.request.url_info, child_url_record)[0]:
                continue

            if link_context.inline:
                num_inline += 1
            else:
                num_linked += 1

            item_session.add_child_url(url_info.url,
                                       inline=link_context.inline,
                                       link_type=link_context.link_type)

        return num_inline, num_linked
Exemplo n.º 4
0
    def add_extra_urls(self, item_session: ItemSession):
        '''Add additional URLs such as robots.txt, favicon.ico.'''

        if item_session.url_record.level == 0 and self._sitemaps:
            extra_url_infos = (
                self.parse_url('{0}://{1}/robots.txt'.format(
                    item_session.url_record.url_info.scheme,
                    item_session.url_record.url_info.hostname_with_port)),
                self.parse_url('{0}://{1}/sitemap.xml'.format(
                    item_session.url_record.url_info.scheme,
                    item_session.url_record.url_info.hostname_with_port)))

            for url_info in extra_url_infos:
                item_session.add_child_url(url_info.url)
Exemplo n.º 5
0
    def get_urls(self, item_session: ItemSession):
        filename = item_session.response.body.name
        url_info = item_session.request.url_info
        print('get_urls', filename)
        assert filename
        assert os.path.isfile(filename)
        assert url_info.url

        if url_info.path == '/':
            item_session.add_child_url('http://localhost:' +
                                       str(url_info.port) + '/post/',
                                       inline=True,
                                       post_data='text=hello',
                                       replace=True)
            item_session.add_child_url('..malformed')
Exemplo n.º 6
0
    def get_urls(self, item_session: ItemSession):
        filename = item_session.response.body.name
        url_info = item_session.request.url_info
        print('get_urls', filename)
        assert filename
        assert os.path.isfile(filename)
        assert url_info.url

        if url_info.path == '/':
            item_session.add_child_url(
                'http://localhost:' + str(url_info.port) + '/post/',
                inline=True,
                post_data='text=hello',
                replace=True
            )
            item_session.add_child_url('..malformed')
Exemplo n.º 7
0
    def add_extra_urls(self, item_session: ItemSession):
        '''Add additional URLs such as robots.txt, favicon.ico.'''

        if item_session.url_record.level == 0 and self._sitemaps:
            extra_url_infos = (
                self.parse_url(
                    '{0}://{1}/robots.txt'.format(
                        item_session.url_record.url_info.scheme,
                        item_session.url_record.url_info.hostname_with_port)
                ),
                self.parse_url(
                    '{0}://{1}/sitemap.xml'.format(
                        item_session.url_record.url_info.scheme,
                        item_session.url_record.url_info.hostname_with_port)
                )
            )

            for url_info in extra_url_infos:
                item_session.add_child_url(url_info.url)