def _process_scrape_info(self, scraper: BaseScraper, scrape_result: ScrapeResult, item_session: ItemSession): '''Collect the URLs from the scrape info dict.''' if not scrape_result: return 0, 0 num_inline = 0 num_linked = 0 for link_context in scrape_result.link_contexts: url_info = self.parse_url(link_context.link) if not url_info: continue url_info = self.rewrite_url(url_info) child_url_record = item_session.child_url_record( url_info.url, inline=link_context.inline ) if not self._fetch_rule.consult_filters(item_session.request.url_info, child_url_record)[0]: continue if link_context.inline: num_inline += 1 else: num_linked += 1 item_session.add_child_url(url_info.url, inline=link_context.inline, link_type=link_context.link_type) return num_inline, num_linked
def my_get_urls(self, item_session: ItemSession): the_url = item_session.request.url logger.info("get_urls() for url `%s`", the_url) if the_url.startswith(VALKYRIE_URL_PREFIX) \ or (the_url.startswith(CHIHIRO_URL_PREFIX) and not the_url.endswith(CHIHIRO_IMAGE_URL_SUFFIX)): the_type = None if the_url.startswith(VALKYRIE_URL_PREFIX): the_type = UrlType.VALKYRIE elif the_url.startswith(CHIHIRO_URL_PREFIX): the_type = UrlType.CHIHIRO else: raise Exception("unknown url prefix? `%s`", the_url) urls = self.process_result(the_type, item_session) for iter_url in urls: item_session.add_child_url(iter_url) else: # not one of the main JSON api urls, don't add any new urls logger.info("url doesn't start with JSON api prefix, or had the /image suffix, not adding any new urls")
def _process_scrape_info(self, scraper: BaseScraper, scrape_result: ScrapeResult, item_session: ItemSession): '''Collect the URLs from the scrape info dict.''' if not scrape_result: return 0, 0 num_inline = 0 num_linked = 0 for link_context in scrape_result.link_contexts: url_info = self.parse_url(link_context.link) if not url_info: continue url_info = self.rewrite_url(url_info) child_url_record = item_session.child_url_record( url_info.url, inline=link_context.inline) if not self._fetch_rule.consult_filters( item_session.request.url_info, child_url_record)[0]: continue if link_context.inline: num_inline += 1 else: num_linked += 1 item_session.add_child_url(url_info.url, inline=link_context.inline, link_type=link_context.link_type) return num_inline, num_linked
def add_extra_urls(self, item_session: ItemSession): '''Add additional URLs such as robots.txt, favicon.ico.''' if item_session.url_record.level == 0 and self._sitemaps: extra_url_infos = ( self.parse_url('{0}://{1}/robots.txt'.format( item_session.url_record.url_info.scheme, item_session.url_record.url_info.hostname_with_port)), self.parse_url('{0}://{1}/sitemap.xml'.format( item_session.url_record.url_info.scheme, item_session.url_record.url_info.hostname_with_port))) for url_info in extra_url_infos: item_session.add_child_url(url_info.url)
def get_urls(self, item_session: ItemSession): filename = item_session.response.body.name url_info = item_session.request.url_info print('get_urls', filename) assert filename assert os.path.isfile(filename) assert url_info.url if url_info.path == '/': item_session.add_child_url('http://localhost:' + str(url_info.port) + '/post/', inline=True, post_data='text=hello', replace=True) item_session.add_child_url('..malformed')
def get_urls(self, item_session: ItemSession): filename = item_session.response.body.name url_info = item_session.request.url_info print('get_urls', filename) assert filename assert os.path.isfile(filename) assert url_info.url if url_info.path == '/': item_session.add_child_url( 'http://localhost:' + str(url_info.port) + '/post/', inline=True, post_data='text=hello', replace=True ) item_session.add_child_url('..malformed')
def add_extra_urls(self, item_session: ItemSession): '''Add additional URLs such as robots.txt, favicon.ico.''' if item_session.url_record.level == 0 and self._sitemaps: extra_url_infos = ( self.parse_url( '{0}://{1}/robots.txt'.format( item_session.url_record.url_info.scheme, item_session.url_record.url_info.hostname_with_port) ), self.parse_url( '{0}://{1}/sitemap.xml'.format( item_session.url_record.url_info.scheme, item_session.url_record.url_info.hostname_with_port) ) ) for url_info in extra_url_infos: item_session.add_child_url(url_info.url)