def parse(self, response): hxs = HtmlXPathSelector(response) sitemap_page_xpath = "/html/body/div[@class='container']/div[@id='column']/a/@href" sitemap_page_urls = hxs.select(sitemap_page_xpath).extract() for sitemap_page_url in sitemap_page_urls: resource = Resource(self._baseUrl + sitemap_page_url, "directory") url = resource.get_absolute_url() yield Request(url, callback=self.parse_sitemap_page)
def download(self, resource_type, resource_url): """Downloads a resource of type feed or image by its URL.""" if not self._rc.check_remote_resource(resource_type, resource_url): return resource = Resource(resource_url, resource_type) if resource.get_absolute_url().endswith('/'): resource._set_url(resource.get_absolute_url()[:-1]) resource_target = resource.get_path() base_path = resource.get_base_path() msg = 'DEBUG: Will download resource %s with target %s to location %s.' \ % (resource_url, resource_target, base_path) ResourceDownloader._logger.info(msg) self._rh.ensurePathExists(base_path) args = [resource_type, resource_url, resource_target] duplicate_found = False if not duplicate_found: for dedup_args in ResourceDownloader._resources: if dedup_args[2] == args[2]: duplicate_found = True break if not duplicate_found: for dedup_args in ResourceDownloader._downloadedResources: if dedup_args[2] == args[2]: duplicate_found = True break if not duplicate_found: ResourceDownloader._resources.append(args) time_since_last_download = time.time() - self.last_download_timestamp # download 300 files in parallel or how many ever we have every minute if len(ResourceDownloader._resources ) <= 1000 and time_since_last_download <= 60: # TODO return resources_tmp = ResourceDownloader._resources ResourceDownloader._resources = [] ResourceDownloader._downloadedResources = ResourceDownloader._downloadedResources + resources_tmp self.last_download_timestamp = time.time() self._tdr.run_parallel_in_threads(_download, resources_tmp)
def download(self, resource_type, resource_url): """Downloads a resource of type feed or image by its URL.""" if not self._rc.check_remote_resource(resource_type, resource_url): return resource = Resource(resource_url, resource_type) if resource.get_absolute_url().endswith('/'): resource._set_url(resource.get_absolute_url()[:-1]) resource_target = resource.get_path() base_path = resource.get_base_path() msg = 'DEBUG: Will download resource %s with target %s to location %s.' \ % (resource_url, resource_target, base_path) ResourceDownloader._logger.info(msg) self._rh.ensurePathExists(base_path) args = [resource_type, resource_url, resource_target] duplicate_found = False if not duplicate_found: for dedup_args in ResourceDownloader._resources: if dedup_args[2] == args[2]: duplicate_found = True break if not duplicate_found: for dedup_args in ResourceDownloader._downloadedResources: if dedup_args[2] == args[2]: duplicate_found = True break if not duplicate_found: ResourceDownloader._resources.append(args) time_since_last_download = time.time() - self.last_download_timestamp # download 300 files in parallel or how many ever we have every minute if len(ResourceDownloader._resources) <= 1000 and time_since_last_download <= 60: # TODO return resources_tmp = ResourceDownloader._resources ResourceDownloader._resources = [] ResourceDownloader._downloadedResources = ResourceDownloader._downloadedResources + resources_tmp self.last_download_timestamp = time.time() self._tdr.run_parallel_in_threads(_download, resources_tmp)