Exemplo n.º 1
0
 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     sitemap_page_xpath = "/html/body/div[@class='container']/div[@id='column']/a/@href"
     sitemap_page_urls = hxs.select(sitemap_page_xpath).extract()
     for sitemap_page_url in sitemap_page_urls:
         resource = Resource(self._baseUrl + sitemap_page_url, "directory")
         url = resource.get_absolute_url()
         yield Request(url, callback=self.parse_sitemap_page)
Exemplo n.º 2
0
 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     sitemap_page_xpath = "/html/body/div[@class='container']/div[@id='column']/a/@href"
     sitemap_page_urls = hxs.select(sitemap_page_xpath).extract()
     for sitemap_page_url in sitemap_page_urls:
         resource = Resource(self._baseUrl + sitemap_page_url, "directory")
         url = resource.get_absolute_url()
         yield Request(url, callback=self.parse_sitemap_page)
Exemplo n.º 3
0
    def download(self, resource_type, resource_url):
        """Downloads a resource of type feed or image by its URL."""

        if not self._rc.check_remote_resource(resource_type, resource_url):
            return

        resource = Resource(resource_url, resource_type)
        if resource.get_absolute_url().endswith('/'):
            resource._set_url(resource.get_absolute_url()[:-1])
        resource_target = resource.get_path()
        base_path = resource.get_base_path()
        msg = 'DEBUG: Will download resource %s with target %s to location %s.' \
              % (resource_url, resource_target, base_path)
        ResourceDownloader._logger.info(msg)

        self._rh.ensurePathExists(base_path)

        args = [resource_type, resource_url, resource_target]

        duplicate_found = False
        if not duplicate_found:
            for dedup_args in ResourceDownloader._resources:
                if dedup_args[2] == args[2]:
                    duplicate_found = True
                    break
        if not duplicate_found:
            for dedup_args in ResourceDownloader._downloadedResources:
                if dedup_args[2] == args[2]:
                    duplicate_found = True
                    break
        if not duplicate_found:
            ResourceDownloader._resources.append(args)

        time_since_last_download = time.time() - self.last_download_timestamp
        # download 300 files in parallel or how many ever we have every minute
        if len(ResourceDownloader._resources
               ) <= 1000 and time_since_last_download <= 60:  # TODO
            return

        resources_tmp = ResourceDownloader._resources
        ResourceDownloader._resources = []
        ResourceDownloader._downloadedResources = ResourceDownloader._downloadedResources + resources_tmp
        self.last_download_timestamp = time.time()
        self._tdr.run_parallel_in_threads(_download, resources_tmp)
Exemplo n.º 4
0
    def download(self, resource_type, resource_url):
        """Downloads a resource of type feed or image by its URL."""
        
        if not self._rc.check_remote_resource(resource_type, resource_url):
            return

        resource = Resource(resource_url, resource_type)
        if resource.get_absolute_url().endswith('/'):
            resource._set_url(resource.get_absolute_url()[:-1])
        resource_target = resource.get_path()
        base_path = resource.get_base_path()
        msg = 'DEBUG: Will download resource %s with target %s to location %s.' \
              % (resource_url, resource_target, base_path)
        ResourceDownloader._logger.info(msg)
        
        self._rh.ensurePathExists(base_path)
        
        args = [resource_type, resource_url, resource_target]
        
        duplicate_found = False
        if not duplicate_found:
            for dedup_args in ResourceDownloader._resources:
                if dedup_args[2] == args[2]:
                    duplicate_found = True
                    break
        if not duplicate_found:
            for dedup_args in ResourceDownloader._downloadedResources:
                if dedup_args[2] == args[2]:
                    duplicate_found = True
                    break
        if not duplicate_found:
            ResourceDownloader._resources.append(args)
        
        time_since_last_download = time.time() - self.last_download_timestamp 
        # download 300 files in parallel or how many ever we have every minute
        if len(ResourceDownloader._resources) <= 1000 and time_since_last_download <= 60: # TODO
            return
        
        resources_tmp = ResourceDownloader._resources
        ResourceDownloader._resources = []
        ResourceDownloader._downloadedResources = ResourceDownloader._downloadedResources + resources_tmp
        self.last_download_timestamp = time.time()
        self._tdr.run_parallel_in_threads(_download, resources_tmp)