class ResourceDownloader: """Commonly used tool that downloads resources.""" _logger = LoggerFactory().getLogger('RessourceDownloader') _resources = [] _downloadedResources = [] def __init__(self): self._tdr = Threader() self._pt = PathTool.PathTool() self._rc = ResourceChecker() self._rh = ResourceHelper() self.last_download_timestamp = 0 def download(self, resource_type, resource_url): """Downloads a resource of type feed or image by its URL.""" if not self._rc.check_remote_resource(resource_type, resource_url): return resource = Resource(resource_url, resource_type) if resource.get_absolute_url().endswith('/'): resource._set_url(resource.get_absolute_url()[:-1]) resource_target = resource.get_path() base_path = resource.get_base_path() msg = 'DEBUG: Will download resource %s with target %s to location %s.' \ % (resource_url, resource_target, base_path) ResourceDownloader._logger.info(msg) self._rh.ensurePathExists(base_path) args = [resource_type, resource_url, resource_target] duplicate_found = False if not duplicate_found: for dedup_args in ResourceDownloader._resources: if dedup_args[2] == args[2]: duplicate_found = True break if not duplicate_found: for dedup_args in ResourceDownloader._downloadedResources: if dedup_args[2] == args[2]: duplicate_found = True break if not duplicate_found: ResourceDownloader._resources.append(args) time_since_last_download = time.time() - self.last_download_timestamp # download 300 files in parallel or how many ever we have every minute if len(ResourceDownloader._resources ) <= 1000 and time_since_last_download <= 60: # TODO return resources_tmp = ResourceDownloader._resources ResourceDownloader._resources = [] ResourceDownloader._downloadedResources = ResourceDownloader._downloadedResources + resources_tmp self.last_download_timestamp = time.time() self._tdr.run_parallel_in_threads(_download, resources_tmp)
class ResourceDownloader: """Commonly used tool that downloads resources.""" _logger = LoggerFactory().getLogger('RessourceDownloader') _resources = [] _downloadedResources = [] def __init__(self): self._tdr = Threader() self._pt = PathTool.PathTool() self._rc = ResourceChecker() self._rh = ResourceHelper() self.last_download_timestamp = 0 def download(self, resource_type, resource_url): """Downloads a resource of type feed or image by its URL.""" if not self._rc.check_remote_resource(resource_type, resource_url): return resource = Resource(resource_url, resource_type) if resource.get_absolute_url().endswith('/'): resource._set_url(resource.get_absolute_url()[:-1]) resource_target = resource.get_path() base_path = resource.get_base_path() msg = 'DEBUG: Will download resource %s with target %s to location %s.' \ % (resource_url, resource_target, base_path) ResourceDownloader._logger.info(msg) self._rh.ensurePathExists(base_path) args = [resource_type, resource_url, resource_target] duplicate_found = False if not duplicate_found: for dedup_args in ResourceDownloader._resources: if dedup_args[2] == args[2]: duplicate_found = True break if not duplicate_found: for dedup_args in ResourceDownloader._downloadedResources: if dedup_args[2] == args[2]: duplicate_found = True break if not duplicate_found: ResourceDownloader._resources.append(args) time_since_last_download = time.time() - self.last_download_timestamp # download 300 files in parallel or how many ever we have every minute if len(ResourceDownloader._resources) <= 1000 and time_since_last_download <= 60: # TODO return resources_tmp = ResourceDownloader._resources ResourceDownloader._resources = [] ResourceDownloader._downloadedResources = ResourceDownloader._downloadedResources + resources_tmp self.last_download_timestamp = time.time() self._tdr.run_parallel_in_threads(_download, resources_tmp)
class FeedsDownloaderRunner: def __init__(self): self._iDler = ImagesDownloader() self._rc = ResourceChecker() def run(self): feedFilePaths = self._rc.getAllFeedPaths() for feedFilePath in feedFilePaths: self._iDler.handleFeed(feedFilePath) print('FeedsDownloaderRunner: INFO: Done.')
import sunburnt from Resource.ResourceHelper import ResourceHelper from Resource.ResourceChecker import ResourceChecker from Util.PathTool import PathTool from Digester.FeedDictFactory import FeedDictFactory # create a connection to a solr server try: solr = sunburnt.SolrInterface("http://localhost:8983/solr/") except socket.error as e: print(e, "Is Solr started?") _pt = PathTool.PathTool() _rh = ResourceHelper() _rc = ResourceChecker() feeds = _rh.getAllFeedPaths() for feed in feeds: print feed if not _rc.check_local_resource(feed, 'feed'): print("Skipping:", feed) continue try: feedDictFactory = FeedDictFactory() feedDict = feedDictFactory.getFeedDict(feed) if feedDict != None and feedDict != {}: feedDict['id'] = _pt.getFeedId(feed) print(("Indexing", feedDict))
def __init__(self): self._tdr = Threader() self._pt = PathTool.PathTool() self._rc = ResourceChecker() self._rh = ResourceHelper() self.last_download_timestamp = 0
def __init__(self): self._iDler = ImagesDownloader() self._rc = ResourceChecker()