Пример #1
0
class ResourceDownloader:
    """Commonly used tool that downloads resources."""

    _logger = LoggerFactory().getLogger('RessourceDownloader')
    _resources = []
    _downloadedResources = []

    def __init__(self):
        self._tdr = Threader()
        self._pt = PathTool.PathTool()
        self._rc = ResourceChecker()
        self._rh = ResourceHelper()
        self.last_download_timestamp = 0

    def download(self, resource_type, resource_url):
        """Downloads a resource of type feed or image by its URL."""

        if not self._rc.check_remote_resource(resource_type, resource_url):
            return

        resource = Resource(resource_url, resource_type)
        if resource.get_absolute_url().endswith('/'):
            resource._set_url(resource.get_absolute_url()[:-1])
        resource_target = resource.get_path()
        base_path = resource.get_base_path()
        msg = 'DEBUG: Will download resource %s with target %s to location %s.' \
              % (resource_url, resource_target, base_path)
        ResourceDownloader._logger.info(msg)

        self._rh.ensurePathExists(base_path)

        args = [resource_type, resource_url, resource_target]

        duplicate_found = False
        if not duplicate_found:
            for dedup_args in ResourceDownloader._resources:
                if dedup_args[2] == args[2]:
                    duplicate_found = True
                    break
        if not duplicate_found:
            for dedup_args in ResourceDownloader._downloadedResources:
                if dedup_args[2] == args[2]:
                    duplicate_found = True
                    break
        if not duplicate_found:
            ResourceDownloader._resources.append(args)

        time_since_last_download = time.time() - self.last_download_timestamp
        # download 300 files in parallel or how many ever we have every minute
        if len(ResourceDownloader._resources
               ) <= 1000 and time_since_last_download <= 60:  # TODO
            return

        resources_tmp = ResourceDownloader._resources
        ResourceDownloader._resources = []
        ResourceDownloader._downloadedResources = ResourceDownloader._downloadedResources + resources_tmp
        self.last_download_timestamp = time.time()
        self._tdr.run_parallel_in_threads(_download, resources_tmp)
Пример #2
0
class ResourceDownloader:
    """Commonly used tool that downloads resources."""
    
    _logger = LoggerFactory().getLogger('RessourceDownloader')
    _resources = []
    _downloadedResources = []
    
    def __init__(self):
        self._tdr = Threader()
        self._pt = PathTool.PathTool()
        self._rc = ResourceChecker()
        self._rh = ResourceHelper()
        self.last_download_timestamp = 0

    def download(self, resource_type, resource_url):
        """Downloads a resource of type feed or image by its URL."""
        
        if not self._rc.check_remote_resource(resource_type, resource_url):
            return

        resource = Resource(resource_url, resource_type)
        if resource.get_absolute_url().endswith('/'):
            resource._set_url(resource.get_absolute_url()[:-1])
        resource_target = resource.get_path()
        base_path = resource.get_base_path()
        msg = 'DEBUG: Will download resource %s with target %s to location %s.' \
              % (resource_url, resource_target, base_path)
        ResourceDownloader._logger.info(msg)
        
        self._rh.ensurePathExists(base_path)
        
        args = [resource_type, resource_url, resource_target]
        
        duplicate_found = False
        if not duplicate_found:
            for dedup_args in ResourceDownloader._resources:
                if dedup_args[2] == args[2]:
                    duplicate_found = True
                    break
        if not duplicate_found:
            for dedup_args in ResourceDownloader._downloadedResources:
                if dedup_args[2] == args[2]:
                    duplicate_found = True
                    break
        if not duplicate_found:
            ResourceDownloader._resources.append(args)
        
        time_since_last_download = time.time() - self.last_download_timestamp 
        # download 300 files in parallel or how many ever we have every minute
        if len(ResourceDownloader._resources) <= 1000 and time_since_last_download <= 60: # TODO
            return
        
        resources_tmp = ResourceDownloader._resources
        ResourceDownloader._resources = []
        ResourceDownloader._downloadedResources = ResourceDownloader._downloadedResources + resources_tmp
        self.last_download_timestamp = time.time()
        self._tdr.run_parallel_in_threads(_download, resources_tmp)
Пример #3
0
class FeedsDownloaderRunner:
    def __init__(self):
        self._iDler = ImagesDownloader()
        self._rc = ResourceChecker()

    def run(self):
        feedFilePaths = self._rc.getAllFeedPaths()
        for feedFilePath in feedFilePaths:
            self._iDler.handleFeed(feedFilePath)

        print('FeedsDownloaderRunner: INFO: Done.')
Пример #4
0
class FeedsDownloaderRunner:

    def __init__(self):
        self._iDler = ImagesDownloader()
        self._rc = ResourceChecker()

    def run(self):
        feedFilePaths = self._rc.getAllFeedPaths()
        for feedFilePath in feedFilePaths:
            self._iDler.handleFeed(feedFilePath)

        print('FeedsDownloaderRunner: INFO: Done.')
Пример #5
0
import sunburnt

from Resource.ResourceHelper import ResourceHelper
from Resource.ResourceChecker import ResourceChecker
from Util.PathTool import PathTool
from Digester.FeedDictFactory import FeedDictFactory

# create a connection to a solr server
try:
    solr = sunburnt.SolrInterface("http://localhost:8983/solr/")
except socket.error as e:
    print(e, "Is Solr started?")

_pt = PathTool.PathTool()
_rh = ResourceHelper()
_rc = ResourceChecker()
feeds = _rh.getAllFeedPaths()
for feed in feeds:

    print feed
    
    if not _rc.check_local_resource(feed, 'feed'):
        print("Skipping:", feed)
        continue
    
    try:
        feedDictFactory = FeedDictFactory()
        feedDict = feedDictFactory.getFeedDict(feed)
        if feedDict != None and feedDict != {}:
            feedDict['id'] = _pt.getFeedId(feed)
            print(("Indexing", feedDict))
Пример #6
0
 def __init__(self):
     self._tdr = Threader()
     self._pt = PathTool.PathTool()
     self._rc = ResourceChecker()
     self._rh = ResourceHelper()
     self.last_download_timestamp = 0
Пример #7
0
import sunburnt

from Resource.ResourceHelper import ResourceHelper
from Resource.ResourceChecker import ResourceChecker
from Util.PathTool import PathTool
from Digester.FeedDictFactory import FeedDictFactory

# create a connection to a solr server
try:
    solr = sunburnt.SolrInterface("http://localhost:8983/solr/")
except socket.error as e:
    print(e, "Is Solr started?")

_pt = PathTool.PathTool()
_rh = ResourceHelper()
_rc = ResourceChecker()
feeds = _rh.getAllFeedPaths()
for feed in feeds:

    print feed

    if not _rc.check_local_resource(feed, 'feed'):
        print("Skipping:", feed)
        continue

    try:
        feedDictFactory = FeedDictFactory()
        feedDict = feedDictFactory.getFeedDict(feed)
        if feedDict != None and feedDict != {}:
            feedDict['id'] = _pt.getFeedId(feed)
            print(("Indexing", feedDict))
Пример #8
0
 def __init__(self):
     self._tdr = Threader()
     self._pt = PathTool.PathTool()
     self._rc = ResourceChecker()
     self._rh = ResourceHelper()
     self.last_download_timestamp = 0
Пример #9
0
 def __init__(self):
     self._iDler = ImagesDownloader()
     self._rc = ResourceChecker()
Пример #10
0
 def __init__(self):
     self._iDler = ImagesDownloader()
     self._rc = ResourceChecker()