def download_images(self): logger.info('Beginning download_images()') logger.debug("Downloading images using {} threads".format(self.workers)) workpool = WorkerPool(size=self.workers) def create_download_jobs(key_func): for image_url, group in itertools.groupby(sorted(self.emotes, key=key_func), key_func): if not image_url: continue file_path = get_file_path(image_url, rootdir=self.reddit_cache) if not path.isfile(file_path): workpool.put(DownloadJob(self._requests, urlparse.urljoin('https://s3.amazonaws.com/',image_url), retry=5, rate_limit_lock=self.rate_limit_lock, callback=self._callback_download_image, **{'image_path': file_path})) with self.mutex: create_download_jobs( lambda e: e['background-image']) create_download_jobs( lambda e: e.get('hover-background-image')) workpool.shutdown() workpool.join()
def _fetch_css(self): logger.debug("Fetching css using {} threads".format(self.workers)) workpool = WorkerPool(size=self.workers) if not os.path.exists(self.cache_dir): os.makedirs(self.cache_dir) for subreddit in self.subreddits: if subreddit in self.legacy_subreddits: legacy_file = '{}/../legacy_css/{}.css'.format( os.path.dirname(__file__), subreddit) if os.path.exists(legacy_file): with open(legacy_file) as fh: css = fh.read() self._process_stylesheet_response( 200, css, "text/css", subreddit) else: logger.error( "No css file found for legacy subreddit {}".format( subreddit)) else: workpool.put( DownloadJob( self._requests, 'https://old.reddit.com/r/{}/stylesheet'.format( subreddit), retry=5, rate_limit_lock=self.rate_limit_lock, callback=self._callback_fetch_stylesheet, **{'subreddit': subreddit})) workpool.shutdown() workpool.join()
def _download_images(self): logger.debug("Downloading images using {} threads".format( self.workers)) workpool = WorkerPool(size=self.workers) # cache emotes key_func = lambda e: e['background-image'] with self.mutex: for image_url, group in itertools.groupby( sorted(self.emotes, key=key_func), key_func): if not image_url: continue file_path = self.get_file_path(image_url, rootdir=self.cache_dir) if not os.path.isfile(file_path): # Temp workaround for downloading apngs straight from amazon instead of broken ones from cloudflare image_url = re.sub(r'^(https?:)?//', 'https://s3.amazonaws.com/', image_url) workpool.put( DownloadJob(self._requests, image_url, retry=5, rate_limit_lock=self.rate_limit_lock, callback=self._callback_download_image, **{'image_path': file_path})) workpool.shutdown() workpool.join()
def _fetch_css(self): logger.debug("Fetching css using {} threads".format(self.workers)) workpool = WorkerPool(size=self.workers) if not os.path.exists(self.cache_dir): os.makedirs(self.cache_dir) for subreddit in self.subreddits: css_path = os.path.sep.join([self.cache_dir, subreddit + ".css"]) if self.prefer_cache and os.path.exists(css_path): with open(css_path) as css_file: css = css_file.read().decode("utf8") self._handle_css(css, subreddit) else: workpool.put( DownloadJob( self._requests, "http://www.reddit.com/r/{}/stylesheet".format(subreddit), retry=5, rate_limit_lock=self.rate_limit_lock, callback=self._callback_fetch_stylesheet, **{"subreddit": subreddit} ) ) workpool.shutdown() workpool.join()
def _download_images(self): logger.debug("Downloading images using {} threads".format( self.workers)) workpool = WorkerPool(size=self.workers) # cache emotes key_func = lambda e: e['background-image'] with self.mutex: for image_url, group in itertools.groupby( sorted(self.emotes, key=key_func), key_func): if not image_url: continue file_path = self.get_file_path(image_url, rootdir=self.cache_dir) if not os.path.isfile(file_path): workpool.put( DownloadJob(self._requests, image_url, retry=5, rate_limit_lock=self.rate_limit_lock, callback=self._callback_download_image, **{'image_path': file_path})) workpool.shutdown() workpool.join()
def _download_images(self): logger.debug("Downloading images using {} threads".format(self.workers)) workpool = WorkerPool(size=self.workers) # cache emotes key_func = lambda e: e["background-image"] with self.mutex: for image_url, group in itertools.groupby(sorted(self.emotes, key=key_func), key_func): if not image_url: continue file_path = self.get_file_path(image_url, rootdir=self.cache_dir) if not os.path.isfile(file_path): image_url = re.sub(r"^(https?:)?//", "https://", image_url) workpool.put( DownloadJob( self._requests, image_url, retry=5, rate_limit_lock=self.rate_limit_lock, callback=self._callback_download_image, **{"image_path": file_path} ) ) workpool.shutdown() workpool.join()
def download_images(self): logger.info('Beginning download_images()') logger.debug("Downloading images using {} threads".format( self.workers)) workpool = WorkerPool(size=self.workers) def create_download_jobs(key_func): for image_url, group in itertools.groupby( sorted(self.emotes, key=key_func), key_func): if not image_url: continue file_path = get_file_path(image_url, rootdir=self.reddit_cache) if not path.isfile(file_path): workpool.put( DownloadJob(self._requests, urlparse.urljoin( 'https://s3.amazonaws.com/', image_url), retry=5, rate_limit_lock=self.rate_limit_lock, callback=self._callback_download_image, **{'image_path': file_path})) with self.mutex: create_download_jobs(lambda e: e['background-image']) create_download_jobs(lambda e: e.get('hover-background-image')) workpool.shutdown() workpool.join()
def fetch_css(self): logger.info('Beginning fetch_css()') logger.debug("Fetching css using {} threads".format(self.workers)) workpool = WorkerPool(size=self.workers) for subreddit in self.subreddits: try: css_subreddit_path = path.join(self.session_cache, subreddit.lower()) + '.css' with open(css_subreddit_path, 'r') as f: pass except: workpool.put( DownloadJob( self._requests, 'https://pay.reddit.com/r/{}/stylesheet'.format( subreddit), retry=5, rate_limit_lock=self.rate_limit_lock, callback=self._callback_fetch_stylesheet, **{'subreddit': subreddit})) workpool.shutdown() workpool.join()
def _fetch_css(self): logger.debug("Fetching css using {} threads".format(self.workers)) workpool = WorkerPool(size=self.workers) for subreddit in self.subreddits: workpool.put(DownloadJob(self._requests, 'http://www.reddit.com/r/{}/stylesheet'.format(subreddit), retry=5, rate_limit_lock=self.rate_limit_lock, callback=self._callback_fetch_stylesheet, **{'subreddit': subreddit})) workpool.shutdown() workpool.join()
def _process_emotes(self): logger.debug("Processing emotes using {} threads".format(self.workers)) workpool = WorkerPool(self.workers) key_func = lambda e: e['background-image'] with self.mutex: for image_url, group in itertools.groupby(sorted(self.emotes, key=key_func), key_func): if not image_url: continue workpool.put(self.processor_factory.new_processor(scraper=self, image_url=image_url, group=list(group))) workpool.shutdown() workpool.join()
def _fetch_css(self): logger.debug("Fetching css using {} threads".format(self.workers)) workpool = WorkerPool(size=self.workers) for subreddit in self.subreddits: workpool.put( DownloadJob( self._requests, 'http://www.reddit.com/r/{}/stylesheet'.format(subreddit), retry=5, rate_limit_lock=self.rate_limit_lock, callback=self._callback_fetch_stylesheet, **{'subreddit': subreddit})) workpool.shutdown() workpool.join()
def _process_emotes(self): logger.debug("Processing emotes using {} threads".format(self.workers)) workpool = WorkerPool(self.workers) key_func = lambda e: e['background-image'] with self.mutex: for image_url, group in itertools.groupby( sorted(self.emotes, key=key_func), key_func): if not image_url: continue workpool.put( self.processor_factory.new_processor(scraper=self, image_url=image_url, group=list(group))) workpool.shutdown() workpool.join()
def fetch_css(self): logger.info('Beginning fetch_css()') logger.debug("Fetching css using {} threads".format(self.workers)) workpool = WorkerPool(size=self.workers) for subreddit in self.subreddits: try: css_subreddit_path = path.join(self.session_cache, subreddit.lower()) + '.css' with open(css_subreddit_path, 'r') as f: pass except: workpool.put(DownloadJob(self._requests, 'https://pay.reddit.com/r/{}/stylesheet'.format(subreddit), retry=5, rate_limit_lock=self.rate_limit_lock, callback=self._callback_fetch_stylesheet, **{'subreddit': subreddit})) workpool.shutdown() workpool.join()
def _download_images(self): logger.debug("Downloading images using {} threads".format(self.workers)) workpool = WorkerPool(size=self.workers) # cache emotes key_func = lambda e: e['background-image'] with self.mutex: for image_url, group in itertools.groupby(sorted(self.emotes, key=key_func), key_func): if not image_url: continue file_path = self.get_file_path(image_url, rootdir=self.cache_dir) if not os.path.isfile(file_path): # Temp workaround for downloading apngs straight from amazon instead of broken ones from cloudflare image_url = image_url.replace('http://', 'https://s3.amazonaws.com/') workpool.put(DownloadJob(self._requests, image_url, retry=5, rate_limit_lock=self.rate_limit_lock, callback=self._callback_download_image, **{'image_path': file_path})) workpool.shutdown() workpool.join()