def fill_queue(self): page = random.randint(1, 250) url = 'https://api.unsplash.com/photos/?page=%d&per_page=30&client_id=%s' % (page, UnsplashDownloader.CLIENT_ID) logger.info(lambda: "Filling Unsplash queue from " + url) r = Util.request(url) if int(r.headers.get('X-Ratelimit-Remaining', 1000000)) < 100: UnsplashDownloader.rate_limiting_started_time = time.time() for item in r.json(): try: width = item['width'] height = item['height'] if self.parent and not self.parent.size_ok(width, height): continue image_url = item['links']['download'] origin_url = item['links']['html'] filename = os.path.join(self.target_folder, Util.sanitize_filename(image_url.split('/')[-2] + '.jpg')) extra_metadata = { 'sourceType': 'unsplash', 'sfwRating': 100, 'author': item['user']['name'], 'authorURL': item['user']['links']['html'], 'keywords': [cat['title'].lower().strip() for cat in item['categories']] } self.queue.append((origin_url, image_url, extra_metadata, filename)) except: logger.exception(lambda: "Could not process an item from Unsplash") raise random.shuffle(self.queue) logger.info(lambda: "Unsplash populated with %d URLs" % len(self.queue))
def fill_queue(self): if time.time() - UnsplashDownloader.rate_limiting_started_time < 3600: logger.info( lambda: "Unsplash queue empty, but rate limit reached, will try again later" ) return [] page = random.randint(1, 250) url = 'https://api.unsplash.com/photos/?page=%d&per_page=30&client_id=%s' % ( page, UnsplashDownloader.CLIENT_ID) logger.info(lambda: "Filling Unsplash queue from " + url) r = Util.request(url) if int(r.headers.get('X-Ratelimit-Remaining', 1000000)) < 100: UnsplashDownloader.rate_limiting_started_time = time.time() queue = [] for item in r.json(): try: width = item['width'] height = item['height'] if self.is_size_inadequate(width, height): continue image_url = item['urls']['full'] origin_url = item['links'][ 'html'] + UnsplashDownloader.UTM_PARAMS extra_metadata = { 'sourceType': 'unsplash', 'sfwRating': 100, 'author': item['user']['name'], 'authorURL': item['user']['links']['html'] + UnsplashDownloader.UTM_PARAMS, 'keywords': [ cat['title'].lower().strip() for cat in item['categories'] ], 'extraData': { 'unsplashDownloadLocation': item['links']['download_location'], 'unsplashDownloadReported': False, } } queue.append((origin_url, image_url, extra_metadata)) except: logger.exception( lambda: "Could not process an item from Unsplash") raise random.shuffle(queue) return queue
def fill_queue(self): if time.time() - UnsplashDownloader.rate_limiting_started_time < 3600: logger.info( lambda: "Unsplash queue empty, but rate limit reached, will try again later" ) return [] url = self.get_unsplash_api_url() logger.info(lambda: "Filling Unsplash queue from " + url) r = Util.request(url) if int(r.headers.get("X-Ratelimit-Remaining", 1000000)) < 1000: UnsplashDownloader.rate_limiting_started_time = time.time() queue = [] for item in r.json(): try: width = item["width"] height = item["height"] if self.is_size_inadequate(width, height): continue image_url = item["urls"]["full"] + "&w={}".format( max(1980, int(Util.get_primary_display_size()[0] * 1.2))) origin_url = item["links"][ "html"] + UnsplashDownloader.UTM_PARAMS extra_metadata = { "sourceType": "unsplash", "sfwRating": 100, "author": item["user"]["name"], "authorURL": item["user"]["links"]["html"] + UnsplashDownloader.UTM_PARAMS, "keywords": [ cat["title"].lower().strip() for cat in item["categories"] ], "extraData": { "unsplashDownloadLocation": item["links"]["download_location"], "unsplashDownloadReported": False, }, } queue.append((origin_url, image_url, extra_metadata)) except: logger.exception( lambda: "Could not process an item from Unsplash") raise random.shuffle(queue) return queue
def save_locally(self, origin_url, image_url, source_type=None, source_location=None, source_name=None, force_download=False, extra_metadata={}, local_filename=None): if not source_type: source_type = self.source_type if not source_name: source_name = self.name if not source_location: source_location = self.location if not force_download and self.parent and origin_url in self.parent.banned: logger.info(lambda: "URL " + origin_url + " is banned, skip downloading") return None try: os.makedirs(self.target_folder) except Exception: pass if origin_url.startswith('//'): origin_url = 'https:' + origin_url if image_url.startswith('//'): image_url = origin_url.split('//')[0] + image_url if not local_filename: local_filename = self.get_local_filename(image_url) logger.info(lambda: "Origin URL: " + origin_url) logger.info(lambda: "Image URL: " + image_url) logger.info(lambda: "Local name: " + local_filename) if not force_download and os.path.exists(local_filename): logger.info(lambda: "File already exists, skip downloading") return None if self.parent and self.parent.options.safe_mode: sfw_rating = Smart.get_sfw_rating(origin_url) if sfw_rating is not None and sfw_rating < 100: logger.info(lambda: "Skipping non-safe download %s. Is the source %s:%s " "suitable for Safe mode?" % (origin_url, source_type, self.location)) return None if self.parent and self.parent.options.safe_mode and 'keywords' in extra_metadata: blacklisted = set(k.lower() for k in extra_metadata['keywords']) & Smart.get_safe_mode_keyword_blacklist() if len(blacklisted) > 0: logger.info(lambda: "Skipping non-safe download %s due to blacklisted keywords (%s). " "Is the source %s:%s suitable for Safe mode?" % (origin_url, str(blacklisted), source_type, self.location)) return None try: r = Util.request(image_url, stream=True) with open(local_filename, 'wb') as f: Util.request_write_to(r, f) except Exception, e: logger.info(lambda: "Download failed from image URL: %s (source location: %s) " % (image_url, self.location)) raise e
def fill_queue(self): if time.time() - UnsplashDownloader.rate_limiting_started_time < 3600: logger.info( lambda: "Unsplash queue empty, but rate limit reached, will try again later" ) return [] page = random.randint(1, 250) url = "https://api.unsplash.com/photos/?page=%d&per_page=30&client_id=%s" % ( page, UnsplashDownloader.CLIENT_ID, ) logger.info(lambda: "Filling Unsplash queue from " + url) r = Util.request(url) if int(r.headers.get("X-Ratelimit-Remaining", 1000000)) < 100: UnsplashDownloader.rate_limiting_started_time = time.time() queue = [] for item in r.json(): try: width = item["width"] height = item["height"] if self.is_size_inadequate(width, height): continue image_url = item["urls"]["full"] origin_url = item["links"]["html"] + UnsplashDownloader.UTM_PARAMS extra_metadata = { "sourceType": "unsplash", "sfwRating": 100, "author": item["user"]["name"], "authorURL": item["user"]["links"]["html"] + UnsplashDownloader.UTM_PARAMS, "keywords": [cat["title"].lower().strip() for cat in item["categories"]], "extraData": { "unsplashDownloadLocation": item["links"]["download_location"], "unsplashDownloadReported": False, }, } queue.append((origin_url, image_url, extra_metadata)) except: logger.exception(lambda: "Could not process an item from Unsplash") raise random.shuffle(queue) return queue
def fill_queue(self): page = random.randint(1, 250) url = 'https://api.unsplash.com/photos/?page=%d&per_page=30&client_id=%s' % ( page, UnsplashDownloader.CLIENT_ID) logger.info(lambda: "Filling Unsplash queue from " + url) r = Util.request(url) if int(r.headers.get('X-Ratelimit-Remaining', 1000000)) < 100: UnsplashDownloader.rate_limiting_started_time = time.time() for item in r.json(): try: width = item['width'] height = item['height'] if self.parent and not self.parent.size_ok(width, height): continue image_url = item['links']['download'] origin_url = item['links']['html'] filename = os.path.join( self.target_folder, Util.sanitize_filename(image_url.split('/')[-2] + '.jpg')) extra_metadata = { 'sourceType': 'unsplash', 'sfwRating': 100, 'author': item['user']['name'], 'authorURL': item['user']['links']['html'], 'keywords': [ cat['title'].lower().strip() for cat in item['categories'] ] } self.queue.append( (origin_url, image_url, extra_metadata, filename)) except: logger.exception( lambda: "Could not process an item from Unsplash") raise random.shuffle(self.queue) logger.info( lambda: "Unsplash populated with %d URLs" % len(self.queue))
def fetch( url, to_folder, origin_url=None, source_type=None, source_location=None, source_name=None, extra_metadata=None, progress_reporter=lambda a, b: None, verbose=True, ): reported = verbose try: logger.info(lambda: "Trying to fetch URL %s to %s " % (url, to_folder)) if verbose: progress_reporter(_("Fetching"), url) if url.startswith("javascript:"): if verbose: progress_reporter(_("Not an image"), url) return None if url.find("://") < 0: url = "file://" + url r = Util.request(url, stream=True) if not "content-type" in r.headers: logger.info(lambda: "Unknown content-type for url " + url) if verbose: progress_reporter(_("Not an image"), url) return None ct = r.headers["content-type"] if not ct.startswith("image/"): logger.info(lambda: "Unsupported content-type for url " + url + ": " + ct) if verbose: progress_reporter(_("Not an image"), url) return None local_name = Util.get_local_name(r.url) if "content-disposition" in r.headers: cd = r.headers["content-disposition"] cd_name = ImageFetcher.extract_filename_from_content_disposition( cd) if cd_name: local_name = cd_name filename = os.path.join(to_folder, local_name) if os.path.exists(filename): m = Util.read_metadata(filename) if m and m.get("imageURL") == url: logger.info( lambda: "Local file already exists (%s)" % filename) return filename else: logger.info( lambda: "File with same name already exists, but from different imageURL; renaming new download" ) filename = Util.find_unique_name(filename) logger.info(lambda: "Fetching to " + filename) if not reported: reported = True progress_reporter(_("Fetching"), url) local_filepath_partial = filename + ".partial" with open(local_filepath_partial, "wb") as f: Util.request_write_to(r, f) try: img = Image.open(local_filepath_partial) except Exception: progress_reporter(_("Not an image"), url) Util.safe_unlink(local_filepath_partial) return None if img.size[0] < 400 or img.size[1] < 400: # too small - delete and do not use progress_reporter(_("Image too small, ignoring it"), url) Util.safe_unlink(local_filepath_partial) return None metadata = { "sourceType": source_type or "fetched", "sourceName": source_name or "Fetched", "sourceURL": origin_url or url, "imageURL": url, } if source_location: metadata["sourceLocation"] = source_location metadata.update(extra_metadata or {}) Util.write_metadata(local_filepath_partial, metadata) os.rename(local_filepath_partial, filename) logger.info(lambda: "Fetched %s to %s." % (url, filename)) return filename except Exception as e: # pylint: disable=no-member logger.exception(lambda: "Fetch failed for URL " + url) if reported: if isinstance( e, HTTPError) and e.response.status_code in (403, 404): progress_reporter( _("Sorry, got %s error...") % str(e.response.status_code), _("This means the link is no longer valid"), ) else: progress_reporter( _("Fetch failed for some reason"), _("To get more information, please run Variety from terminal with -v option and retry the action" ), ) return None
def save_locally( self, origin_url, image_url, source_type=None, source_location=None, source_name=None, force_download=False, extra_metadata=None, local_filename=None, request_headers=None, request_kwargs=None, ): source_type = source_type or self.get_source_type() source_name = source_name or self.get_source_name() source_location = source_location or self.get_source_location( ) or self.get_description() if not force_download and self.is_in_banned(origin_url): logger.info( lambda: "URL " + origin_url + " is banned, skip downloading") return None try: os.makedirs(self.target_folder) except Exception: pass if origin_url.startswith("//"): origin_url = "https:" + origin_url if image_url.startswith("//"): image_url = origin_url.split("//")[0] + image_url # we will download the contents to a ".partial" file, then rename it to the proper name if not local_filename: local_filename = self.get_local_filename(url=image_url) local_filepath = self._local_filepath(local_filename=local_filename) local_filepath_partial = local_filepath + ".partial" logger.info(lambda: "Origin URL: " + origin_url) logger.info(lambda: "Image URL: " + image_url) logger.info(lambda: "Local path: " + local_filepath) if not force_download and os.path.exists(local_filepath): logger.info(lambda: "File already exists, skip downloading") return None is_unsafe, blacklisted = self.is_unsafe(extra_metadata or {}) if is_unsafe: logger.info( lambda: "Skipping non-safe download %s due to blacklisted keywords (%s). " "Is the source %s:%s suitable for Safe mode?" % (origin_url, str(blacklisted), source_type, source_location)) return None try: r = Util.request(image_url, stream=True, headers=request_headers, **(request_kwargs or {})) with open(local_filepath_partial, "wb") as f: Util.request_write_to(r, f) except Exception as e: logger.info( lambda: "Download failed from image URL: %s (source location: %s) " % (image_url, source_location)) Util.safe_unlink(local_filepath_partial) raise e if not Util.is_image(local_filepath_partial, check_contents=True): logger.info( lambda: "Downloaded data was not an image, image URL might be outdated" ) Util.safe_unlink(local_filepath_partial) return None metadata = { "sourceType": source_type, "sourceName": source_name, "sourceLocation": source_location, "sourceURL": origin_url, "imageURL": image_url, } metadata.update(extra_metadata or {}) Util.write_metadata(local_filepath_partial, metadata) # file rename is an atomic operation, so we should never end up with partial downloads os.rename(local_filepath_partial, local_filepath) logger.info(lambda: "Download complete") return local_filepath
def save_locally(self, origin_url, image_url, source_type=None, source_location=None, source_name=None, force_download=False, extra_metadata={}, local_filename=None): if not source_type: source_type = self.source_type if not source_name: source_name = self.name if not source_location: source_location = self.location if not force_download and self.parent and origin_url in self.parent.banned: logger.info( lambda: "URL " + origin_url + " is banned, skip downloading") return None try: os.makedirs(self.target_folder) except Exception: pass if origin_url.startswith('//'): origin_url = 'https:' + origin_url if image_url.startswith('//'): image_url = origin_url.split('//')[0] + image_url if not local_filename: local_filename = self.get_local_filename(image_url) logger.info(lambda: "Origin URL: " + origin_url) logger.info(lambda: "Image URL: " + image_url) logger.info(lambda: "Local name: " + local_filename) if not force_download and os.path.exists(local_filename): logger.info(lambda: "File already exists, skip downloading") return None if self.parent and self.parent.options.safe_mode: sfw_rating = Smart.get_sfw_rating(origin_url) if sfw_rating is not None and sfw_rating < 100: logger.info( lambda: "Skipping non-safe download %s. Is the source %s:%s " "suitable for Safe mode?" % (origin_url, source_type, self.location)) return None if self.parent and self.parent.options.safe_mode and 'keywords' in extra_metadata: blacklisted = set(k.lower() for k in extra_metadata['keywords'] ) & Smart.get_safe_mode_keyword_blacklist() if len(blacklisted) > 0: logger.info( lambda: "Skipping non-safe download %s due to blacklisted keywords (%s). " "Is the source %s:%s suitable for Safe mode?" % (origin_url, str(blacklisted), source_type, self.location)) return None try: r = Util.request(image_url, stream=True) with open(local_filename, 'wb') as f: Util.request_write_to(r, f) except Exception, e: logger.info( lambda: "Download failed from image URL: %s (source location: %s) " % (image_url, self.location)) raise e
def save_locally(self, origin_url, image_url, source_type=None, source_location=None, source_name=None, force_download=False, extra_metadata={}, local_filename=None): if not source_type: source_type = self.source_type if not source_name: source_name = self.name if not source_location: source_location = self.location if not force_download and self.parent and origin_url in self.parent.banned: logger.info( lambda: "URL " + origin_url + " is banned, skip downloading") return None try: os.makedirs(self.target_folder) except Exception: pass if origin_url.startswith('//'): origin_url = 'https:' + origin_url if image_url.startswith('//'): image_url = origin_url.split('//')[0] + image_url if not local_filename: local_filename = self.get_local_filename(image_url) logger.info(lambda: "Origin URL: " + origin_url) logger.info(lambda: "Image URL: " + image_url) logger.info(lambda: "Local name: " + local_filename) if not force_download and os.path.exists(local_filename): logger.info(lambda: "File already exists, skip downloading") return None if self.parent and self.parent.options.safe_mode and 'keywords' in extra_metadata: blacklisted = set( k.lower() for k in extra_metadata['keywords']) & SAFE_MODE_BLACKLIST if len(blacklisted) > 0: logger.info( lambda: "Skipping non-safe download %s due to blacklisted keywords (%s). " "Is the source %s:%s suitable for Safe mode?" % (origin_url, str(blacklisted), source_type, self.location)) return None try: r = Util.request(image_url, stream=True) with open(local_filename, 'wb') as f: Util.request_write_to(r, f) except Exception as e: logger.info( lambda: "Download failed from image URL: %s (source location: %s) " % (image_url, self.location)) raise e if not Util.is_image(local_filename, check_contents=True): logger.info( lambda: "Downloaded data was not an image, image URL might be outdated" ) os.unlink(local_filename) return None metadata = { "sourceType": source_type, "sourceName": source_name, "sourceLocation": source_location, "sourceURL": origin_url, "imageURL": image_url } metadata.update(extra_metadata) Util.write_metadata(local_filename, metadata) logger.info(lambda: "Download complete") return local_filename
def fetch(url, to_folder, origin_url=None, source_type=None, source_location=None, source_name=None, extra_metadata={}, progress_reporter=lambda a, b: None, verbose=True): reported = verbose try: logger.info(lambda: "Trying to fetch URL %s to %s " % (url, to_folder)) if verbose: progress_reporter(_("Fetching"), url) if url.startswith('javascript:'): if verbose: progress_reporter(_("Not an image"), url) return None if url.find('://') < 0: url = "file://" + url r = Util.request(url, stream=True) if not "content-type" in r.headers: logger.info(lambda: "Unknown content-type for url " + url) if verbose: progress_reporter(_("Not an image"), url) return None ct = r.headers["content-type"] if not ct.startswith("image/"): logger.info(lambda: "Unsupported content-type for url " + url + ": " + ct) if verbose: progress_reporter(_("Not an image"), url) return None local_name = Util.get_local_name(r.url) if "content-disposition" in r.headers: cd = r.headers["content-disposition"] cd_name = ImageFetcher.extract_filename_from_content_disposition(cd) if cd_name: local_name = cd_name filename = os.path.join(to_folder, local_name) if os.path.exists(filename): m = Util.read_metadata(filename) if m and m.get("imageURL") == url: logger.info(lambda: "Local file already exists (%s)" % filename) return filename else: logger.info(lambda: "File with same name already exists, but from different imageURL; renaming new download") filename = Util.find_unique_name(filename) local_name = os.path.basename(filename) logger.info(lambda: "Fetching to " + filename) if not reported: reported = True progress_reporter(_("Fetching"), url) with open(filename, 'wb') as f: Util.request_write_to(r, f) try: img = Image.open(filename) except Exception: progress_reporter(_("Not an image"), url) os.unlink(filename) return None if img.size[0] < 400 or img.size[1] < 400: # too small - delete and do not use progress_reporter(_("Image too small, ignoring it"), url) os.unlink(filename) return None metadata = {"sourceType": source_type or 'fetched', "sourceName": source_name or "Fetched", "sourceURL": origin_url or url, "imageURL": url} if source_location: metadata["sourceLocation"] = source_location metadata.update(extra_metadata) Util.write_metadata(filename, metadata) logger.info(lambda: "Fetched %s to %s." % (url, filename)) return filename except Exception, e: logger.exception(lambda: "Fetch failed for URL " + url) if reported: if isinstance(e, HTTPError) and e.response.status_code in (403, 404): progress_reporter( _("Sorry, got %s error...") % str(e.response.status_code), _("This means the link is no longer valid")) else: progress_reporter( _("Fetch failed for some reason"), _("To get more information, please run Variety from terminal with -v option and retry the action")) return None