def isbn2url(isbn: str) -> Optional[str]: """Return the ketab.ir book-url for the given isbn.""" browser = StatefulBrowser(user_agent=USER_AGENT) browser.open('http://www.ketab.ir/Search.aspx') browser.select_form() browser['ctl00$ContentPlaceHolder1$TxtIsbn'] = isbn browser.submit_selected() first_link = browser.get_current_page().select_one('.HyperLink2') if first_link is None: return return browser.absolute_url(first_link['href'])
class Dagr: """deviantArt gallery ripper class""" NAME = basename(__file__) __version__ = "0.71.3" MAX_DEVIATIONS = 1000000 # max deviations ART_PATTERN = (r"https://www\.deviantart\.com/" r"[a-zA-Z0-9_-]*/art/[a-zA-Z0-9_-]*") def __init__(self): # Internals self.init_mimetypes() self.browser = None self.errors_count = dict() # Configuration self.directory = getcwd() + "/" self.mature = False self.overwrite = False self.reverse = False self.test_only = False self.verbose = False self.save_progress = None self.debug = False self.retry_exception_names = {} self.cache = CacheSettings() # Current status self.deviant = "" self.load_configuration() def init_mimetypes(self): mimetypes_init() # These MIME types may be missing from some systems add_mimetype('image/vnd.adobe.photoshop', '.psd') add_mimetype('image/photoshop', '.psd') add_mimetype('application/rar', '.rar') add_mimetype('application/x-rar-compressed', '.rar') add_mimetype('application/x-rar', '.rar') add_mimetype('image/x-canon-cr2', '.tif') add_mimetype('application/x-7z-compressed', '.7z') add_mimetype('application/x-lha', '.lzh') add_mimetype('application/zip', '.zip') add_mimetype('image/x-ms-bmp', '.bmp') add_mimetype('application/x-shockwave-flash', '.swf') def load_configuration(self): my_conf = configparser.ConfigParser() # Try to read global then local configuration my_conf.read([ expanduser("~/.config/dagr/dagr_settings.ini"), path_join(getcwd(), "dagr_settings.ini") ]) if my_conf.has_option("DeviantArt", "MatureContent"): self.mature = my_conf.getboolean("DeviantArt", "MatureContent") if my_conf.has_option("Dagr", "OutputDirectory"): self.directory = abspath( expanduser(my_conf.get("Dagr", "OutputDirectory"))) + "/" if my_conf.has_option("Dagr", "SaveProgress"): self.save_progress = my_conf.getint("Dagr", "SaveProgress") if my_conf.has_option("Dagr", "Verbose"): self.verbose = my_conf.getboolean("Dagr", "Verbose") if my_conf.has_option("Dagr", "Debug"): self.debug = my_conf.getboolean("Dagr", "Debug") if my_conf.has_option("Dagr.Cache", "FileNames"): self.cache.file_names = my_conf.get("Dagr.Cache", "FileNames") if my_conf.has_option("Dagr.Cache", "DownloadedPages"): self.cache.downloaded_pages = my_conf.get("Dagr.Cache", "DownloadedPages") if my_conf.has_option("Dagr.Cache", "Artists"): self.cache.artists = my_conf.get("Dagr.Cache", "Artists") if my_conf.has_option("Dagr.Cache", "Meta"): self.cache.meta = my_conf.get("Dagr.Cache", "Meta") if my_conf.has_option("Dagr.Cache", "IndexFile"): self.cache.index_file = my_conf.get("Dagr.Cache", "IndexFile") def start(self): if not self.browser: # Set up fake browser self.set_browser() def set_browser(self): user_agents = ( 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1' ' (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/534.50' ' (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)', 'Opera/9.99 (Windows NT 5.1; U; pl) Presto/9.9.9', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US)' ' AppleWebKit/530.5 (KHTML, like Gecko) Chrome/ Safari/530.5', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.2' ' (KHTML, like Gecko) Chrome/6.0', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; pl; rv:1.9.1)' ' Gecko/20090624 Firefox/3.5 (.NET CLR 3.5.30729)') session = req_session() session.headers.update({'Referer': 'https://www.deviantart.com/'}) if self.mature: session.cookies.update({'agegate_state': '1'}) session.mount('https://', req_adapters.HTTPAdapter(max_retries=3)) self.browser = StatefulBrowser(session=session, user_agent=choice(user_agents)) def get_response(self, url, *args, **kwargs): if isinstance(url, Tag): if hasattr(url, 'attrs') and 'href' in url.attrs: url = self.browser.absolute_url(url['href']) return self.browser.session.get(url, timeout=150) def response_get_content_type(self, response): if "content-type" in response.headers: return next(iter(response.headers.get("content-type").split(";")), None) def get(self, url, file_name=None, files_list=None): if file_name and files_list is None: raise ValueError( 'files_list cannot be empty when file_name is specified') if (file_name and not self.overwrite): glob_name = next( (fn for fn in files_list if basename(file_name) in fn), None) if glob_name: print(glob_name, "exists - skipping") return None response = None tries = {} while True: try: response = self.get_response(url) break except Exception as ex: if self.verbose: traceback.print_exc() except_name = type(ex).__name__ if except_name in self.retry_exception_names: if not except_name in tries: tries[except_name] = 0 tries[except_name] += 1 if tries[except_name] < 3: continue raise DagrException( 'Failed to get url: {}'.format(except_name)) else: raise ex if not response.status_code == req_codes.ok: raise DagrException("incorrect status code - " + str(response.status_code)) if not file_name: return response.text content_type = self.response_get_content_type(response) if self.verbose: print(content_type) if not content_type: raise DagrException('missing content-type') file_ext = guess_extension(content_type) if not file_ext: raise DagrException('unknown content-type - ' + content_type) file_name += file_ext file_name = abspath(file_name) file_exists = path_exists(file_name) if file_exists and not self.overwrite: files_list.append(basename(file_name)) print(file_name, "exists - skipping") return None while True: try: with open(file_name, "wb") as local_file: local_file.write(response.content) break except Exception as ex: if self.verbose: traceback.print_exc() except_name = type(ex).__name__ if except_name in self.retry_exception_names: if not except_name in tries: tries[except_name] = 0 tries[except_name] += 1 if tries[except_name] < 3: continue raise DagrException( 'Failed to get url: {}'.format(except_name)) else: raise ex if response.headers.get("last-modified"): # Set file dates to last modified time mod_time = mktime(parsedate(response.headers.get("last-modified"))) utime(file_name, (mod_time, mod_time)) files_list.append(basename(file_name)) return file_name def find_link(self, link): filelink = None filename = basename(link) linkmeta = {} mature_error = False self.browser.open(link) html = self.browser.get_current_page() try: linkmeta = { 'title': html.findAll('a', {'class': 'title'})[0].text, 'tags': [ x.attrs['data-canonical-tag'] for x in html.findAll('a', {'class': 'discoverytag'}) ], 'description': str( html.findAll('div', {'class': 'dev-description'})[0].findChild()), 'copyright': html.findAll('span', {'class': 'cc-copy'})[0].text, 'category': html.findAll('span', {'class': 'dev-about-breadcrumb'})[0].text } except: pass # Full image link (via download link) link_text = re.compile("Download( (Image|File))?") img_link = None for candidate in self.browser.links("a"): if link_text.search(candidate.text) and candidate.get("href"): img_link = candidate break if img_link and img_link.get("data-download_url"): return (filename, img_link, linkmeta) if self.verbose: print("Download link not found, falling back to direct image") current_page = self.browser.get_current_page() # Fallback 1: try meta (filtering blocked meta) filesearch = current_page.find("meta", {"property": "og:image"}) if filesearch: filelink = filesearch['content'] if basename(filelink).startswith("noentrythumb-"): filelink = None mature_error = True if not filelink: # Fallback 2: try collect_rid, full filesearch = current_page.find("img", { "collect_rid": True, "class": re.compile(".*full") }) if not filesearch: # Fallback 3: try collect_rid, normal filesearch = current_page.find("img", { "collect_rid": True, "class": re.compile(".*normal") }) if filesearch: filelink = filesearch['src'] page_title = current_page.find("span", {"itemprop": "title"}) if page_title and page_title.text == "Literature": filelink = self.browser.get_url() return (filename, filelink, linkmeta) if not filelink: filelink = self.find_video(current_page) if not filelink: iframe_search = current_page.find('iframe', {'class': 'flashtime'}) if iframe_search: self.browser.open(iframe_search.attrs['src']) current_page = self.browser.get_current_page() embed_search = current_page.find('embed', {'id': 'sandboxembed'}) if embed_search: filelink = embed_search.attrs['src'] if not filelink: if mature_error: if self.mature: raise DagrException("maybe not an image") else: raise DagrException("maybe a mature deviation/" + "not an image") else: raise DagrException("all attemps to find a link failed") return (filename, filelink, linkmeta) def find_video(self, current_page): try: script = self.filter_page_scripts(current_page, 'deviantART.pageData=') best_res = self.extract_nested_assign( script, ['deviantART.pageData', '"film"', '"sizes"'])[-1] return json.loads( str(self.extract_nested_assign(best_res, ['"src"']))) except (ImportError, StopIteration): pass def filter_page_scripts(self, current_page, filt): return next(content for content in (script.get_text() for script in current_page.find_all( 'script', {'type': 'text/javascript'}) if not script.has_attr('src')) if content and filt in content) def extract_nested_assign(self, node, identifiers): from calmjs.parse import es5 as calmjs_es5 from calmjs.parse.asttypes import (Node as calmjs_node, Assign as calmjs_assign, Object as calmjs_obj) from calmjs.parse.walkers import Walker as calmjs_walker if not isinstance(node, calmjs_node): node = calmjs_es5(node) walker = calmjs_walker() def calmjs_do_extract(node, identifiers): identifier = identifiers.pop(0) sub_node = next( walker.filter( node, lambda n: (isinstance(n, calmjs_assign) and str( n.left) == identifier))) if identifiers: return self.extract_nested_assign(sub_node, identifiers) if isinstance(sub_node.right, calmjs_obj): return list(sub_node.right) return sub_node.right return calmjs_do_extract(node, identifiers) def handle_download_error(self, link, link_error): error_string = str(link_error) print("Download error (" + link + ") : " + error_string) if error_string in self.errors_count: self.errors_count[error_string] += 1 else: self.errors_count[error_string] = 1 def get_pages(self, mode, base_url): pages = [] for i in range(0, int(Dagr.MAX_DEVIATIONS / 24), 24): html = "" url = base_url + str(i) try: html = self.get(url) except DagrException: print("Could not find " + self.deviant + "'s " + mode) return pages prelim = re.findall(Dagr.ART_PATTERN, html, re.IGNORECASE | re.DOTALL) for match in prelim: if match not in pages: pages.append(match) done = re.findall( "(This section has no deviations yet!|" "This collection has no items yet!|" "Sorry, we found no relevant results.|" "Sorry, we don't have that many results.)", html, re.IGNORECASE | re.S) if done: break progress_msg = '{} page {} crawled...'.format( mode, int(i / 24) + 1) if mode == 'search': print(progress_msg) else: print("{}'s {}".format(self.deviant, progress_msg)) if not self.reverse: pages.reverse() return pages def load_cache_file(self, base_dir, cache_file): full_path = path_join(base_dir, cache_file) try: if path_exists(full_path): with open(full_path, 'r') as filehandle: return json.load(filehandle) else: if self.verbose: print('Primary {} cache not found'.format(cache_file)) except: print('Unable to load primary {} cache:'.format(cache_file)) if self.verbose: traceback.print_exc() full_path += '.bak' try: if path_exists(full_path): with open(full_path, 'r') as filehandle: return json.load(filehandle) else: if self.verbose: print('Backup {} cache not found'.format(cache_file)) except: print('Unable to load backup {} cache:'.format(cache_file)) if self.verbose: traceback.print_exc() def load_cache(self, base_dir, **kwargs): def filenames(): if self.verbose: print('Building filenames cache') files_list_raw = glob(path_join(base_dir, '*')) return [basename(fn) for fn in files_list_raw] def downloaded_pages(): return [] def artists(): return {} def meta(): return {} cache_defaults = { 'filenames': filenames, 'downloaded_pages': downloaded_pages, 'artists': artists, 'meta': meta } for cache_type, cache_file in kwargs.items(): cache_contents = self.load_cache_file(base_dir, cache_file) if cache_contents: yield cache_contents else: if not cache_type in cache_defaults: raise ValueError( 'Unkown cache type: {}'.format(cache_type)) yield cache_defaults[cache_type]() def get_base_dir(self, mode, mode_arg=None): if self.deviant: base_dir = path_join(self.directory, self.deviant, mode) else: base_dir = path_join(self.directory, mode) if mode_arg: base_dir = path_join(base_dir, mode_arg) try: da_make_dirs(base_dir) except OSError as mkdir_error: print(str(mkdir_error)) return return base_dir def get_images(self, mode, mode_arg, pages): base_dir = self.get_base_dir(mode, mode_arg) if base_dir: try: with portalocker.TemporaryFileLock(filename=path_join( base_dir, '.lock'), fail_when_locked=True): #Load caches fn_cache = self.cache.file_names dp_cache = self.cache.downloaded_pages m_cache = self.cache.meta files_list, existing_pages, meta = self.load_cache( base_dir, filenames=fn_cache, downloaded_pages=dp_cache, meta=m_cache) if not self.overwrite: pages = [x for x in pages if x not in existing_pages] print("Total deviations to download: " + str(len(pages))) for count, link in enumerate(pages, start=1): if self.save_progress and count % self.save_progress == 0: self.update_cache(base_dir, fn_cache, files_list) self.update_cache(base_dir, dp_cache, existing_pages) self.update_cache(base_dir, m_cache, meta) if self.verbose: print("Downloading " + str(count) + " of " + str(len(pages)) + " ( " + link + " )") filename = "" filelink = "" linkmeta = {} try: filename, filelink, linkmeta = self.find_link(link) if self.test_only: print(filelink) continue self.get(filelink, path_join(base_dir, filename), files_list) except (KeyboardInterrupt, SystemExit): if pages: self.update_cache(base_dir, fn_cache, files_list) self.update_cache(base_dir, dp_cache, existing_pages) self.update_cache(base_dir, m_cache, meta) raise except DagrException as get_error: pages.remove(link) self.handle_download_error(link, get_error) continue else: if link not in existing_pages: existing_pages.append(link) meta[filename] = linkmeta if pages or (not path_exists(path_join(base_dir, fn_cache)) and files_list): self.update_cache(base_dir, fn_cache, files_list) if pages: self.update_cache(base_dir, dp_cache, existing_pages) if pages or (not path_exists( path_join(base_dir, self.cache.artists)) and existing_pages): self.update_artists(base_dir, existing_pages, files_list) self.update_cache(base_dir, m_cache, meta) except (portalocker.exceptions.LockException, portalocker.exceptions.AlreadyLocked): print('Skipping locked directory {}'.format(base_dir)) except PermissionError: print('Unable to unlock {}'.format(base_dir)) def backup_cache_file(self, file_name): backup_name = file_name + '.bak' if path_exists(file_name): if path_exists(backup_name): os_remove(backup_name) rename(file_name, backup_name) def update_cache(self, base_dir, cache_file, cache_contents): full_path = path_join(base_dir, cache_file) self.backup_cache_file(full_path) if self.verbose: print('Updating {} cache'.format(cache_file)) with open(full_path, 'w') as filehandle: json.dump(cache_contents, filehandle, indent=4, sort_keys=True) def update_artists(self, base_dir, pages, files_list): artists = {} for page in pages: artist_url = dirname(dirname(page)) artist_name = basename(artist_url) url_basename = basename(page) try: real_filename = next(fn for fn in files_list if url_basename in fn) except StopIteration as ex: print(page, url_basename) raise ex if not artist_name in artists: artists[artist_name] = { 'Home Page': artist_url, 'Artworks': {} } artists[artist_name]['Artworks'][real_filename] = page self.update_cache(base_dir, self.cache.artists, artists) def global_search(self, query): base_url = 'https://www.deviantart.com/?q=' + query + '&offset=' pages = self.get_pages('search', base_url) if not pages: print('No search results for query {}'.format(query)) return print('Total search results found for {} : {}'.format( query, len(pages))) self.get_images('search', query, pages) print('Query successfully ripped.') def deviant_get(self, mode, mode_arg=None): print("Ripping " + self.deviant + "'s " + mode + "...") base_url = "https://www.deviantart.com/" + self.deviant.lower() + "/" if mode == "favs": base_url += "favourites/?catpath=/&offset=" elif mode == "collection": base_url += "favourites/" + mode_arg + "?offset=" elif mode == "scraps": base_url += "gallery/?catpath=scraps&offset=" elif mode == "gallery": base_url += "gallery/?catpath=/&offset=" elif mode == "album": base_url += "gallery/" + mode_arg + "?offset=" elif mode == "query": base_url += "gallery/?q=" + mode_arg + "&offset=" elif mode == "category": base_url += "gallery/?catpath=" + mode_arg + "&offset=" pages = self.get_pages(mode, base_url) if not pages: print(self.deviant + "'s " + mode + " had no deviations.") return print("Total deviations in " + self.deviant + "'s " + mode + " found: " + str(len(pages))) self.get_images(mode, mode_arg, pages) print(self.deviant + "'s " + mode + " successfully ripped.") def group_get(self, mode): print("Ripping " + self.deviant + "'s " + mode + "...") base_url = 'https://www.deviantart.com/' + self.deviant.lower() + '/' if mode == "favs": base_url += "favourites/" elif mode == "gallery": base_url += "gallery/" folders = [] i = 0 while True: html = self.get(base_url + '?offset=' + str(i)) k = re.findall( 'class="ch-top" href="' + base_url + '([0-9]*/[a-zA-Z0-9_-]*)"', html, re.IGNORECASE) if k == []: break new_folder = False for match in k: if match not in folders: folders.append(match) new_folder = True if not new_folder: break i += 10 # no repeats folders = list(set(folders)) if not folders: print(self.deviant + "'s " + mode + " is empty.") print("Total folders in " + self.deviant + "'s " + mode + " found: " + str(len(folders))) if self.reverse: folders.reverse() pages = [] for folder in folders: label = folder.split("/")[-1] print("Crawling folder " + label + "...") pages = self.get_pages(mode, base_url + folder + '?offset=') if not self.reverse: pages.reverse() self.get_images(mode, label, pages) print(self.deviant + "'s " + mode + " successfully ripped.") def print_errors(self): if self.errors_count: print("Download errors count:") for error in self.errors_count: print("* " + error + " : " + str(self.errors_count[error]))