class Dagr: """deviantArt gallery ripper class""" NAME = basename(__file__) __version__ = "0.71.3" MAX_DEVIATIONS = 1000000 # max deviations ART_PATTERN = (r"https://www\.deviantart\.com/" r"[a-zA-Z0-9_-]*/art/[a-zA-Z0-9_-]*") def __init__(self): # Internals self.init_mimetypes() self.browser = None self.errors_count = dict() # Configuration self.directory = getcwd() + "/" self.mature = False self.overwrite = False self.reverse = False self.test_only = False self.verbose = False # Current status self.deviant = "" def init_mimetypes(self): mimetypes_init() # These MIME types may be missing from some systems add_mimetype('image/vnd.adobe.photoshop', '.psd') add_mimetype('image/photoshop', '.psd') add_mimetype('application/rar', '.rar') add_mimetype('application/x-rar-compressed', '.rar') add_mimetype('application/x-rar', '.rar') add_mimetype('image/x-canon-cr2', '.tif') add_mimetype('application/x-7z-compressed', '.7z') add_mimetype('application/x-lha', '.lzh') def load_configuration(self): my_conf = configparser.ConfigParser() # Try to read global then local configuration my_conf.read([ expanduser("~/.config/dagr/dagr_settings.ini"), path_join(getcwd(), "dagr_settings.ini") ]) if my_conf.has_option("DeviantArt", "MatureContent"): self.mature = my_conf.getboolean("DeviantArt", "MatureContent") if my_conf.has_option("Dagr", "OutputDirectory"): self.directory = abspath( expanduser(my_conf.get("Dagr", "OutputDirectory"))) + "/" def start(self): if not self.browser: # Set up fake browser self.set_browser() def set_browser(self): user_agents = ( 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1' ' (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/534.50' ' (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)', 'Opera/9.99 (Windows NT 5.1; U; pl) Presto/9.9.9', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US)' ' AppleWebKit/530.5 (KHTML, like Gecko) Chrome/ Safari/530.5', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.2' ' (KHTML, like Gecko) Chrome/6.0', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; pl; rv:1.9.1)' ' Gecko/20090624 Firefox/3.5 (.NET CLR 3.5.30729)') session = req_session() session.headers.update({'Referer': 'https://www.deviantart.com/'}) if self.mature: session.cookies.update({'agegate_state': '1'}) session.mount('https://', req_adapters.HTTPAdapter(max_retries=3)) self.browser = StatefulBrowser(session=session, user_agent=choice(user_agents)) def get(self, url, file_name=None): if (file_name and not self.overwrite and glob(file_name + ".*")): print(glob(file_name + ".*")[0] + " exists - skipping") return None if isinstance(url, Tag): # Download and save soup links get_resp = self.browser.download_link(url, file_name) else: # Direct URL get_resp = self.browser.session.get(url) if file_name: with open(file_name, "wb") as local_file: local_file.write(get_resp.content) if get_resp.status_code != req_codes.ok: raise DagrException("incorrect status code - " + str(get_resp.status_code)) if file_name is None: return get_resp.text if get_resp.headers.get("last-modified"): # Set file dates to last modified time mod_time = mktime(parsedate(get_resp.headers.get("last-modified"))) utime(file_name, (mod_time, mod_time)) if get_resp.headers.get("content-type"): content_type = get_resp.headers.get("content-type").split(";")[0] file_ext = guess_extension(content_type) if file_ext: rename(file_name, file_name + file_ext) else: raise DagrException('unknown content-type - ' + content_type) return file_name def find_link(self, link): filelink = None filename = basename(link) mature_error = False self.browser.open(link) # Full image link (via download link) link_text = re.compile("Download( (Image|File))?") img_link = None for candidate in self.browser.links("a"): if link_text.search(candidate.text) and candidate.get("href"): img_link = candidate break if img_link and img_link.get("data-download_url"): return (filename, img_link) if self.verbose: print("Download link not found, falling back to direct image") current_page = self.browser.get_current_page() # Fallback 1: try meta (filtering blocked meta) filesearch = current_page.find("meta", {"property": "og:image"}) if filesearch: filelink = filesearch['content'] if basename(filelink).startswith("noentrythumb-"): filelink = None mature_error = True if not filelink: # Fallback 2: try collect_rid, full filesearch = current_page.find("img", { "collect_rid": True, "class": re.compile(".*full") }) if not filesearch: # Fallback 3: try collect_rid, normal filesearch = current_page.find("img", { "collect_rid": True, "class": re.compile(".*normal") }) if filesearch: filelink = filesearch['src'] if current_page.find("span", { "itemprop": "title" }).text == "Literature": filelink = self.browser.get_url() return (filename, filelink) if not filelink: if mature_error: if self.mature: raise DagrException("maybe not an image") else: raise DagrException("maybe a mature deviation/" + "not an image") else: raise DagrException("all attemps to find a link failed") return (filename, filelink) def handle_download_error(self, link, link_error): error_string = str(link_error) print("Download error (" + link + ") : " + error_string) if error_string in self.errors_count: self.errors_count[error_string] += 1 else: self.errors_count[error_string] = 1 def get_pages(self, mode, base_url): pages = [] for i in range(0, int(Dagr.MAX_DEVIATIONS / 24), 24): html = "" url = base_url + str(i) try: html = self.get(url) except DagrException: print("Could not find " + self.deviant + "'s " + mode) return pages prelim = re.findall(Dagr.ART_PATTERN, html, re.IGNORECASE | re.DOTALL) for match in prelim: if match not in pages: pages.append(match) done = re.findall( "(This section has no deviations yet!|" "This collection has no items yet!)", html, re.IGNORECASE | re.S) if done: break print(self.deviant + "'s " + mode + " page " + str(int((i / 24) + 1)) + " crawled...") if not self.reverse: pages.reverse() return pages def get_images(self, mode, mode_arg, pages): base_dir = self.directory + self.deviant + "/" + mode if mode_arg: base_dir += "/" + mode_arg try: da_make_dirs(base_dir) except OSError as mkdir_error: print(str(mkdir_error)) return # Find previously downloaded pages existing_pages = [] try: with open(base_dir + "/.dagr_downloaded_pages", "r") as filehandle: existing_pages = json.load(filehandle) except FNF_ERROR: # May not exist (new directory, ...) pass if not self.overwrite: pages = [x for x in pages if x not in existing_pages] print("Total deviations to download: " + str(len(pages))) for count, link in enumerate(pages, start=1): if self.verbose: print("Downloading " + str(count) + " of " + str(len(pages)) + " ( " + link + " )") filename = "" filelink = "" try: filename, filelink = self.find_link(link) except (KeyboardInterrupt, SystemExit): raise except DagrException as link_error: self.handle_download_error(link, link_error) continue if not self.test_only: try: self.get(filelink, base_dir + "/" + filename) except DagrException as get_error: self.handle_download_error(link, get_error) continue else: if link not in existing_pages: existing_pages.append(link) else: print(filelink) # Update downloaded pages cache with open(base_dir + "/.dagr_downloaded_pages", "w") as filehandle: json.dump(existing_pages, filehandle) def deviant_get(self, mode, mode_arg=None): print("Ripping " + self.deviant + "'s " + mode + "...") base_url = "https://www.deviantart.com/" + self.deviant.lower() + "/" if mode == "favs": base_url += "favourites/?catpath=/&offset=" elif mode == "collection": base_url += "favourites/" + mode_arg + "?offset=" elif mode == "scraps": base_url += "gallery/?catpath=scraps&offset=" elif mode == "gallery": base_url += "gallery/?catpath=/&offset=" elif mode == "album": base_url += "gallery/" + mode_arg + "?offset=" elif mode == "query": base_url += "gallery/?q=" + mode_arg + "&offset=" elif mode == "category": base_url += "gallery/?catpath=" + mode_arg + "&offset=" pages = self.get_pages(mode, base_url) if not pages: print(self.deviant + "'s " + mode + " had no deviations.") return print("Total deviations in " + self.deviant + "'s " + mode + " found: " + str(len(pages))) self.get_images(mode, mode_arg, pages) print(self.deviant + "'s " + mode + " successfully ripped.") def group_get(self, mode): print("Ripping " + self.deviant + "'s " + mode + "...") base_url = 'https://www.deviantart.com/' + self.deviant.lower() + '/' if mode == "favs": base_url += "favourites/" elif mode == "gallery": base_url += "gallery/" folders = [] i = 0 while True: html = self.get(base_url + '?offset=' + str(i)) k = re.findall( 'class="ch-top" href="' + base_url + '([0-9]*/[a-zA-Z0-9_-]*)"', html, re.IGNORECASE) if k == []: break new_folder = False for match in k: if match not in folders: folders.append(match) new_folder = True if not new_folder: break i += 10 # no repeats folders = list(set(folders)) if not folders: print(self.deviant + "'s " + mode + " is empty.") print("Total folders in " + self.deviant + "'s " + mode + " found: " + str(len(folders))) if self.reverse: folders.reverse() pages = [] for folder in folders: label = folder.split("/")[-1] print("Crawling folder " + label + "...") pages = self.get_pages(mode, base_url + folder + '?offset=') if not self.reverse: pages.reverse() self.get_images(mode, label, pages) print(self.deviant + "'s " + mode + " successfully ripped.") def print_errors(self): if self.errors_count: print("Download errors count:") for error in self.errors_count: print("* " + error + " : " + str(self.errors_count[error]))
class Dagr: """deviantArt gallery ripper class""" NAME = basename(__file__) __version__ = "0.71.3" MAX_DEVIATIONS = 1000000 # max deviations ART_PATTERN = (r"https://www\.deviantart\.com/" r"[a-zA-Z0-9_-]*/art/[a-zA-Z0-9_-]*") def __init__(self): # Internals self.init_mimetypes() self.browser = None self.errors_count = dict() # Configuration self.directory = getcwd() + "/" self.mature = False self.overwrite = False self.reverse = False self.test_only = False self.verbose = False # Current status self.deviant = "" def init_mimetypes(self): mimetypes_init() # These MIME types may be missing from some systems add_mimetype('image/vnd.adobe.photoshop', '.psd') add_mimetype('image/photoshop', '.psd') add_mimetype('application/rar', '.rar') add_mimetype('application/x-rar-compressed', '.rar') add_mimetype('application/x-rar', '.rar') add_mimetype('image/x-canon-cr2', '.tif') add_mimetype('application/x-7z-compressed', '.7z') add_mimetype('application/x-lha', '.lzh') def load_configuration(self): my_conf = configparser.ConfigParser() # Try to read global then local configuration my_conf.read([expanduser("~/.config/dagr/dagr_settings.ini"), path_join(getcwd(), "dagr_settings.ini")]) if my_conf.has_option("DeviantArt", "MatureContent"): self.mature = my_conf.getboolean("DeviantArt", "MatureContent") if my_conf.has_option("Dagr", "OutputDirectory"): self.directory = abspath( expanduser(my_conf.get("Dagr", "OutputDirectory")) ) + "/" def start(self): if not self.browser: # Set up fake browser self.set_browser() def set_browser(self): user_agents = ( 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1' ' (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/534.50' ' (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)', 'Opera/9.99 (Windows NT 5.1; U; pl) Presto/9.9.9', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US)' ' AppleWebKit/530.5 (KHTML, like Gecko) Chrome/ Safari/530.5', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.2' ' (KHTML, like Gecko) Chrome/6.0', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; pl; rv:1.9.1)' ' Gecko/20090624 Firefox/3.5 (.NET CLR 3.5.30729)' ) session = req_session() session.headers.update({'Referer': 'https://www.deviantart.com/'}) if self.mature: session.cookies.update({'agegate_state': '1'}) session.mount('https://', req_adapters.HTTPAdapter(max_retries=3)) self.browser = StatefulBrowser(session=session, user_agent=choice(user_agents)) def get(self, url, file_name=None): if (file_name and not self.overwrite and glob(file_name + ".*")): print(glob(file_name + ".*")[0] + " exists - skipping") return None if isinstance(url, Tag): # Download and save soup links get_resp = self.browser.download_link(url, file_name) else: # Direct URL get_resp = self.browser.session.get(url) if file_name: with open(file_name, "wb") as local_file: local_file.write(get_resp.content) if get_resp.status_code != req_codes.ok: raise DagrException("incorrect status code - " + str(get_resp.status_code)) if file_name is None: return get_resp.text if get_resp.headers.get("last-modified"): # Set file dates to last modified time mod_time = mktime(parsedate(get_resp.headers.get("last-modified"))) utime(file_name, (mod_time, mod_time)) if get_resp.headers.get("content-type"): content_type = get_resp.headers.get("content-type").split(";")[0] file_ext = guess_extension(content_type) if file_ext: rename(file_name, file_name + file_ext) else: raise DagrException('unknown content-type - ' + content_type) return file_name def find_link(self, link): filelink = None filename = basename(link) mature_error = False self.browser.open(link) # Full image link (via download link) link_text = re.compile("Download( (Image|File))?") img_link = None for candidate in self.browser.links("a"): if link_text.search(candidate.text) and candidate.get("href"): img_link = candidate break if img_link and img_link.get("data-download_url"): return (filename, img_link) if self.verbose: print("Download link not found, falling back to direct image") current_page = self.browser.get_current_page() # Fallback 1: try meta (filtering blocked meta) filesearch = current_page.find("meta", {"property": "og:image"}) if filesearch: filelink = filesearch['content'] if basename(filelink).startswith("noentrythumb-"): filelink = None mature_error = True if not filelink: # Fallback 2: try collect_rid, full filesearch = current_page.find("img", {"collect_rid": True, "class": re.compile(".*full")}) if not filesearch: # Fallback 3: try collect_rid, normal filesearch = current_page.find("img", {"collect_rid": True, "class": re.compile(".*normal")}) if filesearch: filelink = filesearch['src'] if current_page.find( "span", {"itemprop": "title"}).text == "Literature": filelink = self.browser.get_url() return (filename, filelink) if not filelink: if mature_error: if self.mature: raise DagrException("maybe not an image") else: raise DagrException("maybe a mature deviation/" + "not an image") else: raise DagrException("all attemps to find a link failed") return (filename, filelink) def handle_download_error(self, link, link_error): error_string = str(link_error) print("Download error (" + link + ") : " + error_string) if error_string in self.errors_count: self.errors_count[error_string] += 1 else: self.errors_count[error_string] = 1 def get_pages(self, mode, base_url): pages = [] for i in range(0, int(Dagr.MAX_DEVIATIONS / 24), 24): html = "" url = base_url + str(i) try: html = self.get(url) except DagrException: print("Could not find " + self.deviant + "'s " + mode) return pages prelim = re.findall(Dagr.ART_PATTERN, html, re.IGNORECASE | re.DOTALL) for match in prelim: if match not in pages: pages.append(match) done = re.findall("(This section has no deviations yet!|" "This collection has no items yet!)", html, re.IGNORECASE | re.S) if done: break print(self.deviant + "'s " + mode + " page " + str(int((i / 24) + 1)) + " crawled...") if not self.reverse: pages.reverse() return pages def get_images(self, mode, mode_arg, pages): base_dir = self.directory + self.deviant + "/" + mode if mode_arg: base_dir += "/" + mode_arg try: da_make_dirs(base_dir) except OSError as mkdir_error: print(str(mkdir_error)) return # Find previously downloaded pages existing_pages = [] try: with open(base_dir + "/.dagr_downloaded_pages", "r") as filehandle: existing_pages = json.load(filehandle) except FNF_ERROR: # May not exist (new directory, ...) pass if not self.overwrite: pages = [x for x in pages if x not in existing_pages] print("Total deviations to download: " + str(len(pages))) for count, link in enumerate(pages, start=1): if self.verbose: print("Downloading " + str(count) + " of " + str(len(pages)) + " ( " + link + " )") filename = "" filelink = "" try: filename, filelink = self.find_link(link) except (KeyboardInterrupt, SystemExit): raise except DagrException as link_error: self.handle_download_error(link, link_error) continue if not self.test_only: try: self.get(filelink, base_dir + "/" + filename) except DagrException as get_error: self.handle_download_error(link, get_error) continue else: if link not in existing_pages: existing_pages.append(link) else: print(filelink) # Update downloaded pages cache with open(base_dir + "/.dagr_downloaded_pages", "w") as filehandle: json.dump(existing_pages, filehandle) def deviant_get(self, mode, mode_arg=None): print("Ripping " + self.deviant + "'s " + mode + "...") base_url = "https://www.deviantart.com/" + self.deviant.lower() + "/" if mode == "favs": base_url += "favourites/?catpath=/&offset=" elif mode == "collection": base_url += "favourites/" + mode_arg + "?offset=" elif mode == "scraps": base_url += "gallery/?catpath=scraps&offset=" elif mode == "gallery": base_url += "gallery/?catpath=/&offset=" elif mode == "album": base_url += "gallery/" + mode_arg + "?offset=" elif mode == "query": base_url += "gallery/?q=" + mode_arg + "&offset=" elif mode == "category": base_url += "gallery/?catpath=" + mode_arg + "&offset=" pages = self.get_pages(mode, base_url) if not pages: print(self.deviant + "'s " + mode + " had no deviations.") return print("Total deviations in " + self.deviant + "'s " + mode + " found: " + str(len(pages))) self.get_images(mode, mode_arg, pages) print(self.deviant + "'s " + mode + " successfully ripped.") def group_get(self, mode): print("Ripping " + self.deviant + "'s " + mode + "...") base_url = 'https://www.deviantart.com/' + self.deviant.lower() + '/' if mode == "favs": base_url += "favourites/" elif mode == "gallery": base_url += "gallery/" folders = [] i = 0 while True: html = self.get(base_url + '?offset=' + str(i)) k = re.findall('class="ch-top" href="' + base_url + '([0-9]*/[a-zA-Z0-9_-]*)"', html, re.IGNORECASE) if k == []: break new_folder = False for match in k: if match not in folders: folders.append(match) new_folder = True if not new_folder: break i += 10 # no repeats folders = list(set(folders)) if not folders: print(self.deviant + "'s " + mode + " is empty.") print("Total folders in " + self.deviant + "'s " + mode + " found: " + str(len(folders))) if self.reverse: folders.reverse() pages = [] for folder in folders: label = folder.split("/")[-1] print("Crawling folder " + label + "...") pages = self.get_pages(mode, base_url + folder + '?offset=') if not self.reverse: pages.reverse() self.get_images(mode, label, pages) print(self.deviant + "'s " + mode + " successfully ripped.") def print_errors(self): if self.errors_count: print("Download errors count:") for error in self.errors_count: print("* " + error + " : " + str(self.errors_count[error]))
browser["user"] = "******" browser["pwd"] = "ThunderDude" browser.submit_selected() print(f"Znajduję się na stronie {browser.url}") print(f"Pobieram wszystkie kotwice...") anchor_tags = browser.links() links = [anchor_tag["href"] for anchor_tag in anchor_tags] profile_pictures_dir = Path(__file__).parent / "profile_pictures" profile_pictures_dir.mkdir(exist_ok=True) for link in links: print(f"\nPrzechodzę do {link}") browser.open(SITE_URL + link) page_html = browser.page profile_name = page_html.findAll("h2")[0].get_text()[6:] print("Szukam zdjęcia profilowego...") img_src = page_html.findAll("img")[0]["src"] link = page_html.new_tag(name="a") link["href"] = img_src print( f"Pobieram zdjęcie profilowe {profile_name} do folderu profile_images..." ) profile_picture_file = profile_pictures_dir / img_src.replace( "/static/", "") browser.download_link(link=link, file=profile_picture_file)