def create_referral(username, counter) -> None: try: letters = string.ascii_uppercase random_word: str = ''.join(random.choice(letters) for i in range(10)) sign_up_data = { 'name': f'{random_word}', 'username': f'{random_word}', 'email': f'{random_word}@email.com', 'password': f'{random_word}', 'password_confirmation': f'{random_word}', 'check': True, } url: str = f"https://ref.moneyguru.co/{username}" browser = StatefulBrowser() browser.open(url) if browser.url == "https://moneyguru.co": browser.open(browser.url) browser.follow_link('/register') browser.select_form('form[action="/register"]') browser['name'] = sign_up_data['name'] browser['username'] = sign_up_data['username'] browser['email'] = sign_up_data['email'] browser['password'] = sign_up_data['password'] browser['password_confirmation'] = sign_up_data[ 'password_confirmation'] browser['check'] = sign_up_data['check'] response = browser.submit_selected() # check for successful form submission page = browser.page ref_label = page.find( 'div', class_="widget-title opacity-5 text-uppercase") if ref_label: print(f"{ref_label.text}: {counter}") except requests.exceptions.ConnectionError: print('You have network connection problem') except mechanicalsoup.LinkNotFoundError as e: print("I can't find your link", e) except mechanicalsoup.InvalidFormMethod as e: print("invalid form method", e)
from mechanicalsoup import StatefulBrowser home_url = "http://testing.chandrashekar.info/" username = "******" password = "******" login_url = "http://testing.chandrashekar.info/wp-login.php" logged_in_url = "http://testing.chandrashekar.info/wp-admin/" add_new_post_url = "http://testing.chandrashekar.info/wp-admin/post-new.php" browser = StatefulBrowser() browser.open(login_url) assert browser.get_url() == login_url browser.select_form() browser["log"] = username browser["pwd"] = password browser.submit_selected() assert browser.get_url() == logged_in_url print(browser.get_url()) browser.follow_link("post-new.php") assert browser.get_url() == add_new_post_url print(browser.get_url())
password = '******' dashboard_url = "http://testing.chandrashekar.info/wp-admin/" add_new_posts_url = "http://testing.chandrashekar.info/wp-admin/post-new.php" from mechanicalsoup import StatefulBrowser import sys browser = StatefulBrowser() r = browser.open(login_url) assert r.ok and browser.get_url() == login_url loginform = browser.select_form("#loginform") browser["log"] = username browser["pwd"] = password r = browser.submit_selected() assert r.ok and browser.get_url() == dashboard_url r = browser.follow_link(text="Add New") assert r.ok and browser.get_url() == add_new_posts_url post_form = browser.select_form("#post") browser["post_title"] = input("Enter blog title: ") print("Enter blog content below. Press Ctrl-d to complete.") print("---------------------------------------------------") browser["content"] = sys.stdin.read() publish_btn = browser.get_current_page().select("input#publish")[0] r = browser.submit_selected(publish_btn) assert r.ok and browser.get_current_page().select("div#message p")[0].text == "Post published"
#!/usr/bin/env python3 """使用 Mechanize 模拟浏览器""" from bs4 import BeautifulSoup, SoupStrainer from mechanicalsoup import StatefulBrowser br = StatefulBrowser() # home page rsp = br.open("http://us.pycon.org/2011/home/") print("\n***", rsp.url) print("Confirm home page has 'Log in' link; click it") page = rsp.text assert "Log in" in page, "Log in not in page" # 断言页面有 Log in, 否则退出 rsp = br.follow_link(br.links(link_text="Log in")[0]) # login page print("\n***", rsp.url) print("Confirm at least a login form; submit invalid creds") assert br.select_form(), "no forms on this page" current_form = br.select_form(nr=0) # 选择第一个表单 current_form.form["username"] = "******" current_form.form["password"] = "******" rsp = br.submit_selected() # login page, with error print("\n***", rsp.url) print("Error due to invalid creds; resubmit w/valid creds") assert rsp.url == "https://us.pycon.org/2011/account/login/", rsp.url page = rsp.text # 找到登录错误信息
from mechanicalsoup import StatefulBrowser browser = StatefulBrowser() browser.open("http://www.python.org/") browser.follow_link("/blogs/") #browser.follow_link(text="Python News") print(browser.get_url()) browser.select_form() browser.get_current_form().print_summary() browser["q"] = "Raymond Hettinger" browser.submit_selected() print(browser.get_url()) print("-" * 40) print(browser.links()) browser.close()
class Dagr: """deviantArt gallery ripper class""" NAME = basename(__file__) __version__ = "0.70.1" MAX_DEVIATIONS = 1000000 # max deviations ART_PATTERN = (r"https://www\.deviantart\.com/" r"[a-zA-Z0-9_-]*/art/[a-zA-Z0-9_-]*") def __init__(self): # Internals self.browser = None self.errors_count = dict() # Configuration self.directory = getcwd() + "/" self.mature = False self.overwrite = False self.reverse = False self.test_only = False self.verbose = False # Current status self.deviant = "" def load_configuration(self): my_conf = configparser.ConfigParser() # Try to read global then local configuration my_conf.read([ expanduser("~/.config/dagr/dagr_settings.ini"), path_join(getcwd(), "dagr_settings.ini") ]) if my_conf.has_option("DeviantArt", "MatureContent"): self.mature = my_conf.getboolean("DeviantArt", "MatureContent") if my_conf.has_option("Dagr", "OutputDirectory"): self.directory = abspath( expanduser(my_conf.get("Dagr", "OutputDirectory"))) + "/" def start(self): if not self.browser: # Set up fake browser self.set_browser() def set_browser(self): user_agents = ( 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1' ' (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/534.50' ' (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)', 'Opera/9.99 (Windows NT 5.1; U; pl) Presto/9.9.9', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US)' ' AppleWebKit/530.5 (KHTML, like Gecko) Chrome/ Safari/530.5', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.2' ' (KHTML, like Gecko) Chrome/6.0', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; pl; rv:1.9.1)' ' Gecko/20090624 Firefox/3.5 (.NET CLR 3.5.30729)') session = req_session() session.headers.update({'Referer': 'https://www.deviantart.com/'}) if self.mature: session.cookies.update({'agegate_state': '1'}) session.mount('https://', req_adapters.HTTPAdapter(max_retries=3)) self.browser = StatefulBrowser(session=session, user_agent=choice(user_agents)) def get(self, url, file_name=None): if (file_name and not self.overwrite and path_exists(file_name)): print(file_name + " exists - skipping") return None get_resp = self.browser.open(url) if get_resp.status_code != req_codes.ok: raise DagrException("incorrect status code - " + str(get_resp.status_code)) if file_name is None: return get_resp.text # Open our local file for writing local_file = open(file_name, "wb") # Write to our local file local_file.write(get_resp.content) local_file.close() return file_name def find_link(self, link): filelink = None mature_error = False self.browser.open(link) # Full image link (via download link) link_text = re.compile("Download( (Image|File))?") img_link = None for candidate in self.browser.links("a"): if link_text.search(candidate.text) and candidate.get("href"): img_link = candidate break if img_link: self.browser.follow_link(img_link) filelink = self.browser.get_url() return (basename(filelink), filelink) if self.verbose: print("Download link not found, falling back to direct image") current_page = self.browser.get_current_page() # Fallback 1: try meta (filtering blocked meta) filesearch = current_page.find("meta", {"property": "og:image"}) if filesearch: filelink = filesearch['content'] if basename(filelink).startswith("noentrythumb-"): filelink = None mature_error = True if not filelink: # Fallback 2: try collect_rid, full filesearch = current_page.find("img", { "collect_rid": True, "class": re.compile(".*full") }) if not filesearch: # Fallback 3: try collect_rid, normal filesearch = current_page.find("img", { "collect_rid": True, "class": re.compile(".*normal") }) if filesearch: filelink = filesearch['src'] if not filelink: if mature_error: if self.mature: raise DagrException("maybe not an image") else: raise DagrException("maybe a mature deviation/" + "not an image") else: raise DagrException("all attemps to find a link failed") filename = basename(filelink) return (filename, filelink) def handle_download_error(self, link, link_error): error_string = str(link_error) print("Download error (" + link + ") : " + error_string) if error_string in self.errors_count: self.errors_count[error_string] += 1 else: self.errors_count[error_string] = 1 def get_pages(self, mode, base_url): pages = [] for i in range(0, int(Dagr.MAX_DEVIATIONS / 24), 24): html = "" url = base_url + str(i) try: html = self.get(url) except DagrException: print("Could not find " + self.deviant + "'s " + mode) return pages prelim = re.findall(Dagr.ART_PATTERN, html, re.IGNORECASE | re.DOTALL) for match in prelim: if match not in pages: pages.append(match) done = re.findall( "(This section has no deviations yet!|" "This collection has no items yet!)", html, re.IGNORECASE | re.S) if done: break print(self.deviant + "'s " + mode + " page " + str(int((i / 24) + 1)) + " crawled...") if not self.reverse: pages.reverse() return pages def get_images(self, mode, mode_arg, pages): base_dir = self.directory + self.deviant + "/" + mode if mode_arg: base_dir += "/" + mode_arg try: da_make_dirs(base_dir) except OSError as mkdir_error: print(str(mkdir_error)) return # Find previously downloaded pages existing_pages = [] try: with open(base_dir + "/.dagr_downloaded_pages", "r") as filehandle: existing_pages = json.load(filehandle) except FNF_Error as fnf_error: # May not exist (new directory, ...) pass if not self.overwrite: pages = [x for x in pages if x not in existing_pages] print("Total deviations to download: " + str(len(pages))) for count, link in enumerate(pages, start=1): if self.verbose: print("Downloading " + str(count) + " of " + str(len(pages)) + " ( " + link + " )") filename = "" filelink = "" try: filename, filelink = self.find_link(link) except (KeyboardInterrupt, SystemExit): raise except DagrException as link_error: self.handle_download_error(link, link_error) continue if not self.test_only: try: self.get(filelink, base_dir + "/" + filename) except DagrException as get_error: self.handle_download_error(link, get_error) continue else: if link not in existing_pages: existing_pages.append(link) else: print(filelink) # Update downloaded pages cache with open(base_dir + "/.dagr_downloaded_pages", "w") as filehandle: json.dump(existing_pages, filehandle) def deviant_get(self, mode, mode_arg=None): print("Ripping " + self.deviant + "'s " + mode + "...") base_url = "https://www.deviantart.com/" + self.deviant.lower() + "/" if mode == "favs": base_url += "favourites/?catpath=/&offset=" elif mode == "collection": base_url += "favourites/" + mode_arg + "?offset=" elif mode == "scraps": base_url += "gallery/?catpath=scraps&offset=" elif mode == "gallery": base_url += "gallery/?catpath=/&offset=" elif mode == "album": base_url += "gallery/" + mode_arg + "?offset=" elif mode == "query": base_url += "gallery/?q=" + mode_arg + "&offset=" pages = self.get_pages(mode, base_url) if not pages: print(self.deviant + "'s " + mode + " had no deviations.") return print("Total deviations in " + self.deviant + "'s " + mode + " found: " + str(len(pages))) self.get_images(mode, mode_arg, pages) print(self.deviant + "'s " + mode + " successfully ripped.") def group_get(self, mode): print("Ripping " + self.deviant + "'s " + mode + "...") base_url = 'https://www.deviantart.com/' + self.deviant.lower() + '/' if mode == "favs": base_url += "favourites/" elif mode == "gallery": base_url += "gallery/" folders = [] i = 0 while True: html = self.get(base_url + '?offset=' + str(i)) k = re.findall( 'class="ch-top" href="' + base_url + '([0-9]*/[a-zA-Z0-9_-]*)"', html, re.IGNORECASE) if k == []: break new_folder = False for match in k: if match not in folders: folders.append(match) new_folder = True if not new_folder: break i += 10 # no repeats folders = list(set(folders)) if not folders: print(self.deviant + "'s " + mode + " is empty.") print("Total folders in " + self.deviant + "'s " + mode + " found: " + str(len(folders))) if self.reverse: folders.reverse() pages = [] for folder in folders: label = folder.split("/")[-1] print("Crawling folder " + label + "...") pages = self.get_pages(mode, base_url + folder + '?offset=') if not self.reverse: pages.reverse() self.get_images(mode, label, pages) print(self.deviant + "'s " + mode + " successfully ripped.") def print_errors(self): if self.errors_count: print("Download errors count:") for error in self.errors_count: print("* " + error + " : " + str(self.errors_count[error]))