def __init__(self, browser_wrapper): self._connected = False self._current_html = None self._browser_wrapper = browser_wrapper self._browser = StatefulBrowser() self._browser.addHeaders = [('User-Agent', 'Firefox'), ('Accept-Language', 'en-US,en;q=0.5')]
def prepare(self, **kwargs): self.browser = StatefulBrowser() self._mediatypes = kwargs.get("mediatypes") self._qualities = kwargs.get("qualities") self._templates = kwargs.get("templates") auth_package = kwargs.get("auth") if auth_package[0] == types.AuthType.COOKIES: jar = requests.cookies.RequestsCookieJar() session_values = auth_package[1]["session"] jar.set( "session", session_values["value"], domain=".cloud.blender.org", path="/", ) self.browser.session.cookies = jar self.browser.open("https://cloud.blender.org/settings/profile" ).status_code == 200 profile_page = self.browser.get_current_page() try: assert profile_page.find(class_="py-1") is not None except AssertionError: echo.error_msg("Authentication was not successfull") exit(1) echo.debug_msg("Authentication successfull")
def extract_info(url): user_agent = UserAgent(fallback="Mozilla/5.0 (X11; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0") browser = StatefulBrowser(user_agent=user_agent.random) browser.open(url) page = browser.get_current_page() result = re.search(r'"bitcoin:([\w\d]+)\?amount=(\d+\.\d+)&', str(page)) return PaymentInfo(float(result.group(2)), result.group(1))
def extract_article(browser: mechanicalsoup.StatefulBrowser, url: str) -> (Article, []): browser.open(url) page = browser.get_current_page() script_data = page.select('script[type="application/ld+json"]') first_script_data = json.loads(script_data[1].text) second_script_data = json.loads(script_data[0].text) header = second_script_data["headline"].replace(u'\xa0', ' ') description = second_script_data["description"].replace(u'\xa0', ' ') category = find_category(first_script_data) published_at = datetime.strptime(second_script_data["datePublished"], '%Y-%m-%dT%H:%M:%S.%fZ') modified_at = datetime.strptime(second_script_data["dateModified"], '%Y-%m-%dT%H:%M:%S.%fZ') authors = retrieve_authors(page) paragraphs = retrieve_paragraphs(page) print(f"Currently working on article {header}") comments = retrieve_comments(url.replace("clanek", "diskuze")) article = Article(link=url, header=header, description=description, category=category, published_at=published_at, modified_at=modified_at, paragraphs=paragraphs) return article, authors, comments
class WebpageSummarizer(object): """ Generates summary of a given web page. """ def __init__(self): self.browser = StatefulBrowser(user_agent='Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0') self.browser.session.headers.update({'Upgrade-Insecure-Requests': '1'}) def summarize_webpage(self, url, summarization_ratio): """ Takes a web page URL and returns the title and a summary of the web page. :param url: Web page URL. :param summarization_ratio: Fraction of original text to include in the summary. :return: Web page title and summarized web page text. """ title = summarized_text = '' try: self.browser.open(url) page = self.browser.get_current_page() # Find all the paragraphs because they contain the main web page text page_text = ' '.join(map(lambda p: p.text, page.find_all('p'))) title = page.title.text.strip() # Generate a summary of the given web page text if it contains more than 10 sentences summarized_text = summarize(page_text, ratio=summarization_ratio).strip() if summarized_text == '': summarized_text = page_text except Exception as e: print(e) finally: self.browser.close() return title, summarized_text
def click_ref_link(username: str, counter: int): try: url: str = f"https://ref.moneyguru.co/{username}" browser = StatefulBrowser() browser.open(url) if browser.url == "https://moneyguru.co": print(f'{counter}: clicked', url) except requests.exceptions.ConnectionError: print('You have network connection problem')
def get_network_fee(): # with web3.py he gives 520 gwei which is too much """ Give an estimate of network fee for a simple ether transaction. from http://gasprice.dopedapp.com/ :return: network cost """ br = StatefulBrowser(user_agent="Firefox") page = br.open("http://gasprice.dopedapp.com/") response = page.json() gwei_price = float(response["safe_price_in_gwei"]) return gwei_price * GWEI_TO_ETHER * NB_GAS_FOR_TRANSACTION
def isbn2url(isbn: str) -> Optional[str]: """Return the ketab.ir book-url for the given isbn.""" browser = StatefulBrowser(user_agent=USER_AGENT) browser.open('http://www.ketab.ir/Search.aspx') browser.select_form() browser['ctl00$ContentPlaceHolder1$TxtIsbn'] = isbn browser.submit_selected() first_link = browser.get_current_page().select_one('.HyperLink2') if first_link is None: return return browser.absolute_url(first_link['href'])
def __init__(self, target_session, target_args): self._target_session = target_session self._browser = StatefulBrowser(session=target_session) self._args = target_args self._target_url = None self._username_password = None self._form = None
def _re_login_with_form(self): browser = StatefulBrowser(session=self._target_session) self._do_login(browser, self._target_url, self._username_password, self._form) return browser
def extract_info(cls, url): """ Extracts amount and BitCoin address from a UndergroundPrivate payment URL. :param url: the URL like https://spectrocoin.com/en/order/view/1045356-0X6XzpZi.html :return: a tuple of the amount in BitCoin along with the address """ user_agent = UserAgent( fallback= "Mozilla/5.0 (X11; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0" ) browser = StatefulBrowser(user_agent=user_agent.random) browser.open(url) soup = browser.get_current_page() amount = soup.select_one('div.payAmount').text.split(" ")[0] address = soup.select_one('div.address').text return PaymentInfo(float(amount), address)
def search_postcode(searchterm: str): """ this does a search against the Australia Post site to grab postcode/suburb/state data params.searchterm: the search string type.searchterm: str :returns: postcode=int, suburb=str, state=str :rtype: dict """ browser = StatefulBrowser(soup_config={'features': 'lxml'}) # Uncomment for a more verbose output: # browser.set_verbose(2) # build the URL for search searchurl = "https://auspost.com.au/postcode/{}".format( searchterm.replace(' ', '%20')) # grab the page try: browser.open(searchurl) # get the page contents page = browser.get_current_page() # find the lis within the ol lis = page.find_all('ol')[0].find_all('li') # pull out the data data_lis = [li for li in lis if 'id="result' in str(li)] for list_element in data_lis: # this is the data found in June 2017 #<span class="suburb-map-postcode">POSTCODE</span> #<h2>SUBURB, STATE</h2> postcode = list_element.find_all('span') if postcode: postcode = postcode[0].contents[0] secondfield = list_element.find_all('h2') if secondfield: suburb, state = secondfield[0].contents[0].split(",") if postcode and state.strip() and suburb.strip(): yield { 'postcode': int(postcode), 'state': state.strip(), 'suburb': suburb.strip(), } except Exception: raise ConnectionError("Failed to open the url '{}'".format(searchurl))
def get_browser(self): browser = StatefulBrowser() url = 'https://tdserebro.ru/login?changeCity=1526384' browser.open(url) browser.select_form('form.login_form') browser['_username'] = tdserebro_login browser['_password'] = tdserebro_password browser.submit_selected() self.browser = browser
def login( self, username, # type: Text password, # type: Text customer, # type: Text ): # type: (...) -> Connection browser = StatefulBrowser() response = browser.open(self.url) response.raise_for_status() assert 'login' in (browser.get_url() or '').lower() browser.select_form('#loginForm') browser['UserName'] = username browser['Password'] = password browser['CustomerIdentifier'] = customer browser['Language'] = 'ENG' response = browser.submit_selected() response.raise_for_status() html_response = HtmlResponse.from_response(response) if html_response.soup.find(id='loginForm'): # Still on the login form, so login must have failed raise LoginFailed('Login failed') return Connection(browser, self)
def create_browser(): browser = StatefulBrowser() browser.open(f'{main_url}/wp-login.php') browser.select_form('form[name="loginform"]') browser['log'] = login browser['pwd'] = password browser.submit_selected() return browser
def get_current_usd_to_cny(): """ Get the current China mainland bank transfer buying rate for USD to CNY. Casting returned objects to string can ensure they do not inadvertently contain a BS4 object, which presents difficulties in pickling which can be hard to debug. If you do use newt.db, scrape objects can be tested with newt.db.jsonpickle `dumps` function, to ensure that the object can be both pickled and also serialized to json by newt.db and indexed/saved in a PostgreSQL jsonb field. :return: str(): rate in CNY, str(): time string """ browser = StatefulBrowser() browser.open('http://www.boc.cn/sourcedb/whpj/enindex.html') trs = browser.get_current_page().find_all("tr") cells = _get_usd_row_cells(trs) rate = cells[0].text time = cells[5].text time = time.split() time = time[0] + ' ' + time[1] return str(rate), str(time)
def scrape_HTML(url): """Scrapes the HTML from W4MPJobs""" browser = StatefulBrowser() page = browser.open(url) form = Form(page.soup.form) # Selects all on the number of results radio button number_results_data = {"ctl00$MainContent$RadioButtonList2": 9999} form.set_radio(number_results_data) # Selects NWM or more on salary radio button salary_data = {"ctl00$MainContent$rblSalary": "nmwormore"} form.set_radio(salary_data) # Selects outside London on the location radio button – other options commented out location_data = {"ctl00$MainContent$rblJobs": "outside"} # location_data = {"ctl00$MainContent$rblJobs": "inlondon"} # location_data = {"ctl00$MainContent$rblJobs": "both"} form.set_radio(location_data) # Submits the form response = browser.submit(form, page.url) # Gets response as text response = response.text # Closes the browser browser.close() return response
def test_is_logged_in(self, requests_mock): requests_mock.get(TEST_URL + '/Account/Login', text=(HTML / 'login.html').read_text()) requests_mock.post( TEST_URL + '/Account/Login', cookies={'.ASPXAUTH': 'XXX'}, text=(HTML / 'home.html').read_text(), ) requests_mock.get( TEST_URL + '/Account/LogOff', cookies={'.ASPXAUTH': None}, text=(HTML / 'login.html').read_text(), ) browser = StatefulBrowser() session = Session(TEST_URL, browser) assert not session.is_logged_in session.log_in('joe.bloggs', 'abc123') # The ``requests-mock`` library currently doesn't mock cookies in sessions properly. # In the meantime, mock the cookie by directly setting it on the ``browser`` object. # https://github.com/jamielennox/requests-mock/issues/17 browser.get_cookiejar().set_cookie( create_cookie(name='.ASPXAUTH', value='XXX')) assert session.is_logged_in session.log_out() # As above. browser.get_cookiejar().set_cookie( create_cookie(name='.ASPXAUTH', value=None)) assert not session.is_logged_in
def retreive_download_url(url, filename=None): ''' Retrieve files url from page body. If filename is given, filter by it. ''' try: br = StatefulBrowser() response = br.open(url) soup = response.soup search_tag = soup.find('ul', {'class': 'resource-list'}) title = soup.find('h1', {'itemprop': 'name'}).text.strip() if filename is None: urls = many_files(search_tag) else: urls = single_file(search_tag, filename) return title, urls except Exception as e: raise Exception('Bad URL')
def set_browser(self): user_agents = ( 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1' ' (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/534.50' ' (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)', 'Opera/9.99 (Windows NT 5.1; U; pl) Presto/9.9.9', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US)' ' AppleWebKit/530.5 (KHTML, like Gecko) Chrome/ Safari/530.5', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.2' ' (KHTML, like Gecko) Chrome/6.0', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; pl; rv:1.9.1)' ' Gecko/20090624 Firefox/3.5 (.NET CLR 3.5.30729)') session = req_session() session.headers.update({'Referer': 'https://www.deviantart.com/'}) if self.mature: session.cookies.update({'agegate_state': '1'}) session.mount('https://', req_adapters.HTTPAdapter(max_retries=3)) self.browser = StatefulBrowser(session=session, user_agent=choice(user_agents))
def creaBrowser(config=Namespace()): browser = StatefulBrowser(soup_config={'features': "html.parser"}, raise_on_404=True, user_agent="SMparser", ) if 'verbose' in config: browser.set_verbose(config.verbose) if 'debug' in config: browser.set_debug(config.debug) return browser
def set_browser(self): user_agents = ( 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1' ' (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/534.50' ' (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)', 'Opera/9.99 (Windows NT 5.1; U; pl) Presto/9.9.9', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US)' ' AppleWebKit/530.5 (KHTML, like Gecko) Chrome/ Safari/530.5', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.2' ' (KHTML, like Gecko) Chrome/6.0', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; pl; rv:1.9.1)' ' Gecko/20090624 Firefox/3.5 (.NET CLR 3.5.30729)' ) session = req_session() session.headers.update({'Referer': 'https://www.deviantart.com/'}) if self.mature: session.cookies.update({'agegate_state': '1'}) session.mount('https://', req_adapters.HTTPAdapter(max_retries=3)) self.browser = StatefulBrowser(session=session, user_agent=choice(user_agents))
def test_save_logs(self, requests_mock, tmp_path: Path): requests_mock.get(TEST_URL + '/Account/Login', text=(HTML / 'login.html').read_text()) requests_mock.post( TEST_URL + '/Account/Login', cookies={'.ASPXAUTH': 'XXX'}, text=(HTML / 'home.html').read_text(), ) requests_mock.get( TEST_URL + '/Account/LogOff', cookies={'.ASPXAUTH': None}, text=(HTML / 'login.html').read_text(), ) browser = StatefulBrowser() session = Session(TEST_URL, browser) session.log_in('joe.bloggs', 'abc123') # The ``requests-mock`` library currently doesn't mock cookies in sessions properly. # In the meantime, mock the cookie by directly setting it on the ``browser`` object. # https://github.com/jamielennox/requests-mock/issues/17 browser.get_cookiejar().set_cookie( create_cookie(name='.ASPXAUTH', value='XXX')) session.log_out() # As above. browser.get_cookiejar().set_cookie( create_cookie(name='.ASPXAUTH', value=None)) session.save_logs(str(tmp_path)) files = list(sorted(tmp_path.iterdir())) assert len(files) == 3 assert files[0].name.endswith('Z-login-0.txt') assert files[1].name.endswith('Z-home-0.txt') assert files[2].name.endswith('Z-logout-0.txt') with files[0].open() as log: assert isinstance(datetime.fromisoformat(next(log).strip()), datetime) assert next(log).startswith('GET ' + TEST_URL) assert next(log).startswith('200 None') assert next(log) == '\n' assert files[0].read_text().endswith((HTML / 'login.html').read_text())
def login(self, url, username, password): """ Login in the given url with given username and password returns a browser instance which has an active logged in sesssion raises exception if any error occurs, or if credentials are wrong """ logging.info("Logging in with the given credentials") try: # Create a browser instance b = Browser() b.open(url) b.select_form(nr=0) # Enter the username password b["login_user"] = username b["password"] = password # Login with given credentials response = b.submit_selected() self.verifyCredentials(response.soup) except Exception as e: logging.error(e) raise Exception("Login Failed: Invalid Credentials") return b
def _get_network_cost(speed): br = StatefulBrowser(user_agent='Firefox') page = br.open('https://bitcoinfees.21.co/api/v1/fees/recommended') response = page.json() satoshirate = float(response[speed]) return satoshirate
def query_TRILEGAL(RA: float, Dec: float): """ Begins TRILEGAL query. Args: RA, Dec: Coordinates of the target. Returns: output_url (str): URL of page with query results. """ # fill out and submit online TRILEGAL form browser = StatefulBrowser() browser.open("http://stev.oapd.inaf.it/cgi-bin/trilegal_1.6") browser.select_form(nr=0) browser["gal_coord"] = "2" browser["eq_alpha"] = str(RA) browser["eq_delta"] = str(Dec) browser["field"] = "0.1" browser["photsys_file"] = "tab_mag_odfnew/tab_mag_TESS_2mass.dat" browser["icm_lim"] = "1" browser["mag_lim"] = "21" browser["binary_kind"] = "0" browser.submit_selected() print("TRILEGAL form submitted.") sleep(5) if len(browser.get_current_page().select("a")) == 0: browser = StatefulBrowser() browser.open("http://stev.oapd.inaf.it/cgi-bin/trilegal_1.5") browser.select_form(nr=0) browser["gal_coord"] = "2" browser["eq_alpha"] = str(RA) browser["eq_delta"] = str(Dec) browser["field"] = "0.1" browser["photsys_file"] = "tab_mag_odfnew/tab_mag_2mass.dat" browser["icm_lim"] = "1" browser["mag_lim"] = "21" browser["binary_kind"] = "0" browser.submit_selected() # print("TRILEGAL form submitted.") sleep(5) if len(browser.get_current_page().select("a")) == 0: print("TRILEGAL too busy, \ using saved stellar populations instead.") return None else: this_page = browser.get_current_page() data_link = this_page.select("a")[0].get("href") output_url = "http://stev.oapd.inaf.it/" + data_link[3:] return output_url else: this_page = browser.get_current_page() data_link = this_page.select("a")[0].get("href") output_url = "http://stev.oapd.inaf.it/" + data_link[3:] return output_url
def scrapeScene(filename, date, url): ret = [] browser = StatefulBrowser(session=None) browser.open("https://ifeelmyself.com/public/main.php") cookie_obj = create_cookie(name='tags_popup_shown', value='true', domain='ifeelmyself.com') browser.session.cookies.set_cookie(cookie_obj) if url: debugPrint("Url found, using that to scrape") browser.open(url) response = browser.page table = response.find( class_=["blog_wide_news_tbl entry ppss-scene", "entry ppss-scene"]) if table: ret = extract_info(table) else: debugPrint("Analyzing filename...") artist_id_match = re.search(r"(f\d{3,5})", filename, re.I) if artist_id_match: artist_id = artist_id_match.group(0) video_id = re.search(r"-(\d+)", filename, re.I).group(1) browser.open("https://ifeelmyself.com/public/main.php?page=search") browser.select_form() browser['keyword'] = artist_id browser['view_by'] = "news" browser.submit_selected() response = browser.page debugPrint("Searching for video_id") debugPrint(artist_id + "-" + video_id) tables = response.find_all(class_=[ "blog_wide_news_tbl entry ppss-scene", "entry ppss-scene" ]) for table in tables: img = str(table.find("img")['src']) debugPrint(f"Image:{str(img)}") if (f"/{video_id}/{artist_id}-" in img) and img.endswith( ("vg.jpg", "hs.jpg")): debugPrint("Found a single match video!") # Extract data from this single result ret = extract_info(table) break else: sys.stderr.write("0 matches found! Checking offset") pages = int( response.find_all("a", class_="pagging_nonsel")[-1].get_text()) if pages: for offset in range(10, pages * 10, 10): browser.open( "https://ifeelmyself.com/public/main.php?page=search_results&offset=" + str(offset)) response = browser.page tables = response.find_all(class_=[ "blog_wide_news_tbl entry ppss-scene", "entry ppss-scene" ]) for table in tables: img = str(table.find("img")) debugPrint(f"Image:{img}") if (f"/{video_id}/{artist_id}-" in img) and img.endswith( ("vg.jpg", "hs.jpg")): ret = extract_info(table) break else: sys.stderr.write("0 matches found!, check your filename") else: debugPrint("Name changed after downloading") filename = filename.lower() extract_from_filename = re.match( r"^([0-9\.]{6,10})?(?<title>.+)\s(?<artist>\w+)(\.mp4)?$", filename) if extract_from_filename: title = extract_from_filename.group('title') #if date: # date_dbY = datetime.strptime(date, '%d.%m.%Y').date().strftime('%d %b %Y') # month = datetime.strptime(date, '%d.%m.%Y').date().strftime('%B') # year = datetime.strptime(date, '%d.%m.%Y').date().strftime('%Y') # debugPrint("Date: "+date_dbY) if title: title = title.lower().replace("ifeelmyself", "") title = title.replace("-", "") title = title.replace("by", "") debugPrint(f"Title: {title}") browser.open( "https://ifeelmyself.com/public/main.php?page=search") browser.select_form() debugPrint("Searching..") browser['keyword'] = title browser['view_by'] = "news" browser.submit_selected() response = browser.page #Obtaining and counting the results. Ideally you only have a single result matches = response.find_all( "a", href='javascript:;' ) #This a href javascript contains all the titles if len(matches) == 1: debugPrint("Found a single match!") table = response.find(class_=[ "blog_wide_news_tbl entry ppss-scene", "entry ppss-scene" ]) else: if len(matches) == 0: sys.stderr.write("0 matches found! Check filename") print("{}}") exit if len(matches) > 1: debugPrint( "Multiple videos found, maybe refine search term?") index = [ i for i, s in enumerate(matches) if title in str(s) ] tables = response.find_all(class_=[ "blog_wide_news_tbl entry ppss-scene", "entry ppss-scene" ]) table = tables[0] #Getting first if table: ret = extract_info(table) else: debugPrint("Not a supported filename") print("{}") exit return ret
class Dagr: """deviantArt gallery ripper class""" NAME = basename(__file__) __version__ = "0.71.3" MAX_DEVIATIONS = 1000000 # max deviations ART_PATTERN = (r"https://www\.deviantart\.com/" r"[a-zA-Z0-9_-]*/art/[a-zA-Z0-9_-]*") def __init__(self): # Internals self.init_mimetypes() self.browser = None self.errors_count = dict() # Configuration self.directory = getcwd() + "/" self.mature = False self.overwrite = False self.reverse = False self.test_only = False self.verbose = False # Current status self.deviant = "" def init_mimetypes(self): mimetypes_init() # These MIME types may be missing from some systems add_mimetype('image/vnd.adobe.photoshop', '.psd') add_mimetype('image/photoshop', '.psd') add_mimetype('application/rar', '.rar') add_mimetype('application/x-rar-compressed', '.rar') add_mimetype('application/x-rar', '.rar') add_mimetype('image/x-canon-cr2', '.tif') add_mimetype('application/x-7z-compressed', '.7z') add_mimetype('application/x-lha', '.lzh') def load_configuration(self): my_conf = configparser.ConfigParser() # Try to read global then local configuration my_conf.read([expanduser("~/.config/dagr/dagr_settings.ini"), path_join(getcwd(), "dagr_settings.ini")]) if my_conf.has_option("DeviantArt", "MatureContent"): self.mature = my_conf.getboolean("DeviantArt", "MatureContent") if my_conf.has_option("Dagr", "OutputDirectory"): self.directory = abspath( expanduser(my_conf.get("Dagr", "OutputDirectory")) ) + "/" def start(self): if not self.browser: # Set up fake browser self.set_browser() def set_browser(self): user_agents = ( 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1' ' (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/534.50' ' (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)', 'Opera/9.99 (Windows NT 5.1; U; pl) Presto/9.9.9', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US)' ' AppleWebKit/530.5 (KHTML, like Gecko) Chrome/ Safari/530.5', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.2' ' (KHTML, like Gecko) Chrome/6.0', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; pl; rv:1.9.1)' ' Gecko/20090624 Firefox/3.5 (.NET CLR 3.5.30729)' ) session = req_session() session.headers.update({'Referer': 'https://www.deviantart.com/'}) if self.mature: session.cookies.update({'agegate_state': '1'}) session.mount('https://', req_adapters.HTTPAdapter(max_retries=3)) self.browser = StatefulBrowser(session=session, user_agent=choice(user_agents)) def get(self, url, file_name=None): if (file_name and not self.overwrite and glob(file_name + ".*")): print(glob(file_name + ".*")[0] + " exists - skipping") return None if isinstance(url, Tag): # Download and save soup links get_resp = self.browser.download_link(url, file_name) else: # Direct URL get_resp = self.browser.session.get(url) if file_name: with open(file_name, "wb") as local_file: local_file.write(get_resp.content) if get_resp.status_code != req_codes.ok: raise DagrException("incorrect status code - " + str(get_resp.status_code)) if file_name is None: return get_resp.text if get_resp.headers.get("last-modified"): # Set file dates to last modified time mod_time = mktime(parsedate(get_resp.headers.get("last-modified"))) utime(file_name, (mod_time, mod_time)) if get_resp.headers.get("content-type"): content_type = get_resp.headers.get("content-type").split(";")[0] file_ext = guess_extension(content_type) if file_ext: rename(file_name, file_name + file_ext) else: raise DagrException('unknown content-type - ' + content_type) return file_name def find_link(self, link): filelink = None filename = basename(link) mature_error = False self.browser.open(link) # Full image link (via download link) link_text = re.compile("Download( (Image|File))?") img_link = None for candidate in self.browser.links("a"): if link_text.search(candidate.text) and candidate.get("href"): img_link = candidate break if img_link and img_link.get("data-download_url"): return (filename, img_link) if self.verbose: print("Download link not found, falling back to direct image") current_page = self.browser.get_current_page() # Fallback 1: try meta (filtering blocked meta) filesearch = current_page.find("meta", {"property": "og:image"}) if filesearch: filelink = filesearch['content'] if basename(filelink).startswith("noentrythumb-"): filelink = None mature_error = True if not filelink: # Fallback 2: try collect_rid, full filesearch = current_page.find("img", {"collect_rid": True, "class": re.compile(".*full")}) if not filesearch: # Fallback 3: try collect_rid, normal filesearch = current_page.find("img", {"collect_rid": True, "class": re.compile(".*normal")}) if filesearch: filelink = filesearch['src'] if current_page.find( "span", {"itemprop": "title"}).text == "Literature": filelink = self.browser.get_url() return (filename, filelink) if not filelink: if mature_error: if self.mature: raise DagrException("maybe not an image") else: raise DagrException("maybe a mature deviation/" + "not an image") else: raise DagrException("all attemps to find a link failed") return (filename, filelink) def handle_download_error(self, link, link_error): error_string = str(link_error) print("Download error (" + link + ") : " + error_string) if error_string in self.errors_count: self.errors_count[error_string] += 1 else: self.errors_count[error_string] = 1 def get_pages(self, mode, base_url): pages = [] for i in range(0, int(Dagr.MAX_DEVIATIONS / 24), 24): html = "" url = base_url + str(i) try: html = self.get(url) except DagrException: print("Could not find " + self.deviant + "'s " + mode) return pages prelim = re.findall(Dagr.ART_PATTERN, html, re.IGNORECASE | re.DOTALL) for match in prelim: if match not in pages: pages.append(match) done = re.findall("(This section has no deviations yet!|" "This collection has no items yet!)", html, re.IGNORECASE | re.S) if done: break print(self.deviant + "'s " + mode + " page " + str(int((i / 24) + 1)) + " crawled...") if not self.reverse: pages.reverse() return pages def get_images(self, mode, mode_arg, pages): base_dir = self.directory + self.deviant + "/" + mode if mode_arg: base_dir += "/" + mode_arg try: da_make_dirs(base_dir) except OSError as mkdir_error: print(str(mkdir_error)) return # Find previously downloaded pages existing_pages = [] try: with open(base_dir + "/.dagr_downloaded_pages", "r") as filehandle: existing_pages = json.load(filehandle) except FNF_ERROR: # May not exist (new directory, ...) pass if not self.overwrite: pages = [x for x in pages if x not in existing_pages] print("Total deviations to download: " + str(len(pages))) for count, link in enumerate(pages, start=1): if self.verbose: print("Downloading " + str(count) + " of " + str(len(pages)) + " ( " + link + " )") filename = "" filelink = "" try: filename, filelink = self.find_link(link) except (KeyboardInterrupt, SystemExit): raise except DagrException as link_error: self.handle_download_error(link, link_error) continue if not self.test_only: try: self.get(filelink, base_dir + "/" + filename) except DagrException as get_error: self.handle_download_error(link, get_error) continue else: if link not in existing_pages: existing_pages.append(link) else: print(filelink) # Update downloaded pages cache with open(base_dir + "/.dagr_downloaded_pages", "w") as filehandle: json.dump(existing_pages, filehandle) def deviant_get(self, mode, mode_arg=None): print("Ripping " + self.deviant + "'s " + mode + "...") base_url = "https://www.deviantart.com/" + self.deviant.lower() + "/" if mode == "favs": base_url += "favourites/?catpath=/&offset=" elif mode == "collection": base_url += "favourites/" + mode_arg + "?offset=" elif mode == "scraps": base_url += "gallery/?catpath=scraps&offset=" elif mode == "gallery": base_url += "gallery/?catpath=/&offset=" elif mode == "album": base_url += "gallery/" + mode_arg + "?offset=" elif mode == "query": base_url += "gallery/?q=" + mode_arg + "&offset=" elif mode == "category": base_url += "gallery/?catpath=" + mode_arg + "&offset=" pages = self.get_pages(mode, base_url) if not pages: print(self.deviant + "'s " + mode + " had no deviations.") return print("Total deviations in " + self.deviant + "'s " + mode + " found: " + str(len(pages))) self.get_images(mode, mode_arg, pages) print(self.deviant + "'s " + mode + " successfully ripped.") def group_get(self, mode): print("Ripping " + self.deviant + "'s " + mode + "...") base_url = 'https://www.deviantart.com/' + self.deviant.lower() + '/' if mode == "favs": base_url += "favourites/" elif mode == "gallery": base_url += "gallery/" folders = [] i = 0 while True: html = self.get(base_url + '?offset=' + str(i)) k = re.findall('class="ch-top" href="' + base_url + '([0-9]*/[a-zA-Z0-9_-]*)"', html, re.IGNORECASE) if k == []: break new_folder = False for match in k: if match not in folders: folders.append(match) new_folder = True if not new_folder: break i += 10 # no repeats folders = list(set(folders)) if not folders: print(self.deviant + "'s " + mode + " is empty.") print("Total folders in " + self.deviant + "'s " + mode + " found: " + str(len(folders))) if self.reverse: folders.reverse() pages = [] for folder in folders: label = folder.split("/")[-1] print("Crawling folder " + label + "...") pages = self.get_pages(mode, base_url + folder + '?offset=') if not self.reverse: pages.reverse() self.get_images(mode, label, pages) print(self.deviant + "'s " + mode + " successfully ripped.") def print_errors(self): if self.errors_count: print("Download errors count:") for error in self.errors_count: print("* " + error + " : " + str(self.errors_count[error]))
class UserScraper(object): """ Scrapes fakeaddressgenerator.com for fake user data. It also adds some basic additional information for server configuration. """ attributes = [ 'Full Name', 'Street', 'City', 'State Full', 'Zip Code', 'Phone Number', 'Company', 'Username' ] pages = { 'NL': 'http://www.fakeaddressgenerator.com/World/Netherlands_address_generator', 'US': 'http://www.fakeaddressgenerator.com/World/us_address_generator', 'UK': 'http://www.fakeaddressgenerator.com/World/uk_address_generator', 'CA': 'http://www.fakeaddressgenerator.com/World/ca_address_generator', } def __init__(self, country='NL'): self.country_code = country self.browser = StatefulBrowser() self.page = UserScraper.pages.get(country) def get_user(self): self.browser.open(self.page) attrs = {} for attr in self.attributes: attrs[attr] = self._get_attribute(attr) attrs['country_code'] = self.country_code attrs['password'] = ''.join( random.choice(string.ascii_letters + string.digits) for _ in range(12)) attrs['email'] = 'authentic8989+' + attrs['Username'] + '@gmail.com' attrs['rootpw'] = attrs['password'] attrs['ns1'] = 'ns1' attrs['ns2'] = 'ns2' attrs['hostname'] = attrs['Username'] + '.hostname.com' attrs['testnet'] = 'off' return self._map_to_config(attrs) @staticmethod def _map_to_config(attrs): config = {} # Treat full name separately because it needs to be split if 'Full Name' in attrs: config['user'] = {} config['user']['firstname'] = attrs['Full Name'].split('\xa0')[0] config['user']['lastname'] = attrs['Full Name'].split('\xa0')[-1] # Map the possible user attributes to their config names and sections mapping = { 'Street': ('address', 'address'), 'City': ('address', 'city'), 'State Full': ('address', 'state'), 'Zip Code': ('address', 'zipcode'), 'Phone Number': ('user', 'phonenumber'), 'Company': ('user', 'companyname'), 'Username': ('user', 'username'), 'country_code': ('address', 'countrycode'), 'password': ('user', 'password'), 'email': ('user', 'email'), 'rootpw': ('server', 'root_password'), 'ns1': ('server', 'ns1'), 'ns2': ('server', 'ns2'), 'hostname': ('server', 'hostname'), 'testnet': ('user', 'testnet') } for attr in attrs.keys(): if attr in mapping.keys(): section, key = mapping[attr] if section not in config: config[section] = {} config[section][key] = attrs[attr] return config def _get_attribute(self, attribute): return self.browser.get_current_page() \ .find(string=attribute) \ .parent.parent.parent \ .find('input') \ .get('value')
class Session: BASE_URL = 'https://m.facebook.com' def __init__(self, browser_wrapper): self._connected = False self._current_html = None self._browser_wrapper = browser_wrapper self._browser = StatefulBrowser() self._browser.addHeaders = [('User-Agent', 'Firefox'), ('Accept-Language', 'en-US,en;q=0.5')] def __del__(self): self._dispose() def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self._dispose() @property def connected(self): return self._connected def log_in(self, username, password): try: # Log in to non-mobile site is more reliable self._browser_wrapper.open(self._browser, 'https://www.facebook.com') self._browser.select_form('form[id="login_form"]') self._browser['email'] = username self._browser['pass'] = password self._browser_wrapper.submit_selected(self._browser) # Check if we really are in account profile page if self._browser.get_current_page().find('form', action='/search/top/'): self._connected = True except: raise LogInError(f'Unable to log in as {username}') return self def log_out(self): if self._connected: self._browser.close() self._connected = False def profile_info(self, id_): """Retrieve informations for a given profile.""" self._ensure_connected() try: self._browser_wrapper.open(self._browser, f'{Session.BASE_URL}/{id_}') name = self._sanitize_title( self._browser.get_current_page().find('title').text) image = parse_image(self._browser.get_current_page(), name) info = parse_info(self._browser.get_current_page()) return name, image, info except: return None def search(self, query): """ Execute search of a given text returning a tuple with ID, descriptions and URI. """ url_query = '+'.join(query.split()) url_path = f'/search/top/?q={url_query}' \ if self._connected else f'/public/{url_query}' try: self._browser_wrapper.open( self._browser, f'{Session.BASE_URL}{url_path}{url_query}') return parse_search(self._browser.get_current_page(), Session.BASE_URL) except: return None def _ensure_connected(self): if not self._connected: raise NotConnectedError('No active connection or required login') def _sanitize_title(self, title): # Handle cases like 'Some One - Home' if '-' in title: return title.split('-')[0].strip() return title def _dispose(self): if self._connected: self.log_out()
def __init__(self, country='NL'): self.country_code = country self.browser = StatefulBrowser() self.page = UserScraper.pages.get(country)
from mechanicalsoup import StatefulBrowser home_url = "http://testing.chandrashekar.info/" username = "******" password = "******" login_url = "http://testing.chandrashekar.info/wp-login.php" logged_in_url = "http://testing.chandrashekar.info/wp-admin/" add_new_post_url = "http://testing.chandrashekar.info/wp-admin/post-new.php" browser = StatefulBrowser() browser.open(login_url) assert browser.get_url() == login_url browser.select_form() browser["log"] = username browser["pwd"] = password browser.submit_selected() assert browser.get_url() == logged_in_url print(browser.get_url()) browser.follow_link("post-new.php") assert browser.get_url() == add_new_post_url print(browser.get_url())