def html_parsing(): response = requests.get('https://www.debian.org/releases/stable') root = HTML(response.content) print([e.tag for e in root]) print(root.find('head').find('title').text) print(root.find('body').findall('div')[1].find('p').text) print(root.xpath('body')) print(root.xpath('body/div')) print(root.xpath('//h1')) print(root.find('head').xpath('.//h1')) print(root.xpath('//div[@id="content"]')) print(root.xpath('//div[h1]')) print(root.xpath('body/div[2]'))
def edit_message(base_url, username, password, message_id, new_body): url_opener = _utils.login_and_go_to_faq(base_url, username, password) # calculate some more URLs faq_url = urljoin(base_url, "faq.php") edit_url = urljoin(base_url, "misc.php") # go to the FAQ page (page with low backend complexity) to get the security token print("fetching security token") faq_response = url_opener.open(faq_url) faq = HTML(faq_response.read()) token_field = faq.find(".//input[@name='securitytoken']") security_token = token_field.attrib["value"] # encode the message request_string = \ "do=vsacb_editmessage&s=&securitytoken={0}&id={1}&vsacb_editmessage={2}".format( security_token, message_id, encode_outgoing_message(new_body) ) request_bytes = request_string.encode(server_encoding) print("updating message") edit_response = url_opener.open(edit_url, data=request_bytes) edit_response.read() print("done")
def link_tag_url(html): ''' extracts a relative url from an HTML document's link tag, like <link rel="shortcut icon" href="images-template/favicon.ico" type="image/x-icon" /> ''' from lxml.etree import HTML doc = HTML(html) link_tag = doc.find('.//link[@rel="shortcut icon"]') if link_tag is not None: favicon_url = link_tag.get('href', '') if favicon_url: return favicon_url
def fake(base_url, username, password, game_id, time, score, game_name=None): url_opener = _utils.login_and_enter_arcade(base_url, username, password) # calculate some more URLs play_game_url = urljoin(base_url, "arcade.php?do=play&gameid={0}".format(game_id)) score_url = urljoin(base_url, "index.php?act=Arcade&do=newscore") # pretend to play the game print("playing the game") play_game_response = url_opener.open(play_game_url) play_game = HTML(play_game_response.read()) if game_name is None: # (meanwhile, find the game's name) game_flash = play_game.find(".//embed[@type='application/x-shockwave-flash']") if game_flash is None: print("didn't find the flash plugin on the game page :'-(") return flash_vars = game_flash.attrib['flashvars'].split("&") for var in flash_vars: if var.startswith("gamename="): game_name = var[len("gamename="):] if game_name is None: print("game name not found :'-(") return # wait the given time print("waiting") sleep(time) post_values = { "gscore": score, "gname": game_name } post_data = _utils.encode_post_data(post_values) print("submitting fake score") score_response = url_opener.open(score_url, data=post_data) score_response.read() print("done")
# screen scraping import re import requests from lxml.etree import HTML response = requests.get('https://www.debian.org/releases/stable') root = HTML(response.content) title_text = root.find('head').find('title').text release = re.search('\u201c(.*)\u201d', title_text).group(1) p_text = root.xpath('//div[@id="content"]/p[1]')[0].text version = p_text.split()[1] print('Codename: {}\nVersion: {}\n'.format(release, version))
def fake(base_url, username, password, game_id, time, score, tourney_id, game_name=None, rung=None, face_off=None): url_opener = _utils.login_and_enter_arcade(base_url, username, password) # calculate some more URLs tourneys_url = urljoin(base_url, "arcade.php?&do=viewtournaments") view_tourney_url = urljoin(base_url, "arcade.php?&act=Arcade&do=viewtourney&tid={0}".format( tourney_id )) play_tourney_game_url = urljoin( base_url, "arcade.php?&do=playtourney&gameid={0}&tid={1}{2}{3}".format( game_id, tourney_id, "&rung={0}".format(rung) if rung is not None else "", "&faceoff={0}".format(face_off) if face_off is not None else "" ) ) score_url = urljoin(base_url, "index.php?act=Arcade&do=newscore") # go to tourneys print("entering tourneys page") tourneys_response = url_opener.open(tourneys_url) tourneys_response.read() # view the tourney print("looking at the tourney") view_tourney_response = url_opener.open(view_tourney_url) view_tourney_response.read() # pretend to play the game print("playing the game") play_tourney_game_response = url_opener.open(play_tourney_game_url) play_tourney_game = HTML(play_tourney_game_response.read()) if game_name is None: # (meanwhile, find the game's name) game_flash = play_tourney_game.find(".//embed[@type='application/x-shockwave-flash']") if game_flash is None: print("didn't find the flash plugin on the game page :'-(") return flash_vars = game_flash.attrib['flashvars'].split("&") for var in flash_vars: if var.startswith("gamename="): game_name = var[len("gamename="):] if game_name is None: print("game name not found :'-(") return # wait the given time print("waiting") sleep(time) post_values = { "gscore": score, "gname": game_name } post_data = _utils.encode_post_data(post_values) print("submitting fake score") score_response = url_opener.open(score_url, data=post_data) score_response.read() print("done")
#!/usr/bin/env python3 import re import requests from lxml.etree import HTML response = requests.get('http://www.debian.org/releases/stable/') root = HTML(response.content) title_text = root.find('head').find('title').text release = re.search('\u201c(.*)\u201d', title_text).group(1) p_text = root.xpath('//div[@id="content"]/p[1]')[0].text version = p_text.split()[1] print('Codename: {}\nVersion: {}'.format(release, version))
import re import requests from lxml.etree import HTML response = requests.get("http://www.debian.org/releases/stable/") root = HTML(response.content) title_text = root.find("head").find("title").text release = re.search("\u201c(.*)\u201d", title_text).group(1) p_text = root.xpath("//div[@id='content']/p[1]")[0].text version = p_text.split()[1] print("Codename:{}\nVersion: {}".format(release, version))