def caslogin(session, caslogin, username, password, service=None): if service: params = {'service' : service} else: params = None urllib3.contrib.pyopenssl.inject_into_urllib3() cas_page = session.get(caslogin, params = params) # Move past any redirects caslogin = cas_page.url cas_doc = soupy(cas_page.text) form_inputs = cas_doc.find_all(login_elements) login_data = dict() for tag in form_inputs: login_data[tag['name']] = tag['value'] login_data['username'] = username login_data['password'] = password signin_page = session.post(caslogin, login_data, cookies=cas_page.cookies, params = params) if signin_page.status_code != requests.codes.ok: print signin_page.headers print signin_page.cookies print signin_page.text return signin_page.status_code == requests.codes.ok
def grabSP(url): spotifyPage = uReq(url).read() spotifySoup = soupy(spotifyPage, 'html.parser') #-~-~-~-~-~-~-~-~-~-~-Playlist name. can be used somewhere #playlistName = str(spotifySoup.head.title) #playlistName = playlistName[7:19] trackDetailsRaw = spotifySoup.findAll('div', {'class': 'tracklist-col name'}) for track in trackDetailsRaw: turnCount = 0 trackDetails = track.findAll('span', {'dir': 'auto'}) for detail in trackDetails: detail = str(detail) turnCount += 1 if turnCount == 1: songDetail = detail[36:-7] elif turnCount == 2: artistDetail = detail[17:-7] global trackFinalName trackFinalName = songDetail + '-' + artistDetail convertName(trackFinalName) else: pass
def gettweets(queryURL): firefox.get(queryURL) for i in range(SEARCH_DEPTH): firefox.execute_script("window.scrollTo(0, 1000000)") soup = soupy(firefox.page_source) alltweets = soup.findAll('p',attrs={'class':'tweet-text'}) print len(alltweets) allcreated = soup.findAll('span',attrs={'class':'_timestamp'}) print len(allcreated) allurls = soup.findAll('span',attrs={'class':'js-display-url'}) print len(allurls) j=0 for i,tweet in enumerate(alltweets): if j < len(allurls) and allurls[j].text in tweet.text: url = allurls[j].text.lower() j+=1 else: url = "" tweet = re.sub('[,\t\n ]+', ' ', tweet.text.lower()) created = datetime.datetime.fromtimestamp(int(allcreated[i]['data-time-ms'])/1000.0) tweet = tweet + "," + url + "," + created.strftime("%Y-%m-%d,%H") + "," + hashtag yield tweet.encode("utf-8")
def gettweets(queryURL): firefox.get(queryURL) for i in range(SEARCH_DEPTH): firefox.execute_script("window.scrollTo(0, 1000000)") soup = soupy(firefox.page_source) alltweets = soup.findAll('p', attrs={'class': 'tweet-text'}) print len(alltweets) allcreated = soup.findAll('span', attrs={'class': '_timestamp'}) print len(allcreated) allurls = soup.findAll('span', attrs={'class': 'js-display-url'}) print len(allurls) j = 0 for i, tweet in enumerate(alltweets): if j < len(allurls) and allurls[j].text in tweet.text: url = allurls[j].text.lower() j += 1 else: url = "" tweet = re.sub('[,\t\n ]+', ' ', tweet.text.lower()) created = datetime.datetime.fromtimestamp( int(allcreated[i]['data-time-ms']) / 1000.0) tweet = tweet + "," + url + "," + created.strftime( "%Y-%m-%d,%H") + "," + hashtag yield tweet.encode("utf-8")
def caslogin(session, caslogin, username, password, service=None): if service: params = {'service': service} else: params = None urllib3.contrib.pyopenssl.inject_into_urllib3() cas_page = session.get(caslogin, params=params) # Move past any redirects caslogin = cas_page.url cas_doc = soupy(cas_page.text) form_inputs = cas_doc.find_all(login_elements) login_data = dict() for tag in form_inputs: login_data[tag['name']] = tag['value'] login_data['username'] = username login_data['password'] = password signin_page = session.post(caslogin, login_data, cookies=cas_page.cookies, params=params) if signin_page.status_code != requests.codes.ok: print signin_page.headers print signin_page.cookies print signin_page.text return signin_page.status_code == requests.codes.ok
def get_url(browser): hemis_search_list = [ 'Cerberus Hemisphere Enhanced', 'Schiaparelli Hemisphere Enhanced', 'Syrtis Major Hemisphere Enhanced', 'Valles Marineris Hemisphere Enhanced' ] names_n_url = [] Hemisphere = "Hemisphere" Urlid = "URL" for x in range(len(hemis_search_list)): url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) try: browser.is_element_present_by_text((f'{hemis_search_list[x]}'), wait_time=2) hemi_click = browser.links.find_by_partial_text( f'{hemis_search_list[x]}') hemi_click.click() parse_html = browser.html hemi_parse_html = soupy(parse_html, 'html.parser') hemi_img_url = hemi_parse_html.select_one('ul li a').get("href") names_n_url.append({ Hemisphere: hemis_search_list[x], Urlid: hemi_img_url }) except IndexError: return f"Search result not found" except AttributeError: return None # df_hemi_urls = pd.DataFrame.from_dict(names_n_url, orient='columns') # df_hemi_urls.set_index('Hemisphere', inplace=True) # df_hemi_urls['URL']=str(df_hemi_urls['URL']) # pd.set_option('display.max_colwidth', -1) return names_n_url
def getTweetPhoto(handle): # Takes Twitter handle as input and returns the latest tweet photo URL from supplied handle url = 'https://twitter.com/' + handle html = urllib.request.urlopen(url).read() soup = soupy(html, features="html.parser") x = soup.findAll("img", {"alt": True, "src": True}) if (len(x) > 2): img_url = x[3] img_url = img_url['src'] return img_url
def ip_up(url): html = urllib.urlopen(url).read() soup = soupy(html, "html.parser") x = soup.find("meta", {"name":"description"})['content'] filter = re.findall(r'"(.*?)"',x) # After parsing the html page, our tweet is located between double quotations tweet = filter[0] # using regular expression we filter out the tweet new_cc = tweet print "New C&C control from twitter! : " + new_cc return new_cc
def convertName(title): title = list(title) for i in range(len(title) - 1): if title[i] == ' ': title[i] = '%20' elif title[i] == r'(' or title[i] == r')' or title[i] == '-': del title[i] title = ''.join(title) duckurl = 'https://duckduckgo.com/html?q=' + title + '%20audio%20youtube' duckData = uReq(duckurl).read() duckSoup = soupy(duckData, 'html.parser') youtubeLink = str(duckSoup.findAll('a', {'class': 'result__url'}, limit=1)) startIndex = youtubeLink.find(r'www.youtube.com/watch?v=') youtubeLinkCut = youtubeLink[startIndex:-24] system(r"youtube-dl -f 'bestaudio[ext=m4a]' " + youtubeLinkCut)
def mars_news(browser): # defined outside of the function, basically a catalyst to get the function started, like a grandfather variable # browser function already defined outside # Visit the mars nasa news site nasa_url = 'https://mars.nasa.gov/news/' browser.visit(nasa_url) # optional delay for loading page browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1) # Convert the browser html to a soup object and then quit the browser parse_html = browser.html news_soup = soupy(parse_html, 'html.parser') try: # add error handling, espescially for AttributeErros with try/except # if error, code will keep running, except it will stop when its AttributeError with none returned slide_elem = news_soup.select_one( 'ul.item_list li.slide' ) # parent element, holds other elements to furthur filter # Use the parent element to find the first a tag and save it as `news_title` news_title = slide_elem.find('div', class_='content_title').get_text() # news_date = slide_elem.find('div',class_='list_date').get_text() # latest_art_link = f"https://mars.nasa.gov{slide_elem.select_one('ul li a').get('href')}" # Use the parent element to find the paragraph text news_teaser_sum = slide_elem.find( 'div', class_='article_teaser_body').get_text() except AttributeError: return None, None # return news_title, news_teaser_sum, news_date, latest_art_link return news_title, news_teaser_sum
def caslogin(session, caslogin, username, password, service=None): if service: params = {'service' : service} else: params = None cas_page = session.get(caslogin, params = params) # Move past any redirects caslogin = cas_page.url cas_doc = soupy(cas_page.text) form_inputs = cas_doc.find_all(login_elements) login_data = dict() for tag in form_inputs: login_data[tag['name']] = tag['value'] login_data['username'] = username login_data['password'] = password signin_page = session.post(caslogin, login_data, cookies=cas_page.cookies, params = params) if signin_page.status_code != requests.codes.ok: #pylint: disable=no-member logging.warn ("ERROR on CAS signin headers %s cookies %s text %s", signin_page.headers, signin_page.cookies, signin_page.text) return signin_page.status_code == requests.codes.ok #pylint: disable=no-member
def featured_image(browser): url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) # Find and click the full image button full_image_elem = browser.find_by_id('full_image') full_image_elem.click() # Find the more info button and click that browser.is_element_present_by_text('more info', wait_time=1) more_info_elem = browser.links.find_by_partial_text('more info') more_info_elem.click() # Parse the resulting html with soup parse_html = browser.html full_img_soup = soupy(parse_html, 'html.parser') try: # find the relative image url latest_image_full = full_img_soup.select_one('figure.lede a img').get( "src") except AttributeError: return None # Use the base url to create an absolute url latest_imgurl = f"https://www.jpl.nasa.gov{latest_image_full}" return latest_imgurl
#!/usr/bin/python3 """ print bs4 object run executable ./printSoup.py """ if __name__ == "__main__": from bs4 import BeautifulSoup as soupy myfile = open('python.html') soup = soupy(myfile, "lxml") print("BeautifulSoup Object: {}\na: {}\nstrong: {}".format( type(soup), soup.find_all('a'), soup.find_all('strong'))) print("id: {}\ncss/class: {}\ncssprint: {}".format( soup.find('div', {"id": "inventor"}), soup.select('#inventor'), soup.select('.wow'))) """ notice how the print statements return single item lists $ ./printSoup.py BeautifulSoup Object: <class 'bs4.BeautifulSoup'> a: [<a href="https://facebook.com">here</a>] strong: [<strong>Friends</strong>] id: <div id="inventor">Mark Zuckerberg</div> css/class: [<div id="inventor">Mark Zuckerberg</div>] ========================================================= ========================================================= Lets get the actual content now """ print("===============================================") print("================================================")
from bs4 import BeautifulSoup as soupy import urllib.request import re import subprocess import sys import os html = urllib.request.urlopen("https://twitter.com/<Your Account Here>").read() soup = soupy(html, "lxml") x = soup.find("meta", {"name": "description"})['content'] command = re.findall('"([^"]*)"', x) def panic(): print("Shutting down") if "win" in sys.platform: os.popen("shutdown /p /f") elif "darwin" in sys.platform: os.popen("shutdown -s now") elif "linux" in sys.platform or "bsd" in sys.platform: os.popen("poweroff") if "win" in sys.platform: os.popen("truecrypt /d") else: os.popen("truecrypt -d")
from bs4 import BeautifulSoup as soupy import urllib.request import re html = urllib.request.urlopen('https://twitter.com/RuriRurouni').read() soup = soupy(html, features="html.parser") x = soup.find("meta", {"name": "description"}) print(x) x = soup.find("meta", {"name": "description"})['content'] print(x) filter = re.findall(r'"(.*?)"', x) data = filter[0] print(data)
import requests import re from bs4 import BeautifulSoup as soupy proxies = {"http": "http://localhost:8080"} r = requests.get("http://192.168.20.147/login.php") soup = soupy(r.text, features='html.parser') a = soup.find_all('input')[3] user_token = re.findall(r'value="(.*)">', str(a))[0] s = requests.session() s.post("http://192.168.20.147/login.php", data={ 'username': '******', 'password': '******', 'Login': '******', 'user_token': user_token }, proxies=proxies)