def check_domain(name): words = [ 'not found', 'No match', 'is free', 'AVAILABLE', 'nothing found', 'No entries found', 'NOT FOUND' ] fulltext = "" firstpart = 'https://www.who.is/whois/' myurl = firstpart + name uClient = uReq(myurl) page_html = uClient.read() uClient.close() page_soup = soup(page_html, 'html.parser') check = page_soup.findAll("pre", {"style": "border:0px;"}) for texts in check: fulltext = texts.text.strip() if not check: print "Sorry Mate Domain %s it`s already Registered" % (name) if check: for word in words: if word in fulltext: print "This Domain %s it`s availabe" % (name) count = None if not word in fulltext: if word is "none": print "Sorry Mate Domain %s it`s already Registered" % ( name)
def checkOpen(coursename, coursenum, lecture): url = 'http://www.adm.uwaterloo.ca/cgi-bin/cgiwrap/infocour/salook.pl?level=under&sess=1185&subject=' + coursename + '&cournum=' + coursenum uClient = uReq(url) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") all_cells = page_soup.findAll('td') index = 0 capacity = 0 current = 0 #The page we're retrieving the data from doesn't use any classnames or IDs so we have to retrieve the desired elements by checking the text for cell in all_cells: text = cell.get_text() # The 5th cell after 'LEC 00X' is the current enrollment if index == 5: current = text # The 6th cell after 'LEC 00X' is the current enrollment if index == 6: capacity = text break # After finding the desired cell, start keeping track of how many cells we've been to if (text == ('LEC 00' + lecture + ' ')): index += 1 continue if index > 0: index += 1 if int(current) < int(capacity): return True
def generateListings(bedrooms, furnished): my_other_url = 'https://www.kijiji.ca/b-short-term-rental/gta-greater-toronto-area/sublet/k0c42l1700272' index = my_other_url.find('/sublet/') + len('/sublet/') url_end = my_other_url[index:] if (bedrooms == '1'): url_end = '1+bedroom/' + url_end elif (bedrooms == '2'): url_end = '2+bedrooms/' + url_end elif (bedrooms == '3'): url_end = '3+bedrooms/' + url_end elif (bedrooms == '4'): url_end = '4+bedrooms__5+bedrooms__6+more+bedrooms' + url_end if (furnished == 'Y'): url_end = url_end + '?furnished=1' elif (furnished == 'N'): url_end = url_end + '?furnished=0' my_url = my_other_url[:index] + url_end uClient = uReq(my_url) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") containers = page_soup.findAll(True, {"class": ['search-item', 'regular-ad']}) return containers
def main(): uClient = uReq(my_url) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") href_tags = page_soup.find_all(href=True) initial_result = len(href_tags) time.sleep(10) href_tags = page_soup.find_all(href=True) new_result = len(href_tags) print(new_result - initial_result) # tell you how many links were added or removed if new_result - initial_result == 0: print("no change") #implementation of twilio... elif new_result - initial_result < 0 or new_result - initial_result > 0: client = twilio.rest.Client('AC05a19e314e2e0a36da9d8966556c359c', '8cf175a0d0d3587e9a8ceece40bfa2c6') client.messages.create( body="Google just changed something on their homepage", to=my_phone_number, from_=twilio_phone_number) else: print("nothing noticed")
def my_function2(): #list of URLs to scrape from my_url = [ 'https://magicseaweed.com/Narragansett-Beach-Surf-Report/1103/', 'https://magicseaweed.com/2nd-Beach-Sachuest-Beach-Surf-Report/846/', 'https://magicseaweed.com/Nahant-Surf-Report/1091/', 'https://magicseaweed.com/Nantasket-Beach-Surf-Report/371/', 'https://magicseaweed.com/Scituate-Surf-Report/372/', 'https://magicseaweed.com/Cape-Cod-Surf-Report/373/', 'https://magicseaweed.com/The-Wall-Surf-Report/369/', 'https://magicseaweed.com/Green-Harbor-Surf-Report/864/', 'https://magicseaweed.com/Cape-Ann-Surf-Report/370/', 'https://magicseaweed.com/27th-Ave-North-Myrtle-Surf-Report/2152/', 'https://magicseaweed.com/Cocoa-Beach-Surf-Report/350/' ] # opening up connecting, grabbing the page conn = sqlite3.connect('SurfSend.db') cursor = conn.cursor() cursor.execute( 'CREATE TABLE IF NOT EXISTS WindInfo(ID INTEGER PRIMARY KEY, WindMPH TEXT)' ) #iterate over list of URLS for url in my_url: #initiating python's ability to parse URL uClient = uReq(url) # this will offload our content in'to a variable page_html = uClient.read() # closes our client uClient.close() # html parsing #beautifulsoup magic page_soup = soup(page_html, "html.parser") #variable for soon to be parsed page wind = page_soup.findAll( 'td', class_=re.compile("text-center table-forecast-wind td-nowrap")) #prints the list of URLs we scraped from # iterates over parsed HTML for w in wind: #wavesize wi = w.find('span', class_='stacked-text text-right') winb = wi.text.strip() conn = sqlite3.connect('SurfSend.db') cursor = conn.cursor() # cursor.execute("INSERT INTO WindInfo VALUES (?)", (winb,)) cursor.execute("INSERT INTO WindInfo (WindMPH) VALUES (?)", (winb, )) conn.commit() cursor.close() conn.close()
def generateListings(bedrooms, furnished): my_other_url = 'https://toronto.craigslist.ca/search/sub' uClient = uReq(my_other_url) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") containers = page_soup.findAll(True, {"class": ['result-row']}) return containers
def get_src_of_rand_video(): _url = base_url + random.choice(show_titles) uClient = uReq(_url) page_soup = soup(uClient.read(), "html.parser") uClient.close() episodes = page_soup.select("div.cat-eps a.sonra") link = random.choice(episodes).get('href') #get video source driver.get(link) iframes = driver.find_elements_by_tag_name('iframe') driver.switch_to.frame(1) src = driver.find_element_by_id('video-js').find_element_by_tag_name( 'source').get_attribute('src') return src
def scrapeLoop(): # open connection to twitter uClient = uReq(twit_url) # offload content from page into a variable # check to be sure that the page will only read what is immediately loaded and will not refresh at bottom of page (initially) [x] page_html = uClient.read() # close the client uClient.close() # use beautiful soup html parser --> allows us to parse through the elements of a page page_soup = soup(page_html, "html.parser") # now that we have the page information, we can find all the parts that contain text from tweets containers = page_soup.findAll("div", {"class": "js-tweet-text-container"}) # iterate through each container and pull information for container in containers: # returns the text within the tweet tweet_text = container.p.text # count each character in the extracted text # remove the pic.twitter text # start by finding the index of pic.twitter pic_sub = "pic.twitter" index = tweet_text.find(pic_sub) if index > -1: count = index f.write(str(count) + "\n") else: count = 0 for c in tweet_text: count += 1 f.write(str(count) + "\n") global loops loops = loops - 1 # recursively call the timer. # That way we can collect data for a certain number of intervals. if loops > 0: timer = Timer(600.0, scrapeLoop) timer.start() else: # if not closed we cannot use f.close()
def ScrapePrice(data): url = data print(url) #uClient = uReq(url) uClient = uReq(url) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") print("start flipkart") FinalResult = [] p = page_soup.find("span", {"id": "best_price"}) FinalResult.append(p.text) seller = page_soup.find("span", {"class": "btn-span"}) FinalResult.append(seller.text) print("found result") return FinalResult
def main(): uClient = uReq(my_url) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") href_tags = page_soup.find_all(href=True) initial_result = len(href_tags) time.sleep(10) href_tags = page_soup.find_all(href=True) if new_result - initial_result == 0: print ("no change") elif new_result - initial_result > 0 or new_result - initial_result < 0: print("change") else: print ("nothing noticed")
def school_fun(new_url): filename = "schools_details.csv" f = open(filename, "a") try: my_url = new_url # opening up the connection, grabbing the page newClient = uReq(my_url) new_page_html = newClient.read() newClient.close() # html parsing new_page_soup = soup(new_page_html, "html.parser") # clicking the link driver = webdriver.Firefox() driver.get(new_url) # doc = driver.page_source new_container = new_page_soup.findAll('font', {"face": "times new roman, serif"}) # getting school_name and school_email_id school_name = new_container[1].text.strip() email_id = new_container[9].text.strip() school_email_id = re.findall(r'[\w\.-]+@[\w\.-]+', email_id) # you can comment off the following four print commands if not needed print "--------------------------------------------------------------------------------------------------------" print "School_Name: " + school_name print "School-Email_id: " + school_email_id[0].strip() print "--------------------------------------------------------------------------------------------------------" f.write(school_name + "," + school_email_id[0].strip() + "\n") # f.close() except: pass print "--------------------------------------------------------------------------------------------------------" print "DATA CANNOT BE RETRIVE" print "--------------------------------------------------------------------------------------------------------" f.close() driver.quit();
def main(): time = str(datetime.now().time().hour) day = datetime.today().weekday() dining_url = 'http://housing.utexas.edu/dining/hours' uClient = uReq(dining_url) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") containers = page_soup.findAll("table",{"class": "tablesaw tablesaw-stack"}) openPlaces = [] times = [] places = [] data = [] for container in containers: day_values = container.tbody.findAll("tr") place = "" for val in day_values: if val.th is not None: # Ex. J2 Dining place = val.th places.append(place.text.strip()) day_info = val.findAll("td") days = [] isTime = 0 timeLeft = 0 timesRange = "" dayRange = "" for temp in day_info: text = temp.text.strip() if (len(text) != 0): # avoid spaces under days if (text[0].isdigit() or text == "Closed" or text[0] == "N"): # time ranges timesRange = text isTime = checkTime(text, time) else: dayRange = text days = checkDay(text) if (len(days) > 0 and -1 not in days): if (day in days and isTime == 1): data.append({"name": place.text.strip()}) sac(time, data) union(time, data) print data return render_template('index.html', data=data)
def PriceNameSuggestion(name): print("entered Price name suggestion fn") try: my_url = 'http://scandid.in/search?q=' + name + '&type=products' print(my_url) uClient = uReq(my_url) page_soup = soup(uClient.read(), "html.parser") uClient.close() data = page_soup.findAll("a", {"class": "ellipsis multiline"})[0:8] name = [] link = [] for i in data: name.append(i.text) link.append('http://scandid.in/' + i['href']) print("name is ", name) print("link is ", link) return (name, link) except: print("Error opening the URL")
def scrapedata(data): print("entered scrapedata fn") try: my_url = 'https://www.goodguide.com/products?filter=' + data print(my_url) uClient = uReq(my_url) page_soup = soup(uClient.read(), "html.parser") uClient.close() data = page_soup.findAll("a", {"class": "entity-link"})[0:8] #print(data) name = [] link = [] for i in data: name.append(i.get('title')) link.append(i.get('href')) return (name, link) except: print("Error opening the URL")
def flipkart(d): print(d) url = 'http://scandid.in/search?q=' + d + '&type=products' print(url) #uClient = uReq(url) uClient = uReq(url) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") print("start flipkart") containers = page_soup.findAll("a", {"class": "ellipsis multiline"})[0:10] print("found result") l = [] for i in containers: print(i['href']) print("END") return l
def sac(currTime, data): sacRestaurants = ["Chick-fil-A", "P.O.D.", "Starbucks", "Taco Cabana", "Zen"] dayIndex = datetime.today().weekday() # dayIndex = getDayIndex(day) dining_url = 'https://universityunions.utexas.edu/sac-hours/fall-2019' uClient = uReq(dining_url) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") containers = page_soup.findAll("table",{"class": "tablesaw tablesaw-stack"}) locations = containers[2].tbody.findAll("tr") for location in locations: times = location.findAll("td") name = times[0].text.strip() if (name[:6] == "P.O.D."): name = "P.O.D." if (name in sacRestaurants): if (checkSacTime(times[dayIndex].text.strip(), currTime) == 1): data.append({"name": name})
def ScrapeResult(data): print("entered ScrapeResult fn") try: my_url = 'https://www.goodguide.com' + data print(my_url) uClient = uReq(my_url) page_soup = soup(uClient.read(), "html.parser") uClient.close() title = page_soup.find("h1") print(title.text) imgParent = page_soup.find( "p", {"class": "text-center product-highlight-image"}) img = imgParent.find("img") i = img['src'] scoreParent = page_soup.find("p", {"class": "ring-value number"}) score = scoreParent.find("a") print(score.text) contentParent = page_soup.find( "p", {"class": "rating-explained-ingredient-count number high"}) HighHazardConcern = contentParent.find("a") print(HighHazardConcern.text) contentParent2 = page_soup.find( "p", {"class": "rating-explained-ingredient-count number medium"}) MediumHazardConcern = contentParent2.find("a") print(MediumHazardConcern.text) contentParent3 = page_soup.find( "p", {"class": "rating-explained-ingredient-count number low"}) LowHazardConcern = contentParent3.find("a") print(LowHazardConcern.text) print("END") return (title.text, i, score.text, HighHazardConcern.text, MediumHazardConcern.text, LowHazardConcern.text) except: print("Error opening the URL: Error in scrape result")
def union(currTime, data): unionRestaurants = ["Starbucks", "Chick-Fil-A", "P.O.D.", "Quiznos", "MoZZo", "Panda Express", "Field of Greens Market Place", "Wendy's @ Jester", "Java City @ PCL"] dayIndex = datetime.today().weekday() # print day # dayIndex = getDayIndex(day) dining_url = 'https://universityunions.utexas.edu/union-hours/fall-2019' uClient = uReq(dining_url) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") containers = page_soup.findAll("table",{"class": "tablesaw tablesaw-stack"}) locations = containers[0].tbody.findAll("tr") # print dayIndex for location in locations: times = location.findAll("td") name = times[0].text.strip() if (name[:3] == "Prov"): name = "P.O.D." if (name in unionRestaurants): # print name if (checkUnionTime(times[dayIndex].text.strip(), currTime) == 0): data.append({"name": name})
def get_park_names(self): wikipedia_url = "https://en.wikipedia.org/wiki/List_of_national_parks_of_the_United_States" wikipedia_client = uReq(wikipedia_url) wikipedia_html = wikipedia_client.read() wikipedia_client.close() wikipedia_soup = soup(wikipedia_html, "html.parser") park_rows = wikipedia_soup.table.find_all('tr') park_rows.pop(0) num_parks = len(park_rows) parks = [] for row in park_rows: parks.append(row.contents[1].contents[0].contents[0].encode( 'ascii', 'ignore') + " National Park") fo = open("national_parks", "wb") for park in parks: fo.write(park) fo.write("\n") fo.close() return parks
def get_hh_info(hh_url): hh_df =pd.DataFrame({'Current_Price' : [], 'Address' : [], 'Neighbourhood' : [],'Last_sold_date' : [], 'Last_Price':[],'Other_info':[],'House_details' : []}) all_hh_info = [] uClient = uReq(hh_url) page_html = uClient.read() page_soup = soup(page_html, 'html.parser') pageinfo = page_soup.findAll("ul", {"class": "listInline mbn pdpFeatureList"}) houseinfo = pageinfo[0].text.split('\n\n\n') text_content = [info.strip() for info in houseinfo if info !=''] items = [item.split('\n') for item in text_content] strings = [string.split(',') for caption in items for string in caption] strings_ = [''.join(eles) if len(eles) >1 else eles for eles in strings] house_details = flatten(strings_) price = house_soup.select('span.h2.typeEmphasize')[0].text price1 = price.strip() address = house_soup.select('span.h2.typeEmphasize.pan.man.defaultLineHeight')[0].text address1 = address.strip() neighbourhood = house_soup.select('span.h6.typeWeightNormal.pts.typeLowlight.miniHidden.xxsHidden')[0].text neighbourhood1 = neighbourhood.strip() last_sold_date1 = house_soup.select('td.noWrap')[0].text his_price = house_soup.select('td.noWrap')[2].text his_price1 = price.strip() others = house_soup.select('ul.listInlineBulleted.man.pts')[0].text others = others.split('\n') app_info1 = [other for other in others if other != ''] all_hh_info.append(price1) all_hh_info.append(address1) all_hh_info.append(neighbourhood1) all_hh_info.append(last_sold_date1) all_hh_info.append(his_price1) all_hh_info.append(app_info1) all_hh_info.append(house_details) if len(all_hh_info) == len(hh_df.columns): hh_df.loc[len(hh_df.index)] = all_hh_info columns = ['Current_Price', 'Address', 'Neighbourhood', 'Last_sold_date', 'Last_Price', 'Other_info', 'House_details'] hh_df = pd.DataFrame(all_hh_info).T hh_df = hh_df[columns] return hh_df
from urllib2 import urlopen as uReq from bs4 import BeautifulSoup as soup my_url = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics%20card' #opening up connection, grabbing the webpage uClient = uReq(my_url) page_html = uClient.read() uClient.close() #html parsing page_soup = soup(page_html, "html.parser") #grabs each product containers = page_soup.findAll("div", {"class": "item-container"}) #creating a csv file and setting it to write mode filename = "products.csv" f = open(filename, "w") headers = "brand,product_name,shipping\n" f.write(headers) for container in containers: brand = container.div.div.a.img["title"] title_container = container.findAll("a", {"class": "item-title"}) product_name = title_container[0].text shipping_container = container.findAll("li", {"class": "price-ship"}) shipping = shipping_container[0].text.strip()
my_url = [] print "Enter wikia URL to get information:" while True: urls = raw_input() if urls == "exit": break else: my_url.append(urls) #my_url[0] = 'http://vampirediaries.wikia.com/wiki/Hybrid' #my_url[1] = 'http://vampirediaries.wikia.com/wiki/Vampire' #opening a connection and geting the html contents for url in my_url: uClient = uReq(url) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") containers = page_soup.findAll("div", {"class": "pi-item"}) figures = page_soup.findAll("h2", {"class": "pi-item"}) #for container,figure in zip(containers,figures): for container in containers: #print figure.text print container.text #print container print "\n" # print container.a # print "\n" # print container.li
from lxml import html import requests #Webscraper for xe.com #These strings will be used to help build the URL from_currency = "USD" #This program will calculate the exchange rate of ONE currency, which you specify, with respect to a list of OTHER currencies, which you can add to to_currency_list = ["CAD", "EUR", "GBP"] #These strings will be used to help build the URL view_list = ["12h", "1D", "1W", "1M", "1Y", "2Y", "5Y", "10Y"] #List of the various time frames that XE provides for toCurr in to_currency_list: for view in view_list: my_url = "https://www.xe.com/currencycharts/?from=" + from_currency + "&to=" + toCurr + "&view=" + view #page = requests.get(my_url) #Open webpage #tree = html.fromstring(page.content) #low = tree.xpath('//*[@id="rates_detail_desc"]/strong[3]') #high = tree.xpath('//*[@id="rates_detail_desc"]/strong[4]') #print(low) uClient = uReq(my_url) #Open webpage page_html = uClient.read() #Read webpage uClient.close() #Close webpage page_soup = soup(page_html, "html.parser") #html parsing rates_detail = page_soup.find("div", {"id": "rates_detail_desc"}) inner_text = rates_detail.text #rates_detail = page_soup.find("table", {"id": "crLive"}) #rate = rates_detail.tbody print(inner_text)
#/usr/bin/env python #Developed by John Melody Mel import bs4 from urllib2 import urlopen as uReq from bs4 import BeautifulSoup as soup my_url = "https://www.ubuntu.com" #grabbing command: uClient = uReq("https://www.ubuntu.com") page_html = uClient.read() uClient.close() page_soup = (page_html, "html.parser") #grab each ubuntu() page_soup.findAll = ("div", {"class" : "Download Ubuntu"}) #len(containers) #containers[0] container = containers[0] #for containers in container download = container.div.div.a["Ubuntu 18.04.1 LTS"]
To extract relevant urls """ from bs4 import BeautifulSoup as soup from urllib2 import Request from urllib2 import urlopen as uReq headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3' } #Change URL here req = Request( url='https://en.wikipedia.org/wiki/List_of_neighbourhoods_in_Bangalore', headers=headers) ## uClient = uReq(req) page_html = uClient.read() uClient.close() page_soup = soup(page_html, 'html.parser') table_containers = page_soup.find('body').find('div', { 'class': 'mw-parser-output' }).findAll('table') d = {} for tcont in table_containers: d.update({tcont: tcont.findAll('tr')}) locale = '' container = '' for table in d: for container in d[table]: if container.td == None: continue
import bs4 from urllib2 import urlopen as uReq from bs4 import BeautifulSoup as soup myUrl = "https://www.webopedia.com/Top_Category.asp" page_html = uReq(myUrl).read() uReq(myUrl).close() parsedPage = soup(page_html, "html.parser") categories = parsedPage.findAll("div", {"class": "bullet_list"}) file = "terms.csv" f = open(file, "w") headers = "main_category_id, main_category_name, subCategory_id, subCategory_name, term_id, term_name, term_difinition\n" f.write(headers) for index, category in enumerate(categories): main_category_id = index main_category_name = str(category.div.span.a["href"]) subCategories1 = category.findAll("li", {"class": "listing-item"}) subCategories2 = category.findAll("li", {"class": "listing-item-hidden"}) subCategories = subCategories1 + subCategories2 for indx, subCategory in enumerate(subCategories): subCategory_id = indx subCategory_name = str(subCategory.a.text) sublink = str(subCategory.a["href"]) link = "https://www.webopedia.com" + sublink terms_html = uReq(link).read() uReq(link).close() parsedTermsPage = soup(terms_html, "html.parser")
def scrape(): url = 'http://www.meteokav.gr/weather/' client = uReq(url) page = client.read() client.close() page_soup = soup(page, "html.parser") values_list = [ ["Ενημέρωση απο το www.meteokav.gr:"], [ "Θερμοκρασία:", page_soup.find("span", { "id": "ajaxtemp" }).text.strip()[0:6] ], [ page_soup.find_all("strong")[19].text.strip(), page_soup.find("span", { "id": "ajaxhumidity" }).text.strip() + "%" ], [ "Αίσθηση σαν: ", page_soup.find("span", { "id": "ajaxfeelslike" }).text.strip() ], ["Διαφορά 24ώρου: ", page_soup.find_all("strong")[0].text.strip()], ["Διαφορά ώρας: ", page_soup.find_all("strong")[1].text.strip()], [ "Ανεμος: " + page_soup.find("span", { "id": "ajaxwinddir" }).text.strip() + "@" + page_soup.find("span", { "id": "ajaxbeaufortnum" }).text.strip() + " Bft" ], [ page_soup.find_all("strong")[21].text.strip() + " " + page_soup.find("span", { "id": "ajaxbaro" }).text.strip() + " " + page_soup.find("span", { "id": "ajaxbarotrendtext" }).text.strip() ], [ "Βροχή Σήμερα: " + page_soup.find("span", { "id": "ajaxrain" }).text.strip() ], #[page_soup.find("td", {"colspan":"2"}).find_all("tr")[1].find_all("td")[0].text.strip() + [ "Μέγιστη Σήμερα: " + page_soup.find("table", { "class": "data1" }).find_all("tr")[1].find_all("td")[1].text.strip()[0:6] + "@" + page_soup.find("table", { "class": "data1" }).find_all("tr")[1].find_all("td")[1].text.strip()[-6:] ], # [page_soup.find("td", {"colspan":"2"}).find_all("tr")[1].find_all("td")[0].text.strip() + [ "Μέγιστη Χθες: " + page_soup.find("table", { "class": "data1" }).find_all("tr")[1].find_all("td")[2].text.strip()[0:6] + "@" + page_soup.find("table", { "class": "data1" }).find_all("tr")[1].find_all("td")[2].text.strip()[-6:] ], [ "Ελάχιστη Σήμερα: " + page_soup.find("table", { "class": "data1" }).find_all("tr")[2].find_all("td")[1].text.strip()[0:4] + "@" + page_soup.find("table", { "class": "data1" }).find_all("tr")[2].find_all("td")[1].text.strip()[-5:] ], [ "Ελάχιστη Χθες: " + page_soup.find("table", { "class": "data1" }).find_all("td")[5].text.strip()[0:4] + "@" + page_soup.find("table", { "class": "data1" }).find_all("td")[5].text.strip()[-5:] ], [ page_soup.find_all("strong")[20].text.strip() + " " + page_soup.find("span", { "id": "ajaxdew" }).text.strip() ], [ "Μέγιστη " + page_soup.find_all("strong")[19].text.strip() + " " + page_soup.find("td", { "rowspan": "3" }).find_all("tr")[1].find_all("td")[1].text.strip()[0:3] + "@" + page_soup.find("td", { "rowspan": "3" }).find_all("tr")[1].find_all("td")[1].text.strip()[-5:] ], [ "Μέγιστη πιεση: " + page_soup.find("td", { "rowspan": "3" }).find_all("tr")[6].find_all("td")[1].text.strip()[0:10] + "@" + page_soup.find("td", { "rowspan": "3" }).find_all("tr")[6].find_all("td")[1].text.strip()[-5:] ], [ "Ελάχιστη πιεση: " + page_soup.find("td", { "rowspan": "3" }).find_all("tr")[7].find_all("td")[1].text.strip()[0:10] + "@" + page_soup.find("td", { "rowspan": "3" }).find_all("tr")[7].find_all("td")[1].text.strip()[-5:] ] ] # y = values_list #uni_values = unicodedata.normalize('NFKD', y).encode('ascii', 'ignore') return tabulate(values_list)
import bs4 from urllib2 import urlopen as uReq from bs4 import BeautifulSoup as soup print("web scrap") my_url = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=video%20cards' uClient = uReq(my_url) #opening connection page_html = uClient.read() #grabbing page uClient.close() #close connection page_soup = soup(page_html, "html.parser") #grabs entire html file into memory location # we need to grab the list of video cards # this was found how to do by going to the page # on chrome and then using the "inspect" ability, # figure out the individual search results is in a div # with the class name item-container # use class below, we can also use id if we wanted to containers = page_soup.findAll("div", {"class", "item-container"}) #putting it into a csv file filename = "graphics_card.csv" #name of file f = open(filename, "w") #open file and declare access type headers = "brand, product_name, shippping \n" # the headers for each column f.write(headers)
for i in xrange(1,len(data)): imdb_score.append(float(data["Column 27"][i])) mean = np.mean(imdb_score) std_Dev=np.std(imdb_score) print ("Population mean:"+str(mean)) print ("Population Standard deviation:"+str(std_Dev)) print ("Population length:"+str(len(data))) ######################### Script to get Data of 2016 movies using web srcapping################# import bs4 from urllib2 import urlopen as uReq from bs4 import BeautifulSoup as soup url="http://www.imdb.com/list/ls070941500/" page =uReq(url) page_html=page.read() page.close() page_soup=soup(page_html,"lxml") # print page_soup movie=page_soup.findAll("div",{"class":"info"}) file = open("imdb_2016.txt","w") imdb_score2016=[] for x in xrange(0,len(movie)): movie_name=movie[x].b.a.get_text() imdb_score=movie[x].div.findAll("span",{"class":"value"})[0].get_text() imdb_score2016.append(float(imdb_score)) file.write(str(x)+". \t"+movie_name+"\t\t\t\t\t\t\t\t\t"+ imdb_score+"\n") file.close()
from urllib2 import urlopen as uReq from bs4 import BeautifulSoup as soup import urllib my_url = 'https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/academic-major' uClient = uReq(my_url) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") containers = page_soup.findAll("ul", {"id": "ullist"}) listpoints = containers[0].findAll("li") for points in listpoints: links = points.a["href"] bas = "https://www.scholarships.com" finlink = bas + links u1Client = uReq(finlink) page1_html = u1Client.read() u1Client.close() page1_soup = soup(page1_html, "html.parser") cont = page1_soup.findAll("td") for pts in cont: lks = pts.a["href"] finlks = bas + lks '''u2Client=uReq(finlks) page2_html=u2Client.read() u2Client.close() page2_soup = soup(page2_html,"html.parser")''' u2client = urllib.urlopen(finlks.encode('utf-8')).read() page2_soup = soup(u2client, "html.parser")