def downloadLocationList(): url = 'https://pilotflyingj.com/umbraco/surface/storelocations/download?Format=xls&PageSize=793&PageNumber=1' client = urlreq(url) sheet = client.read() client.close() print("Location list read from website.", file=sys.stderr) try: mkdir('cache') except FileExistsError: # ignore an error that indicates cache already exists pass file = open(cacheFileName, 'wb') file.write(sheet) file.close()
def downloadLocationList(): url = 'http://www.ta-petro.com/assets/ce/Documents/Master-Location-List.xls' client = urlreq(url) sheet = client.read() client.close() print("Location list read from website.", file=sys.stderr) try: mkdir('cache') except FileExistsError: # ignore an error that indicates cache already exists pass file = open(cacheFileName, 'wb') file.write(sheet) file.close()
def nfl_get_num_weeks(year): maxWeek = 1 url = 'https://www.pro-football-reference.com/years/' + year + '/week_1.htm' try: # open connection and get page html urlClient = urlreq(url) page_html = urlClient.read() urlClient.close() except: print("Could not open page: " + url) page_soup = soup(page_html, "html.parser") maxWeek = int( str( page_soup.find("div", { "class": "section_wrapper" }).find(string=lambda text: isinstance(text, Comment))).count( "/week_")) return maxWeek
from bs4 import BeautifulSoup as soup from urllib.request import urlopen as urlreq myurl = "https://www.flipkart.com/search?q=phones&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off" uclient = urlreq(myurl) page_html = uclient.read() uclient.close() page_soup = soup(page_html,"html.parser") page_soup.body.p #gives you the p value filename = "products.csv" f = open(filename,"w") headers ="name, price, rating \n" f.write(headers) names = page_soup.find_all("div",{"class":"_3wU53n"}) # grabs all names using class price = page_soup.find_all("div",{"class":"_1vC4OE _2rQ-NK"}) # grabs all price using class rating = page_soup.find_all("div",{"class":"hGSR34"}) # grabs all ratings using class for x in range(0 , 23): product_name = names[x].text product_price = price[x].text product_rating = rating[x].text print("name :"+ product_name) print("price :"+ product_price) print("rating :"+ product_rating) f.write(product_name + "," +product_price.replace("₹"," ").replace(",","") +"," +product_rating + "\n")
def scrape(self): if self.first: titles = "brand, product_name, sale_percentage, price, shipping, total, url_link" file_name = "newegg.csv" try: f = open(file_name, "w") except PermissionError: print( "------ERROR------ERROR------ERROR------ERROR------ERROR------ERROR------ERROR------" ) print( "FILE OPEN ERROR! Please close the .csv file before running the program." ) print( "-----------------------------------------------------------------------------------" ) print('\n') print("Exiting program...") sleep(3) quit() currentDate = str(datetime.datetime.now()) f.write(titles + ',,,' + currentDate + "\n") print('Collecting data, be patient!...') print('\n') else: file_name = "newegg.csv" try: f = open(file_name, "a") except PermissionError: print( "------ERROR------ERROR------ERROR------ERROR------ERROR------ERROR------ERROR------" ) print( "FILE OPEN ERROR! Please close the .csv file before running the program." ) print( "-----------------------------------------------------------------------------------" ) print('\n') print("Exiting program...") sleep(3) quit() # setting the target URL, downloading the page, reading the contents, and dumping it into a variable called url_page_dump url_page_download = urlreq(self.target_url) url_page_dump = url_page_download.read() url_page_download.close() # parses the dumped page into html page_soup = bsoup(url_page_dump, "html.parser") # finds the total number of pages try: num_of_pages = (page_soup.find_all( "span", {"class": "list-tool-pagination-text"}))[0].strong.text[2:] except: num_of_pages = '1' if self.debug: print( '------------------------------DEBUG------------------------------' ) # looping through all the found pages for i in range(1, int(num_of_pages) + 1): if ((i > self.page_limit) & (self.page_limit != -1)): break if i == self.page_limit: page_limit_test = True else: page_limit_test = False digit_length = len(str(i - 1)) self.target_url = self.target_url[0:-digit_length] + str(i) if self.debug: print('PAGE URL: ' + str(self.target_url)) url_page_download = urlreq(self.target_url) url_page_dump = url_page_download.read() page_soup = bsoup(url_page_dump, "html.parser") try: anchor_element = page_soup.find("div", { "class": "list-tool-search" }).label.text if anchor_element != "Search Within:": print("\n") print( "------ERROR------ERROR------ERROR------ERROR------ERROR------ERROR------ERROR------" ) print( "Newegg has likely detected this as a bot! Please consider adding a delay, limiting" ) print("pages, and changing the VPN server location!") print( "-----------------------------------------------------------------------------------" ) print("\n") print("Exiting program...") sleep(3) quit() except AttributeError: print("\n") print( "------ERROR------ERROR------ERROR------ERROR------ERROR------ERROR------ERROR------" ) print( "Newegg has likely detected this as a bot! Please consider adding a delay, limiting" ) print("pages, and changing the VPN server location!") print( "-----------------------------------------------------------------------------------" ) print("\n") print("Exiting program...") sleep(3) quit() # finds every product listing on the current page item_containers = page_soup.find_all("div", {"class": "item-container"}) #print(len(item_containers)) #loops through all the products (container) and returns useful information about them for container in item_containers: if self.debug: print('PAGE: ' + str(i)) # finds the product brand try: product_brand = container.find("a", { "class": "item-brand" }).img["title"] except: product_brand = 'Unknown Brand' if self.debug: print('BRAND: ' + product_brand) # finds the product name try: product_name = container.find("a", { "class": "item-title" }).text except: product_name = "Unknown Name" if self.debug: print('PRODUCT NAME: ' + product_name) # finds the sale percentage try: product_sale_percent = container.find( "span", { "class": "price-save-percent" }).text if product_sale_percent[-1] != "%": product_sale_percent = "0%" except: product_sale_percent = "0%" if self.debug: print('SALE PERCENT: ' + product_sale_percent) # finds the product price and replaces a comma with nothing if there exists one in the price try: product_price_dollars = container.find( "li", { "class": "price-current" }).strong.text product_price_cents = container.find( "li", { "class": "price-current" }).sup.text product_price = (product_price_dollars + product_price_cents) except: product_price = "0" if ("," in product_price): product_price = product_price.replace(",", "") if self.debug: print('PRICE: ' + product_price) # finds the product shipping price try: product_shipping_price = container.find( "li", { "class": "price-ship" }).text except: product_shipping_price = "0" if (product_shipping_price == ""): product_shipping_price = "0" elif (product_shipping_price[0] == "$"): product_shipping_price = product_shipping_price[ 1:product_shipping_price.find(" ")] elif (product_shipping_price == "Free Shipping"): product_shipping_price = "0" else: product_shipping_price = "0" if self.debug: print('SHIPPING PRICE: ' + product_shipping_price) # calculates the total product price with shipping included (also limits the float value to 2 decimal places) try: total_product_price = '%.2f' % ( float(product_price) + float(product_shipping_price)) if (total_product_price == "0"): total_product_price = 'Unknown Price' except: total_product_price = product_price if self.debug: print('TOTAL PRICE: ' + total_product_price) try: product_link = container.find( "a", {"class": "item-title"})["href"] except: product_link = 'https://newegg.ca' if self.debug: print('URL: ' + product_link) f.write(product_brand + "," + product_name.replace(",", ";") + "," + product_sale_percent + "," + product_price + "," + product_shipping_price + "," \ + total_product_price + "," + product_link + "\n") if self.debug: print( '_____________________________________________________' ) if ((self.delay != 0) & (i != int(num_of_pages)) & (page_limit_test == False)): if self.debug: print('\n') print( '------------------------DEBUG------------------------' ) print('Delaying for ' + str(self.delay) + ' second(s)') print( '-----------------------------------------------------' ) for i in range(0, self.delay): if self.debug: print('Delay time left: ' + str(self.delay - i), end="\r") sleep(1) if self.debug: print('\n') print( '_____________________________________________________' ) if self.debug: print( '-----------------------------------------------------------------' ) print('\n') print('------------------------DEBUG------------------------') print('End of main sequence reached!') print('-----------------------------------------------------') f.close() if self.repeat: if self.debug: print('\n') print('------------------------DEBUG------------------------') print('Repeat sequence reached!') print('-----------------------------------------------------') print('\n') self.repeat = False self.first = False self.target_url = amd self.scrape()
def nfl_scores(): # initialize year to current year and week to first week year = str(date.today().year) week = "1" maxWeek = 1 try: if len(sys.argv) == 4: if len(str(sys.argv[2])) == 4: year = str(sys.argv[2]) else: print("Must input a valid year!") raise Exception maxWeek = nfl_get_num_weeks(str(year)) if int(sys.argv[3]) > 0: if int(sys.argv[3]) <= maxWeek: week = str(sys.argv[3]) else: print("NFL " + year + " only has " + str(maxWeek) + " weeks!") raise Exception else: print("Must enter a positive value for week!") raise Exception else: print("Arguments for football are <year> <week>") raise Exception except: print("One or more of your arguments are not valid!") #Pro Football Reference my_url = 'https://www.pro-football-reference.com/years/' + year + '/week_' + week + '.htm' pageExist = True try: # open connection and get page html urlClient = urlreq(my_url) page_html = urlClient.read() urlClient.close() except HTTPError: pageExist = False print("Page does not exist!") if pageExist: # initiate soup page_soup = soup(page_html, "html.parser") gamePosts = page_soup.findAll( "div", {"class": "game_summary expanded nohover"}) #header print("Getting scores from Week " + week + " of the " + year + " Season!") print("================================================") for post in gamePosts: gameDate = post.find("tr", {"class", "date"}).text homeTeam = post.find("table", {"class", "teams"}).tbody.select("tr")[1] homeTeamName = homeTeam.select("td")[0].text awayTeam = post.find("table", {"class", "teams"}).tbody.select("tr")[2] awayTeamName = awayTeam.select("td")[0].text homeTeamPoints = homeTeam.select("td")[1].text awayTeamPoints = awayTeam.select("td")[1].text passYdName = post.find( "table", {"class", "stats"}).select("tr")[0].select("td")[1].text passYdPts = post.find( "table", {"class", "stats"}).select("tr")[0].select("td")[2].text rushYdName = post.find( "table", {"class", "stats"}).select("tr")[1].select("td")[1].text rushYdPts = post.find( "table", {"class", "stats"}).select("tr")[1].select("td")[2].text recYdName = post.find( "table", {"class", "stats"}).select("tr")[2].select("td")[1].text recYdPts = post.find( "table", {"class", "stats"}).select("tr")[2].select("td")[2].text winnerIndex = 1 if post.find( "table", {"class", "teams"}).find("tbody").select("tr")[1] == post.find( "tr", {"class", "winner"}): winnerIndex = 1 else: winnerIndex = 2 print(gameDate + ":") print("(H) " + homeTeamName + " scored " + homeTeamPoints + " points.", end=" ") if winnerIndex == 1: print("[WINNER]") else: print() print("(A) " + awayTeamName + " scored " + awayTeamPoints + " points.", end=" ") if winnerIndex == 2: print("[WINNER]") else: print() print("\t-" + passYdName + " led in passing yards with " + passYdPts + " yards") print("\t-" + rushYdName + " led in rushing yards with " + rushYdPts + " yards") print("\t-" + recYdName + " led in receiving yards with " + recYdPts + " yards") print( '------------------------------------------------------------') print()
def nba_scores(): # Set default date to current day month = str(date.today().month) day = str(date.today().day) year = str(date.today().year) # check for inputs try: if len(sys.argv) >= 3: # input is today if sys.argv[2].lower() == "today" or sys.argv[2].lower() == "t": month = str(date.today().month) if len(str(date.today( ).month)) == 2 else "0" + str(date.today().month) day = str(date.today().day) if len(str( date.today().day)) == 2 else "0" + str(date.today().day) year = str(date.today().year) # input is yesterday elif sys.argv[2].lower() == "yesterday" or sys.argv[2].lower( ) == "y": daysBack = 1 # set amount of days back if len(sys.argv) == 4: if int(sys.argv[3]) >= 0: daysBack = int(sys.argv[3]) else: print( "Can not go back negative days; going back 1 day instead!" ) # set month, day, and year newDate = date.today() - timedelta(daysBack) month = str(newDate.month) day = str(newDate.day) year = str(newDate.year) # date is given else: # input year is after current year if int(sys.argv[4]) > int(date.today().year): validInDate = False elif int(sys.argv[4]) == int(date.today().year): # input month is after current month if int(sys.argv[2]) > int(date.today().month): validInDate = False elif int(sys.argv[2]) == int(date.today().month): # input day is after current day if int(sys.argv[3]) > int(date.today().day): validInDate = False month = str(sys.argv[2]) day = str(sys.argv[3]) year = str(sys.argv[4]) except: print("One or more of your arguments are not valid!") #Basketball Reference my_url = 'https://www.basketball-reference.com/boxscores/?month=' + month + '&day=' + day + '&year=' + year pageExist = True try: # open connection and get page html urlClient = urlreq(my_url) page_html = urlClient.read() urlClient.close() except HTTPError: pageExist = False print("Page does not exist!") if (pageExist): # html parsing page_soup = soup(page_html, "html.parser") # get all game posts gamePosts = page_soup.findAll( "div", {"class": "game_summary expanded nohover"}) # header print("Getting scores from " + month + "/" + day + "/" + year) print("=================================") # parse data for all games for post in gamePosts: leadScorer = post.find("table", { "class": "stats" }).tbody.tr.select('td')[1].text mostPoints = post.find("table", { "class": "stats" }).tbody.tr.select('td')[2].text homeTeam = post.select('table')[1].tbody.select('tr')[0].td.a.text awayTeam = post.select('table')[1].tbody.select('tr')[1].td.a.text winner = post.find("table", { "class": "teams" }).tbody.find("tr", {"class": "winner"}) winnerTeam = winner.td.text winnerPoints = winner.find("td", {"class": "right"}).text loser = post.find("table", { "class": "teams" }).tbody.find("tr", {"class": "loser"}) loserTeam = loser.td.text loserPoints = loser.find("td", {"class": "right"}).text print(("(H) " if homeTeam == winnerTeam else "(A) ") + winnerTeam + " scored " + winnerPoints + " points. [WINNER]") print(("(H) " if homeTeam == loserTeam else "(A) ") + loserTeam + " scored " + loserPoints + " points.") print("\t-The leading scorer was " + leadScorer + " with " + mostPoints + " points.") print( '------------------------------------------------------------') print()