def parse_page(url): x = Ureq(url) page = x.read() x.close() page_parsed = Bsoup(page, 'html.parser') return (page_parsed)
def get_json(url): req = Request(url, headers=hdr) page = Ureq(req) try: js = page.read().decode() js = json.loads(js) except: js = None return js
def hockey_bet(): # Pull in url for schedule # TODO: Check date, and if it is not during the season, exit function url = 'https://www.hockey-reference.com/leagues/NHL_2020_games.html' # Run through BeautifulSoup steps uClient = Ureq(url) raw_content = uClient.read() page_soup = soup(raw_content, "html.parser") html = list(page_soup.children)[3] game = html.findAll(class_ = 'left') game = [team.get_text() for team in game] drop_list = ['Date','Visitor','Home','Notes',''] # Clean data game = [game for game in game if game not in drop_list] bin_len = 3 start = 0 end = start + bin_len week_list = [] while end < (len(game) + 1): week = game[start:end] start = end end = start + bin_len week_list.append(week) df = pd.DataFrame(week_list) df.columns = ['Date','Visitor','Home'] # Clean team names into readable format row_count = 0 visitor = df['Visitor'].str.split(" ", expand = True) home = df['Home'].str.split(" ", expand = True) while row_count < len(df): if visitor[2][row_count] == None: df['Visitor'][row_count] = visitor[1][row_count] elif visitor[2][row_count] != None: df['Visitor'][row_count] = visitor[2][row_count] if home[2][row_count] == None: df['Home'][row_count] = home[1][row_count] elif home[2][row_count] != None: df['Home'][row_count] = home[2][row_count] row_count += 1 # Only select todays games todays_date = datetime.now().strftime('%Y-%m-%d') todays_games = df[df['Date'] == todays_date] todays_games = todays_games.reset_index() todays_games = todays_games[['Visitor','Home']] return todays_games
from urllib.request import urlopen as Ureq from bs4 import BeautifulSoup as soup my_url = "https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38" #opening url and grabbing page uClint = Ureq(my_url) page_html = uClint.read() uClint.close() #html parser page_soup = soup(page_html, "html.parser") #print(page_soup.h1) # prints H1 #print(page_soup.p)# prints paragraphs #print(page_soup.body.div) #grab each product containers = page_soup.findAll("div", {"class": "item-container"}) print(len(containers)) # to open a file file_name = "product.csv" f = open(file_name, "w") headers = "Brand", "prouct name", "Shipping\n" f.write("Brand, product name, shippig\n") #below 3 lines of code is for container 1 that is 0th #container = containers[0] #print(container.a) #print(container.div.div.a.img["title"]) # will return the title
def basketball_bet(): # Get the current month and day in order to get the games playing today current_month_text = datetime.now().strftime('%B').lower() current_day = datetime.now().strftime('%d') # Pull the url based on the current month try: url = 'https://www.basketball-reference.com/leagues/NBA_2020_games-' + current_month_text + '.html' except: print('There are currently no basketball games being played today') return uClient = Ureq(url) raw_content = uClient.read() page_soup = soup(raw_content, "html.parser") html = list(page_soup.children)[3] schedule_text = html.findAll(class_="left") # Get the text from the html schedule = [game.get_text() for game in schedule_text] # Fill dataframe with game date, visiting team name, and home team name bin_len = 3 start = 0 end = start + bin_len week_list = [] while end < (len(schedule) + 1): week = schedule[start:end] start = end + 1 end = start + bin_len week_list.append(week) df_1 = pd.DataFrame(week_list) df_1.columns = ['Date', 'Visitor', 'Home'] # Clean all of the comlumns row_count = 0 new = df_1['Date'].str.split(" ", n=3, expand=True) while row_count < len(df_1): df_1['Date'][row_count] = new[2][row_count][:-1] row_count += 1 game_time = html.findAll(class_='right') game_time = [team.get_text() for team in game_time] bin_len = 4 start = 0 end = start + bin_len week_list = [] while end < (len(game_time) + 1): week = game_time[start:end] start = end end = start + bin_len week_list.append(week) df = pd.DataFrame(week_list) df.columns = ['Game_Time (EST)', 'Stat1', 'Stat2', 'Stat3'] df = df['Game_Time (EST)'] # Concat the dataframes to get desired data todays_games = pd.concat([df_1, df], axis=1, join='inner') todays_games = todays_games[todays_games['Date'] == current_day] # If there are no games being played, exit function if len(todays_games) == 0: print('There are currently no basketball games being played today.') return # Clean team names into more readable forms todays_games = todays_games.reset_index() todays_games = todays_games[['Visitor', 'Home', 'Game_Time (EST)']] todays_games['Home'] = todays_games['Home'].apply(lambda x: teams_dict[x]) todays_games['Visitor'] = todays_games['Visitor'].apply( lambda x: teams_dict[x]) # Return games being played today return todays_games
from urllib.request import urlopen as Ureq from bs4 import BeautifulSoup as soup from newsapi import NewsApiClient # pre-processor my_url = 'http://topforeignstocks.com/stock-lists/the-complete-list-of-biotech-stocks-trading-on-nasdaq/' # List of biotech companies uClient = Ureq(my_url) #downloads webpage page_html = uClient.read() page_soup = soup(page_html, "html.parser") # print(page_soup.tbody.td) bio_tech_companies = page_soup.findAll("td", {"class": "column-2"}) for i in range(1): query = str(bio_tech_companies[i].text.strip()) print(query) newsapi = NewsApiClient(api_key='42eab217e53348febe920e907f524b0f') top_headlines = newsapi.get_top_headlines(q=str('biotech'), language='en') print(top_headlines) uClient.close()
page_numbers = 1 headers = "Album | Artist | Score | Author | Genre | Review Date \n" #open csv file with open('albums_complete_second_half.csv', 'wb') as csvfile: csvfile.write((headers).encode('utf8')) items = [] #Iterate through every page on https://pitchfork.com/reviews/albums/ while(True): url = (base_url_main_page+"?page="+str(page_numbers)) #iterate through until no page is found. Ignore other HTTP response errors try: response = Ureq(url) except urllib.error.HTTPError as e: error_message = e.read() if e.getcode() == 404: sys.exit("No page found") else: print(error_message) else: page_html = response.read() page_soup = soup(page_html, "html.parser") url_names = page_soup.findAll("div",class_= "review") count = 0 #enter urls of album reviews for item in url_names:
def basketball_win(date): current_month = date[0:2] current_day = date[3:5] string = current_month current_month_text = datetime.strptime(string, "%m") current_month_text = datetime.strftime(current_month_text, "%B").lower() # Pull the url based on the current month try: url = 'https://www.basketball-reference.com/leagues/NBA_2020_games-' + current_month_text + '.html' except: print('There are currently no basketball games being played today') return uClient = Ureq(url) raw_content = uClient.read() page_soup = soup(raw_content, "html.parser") html = list(page_soup.children)[3] schedule_text = html.findAll(class_="left") # Get the text from the html schedule = [game.get_text() for game in schedule_text] # Fill dataframe with game date, visiting team name, and home team name bin_len = 3 start = 0 end = start + bin_len week_list = [] while end < (len(schedule) + 1): week = schedule[start:end] start = end + 1 end = start + bin_len week_list.append(week) df_1 = pd.DataFrame(week_list) df_1.columns = ['Date', 'Visitor', 'Home'] # Clean all of the comlumns row_count = 0 new = df_1['Date'].str.split(" ", n=3, expand=True) while row_count < len(df_1): df_1['Date'][row_count] = new[2][row_count][:-1] row_count += 1 game_time = html.findAll(class_='right') game_time = [team.get_text() for team in game_time] bin_len = 4 start = 0 end = start + bin_len week_list = [] while end < (len(game_time) + 1): week = game_time[start:end] start = end end = start + bin_len week_list.append(week) df = pd.DataFrame(week_list) df.columns = ['Game_Time', 'Visitor_Points', 'Home_Points', 'Stat3'] df.drop(columns=['Stat3'], inplace=True) total_df = pd.concat([df_1, df], axis=1, join='inner') win_list = [] row_count = 0 for row in total_df['Date']: if (total_df['Visitor_Points'][row_count]) > ( total_df['Home_Points'][row_count]): win_list.append(total_df['Visitor'][row_count]) elif (total_df['Home_Points'][row_count]) > ( total_df['Visitor_Points'][row_count]): win_list.append(total_df['Home'][row_count]) elif (total_df['Home_Points'][row_count]) != '' and ( total_df['Visitor_Points'][row_count]) != '' and ( total_df['Home_Points'][row_count]) == ( total_df['Visitor_Points'][row_count]): win_list.append('Tie') else: win_list.append('Incomplete') row_count += 1 total_df['Winner'] = win_list todays_games = total_df[total_df['Date'] == current_day] if len(todays_games) == 0: print('There are currently no basketball games being played today.') todays_games['Home'] = todays_games['Home'].apply(lambda x: teams_dict[x]) todays_games['Visitor'] = todays_games['Visitor'].apply( lambda x: teams_dict[x]) todays_games['Winner'] = todays_games['Winner'].apply( lambda x: teams_dict[x]) return todays_games
import bs4 from urllib.request import urlopen as Ureq from bs4 import BeautifulSoup as soup #target web page my_url = "https://www.shelflife.co.za/Online-store/sneakers" #opening connection and grabbing page uClient = Ureq(my_url) page_html = uClient.read() uClient.close() #html parsing page_soup = soup(page_html, "html.parser") #grabs each product containers = page_soup.find_all("div", {"class": "col-xs-6 col-sm-3"}) title = page_soup.find_all("div", {"class": "title"}) price = page_soup.findAll("div", {"class" : "price"}) #finds sale product sale_products = page_soup.findAll('div', {"class" : "special_label sale"} ) #open a new csv file to write data scraped from website filename = "shelf_life_sneaks_sale.csv" f = open(filename, "w") headers = "product_name,price\n" f.write(headers) #writing the data to the csv file for i in range (0, len(containers)):
def scrape(): #################################################################################### concat = Sentry.get() #my_url = "file:///C:/Users/Adam-22-26/Desktop/graphics%20card%20-%20Newegg.com.html" my_url = 'https://www.newegg.com/global/ph-en/p/pl?d={}'.format(concat) my_url = my_url.replace(' ', '+') #################################################################################### uClient = Ureq(my_url) page_html = uClient.read() uClient.close() #html_parsing page_soup = Soup(page_html, "html.parser") #grabe each containers = page_soup.findAll("div", {"class": "item-container"}) #manufacturer = page_soup.findAll("label",{"class": "form-checkbox"}) #print(manufacturer ) #print(len(containers)) #print(containers[5:]) #container = containers[5] #---------------------------------------- save the csv files fileName = "{}.csv".format( concat) ############################################### f = open(fileName, "w") headers = "BRAND , PRICES , SAVES , TITLES , LINK \n" # f.write(headers) for container in containers[4:]: #--------------------------------------------------------- brand_container = container.findAll("a", {"class": "item-brand"}) brand = brand_container[0].img["title"] #brand name #------------------------------------------------------------------- may_know = container.findAll("a", {"class": "item-title"}) #print(may_know) #################################################################### title = container.a.img["title"] #Name of selling #print(container) #######################################################3 hyper = brand_container[0]["href"] #hyper = container.findAll("div",{"class": "item-info"}) #hyper = hypers.a #print(hyper) #-------------------------------------------------------------- price_container = container.findAll("li", {"class": "price-current"}) price_container2 = price_container[0].strong price = re.findall(r'.\d.\d\d\d', str(price_container2)) prices = ''.join(price) #------------------------------------------------------------------------ save_container = container.findAll("span", {"class": "price-save-percent"}) save = re.findall(r'\d\d.', str(save_container)) saves = ''.join(save) if saves == '': saves = "None" else: saves = saves if prices == "": prices = "Not Available" else: prices = prices brandlistbox.insert(END, " : " + brand) pricelistbox.insert(END, "₱ " + prices) savelistbox.insert(END, saves) Listbox4.insert(END, " : " + title) hyperlink.insert(END, ' ' + hyper) #------------------------------------------------------------------------- f.write( brand.replace(',', '') + ", " + prices.replace( ',', '.').replace('0', '1').replace('>', ' ') + ',' + saves.replace('', '').replace('None', '0%') + ', ' + title.replace(',', '') + ', ' + hyper + "\n") f.close() new_win = Button(window, width=10, text="New_Win", command=mainwindow, height=1, font="Jokerman", relief=RAISED, activebackground="LightBlue1", background='sky blue') new_win.place(x=105, y=90) messagebox.showinfo("Happens", "DONE! \n press ok to proceed")
def notifyprice(price,type,updown): sub = 'attention! {} price {} to {}'.format(type,updown,price) attention_body = 'please attention {} price!!! current price is {} to {}'.format(type,updown,price) # wechat_msg(attention_body) mail_msg(sub,attention_body) if __name__ == '__main__': # itchat.auto_login() while True: queryurl = '' #Set datasource here req = Ureq(queryurl) china_gold = china_silver = #your way of obtaining price data print('gold price is: {}'.format(china_gold)) print('silver price is: {}'.format(china_silver)) if china_silver>=silver_high or china_silver<=silver_low: if china_silver >= silver_high: notifyprice(china_silver,'silver','up') silver_high += silver_range print('silver notify range raised to {}'.format(silver_high)) if china_silver <= silver_low: notifyprice(china_silver,'silver','down') silver_low -= silver_range print('silver notify range down to {}'.format(silver_low))
def football_bet(): # Ensure that the football season is currently going on year_date = datetime.now().strftime('%Y-%m-%d') if year_date > 'February 2 2020' and year_date < 'September 10 2020': print( "The next football season hasn't begun yet. Please come back on September 10." ) return elif year_date < 'February 2 2020': url = 'https://www.pro-football-reference.com/years/2019/games.htm' else: url = 'https://www.pro-football-reference.com/years/2020/games.htm' # Run through BeautifulSoup steps to pull wanted data uClient = Ureq(url) raw_content = uClient.read() page_soup = soup(raw_content, "html.parser") html = list(page_soup.children)[3] teams_win_loss = html.findAll(class_='left') game = html.findAll(class_='right') game = [team.get_text() for team in game] teams_win_loss = [team.get_text() for team in teams_win_loss] removal = ['Day'] teams_win_loss = [item for item in teams_win_loss if item not in removal] # Set todays date that will be used to select todays games date = datetime.now().strftime('%B %d') # Clean stats bin_len = 8 start = 0 end = start + bin_len week_list = [] while end < (len(game) + 1): week = game[start:end] start = end + 1 end = start + bin_len week_list.append(week) df_1 = pd.DataFrame(week_list) df_1.columns = [ 'Game_Week', 'Time (EST)', 'Stat1', 'Stat2', 'Stat3', 'Stat4', 'Stat5', 'Stat6' ] bin_len = 4 start = 0 end = start + bin_len week_list = [] while end < (len(teams_win_loss) + 1): week = teams_win_loss[start:end] start = end end = start + bin_len week_list.append(week) df_2 = pd.DataFrame(week_list) df_2.columns = ['Day_Of_Week', 'Date', 'Home', 'Visitor'] # Concat data frames football = pd.concat( [df_1[['Game_Week', 'Time (EST)']], df_2[['Date', 'Home', 'Visitor']]], axis=1, join='inner') # Select only games being played today todays_games = football[football['Date'] == date] # Return dataframe return todays_games
import bs4 from urllib.request import urlopen as Ureq from bs4 import BeautifulSoup as soup my_url = [] for x in range(10): my_url.append("https://de.aliexpress.com/category/2118/printers/" + str(x) + "html?site=deu&tag=") for y in range(10): uClient = Ureq(my_url[y]) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") #grabs each product #containers = page_soup.findAll("li",{"class":"list-item"}) #filename = "printers.csv" #f = open(filename, "w") #headers ="brand,product_name, shipping, price_real\n" # f.write(headers) ullist = page_soup.findAll("div", {"class": "col-main"}) error_p = page_soup.findAll( "p", {"class": "ui-notice ui-notice-normal ui-notice-prompt"}) error = [] error.append(error_p) if error == [[]]:
def hockey_win(date): url = 'https://www.hockey-reference.com/leagues/NHL_2020_games.html' # Run through BeautifulSoup steps uClient = Ureq(url) raw_content = uClient.read() page_soup = soup(raw_content, "html.parser") html = list(page_soup.children)[3] game = html.findAll(class_ = 'left') results = html.findAll(class_ = 'right') game = [team.get_text() for team in game] results = [team.get_text() for team in results] results_drop = ['LOG'] results = [results for results in results if results not in results_drop] drop_list = ['Date','Visitor','Home','Notes',''] # Clean data game = [game for game in game if game not in drop_list] bin_len = 3 start = 0 end = start + bin_len week_list = [] while end < (len(game) + 1): week = game[start:end] start = end end = start + bin_len week_list.append(week) df = pd.DataFrame(week_list) df.columns = ['Date','Visitor','Home'] # Clean team names into readable format bin_len = 4 start = 0 end = start + bin_len week_list = [] while end < (len(results) + 1): week = results[start:end] start = end end = start + bin_len week_list.append(week) df_1 = pd.DataFrame(week_list) row_count = 0 visitor = df['Visitor'].str.split(" ", expand = True) home = df['Home'].str.split(" ", expand = True) while row_count < len(df): if visitor[2][row_count] == None: df['Visitor'][row_count] = visitor[1][row_count] elif visitor[2][row_count] != None: df['Visitor'][row_count] = visitor[2][row_count] if home[2][row_count] == None: df['Home'][row_count] = home[1][row_count] elif home[2][row_count] != None: df['Home'][row_count] = home[2][row_count] row_count += 1 # Only select todays games df_1.columns = ['Visitor_Goals','Home_Goals','Attendance','Time'] total_df = pd.concat([df,df_1],axis=1,join='inner') win_count = 0 win_list = [] while win_count < len(total_df): if (total_df['Visitor_Goals'][win_count]) > (total_df['Home_Goals'][win_count]): win_list.append(total_df['Visitor'][win_count]) elif (total_df['Home_Goals'][win_count]) > (total_df['Visitor_Goals'][win_count]): win_list.append(total_df['Home'][win_count]) elif (total_df['Home_Goals'][win_count]) != '' and (total_df['Visitor_Goals'][win_count]) != '' and (total_df['Home_Goals'][win_count]) == (total_df['Visitor_Goals'][win_count]): win_list.append('Tie') else: win_list.append('Incomplete') win_count += 1 total_df['Winner'] = win_list todays_games = total_df[total_df['Date'] == date] todays_games = todays_games.reset_index() return todays_games
def baseball_bet(): # Set the current date in a readable form and the form used for the html todays_date = datetime.now().strftime('%m-%d-%Y') date_html = datetime.now().strftime('%Y%m%d') # Set Opening Day date openeing_day = "03-26-2020" # Parse OD date OD = datetime.strptime(openeing_day, "%m-%d-%Y") # Set current date present = datetime.now() # If it is before OD, return from function if present.date() < OP.date(): print('Opening Day is not until March 26. Please come back then.') return # Set url for todays date if season has already started url = 'https://www.espn.com/mlb/schedule/_/date/' + date_html # Make sure that there are acutally games being played # If there are not, the url will not work try: uClient = Ureq(url) raw_content = uClient.read() except: print('There are no games being played on this day.') return # Run through BeautifulSoup steps to pull out desired data page_soup = soup(raw_content, "html.parser") html = list(page_soup.children)[3] game = html.findAll(class_='external') game_date_list = [] # Fix dates given into readable datetime format for x in range(1, len(game)): game_date = game[x]['href'].split('/')[5].split('-')[-3:-1] game_date.append('2020') sent_str = "" for i in game_date: sent_str += str(i) + "-" sent_str = sent_str[:-1] date = datetime.strptime(sent_str, '%m-%d-%Y') date = date.strftime('%m-%d-%Y') game_date_list.append(date) # Get the names of the teams that are playing on that day game = html.findAll(class_='team-name') game = [team.get_text() for team in game] game_list = [] for item in game: # The abbrvs are only the last three characters in the str item = item[-3:] game_list.append(item) # Split home and away teams from the list of cleaned teams bin_len = 2 start = 0 end = start + bin_len week_list = [] while end < (len(game_list) + 1): week = game_list[start:end] start = end end = start + bin_len week_list.append(week) df = pd.DataFrame(week_list) df.columns = ['Visitor', 'Home'] df['Date'] = game_date_list todays_games = df[df['Date'] == todays_date] # Apply the lambda function that will clean the team names into more colloquial names todays_games['Home'] = todays_games['Home'].apply(lambda x: teams_dict[x]) todays_games['Visitor'] = todays_games['Visitor'].apply( lambda x: teams_dict[x]) # return data frame of games that are being played today return todays_games