예제 #1
0
def parse_page(url):

    x = Ureq(url)
    page = x.read()
    x.close()
    page_parsed = Bsoup(page, 'html.parser')

    return (page_parsed)
예제 #2
0
def get_json(url):
    req = Request(url, headers=hdr)
    page = Ureq(req)
    try:
        js = page.read().decode()
        js = json.loads(js)
    except:
        js = None
    return js
예제 #3
0
def hockey_bet():
    # Pull in url for schedule
    # TODO: Check date, and if it is not during the season, exit function
    url = 'https://www.hockey-reference.com/leagues/NHL_2020_games.html'
    # Run through BeautifulSoup steps
    uClient = Ureq(url)
    raw_content = uClient.read()
    page_soup = soup(raw_content, "html.parser")
    html = list(page_soup.children)[3]
    game = html.findAll(class_ = 'left')
    game = [team.get_text() for team in game]
    drop_list = ['Date','Visitor','Home','Notes','']
    # Clean data
    game = [game for game in game if game not in drop_list]
    bin_len = 3
    start = 0
    end = start + bin_len
    week_list = []
    while end < (len(game) + 1):
        week = game[start:end]
        start = end
        end = start + bin_len
        week_list.append(week)
    df = pd.DataFrame(week_list)
    df.columns = ['Date','Visitor','Home']
    # Clean team names into readable format
    row_count = 0
    visitor = df['Visitor'].str.split(" ", expand = True) 
    home = df['Home'].str.split(" ", expand = True) 
    while row_count < len(df):
        if visitor[2][row_count] == None:
            df['Visitor'][row_count] = visitor[1][row_count]
        elif visitor[2][row_count] != None:
            df['Visitor'][row_count] = visitor[2][row_count]
        if home[2][row_count] == None:
            df['Home'][row_count] = home[1][row_count]
        elif home[2][row_count] != None:
            df['Home'][row_count] = home[2][row_count]
        row_count += 1
    # Only select todays games
    todays_date = datetime.now().strftime('%Y-%m-%d')
    todays_games = df[df['Date'] == todays_date]
    todays_games = todays_games.reset_index()
    todays_games = todays_games[['Visitor','Home']]
    return todays_games
from urllib.request import urlopen as Ureq
from bs4 import BeautifulSoup as soup
my_url = "https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38"
#opening url and grabbing page
uClint = Ureq(my_url)
page_html = uClint.read()
uClint.close()

#html parser
page_soup = soup(page_html, "html.parser")
#print(page_soup.h1) # prints H1
#print(page_soup.p)# prints paragraphs

#print(page_soup.body.div)

#grab each product

containers = page_soup.findAll("div", {"class": "item-container"})
print(len(containers))

# to open a file
file_name = "product.csv"
f = open(file_name, "w")
headers = "Brand", "prouct name", "Shipping\n"

f.write("Brand, product name, shippig\n")

#below 3 lines of code is for container 1 that is 0th
#container = containers[0]
#print(container.a)
#print(container.div.div.a.img["title"]) # will return the title
예제 #5
0
def basketball_bet():
    # Get the current month and day in order to get the games playing today
    current_month_text = datetime.now().strftime('%B').lower()
    current_day = datetime.now().strftime('%d')
    # Pull the url based on the current month
    try:
        url = 'https://www.basketball-reference.com/leagues/NBA_2020_games-' + current_month_text + '.html'
    except:
        print('There are currently no basketball games being played today')
        return
    uClient = Ureq(url)
    raw_content = uClient.read()
    page_soup = soup(raw_content, "html.parser")
    html = list(page_soup.children)[3]
    schedule_text = html.findAll(class_="left")
    # Get the text from the html
    schedule = [game.get_text() for game in schedule_text]
    # Fill dataframe with game date, visiting team name, and home team name
    bin_len = 3
    start = 0
    end = start + bin_len
    week_list = []
    while end < (len(schedule) + 1):
        week = schedule[start:end]
        start = end + 1
        end = start + bin_len
        week_list.append(week)
    df_1 = pd.DataFrame(week_list)
    df_1.columns = ['Date', 'Visitor', 'Home']
    # Clean all of the comlumns
    row_count = 0
    new = df_1['Date'].str.split(" ", n=3, expand=True)
    while row_count < len(df_1):
        df_1['Date'][row_count] = new[2][row_count][:-1]
        row_count += 1
    game_time = html.findAll(class_='right')
    game_time = [team.get_text() for team in game_time]
    bin_len = 4
    start = 0
    end = start + bin_len
    week_list = []
    while end < (len(game_time) + 1):
        week = game_time[start:end]
        start = end
        end = start + bin_len
        week_list.append(week)
    df = pd.DataFrame(week_list)
    df.columns = ['Game_Time (EST)', 'Stat1', 'Stat2', 'Stat3']
    df = df['Game_Time (EST)']
    # Concat the dataframes to get desired data
    todays_games = pd.concat([df_1, df], axis=1, join='inner')
    todays_games = todays_games[todays_games['Date'] == current_day]
    # If there are no games being played, exit function
    if len(todays_games) == 0:
        print('There are currently no basketball games being played today.')
        return
    # Clean team names into more readable forms
    todays_games = todays_games.reset_index()
    todays_games = todays_games[['Visitor', 'Home', 'Game_Time (EST)']]
    todays_games['Home'] = todays_games['Home'].apply(lambda x: teams_dict[x])
    todays_games['Visitor'] = todays_games['Visitor'].apply(
        lambda x: teams_dict[x])
    # Return games being played today
    return todays_games
예제 #6
0
from urllib.request import urlopen as Ureq
from bs4 import BeautifulSoup as soup
from newsapi import NewsApiClient
# pre-processor
my_url = 'http://topforeignstocks.com/stock-lists/the-complete-list-of-biotech-stocks-trading-on-nasdaq/'  # List of biotech companies
uClient = Ureq(my_url)  #downloads webpage
page_html = uClient.read()
page_soup = soup(page_html, "html.parser")
# print(page_soup.tbody.td)
bio_tech_companies = page_soup.findAll("td", {"class": "column-2"})
for i in range(1):
    query = str(bio_tech_companies[i].text.strip())
print(query)
newsapi = NewsApiClient(api_key='42eab217e53348febe920e907f524b0f')
top_headlines = newsapi.get_top_headlines(q=str('biotech'), language='en')
print(top_headlines)
uClient.close()
page_numbers = 1
headers = "Album | Artist | Score | Author | Genre | Review Date \n"

#open csv file
with open('albums_complete_second_half.csv', 'wb') as csvfile:
	csvfile.write((headers).encode('utf8'))

	items = []

#Iterate through every page on https://pitchfork.com/reviews/albums/
	while(True):
		url = (base_url_main_page+"?page="+str(page_numbers))

		#iterate through until no page is found. Ignore other HTTP response errors
		try:
			response = Ureq(url)
		except urllib.error.HTTPError as e:
			error_message = e.read()
			if e.getcode() == 404:
				sys.exit("No page found")
			else:
				print(error_message)
		else:
			page_html = response.read()
			page_soup = soup(page_html, "html.parser")

			url_names = page_soup.findAll("div",class_= "review")

			count = 0
			#enter urls of album reviews
			for item in url_names:
예제 #8
0
def basketball_win(date):
    current_month = date[0:2]
    current_day = date[3:5]
    string = current_month
    current_month_text = datetime.strptime(string, "%m")
    current_month_text = datetime.strftime(current_month_text, "%B").lower()
    # Pull the url based on the current month
    try:
        url = 'https://www.basketball-reference.com/leagues/NBA_2020_games-' + current_month_text + '.html'
    except:
        print('There are currently no basketball games being played today')
        return
    uClient = Ureq(url)
    raw_content = uClient.read()
    page_soup = soup(raw_content, "html.parser")
    html = list(page_soup.children)[3]
    schedule_text = html.findAll(class_="left")
    # Get the text from the html
    schedule = [game.get_text() for game in schedule_text]
    # Fill dataframe with game date, visiting team name, and home team name
    bin_len = 3
    start = 0
    end = start + bin_len
    week_list = []
    while end < (len(schedule) + 1):
        week = schedule[start:end]
        start = end + 1
        end = start + bin_len
        week_list.append(week)
    df_1 = pd.DataFrame(week_list)
    df_1.columns = ['Date', 'Visitor', 'Home']
    # Clean all of the comlumns
    row_count = 0
    new = df_1['Date'].str.split(" ", n=3, expand=True)
    while row_count < len(df_1):
        df_1['Date'][row_count] = new[2][row_count][:-1]
        row_count += 1

    game_time = html.findAll(class_='right')
    game_time = [team.get_text() for team in game_time]
    bin_len = 4
    start = 0
    end = start + bin_len
    week_list = []
    while end < (len(game_time) + 1):
        week = game_time[start:end]
        start = end
        end = start + bin_len
        week_list.append(week)
    df = pd.DataFrame(week_list)
    df.columns = ['Game_Time', 'Visitor_Points', 'Home_Points', 'Stat3']
    df.drop(columns=['Stat3'], inplace=True)
    total_df = pd.concat([df_1, df], axis=1, join='inner')
    win_list = []
    row_count = 0
    for row in total_df['Date']:
        if (total_df['Visitor_Points'][row_count]) > (
                total_df['Home_Points'][row_count]):
            win_list.append(total_df['Visitor'][row_count])
        elif (total_df['Home_Points'][row_count]) > (
                total_df['Visitor_Points'][row_count]):
            win_list.append(total_df['Home'][row_count])
        elif (total_df['Home_Points'][row_count]) != '' and (
                total_df['Visitor_Points'][row_count]) != '' and (
                    total_df['Home_Points'][row_count]) == (
                        total_df['Visitor_Points'][row_count]):
            win_list.append('Tie')
        else:
            win_list.append('Incomplete')
        row_count += 1
    total_df['Winner'] = win_list
    todays_games = total_df[total_df['Date'] == current_day]
    if len(todays_games) == 0:
        print('There are currently no basketball games being played today.')
    todays_games['Home'] = todays_games['Home'].apply(lambda x: teams_dict[x])
    todays_games['Visitor'] = todays_games['Visitor'].apply(
        lambda x: teams_dict[x])
    todays_games['Winner'] = todays_games['Winner'].apply(
        lambda x: teams_dict[x])
    return todays_games
import bs4
from urllib.request import urlopen as Ureq
from bs4 import BeautifulSoup as soup

#target web page
my_url = "https://www.shelflife.co.za/Online-store/sneakers"

#opening connection and grabbing page

uClient = Ureq(my_url)
page_html = uClient.read()
uClient.close()

#html parsing
page_soup = soup(page_html, "html.parser")

#grabs each product
containers = page_soup.find_all("div", {"class": "col-xs-6 col-sm-3"})
title = page_soup.find_all("div", {"class": "title"})
price = page_soup.findAll("div", {"class" : "price"})
#finds sale product
sale_products = page_soup.findAll('div', {"class" : "special_label sale"} )

#open a new csv file to write data scraped from  website
filename = "shelf_life_sneaks_sale.csv"
f = open(filename, "w")
headers = "product_name,price\n"
f.write(headers)

#writing the data to the csv file
for i in range (0, len(containers)):
예제 #10
0
    def scrape():
        ####################################################################################
        concat = Sentry.get()
        #my_url = "file:///C:/Users/Adam-22-26/Desktop/graphics%20card%20-%20Newegg.com.html"
        my_url = 'https://www.newegg.com/global/ph-en/p/pl?d={}'.format(concat)
        my_url = my_url.replace(' ', '+')
        ####################################################################################
        uClient = Ureq(my_url)

        page_html = uClient.read()
        uClient.close()
        #html_parsing
        page_soup = Soup(page_html, "html.parser")
        #grabe each
        containers = page_soup.findAll("div", {"class": "item-container"})

        #manufacturer = page_soup.findAll("label",{"class": "form-checkbox"})
        #print(manufacturer )
        #print(len(containers))
        #print(containers[5:])
        #container = containers[5]
        #---------------------------------------- save the csv files
        fileName = "{}.csv".format(
            concat)  ###############################################

        f = open(fileName, "w")
        headers = "BRAND     , PRICES    ,  SAVES    , TITLES   , LINK    \n"  #
        f.write(headers)

        for container in containers[4:]:
            #---------------------------------------------------------
            brand_container = container.findAll("a", {"class": "item-brand"})
            brand = brand_container[0].img["title"]  #brand name

            #-------------------------------------------------------------------
            may_know = container.findAll("a", {"class": "item-title"})
            #print(may_know)

            ####################################################################
            title = container.a.img["title"]  #Name of selling
            #print(container)
            #######################################################3
            hyper = brand_container[0]["href"]
            #hyper = container.findAll("div",{"class": "item-info"})
            #hyper = hypers.a
            #print(hyper)
            #--------------------------------------------------------------
            price_container = container.findAll("li",
                                                {"class": "price-current"})
            price_container2 = price_container[0].strong
            price = re.findall(r'.\d.\d\d\d', str(price_container2))
            prices = ''.join(price)
            #------------------------------------------------------------------------
            save_container = container.findAll("span",
                                               {"class": "price-save-percent"})
            save = re.findall(r'\d\d.', str(save_container))
            saves = ''.join(save)

            if saves == '':
                saves = "None"
            else:
                saves = saves
            if prices == "":
                prices = "Not Available"
            else:
                prices = prices

            brandlistbox.insert(END, " :   " + brand)
            pricelistbox.insert(END, "₱ " + prices)
            savelistbox.insert(END, saves)
            Listbox4.insert(END, " :   " + title)
            hyperlink.insert(END, '  ' + hyper)
            #-------------------------------------------------------------------------

            f.write(
                brand.replace(',', '') + ", " + prices.replace(
                    ',', '.').replace('0', '1').replace('>', '    ') + ',' +
                saves.replace('', '').replace('None', '0%') + ', ' +
                title.replace(',', '') + ', ' + hyper + "\n")

        f.close()
        new_win = Button(window,
                         width=10,
                         text="New_Win",
                         command=mainwindow,
                         height=1,
                         font="Jokerman",
                         relief=RAISED,
                         activebackground="LightBlue1",
                         background='sky blue')
        new_win.place(x=105, y=90)
        messagebox.showinfo("Happens", "DONE! \n press ok to proceed")
예제 #11
0
def notifyprice(price,type,updown):
    sub = 'attention! {} price {} to {}'.format(type,updown,price)
    attention_body = 'please attention {} price!!! current price is {} to {}'.format(type,updown,price)
#    wechat_msg(attention_body)
    mail_msg(sub,attention_body)




if __name__ == '__main__':
#    itchat.auto_login()
    while True:
        queryurl = ''
        #Set datasource here
        req = Ureq(queryurl)
        china_gold = 
        china_silver = 
        #your way of obtaining price data
        print('gold price is: {}'.format(china_gold))
        print('silver price is: {}'.format(china_silver))
        if china_silver>=silver_high or china_silver<=silver_low:
            if china_silver >= silver_high:
                notifyprice(china_silver,'silver','up')
                silver_high += silver_range
                print('silver notify range raised to {}'.format(silver_high))
            if china_silver <= silver_low:
                notifyprice(china_silver,'silver','down')
                silver_low -= silver_range
                print('silver notify range down to {}'.format(silver_low))
예제 #12
0
def football_bet():
    # Ensure that the football season is currently going on
    year_date = datetime.now().strftime('%Y-%m-%d')
    if year_date > 'February 2 2020' and year_date < 'September 10 2020':
        print(
            "The next football season hasn't begun yet. Please come back on September 10."
        )
        return
    elif year_date < 'February 2 2020':
        url = 'https://www.pro-football-reference.com/years/2019/games.htm'
    else:
        url = 'https://www.pro-football-reference.com/years/2020/games.htm'
    # Run through BeautifulSoup steps to pull wanted data
    uClient = Ureq(url)
    raw_content = uClient.read()
    page_soup = soup(raw_content, "html.parser")
    html = list(page_soup.children)[3]
    teams_win_loss = html.findAll(class_='left')
    game = html.findAll(class_='right')
    game = [team.get_text() for team in game]
    teams_win_loss = [team.get_text() for team in teams_win_loss]
    removal = ['Day']
    teams_win_loss = [item for item in teams_win_loss if item not in removal]
    # Set todays date that will be used to select todays games
    date = datetime.now().strftime('%B %d')
    # Clean stats
    bin_len = 8
    start = 0
    end = start + bin_len
    week_list = []
    while end < (len(game) + 1):
        week = game[start:end]
        start = end + 1
        end = start + bin_len
        week_list.append(week)
    df_1 = pd.DataFrame(week_list)
    df_1.columns = [
        'Game_Week', 'Time (EST)', 'Stat1', 'Stat2', 'Stat3', 'Stat4', 'Stat5',
        'Stat6'
    ]

    bin_len = 4
    start = 0
    end = start + bin_len
    week_list = []
    while end < (len(teams_win_loss) + 1):
        week = teams_win_loss[start:end]
        start = end
        end = start + bin_len
        week_list.append(week)
    df_2 = pd.DataFrame(week_list)
    df_2.columns = ['Day_Of_Week', 'Date', 'Home', 'Visitor']
    # Concat data frames
    football = pd.concat(
        [df_1[['Game_Week', 'Time (EST)']], df_2[['Date', 'Home', 'Visitor']]],
        axis=1,
        join='inner')
    # Select only games being played today
    todays_games = football[football['Date'] == date]
    # Return dataframe
    return todays_games
예제 #13
0
import bs4
from urllib.request import urlopen as Ureq
from bs4 import BeautifulSoup as soup

my_url = []

for x in range(10):
    my_url.append("https://de.aliexpress.com/category/2118/printers/" +
                  str(x) + "html?site=deu&tag=")

for y in range(10):
    uClient = Ureq(my_url[y])
    page_html = uClient.read()
    uClient.close()

    page_soup = soup(page_html, "html.parser")
    #grabs each product
    #containers = page_soup.findAll("li",{"class":"list-item"})
    #filename = "printers.csv"
    #f = open(filename, "w")

    #headers ="brand,product_name, shipping, price_real\n"

    # f.write(headers)
    ullist = page_soup.findAll("div", {"class": "col-main"})
    error_p = page_soup.findAll(
        "p", {"class": "ui-notice ui-notice-normal ui-notice-prompt"})
    error = []
    error.append(error_p)

    if error == [[]]:
예제 #14
0
def hockey_win(date):
    url = 'https://www.hockey-reference.com/leagues/NHL_2020_games.html'
    # Run through BeautifulSoup steps
    uClient = Ureq(url)
    raw_content = uClient.read()
    page_soup = soup(raw_content, "html.parser")
    html = list(page_soup.children)[3]
    game = html.findAll(class_ = 'left')
    results = html.findAll(class_ = 'right')
    game = [team.get_text() for team in game]
    results = [team.get_text() for team in results]
    results_drop = ['LOG']
    results = [results for results in results if results not in results_drop]
    drop_list = ['Date','Visitor','Home','Notes','']
    # Clean data
    game = [game for game in game if game not in drop_list]
    bin_len = 3
    start = 0
    end = start + bin_len
    week_list = []
    while end < (len(game) + 1):
        week = game[start:end]
        start = end
        end = start + bin_len
        week_list.append(week)
    df = pd.DataFrame(week_list)
    df.columns = ['Date','Visitor','Home']
    # Clean team names into readable format
    bin_len = 4
    start = 0
    end = start + bin_len
    week_list = []
    while end < (len(results) + 1):
        week = results[start:end]
        start = end
        end = start + bin_len
        week_list.append(week)
    df_1 = pd.DataFrame(week_list)
    row_count = 0
    visitor = df['Visitor'].str.split(" ", expand = True) 
    home = df['Home'].str.split(" ", expand = True) 
    while row_count < len(df):
        if visitor[2][row_count] == None:
            df['Visitor'][row_count] = visitor[1][row_count]
        elif visitor[2][row_count] != None:
            df['Visitor'][row_count] = visitor[2][row_count]
        if home[2][row_count] == None:
            df['Home'][row_count] = home[1][row_count]
        elif home[2][row_count] != None:
            df['Home'][row_count] = home[2][row_count]
        row_count += 1
    # Only select todays games
    df_1.columns = ['Visitor_Goals','Home_Goals','Attendance','Time']
    total_df = pd.concat([df,df_1],axis=1,join='inner')
    win_count = 0
    win_list = []
    while win_count < len(total_df):
        if (total_df['Visitor_Goals'][win_count]) > (total_df['Home_Goals'][win_count]):
            win_list.append(total_df['Visitor'][win_count])
        elif (total_df['Home_Goals'][win_count]) > (total_df['Visitor_Goals'][win_count]):
            win_list.append(total_df['Home'][win_count])
        elif (total_df['Home_Goals'][win_count]) != '' and (total_df['Visitor_Goals'][win_count]) != '' and (total_df['Home_Goals'][win_count]) == (total_df['Visitor_Goals'][win_count]):
            win_list.append('Tie')
        else:
            win_list.append('Incomplete') 
        win_count += 1
    total_df['Winner'] = win_list
    todays_games = total_df[total_df['Date'] == date]
    todays_games = todays_games.reset_index()
    return todays_games
예제 #15
0
def baseball_bet():
    # Set the current date in a readable form and the form used for the html
    todays_date = datetime.now().strftime('%m-%d-%Y')
    date_html = datetime.now().strftime('%Y%m%d')
    # Set Opening Day date
    openeing_day = "03-26-2020"
    # Parse OD date
    OD = datetime.strptime(openeing_day, "%m-%d-%Y")
    # Set current date
    present = datetime.now()
    # If it is before OD, return from function
    if present.date() < OP.date():
        print('Opening Day is not until March 26. Please come back then.')
        return
    # Set url for todays date if season has already started
    url = 'https://www.espn.com/mlb/schedule/_/date/' + date_html
    # Make sure that there are acutally games being played
    # If there are not, the url will not work
    try:
        uClient = Ureq(url)
        raw_content = uClient.read()
    except:
        print('There are no games being played on this day.')
        return
    # Run through BeautifulSoup steps to pull out desired data
    page_soup = soup(raw_content, "html.parser")
    html = list(page_soup.children)[3]
    game = html.findAll(class_='external')
    game_date_list = []
    # Fix dates given into readable datetime format
    for x in range(1, len(game)):
        game_date = game[x]['href'].split('/')[5].split('-')[-3:-1]
        game_date.append('2020')
        sent_str = ""
        for i in game_date:
            sent_str += str(i) + "-"
        sent_str = sent_str[:-1]
        date = datetime.strptime(sent_str, '%m-%d-%Y')
        date = date.strftime('%m-%d-%Y')
        game_date_list.append(date)
    # Get the names of the teams that are playing on that day
    game = html.findAll(class_='team-name')
    game = [team.get_text() for team in game]
    game_list = []
    for item in game:
        # The abbrvs are only the last three characters in the str
        item = item[-3:]
        game_list.append(item)
    # Split home and away teams from the list of cleaned teams
    bin_len = 2
    start = 0
    end = start + bin_len
    week_list = []
    while end < (len(game_list) + 1):
        week = game_list[start:end]
        start = end
        end = start + bin_len
        week_list.append(week)
    df = pd.DataFrame(week_list)
    df.columns = ['Visitor', 'Home']
    df['Date'] = game_date_list
    todays_games = df[df['Date'] == todays_date]
    # Apply the lambda function that will clean the team names into more colloquial names
    todays_games['Home'] = todays_games['Home'].apply(lambda x: teams_dict[x])
    todays_games['Visitor'] = todays_games['Visitor'].apply(
        lambda x: teams_dict[x])
    # return data frame of games that are being played today
    return todays_games