Exemplo n.º 1
0
def scrape(player_type, stats, filename):
    my_url = 'https://www.mlb.com'

    client = urequest(my_url + '/players')
    page_html = client.read()
    client.close()

    page_soup = soup(page_html, "html.parser")
    player_list = page_soup.findAll("li", {"class": "p-related-links__item"})

    f = open('stat_scraper/generated_stats/' + filename, "w")
    f.write(",".join(stats) + '\n')

    for player in player_list:
        player_url = my_url + player.a['href']
        try:
            client = urequest(player_url)
            page_html = client.read()
            client.close()
        except:
            continue

        page_soup = soup(page_html, "html.parser")
        player_recent_html = page_soup.findAll(
            "div", {'class': 'player-splits--last player-splits--last-x'})

        if player_recent_html:
            position_html = page_soup.find("div",
                                           {'class': 'player-header--vitals'})
            position = position_html.ul.li.text

            if player_type == 'P' and not position == player_type:
                continue
            elif player_type == 'H' and position == 'P':
                continue

            name = page_soup.find("span", {
                'class': 'player-header--vitals-name'
            }).text
            table = player_recent_html[0].div.div.div.div.table.tbody
            rows = table.findChildren(['th', 'tr'])
            for row in rows:
                cells = row.findChildren('td')
                row_values = [cell.span.text for cell in cells]
                row_values = [name, position] + row_values
                f.write(",".join(row_values) + '\n')

    f.close()
Exemplo n.º 2
0
def scrape_pitching(years):
    my_url = 'https://www.baseball-reference.com/leagues/MLB/'

    f = open('stat_scraper/generated_stats/team_pitching_statistics.csv', "w")
    pitching_columns = ['Team','#P','PAge','RA/G','W','L','W-L%','ERA','G','GS','GF','CG','tSho','cSho','SV','IP','H','R','ER','HR','BB',
        'IBB','SO','HBP','BK','WP','BF','ERA+','FIP','WHIP','H9','HR9','BB9','SO9','SO/W','LOB', 'Postseason']
    f.write(",".join(pitching_columns) + '\n')

    for year in years:
        link = my_url + str(year) + '.shtml'
        client = urequest(link)
        page_html = client.read()
        client.close()

        page_soup = soup(page_html, "html.parser")
        standard_pitching_html = page_soup.find("div", {'id': 'all_teams_standard_pitching'})
        new_page_soup = soup(str(standard_pitching_html), 'lxml')
        pitching_comments = new_page_soup.findAll(text=lambda text:isinstance(text, Comment))[0]
        pitching_soup = soup(pitching_comments, 'lxml')

        standard_pitching_html = pitching_soup.find('div', {'id': 'div_teams_standard_pitching'})
        pitching_table = standard_pitching_html.table.tbody
        pitching_rows = pitching_table.findChildren(['tr'])
        pitching_rows = pitching_rows[:len(pitching_rows)-1]

        postseason_html = page_soup.find("div", {'id': 'all_postseason'})
        new_page_soup = soup(str(postseason_html), 'lxml')
        comments = new_page_soup.findAll(text=lambda text:isinstance(text, Comment))[0]
        commentsoup = soup(comments, 'lxml')
        postseason_table = commentsoup.find('div', {'id': 'div_postseason'}).table.tbody
        postseason_rows = postseason_table.findChildren(['tr'])

        postseason_teams = []
        for row in postseason_rows:
            td_list = row.find_all('td')
            look_at_td = td_list[2]
            teams = look_at_td.findAll('a')
            if not teams[0].text in postseason_teams:
                postseason_teams.append(teams[0].text)
            if not teams[1].text in postseason_teams:
                postseason_teams.append(teams[1].text)

        for row in pitching_rows:
            team_name = row.th.a['title']
            cells = row.findChildren('td')
            row_values = [cell.text for cell in cells]
            row_values = [str(year) + ' ' + team_name] + row_values
            if team_name in postseason_teams:
                row_values.append('1')
            else:
                row_values.append('0')
            f.write(",".join(row_values) + '\n')

    f.close()
Exemplo n.º 3
0
    def create_soup_page(url: str):
        page_soup = None
        u_client = None
        try:
            u_client = urequest(url)
            page_html = u_client.read()
            page_soup = URequest.html_to_soup(page_html)
        except Exception as e:
            print(e)
        finally:
            u_client.close()

        return page_soup
Exemplo n.º 4
0
def scrape_hitting(years):
    my_url = 'https://www.baseball-reference.com/leagues/MLB/'

    f = open('stat_scraper/generated_stats/team_hitting_statistics.csv', "w")
    hitting_columns = ['Team', '#Bat', 'BatAge', 'R/G', 'G', 'PA', 'AB', 'R','H', '2B', '3B', 
        'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS', 'OPS+', 'TB', 'GDP', 'HBP', 'SH', 'SF', 'IBB', 'LOB', 'Postseason']
    f.write(",".join(hitting_columns) + '\n')

    for year in years:
        link = my_url + str(year) + '.shtml'
        client = urequest(link)
        page_html = client.read()
        client.close()

        page_soup = soup(page_html, "html.parser")
        standard_batting_html = page_soup.find("div", {'id': 'all_teams_standard_batting'}).find('div', {'id': 'div_teams_standard_batting'})

        postseason_html = page_soup.find("div", {'id': 'all_postseason'})
        new_page_soup = soup(str(postseason_html), 'lxml')
        comments = new_page_soup.findAll(text=lambda text:isinstance(text, Comment))[0]
        commentsoup = soup(comments, 'lxml')
        postseason_table = commentsoup.find('div', {'id': 'div_postseason'}).table.tbody
        postseason_rows = postseason_table.findChildren(['tr'])

        postseason_teams = []
        for row in postseason_rows:
            td_list = row.find_all('td')
            look_at_td = td_list[2]
            teams = look_at_td.findAll('a')
            if not teams[0].text in postseason_teams:
                postseason_teams.append(teams[0].text)
            if not teams[1].text in postseason_teams:
                postseason_teams.append(teams[1].text)

        batting_table = standard_batting_html.table.tbody
        batting_rows = batting_table.findChildren(['tr'])
        batting_rows = batting_rows[:len(batting_rows)-1]

        for row in batting_rows:
            team_name = row.th.a['title']
            cells = row.findChildren('td')
            row_values = [cell.text for cell in cells]
            row_values = [str(year) + ' ' + team_name] + row_values
            if team_name in postseason_teams:
                row_values.append('1')
            else:
                row_values.append('0')
            f.write(",".join(row_values) + '\n')

    f.close()
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as urequest

url = 'https://www.flipkart.com/search?q=iphone&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off'
uClient = urequest(url)
page_html = uClient.read()
uClient.close()

page_soup = soup(page_html, "html.parser")

containers = page_soup.findAll("div", {"class": "_3O0U0u"})

filename = "products.csv"
f = open(filename, "w")

headers = "Product_Name, Price, Rating\n"
f.write(headers)

for container in containers:
    product_name = container.div.img["alt"]

    price_container = container.findAll("div",
                                        {"class": "col col-5-12 _2o7WAb"})
    price = price_container[0].text

    rating_container = container.findAll("div", {"class": "hGSR34"})
    rating = rating_container[0].text

    print("product_name:" + product_name)
    print("price:" + price)
    print("rating:" + rating)
Exemplo n.º 6
0
from urllib.request import urlopen as urequest
from bs4 import BeautifulSoup as soup

modulePage = 'http://www.open.ac.uk/courses/modules'
OU = "http://www.open.ac.uk"

#grabbing OU module page
uClient = urequest(modulePage)

#Making it readable
pageHtml = uClient.read()

#closing connection
uClient.close()

# structure the page into hmtl
pageSoup = soup(pageHtml, "html.parser")

#print(pageSoup.h1)

#modules are located in class "int-grid7" on the page
modules = pageSoup.findAll("div", {"class": "int-grid7"})

for module in modules:
    print(module.text)

#level are located in class "int-grid5" on the page
levels = pageSoup.findAll("div", {"class": "int-grid5"})

for level in levels:
    print(level.text)
Exemplo n.º 7
0
from urllib.request import urlopen as urequest
from bs4 import BeautifulSoup as bsoup

my_url = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics%20cards'

# Opening up connection, accessing web page
uClient = urequest(my_url)
page_html = uClient.read()
uClient.close()

#html parser
page_soup = bsoup(page_html, "html.parser")

# grab each product
containers = page_soup.findAll("div", {"class": "item-container"})

filename = "Newegg_products.csv"
f = open(filename, "w")

headers = "brand, product_name, shipping\n"

f.write(headers)

for container in containers:
    branding = container.findAll("div", {"class": "item-branding"})
    brand = branding[0].a.img["title"]

    title_container = container.findAll("a", {"class": "item-title"})
    product_name = title_container[0].text

    shipping_container = container.findAll("li", {"class": "price-ship"})