Python urlreq示例，urllib.request.urlreq Python示例

示例#1

0

显示文件

文件： import_flyingj.py 项目： jstetic/webapp

def downloadLocationList():
    url = 'https://pilotflyingj.com/umbraco/surface/storelocations/download?Format=xls&PageSize=793&PageNumber=1'
    client = urlreq(url)
    sheet = client.read()
    client.close()
    print("Location list read from website.", file=sys.stderr)
    try:
        mkdir('cache')
    except FileExistsError:
        # ignore an error that indicates cache already exists
        pass
    file = open(cacheFileName, 'wb')
    file.write(sheet)
    file.close()

示例#2

0

显示文件

文件： import_ta.py 项目： ahalliop/webapp

def downloadLocationList():
    url =  'http://www.ta-petro.com/assets/ce/Documents/Master-Location-List.xls'
    client = urlreq(url)
    sheet = client.read()
    client.close()
    print("Location list read from website.", file=sys.stderr)

    try:
        mkdir('cache')
    except FileExistsError:
        # ignore an error that indicates cache already exists
        pass

    file = open(cacheFileName, 'wb')
    file.write(sheet)
    file.close()

示例#3

0

显示文件

文件： ss.py 项目： LyoRex/sport-score-scraper

def nfl_get_num_weeks(year):
    maxWeek = 1

    url = 'https://www.pro-football-reference.com/years/' + year + '/week_1.htm'

    try:
        # open connection and get page html
        urlClient = urlreq(url)
        page_html = urlClient.read()
        urlClient.close()
    except:
        print("Could not open page: " + url)

    page_soup = soup(page_html, "html.parser")

    maxWeek = int(
        str(
            page_soup.find("div", {
                "class": "section_wrapper"
            }).find(string=lambda text: isinstance(text, Comment))).count(
                "/week_"))

    return maxWeek

示例#4

0

显示文件

文件： scrap_data.py 项目： ABHISHEK-T-S/Web_Scrapping

from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as urlreq

myurl = "https://www.flipkart.com/search?q=phones&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off"

uclient = urlreq(myurl)
page_html = uclient.read()
uclient.close()

page_soup = soup(page_html,"html.parser")

page_soup.body.p #gives you the p value

filename = "products.csv"
f = open(filename,"w")
headers ="name, price, rating \n"
f.write(headers)

names = page_soup.find_all("div",{"class":"_3wU53n"}) # grabs all  names using class
price = page_soup.find_all("div",{"class":"_1vC4OE _2rQ-NK"}) # grabs all price using class
rating = page_soup.find_all("div",{"class":"hGSR34"}) # grabs all ratings using class
for x in range(0 , 23):
    product_name = names[x].text
    product_price = price[x].text
    product_rating = rating[x].text

    print("name :"+ product_name)
    print("price :"+ product_price)
    print("rating :"+ product_rating)

    f.write(product_name + "," +product_price.replace("₹"," ").replace(",","") +"," +product_rating + "\n")

示例#5

0

显示文件

    def scrape(self):
        if self.first:
            titles = "brand, product_name, sale_percentage, price, shipping, total, url_link"
            file_name = "newegg.csv"
            try:
                f = open(file_name, "w")
            except PermissionError:
                print(
                    "------ERROR------ERROR------ERROR------ERROR------ERROR------ERROR------ERROR------"
                )
                print(
                    "FILE OPEN ERROR! Please close the .csv file before running the program."
                )
                print(
                    "-----------------------------------------------------------------------------------"
                )
                print('\n')
                print("Exiting program...")
                sleep(3)
                quit()
            currentDate = str(datetime.datetime.now())
            f.write(titles + ',,,' + currentDate + "\n")
            print('Collecting data, be patient!...')
            print('\n')
        else:
            file_name = "newegg.csv"
            try:
                f = open(file_name, "a")
            except PermissionError:
                print(
                    "------ERROR------ERROR------ERROR------ERROR------ERROR------ERROR------ERROR------"
                )
                print(
                    "FILE OPEN ERROR! Please close the .csv file before running the program."
                )
                print(
                    "-----------------------------------------------------------------------------------"
                )
                print('\n')
                print("Exiting program...")
                sleep(3)
                quit()

        # setting the target URL, downloading the page, reading the contents, and dumping it into a variable called url_page_dump
        url_page_download = urlreq(self.target_url)
        url_page_dump = url_page_download.read()
        url_page_download.close()

        # parses the dumped page into html
        page_soup = bsoup(url_page_dump, "html.parser")

        # finds the total number of pages
        try:
            num_of_pages = (page_soup.find_all(
                "span",
                {"class": "list-tool-pagination-text"}))[0].strong.text[2:]
        except:
            num_of_pages = '1'

        if self.debug:
            print(
                '------------------------------DEBUG------------------------------'
            )
        # looping through all the found pages
        for i in range(1, int(num_of_pages) + 1):
            if ((i > self.page_limit) & (self.page_limit != -1)):
                break

            if i == self.page_limit:
                page_limit_test = True
            else:
                page_limit_test = False

            digit_length = len(str(i - 1))
            self.target_url = self.target_url[0:-digit_length] + str(i)
            if self.debug:
                print('PAGE URL: ' + str(self.target_url))

            url_page_download = urlreq(self.target_url)
            url_page_dump = url_page_download.read()
            page_soup = bsoup(url_page_dump, "html.parser")

            try:
                anchor_element = page_soup.find("div", {
                    "class": "list-tool-search"
                }).label.text
                if anchor_element != "Search Within:":
                    print("\n")
                    print(
                        "------ERROR------ERROR------ERROR------ERROR------ERROR------ERROR------ERROR------"
                    )
                    print(
                        "Newegg has likely detected this as a bot! Please consider adding a delay, limiting"
                    )
                    print("pages, and changing the VPN server location!")
                    print(
                        "-----------------------------------------------------------------------------------"
                    )
                    print("\n")
                    print("Exiting program...")
                    sleep(3)
                    quit()
            except AttributeError:
                print("\n")
                print(
                    "------ERROR------ERROR------ERROR------ERROR------ERROR------ERROR------ERROR------"
                )
                print(
                    "Newegg has likely detected this as a bot! Please consider adding a delay, limiting"
                )
                print("pages, and changing the VPN server location!")
                print(
                    "-----------------------------------------------------------------------------------"
                )
                print("\n")
                print("Exiting program...")
                sleep(3)
                quit()

            # finds every product listing on the current page
            item_containers = page_soup.find_all("div",
                                                 {"class": "item-container"})
            #print(len(item_containers))

            #loops through all the products (container) and returns useful information about them
            for container in item_containers:
                if self.debug:
                    print('PAGE: ' + str(i))

                # finds the product brand
                try:
                    product_brand = container.find("a", {
                        "class": "item-brand"
                    }).img["title"]
                except:
                    product_brand = 'Unknown Brand'
                if self.debug:
                    print('BRAND: ' + product_brand)

                # finds the product name
                try:
                    product_name = container.find("a", {
                        "class": "item-title"
                    }).text
                except:
                    product_name = "Unknown Name"
                if self.debug:
                    print('PRODUCT NAME: ' + product_name)

                # finds the sale percentage
                try:
                    product_sale_percent = container.find(
                        "span", {
                            "class": "price-save-percent"
                        }).text
                    if product_sale_percent[-1] != "%":
                        product_sale_percent = "0%"
                except:
                    product_sale_percent = "0%"
                if self.debug:
                    print('SALE PERCENT: ' + product_sale_percent)

                # finds the product price and replaces a comma with nothing if there exists one in the price
                try:
                    product_price_dollars = container.find(
                        "li", {
                            "class": "price-current"
                        }).strong.text
                    product_price_cents = container.find(
                        "li", {
                            "class": "price-current"
                        }).sup.text
                    product_price = (product_price_dollars +
                                     product_price_cents)
                except:
                    product_price = "0"
                if ("," in product_price):
                    product_price = product_price.replace(",", "")
                if self.debug:
                    print('PRICE: ' + product_price)

                # finds the product shipping price
                try:
                    product_shipping_price = container.find(
                        "li", {
                            "class": "price-ship"
                        }).text
                except:
                    product_shipping_price = "0"
                if (product_shipping_price == ""):
                    product_shipping_price = "0"
                elif (product_shipping_price[0] == "$"):
                    product_shipping_price = product_shipping_price[
                        1:product_shipping_price.find(" ")]
                elif (product_shipping_price == "Free Shipping"):
                    product_shipping_price = "0"
                else:
                    product_shipping_price = "0"
                if self.debug:
                    print('SHIPPING PRICE: ' + product_shipping_price)

                # calculates the total product price with shipping included (also limits the float value to 2 decimal places)
                try:
                    total_product_price = '%.2f' % (
                        float(product_price) + float(product_shipping_price))
                    if (total_product_price == "0"):
                        total_product_price = 'Unknown Price'
                except:
                    total_product_price = product_price
                if self.debug:
                    print('TOTAL PRICE: ' + total_product_price)

                try:
                    product_link = container.find(
                        "a", {"class": "item-title"})["href"]
                except:
                    product_link = 'https://newegg.ca'
                if self.debug:
                    print('URL: ' + product_link)

                f.write(product_brand + "," + product_name.replace(",", ";") + "," + product_sale_percent + "," + product_price + "," + product_shipping_price + "," \
                    + total_product_price + "," + product_link + "\n")
                if self.debug:
                    print(
                        '_____________________________________________________'
                    )

            if ((self.delay != 0) & (i != int(num_of_pages)) &
                (page_limit_test == False)):
                if self.debug:
                    print('\n')
                    print(
                        '------------------------DEBUG------------------------'
                    )
                    print('Delaying for ' + str(self.delay) + ' second(s)')
                    print(
                        '-----------------------------------------------------'
                    )

                for i in range(0, self.delay):
                    if self.debug:
                        print('Delay time left: ' + str(self.delay - i),
                              end="\r")
                    sleep(1)

                if self.debug:
                    print('\n')
                    print(
                        '_____________________________________________________'
                    )

        if self.debug:
            print(
                '-----------------------------------------------------------------'
            )
            print('\n')
            print('------------------------DEBUG------------------------')
            print('End of main sequence reached!')
            print('-----------------------------------------------------')

        f.close()

        if self.repeat:
            if self.debug:
                print('\n')
                print('------------------------DEBUG------------------------')
                print('Repeat sequence reached!')
                print('-----------------------------------------------------')
                print('\n')
            self.repeat = False
            self.first = False

            self.target_url = amd

            self.scrape()

示例#6

0

显示文件

文件： ss.py 项目： LyoRex/sport-score-scraper

def nfl_scores():
    # initialize year to current year and week to first week
    year = str(date.today().year)
    week = "1"
    maxWeek = 1
    try:
        if len(sys.argv) == 4:
            if len(str(sys.argv[2])) == 4:
                year = str(sys.argv[2])
            else:
                print("Must input a valid year!")
                raise Exception
            maxWeek = nfl_get_num_weeks(str(year))
            if int(sys.argv[3]) > 0:
                if int(sys.argv[3]) <= maxWeek:
                    week = str(sys.argv[3])
                else:
                    print("NFL " + year + " only has " + str(maxWeek) +
                          " weeks!")
                    raise Exception
            else:
                print("Must enter a positive value for week!")
                raise Exception
        else:
            print("Arguments for football are <year> <week>")
            raise Exception
    except:
        print("One or more of your arguments are not valid!")

    #Pro Football Reference
    my_url = 'https://www.pro-football-reference.com/years/' + year + '/week_' + week + '.htm'

    pageExist = True
    try:
        # open connection and get page html
        urlClient = urlreq(my_url)
        page_html = urlClient.read()
        urlClient.close()
    except HTTPError:
        pageExist = False
        print("Page does not exist!")

    if pageExist:
        # initiate soup
        page_soup = soup(page_html, "html.parser")
        gamePosts = page_soup.findAll(
            "div", {"class": "game_summary expanded nohover"})
        #header
        print("Getting scores from Week " + week + " of the " + year +
              " Season!")
        print("================================================")

        for post in gamePosts:
            gameDate = post.find("tr", {"class", "date"}).text

            homeTeam = post.find("table",
                                 {"class", "teams"}).tbody.select("tr")[1]
            homeTeamName = homeTeam.select("td")[0].text
            awayTeam = post.find("table",
                                 {"class", "teams"}).tbody.select("tr")[2]
            awayTeamName = awayTeam.select("td")[0].text

            homeTeamPoints = homeTeam.select("td")[1].text
            awayTeamPoints = awayTeam.select("td")[1].text

            passYdName = post.find(
                "table",
                {"class", "stats"}).select("tr")[0].select("td")[1].text
            passYdPts = post.find(
                "table",
                {"class", "stats"}).select("tr")[0].select("td")[2].text

            rushYdName = post.find(
                "table",
                {"class", "stats"}).select("tr")[1].select("td")[1].text
            rushYdPts = post.find(
                "table",
                {"class", "stats"}).select("tr")[1].select("td")[2].text

            recYdName = post.find(
                "table",
                {"class", "stats"}).select("tr")[2].select("td")[1].text
            recYdPts = post.find(
                "table",
                {"class", "stats"}).select("tr")[2].select("td")[2].text

            winnerIndex = 1
            if post.find(
                    "table",
                {"class", "teams"}).find("tbody").select("tr")[1] == post.find(
                    "tr", {"class", "winner"}):
                winnerIndex = 1
            else:
                winnerIndex = 2

            print(gameDate + ":")
            print("(H) " + homeTeamName + " scored " + homeTeamPoints +
                  " points.",
                  end=" ")
            if winnerIndex == 1:
                print("[WINNER]")
            else:
                print()
            print("(A) " + awayTeamName + " scored " + awayTeamPoints +
                  " points.",
                  end=" ")
            if winnerIndex == 2:
                print("[WINNER]")
            else:
                print()

            print("\t-" + passYdName + " led in passing yards with " +
                  passYdPts + " yards")
            print("\t-" + rushYdName + " led in rushing yards with " +
                  rushYdPts + " yards")
            print("\t-" + recYdName + " led in receiving yards with " +
                  recYdPts + " yards")

            print(
                '------------------------------------------------------------')
            print()

示例#7

0

显示文件

文件： ss.py 项目： LyoRex/sport-score-scraper

def nba_scores():
    # Set default date to current day
    month = str(date.today().month)
    day = str(date.today().day)
    year = str(date.today().year)
    # check for inputs
    try:
        if len(sys.argv) >= 3:
            # input is today
            if sys.argv[2].lower() == "today" or sys.argv[2].lower() == "t":
                month = str(date.today().month) if len(str(date.today(
                ).month)) == 2 else "0" + str(date.today().month)
                day = str(date.today().day) if len(str(
                    date.today().day)) == 2 else "0" + str(date.today().day)
                year = str(date.today().year)
            # input is yesterday
            elif sys.argv[2].lower() == "yesterday" or sys.argv[2].lower(
            ) == "y":
                daysBack = 1
                # set amount of days back
                if len(sys.argv) == 4:
                    if int(sys.argv[3]) >= 0:
                        daysBack = int(sys.argv[3])
                    else:
                        print(
                            "Can not go back negative days; going back 1 day instead!"
                        )

                # set month, day, and year
                newDate = date.today() - timedelta(daysBack)
                month = str(newDate.month)
                day = str(newDate.day)
                year = str(newDate.year)
            # date is given
            else:
                # input year is after current year
                if int(sys.argv[4]) > int(date.today().year):
                    validInDate = False
                elif int(sys.argv[4]) == int(date.today().year):
                    # input month is after current month
                    if int(sys.argv[2]) > int(date.today().month):
                        validInDate = False
                    elif int(sys.argv[2]) == int(date.today().month):
                        # input day is after current day
                        if int(sys.argv[3]) > int(date.today().day):
                            validInDate = False
                month = str(sys.argv[2])
                day = str(sys.argv[3])
                year = str(sys.argv[4])
    except:
        print("One or more of your arguments are not valid!")

    #Basketball Reference
    my_url = 'https://www.basketball-reference.com/boxscores/?month=' + month + '&day=' + day + '&year=' + year

    pageExist = True
    try:
        # open connection and get page html
        urlClient = urlreq(my_url)
        page_html = urlClient.read()
        urlClient.close()
    except HTTPError:
        pageExist = False
        print("Page does not exist!")

    if (pageExist):
        # html parsing
        page_soup = soup(page_html, "html.parser")
        # get all game posts
        gamePosts = page_soup.findAll(
            "div", {"class": "game_summary expanded nohover"})

        # header
        print("Getting scores from " + month + "/" + day + "/" + year)
        print("=================================")

        # parse data for all games
        for post in gamePosts:
            leadScorer = post.find("table", {
                "class": "stats"
            }).tbody.tr.select('td')[1].text
            mostPoints = post.find("table", {
                "class": "stats"
            }).tbody.tr.select('td')[2].text

            homeTeam = post.select('table')[1].tbody.select('tr')[0].td.a.text
            awayTeam = post.select('table')[1].tbody.select('tr')[1].td.a.text

            winner = post.find("table", {
                "class": "teams"
            }).tbody.find("tr", {"class": "winner"})
            winnerTeam = winner.td.text
            winnerPoints = winner.find("td", {"class": "right"}).text

            loser = post.find("table", {
                "class": "teams"
            }).tbody.find("tr", {"class": "loser"})
            loserTeam = loser.td.text
            loserPoints = loser.find("td", {"class": "right"}).text

            print(("(H) " if homeTeam == winnerTeam else "(A) ") + winnerTeam +
                  " scored " + winnerPoints + " points. [WINNER]")
            print(("(H) " if homeTeam == loserTeam else "(A) ") + loserTeam +
                  " scored " + loserPoints + " points.")
            print("\t-The leading scorer was " + leadScorer + " with " +
                  mostPoints + " points.")

            print(
                '------------------------------------------------------------')
            print()