def mainPageScrape(f):
    address = "https://www.newegg.com/Processors-Desktops/SubCategory/ID-343"

    # opening up connection grabbing the page
    uClient = UReq(address)
    page_html = uClient.read()
    uClient.close()

    # html parsing
    page_soup = soup(page_html, "html.parser")

    # add each processor item container to a list of containers
    containers = page_soup.findAll("div", {"class": "item-container"})

    for container in containers:
        list = (containerScrape(container))
        csv_string = list[0] + "," + list[1] + "," + list[2] + "," + list[3] + "," + list[4] + "," + list[5] + "," + \
                     list[6]
        if descriptionlog.__contains__(list[1]):
            print("Duplicate processor found. Not writing to list.")
        else:
            descriptionlog.append(list[1])
            print(csv_string)
            f.write(csv_string + "\n")

    containers.clear()
Пример #2
0
from urllib.request import urlopen as UReq
from bs4 import BeautifulSoup as beau

myurl = 'https://campinascomprelocal.com.br/tipo/bares/'
print(myurl)

# open connection page
uClient = UReq(myurl)
page_html = uClient.read()
uClient.close()

soup = beau(page_html, 'lxml')

contents = soup.title
print(contents)
from bs4 import BeautifulSoup as soup
import requests
from urllib.request import urlopen as UReq

# spørgsmål 1: Webscrape alle titlerne på artiklerne som vises på denne side og print dem alle samt antallet af dem

my_url = 'https://www.dr.dk/nyheder/tema/coronavirus'

uClient = UReq(my_url)
page_html = uClient.read()
uClient.close()

page_soup = soup(page_html, 'html.parser')
# print(page_soup.title)

# grabs each product
containers = page_soup.findAll("span", {"class": "dre-teaser-title__text"})
# print(containers)

print(len(containers))

for container in containers:
    title_container = container.findAll("span", "dre-compact-teaser__title")
    article_name = title_container[0].text

print(article_name)
Пример #4
0
 def getPage(self):
     uClient = UReq(self.url)
     self.page_html = uClient.read()
     uClient.close()
def remainingPagesScrape(f):
    page = 2
    duplicateCount = 0
    link = 'https://www.newegg.com/Processors-Desktops/SubCategory/ID-343/Page-'

    while True:
        try:
            address = link + str(page)
            print()
            print("Preparing to Scrape Page: " + str(page))
            print("Address: " + address)
            print()

            # opening up connection grabbing the page
            uClient = UReq(address)
            page_html = uClient.read()
            uClient.close()

            # html parsing
            page_soup = soup(page_html, "html.parser")

            # add each processor item container to a list of containers
            containers = page_soup.findAll("div", {"class": "item-container"})

            for container in containers:
                list = (containerScrape(container))
                csv_string = list[0] + "," + list[1] + "," + list[
                    2] + "," + list[3] + "," + list[4] + "," + list[
                        5] + "," + list[6]
                if descriptionlog.__contains__(list[1]):
                    print("Duplicate processor found. Not writing to list.")
                    duplicateCount = duplicateCount + 1
                else:
                    descriptionlog.append(list[1])
                    print(csv_string)
                    f.write(csv_string + "\n")
            containers.clear()

            if duplicateCount > 100:
                print()
                print(
                    "Duplicate Count Is " + str(duplicateCount) +
                    ". This Suggests The Data Is Being Reiterated. The Script Will Stop."
                )
                print("Processor Scrape Complete")
                print()
                print("Traversed " + str(page) + " Pages")
                print(
                    str(descriptionlog.__len__()) + " Unique Processors Found")
                print()
                print("Data Written To: " + f.name)
                f.close()
                break

            page = page + 1

        except IndexError as e:
            print()
            page = page + 1
            # f.close()
            print("So Far We Have Traversed " + str(page - 1) + " Pages")
            print(str(descriptionlog.__len__()) + " Unique Processors Found")
            print(str(duplicateCount) + " Duplicates Ignored")