Пример #1
0
def add_item(item, n, inlist, p):
    pika = "https://www.pikalytics.com/pokedex/vgc2018/" + p + "/"

    uData = ureq(pika)  #pull html
    html = uData.read()
    page_soup = soup(html, "html.parser")

    item = page_soup.findAll("div",
                             {"class": "inline-block pokemon-stat-container"})
    item = item[3].findAll("div", {"class": "pokedex-category-wrapper"})
    item = item[0].findAll("div", {"id": "items_wrapper"})
    item = item[0].findAll("div")
    item = item[0].findAll("div", {"class": "pokedex-move-entry-new"})
    item = item[n].findAll("div", {"style": "display:inline-block;"})
    item = str(item)
    item = item.split('>')
    item = item[1].split('<')
    item = item[0]
    if item not in inlist:
        inlist.append(item)
        return (item)
    else:
        n += 1
        return (add_item(item, n, inlist, p))
def department(url):
    my_url = url
    uClient = ureq(my_url)

    # Open link, grab the page
    page_html = uClient.read()
    uClient.close()

    # html parse
    page_soup = soup(page_html, "html.parser")

    # grabs each class
    containers = page_soup.find("div", {"id": "crsebrowser"})

    a = []

    for string in page_soup.table.find_all("a", href=True):
        a.append(string.get_text())

    dict = {my_url: []}
    for i in a:
        dict[my_url].append(i)

    return (dict)
Пример #3
0
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as ureq
import csv

my_url = 'http://www.espn.com/college-sports/basketball/recruiting/playerrankings/_/class/2021'
client = ureq(my_url)  #opens connection, grabs page
page_html = client.read()
client.close()
#html parsing
page_soup = soup(page_html, "html.parser")
#grabs all item info via html
containers = page_soup.findAll("tr")

#array format: [[player, pos, [hometown, high school], ht, wt], [player, pos, hometown, ht, wt, stars, grade]]
p_info = []

for c in containers:
    if ("oddrow" in c["class"]) or ("evenrow" in c["class"]):
        s = c.findAll("td")
        name = s[1].div.a.strong.text
        pos = s[2].b.text
        hometown = s[3].text
        ht = s[4].text
        wt = s[5].text
        p_info.append([name, pos, hometown, ht, wt])

#string formatting
for i in p_info:
    index = i[2].find(",") + 4
    i[2] = [i[2][:index], i[2][index:]]
    print([i[2][:index], i[2][index:]])
Пример #4
0
url = os.environ.get('URL_VAR')

browser = webdriver.Chrome('C:\\Users\Leandro\Downloads\chromedriver.exe')
browser.get(url)

search = browser.find_element_by_name('field-keywords')
search.send_keys(os.environ.get('SEND_KEYS'))
search.send_keys(Keys.ENTER)

browser.implicitly_wait(15)
#passing my url from selenium to bs4
new_url = browser.current_url
browser.implicitly_wait(2)

client = ureq(new_url)
html_scrap = client.read()
client.close()
html_soup = soup(html_scrap, "html.parser")

containers = html_soup.findAll(
    "div", {"class": "s-include-content-margin s-border-bottom"})
page = containers[0]

title_tag = page.find("span",
                      {"class": "a-size-medium a-color-base a-text-normal"})
title = title_tag.text
price_tag = page.find("span", {"class": "a-offscreen"})
price = price_tag.text

filename = "portfolio-scraper.csv"
Пример #5
0
from urllib.request import urlopen as ureq
from bs4 import BeautifulSoup as soup

my_url = "https://www.newegg.com/Product/ProductList.aspx?Submit=ENE&IsNodeId=1&N=100007709%20600499109"

# opening up a connection and grabbing the page
uclient = ureq(my_url)
page_html = uclient.read()
uclient.close()

# html parsing
page_soup = soup(page_html, "html.parser")

# grabs each product
containers = page_soup.find_all("div",{"class":"item-container"})
abrands = page_soup.find_all("a",{"class":"item-brand"})

filename = "Graphics Cards.csv"
f = open(filename, "w")

headers = "Make, Product_Name, Shipping\n"

f.write(headers)

for container in containers:
	for abrand in abrands:
		pass
	make = abrand.img["title"]
	
	title_container = container.find_all("a",{"class":"item-title"})
	product_name = title_container[0].text
Пример #6
0
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as ureq

myurl = 'https://www.bestbuy.com/site/arlo-pro-3-6-camera-indoor-outdoor-wire-free-2k-hdr-security-camera-system-white/6364585.p?skuId=6364585'
client = ureq(myurl)
page_html = client.read()
client.close()
for i in range(20):
    sleep(3)
    page_soup = soup(page_html, 'html.parser')

    container = page_soup.findAll('div', {'class': 'price-box'})
    # print(container[0])
    # print(len(container))
    containers = container[0]
    print(containers)
# print(containers )
# price = containers.findAll('div',{'class': 'priceView-hero-price'})
# print(price[0].text)
Пример #7
0
import csv  #to do operations on CSV
import pandas as pd  # file operations
from bs4 import BeautifulSoup as soup  #Scrapping tool
from urllib.request import urlopen as ureq  # For requesting data from link
import numpy as np
import re

url = "http://howstat.com/cricket/Statistics/Players/PlayerList.asp?Group=A"
pagehtml = ureq(url)
soup = soup(pagehtml, "html.parser")  #parse the html
table = soup.find("table", {"class": "TableLined"})
with open('AZ.csv', 'a', newline='') as csvfile:
    f = csv.writer(csvfile)
for x in table:
    rows = table.find_all('tr')  #find all tr tag(rows)
    for tr in rows:
        data = []
        cols = tr.find_all('td')  #find all td tags(columns)
        for td in cols:
            data.append(td.text.strip())
            print(data)
Пример #8
0
filename = 'drive/MyDrive/clgscrap.csv'  #Results Storage
#Reading the old csv for previous notification
checker = []
try:
    with open(filename, "r+", encoding="utf-16") as r:
        reader = csv.reader(r, delimiter="\n")
        for i, line in enumerate(reader):
            checker = checker + [
                line
            ]  #Take the read lines into a list called checker
except Exception as e:
    err = e

#Connecting the website with scraper
myurl = 'http://gecskp.ac.in'  #provide the required URL
ucli = ureq(myurl)  #Connect to webpage using URLLIB
page_html = ucli.read()  #reads the webpage and transfers to a variable
ucli.close(
)  #We should close the connection as the webpage can contain timeout

page_soup = soup(
    page_html, "html.parser"
)  #Get the html part using the html.parser function of beautifulsoup library
#Specify the tag where brief of needed data can be extracted
containers = page_soup.findAll(
    "a", {"style": "font-weight: 600; font-style: normal;"}, {"href"})

#Scraping and updating the csv
with open(filename, "w+", encoding="utf-16"
          ) as f:  #External encoding to support multiple languages
    headers = "NEWS,LINK\n"
Пример #9
0
from urllib.request import urlopen as ureq
from bs4 import BeautifulSoup as soup

thehylia = "https://anime.thehylia.com/"
music = "https://anime.thehylia.com/soundtracks/browse/all"
animusic = "https://anime.thehylia.com/downloads/browse/all"

import sys
program_name = sys.argv[0]
arguments = sys.argv[1:]
count = len(arguments)


for x in arguments:
   # print("x is :", x)
    uclient = ureq(x)
    soup_html = uclient.read()
    uclient.close()
    htmlsoup = soup(soup_html , "html.parser")

    content = htmlsoup.find("table", {"class":"blog"})
    content = content.find("table")
    content = content.findAll("tr")
    for item in content:
        for val in item.findAll("td"):
            #print(val.text)
            if val.a != None:
                print(val.a["href"])
        #print("\n")    

    #with open('./database.db', 'w') as currfile:
Пример #10
0
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as ureq
import re
print("Name the movie")
movie = input()
new_movie = ""
for i in movie:
    if i.isspace():
        new_movie = new_movie + '+'
    else:
        new_movie = new_movie + i
#print(new_movie)
meurl = "https://www.imdb.com/find?q=" + new_movie + "&ref_=nv_sr_sm"
print(meurl)
uclient = ureq(meurl)
page = uclient.read()  # Download the page
uclient.close()
pagesoup = soup(page, "html.parser")
main = pagesoup.find("div", {"id": "main"})
#print(main)
#print("type of main is")
#print(type(main))
links = main.findAll("a", href=re.compile("(/title/)+"))
#print("\nlength of link is ")
#print(len(links))

mainlink = links[0]
mainlink = mainlink['href']
#print(mainlink)

print("\nNow the final link is ")
Пример #11
0
def get_json(local):
    f = ureq(
        'http://api.wunderground.com/api/0bf213c89d5d826c/geolookup/conditions/q/IA/'
        + local + '.json')
    data = json.loads(f.read())
    return data
Пример #12
0
mycursor = mydb.cursor()
"""sql_table= (
    "CREATE TABLE IF NOT EXISTS Moviez ("
    "  `mov_no` int(11) NOT NULL AUTO_INCREMENT,"
    "  `release_year` varchar NOT NULL,"
    "  `mov_name` varchar(100) NOT NULL,"
    "  `genre` varchar(16) NOT NULL,"
    "  `rating` DECIMAL(2,2) NOT NULL,"
    "  `votes` INT(10) NOT NULL,"
    "  'gross' DECIMAL(4,2) NOT NULL,"
    "  'director' VARCHAR(40) NOT NULL,"
    "  PRIMARY KEY (`mov_no`)"
    ") ENGINE=InnoDB")"""

myUrl = 'https://www.imdb.com/search/title?groups=top_250&sort=user_rating'
html_data = ureq(myUrl)
read_html = html_data.read()
html_data.close()

page_soup = soup(read_html, 'html.parser')
#html_doc=print(page_soup.prettify())

my_container = page_soup.find_all('div', {'class': 'lister-item-content'})
#for string in page_soup.stripped_strings:
#   print(string)
#container=my_container[0]
"""mov_name=container.find_all('a')
print(mov_name[0].text)
year=container.find_all('span',{'class':'lister-item-year text-muted unbold'})
print(year[0].text)
rating=container.find_all('span',{'class':'global-sprite rating-star imdb-rating'})
def scrap(x):
    x = chr(x)  #to change integer into character
    url = PRETEXT_URL + PLAYERLIST.format(x)
    try:
        print("\n")
        print(url)
        pagehtml = ureq(url)
    except HTTPError as e:
        print(e)
    except URLError as e:
        print("Website Can't be reached")
    else:
        soup = soups(pagehtml, "html.parser")  #parse the html
        table = soup.find("table", {"class": "TableLined"})
        if table is not None:
            #for x in table:
            rows = table.find_all('tr',
                                  attrs={"bgcolor":
                                         ["#FFFFFF",
                                          "#E3FBE9"]})  #find all tr tag(rows)
            localCricketData = []
            for tr in rows:
                data = []
                allLinks = tr.find_all('a', {"class": "LinkNormal"})
                if (len(allLinks)):
                    allLinks = allLinks[1:]

                cols = tr.find_all('td')  #find all td tags(columns)
                i = 0
                dataDist = {}
                isFullNameSet = 0
                for td in cols:
                    if (td.text.strip() not in IGNORE_VALUES
                            and td.text.strip().find('No. of Records') == -1):
                        textValue = td.text.strip()
                        if (textValue == ''):
                            textValue = 0
                        dataDist = mapData(dataDist, i, textValue)
                        data.append(td.text.strip())
                        i += 1

                for link in allLinks:
                    dataTitle = ''
                    subDataDist = {}
                    subUrl = PRETEXT_URL + link.get('href')
                    if re.search(ODI_URL, subUrl):
                        dataTitle = 'odistats'
                    elif re.search(T20_URL, subUrl):
                        dataTitle = 't20stats'
                    elif re.search(IPL_URL, subUrl):
                        dataTitle = 'iplstats'
                    elif re.search(TEST_URL, subUrl):
                        dataTitle = 'teststats'

                    try:
                        #print("\n")
                        #print(subUrl)
                        subPagehtml = ureq(subUrl)
                    except HTTPError as e:
                        print(e)
                    except URLError as e:
                        print("Website Can't be reached")
                    else:
                        subSoup = soups(subPagehtml, "lxml")  #parse the html

                        # Set Full Name, Batting Style and Bowling Style
                        if (isFullNameSet == 0):
                            dataDist = mapData(
                                dataDist, 6,
                                subSoup.find('td',
                                             text='Full Name:').find_next(
                                                 'td').text.strip())
                            dataDist = mapData(
                                dataDist, 7,
                                subSoup.find(
                                    'td',
                                    text='Bats:').find_next('td').text.strip())
                            dataDist = mapData(
                                dataDist, 8,
                                subSoup.find('td', text='Bowls:').find_next(
                                    'td').text.strip())
                            if (subSoup.find('a',
                                             text='IPL Profile & Statistics')):
                                allLinks.append(
                                    subSoup.find(
                                        'a', text='IPL Profile & Statistics'))
                            isFullNameSet = 1

                        if (dataTitle == 'iplstats'):
                            dataDist['ipl_teams'] = subSoup.find(
                                'td', text='Teams:').find_next(
                                    'td').text.strip().split(",")
                            dataDist['ipl'] = re.sub(
                                r"\([^()]*\)", "",
                                subSoup.find('td', text='Matches:').find_next(
                                    'td').text.strip())
                            dataDist['ipl'] = dataDist['ipl'].replace(
                                "\u00A0", "")

                        subTables = subSoup.find('table',
                                                 attrs={"width": ["270"]})
                        content = subTables.find_all('td')
                        mainKey = []
                        headers = subTables.find_all('td',
                                                     attrs={"colspan": "2"})
                        for heading in headers:
                            if (heading.text.strip() != ''):
                                mainKey.append(heading.text.strip())

                        dataDist[dataTitle] = extractTdAndMapValues(
                            subDataDist, mainKey, content)
                        #print(dataDist)

                if (bool(dataDist)):
                    localCricketData.append(dataDist)
            return localCricketData
Пример #14
0
def get_soup(url_string):
    web_client = ureq(url_string)
    page_content = web_client.read()
    web_client.close()
    return soup(page_content, 'html5lib')
Пример #15
0
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as ureq

my_url="https://www.flipkart.com/search?q=mobile%20phone&as=on&as-show=on&marketplace=FLIPKART&otracker=start&as-pos=2_q_mobile%20phone"

file_name="flipkartmobilerecord.csv"

f=open(file_name,"w",encoding="utf-8")
headers="product_name,price\n"

f.write(headers)

client=ureq(my_url)
page_html=client.read()
client.close()

page_soup=soup(page_html,"html.parser")

containers=page_soup.findAll("div",{"class":"_1-2Iqu row"})

for container in containers:
	name_container=container.findAll("div",{"class":"_3wU53n"})
	name=name_container[0].text

	price_container= container.findAll("div",{"class":"_1vC4OE _2rQ-NK"})
	price=price_container[0].text




	#print("name :"+name+"price :"+price+"\n")
Пример #16
0
#TAKES MAL_ID AS AN ARGUMENT TO GET IMAGE AND DESCRIPTION

from urllib.request import urlopen as ureq
from bs4 import BeautifulSoup as soup
import sqlite3, time, sys

mal = "https://myanimelist.net/"
topanimelist = "https://myanimelist.net/topanime.php?limit=14150"
#0 to 14150
topmangalist = "https://myanimelist.net/topmanga.php?limit=46100"
#0 to 46100
searchmal = "https://myanimelist.net/search/prefix.json?type=all&keyword=boku&v=1"
malanime = "https://myanimelist.net/search/prefix.json?type=anime&keyword=boku&v=1"
malmanga = "https://myanimelist.net/search/prefix.json?type=manga&keyword=boku&v=1"
malcharacter = "https://myanimelist.net/search/prefix.json?type=character&keyword=boku&v=1"
maluser = "******"

mal_id = sys.argv[1]  #30276 for ONE PUNCH MAN

curr_url = "https://myanimelist.net/anime/" + str(mal_id)

uclient = ureq(curr_url)
soup_html = uclient.read()
uclient.close()
htmlsoup = soup(soup_html, "html.parser")

#print(str(htmlsoup.find("meta", {"property" : "og:url"})["content"].split("/")[5]))
print(str(htmlsoup.find("table", {"width": "100%"}).div.div.a.img["src"]))
print(str(htmlsoup.find("span", {"itemprop": "description"}).text))
Пример #17
0

def get_url(page_number):
    return 'https://www.cars.com/for-sale/searchresults.action/?mdId=20823&mkId=20017&mlgId=28872&page=' + str(
        page_number
    ) + '&perPage=100&rd=99999&searchSource=GN_REFINEMENT&shippable-dealers-checkbox=true&showMore=false&sort=relevance&yrId=58487&yrId=30031936&yrId=35797618&yrId=36362520&zc=60542&localVehicles=false'


#this url is searching for 2018-2019 honda civics under 10k miles. You can configure it to whatever car(s) or model(s) you want.
filename = 'cars.csv'  #output filename
f = open(filename, 'w')

headers = 'Price,Color\n'
f.write(headers)
for i in range(1, 51):  #loop through pages of cars..
    uClient = ureq(get_url(i))  #opens url
    page_html = uClient.read()  #reads page html
    uClient.close()  #closes url
    page_soup = soup(page_html, 'html.parser')  #parses html
    containers = page_soup.findAll(
        'div', {'class': 'shop-srp-listings__inner'}
    )  #finds all of the listing blocks. I have included a sample picture of what a listing block looks like
    for container in containers:  #loops through each listing block on each page. I have it set to 100 listings per page in my link.
        price_container = container.find('div', 'payment-section')
        price = price_container.span.text.strip()
        if price == 'Not Priced':  #some prices are not listed, so we do not want to include these since the cars will be of no use
            continue
        price = price.replace(',', '')
        price = price.replace('$', '')  #gives price as a readable number

        color_container = container.find('ul',
Пример #18
0
from bs4 import BeautifulSoup as soup
user1 = input(
    "PLEASE MAKE FOLDER NAME AS = database AND IF MADE THEN ENTER Y :-")
user2 = input(
    "PLEASE MAKE ANOTHER FOLDER INSIDE THE database FOLDER NAME AS = images AND IF MADE THEN ENTER Y :- "
)
if (user1 == 'y' and user2 == 'y'):
    my_url = 'https://www.imdb.com/list/ls004440136/'
    #opening csv file for database
    filename = "database/database.csv"
    f = open(filename, "w")
    #headers
    headers = "Image,Actor_name,Profession,personality traits\n"
    f.write(headers)
    #openeing up connection,grabbing the page
    uclint = ureq(my_url)
    page_html = uclint.read()
    uclint.close()
    #html parser
    page_soup = soup(page_html, "html.parser")
    #find the all div with class lister list detail sub-list in the page
    #grab each celebrity name
    containers = page_soup.findAll("div", {"class": "lister-item mode-detail"})
    print("\n")
    print(
        str(len(containers)) + " Record Found And Stored In database.cvs File")
    #grabing only images of actors
    imgs = page_soup.find_all("div", {"class": "lister-item-image"})
    links = []
    for imge in imgs:
        link = imge.a.img.get('src')
Пример #19
0
from urllib.request import urlopen as ureq
from bs4 import BeautifulSoup as soup
"""
from selenium import webdriver

def render_page(url):
    driver = webdriver.Chrome()
    driver.get(url)
    r = driver.page_source
    return r


    r = render_page(myurl)
"""
myurl = 'https://ktu.edu.in'
ucli = ureq(myurl)
page_html = ucli.read()
ucli.close()

page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div", {"class": "latest-news"})

for container in containers:
    title_container = container.findAll("li", {"style": ""})
    title_time = title_container[0].label.text[0:10] + " " + title_container[
        0].label.text[24:28]
    title_content = title_container[0].a.text
    print(title_time + " | " + " Announcement :" + title_content)
"""
filname='clgscrap.csv'
f= open(filename,"w+")
Пример #20
0
        data_json += [{
            "name": product_name,
            "discription": product_discription,
            "colors": product_colors,
            "price": product_price
        }]

        # if i < len(grides):
        #     data_csv =  data_csv + ','
        i += 1


if __name__ == '__main__':

    for x in my_url:
        uclient = ureq(my_url[x])
        page_html = uclient.read()
        uclient.close()
        pagesoup = soup(page_html, "html.parser")
        grides = pagesoup.findAll("div", {"class": "grid-item-info"})
        print('cotégorie de produits = ', x)
        print('num de produits = ', len(grides))

        selection(grides, x)

    #file save
    with open(CSV_FILE, 'w') as csv_f:
        csv_f.write(data_csv)

    with open(JSON_FILE, 'w') as json_f:
        json_f.write(json.dumps(data_json) + '\n')
Пример #21
0
def game_soup(gameId):
    match_url = 'http://www.espn.com/nfl/game?gameId=' + str(gameId)
    u_client = ureq(match_url)
    page_html = u_client.read()
    u_client.close()
    return Soup(page_html, 'html.parser')
Пример #22
0
from urllib.request import urlopen as ureq
from bs4 import BeautifulSoup as soup
import sqlite3, time

conn = sqlite3.connect('thehyliadatabasesql.db')
c = conn.cursor()


thehylia = "https://anime.thehylia.com/"
music = "https://anime.thehylia.com/soundtracks/browse/all"
animusic = "https://anime.thehylia.com/downloads/browse/all"

#Music first

uclient = ureq(music)
soup_html = uclient.read()
uclient.close()
htmlsoup = soup(soup_html , "html.parser")

content = htmlsoup.find("table", {"class":"blog"})
content = content.find("p", {"align":"left"})
content = content.findAll("a")

c.execute("CREATE TABLE IF NOT EXISTS albums (album_id INT, url TEXT, name TEXT, unix INT)")

album_id = 0

for link in content:
    unix = time.time()
    global album_id
Пример #23
0
# Import Libraries
import pandas as pd
from urllib.request import urlopen as ureq
from bs4 import BeautifulSoup as soup

# URL
url = 'https://www.optikmelawai.com/frame?category=01&gender=3+5&page=1'

# Opening up connection, grabbing the page
uClient = ureq(url)
page_html = uClient.read()
uClient.close()

# html parsing
page_soup = soup(page_html, "html.parser")

# grabs each product
containers = page_soup.findAll("div",{"class":"pad-box2"})

# Creating dataset
titles = []
brands = []
prices = []
discount_prices = []

# loop each product in page
for container in containers:
    title = container.a["title"]
    titles.append(title)

    brand_container = container.findAll("p",{"class":"jdl-prod"})
Пример #24
0
# coding: utf-8

# In[3]:


from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as ureq
import pandas as pd

my_url= 'https://www.flipkart.com/search?q=mi&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off' #link of website to be scraped

client = ureq(my_url)               #giving link to request 
page_html = client.read()           #read the page html
client.close()

page_soup = soup(page_html, "html.parser")  #parsing the html of page

containers = page_soup.findAll("div", {"class" :"_3O0U0u"})     #HTML tag that contain the needed information
print("The number of components is ")
print (len(containers))                #no. of components in the tag 
#print(soup.prettify(containers[0]))    #prettify the html and print the specified component

n = int(input("what is the range"))
for i in range(n):
    container = containers[i]
    print(container.div.img["alt"])      #print the attribute needed(name) of the component

    price = container.findAll("div", {"class":"_1vC4OE _2rQ-NK"})  #tag and class containing price
    print(price[0].text)
#file= "getdata.csv"
Пример #25
0
# Import statements
from urllib.request import urlopen as ureq
from bs4 import BeautifulSoup as soup
from urllib.request import Request
import json


# Getting URL
my_url = input("Enter genius URL of artist: ")

# Setting a known brower user agent
req = Request(my_url, headers={'User-Agent': 'Mozilla/5.0'})

# Opening up connection, grabbing the page
u_client = ureq(req)
page_html = u_client.read()
u_client.close()

# Does HTML parsing
page_soup = soup(page_html, "html.parser")

# Instantiating json to store lyrics
song_json = {}
song_json["Lyrics"] = []

# Grabs each songs from artist page
containers = page_soup.findAll("div", {"class": "mini_card_grid-song"})

# For each song, go to URL and grab lyrics
for container in containers:
    song_url = container.a["href"]
Пример #26
0
carlist = []
filename = "ricardo_scraping_test_1.csv"

badbrands = (121, 141, 158, 154, 172, 181, 195, 198, 208, 255, 287, 293, 299,
             290, 341)
for x in range(1, 401):
    print('BRAND {}'.format(x))
    for z in range(1, 300):
        if x in badbrands: break
        print('PAGE {}'.format(z))
        my_ulr = str(
            "https://auto.ricardo.ch/de/s?make=" + str(x) +
            "&offer_type=classified&sort_type=registration_date&sort_order=asc&page="
            + str(z))

        uClient = ureq(my_ulr)

        myData = uClient.read()

        uClient.close

        page_soup = soup(myData, "html.parser")

        articles = page_soup.findAll("a", {"class": "ric-article"}, href=True)

        #print(car.markestring,car.modelstring)

        for a in articles:
            if str(a['href'])[0] == '/':
                car = rst.run('https://auto.ricardo.ch' + str(a['href']))
            else:
from bs4 import BeautifulSoup

import pandas as pd

import re
import nltk
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer 
nltk.download('stopwords')

containerlist=[]
i=1
for i in range(25):      # Number of pages plus one 
    test11= "https://www.trustpilot.com/review/www.sonetel.com?page="+(str(i))
    uclient = ureq(test11)
    page_html = uclient.read()
    uclient.close()
    page_soup= BeautifulSoup(page_html,"html.parser")
    container = page_soup.find_all("div", {"class" :"review-body"})    
    contain=container[0:-1]    
    containerlist.extend(contain)
    i=i+1
#    print(len(contain))
    
print (len(containerlist))
len(contain)

 

labels=['review1','review2','review3','review4','review5','review6','review7','review8','review9','review10',
Пример #28
0
from urllib.request import urlopen as ureq
from bs4 import BeautifulSoup as soup

my_url = 'https://accounts.google.com/signin'

uclient = ureq(my_url)  #opening a connection, grabbing a page
page_html = uclient.read()
uclient.close()

page_soup = soup(page_html, "html.parser")  #html parsing
search = page_soup.find_all('input', {'type': 'email'})
for result in search:
    # print(result, '\n')
    input_value = result.get('value')
    print('c', input_value, 'c')
Пример #29
0
def download():
    uclient = ureq(url).read()
    return uclient
Пример #30
0
from urllib.request import urlopen as ureq
from bs4 import BeautifulSoup as soup
import pandas


filename = 'products.csv'
f = open(filename,'w')
headers = 'brand, product_name, price, shipping\n'
f.write(headers)
url = "https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38"
uclient = ureq(url)
page_html = uclient.read()
uclient.close()
page_soup = soup(page_html,'html.parser')
containers = page_soup.findAll('div',{'class':'item-container'})
for container in containers:
	brand = container.div.div.a.img['title']
	title = container.findAll('a',{'class':'item-title'})[0].text
	price_class = container.findAll('ul',{'class':'price'})[0]
	price = price_class.findAll('li',{'class':'price-current'})[0].strong.text
	shipping = container.findAll('li',{'class':'price-ship'})[0].text.strip()
	f.write(brand+',' + title.replace(',','|') + ',' + price + ',' + shipping + '\n')
f.close()

df_product = pandas.read_csv(filename)
print(df_product)

Пример #31
0
enter_product = driver.find_element_by_xpath(
    '//*[@id="container"]/div/div[1]/div[1]/div[2]/div[2]/form/div/div/input'
).send_keys("iphone")

init_search = driver.find_element_by_xpath(
    '//*[@id="container"]/div/div[1]/div[1]/div[2]/div[2]/form/div/button')
init_search.click()

driver.set_page_load_timeout("15")

from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as ureq

myurl = "https://www.flipkart.com/search?q=iphone&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=off&as=off"

uclient = ureq(myurl)  #opens the connection
req = uclient.read()  #read the complete html page and save in variable
uclient.close()  #close the connection

req = soup(req, 'html.parser')  #html5lib
containers = req.findAll('div', {'class': '_1UoZlX'})
#print (len(containers))

#print (soup.prettify(containers[0]))
container = containers[0]

name = container.findAll('div', {'class': '_3wU53n'})
#print (name[1].text)

price = container.findAll('div', {'class': '_1vC4OE _2rQ-NK'})
#print (price[0].text)