Пример #1
0
def getSeasons(show):

    my_url = "https://www.imdb.com/title/{0}/episodes?ref_=tt_ov_epl".format(show)

    # open connection and grab page
    uClient = uReq(my_url)

    # put into a variable
    page_html = uClient.read()

    # close connection
    uClient.close()

    #html parsing
    page_soup = soup(page_html, "html.parser")

    # finds how many seasons are in this show
    containers = page_soup.findAll("div", {"class": "seasonAndYearNav"})

    container = containers[0]
    # the location of the season information
    seasons = container.div.div.text.split()

    seasons.pop(0)
    ## If errors occur, the following lines may be helpful in determining problems
    # for i in seasons:
    #     print type(i)

    # find the length of the list to determine the number of seasons in a show
    Seasons = len(seasons)
    # print Seasons # This is the answer!
    return Seasons # This is the answer!
Пример #2
0
def get_soup_content(url):
    uClient = uReq(url)
    page_html = uClient.read()
    uClient.close()

    page_soup = soup(page_html, 'html.parser')

    return page_soup
def get_page_number(url):
    pageClient = uReq(url)
    page_html = pageClient.read()
    pageClient.close()
    page_soup = soup(page_html, 'html.parser')
    pages = page_soup.find('div', {'class':'search_paginator'}).ul.find('li', {'class' : 'last-page'}).a.decode_contents(formatter='html')

    return pages
Пример #4
0
def page_crawler(urls, visited, category_data, page_data,
                 incomplete_categories):

    count = 0
    files_made = 1
    G = nx.DiGraph()

    while len(urls) > 0 and count < 25000:

        if count % 1000 == 0 and count != 0:

            pages_pickle = open(
                STARTING_TOPIC + "_pages0" + str(files_made) + ".pickle", "wb")
            pickle.dump(page_data, pages_pickle, pickle.HIGHEST_PROTOCOL)
            pages_pickle.close()

            category_pickle = open(
                STARTING_TOPIC + "_categories0" + str(files_made) + ".pickle",
                "wb")
            pickle.dump(category_data, category_pickle,
                        pickle.HIGHEST_PROTOCOL)
            category_pickle.close()

            files_made += 1

        if urls[0] not in visited:

            print urls[0]
            page_list = []
            uClient = uReq(urls[0])
            page_html = uClient.read()
            uClient.close()

            fixed_name = fix_name(urls[0])
            current_category = category(fixed_name)
            page_soup = soup(page_html, "html.parser")
            get_subcategories(current_category, page_soup, urls, BASE_URL,
                              fixed_name, incomplete_categories, G)
            get_supercategories(current_category, page_soup, BASE_URL, G)
            page_list = get_pages(current_category, page_soup, page_list,
                                  fixed_name, BASE_URL, SUBPAGE_WEIGHT_FACTOR,
                                  G)
            category_data.append(current_category)
            page_data += page_list
            visited.append(urls[0])
            urls.pop(0)
            count += 1
            print "count is " + str(count) + ", cue is " + str(len(urls))
            print current_category.name + "[pages] : " + str(len(page_list))

            #print "current_weight is " + str(len(current_category.subcategories))

        else:
            urls.pop(0)

    return G
Пример #5
0
def getCountry(url):            # gets a country
    # open connection, grab the page
    uClient = uReq(url)
    page_html = uClient.read()
    uClient.close()
    # html parser
    page_soup = soup(page_html, "html.parser")
    # return container name and sorts out country names
    containerName = page_soup.findAll("span",{"class":"heavy"})
    countryName = findall(r'/names/usage/(.*?)">',str(containerName))
    return(countryName[random.randrange(0,len(countryName))])
Пример #6
0
def info(str1):
    my_link = str1
    uClient = uReq(my_link)
    page_html = uClient.read()
    uClient.close()
    page_soup = soup(page_html, "html.parser")
    containers_link = page_soup.findAll("ul")[2].findAll("li")
    counter = 0
    for i in containers_link:
        info_list.append(containers_link[counter].text)
        counter += 1
Пример #7
0
def submissionCount():

    #Opening Connection, Grabbing the Page
    uClient = uReq(url)
    page_html = uClient.read()
    uClient.close()

    #Parse the HTML
    page_soup = soup(page_html, "html.parser")
    submissionCount = page_soup.findAll("span", {"class": "span-6"})[1].text
    print(submissionCount)
    return submissionCount
def get_page_content(url, error_link_array):
    try:
        contentClient = uReq(url)
        content_html = contentClient.read()
        contentClient.close()

        print(url + ' -> getting content from here')

        if content_html == None:
            error_link_array.append(url)
            return None

        return content_html
    except:
        print(url + '   ->error Link!')
        get_page_content(iriToUri(url), error_link_array)
def sample_page_number(url):

    try:
        pageClient = uReq(url)
        page_html = pageClient.read()
        pageClient.close()
        page_soup = soup(page_html, 'html.parser')
        if page_soup.find('div', {'class': 'search_paginator'}) and len(
                page_soup.find('div', {'class': 'search_paginator'}).find_all('li', {'class': 'other-page'})) > 1:
            pages = page_soup.find('div', {'class': 'search_paginator'}).ul.find_all('li', {'class': 'other-page'})[
                -1].a.decode_contents(formatter='html')
            return pages
        else:
            return 1
    except:
        sample_page_number(iriToUri(url))
Пример #10
0
def scraping(my_url, old_url, n, f, t):
    n=n+1
    print ("n: " + str(n))
    navigator = []
    uClient = uReq(my_url)
    page_html = uClient.read()
    uClient.close()
    page_soup = soup(page_html, "html.parser")
    authors = page_soup.find_all("span", class_="visible-contributors")
    author = authors[0].a.string.encode("utf-8")
    containers = page_soup.findAll("div",{"class":"item-container"})
    #container = containers[0]
    #title_container = container.div.findAll("div",{"class":"notranslate_title"})
    #i = 0
    for container in containers:
        title_container = container.find_all("div", class_="book-detail-line")
        title = str(title_container[0].p.string.encode("utf-8"))
        title_url = container.find_all("a", class_="notranslate_title")
        url = title_url[0]["href"]
        autori = container.find_all("span", class_="contributor-name")
        autore =  str(autori[0].string.encode("utf-8"))   
        ratings = container.find_all("div", class_="star-rating")
        if ratings:
            rating =  ratings[0]["aria-label"]
            splited_rating = rating.split(" ")
            rating = str(splited_rating[1])
        else:
            rating = "null"
        prezzi = container.find_all("p", class_="price")
        try:
            prezzo =  str(prezzi[0].span.span.string.encode("utf-8"))
        except:
            prezzo = "gratis"
        header_url="https://www.kobo.com"
        #print(header_url+url)
        navigator.append(header_url+url)
        full_url = header_url + url
        f.write(my_url+ "," + full_url + "\n")
        t.write(full_url + ";" + title + ";" + autore + ";" + rating + ";" + prezzo + "\n")
        #i=i+1
    if not navigator:
        return old_url, my_url
    old_url = my_url
    rnd = random.randint(0,len(navigator)-1)
    my_url = navigator[rnd]
    print(my_url)
    return my_url, old_url
def get_file_meta(url):
    try:
        sound_array = {}
        soundClient = uReq(url)
        sound_html = soundClient.read()
        soundClient.close()
        tags = []
        sound_soup = soup(sound_html, 'html.parser')
        sound_name = sound_soup.find('div', {'id': 'single_sample_header'}).getText()
        sound_description = sound_soup.find('div', {'id': 'sound_description'}).p.getText()
        sound_tag = sound_soup.find('ul', {'class': 'tags'}).find_all('li')
        for sound in sound_tag:
            tags.append(sound.getText())

        sound_tags = ','.join(tags)
        sound_download = sound_soup.find('div', {'id': 'download'}).a['href']
        sound_license = sound_soup.find('div', {'id': 'sound_license'}).a.getText()
        sound_type = \
        sound_soup.find('dl', {'id': 'sound_information_box'}).find_all(
            'dd')[0].getText()
        sound_duration = \
        sound_soup.find('dl', {'id': 'sound_information_box'}).find_all(
            'dd')[1].getText()
        sound_filesize = \
        sound_soup.find('dl', {'id': 'sound_information_box'}).find_all(
            'dd')[2].getText()
        sound_bitrate = \
        sound_soup.find('dl', {'id': 'sound_information_box'}).find_all(
            'dd')[3].getText()
        sound_channels = \
        sound_soup.find('dl', {'id': 'sound_information_box'}).find_all(
            'dd')[4].getText()
        sound_array['name'] = sound_name.encode('utf-8').replace("'", "").replace("\n", "")
        sound_array['description'] = sound_description.encode('utf-8').replace("'", "").replace("\n", "")
        sound_array['tags'] = sound_tags.encode('utf-8').replace("'", "").replace("\n", "")
        sound_array['license'] = sound_license.encode('utf-8').replace("'", "").replace("\n", "")
        sound_array['type'] = sound_type.encode('utf-8').replace("'", "").replace("\n", "")
        sound_array['duration'] = sound_duration.encode('utf-8').replace("'", "").replace("\n", "")
        sound_array['filesize'] = sound_filesize.encode('utf-8').replace("'", "").replace("\n", "")
        sound_array['bitrate'] = sound_bitrate.encode('utf-8').replace("'", "").replace("\n", "")
        sound_array['channels'] = sound_channels.encode('utf-8').replace("'", "").replace("\n", "")
        sound_array['download'] = 'https://www.freesound.org' + sound_download.encode('utf-8')

        return sound_array
    except Exception, e:
        print(e)
        get_file_meta(iriToUri(url))
Пример #12
0
def jax_enter_artikel_authoren(author_name, anzahl_bisheriger_artikel):
    url_jax_enter = 'https://jaxenter.de/author/' + author_name
    jax_enter_Client = uReq(url_jax_enter)
    jax_enter_page_html = jax_enter_Client.read()
    jax_enter_Client.close()

    jax_enter_page_soup = soup(jax_enter_page_html, "html.parser")
    container_jax_enter = jax_enter_page_soup.findAll("div", {"class": "info"})
    author_name = jax_enter_page_soup.findAll("span", {"class": "author-name"})

    for container in container_jax_enter:
        info_text_container = container.a

        print author_name[0].text.strip() + ": " + info_text_container.text

    if len(container_jax_enter) > anzahl_bisheriger_artikel:
        print "Unter der URL gibt es einen neuen Artikel: " + url_jax_enter
Пример #13
0
def one_de_prices(notebook_types):
    my_url = 'https://www.one.de/notebooks/' + notebook_types
    uClient = uReq(my_url)
    page_html = uClient.read()
    uClient.close()

    page_soup = soup(page_html, "html.parser")
    containers = page_soup.findAll("div", {"class": "product--info-box"})

    for container in containers:
        info_text_container = container.findAll("a",
                                                {"class": "product--title"})
        info_price_container = container.findAll(
            "div", {"class": "price--default-list"})

        print info_text_container[0].text.strip(
        ) + ": " + info_price_container[0].text.strip()
Пример #14
0
def getWords(url):
    letters = ['A-B/','C-D/','E-G/','H-K/','L-N/','O-P/','Q-R/','S/','T/','U-Z/']
    numPages = [ 5, 7, 6, 4, 4, 5, 3, 5, 3, 3 ]
    p = 0
    wordList = []
    while p < len(numPages):
        for elem in range(1, numPages[p] + 1):
            uClient = uReq(url + letters[p] + '?page=' + str(elem))
            page_html = uClient.read()
            uClient.close()
            page_soup = soup(page_html, "html.parser")
            containerName = page_soup.findAll("ul",{"class":"result-list1 wordlist-oxford3000 list-plain"})
            pageWords = findall(r'definition">(.*?)</a>',str(containerName))
            for item in pageWords:
                wordList.append(item)
        p += 1
    return wordList
Пример #15
0
def firstNameLst(url):          # returns a list of first names
    # open connection, grab the page
    uClient = uReq(url)
    page_html = uClient.read()
    uClient.close()
    # html parser
    page_soup = soup(page_html, "html.parser")
    # grabs each item in the class called browsename
    # searches out the names and creates a list we can use to draw from randomly
    containersName = page_soup.findAll("div",{"class":"browsename"})
    to_string_name = ''
    if len(containersName) == 0:
        return ''
    else:
        for items in range(0, len(containersName)):
            to_string_name += str((containersName[items].b.a))
    first_name = findall(r'">(.*?)',to_string_name) 
    return first_name[random.randrange(0, len(first_name)) ]
Пример #16
0
def findNumPages(url):          # check the url to see if there are multiple pages
    # a reference to the current url to use when finding values for the number of pages
    # eg:value="/names/usage/english/13">page 13</option></select>
    container_page_ref = (url[30:]) + '/'
    # open connection, grab the page
    uClient = uReq(url)
    page_html = uClient.read()
    uClient.close()
    # html parser
    page_soup = soup(page_html, "html.parser")
    # grabes each item and finds the largest number
    containersPage = str(page_soup.find("select",{"name": "page"}))
    page_num = findall(r'{0}(.*?)">page '.format(container_page_ref), containersPage)
    #checks len of list before conversion to integers
    if len(page_num) > 0:
        page_num_max = (max(map (int, page_num)))
        return page_num_max
    else:
        return ''
Пример #17
0
def mainfun(my_url):
	uClient = uReq(my_url)
	page_html = uClient.read()
	uClient.close()
	page_soup = soup(page_html, "html.parser")
	# print page_soup
	product_available=""
	product_name = page_soup.findAll("h1",{"class":"_3eAQiD"})[0].text
	product_price = page_soup.findAll("div",{"class":"_1vC4OE _37U4_g"})[0].text
	try:
		product_available = page_soup.findAll("div",{"class":"_3xgqrA"})[0].text
	except:
		product_available = "Available"
	update_time = time.asctime(time.localtime(time.time()))
	print "Product Name: "+ product_name
	print "Product Cost: "+ product_price
	print "Availablity : "+ product_available
	print "Product Url : "+ my_url
	print "Update Time : "+ update_time
Пример #18
0
def recupere():
    my_url = 'https://play.google.com/store/search?q=toutes%20les%20applications%20camerounaise'
    uClient = uReq(my_url)
    page_html = uClient.read()
    uClient.close()
    page_soup = soup(page_html,"html.parser")
    dts = page_soup.find_all("div",{"class":"details"})
    liste = []
    for detail in dts :
        
        title_detail = detail.find_all("a",{"class":"title"})
        title_name = title_detail[0].text

        description_detail = detail.find_all("div",{"class":"description"})
        description = description_detail[0].text
        

        price_detail = detail.find_all("span",{"class":"display-price"}) 
        price = price_detail[0].text.strip()
        descrip = (title_name, description, price)
        liste.append(descrip)
    return liste
def sample_more_search_result(url):

    try:
        sampleClient = uReq(url)
        sample_html = sampleClient.read()
        sampleClient.close()

        sample_soup = soup(sample_html, 'html.parser')

        sample_result = sample_soup.find('div', {'id': 'wrapper'}).find('div', {'id': 'container'}).find('div', {
            'id': 'content_full'}).find_all('div', {'class': 'sample_more_search_results'})

        samples = []

        for sample in sample_result:
            if sample != None:
                samples.append('https://www.freesound.org' + sample.a['href'])

        return samples
    except Exception, e:
        sample_more_search_result(iriToUri(url))
        print (e)
Пример #20
0
def main():
    # Argument check
    if len(sys.argv) != 2:
        sys.exit("Usage: python web_scrape.py trialID")

    # Define URL
    url = "https://clinicaltrials.gov/ct2/show/" + sys.argv[1]

    # Open connection and grab html
    uClient = uReq(url)
    page_html = uClient.read()
    uClient.close()

    # Write html to file 'html'
    fp = open("results/html", "w+")
    fp.write(page_html)
    fp.close()

    # Open 'html' for reading and new file 'hits' for writing
    fp = open("results/html", 'r')
    new_fp = open("results/hits", "w+")
    copy = False

    # Remove all text between "Criteria" and "Locations & Contacts"
    for line in fp:
        if line.strip().lstrip(
        ) == "<div class=\"header3\" style=\"margin-top:2ex\">Criteria</div>":
            copy = True
        elif line.strip().lstrip() == "<!-- location_section -->":
            copy = False
        elif copy:
            # Regex to strip all html tags (everything between triangle brackets)
            cleanr = re.compile('<.*?>')
            cleantext = re.sub(cleanr, '', line)
            new_fp.write(cleantext.strip().lstrip())

    # Close 'hits' and 'html'
    new_fp.close()
    fp.close()
Пример #21
0
links=k.get_attribute('href')
print links
driver.get(links)
"""

urls=driver.find_elements_by_css_selector('div.srg a')
#urls=driver.find_element_by_tag_name('h3').findNext('a');
#k=urls.find_element_by_css_selector('a')
for l in urls:
    links.append(l.get_attribute('href'))
print links

my_url=links[0]
my_url2=links[4]
uClient=uReq(my_url)
uClient2=uReq(my_url2)
page_html=uClient.read()
page_html2=uClient2.read()

#for getting al the things on page
f=uReq(my_url)
g=soup(page_html,"html.parser")
s=g.get_text()
#//
uClient.close()
page_soup=soup(page_html,"html.parser")
page_soup2=soup(page_html2,"html.parser")
#containers=page_soup.findAll(text="introduction")
#print containers
Пример #22
0
def retriever_soup(my_url):
    uClient = uReq(my_url)
    page_html = uClient.read()
    uClient.close()
    page_soup = soup(page_html, "html.parser")
    return (page_soup)
Пример #23
0
    'https://www.beatport.com/genre/tech-house/11/top-100',
    'https://www.beatport.com/genre/house/5/top-100',
    'https://www.beatport.com/genre/progressive-house/15/top-100',
    'https://www.beatport.com/genre/funk-soul-disco/40/top-100',
    'https://www.beatport.com/genre/indie-dance-nu-disco/37/top-100',
    'https://www.beatport.com/genre/funky-groove-jackin-house/81/top-100',
    'https://www.beatport.com/genre/leftfield-house-and-techno/80/top-100',
    'https://www.beatport.com/genre/dj-tools/16/top-100',
    'https://www.beatport.com/genre/minimal-deep-tech/14/top-100',
    'https://www.beatport.com/genre/techno/6/top-100'
]
# opening up connecting, grabbing the page

for url in my_url:

    uClient = uReq(url)
    # this will offload our content in'to a variable
    page_html = uClient.read()
    # closes our client
    uClient.close()

    # html parsing
    page_soup = soup(page_html, "html.parser")

    containers = page_soup.findAll("li",
                                   {"class": "bucket-item ec-item track"})

    print(url)

    conn = sqlite3.connect('Beatscrape.db')
    cursor = conn.cursor()
Пример #24
0
from urllib import urlopen as uReq
from bs4 import BeautifulSoup as soup
if __name__ == '__main__':

    pages = []
    for i in range(1, 100):
        my_url = 'https://www.monster.se/jobb/sok/Data-IT_4?intcid=swoop_BrowseJobs_Data-IT&page={0}'.format(
            i)
        pages.append(my_url)
    for my_url in pages:
        try:
            uClient = uReq(my_url)
            pageHtml = uClient.read()
            uClient.close()
            page_soup = soup(pageHtml, "html.parser")
            print page_soup.h1.text.strip()
            containers = page_soup.findAll("article",
                                           {"class": "js_result_row"})
            for container in containers:
                job_title = container.findAll("div", {"class": "jobTitle"})
                print job_title[0].text.strip()
                company = container.findAll("div", {"class": "company"})
                print company[0].text.strip()
                location = container.findAll("div", {"class": "location"})
                print location[0].text.strip()
                print('-------------------------------')
        except AttributeError:
            break
    pages_stepstone = []
    for i in range(1, 100):
        my_url_stepstone = 'https://www.stepstone.se/lediga-jobb-i-hela-sverige/data-it/sida{0}/'.format(
Пример #25
0
from urllib import urlopen as uReq
from bs4 import BeautifulSoup as soup
import re

jobs_url = 'https://www.indeed.com/jobs?q=web+developer&l=Roanoke%2C+TX'

#opening connection and grabbing the page
uClient = uReq(jobs_url)
page_html = uClient.read()
uClient.close()

#html parsing
page_soup = soup(page_html, "html.parser")

#grab all divs with a class of result
results = page_soup.findAll("div", {"class": "result"})
#print len(results)

filename = "jobs.csv"
f = open(filename, "w")

headers = "Title, Company, Location, Experience, Link \n"

f.write(headers)

for result in results:
    title = result.a["title"]

    company = result.findAll('span', {'class': 'company'})
    company_name = company[0].text.strip()
Пример #26
0
    return int(s) / 20


for semt in semts:
    page_url = "https://www.hurriyetemlak.com/" + semt + "-satilik"

    buildings = []
    count = 0
    page_limit = 2
    page = 1
    while (page != page_limit):
        pagesUrl = page_url + "?page=" + str(page)
        print("Semt: " + semt + " Sayfa : " + str(page))

        # opens the connection and downloads html page from url
        uClient = uReq(page_url)

        page_soup = soup(uClient.read(), "html.parser")
        uClient.close()
        rows = page_soup.findAll("div",
                                 {"class": "list-item timeshare clearfix"})

        if (page == 1):
            numberofPost = page_soup.findAll("strong", {"data-ads-count": ""})
            page_limit = strToNum(numberofPost[5].text)

        for r in rows:
            suburl = "https://www.hurriyetemlak.com" + r.a["href"]
            count += 1
            try:
                RuClient = uReq(suburl)
Пример #27
0
# removed urllib.request per stackoverflow
from urllib import urlopen as uReq
from bs4 import BeautifulSoup as soup
import sqlite3

my_url = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics%20cards'

# opening up connecting, grabbing the page
uClient = uReq(my_url)
# this will offload our content into a variable
page_html = uClient.read()
# closes our client
uClient.close()

# html parsing
page_soup = soup(page_html, "html.parser")

# grabs each product
containers = page_soup.findAll("div", {"class": "item-container"})

# ------ commenting this out .... replacing with create database
#filename = "products.csv"
#f = open(filename, "w")

#headers = "brand, product_name, shipping\n"

#f.write(headers)

#for container in containers:
#	brand = container.div.div.a.img["title"]
Пример #28
0
def getWebsite(url):
	
	page_html = uReq(url)
	page_soup = soup(page_html, "html.parser")
	findToday(page_soup)
Пример #29
0
all_links = []
country_list = [
    'african', 'america', 'arabic', 'australian', 'christian', 'english',
    'french', 'german', 'indian', 'iranian', 'irish'
]
gender_list = ['boy', 'girl']
letter = [
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'
]
my_url = 'https://www.babynamesdirect.com/baby-names'
for i in country_list:
    for j in gender_list:
        for k in letter:
            new_url = my_url + "/" + i + "/" + j + "/" + k
            uClient = uReq(new_url)
            page_html = uClient.read()
            uClient.close()
            page_soup = soup(page_html, "html.parser")
            containers_outer_div = page_soup.findAll("small")
            # pages_number = containers_outer_div[-1].string
            if len(containers_outer_div) != 0:
                pages_number = containers_outer_div[-1].string
                pages_number = pages_number.split("of ")
                lastpage = pages_number[-1]
                lastpage = int(lastpage) + 1
                for every in range(1, lastpage):
                    inner_new_url = new_url + "/" + str(every)
                    print(inner_new_url)
                    all_links.append(inner_new_url)
            else:
Пример #30
0
from urllib import urlopen as uReq
from bs4 import BeautifulSoup as soup

myURL = 'https://www.newegg.ca/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphic%20cards'
#opening connection
uClient = uReq(myURL)
page_HTML = uClient.read()
uClient.close()
#parse the page_HTML using html parser
page_soup = soup(page_HTML, "html.parser")
containers = page_soup.findAll("div", {"class": "item-info"})

filename = "products.csv"
f = open(filename, "w")
headers = "brand, product_name, shipping\n"
f.write(headers)

for container in containers:
    brand = container.div.a.img["title"]

    title_container = container.findAll("a", {"class": "item-title"})
    productName = title_container[0].text

    shipping_container = container.findAll("li", {"class": "price-ship"})
    shippingPrice = shipping_container[0].text.strip()

    print("brand : " + brand)
    print("productName : " + productName)
    print("shippingPrice : " + shippingPrice)
    f.write(brand + "," + productName.replace(",", "| ") + "," +
            shippingPrice + "\n")