예제 #1
0
def ID(title_series
       ):  #function to find imdbID corresponding to given tv series

    myurl1 = "http://www.omdbapi.com/?t=%s&apikey=8deaca41" % title_series  #creating omdb url from title
    client1 = ureq(myurl1)

    page = json.load(client1)
    ID = page['imdbID']  #imdb url to be used for scraping
    myurl = "https://www.imdb.com/title/%s/" % ID

    return (myurl)  #to return the generated url
예제 #2
0
from urllib2 import urlopen as ureq
from urllib2 import Request as req
from bs4 import BeautifulSoup as soup
import lxml
agent='Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.53 Safari/525.19'
headers={
"User-Agent":agent
}
url='https://www.google.com/search?q=pangolin'
#"User-Agent":agent
html=req(url, headers=headers)
try:
    gold=ureq(html);
#url.add_header('User-Agent','Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.53 Safari/525.19')
#page_soup=open('html')
    page_soup=soup(gold, "lxml")
    divs=page_soup.findAll("div", {"class":"rc"})
    for div in divs:
        gold=div.h3.a
        print gold.text
except urllib2HTTPError, e:
    print "HTTP error:", e.code
    exit(1)
예제 #3
0

# product = input("Please enter the product: ")
product = sys.argv[1]
fileName = product.title().replace(" ", "") + ".json"
product = product.replace(" ", "+").replace(",","").title()

# upci = input("UPCI: ")


############################################################Searches mygrocerydeals.com############################################################


url = groceryUrlBeginning + product + groceryUrlEnd 	#page URL

uclient = ureq(url) 					#opening connection to website	
pageHTMl = uclient.read()				#reading HTML
uclient.close()
content = soup(pageHTMl, "html.parser")

#grabs each product
itemContainers = content.findAll("div", {"data-type":"special"})

productName = []
size = []
price = []
dealEnd = []
storeName = []
pictureUrl = []

count = 0 																			#counter for going row by row
예제 #4
0
def helper(List):  #function to do scraping

    status = []
    IMDBID = []
    for i in range(len(List)):
        status1 = []
        myurl = ID(List[i])

        uclient = ureq(myurl, cafile=certifi.where())
        page_html = uclient.read()  #reading the content from myurl
        uclient.close()

        page_soup = soup(page_html,
                         "html.parser")  #using 'beautiful soup' for scraping
        title = page_soup.h1

        status1.append("Title of series: %s" %
                       title.text)  #scraped title of tv series
        container = page_soup.findAll("div", {"class": "table full-width"})
        contain = container[0]

        contain.findAll("div", {"class": "episode-widget-currentep"})

        sub = contain.findAll("div", {"class": "episode-widget-currentep"})
        if (sub == []):

            final = contain.div.findAll("a")
            finals = final[0]
            number1 = finals.text  #season number
            number = int(number1)

            finals_date = final[number]
            date = finals_date.text

            DATE = int(date)  #Date of release
            if (DATE > 2018):

                status1.append("Upcoming season: %d" %
                               number)  #upcoming season
                status1.append("Aired in: %s" %
                               date)  #year in which it will be released
            elif (DATE == 2018):

                status1.append("Running seson: %d" % number)  #running season
                status1.append("Aired in: %s" %
                               date)  #year in which season started streaming
            elif (DATE < 2017):
                s = "The show has finished streaming all its episodes."

                status1.append(s)

                status1.append("Most recent season: %d" %
                               number)  #last aired season
                status1.append("Aired in: %s" %
                               date)  #year in which it was released
        else:
            subs = sub[0]
            subs.findAll("div", {"class": "episode-widget-airdate"})
            dd = subs.findAll("div", {"class": "episode-widget-airdate"})
            dds = dd[0]
            dds.text

            ddttitle = subs.findAll("div", {"class": "episode-widget-title"})
            ddttitles = ddttitle[0]
            title = ddttitles.a
            title.text
            status1 = contain.h4

            status1.append(dds.text)
            status1.append(title.text)

        for i in range(len(status1)):

            elements = status1[
                i]  #list named 'status' having details related to each tv series
            status.append(elements)  #return list status

    return status
예제 #5
0
# scraps a website
from urllib2 import urlopen as ureq
from bs4 import BeautifulSoup as soup

my_url = 'http://www.goforevent.com/jntua_pixel18/cse_dashboard.php'

# opening the connection
uclient = ureq(my_url)
pagehtml = uclient.read()
uclient.close()

# html parser
page_soup = soup(pagehtml, 'html.parser')

# grabs all products
rows = page_soup.select('tr')
file = open('sample.txt', 'w')
sum = 0
for i in range(1, len(rows)):
    rows_i = rows[i].select('td')
    # file.write(rows_i[2].text +" "+ rows_i[3].text + "\n")
    sum = sum + float(rows_i[3].text)
file.write(str(sum))
file.close()
예제 #6
0
from urllib2 import urlopen as ureq
from bs4 import BeautifulSoup as soup
import lxml
url = 'http://deschulz.net/plaincontent.html'
html = ureq(url)
page_soup = soup(html, "lxml")
body1 = page_soup.body
print(body1.text.strip())
예제 #7
0
from urllib2 import urlopen as ureq
import bs4
from bs4 import BeautifulSoup
import requests
import io
import json
from inflection import singularize

# Change csrf in security.js to false to use the script. Remember to change it back to true.

# create a set to make sure there are no duplicate recipe entries
recipe_set = set()
prepTimes = []

url2 = 'https://tasty.co/topic/one-pot'
uclient2 = ureq(url2)  # opens up connection, grabs web page
page_html = uclient2.read()
uclient2.close()  # close the client
parent_page = BeautifulSoup(page_html, 'html.parser')  # html parsing

# Put links to recipe pages in a list
links_container = parent_page.findAll('a',
                                      {'class': 'feed-item analyt-unit-tap'})

# First 20 links are on the page. The rest are loaded after clicking "Show more"
for i in range(20):
    # Parse page from <a> tag
    #print(links_container[i].get("href"))
    url_child = links_container[i].get('href')
    ch_client = ureq(url_child)  # opens up connection, grabs web page
    source = ch_client.read()  # read the page