Пример #1
0
def scrape_news():
    r = requests.get(url)

    sp = bs(r.content, 'html5lib')
    headline = sp.findAll('div', {'class': 'news-card-title'})
    article = sp.findAll('div', {'itemprop': 'articleBody'})
    j = 0
    con = db.create()
    if con is not None:
        for i in headline:
            curr = con.cursor()
            head = i.span.text
            link = i.a['href']
            link = p1 + link
            a = article[j].text
            j += 1
            curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",
                         (page, link, head, a))
            #print(head+"\n" + link + "\n" + a+"\n")
            #f.write(head.replace(",","")+","+link+","+a.replace(",","")+"\n")

        con.commit()
        curr.close()
        return True
    else:
        return False
Пример #2
0
def scrape_money():
    r = requests.get(url)

    sp = bs(r.content, 'html5lib')
    head = sp.findAll('li', {'class': 'clearfix'})
    #article=sp.findAll('div',{'itemprop':'articleBody'})
    j = 0

    con = db.create()

    if con is not None:
        for i in head:
            curr = con.cursor()
            headline = i.a['title']
            #print(headline)
            link = i.a['href']
            #print(link)
            #link=page+link
            a = i.p.text
            #print(a)

            j += 1
            curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",
                         (page, link, headline, a))
            #print(head+"\n" + link + "\n" + a+"\n")
            #f.write(head.replace(",","")+","+link+","+a.replace(",","")+"\n")

        con.commit()
        curr.close()
        return True
    else:
        return False
Пример #3
0
def scrape_sports():
	r=requests.get(url)
	
	sp=bs(r.content,'html5lib')
	div=sp.findAll('div',{'class':'detail'})
	#heads=sp.findAll('h3',{'class':'story-title'})
	body=sp.findAll('p')
	j=0;
	#print(div[0].p.text)
	con=db.create()
	if con is not None:
		for i in div:
			curr=con.cursor()		
			headline=i.h2['title']
			article=i.p.text
			#article=body[j].text
			#print(headline+"\n")
			#print(article)
			j+=1
			link=i.a['href']
			link=u+link
			#print(link)
			curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",(page,link,headline,article))
			#print(head+"\n" + link + "\n" + a+"\n")
			#f.write(head.replace(",","")+","+link+","+a.replace(",","")+"\n")
	
		con.commit()
		curr.close()
		return True
	else:
		return False
Пример #4
0
def scrape_space():
    r = requests.get(url)

    sp = bs(r.content, 'html5lib')
    div = sp.findAll('div', {'class': 'content'})
    heads = sp.findAll('h3', {'class': 'article-name'})
    body = sp.findAll('p', {'class': 'synopsis'})
    links = sp.findAll('a', {'class': 'article-link'})
    j = 0
    #print(div[0].p.text)
    con = db.create()
    if con is not None:
        for i in div:
            curr = con.cursor()
            headline = heads[j].text
            link = links[j]['href']
            article = body[j].text
            """print(headline+"\n")
			print(article)
			print(link)"""
            j += 1

            curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",
                         (page, link, headline, article))
            #print(head+"\n" + link + "\n" + a+"\n")
            #f.write(head.replace(",","")+","+link+","+a.replace(",","")+"\n")

        con.commit()
        curr.close()
        return True
    else:
        return False
Пример #5
0
def scrape_quora():
    url = "https://www.quora.com/topic/Web-Development"
    page = "https://www.quora.com"
    source = "Quora"
    source1 = "Quora"
    r = requests.get(url)
    #r=requests.get("https://www.quora.com/topic/Hollywood")
    sp = bs(r.content, 'html5lib')
    i = 1
    ass = sp.findAll('a', attrs={'class': 'question_link'})
    """filename="quora.txt"
	f=open(filename,"w")"""
    k = 0
    con = db.create()
    if con is not None:
        for Qlink in ass:
            Qhref = Qlink['href']

            FinalLink = page + Qhref
            r1 = requests.get(FinalLink)
            sp1 = bs(r1.content, 'html5lib')
            span1 = sp1.findAll(
                'p',
                attrs={'class': 'ui_qtext_para u-ltr u-text-align--start'})

            #for p in span1:
            text1 = span1[0].text
            #print(Qhref+"\n")
            #print(text1)

            curr = con.cursor()
            curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",
                         (source, FinalLink, Qhref, text1))

        con.commit()
        curr.close()
        return True

    else:
        return False
Пример #6
0
def scrape_newegg():

    r = requests.get(myurl)
    s = bs(r.content, 'html5lib')
    container = s.findAll('div', {'class': 'item-container'})

    con = db.create()

    for contain in container:
        curr = con.cursor()
        price = contain.find("li", "price-current").text.strip()
        brand = contain.find(
            "div", "item-branding").img["title"].strip() + "\n" + price
        product = contain.find("a", "item-title").text.strip()
        url = contain.find('a', "item-title")
        linkk = url['href']
        #price.strip()

        curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",
                     (source, linkk, brand, product))
        #f.write(brand + "," + product.replace(",","|") + "," + price.replace(",",""))

    con.commit()
    curr.close()