def scrape_news(): r = requests.get(url) sp = bs(r.content, 'html5lib') headline = sp.findAll('div', {'class': 'news-card-title'}) article = sp.findAll('div', {'itemprop': 'articleBody'}) j = 0 con = db.create() if con is not None: for i in headline: curr = con.cursor() head = i.span.text link = i.a['href'] link = p1 + link a = article[j].text j += 1 curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)", (page, link, head, a)) #print(head+"\n" + link + "\n" + a+"\n") #f.write(head.replace(",","")+","+link+","+a.replace(",","")+"\n") con.commit() curr.close() return True else: return False
def scrape_money(): r = requests.get(url) sp = bs(r.content, 'html5lib') head = sp.findAll('li', {'class': 'clearfix'}) #article=sp.findAll('div',{'itemprop':'articleBody'}) j = 0 con = db.create() if con is not None: for i in head: curr = con.cursor() headline = i.a['title'] #print(headline) link = i.a['href'] #print(link) #link=page+link a = i.p.text #print(a) j += 1 curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)", (page, link, headline, a)) #print(head+"\n" + link + "\n" + a+"\n") #f.write(head.replace(",","")+","+link+","+a.replace(",","")+"\n") con.commit() curr.close() return True else: return False
def scrape_sports(): r=requests.get(url) sp=bs(r.content,'html5lib') div=sp.findAll('div',{'class':'detail'}) #heads=sp.findAll('h3',{'class':'story-title'}) body=sp.findAll('p') j=0; #print(div[0].p.text) con=db.create() if con is not None: for i in div: curr=con.cursor() headline=i.h2['title'] article=i.p.text #article=body[j].text #print(headline+"\n") #print(article) j+=1 link=i.a['href'] link=u+link #print(link) curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)",(page,link,headline,article)) #print(head+"\n" + link + "\n" + a+"\n") #f.write(head.replace(",","")+","+link+","+a.replace(",","")+"\n") con.commit() curr.close() return True else: return False
def scrape_space(): r = requests.get(url) sp = bs(r.content, 'html5lib') div = sp.findAll('div', {'class': 'content'}) heads = sp.findAll('h3', {'class': 'article-name'}) body = sp.findAll('p', {'class': 'synopsis'}) links = sp.findAll('a', {'class': 'article-link'}) j = 0 #print(div[0].p.text) con = db.create() if con is not None: for i in div: curr = con.cursor() headline = heads[j].text link = links[j]['href'] article = body[j].text """print(headline+"\n") print(article) print(link)""" j += 1 curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)", (page, link, headline, article)) #print(head+"\n" + link + "\n" + a+"\n") #f.write(head.replace(",","")+","+link+","+a.replace(",","")+"\n") con.commit() curr.close() return True else: return False
def scrape_quora(): url = "https://www.quora.com/topic/Web-Development" page = "https://www.quora.com" source = "Quora" source1 = "Quora" r = requests.get(url) #r=requests.get("https://www.quora.com/topic/Hollywood") sp = bs(r.content, 'html5lib') i = 1 ass = sp.findAll('a', attrs={'class': 'question_link'}) """filename="quora.txt" f=open(filename,"w")""" k = 0 con = db.create() if con is not None: for Qlink in ass: Qhref = Qlink['href'] FinalLink = page + Qhref r1 = requests.get(FinalLink) sp1 = bs(r1.content, 'html5lib') span1 = sp1.findAll( 'p', attrs={'class': 'ui_qtext_para u-ltr u-text-align--start'}) #for p in span1: text1 = span1[0].text #print(Qhref+"\n") #print(text1) curr = con.cursor() curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)", (source, FinalLink, Qhref, text1)) con.commit() curr.close() return True else: return False
def scrape_newegg(): r = requests.get(myurl) s = bs(r.content, 'html5lib') container = s.findAll('div', {'class': 'item-container'}) con = db.create() for contain in container: curr = con.cursor() price = contain.find("li", "price-current").text.strip() brand = contain.find( "div", "item-branding").img["title"].strip() + "\n" + price product = contain.find("a", "item-title").text.strip() url = contain.find('a', "item-title") linkk = url['href'] #price.strip() curr.execute("INSERT INTO Scrapped_data VALUES(?,?,?,?)", (source, linkk, brand, product)) #f.write(brand + "," + product.replace(",","|") + "," + price.replace(",","")) con.commit() curr.close()