def scrape(): import requests from bs4 import BeautifulSoup from django.conf import settings import os import shutil from news.models import Headline, UserProfile from datetime import datetime #user_p, created = UserProfile.objects.get_or_create(user=request.user) #user_p.last_scrape = datetime.now(timezone.utc) #user_p.save() url = "https://premierleague.com" session = requests.Session() session.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36" } content = session.get(url + "/news", verify=False).content # .content grabs all html soup = BeautifulSoup(content, 'html.parser') articles = soup.find_all("section", {"class": "featuredArticle"}) for item in articles: url_suffix = item.find("a", {"class": "thumbnail thumbLong"})['href'] news_link = url + url_suffix if not re.search( '^https://', url_suffix) else url_suffix img_src = item.find("img")['src'].strip() new_headline = Headline() new_headline.url = news_link new_headline.pub_date = datetime.now() # use img_src to get the link, # then use the link to get the actual image, # and save the image in BASE_DIR/src/static media_root_path = settings.MEDIA_ROOT local_fname = img_src.split("/")[-1].split("?")[0] try: if not local_fname.startswith( "audioboomgraphics") and local_fname not in os.listdir( media_root_path): r = session.get(img_src, stream=True, verify=False) with open(local_fname, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): f.write(chunk) cur_img_abspath = os.path.abspath(local_fname) shutil.move(cur_img_abspath, media_root_path) new_headline.image = local_fname elif local_fname in os.listdir(media_root_path): new_headline.image = local_fname except: pass info = get_summary(news_link) new_headline.title = info['title'] new_headline.summary = info['summary'] try: new_headline.save() except: pass
def scrape(request): session = requests.Session() session.headers = { "User-Agent": "Googlebot/2.1 (+http://www.google.com/bo.html)" } url = "https://timesofindia.indiatimes.com/briefs" content = session.get(url, verify=False).content soup = BeautifulSoup(content, 'html.parser') News = soup.find_all('div', {"class": "brief_box"}) for article in News: try: main = article.find('h2').find('a') except Exception as e: print(e) link = str(main['href']) link = url + link title = main.text #image_src = article.find('a') #image_src = article.find('div', {"class":"posrel"}) #image = image_src.find('img')['src'] #print(image_src) new_headline = Headline() new_headline.title = title #new_headline.image = image new_headline.url = link new_headline.save() return redirect('../')
def scrape(request): session = requests.Session() session.headers = { "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)" } url = "https://www.theonion.com/" content = session.get(url, verify=False).content soup = BSoup(content, "html.parser") News = soup.find_all('article', {"class": "sc-1pw4fyi-5 RkwFH"}) for artcile in News: main = artcile.find_all('a')[0] link = main['href'] title = artcile.find('h4', {"class": "sc-1qoge05-0 eoIfRA"}).text News3 = artcile.find('img', {"class": "dv4r5q-2 iaqrWM"}) if News3 is None: image_src = temp else: image_src = News3['srcset'].split(' ')[0] temp = image_src new_headline = Headline() new_headline.title = title new_headline.url = link new_headline.image = image_src new_headline.save() return redirect("../")
def scrape(request): session = requests.Session() session.headers = { "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)" } url = "https://www.theonion.com/" content = session.get(url, verify=True).content soup = BSoup(content, "html.parser") News = soup.find_all('article', {"class": "sc-1pw4fyi-7 gDJTEP js_post_item"}) for article in News: main = article.find_all('a')[0] title = article.find_all('h4')[0] link = main['href'] images = main.find('img') if images is not None: if images.has_attr('srcset'): #print(images) image_src = str(main.find('img')['srcset']).split(".jpg")[0] print('title: ', title.text) print('link: ', link) titlet = str(title.text) image_src = image_src + '.jpg' print('image_src: ', image_src) if link is not None and image_src is not None and title is not None: new_headline = Headline(title=titlet, image=image_src, url=link) new_headline.save() return redirect('news')
def scrape(request): session = requests.Session() session.headers = { "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)" } url = "https://www.theonion.com/" content = session.get(url, verify=False).content soup = BSoup(content, "html.parser") News = soup.find_all('article', {"class": "js_post_item"}) for article in News: title = article.find_all('a', {"class": "js_link"})[-1].text link = article.find("a", {"class": "js_link"}).attrs["href"] image_src = article.find("a", {"class": "js_link"}).find("img") if image_src: try: image_src = image_src.attrs["srcset"] image_src = image_src.split(" ")[-4] except: try: image_src = image_src.attrs["data-expanded-srcset"] image_src = image_src.split(" ")[-4] except: continue else: continue new_headline = Headline() new_headline.title = title new_headline.url = link new_headline.image = image_src new_headline.save() return redirect("../")
def scrape(request): session = requests.Session() session.headers = { "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)%22%7D" } url = "https://www.ynet.co.il/home/0,7340,L-8,00.html" content = session.get(url, verify=False).content soup = BSoup(content, "html.parser") News = soup.find_all('div', {"class": "str3s str3s_small str3s_type_small"}) Titles = soup.find_all('div', {"class": "title"}) TitlesText = [] for title in Titles: t = title.text TitlesText.append(t) i = 0 new_headline_links = [] for article in Headline.objects.all(): new_headline_links.append(article.title) for artcile in News: main = artcile.find_all('a')[0] link = main['href'] image_src = str(main.find('img')['src']).split(" ")[0] if (TitlesText[i] in new_headline_links): break if (link.find("https") != -1): link2 = link else: link2 = "https://www.ynet.co.il/" + link link2 = link2.replace('#autoplay', '') articleContent = session.get(link2, verify=False).content print(link2) soup = BSoup(articleContent, "html.parser") new_headline = Headline() ok = "פורסם:" #header = soup.find_all('div', {"class":"element B3 ghcite noBottomPadding"})[0] dates = soup.find_all('span', string=ok) print(dates) new_headline.date = dates[1].text new_headline.title = TitlesText[i] new_headline.url = link2 new_headline.image = image_src #if (new_headline.date != 'error#'): # new_headline.save() new_headline.save() i = i + 1 return redirect("../")
def scrape(request): url = "https://www.theonion.com/" r = requests.get(url) soup = BSoup(r.content, 'html.parser') val = soup.find_all('article', {'class': "js_post_item"}) for link in val: main = link.find('a') try: image_url = (str(main.find('img')['data-srcset']).split(" ")[0]) new_headine = Headline() new_headine.image = image_url new_headine.url = main['href'] new_headine.title = link.find('h4').get_text() new_headine.save() except: pass return redirect("../")
def scrape(request): # scrape news articles from theonion.com session = requests.Session() session.headers = {"User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"} url = "https://www.theonion.com/" content = session.get(url, verify=False).content soup = BSoup(content, "html.parser") # create a soup object News = soup.find_all('div', {"class":"curation-module__item"}) for artcile in News: # to iterate over soup objects main = artcile.find_all('a')[0] link = main['href'] image_src = str(main.find('img')['srcset']).split(" ")[-4] title = main['title'] new_headline = Headline() new_headline.title = title new_headline.url = link new_headline.image = image_src new_headline.save() return redirect("../")
def scrape(request): Headline.objects.all().delete() session = requests.Session() session.headers = { "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)" } url = "https://www.freecodecamp.org/news/" content = session.get(url, verify=False).content soup = BSoup(content, "html.parser") News = soup.find_all('article', {"class": "post-card"}) for artcile in News: main = artcile.find_all('a')[0] link = "https://www.freecodecamp.org" + main['href'] image_src = str(main.find('img')['src']) if not "http" in image_src: image_src = "https://www.freecodecamp.org" + str( main.find('img')['src']) title = str(main.find('img')['alt']) new_headline = Headline() new_headline.title = title new_headline.url = link new_headline.image = image_src new_headline.save() url2 = "https://www.entrepreneur.com/topic/coders" content = session.get(url2, verify=False).content soup = BSoup(content, "html.parser") News = soup.find_all('div', {"class": "hero"}) for artcile in News: main = artcile.find_all('a')[0] link = "https://www.entrepreneur.com" + main['href'] image_ = str(main.find('img')['src']) image_ = image_.replace('&blur=50', '') title = str(main.find('img')['alt']) new_headline = Headline() new_headline.title = title new_headline.url = link new_headline.image = image_ new_headline.save() return redirect("../")
def scrape(request): session = requests.Session() session.headers = {"User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"} url = "https://www.theonion.com/" content = session.get(url, verify=False).content soup = BSoup(content, "html.parser") News = soup.find_all('article', {"class":"pg455-2 gegKxg js_post_item sc-1pw4fyi-7 dCsSCd"}) for artcile in News: main = artcile.find_all('a')[0] link = main['href'] image_src = str(main.find('img')['data-anim-src']).split(" ")[0] title = main['href'][30:80] new_headline = Headline() new_headline.title = title new_headline.url = link new_headline.image = image_src new_headline.save() return redirect("/")
def scrape(request): session = requests.Session() session.headers ={"User-Agent":"Googlebot/2.1 (+http://www.google.com/bot.html)"} url = "https://www.indiatoday.in/world" #for page in range(1,5): #url+='?page=%d'%page content = session.get(url, verify=False).content soup = BSoup(content,"html.parser") #News = soup.find_all('div',{"class": "view-content"}) #for article in News: News = soup.find_all('div',{"class": "catagory-listing"}) for article in News: image_url = article.find('div',{"class": "pic"}).img['src'] title= article.find('div',{"class": "detail"}).h2.a.contents[0] link = str(url[:-6]+article.find('div',{"class": "detail"}).h2.a['href']) try: description = str(article.find('div',{"class": "detail"}).p.text) except: description = str(article.find('div',{"class": "detail"}).p) new_headline = Headline() new_headline.title = title new_headline.url = link new_headline.image = image_url new_headline.description= description try: new_headline.save() except IntegrityError as e: if 'unique constraint' in e.args: continue return redirect("../")
def scrape(request): session = requests.Session() session.headers = { 'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)' } url = 'https://www.stirileprotv.ro/' content = session.get(url, verify=False).content soup = BSoup(content, "html.parser") News = soup.find_all('div', {"class": "curation-module__item"}) for artcile in News: main = artcile.find_all('a')[0] link = main['href'] image_src = str(main.find('img')['srcset']).split(" ")[-4] title = main['title'] new_headline = Headline() new_headline.title = title new_headline.url = link new_headline.image = image_src new_headline.save() return redirect("../")
def scrape(request): session = requests.Session() session.headers = {"User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"} url = "https://lite.cnn.com" content = session.get(url, verify=False).content soup = BSoup(content, "html.parser") News = soup.find('ul').find_all('li') for article in News: main = article.find_all('a')[0] link = main['href'] title = main.text try: new_headline = Headline() new_headline.title = title new_headline.url = url + link new_headline.save() return HttpResponse("Success Scrapping Data") except e as Exception: return HttpResponse(f"Failed {e}")
def scrape(request): session = requests.Session() session.headers = { "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)" } url = "https://www.theonion.com/" content = session.get(url, verify=False).content soup = BSoup(content, "html.parser") # sc-1pw4fyi-3 ziaet, sc-1qkakgd-0 eSMucW, sc-1whp23a-1 kphRNd, a1de4o-5 cKrhTm News = soup.find_all( 'div', {"class": ["sc-1qkakgd-0 eSMucW", "sc-1whp23a-1 kphRNd"]}) for artcile in News: main = artcile.find_all('a')[0] link = main['href'] image_src = str(main.find('img')['srcset']).split(" ")[-4] title = "News" new_headline = Headline() new_headline.title = title new_headline.url = link new_headline.image = image_src new_headline.save() return redirect("../")
def scrape(request): Headline.objects.all().delete() session = requests.Session() session.headers = {"User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"} url = "https://www.theonion.com/latest" content = session.get(url).content soup = BSoup(content, "html.parser") News = soup.find_all('div', {"class":"cw4lnv-11 dFCKPx"}) for article in News: main = article.find_all('a',href=True) linkx = article.find('a', {"class":"sc-1out364-0 hMndXN js_link"}) link=linkx['href'] imgx=main[0].find('img',src=True) image_src=imgx['srcset'].split(" ")[-4] titlex = article.find('h2', {"class":"sc-759qgu-0 cYlVdn cw4lnv-6 eXwNRE"}) title = titlex.text new_headline = Headline() #Headline.objects.all().delete() new_headline.title = title new_headline.url = link new_headline.image = image_src new_headline.save() return redirect("../")
def scrape(request): session = requests.Session() session.headers = { "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)" } url = ur.urlopen("http://www.sinovision.net/").read() soup = BSoup(url.decode("utf-8"), 'lxml') main_news = soup.find('div', {'class': 'centersection-r'}) for news in main_news.find_all('li', {'class': 'rolltitle'}): title = news.find('a').get_text() main = news.find_all('a')[0] link = main['href'] if Headline.objects.filter(url=link): continue elif link == '': continue new_headline = Headline() new_headline.title = title new_headline.url = link new_headline.save()
def scrape(request): session = requests.Session() session.verify = False session.headers = { "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)" } url = "https://news.abplive.com/news" content = session.get(url, verify=True).content soup = BSoup(content, "html.parser") News = soup.find_all('div', {"class": "other_news"}) for article in News: print(article) break # check if already exist if not Headline.objects.filter( title__iexact=article.a['title']).exists(): print("not exist") new_headline = Headline() new_headline.title = article.a['title'] new_headline.url = article.a['href'] new_headline.image = article.img['data-src'] new_headline.save() return redirect("../")
def scrape(request): session = requests.Session() session.headers = { "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)" } url = "https://timesofindia.indiatimes.com/briefs" content = session.get(url, verify=False).content soup = BSoup(content, "html.parser") News = soup.find_all('div', {"class": "brief_box"}) for i in range(0, 60): if i % 6 == 4: continue article = News[i] link = "https://timesofindia.indiatimes.com" + article.h2.a['href'] title = article.h2.text text = article.p.text new_headline = Headline() new_headline.title = title new_headline.url = link new_headline.text = text new_headline.save() return redirect("../")
def scrape(request): session = requests.Session() session.headers = { "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)" } url = "https://www.theonion.com/" #content = session.get(url).text page = requests.get(url) html = page.text soup = BSoup(html, "html.parser") News = soup.findAll('article') for artcile in News: link = str(artcile.find('a')['href']).split(" ") image_src = str(artcile.find('img')['srcset']).split(" ")[-4] title = str(artcile.find('h4').get_text()) new_headline = Headline() new_headline.title = title new_headline.url = link new_headline.image = image_src new_headline.save() return redirect("/")
def scrape(request): session = requests.Session() session.headers = ("User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)") url = "https://www.theonion.com/" content = session.get(url, verify=False).content soup = BSoup(content, "html.parser") News = soup.find_all('div', ("class":"curation-module__item")) for article in News: main = article.find_all('a')[0] link = main['href'] image_src = str(main.find('img')['srcset']).split(" ")[-4] title = main['title'] new_headline = Headline() new_headline.title = title new_headline.url = link new_headline.image = image_src new_headline.save() return redirect("../") # DataFlair def news_list(request): headlines = Headline.objects.all()[::-1] context = { 'object_list': headlines, } return render(request, "news/home.html", context)
def get_news(): STORIES = 50 sites = 2 per_site = STORIES // sites fox_html = requests.get('http://www.foxnews.com/politics') fox_soup = BeautifulSoup(fox_html.text, 'lxml') fox_list = fox_soup.find_all('div', class_='content article-list') fox_count = 0 for lst in fox_list: fox = lst.find_all('article', class_='article') for article in fox: if fox_count < per_site: headline = Headline() pic = article.find('a').find('img')['src'] url = article.find('a')['href'] title = article.find('h4', class_='title').find('a').text time_ago = article.find('span', class_='time').text leaning = 'right' headline.img = pic if url[0] == '/': headline.url = "http://www.foxnews.com" + url else: headline.url = url headline.title = title headline.leaning = leaning headline.time_ago_str = time_ago headline.save() fox_count += 1 politico_html = requests.get('http://www.politico.com/politics') politico_soup = BeautifulSoup(politico_html.text, 'lxml') politico = politico_soup.find_all('article', class_='story-frag format-sm') politico_count = 0 for article in politico: if len(article.find('a').text.split( " ")) > 4 and politico_count < per_site: headline = Headline() if article.find('img') != None: pic = article.find('img')['src'] else: pic = "" url = article.find('a')['href'] title = article.find('a').text leaning = 'left' now = datetime.datetime.now() pub_datetime_str = article.find('time')['datetime'] pub_datetime = datetime.datetime.strptime(pub_datetime_str, '%Y-%m-%d %H:%M:%S') time_ago = (now - pub_datetime).seconds // 60 headline.img = pic headline.url = url headline.title = title headline.leaning = leaning headline.mins_ago = time_ago headline.save() politico_count += 1