Exemplo n.º 1
0
def scrape():
    import requests
    from bs4 import BeautifulSoup
    from django.conf import settings
    import os
    import shutil
    from news.models import Headline, UserProfile
    from datetime import datetime
    #user_p, created = UserProfile.objects.get_or_create(user=request.user)
    #user_p.last_scrape = datetime.now(timezone.utc)
    #user_p.save()
    url = "https://premierleague.com"
    session = requests.Session()
    session.headers = {
        "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
    }
    content = session.get(url + "/news",
                          verify=False).content  # .content grabs all html

    soup = BeautifulSoup(content, 'html.parser')
    articles = soup.find_all("section", {"class": "featuredArticle"})

    for item in articles:
        url_suffix = item.find("a", {"class": "thumbnail thumbLong"})['href']
        news_link = url + url_suffix if not re.search(
            '^https://', url_suffix) else url_suffix
        img_src = item.find("img")['src'].strip()

        new_headline = Headline()
        new_headline.url = news_link
        new_headline.pub_date = datetime.now()
        # use img_src to get the link,
        # then use the link to get the actual image,
        # and save the image in BASE_DIR/src/static

        media_root_path = settings.MEDIA_ROOT
        local_fname = img_src.split("/")[-1].split("?")[0]
        try:
            if not local_fname.startswith(
                    "audioboomgraphics") and local_fname not in os.listdir(
                        media_root_path):
                r = session.get(img_src, stream=True, verify=False)
                with open(local_fname, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=1024):
                        f.write(chunk)
                cur_img_abspath = os.path.abspath(local_fname)
                shutil.move(cur_img_abspath, media_root_path)
                new_headline.image = local_fname
            elif local_fname in os.listdir(media_root_path):
                new_headline.image = local_fname
        except:
            pass
        info = get_summary(news_link)
        new_headline.title = info['title']
        new_headline.summary = info['summary']
        try:
            new_headline.save()
        except:
            pass
def scrape(request):
    session = requests.Session()
    session.headers = {
        "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"
    }
    url = "https://www.theonion.com/"

    content = session.get(url, verify=False).content
    soup = BSoup(content, "html.parser")
    News = soup.find_all('article', {"class": "js_post_item"})
    for article in News:
        title = article.find_all('a', {"class": "js_link"})[-1].text
        link = article.find("a", {"class": "js_link"}).attrs["href"]
        image_src = article.find("a", {"class": "js_link"}).find("img")
        if image_src:
            try:
                image_src = image_src.attrs["srcset"]
                image_src = image_src.split(" ")[-4]
            except:
                try:
                    image_src = image_src.attrs["data-expanded-srcset"]
                    image_src = image_src.split(" ")[-4]
                except:
                    continue
        else:
            continue
        new_headline = Headline()
        new_headline.title = title
        new_headline.url = link
        new_headline.image = image_src
        new_headline.save()
    return redirect("../")
def scrape(request):
    session = requests.Session()
    session.headers = {
        "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"
    }
    url = "https://www.theonion.com/"
    content = session.get(url, verify=False).content
    soup = BSoup(content, "html.parser")
    News = soup.find_all('article', {"class": "sc-1pw4fyi-5 RkwFH"})
    for artcile in News:
        main = artcile.find_all('a')[0]
        link = main['href']
        title = artcile.find('h4', {"class": "sc-1qoge05-0 eoIfRA"}).text
        News3 = artcile.find('img', {"class": "dv4r5q-2 iaqrWM"})
        if News3 is None:
            image_src = temp
        else:
            image_src = News3['srcset'].split(' ')[0]
            temp = image_src
        new_headline = Headline()
        new_headline.title = title
        new_headline.url = link
        new_headline.image = image_src
        new_headline.save()

    return redirect("../")
Exemplo n.º 4
0
def scrape(request):
    session = requests.Session()
    session.headers = {
        "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)%22%7D"
    }
    url = "https://www.ynet.co.il/home/0,7340,L-8,00.html"

    content = session.get(url, verify=False).content
    soup = BSoup(content, "html.parser")
    News = soup.find_all('div',
                         {"class": "str3s str3s_small str3s_type_small"})
    Titles = soup.find_all('div', {"class": "title"})
    TitlesText = []
    for title in Titles:
        t = title.text
        TitlesText.append(t)

    i = 0
    new_headline_links = []
    for article in Headline.objects.all():
        new_headline_links.append(article.title)

    for artcile in News:
        main = artcile.find_all('a')[0]

        link = main['href']
        image_src = str(main.find('img')['src']).split(" ")[0]

        if (TitlesText[i] in new_headline_links):
            break

        if (link.find("https") != -1):
            link2 = link
        else:
            link2 = "https://www.ynet.co.il/" + link

        link2 = link2.replace('#autoplay', '')
        articleContent = session.get(link2, verify=False).content
        print(link2)
        soup = BSoup(articleContent, "html.parser")

        new_headline = Headline()

        ok = "פורסם:"
        #header = soup.find_all('div', {"class":"element B3 ghcite noBottomPadding"})[0]
        dates = soup.find_all('span', string=ok)
        print(dates)

        new_headline.date = dates[1].text
        new_headline.title = TitlesText[i]
        new_headline.url = link2
        new_headline.image = image_src
        #if (new_headline.date != 'error#'):
        #    new_headline.save()
        new_headline.save()
        i = i + 1

    return redirect("../")
def scrape(request):
    Headline.objects.all().delete()

    session = requests.Session()
    session.headers = {
        "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"
    }

    url = "https://www.freecodecamp.org/news/"
    content = session.get(url, verify=False).content
    soup = BSoup(content, "html.parser")
    News = soup.find_all('article', {"class": "post-card"})
    for artcile in News:
        main = artcile.find_all('a')[0]
        link = "https://www.freecodecamp.org" + main['href']
        image_src = str(main.find('img')['src'])
        if not "http" in image_src:
            image_src = "https://www.freecodecamp.org" + str(
                main.find('img')['src'])
        title = str(main.find('img')['alt'])
        new_headline = Headline()
        new_headline.title = title
        new_headline.url = link
        new_headline.image = image_src
        new_headline.save()

    url2 = "https://www.entrepreneur.com/topic/coders"
    content = session.get(url2, verify=False).content
    soup = BSoup(content, "html.parser")
    News = soup.find_all('div', {"class": "hero"})
    for artcile in News:
        main = artcile.find_all('a')[0]
        link = "https://www.entrepreneur.com" + main['href']
        image_ = str(main.find('img')['src'])
        image_ = image_.replace('&blur=50', '')
        title = str(main.find('img')['alt'])
        new_headline = Headline()
        new_headline.title = title
        new_headline.url = link
        new_headline.image = image_
        new_headline.save()

    return redirect("../")
Exemplo n.º 6
0
def scrape(request):
    url = "https://www.theonion.com/"
    r = requests.get(url)
    soup = BSoup(r.content, 'html.parser')
    val = soup.find_all('article', {'class': "js_post_item"})
    for link in val:
        main = link.find('a')
        try:
            image_url = (str(main.find('img')['data-srcset']).split(" ")[0])
            new_headine = Headline()
            new_headine.image = image_url
            new_headine.url = main['href']
            new_headine.title = link.find('h4').get_text()
            new_headine.save()
        except:
            pass
    return redirect("../")
Exemplo n.º 7
0
def scrape(request): # scrape news articles from theonion.com
  session = requests.Session()
  session.headers = {"User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"}
  url = "https://www.theonion.com/"
  content = session.get(url, verify=False).content
  soup = BSoup(content, "html.parser") # create a soup object
  News = soup.find_all('div', {"class":"curation-module__item"})
  for artcile in News: # to iterate over soup objects
    main = artcile.find_all('a')[0]
    link = main['href']
    image_src = str(main.find('img')['srcset']).split(" ")[-4]
    title = main['title']
    new_headline = Headline()
    new_headline.title = title
    new_headline.url = link
    new_headline.image = image_src
    new_headline.save()
  return redirect("../")
Exemplo n.º 8
0
def scrape(request):
    session = requests.Session()
    session.headers = {"User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"}
    url = "https://www.theonion.com/"
    content = session.get(url, verify=False).content
    soup = BSoup(content, "html.parser")
    News = soup.find_all('article', {"class":"pg455-2 gegKxg js_post_item sc-1pw4fyi-7 dCsSCd"})
    for artcile in News:
        main = artcile.find_all('a')[0]

        link = main['href']
        image_src = str(main.find('img')['data-anim-src']).split(" ")[0]
        title = main['href'][30:80]
        new_headline = Headline()
        new_headline.title = title
        new_headline.url = link
        new_headline.image = image_src
        new_headline.save()
    return redirect("/")
Exemplo n.º 9
0
def scrape(request):
	session = requests.Session()
	session.headers ={"User-Agent":"Googlebot/2.1 (+http://www.google.com/bot.html)"}
	url = "https://www.indiatoday.in/world"

	#for page in range(1,5):
			#url+='?page=%d'%page

	content = session.get(url, verify=False).content
	soup = BSoup(content,"html.parser")
    #News = soup.find_all('div',{"class": "view-content"})

    #for article in News:
	
	News = soup.find_all('div',{"class": "catagory-listing"})

	for article in News:
	
		image_url = article.find('div',{"class": "pic"}).img['src']
		title=  article.find('div',{"class": "detail"}).h2.a.contents[0]
		link = str(url[:-6]+article.find('div',{"class": "detail"}).h2.a['href'])
		try:
			description = str(article.find('div',{"class": "detail"}).p.text)
		except:
			description = str(article.find('div',{"class": "detail"}).p)


		new_headline = Headline()
		new_headline.title = title
		new_headline.url = link
		new_headline.image = image_url
		new_headline.description= description

		try:
			new_headline.save()
		
		except IntegrityError as e: 
			if 'unique constraint' in e.args:
				continue 


	return redirect("../")
Exemplo n.º 10
0
def scrape(request):
    session = requests.Session()
    session.headers = {
        'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)'
    }
    url = 'https://www.stirileprotv.ro/'
    content = session.get(url, verify=False).content
    soup = BSoup(content, "html.parser")
    News = soup.find_all('div', {"class": "curation-module__item"})
    for artcile in News:
        main = artcile.find_all('a')[0]
        link = main['href']
        image_src = str(main.find('img')['srcset']).split(" ")[-4]
        title = main['title']
        new_headline = Headline()
        new_headline.title = title
        new_headline.url = link
        new_headline.image = image_src
        new_headline.save()
    return redirect("../")
Exemplo n.º 11
0
def scrape(request):
    session = requests.Session()
    session.headers = {
        "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"
    }
    url = "https://www.theonion.com/"
    content = session.get(url, verify=False).content
    soup = BSoup(content, "html.parser")
    #   sc-1pw4fyi-3 ziaet, sc-1qkakgd-0 eSMucW, sc-1whp23a-1 kphRNd, a1de4o-5 cKrhTm
    News = soup.find_all(
        'div', {"class": ["sc-1qkakgd-0 eSMucW", "sc-1whp23a-1 kphRNd"]})
    for artcile in News:
        main = artcile.find_all('a')[0]
        link = main['href']
        image_src = str(main.find('img')['srcset']).split(" ")[-4]
        title = "News"
        new_headline = Headline()
        new_headline.title = title
        new_headline.url = link
        new_headline.image = image_src
        new_headline.save()
    return redirect("../")
Exemplo n.º 12
0
def scrape(request):
    session = requests.Session()
    session.verify = False
    session.headers = {
        "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"
    }
    url = "https://news.abplive.com/news"
    content = session.get(url, verify=True).content
    soup = BSoup(content, "html.parser")
    News = soup.find_all('div', {"class": "other_news"})
    for article in News:
        print(article)
        break
        # check if already exist
        if not Headline.objects.filter(
                title__iexact=article.a['title']).exists():
            print("not exist")
            new_headline = Headline()
            new_headline.title = article.a['title']
            new_headline.url = article.a['href']
            new_headline.image = article.img['data-src']
            new_headline.save()
    return redirect("../")
Exemplo n.º 13
0
def scrape(request):
  Headline.objects.all().delete()
  session = requests.Session()
  session.headers = {"User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"}
  url = "https://www.theonion.com/latest"
  content = session.get(url).content
  soup = BSoup(content, "html.parser")
  News = soup.find_all('div', {"class":"cw4lnv-11 dFCKPx"})
  for article in News:
    main = article.find_all('a',href=True)
    linkx = article.find('a', {"class":"sc-1out364-0 hMndXN js_link"})
    link=linkx['href']
    imgx=main[0].find('img',src=True)
    image_src=imgx['srcset'].split(" ")[-4]
    titlex = article.find('h2', {"class":"sc-759qgu-0 cYlVdn cw4lnv-6 eXwNRE"})
    title = titlex.text
    new_headline = Headline()
    #Headline.objects.all().delete()
    new_headline.title = title
    new_headline.url = link
    new_headline.image = image_src
    new_headline.save()
  return redirect("../")
Exemplo n.º 14
0
def scrape(request):
    session = requests.Session()
    session.headers = {
        "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"
    }
    url = "https://www.theonion.com/"
    #content = session.get(url).text
    page = requests.get(url)

    html = page.text
    soup = BSoup(html, "html.parser")
    News = soup.findAll('article')

    for artcile in News:
        link = str(artcile.find('a')['href']).split(" ")
        image_src = str(artcile.find('img')['srcset']).split(" ")[-4]
        title = str(artcile.find('h4').get_text())
        new_headline = Headline()
        new_headline.title = title
        new_headline.url = link
        new_headline.image = image_src
        new_headline.save()
    return redirect("/")
Exemplo n.º 15
0
def scrape(request):
	session = requests.Session()
	session.headers = ("User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)")
	url = "https://www.theonion.com/"

	content = session.get(url, verify=False).content 
	soup = BSoup(content, "html.parser")
	News = soup.find_all('div', ("class":"curation-module__item"))
	for article in News:
		main = article.find_all('a')[0]
		link = main['href']
		image_src = str(main.find('img')['srcset']).split(" ")[-4]
		title = main['title']
		new_headline = Headline()
		new_headline.title = title
		new_headline.url = link 
		new_headline.image = image_src
		new_headline.save()
		return redirect("../")


# DataFlair 
def news_list(request):
	headlines = Headline.objects.all()[::-1]
	context = {
		'object_list': headlines,

	}
	return render(request, "news/home.html", context)