Exemplo n.º 1
0
def nyt(query_subject):
    links = []
    for url in search_news(query_subject + ' site:https://www.nytimes.com', pause = rando(), stop = stop_limit):
        links.append(url)

    article_list = []
    target_article = []

    for link in links:
        try:
            target_article.append(requests.get(link))

        except requests.exceptions.ConnectionError:
            print('something messed up')




    for request in target_article:
        article = {'Title' : '' , 'Authors' : [], 'Text' : [], 'Date' : '', 'Publication' : 'New York Times'}

        article['Title'] = get_title_nyt(request)
        article['Authors'] = get_authors_nyt(request)
        article['Text'] = wordlist(get_content_nyt(request))

        article_list.append(article)

    return  article_list
Exemplo n.º 2
0
def nyt(query_subject):
    links = []
    for url in search_news(query_subject + ' site:https://www.nytimes.com',
                           pause=rando(),
                           stop=stop_limit):
        links.append(url)

    article_list = []
    target_article = []

    for link in links:
        try:
            target_article.append(requests.get(link))

        except requests.exceptions.ConnectionError:
            print('something messed up')

    for request in target_article:
        article = {
            'Title': '',
            'Authors': [],
            'Text': [],
            'Date': '',
            'Publication': 'New York Times'
        }

        article['Title'] = get_title_nyt(request)
        article['Authors'] = get_authors_nyt(request)
        article['Text'] = wordlist(get_content_nyt(request))

        article_list.append(article)

    return article_list
Exemplo n.º 3
0
def hp(query_subject):
    links = []


    for url in search_news(query_subject + ' site:http://www.huffingtonpost.com',  pause = rando(), stop = 20):
        links.append(url)

    article_list = []
    target_article = []

    for link in links:
        try:
            target_article.append(requests.get(link))

        except requests.exceptions.ConnectionError:
            print('something messed up')


    for request in target_article:
        article = {'Title' : '' , 'Authors' : [], 'Text' : [], 'Date' : '', 'Publication' : 'Huffington Post'}

        article['Text'] = wordlist(get_content_hp(request))
        article['Title'] = get_title_hp(request)
        article['Date'] = get_date_hp(request)
        article['Authors'] = get_authors_hp(request)
        article_list.append(article)

    return  article_list
Exemplo n.º 4
0
def main():
    article_content = open('webscrapertwo.json', 'w')

    # Create the file inwhich to store the

    links = []

    article_holder = {
        'Title': '',
        'Authors': [],
        'Text': '',
        'Date': '',
        'Publication': 'New York Times'
    }

    subject = input("what do you want to look up? ")
    for url in search_news(subject + ' site:https://www.nytimes.com', stop=5):
        links.append(url)

    # x = len(links)
    target_article = []

    for link in links:
        try:
            target_article.append(requests.get(link))
            # print(article_link)
        except requests.exceptions.ConnectionError:
            print('something messed up')

    y = 0

    for request in target_article:
        article = {
            'Title': '',
            'Authors': [],
            'Text': '',
            'Date': '',
            'Publication': 'New York Times'
        }

        article['Title'] = get_title_nyt(request)
        article['Authors'] = get_authors_nyt(request)
        article['Text'] = get_content_nyt(request)

        json.dump(article, article_content, indent=4)

    print('\n')
    print('\n')
    print('\n')
Exemplo n.º 5
0
def fetch_news(query):
    print('Getting result...')
    result = google.search_news(query, num=4, stop=4)
    print('->Got result!')
    video_urls = list()
    for r in result:
        #print("-->" + r)
        #if 'watch?v=' in r:
        video_urls.append(r)
    if len(video_urls) > 1:
        return video_urls[random.randint(0, len(video_urls) - 1)]
    elif len(video_urls) == 1:
        return video_urls[0]
    else:
        return "Could not find a news article. Try typing a different query."
Exemplo n.º 6
0
def main():
    article_content = open('webscrapertwo.json', 'w')

    # Create the file inwhich to store the

    links = []

    article_holder = {'Title' : '' , 'Authors' : [], 'Text' : '', 'Date' : '', 'Publication' : 'New York Times'}

    subject = input("what do you want to look up? ")
    for url in search_news(subject + ' site:https://www.nytimes.com', stop = 5):
        links.append(url)

    # x = len(links)
    target_article = []


    for link in links:
        try:
            target_article.append(requests.get(link))
            # print(article_link)
        except requests.exceptions.ConnectionError:
            print('something messed up')


    y = 0

    for request in target_article:
        article = {'Title' : '' , 'Authors' : [], 'Text' : '', 'Date' : '', 'Publication' : 'New York Times'}

        article['Title'] = get_title_nyt(request)
        article['Authors'] = get_authors_nyt(request)
        article['Text'] = get_content_nyt(request)

        json.dump(article, article_content, indent=4)

    print('\n')
    print('\n')
    print('\n')
Exemplo n.º 7
0
from __future__ import unicode_literals
from google import search_news
# import newspaper
import urllib
from bs4 import BeautifulSoup
# data = ""
text_f = open("/home/suryansh/Desktop/debatenight", "w")
for url in search_news("debate night", stop=30):
    # print url
    # url = "http://www.hindustantimes.com/analysis/us-understands-india-s-anger-but-does-not-explicitly-back-surgical-strikes/story-VOow8dR7nRenaY6OryVjoL.html"

    if url[0:21] == "http://www.bbc.co.uk/":
        continue
    print "** " + url + "\n\n"
    if (url ==
            "http://www.hamhigh.co.uk/news/hampstead_schoolgirls_hold_yoga_day_for_indian_village_1_4763832"
            or url ==
            "https://www.rt.com/sport/365009-rio-worker-wages-threaten-suit/"
            or url
            == "http://www.bbc.co.uk/news/uk-england-manchester-37631537"
            or url == "http://www.bbc.co.uk/newsround/37675611"):
        continue
    # if( url == "https://www.bloomberg.com/view/articles/2016-11-08/india-pushes-u-k-to-figure-out-its-economic-future"):
    # continue
    html = urllib.urlopen(url).read()
    soup = BeautifulSoup(html, from_encoding="utf-8")

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()  # rip it out