Пример #1
0
def get_articles(output_path, ):
    config = dict(
        language='pl',
        fetch_images=False,
        MIN_WORD_COUNT=100,
        MIN_SENT_COUNT=5,
        memoize_articles=False,
    )

    papers = {}
    for url in SITES_URLS:
        paper = newspaper.build(url, **config)
        print(f"{url} contains {paper.size()} articles")
        papers[url] = paper

    print("Downloading...")
    news_pool.set(papers.values(), threads_per_source=2)
    news_pool.join()

    print("Parsing...")
    for paper in papers.values():
        paper.parse_articles()

    articles = [
        art.text for paper in papers.values() for art in paper.articles
    ]
    articles = [art.replace("\n", "") for art in articles]

    print(f"Scraped {len(articles)} articles")

    with open(output_path, "w", encoding='utf-8') as f:
        json.dump(articles, f)
 def download_and_parse(self, url_list=None):
     if not self.submissions and url_list is None:
         logging.error('Must give a "url_list" or "query_reddit" first.')
         return
     self.set_articles(url_list)
     news_pool.set(self.articles)
     news_pool.join()
def loadNews(knownrealSites, s):
    articles = Article.objects.all()
    #for a in articles:
    #print(a)
    papers = []
    for url in knownrealSites:
        real_paper = None
        try:
            real_paper = newspaper.build(url)
            papers.append(real_paper)
            print(url + ' contains ' + str(len(real_paper.articles)) + ' ' +
                  s + ' articles')
        except:
            print(url)
            print('url is bad')
            continue
    news_pool.set(papers, threads_per_source=4)
    news_pool.join()
    for paper in papers:
        for article in paper.articles:
            #due to multithreading above we can assume every article has had download called on it.
            #for article in real_paper.articles:
            try:
                #article.download()
                article.parse()
                #print('article.authors:**************************\n');print(article.authors)
                #print('article.text:**************************\n');print(article.text)
                #print('article.url:**************************\n');print(article.url)
                #print('article.title:**************************\n');print(article.title)
                #article.nlp()
                #print('keywords:**************************\n');print(article.keywords)
                #print('summary:**************************\n');print(article.summary)
            except:
                print('issue with download/parse')
                continue
            #x,y,z = tweetParser.getSentiment(url,2000)
            #print(article.publish_date)
            a = Article(
                address=article.url,
                title=article.title,
                body=article.text,
                date=article.publish_date,
                result=s,
                #positive = x,
                #negative = y,
                #neutral = z,
            )
            #article.parse()
            #article.nlp()
            try:
                a.save()
                print(
                    '**************************article SAVED**************************'
                )
            except:
                print(
                    '**************** article failed to save with field **************'
                )
                continue
Пример #4
0
def get_newspapers(source_urls):
    papers = []
    for url in source_urls:
        papers.append(newspaper.build(url))
    
    news_pool.set(papers, threads_per_source=2)
    news_pool.join()
    return papers
Пример #5
0
def scrape_news():

    #t = time.time()

    ## connect
    client = MongoDB()

    with client as db:

        # #connect (not necessary)
        # connect(db)

        ## multi-threading
        eu_paper = newspaper.build('https://theguardian.com',
                                   memoize_articles=False,
                                   fetch_images=False)
        us_paper = newspaper.build('https://www.cbsnews.com/',
                                   memoize_articles=False,
                                   fetch_images=False)
        hk_paper = newspaper.build('http://scmp.com',
                                   memoize_articles=False,
                                   fetch_images=False)
        jp_paper = newspaper.build('https://www.japantimes.co.jp/',
                                   memoize_articles=False,
                                   fetch_images=False)

        papers = [eu_paper, us_paper, hk_paper, jp_paper]
        news_pool.set(papers, threads_per_source=2)  # (4*2) = 8 threads total
        news_pool.join()

        print("Size of EU paper: " + str(eu_paper.size()))
        print("Size of US paper: " + str(us_paper.size()))
        print("Size of HK paper: " + str(hk_paper.size()))
        print("Size of JP paper: " + str(jp_paper.size()))

        for paper in papers:
            for article in paper.articles:
                try:
                    article.parse()
                    print(len(article.text))
                    if len(article.text) > 100:
                        article.nlp()
                        item = {
                            'url': article.url,
                            'brand': paper.brand,
                            'title': article.title,
                            'text': article.text,
                            'keywords': article.keywords,
                            'summary': article.summary,
                            'date': dt.today(),
                            'date_str': dt.today().strftime('%Y-%m-%d')
                        }
                        db.news_items.insert_one(item)
                except Exception as e:
                    #In case it fails, skip article
                    print(e)
                    print("continuing...")
                    continue
Пример #6
0
    def build_sources(self, param):
        replies = list()
        for sources in param:
            replies.append(
                newspaper.build('http://' + str(sources) + '.com',
                                language='en'))

        news_pool.set(replies, threads_per_source=3)
        news_pool.join()

        return replies
Пример #7
0
    def __init__(self):

        # create list containing news sites to scrape
        self.web_list = ['http://www.foxnews.com','http://www.usatoday.com']

        # setup newspaper to multi-thread news sources 
        self.newsWebList = [newspaper.build(i, memoize_articles=True, fetch_images=False) for i in self.web_list]
        news_pool.set(self.newsWebList, threads_per_source=10)
        news_pool.join()
        self.connectDB()
        self.compareArticle()
Пример #8
0
    def download_all_articles(self):
        logging.info("Downloading all articles...")

        papers = self.create_source_feed_list()

        news_pool.set(papers, threads_per_source=self.THREADS_PER_NEWS_SOURCE)

        # Download feed from all sources in parallel threads
        news_pool.join()

        logging.info("Download complete.")
        logging.info(datetime.now())
Пример #9
0
    def get_who_articles(self):

        covid_articles = newspaper.build(self.newspaper_link, memoize_articles=False)
        papers = [covid_articles, ]
        news_pool.set(papers, threads_per_source=4)
        news_pool.join()

        for index, article in enumerate(covid_articles.articles):
            print(article.url)
            article.parse()
            write_file = open('sites/articles/article' + str(index) + '.txt', 'w', encoding='utf-8')
            write_file.write(str(article.title) + "\n")
            write_file.write(textwrap.fill(article.text, width=120))
            write_file.close()
Пример #10
0
def main() -> List[List[str]]:
    papers = [newspaper.build(url,
                              memoize_articles=False,
                              fetch_images=False,
                              verbose=DEBUG)
              for url in SITE_URLS]
    news_pool.set(papers, threads_per_source=THREADS_PER_SOURCE)
    news_pool.join()

    articles = []

    for paper in papers:
        articles.extend(get_articles(paper))

    print('Final number of articles:', len(articles))

    return articles
Пример #11
0
    def test_download_works(self):
        config = Configuration()
        config.memoize_articles = False
        slate_paper = newspaper.build('http://slate.com', config=config)
        tc_paper = newspaper.build('http://techcrunch.com', config=config)
        espn_paper = newspaper.build('http://espn.com', config=config)

        print('Slate has %d articles TC has %d articles ESPN has %d articles' %
              (slate_paper.size(), tc_paper.size(), espn_paper.size()))

        papers = [slate_paper, tc_paper, espn_paper]
        news_pool.set(papers, threads_per_source=2)

        news_pool.join()

        print 'Downloaded Slate mthread len', len(slate_paper.articles[0].html)
        print 'Downloaded ESPN mthread len', len(espn_paper.articles[-1].html)
        print 'Downloaded TC mthread len', len(tc_paper.articles[1].html)
def download_newspapers(sources):
    '''
    This function will download the data from the newspapers in the sources
    variable, and then save them to mongodb database.

    '''

    res = []
    paper = []
    l = newspaper.build(sources, memoize_articles=False)
    paper.append(l)
    news_pool.set(paper, threads_per_source=2)
    news_pool.join()

    for r in paper[0].articles:
        res.append(r)

    return res
Пример #13
0
    def test_download_works(self):
        config = Configuration()
        config.memoize_articles = False
        slate_paper = newspaper.build('http://slate.com', config=config)
        tc_paper = newspaper.build('http://techcrunch.com', config=config)
        espn_paper = newspaper.build('http://espn.com', config=config)

        print ('slate has %d articles tc has %d articles espn has %d articles'
               % (slate_paper.size(), tc_paper.size(), espn_paper.size()))

        papers = [slate_paper, tc_paper, espn_paper]
        news_pool.set(papers, threads_per_source=2)

        news_pool.join()

        print 'Downloaded slate mthread len', len(slate_paper.articles[0].html)
        print 'Downloaded espn mthread len', len(espn_paper.articles[-1].html)
        print 'Downloaded tc mthread len', len(tc_paper.articles[1].html)
Пример #14
0
def collect_news():

    papers = []
    papers.append(newspaper.build('http://cnn.com', memoize_articles=True))
    papers.append(
        newspaper.build('http://www.bbc.com/news', memoize_articles=True))
    papers.append(
        newspaper.build('http://news.sky.com/world', memoize_articles=True))
    papers.append(
        newspaper.build('https://nytimes.com/section/world',
                        memoize_articles=True))
    papers.append(
        newspaper.build('https://washingtonpost.com/world',
                        memoize_articles=True))
    papers.append(
        newspaper.build('http://reuters.com/news/world',
                        memoize_articles=True))

    news_pool.set(papers, threads_per_source=1)
    news_pool.join()
    news_list = []

    categories = fetch_20newsgroups(subset='train', shuffle=True)
    clf = joblib.load(os.path.join(os.path.dirname(__file__), 'model.pkl'))

    for paper in papers:
        for current_article in itertools.islice(paper.articles, 0, 5):
            current_article.download()
            current_article.parse()
            current_article.nlp()

            news_to_add = {
                'title': current_article.title,
                'keywords': current_article.keywords,
                'url': current_article.url,
                'category': news_predictor([current_article.text], categories,
                                           clf),
                'source': paper.brand,
                'collected':
                datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            }
            news_list.append(news_to_add)

    return news_list
Пример #15
0
def extract_articles(url_list):
    """Extracts article text and keywords from url.

    Inputs
    ------
    url_list: list
    Returns
    -------
    generator with keywords parsed from article url list
    """
    articles = [Article(url) for url in url_list]
    news_pool.set(articles)
    news_pool.join()
    r = Rake()
    for article in articles:
        article.parse()
        r.extract_keywords_from_text(article.text)
        article_kwords = r.get_ranked_phrases()
        yield article_kwords
Пример #16
0
def main():

    # Build a news souce
    # use memoize_articles flag to turn off article caching
    fox = newspaper.build("http://www.foxnews.com", memoize_articles=False)
    print(fox.size())
    msnbc = newspaper.build("http://www.msnbc.com", memoize_articles=False)
    print(msnbc.size())
    bbc = newspaper.build("http://www.bbc.com", memoize_articles=False)
    print(bbc.size())

    papers = [fox, msnbc, bbc]

    news_pool.set(papers, threads_per_source=2)  #6 total
    news_pool.join()

    # extract and save articles
    saveFile("fox.json", downloadAndParse(fox))
    saveFile("msnbc.json", downloadAndParse(msnbc))
    saveFile("bbc.json", downloadAndParse(bbc))
Пример #17
0
    def test_download_works(self):
        """
        """
        config = Configuration()
        config.is_memoize_articles = False
        slate_paper = newspaper.build('http://slate.com', config)
        tc_paper = newspaper.build('http://techcrunch.com', config)
        espn_paper = newspaper.build('http://espn.com', config)

        print 'slate has %d articles tc has %d articles espn has %d articles' \
                % (slate_paper.size(), tc_paper.size(), espn_paper.size())

        papers = [slate_paper, tc_paper, espn_paper]
        news_pool.set(papers, threads_per_source=2)

        news_pool.join()

        print 'Downloaded slate mthread len', len(slate_paper.articles[0].html)
        print 'Downloaded espn mthread len', len(espn_paper.articles[-1].html)
        print 'Downloaded tc mthread len', len(tc_paper.articles[1].html)
Пример #18
0
def pool():
    #Download all new news articles from our sources
    papers = []
    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'

    config = Config()
    config.browser_user_agent = user_agent
    config.request_timeout = 10
    config.memoize_articles=False 

    #Build a model of all articles from the website. Get only those we haven't retrieved before
    reuters_paper = newspaper.build('https://www.reuters.com/', memoize_articles=False)#, request_timeout=10)
    bbc_paper = newspaper.build('https://www.bbc.co.uk/news', memoize_articles=False)#, request_timeout=10)

    #We add the models of the news sources
    papers.append(reuters_paper)
    papers.append(bbc_paper)

    news_pool.set(papers, threads_per_source=8)
    news_pool.join()

    return papers
Пример #19
0
def main(argv):
    TOP_PATH = os.path.dirname(__file__)
    OUT_PATH = os.path.join(TOP_PATH, 'output')
    if not os.path.exists(OUT_PATH):
        os.makedirs(OUT_PATH)

    # Our permanent config for crawling
    config = Config()
    config.language = 'id'
    config.MIN_SENT_COUNT = 20
    config.memoize = False
    config.fetch_images = False
    config.verbose= True

    # Get contents of our source file
    sourcefile = os.path.join(TOP_PATH, "sources.txt")
    with open(os.path.join(sourcefile), 'r') as f:
        sourcelist = f.read().strip().split('\n')

    # Initialize our sources
    sources = [IntiSource(source,config=config) for source in sourcelist]

    # Make domain directories inside our output path and build sources
    for s in sources:
        if not os.path.exists(os.path.join(OUT_PATH, s.domain)):
            dom_path = os.path.join(OUT_PATH, s.domain)
            os.makedirs(dom_path)

        # Build
        s.build()

        if config.verbose:
            s.print_summary()

    # Multithreaded source downloading and parsing
    news_pool.set(sources, threads_per_source = 4)
    news_pool.join()

    article_parse(sources)
Пример #20
0
def scrape_website(urls):
    papers = []
    for url in urls:
        if url:
            papers.append(newspaper.build(url, memoize_articles=False))

    for paper in papers:
        delete_queue = [] # articles to be deleted
        for article in paper.articles:
            if 'video' in article.url or 'videos' in article.url:
                delete_queue.append(article)

        for article in delete_queue:
            paper.articles.remove(article)

    news_pool.set(papers, threads_per_source=2) # (2*2) = 4 threads in all
    news_pool.join()

    for paper in papers:
        paper.parse_articles()

    es = ElasticStorage.get_instance(dev=False)
    for paper in papers:
        es.store_articles(paper.articles, paper.url)
Пример #21
0
def scrape_articles(domains=DOMAINS):
    """Crawls domains and scrapes new web articles.
    """
    papers = [newspaper.build(s, memoize_articles=False) for s in domains]
    news_pool.set(papers, threads_per_source=1)
    news_pool.join()

    for domain, paper in zip(domains, papers):
        paper_source = parse_source(domain)
        for article in paper.articles:
            article_source = parse_source(article.url)
            if article_source != paper_source:
                continue
            article.parse()
            a = Article(url=article.url, 
                        title=article.title,
                        text=article.text, 
                        image=article.top_image,
                        domain=domain)
            a.save()

    n_articles = sum(map(lambda p: len(p.articles), papers))
    logmsg = '{} articles crawled'.format(n_articles)
    logger.info(logmsg)
Пример #22
0
print(firstarg, secondarg)

x = 0
allpapers = newspaper.popular_urls()
for n in allpapers:
    x += 1

if int(secondarg) < x:
    print("there are that many papers", x)
    sourcearts = []
    for paper in allpapers[int(firstarg):int(secondarg)]:
        sourcepaper = newspaper.build(paper)
        sourcearts.append(sourcepaper)

    poolset = news_pool.set(sourcearts,
                            threads_per_source=3)  # (3*2) = 6 threads total
    pooljoin = news_pool.join()
    iart = 0
    for iart in range(len(sourcearts)):
        print("newspaper {}: {}".format(iart + 1, sourcearts[iart].size()))

    iart = 0
    try:
        connection = mysql.connector.connect(host='localhost',
                                             database='newspy',
                                             user='',
                                             password='')
        if connection.is_connected():
            db_Info = connection.get_server_info()
            print("Connected to MySQL Server version ", db_Info)
            cursor = connection.cursor()
Пример #23
0
def main():
	import newspaper
	from newspaper import news_pool
	import re
	import csv
	import unicodedata

	# Active list of news/media sources
	
	sources = ['http://fivethirtyeight.com']

	#sources = ['http://cnn.com','http://foxnews.com',
	#'http://npr.org','http://msnbc.com','http://cbs.com',
	#'http://economist.com','http://time.com','http://nytimes.com',
	#'http://espn.com','http://reuters.com','http://usatoday.com',
	#'http://bbc.com','http://fivethirtyeight.com']

	papers = {} # Empty dictionary

	print("Building papers\n....\n...\n...")

	# Build diction, using url name for keys ex/ 'http://cnn.com' key will be 'cnn'
	for i in range(len(sources)):
		key = re.sub(r'(^https?:\/\/|\.com$|\.org$)','',sources[i])
		papers[key] = newspaper.build(sources[i],memoize_articles=False)
		# Print number of articles added from "recent" list for logging purposes
		print(key,papers[key].size())

	print("Downloading articles (this may take a while)\n...\n...\n...")

	# Download all articles via multi-threading
	news_pool.set([x[1] for x in papers.items()], threads_per_source=2) # Test various thread counts
	news_pool.join()

	print("Extracting text from articles \n...\n...\n...")

	# Parse all articles
	for i in papers:
		for j in range(papers[i].size()):
			#call to "download()" deprecated by news_pool.set & news_pool.join
			#papers[i].articles[j].download()
			papers[i].articles[j].parse()
			#extract keywords
			papers[i].articles[j].nlp()

	print("Writing new articles to dump file \n...\n...\n...")

	# Append articles to csv
	# Prototype format: col(1) = source, col(2) = title, col(3) = authors, col(4) = text
	with open('papers.csv','a') as outcsv:
		writer = csv.writer(outcsv)
		writer.writerow(["Source","Date","Title","Authors","Text","Keywords"])
		for i in papers:
			source = i
			for j in range(papers[i].size()):
				# Grab key features
				title = unicodedata.normalize('NFKD',papers[i].articles[j].title).encode('ascii','ignore')
				authors = [x.encode('UTF-8') for x in papers[i].articles[j].authors]
				text = unicodedata.normalize('NFKD',papers[i].articles[j].text).encode('ascii','ignore')
				date = papers[i].articles[j].publish_date
				#date = unicodedata.normalize('NFKD',papers[i].articles[j].publish_date).encode('ascii','ignore')
				# Identify keywords, while we're at it
				keywords = [x.encode('UTF-8') for x in papers[i].articles[j].keywords]
				writer.writerow([source,date,title,authors,text,keywords])
Пример #24
0
import newspaper
from newspaper import news_pool

hq_paper = newspaper.build('https://www.huanqiu.com', language="zh")
sh_paper = newspaper.build('http://news.sohu.com', language="zh")
sn_paper = newspaper.build('https://news.sina.com.cn', language="zh")

papers = [hq_paper, sh_paper, sn_paper]
# 线程数为 3 * 2 = 6
news_pool.set(papers, threads_per_source=2)
news_pool.join()
print(hq_paper.articles[0].html)
Пример #25
0
def crawl():
    import newspaper
    from newspaper import news_pool

    memoize_articles = True

    conn = connect()

    threads_per_source = 4

    round = 0

    #loop indefinetely
    while True:
        count = get_news_source_count(conn)
        offset = 0
        limit = 15
        round += 1
        log.info("Crawling round %s.", round)
        while offset <= count:
            papers = []
            sources = get_news_sources(conn, offset, limit)

            offset += limit

            for source in sources:
                log.info("Creating newspaper for source %s", source[1])
                news_paper = newspaper.build(source[1], memoize_articles=memoize_articles, MIN_WORD_COUNT=100)

                papers.append(news_paper)
                log.info("Found %s articles from %s.", news_paper.size(), source[1])

            log.info("Creating a pool of newspapers for %s newspapers.", len(papers))
            news_pool.set(papers, threads_per_source=threads_per_source)

            log.info("Downloading articles for all newspapers.")
            start_time = time.time()
            news_pool.join()

            end_time = time.time() - start_time
            log.info("Downloading finished in %s", end_time)

            log.info("Storing downloaded articles in the database.")
            for paper in papers:
                #get the source id for this nespaper
                news_source_id = get_news_source(conn, paper.url)[0]

                #Get already cralwed articles for this newspaper
                crawled_urls = articles_exist(conn, paper.article_urls())
                crawled_urls_size = 0
                if crawled_urls:
                    crawled_urls_size = len(crawled_urls)
                else:
                    crawled_urls = ['']

                log.info("For newspaper %s %s articles already crawled.", paper.url, crawled_urls_size)
                #articles = []
                #crawled_articles = articles_for_news_source(conn, news_source_id)
                article_count = 0
                for article in paper.articles:
                    #if the article is not crawled already
                    if article.url not in crawled_urls:
                        #parse it
                        try:
                            article.parse()
                            #check if its a news article, and not some other page
                            if article.is_valid_body():
                                article_count += 1
                                insert_news_article(conn, article, news_source_id)
                        except:
                            pass

                            #Check if the combination title and publish date already exists for this newspaper
                            #publish_date = article.publish_date
                            #if publish_date:
                            #    publish_date = publish_date.replace(tzinfo=None)

                            #if (article.title, publish_date) not in crawled_articles:
                                #If not, add it for insertion
                            #    articles.append(article)
                            #    crawled_articles.append((article.title, publish_date))
                            #    log.info("Article '%s' publish date '%s' doesn't exists.", article.title, publish_date)
                            #else:
                            #    log.warn("Article '%s' already exists", article.url)
                log.info("For newspaper %s stored %s articles.", paper.url, article_count)
                #Insert the articles in the database
                #insert_news_articles(conn, list(set(articles)), news_source_id)

        time.sleep(1000) #sleep for 1000 seconds before continuing

# ### Scraping articles

# In[11]:


title = []
author = []
published = []
body = []

#downloading articles
#multi-threading to be nicer to medium
articles = [Article(link, fetch_images = False) for link in links]
news_pool.set(articles, threads_per_source = 6)
news_pool.join()

#getting title, author, publish date, and text body for each article
for i in range(0, len(articles)):
    
    try:
        articles[i].parse()
    
    except ArticleException:
        pass
    
    #appending each to the corresponding list
    title.append(articles[i].title)
    author.append(articles[i].authors)
    published.append(articles[i].publish_date)
Пример #27
0
  print "yahoo built"
  google = newspaper.build('http://www.usnews.com/')
  print "google built"
  bbc = newspaper.build('http://www.bbc.com/news/world/us_and_canada/')
  print "bbc built"
  nbc = newspaper.build('http://www.nbcnews.com/news/us-news')
  print "nbcbuild"
  cnn = newspaper.build('http://www.cnn.com/US/')
  print "cnn"
  abc = newspaper.build('http://abcnews.go.com/US/')
  print "abc built"
  fox = newspaper.build('http://www.foxnews.com/us/index.html')
  print "fox built"

  papers = [yahoo, google, bbc, nbc, cnn, abc, fox]
  news_pool.set(papers, threads_per_source=2)
  news_pool.join()

  for Source in papers:
      for article in Source.articles:
          url = article.url
          htmlcode = article.html

          print url
          filename = "html/" + article.title + ".html"
          filename = filename.replace("'", "")
          print filename.encode('utf-8')
          htmlfile = open(filename.encode('utf-8'), "wb")
          htmlfile.write(htmlcode.encode('utf-8'))
          htmlfile.close()
          #HTML(filename).write_png(pngfilename)
Пример #28
0
            break
        elif str(veiculo) == '4':
            escolha = input('Digite o endereço (URL) do veículo que deseja: ')
            break
        else:
            print(
                'Você não digitou um valor da lista. Digite apenas um número entre 1 e 4'
            )
            print()
            continue
    except ValueError:
        print()

meio = newspaper.build(escolha, language='pt', memoize_articles=False)
fast = [meio]
news_pool.set(fast, threads_per_source=2)
print()
print('Total de registros coletados: ' + str(meio.size()))
listaurl = []
urlfinal = []
for article in meio.articles:
    listaurl.append(article.url)
for url in listaurl:
    if veiculo == "1":
        if "comments" not in url:
            if "especial" not in url:
                if "oauth" not in url:
                    if "aovivo" not in url:
                        if "2019" in url:
                            urlfinal.append(url)
    elif veiculo == "2":
Пример #29
0
    Juila Sell
    
"""

# import required modules for webscraping and html parsing
import requests
import newspaper
from newspaper import news_pool
import sqlite3

# create list containing news sites to scrape
web_list = ['http://www.foxnews.com', 'http://www.usatoday.com']

# setup newspaper to multi-thread news sources 
newsWebList = [newspaper.build(i) for i in web_list]
news_pool.set(newsWebList, threads_per_source=2)
news_pool.join()

# connect to Sqlite database and initiate / build table 
con = sqlite3.connect('tnc.db')
with con:
    cur = con.cursor()
    cur.execute("DROP TABLE IF EXISTS NewsArticle")
    cur.execute("CREATE TABLE NewsArticle(Id TEXT, Number INT, Name TEXT, Count INT)")

# The News Counter Webscraper
def tncWebscraper():
    # iterates through sources
    for web_page in web_list:
        # set get request for html
        i = 0
Пример #30
0
    def gather_different(self,
                         extra_urls=None,
                         only_extra=False,
                         ignore_gotten=False,
                         save=True):
        checklang = False
        if extra_urls:
            self.urls["extras"] = set(extra_urls)
            for url_ext in extra_urls:
                mine_article(url_ext)
        if not only_extra:
            print(self.newssites)
            if len(self.newssites) > 1 and type(self.newssites) is list:
                papers = [
                    build(paper, config=self.config)
                    for paper in self.newssites
                ]
            else:
                papers = build(self.newssites, config=self.config)
            log(f"Getting Data from {len(self.newssites)} newssites...")
            news_pool.set(papers, threads_per_source=2)
            news_pool.join()
            for art_pool, url in zip(papers, self.newssites):
                print(
                    f"Handling newssite {int(self.newssites.index(url)) + 1}/{len(self.newssites)}"
                )
                for art in art_pool.articles:
                    art.parse()
                    if (str(art.url)
                            not in self.urls["gotten"]) or ignore_gotten:
                        created = date_to_posix(dates=art.publish_date,
                                                list=False)
                        if created is not None and created != "None":
                            dic_temp = {
                                "link":
                                str(art.url),
                                "text":
                                str(
                                    art.text.replace("  ",
                                                     "").replace("\n", "")),
                                "title":
                                str(art.title),
                                "created":
                                float(created),
                                "keywords":
                                str(art.keywords),
                                "author":
                                str(art.authors)
                            }
                            self.urls["gotten"] = np.append(
                                self.urls["gotten"], art.url)
                            if checklang:
                                try:
                                    if check_lang_is_en(str(art.text)):
                                        self.df_art = self.df_art.append(
                                            dic_temp, ignore_index=True)
                                    else:
                                        print(f"Blocked: {dic_temp['text']}")
                                except json.decoder.JSONDecodeError as e:
                                    error(e)
                                    if check_lang_is_en(str(art.title)):
                                        self.df_art = self.df_art.append(
                                            dic_temp, ignore_index=True)
                                    else:
                                        print(f"Blocked: {dic_temp['text']}")
                                    print("fixed?")
                            else:
                                self.df_art = self.df_art.append(
                                    dic_temp, ignore_index=True)

        if save:
            print(self.df_art)
            try:
                pass
                #print(self.df_art.to_string())
            except:
                pass
            update_hdf5(files["news_store"],
                        "news_articles",
                        dataframe=self.df_art,
                        mode="a",
                        append=False)
Пример #31
0
print data.shape
# create list of Article objects
urls = data[1:, 0].tolist()

# for each line in csv
articles = []
for i in range(len(urls)):
    # print "iteration:{} {} ".format(i,urls[i])
    articles.append(Article(url=urls[i]))

# create a source of aricltes
news_source = Source("https://www.dummyurl.com")
news_source.articles = articles
# create a news_pool for threading purposes
news_pool.set([news_source], threads_per_source=2)
news_pool.join()

# iterate through article list to create a column for the csv
print "Parsing articles..."

article_list = []
labels = ['title', 'authors', 'text', 'keywords', 'summary', 'tags']
for article in articles:
    print "Parsing article {}".format(article.url)
    article.parse()
    article_list.append({
        labels[0]: article.title,
        labels[1]: article.authors,
        labels[2]: article.text,
        labels[3]: article.keywords,
Пример #32
0
__author__ = 'James'
import newspaper
from newspaper import Config, news_pool

config = Config()
config.set_language('en')
config.memoize_articles = False


reuters = newspaper.build(url='http://www.reuters.com', config=config)
indo = newspaper.build(url='http://www.independent.ie', config=config)

papers = [reuters, indo]

news_pool.set(paper_list=papers, threads_per_source=3)
news_pool.join()

for paper in papers:
    print(paper.brand + ": " + str(paper.size()) + " article(s)")
    # for article in paper.articles:
    #     print(article.title)

# print("-----------\nCATEGORIES\n-----------")
#
# for category in a.categories:
#     print(category.url)
#     b = newspaper.Source(url=category.url)
#     b.build()
#     print("\t-----------\n\tFEEDS\t\n-----------\t")
#     for feed_url in b.feed_urls():
#         print("\t->" + feed_url)
Пример #33
0
 def post(self, request):
     form = SearchForm(request.POST)
     if form.is_valid():
         search_key = form.cleaned_data['search_keys']
         search_weight = form.cleaned_data['search_weight']
         search_key = search_key.split(',')
         search_weight = search_weight.split(',')
         search_key_weight = {}
         for l in range(len(search_key)):
             search_key_weight[search_key[l]] = search_weight[l]
         if detect(search_key[0]) != 'zh' and detect(
                 search_key[0]) != 'zh-cn':
             # cnn_paper = newspaper.build('http://cnn.com', memoize_articles=False)
             # print(cnn_paper.size())
             # times_paper = newspaper.build('https://www.nytimes.com/', memoize_articles=False)
             # print(times_paper.size())
             # guardian_paper = newspaper.build('https://www.theguardian.com/us', memoize_articles=False)
             # print(guardian_paper.size())
             # abc_paper = newspaper.build('https://abcnews.go.com/', memoize_articles=False)
             # print(abc_paper.size())
             # bbc_paper = newspaper.build('https://www.bbc.com/', memoize_articles=False)
             # print(bbc_paper.size())
             boston_paper = newspaper.build('https://www.bostonglobe.com//',
                                            memoize_articles=False)
             print(boston_paper.size())
             seattle_paper = newspaper.build(
                 'https://www.seattletimes.com/', memoize_articles=False)
             print(seattle_paper.size())
             # papers = [cnn_paper, times_paper, guardian_paper, abc_paper, bbc_paper]
             papers = [boston_paper, seattle_paper]
             news_pool.set(papers,
                           threads_per_source=2)  # (5*2) = 10 threads total
             news_pool.join()
             # for article in cnn_paper.articles:
             #     self.all_scrapy(article, search_key_weight)
             # for article in times_paper.articles:
             #     self.all_scrapy(article, search_key_weight)
             # for article in guardian_paper.articles:
             #     self.all_scrapy(article, search_key_weight)
             # for article in abc_paper.articles:
             #     self.all_scrapy(article, search_key_weight)
             # for article in bbc_paper.articles:
             #     self.all_scrapy(article, search_key_weight)
             for article in boston_paper.articles:
                 self.all_scrapy(article, search_key_weight)
             for article in seattle_paper.articles:
                 self.all_scrapy(article, search_key_weight)
         elif detect(search_key[0]) == 'zh-cn':
             qq_paper = newspaper.build('https://www.qq.com/',
                                        memoize_articles=False)
             print('qq_paper: ' + str(qq_paper.size()))
             # wy_paper = newspaper.build('https://news.163.com/', memoize_articles=False)
             # papers = [qq_paper, wy_paper]
             papers = [qq_paper]
             news_pool.set(papers,
                           threads_per_source=2)  # (3*2) = 6 threads total
             news_pool.join()
             for article in qq_paper.articles:
                 print('processing')
                 self.all_scrapy(article, search_key_weight)
             # for article in wy_paper.articles:
             #     print('processing')
             #     self.all_scrapy(article, search_key_weight)
     else:
         form = SearchForm()
     return HttpResponseRedirect(reverse('searching:results', args=()))
Пример #34
0
                           memoize_articles=False)
globalnewsca = newspaper.build('https://globalnews.ca/',
                               memoize_articles=False)
thestar = newspaper.build('https://www.thestar.com/', memoize_articles=False)
cna = newspaper.build('https://www.channelnewsasia.com/news/international',
                      memoize_articles=False)

#Combine all the sources
list_of_sources = [
    cnn, bbc, slate, breitbart, politico, thehill, cbc, washingtonpost,
    globeandmail, tc, gamespot, globalnewsca, thestar, cna
]

#Intaitate Muli-Threading Downloads
#WARNING: keep the threads_per_source at a reasonable number
news_pool.set(list_of_sources,
              threads_per_source=4)  #2 threads per each source
news_pool.join()

#Create our final dataframe
df_articles = pd.DataFrame()

#Create a download limit per sources
limit = 100

for source in list_of_sources:
    #tempoary lists to store each element we want to extract
    list_title = []
    list_text = []
    list_source = []

    count = 0
client = MongoClient()

db = client['news_crawls']

outlet_list_file_open = open(sys.argv[1], 'r')
outlet_list_file = outlet_list_file_open.read()
outlet_list = outlet_list_file.split('\n')
outlet_list.pop(-1)

build_objects = []

for outlet in outlet_list:
    build_objects.append(np.build('http://' + outlet, memoize_articles=False))

news_pool.set(build_objects, threads_per_source=2)
news_pool.join()

for outlet in build_objects:
    for article in outlet.articles:
        count = db.italian_outlets.find({"url": article.url})
        if count.count() != 0:
            continue
        else:
            article.parse()
            print article.url
            db.italian_outlets.insert_one({
                "title":
                article.title,
                "text":
                article.text,
Пример #36
0
    def scrape_newspapers(self,
                          company_name,
                          start_date,
                          end_date,
                          bodies=False):
        """ Build a list of the newspapers articles from a given url """
        def build_papers(news_url):
            return newspaper.build(news_url,
                                   language=self.language,
                                   memoize_articles=False)

        """ Return a relevant article matching company name and optional params such as start_date, end_date, bodies """

        def relevant_articles(papers):
            try:
                for article in papers.articles:
                    """
                        Lets analyse the HTML of the article to inspect the h1 (title) of the article. 
                        Reading documentation of newspaper3k suggests parse() is expensive method so 
                        try to limit overhead and only parse articles with a relevant title.
                    """
                    soup = BeautifulSoup(article.html, "html.parser")
                    title = soup.find('h1').get_text()
                    #If the company name is found wihtin the headline of a news article then parse the article for more information
                    if title and company_name in title.lower():
                        article.parse()
                        if within_date_range(article.publish_date, start_date,
                                             end_date):
                            article_dict = {
                                "headline": article.title,
                                "source": article.url,
                                "published_date": article.publish_date,
                                "company_name": company_name
                            }
                            if bodies:
                                article_dict.update({"body": article.text})
                            yield article_dict

            except Exception as e:
                #log the error to a file, continue
                print("Exception:", e)
                pass

        articles = []
        company_name = company_name.lower()

        try:
            print("Downloading papers .....")
            papers = [build_papers(src) for src in self.news_urls]
            print("Papers downloaded", len(papers), papers)
            news_pool.set(papers, threads_per_source=2)
            news_pool.join()

        except Exception as e:
            #should log the error to a file in production then continue
            print("Exception:", e)
            pass

        finally:
            articles.extend(
                [article for p in papers for article in relevant_articles(p)])

        return articles
#!/usr/bin/python
# -*- coding: utf-8 -*-

import newspaper
from newspaper import news_pool
from pprint import pprint

# slate_paper = newspaper.build('http://slate.com')
# tc_paper = newspaper.build('http://techcrunch.com')
# espn_paper = newspaper.build('http://espn.com')
elpais = newspaper.build('http://elpais.com')
elmundo = newspaper.build('http://www.elmundo.es')
publico = newspaper.build('http://www.publico.es')
papers = [elpais, elmundo, publico]
news_pool.set(papers, threads_per_source=2) # (3*2) = 6 threads total
news_pool.join()

print(len(papers))
pprint(papers)
print(len(elpais.articles))
print(len(elmundo.articles))
Пример #38
0
def main():
	import newspaper # article download utility
	from newspaper import news_pool, Config, Article, Source
	import re # regex
	import csv # csv file-formatting
	import unicodedata # string cleaning
	from datetime import datetime # time-checking for cache-updates

	print("Retrieving sources and update times\n...")

	# Read active list of news/media sources
	f = open("sourcelist","r")
	sources = f.read().splitlines()
	times = []

	#
	# ONGOING: update time storage and retrieval
	#		-dependent on if caching is sufficient

	papers = {} # Empty dictionary

	print("Building papers\n....\n...\n...")

	# Store total and current number of articles for progress metrics
	total_articles = 0; current_articles = 0

	# Build diction, using url name for keys ex/ 'http://cnn.com' key will be 'cnn'
	for i in range(len(sources)):
		key = re.sub(r'(^https?:\/\/|\.com\n$|\.org\n$)','',sources[i])
		papers[key] = newspaper.build(sources[i],memoize_articles=True)
		
		# Print number of articles added from "recent" list for logging purposes
		total_articles = total_articles + papers[key].size()
		print(key,papers[key].size())

	print("Downloading articles (this may take a while)\n...\n...\n...")

	config = Config()
	config.fetch_images = False
	
	# Download all articles via multi-threading
	news_pool.set([x[1] for x in papers.items()], threads_per_source=2) # Test various thread counts
	news_pool.join()

	print("Extracting text from articles and writing to dump files \n...\n...\n...")

	# Append articles to aggregate and individual csv's
	# Format: col(1) = source, col(2) = date, col(3) = title, col(4) = authors, col(5) = text, col(6) = keywords
	with open('papers.csv','a') as outcsv:

		# Setup aggregate csv writer
		writer = csv.writer(outcsv)
		#writer.writerow(["Source","Date","Title","Authors","Text","Keywords"])

		# Traverse sources
		for i in papers:

			# Setup single_source csv writing
			source = i
			ind_outcsv = open(str(i+".csv"),'a')
			ind_writer = csv.writer(ind_outcsv)

			# Traverse articles in source			
			for j in range(papers[i].size()):

				# Parse articles and extract features
				current_articles += 1
				print("Processing " + str(i) + " article " + str(current_articles) + " of " + str(total_articles) + " (" + str("{0:.2f}".format((current_articles/float(total_articles)*100),2))
 + " %)")

				try:
					papers[i].articles[j].parse()

					# Grab key features
					title = unicodedata.normalize('NFKD',papers[i].articles[j].title).encode('ascii','ignore')
					authors = [x.encode('UTF-8') for x in papers[i].articles[j].authors]
					text = unicodedata.normalize('NFKD',papers[i].articles[j].text).encode('ascii','ignore')
					date = papers[i].articles[j].publish_date
					keywords = [x.encode('UTF-8') for x in papers[i].articles[j].keywords]
					
					# Add new row to both single-source and aggregate files
					ind_writer.writerow([source,date,title,authors,text,keywords])
					writer.writerow([source,date,title,authors,text,keywords])
					papers[i].articles[j].nlp()

				except httplib.BadStatusLine:
					print "httplib.BadStatusLine, no dice"
Пример #39
0
def get_bot_response():
    while True:
        userText = request.args.get('msg')
        msg = str(userText)
        entrada = msg.lower()
        f = csv.writer(open('inputs.csv', 'a', encoding='utf-8'))
        f.writerow([msg])
        response = searchbot.get_response(userText)
        if float(response.confidence) >= 0.8:
            return str(searchbot.get_response(userText))
        elif userText == str('NÃO'):
            return str('Refaça a pergunta, por favor!')
        elif userText == str("SIM"):
            return str("Agradecemos o seu contato")
        elif float(response.confidence) == 0.0:
            entrada = msg
            # print(entrada)
            p1 = 'http://receita.economia.gov.br/@@busca?advanced_search=False&sort_on=&SearchableText='
            p2 = '&portal_type%3Alist=Document&created.query%3Arecord%3Alist%3Adate=1970-01-02&created.range%3Arecord=min'
            html = str(p1 + entrada + p2)
            stop2 = nltk.corpus.stopwords.words('portuguese')
            stop2.append('faço')
            stop2.append('um')
            stop2.append('gostaria')
            stop2.append('fazer')
            stop2.append('saber')
            stop2.append('posso')
            stop2.append('como')
            splitter = re.compile('\\W+')

            lista_palavras = []
            lista = [p for p in splitter.split(entrada) if p != '']
            for p in lista:
                if p not in stop2:
                    if len(p) > 1:
                        lista_palavras.append(p)
            ar = len(lista_palavras)
            ax = str(lista_palavras[0:ar])
            e = str(ax).replace(',', ' ').strip('[]')
            e.strip("'")
            #headers = {'User-Agent': 'Mozilla/5.0'}
            headers = {
                'User-Agent':
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
            }
            try:
                page = requests.get(html,
                                    headers=headers,
                                    verify=False,
                                    stream=False,
                                    timeout=7)
                soup = BeautifulSoup(page.content, 'lxml')
                cla = soup.find(class_='searchResults')
                links = cla.find_all('a')
            except (KeyError, IndexError, AttributeError):
                pass
            # namess = soup.find_all('a')
            # ra = (lista_palavras)
            # CRIAR A LISTA DE LINKS SITE RFB

            listr = []
            for link in links:
                texto = str(link.get_text()).lower().replace('ã', 'a').replace(
                    '-', ' ').replace('ç', 'c').split()
                time.sleep(0.5)
                # print(len(texto))
                url = str(link.get('href'))
                time.sleep(0.5)
                # print(len(url))
                urls = str(link.get('href')).lower().replace('/', ' ').replace(
                    '-', ' ').replace('.', ' ').split()
                time.sleep(0.5)
                # print(len(urls))
                if entrada in texto:
                    listr.append(url)
                for i in range(0, ar):
                    if lista_palavras[i] in texto:
                        listr.append(url)
                    elif lista_palavras[i] in urls:
                        listr.append(url)
            else:
                listr == []
                pass

            listag = []
            rec = 'site:receita.economia.gov.br intitle:' + msg + " -filetype:pdf -.pdf"
            for urla in search(rec,
                               tld='com.br',
                               lang='pt-br',
                               stop=4,
                               pause=8):
                time.sleep(1)
                listag.append(urla)

            g = int(len(listag))
            # print(g)

            listago = []
            for z in range(0, g):
                ur = str(listag[z])
                listago.append(ur)

            # print(listago)
            # print(len(listago))
            qo = int(len(listago))
            # print(listr)
            # print(len(listr))
            listaunida = listago + listr
            conj = list(set(listaunida))
            # print(conj)
            # print(len(conj))
            # print(type(conj))

            # print(p)
            # print(len(p))
            j = len(conj)

            reports2 = []
            news_pool.set(reports2, threads_per_source=2)
            news_pool.join()
            for r in range(0, j):

                try:
                    ia = str(conj[r])
                    article = Article(ia, language="pt")
                    article.download()
                    article.parse()
                    article.text
                    article.nlp()
                    article.summary
                except:
                    pass
                reports2.append(str(article.summary).replace('\n', ' '))
            # print(len(reports2))

            resposta_finalc = set(reports2)
            print(resposta_finalc)

            if resposta_finalc == set():
                wikipedia.set_lang("pt")
                a = msg
                result = wikipedia.search(a, results=1)
                page = wikipedia.summary(result, sentences=6)
                content = page
                return str(content)
            else:
                try:
                    resposta_final = (str(resposta_finalc).replace(
                        '\n', ' ').replace('[', ' ').replace(']', ' ').replace(
                            ',',
                            ' ').replace("'",
                                         ' ').replace('{',
                                                      ' ').replace("}", ' '))
                    f = csv.writer(open('chats.csv', 'a', encoding='utf-8'))
                    f.writerow([msg + '\n' + resposta_final])
                    return str(
                        resposta_final + '\n' +
                        'Encontrou a resposta que precisava? SIM ou NÃO?')
                except:
                    return str(
                        'Desculpe! Não encontrei uma resposta para sua pergunta. Poderia repetir com outros termos?'
                    )
Пример #40
0
def auto_article_go_getter():
    print("starting builds ", file=sys.stderr)
    cnn_paper = newspaper.build("https://www.cnn.com",  memorize_articles=True, language = 'en')
    print("cnn_paper built", file=sys.stderr)
    nbc_paper = newspaper.build("https://www.nbcnews.com",  memorize_articles=True, language = 'en')
    #print("nbc_paper built", file=sys.stderr)
    #nyt_paper = newspaper.build("https://www.nytimes.com/",  memorize_articles=True, language = 'en')
    #print("nyt_paper built", file=sys.stderr)
    apn_paper = newspaper.build("https://apnews.com/",  memorize_articles=True, language = 'en')
    print("apn_paper built", file=sys.stderr)
    abc_paper = newspaper.build("https://abcnews.go.com/",  memorize_articles=True, language = 'en')
    print("abc_paper built", file=sys.stderr)
    papers = [cnn_paper, nbc_paper, apn_paper, abc_paper]
    verge_paper = newspaper.build("https://www.theverge.com/",  memorize_articles=True, language = 'en')
    print("verge_paper built", file=sys.stderr)
    techP = [verge_paper]
    espn_paper = newspaper.build("https://www.espn.com/",  memorize_articles=True, language = 'en')
    print("espn_paper built", file=sys.stderr)
    sportP = [espn_paper]
    et_paper = newspaper.build("https://ew.com/",  memorize_articles=True, language = 'en')
    print("ew_paper built", file=sys.stderr)
    entertainmentP = [et_paper]
    crypto_paper = newspaper.build("https://cryptonews.com/",  memorize_articles=True, language = 'en')
    print("crypto_paper built", file=sys.stderr)
    cryptoP = [crypto_paper]
    climate_paper = newspaper.build("https://www.climatechangenews.com/",  memorize_articles=True, language = 'en')
    print("climate_paper built", file=sys.stderr)
    climateP = [climate_paper]
    print("all papers built", file=sys.stderr)
    count = 0
    article_list = []
    print("Starting pool threading", file=sys.stderr)
    print("Starting pool for papers", file=sys.stderr)
    news_pool.set(papers, threads_per_source=1000)
    news_pool.join()
    print("Finished pool threading for papers", file=sys.stderr)
    print("Starting pool for techp", file=sys.stderr)
    news_pool.set(techP, threads_per_source=1000)
    news_pool.join()
    print("Finished pool threading for techp", file=sys.stderr)
    print("Starting pool for sportp", file=sys.stderr)
    news_pool.set(sportP, threads_per_source=1000)
    news_pool.join()
    print("Finished pool threading for sportp", file=sys.stderr)
    print("Starting pool for entertainmentp", file=sys.stderr)
    news_pool.set(entertainmentP, threads_per_source=1000)
    news_pool.join()
    print("Finished pool threading for entertainmentp", file=sys.stderr)
    print("Starting pool for cryptop", file=sys.stderr)
    news_pool.set(cryptoP, threads_per_source=1000)
    news_pool.join()
    print("Finished pool threading for cryptop", file=sys.stderr)
    print("Starting pool for climatep", file=sys.stderr)
    news_pool.set(climateP, threads_per_source=1000)
    news_pool.join()
    print("Finished pool threading for climatep", file=sys.stderr)
    print("Saving articles to mongodb", file=sys.stderr)
    for build in papers:
        for news in (build.articles):
            if "politics" in news.url and "cnnespanol" not in news.url:
                news.parse()
                #call on text summarizer with text of article
                textSum = text_summarizer(news.text)
                if "apnews.com" in news.url:
                    textSum = news.text
                article = NewsArticle(
                    link = news.url,
                    image = news.top_image,
                    wing = "political",
                    #text = news.text,
                    text = textSum,
                    title = news.title
                    ).save()
            #email_services = ["hotmail", "gmail", "yahoo"] 
            #email_contains_service = any(email_service in user_email for email_service in email_services)
            elif ["stock", "net", "loss", "Q1", "Q2", "Q3", "Q4", "Gain"] in word_tokenize(news.text):
                news.parse()
                #call on text summarizer with text of article
                textSum = text_summarizer(news.text)
                if "apnews.com" in news.url:
                    textSum = news.text
                article = NewsArticle(
                    link = news.url,
                    image = news.top_image,
                    wing = "buisness",
                    text = textSum,
                    title = news.title
                    ).save()
            elif "covid" in news.url or "corona" in news.url:
                news.parse()
                #call on text summarizer with text of article
                textSum = text_summarizer(news.text)
                if "apnews.com" in news.url:
                    textSum = news.text
                article = NewsArticle(
                    link = news.url,
                    image = news.top_image,
                    wing = "covid",
                    text = textSum,
                    title = news.title
                    ).save()
                count += 1
    for build in techP:
        for news in (build.articles):
            news.parse()
            #call on text summarizer with text of article
            textSum = text_summarizer(news.text)
            if "apnews.com" in news.url:
                    textSum = news.text
            if "#comments" not in news.url:
                article = NewsArticle(
                    link = news.url,
                    image = news.top_image,
                    wing = "tech",
                    text = textSum,
                    title = news.title
                    ).save()
    for build in sportP:
        for news in (build.articles):
            news.parse()
            #call on text summarizer with text of article
            textSum = text_summarizer(news.text)
            article = NewsArticle(
                link = news.url,
                image = news.top_image,
                wing = "sports",
                text = textSum,
                title = news.title
                ).save()
    for build in entertainmentP:
        for news in (build.articles):
            news.parse()
            #call on text summarizer with text of article
            textSum = text_summarizer(news.text)
            article = NewsArticle(
                link = news.url,
                image = news.top_image,
                wing = "entertainment",
                text = textSum,
                title = news.title
                ).save()
    for build in cryptoP:
        for news in (build.articles):
            news.parse()
            #call on text summarizer with text of article
            textSum = text_summarizer(news.text)
            article = NewsArticle(
                link = news.url,
                image = news.top_image,
                wing = "crypto",
                text = textSum,
                title = news.title
                ).save()
    for build in climateP:
        for news in (build.articles):
            news.parse()
            #call on text summarizer with text of article
            textSum = text_summarizer(news.text)
            article = NewsArticle(
                link = news.url,
                image = news.top_image,
                wing = "climate",
                text = textSum,
                title = news.title
                ).save()            
    print("Articles saved in mongodb", file=sys.stderr)