def test_cache_categories(self): """ builds two same source objects in a row examines speeds of both """ s = Source('http://yahoo.com') s.download() s.parse() s.set_categories() saved_urls = s.category_urls() s.categories = [] # reset and try again with caching s.set_categories() assert sorted(s.category_urls()) == sorted(saved_urls)
def test_cache_categories(self): """ builds two same source objects in a row examines speeds of both """ s = Source("http://yahoo.com") s.download() s.parse() s.set_categories() saved_urls = s.category_urls() s.categories = [] # reset and try again with caching s.set_categories() assert sorted(s.category_urls()) == sorted(saved_urls)
def test_cache_categories(self): """Builds two same source objects in a row examines speeds of both """ url = 'http://uk.yahoo.com' mock_response_with(url, 'yahoo_main_site') s = Source(url) s.download() s.parse() s.set_categories() saved_urls = s.category_urls() s.categories = [] s.set_categories() assert sorted(s.category_urls()) == sorted(saved_urls)
def test_cache_categories(self): """Builds two same source objects in a row examines speeds of both """ url = 'http://uk.yahoo.com' html = mock_resource_with('yahoo_main_site', 'html') s = Source(url) s.download() s.parse() s.set_categories() saved_urls = s.category_urls() s.categories = [] s.set_categories() self.assertCountEqual(saved_urls, s.category_urls())
def main(): source="The Huffington Post" delivery_time="6:00" #config = Config() #config.memoize_articles = False hpost = Source("http://huffingtonpost.com/theworldpost", memoize_articles=False) hpost.download() hpost.parse() hpost.set_categories() hpost.categories = [hpost.categories[0]] hpost.categories[0].url = "http://huffingtonpost.com/theworldpost" hpost.download_categories() hpost.parse_categories() hpost.set_feeds() hpost.download_feeds() hpost.generate_articles() #for c in hpost.categories: # print(c) #guardian = newspaper.build('http://theguardian.com/world', memoize_articles=False) #news_pool.set([guardian], threads_per_source=2) #news_pool.join() #print(hpost.size()) for article in [x for x in hpost.articles if re.match(".*html.*world.*", x.url) is not None]: url = article.url a = Article(url, language='en') a.download() for i in range(10): if a.is_downloaded: break else: a.download() try: a.parse() a.nlp() except: print("Error: Not parsed/downloaded correctly.") continue html = a.html summary = a.summary keywords = a.keywords title = a.title text = a.text #print(html) #print(text) #print(summary) #print(keywords) #print(title) #print(a.publish_date) if source in title: title = None #print(title) findtime = re.search(r'Posted.*<time datetime="(.*?)">', html) if findtime is None: date=None time=None else: date,time = findtime.group(1).split("T") date = date.split("-") date[0], date[1], date[2] = date[1], date[2], date[0] date = "/".join(date) time = ":".join(time.split("-")[0].split(":")[0:2]) date_time = str(date) + " " + str(time) #print(title) #print(date_time) date_obj = datetime.datetime.strptime(date_time,'%m/%d/%Y %H:%M') #print(date_obj.strftime('%Y/%m/%d %I:%M %p')) try: article = { 'headline': title, 'url': url, 'text': text, 'date': date_obj } newspaper_article('Huffington Post', article, keywords=keywords) except Exception as ex: print 'Article could not be created due to following error' print ex
def main(): source="The Washington Post" delivery_time="6:00" #config = Config() #config.memoize_articles = False wpost = Source("http://washingtonpost.com/world", memoize_articles=False) wpost.download() wpost.parse() wpost.set_categories() wpost.categories = [wpost.categories[0]] wpost.categories[0].url = "http://washingtonpost.com/world" wpost.download_categories() wpost.parse_categories() wpost.set_feeds() wpost.download_feeds() wpost.generate_articles() #for c in wpost.categories: # print(c) #guardian = newspaper.build('http://theguardian.com/world', memoize_articles=False) #news_pool.set([guardian], threads_per_source=2) #news_pool.join() #print(wpost.size()) for article in [x for x in wpost.articles if re.match(".*com/world/.*", x.url) is not None and re.match(".*gallery.html", x.url) is None]: url = article.url a = Article(url, language='en') a.download() for i in range(10): if a.is_downloaded: break else: a.download() try: a.parse() a.nlp() except: print("Error: Not parsed/downloaded correctly.") continue html = a.html summary = a.summary keywords = a.keywords title = a.title text = a.text #print(html) #print(text) #print(summary) #print(keywords) #print(title) #print(a.publish_date) if source in title: title = None #print(title) if a.publish_date is not None: date = str(a.publish_date).split()[0].split("-") #print(date) date[0], date[1], date[2] = date[1], date[2], date[0] date = "/".join(date) else: date = None time = re.search(r'<span class="pb-timestamp">(.*?)</span>' , html) if time is None: print(url) date = None else: time = time.group(1) if ":" not in time: time = delivery_time else: time = time.split(" at ")[1] time = datetime.datetime.strptime(time,'%I:%M %p').strftime('%H:%M') date_time = str(date) + " " + str(time) #print(date_time) date_obj = datetime.datetime.strptime(date_time,'%m/%d/%Y %H:%M') #print(date_obj.strftime('%Y/%m/%d %I:%M %p')) #print(text) #print(date_time) #TODO: Add stuff to the DB try: article = { 'headline': title, 'url': url, 'text': text, 'date': date_obj } newspaper_article(source, article, keywords=keywords) except Exception as ex: print 'Article could not be created due to following error' print ex
def main(): source = "The Washington Post" delivery_time = "6:00" #config = Config() #config.memoize_articles = False wpost = Source("http://washingtonpost.com/world", memoize_articles=False) wpost.download() wpost.parse() wpost.set_categories() wpost.categories = [wpost.categories[0]] wpost.categories[0].url = "http://washingtonpost.com/world" wpost.download_categories() wpost.parse_categories() wpost.set_feeds() wpost.download_feeds() wpost.generate_articles() #for c in wpost.categories: # print(c) #guardian = newspaper.build('http://theguardian.com/world', memoize_articles=False) #news_pool.set([guardian], threads_per_source=2) #news_pool.join() #print(wpost.size()) for article in [ x for x in wpost.articles if re.match(".*com/world/.*", x.url) is not None and re.match(".*gallery.html", x.url) is None ]: url = article.url a = Article(url, language='en') a.download() for i in range(10): if a.is_downloaded: break else: a.download() try: a.parse() a.nlp() except: print("Error: Not parsed/downloaded correctly.") continue html = a.html summary = a.summary keywords = a.keywords title = a.title text = a.text #print(html) #print(text) #print(summary) #print(keywords) #print(title) #print(a.publish_date) if source in title: title = None #print(title) if a.publish_date is not None: date = str(a.publish_date).split()[0].split("-") #print(date) date[0], date[1], date[2] = date[1], date[2], date[0] date = "/".join(date) else: date = None time = re.search(r'<span class="pb-timestamp">(.*?)</span>', html) if time is None: print(url) date = None else: time = time.group(1) if ":" not in time: time = delivery_time else: time = time.split(" at ")[1] time = datetime.datetime.strptime(time, '%I:%M %p').strftime('%H:%M') date_time = str(date) + " " + str(time) #print(date_time) date_obj = datetime.datetime.strptime(date_time, '%m/%d/%Y %H:%M') #print(date_obj.strftime('%Y/%m/%d %I:%M %p')) #print(text) #print(date_time) #TODO: Add stuff to the DB try: article = { 'headline': title, 'url': url, 'text': text, 'date': date_obj } newspaper_article(source, article, keywords=keywords) except Exception as ex: print 'Article could not be created due to following error' print ex
def paper(self): ''' get newspaper articles, default source is `herald` newspaper defaults to articles of this month and year import newspaperzw news = newspaperzw.news() ''' if self.summary and self.nlp == False: # raise exception. `nltk` module missing raise Exception(self.error_msg) news_source = Providers().getUrl(self.provider).strip() name = Source(news_source, self.config) name.build() name.download() name.parse() name.download_articles() # do logging logging.debug(f"News Source build and downloaded. url: {news_source}") news_data = {} news_article = [] counter = 0 for article in name.article_urls(): images = "" keywords = "" try: name.articles[counter].download() name.articles[counter].parse() # log logging.debug( f"Article #{counter} downloaded and parsed successfuly") except: counter += 1 # log logging.error( f"Error download and parsing article #{counter}. continue.." ) continue # get in data title = name.articles[counter].title date_pub = name.articles[counter].publish_date top_image = name.articles[counter].top_image link = name.articles[counter].url text = name.articles[counter].text if (self.nlp): # do nlp stuff name.articles[counter].nlp() summary = name.articles[counter].summary for words in name.articles[counter].keywords: keywords += str(words) + ',' # log logging.debug( f"summary flag enabled. NLP summary obtained successfuly") # add to news pool, only add news of this year and month # data_pub format = 10-04-2018 21:28:09 data = {} if (self.nlp): data.update({ "article_id": randint(555, 999), "title": title, "published": date_pub, "image": top_image, "news": text, "summary": summary, "keywords": keywords.rstrip(','), "url": link }) # log logging.debug("article data with summary saved to news pool!") else: data.update({ "article_id": randint(555, 999), "title": title, "published": date_pub, "image": top_image, "news": text, "url": link }) # log logging.debug("article data added to news pool") news_article.append(data) data = {} # increment to next articles counter += 1 # build main news storage news_data.update({ 'source': name.brand, 'domain': name.domain, 'news': news_article }) # log logging.debug("News main data pool created on success") return news_data