def get_articles(): # get Chinese articles from domain for url in open("list_ch.txt", 'r'): try: paper = newspaper.build(url, memoize_articles = True, language = 'zh') match_object = re.search('http\:\/\/([^\/]+)\/', url) domain = match_object.group(1) for article in paper.articles: get_meta(article, domain) except: pass # get English articles from domain for url in open("list_en.txt", 'r'): try: paper = newspaper.build(url, memoize_articles = True, language = 'en') match_object = re.search('http\:\/\/([^\/]+)\/', url) domain = match_object.group(1) for article in paper.articles: get_meta(article, domain) except: pass # get articles from RSS for url in open("list_rss_ch.txt", 'r'): try: feed = feedparser.parse(url) match_object = re.search('http\:\/\/([^\/]+)\/', url) domain = match_object.group(1) chinese = True for post in feed.entries: link = post.link get_meta_rss(link, domain, chinese) except: pass for url in open("list_rss_en.txt", 'r'): try: feed = feedparser.parse(url) match_object = re.search('http\:\/\/([^\/]+)\/', url) domain = match_object.group(1) chinese = False for post in feed.entries: link = post.link get_meta_rss(link, domain, chinese) except: pass print "success!" return
def CheckForMoreArticles(): print 'Checking for more articles from CNN' cnn = newspaper.build(u'http://us.cnn.com/') print 'Found ' + str(cnn.size()) + ' new articles from CNN' print 'Checking for more articles from SMH' smh = newspaper.build(u'http://smh.com.au/') print 'Found ' + str(smh.size()) + ' new articles from SMH' print 'Checking for more articles from Slashdot' slashdot = newspaper.build(u'http://slashdot.org/') print 'Found ' + str(smh.size()) + ' new articles from SlashDot' print 'Checking for more articles from BBC' bbc = newspaper.build(u'http://www.bbc.com/') print 'Found ' + str(smh.size()) + ' new articles from BBC' return cnn.articles + smh.articles + slashdot.articles + bbc.articles
def populate_sites(sites): """ (list of str) -> list of [str, newspaper.source.Source] Parses through the sites using newspaper library and returns list of sites with available articles populated Keyword arguments: sites -- List of [name, url] of each site """ new_sites = [] for s in range(len(sites)): # Check for any new command on communication stream check_command() # Duplicate the name of the sites new_sites.append([sites[s][0]]) # Use the url and populate the site with articles new_sites[s].append( ( newspaper.build( sites[s][1], memoize_articles=False, keep_article_html=True, fetch_images=False, language="en", number_threads=1, ) ) ) # Append site url new_sites[s].append(sites[s][1]) return new_sites
def get_news_data(): # Get list of settings urllist: SettingsList = get_safe_settingslist('CryptoNewsUrls', urls) keylist: SettingsList = get_safe_settingslist('CrytoNewsKeywords', keywords) logger_name = 'main_scraper.' + "bitcoin_news" logger = logging.getLogger(logger_name) for url in urllist.list: paper = newspaper.build(url, language='en') for article in paper.articles: try: article.download() article.parse() keys = [key for key in keylist.list if key in article.title.lower()] if len(keys) > 0: # check if article already exists obj = CryptoNews.objects(title=article.title).first() if obj is None: news = CryptoNews() news.title = article.title news.description = article.meta_description news.text = article.text news.tags = keys news.url = article.url news.save() logger.info(article.title) except BaseException as e: logger.error('Cryptonews error{0}'.format(e)) pass
def fetch_article_url(self, memoize=False): paper = newspaper.build(self.url, memoize_articles=memoize) or [] self.narticles = paper.size() print 'article count:%s' % self.narticles pipe = redis.pipeline() date_fmt = r'\d{4}[-/]\d{2}[-/]\d{2}' for article in paper.articles: url = article.url print url date_keys = re.findall(date_fmt, url) if not date_keys: continue date_key = date_keys[0] key = self.key(date_key) pipe.sadd(key, url) if self.save and date_key in self.get_valid_days(): print 'processing....' try: article.download() article.parse() key = self.key(date_key, article.title) pipe.set(key, article.text) except: pass pipe.execute()
def discover_feeds_urls(feed_url): """ Try to discover more feed URLs in one. """ LOGGER.info(u'Trying to discover new RSS/Atom feeds from %s…', feed_url) try: site = newspaper.build(feed_url) urls_to_try = set(site.feed_urls()) except: LOGGER.exception(u'Newspaper did not help finding feeds ' u'from “%s”', feed_url) created = [] known = [] for url in urls_to_try: result = create_feeds_from_url(url, recurse=False) if result: # keep feeds if they have been created created.extend(x[0] for x in result if x[1]) known.extend(x[0] for x in result if not x[1]) LOGGER.info(u'Done discovering %s: %s feeds created, %s already known.', feed_url, len(created), len(known))
def makeDocs(): utc = pytz.utc es = Elasticsearch(BONSAI_URL, verify_certs= True) es.indices.delete(index='news', ignore=[400, 404]) es.indices.create(index='news', ignore=400) print "Created" cnn_paper = newspaper.build(u'http://cnn.com', memoize_articles=False) a = defaultdict(int) cnn_articles = cnn_paper.articles print cnn_paper.size() for i in range(10): article = cnn_articles[i] url = article.url art = Article(url) art.download() art.parse() print art.publish_date print art.text print "Article" + str(i) print art.publish_date is not None print art.text is not None if (art.publish_date is not None) and (art.text is not None): try: doc = { 'domain': 'CNN', 'date': utc.localize(art.publish_date), 'text': art.text } res = es.index(index="news", doc_type='article', id=i, body=doc) print "Doc" + str(i) except: print "Doc not accepted"
def build_newspaper(self): ''' This method builds newspaper using their url and newspaper library ''' for site_url in self.site_urls: self.built_newspapers.append(newspaper.build(site_url, memoize_articles=False))
def readArticleCollectionFile(site, filename, c): f = open(filename, 'w') paper = newspaper.build(site, memoize_articles=False) print len(paper.articles) i = 0 for article in paper.articles: article.download() article.parse() title = article.title.encode('ascii', 'ignore') text = article.text.encode('ascii', 'ignore') #article.nlp() #keywords = article.keywords #summary = article.summary.encode('ascii', 'ignore') f.write('<article>\n') f.write("<class>" + str(c) + "</class>\n") f.write('<title>' + title + '</title>\n') f.write('<text>\n' + text + '</text>\n') #f.write('<keywords>' + str(keywords) + '</keywords>\n') #f.write('<summary>' + summary + '</summary>\n') f.write("</article>\n") i = i + 1 if i > 40: break f.close()
def _get_articles(url): url = url.strip() for file in os.listdir(newspaper.settings.ANCHOR_DIRECTORY): # clearing newspaper categories cache os.unlink(os.path.join(newspaper.settings.ANCHOR_DIRECTORY, file)) articles = newspaper.build(url).articles if url.split('.')[1] == 'jetbrains': # at least for now. Newspaper is a bit buggy on JetBrains site articles = [] for page in range(10): soup = BeautifulSoup(requests.get(url + '/page/' + str(page)).content, 'html.parser') for title in soup.find_all('h2', {'class': 'entry-title'}): articles.append(NewspaperArticle(title.find('a').get('href'))) for article in articles: article.download() if not article.is_downloaded: print("Failed to download article:", article.url) continue article.parse() article.nlp() publish_date = article.publish_date if publish_date is None and url.split('.')[1] == 'jetbrains': soup = BeautifulSoup(requests.get(article.url).content, 'html.parser') publish_date = soup.find('span', {'class': 'entry-date'}).getText() # actually, newspaper is very buggy on JetBrains blog and often cannot parse publish date print(publish_date) yield DataMiningArticle(article.html, article.title, article.summary, article.text, "", article.canonical_link, "", publish_date)
def Calculate(): try: news = request.form['inputNews'].lower() topic = request.form['inputTopic'] category = request.form['inputCategory'] print news + "\t" + topic + "\t" + category from havenondemand.hodindex import HODClient client = HODClient(apikey='6b1f8438-56c7-45e0-98a6-6742c1be0d65', apiversiondefault=1) """def get_bias(url): print "Hello" data = {'url': url} r = client.post('analyzesentiment', data) sentiment = r.json()['aggregate']['sentiment'] score = r.json()['aggregate']['score'] print url + " | " + sentiment + " | " + str(score) return score""" paper = newspaper.build("http://" + news + ".com", language='en', memoize_articles=False) url = [] for article in paper.articles: url.append(article.url) cumulative_score = 0.0 countNegative = 0 countPositive = 0 countNeutral = 0 """import multiprocessing as mp p = mp.Pool(3) res = p.map(get_bias, url)""" print newspaper.category for u in url: data = {'url': u} r = client.post('analyzesentiment', data) sentiment = r.json()['aggregate']['sentiment'] score = r.json()['aggregate']['score'] print u + " | " + sentiment + " | " + str(score) cumulative_score += score if sentiment == 'positive': countPositive += 1 elif sentiment == 'negative': countNegative += 1 elif sentiment == 'neutral': countNeutral += 1 print cumulative_score print cumulative_score/len(url) except Exception as e: return json.dumps({'error':str(e)}) return news + topic + category
def validate_site(site): try: s = newspaper.build(site, memoize_articles=False, keep_article_html=True, fetch_images=False, language="en") if s.size() == 0: raise ValidationError("%s is not a valid Referring Site!" % site) except: raise ValidationError("%s is not a valid Referring Site!" % site)
def get_article_urls(self, rclient, source_url): paper = newspaper.build( source_url, memoize_articles=False, fetch_images=False, request_timeout=self.timeout, number_threads=self.threads, language=self.language, browser_user_agent=self.user_agent) urls = ((a.url, a.title) for a in paper.articles[:self.max_articles]) return ifilterfalse(lambda x: rclient.exists(x[0]), urls)
def parse_seccion(self, response): el_territorio = newspaper.build( 'https://www.elterritorio.com.ar/misiones-1-seccion') noticias = el_territorio.articles for noticia in noticias: request = scrapy.Request(url=noticia.url, callback=self.parse_noticia) yield request
def main(): source_url, rabbit_url = parse_config() paper = newspaper.build(source_url) publisher = Publisher( rabbit_url=rabbit_url, publish_interval=0.25, article_urls=paper.article_urls()) publisher.run()
def get_newspapers(source_urls): papers = [] for url in source_urls: papers.append(newspaper.build(url)) news_pool.set(papers, threads_per_source=2) news_pool.join() return papers
def newspaperproxyjs(domain): logger.info(' - '.join(['NEWSPAPER PROXY JS', domain])) return newspaper.build('http://api.scraperapi.com?key=' + scraperapi_api_key + '&render=true&url=http://' + domain, memoize_articles=False, request_timeout=70, number_threads=2)
def newspaper_url(url): web_paper = newspaper.build(url, language="vi", memoize_articles=False) print("Extracting news pages url!!!") for article in web_paper.articles: print("News page url:", article.url) newspaper_information(article.url) print("Total acquisition%s Article" % web_paper.size()) # Number of articles
def __generateArticles(self): source = build(self.__TargerUrl, language='id', memoize_articles=False) containers = [] for article in source.articles: _href = article.url.split('?')[0] if not _href in containers and self.__ValidUrlRegex.match(_href): containers.append(_href) self.__Articles = containers
def test_newspaper(url): paper = newspaper.build(url) for article in paper.articles: print(article.url) for category in paper.category_urls(): print(category)
def extract_article_urls(site_urls): article_urls = [] for site_url in site_urls: paper = newspaper.build(site_url) urls = [a.url for a in paper.articles if '-' in a.url] print('Found {} articles on {}'.format(len(urls), site_url)) article_urls += urls return article_urls
def processGeneric(self, feedid, url): p = newspaper.build(url) for a in p.articles: a.download() log.info('retreiving %s' % a.title) self.articles.append( ArticleData(feedid, a.title, a.url, "", str(datetime.datetime.utcnow())))
def get_article_links(url): """ Takes a URL with articles and returns a list of URLs of all the contained articles. :param url: A URL :rtype: list """ links = newspaper.build(url, memoize_articles=False, language='zh') return [article.url for article in links.articles]
def getArticlesFromSource(self, source, external=False): paper = newspaper.build(source, memoize_articles=True, browser_user_agent='BEEVA/Emojinews crawler 1.0') #Filtering articles out of domain news = filter((lambda x: x.url.startswith(source) or external), paper.articles) news = map(self.cleanNames, news) news = filter((lambda x: x.title), paper.articles) news = map(lambda x: {'url':x.url, 'title':x.title}, news) return news
def __init__(self, src): self.url = src['url'] self.domains = src['domains'] self.left_cut_regex = src['left_cut_regex'] self.right_cut_regex = src['right_cut_regex'] self.paper = newspaper.build(self.url, language=config.LANGUAGE, memoize_articles=config.MEMOIZE_ARTICLES)
def update_db(self): self.paper = newspaper.build(self.news_source_url) for article in self.paper.articles: self.article = article self.get_article()
def news(source): paper = newspaper.build(source, memoize_articles=False) first_article = paper.articles[0] first_article.download() first_article.parse() json_str = staticLine(first_article.title) # msgs.append(json_str) return json_str
def build_source(feed: str): source = newspaper.build(feed, memoize_articles=not self.include_old, keep_article_html=True) if len(source.articles) == 0 and self.include_old: tqdm.write("Warning: The source `" + feed + "` appears to have no articles.") articles.extend(source.articles)
def get_source_list(): """Build newspaper objects for scraping. Returns a list.""" # Build Papers Objects to be downloaded and parsed for data extraction. tech_crunch = newspaper.build('https://www.techcrunch.com/', memoize_articles=True, language='en') fox = newspaper.build('https://www.foxnews.com/', memoize_articles=False, language='en') nytimes = newspaper.build('http://nytimes.com', memoize_articles=False, language='en') wsj = newspaper.build('http://wsj.com', memoize_articles=True, language='en') bbc = newspaper.build('http://bbcnews.com', memoize_articles=True, language='en') cnn = newspaper.build('http://cnn.com', memoize_articles=True, language='en') ap = newspaper.build('https://www.ap.org/en-us/', memoize_articles=True, language='en') papers = [fox, nytimes, wsj, bbc, cnn, ap] return papers
def main(fcsv): websites = ['https://www.horoscope.com/inspiration'] for site in websites: print(site) # web = newspaper.build(site, config) # web = newspaper.build(site, memoize_articles=False, MIN_WORD_COUNT=1000) web = newspaper.build(site, memoize_articles=False) # web = newspaper.build(site) scrape(web, fcsv)
def scrapeNews(): data = {} data['newspapers'] = {} # Loads the JSON files with news sites with open('NewsPapers.json') as companyList: companies = json.load(companyList) count = 1 # Iterate through each news company for company, value in companies.items(): # It uses the python newspaper library to extract articles print("Building site for ", company) paper = newspaper.build(value['link'], memoize_articles=False) newsPaper = { "link": value['link'], "articles": [] } #noneTypeCount = 0 for content in paper.articles: if count > LIMIT: break try: content.download() content.parse() except Exception as e: # getting any kind of 404 error, should noy stop the program altogether print(e) continue if content.publish_date is None: print(count, " Article has date of type None...") continue #the program is made to loop over again because the fact that it doesnot have a #publishing date means it can be any type of content like ads, video pages etc. but not news article #thats why such are skipped for good article = {} article['title'] = content.title article['text'] = content.text article['link'] = content.url article['published'] = content.publish_date.isoformat() newsPaper['articles'].append(article) del article ##print(count, "articles downloaded from", company, " using newspaper, url: ", content.url) count += 1 count = 1 data['newspapers'][company] = newsPaper # Finally it saves the articles as a JSON-file. try: with open('scraped_articles.json', 'w') as JSONfile: json.dump(data, JSONfile) JSONfile.close() del data except Exception as e: print(e)
def get_article_text(link): text = [] newspapr = newspaper.build(link) for article in newspapr.articles: # print(article.url) text.append(article.text) return text
def extract_articles(news_source='http://cnn.com',num=10): news_source = newspaper.build(news_source, memoize_articles=False) output=[] for art in news_source.articles[:min(num,len(news_source.articles))]: art.download() art.parse() raw=art.text output=output+nltk.word_tokenize(raw) return output
def scrapeFarRight(): farRight = newspaper.build('http://www.breitbart.com/big-government/',memoize_articles=False) i = 0 for article in farRight.articles: article.download() article.parse() with open("farRight/article" + str(i) + ".txt", 'a') as out: out.write(article.text + '\n') i = i+1
def scrapeFarLeft(): farLeft = newspaper.build('http://dailykos.com/blogs/main') i = 0 for article in farLeft.articles: article.download() article.parse() with open("centerLeft/article" + i + ".txt", 'a') as out: out.write(article.text + '\n') i = i+1
def get_articles(self, news=None): assert news is not None, "news is not defined." assert news.url is not None, "news.url is not defined." url = copy.copy(news.url) url = NetworkTools.full_url(url) paper = newspaper.build(url, memoize_articles=False) articles = copy.deepcopy(paper.articles) return articles
def try_newspaper(): vne = newspaper.build(TARGET) a = Article(ARTICLE, language='vi') for article in vne.articles: print(article.url) a.download() a.parse() print("FIN")
def RefreshArticles(domain, directory, personality, log=Print, timeout=None): start_time = time.time() arts = np.build(domain, language='en', memoize_articles=False).articles log(domain + " has %d articles" % len(arts)) for art in arts: if not timeout is None and time.time() - start_time > timeout: log("Timeout after %f secons" % (time.time() - start_time)) return DownloadAndProcess(url, directory, personality, log=log)
def __init__(self,url): try: nltk.download('punkt') except Exception as ex: print(ex) self.url = url self.isstarted=False self.ispaused = False self.newspaper = newspaper.build(url)
def collect_article_urls(): url = request.args.get('url') paper = newspaper.build(url, memoize_articles=False) article_urls = set() for article in paper.articles: article_urls.add(article.url) return json.dumps(list(article_urls)), 200, { 'Content-Type': 'application/json' }
def scrapeCenterLeft(): centerL = newspaper.build('http://bbc.com/news/world/us_and_canada') i = 0 for article in centerL.articles: article.download() article.parse() with open("centerLeft/article" + i + ".txt", 'a') as out: out.write(article.text + '\n') i = i+1
def crawl_web(url): paper = newspaper.build(url, memoize_articles=False, language='en') for content in paper.articles: if check_exist_url(content.link): article, text = crawl_web_page(content) if article and text and check_exist(article['id']): load_to_disk(article['id'], text) load_to_db(article)
def test_download_works(self): config = Configuration() config.memoize_articles = False slate_paper = newspaper.build('http://slate.com', config=config) tc_paper = newspaper.build('http://techcrunch.com', config=config) espn_paper = newspaper.build('http://espn.com', config=config) print('Slate has %d articles TC has %d articles ESPN has %d articles' % (slate_paper.size(), tc_paper.size(), espn_paper.size())) papers = [slate_paper, tc_paper, espn_paper] news_pool.set(papers, threads_per_source=2) news_pool.join() print 'Downloaded Slate mthread len', len(slate_paper.articles[0].html) print 'Downloaded ESPN mthread len', len(espn_paper.articles[-1].html) print 'Downloaded TC mthread len', len(tc_paper.articles[1].html)
def scrapeModerate(): moderate = newspaper.build('http://cbsnews.com/us/',memoize_articles=False) i = 0 for article in moderate.articles: article.download() article.parse() with open("moderate/article" + i + ".txt", 'a') as out: out.write(article.text + '\n') i = i+1
def scrapeCenterRight(): centerR = newspaper.build('http://foxnews.com/us.html',memoize_articles=False) i = 0 for article in centerR.articles: article.download() article.parse() with open("centerRight/article" + i + ".txt", 'a') as out: out.write(article.text + '\n') i = i+1
def searchSite(self, siteURL): "searchSite downloads all the known articles for the given site" paper = newspaper.build(siteURL,language=self.language) self.articleURLs[siteURL] = [] for article in paper.articles: self.articleURLs[siteURL].append(article.url)
def __init__(self, site, memoize_articles=False): self.site = site self.paper = newspaper.build(site, memoize_articles=memoize_articles, browser_user_agent=AGENT) self.purge_categories() self.purge_articles(self.paper.categories_to_articles())
def test_download_works(self): config = Configuration() config.memoize_articles = False slate_paper = newspaper.build('http://slate.com', config=config) tc_paper = newspaper.build('http://techcrunch.com', config=config) espn_paper = newspaper.build('http://espn.com', config=config) print ('slate has %d articles tc has %d articles espn has %d articles' % (slate_paper.size(), tc_paper.size(), espn_paper.size())) papers = [slate_paper, tc_paper, espn_paper] news_pool.set(papers, threads_per_source=2) news_pool.join() print 'Downloaded slate mthread len', len(slate_paper.articles[0].html) print 'Downloaded espn mthread len', len(espn_paper.articles[-1].html) print 'Downloaded tc mthread len', len(tc_paper.articles[1].html)
def _handle_fallback(company, value, count, limit): """This is the fallback method if a RSS-feed link is not provided. It uses the python newspaper library to extract articles. """ print(f"Building site for {company}") paper = newspaper.build(value["link"], memoize_articles=False) news_paper = {"link": value["link"], "articles": []} none_type_count = 0 for content in paper.articles: if count > limit: break try: content.download() content.parse() except Exception as err: print(err) print("continuing...") continue # Again, for consistency, if there is no found publish date the # article will be skipped. # # After 10 downloaded articles from the same newspaper without # publish date, the company will be skipped. if content.publish_date is None: print(f"{count} Article has date of type None...") none_type_count = none_type_count + 1 if none_type_count > 10: print("Too many noneType dates, aborting...") none_type_count = 0 break count = count + 1 continue article = { "title": content.title, "text": content.text, "link": content.url, "published": content.publish_date.isoformat(), } news_paper["articles"].append(article) message = client.messages.create( body=content.title+content.url, from_=from_whatsapp_number, to=to_whatsapp_number ) print(message.sid) print( f"{count} articles downloaded from {company} using newspaper, url: {content.url}" ) count = count + 1 none_type_count = 0 return count, news_paper
def validate_site(site): try: s = newspaper.build(site, memoize_articles=False, keep_article_html=True, fetch_images=False, language='en') if s.size() == 0: raise ValidationError('%s is not a valid monitoring site!' % site) except: raise ValidationError('%s is not a valid monitoring site!' % site)
def get(self,site): print site paper=newspaper.build(site) articles={} i=0 for article in paper.articles: articles[i]={} articles[i]['url']=article.url i=i+1 return {'size':i,'articles':articles}
def get(self,site): paper=newspaper.build(site) feed_urls={} i=0 for feed in paper.feed_urls(): print feed print type(feed) feed_urls[i]=feed i=i+1 return {'size':i,'feed_urls':feed_urls}
def get_paper_from_url(url): """Return a build for the given url - Input type: url string - Return type: paper """ paper = newspaper.build(url) print 'Successfully built %s.' % paper.brand return paper
def news_papers(cls, url=""): news_paper = newspaper.build(url) for kj in news_paper.articles: kj.download() kj.parse() kj.nlp() return kj.text, kj.keywords
def build(self): s = "building {} : {}".format(self.name, self.url) print(s) logger.info(s) self.paper = newspaper.build(self.url, language='en', memoize_articles=self.memoize) self.size = len(self.paper.articles) s = "total articles: {} for {}".format(self.size, self.name) print(s) logger.info(s)
def build_news_source(): print ("building news sources") paper_urls = [] paper_urls.append('http://edition.cnn.com') paper_urls.append('http://www.washingtonpost.com') paper_urls.append('http://www.bbc.com') paper_urls.append('http://www.nytimes.com/') papers = [newspaper.build(paper_url) for paper_url in paper_urls] return papers
def pull(self, website = None): ''' Builds a cached newspaper from the given website. By Default, it looks at the website passed to the constructor ''' if not (website is None): self.website = website if self.website is None: raise ValueError('NewsScraper does not have a website.') self.paper = newspaper.build(self.website, memoize_articles= self.memoize_articles, number_threads = 20, fetch_images = False, verbose = False)
def gen_article_text(self): #Not currently functioning. paper = newspaper.build(self.source) articles = paper.articles texts = [] with open("article_text.txt","a") as articletexts: for i in articles: i.download() i.parse() titles.append(i.texts) articletitles.write("{}\n".format(i.text)) return texts
def get_news(): articles = [] cnn = newspaper.build('http://cnn.com') i = 0 while i <= 2: articles.append(cnn.articles[i]) i += 1 for article in articles: article.download() article.parse() return say_news(articles)