示例#1
0
def get_articles():
	# get Chinese articles from domain
	for url in open("list_ch.txt", 'r'):
		try: 
			paper = newspaper.build(url, memoize_articles = True, language = 'zh')
			match_object = re.search('http\:\/\/([^\/]+)\/', url)
			domain = match_object.group(1)

			for article in paper.articles:
				get_meta(article, domain)

		except:
			pass


	# get English articles from domain
	for url in open("list_en.txt", 'r'):
		try:
			paper = newspaper.build(url, memoize_articles = True, language = 'en')
			match_object = re.search('http\:\/\/([^\/]+)\/', url)
			domain = match_object.group(1)

			for article in paper.articles:
				get_meta(article, domain)

		except:
			pass


	# get articles from RSS
	for url in open("list_rss_ch.txt", 'r'):
		try:
			feed = feedparser.parse(url)
			match_object = re.search('http\:\/\/([^\/]+)\/', url)
			domain = match_object.group(1)
			chinese = True

			for post in feed.entries:
				link = post.link
				get_meta_rss(link, domain, chinese)

		except:
			pass

	for url in open("list_rss_en.txt", 'r'):
		try:
			feed = feedparser.parse(url)
			match_object = re.search('http\:\/\/([^\/]+)\/', url)
			domain = match_object.group(1)
			chinese = False

			for post in feed.entries:
				link = post.link
				get_meta_rss(link, domain, chinese)

		except:
			pass

	print "success!"
	return
示例#2
0
def CheckForMoreArticles():
    print 'Checking for more articles from CNN'
    cnn = newspaper.build(u'http://us.cnn.com/')
    print 'Found ' + str(cnn.size()) + ' new articles from CNN'
    print 'Checking for more articles from SMH'
    smh = newspaper.build(u'http://smh.com.au/')
    print 'Found ' + str(smh.size()) + ' new articles from SMH'
    print 'Checking for more articles from Slashdot'
    slashdot = newspaper.build(u'http://slashdot.org/')
    print 'Found ' + str(smh.size()) + ' new articles from SlashDot'
    print 'Checking for more articles from BBC'
    bbc = newspaper.build(u'http://www.bbc.com/')
    print 'Found ' + str(smh.size()) + ' new articles from BBC'
    return cnn.articles + smh.articles + slashdot.articles + bbc.articles
def populate_sites(sites):
    """ (list of str) -> list of [str, newspaper.source.Source]
    Parses through the sites using newspaper library and
    returns list of sites with available articles populated

    Keyword arguments:
    sites         -- List of [name, url] of each site
    """
    new_sites = []
    for s in range(len(sites)):
        # Check for any new command on communication stream
        check_command()
        # Duplicate the name of the sites
        new_sites.append([sites[s][0]])
        # Use the url and populate the site with articles
        new_sites[s].append(
            (
                newspaper.build(
                    sites[s][1],
                    memoize_articles=False,
                    keep_article_html=True,
                    fetch_images=False,
                    language="en",
                    number_threads=1,
                )
            )
        )
        # Append site url
        new_sites[s].append(sites[s][1])
    return new_sites
def get_news_data():
    # Get list of settings
    urllist: SettingsList = get_safe_settingslist('CryptoNewsUrls', urls)
    keylist: SettingsList = get_safe_settingslist('CrytoNewsKeywords', keywords)

    logger_name = 'main_scraper.' + "bitcoin_news"
    logger = logging.getLogger(logger_name)

    for url in urllist.list:
        paper = newspaper.build(url, language='en')
        for article in paper.articles:
            try:
                article.download()
                article.parse()

                keys = [key for key in keylist.list if key in article.title.lower()]
                if len(keys) > 0:
                    # check if article already exists
                    obj = CryptoNews.objects(title=article.title).first()
                    if obj is None:
                        news = CryptoNews()
                        news.title = article.title
                        news.description = article.meta_description
                        news.text = article.text
                        news.tags = keys
                        news.url = article.url
                        news.save()
                        logger.info(article.title)

            except BaseException as e:
                logger.error('Cryptonews error{0}'.format(e))
                pass
    def fetch_article_url(self, memoize=False):
        paper = newspaper.build(self.url, memoize_articles=memoize) or []
        self.narticles = paper.size()
        print 'article count:%s' % self.narticles
        pipe = redis.pipeline()
        date_fmt = r'\d{4}[-/]\d{2}[-/]\d{2}'
        for article in paper.articles:
            url = article.url
            print url
            date_keys = re.findall(date_fmt, url)
            if not date_keys:
                continue

            date_key = date_keys[0]
            key = self.key(date_key)

            pipe.sadd(key, url)

            if self.save and date_key in self.get_valid_days():
                print 'processing....'
                try:
                    article.download()
                    article.parse()
                    key = self.key(date_key, article.title)
                    pipe.set(key, article.text)
                except:
                    pass
               
        pipe.execute()
示例#6
0
文件: rssatom.py 项目: 1flow/1flow
def discover_feeds_urls(feed_url):
    """ Try to discover more feed URLs in one. """

    LOGGER.info(u'Trying to discover new RSS/Atom feeds from %s…', feed_url)

    try:
        site = newspaper.build(feed_url)

        urls_to_try = set(site.feed_urls())

    except:
        LOGGER.exception(u'Newspaper did not help finding feeds '
                         u'from “%s”', feed_url)

    created = []
    known = []

    for url in urls_to_try:
        result = create_feeds_from_url(url, recurse=False)

        if result:
            # keep feeds if they have been created
            created.extend(x[0] for x in result if x[1])
            known.extend(x[0] for x in result if not x[1])

    LOGGER.info(u'Done discovering %s: %s feeds created, %s already known.',
                feed_url, len(created), len(known))
示例#7
0
def makeDocs():
    utc = pytz.utc
    es = Elasticsearch(BONSAI_URL, verify_certs= True)
    es.indices.delete(index='news', ignore=[400, 404])
    es.indices.create(index='news', ignore=400)

    print "Created"
    cnn_paper = newspaper.build(u'http://cnn.com', memoize_articles=False)
    a = defaultdict(int)
    cnn_articles = cnn_paper.articles
    print cnn_paper.size()
    for i in range(10):
        article = cnn_articles[i]
        url = article.url
        art = Article(url)
        art.download()
        art.parse()
        print art.publish_date
        print art.text
        print "Article" + str(i)
        print art.publish_date is not None
        print art.text is not None
        if (art.publish_date is not None) and (art.text is not None):
            try:
                doc = {
                'domain': 'CNN',
                'date': utc.localize(art.publish_date), 
                'text': art.text
                }
                res = es.index(index="news", doc_type='article', id=i, body=doc)
                print "Doc" + str(i)
            except:
                print "Doc not accepted"
 def build_newspaper(self):
     '''
     This method builds newspaper using their url and newspaper library
     '''
     for site_url in self.site_urls:
         self.built_newspapers.append(newspaper.build(site_url,
                                                    memoize_articles=False))
示例#9
0
def readArticleCollectionFile(site, filename, c):
	f = open(filename, 'w')

	paper = newspaper.build(site, memoize_articles=False)

	print len(paper.articles)

	i = 0
	for article in paper.articles:
		article.download()
		article.parse()

		title = article.title.encode('ascii', 'ignore')
		text = article.text.encode('ascii', 'ignore')

		#article.nlp()
		#keywords = article.keywords
		#summary = article.summary.encode('ascii', 'ignore')

		f.write('<article>\n')
		f.write("<class>" + str(c) + "</class>\n")
		f.write('<title>' + title + '</title>\n')
		f.write('<text>\n' + text + '</text>\n')
		#f.write('<keywords>' + str(keywords) + '</keywords>\n')
		#f.write('<summary>' + summary + '</summary>\n')
		f.write("</article>\n")
		i = i + 1
		if i > 40:
			break
	f.close()
示例#10
0
 def _get_articles(url):
     url = url.strip()
     for file in os.listdir(newspaper.settings.ANCHOR_DIRECTORY):  # clearing newspaper categories cache
         os.unlink(os.path.join(newspaper.settings.ANCHOR_DIRECTORY, file))
     articles = newspaper.build(url).articles
     if url.split('.')[1] == 'jetbrains':  # at least for now. Newspaper is a bit buggy on JetBrains site
         articles = []
         for page in range(10):
             soup = BeautifulSoup(requests.get(url + '/page/' + str(page)).content, 'html.parser')
             for title in soup.find_all('h2', {'class': 'entry-title'}):
                 articles.append(NewspaperArticle(title.find('a').get('href')))
     for article in articles:
         article.download()
         if not article.is_downloaded:
             print("Failed to download article:", article.url)
             continue
         article.parse()
         article.nlp()
         publish_date = article.publish_date
         if publish_date is None and url.split('.')[1] == 'jetbrains':
             soup = BeautifulSoup(requests.get(article.url).content, 'html.parser')
             publish_date = soup.find('span', {'class': 'entry-date'}).getText()
             # actually, newspaper is very buggy on JetBrains blog and often cannot parse publish date
         print(publish_date)
         yield DataMiningArticle(article.html, article.title, article.summary, article.text,
                                 "", article.canonical_link, "", publish_date)
示例#11
0
def Calculate():
	try:
		news = request.form['inputNews'].lower()
		topic = request.form['inputTopic']
		category = request.form['inputCategory']

		print news + "\t" + topic + "\t" + category
		
		from havenondemand.hodindex import HODClient
		client = HODClient(apikey='6b1f8438-56c7-45e0-98a6-6742c1be0d65', apiversiondefault=1)

		"""def get_bias(url):
			print "Hello"
			data = {'url': url}
			r = client.post('analyzesentiment', data)
			sentiment = r.json()['aggregate']['sentiment']
			score = r.json()['aggregate']['score']
			print url + " | " + sentiment + " | " + str(score)
			return score"""

		paper = newspaper.build("http://" + news + ".com", language='en', memoize_articles=False)

		url = []

		for article in paper.articles:
			url.append(article.url)

		cumulative_score = 0.0
		countNegative = 0
		countPositive = 0
		countNeutral = 0

		"""import multiprocessing as mp

		p = mp.Pool(3)
		res = p.map(get_bias, url)"""

		print newspaper.category

		for u in url:
			data = {'url': u}
			r = client.post('analyzesentiment', data)
			sentiment = r.json()['aggregate']['sentiment']
			score = r.json()['aggregate']['score']
			print u + " | " + sentiment + " | " + str(score)
			cumulative_score += score
			if sentiment == 'positive':
				countPositive += 1
			elif sentiment == 'negative':
				countNegative += 1
			elif sentiment == 'neutral':
				countNeutral += 1				

		print cumulative_score
		print cumulative_score/len(url)

	except Exception as e:
		return json.dumps({'error':str(e)})

	return news + topic + category
示例#12
0
文件: models.py 项目: wangx173/Voyage
def validate_site(site):
    try:
        s = newspaper.build(site, memoize_articles=False, keep_article_html=True, fetch_images=False, language="en")
        if s.size() == 0:
            raise ValidationError("%s is not a valid Referring Site!" % site)
    except:
        raise ValidationError("%s is not a valid Referring Site!" % site)
示例#13
0
 def get_article_urls(self, rclient, source_url):
     paper = newspaper.build(
         source_url, memoize_articles=False, fetch_images=False,
         request_timeout=self.timeout, number_threads=self.threads,
         language=self.language, browser_user_agent=self.user_agent)
     urls = ((a.url, a.title) for a in paper.articles[:self.max_articles])
     return ifilterfalse(lambda x: rclient.exists(x[0]), urls)
示例#14
0
 def parse_seccion(self, response):
     el_territorio = newspaper.build(
         'https://www.elterritorio.com.ar/misiones-1-seccion')
     noticias = el_territorio.articles
     for noticia in noticias:
         request = scrapy.Request(url=noticia.url,
                                  callback=self.parse_noticia)
         yield request
def main():
    source_url, rabbit_url = parse_config()
    paper = newspaper.build(source_url)
    publisher = Publisher(
        rabbit_url=rabbit_url,
        publish_interval=0.25,
        article_urls=paper.article_urls())
    publisher.run()
def get_newspapers(source_urls):
    papers = []
    for url in source_urls:
        papers.append(newspaper.build(url))
    
    news_pool.set(papers, threads_per_source=2)
    news_pool.join()
    return papers
示例#17
0
def newspaperproxyjs(domain):
    logger.info(' - '.join(['NEWSPAPER PROXY JS', domain]))
    return newspaper.build('http://api.scraperapi.com?key=' +
                           scraperapi_api_key + '&render=true&url=http://' +
                           domain,
                           memoize_articles=False,
                           request_timeout=70,
                           number_threads=2)
示例#18
0
def newspaper_url(url):
    web_paper = newspaper.build(url, language="vi", memoize_articles=False)
    print("Extracting news pages url!!!")
    for article in web_paper.articles:
        print("News page url:", article.url)
        newspaper_information(article.url)

    print("Total acquisition%s Article" % web_paper.size())  # Number of articles
示例#19
0
 def __generateArticles(self):
     source = build(self.__TargerUrl, language='id', memoize_articles=False)
     containers = []
     for article in source.articles:
         _href = article.url.split('?')[0]
         if not _href in containers and self.__ValidUrlRegex.match(_href):
             containers.append(_href)
     self.__Articles = containers
示例#20
0
文件: top10.py 项目: notnews/top10
def test_newspaper(url):
    paper = newspaper.build(url)

    for article in paper.articles:
        print(article.url)

    for category in paper.category_urls():
        print(category)
def extract_article_urls(site_urls):
    article_urls = []
    for site_url in site_urls:
        paper = newspaper.build(site_url)
        urls = [a.url for a in paper.articles if '-' in a.url]
        print('Found {} articles on {}'.format(len(urls), site_url))
        article_urls += urls
    return article_urls
示例#22
0
 def processGeneric(self, feedid, url):
     p = newspaper.build(url)
     for a in p.articles:
         a.download()
         log.info('retreiving %s' % a.title)
         self.articles.append(
             ArticleData(feedid, a.title, a.url, "",
                         str(datetime.datetime.utcnow())))
示例#23
0
def get_article_links(url):
    """
    Takes a URL with articles and returns a list of URLs of all the contained articles.
    :param url: A URL
    :rtype: list
    """
    links = newspaper.build(url, memoize_articles=False, language='zh')
    return [article.url for article in links.articles]
示例#24
0
	def getArticlesFromSource(self, source, external=False):
		paper = newspaper.build(source, memoize_articles=True, browser_user_agent='BEEVA/Emojinews crawler 1.0')
		#Filtering articles out of domain
		news = filter((lambda x: x.url.startswith(source) or external), paper.articles)
		news = map(self.cleanNames, news)
		news = filter((lambda x: x.title), paper.articles)
		news = map(lambda x: {'url':x.url, 'title':x.title}, news)
		return news
示例#25
0
 def __init__(self, src):
     self.url = src['url']
     self.domains = src['domains']
     self.left_cut_regex = src['left_cut_regex']
     self.right_cut_regex = src['right_cut_regex']
     self.paper = newspaper.build(self.url,
                                  language=config.LANGUAGE,
                                  memoize_articles=config.MEMOIZE_ARTICLES)
示例#26
0
    def update_db(self):

        self.paper = newspaper.build(self.news_source_url)

        for article in self.paper.articles:

            self.article = article
            self.get_article()
示例#27
0
def news(source):
    paper = newspaper.build(source, memoize_articles=False)
    first_article = paper.articles[0]
    first_article.download()
    first_article.parse()
    json_str = staticLine(first_article.title)
    # msgs.append(json_str)
    return json_str
示例#28
0
 def build_source(feed: str):
     source = newspaper.build(feed,
                              memoize_articles=not self.include_old,
                              keep_article_html=True)
     if len(source.articles) == 0 and self.include_old:
         tqdm.write("Warning: The source `" + feed +
                    "` appears to have no articles.")
     articles.extend(source.articles)
示例#29
0
文件: scrape.py 项目: petalrame/TLDR
def get_source_list():
    """Build newspaper objects for scraping. Returns a list."""
    # Build Papers Objects to be downloaded and parsed for data extraction.
    tech_crunch = newspaper.build('https://www.techcrunch.com/',
                                  memoize_articles=True,
                                  language='en')
    fox = newspaper.build('https://www.foxnews.com/',
                          memoize_articles=False,
                          language='en')
    nytimes = newspaper.build('http://nytimes.com',
                              memoize_articles=False,
                              language='en')
    wsj = newspaper.build('http://wsj.com',
                          memoize_articles=True,
                          language='en')
    bbc = newspaper.build('http://bbcnews.com',
                          memoize_articles=True,
                          language='en')
    cnn = newspaper.build('http://cnn.com',
                          memoize_articles=True,
                          language='en')
    ap = newspaper.build('https://www.ap.org/en-us/',
                         memoize_articles=True,
                         language='en')
    papers = [fox, nytimes, wsj, bbc, cnn, ap]
    return papers
示例#30
0
def main(fcsv):
    websites = ['https://www.horoscope.com/inspiration']
    for site in websites:
        print(site)
        # web = newspaper.build(site, config)
        # web = newspaper.build(site, memoize_articles=False, MIN_WORD_COUNT=1000)
        web = newspaper.build(site, memoize_articles=False)
        # web = newspaper.build(site)
        scrape(web, fcsv)
示例#31
0
def scrapeNews():    
    data = {}
    data['newspapers'] = {}
    
    # Loads the JSON files with news sites
    with open('NewsPapers.json') as companyList:
        companies = json.load(companyList)
        
    count = 1
    
    # Iterate through each news company
    for company, value in companies.items():            
        # It uses the python newspaper library to extract articles
        print("Building site for ", company)
        paper = newspaper.build(value['link'], memoize_articles=False)
        newsPaper = {
            "link": value['link'],
            "articles": []
        }
        #noneTypeCount = 0
        for content in paper.articles:
            if count > LIMIT:
                break
            try:
                content.download()
                content.parse()
            except Exception as e:
                # getting any kind of 404 error, should noy stop the program altogether
                print(e)
                continue
            
            if content.publish_date is None:
                print(count, " Article has date of type None...")
                continue
                #the program is made to loop over again because the fact that it doesnot have a 
                #publishing date means it can be any type of content like ads, video pages etc. but not news article
                #thats why such are skipped for good
            article = {}
            article['title'] = content.title
            article['text'] = content.text
            article['link'] = content.url
            article['published'] = content.publish_date.isoformat()
            newsPaper['articles'].append(article)
            del article
            ##print(count, "articles downloaded from", company, " using newspaper, url: ", content.url)
            count += 1
        count = 1
        data['newspapers'][company] = newsPaper
    
    # Finally it saves the articles as a JSON-file.
    try:
        with open('scraped_articles.json', 'w') as JSONfile:
            json.dump(data, JSONfile)
            JSONfile.close()
            del data
    except Exception as e: 
        print(e)
def get_article_text(link):

    text = []
    newspapr = newspaper.build(link)
    for article in newspapr.articles:
      # print(article.url)
      text.append(article.text)

    return text
示例#33
0
def extract_articles(news_source='http://cnn.com',num=10):
    news_source = newspaper.build(news_source, memoize_articles=False)
    output=[]
    for art in news_source.articles[:min(num,len(news_source.articles))]:
        art.download()
        art.parse()
        raw=art.text
        output=output+nltk.word_tokenize(raw)
    return output
示例#34
0
def scrapeFarRight():
    farRight = newspaper.build('http://www.breitbart.com/big-government/',memoize_articles=False)
    i = 0
    for article in farRight.articles:
        article.download()
        article.parse()
        with open("farRight/article" + str(i) + ".txt", 'a') as out:
            out.write(article.text + '\n')
        i = i+1
示例#35
0
def scrapeFarLeft():
    farLeft = newspaper.build('http://dailykos.com/blogs/main')
    i = 0
    for article in farLeft.articles:
        article.download()
        article.parse()
        with open("centerLeft/article" + i + ".txt", 'a') as out:
            out.write(article.text + '\n')
        i = i+1
示例#36
0
	def get_articles(self, news=None):
		assert news     is not None, "news is not defined."
		assert news.url is not None, "news.url is not defined." 

		url      = copy.copy(news.url)
		url      = NetworkTools.full_url(url)
		paper    = newspaper.build(url, memoize_articles=False)
		articles = copy.deepcopy(paper.articles)
		return articles
示例#37
0
def try_newspaper():
    vne = newspaper.build(TARGET)
    a = Article(ARTICLE, language='vi')
    for article in vne.articles:
        print(article.url)

    a.download()
    a.parse()
    print("FIN")
示例#38
0
def RefreshArticles(domain, directory, personality, log=Print, timeout=None):
    start_time = time.time()
    arts = np.build(domain, language='en', memoize_articles=False).articles
    log(domain + " has %d articles" % len(arts))
    for art in arts:
        if not timeout is None and time.time() - start_time > timeout:
            log("Timeout after %f secons" % (time.time() - start_time))
            return
        DownloadAndProcess(url, directory, personality, log=log)
示例#39
0
	def __init__(self,url):
		try:
			nltk.download('punkt')
		except Exception as ex:
			print(ex)
		self.url = url
		self.isstarted=False
		self.ispaused = False
		self.newspaper = newspaper.build(url)
示例#40
0
def collect_article_urls():
    url = request.args.get('url')
    paper = newspaper.build(url, memoize_articles=False)
    article_urls = set()
    for article in paper.articles:
        article_urls.add(article.url)
    return json.dumps(list(article_urls)), 200, {
        'Content-Type': 'application/json'
    }
示例#41
0
def scrapeCenterLeft():
    centerL = newspaper.build('http://bbc.com/news/world/us_and_canada')
    i = 0
    for article in centerL.articles:
        article.download()
        article.parse()
        with open("centerLeft/article" + i + ".txt", 'a') as out:
            out.write(article.text + '\n')
        i = i+1
示例#42
0
def crawl_web(url):
    paper = newspaper.build(url, memoize_articles=False, language='en')

    for content in paper.articles:
        if check_exist_url(content.link):
            article, text = crawl_web_page(content)
            if article and text and check_exist(article['id']):
                load_to_disk(article['id'], text)
                load_to_db(article)
示例#43
0
    def test_download_works(self):
        config = Configuration()
        config.memoize_articles = False
        slate_paper = newspaper.build('http://slate.com', config=config)
        tc_paper = newspaper.build('http://techcrunch.com', config=config)
        espn_paper = newspaper.build('http://espn.com', config=config)

        print('Slate has %d articles TC has %d articles ESPN has %d articles' %
              (slate_paper.size(), tc_paper.size(), espn_paper.size()))

        papers = [slate_paper, tc_paper, espn_paper]
        news_pool.set(papers, threads_per_source=2)

        news_pool.join()

        print 'Downloaded Slate mthread len', len(slate_paper.articles[0].html)
        print 'Downloaded ESPN mthread len', len(espn_paper.articles[-1].html)
        print 'Downloaded TC mthread len', len(tc_paper.articles[1].html)
示例#44
0
def scrapeModerate():
    moderate = newspaper.build('http://cbsnews.com/us/',memoize_articles=False)
    i = 0
    for article in moderate.articles:
        article.download()
        article.parse()
        with open("moderate/article" + i + ".txt", 'a') as out:
            out.write(article.text + '\n')
        i = i+1
示例#45
0
def scrapeCenterRight():
    centerR = newspaper.build('http://foxnews.com/us.html',memoize_articles=False)
    i = 0
    for article in centerR.articles:
        article.download()
        article.parse()
        with open("centerRight/article" + i + ".txt", 'a') as out:
            out.write(article.text + '\n')
        i = i+1
示例#46
0
def RefreshArticles(domain, directory, personality, log=Print, timeout=None):
    start_time = time.time()
    arts = np.build(domain, language='en', memoize_articles=False).articles
    log(domain + " has %d articles" % len(arts))
    for art in arts:
        if not timeout is None and time.time() - start_time > timeout:
            log("Timeout after %f secons" % (time.time() - start_time))
            return
        DownloadAndProcess(url, directory, personality, log=log)
示例#47
0
  def searchSite(self, siteURL):
    "searchSite downloads all the known articles for the given site"

    paper = newspaper.build(siteURL,language=self.language)
    
    self.articleURLs[siteURL] = []

    for article in paper.articles:
      self.articleURLs[siteURL].append(article.url)
示例#48
0
    def __init__(self, site, memoize_articles=False):

        self.site = site
        self.paper = newspaper.build(site,
                                     memoize_articles=memoize_articles,
                                     browser_user_agent=AGENT)

        self.purge_categories()
        self.purge_articles(self.paper.categories_to_articles())
示例#49
0
    def test_download_works(self):
        config = Configuration()
        config.memoize_articles = False
        slate_paper = newspaper.build('http://slate.com', config=config)
        tc_paper = newspaper.build('http://techcrunch.com', config=config)
        espn_paper = newspaper.build('http://espn.com', config=config)

        print ('slate has %d articles tc has %d articles espn has %d articles'
               % (slate_paper.size(), tc_paper.size(), espn_paper.size()))

        papers = [slate_paper, tc_paper, espn_paper]
        news_pool.set(papers, threads_per_source=2)

        news_pool.join()

        print 'Downloaded slate mthread len', len(slate_paper.articles[0].html)
        print 'Downloaded espn mthread len', len(espn_paper.articles[-1].html)
        print 'Downloaded tc mthread len', len(tc_paper.articles[1].html)
def _handle_fallback(company, value, count, limit):
    """This is the fallback method if a RSS-feed link is not provided.

    It uses the python newspaper library to extract articles.

    """

    print(f"Building site for {company}")
    paper = newspaper.build(value["link"], memoize_articles=False)
    news_paper = {"link": value["link"], "articles": []}
    none_type_count = 0
    for content in paper.articles:
        if count > limit:
            break
        try:
            content.download()
            content.parse()
        except Exception as err:
            print(err)
            print("continuing...")
            continue
        # Again, for consistency, if there is no found publish date the
        # article will be skipped.
        #
        # After 10 downloaded articles from the same newspaper without
        # publish date, the company will be skipped.
        if content.publish_date is None:
            print(f"{count} Article has date of type None...")
            none_type_count = none_type_count + 1
            if none_type_count > 10:
                print("Too many noneType dates, aborting...")
                none_type_count = 0
                break
            count = count + 1
            continue
        article = {
            "title": content.title,
            "text": content.text,
            "link": content.url,
            "published": content.publish_date.isoformat(),
        }
        news_paper["articles"].append(article)

        message = client.messages.create(
            body=content.title+content.url,
            from_=from_whatsapp_number,
            to=to_whatsapp_number
        )
        print(message.sid)

        print(
            f"{count} articles downloaded from {company} using newspaper, url: {content.url}"
        )
        count = count + 1
        none_type_count = 0
    return count, news_paper
示例#51
0
def validate_site(site):
    try:
        s = newspaper.build(site, memoize_articles=False,
                            keep_article_html=True,
                            fetch_images=False,
                            language='en')
        if s.size() == 0:
            raise ValidationError('%s is not a valid monitoring site!' % site)
    except:  
        raise ValidationError('%s is not a valid monitoring site!' % site)
示例#52
0
	def get(self,site):
		print site
		paper=newspaper.build(site)
		articles={}
		i=0
		for article in paper.articles:
			articles[i]={}
			articles[i]['url']=article.url
			i=i+1
		return {'size':i,'articles':articles}
示例#53
0
	def get(self,site):
		paper=newspaper.build(site)
		feed_urls={}
		i=0
		for feed in paper.feed_urls():
			print feed
			print type(feed)
			feed_urls[i]=feed
			i=i+1
		return {'size':i,'feed_urls':feed_urls}
def get_paper_from_url(url):
    """Return a build for the given url
    - Input type: url string
    - Return type: paper
    """

    paper = newspaper.build(url)
    print 'Successfully built %s.' % paper.brand

    return paper
示例#55
0
        def news_papers(cls, url=""):

                news_paper = newspaper.build(url)
                for kj in news_paper.articles:

                        kj.download()
                        kj.parse()
                        kj.nlp()

                        return kj.text, kj.keywords
 def build(self):
   s = "building {} : {}".format(self.name, self.url)
   print(s)
   logger.info(s)
   self.paper = newspaper.build(self.url, 
                           language='en', 
                           memoize_articles=self.memoize)
   self.size = len(self.paper.articles)
   s = "total articles: {} for {}".format(self.size, self.name)
   print(s)
   logger.info(s)
def build_news_source():
    print ("building news sources")
    paper_urls = []
    paper_urls.append('http://edition.cnn.com')
    paper_urls.append('http://www.washingtonpost.com')
    paper_urls.append('http://www.bbc.com')
    paper_urls.append('http://www.nytimes.com/')

    papers = [newspaper.build(paper_url) for paper_url in paper_urls]
    
    return papers
示例#58
0
	def pull(self, website = None):
		'''
		Builds a cached newspaper from the given website. 
		By Default, it looks at the website passed to the constructor
		'''
		if not (website is None):
			self.website = website
		if self.website is None:
			raise ValueError('NewsScraper does not have a website.')

		self.paper = newspaper.build(self.website, memoize_articles= self.memoize_articles, number_threads = 20, fetch_images = False, verbose = False)
示例#59
0
	def gen_article_text(self): #Not currently functioning. 
		paper = newspaper.build(self.source)
		articles = paper.articles
		texts = []
		with open("article_text.txt","a") as articletexts:
			for i in articles:
				i.download()
				i.parse()
				titles.append(i.texts)
				articletitles.write("{}\n".format(i.text))
		return texts
示例#60
0
def get_news():
	articles = []
	cnn = newspaper.build('http://cnn.com')
	i = 0
	while i <= 2:
		articles.append(cnn.articles[i])
		i += 1	
	for article in articles:
		article.download()
		article.parse()
	return say_news(articles)