示例#1
0
    def test_source_build(self):
        """
        builds a source object, validates it has no errors, prints out
        all valid categories and feed urls
        """
        DESC = """CNN.com delivers the latest breaking news and information on the latest top stories, weather, business, entertainment, politics, and more. For in-depth coverage, CNN.com provides special reports, video, audio, photo galleries, and interactive guides."""
        BRAND = "cnn"

        config = Configuration()
        config.verbose = False
        s = Source("http://cnn.com", config=config)
        s.clean_memo_cache()
        s.build()

        assert s.brand == BRAND
        assert s.description == DESC

        # For this test case and a few more, I don't believe you can actually
        # assert two values to equal eachother because some values are ever changing.

        # Insead, i'm just going to print some stuff out so it is just as easy to take
        # a glance and see if it looks OK.

        print "\t\tWe have %d articles currently!" % s.size()
        print
        print "\t\t%s categories are: %s" % (s.url, str(s.category_urls()))
示例#2
0
    def test_source_build(self):
        """
        builds a source object, validates it has no errors, prints out
        all valid categories and feed urls
        """
        DESC = """CNN.com delivers the latest breaking news and information on the latest top stories, weather, business, entertainment, politics, and more. For in-depth coverage, CNN.com provides special reports, video, audio, photo galleries, and interactive guides."""
        BRAND = 'cnn'

        config = Configuration()
        config.verbose = False
        s = Source('http://cnn.com', config=config)
        s.clean_memo_cache()
        s.build()

        assert s.brand == BRAND
        assert s.description == DESC

        # For this test case and a few more, I don't believe you can actually
        # assert two values to equal eachother because some values are ever changing.

        # Insead, i'm just going to print some stuff out so it is just as easy to take
        # a glance and see if it looks OK.

        print '\t\tWe have %d articles currently!' % s.size()
        print
        print '\t\t%s categories are: %s' % (s.url, str(s.category_urls()))
 def extract_from_source(self, source):
     news = NpSource(source, verbose=True)
     news.clean_memo_cache()
     news.build()
     logging.info('...build done!')
     for url in news.article_urls():
         if self.is_available_url(url):
             article = self._extract_articles(url)
             if self.is_available_article(article):
                 self._store_article(article)
示例#4
0
class News:
    articles = []

    def __init__(self, url):
        self.newspaper = Source(url)
        self.newspaper.clean_memo_cache()
        self.newspaper.build()
        self.articles = self.newspaper.articles

    def get_news(self, num_of_articles):
        return self.newspaper.articles[:num_of_articles]
示例#5
0
    def test_source_build(self):
        """
        builds a source object, validates it has no errors, prints out
        all valid categories and feed urls
        """
        DESC = ('CNN.com International delivers breaking news from across '
                'the globe and information on the latest top stories, '
                'business, sports and entertainment headlines. Follow the '
                'news as it happens through: special reports, videos, '
                'audio, photo galleries plus interactive maps and timelines.')
        CATEGORY_URLS = [
            u'http://cnn.com/ASIA', u'http://connecttheworld.blogs.cnn.com',
            u'http://cnn.com/HLN', u'http://cnn.com/MIDDLEEAST',
            u'http://cnn.com', u'http://ireport.cnn.com',
            u'http://cnn.com/video', u'http://transcripts.cnn.com',
            u'http://cnn.com/espanol', u'http://partners.cnn.com',
            u'http://www.cnn.com', u'http://cnn.com/US',
            u'http://cnn.com/EUROPE', u'http://cnn.com/TRAVEL',
            u'http://cnn.com/cnni', u'http://cnn.com/SPORT',
            u'http://cnn.com/mostpopular', u'http://arabic.cnn.com',
            u'http://cnn.com/WORLD', u'http://cnn.com/LATINAMERICA',
            u'http://us.cnn.com', u'http://travel.cnn.com',
            u'http://mexico.cnn.com', u'http://cnn.com/SHOWBIZ',
            u'http://edition.cnn.com', u'http://amanpour.blogs.cnn.com',
            u'http://money.cnn.com', u'http://cnn.com/tools/index.html',
            u'http://cnnespanol.cnn.com', u'http://cnn.com/CNNI',
            u'http://business.blogs.cnn.com', u'http://cnn.com/AFRICA',
            u'http://cnn.com/TECH', u'http://cnn.com/BUSINESS'
        ]
        FEEDS = [u'http://rss.cnn.com/rss/edition.rss']
        BRAND = 'cnn'

        s = Source('http://cnn.com', verbose=False, memoize_articles=False)
        url_re = re.compile(".*cnn\.com")
        mock_response_with(url_re, 'cnn_main_site')
        s.clean_memo_cache()
        s.build()

        assert s.brand == BRAND
        assert s.description == DESC
        assert s.size() == 241
        assert s.category_urls() == CATEGORY_URLS
        # TODO: A lot of the feed extraction is NOT being tested because feeds
        # are primarly extracted from the HTML of category URLs. We lose this
        # effect by just mocking CNN's main page HTML. Warning: tedious fix.
        assert s.feed_urls() == FEEDS
示例#6
0
    def test_source_build(self):
        """builds a source object, validates it has no errors, prints out
        all valid categories and feed urls"""

        DESC = """CNN.com delivers the latest breaking news and information on the latest top stories, weather, business, entertainment, politics, and more. For in-depth coverage, CNN.com provides special reports, video, audio, photo galleries, and interactive guides."""
        BRAND = 'cnn'

        configs = Configuration()
        configs.verbose = False
        s = Source('http://cnn.com', configs=configs)
        s.clean_memo_cache()
        s.build()

        assert s.brand == BRAND
        assert s.description == DESC

        print '\t\tWe have %d articles currently!' % s.size()
示例#7
0
    def test_source_build(self):
        """
        builds a source object, validates it has no errors, prints out
        all valid categories and feed urls
        """
        DESC = ('CNN.com International delivers breaking news from across '
                'the globe and information on the latest top stories, '
                'business, sports and entertainment headlines. Follow the '
                'news as it happens through: special reports, videos, '
                'audio, photo galleries plus interactive maps and timelines.')
        CATEGORY_URLS = [
            u'http://cnn.com/ASIA', u'http://connecttheworld.blogs.cnn.com',
            u'http://cnn.com/HLN', u'http://cnn.com/MIDDLEEAST',
            u'http://cnn.com', u'http://ireport.cnn.com',
            u'http://cnn.com/video', u'http://transcripts.cnn.com',
            u'http://cnn.com/espanol',
            u'http://partners.cnn.com', u'http://www.cnn.com',
            u'http://cnn.com/US', u'http://cnn.com/EUROPE',
            u'http://cnn.com/TRAVEL', u'http://cnn.com/cnni',
            u'http://cnn.com/SPORT', u'http://cnn.com/mostpopular',
            u'http://arabic.cnn.com', u'http://cnn.com/WORLD',
            u'http://cnn.com/LATINAMERICA', u'http://us.cnn.com',
            u'http://travel.cnn.com', u'http://mexico.cnn.com',
            u'http://cnn.com/SHOWBIZ', u'http://edition.cnn.com',
            u'http://amanpour.blogs.cnn.com', u'http://money.cnn.com',
            u'http://cnn.com/tools/index.html', u'http://cnnespanol.cnn.com',
            u'http://cnn.com/CNNI', u'http://business.blogs.cnn.com',
            u'http://cnn.com/AFRICA', u'http://cnn.com/TECH',
            u'http://cnn.com/BUSINESS']
        FEEDS = [u'http://rss.cnn.com/rss/edition.rss']
        BRAND = 'cnn'

        s = Source('http://cnn.com', verbose=False, memoize_articles=False)
        url_re = re.compile(".*cnn\.com")
        mock_response_with(url_re, 'cnn_main_site')
        s.clean_memo_cache()
        s.build()

        assert s.brand == BRAND
        assert s.description == DESC
        assert s.size() == 241
        assert s.category_urls() == CATEGORY_URLS
        # TODO: A lot of the feed extraction is NOT being tested because feeds
        # are primarly extracted from the HTML of category URLs. We lose this
        # effect by just mocking CNN's main page HTML. Warning: tedious fix.
        assert s.feed_urls() == FEEDS
示例#8
0
    def test_source_build(self):
        """
        builds a source object, validates it has no errors, prints out
        all valid categories and feed urls
        """
        DESC = ('CNN.com International delivers breaking news from across '
                'the globe and information on the latest top stories, '
                'business, sports and entertainment headlines. Follow the '
                'news as it happens through: special reports, videos, '
                'audio, photo galleries plus interactive maps and timelines.')
        CATEGORY_URLS = [
            'http://cnn.com/ASIA', 'http://connecttheworld.blogs.cnn.com',
            'http://cnn.com/HLN', 'http://cnn.com/MIDDLEEAST',
            'http://cnn.com', 'http://ireport.cnn.com', 'http://cnn.com/video',
            'http://transcripts.cnn.com', 'http://cnn.com/espanol',
            'http://partners.cnn.com', 'http://www.cnn.com',
            'http://cnn.com/US', 'http://cnn.com/EUROPE',
            'http://cnn.com/TRAVEL', 'http://cnn.com/cnni',
            'http://cnn.com/SPORT', 'http://cnn.com/mostpopular',
            'http://arabic.cnn.com', 'http://cnn.com/WORLD',
            'http://cnn.com/LATINAMERICA', 'http://us.cnn.com',
            'http://travel.cnn.com', 'http://mexico.cnn.com',
            'http://cnn.com/SHOWBIZ', 'http://edition.cnn.com',
            'http://amanpour.blogs.cnn.com', 'http://money.cnn.com',
            'http://cnn.com/tools/index.html', 'http://cnnespanol.cnn.com',
            'http://cnn.com/CNNI', 'http://business.blogs.cnn.com',
            'http://cnn.com/AFRICA', 'http://cnn.com/TECH',
            'http://cnn.com/BUSINESS'
        ]
        FEEDS = ['http://rss.cnn.com/rss/edition.rss']
        BRAND = 'cnn'

        s = Source('http://cnn.com', verbose=False, memoize_articles=False)
        # html = mock_resource_with('http://cnn.com', 'cnn_main_site')
        s.clean_memo_cache()
        s.build()
示例#9
0
    def test_source_build(self):
        """
        builds a source object, validates it has no errors, prints out
        all valid categories and feed urls
        """
        DESC = ('CNN.com International delivers breaking news from across '
                'the globe and information on the latest top stories, '
                'business, sports and entertainment headlines. Follow the '
                'news as it happens through: special reports, videos, '
                'audio, photo galleries plus interactive maps and timelines.')
        CATEGORY_URLS = [
            'http://cnn.com/ASIA', 'http://connecttheworld.blogs.cnn.com',
            'http://cnn.com/HLN', 'http://cnn.com/MIDDLEEAST',
            'http://cnn.com', 'http://ireport.cnn.com',
            'http://cnn.com/video', 'http://transcripts.cnn.com',
            'http://cnn.com/espanol',
            'http://partners.cnn.com', 'http://www.cnn.com',
            'http://cnn.com/US', 'http://cnn.com/EUROPE',
            'http://cnn.com/TRAVEL', 'http://cnn.com/cnni',
            'http://cnn.com/SPORT', 'http://cnn.com/mostpopular',
            'http://arabic.cnn.com', 'http://cnn.com/WORLD',
            'http://cnn.com/LATINAMERICA', 'http://us.cnn.com',
            'http://travel.cnn.com', 'http://mexico.cnn.com',
            'http://cnn.com/SHOWBIZ', 'http://edition.cnn.com',
            'http://amanpour.blogs.cnn.com', 'http://money.cnn.com',
            'http://cnn.com/tools/index.html', 'http://cnnespanol.cnn.com',
            'http://cnn.com/CNNI', 'http://business.blogs.cnn.com',
            'http://cnn.com/AFRICA', 'http://cnn.com/TECH',
            'http://cnn.com/BUSINESS']
        FEEDS = ['http://rss.cnn.com/rss/edition.rss']
        BRAND = 'cnn'

        s = Source('http://cnn.com', verbose=False, memoize_articles=False)
        # resp = mock_response_with('http://cnn.com', 'cnn_main_site')
        s.clean_memo_cache()
        s.build()