示例#1
0
    def test_config_build(self):
        """Test if our **kwargs to config building setup actually works.
        NOTE: No need to mock responses as we are just initializing the
        objects, not actually calling download(..)
        """
        a = Article(url='http://www.cnn.com/2013/11/27/'
                    'travel/weather-thanksgiving/index.html')
        assert a.config.language == 'en'
        assert a.config.memoize_articles is True
        assert a.config.use_meta_language is True

        a = Article(url='http://www.cnn.com/2013/11/27/travel/'
                    'weather-thanksgiving/index.html',
                    language='zh',
                    memoize_articles=False)
        assert a.config.language == 'zh'
        assert a.config.memoize_articles is False
        assert a.config.use_meta_language is False

        s = Source(url='http://cnn.com')
        assert s.config.language == 'en'
        assert s.config.MAX_FILE_MEMO == 20000
        assert s.config.memoize_articles is True
        assert s.config.use_meta_language is True

        s = Source(url="http://cnn.com",
                   memoize_articles=False,
                   MAX_FILE_MEMO=10000,
                   language='en')
        assert s.config.memoize_articles is False
        assert s.config.MAX_FILE_MEMO == 10000
        assert s.config.language == 'en'
        assert s.config.use_meta_language is False
def main():
    source="The Guardian"
    #config = Config()
    #config.memoize_articles = False
    guardian = Source("http://www.theguardian.com/world", memoize_articles=False)
    guardian.build()
    #guardian = newspaper.build('http://theguardian.com/world', memoize_articles=False)
    #news_pool.set([guardian], threads_per_source=2)
    #news_pool.join()

    #print(guardian.size())

    for article in [x for x in guardian.articles if re.match(".*/world/.*", x.url) is not  None]:
        url = article.url
        a = Article(url, language='en')
        a.download()
        for i in range(10):
            if a.is_downloaded:
                break
            else:
                a.download()
        try:
            a.parse()
            a.nlp()
        except:
            print("Error: Not parsed/downloaded correctly.")
            continue

        html = a.html
        summary = a.summary
        keywords = a.keywords
        title = a.title
        text = a.text
        date = str(a.publish_date).split()[0].split("-")
        date[0], date[1], date[2] = date[1], date[2], date[0]
        date = "/".join(date)
        delta = re.search(r'<span class="content__dateline-time">(.*)</span>' , html).group(1).replace(".",":").split()[0]
	time = datetime.now() + timedelta(hours=delta )
        date_time = date + " " + time
        #print(title)
        #print(date_time)
        date_obj = datetime.datetime.strptime(date_time,'%m/%d/%Y %H:%M')
        #print(date_obj.strftime('%Y/%m/%d %I:%M %p'))
        #TODO: Add stuff to the DB

        try:
            article = {
                'headline': title,
                'url': url,
                'text': text,
                'date': date_obj
            }
            newspaper_article('The Guardian', article, keywords=keywords)
        except Exception as ex:
            print 'Article could not be created due to following error'
            print ex
示例#3
0
class News:
    articles = []

    def __init__(self, url):
        self.newspaper = Source(url)
        self.newspaper.clean_memo_cache()
        self.newspaper.build()
        self.articles = self.newspaper.articles

    def get_news(self, num_of_articles):
        return self.newspaper.articles[:num_of_articles]
示例#4
0
    def test_config_build(self):
        """
        Test if our **kwargs to config building setup actually works.
        """
        a = Article(
            url=
            'http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html'
        )
        assert a.config.language == 'en'
        assert a.config.memoize_articles == True
        assert a.config.use_meta_language == True

        a = Article(
            url=
            'http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html',
            language='zh',
            memoize_articles=False)
        assert a.config.language == 'zh'
        assert a.config.memoize_articles == False
        assert a.config.use_meta_language == False

        s = Source(url='http://cnn.com')
        assert s.config.language == 'en'
        assert s.config.MAX_FILE_MEMO == 20000
        assert s.config.memoize_articles == True
        assert s.config.use_meta_language == True

        s = Source(url="http://cnn.com",
                   memoize_articles=False,
                   MAX_FILE_MEMO=10000,
                   language='en')
        assert s.config.memoize_articles == False
        assert s.config.MAX_FILE_MEMO == 10000
        assert s.config.language == 'en'
        assert s.config.use_meta_language == False

        s = newspaper.build('http://cnn.com', dry=True)
        assert s.config.language == 'en'
        assert s.config.MAX_FILE_MEMO == 20000
        assert s.config.memoize_articles == True
        assert s.config.use_meta_language == True

        s = newspaper.build('http://cnn.com',
                            dry=True,
                            memoize_articles=False,
                            MAX_FILE_MEMO=10000,
                            language='zh')
        assert s.config.language == 'zh'
        assert s.config.MAX_FILE_MEMO == 10000
        assert s.config.memoize_articles == False
        assert s.config.use_meta_language == False
示例#5
0
    def test_source_build(self):
        """
        builds a source object, validates it has no errors, prints out
        all valid categories and feed urls
        """
        DESC = """CNN.com delivers the latest breaking news and information on the latest top stories, weather, business, entertainment, politics, and more. For in-depth coverage, CNN.com provides special reports, video, audio, photo galleries, and interactive guides."""
        BRAND = 'cnn'

        config = Configuration()
        config.verbose = False
        s = Source('http://cnn.com', config=config)
        s.clean_memo_cache()
        s.build()

        assert s.brand == BRAND
        assert s.description == DESC

        # For this test case and a few more, I don't believe you can actually
        # assert two values to equal eachother because some values are ever changing.

        # Insead, i'm just going to print some stuff out so it is just as easy to take
        # a glance and see if it looks OK.

        print '\t\tWe have %d articles currently!' % s.size()
        print
        print '\t\t%s categories are: %s' % (s.url, str(s.category_urls()))
示例#6
0
 def test_source_custom_params(self):
     s = Source(url="http://cnn.com", memoize_articles=False,
                MAX_FILE_MEMO=10000, language='en')
     self.assertFalse(s.config.memoize_articles)
     self.assertEqual(10000, s.config.MAX_FILE_MEMO)
     self.assertEqual('en', s.config.language)
     self.assertFalse(s.config.use_meta_language)
示例#7
0
    def crawl_sites(self, parse=True, download=True, nlp=True):

        self.do_parse = parse
        self.do_nlp = nlp
        assert not (self.do_parse ^ self.do_nlp
                    ), """if nlp is set to true, parse must be set to true"""
        article_futures = []
        newspaper_config = self.crawler_config.crawl_option
        sources = {
            s.name: Source(s.url, newspaper_config)
            for s in self.crawler_config.sites
        }
        for s_name, source in sources.items():
            source.build()
            logger.info("Number of articles in newspaper %s is %d" %
                        (s_name, source.size()))

        logger.info("Built the sources for the newspapers")
        if not download: return sources

        logger.info("downloading the article data from the newspapers")
        for s_name, source in sources.items():
            article_futures.extend([
                self.news_pools[s_name].submit(self.download_article, article)
                for article in source.articles
            ])

        #download the actual content and parse
        for future_obj in as_completed(article_futures):
            self.article_callback(future_obj)
        return sources
示例#8
0
    def test_source_build(self):
        """
        builds a source object, validates it has no errors, prints out
        all valid categories and feed urls
        """
        DESC = """CNN.com delivers the latest breaking news and information on the latest top stories, weather, business, entertainment, politics, and more. For in-depth coverage, CNN.com provides special reports, video, audio, photo galleries, and interactive guides."""
        BRAND = "cnn"

        config = Configuration()
        config.verbose = False
        s = Source("http://cnn.com", config=config)
        s.clean_memo_cache()
        s.build()

        assert s.brand == BRAND
        assert s.description == DESC

        # For this test case and a few more, I don't believe you can actually
        # assert two values to equal eachother because some values are ever changing.

        # Insead, i'm just going to print some stuff out so it is just as easy to take
        # a glance and see if it looks OK.

        print "\t\tWe have %d articles currently!" % s.size()
        print
        print "\t\t%s categories are: %s" % (s.url, str(s.category_urls()))
 def build_sources(self, domains):
     """Build sources using newspaper API to scrape from selected domains."""
     try:
         for domain in domains:
             source = 'http://%s' % domain
             self.sources.append(source)
         for source in self.sources:
             self.paper = Source(source)
             self.paper = self.newspaper.build(source,
                                               memoize_articles=True,
                                               keep_article_html=True,
                                               verbose=True)
             print('Source: {} - Size: {}'.format(source,
                                                  self.paper.size()))
             self.papers.append(self.paper)
         self.news_pool.set(self.papers, threads_per_source=2)
         self.news_pool.join()
         return self.papers
     except:
         raise Exception
示例#10
0
    def test_source_build(self):
        """
        builds a source object, validates it has no errors, prints out
        all valid categories and feed urls
        """
        DESC = ('CNN.com International delivers breaking news from across '
                'the globe and information on the latest top stories, '
                'business, sports and entertainment headlines. Follow the '
                'news as it happens through: special reports, videos, '
                'audio, photo galleries plus interactive maps and timelines.')
        CATEGORY_URLS = [
            u'http://cnn.com/ASIA', u'http://connecttheworld.blogs.cnn.com',
            u'http://cnn.com/HLN', u'http://cnn.com/MIDDLEEAST',
            u'http://cnn.com', u'http://ireport.cnn.com',
            u'http://cnn.com/video', u'http://transcripts.cnn.com',
            u'http://cnn.com/espanol', u'http://partners.cnn.com',
            u'http://www.cnn.com', u'http://cnn.com/US',
            u'http://cnn.com/EUROPE', u'http://cnn.com/TRAVEL',
            u'http://cnn.com/cnni', u'http://cnn.com/SPORT',
            u'http://cnn.com/mostpopular', u'http://arabic.cnn.com',
            u'http://cnn.com/WORLD', u'http://cnn.com/LATINAMERICA',
            u'http://us.cnn.com', u'http://travel.cnn.com',
            u'http://mexico.cnn.com', u'http://cnn.com/SHOWBIZ',
            u'http://edition.cnn.com', u'http://amanpour.blogs.cnn.com',
            u'http://money.cnn.com', u'http://cnn.com/tools/index.html',
            u'http://cnnespanol.cnn.com', u'http://cnn.com/CNNI',
            u'http://business.blogs.cnn.com', u'http://cnn.com/AFRICA',
            u'http://cnn.com/TECH', u'http://cnn.com/BUSINESS'
        ]
        FEEDS = [u'http://rss.cnn.com/rss/edition.rss']
        BRAND = 'cnn'

        s = Source('http://cnn.com', verbose=False, memoize_articles=False)
        url_re = re.compile(".*cnn\.com")
        mock_response_with(url_re, 'cnn_main_site')
        s.clean_memo_cache()
        s.build()

        assert s.brand == BRAND
        assert s.description == DESC
        assert s.size() == 241
        assert s.category_urls() == CATEGORY_URLS
        # TODO: A lot of the feed extraction is NOT being tested because feeds
        # are primarly extracted from the HTML of category URLs. We lose this
        # effect by just mocking CNN's main page HTML. Warning: tedious fix.
        assert s.feed_urls() == FEEDS
示例#11
0
    def test_source_build(self):
        """
        builds a source object, validates it has no errors, prints out
        all valid categories and feed urls
        """
        DESC = ('CNN.com International delivers breaking news from across '
                'the globe and information on the latest top stories, '
                'business, sports and entertainment headlines. Follow the '
                'news as it happens through: special reports, videos, '
                'audio, photo galleries plus interactive maps and timelines.')
        CATEGORY_URLS = [
            u'http://cnn.com/ASIA', u'http://connecttheworld.blogs.cnn.com',
            u'http://cnn.com/HLN', u'http://cnn.com/MIDDLEEAST',
            u'http://cnn.com', u'http://ireport.cnn.com',
            u'http://cnn.com/video', u'http://transcripts.cnn.com',
            u'http://cnn.com/espanol',
            u'http://partners.cnn.com', u'http://www.cnn.com',
            u'http://cnn.com/US', u'http://cnn.com/EUROPE',
            u'http://cnn.com/TRAVEL', u'http://cnn.com/cnni',
            u'http://cnn.com/SPORT', u'http://cnn.com/mostpopular',
            u'http://arabic.cnn.com', u'http://cnn.com/WORLD',
            u'http://cnn.com/LATINAMERICA', u'http://us.cnn.com',
            u'http://travel.cnn.com', u'http://mexico.cnn.com',
            u'http://cnn.com/SHOWBIZ', u'http://edition.cnn.com',
            u'http://amanpour.blogs.cnn.com', u'http://money.cnn.com',
            u'http://cnn.com/tools/index.html', u'http://cnnespanol.cnn.com',
            u'http://cnn.com/CNNI', u'http://business.blogs.cnn.com',
            u'http://cnn.com/AFRICA', u'http://cnn.com/TECH',
            u'http://cnn.com/BUSINESS']
        FEEDS = [u'http://rss.cnn.com/rss/edition.rss']
        BRAND = 'cnn'

        s = Source('http://cnn.com', verbose=False, memoize_articles=False)
        url_re = re.compile(".*cnn\.com")
        mock_response_with(url_re, 'cnn_main_site')
        s.clean_memo_cache()
        s.build()

        assert s.brand == BRAND
        assert s.description == DESC
        assert s.size() == 241
        assert s.category_urls() == CATEGORY_URLS
        # TODO: A lot of the feed extraction is NOT being tested because feeds
        # are primarly extracted from the HTML of category URLs. We lose this
        # effect by just mocking CNN's main page HTML. Warning: tedious fix.
        assert s.feed_urls() == FEEDS
示例#12
0
    def test_source_build(self):
        """
        builds a source object, validates it has no errors, prints out
        all valid categories and feed urls
        """
        DESC = ('CNN.com International delivers breaking news from across '
                'the globe and information on the latest top stories, '
                'business, sports and entertainment headlines. Follow the '
                'news as it happens through: special reports, videos, '
                'audio, photo galleries plus interactive maps and timelines.')
        CATEGORY_URLS = [
            'http://cnn.com/ASIA', 'http://connecttheworld.blogs.cnn.com',
            'http://cnn.com/HLN', 'http://cnn.com/MIDDLEEAST',
            'http://cnn.com', 'http://ireport.cnn.com',
            'http://cnn.com/video', 'http://transcripts.cnn.com',
            'http://cnn.com/espanol',
            'http://partners.cnn.com', 'http://www.cnn.com',
            'http://cnn.com/US', 'http://cnn.com/EUROPE',
            'http://cnn.com/TRAVEL', 'http://cnn.com/cnni',
            'http://cnn.com/SPORT', 'http://cnn.com/mostpopular',
            'http://arabic.cnn.com', 'http://cnn.com/WORLD',
            'http://cnn.com/LATINAMERICA', 'http://us.cnn.com',
            'http://travel.cnn.com', 'http://mexico.cnn.com',
            'http://cnn.com/SHOWBIZ', 'http://edition.cnn.com',
            'http://amanpour.blogs.cnn.com', 'http://money.cnn.com',
            'http://cnn.com/tools/index.html', 'http://cnnespanol.cnn.com',
            'http://cnn.com/CNNI', 'http://business.blogs.cnn.com',
            'http://cnn.com/AFRICA', 'http://cnn.com/TECH',
            'http://cnn.com/BUSINESS']
        FEEDS = ['http://rss.cnn.com/rss/edition.rss']
        BRAND = 'cnn'

        s = Source('http://cnn.com', verbose=False, memoize_articles=False)
        # resp = mock_response_with('http://cnn.com', 'cnn_main_site')
        s.clean_memo_cache()
        s.build()
 def extract_from_source(self, source):
     news = NpSource(source, verbose=True)
     news.clean_memo_cache()
     news.build()
     logging.info('...build done!')
     for url in news.article_urls():
         if self.is_available_url(url):
             article = self._extract_articles(url)
             if self.is_available_article(article):
                 self._store_article(article)
示例#14
0
 def test_feed_extraction(self):
     """Test that feeds are matched properly
     """
     url = 'http://theatlantic.com'
     html = mock_resource_with('theatlantic.com1', 'html')
     s = Source(url, memoize_articles=False)
     s.html = html
     s.parse()
     # mock in categories containing only homepage
     #s.set_categories()
     category = Category(url=url)
     category.html = html
     category.doc = s.doc
     s.categories = [
         category,
     ]
     #s.parse_categories()
     s.set_feeds()
     self.assertEqual(len(s.feeds), 3)
示例#15
0
    def test_source_build(self):
        """builds a source object, validates it has no errors, prints out
        all valid categories and feed urls"""

        DESC = """CNN.com delivers the latest breaking news and information on the latest top stories, weather, business, entertainment, politics, and more. For in-depth coverage, CNN.com provides special reports, video, audio, photo galleries, and interactive guides."""
        BRAND = 'cnn'

        configs = Configuration()
        configs.verbose = False
        s = Source('http://cnn.com', configs=configs)
        s.clean_memo_cache()
        s.build()

        assert s.brand == BRAND
        assert s.description == DESC

        print '\t\tWe have %d articles currently!' % s.size()
示例#16
0
    def test_source_build(self):
        """
        builds a source object, validates it has no errors, prints out
        all valid categories and feed urls
        """
        DESC = ('CNN.com International delivers breaking news from across '
                'the globe and information on the latest top stories, '
                'business, sports and entertainment headlines. Follow the '
                'news as it happens through: special reports, videos, '
                'audio, photo galleries plus interactive maps and timelines.')
        CATEGORY_URLS = [
            'http://cnn.com/ASIA', 'http://connecttheworld.blogs.cnn.com',
            'http://cnn.com/HLN', 'http://cnn.com/MIDDLEEAST',
            'http://cnn.com', 'http://ireport.cnn.com', 'http://cnn.com/video',
            'http://transcripts.cnn.com', 'http://cnn.com/espanol',
            'http://partners.cnn.com', 'http://www.cnn.com',
            'http://cnn.com/US', 'http://cnn.com/EUROPE',
            'http://cnn.com/TRAVEL', 'http://cnn.com/cnni',
            'http://cnn.com/SPORT', 'http://cnn.com/mostpopular',
            'http://arabic.cnn.com', 'http://cnn.com/WORLD',
            'http://cnn.com/LATINAMERICA', 'http://us.cnn.com',
            'http://travel.cnn.com', 'http://mexico.cnn.com',
            'http://cnn.com/SHOWBIZ', 'http://edition.cnn.com',
            'http://amanpour.blogs.cnn.com', 'http://money.cnn.com',
            'http://cnn.com/tools/index.html', 'http://cnnespanol.cnn.com',
            'http://cnn.com/CNNI', 'http://business.blogs.cnn.com',
            'http://cnn.com/AFRICA', 'http://cnn.com/TECH',
            'http://cnn.com/BUSINESS'
        ]
        FEEDS = ['http://rss.cnn.com/rss/edition.rss']
        BRAND = 'cnn'

        s = Source('http://cnn.com', verbose=False, memoize_articles=False)
        # html = mock_resource_with('http://cnn.com', 'cnn_main_site')
        s.clean_memo_cache()
        s.build()
示例#17
0
 def test_source_url_input_none(self):
     with self.assertRaises(Exception):
         Source(url=None)
示例#18
0
def main():
    source = "Al Jazeera"
    aj = Source(
        "http://america.aljazeera.com/topics/topic/categories/international.html",
        memoize_articles=False)
    fetch_data(aj)
示例#19
0
import newspaper
from newspaper import Source

url = 'http://www.prothomalo.com/'
bangla_paper = Source(url, memoize_articles=False, number_threads=20)
bangla_paper.build()
print(bangla_paper.size())

for article in bangla_paper.articles:

    try:
        article.download()
        article.parse()
        print(article.url)
        print('Title :\n' + str(article.title) + '\n')
        print('Content :\n' + str(article.text) + '\n')

        if (len(article.tags) > 0):
            print('Tags :\n' + str(article.tags) + '\n')
        else:
            print('Tags :\n{}\n')

    except Exception:
        print(Exception)
'''
#print (newspaper.languages())
url = 'http://www.kalerkantho.com/online/Islamic-lifestylie/2017/12/29/583269';
#url = 'https://bdnews24.com/neighbours/2017/12/29/indian-state-of-assam-tense-ahead-of-citizens-list-targeting-illegal-bangladeshis'
article = Article(url, language='bn')
'''
示例#20
0
    def test_cache_categories(self):
        """Builds two same source objects in a row examines speeds of both
        """
        url = 'http://uk.yahoo.com'
        mock_response_with(url, 'yahoo_main_site')
        s = Source(url)
        s.download()
        s.parse()
        s.set_categories()

        saved_urls = s.category_urls()
        s.categories = []
        s.set_categories()
        assert sorted(s.category_urls()) == sorted(saved_urls)
示例#21
0
 def __init__(self, url):
     self.newspaper = Source(url)
     self.newspaper.clean_memo_cache()
     self.newspaper.build()
     self.articles = self.newspaper.articles
示例#22
0
    def test_cache_categories(self):
        """
        builds two same source objects in a row examines speeds of both
        """
        s = Source("http://yahoo.com")
        s.download()
        s.parse()
        s.set_categories()

        saved_urls = s.category_urls()
        s.categories = []  # reset and try again with caching
        s.set_categories()
        assert sorted(s.category_urls()) == sorted(saved_urls)
class ExtractArticles():
    def __init__(self):
        self.sources = []
        self.papers = []
        self.pool = []
        self.categories = []
        self.category = None
        self.paper = None
        self.articles = []
        self.article = None
        self.newspaper = newspaper
        self.news_pool = news_pool

    def build_sources(self, domains):
        """Build sources using newspaper API to scrape from selected domains."""
        try:
            for domain in domains:
                source = 'http://%s' % domain
                self.sources.append(source)
            for source in self.sources:
                self.paper = Source(source)
                self.paper = self.newspaper.build(source,
                                                  memoize_articles=True,
                                                  keep_article_html=True,
                                                  verbose=True)
                print('Source: {} - Size: {}'.format(source,
                                                     self.paper.size()))
                self.papers.append(self.paper)
            self.news_pool.set(self.papers, threads_per_source=2)
            self.news_pool.join()
            return self.papers
        except:
            raise Exception

    def parse_article(self, paper, order=0):
        self.paper = paper
        try:
            self.article = paper.articles[order]
            article = self.article
            article.download()
            article.parse()
            brand = paper.brand
            url = article.url
            text = article.text
            html = article.article_html
            title = article.title
            images = article.images
            video = article.movies
            date = article.publish_date
            result = {
                'paper': brand,
                'article_url': url,
                'title': title,
                'text': text,
                'content': html,
                'video': video,
                'images': images,
                'publish_time': date
            }
            return result
        except:
            raise Exception

    def parse_articles(self, pool):
        index = 0
        try:
            for paper in pool:
                size = paper.size()
                brand = paper.brand
                while index < size:
                    article = self.parse_article(paper, index)
                    self.articles.append(article)
                    index += 1
                if size == 0:
                    pass
                print('Paper [{}] has new [{}] articles'.format(brand, size))
            return self.articles
        except:
            raise Exception

    def remove_invalid_articles(self, pool):
        """Remove scraped articles with duplicated or None titles."""
        try:
            title_list = []
            article_list = []
            print('Original articles: {}'.format(len(pool)))
            for article in pool:
                title = article['title']
                if title is None or title == "":
                    pool.remove(article)
                if title not in title_list:
                    title_list.append(title)
                    article_list.append(article)
            print('Unique articles: {}'.format(len(article_list)))
            return article_list
        except:
            raise Exception
示例#24
0
    def test_cache_categories(self):
        """
        builds two same source objects in a row examines speeds of both
        """
        s = Source('http://yahoo.com')
        s.download()
        s.parse()
        s.set_categories()

        saved_urls = s.category_urls()
        s.categories = []  # reset and try again with caching

        s.set_categories()

        assert sorted(s.category_urls()) == sorted(saved_urls)
示例#25
0
 def failfunc():
     Source(url=None)
示例#26
0
    def test_cache_categories(self):
        """Builds two same source objects in a row examines speeds of both
        """
        url = 'http://uk.yahoo.com'
        html = mock_resource_with('yahoo_main_site', 'html')
        s = Source(url)
        s.download()
        s.parse()
        s.set_categories()

        saved_urls = s.category_urls()
        s.categories = []
        s.set_categories()
        self.assertCountEqual(saved_urls, s.category_urls())
示例#27
0
fname = "data.csv"
df_origin = pd.read_csv(fname, header=None)
data = df_origin.as_matrix()

print data.shape
# create list of Article objects
urls = data[1:, 0].tolist()

# for each line in csv
articles = []
for i in range(len(urls)):
    # print "iteration:{} {} ".format(i,urls[i])
    articles.append(Article(url=urls[i]))

# create a source of aricltes
news_source = Source("https://www.dummyurl.com")
news_source.articles = articles
# create a news_pool for threading purposes
news_pool.set([news_source], threads_per_source=2)
news_pool.join()

# iterate through article list to create a column for the csv
print "Parsing articles..."

article_list = []
labels = ['title', 'authors', 'text', 'keywords', 'summary', 'tags']
for article in articles:
    print "Parsing article {}".format(article.url)
    article.parse()
    article_list.append({
        labels[0]: article.title,
示例#28
0
 def failfunc():
     __ = Source(url=None)
示例#29
0
    def paper(self):
        '''
            get newspaper articles, default source is `herald` newspaper
            defaults to articles of this month and year
            import newspaperzw

            news = newspaperzw.news()
        '''

        if self.summary and self.nlp == False:
            # raise exception. `nltk` module missing
            raise Exception(self.error_msg)

        news_source = Providers().getUrl(self.provider).strip()

        name = Source(news_source, self.config)
        name.build()
        name.download()
        name.parse()
        name.download_articles()

        # do logging
        logging.debug(f"News Source build and downloaded. url: {news_source}")

        news_data = {}
        news_article = []

        counter = 0
        for article in name.article_urls():
            images = ""
            keywords = ""

            try:
                name.articles[counter].download()
                name.articles[counter].parse()

                # log
                logging.debug(
                    f"Article #{counter} downloaded and parsed successfuly")

            except:
                counter += 1

                # log
                logging.error(
                    f"Error download and parsing article #{counter}. continue.."
                )
                continue

            # get in data
            title = name.articles[counter].title
            date_pub = name.articles[counter].publish_date
            top_image = name.articles[counter].top_image
            link = name.articles[counter].url
            text = name.articles[counter].text

            if (self.nlp):
                # do nlp stuff
                name.articles[counter].nlp()
                summary = name.articles[counter].summary

                for words in name.articles[counter].keywords:
                    keywords += str(words) + ','

                # log
                logging.debug(
                    f"summary flag enabled. NLP summary obtained successfuly")

            # add to news pool, only add news of this year and month
            # data_pub format = 10-04-2018 21:28:09
            data = {}
            if (self.nlp):
                data.update({
                    "article_id": randint(555, 999),
                    "title": title,
                    "published": date_pub,
                    "image": top_image,
                    "news": text,
                    "summary": summary,
                    "keywords": keywords.rstrip(','),
                    "url": link
                })

                # log
                logging.debug("article data with summary saved to news pool!")

            else:
                data.update({
                    "article_id": randint(555, 999),
                    "title": title,
                    "published": date_pub,
                    "image": top_image,
                    "news": text,
                    "url": link
                })

                # log
                logging.debug("article data added to news pool")

            news_article.append(data)
            data = {}

            # increment to next articles
            counter += 1

        # build main news storage
        news_data.update({
            'source': name.brand,
            'domain': name.domain,
            'news': news_article
        })

        # log
        logging.debug("News main data pool created on success")

        return news_data
示例#30
0
    def test_cache_categories(self):
        """Builds two same source objects in a row examines speeds of both
        """
        url = 'http://uk.yahoo.com'
        mock_response_with(url, 'yahoo_main_site')
        s = Source(url)
        s.download()
        s.parse()
        s.set_categories()

        saved_urls = s.category_urls()
        s.categories = []
        s.set_categories()
        assert sorted(s.category_urls()) == sorted(saved_urls)
示例#31
0
    def test_cache_categories(self):
        """Builds two same source objects in a row examines speeds of both
        """
        url = 'http://uk.yahoo.com'
        html = mock_resource_with('yahoo_main_site', 'html')
        s = Source(url)
        s.download()
        s.parse()
        s.set_categories()

        saved_urls = s.category_urls()
        s.categories = []
        s.set_categories()
        self.assertCountEqual(saved_urls, s.category_urls())
示例#32
0
 def test_source_default_params(self):
     s = Source(url='http://cnn.com')
     self.assertEqual('en', s.config.language)
     self.assertEqual(20000, s.config.MAX_FILE_MEMO)
     self.assertTrue(s.config.memoize_articles)
     self.assertTrue(s.config.use_meta_language)
示例#33
0
def main():
    source="The Huffington Post"
    delivery_time="6:00"
    #config = Config()
    #config.memoize_articles = False
    hpost = Source("http://huffingtonpost.com/theworldpost", memoize_articles=False)
    hpost.download()
    hpost.parse()

    hpost.set_categories()
    
    hpost.categories = [hpost.categories[0]]
    hpost.categories[0].url = "http://huffingtonpost.com/theworldpost"
    hpost.download_categories()
    hpost.parse_categories()

    hpost.set_feeds()
    hpost.download_feeds()

    hpost.generate_articles()
    
    #for c in hpost.categories:
    #    print(c)
    #guardian = newspaper.build('http://theguardian.com/world', memoize_articles=False)
    #news_pool.set([guardian], threads_per_source=2)
    #news_pool.join()

    #print(hpost.size())

    for article in [x for x in hpost.articles if re.match(".*html.*world.*", x.url) is not None]:
        url = article.url
        a = Article(url, language='en')
        a.download()
        for i in range(10):
            if a.is_downloaded:
                break
            else:
                a.download()
        
        try:
            a.parse()
            a.nlp()
        except:
            print("Error: Not parsed/downloaded correctly.")
            continue

        html = a.html
        summary = a.summary
        keywords = a.keywords
        title = a.title
        text = a.text
        #print(html)
        #print(text)
        #print(summary)
        #print(keywords)
        #print(title)
        #print(a.publish_date)
        if source in title:
            title = None
        #print(title)
        findtime = re.search(r'Posted.*<time datetime="(.*?)">', html)
        if findtime is None:
            date=None
            time=None
        else:
            date,time = findtime.group(1).split("T")
            date = date.split("-")
            date[0], date[1], date[2] = date[1], date[2], date[0]
            date = "/".join(date)
            
            time = ":".join(time.split("-")[0].split(":")[0:2])
        date_time = str(date) + " " + str(time)
        #print(title)
        #print(date_time)
        date_obj = datetime.datetime.strptime(date_time,'%m/%d/%Y %H:%M')
        #print(date_obj.strftime('%Y/%m/%d %I:%M %p'))

        try:
            article = {
                'headline': title,
                'url': url,
                'text': text,
                'date': date_obj
            }
            newspaper_article('Huffington Post', article, keywords=keywords)
        except Exception as ex:
            print 'Article could not be created due to following error'
            print ex
示例#34
0
def main():
    source = "The Washington Post"
    delivery_time = "6:00"
    #config = Config()
    #config.memoize_articles = False
    wpost = Source("http://washingtonpost.com/world", memoize_articles=False)
    wpost.download()
    wpost.parse()

    wpost.set_categories()

    wpost.categories = [wpost.categories[0]]
    wpost.categories[0].url = "http://washingtonpost.com/world"
    wpost.download_categories()
    wpost.parse_categories()

    wpost.set_feeds()
    wpost.download_feeds()

    wpost.generate_articles()

    #for c in wpost.categories:
    #    print(c)
    #guardian = newspaper.build('http://theguardian.com/world', memoize_articles=False)
    #news_pool.set([guardian], threads_per_source=2)
    #news_pool.join()

    #print(wpost.size())

    for article in [
            x for x in wpost.articles
            if re.match(".*com/world/.*", x.url) is not None
            and re.match(".*gallery.html", x.url) is None
    ]:
        url = article.url
        a = Article(url, language='en')
        a.download()

        for i in range(10):
            if a.is_downloaded:
                break
            else:
                a.download()

        try:
            a.parse()
            a.nlp()
        except:
            print("Error: Not parsed/downloaded correctly.")
            continue

        html = a.html
        summary = a.summary
        keywords = a.keywords
        title = a.title
        text = a.text
        #print(html)
        #print(text)
        #print(summary)
        #print(keywords)
        #print(title)
        #print(a.publish_date)
        if source in title:
            title = None
        #print(title)
        if a.publish_date is not None:
            date = str(a.publish_date).split()[0].split("-")
            #print(date)
            date[0], date[1], date[2] = date[1], date[2], date[0]
            date = "/".join(date)
        else:
            date = None
        time = re.search(r'<span class="pb-timestamp">(.*?)</span>', html)
        if time is None:
            print(url)
            date = None
        else:
            time = time.group(1)
            if ":" not in time:
                time = delivery_time
            else:
                time = time.split(" at ")[1]
                time = datetime.datetime.strptime(time,
                                                  '%I:%M %p').strftime('%H:%M')
        date_time = str(date) + " " + str(time)
        #print(date_time)
        date_obj = datetime.datetime.strptime(date_time, '%m/%d/%Y %H:%M')
        #print(date_obj.strftime('%Y/%m/%d %I:%M %p'))
        #print(text)
        #print(date_time)
        #TODO: Add stuff to the DB

        try:
            article = {
                'headline': title,
                'url': url,
                'text': text,
                'date': date_obj
            }
            newspaper_article(source, article, keywords=keywords)
        except Exception as ex:
            print 'Article could not be created due to following error'
            print ex
示例#35
0
def main():
    source="The Washington Post"
    delivery_time="6:00"
    #config = Config()
    #config.memoize_articles = False
    wpost = Source("http://washingtonpost.com/world", memoize_articles=False)
    wpost.download()
    wpost.parse()

    wpost.set_categories()
    
    wpost.categories = [wpost.categories[0]]
    wpost.categories[0].url = "http://washingtonpost.com/world"
    wpost.download_categories()
    wpost.parse_categories()

    wpost.set_feeds()
    wpost.download_feeds()

    wpost.generate_articles()
    
    #for c in wpost.categories:
    #    print(c)
    #guardian = newspaper.build('http://theguardian.com/world', memoize_articles=False)
    #news_pool.set([guardian], threads_per_source=2)
    #news_pool.join()

    #print(wpost.size())

    for article in [x for x in wpost.articles if re.match(".*com/world/.*", x.url) is not None and re.match(".*gallery.html", x.url) is None]:
        url = article.url
        a = Article(url, language='en')
        a.download()

        for i in range(10):
            if a.is_downloaded:
                break
            else:
                a.download()

        try:
            a.parse()
            a.nlp()
        except:
            print("Error: Not parsed/downloaded correctly.")
            continue

        html = a.html
        summary = a.summary
        keywords = a.keywords
        title = a.title
        text = a.text
        #print(html)
        #print(text)
        #print(summary)
        #print(keywords)
        #print(title)
        #print(a.publish_date)
        if source in title:
            title = None
        #print(title)
        if a.publish_date is not  None:
            date = str(a.publish_date).split()[0].split("-")
            #print(date)
            date[0], date[1], date[2] = date[1], date[2], date[0]
            date = "/".join(date)
        else:
            date = None
        time = re.search(r'<span class="pb-timestamp">(.*?)</span>' , html)
        if time is None:
            print(url)
            date = None
        else:
            time = time.group(1)
            if ":" not in time:
                time = delivery_time
            else:
                time = time.split(" at ")[1]
                time = datetime.datetime.strptime(time,'%I:%M %p').strftime('%H:%M')
        date_time = str(date) + " " + str(time)
        #print(date_time)
        date_obj = datetime.datetime.strptime(date_time,'%m/%d/%Y %H:%M')
        #print(date_obj.strftime('%Y/%m/%d %I:%M %p'))
        #print(text)
        #print(date_time)
        #TODO: Add stuff to the DB

        try:
            article = {
                'headline': title,
                'url': url,
                'text': text,
                'date': date_obj
            }
            newspaper_article(source, article, keywords=keywords)
        except Exception as ex:
            print 'Article could not be created due to following error'
            print ex
def main():
    source = "The Guardian"
    #config = Config()
    #config.memoize_articles = False
    guardian = Source("http://www.theguardian.com/world",
                      memoize_articles=False)
    guardian.build()
    #guardian = newspaper.build('http://theguardian.com/world', memoize_articles=False)
    #news_pool.set([guardian], threads_per_source=2)
    #news_pool.join()

    #print(guardian.size())

    for article in [
            x for x in guardian.articles
            if re.match(".*/world/.*", x.url) is not None
    ]:
        url = article.url
        a = Article(url, language='en')
        a.download()
        for i in range(10):
            if a.is_downloaded:
                break
            else:
                a.download()
        try:
            a.parse()
            a.nlp()
        except:
            print("Error: Not parsed/downloaded correctly.")
            continue

        html = a.html
        summary = a.summary
        keywords = a.keywords
        title = a.title
        text = a.text
        date = str(a.publish_date).split()[0].split("-")
        date[0], date[1], date[2] = date[1], date[2], date[0]
        date = "/".join(date)
        delta = re.search(r'<span class="content__dateline-time">(.*)</span>',
                          html).group(1).replace(".", ":").split()[0]
        time = datetime.now() + timedelta(hours=delta)
        date_time = date + " " + time
        #print(title)
        #print(date_time)
        date_obj = datetime.datetime.strptime(date_time, '%m/%d/%Y %H:%M')
        #print(date_obj.strftime('%Y/%m/%d %I:%M %p'))
        #TODO: Add stuff to the DB

        try:
            article = {
                'headline': title,
                'url': url,
                'text': text,
                'date': date_obj
            }
            newspaper_article('The Guardian', article, keywords=keywords)
        except Exception as ex:
            print 'Article could not be created due to following error'
            print ex
示例#37
0
def main():
    source = "BBC"
    bbc = Source("http://www.bbc.com/news", memoize_articles=False)
    fetch_data(bbc)