Python Source.download примеры использования

Язык программирования: Python

Пространство имен/Пакет: newspaper

Класс/Тип: Source

Метод/Функция: download

Примеров на hotexamples.com: 10

Python Source.download - 10 примеров найдено. Это лучшие примеры Python кода для newspaper.Source.download, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Source(25)

build(8)

parse(7)

categories(6)

download(6)

set_categories(5)

category_urls(5)

clean_memo_cache(5)

size(4)

set_feeds(3)

article_urls(2)

download_feeds(2)

generate_articles(2)

parse_categories(2)

download_categories(2)

feed_urls(1)

html(1)

download_articles(1)

articles(1)

Пример #1

Показать файл

Файл: unit_tests.py Проект: zhoubug/newspaper

    def test_cache_categories(self):
        """
        builds two same source objects in a row examines speeds of both
        """
        s = Source('http://yahoo.com')
        s.download()
        s.parse()
        s.set_categories()

        saved_urls = s.category_urls()
        s.categories = []  # reset and try again with caching
        s.set_categories()
        assert sorted(s.category_urls()) == sorted(saved_urls)

Пример #2

Показать файл

Файл: unit_tests.py Проект: WheresWardy/newspaper

    def test_cache_categories(self):
        """
        builds two same source objects in a row examines speeds of both
        """
        s = Source("http://yahoo.com")
        s.download()
        s.parse()
        s.set_categories()

        saved_urls = s.category_urls()
        s.categories = []  # reset and try again with caching
        s.set_categories()
        assert sorted(s.category_urls()) == sorted(saved_urls)

Пример #3

Показать файл

    def test_cache_categories(self):
        """Builds two same source objects in a row examines speeds of both
        """
        url = 'http://uk.yahoo.com'
        mock_response_with(url, 'yahoo_main_site')
        s = Source(url)
        s.download()
        s.parse()
        s.set_categories()

        saved_urls = s.category_urls()
        s.categories = []
        s.set_categories()
        assert sorted(s.category_urls()) == sorted(saved_urls)

Пример #4

Показать файл

    def test_cache_categories(self):
        """Builds two same source objects in a row examines speeds of both
        """
        url = 'http://uk.yahoo.com'
        html = mock_resource_with('yahoo_main_site', 'html')
        s = Source(url)
        s.download()
        s.parse()
        s.set_categories()

        saved_urls = s.category_urls()
        s.categories = []
        s.set_categories()
        self.assertCountEqual(saved_urls, s.category_urls())

Пример #5

Показать файл

Файл: unit_tests.py Проект: erezbil/newspaper

    def test_cache_categories(self):
        """Builds two same source objects in a row examines speeds of both
        """
        url = 'http://uk.yahoo.com'
        mock_response_with(url, 'yahoo_main_site')
        s = Source(url)
        s.download()
        s.parse()
        s.set_categories()

        saved_urls = s.category_urls()
        s.categories = []
        s.set_categories()
        assert sorted(s.category_urls()) == sorted(saved_urls)

Пример #6

Показать файл

Файл: unit_tests.py Проект: Newspad/newspaper

    def test_cache_categories(self):
        """Builds two same source objects in a row examines speeds of both
        """
        url = 'http://uk.yahoo.com'
        html = mock_resource_with('yahoo_main_site', 'html')
        s = Source(url)
        s.download()
        s.parse()
        s.set_categories()

        saved_urls = s.category_urls()
        s.categories = []
        s.set_categories()
        self.assertCountEqual(saved_urls, s.category_urls())

Пример #7

Показать файл

def main():
    source="The Huffington Post"
    delivery_time="6:00"
    #config = Config()
    #config.memoize_articles = False
    hpost = Source("http://huffingtonpost.com/theworldpost", memoize_articles=False)
    hpost.download()
    hpost.parse()

    hpost.set_categories()
    
    hpost.categories = [hpost.categories[0]]
    hpost.categories[0].url = "http://huffingtonpost.com/theworldpost"
    hpost.download_categories()
    hpost.parse_categories()

    hpost.set_feeds()
    hpost.download_feeds()

    hpost.generate_articles()
    
    #for c in hpost.categories:
    #    print(c)
    #guardian = newspaper.build('http://theguardian.com/world', memoize_articles=False)
    #news_pool.set([guardian], threads_per_source=2)
    #news_pool.join()

    #print(hpost.size())

    for article in [x for x in hpost.articles if re.match(".*html.*world.*", x.url) is not None]:
        url = article.url
        a = Article(url, language='en')
        a.download()
        for i in range(10):
            if a.is_downloaded:
                break
            else:
                a.download()
        
        try:
            a.parse()
            a.nlp()
        except:
            print("Error: Not parsed/downloaded correctly.")
            continue

        html = a.html
        summary = a.summary
        keywords = a.keywords
        title = a.title
        text = a.text
        #print(html)
        #print(text)
        #print(summary)
        #print(keywords)
        #print(title)
        #print(a.publish_date)
        if source in title:
            title = None
        #print(title)
        findtime = re.search(r'Posted.*<time datetime="(.*?)">', html)
        if findtime is None:
            date=None
            time=None
        else:
            date,time = findtime.group(1).split("T")
            date = date.split("-")
            date[0], date[1], date[2] = date[1], date[2], date[0]
            date = "/".join(date)
            
            time = ":".join(time.split("-")[0].split(":")[0:2])
        date_time = str(date) + " " + str(time)
        #print(title)
        #print(date_time)
        date_obj = datetime.datetime.strptime(date_time,'%m/%d/%Y %H:%M')
        #print(date_obj.strftime('%Y/%m/%d %I:%M %p'))

        try:
            article = {
                'headline': title,
                'url': url,
                'text': text,
                'date': date_obj
            }
            newspaper_article('Huffington Post', article, keywords=keywords)
        except Exception as ex:
            print 'Article could not be created due to following error'
            print ex

Пример #8

Показать файл

Файл: fetch_wpost.py Проект: JessicaFu/CS1951aFinalProj

def main():
    source="The Washington Post"
    delivery_time="6:00"
    #config = Config()
    #config.memoize_articles = False
    wpost = Source("http://washingtonpost.com/world", memoize_articles=False)
    wpost.download()
    wpost.parse()

    wpost.set_categories()
    
    wpost.categories = [wpost.categories[0]]
    wpost.categories[0].url = "http://washingtonpost.com/world"
    wpost.download_categories()
    wpost.parse_categories()

    wpost.set_feeds()
    wpost.download_feeds()

    wpost.generate_articles()
    
    #for c in wpost.categories:
    #    print(c)
    #guardian = newspaper.build('http://theguardian.com/world', memoize_articles=False)
    #news_pool.set([guardian], threads_per_source=2)
    #news_pool.join()

    #print(wpost.size())

    for article in [x for x in wpost.articles if re.match(".*com/world/.*", x.url) is not None and re.match(".*gallery.html", x.url) is None]:
        url = article.url
        a = Article(url, language='en')
        a.download()

        for i in range(10):
            if a.is_downloaded:
                break
            else:
                a.download()

        try:
            a.parse()
            a.nlp()
        except:
            print("Error: Not parsed/downloaded correctly.")
            continue

        html = a.html
        summary = a.summary
        keywords = a.keywords
        title = a.title
        text = a.text
        #print(html)
        #print(text)
        #print(summary)
        #print(keywords)
        #print(title)
        #print(a.publish_date)
        if source in title:
            title = None
        #print(title)
        if a.publish_date is not  None:
            date = str(a.publish_date).split()[0].split("-")
            #print(date)
            date[0], date[1], date[2] = date[1], date[2], date[0]
            date = "/".join(date)
        else:
            date = None
        time = re.search(r'<span class="pb-timestamp">(.*?)</span>' , html)
        if time is None:
            print(url)
            date = None
        else:
            time = time.group(1)
            if ":" not in time:
                time = delivery_time
            else:
                time = time.split(" at ")[1]
                time = datetime.datetime.strptime(time,'%I:%M %p').strftime('%H:%M')
        date_time = str(date) + " " + str(time)
        #print(date_time)
        date_obj = datetime.datetime.strptime(date_time,'%m/%d/%Y %H:%M')
        #print(date_obj.strftime('%Y/%m/%d %I:%M %p'))
        #print(text)
        #print(date_time)
        #TODO: Add stuff to the DB

        try:
            article = {
                'headline': title,
                'url': url,
                'text': text,
                'date': date_obj
            }
            newspaper_article(source, article, keywords=keywords)
        except Exception as ex:
            print 'Article could not be created due to following error'
            print ex

Пример #9

Показать файл

Файл: fetch_wpost.py Проект: JessicaFu/CS1951aFinalProj

def main():
    source = "The Washington Post"
    delivery_time = "6:00"
    #config = Config()
    #config.memoize_articles = False
    wpost = Source("http://washingtonpost.com/world", memoize_articles=False)
    wpost.download()
    wpost.parse()

    wpost.set_categories()

    wpost.categories = [wpost.categories[0]]
    wpost.categories[0].url = "http://washingtonpost.com/world"
    wpost.download_categories()
    wpost.parse_categories()

    wpost.set_feeds()
    wpost.download_feeds()

    wpost.generate_articles()

    #for c in wpost.categories:
    #    print(c)
    #guardian = newspaper.build('http://theguardian.com/world', memoize_articles=False)
    #news_pool.set([guardian], threads_per_source=2)
    #news_pool.join()

    #print(wpost.size())

    for article in [
            x for x in wpost.articles
            if re.match(".*com/world/.*", x.url) is not None
            and re.match(".*gallery.html", x.url) is None
    ]:
        url = article.url
        a = Article(url, language='en')
        a.download()

        for i in range(10):
            if a.is_downloaded:
                break
            else:
                a.download()

        try:
            a.parse()
            a.nlp()
        except:
            print("Error: Not parsed/downloaded correctly.")
            continue

        html = a.html
        summary = a.summary
        keywords = a.keywords
        title = a.title
        text = a.text
        #print(html)
        #print(text)
        #print(summary)
        #print(keywords)
        #print(title)
        #print(a.publish_date)
        if source in title:
            title = None
        #print(title)
        if a.publish_date is not None:
            date = str(a.publish_date).split()[0].split("-")
            #print(date)
            date[0], date[1], date[2] = date[1], date[2], date[0]
            date = "/".join(date)
        else:
            date = None
        time = re.search(r'<span class="pb-timestamp">(.*?)</span>', html)
        if time is None:
            print(url)
            date = None
        else:
            time = time.group(1)
            if ":" not in time:
                time = delivery_time
            else:
                time = time.split(" at ")[1]
                time = datetime.datetime.strptime(time,
                                                  '%I:%M %p').strftime('%H:%M')
        date_time = str(date) + " " + str(time)
        #print(date_time)
        date_obj = datetime.datetime.strptime(date_time, '%m/%d/%Y %H:%M')
        #print(date_obj.strftime('%Y/%m/%d %I:%M %p'))
        #print(text)
        #print(date_time)
        #TODO: Add stuff to the DB

        try:
            article = {
                'headline': title,
                'url': url,
                'text': text,
                'date': date_obj
            }
            newspaper_article(source, article, keywords=keywords)
        except Exception as ex:
            print 'Article could not be created due to following error'
            print ex

Пример #10

Показать файл

    def paper(self):
        '''
            get newspaper articles, default source is `herald` newspaper
            defaults to articles of this month and year
            import newspaperzw

            news = newspaperzw.news()
        '''

        if self.summary and self.nlp == False:
            # raise exception. `nltk` module missing
            raise Exception(self.error_msg)

        news_source = Providers().getUrl(self.provider).strip()

        name = Source(news_source, self.config)
        name.build()
        name.download()
        name.parse()
        name.download_articles()

        # do logging
        logging.debug(f"News Source build and downloaded. url: {news_source}")

        news_data = {}
        news_article = []

        counter = 0
        for article in name.article_urls():
            images = ""
            keywords = ""

            try:
                name.articles[counter].download()
                name.articles[counter].parse()

                # log
                logging.debug(
                    f"Article #{counter} downloaded and parsed successfuly")

            except:
                counter += 1

                # log
                logging.error(
                    f"Error download and parsing article #{counter}. continue.."
                )
                continue

            # get in data
            title = name.articles[counter].title
            date_pub = name.articles[counter].publish_date
            top_image = name.articles[counter].top_image
            link = name.articles[counter].url
            text = name.articles[counter].text

            if (self.nlp):
                # do nlp stuff
                name.articles[counter].nlp()
                summary = name.articles[counter].summary

                for words in name.articles[counter].keywords:
                    keywords += str(words) + ','

                # log
                logging.debug(
                    f"summary flag enabled. NLP summary obtained successfuly")

            # add to news pool, only add news of this year and month
            # data_pub format = 10-04-2018 21:28:09
            data = {}
            if (self.nlp):
                data.update({
                    "article_id": randint(555, 999),
                    "title": title,
                    "published": date_pub,
                    "image": top_image,
                    "news": text,
                    "summary": summary,
                    "keywords": keywords.rstrip(','),
                    "url": link
                })

                # log
                logging.debug("article data with summary saved to news pool!")

            else:
                data.update({
                    "article_id": randint(555, 999),
                    "title": title,
                    "published": date_pub,
                    "image": top_image,
                    "news": text,
                    "url": link
                })

                # log
                logging.debug("article data added to news pool")

            news_article.append(data)
            data = {}

            # increment to next articles
            counter += 1

        # build main news storage
        news_data.update({
            'source': name.brand,
            'domain': name.domain,
            'news': news_article
        })

        # log
        logging.debug("News main data pool created on success")

        return news_data