示例#1
0
class TestPool(TestCase):
    __timeout__ = 5
    size = 1

    def setUp(self):
        greentest.TestCase.setUp(self)
        self.pool = ThreadPool(self.size)

    def test_apply(self):
        papply = self.pool.apply
        self.assertEqual(papply(sqr, (5,)), sqr(5))
        self.assertEqual(papply(sqr, (), {'x': 3}), sqr(x=3))

    def test_map(self):
        pmap = self.pool.map
        self.assertEqual(pmap(sqr, range(10)), list(map(sqr, range(10))))
        self.assertEqual(pmap(sqr, range(100)), list(map(sqr, range(100))))

    def test_async(self):
        res = self.pool.apply_async(sqr, (7, TIMEOUT1,))
        get = TimingWrapper(res.get)
        self.assertEqual(get(), 49)
        self.assertAlmostEqual(get.elapsed, TIMEOUT1, 1)

    def test_async_callback(self):
        result = []
        res = self.pool.apply_async(sqr, (7, TIMEOUT1,), callback=lambda x: result.append(x))
        get = TimingWrapper(res.get)
        self.assertEqual(get(), 49)
        self.assertAlmostEqual(get.elapsed, TIMEOUT1, 1)
        gevent.sleep(0)  # let's the callback run
        assert result == [49], result

    def test_async_timeout(self):
        res = self.pool.apply_async(sqr, (6, TIMEOUT2 + 0.2))
        get = TimingWrapper(res.get)
        self.assertRaises(gevent.Timeout, get, timeout=TIMEOUT2)
        self.assertAlmostEqual(get.elapsed, TIMEOUT2, 1)
        self.pool.join()

    def test_imap(self):
        it = self.pool.imap(sqr, range(10))
        self.assertEqual(list(it), list(map(sqr, range(10))))

        it = self.pool.imap(sqr, range(10))
        for i in range(10):
            self.assertEqual(six.advance_iterator(it), i * i)
        self.assertRaises(StopIteration, lambda: six.advance_iterator(it))

        it = self.pool.imap(sqr, range(1000))
        for i in range(1000):
            self.assertEqual(six.advance_iterator(it), i * i)
        self.assertRaises(StopIteration, lambda: six.advance_iterator(it))

    def test_imap_random(self):
        it = self.pool.imap(sqr_random_sleep, range(10))
        self.assertEqual(list(it), list(map(sqr, range(10))))

    def test_imap_unordered(self):
        it = self.pool.imap_unordered(sqr, range(1000))
        self.assertEqual(sorted(it), list(map(sqr, range(1000))))

        it = self.pool.imap_unordered(sqr, range(1000))
        self.assertEqual(sorted(it), list(map(sqr, range(1000))))

    def test_imap_unordered_random(self):
        it = self.pool.imap_unordered(sqr_random_sleep, range(10))
        self.assertEqual(sorted(it), list(map(sqr, range(10))))

    def test_terminate(self):
        result = self.pool.map_async(sleep, [0.1] * ((self.size or 10) * 2))
        gevent.sleep(0.1)
        kill = TimingWrapper(self.pool.kill)
        kill()
        assert kill.elapsed < 0.5, kill.elapsed
        result.join()

    def sleep(self, x):
        sleep(float(x) / 10.)
        return str(x)

    def test_imap_unordered_sleep(self):
        # testing that imap_unordered returns items in competion order
        result = list(self.pool.imap_unordered(self.sleep, [10, 1, 2]))
        if self.pool.size == 1:
            expected = ['10', '1', '2']
        else:
            expected = ['1', '2', '10']
        self.assertEqual(result, expected)
示例#2
0
class TestPool(TestCase):
    __timeout__ = 5
    size = 1

    def setUp(self):
        greentest.TestCase.setUp(self)
        self.pool = ThreadPool(self.size)

    def test_apply(self):
        papply = self.pool.apply
        self.assertEqual(papply(sqr, (5, )), sqr(5))
        self.assertEqual(papply(sqr, (), {'x': 3}), sqr(x=3))

    def test_map(self):
        pmap = self.pool.map
        self.assertEqual(pmap(sqr, range(10)), list(map(sqr, range(10))))
        self.assertEqual(pmap(sqr, range(100)), list(map(sqr, range(100))))

    def test_async(self):
        res = self.pool.apply_async(sqr, (
            7,
            TIMEOUT1,
        ))
        get = TimingWrapper(res.get)
        self.assertEqual(get(), 49)
        self.assertAlmostEqual(get.elapsed, TIMEOUT1, 1)

    def test_async_callback(self):
        result = []
        res = self.pool.apply_async(sqr, (
            7,
            TIMEOUT1,
        ),
                                    callback=lambda x: result.append(x))
        get = TimingWrapper(res.get)
        self.assertEqual(get(), 49)
        self.assertAlmostEqual(get.elapsed, TIMEOUT1, 1)
        gevent.sleep(0)  # let's the callback run
        assert result == [49], result

    def test_async_timeout(self):
        res = self.pool.apply_async(sqr, (6, TIMEOUT2 + 0.2))
        get = TimingWrapper(res.get)
        self.assertRaises(gevent.Timeout, get, timeout=TIMEOUT2)
        self.assertAlmostEqual(get.elapsed, TIMEOUT2, 1)
        self.pool.join()

    def test_imap(self):
        it = self.pool.imap(sqr, range(10))
        self.assertEqual(list(it), list(map(sqr, range(10))))

        it = self.pool.imap(sqr, range(10))
        for i in range(10):
            self.assertEqual(six.advance_iterator(it), i * i)
        self.assertRaises(StopIteration, lambda: six.advance_iterator(it))

        it = self.pool.imap(sqr, range(1000))
        for i in range(1000):
            self.assertEqual(six.advance_iterator(it), i * i)
        self.assertRaises(StopIteration, lambda: six.advance_iterator(it))

    def test_imap_random(self):
        it = self.pool.imap(sqr_random_sleep, range(10))
        self.assertEqual(list(it), list(map(sqr, range(10))))

    def test_imap_unordered(self):
        it = self.pool.imap_unordered(sqr, range(1000))
        self.assertEqual(sorted(it), list(map(sqr, range(1000))))

        it = self.pool.imap_unordered(sqr, range(1000))
        self.assertEqual(sorted(it), list(map(sqr, range(1000))))

    def test_imap_unordered_random(self):
        it = self.pool.imap_unordered(sqr_random_sleep, range(10))
        self.assertEqual(sorted(it), list(map(sqr, range(10))))

    def test_terminate(self):
        result = self.pool.map_async(sleep, [0.1] * ((self.size or 10) * 2))
        gevent.sleep(0.1)
        kill = TimingWrapper(self.pool.kill)
        kill()
        assert kill.elapsed < 0.5, kill.elapsed
        result.join()

    def sleep(self, x):
        sleep(float(x) / 10.)
        return str(x)

    def test_imap_unordered_sleep(self):
        # testing that imap_unordered returns items in competion order
        result = list(self.pool.imap_unordered(self.sleep, [10, 1, 2]))
        if self.pool.size == 1:
            expected = ['10', '1', '2']
        else:
            expected = ['1', '2', '10']
        self.assertEqual(result, expected)
示例#3
0
	def handle(self, *args, **options):
		self.verbose = options['verbosity'] > 1
		self.debug = options['debug']
		if self.debug:
			self.verbose=True
		self.full = options['full']

		if options['id']:
			feeds = Blog.objects.filter(pk=options['id'])
		else:
			# Fetch all feeds - that are not archived. We do fetch feeds that are not approved,
			# to make sure they work.
			feeds = Blog.objects.filter(archived=False)

		# Fan out the fetching itself
		fetchers = [FeedFetcher(f, self.trace) for f in feeds]
		num = len(fetchers)
		pool = ThreadPool(options['parallelism'])
		pr = pool.map_async(self._fetch_one_feed, fetchers)
		while not pr.ready():
			gevent.sleep(1)
			self.trace("Fetching feeds (%s/%s done), please wait..." % (num-pool.task_queue.unfinished_tasks, num))

		total_entries = 0
		# Fetching was async, but results processing will be sync. Don't want to deal with
		# multithreaded database connections and such complications.
		try:
			with transaction.atomic():
				for feed, results in pr.get():
					if isinstance(results, ParserGotRedirect):
						# Received a redirect. If this is a redirect for exactly the same URL just
						# from http to https, special case this and allow it. For any other redirect,
						# we don't follow it since it might no longer be a properly filtered feed
						# for example.
						if results.url == feed.feedurl:
							# Redirect to itself! Should never happen, of course.
							AggregatorLog(feed=feed, success=False,
										  info="Feed returned redirect loop to itself!").save()
						elif results.url == feed.feedurl.replace('http://', 'https://'):
							# OK, update it!
							AggregatorLog(feed=feed, success=True,
										  info="Feed returned redirect to https, updating registration").save()
							send_simple_mail(settings.EMAIL_SENDER,
											 feed.user.email,
											 "Your blog at Planet PostgreSQL redirected",
											 u"The blog aggregator at Planet PostgreSQL has picked up a redirect for your blog.\nOld URL: {0}\nNew URL: {1}\n\nThe database has been updated, and new entries will be fetched from the secure URL in the future.\n".format(feed.feedurl, results.url),
											 sendername="Planet PostgreSQL",
											 receivername=u"{0} {1}".format(feed.user.first_name, feed.user.last_name),
											 )
							send_simple_mail(settings.EMAIL_SENDER,
											 settings.NOTIFICATION_RECEIVER,
											 "Blog redirect detected on Planet PostgreSQL",
											 u"The blog at {0} by {1}\nis returning a redirect to a https version of itself.\n\nThe database has automatically been updated, and will start fetching using https in the future,\n\n".format(feed.feedurl, feed.user),
											 sendername="Planet PostgreSQL",
											 receivername="Planet PostgreSQL Moderators",
							)
							feed.feedurl = results.url
							feed.save()
						else:
							AggregatorLog(feed=feed, success=False,
										  info="Feed returned redirect (http 301)").save()
					elif isinstance(results, Exception):
						AggregatorLog(feed=feed,
									  success=False,
									  info=results).save()
					else:
						if feed.approved:
							had_entries = True
						else:
							had_entries = feed.has_entries
						entries = 0
						titles = []
						ids = []

						for entry in results:
							self.trace("Found entry at %s" % entry.link)
							# Entry is a post, but we need to check if it's already there. Check
							# is done on guid. Some blogs use http and https in the guid, and
							# also change between them depending on how the blog is fetched,
							# so check for those two explicitly.
							if 'http://' in entry.guid:
								alternateguid = entry.guid.replace('http://', 'https://')
							elif 'https://' in entry.guid:
								alternateguid = entry.guid.replace('https://', 'http://')
							else:
								alternateguid = None
							# We check if this entry has been syndicated on any *other* blog as well,
							# so we don't accidentally post something more than once.
							if not Post.objects.filter(Q(guid=entry.guid) | Q(guid=alternateguid)).exists():
								self.trace("Saving entry at %s" % entry.link)
								entry.save()
								entry.update_shortlink()
								AggregatorLog(feed=feed,
											  success=True,
											  info="Fetched entry at '%s'" % entry.link).save()
								entries += 1
								titles.append(entry.title)
								ids.append(entry.pk)
								total_entries += 1
							else:
								self.trace("Skipping entry: %s" % entry.link)

						if entries > 0 and feed.approved:
							# If we picked "too many" entries, this might indicate a misconfigured blog that
							# stopped doing it's filtering correctly.
							if entries > settings.MAX_SAFE_ENTRIES_PER_FETCH:
								self.trace("{0} new entries for {1}, >{2}, hiding".format(
									entries, feed.feedurl, settings.MAX_SAFE_ENTRIES_PER_FETCH))
								Post.objects.filter(id__in=ids).update(hidden=True)
								# Email a notification that they were picked up
								send_simple_mail(settings.EMAIL_SENDER,
												 feed.user.email,
												 "Many posts found at your blog at Planet PostgreSQL",
												 u"The blog aggregator at Planet PostgreSQL has just picked up the following\nposts from your blog at {0}:\n\n{1}\n\nSince this is a large number of posts, they have been fetched\nand marked as hidden, to avoid possible duplicates.\n\nPlease go to https://planet.postgresql.org/register/edit/{2}\nand confirm (by unhiding) which of these should be posted.\n\nThank you!\n\n".format(
													 feed.blogurl,
													 "\n".join(["* " + t for t in titles]),
													 feed.id),
												 sendername="Planet PostgreSQL",
												 receivername=u"{0} {1}".format(feed.user.first_name, feed.user.last_name),
								)
								send_simple_mail(settings.EMAIL_SENDER,
												 settings.NOTIFICATION_RECEIVER,
												 "Excessive posts from feed on Planet PostgreSQL",
												 u"The blog at {0} by {1}\nreceived {2} new posts in a single fetch.\nAs this may be incorect, the posts have been marked as hidden.\nThe author may individually mark them as visible depending on\nprevious posts, and has been sent a notification about this.".format(feed.feedurl, feed.user, len(ids)),
												 sendername="Planet PostgreSQL",
												 receivername="Planet PostgreSQL Moderators",
								)
							else:
								# Email a notification that they were picked up
								send_simple_mail(settings.EMAIL_SENDER,
												 feed.user.email,
												 "Posts found at your blog at Planet PostgreSQL",
												 u"The blog aggregator at Planet PostgreSQL has just picked up the following\nposts from your blog at {0}:\n\n{1}\n\nIf these entries are correct, you don't have to do anything.\nIf any entry should not be there, head over to\n\nhttps://planet.postgresql.org/register/edit/{2}/\n\nand click the 'Hide' button for those entries as soon\nas possible.\n\nThank you!\n\n".format(
													 feed.blogurl,
													 "\n".join(["* " + t for t in titles]),
													 feed.id),
												 sendername="Planet PostgreSQL",
												 receivername=u"{0} {1}".format(feed.user.first_name, feed.user.last_name),
								)

						if entries > 0 and not had_entries:
							# Entries showed up on a blog that was previously empty
							send_simple_mail(settings.EMAIL_SENDER,
											 settings.NOTIFICATION_RECEIVER,
											 "A blog was added to Planet PostgreSQL",
											 u"The blog at {0} by {1}\nwas added to Planet PostgreSQL, and has now received entries.\n\nTo moderate: https://planet.postgresql.org/register/moderate/\n\n".format(feed.feedurl, feed.user),
											 sendername="Planet PostgreSQL",
											 receivername="Planet PostgreSQL Moderators",
							)

						# If the blog URL changed, update it as requested
						if getattr(feed, 'new_blogurl', None):
							self.trace("URL changed for %s to %s" % (feed.feedurl, feed.new_blogurl))
							send_simple_mail(settings.EMAIL_SENDER,
											 settings.NOTIFICATION_RECEIVER,
											 "A blog url changed on Planet PostgreSQL",
											 u"When checking the blog at {0} by {1}\nthe blog URL was updated to:\n{2}\n(from previous value {3})\n\nTo moderate: https://planet.postgresql.org/register/moderate/\n\n".format(feed.feedurl, feed.user, feed.new_blogurl, feed.blogurl),
											 sendername="Planet PostgreSQL",
											 receivername="Planet PostgreSQL Moderators",
							)
							send_simple_mail(settings.EMAIL_SENDER,
											 feed.user.email,
											 "URL of your blog at Planet PostgreSQL updated",
											 u"The blog aggregator at Planet PostgreSQL has update the URL of your blog\nwith the feed at {0} to:\n{1} (from {2})\nIf this is correct, you don't have to do anything.\nIf not, please contact [email protected]\n".format(
												 feed.feedurl,
												 feed.new_blogurl,
												 feed.blogurl,
											 ),
											 sendername="Planet PostgreSQL",
											 receivername=u"{0} {1}".format(feed.user.first_name, feed.user.last_name),
											 )
							feed.blogurl = feed.new_blogurl
							feed.save()
				if self.debug:
					# Roll back transaction without error
					raise BreakoutException()
		except BreakoutException:
			self.stderr.write("Rolling back all changes")
			pass

		if total_entries > 0 and not self.debug:
			purge_root_and_feeds()
示例#4
0
    def handle(self, *args, **options):
        self.verbose = options['verbosity'] > 1
        self.debug = options['debug']
        if self.debug:
            self.verbose = True
        self.full = options['full']

        if options['id']:
            feeds = Blog.objects.filter(pk=options['id'])
        else:
            # Fetch all feeds - that are not archived. We do fetch feeds that are not approved,
            # to make sure they work.
            feeds = Blog.objects.filter(archived=False)

        # Fan out the fetching itself
        fetchers = [FeedFetcher(f, self.trace) for f in feeds]
        num = len(fetchers)
        pool = ThreadPool(options['parallelism'])
        pr = pool.map_async(self._fetch_one_feed, fetchers)
        while not pr.ready():
            gevent.sleep(1)
            self.trace("Fetching feeds (%s/%s done), please wait..." %
                       (num - pool.task_queue.unfinished_tasks, num))

        total_entries = 0
        # Fetching was async, but results processing will be sync. Don't want to deal with
        # multithreaded database connections and such complications.
        try:
            with transaction.atomic():
                for feed, results in pr.get():
                    if isinstance(results, Exception):
                        AggregatorLog(feed=feed, success=False,
                                      info=results).save()
                    else:
                        if feed.approved:
                            had_entries = True
                        else:
                            had_entries = feed.has_entries
                        entries = 0
                        titles = []

                        for entry in results:
                            self.trace("Found entry at %s" % entry.link)
                            # Entry is a post, but we need to check if it's already there. Check
                            # is done on guid.
                            if not Post.objects.filter(
                                    feed=feed, guid=entry.guid).exists():
                                self.trace("Saving entry at %s" % entry.link)
                                entry.save()
                                entry.update_shortlink()
                                AggregatorLog(feed=feed,
                                              success=True,
                                              info="Fetched entry at '%s'" %
                                              entry.link).save()
                                entries += 1
                                titles.append(entry.title)
                                total_entries += 1

                        if entries > 0 and feed.approved:
                            # Email a notification that they were picked up
                            send_simple_mail(
                                settings.EMAIL_SENDER,
                                feed.user.email,
                                "Posts found at your blog at Planet PostgreSQL",
                                u"The blog aggregator at Planet PostgreSQL has just picked up the "
                                u"following\nposts from your blog at {0}:\n\n{1}\n\nIf these entries are "
                                u"correct, you don't have to do anything.\nIf any entry should not be "
                                u"there, head over to\n\nhttps://planet.postgresql.org/register/edit/"
                                u"{2}/\n\nand click the 'Hide' button for those entries as soon\nas "
                                u"possible.\n\nThank you!\n\n".format(
                                    feed.blogurl,
                                    "\n".join(["* " + t for t in titles]),
                                    feed.id),
                                sendername="Planet PostgreSQL",
                                receivername=u"{0} {1}".format(
                                    feed.user.first_name, feed.user.last_name),
                            )

                        if entries > 0 and not had_entries:
                            # Entries showed up on a blog that was previously empty
                            send_simple_mail(
                                settings.EMAIL_SENDER,
                                settings.NOTIFICATION_RECEIVER,
                                "A blog was added to Planet PostgreSQL",
                                u"The blog at {0} by {1}\nwas added to Planet PostgreSQL, and has now "
                                u"received entries.\n\nTo moderate: "
                                u"https://planet.postgresql.org/register/moderate/\n\n"
                                .format(feed.feedurl, feed.user),
                                sendername="Planet PostgreSQL",
                                receivername="Planet PostgreSQL Moderators",
                            )

                        # If the blog URL changed, update it as requested
                        if getattr(feed, 'new_blogurl', None):
                            print("URL changed for %s to %s" %
                                  (feed.feedurl, feed.new_blogurl))
                            send_simple_mail(
                                settings.EMAIL_SENDER,
                                settings.NOTIFICATION_RECEIVER,
                                "A blog url changed on Planet PostgreSQL",
                                u"When checking the blog at {0} by {1}\nthe blog URL was updated to:\n"
                                u"{2}\n(from previous value {3})\n\nTo moderate: "
                                u"https://planet.postgresql.org/register/moderate/\n\n"
                                .format(feed.feedurl, feed.user,
                                        feed.new_blogurl, feed.blogurl),
                                sendername="Planet PostgreSQL",
                                receivername="Planet PostgreSQL Moderators",
                            )
                            send_simple_mail(
                                settings.EMAIL_SENDER,
                                feed.user.email,
                                "URL of your blog at Planet PostgreSQL updated",
                                u"The blog aggregator at Planet PostgreSQL has update the URL of your "
                                u"blog\nwith the feed at {0} to:\n{1} (from {2})\nIf this is correct, "
                                u"you don't have to do anything.\nIf not, please contact "
                                u"[email protected]\n".format(
                                    feed.feedurl,
                                    feed.new_blogurl,
                                    feed.blogurl,
                                ),
                                sendername="Planet PostgreSQL",
                                receivername=u"{0} {1}".format(
                                    feed.user.first_name, feed.user.last_name),
                            )
                            feed.blogurl = feed.new_blogurl
                            feed.save()
                if self.debug:
                    # Roll back transaction without error
                    raise BreakoutException()
        except BreakoutException:
            self.stderr.write("Rolling back all changes")
            pass

        if total_entries > 0 and not self.debug:
            purge_root_and_feeds()
示例#5
0
class WebPage(Parser, _ElementFactory):
    """Provides the apis for invoking parse and save functionality.

    usage::

        >>> from pywebcopy import WebPage, config
        >>> url = 'http://some-url.com/some-page.html'

        # You should always start with setting up the config or use apis
        >>> config.setup_config(url, project_folder, project_name, **kwargs)

        # Create a instance of the WebPage object
        >>> wp = WebPage()

        # If you want to use `requests` to fetch the page then
        >>> wp.get(url)

        # Else if you want to use plain html or urllib then use
        >>> wp.set_source(object_which_have_a_read_method, encoding=encoding)
        >>> wp.url = url   # you need to do this if you are using set_source()

        # Then you can access several methods like
        >>> wp.save_complete()
        >>> wp.save_html()
        >>> wp.save_assets()

    """
    __slots__ = 'url', 'path', '_threads', '_utx'

    def __init__(self, **kwargs):
        super(WebPage, self).__init__()

        if not config.is_set():
            import warnings
            warnings.warn(
                UserWarning(
                    "Global Configuration is not setup. You can ignore this if you are going manual."
                    "This is just one time warning regarding some unexpected behavior."
                )
            )

        # Some scripts might have apis specific to previous version
        # which this doesn't support now and would definitely remove
        # the arguments in later version
        if kwargs.pop('url', None):
            raise DeprecationWarning(
                "Direct initialisation with url is not supported now. Please use"
                "the get() or set_source() methods for page fetching."
                "And use the config.setup_config() method to setup the kwargs."
                "Arguments will be completely removed in later versions."
            )

        self.url = config.get('project_url', None)
        self.path = config.get('project_folder', None)
        self._utx = None
        self._element_map = {
            'link': LinkTag,
            'style': LinkTag,
            'script': ScriptTag,
            'img': ImgTag,
            'a': AnchorTag,
            'form': AnchorTag,
            'default': TagBase,
        }
        self._threads = []

    def __repr__(self):
        return '<WebPage: [%s]>' % self.url

    # @property
    # def url(self):
    #     return self.utx.url
    #
    # @url.setter
    # def url(self, new_url):
    #     self.utx.url = new_url

    @property
    def utx(self):
        """Returns an URLTransformer() object made from the self.url string.

        :rtype: URLTransformer
        :returns: prepared URLTransformer object
        """
        if self._utx is None:
            self._utx = self._new_utx()
        return self._utx

    @utx.setter
    def utx(self, o):
        assert isinstance(o, URLTransformer), TypeError
        assert hasattr(o, 'url'), AttributeError
        self._utx = o
        self.url = getattr(o, 'url')
        self.path = getattr(o, 'to_path')

    file_path = property(attrgetter('utx.file_path'), doc="Path at which this object would be written.")
    project_path = property(attrgetter('utx.base_path'), doc="Path at which this object would be written.")
    file_name = property(attrgetter('utx.file_name'), doc="Name to be used as the file node.")
    element_map = property(attrgetter('_element_map'), doc="Registry of different handler for different tags.")

    def _get_utx(self):
        return self.utx

    def _new_utx(self):
        assert self.url is not None, "Url not setup."
        assert self.path is not None, "Folder path not setup."

        o = URLTransformer(
            url=self.url,
            base_url=self.url,
            base_path=self.path,
            default_fn='index.html'
        )
        o.default_suffix = 'html'
        o.enforce_suffix = True
        return o

    def _make_element(self, tag):
        # print(self._element_map)
        elem = self._element_map.get(tag)
        if not elem:
            elem = self._element_map.get('default')
        LOGGER.debug('Element: <%r> selected for the tag: <%r>' % (elem, tag))
        return elem

    def get(self, url, **params):
        """Fetches the Html content from Internet using the requests.
        You can any requests params which will be passed to the library
        itself.
        The requests arguments you supply will also be applied to the
        global session meaning all the files will be downloaded using these
        settings.

        If you want to use some manual page fetch, like using urllib, then
        you should use the set_source() method which would do the right job.

        usage::

            >>> wp = WebPage()
            >>> wp.get(url, proxies=proxies, headers=headers, auth=auth, ...)
            >>> wp.save_complete()

        :param url: url of the page to fetch
        :param params: keyword arguments which `requests` module may accept.
        """
        req = SESSION.get(url, **params)
        req.raise_for_status()
        # Set some information about the content being loaded so
        # that the parser has a better idea about
        self.url = req.url
        # The internal parser assumes a read() method to be
        # present on the source, thus we need to pass the raw stream
        # io object which serves the purpose
        req.raw.decode_content = True
        self.set_source(req.raw, req.encoding)

    def shutdown(self, timeout=10):
        """
        Shuts down the working threads.
        :param timeout: timeout per thread before raising error.
        :return: None
        *New in 6.1.0*
        """
        for i in self._threads:
            if i.is_alive():
                i.join(timeout=timeout)
        del self._threads[:]

    def save_assets(self):
        """Save only the linked files to the disk.
        """
        if not self.elements:
            LOGGER.error("No elements found for downloading!")
            return

        LOGGER.info("Starting save_assets Action on url: {!r}".format(self.utx.url))

        elms = list(self.elements)

        # LOGGER.log(100, "Queueing download of <%d> asset files." % len(elms))

        self._tp = ThreadPool(maxsize=1000)
        self._tp.map_async(lambda e: e.run(), elms)

        # for elem in elms:
        #     elem.run()
        #     # don't use multi-threading
        #     #with POOL_LIMIT:
        #     #    t = threading.Thread(name=repr(elem), target=elem.run)
        #     #    t.start()
        #     #    self._threads.append(t)

    def save_html(self, file_name=None, raw_html=False):
        """Saves the html of the page to a default or specified file.
        :type file_name: str
        :type raw_html: bool
        :param file_name: path of the file to write the contents to
        :param raw_html: whether write the unmodified html or the rewritten html
        """

        LOGGER.info("Starting save_html Action on url: {!r}".format(self.utx.url))

        if not file_name:
            file_name = self.utx.file_path

        # Create directories if necessary
        if not os.path.exists(os.path.dirname(file_name)):
            try:
                os.makedirs(os.path.dirname(file_name))
            except FileExistsError:
                pass

        if raw_html:
            src = getattr(self, '_source')
            if not src or not hasattr(src, 'read'):
                raise Exception(
                    "Parser source is not set yet!"
                    "Try .get() method or .set_source() method to feed the parser!"
                )
            with open(file_name, 'wb') as fh:
                fh.write(self.get_source().read())
        else:
            if not getattr(self, '_parseComplete'):
                self.parse()
            root = self.root
            if not hasattr(root, 'getroottree'):
                raise ParseError("Tree is not being generated by parser!")

            root.getroottree().write(file_name, method="html")
            LOGGER.info("WebPage saved successfully to %s" % file_name)

    def save_complete(self):
        """Saves the complete html+assets on page to a file and
        also writes its linked files to the disk.

        Implements the combined logic of save_assets and save_html in
        compact form with checks and validation.
        """

        assert self.url is not None, "Url is not setup."
        assert getattr(self, '_source') is not None, "Source is not setup."

        LOGGER.info("Starting save_complete Action on url: [%r]" % self.url)

        if not self._parseComplete:
            self.parse()  # call in the action

        self.save_assets()
        self.save_html(self.utx.file_path, raw_html=False)
示例#6
0
#!/usr/bin/python
# -*- coding: utf8 -*-

import logging
import gevent
from gevent.threadpool import ThreadPool

logging.basicConfig(level=logging.INFO, format="%(message)s")


def callback_(args):
    for arg in args:
        logging.info(arg.strip())


def foo(args):
    return args


if __name__ == "__main__":
    tp = ThreadPool(20)
    f = open('/tmp/test.txt')
    greenlets = [tp.map_async(foo, f, callback=callback_)]
    gevent.joinall(greenlets)