Python RobotsParser примеры использования

Язык программирования: Python

Пространство имен/Пакет: parsers

Класс/Тип: RobotsParser

Примеров на hotexamples.com: 4

Python RobotsParser - 4 примера найдено. Это лучшие примеры Python кода для parsers.RobotsParser, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

RobotsParser(1)

block_url(1)

Пример #1

Показать файл

class GenericSiteCrawler(BaseSiteCrawler):
    def __init__(self, hostname, dbconn, siteid, https=False):
        super(GenericSiteCrawler, self).__init__(hostname,
                                                 dbconn,
                                                 siteid,
                                                 https=https)

    def init_crawl(self):
        # Load robots.txt
        self.robots = RobotsParser("http://%s/robots.txt" % self.hostname)

        # We need to seed the crawler with every URL we've already seen, since
        # we don't recrawl the contents if they haven't changed.
        allpages = self.scantimes.keys()

        # Figure out if there are any excludes to deal with (beyond the
        # robots.txt ones)
        curs = self.dbconn.cursor()
        curs.execute("SELECT suburlre FROM site_excludes WHERE site=%(site)s",
                     {
                         'site': self.siteid,
                     })
        self.extra_excludes = [re.compile(x) for x, in curs.fetchall()]

        # We *always* crawl the root page, of course
        self.queue.put(("/", 0.5, False))

        # Now do all the other pages
        for x in allpages:
            self.queue.put((x, 0.5, False))

    def exclude_url(self, url):
        if self.robots and self.robots.block_url(url):
            return True
        for r in self.extra_excludes:
            if r.search(url):
                return True
        return False

    def queue_url(self, url):
        self.queue.put((url.strip(), 0.5, False))

    def post_process_page(self, url):
        for l in self.resolve_links(self.page.links, url):
            if self.pages_crawled.has_key(l) or self.pages_crawled.has_key(
                    l + "/"):
                continue
            if self.exclude_url(l):
                continue
            self.queue_url(l)

Пример #2

Показать файл

    def init_crawl(self):
        # Load robots.txt
        self.robots = RobotsParser("http://%s/robots.txt" % self.hostname)

        # We need to seed the crawler with every URL we've already seen, since
        # we don't recrawl the contents if they haven't changed.
        allpages = self.scantimes.keys()

        # Figure out if there are any excludes to deal with (beyond the
        # robots.txt ones)
        curs = self.dbconn.cursor()
        curs.execute("SELECT suburlre FROM site_excludes WHERE site=%(site)s",
                     {
                         'site': self.siteid,
                     })
        self.extra_excludes = [re.compile(x) for x, in curs.fetchall()]

        # We *always* crawl the root page, of course
        self.queue.put(("/", 0.5, False))

        # Now do all the other pages
        for x in allpages:
            self.queue.put((x, 0.5, False))

Пример #3

Показать файл

Файл: genericsite.py Проект: ChristophBerg/pgweb

class GenericSiteCrawler(BaseSiteCrawler):
	def __init__(self, hostname, dbconn, siteid):
		super(GenericSiteCrawler, self).__init__(hostname, dbconn, siteid)

	def init_crawl(self):
		# Load robots.txt
		self.robots = RobotsParser("http://%s/robots.txt" % self.hostname)

		# We need to seed the crawler with every URL we've already seen, since
		# we don't recrawl the contents if they haven't changed.
		allpages = self.scantimes.keys()

		# Figure out if there are any excludes to deal with (beyond the
		# robots.txt ones)
		curs = self.dbconn.cursor()
		curs.execute("SELECT suburlre FROM site_excludes WHERE site=%(site)s", {
				'site': self.siteid,
				})
		self.extra_excludes = [re.compile(x) for x, in curs.fetchall()]

		# We *always* crawl the root page, of course
		self.queue.put(("/", 0.5))

		# Now do all the other pages
		for x in allpages:
			self.queue.put((x, 0.5))

	def exclude_url(self, url):
		if self.robots and self.robots.block_url(url):
			return True
		for r in self.extra_excludes:
			if r.search(url):
				return True
		return False

	def queue_url(self, url):
		self.queue.put((url.strip(), 0.5))

	def post_process_page(self, url):
		for l in self.resolve_links(self.page.links, url):
			if self.pages_crawled.has_key(l) or self.pages_crawled.has_key(l+"/"):
				continue
			if self.exclude_url(l):
				continue
			self.queue_url(l)

Пример #4

Показать файл

Файл: genericsite.py Проект: ChristophBerg/pgweb

	def init_crawl(self):
		# Load robots.txt
		self.robots = RobotsParser("http://%s/robots.txt" % self.hostname)

		# We need to seed the crawler with every URL we've already seen, since
		# we don't recrawl the contents if they haven't changed.
		allpages = self.scantimes.keys()

		# Figure out if there are any excludes to deal with (beyond the
		# robots.txt ones)
		curs = self.dbconn.cursor()
		curs.execute("SELECT suburlre FROM site_excludes WHERE site=%(site)s", {
				'site': self.siteid,
				})
		self.extra_excludes = [re.compile(x) for x, in curs.fetchall()]

		# We *always* crawl the root page, of course
		self.queue.put(("/", 0.5))

		# Now do all the other pages
		for x in allpages:
			self.queue.put((x, 0.5))