Python RobotFileParser примеры использования

Язык программирования: Python

Пространство имен/Пакет: six.moves.urllib.robotparser

Класс/Тип: RobotFileParser

Примеров на hotexamples.com: 5

Python RobotFileParser - 5 примеров найдено. Это лучшие примеры Python кода для six.moves.urllib.robotparser.RobotFileParser, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

RobotFileParser(3)

can_fetch(3)

parse(2)

allow_all(1)

disallow_all(1)

modified(1)

read(1)

Пример #1

Показать файл

Файл: tornado_fetcher.py Проект: ider-zh/pyspider

    def can_fetch(self, user_agent, url):
        parsed = urlsplit(url)
        domain = parsed.netloc
        if domain in self.robots_txt_cache:
            robot_txt = self.robots_txt_cache[domain]
            if time.time() - robot_txt.mtime() > self.robot_txt_age:
                robot_txt = None
        else:
            robot_txt = None

        if robot_txt is None:
            robot_txt = RobotFileParser()
            try:
                response = yield gen.maybe_future(
                    self.http_client.fetch(urljoin(url, '/robots.txt'),
                                           connect_timeout=10,
                                           request_timeout=30))
                content = response.body
            except tornado.httpclient.HTTPError as e:
                logger.error('load robots.txt from %s error: %r', domain, e)
                content = ''

            try:
                content = content.decode('utf8', 'ignore')
            except UnicodeDecodeError:
                content = ''

            robot_txt.parse(content.splitlines())
            self.robots_txt_cache[domain] = robot_txt

        raise gen.Return(robot_txt.can_fetch(user_agent, url))

Пример #2

Показать файл

Файл: tornado_fetcher.py Проект: binux/pyspider

    def can_fetch(self, user_agent, url):
        parsed = urlsplit(url)
        domain = parsed.netloc
        if domain in self.robots_txt_cache:
            robot_txt = self.robots_txt_cache[domain]
            if time.time() - robot_txt.mtime() > self.robot_txt_age:
                robot_txt = None
        else:
            robot_txt = None

        if robot_txt is None:
            robot_txt = RobotFileParser()
            try:
                response = yield gen.maybe_future(self.http_client.fetch(
                    urljoin(url, '/robots.txt'), connect_timeout=10, request_timeout=30))
                content = response.body
            except tornado.httpclient.HTTPError as e:
                logger.error('load robots.txt from %s error: %r', domain, e)
                content = ''

            try:
                content = content.decode('utf8', 'ignore')
            except UnicodeDecodeError:
                content = ''

            robot_txt.parse(content.splitlines())
            self.robots_txt_cache[domain] = robot_txt

        raise gen.Return(robot_txt.can_fetch(user_agent, url))

Пример #3

Показать файл

Файл: models.py Проект: RossBrunton/BMAT

    def download_title(self):
        """ Downloads the title of the bookmark
        
        This may fail silently. If it does, it sets the title to "Unknown Title".
        """
        self.title = "Unknown Title"

        try:
            url = urlparse(self.url)
            robots = "{}://{}/robots.txt".format(url.scheme, url.netloc)

            rfp = RobotFileParser(robots)
            rfp.read()

            if rfp.can_fetch("BMAT", self.url):
                r = requests.get(self.url,
                                 timeout=3.0,
                                 headers={"User-Agent": "BMAT"})
                r.raise_for_status()

                p = HTMLTitleReader()
                p.feed(r.text)

                self.title = p.title
                self.save()

        except:
            return

Пример #4

Показать файл

    def load_rules_from_url(self, robots_url, timeout=None):
        """Manually load the robots.txt file from the server.

        :param robots_url: url address of the text file to load.
        :param timeout: requests timeout
        :return: loaded rules or None if failed.
        """
        _parser = RobotFileParser()
        try:
            req = requests.Request(method='GET',
                                   url=robots_url,
                                   headers=self.headers,
                                   auth=self.auth,
                                   cookies=self.cookies,
                                   hooks=self.hooks)
            prep = req.prepare()
            send_kwargs = {
                'stream': False,
                'timeout': timeout,
                'verify': self.verify,
                'cert': self.cert,
                'proxies': self.proxies,
                'allow_redirects': True,
            }
            f = super(Session, self).send(prep, **send_kwargs)
            f.raise_for_status()
            self.cookies.update(f.cookies)
        except requests.exceptions.HTTPError as err:
            code = err.response.status_code
            if code in (401, 403):
                _parser.disallow_all = True
            elif 400 <= code < 500:
                _parser.allow_all = True
        except requests.exceptions.ConnectionError:
            _parser.allow_all = True
        else:
            _parser.parse(f.text.splitlines())
        self.robots_registry[robots_url] = _parser
        #: Initiate a start time for delays
        _parser.modified()
        return _parser

Пример #5

Показать файл

Файл: structures.py Проект: alessandrohc/pywebcopy

 def can_fetch(self, url, user_agent=None):
     return RobotFileParser.can_fetch(self, user_agent or self.user_agent, url)