Пример #1
0
    def can_fetch(self, user_agent, url):
        parsed = urlsplit(url)
        domain = parsed.netloc
        if domain in self.robots_txt_cache:
            robot_txt = self.robots_txt_cache[domain]
            if time.time() - robot_txt.mtime() > self.robot_txt_age:
                robot_txt = None
        else:
            robot_txt = None

        if robot_txt is None:
            robot_txt = RobotFileParser()
            try:
                response = yield gen.maybe_future(
                    self.http_client.fetch(urljoin(url, '/robots.txt'),
                                           connect_timeout=10,
                                           request_timeout=30))
                content = response.body
            except tornado.httpclient.HTTPError as e:
                logger.error('load robots.txt from %s error: %r', domain, e)
                content = ''

            try:
                content = content.decode('utf8', 'ignore')
            except UnicodeDecodeError:
                content = ''

            robot_txt.parse(content.splitlines())
            self.robots_txt_cache[domain] = robot_txt

        raise gen.Return(robot_txt.can_fetch(user_agent, url))
Пример #2
0
    def can_fetch(self, user_agent, url):
        parsed = urlsplit(url)
        domain = parsed.netloc
        if domain in self.robots_txt_cache:
            robot_txt = self.robots_txt_cache[domain]
            if time.time() - robot_txt.mtime() > self.robot_txt_age:
                robot_txt = None
        else:
            robot_txt = None

        if robot_txt is None:
            robot_txt = RobotFileParser()
            try:
                response = yield gen.maybe_future(self.http_client.fetch(
                    urljoin(url, '/robots.txt'), connect_timeout=10, request_timeout=30))
                content = response.body
            except tornado.httpclient.HTTPError as e:
                logger.error('load robots.txt from %s error: %r', domain, e)
                content = ''

            try:
                content = content.decode('utf8', 'ignore')
            except UnicodeDecodeError:
                content = ''

            robot_txt.parse(content.splitlines())
            self.robots_txt_cache[domain] = robot_txt

        raise gen.Return(robot_txt.can_fetch(user_agent, url))
Пример #3
0
    def download_title(self):
        """ Downloads the title of the bookmark
        
        This may fail silently. If it does, it sets the title to "Unknown Title".
        """
        self.title = "Unknown Title"

        try:
            url = urlparse(self.url)
            robots = "{}://{}/robots.txt".format(url.scheme, url.netloc)

            rfp = RobotFileParser(robots)
            rfp.read()

            if rfp.can_fetch("BMAT", self.url):
                r = requests.get(self.url,
                                 timeout=3.0,
                                 headers={"User-Agent": "BMAT"})
                r.raise_for_status()

                p = HTMLTitleReader()
                p.feed(r.text)

                self.title = p.title
                self.save()

        except:
            return
Пример #4
0
    def load_rules_from_url(self, robots_url, timeout=None):
        """Manually load the robots.txt file from the server.

        :param robots_url: url address of the text file to load.
        :param timeout: requests timeout
        :return: loaded rules or None if failed.
        """
        _parser = RobotFileParser()
        try:
            req = requests.Request(method='GET',
                                   url=robots_url,
                                   headers=self.headers,
                                   auth=self.auth,
                                   cookies=self.cookies,
                                   hooks=self.hooks)
            prep = req.prepare()
            send_kwargs = {
                'stream': False,
                'timeout': timeout,
                'verify': self.verify,
                'cert': self.cert,
                'proxies': self.proxies,
                'allow_redirects': True,
            }
            f = super(Session, self).send(prep, **send_kwargs)
            f.raise_for_status()
            self.cookies.update(f.cookies)
        except requests.exceptions.HTTPError as err:
            code = err.response.status_code
            if code in (401, 403):
                _parser.disallow_all = True
            elif 400 <= code < 500:
                _parser.allow_all = True
        except requests.exceptions.ConnectionError:
            _parser.allow_all = True
        else:
            _parser.parse(f.text.splitlines())
        self.robots_registry[robots_url] = _parser
        #: Initiate a start time for delays
        _parser.modified()
        return _parser
Пример #5
0
 def can_fetch(self, url, user_agent=None):
     return RobotFileParser.can_fetch(self, user_agent or self.user_agent, url)