Пример #1
0
    def can_fetch(self, user_agent, url):
        parsed = urlsplit(url)
        domain = parsed.netloc
        if domain in self.robots_txt_cache:
            robot_txt = self.robots_txt_cache[domain]
            if time.time() - robot_txt.mtime() > self.robot_txt_age:
                robot_txt = None
        else:
            robot_txt = None

        if robot_txt is None:
            robot_txt = RobotFileParser()
            try:
                response = yield gen.maybe_future(self.http_client.fetch(
                    urljoin(url, '/robots.txt'), connect_timeout=10, request_timeout=30))
                content = response.body
            except tornado.httpclient.HTTPError as e:
                logger.error('load robots.txt from %s error: %r', domain, e)
                content = ''

            try:
                content = content.decode('utf8', 'ignore')
            except UnicodeDecodeError:
                content = ''

            robot_txt.parse(content.splitlines())
            self.robots_txt_cache[domain] = robot_txt

        raise gen.Return(robot_txt.can_fetch(user_agent, url))
Пример #2
0
    def can_fetch(self, user_agent, url):
        parsed = urlsplit(url)
        domain = parsed.netloc
        if domain in self.robots_txt_cache:
            robot_txt = self.robots_txt_cache[domain]
            if time.time() - robot_txt.mtime() > self.robot_txt_age:
                robot_txt = None
        else:
            robot_txt = None

        if robot_txt is None:
            robot_txt = RobotFileParser()
            try:
                response = yield gen.maybe_future(
                    self.http_client.fetch(urljoin(url, '/robots.txt'),
                                           connect_timeout=10,
                                           request_timeout=30))
                content = response.body
            except tornado.httpclient.HTTPError as e:
                logger.error('load robots.txt from %s error: %r', domain, e)
                content = ''

            try:
                content = content.decode('utf8', 'ignore')
            except UnicodeDecodeError:
                content = ''

            robot_txt.parse(content.splitlines())
            self.robots_txt_cache[domain] = robot_txt

        raise gen.Return(robot_txt.can_fetch(user_agent, url))
Пример #3
0
    def download_title(self):
        """ Downloads the title of the bookmark
        
        This may fail silently. If it does, it sets the title to "Unknown Title".
        """
        self.title = "Unknown Title"

        try:
            url = urlparse(self.url)
            robots = "{}://{}/robots.txt".format(url.scheme, url.netloc)

            rfp = RobotFileParser(robots)
            rfp.read()

            if rfp.can_fetch("BMAT", self.url):
                r = requests.get(self.url,
                                 timeout=3.0,
                                 headers={"User-Agent": "BMAT"})
                r.raise_for_status()

                p = HTMLTitleReader()
                p.feed(r.text)

                self.title = p.title
                self.save()

        except:
            return
Пример #4
0
 def can_fetch(self, url, user_agent=None):
     return RobotFileParser.can_fetch(self, user_agent or self.user_agent, url)