def can_fetch(self, user_agent, url): parsed = urlsplit(url) domain = parsed.netloc if domain in self.robots_txt_cache: robot_txt = self.robots_txt_cache[domain] if time.time() - robot_txt.mtime() > self.robot_txt_age: robot_txt = None else: robot_txt = None if robot_txt is None: robot_txt = RobotFileParser() try: response = yield gen.maybe_future( self.http_client.fetch(urljoin(url, '/robots.txt'), connect_timeout=10, request_timeout=30)) content = response.body except tornado.httpclient.HTTPError as e: logger.error('load robots.txt from %s error: %r', domain, e) content = '' try: content = content.decode('utf8', 'ignore') except UnicodeDecodeError: content = '' robot_txt.parse(content.splitlines()) self.robots_txt_cache[domain] = robot_txt raise gen.Return(robot_txt.can_fetch(user_agent, url))
def can_fetch(self, user_agent, url): parsed = urlsplit(url) domain = parsed.netloc if domain in self.robots_txt_cache: robot_txt = self.robots_txt_cache[domain] if time.time() - robot_txt.mtime() > self.robot_txt_age: robot_txt = None else: robot_txt = None if robot_txt is None: robot_txt = RobotFileParser() try: response = yield gen.maybe_future(self.http_client.fetch( urljoin(url, '/robots.txt'), connect_timeout=10, request_timeout=30)) content = response.body except tornado.httpclient.HTTPError as e: logger.error('load robots.txt from %s error: %r', domain, e) content = '' try: content = content.decode('utf8', 'ignore') except UnicodeDecodeError: content = '' robot_txt.parse(content.splitlines()) self.robots_txt_cache[domain] = robot_txt raise gen.Return(robot_txt.can_fetch(user_agent, url))
def download_title(self): """ Downloads the title of the bookmark This may fail silently. If it does, it sets the title to "Unknown Title". """ self.title = "Unknown Title" try: url = urlparse(self.url) robots = "{}://{}/robots.txt".format(url.scheme, url.netloc) rfp = RobotFileParser(robots) rfp.read() if rfp.can_fetch("BMAT", self.url): r = requests.get(self.url, timeout=3.0, headers={"User-Agent": "BMAT"}) r.raise_for_status() p = HTMLTitleReader() p.feed(r.text) self.title = p.title self.save() except: return
def load_rules_from_url(self, robots_url, timeout=None): """Manually load the robots.txt file from the server. :param robots_url: url address of the text file to load. :param timeout: requests timeout :return: loaded rules or None if failed. """ _parser = RobotFileParser() try: req = requests.Request(method='GET', url=robots_url, headers=self.headers, auth=self.auth, cookies=self.cookies, hooks=self.hooks) prep = req.prepare() send_kwargs = { 'stream': False, 'timeout': timeout, 'verify': self.verify, 'cert': self.cert, 'proxies': self.proxies, 'allow_redirects': True, } f = super(Session, self).send(prep, **send_kwargs) f.raise_for_status() self.cookies.update(f.cookies) except requests.exceptions.HTTPError as err: code = err.response.status_code if code in (401, 403): _parser.disallow_all = True elif 400 <= code < 500: _parser.allow_all = True except requests.exceptions.ConnectionError: _parser.allow_all = True else: _parser.parse(f.text.splitlines()) self.robots_registry[robots_url] = _parser #: Initiate a start time for delays _parser.modified() return _parser
def can_fetch(self, url, user_agent=None): return RobotFileParser.can_fetch(self, user_agent or self.user_agent, url)