def found_terminator(self): self._last_use = int(time.time()) if self._current_request: self._current_request.found_terminator() else: header, self._in_buffer = self._in_buffer, '' lines = string.split(header, '\r\n') while lines and not lines[0]: lines.pop(0) if not lines: self.close_when_done() return request = lines.pop(0) try: command, uri, version = crack_request(request) except: if self.server.debug: self.log_info("Ignoring malformed HTTP request: " + request) return if '%' in request: request = unquote(request) if command is None: self.log_info('Bad HTTP request: %s' % repr(request), 'error') return header = _join_headers(lines) self._current_request = Request(self, request, command, uri, version, header) requests = self._request_queue requests.insert(len(requests) - 1, self._current_request) self.request_counter.increment() self.server.total_requests.increment() self._current_request.found_terminator()
def redirect_request(self, newurl, req, fp, code, msg, headers): """Return a Request or None in response to a redirect. This is called by the http_error_30x methods when a redirection response is received. If a redirection should take place, return a new Request to allow http_error_30x to perform the redirect; otherwise, return None to indicate that an HTTPError should be raised. """ if code in (301, 302, 303, "refresh") or \ (code == 307 and not req.has_data()): # Strictly (according to RFC 2616), 301 or 302 in response to # a POST MUST NOT cause a redirection without confirmation # from the user (of urllib2, in this case). In practice, # essentially all clients do redirect in this case, so we do # the same. # XXX really refresh redirections should be visiting; tricky to # fix, so this will wait until post-stable release new = Request(newurl, headers=req.headers, origin_req_host=req.get_origin_req_host(), unverifiable=True, visit=False, ) new._origin_req = getattr(req, "_origin_req", req) return new else: raise HTTPError(req.get_full_url(), code, msg, headers, fp)
def read(self): """Reads the robots.txt URL and feeds it to the parser.""" if self._opener is None: self.set_opener() req = Request(self.url, unverifiable=True, visit=False, timeout=self._timeout) try: f = self._opener.open(req) except HTTPError as f: pass except (IOError, socket.error, OSError) as exc: debug_robots("ignoring error opening %r: %s" % (self.url, exc)) return lines = [] line = f.readline() while line: lines.append(line.strip()) line = f.readline() status = f.code if status == 401 or status == 403: self.disallow_all = True debug_robots("disallow all") elif status >= 400: self.allow_all = True debug_robots("allow all") elif status == 200 and lines: debug_robots("parse lines") self.parse(lines)
def read(self): """Reads the robots.txt URL and feeds it to the parser.""" if self._opener is None: self.set_opener() req = Request(self.url, unverifiable=True, visit=False) try: f = self._opener.open(req) except HTTPError, f: pass
def http_request(self, request): if not hasattr(request, "add_unredirected_header"): newrequest = Request(request._Request__original, request.data, request.headers) try: newrequest.origin_req_host = request.origin_req_host except AttributeError: pass try: newrequest.unverifiable = request.unverifiable except AttributeError: pass try: newrequest.visit = request.visit except AttributeError: pass request = newrequest return request
def _request(self, url_or_req, data, visit, timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): if isstringlike(url_or_req): req = Request(url_or_req, data, visit=visit, timeout=timeout) else: # already a urllib2.Request or mechanize.Request instance req = url_or_req if data is not None: req.add_data(data) # XXX yuck set_request_attr(req, "visit", visit, None) set_request_attr(req, "timeout", timeout, _sockettimeout._GLOBAL_DEFAULT_TIMEOUT) return req
def _request(self, url_or_req, data, visit): if isstringlike(url_or_req): req = Request(url_or_req, data, visit=visit) else: # already a urllib2.Request or mechanize.Request instance req = url_or_req if data is not None: req.add_data(data) # XXX yuck, give request a .visit attribute if it doesn't have one try: req.visit except AttributeError: req.visit = None if visit is not None: req.visit = visit return req