class cookie: def __init__( self ): self.cookieObj = SimpleCookie() self.load() def load( self ): if not os.environ.has_key("HTTP_COOKIE"): # Kein Cookie vorhanden return self.cookieObj.load( os.environ["HTTP_COOKIE"] ) def readCookie( self, CookieName ): if self.cookieObj == False: # Gibt kein Cookie return False if self.cookieObj.has_key(CookieName): return self.cookieObj[CookieName].value else: return False def debug( self ): print "Cookie-Debug:" print "<hr><pre>" if not os.environ.has_key("HTTP_COOKIE"): print "There is no HTTP_COOKIE in os.environ:\n" for k,v in os.environ.iteritems(): print k,v else: print self.cookieObj print "</pre><hr>"
def cookie_parts(name, kaka): cookie_obj = SimpleCookie(kaka) morsel = cookie_obj.get(name) if morsel: return morsel.value.split("|") else: return None
def make_cookie(name, load, seed, expire=0, domain="", path="", timestamp=""): """ Create and return a cookie :param name: Cookie name :param load: Cookie load :param seed: A seed for the HMAC function :param expire: Number of minutes before this cookie goes stale :param domain: The domain of the cookie :param path: The path specification for the cookie :return: A tuple to be added to headers """ cookie = SimpleCookie() if not timestamp: timestamp = str(int(time.mktime(time.gmtime()))) signature = cookie_signature(seed, load, timestamp) cookie[name] = "|".join([load, timestamp, signature]) if path: cookie[name]["path"] = path if domain: cookie[name]["domain"] = domain if expire: cookie[name]["expires"] = _expiration(expire, "%a, %d-%b-%Y %H:%M:%S GMT") return tuple(cookie.output().split(": ", 1))
def getSessionId(request_cookie): cookie = SimpleCookie() cookie.load(request_cookie) try: sessionId = int((cookie['id']).value) except CookieError, ValueError: sessionId = sessions.AddNewSession({'num' : 0, 'auth' : False})
def parse_cookie(name, seed, kaka): """Parses and verifies a cookie value :param seed: A seed used for the HMAC signature :param kaka: The cookie :return: A tuple consisting of (payload, timestamp) """ if not kaka: return None cookie_obj = SimpleCookie(kaka) morsel = cookie_obj.get(name) if morsel: parts = morsel.value.split("|") if len(parts) != 3: return None # verify the cookie signature sig = cookie_signature(seed, parts[0], parts[1]) if sig != parts[2]: raise Exception("Invalid cookie signature") try: return parts[0].strip(), parts[1] except KeyError: return None else: return None
def set_cookie(name, _, *args): cookie = SimpleCookie() cookie[name] = base64.b64encode(":".join(args)) cookie[name]['path'] = "/" cookie[name]["expires"] = _expiration(5) # 5 minutes from now logger.debug("Cookie expires: %s", cookie[name]["expires"]) return tuple(cookie.output().split(": ", 1))
def __read_cookie(self): """Reads the HTTP Cookie and loads the sid and data from it (if any).""" try: # check the cookie to see if a session has been started cookie = SimpleCookie(os.environ['HTTP_COOKIE']) self.cookie_keys = filter(is_gaesessions_key, cookie.keys()) if not self.cookie_keys: return # no session yet self.cookie_keys.sort() data = ''.join(cookie[k].value for k in self.cookie_keys) i = SIG_LEN + SID_LEN sig, sid, b64pdump = data[:SIG_LEN], data[SIG_LEN:i], data[i:] pdump = b64decode(b64pdump) actual_sig = Session.__compute_hmac(self.base_key, sid, pdump) if sig == actual_sig: self.__set_sid(sid, False) # check for expiration and terminate the session if it has expired if self.get_expiration() != 0 and time.time() > self.get_expiration(): return self.terminate() if pdump: self.data = self.__decode_data(pdump) else: self.data = None # data is in memcache/db: load it on-demand else: logging.warn('cookie with invalid sig received from %s: %s' % (os.environ.get('REMOTE_ADDR'), b64pdump)) except (CookieError, KeyError, IndexError, TypeError): # there is no cookie (i.e., no session) or the cookie is invalid self.terminate(False)
def parse_cookie(name, seed, kaka): """Parses and verifies a cookie value """ if not kaka: return None cookie_obj = SimpleCookie(kaka) morsel = cookie_obj.get(name) if morsel: parts = morsel.value.split("|") if len(parts) != 3: return None # verify the cookie signature #print >> sys.stderr, "COOKIE verify '%s' '%s' '%s'" % (seed, # parts[0], # parts[1]) sig = cookie_signature(seed, parts[0], parts[1]) #print >> sys.stderr, ">>", sig if sig != parts[2]: raise Exception("Invalid cookie signature") try: return parts[0].strip(), parts[1] except KeyError: return None else: return None
def __login(self, username, password): """ login douban, get the session token """ data = urllib.urlencode({'source':'simple', 'form_email':username, 'form_password':password}) contentType = "application/x-www-form-urlencoded" self.__get_bid() cookie = "bid=%s" % self.bid headers = {"Content-Type":contentType, "Cookie": cookie } with contextlib.closing(httplib.HTTPSConnection("www.douban.com")) as conn: conn.request("POST", "/accounts/login", data, headers) r1 = conn.getresponse() resultCookie = SimpleCookie(r1.getheader('Set-Cookie')) if not resultCookie.has_key('dbcl2'): raise DoubanLoginException() dbcl2 = resultCookie['dbcl2'].value if dbcl2 is not None and len(dbcl2) > 0: self.dbcl2 = dbcl2 uid = self.dbcl2.split(':')[0] self.uid = uid
def set_cookie(name, _, value): cookie = SimpleCookie() cookie[name] = value cookie[name]['path'] = "/" cookie[name]["expires"] = _expiration(5) # 5 minutes from now logger.debug("Cookie expires: %s" % cookie[name]["expires"]) return tuple(cookie.output().split(": ", 1))
def cookie(name, sid, seed, expire=0, domain="", path=""): """ Create and return a cookie :param sid: Session identifier :param seed: A seed for the HMAC function :param expire: Number of minutes before this cookie goes stale :param domain: The domain of the cookie :param path: The path specification for the cookie :return: A tuple to be added to headers """ cookie = SimpleCookie() timestamp = str(int(time.mktime(time.gmtime()))) #print >> sys.stderr, "COOKIE create '%s' '%s' '%s'" % (seed, sid, # timestamp) signature = cookie_signature(seed, sid, timestamp) #print >> sys.stderr, ">>", signature cookie[name] = "|".join([sid, timestamp, signature]) if path: cookie[name]["path"] = path if domain: cookie[name]["domain"] = domain if expire: cookie[name]["expires"] = _expiration(expire, "%a, %d-%b-%Y %H:%M:%S GMT") return tuple(cookie.output().split(": ", 1))
def COOKIES(self): if 'brick.cookies' not in self.environ: raw_dict = SimpleCookie(self.environ.get('HTTP_COOKIE','')) self.environ['brick.cookies'] = {} for cookie in raw_dict.itervalues(): self.environ['brick.cookies'][cookie.key] = cookie.value return self.environ['brick.cookies']
def getSession(self): """Return the existing session or a new session""" if self.session is not None: return self.session # Get value of cookie header that was sent cookie_str = self.headers.get('Cookie') if cookie_str: cookie_obj = SimpleCookie(cookie_str) sid_morsel = cookie_obj.get(self.SESSION_COOKIE_NAME, None) if sid_morsel is not None: sid = sid_morsel.value else: sid = None else: sid = None # If a session id was not set, create a new one if sid is None: sid = randomString(16, '0123456789abcdef') session = None else: session = self.server.sessions.get(sid) # If no session exists for this session ID, create one if session is None: session = self.server.sessions[sid] = {} session['id'] = sid self.session = session return session
def __init__(self, hnd, name = session.COOKIE_NAME, timeout = 0): super(DatastoreSession, self).__init__(hnd, name, timeout) SessionStore.clear() # check from cookie if not timeout: config = Config() timeout = config.get('session_timeout', 60*60) elif timeout == -1: timeout = 356*24*60*60*50 if name in hnd.request.cookies: self._id = hnd.request.cookies[name] res = SessionStore.gql("WHERE id = :1", self._id).get() if res: self._store = res session_data = self._store.value if session_data: self.update(pickle.loads(session_data)) else: self._create_store(self._id) else: # not in the cookie, set it c = SimpleCookie() c[name] = self._id c[name]['path'] = '/' c[name]['expires'] = rfc822.formatdate(time()+timeout) cs = c.output().replace('Set-Cookie: ', '') hnd.response.headers.add_header('Set-Cookie', cs) self._create_store(self._id)
def username(cookie, name=None): """ try to extract username from PAS cookie """ if cookie is not None: cookies = SimpleCookie() try: cookies.load(cookie) except CookieError: return name if cookie_name in cookies: # Deal with doubly quoted cookies ac_cookie = repeatedly_unquote(cookies[cookie_name].value) try: ac = decodestring(ac_cookie + '=====') except (TypeError, binascii.Error): return name # plone.session 3.x (Plone 4.x) if '!' in ac[40:]: name, user_data = ac[40:].split('!', 1) # plone.session 2.x (Plone 3.x) elif ' ' in ac[20:21]: name = ac[21:] # PluggableAuthService.CookieAuthHelper elif ':' in ac: user, pwd = ac.split(':', 1) # PluggableAuthService >= 1.5 try: name = user.decode('hex') # PluggableAuthService < 1.5 except TypeError: name = user return name
def _parse_cookie(self): cookiestr = self.environ.get('HTTP_COOKIE', '') if not cookiestr: return cookies = SimpleCookie(cookiestr) for c in cookies.values(): self.cookie[c.key] = c.value
class CookieHandler(object): def __init__(self, *args, **kw): # Somewhere to store cookies between consecutive requests self.cookies = SimpleCookie() super(CookieHandler, self).__init__(*args, **kw) def httpCookie(self, path): """Return self.cookies as an HTTP_COOKIE environment value.""" l = [m.OutputString() for m in self.cookies.values() if path.startswith(m['path'])] return '; '.join(l) def loadCookies(self, envstring): self.cookies.load(envstring) def saveCookies(self, response): """Save cookies from the response.""" # Urgh - need to play with the response's privates to extract # cookies that have been set for k,v in response._cookies.items(): k = k.encode('utf8') self.cookies[k] = v['value'].encode('utf8') if self.cookies[k].has_key('Path'): self.cookies[k]['Path'] = v['Path']
class CookieScraper(object): "Scraper that keeps track of getting and setting cookies." def __init__(self): self._cookies = SimpleCookie() def get_page(self, url, post_data=None, headers=()): """ Helper method that gets the given URL, handling the sending and storing of cookies. Returns the requested page as a string. """ socket.timeout(300) opener = urllib.URLopener() opener.addheader('Cookie', self._cookies.output(attrs=[], header='', sep=';').strip()) for k, v in headers: opener.addheader(k, v) try: f = opener.open(url, post_data) except IOError, e: if e[1] == 302: # Got a 302 redirect, but check for cookies before redirecting. # e[3] is a httplib.HTTPMessage instance. if e[3].dict.has_key('set-cookie'): self._cookies.load(e[3].dict['set-cookie']) return self.get_page(e[3].getheader('location')) else: raise if f.headers.dict.has_key('set-cookie'): self._cookies.load(f.headers.dict['set-cookie']) return f.read()
def get_cookie_dict(environ): """Return a *plain* dictionary of cookies as found in the request. Unlike ``get_cookies`` this returns a dictionary, not a ``SimpleCookie`` object. For incoming cookies a dictionary fully represents the information. Like ``get_cookies`` this caches and checks the cache. """ header = environ.get('HTTP_COOKIE') if not header: return {} if environ.has_key('paste.cookies.dict'): cookies, check_header = environ['paste.cookies.dict'] if check_header == header: return cookies cookies = SimpleCookie() try: cookies.load(header) except CookieError: pass result = {} for name in cookies: result[name] = cookies[name].value environ['paste.cookies.dict'] = (result, header) return result
def __init__(self, environ, backend, ttl, cookie_name, fp_use_ip, log): self.handler = backend self.ttl = ttl self.cookie_name = cookie_name self.sid = None self.data = {} self.log = log self.clear_cookie = False self.session_start = False fingerprint = '%s%s%s' % (environ.get('HTTP_USER_AGENT'), environ.get('HTTP_ACCEPT_ENCODING'), environ.get('HTTP_ACCEPT_LANGUAGE')) if fp_use_ip: fingerprint += environ.get('REMOTE_ADDR') self.fingerprint = hashlib.sha1(fingerprint).hexdigest() if 'HTTP_COOKIE' in environ: cookie = SimpleCookie(environ['HTTP_COOKIE']) if cookie.get(self.cookie_name): cookie_sid = cookie[self.cookie_name].value if cookie_sid: self.sid = cookie_sid
def status(self, environ, start_response): name1 = '' name1_key = '*empty*' if 'HTTP_COOKIE' in environ: c = SimpleCookie(environ.get('HTTP_COOKIE', '')) if 'name1' in c: key = c.get('name1').value name1 = usernames.get(key, '') name1_key = key data = """ <html> <body> Your username is """ data += name1 data += " and your key is " data += name1_key data += "<p><a href='/'>Home</a></body></html>" else: data = """ <html> <body> You're not Logged in.....<p> <a href='/'>Home</a> </body> </html> """ start_response('200 OK', list(html_headers)) return [data]
def get_cookie(self, name): cookie_str = self.request.headers.get('cookie') if not cookie_str: return None cookie = SimpleCookie() cookie.load(cookie_str) return cookie[name].value;
def __get_params(self): self._kwargs = self.__get_query_params() self._cog_ajax = self._kwargs.get('cog_ajax') self.__cog_target = self._kwargs.get('cog_target') self._cog_raw = self._kwargs.get('cog_raw', None) self._cog_method = self._kwargs.get('cog_method', None) # cog_method must not contain non-word caracters assert self._cog_method is None or \ re.search('\W', self._cog_method) is None if self._cog_method is not None and self._cog_method[0] == '_': # we never should receive a protected method... self._cog_method = "w3error" self._kwargs['cog_method'] = "w3error" self._kwargs['cog_error'] = "Can't call a protected method!" self._cog_ref_oid = self._kwargs.get('cog_ref_oid', None) self._cog_oid_ = self._kwargs.get('cog_oid_', None) self._session_key = None if 'HTTP_COOKIE' in self._environ: cookie_string = self._environ.get('HTTP_COOKIE') cookie = SimpleCookie() cookie.load(cookie_string) if 'cog_session' in cookie: self._session_key = cookie['cog_session'].value self.__cog_environment = self.__get_env() self._cog_fqtn_ = self._kwargs.get('cog_fqtn_', None) if self._cog_ref_oid and self._cog_ref_oid == self._cog_oid_: self._cog_oid_ = None self._kwargs['cog_controller'] = self self._kwargs['cog_first_call'] = True
def __read_cookies(self): from Cookie import SimpleCookie cookies_raw = SimpleCookie(self.get_env('HTTP_COOKIE')) cookies = {} for key, field in cookies_raw.iteritems(): cookies[key] = field.value return Table(cookies, allow_duplicates = False, readonly = True)
def application(environ, start_response): GET = parse_qs(environ['QUERY_STRING']) path = environ['PATH_INFO'] cookies = SimpleCookie(environ.get('HTTP_COOKIE', '')) headers = {'Content-Type': 'text/html'} if path == '/': response = base%{'contenido': form} elif path == '/set': cookies['sessionId'] = store.add(GET.get('name', ['NULL McNULL',])[0]) response = base%{'contenido': '<div style="background-color:green;color:white">Cookie establecida</div>'} headers.update({'Set-Cookie': cookies['sessionId'].OutputString()}) else: cookie = cookies.get('sessionId',None) name = cookie and store.get(cookie.value, None) or None response = base%{'contenido': "<p>El valor de la sesión es: %s</p>"%name if name else 'Ninguno'} headers.update({'Content-Length': str(len(response))}) start_response( "200 OK", headers.items() ) return [response]
def start(self, cookies, cookieopts=None): c = SimpleCookie(cookies) sid = c.get(self.cookiename) create = True if sid is not None: for m in self.get(sid.value): yield m if self.apiroutine.retvalue is not None: self.apiroutine.retvalue = (self.SessionHandle(self.apiroutine.retvalue, self.apiroutine), []) create = False if create: for m in self.create(): yield m sh = self.apiroutine.retvalue m = Morsel() m.key = self.cookiename m.value = sh.id m.coded_value = sh.id opts = {"path": "/", "httponly": True} if cookieopts: opts.update(cookieopts) if not cookieopts["httponly"]: del cookieopts["httponly"] m.update(opts) self.apiroutine.retvalue = (sh, [m])
def __read_cookie(self): """Reads the HTTP Cookie and loads the sid and data from it (if any).""" print 'session: __read_cookie' try: if self.environ.get('HTTP_COOKIE') is None: return #no cookies #cookie = SimpleCookie(os.environ['HTTP_COOKIE']) cookie = SimpleCookie(self.environ.get('HTTP_COOKIE')) self.cookie_keys = filter(is_mole_sessions_key, cookie.keys()) if not self.cookie_keys: return # no session self.cookie_keys.sort() data = ''.join(cookie[k].value for k in self.cookie_keys) i = SIG_LEN + SID_LEN sig, sid, b64pdump = data[:SIG_LEN], data[SIG_LEN:i], data[i:] pdump = b64decode(b64pdump) actual_sig = Session.__compute_hmac(self.base_key, sid, pdump) if sig == actual_sig: self.__set_sid(sid, False) if self.get_expiration() != 0 and time.time() > self.get_expiration(): return self.terminate() if pdump: self.data = self.__decode_data(pdump) else: self.data = None else: logging.warn('cookie with invalid sig received from %s: %s' % (os.environ.get('REMOTE_ADDR'), b64pdump)) except (CookieError, KeyError, IndexError, TypeError): import traceback;traceback.print_exc() logging.error("session error:", exc_info=True) self.terminate(False)
def set_cookie( self, key, value, max_age=None, expires=None, path='/', domain=None ): """ Adds the given cookie to the response, so it will be set on the user's browser. """ cookies = Cookie() cookies[key] = value if isinstance(max_age, timedelta): max_age = max_age.seconds + max_age.days*24*60*60 if max_age is not None and expires is None: expires = datetime.utcnow() + timedelta(seconds=max_age) if isinstance(expires, timedelta): expires = datetime.utcnow() + expires if isinstance(expires, datetime): expires = '"'+datetime_utils._serialize_cookie_date(expires)+'"' for var_name, var_value in [('max-age', max_age), ('path', path), ('domain', domain), ('expires', expires)]: if var_value is not None: cookies[key][var_name] = str(var_value) cookies = cookies.output(header='').lstrip() if cookies: self.extra_headers.append(('Set-Cookie', cookies))
def getSession(self): """Return the existing session or a new session""" if self.session is not None: return self.session # Get value of cookie header that was sent cookie_str = self.headers.get('Cookie') if cookie_str: cookie_obj = SimpleCookie(cookie_str) sid_morsel = cookie_obj.get(self.SESSION_COOKIE_NAME, None) if sid_morsel is not None: sid = sid_morsel.value else: sid = None else: sid = None # If a session id was not set, create a new one if sid is None: # Pure pragmatism: Use function for nonce salt to generate session ID. sid = make_nonce_salt(16) session = None else: session = self.server.sessions.get(sid) # If no session exists for this session ID, create one if session is None: session = self.server.sessions[sid] = {} session['id'] = sid self.session = session return session
def set_cookie(self, cookie): """ Set a cookie. The cookie will actually be recorded in the WSGI environ and the 'Set-Cookie' header will be generated with the responses are first sent. 'cookie' can be one of four things: * a string: the value is considered a cookie header value, i.e. the bit that would normally be added after 'Set-Cookie: '. * (name, value) tuple: a persistent cookie is created. * (name, None) tuple: the named cookie will be removed. * cookie instance: e.g. one of the cookie types in Python's Cookie module. """ if isinstance(cookie, str): pass elif isinstance(cookie, tuple): name, value = cookie cookie = SimpleCookie() cookie[name] = value or '' cookie[name]['path'] = self.environ['SCRIPT_NAME'] or '/' if value is None: cookie[name]['expires'] = 0 cookie[name]['max-age'] = 0 cookie = cookie.output(header='').strip() else: cookie = cookie.output(header='').strip() self.headers.append(cookie)
def _BaseCookie__set(self, key, real_value, coded_value): if not isinstance(key, bytes): key = key.encode('ascii') # Python 2.x cannot handle unicode keys return SimpleCookie._BaseCookie__set(self, key, real_value, coded_value)
def clear_cookies(self): self._cookies = SimpleCookie()
class Retriever(object): 'HTTP client.' def __init__(self, user_agent=None, cache=Default, timeout=20, sleep=0): # Use cache=None to explicitly turn off caching. # If you don't provide cache, then it will cache in # settings.HTTP_CACHE, or '/tmp/eb_scraper_cache' if # the setting is undefined. # sleep should be the number of seconds to sleep between requests. from django.conf import settings if cache is Default: cache = getattr(settings, 'HTTP_CACHE', '/tmp/eb_scraper_cache') self.h = httplib2.Http(cache, timeout=timeout) self.h.force_exception_to_status_code = False self.h.follow_redirects = False self.user_agent = user_agent or 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)' self._cookies = SimpleCookie() self.logger = logging.getLogger('eb.retrieval.retriever') self.sleep = sleep # Keep track of whether we've downloaded any pages yet. # This makes sure we don't sleep before the very first requested page. self.page_downloaded = False def clear_cookies(self): self._cookies = SimpleCookie() def fetch_data_and_headers(self, uri, data=None, headers=None, send_cookies=True, follow_redirects=True, raise_on_error=True): "Retrieves the resource and returns a tuple of (content, header dictionary)." # Sleep, if necessary, but only if a page has already been downloaded # with this retriever. (We don't want to sleep before the very first # request that a retriever makes, because that would be unnecessary.) if self.sleep and self.page_downloaded: self.logger.debug('Sleeping for %s seconds', self.sleep) time.sleep(self.sleep) self.page_downloaded = True # Prepare the request. if not headers: headers = {} headers['user-agent'] = headers.get('user-agent', self.user_agent) if send_cookies and self._cookies: # Some broken ASP.NET servers put "\r\n" in there, so we replace # that with semicolon to get proper behavior. headers['Cookie'] = self._cookies.output( attrs=[], header='').strip().replace('\r\n', ';') method = data and "POST" or "GET" body = data and urlencode(data) or None if method == "POST" and body: headers.setdefault('Content-Type', 'application/x-www-form-urlencoded') # Get the response. resp_headers = None for attempt_number in range(3): self.logger.debug('Attempt %s: %s %s', attempt_number + 1, method, uri) if data: self.logger.debug('%r', data) try: resp_headers, content = self.h.request(uri, method, body=body, headers=headers) if resp_headers['status'] == '500': self.logger.debug("Request got a 500 error: %s %s", method, uri) continue # Try again. break except socket.timeout: self.logger.debug("Request timed out after %s seconds: %s %s", self.h.timeout, method, uri) continue # Try again. except socket.error, e: self.logger.debug("Got socket error: %s", e) continue # Try again. except AttributeError, e: self.logger.debug("Got httplib bug where socket is None: %s", e) continue # Try again except httplib2.ServerNotFoundError: raise RetrievalError("Could not %s %r: server not found" % (method, uri))
def parse_cookie(request): """Translate request's cookie into a Cookie.SimpleCookie. """ raw_cookie = request.message.get('Cookie','') return SimpleCookie(raw_cookie)
def load(cls, cookie_data): cookie = SessionCookie() SimpleCookie.load(cookie, cookie_data) return cookie
def convert_cookie(self, cookie): sc = SimpleCookie() for key, value in cookie.iteritems(): sc[key] = value return sc
def error_reporter(request): """ Grab an error submitted as a GET request """ if not request.GET and not request.POST: return HttpResponse( '') ## If someone just hits this page at random, ignore it url = request.GET.get('url', "") domain = Site.objects.get_current().domain if url[:4] == 'http' and (domain not in (url[7:(7 + len(domain))], url[8:(8 + len(domain))])): ## Punt responses not from us return HttpResponse( '') ## Return something, so we don't trigger an error cookies = StringIO() get = StringIO() meta = StringIO() post = StringIO() pprint(dict(request.COOKIES), cookies) pprint(dict(request.GET), get) pprint(dict(request.META), meta) user_str = request.user.username if hasattr( request, 'user') and request.user.is_authenticated() else "(not authenticated)" user_agent_str = request.META.get('HTTP_USER_AGENT', "(not specified)") msg = request.GET.get('msg', "(no message)") json_flag = "" if request.POST: if request.raw_post_data.strip()[0] == '[': ## Probably a JSON error report ## Let's try to decode it try: err = json.loads(request.raw_post_data) ## Deal with messages that we don't want to deal with if is_quirk_should_be_ignored(err): return HttpResponse('') json_flag = " (JSON-encoded)" for e in err: try: c = SimpleCookie() c.load(str(e['data']['cookie'])) e['data']['cookie'] = dict( (str(x), str(y)) for x, y in c.iteritems()) except: ## Whoops, don't have cookie data after all pass ## Also pull out some data, if we can ## 'err' is an array, and we don't need to do this more than once; ## but it should typically be an array of either 0 or 1 elements, ## and we don't want to do it for a 0-length array, ## so just do it in the loop try: if user_str == "(not authenticated)": user_str = "%s %s" % (user_str, e['data']['cookie']) except: pass try: if user_agent_str == "(not specified)": user_agent_str = e['env']['user_agent'] except: pass try: if msg == "(no message)": msg = e['exception']['message'] except: pass except Exception, e: print "*** Exception!", e print json.__dict__ err = request.raw_post_data pprint(err, post) else: pprint(dict(request.POST), post)
class Session(object): def __init__(self): self.data = {} self.started = False self._flock = None self.expires = 0 # delete right away self.__sid = sid = self.__getsid() self.path = os.path.join(S_DIR, sid + S_EXT) def isset(self, name): """Is the variable set in the session?""" if not self.started: raise NotStarted("Session must be started") return name in self def unset(self, name): """Unset the name from the session""" if not self.started: raise NotStarted("Session must be started") del self[name] @staticmethod def __newsid(): """Create a new session ID""" h = hashlib.new("ripemd160") h.update(str(time.time() / time.clock()**-1) + str(os.getpid())) return h.hexdigest() def __getsid(self): """Get the current session ID or return a new one""" # first, try to load the sid from the GET or POST forms #query_string = sys.stdin.read() #parser = FormParser() #parser.parse_values(query_string)#query_string.partition('&') #_S_ID = parser.get_value("S_ID", "") #if S_ID: # sid = S_ID # return sid # then try to load the sid from the HTTP cookie self.cookie = SimpleCookie() if os.environ.has_key('HTTP_COOKIE'): self.cookie.load(os.environ['HTTP_COOKIE']) if S_ID in self.cookie: sid = self.cookie[S_ID].value return sid else: raise NoCookiesError("Could not find any cookies") # if all else fails, return a new sid return self.__newsid() def getsid(self): """ Return the name and value that the sid needs to have in a GET or POST request """ if not self.started: raise NotStarted("Session must be started") return (S_ID, self.__sid) def start(self): """Start the session""" if self.started: return True # session cannot be started more than once per script self._flock = FileLock(self.path) self._flock.acquire() # load the session if it exists if os.path.exists(self.path): with open(self.path, "rb") as f: self.data = dict(load(f)) self.data["__date_loaded__"] = TODAY else: # create a session with open(self.path, "wb") as f: self.data = {"__date_loaded__": TODAY} # the session is officially started! self.started = True # store the sid in the cookie self.cookie[S_ID] = self.__sid self.cookie[S_ID]["expires"] = str(self.expires) self.cookie[S_ID]["version"] = "1" return True def commit(self): """Commit the changes to the session""" if not self.started: raise NotStarted("Session must be started") with open(self.path, "wb") as f: dump(self.data, f, HIGHEST_PROTOCOL) def destroy(self): """Destroy the session""" if not self.started: raise NotStarted("Session must be started") os.remove(self.path) if self._flock: self._flock.release() self.started = False def output(self): """Commit changes and send headers.""" if not self.started: raise NotStarted("Session must be started") self.commit() return self.cookie.output() def setdefault(self, item, default=None): if not self.started: raise NotStarted("Session must be started") if not self.isset(item): self[item] = default return self[item] def set_expires(self, days): """Sets the expiration of the cookie""" date = datetime.date.today() + datetime.timedelta(days=days) self.expires = date.strftime("%a, %d-%b-%Y %H:%M:%S PST") self.cookie[S_ID]["expires"] = str(self.expires) def __getitem__(self, item): """Get the item from the session""" if not self.started: raise NotStarted("Session must be started") return self.data.__getitem__(item) def __setitem__(self, item, value): """set the item into the session""" if not self.started: raise NotStarted("Session must be started") self.data.__setitem__(item, value) def __delitem__(self, item): if not self.started: raise NotStarted("Session must be started") self.data.__delitem__(item) def __contains__(self, item): """Return if item in the session""" if not self.started: raise NotStarted("Session must be started") return self.data.__contains__(item) def __iter__(self): """Go through the names of all the session variables""" if not self.started: raise NotStarted("Session must be started") return self.data.__iter__()
class Response(object): """An HTTP Response, including status, headers, and body. Application developers should use Response.headers (a dict) to set or modify HTTP response headers. When the response is finalized, Response.headers is transformed into Response.header_list as (key, value) tuples. """ __metaclass__ = cherrypy._AttributeDocstrings # Class attributes for dev-time introspection. status = "" status__doc = """The HTTP Status-Code and Reason-Phrase.""" header_list = [] header_list__doc = """ A list of the HTTP response headers as (name, value) tuples. In general, you should use response.headers (a dict) instead.""" headers = httputil.HeaderMap() headers__doc = """ A dict-like object containing the response headers. Keys are header names (in Title-Case format); however, you may get and set them in a case-insensitive manner. That is, headers['Content-Type'] and headers['content-type'] refer to the same value. Values are header values (decoded according to RFC 2047 if necessary). See also: httputil.HeaderMap, httputil.HeaderElement.""" cookie = SimpleCookie() cookie__doc = """See help(Cookie).""" body = ResponseBody() body__doc = """The body (entity) of the HTTP response.""" time = None time__doc = """The value of time.time() when created. Use in HTTP dates.""" timeout = 300 timeout__doc = """Seconds after which the response will be aborted.""" timed_out = False timed_out__doc = """ Flag to indicate the response should be aborted, because it has exceeded its timeout.""" stream = False stream__doc = """If False, buffer the response body.""" def __init__(self): self.status = None self.header_list = None self._body = [] self.time = time.time() self.headers = httputil.HeaderMap() # Since we know all our keys are titled strings, we can # bypass HeaderMap.update and get a big speed boost. dict.update(self.headers, { "Content-Type": 'text/html', "Server": "CherryPy/" + cherrypy.__version__, "Date": httputil.HTTPDate(self.time), }) self.cookie = SimpleCookie() def collapse_body(self): """Collapse self.body to a single string; replace it and return it.""" if isinstance(self.body, basestring): return self.body newbody = ''.join([chunk for chunk in self.body]) self.body = newbody return newbody def finalize(self): """Transform headers (and cookies) into self.header_list. (Core)""" try: code, reason, _ = httputil.valid_status(self.status) except ValueError, x: raise cherrypy.HTTPError(500, x.args[0]) headers = self.headers self.output_status = str(code) + " " + headers.encode(reason) if self.stream: # The upshot: wsgiserver will chunk the response if # you pop Content-Length (or set it explicitly to None). # Note that lib.static sets C-L to the file's st_size. if dict.get(headers, 'Content-Length') is None: dict.pop(headers, 'Content-Length', None) elif code < 200 or code in (204, 205, 304): # "All 1xx (informational), 204 (no content), # and 304 (not modified) responses MUST NOT # include a message-body." dict.pop(headers, 'Content-Length', None) self.body = "" else: # Responses which are not streamed should have a Content-Length, # but allow user code to set Content-Length if desired. if dict.get(headers, 'Content-Length') is None: content = self.collapse_body() dict.__setitem__(headers, 'Content-Length', len(content)) # Transform our header dict into a list of tuples. self.header_list = h = headers.output() cookie = self.cookie.output() if cookie: for line in cookie.split("\n"): if line.endswith("\r"): # Python 2.4 emits cookies joined by LF but 2.5+ by CRLF. line = line[:-1] name, value = line.split(": ", 1) if isinstance(name, unicode): name = name.encode("ISO-8859-1") if isinstance(value, unicode): value = headers.encode(value) h.append((name, value))
class WebClient: "Minimal webservice client to do POST request with multipart encoded FORM data" def __init__( self, location, enctype="multipart/form-data", trace=False, cacert=None, ): kwargs = {} if httplib2.__version__ >= '0.7.0': kwargs['disable_ssl_certificate_validation'] = cacert is None kwargs['ca_certs'] = cacert self.http = httplib2.Http('.cache', **kwargs) self.trace = trace self.location = location self.enctype = enctype self.cookies = None self.method = "POST" self.referer = None def multipart_encode(self, vars): "Enconde form data (vars dict)" boundary = mimetools.choose_boundary() buf = StringIO() for key, value in vars.items(): if not isinstance(value, file): buf.write('--%s\r\n' % boundary) buf.write('Content-Disposition: form-data; name="%s"' % key) buf.write('\r\n\r\n' + value + '\r\n') else: fd = value file_size = os.fstat(fd.fileno())[stat.ST_SIZE] filename = fd.name.split('/')[-1] contenttype = mimetypes.guess_type( filename)[0] or 'application/octet-stream' buf.write('--%s\r\n' % boundary) buf.write( 'Content-Disposition: form-data; name="%s"; filename="%s"\r\n' % (key, filename)) buf.write('Content-Type: %s\r\n' % contenttype) # buffer += 'Content-Length: %s\r\n' % file_size fd.seek(0) buf.write('\r\n' + fd.read() + '\r\n') buf.write('--' + boundary + '--\r\n\r\n') buf = buf.getvalue() return boundary, buf def __call__(self, **vars): "Perform a GET/POST request and return the response" location = self.location if self.method == "GET": location += "?%s" % urlencode(vars) # prepare the request content suitable to be sent to the server: if self.enctype == "multipart/form-data": boundary, body = self.multipart_encode(vars) content_type = '%s; boundary=%s' % (self.enctype, boundary) elif self.enctype == "application/x-www-form-urlencoded": body = urlencode(vars) content_type = self.enctype # add headers according method, cookies, etc.: headers = {} if self.method == "POST": headers.update({ 'Content-type': content_type, 'Content-length': str(len(body)), }) if self.cookies: headers['Cookie'] = self.cookies.output(attrs=(), header="", sep=";") if self.referer: headers['Referer'] = self.referer if self.trace: print "-" * 80 print "%s %s" % (self.method, location) print '\n'.join(["%s: %s" % (k, v) for k, v in headers.items()]) print "\n%s" % body # send the request to the server and store the result: response, content = self.http.request(location, self.method, body=body, headers=headers) self.response = response self.content = content if self.trace: print print '\n'.join(["%s: %s" % (k, v) for k, v in response.items()]) print content print "=" * 80 # Parse and store the cookies (if any) if "set-cookie" in self.response: if not self.cookies: self.cookies = SimpleCookie() self.cookies.load(self.response["set-cookie"]) return content
class Request(object): """An HTTP request. This object represents the metadata of an HTTP request message; that is, it contains attributes which describe the environment in which the request URL, headers, and body were sent (if you want tools to interpret the headers and body, those are elsewhere, mostly in Tools). This 'metadata' consists of socket data, transport characteristics, and the Request-Line. This object also contains data regarding the configuration in effect for the given URL, and the execution plan for generating a response. """ __metaclass__ = cherrypy._AttributeDocstrings prev = None prev__doc = """ The previous Request object (if any). This should be None unless we are processing an InternalRedirect.""" # Conversation/connection attributes local = httputil.Host("127.0.0.1", 80) local__doc = \ "An httputil.Host(ip, port, hostname) object for the server socket." remote = httputil.Host("127.0.0.1", 1111) remote__doc = \ "An httputil.Host(ip, port, hostname) object for the client socket." scheme = "http" scheme__doc = """ The protocol used between client and server. In most cases, this will be either 'http' or 'https'.""" server_protocol = "HTTP/1.1" server_protocol__doc = """ The HTTP version for which the HTTP server is at least conditionally compliant.""" base = "" base__doc = """The (scheme://host) portion of the requested URL. In some cases (e.g. when proxying via mod_rewrite), this may contain path segments which cherrypy.url uses when constructing url's, but which otherwise are ignored by CherryPy. Regardless, this value MUST NOT end in a slash.""" # Request-Line attributes request_line = "" request_line__doc = """ The complete Request-Line received from the client. This is a single string consisting of the request method, URI, and protocol version (joined by spaces). Any final CRLF is removed.""" method = "GET" method__doc = """ Indicates the HTTP method to be performed on the resource identified by the Request-URI. Common methods include GET, HEAD, POST, PUT, and DELETE. CherryPy allows any extension method; however, various HTTP servers and gateways may restrict the set of allowable methods. CherryPy applications SHOULD restrict the set (on a per-URI basis).""" query_string = "" query_string__doc = """ The query component of the Request-URI, a string of information to be interpreted by the resource. The query portion of a URI follows the path component, and is separated by a '?'. For example, the URI 'http://www.cherrypy.org/wiki?a=3&b=4' has the query component, 'a=3&b=4'.""" query_string_encoding = 'utf8' query_string_encoding__doc = """ The encoding expected for query string arguments after % HEX HEX decoding). If a query string is provided that cannot be decoded with this encoding, 404 is raised (since technically it's a different URI). If you want arbitrary encodings to not error, set this to 'Latin-1'; you can then encode back to bytes and re-decode to whatever encoding you like later. """ protocol = (1, 1) protocol__doc = """The HTTP protocol version corresponding to the set of features which should be allowed in the response. If BOTH the client's request message AND the server's level of HTTP compliance is HTTP/1.1, this attribute will be the tuple (1, 1). If either is 1.0, this attribute will be the tuple (1, 0). Lower HTTP protocol versions are not explicitly supported.""" params = {} params__doc = """ A dict which combines query string (GET) and request entity (POST) variables. This is populated in two stages: GET params are added before the 'on_start_resource' hook, and POST params are added between the 'before_request_body' and 'before_handler' hooks.""" # Message attributes header_list = [] header_list__doc = """ A list of the HTTP request headers as (name, value) tuples. In general, you should use request.headers (a dict) instead.""" headers = httputil.HeaderMap() headers__doc = """ A dict-like object containing the request headers. Keys are header names (in Title-Case format); however, you may get and set them in a case-insensitive manner. That is, headers['Content-Type'] and headers['content-type'] refer to the same value. Values are header values (decoded according to RFC 2047 if necessary). See also: httputil.HeaderMap, httputil.HeaderElement.""" cookie = SimpleCookie() cookie__doc = """See help(Cookie).""" body = None body__doc = """See help(cherrypy.request.body)""" rfile = None rfile__doc = """ If the request included an entity (body), it will be available as a stream in this attribute. However, the rfile will normally be read for you between the 'before_request_body' hook and the 'before_handler' hook, and the resulting string is placed into either request.params or the request.body attribute. You may disable the automatic consumption of the rfile by setting request.process_request_body to False, either in config for the desired path, or in an 'on_start_resource' or 'before_request_body' hook. WARNING: In almost every case, you should not attempt to read from the rfile stream after CherryPy's automatic mechanism has read it. If you turn off the automatic parsing of rfile, you should read exactly the number of bytes specified in request.headers['Content-Length']. Ignoring either of these warnings may result in a hung request thread or in corruption of the next (pipelined) request. """ process_request_body = True process_request_body__doc = """ If True, the rfile (if any) is automatically read and parsed, and the result placed into request.params or request.body.""" methods_with_bodies = ("POST", "PUT") methods_with_bodies__doc = """ A sequence of HTTP methods for which CherryPy will automatically attempt to read a body from the rfile.""" body = None body__doc = """ If the request Content-Type is 'application/x-www-form-urlencoded' or multipart, this will be None. Otherwise, this will contain the request entity body as an open file object (which you can .read()); this value is set between the 'before_request_body' and 'before_handler' hooks (assuming that process_request_body is True).""" body_params = None body_params__doc = """ If the request Content-Type is 'application/x-www-form-urlencoded' or multipart, this will be a dict of the params pulled from the entity body; that is, it will be the portion of request.params that come from the message body (sometimes called "POST params", although they can be sent with various HTTP method verbs). This value is set between the 'before_request_body' and 'before_handler' hooks (assuming that process_request_body is True).""" # Dispatch attributes dispatch = cherrypy.dispatch.Dispatcher() dispatch__doc = """ The object which looks up the 'page handler' callable and collects config for the current request based on the path_info, other request attributes, and the application architecture. The core calls the dispatcher as early as possible, passing it a 'path_info' argument. The default dispatcher discovers the page handler by matching path_info to a hierarchical arrangement of objects, starting at request.app.root. See help(cherrypy.dispatch) for more information.""" script_name = "" script_name__doc = """ The 'mount point' of the application which is handling this request. This attribute MUST NOT end in a slash. If the script_name refers to the root of the URI, it MUST be an empty string (not "/"). """ path_info = "/" path_info__doc = """ The 'relative path' portion of the Request-URI. This is relative to the script_name ('mount point') of the application which is handling this request.""" login = None login__doc = """ When authentication is used during the request processing this is set to 'False' if it failed and to the 'username' value if it succeeded. The default 'None' implies that no authentication happened.""" # Note that cherrypy.url uses "if request.app:" to determine whether # the call is during a real HTTP request or not. So leave this None. app = None app__doc = \ """The cherrypy.Application object which is handling this request.""" handler = None handler__doc = """ The function, method, or other callable which CherryPy will call to produce the response. The discovery of the handler and the arguments it will receive are determined by the request.dispatch object. By default, the handler is discovered by walking a tree of objects starting at request.app.root, and is then passed all HTTP params (from the query string and POST body) as keyword arguments.""" toolmaps = {} toolmaps__doc = """ A nested dict of all Toolboxes and Tools in effect for this request, of the form: {Toolbox.namespace: {Tool.name: config dict}}.""" config = None config__doc = """ A flat dict of all configuration entries which apply to the current request. These entries are collected from global config, application config (based on request.path_info), and from handler config (exactly how is governed by the request.dispatch object in effect for this request; by default, handler config can be attached anywhere in the tree between request.app.root and the final handler, and inherits downward).""" is_index = None is_index__doc = """ This will be True if the current request is mapped to an 'index' resource handler (also, a 'default' handler if path_info ends with a slash). The value may be used to automatically redirect the user-agent to a 'more canonical' URL which either adds or removes the trailing slash. See cherrypy.tools.trailing_slash.""" hooks = HookMap(hookpoints) hooks__doc = """ A HookMap (dict-like object) of the form: {hookpoint: [hook, ...]}. Each key is a str naming the hook point, and each value is a list of hooks which will be called at that hook point during this request. The list of hooks is generally populated as early as possible (mostly from Tools specified in config), but may be extended at any time. See also: _cprequest.Hook, _cprequest.HookMap, and cherrypy.tools.""" error_response = cherrypy.HTTPError(500).set_response error_response__doc = """ The no-arg callable which will handle unexpected, untrapped errors during request processing. This is not used for expected exceptions (like NotFound, HTTPError, or HTTPRedirect) which are raised in response to expected conditions (those should be customized either via request.error_page or by overriding HTTPError.set_response). By default, error_response uses HTTPError(500) to return a generic error response to the user-agent.""" error_page = {} error_page__doc = """ A dict of {error code: response filename or callable} pairs. The error code must be an int representing a given HTTP error code, or the string 'default', which will be used if no matching entry is found for a given numeric code. If a filename is provided, the file should contain a Python string- formatting template, and can expect by default to receive format values with the mapping keys %(status)s, %(message)s, %(traceback)s, and %(version)s. The set of format mappings can be extended by overriding HTTPError.set_response. If a callable is provided, it will be called by default with keyword arguments 'status', 'message', 'traceback', and 'version', as for a string-formatting template. The callable must return a string or iterable of strings which will be set to response.body. It may also override headers or perform any other processing. If no entry is given for an error code, and no 'default' entry exists, a default template will be used. """ show_tracebacks = True show_tracebacks__doc = """ If True, unexpected errors encountered during request processing will include a traceback in the response body.""" show_mismatched_params = True show_mismatched_params__doc = """ If True, mismatched parameters encountered during PageHandler invocation processing will be included in the response body.""" throws = (KeyboardInterrupt, SystemExit, cherrypy.InternalRedirect) throws__doc = \ """The sequence of exceptions which Request.run does not trap.""" throw_errors = False throw_errors__doc = """ If True, Request.run will not trap any errors (except HTTPRedirect and HTTPError, which are more properly called 'exceptions', not errors).""" closed = False closed__doc = """ True once the close method has been called, False otherwise.""" stage = None stage__doc = """ A string containing the stage reached in the request-handling process. This is useful when debugging a live server with hung requests.""" namespaces = _cpconfig.NamespaceSet( **{"hooks": hooks_namespace, "request": request_namespace, "response": response_namespace, "error_page": error_page_namespace, "tools": cherrypy.tools, }) def __init__(self, local_host, remote_host, scheme="http", server_protocol="HTTP/1.1"): """Populate a new Request object. local_host should be an httputil.Host object with the server info. remote_host should be an httputil.Host object with the client info. scheme should be a string, either "http" or "https". """ self.local = local_host self.remote = remote_host self.scheme = scheme self.server_protocol = server_protocol self.closed = False # Put a *copy* of the class error_page into self. self.error_page = self.error_page.copy() # Put a *copy* of the class namespaces into self. self.namespaces = self.namespaces.copy() self.stage = None def close(self): """Run cleanup code. (Core)""" if not self.closed: self.closed = True self.stage = 'on_end_request' self.hooks.run('on_end_request') self.stage = 'close' def run(self, method, path, query_string, req_protocol, headers, rfile): """Process the Request. (Core) method, path, query_string, and req_protocol should be pulled directly from the Request-Line (e.g. "GET /path?key=val HTTP/1.0"). path should be %XX-unquoted, but query_string should not be. They both MUST be byte strings, not unicode strings. headers should be a list of (name, value) tuples. rfile should be a file-like object containing the HTTP request entity. When run() is done, the returned object should have 3 attributes: status, e.g. "200 OK" header_list, a list of (name, value) tuples body, an iterable yielding strings Consumer code (HTTP servers) should then access these response attributes to build the outbound stream. """ response = cherrypy.serving.response self.stage = 'run' try: self.error_response = cherrypy.HTTPError(500).set_response self.method = method path = path or "/" self.query_string = query_string or '' self.params = {} # Compare request and server HTTP protocol versions, in case our # server does not support the requested protocol. Limit our output # to min(req, server). We want the following output: # request server actual written supported response # protocol protocol response protocol feature set # a 1.0 1.0 1.0 1.0 # b 1.0 1.1 1.1 1.0 # c 1.1 1.0 1.0 1.0 # d 1.1 1.1 1.1 1.1 # Notice that, in (b), the response will be "HTTP/1.1" even though # the client only understands 1.0. RFC 2616 10.5.6 says we should # only return 505 if the _major_ version is different. rp = int(req_protocol[5]), int(req_protocol[7]) sp = int(self.server_protocol[5]), int(self.server_protocol[7]) self.protocol = min(rp, sp) response.headers.protocol = self.protocol # Rebuild first line of the request (e.g. "GET /path HTTP/1.0"). url = path if query_string: url += '?' + query_string self.request_line = '%s %s %s' % (method, url, req_protocol) self.header_list = list(headers) self.headers = httputil.HeaderMap() self.rfile = rfile self.body = None self.cookie = SimpleCookie() self.handler = None # path_info should be the path from the # app root (script_name) to the handler. self.script_name = self.app.script_name self.path_info = pi = path[len(self.script_name):] self.stage = 'respond' self.respond(pi) except self.throws: raise except: if self.throw_errors: raise else: # Failure in setup, error handler or finalize. Bypass them. # Can't use handle_error because we may not have hooks yet. cherrypy.log(traceback=True, severity=40) if self.show_tracebacks: body = format_exc() else: body = "" r = bare_error(body) response.output_status, response.header_list, response.body = r if self.method == "HEAD": # HEAD requests MUST NOT return a message-body in the response. response.body = [] try: cherrypy.log.access() except: cherrypy.log.error(traceback=True) if response.timed_out: raise cherrypy.TimeoutError() return response # Uncomment for stage debugging # stage = property(lambda self: self._stage, lambda self, v: print(v)) def respond(self, path_info): """Generate a response for the resource at self.path_info. (Core)""" response = cherrypy.serving.response try: try: try: if self.app is None: raise cherrypy.NotFound() # Get the 'Host' header, so we can HTTPRedirect properly. self.stage = 'process_headers' self.process_headers() # Make a copy of the class hooks self.hooks = self.__class__.hooks.copy() self.toolmaps = {} self.stage = 'get_resource' self.get_resource(path_info) self.body = _cpreqbody.RequestBody( self.rfile, self.headers, request_params=self.params) self.namespaces(self.config) self.stage = 'on_start_resource' self.hooks.run('on_start_resource') # Parse the querystring self.stage = 'process_query_string' self.process_query_string() # Process the body if self.process_request_body: if self.method not in self.methods_with_bodies: self.process_request_body = False self.stage = 'before_request_body' self.hooks.run('before_request_body') if self.process_request_body: self.body.process() # Run the handler self.stage = 'before_handler' self.hooks.run('before_handler') if self.handler: self.stage = 'handler' response.body = self.handler() # Finalize self.stage = 'before_finalize' self.hooks.run('before_finalize') response.finalize() except (cherrypy.HTTPRedirect, cherrypy.HTTPError), inst: inst.set_response() self.stage = 'before_finalize (HTTPError)' self.hooks.run('before_finalize') response.finalize() finally: self.stage = 'on_end_resource' self.hooks.run('on_end_resource') except self.throws: raise except: if self.throw_errors: raise self.handle_error() def process_query_string(self): """Parse the query string into Python structures. (Core)""" try: p = httputil.parse_query_string( self.query_string, encoding=self.query_string_encoding) except UnicodeDecodeError: raise cherrypy.HTTPError( 404, "The given query string could not be processed. Query " "strings for this resource must be encoded with %r." % self.query_string_encoding) # Python 2 only: keyword arguments must be byte strings (type 'str'). for key, value in p.items(): if isinstance(key, unicode): del p[key] p[key.encode(self.query_string_encoding)] = value self.params.update(p) def process_headers(self): """Parse HTTP header data into Python structures. (Core)""" # Process the headers into self.headers headers = self.headers for name, value in self.header_list: # Call title() now (and use dict.__method__(headers)) # so title doesn't have to be called twice. name = name.title() value = value.strip() # Warning: if there is more than one header entry for cookies (AFAIK, # only Konqueror does that), only the last one will remain in headers # (but they will be correctly stored in request.cookie). if "=?" in value: dict.__setitem__(headers, name, httputil.decode_TEXT(value)) else: dict.__setitem__(headers, name, value) # Handle cookies differently because on Konqueror, multiple # cookies come on different lines with the same key if name == 'Cookie': try: self.cookie.load(value) except CookieError: msg = "Illegal cookie name %s" % value.split('=')[0] raise cherrypy.HTTPError(400, msg) if not dict.__contains__(headers, 'Host'): # All Internet-based HTTP/1.1 servers MUST respond with a 400 # (Bad Request) status code to any HTTP/1.1 request message # which lacks a Host header field. if self.protocol >= (1, 1): msg = "HTTP/1.1 requires a 'Host' request header." raise cherrypy.HTTPError(400, msg) host = dict.get(headers, 'Host') if not host: host = self.local.name or self.local.ip self.base = "%s://%s" % (self.scheme, host) def get_resource(self, path): """Call a dispatcher (which sets self.handler and .config). (Core)""" # First, see if there is a custom dispatch at this URI. Custom # dispatchers can only be specified in app.config, not in _cp_config # (since custom dispatchers may not even have an app.root). dispatch = self.app.find_config(path, "request.dispatch", self.dispatch) # dispatch() should set self.handler and self.config dispatch(path) def handle_error(self): """Handle the last unanticipated exception. (Core)""" try: self.hooks.run("before_error_response") if self.error_response: self.error_response() self.hooks.run("after_error_response") cherrypy.serving.response.finalize() except cherrypy.HTTPRedirect, inst: inst.set_response() cherrypy.serving.response.finalize()
class Request(object): """Creates a new Request object to hold information about a request. :param sock: The socket object of the request. :type sock: socket.socket :param method: The requsted method. :type method: str :param scheme: The requsted scheme. :type scheme: str :param path: The requsted path. :type path: str :param protocol: The requsted protocol. :type protocol: str :param qs: The query string of the request. :type qs: str """ server = None """@cvar: A reference to the underlying server""" scheme = "http" protocol = (1, 1) server_protocol = (1, 1) host = "" local = Host("127.0.0.1", 80) remote = Host("127.0.0.1", 1111) xhr = False index = None script_name = "" login = None handled = False def __init__(self, sock, method, scheme, path, protocol, qs): "initializes x; see x.__class__.__doc__ for signature" self.sock = sock self.method = method self.scheme = scheme or Request.scheme self.path = path self.protocol = protocol self.qs = qs self.cookie = SimpleCookie() self._headers = None if sock: name = sock.getpeername() if name: self.remote = Host(*name) else: name = sock.getsockname() self.remote = Host(name, "", name) self.body = StringIO() def _getHeaders(self): return self._headers def _setHeaders(self, headers): self._headers = headers if "Cookie" in self.headers: self.cookie.load(self.headers["Cookie"]) host = self.headers.get("Host", None) if not host: host = self.local.name or self.local.ip self.base = "%s://%s" % (self.scheme, host) self.xhr = self.headers.get("X-Requested-With", "").lower() == \ "xmlhttprequest" headers = property(_getHeaders, _setHeaders) def __repr__(self): protocol = "HTTP/%d.%d" % self.protocol return "<Request %s %s %s>" % (self.method, self.path, protocol) def url(self): return url(self)
def track_page_view(request): """ // Track a page view, updates all the cookies and campaign tracker, // makes a server side request to Google Analytics and writes the transparent // gif byte data to the response. """ environ = request.environ time_tup = time.localtime(time.time() + COOKIE_USER_PERSISTENCE) # set some useful items in environ: environ['COOKIES'] = parse_cookie(environ.get('HTTP_COOKIE', '')) environ['GET'] = {} for key, value in parse_qsl(environ.get('QUERY_STRING', ''), True): environ['GET'][ key] = value # we only have one value per key name, right? :) x_utmac = environ['GET'].get('x_utmac', None) domain = environ.get('HTTP_HOST', '') # Get the referrer from the utmr parameter, this is the referrer to the # page that contains the tracking pixel, not the referrer for tracking # pixel. document_referer = environ.get("HTTP_REFERER", "") if not document_referer or document_referer == "0": document_referer = "-" else: document_referer = unquote(document_referer) document_path = request.url if document_path: document_path = unquote(document_path) account = environ.get('UTMAC', 'UA-29152694-1') user_agent = environ.get("HTTP_USER_AGENT", '') # // Try and get visitor cookie from the request. cookie = environ['COOKIES'].get(COOKIE_NAME) visitor_id = get_visitor_id(environ.get("HTTP_X_DCMGUID", ''), account, user_agent, cookie) # // Always try and add the cookie to the response. cookie = SimpleCookie() cookie[COOKIE_NAME] = visitor_id morsel = cookie[COOKIE_NAME] morsel['expires'] = time.strftime('%a, %d-%b-%Y %H:%M:%S %Z', time_tup) morsel['path'] = COOKIE_PATH utm_gif_location = "http://www.google-analytics.com/__utm.gif" for utmac in [account, x_utmac]: if not utmac: continue # ignore empty utmacs # // Construct the gif hit url. utm_url = "".join( (utm_gif_location, "?", "utmwv=", VERSION, "&utmn=", get_random_number(), "&utmhn=", quote(domain), "&utmsr=", environ['GET'].get('utmsr', ''), "&utme=", environ['GET'].get('utme', ''), "&utmr=", quote(document_referer), "&utmp=", quote(document_path), "&utmac=", utmac, "&utmcc=__utma%3D999.999.999.999.999.1%3B", "&utmvid=", visitor_id, "&utmip=", get_ip(environ.get("REMOTE_ADDR", '')))) logger.debug("utm_url: " + utm_url) # send_request_to_google_analytics(utm_url, environ) return utm_url
def run(self, method, path, query_string, req_protocol, headers, rfile): """Process the Request. (Core) method, path, query_string, and req_protocol should be pulled directly from the Request-Line (e.g. "GET /path?key=val HTTP/1.0"). path should be %XX-unquoted, but query_string should not be. They both MUST be byte strings, not unicode strings. headers should be a list of (name, value) tuples. rfile should be a file-like object containing the HTTP request entity. When run() is done, the returned object should have 3 attributes: status, e.g. "200 OK" header_list, a list of (name, value) tuples body, an iterable yielding strings Consumer code (HTTP servers) should then access these response attributes to build the outbound stream. """ response = cherrypy.serving.response self.stage = 'run' try: self.error_response = cherrypy.HTTPError(500).set_response self.method = method path = path or "/" self.query_string = query_string or '' self.params = {} # Compare request and server HTTP protocol versions, in case our # server does not support the requested protocol. Limit our output # to min(req, server). We want the following output: # request server actual written supported response # protocol protocol response protocol feature set # a 1.0 1.0 1.0 1.0 # b 1.0 1.1 1.1 1.0 # c 1.1 1.0 1.0 1.0 # d 1.1 1.1 1.1 1.1 # Notice that, in (b), the response will be "HTTP/1.1" even though # the client only understands 1.0. RFC 2616 10.5.6 says we should # only return 505 if the _major_ version is different. rp = int(req_protocol[5]), int(req_protocol[7]) sp = int(self.server_protocol[5]), int(self.server_protocol[7]) self.protocol = min(rp, sp) response.headers.protocol = self.protocol # Rebuild first line of the request (e.g. "GET /path HTTP/1.0"). url = path if query_string: url += '?' + query_string self.request_line = '%s %s %s' % (method, url, req_protocol) self.header_list = list(headers) self.headers = httputil.HeaderMap() self.rfile = rfile self.body = None self.cookie = SimpleCookie() self.handler = None # path_info should be the path from the # app root (script_name) to the handler. self.script_name = self.app.script_name self.path_info = pi = path[len(self.script_name):] self.stage = 'respond' self.respond(pi) except self.throws: raise except: if self.throw_errors: raise else: # Failure in setup, error handler or finalize. Bypass them. # Can't use handle_error because we may not have hooks yet. cherrypy.log(traceback=True, severity=40) if self.show_tracebacks: body = format_exc() else: body = "" r = bare_error(body) response.output_status, response.header_list, response.body = r if self.method == "HEAD": # HEAD requests MUST NOT return a message-body in the response. response.body = [] try: cherrypy.log.access() except: cherrypy.log.error(traceback=True) if response.timed_out: raise cherrypy.TimeoutError() return response
def COOKIES(self): if not self._COOKIES: self._COOKIES = SimpleCookie() return self._COOKIES
def __init__(self, input=None): SimpleCookie.__init__(self, input)
def __init__(self): self._cookies = SimpleCookie()
class Retriever(object): 'HTTP client.' def __init__(self, user_agent=None, timeout=20, sleep=0, disable_ssl_certificate_validation=True): # sleep should be the number of seconds to sleep between requests. self.h = httplib2.Http(timeout=timeout, disable_ssl_certificate_validation=disable_ssl_certificate_validation) self.h.force_exception_to_status_code = False self.h.follow_redirects = False self.user_agent = user_agent or 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)' self._cookies = SimpleCookie() self.logger = logging.getLogger('eb.retrieval.retriever') self.sleep = sleep # Keep track of whether we've downloaded any pages yet. # This makes sure we don't sleep before the very first requested page. self.page_downloaded = False def clear_cookies(self): self._cookies = SimpleCookie() def get_html_and_headers(self, uri, data=None, headers=None, send_cookies=True, follow_redirects=True, raise_on_error=True, basic_auth=None): "Retrieves the resource and returns a tuple of (content, header dictionary)." # Sleep, if necessary, but only if a page has already been downloaded # with this retriever. (We don't want to sleep before the very first # request that a retriever makes, because that would be unnecessary.) if self.sleep and self.page_downloaded: self.logger.debug('Sleeping for %s seconds', self.sleep) time.sleep(self.sleep) self.page_downloaded = True # Prepare the request. if not headers: headers = {} headers['user-agent'] = headers.get('user-agent', self.user_agent) # Take care of cookie header, if necessary. if send_cookies and self._cookies: # Some broken ASP.NET servers put "\r\n" in there, so we replace # that with semicolon to get proper behavior. headers['Cookie'] = self._cookies.output(attrs=[], header='').strip().replace('\r\n', ';') method = data and "POST" or "GET" body = urlencode(data) if isinstance(data, dict) else data if method == "POST" and body and 'Content-Type' not in headers: headers.setdefault('Content-Type', 'application/x-www-form-urlencoded') # Get the response. resp_headers = None for attempt_number in range(3): self.logger.debug('Attempt %s: %s', attempt_number + 1, method) try: resp_headers, content = self.h.request(uri, method, body=body, headers=headers) if resp_headers['status'] == '500': self.logger.debug("Request got a 500 error: %s", method) continue # Try again. break except socket.timeout: self.logger.debug("Request timed out after %s seconds: %s ", self.h.timeout, method) continue # Try again. except socket.error, e: self.logger.debug("Got socket error: %s", e) continue # Try again. except httplib2.ServerNotFoundError: raise RetrievalError("Could not %s : server not found" % (method))
def convertAsync(self, config, connectionSettings=None): if (config != None): config['clientName'] = "PYTHON" config['clientVersion'] = PDFreactor.VERSION url = self.url + "/convert/async.json" if (self.apiKey != None): url += '?apiKey=' + self.apiKey result = "" if (connectionSettings != None and 'headers' in connectionSettings and len(connectionSettings['headers'].keys()) == False): headers = connectionSettings['headers'] else: headers = {} if (connectionSettings != None and 'headers' in connectionSettings): for (key, value) in connectionSettings['headers'].items(): lcKey = key.lower() if lcKey != "content-type" and lcKey != "range" and lcKey != "user-agent": headers[key] = value headers['Content-Type'] = 'application/json' if (connectionSettings != None and 'cookies' in connectionSettings): headers['Cookie'] = '; '.join([ '%s=%s' % (key, value) for (key, value) in connectionSettings['cookies'].items() ]) headers['User-Agent'] = 'PDFreactor Python API v7' headers['X-RO-User-Agent'] = 'PDFreactor Python API v7' req = None if sys.version_info[0] == 2: from urllib2 import HTTPError else: from urllib.error import HTTPError try: if sys.version_info[0] == 2: import Cookie from Cookie import SimpleCookie import urllib2 from urllib2 import URLError options = json.dumps(config) req = urllib2.Request(url, options, headers) response = urllib2.urlopen(req) else: import http.cookies from http.cookies import SimpleCookie import urllib.request options = json.dumps(config) req = urllib.request.Request(url, options.encode(), headers) response = urllib.request.urlopen(req) except HTTPError as e: if (e.code == 422): raise Exception(json.loads(e.read().decode('utf-8'))['error']) elif (e.code == 400): raise Exception('Invalid client data. ' + json.loads(e.read().decode('utf-8'))['error']) elif (e.code == 401): raise Exception('Unauthorized. ' + json.loads(e.read().decode('utf-8'))['error']) elif (e.code == 413): raise Exception('The configuration is too large to process.') elif (e.code == 500): raise Exception(json.loads(e.read().decode('utf-8'))['error']) elif (e.code == 503): raise Exception('Asynchronous conversions are unavailable.') elif (e.code > 400): raise Exception('PDFreactor Web Service error (status: ' + str(e.code) + ').') except Exception as e: msg = e if hasattr(e, 'reason'): msg = e.reason raise Exception( 'Error connecting to PDFreactor Web Service at ' + self.url + '. Please make sure the PDFreactor Web Service is installed and running (Error: ' + str(msg) + ')') documentId = None if (response != None and response.info() != None): location = response.info().get("Location") if (location != None): documentId = location[location.rfind("/") + 1:len(location)] cookieHeader = response.info().get("Set-Cookie") if (cookieHeader != None and connectionSettings != None): if ('cookies' not in connectionSettings): connectionSettings['cookies'] = {} cookiesObj = SimpleCookie() cookiesObj.load(cookieHeader) for name in cookiesObj: connectionSettings['cookies'][name] = cookiesObj[ name].value return documentId
def __init__(self, host, port=None): self.host = host self.port = port self.response = None self.cookies = SimpleCookie()
def get_cookie(self): if 'HTTP_COOKIE' in self.environ: return SimpleCookie(self.environ['HTTP_COOKIE']) return None
def test_signed_out_user_is_anonymous(self): self.make_participant('alice') alice = User.from_username('alice') assert not alice.ANON alice.sign_out(SimpleCookie()) assert alice.ANON
def cookies(self): """ Cookies parsed into a :class:`FormsDict`. Signed cookies are NOT decoded. Use :meth:`get_cookie` if you expect signed cookies. """ cookies = SimpleCookie(self.environ.get('HTTP_COOKIE', '')).values() return FormsDict((c.key, c.value) for c in cookies)
def __init__(self): SimpleCookie.__init__(self) self.token = '' self.userid = '' self.passport = '' self._loaded = False
def __init__(self, *args, **kwargs): super(RpcHttpResponse, self).__init__(*args, **kwargs) self.cookies = SimpleCookie()
class HTTPBase(object): def __init__(self, verify=True, ca_bundle=None, key_file=None, cert_file=None): self.request_args = { "allow_redirects": False, } self.cookies = {} self.cookiejar = cookielib.CookieJar() self.request_args["verify"] = verify if ca_bundle: self.request_args["verify"] = ca_bundle if key_file: self.request_args["cert"] = (cert_file, key_file) self.sec = None def _cookies(self): cookie_dict = {} for _, a in list(self.cookiejar._cookies.items()): for _, b in list(a.items()): for cookie in list(b.values()): # print cookie cookie_dict[cookie.name] = cookie.value return cookie_dict def set_cookie(self, kaka, request): """Returns a cookielib.Cookie based on a set-cookie header line""" # default rfc2109=False # max-age, httponly for cookie_name, morsel in kaka.items(): std_attr = ATTRS.copy() std_attr["name"] = cookie_name _tmp = morsel.coded_value if _tmp.startswith('"') and _tmp.endswith('"'): std_attr["value"] = _tmp[1:-1] else: std_attr["value"] = _tmp std_attr["version"] = 0 # copy attributes that have values for attr in morsel.keys(): if attr in ATTRS: if morsel[attr]: if attr == "expires": std_attr[attr] = _since_epoch(morsel[attr]) else: std_attr[attr] = morsel[attr] elif attr == "max-age": if morsel["max-age"]: std_attr["expires"] = _since_epoch(morsel["max-age"]) for att, set in PAIRS.items(): if std_attr[att]: std_attr[set] = True if std_attr["domain"] and std_attr["domain"].startswith("."): std_attr["domain_initial_dot"] = True if morsel["max-age"] is 0: try: self.cookiejar.clear(domain=std_attr["domain"], path=std_attr["path"], name=std_attr["name"]) except ValueError: pass else: new_cookie = cookielib.Cookie(**std_attr) self.cookiejar.set_cookie(new_cookie) def send(self, url, method="GET", **kwargs): _kwargs = copy.copy(self.request_args) if kwargs: _kwargs.update(kwargs) if self.cookiejar: _kwargs["cookies"] = self._cookies() #logger.info("SENT COOKIEs: %s" % (_kwargs["cookies"],)) try: r = requests.request(method, url, **_kwargs) except requests.ConnectionError, exc: raise ConnectionError("%s" % exc) try: #logger.info("RECEIVED COOKIEs: %s" % (r.headers["set-cookie"],)) self.set_cookie(SimpleCookie(r.headers["set-cookie"]), r) except AttributeError, err: pass
def __init__(self): logger.debug(locals()) self._cookies = SimpleCookie()
class Scraper(object): primary_key = UNDEFINED # Set to None if the scraper should never look for existing records. schema = None sleep = 0 timeout = 20 update = True # Whether to update old records. # If a record is found with an item_date more than 14 days old, the # pub_date will be set to the item_date. Set to None to bypass this # behavior for an individual scraper (i.e., always set pub_date to # the time the scraper runs). fresh_days = 14 # Set this to False for data sets where we don't have a natural item_date. # In this case, the scraper will set it to datetime.date.now() the first # time it sees a record, and it will stay that way for subsequent updates # to the same record. item_date_available = True # Hook for setting NewsItem.title from values in datadict. By default, just # use the dictionary key 'title'. # Other examples: # "{secondary_type.name}" (for a lookup object) # "News at {street_number} {street_name} {street_suffix}" title_format = u'{title}' def __init__(self, retriever=None): self.retriever = retriever self.logger = logging.getLogger('eb.retrieval.%s' % self.schema) # self._geocoder = SmartGeocoder() self.clear_cache() self.h = httplib2.Http(timeout=20, disable_ssl_certificate_validation=True) self.user_agent = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)' self._cookies = SimpleCookie() module_name = re.match('^everyblock\.(.*)\.scrape$', self.__module__) if module_name: self.scraper_name = module_name.group(1) else: self.scraper_name = None self.is_dry_run = False self.num_geocode_attempted = 0 self.num_geocode_succeeded = 0 def clear_cache(self): self._metro_object_cache = None self._schema_fields_cache = None self._schema_object_cache = None self.created_newsitem_ids = [] if hasattr(self, '_temp_files'): for filename in self._temp_files: os.unlink(filename) self._temp_files = [] self.cleanup() def cleanup(self): """ This is a hook for cleaning up after the scraper is done. It's guaranteed to be called at the end of run() and dry_run(). """ pass def dry_run(self, prepare=False, save_ungeocoded_addresses=False): """ Run the scraper, but do not create or update any NewsItem objects. If `prepare` is True, this method will geocode location_names and create Lookup objects. If `save_ungeocoded_addresses` is a string value, the string will be treated as a file path, and all addresses that fail to be geocoded will be saved into that file. """ for item in self.dry_run_iter(prepare, save_ungeocoded_addresses): pass def dry_run_iter(self, prepare=False, save_ungeocoded_addresses=False): """ Just like dry_run(), but returns a generator that iterates over the data dictionaries created. """ import pprint self.is_dry_run = True self.cache_retriever = CacheRetriever(self) self.start_time = datetime.datetime.now() self.start_date = self.start_time.date() if save_ungeocoded_addresses: self.ungeocoded_addresses = {} self.geocode = self.geocode_and_log try: for datadict in self.data(): if prepare: datadict = self.prepare_data(datadict) pprint.pprint(datadict) yield datadict finally: self.clear_cache() self.logger.info('Geocoding succeeded/attempted: {0}/{1}'.format( self.num_geocode_succeeded, self.num_geocode_attempted)) if save_ungeocoded_addresses: self.create_geocoding_report() def run(self, raise_errors=True): self.logger.info("run() started") self.num_added = self.num_changed = 0 self.start_time = datetime.datetime.now() self.start_date = self.start_time.date() # We use a try/finally here so that the DataUpdate object is created # regardless of whether the scraper raised an exception. results = None filename = self.schema + '-data.txt' ni = [] try: got_error = True for datadict in self.data(): ni.append(self.save(datadict)) got_error = False with open(filename, 'w+') as outfile: json.dump(ni, outfile) except: # Record exceptions in the finally block if raise_errors: raise else: pass finally: exc_type, exc_value, exc_traceback = sys.exc_info() # Rollback, in case the database is in an aborted transaction. This # avoids the "psycopg2.ProgrammingError: current transaction is aborted, # commands ignored until end of transaction block" error. # from django.db import connection # connection._rollback() finish_time = datetime.datetime.now() self.clear_cache() results = Results( schema=self.schema, update_start=self.start_time, update_finish=finish_time, num_added=self.num_added, num_changed=self.num_changed, num_skipped=0, num_hidden=0, got_error=got_error, traceback=''.join([ x for x in traceback.format_exception( exc_type, exc_value, exc_traceback) ]), num_geocode_succeeded=self.num_geocode_succeeded, num_geocode_attempted=self.num_geocode_attempted, ) self.logger.info('Records added: %s', self.num_added) self.logger.info('Records changed: %s', self.num_changed) self.logger.info('Geocoding succeeded/attempted: {0}/{1}'.format( self.num_geocode_succeeded, self.num_geocode_attempted)) return results def prepare_data(self, datadict): # Get/create Lookup objects for lookup fields and set the value to the # Lookup ID. # Set the NewsItem fields in case they don't exist. datadict['description'] = datadict.get('description', '') datadict['url'] = datadict.get('url', '') # Set the NewsItem.pub_date according to the fresh_days value. datadict['pub_date'] = datadict.get('pub_date', self.start_time) if self.fresh_days is not None and ( self.start_date - datadict['item_date']).days >= self.fresh_days: datadict['pub_date'] = datetime.datetime.combine( datadict['item_date'], datetime.time(0, 0)) # Calculate location. datadict['location'] = None # Calculate title. TITLE_MAXLENGTH = 255 datadict['title'] = self.title_format.format(**datadict) if len(datadict['title']) >= TITLE_MAXLENGTH - 3: datadict['title'] = datadict['title'][:TITLE_MAXLENGTH - 3] + u'...' # Convert non-many-to-many Lookup objects back to the IDs. return datadict def save(self, datadict): datadict = self.prepare_data(datadict) #if datadict['location'] is None: # return old_newsitem = None # Special case for item_date_available == False. # Unfortunately this logic can't live in prepare_data() because we # don't have old_newsitem at that point. if not self.item_date_available: if old_newsitem is None: datadict['item_date'] = datetime.date.today() else: datadict['item_date'] = old_newsitem.item_date ni = self.create_newsitem(datadict) self.num_added += 1 # self.logger.info(u'Created NewsItem %s (total created in this scrape: %s)', ni.id, self.num_added) # self.created_newsitem_ids.append(ni.id) return ni def create_newsitem(self, datadict): ni = {} #NewsItem.objects.create( #schema=self.schema_object, ni['title'] = datadict['title'], ni['description'] = datadict['description'], ni['url'] = datadict['url'], ni['pub_date'] = str(datadict['pub_date']), ni['item_date'] = str(datadict['item_date']), ni['location'] = datadict['location'], ni['location_name'] = datadict['location_name'], ni['location_id'] = None, # Scrapers shouldn't post to locations. In theory. json_data = json.dumps(ni) return json_data def data(self): """ Yields final dictionaries of data, each of which MUST contain the following keys: title -- string item_date -- datetime.date location_name -- string Also, an item_date (datetime.date) must exist, except if you've specified item_date_available=False in the scraper. These keys might also exist: url location_name_geocoder Other data keys correspond to SchemaField.name. If a value of the dictionary is a list or dictionary, it will automatically be converted to JSON before being inserted into the db_attribute table as a string. If a key of the dictionary is a lookup=True SchemaField, then the value should be the Lookup.code value, not the Lookup ID or Lookup object. """ raise NotImplementedError() def broken(self, message): raise ScraperBroken(message) def get(self, uri, *args, **kwargs): "Returns HTML for the given URL and POST data." parse_result = urlparse.urlparse(uri) if parse_result.scheme == 'ftp': return self.ftp_get(parse_result) else: return self.get_html(uri, *args, **kwargs) def get_to_file(self, *args, **kwargs): """ Retrieves the given URL and POST data to a local file. Returns the filename. The Scraper automatically deletes the file when scraping is done. """ filename = self.retriever.get_to_file(*args, **kwargs) self._temp_files.append( filename) # Keep track so we can delete after scrape is done. return filename def cache_get(self, prefix, suffix, url, make_pretty=False, **kwargs): """ Download the file at the given URL and return its contents as a string. If a dry run is in process, save it as a file in a cache directory using the given prefix and suffix. """ if self.is_dry_run: return self.cache_retriever.get(prefix, suffix, url, make_pretty=make_pretty, **kwargs) else: return self.get(url, **kwargs) def cache_get_to_file(self, prefix, suffix, url, **kwargs): """ Download the file at the given URL, save it to disk, and return its filename. If a dry run is in process, save it as a file in a cache directory using the given prefix and suffix. """ if self.is_dry_run: return self.cache_retriever.get_to_file(prefix, suffix, url, **kwargs) else: return self.get_to_file(url, **kwargs) def get_to_file(self, *args, **kwargs): """ Downloads the given URI and saves it to a temporary file. Returns the full filename of the temporary file. """ import os from tempfile import mkstemp fd, name = mkstemp() fp = os.fdopen(fd, 'wb') fp.write(self.get_html(*args, **kwargs)) fp.close() return name def get_html(self, uri, data=None, headers=None, send_cookies=True, follow_redirects=True, raise_on_error=True, basic_auth=None): return self.get_html_and_headers(uri, data, headers, send_cookies, follow_redirects, raise_on_error, basic_auth)[0] def get_html_and_headers(self, uri, data=None, headers=None, send_cookies=True, follow_redirects=True, raise_on_error=True, basic_auth=None): if self.sleep and self.page_downloaded: self.logger.debug('Sleeping for %s seconds', self.sleep) time.sleep(self.sleep) self.page_downloaded = True # Prepare the request. if not headers: headers = {} headers['user-agent'] = headers.get('user-agent', self.user_agent) # Take care of basic auth header, if necessary. # See http://en.wikipedia.org/wiki/Basic_access_authentication if basic_auth is not None: import base64 auth_header = 'Basic %s' % base64.encodestring( '%s:%s' % (basic_auth[0], basic_auth[1])).strip() headers['Authorization'] = auth_header # Take care of cookie header, if necessary. if send_cookies and self._cookies: # Some broken ASP.NET servers put "\r\n" in there, so we replace # that with semicolon to get proper behavior. headers['Cookie'] = self._cookies.output( attrs=[], header='').strip().replace('\r\n', ';') method = data and "POST" or "GET" body = urlencode(data) if isinstance(data, dict) else data if method == "POST" and body and 'Content-Type' not in headers: headers.setdefault('Content-Type', 'application/x-www-form-urlencoded') # Get the response. resp_headers = None for attempt_number in range(3): self.logger.debug('Attempt %s: %s %s', attempt_number + 1, method, uri) if data: self.logger.debug('Data: %r', data) if headers: self.logger.debug('Headers: %r' % headers) try: resp_headers, content = self.h.request(uri, method, body=body, headers=headers) if resp_headers['status'] == '500': self.logger.debug("Request got a 500 error: %s %s", method, uri) continue # Try again. break except socket.timeout: self.logger.debug("Request timed out after %s seconds: %s %s", self.h.timeout, method, uri) continue # Try again. except socket.error, e: self.logger.debug("Got socket error: %s", e) continue # Try again. except httplib2.ServerNotFoundError: raise RetrievalError("Could not %s %r: server not found" % (method, uri))
def allow_access(environ, host): #print '\n'.join(map(lambda x: '%s: %s' % x, zip(environ.keys(), environ.values()))) #print environ['REQUEST_URI'], environ['SCRIPT_NAME'] try: cookie = SimpleCookie(environ['HTTP_COOKIE']) except KeyError: #No cookie == no permission return False #Special overide for INTERAL services, NOT FOR WEB SERVICES!!! #There should be no way they have the secret key! try: secret = cookie['secretkey'].value #print 'is "%s" == "%s"?' % (secret, settings.SECRET_KEY) if secret == settings.SECRET_KEY: #TODO Added a TRUST ips env var, and make sure it's one of those return True except KeyError: pass try: sessionId = cookie[settings.SESSION_COOKIE_NAME].value except KeyError: #No session id -> not logged in -> immediate access denied return False now = datetime.datetime.now(tz=pytz.utc) if now > allow_access.nextCheck: #if checkFrequency has passed, clear the list allow_access.validSessions = {} #print 'Cleared cache' allow_access.nextCheck = now + allow_access.checkFrequency try: #print 'Check in cache' #Check if in list expireTime = allow_access.validSessions[sessionId] #Get index #print 'In cache' except KeyError: #Session not in dictionary #print 'Not in cache', allow_access.validSessions try: #print 'check in session db', sessionId db.reset_queries() session = Session.objects.get(pk=sessionId) if not SESSION_KEY in session.get_decoded(): #If session KEY is not in the data stream there this is not a logged in SESSION return False except Session.DoesNotExist: #Not in session DB #print 'Not in session db' return False finally: db.connection.close() #if it's in the database allow_access.validSessions[sessionId] = session.expire_date #Add to cache expireTime = session.expire_date #print 'in session db' if expireTime > now: #Valid session by expire time #print 'Valid time' return True else: #print 'Invalid time' return False
def track_page_view(environ): """ // Track a page view, updates all the cookies and campaign tracker, // makes a server side request to Google Analytics and writes the transparent // gif byte data to the response. """ time_tup = time.localtime(time.time() + COOKIE_USER_PERSISTENCE) # set some useful items in environ: environ['COOKIES'] = parse_cookie(environ.get('HTTP_COOKIE', '')) environ['GET'] = {} for key, value in parse_qsl(environ.get('QUERY_STRING', ''), True): environ['GET'][key] = value # we only have one value per key name, right? :) x_utmac = environ['GET'].get('x_utmac', None) domain = environ.get('HTTP_X_FORWARDED_HOST', environ.get('HTTP_HOST', '')) # Get the referrer from the utmr parameter, this is the referrer to the # page that contains the tracking pixel, not the referrer for tracking # pixel. document_referer = environ['GET'].get("utmr", "") if not document_referer or document_referer == "0": document_referer = "-" else: document_referer = unquote(document_referer) document_path = environ['GET'].get('utmp', "") if document_path: document_path = unquote(document_path) account = environ['GET'].get('utmac', '') user_agent = environ.get("HTTP_USER_AGENT", '') # // Try and get visitor cookie from the request. cookie = environ['COOKIES'].get(COOKIE_NAME) visitor_id = get_visitor_id(environ.get("HTTP_X_DCMGUID", ''), account, user_agent, cookie) # // Always try and add the cookie to the response. cookie = SimpleCookie() cookie[COOKIE_NAME] = visitor_id morsel = cookie[COOKIE_NAME] morsel['expires'] = time.strftime('%a, %d-%b-%Y %H:%M:%S %Z', time_tup) morsel['path'] = COOKIE_PATH utm_gif_location = "http://www.google-analytics.com/__utm.gif" client_ip = environ.get("HTTP_X_FORWARDED_FOR", environ.get("REMOTE_ADDR", "")) for utmac in [account, x_utmac]: if not utmac: continue # ignore empty utmacs # // Construct the gif hit url. utm_url = utm_gif_location + "?" + \ "utmwv=" + VERSION + \ "&utmn=" + get_random_number() + \ "&utmhn=" + quote(domain) + \ "&utmsr=" + environ['GET'].get('utmsr', '') + \ "&utme=" + environ['GET'].get('utme', '') + \ "&utmr=" + quote(document_referer) + \ "&utmp=" + quote(document_path) + \ "&utmac=" + utmac + \ "&utmcc=__utma%3D999.999.999.999.999.1%3B" + \ "&utmvid=" + visitor_id + \ "&utmip=" + get_ip(client_ip) dbgMsg("utm_url: " + utm_url) send_request_to_google_analytics(utm_url, environ) # // If the debug parameter is on, add a header to the response that contains # // the url that was used to contact Google Analytics. headers = [('Set-Cookie', str(cookie).split(': ')[1])] if environ['GET'].get('utmdebug', False): headers.append(('X-GA-MOBILE-URL', utm_url)) # Finally write the gif data to the response response = write_gif_data() response_headers = response['response_headers'] response_headers.extend(headers) return response