def bookmarklet_follow(request): uri = request.GET["u"] title = request.GET["t"] # url opener redirect_handler = HTTPRedirectHandler() redirect_handler.max_redirections = settings.CONFIG["url"]["max_redirections"] opener = urllib2.build_opener(redirect_handler) # web page loading handle = opener.open(uri) encoding = detect_header_encoding(handle.headers.dict) content = decode_html(handle.read(), encoding) handle.close() # build a resource discovered_resource = DiscoveredResource() discovered_resource.uri = uri discovered_resource.title = title discovered_resource.content = content # process the discovered resource resource = processing_service_client.process(discovered_resource) topic = topic_manager.create_from_features(resource.title, resource.terms, resource.entities) return redirect("topic_tracking_web.demo.views.topics_show", topic._id)
def build_opener(self, debug=False): """Create handlers with the appropriate debug level. We intentionally create new ones because the OpenerDirector class in urllib2 is smart enough to replace its internal versions with ours if we pass them into the urllib2.build_opener method. This is much easier than trying to introspect into the OpenerDirector to find the existing handlers. Based on http://code.activestate.com/recipes/440574/#c1 TODO: Implement workaround for http://bugs.python.org/issue7152 """ http_handler = HTTPHandler(debuglevel=debug) https_handler = HTTPSHandler(debuglevel=debug) proxy_handler = ProxyHandler(debuglevel=debug) unknown_handler = UnknownHandler(debuglevel=debug) http_default_error_handler = HTTPDefaultErrorHandler(debuglevel=debug) http_redirect_handler = HTTPRedirectHandler(debuglevel=debug) http_error_processor = HTTPErrorProcessor(debuglevel=debug) handlers = [http_handler, https_handler, proxy_handler, \ unknown_handler, http_default_error_handler, \ http_redirect_handler, http_error_processor] opener = build_opener(handlers) return opener
def redirect_request(self, req, fp, code, msg, headers, newurl): """Return a Request or None in response to a redirect. See `urllib2.HTTPRedirectHandler`. If the original request is a POST request, the request's payload will be preserved in the redirect and the returned request will also be a POST request. """ # If we can't handle this redirect, # HTTPRedirectHandler.redirect_request() will raise an # HTTPError. We call the superclass here in the old fashion # since HTTPRedirectHandler isn't a new-style class. new_request = HTTPRedirectHandler.redirect_request( self, req, fp, code, msg, headers, newurl) # If the old request is a POST request, the payload will be # preserved. Note that we don't need to test for the POST-ness # of the old request; if its data attribute - its payload - is # not None it's a POST request, if it's None it's a GET request. # We can therefore just copy the data from the old request to # the new without worrying about breaking things. new_request.data = req.data new_request.timeout = req.timeout return new_request
def build_opener(source_address=None, timeout=10): """Function similar to ``urllib2.build_opener`` that will build an ``OpenerDirector`` with the explicit handlers we want, ``source_address`` for binding, ``timeout`` and our custom `User-Agent` """ # printer('Timeout set to %d' % timeout, debug=True) if source_address: source_address_tuple = (source_address, 0) # printer('Binding to source address: %r' % (source_address_tuple,), debug=True) else: source_address_tuple = None handlers = [ ProxyHandler(), SpeedtestHTTPHandler(source_address=source_address_tuple, timeout=timeout), SpeedtestHTTPSHandler(source_address=source_address_tuple, timeout=timeout), HTTPDefaultErrorHandler(), HTTPRedirectHandler(), HTTPErrorProcessor() ] opener = OpenerDirector() opener.addheaders = [('User-agent', build_user_agent())] for handler in handlers: opener.add_handler(handler) return opener
def http_error_301(self, req, fp, code, msg, headers): if headers.has_key('location'): self.__set_permanent(req, headers['location']) elif headers.has_key('uri'): self.__set_permanent(req, headers['uri']) return HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
def redirect_request(self, req, fp, code, msg, hdrs, newurl): self._counter += 1 if (self._counter > self._maxRedirects): raise HTTPError(req.get_full_url(), code, 'Reached the maximum number of redirects', hdrs, fp) else: # TODO: really reuse referer-header? return HTTPRedirectHandler.redirect_request( self, req, fp, code, msg, hdrs, newurl)
def http_error_302(self, req, fp, code, msg, headers): if 'location' in headers: newurl = headers.getheaders('location')[0] elif 'uri' in headers: newurl = headers.getheaders('uri')[0] newurl = urljoin(req.get_full_url(), newurl) log.debug("302 %s" % newurl) result = HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers) return result
def open(self, url, start, headers=None): time_spent = int(time() - start) if time_spent >= TIMEOUT: raise TimeoutError(time_spent) if not headers: headers = dict() headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; ' + 'rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13' }) opener = build_opener(HTTPRedirectHandler(), HTTPCookieProcessor()) return opener.open(Request(url, headers=headers), timeout=min(CONNECTION_TIMEOUT, TIMEOUT - time_spent))
def redirect_request(self, req, fp, code, msg, hdrs, newurl): if newurl.startswith('https://localhost.admin.eutaxia.eu:5000/login/%s' % provider): raise HTTPError(req.get_full_url(), code, msg, hdrs, fp) return HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, hdrs, newurl)
def http_error_302(self, req, fp, code, msg, headers): result = HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers) result.status = code #log.debug("%d %s" % (code, req.get_full_url())) return result
def http_error_301(self, req, fp, code, msg, headers): response = \ HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers) response.status = code return response
def http_error_302(self, req, fp, code, msg, headers): results = HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers) results.status = code return result
def http_error_302(self, req, fp, code, msg, headers): return HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
def redirect_request(self, *args): new_request = HTTPRedirectHandler.redirect_request(self, *args) # We need to add a cookie from the cookie_jar self.cookie_jar.add_cookie_header(new_request) return new_request
pass # remove resource from collection self._resources_collection.remove_model(resource) if __name__ == '__main__': # configuration file config_file = sys.argv[1] config = yaml.load(file(config_file, 'r')) # logging logging.config.dictConfig(config['logging']) logger = logging.getLogger() # MongoDB mcm = mongo_from_config(config['mongo']) database = config['mongo']['databases']['discovery'] resources_collection = mcm.get_collection(database, 'resources', DiscoveredResource) # url opener redirect_handler = HTTPRedirectHandler() redirect_handler.max_redirections = config['url']['max_redirections'] opener = urllib2.build_opener(redirect_handler) # load pages to the queue loader = WebPageLoader(resources_collection, opener, config) loader.start()
def http_error_302(self, req, fp, code, msg, headers): """ Handle a 302 error """ result = HTTPRedirectHandler.http_error_302( self, req, fp, code, msg, headers) result.status = code return result
def redirect_request(self, req, fp, code, msg, headers, newurl): new_req = HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, newurl) req.redirect_code = code return new_req
import base64 from datetime import datetime import pickle from robotparser import RobotFileParser import time from urlparse import urlparse from urllib2 import urlopen, Request, HTTPError, install_opener, build_opener, HTTPRedirectHandler from django.core.cache import cache from couchdb.mapping import Document, TextField, DateTimeField, ListField, FloatField import settings install_opener(build_opener(HTTPRedirectHandler())) class Page(Document): type = TextField(default="page") url = TextField() content = TextField() links = ListField(TextField()) rank = FloatField(default=0) last_checked = DateTimeField(default=datetime.now) def is_valid(self):
def http_error_302(self, req, fp, code, msg, headers): """ Handle a 302 error """ result = HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers) result.status = code return result