def http_download(download_url, outfile, proxy_url=None, proxy_port=None): if proxy_url: proxy = "{}:{}".format(proxy_url, proxy_port) mainlog.info("Using a proxy : {}".format(proxy)) urlopener = build_opener(ProxyHandler({ 'https': proxy, 'http': proxy }), HTTPRedirectHandler()) else: mainlog.info("Not using a proxy") urlopener = build_opener(HTTPHandler(), HTTPSHandler(), HTTPRedirectHandler()) urlopener.addheaders = [( 'User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:32.0) Gecko/20100101 Firefox/32.0' )] datasource = urlopener.open(download_url) out = open(outfile, 'wb') while True: d = datasource.read(8192) # self.logger.debug("Downloaded {} bytes".format(len(d))) if not d: break else: out.write(d) out.flush() out.close() datasource.close()
def open(request): request = request_vim_to_python(request) rhandler = HTTPRedirectHandler() rhandler.max_redirections = request['max_redirect'] opener = build_opener(rhandler) if request['username']: passmgr = HTTPPasswordMgrWithDefaultRealm() passmgr.add_password( None, request['url'], request['username'], request['password'], ) opener.add_handler(HTTPBasicAuthHandler(passmgr)) opener.add_handler(HTTPDigestAuthHandler(passmgr)) req = Request( url=request['url'], data=request['data'], headers=request['headers'], method=request['method'], ) if request['gzip_decompress']: req.add_header('Accept-encoding', 'gzip') try: res = retry(tries=request['retry'])(opener.open)( req, timeout=request['timeout']) except HTTPError as e: res = e if not hasattr(res, 'version'): # urllib2 does not have 'version' field import httplib res.version = httplib.HTTPConnection._http_vsn response_status = "HTTP/%s %d %s\n" % ( '1.1' if res.version == 11 else '1.0', res.code, res.msg, ) response_headers = str(res.headers) response_body = res.read() if (request['gzip_decompress'] and res.headers.get('Content-Encoding') == 'gzip'): response_body = gzip_decompress(response_body) if hasattr(res.headers, 'get_content_charset'): # Python 3 response_encoding = res.headers.get_content_charset() else: # Python 2 response_encoding = res.headers.getparam('charset') response_body = response_body.decode(response_encoding) return ( request['url'], response_status + response_headers, response_body, )
def http_error_302(self, req, fp, code, msg, headers): previous_url = req.url result = HTTPRedirectHandler.http_error_302( self, req, fp, code, msg, headers) if not hasattr(result, "redirected_via"): result.redirected_via = [] result.redirected_via.append(previous_url)
def _handle_redirect(self, req, fp, code, msg, headers): url_unescaped = headers.get('Location') new_url = quote_url(url_unescaped) headers.replace_header('Location', new_url) result = HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers) return result
def http_error_302(self, req, fp, code, msg, headers): if self.throw: self.location = headers.getheader('Location') raise RedirectionException() else: return HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
def http_error_301(self, req, fp, code, msg, headers): new_url = req.get_full_url() result = HTTPRedirectHandler.http_error_301( self, req, fp, code, msg, headers) if not hasattr(result, "redirected_via"): result.redirected_via = [] result.redirected_via.append(new_url)
def auth(*, email, password, client_id, scope): def split_key_value(kv_pair): kv = kv_pair.split("=") return kv[0], kv[1] # Authorization form def auth_user(email, password, client_id, scope, opener): response = opener.open( "http://oauth.vk.com/oauth/authorize?" + \ "redirect_uri=http://oauth.vk.com/blank.html&response_type=token&" + \ "client_id=%s&scope=%s" % (client_id, ",".join(scope)) ) doc = response.read().decode('utf-8') parser = AuthFormParser() parser.feed(doc) parser.close() if not parser.form_parsed or parser.url is None or "pass" not in parser.params or \ "email" not in parser.params: raise RuntimeError("Something wrong") parser.params["email"] = email parser.params["pass"] = password if parser.method == "POST": response = opener.open(parser.url, urlencode(parser.params).encode('ascii')) else: raise NotImplementedError("Method '%s'" % parser.method) return response.read(), response.geturl() # Permission request form def give_access(doc, opener): parser = AuthFormParser() parser.feed(doc) parser.close() if not parser.form_parsed or parser.url is None: raise RuntimeError("Something wrong") if parser.method == "POST": response = opener.open(parser.url, urlencode(parser.params).encode('ascii')) else: raise NotImplementedError("Method '%s'" % parser.method) return response.geturl() if not isinstance(scope, list): scope = [scope] opener = build_opener(HTTPCookieProcessor(http.cookiejar.CookieJar()), HTTPRedirectHandler()) doc, url = auth_user(email, password, client_id, scope, opener) if urlparse(url).path != "/blank.html": # Need to give access to requested scope url = give_access(doc.decode('utf-8'), opener) if urlparse(url).path != "/blank.html": raise RuntimeError("Expected success here") answer = dict( split_key_value(kv_pair) for kv_pair in urlparse(url).fragment.split("&")) if "access_token" not in answer or "user_id" not in answer: raise RuntimeError("Missing some values in answer") return answer["access_token"], answer["user_id"]
def setup_method(self, method): self.cookies = CookieJar() self.opener = build_opener(HTTPRedirectHandler(), HTTPHandler(debuglevel=0), HTTPSHandler(debuglevel=0), HTTPCookieProcessor(self.cookies)) self.application_process = Process(target=main) self.application_process.start()
def http_error_302(self, req, fp, code, msg, headers): #self.log.info("http_error_302: code %s headers %s" % (code, headers)) if 'location' in headers: newurl = headers['location'] if newurl.startswith('mms:'): raise URLError("MMS REDIRECT:" + headers["Location"]) return HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
def redirect_request(self, req, fp, code, msg, headers, newurl): if PY2: # HTTPRedirectHandler is an old style class request = HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, newurl) else: request = super(S3HTTPRedirectHandler, self).redirect_request(req, fp, code, msg, headers, newurl) del request.headers['Authorization'] return request
def redirect_request(self, request, fp, code, msg, headers, new_url): request_method = request.get_method() if str( code ) in self.redirect_codes and request_method in self.valid_methods: new_url = new_url.replace(' ', '%20') request = Request(new_url, data=request.data, headers=request.headers, origin_req_host=request.get_origin_req_host(), unverifiable=True) if self.method in self.valid_methods: if request.get_method() != self.method: request.get_method = lambda: self.method return request else: HTTPRedirectHandler.redirect_request(request, fp, code, msg, headers, new_url)
def download(self, url, error_message, timeout, tries): http_proxy = self.setting.http_proxy https_proxy = self.setting.https_proxy if http_proxy or https_proxy: proxies = {} if http_proxy: proxies['http'] = http_proxy if not https_proxy: proxies['https'] = http_proxy if https_proxy: proxies['https'] = https_proxy proxy_handler = ProxyHandler(proxies) else: proxy_handler = ProxyHandler() handlers = [proxy_handler, HTTPRedirectHandler()] # secure_url_match = re.match('^https://([^/]+)', url) # if secure_url_match != None: # secure_domain = secure_url_match.group(1) # bundle_path = self.check_certs(secure_domain, timeout) # if not bundle_path: # return False # handlers.append(VerifiedHTTPSHandler(ca_certs=bundle_path)) opener = build_opener(*handlers) while tries > 0: tries -= 1 try: request = Request( url, headers={"User-Agent": "OmniMarkup Downloader"}) http_file = opener.open(request, timeout=timeout) return http_file.read() except HTTPException as e: log.warning('%s HTTP exception %s (%s) downloading %s.', error_message, e.__class__.__name__, str(e), url) except HTTPError as e: # Bitbucket and Github ratelimit using 503 a decent amount if str(e.code) == '503': log.warning( 'Downloading %s was rate limited, trying again', url) continue log.warning('%s HTTP error %s downloading %s.', error_message, str(e.code), url) except URLError as e: # Bitbucket and Github timeout a decent amount if str(e.reason) == 'The read operation timed out' or \ str(e.reason) == 'timed out': log.warning('Downloading %s timed out, trying again', url) continue log.warning('%s URL error %s downloading %s.', error_message, str(e.reason), url) break return False
def http_error_302(self, req, res, code, msg, headers): '''Filter non-GET request before calling parent implementation.''' method = req.get_method() # Bail unless method is get if method != 'GET': return res # Let parent handle the rest return HTTPRedirectHandler.http_error_302(self, req, res, code, msg, headers)
def __init__(self, data_path, **kwargs): if not validation.is_data_path(data_path): raise Exception('invalid data_path: %s' % data_path) self.cookie_jar = MozillaCookieJar( os.path.join(data_path, default.COOKIES_FILENAME)) try: self.cookie_jar.load() except EnvironmentError: pass self.opener = build_opener(HTTPRedirectHandler(), HTTPCookieProcessor(self.cookie_jar)) super(Session, self).__init__(**kwargs)
def redirect_request(self, req, res, code, msg, hdrs, newurl): response = { 'url': req.get_full_url(), 'headers': res.headers, 'code': code, 'msg': msg, 'new_url': newurl } self.redirect_hdrs.append(response) nreq = HTTPRedirectHandler.redirect_request(self, req, res, code, msg, hdrs, newurl) return nreq
def http_error_302(self, request, fp, code, message, headers): cookie = SimpleCookie() request_cookie = request.headers.get('Cookie') if request_cookie: cookie.load(request_cookie) set_cookie = headers.get('set-cookie') if set_cookie: for value in set_cookie: cookie.load(value) headers['Cookie'] = cookie.output(header='', sep='; ') redirect_handler = HTTPRedirectHandler.http_error_302(self, request, fp, code, message, headers) return inesHTTPError(request, redirect_handler, code, message, headers)
def redirect_request(self, req, fp, code, msg, headers, newurl): newreq = HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, newurl) if 'Authorization' not in req.headers: return newreq src = urlparse(req.get_full_url()).hostname dest = urlparse(newreq.get_full_url()).hostname if dest != src: bot.debug('AuthRedirectHandler: stripping "Authorization" header ' "(%s != %s)" % (dest, src)) del newreq.headers['Authorization'] return newreq
def http_error_302(self, req, res, code, msg, headers): '''Filter non-GET request before calling parent implementation.''' method = req.get_method() # Bail unless method is get if method != 'GET': return res # Let parent handle the rest return HTTPRedirectHandler.http_error_302( self, req, res, code, msg, headers)
def login(self): if self.type == 'geonetwork': url = "%sgeonetwork/srv/en/xml.user.login" % self.base headers = { "Content-Type": "application/x-www-form-urlencoded", "Accept": "text/plain" } post = urlencode({ "username": self.user, "password": self.password }) request = Request(url, post, headers) self.opener = build_opener(HTTPCookieProcessor(), HTTPRedirectHandler()) response = self.opener.open(request) doc = dlxml.fromstring(response.read()) assert doc.tag == 'ok', "GeoNetwork login failed!" self.connected = True
def __init__(self, proxy=None): global USER_AGENT self.redirh = HTTPRedirectHandler() self.cookie = HTTPCookieProcessor() self.rawopen = build_opener(self.redirh, self.cookie) if proxy is None or self.no_proxy: self.opener = self.rawopen elif proxy == 'auto': # proxy.uku.im:8888 #self.proxyh = ProxyHandler({'http': "http://211.155.86.25:8888"}) #self.proxyh = ProxyHandler({'http': "proxy.uku.im:8888"}) self.proxyh = ProxyHandler({'http': "https://secure.uku.im:8443"}) #self.proxyh = ProxyHandler({'http': "https://proxy.uku.im:443"}) self.opener = build_opener(self.proxyh, self.redirh, self.cookie) else: self.proxyh = ProxyHandler(proxy) self.opener = build_opener(self.proxyh, self.redirh, self.cookie) self.extra_headers = {"User-Agent": USER_AGENT}
def http_error_301(self, req, res, code, msg, headers): '''Update location and filter non-GET request before calling parent implementation. ''' method = req.get_method() resource = req.resource # Update resource location if 'location' in headers: resource.location = headers['location'] # Bail unless method is GET if method != 'GET': return res # Let parent handle the rest return HTTPRedirectHandler.http_error_301(self, req, res, code, msg, headers)
def __init__(self, args): """ Start up... """ self.args = args self.cj = http.cookiejar.MozillaCookieJar(COOKIES_FILENAME) if os.access(COOKIES_FILENAME, os.F_OK): self.cj.load(os.getcwd() + "/" + COOKIES_FILENAME) self.opener = build_opener(HTTPRedirectHandler(), HTTPHandler(debuglevel=0), HTTPSHandler(debuglevel=0), HTTPCookieProcessor(self.cj)) self.opener.addheaders = [ ('User-Agent', ('Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36' )), ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' ) ] if not os.path.exists(TMP_DIR): os.makedirs(TMP_DIR)
def __init__(self): self.user = bugtracker_user self.password = bugtracker_pass self.login_page = 'https://bugs.archlinux.org/index.php?do=authenticate' #self.target_page = 'http://bugs.archlinux.org/index.php?events%5B%5D=1&events%5B%5D=13&events%5B%5D=2&events%5B%5D=4&event_number=50&do=reports' #self.target_page = 'http://bugs.archlinux.org/index.php?events[]=1&events[]=13&events[]=2&events[]=4&fromdate=&todate=&event_number=50&project=0&do=reports&submit=' self.target_page = 'https://bugs.archlinux.org/index.php?events%5B%5D=1&events%5B%5D=13&events%5B%5D=2&events%5B%5D=4&event_number=50&do=reports&project=0' self.cj = CookieJar() self.opener = build_opener(HTTPRedirectHandler(), HTTPHandler(debuglevel=0), HTTPSHandler(debuglevel=0), HTTPCookieProcessor(self.cj)) # self.opener.addheaders = [ # ('User-agent', ('Mozilla/4.0 (compatible; MSIE 6.0; ' # 'Windows NT 5.2; .NET CLR 1.1.4322)')) # ] self.opener.addheaders = [('User-agent', 'Mozilla/5.0')] # need this twice - once to set cookies, once to log in... self.login() #self.login() self.old_events = set([])
def http_error_301(self, req, res, code, msg, headers): '''Update location and filter non-GET request before calling parent implementation. ''' method = req.get_method() resource = req.resource # Update resource location if 'location' in headers: resource.location = headers['location'] # Bail unless method is GET if method != 'GET': return res # Let parent handle the rest return HTTPRedirectHandler.http_error_301( self, req, res, code, msg, headers)
class Page(object): verb_handler = HTTPHandler() if logging.getLogger().getEffectiveLevel() == logging.DEBUG: verb_handler.set_http_debuglevel(2) redir_handler = HTTPRedirectHandler() opener = build_opener(verb_handler, redir_handler) def __init__(self): pass @staticmethod def unenscape_Google_bang_URL(old_URL): """ See https://developers.google.com/webmasters\ /ajax-crawling/docs/getting-started for more information """ if old_URL.find('#!') != -1: return old_URL.replace('#!', '?_escaped_fragment_=') elif old_URL.startswith('https://groups.google.com/d/topic/'): # DEBUG:get_one_topic:URL collected = # https://groups.google.com/d/topic/jbrout/dreCkob3KSs # DEBUG:__init__:root_URL = # https://groups.google.com/forum/\ # ?_escaped_fragment_=topic/jbrout/dreCkob3KSs return old_URL.replace( 'https://groups.google.com/d/', 'https://groups.google.com/forum/?_escaped_fragment_=') else: return old_URL def _get_page_BS(self, URL): res = self.opener.open(self.unenscape_Google_bang_URL(URL)) in_str = res.read() bs = BeautifulSoup(in_str) res.close() return bs
def http_error_303(self, req, res, code, msg, hdrs): # Let parent handle the rest return HTTPRedirectHandler.http_error_303(self, req, res, code, msg, hdrs)
request.add_unredirected_header('Authorization', 'Bearer ' + auth_token) return self.parent.open(request, timeout=request.timeout) # Got some help from this example https://gist.github.com/FiloSottile/2077115 class HeadRequest(Request): def get_method(self): return "HEAD" better_urllib_get = OpenerDirector() better_urllib_get.addheaders = DEFAULT_HEADERS.copy() better_urllib_get.add_handler(HTTPHandler()) better_urllib_get.add_handler(HTTPSHandler()) better_urllib_get.add_handler(HTTPRedirectHandler()) better_urllib_get.add_handler(SocketFileHandler()) better_urllib_get.add_handler(Oauth2TokenAuthHandler()) class RegistryError(Exception): def __init__(self, response): self.response_obj = response # Util functions ############################################################################################# def parse_thresholds(spec, include_units=True, units_required=True): """ Given a spec string break it up into ':' separated chunks. Convert strings to ints as it makes sense
def redirect_request(self, req, fp, code, msg, headers, newurl): new_req = HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, newurl) req.redirect_code = code return new_req
import pickle import time import requests from celerycrawler import settings from datetime import datetime from urllib.parse import urlparse from urllib.robotparser import RobotFileParser from urllib.request import urlopen, Request, HTTPError from urllib.request import install_opener, build_opener, HTTPRedirectHandler from couchdb.mapping import Document, TextField, DateTimeField, ListField, FloatField from django.core.cache import cache install_opener(build_opener(HTTPRedirectHandler())) class Page(Document): type = TextField(default="page") url = TextField() raw = TextField() content = TextField() links = ListField(TextField()) rank = FloatField(default=0) last_checked = DateTimeField(default=datetime.now) def is_valid(self): return (datetime.now() - self.last_checked).days < 7 def update(self): print("updating page") parse = urlparse(self.url)
def http_error_307(self, req, res, code, msg, hdrs): #return None self.counter['307'] += 1 if self.counter['307'] <= 1: return HTTPRedirectHandler.http_error_307(self, req, res, code, msg, hdrs)
def redirect_request(self, req, fp, code, msg, hdrs, newurl): self.last_url = newurl r = HTTPRedirectHandler.redirect_request( self, req, fp, code, msg, hdrs, newurl) r.get_method = lambda: 'HEAD' return r
def __init__(self): HTTPRedirectHandler.__init__(self) self.redirect_hdrs = [] self.counter = collections.Counter()
def http_error_302(self, req, fp, code, msg, headers): result = HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers) result.status = code return result
def redirect_request(self, req, *rest): resource = req.resource nreq = HTTPRedirectHandler.redirect_request(self, req, *rest) nreq.resource = resource return nreq