def http_download(download_url, outfile, proxy_url=None, proxy_port=None): if proxy_url: proxy = "{}:{}".format(proxy_url, proxy_port) mainlog.info("Using a proxy : {}".format(proxy)) urlopener = build_opener(ProxyHandler({ 'https': proxy, 'http': proxy }), HTTPRedirectHandler()) else: mainlog.info("Not using a proxy") urlopener = build_opener(HTTPHandler(), HTTPSHandler(), HTTPRedirectHandler()) urlopener.addheaders = [( 'User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:32.0) Gecko/20100101 Firefox/32.0' )] datasource = urlopener.open(download_url) out = open(outfile, 'wb') while True: d = datasource.read(8192) # self.logger.debug("Downloaded {} bytes".format(len(d))) if not d: break else: out.write(d) out.flush() out.close() datasource.close()
def auth(*, email, password, client_id, scope): def split_key_value(kv_pair): kv = kv_pair.split("=") return kv[0], kv[1] # Authorization form def auth_user(email, password, client_id, scope, opener): response = opener.open( "http://oauth.vk.com/oauth/authorize?" + \ "redirect_uri=http://oauth.vk.com/blank.html&response_type=token&" + \ "client_id=%s&scope=%s" % (client_id, ",".join(scope)) ) doc = response.read().decode('utf-8') parser = AuthFormParser() parser.feed(doc) parser.close() if not parser.form_parsed or parser.url is None or "pass" not in parser.params or \ "email" not in parser.params: raise RuntimeError("Something wrong") parser.params["email"] = email parser.params["pass"] = password if parser.method == "POST": response = opener.open(parser.url, urlencode(parser.params).encode('ascii')) else: raise NotImplementedError("Method '%s'" % parser.method) return response.read(), response.geturl() # Permission request form def give_access(doc, opener): parser = AuthFormParser() parser.feed(doc) parser.close() if not parser.form_parsed or parser.url is None: raise RuntimeError("Something wrong") if parser.method == "POST": response = opener.open(parser.url, urlencode(parser.params).encode('ascii')) else: raise NotImplementedError("Method '%s'" % parser.method) return response.geturl() if not isinstance(scope, list): scope = [scope] opener = build_opener(HTTPCookieProcessor(http.cookiejar.CookieJar()), HTTPRedirectHandler()) doc, url = auth_user(email, password, client_id, scope, opener) if urlparse(url).path != "/blank.html": # Need to give access to requested scope url = give_access(doc.decode('utf-8'), opener) if urlparse(url).path != "/blank.html": raise RuntimeError("Expected success here") answer = dict( split_key_value(kv_pair) for kv_pair in urlparse(url).fragment.split("&")) if "access_token" not in answer or "user_id" not in answer: raise RuntimeError("Missing some values in answer") return answer["access_token"], answer["user_id"]
def setup_method(self, method): self.cookies = CookieJar() self.opener = build_opener(HTTPRedirectHandler(), HTTPHandler(debuglevel=0), HTTPSHandler(debuglevel=0), HTTPCookieProcessor(self.cookies)) self.application_process = Process(target=main) self.application_process.start()
def download(self, url, error_message, timeout, tries): http_proxy = self.setting.http_proxy https_proxy = self.setting.https_proxy if http_proxy or https_proxy: proxies = {} if http_proxy: proxies['http'] = http_proxy if not https_proxy: proxies['https'] = http_proxy if https_proxy: proxies['https'] = https_proxy proxy_handler = ProxyHandler(proxies) else: proxy_handler = ProxyHandler() handlers = [proxy_handler, HTTPRedirectHandler()] # secure_url_match = re.match('^https://([^/]+)', url) # if secure_url_match != None: # secure_domain = secure_url_match.group(1) # bundle_path = self.check_certs(secure_domain, timeout) # if not bundle_path: # return False # handlers.append(VerifiedHTTPSHandler(ca_certs=bundle_path)) opener = build_opener(*handlers) while tries > 0: tries -= 1 try: request = Request( url, headers={"User-Agent": "OmniMarkup Downloader"}) http_file = opener.open(request, timeout=timeout) return http_file.read() except HTTPException as e: log.warning('%s HTTP exception %s (%s) downloading %s.', error_message, e.__class__.__name__, str(e), url) except HTTPError as e: # Bitbucket and Github ratelimit using 503 a decent amount if str(e.code) == '503': log.warning( 'Downloading %s was rate limited, trying again', url) continue log.warning('%s HTTP error %s downloading %s.', error_message, str(e.code), url) except URLError as e: # Bitbucket and Github timeout a decent amount if str(e.reason) == 'The read operation timed out' or \ str(e.reason) == 'timed out': log.warning('Downloading %s timed out, trying again', url) continue log.warning('%s URL error %s downloading %s.', error_message, str(e.reason), url) break return False
def open(request): request = request_vim_to_python(request) rhandler = HTTPRedirectHandler() rhandler.max_redirections = request['max_redirect'] opener = build_opener(rhandler) if request['username']: passmgr = HTTPPasswordMgrWithDefaultRealm() passmgr.add_password( None, request['url'], request['username'], request['password'], ) opener.add_handler(HTTPBasicAuthHandler(passmgr)) opener.add_handler(HTTPDigestAuthHandler(passmgr)) req = Request( url=request['url'], data=request['data'], headers=request['headers'], method=request['method'], ) if request['gzip_decompress']: req.add_header('Accept-encoding', 'gzip') try: res = retry(tries=request['retry'])(opener.open)( req, timeout=request['timeout']) except HTTPError as e: res = e if not hasattr(res, 'version'): # urllib2 does not have 'version' field import httplib res.version = httplib.HTTPConnection._http_vsn response_status = "HTTP/%s %d %s\n" % ( '1.1' if res.version == 11 else '1.0', res.code, res.msg, ) response_headers = str(res.headers) response_body = res.read() if (request['gzip_decompress'] and res.headers.get('Content-Encoding') == 'gzip'): response_body = gzip_decompress(response_body) if hasattr(res.headers, 'get_content_charset'): # Python 3 response_encoding = res.headers.get_content_charset() else: # Python 2 response_encoding = res.headers.getparam('charset') response_body = response_body.decode(response_encoding) return ( request['url'], response_status + response_headers, response_body, )
def __init__(self, data_path, **kwargs): if not validation.is_data_path(data_path): raise Exception('invalid data_path: %s' % data_path) self.cookie_jar = MozillaCookieJar( os.path.join(data_path, default.COOKIES_FILENAME)) try: self.cookie_jar.load() except EnvironmentError: pass self.opener = build_opener(HTTPRedirectHandler(), HTTPCookieProcessor(self.cookie_jar)) super(Session, self).__init__(**kwargs)
def login(self): if self.type == 'geonetwork': url = "%sgeonetwork/srv/en/xml.user.login" % self.base headers = { "Content-Type": "application/x-www-form-urlencoded", "Accept": "text/plain" } post = urlencode({ "username": self.user, "password": self.password }) request = Request(url, post, headers) self.opener = build_opener(HTTPCookieProcessor(), HTTPRedirectHandler()) response = self.opener.open(request) doc = dlxml.fromstring(response.read()) assert doc.tag == 'ok', "GeoNetwork login failed!" self.connected = True
def __init__(self, proxy=None): global USER_AGENT self.redirh = HTTPRedirectHandler() self.cookie = HTTPCookieProcessor() self.rawopen = build_opener(self.redirh, self.cookie) if proxy is None or self.no_proxy: self.opener = self.rawopen elif proxy == 'auto': # proxy.uku.im:8888 #self.proxyh = ProxyHandler({'http': "http://211.155.86.25:8888"}) #self.proxyh = ProxyHandler({'http': "proxy.uku.im:8888"}) self.proxyh = ProxyHandler({'http': "https://secure.uku.im:8443"}) #self.proxyh = ProxyHandler({'http': "https://proxy.uku.im:443"}) self.opener = build_opener(self.proxyh, self.redirh, self.cookie) else: self.proxyh = ProxyHandler(proxy) self.opener = build_opener(self.proxyh, self.redirh, self.cookie) self.extra_headers = {"User-Agent": USER_AGENT}
def __init__(self, args): """ Start up... """ self.args = args self.cj = http.cookiejar.MozillaCookieJar(COOKIES_FILENAME) if os.access(COOKIES_FILENAME, os.F_OK): self.cj.load(os.getcwd() + "/" + COOKIES_FILENAME) self.opener = build_opener(HTTPRedirectHandler(), HTTPHandler(debuglevel=0), HTTPSHandler(debuglevel=0), HTTPCookieProcessor(self.cj)) self.opener.addheaders = [ ('User-Agent', ('Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36' )), ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' ) ] if not os.path.exists(TMP_DIR): os.makedirs(TMP_DIR)
def __init__(self): self.user = bugtracker_user self.password = bugtracker_pass self.login_page = 'https://bugs.archlinux.org/index.php?do=authenticate' #self.target_page = 'http://bugs.archlinux.org/index.php?events%5B%5D=1&events%5B%5D=13&events%5B%5D=2&events%5B%5D=4&event_number=50&do=reports' #self.target_page = 'http://bugs.archlinux.org/index.php?events[]=1&events[]=13&events[]=2&events[]=4&fromdate=&todate=&event_number=50&project=0&do=reports&submit=' self.target_page = 'https://bugs.archlinux.org/index.php?events%5B%5D=1&events%5B%5D=13&events%5B%5D=2&events%5B%5D=4&event_number=50&do=reports&project=0' self.cj = CookieJar() self.opener = build_opener(HTTPRedirectHandler(), HTTPHandler(debuglevel=0), HTTPSHandler(debuglevel=0), HTTPCookieProcessor(self.cj)) # self.opener.addheaders = [ # ('User-agent', ('Mozilla/4.0 (compatible; MSIE 6.0; ' # 'Windows NT 5.2; .NET CLR 1.1.4322)')) # ] self.opener.addheaders = [('User-agent', 'Mozilla/5.0')] # need this twice - once to set cookies, once to log in... self.login() #self.login() self.old_events = set([])
class Page(object): verb_handler = HTTPHandler() if logging.getLogger().getEffectiveLevel() == logging.DEBUG: verb_handler.set_http_debuglevel(2) redir_handler = HTTPRedirectHandler() opener = build_opener(verb_handler, redir_handler) def __init__(self): pass @staticmethod def unenscape_Google_bang_URL(old_URL): """ See https://developers.google.com/webmasters\ /ajax-crawling/docs/getting-started for more information """ if old_URL.find('#!') != -1: return old_URL.replace('#!', '?_escaped_fragment_=') elif old_URL.startswith('https://groups.google.com/d/topic/'): # DEBUG:get_one_topic:URL collected = # https://groups.google.com/d/topic/jbrout/dreCkob3KSs # DEBUG:__init__:root_URL = # https://groups.google.com/forum/\ # ?_escaped_fragment_=topic/jbrout/dreCkob3KSs return old_URL.replace( 'https://groups.google.com/d/', 'https://groups.google.com/forum/?_escaped_fragment_=') else: return old_URL def _get_page_BS(self, URL): res = self.opener.open(self.unenscape_Google_bang_URL(URL)) in_str = res.read() bs = BeautifulSoup(in_str) res.close() return bs
request.add_unredirected_header('Authorization', 'Bearer ' + auth_token) return self.parent.open(request, timeout=request.timeout) # Got some help from this example https://gist.github.com/FiloSottile/2077115 class HeadRequest(Request): def get_method(self): return "HEAD" better_urllib_get = OpenerDirector() better_urllib_get.addheaders = DEFAULT_HEADERS.copy() better_urllib_get.add_handler(HTTPHandler()) better_urllib_get.add_handler(HTTPSHandler()) better_urllib_get.add_handler(HTTPRedirectHandler()) better_urllib_get.add_handler(SocketFileHandler()) better_urllib_get.add_handler(Oauth2TokenAuthHandler()) class RegistryError(Exception): def __init__(self, response): self.response_obj = response # Util functions ############################################################################################# def parse_thresholds(spec, include_units=True, units_required=True): """ Given a spec string break it up into ':' separated chunks. Convert strings to ints as it makes sense
import pickle import time import requests from celerycrawler import settings from datetime import datetime from urllib.parse import urlparse from urllib.robotparser import RobotFileParser from urllib.request import urlopen, Request, HTTPError from urllib.request import install_opener, build_opener, HTTPRedirectHandler from couchdb.mapping import Document, TextField, DateTimeField, ListField, FloatField from django.core.cache import cache install_opener(build_opener(HTTPRedirectHandler())) class Page(Document): type = TextField(default="page") url = TextField() raw = TextField() content = TextField() links = ListField(TextField()) rank = FloatField(default=0) last_checked = DateTimeField(default=datetime.now) def is_valid(self): return (datetime.now() - self.last_checked).days < 7 def update(self): print("updating page") parse = urlparse(self.url)
def get_handlers(self): handlers = [] if self._verify_cert == False: ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE handler = request.HTTPSHandler(context=ctx) handlers.append(handler) from urllib.request import HTTPRedirectHandler redirect_handler = HTTPRedirectHandler() redirect_handler.max_redirections = 30 redirect_handler.max_repeats = 30 handlers.append(redirect_handler) if self._username and self._password: passman = request.HTTPPasswordMgrWithDefaultRealm() passman.add_password(None, self._parsed_org_url, self._username, self._password) handlers.append(request.HTTPBasicAuthHandler(passman)) passman = request.HTTPPasswordMgrWithDefaultRealm() passman.add_password(None, self._parsed_org_url, self._username, self._password) handlers.append(request.HTTPDigestAuthHandler(passman)) if os.name == 'nt': try: from arcgis._impl.common._iwa import NtlmSspiAuthHandler, KerberosSspiAuthHandler auth_krb = KerberosSspiAuthHandler() handlers.append(auth_krb) try: auth_NTLM = NtlmSspiAuthHandler() handlers.append(auth_NTLM) except: pass except Error as err: _log.error( "winkerberos packages is required for IWA authentication (NTLM and Kerberos)." ) _log.error( "Please install it:\n\tconda install winkerberos") _log.error(str(err)) else: _log.error( 'The GIS uses Integrated Windows Authentication which is currently only supported on the Windows platform' ) if self._auth == "PKI" or \ (self.cert_file is not None and self.key_file is not None): handlers.append( HTTPSClientAuthHandler(self.key_file, self.cert_file)) elif self._portal_connection and \ self._portal_connection.cert_file is not None and \ self._portal_connection.key_file is not None: handlers.append( HTTPSClientAuthHandler(self._portal_connection.key_file, self._portal_connection.cert_file)) cj = cookiejar.CookieJar() if self.proxy_host: # Simple Proxy Support from urllib.request import ProxyHandler if self.proxy_port is None: self.proxy_port = 80 proxies = { "http": "http://%s:%s" % (self.proxy_host, self.proxy_port), "https": "https://%s:%s" % (self.proxy_host, self.proxy_port) } proxy_support = ProxyHandler(proxies) handlers.append(proxy_support) handlers.append(request.HTTPCookieProcessor(cj)) return handlers
import sys from urllib.request import OpenerDirector, HTTPRedirectHandler, HTTPSHandler, urlretrieve REPO_URL = 'https://github.com/facebook/rocksdb' assert len(sys.argv) > 1, 'Please provide a download directory, e.g. /build' assert len(sys.argv) < 3, f'Please omit the unexpected arguments: {sys.argv[2:]}' download_dir = sys.argv[1] od = OpenerDirector() od.add_handler(HTTPSHandler()) od.add_handler(HTTPRedirectHandler()) resp = od.open(f'{REPO_URL}/releases/latest/download/') tag_name = resp.headers['location'].split('/')[-1] release_url = f'{REPO_URL}/archive/{tag_name}.tar.gz' file_path, headers = urlretrieve(release_url, f'{download_dir}/latest.tar.gz') print(f'RocksDB {tag_name} was downloaded to {file_path}', file=sys.stderr) print(tag_name[1:])