def getUrlold(url,timeout=20, returnres=False): global cookieJar global clientHeader try: post=None #print 'url',url #openner = urllib2.build_opener(urllib2.HTTPHandler, urllib2.HTTPSHandler) cookie_handler = urllib2.HTTPCookieProcessor(cookieJar) openner = urllib2.build_opener(cookie_handler, urllib2.HTTPBasicAuthHandler(), urllib2.HTTPHandler()) #print cookieJar if post: req = urllib2.Request(url, post) else: req = urllib2.Request(url) ua_header=False if clientHeader: for n,v in clientHeader: req.add_header(n,v) if n=='User-Agent': ua_header=True if not ua_header: req.add_header('User-Agent','AppleCoreMedia/1.0.0.12B411 (iPhone; U; CPU OS 8_1 like Mac OS X; en_gb)') #req.add_header('X-Playback-Session-Id','9A1E596D-6AB6-435F-85D1-59BDD0E62D24') if gproxy: req.set_proxy(gproxy, 'http') response = openner.open(req) if returnres: return response data=response.read() #print len(data) return data except: print 'Error in getUrl' traceback.print_exc() return None
ERR_STR_NO_DOI = 'No document object identifier found on the page: ' ERR_STR_NO_BIBTEX = 'No BibTeX entry found for the DOI: ' ERR_STR_NO_URL = 'No URL found in the BibTeX entry for the DOI: ' ERR_STR_REPORT = 'Please report the error to [email protected].' # read url from std input url = sys.stdin.readline() # get rid of the newline at the end url = url.strip() cookie_jar = cookielib.CookieJar() handlers = [] if "--debug" in sys.argv: handlers.append(urllib2.HTTPHandler(debuglevel=True)) handlers.append(urllib2.HTTPCookieProcessor(cookie_jar)) opener = urllib2.build_opener(*handlers) opener.addheaders = [('User-agent', 'lwp-request/5.810')] urllib2.install_opener(opener) # link.aps.org/doi is basically a DOI resolver, but not necessarily to # aps.org if re.search(r'/doi/', url): f = urlopen(url) # Open the URL, which automatically follows redirects. # I think this just does similar to HEAD (i.e, no data downloaded) # but not sure. url = f.geturl() # if not an APS url we can handle, let another plugin handle it
def read_body_and_headers(url, post=None, headers=[], follow_redirects=False, timeout=None): _log("read_body_and_headers " + url) if post is not None: _log("read_body_and_headers post=" + post) if len(headers) == 0: headers.append([ "User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:18.0) Gecko/20100101 Firefox/18.0" ]) # Start cookie lib ficherocookies = os.path.join(get_data_path(), 'cookies.dat') _log("read_body_and_headers cookies_file=" + ficherocookies) cj = None ClientCookie = None cookielib = None # Let's see if cookielib is available try: _log("read_body_and_headers importing cookielib") import cookielib except ImportError: _log("read_body_and_headers cookielib no disponible") # If importing cookielib fails # let's try ClientCookie try: _log("read_body_and_headers importing ClientCookie") import ClientCookie except ImportError: _log("read_body_and_headers ClientCookie not available") # ClientCookie isn't available either urlopen = urllib2.urlopen Request = urllib2.Request else: _log("read_body_and_headers ClientCookie available") # imported ClientCookie urlopen = ClientCookie.urlopen Request = ClientCookie.Request cj = ClientCookie.MozillaCookieJar() else: _log("read_body_and_headers cookielib available") # importing cookielib worked urlopen = urllib2.urlopen Request = urllib2.Request cj = cookielib.MozillaCookieJar() # This is a subclass of FileCookieJar # that has useful load and save methods if cj is not None: # we successfully imported # one of the two cookie handling modules _log("read_body_and_headers Cookies enabled") if os.path.isfile(ficherocookies): _log("read_body_and_headers Reading cookie file") # if we have a cookie file already saved # then load the cookies into the Cookie Jar try: cj.load(ficherocookies) except: _log("read_body_and_headers Wrong cookie file, deleting...") os.remove(ficherocookies) # Now we need to get our Cookie Jar # installed in the opener; # for fetching URLs if cookielib is not None: _log("read_body_and_headers opener using urllib2 (cookielib)") # if we use cookielib # then we get the HTTPCookieProcessor # and install the opener in urllib2 if not follow_redirects: opener = urllib2.build_opener( urllib2.HTTPHandler(debuglevel=http_debug_log_enabled), urllib2.HTTPCookieProcessor(cj), NoRedirectHandler()) else: opener = urllib2.build_opener( urllib2.HTTPHandler(debuglevel=http_debug_log_enabled), urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) else: _log("read_body_and_headers opener using ClientCookie") # if we use ClientCookie # then we get the HTTPCookieProcessor # and install the opener in ClientCookie opener = ClientCookie.build_opener( ClientCookie.HTTPCookieProcessor(cj)) ClientCookie.install_opener(opener) # ------------------------------------------------- # Cookies instaladas, lanza la petición # ------------------------------------------------- # Contador inicio = time.clock() # Diccionario para las cabeceras txheaders = {} # Construye el request if post is None: _log("read_body_and_headers GET request") else: _log("read_body_and_headers POST request") # Añade las cabeceras _log("read_body_and_headers ---------------------------") for header in headers: _log("read_body_and_headers header %s=%s" % (str(header[0]), str(header[1]))) txheaders[header[0]] = header[1] _log("read_body_and_headers ---------------------------") req = Request(url, post, txheaders) if timeout is None: handle = urlopen(req) else: #Disponible en python 2.6 en adelante --> handle = urlopen(req, timeout=timeout) #Para todas las versiones: try: import socket deftimeout = socket.getdefaulttimeout() socket.setdefaulttimeout(timeout) handle = urlopen(req) socket.setdefaulttimeout(deftimeout) except: import sys for line in sys.exc_info(): _log("%s" % line) # Actualiza el almacén de cookies cj.save(ficherocookies) # Lee los datos y cierra if handle.info().get('Content-Encoding') == 'gzip': buf = StringIO(handle.read()) f = gzip.GzipFile(fileobj=buf) data = f.read() else: data = handle.read() info = handle.info() _log("read_body_and_headers Response") returnheaders = [] _log("read_body_and_headers ---------------------------") for header in info: _log("read_body_and_headers " + header + "=" + info[header]) returnheaders.append([header, info[header]]) handle.close() _log("read_body_and_headers ---------------------------") ''' # Lanza la petición try: response = urllib2.urlopen(req) # Si falla la repite sustituyendo caracteres especiales except: req = urllib2.Request(url.replace(" ","%20")) # Añade las cabeceras for header in headers: req.add_header(header[0],header[1]) response = urllib2.urlopen(req) ''' # Tiempo transcurrido fin = time.clock() _log("read_body_and_headers Downloaded in %d seconds " % (fin - inicio + 1)) _log("read_body_and_headers body=" + data) return data, returnheaders
access_token_secret = "LbCFs0erKoPpHNZFNVBeuaaT3In9CKkBSmwAohYGr8Tkl" consumer_key = "GvbPz6XCMcOp8610jEifMg" consumer_secret = "o7AT3QTrshnswlkQWmYoZiaCY5vYVzYBUQKWPn25zg" _debug = 0 oauth_token = woof.Token(key=access_token_key, secret=access_token_secret) oauth_consumer = woof.Consumer(key=consumer_key, secret=consumer_secret) signature_method_hmac_sha1 = woof.SignatureMethod_HMAC_SHA1() http_method = "GET" http_handler = urllib.HTTPHandler(debuglevel=_debug) https_handler = urllib.HTTPSHandler(debuglevel=_debug) ''' Construct, sign, and open a twitter request using the hard-coded credentials above. ''' def twitterreq(url, method, parameters): req = woof.Request.from_consumer_and_token(oauth_consumer, token=oauth_token, http_method=http_method, http_url=url, parameters=parameters) req.sign_request(signature_method_hmac_sha1, oauth_consumer, oauth_token)
import urllib import urllib2 import HTMLParser http_header ={ 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0', 'Accept': 'image/png,image/*;q=0.8,*/*;q=0.5', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'DNT': '1', 'Connection': 'keep-alive' } class getBlogList(HTMLParser.HTMLParser): def handle_starttag(self,tag,attrs): if tag == 'a': for name,value in attrs: if name == 'href' and value.index('http://www.xxx.com/888/blog/'): print value host = 'http://www.xxx.com/888/blog' http_debug_handler = urllib2.HTTPHandler(debuglevel=1) urllib2.install_opener(urllib2.build_opener(http_debug_handler)) request = urllib2.Request(host,None,http_header) response = urllib2.urlopen(request) engine = getBlogList() engine.feed(response.read())
uwcicon = xbmc.translatePath(os.path.join(rootDir, 'icon.png')) changelog = xbmc.translatePath(os.path.join(rootDir, 'changelog.txt')) profileDir = addon.getAddonInfo('profile') profileDir = xbmc.translatePath(profileDir).decode("utf-8") cookiePath = os.path.join(profileDir, 'cookies.lwp') kodiver = xbmc.getInfoLabel("System.BuildVersion").split(".")[0] if not os.path.exists(profileDir): os.makedirs(profileDir) urlopen = urllib2.urlopen cj = cookielib.LWPCookieJar(xbmc.translatePath(cookiePath)) Request = urllib2.Request handlers = [urllib2.HTTPBasicAuthHandler(), urllib2.HTTPHandler()] if (2, 7, 8) < sys.version_info < (2, 7, 12): try: import ssl ssl_context = ssl.create_default_context() ssl_context.check_hostname = False ssl_context.verify_mode = ssl.CERT_NONE handlers += [urllib2.HTTPSHandler(context=ssl_context)] except: pass if cj != None: if os.path.isfile(xbmc.translatePath(cookiePath)): try: cj.load()
def do_request(self, json_obj): headers = { 'Content-Type': 'application/json-rpc', 'User-Agent': 'python/zabbix_api' } if self.httpuser: self.debug(logging.INFO, "HTTP Auth enabled") auth = 'Basic ' + string.strip( base64.encodestring(self.httpuser + ':' + self.httppasswd)) headers['Authorization'] = auth self.r_query.append(str(json_obj)) self.debug(logging.INFO, "Sending: " + str(json_obj)) self.debug(logging.DEBUG, "Sending headers: " + str(headers)) request = urllib2.Request(url=self.url, data=json_obj.encode('utf-8'), headers=headers) if self.proto == "https": https_handler = urllib2.HTTPSHandler(debuglevel=0) opener = urllib2.build_opener(https_handler) elif self.proto == "http": http_handler = urllib2.HTTPHandler(debuglevel=0) opener = urllib2.build_opener(http_handler) else: raise ZabbixAPIException("Unknow protocol %s" % self.proto) urllib2.install_opener(opener) try: response = opener.open(request, timeout=self.timeout) except ssl.SSLError as e: if e.message == "The read operation timed out": raise APITimeout("SSL read timeout", ) else: raise e except socket.timeout as e: raise APITimeout("HTTP read timeout", ) except urllib2.URLError as e: if "Connection timed out" in e.message: raise APITimeout("HTTP read timeout", ) else: raise e self.debug(logging.INFO, "Response Code: " + str(response.code)) # NOTE: Getting a 412 response code means the headers are not in the # list of allowed headers. if response.code != 200: raise ZabbixAPIException("HTTP ERROR %s: %s" % (response.status, response.reason)) reads = response.read() if len(reads) == 0: raise ZabbixAPIException("Received zero answer") try: jobj = json.loads(reads.decode('utf-8')) except ValueError as msg: print("unable to decode. returned string: %s" % reads) sys.exit(-1) self.debug(logging.DEBUG, "Response Body: " + str(jobj)) self.id += 1 if 'error' in jobj: # some exception msg = "Error %s: %s, %s while sending %s" % ( jobj['error']['code'], jobj['error']['message'], jobj['error']['data'], str(json_obj)) if re.search(".*already\sexists.*", jobj["error"]["data"], re.I): # already exists raise Already_Exists(msg, jobj['error']['code']) else: raise ZabbixAPIException(msg, jobj['error']['code']) return jobj
def send(self, url, method="GET", payload=None, headers=None, cookie=None, auth=None, content=''): # initialize url service urlib = self.framework.urlib(url) if urlib.scheme == '': url = urlib.sub_service("http") # get random user agent if self.rand_agent: self.user_agent = self.framework.rand_uagent().get # Makes a web request and returns a response object. if method.upper() != "POST" and content: raise RequestException( "Invalid content type for the %s method: %s" % (method, content)) # Prime local mutable variables to prevent persistence if payload is None: payload = {} if headers is None: headers = {} if auth is None: auth = () # Set request arguments # Process user-agent header headers["User-Agent"] = self.user_agent if not headers["User-Agent"]: headers["User-Agent"] = self.user_agent # process payload if content.upper() == "JSON": headers["Content-Type"] = "application/json" payload = json.dumps(payload) else: payload = urlencode(encode_payload(payload)) # process basic authentication if len(auth) == 2: authorization = b64encode( utf_8_encode('%s:%s' % (auth[0], auth[1]))).replace('\n', '') headers["Authorization"] = "Basic %s" % (authorization) # Process socket timeout if self.timeout: socket.setdefaulttimeout(self.timeout) # Set handlers # Declare handlers list according to debug setting handlers = [ request.HTTPHandler(debuglevel=1), request.HTTPSHandler(debuglevel=1) ] if self.debug else [] # Process cookie handler if cookie is not None and cookie != {}: if isinstance(cookie, dict): str_cookie = "" for i in cookie: str_cookie += "%s=%s; " % (i, cookie[i]) headers["Cookie"] = str_cookie else: handlers.append(request.HTTPCookieProcessor(cookie)) # Process redirect and add handler if not self.redirect: handlers.append(NoRedirectHandler) # Process proxies and add handler if self.proxy: proxies = {"http": self.proxy, "https": self.proxy} handlers.append(request.ProxyHandler(proxies)) # Install opener opener = request.build_opener(*handlers) request.install_opener(opener) # Process method and make request if method == "GET": if payload: url = "%s?%s" % (url, payload) req = request.Request(url, headers=headers) elif method == "POST": req = request.Request(url, data=payload, headers=headers) elif method == "HEAD": if payload: url = "%s?%s" % (url, payload) req = request.Request(url, headers=headers) req.get_method = lambda: "HEAD" else: raise RequestException( "Request method \'%s\' is not a supported method." % (method)) try: resp = request.urlopen(req) except request.HTTPError as e: resp = e # Build and return response object return ResponseObject(resp, cookie)
def _post_multipart(self, host, selector, fields, files, ssl=False, port=80, proxy_url=None, proxy_port=None): """ performs a multi-post to AGOL, Portal, or AGS Inputs: host - string - root url (no http:// or https://) ex: www.arcgis.com selector - string - everything after the host ex: /PWJUSsdoJDp7SgLj/arcgis/rest/services/GridIndexFeatures/FeatureServer/0/1/addAttachment fields - dictionary - additional parameters like token and format information files - tuple array- tuple with the file name type, filename, full path ssl - option to use SSL proxy_url - string - url to proxy server proxy_port - interger - port value if not on port 80 Output: JSON response as dictionary Useage: import urlparse url = "http://sampleserver3.arcgisonline.com/ArcGIS/rest/services/SanFrancisco/311Incidents/FeatureServer/0/10261291" parsed_url = urlparse.urlparse(url) params = {"f":"json"} print _post_multipart(host=parsed_url.hostname, selector=parsed_url.path, files=files, fields=params ) """ content_type, body = self._encode_multipart_formdata(fields, files) if ssl: url = "https://%s%s" % (host, selector) else: url = "http://%s%s" % (host, selector) if proxy_url is not None: if proxy_port is None: proxy_port = 80 proxies = { "http": "http://%s:%s" % (proxy_url, proxy_port), "https": "https://%s:%s" % (proxy_url, proxy_port) } proxy_support = urllib2.ProxyHandler(proxies) opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler(debuglevel=0)) urllib2.install_opener(opener) request = urllib2.Request(url) request.add_header('User-agent', 'ArcREST') request.add_header('Content-type', content_type) request.add_header('Content-length', len(body)) request.add_data(body) result = urllib2.urlopen(request).read() if result == "": return "" jres = json.loads(result) if 'error' in jres: if jres['error']['message'] == 'Request not made over ssl': if url.startswith('http://'): url = url.replace('http://', 'https://') return self._post_multipart(host, selector, fields, files, ssl=True, port=port, proxy_url=proxy_url, proxy_port=proxy_port) return self._unicode_convert(jres)
def getRegexParsed( regexs, url, cookieJar=None, forCookieJarOnly=False, recursiveCall=False, cachedPages={}, rawPost=False, cookie_jar_file=None): #0,1,2 = URL, regexOnly, CookieJarOnly doRegexs = re.compile('\$doregex\[([^\]]*)\]').findall(url) setresolved = True for k in doRegexs: if k in regexs: m = regexs[k] cookieJarParam = False if 'cookiejar' in m: cookieJarParam = m['cookiejar'] if '$doregex' in cookieJarParam: cookieJar = getRegexParsed(regexs, m['cookiejar'], cookieJar, True, True, cachedPages) cookieJarParam = True else: cookieJarParam = True if cookieJarParam: if cookieJar == None: cookie_jar_file = None if 'open[' in m['cookiejar']: cookie_jar_file = m['cookiejar'].split( 'open[')[1].split(']')[0] cookieJar = getCookieJar(cookie_jar_file) if cookie_jar_file: saveCookieJar(cookieJar, cookie_jar_file) elif 'save[' in m['cookiejar']: cookie_jar_file = m['cookiejar'].split('save[')[1].split( ']')[0] complete_path = os.path.join(profile, cookie_jar_file) saveCookieJar(cookieJar, cookie_jar_file) if m['page'] and '$doregex' in m['page']: pg = getRegexParsed(regexs, m['page'], cookieJar, recursiveCall=True, cachedPages=cachedPages) if len(pg) == 0: pg = 'http://regexfailed' m['page'] = pg if 'setcookie' in m and m['setcookie'] and '$doregex' in m[ 'setcookie']: m['setcookie'] = getRegexParsed(regexs, m['setcookie'], cookieJar, recursiveCall=True, cachedPages=cachedPages) if 'appendcookie' in m and m['appendcookie'] and '$doregex' in m[ 'appendcookie']: m['appendcookie'] = getRegexParsed(regexs, m['appendcookie'], cookieJar, recursiveCall=True, cachedPages=cachedPages) if 'post' in m and '$doregex' in m['post']: m['post'] = getRegexParsed(regexs, m['post'], cookieJar, recursiveCall=True, cachedPages=cachedPages) if 'rawpost' in m and '$doregex' in m['rawpost']: m['rawpost'] = getRegexParsed(regexs, m['rawpost'], cookieJar, recursiveCall=True, cachedPages=cachedPages, rawPost=True) if 'rawpost' in m and '$epoctime$' in m['rawpost']: m['rawpost'] = m['rawpost'].replace('$epoctime$', getEpocTime()) if 'rawpost' in m and '$epoctime2$' in m['rawpost']: m['rawpost'] = m['rawpost'].replace('$epoctime2$', getEpocTime2()) link = '' if m['page'] and m[ 'page'] in cachedPages and not 'ignorecache' in m and forCookieJarOnly == False: link = cachedPages[m['page']] else: if m['page'] and not m['page'] == '' and m['page'].startswith( 'http'): if '$epoctime$' in m['page']: m['page'] = m['page'].replace('$epoctime$', getEpocTime()) if '$epoctime2$' in m['page']: m['page'] = m['page'].replace('$epoctime2$', getEpocTime2()) page_split = m['page'].split('|') pageUrl = page_split[0] header_in_page = None if len(page_split) > 1: header_in_page = page_split[1] current_proxies = urllib2.ProxyHandler( urllib2.getproxies()) req = urllib2.Request(pageUrl) if 'proxy' in m: proxytouse = m['proxy'] if pageUrl[:5] == "https": proxy = urllib2.ProxyHandler({'https': proxytouse}) else: proxy = urllib2.ProxyHandler({'http': proxytouse}) opener = urllib2.build_opener(proxy) urllib2.install_opener(opener) req.add_header( 'User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/14.0.1' ) proxytouse = None if 'referer' in m: req.add_header('Referer', m['referer']) if 'accept' in m: req.add_header('Accept', m['accept']) if 'agent' in m: req.add_header('User-agent', m['agent']) if 'x-req' in m: req.add_header('X-Requested-With', m['x-req']) if 'x-addr' in m: req.add_header('x-addr', m['x-addr']) if 'x-forward' in m: req.add_header('X-Forwarded-For', m['x-forward']) if 'setcookie' in m: req.add_header('Cookie', m['setcookie']) if 'appendcookie' in m: cookiestoApend = m['appendcookie'] cookiestoApend = cookiestoApend.split(';') for h in cookiestoApend: n, v = h.split('=') w, n = n.split(':') ck = cookielib.Cookie(version=0, name=n, value=v, port=None, port_specified=False, domain=w, domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False) cookieJar.set_cookie(ck) if 'origin' in m: req.add_header('Origin', m['origin']) if header_in_page: header_in_page = header_in_page.split('&') for h in header_in_page: n, v = h.split('=') req.add_header(n, v) if not cookieJar == None: cookie_handler = urllib2.HTTPCookieProcessor(cookieJar) opener = urllib2.build_opener( cookie_handler, urllib2.HTTPBasicAuthHandler(), urllib2.HTTPHandler()) opener = urllib2.install_opener(opener) if 'noredirect' in m: opener = urllib2.build_opener( cookie_handler, NoRedirection, urllib2.HTTPBasicAuthHandler(), urllib2.HTTPHandler()) opener = urllib2.install_opener(opener) elif 'noredirect' in m: opener = urllib2.build_opener( NoRedirection, urllib2.HTTPBasicAuthHandler(), urllib2.HTTPHandler()) opener = urllib2.install_opener(opener) if 'connection' in m: from keepalive import HTTPHandler keepalive_handler = HTTPHandler() opener = urllib2.build_opener(keepalive_handler) urllib2.install_opener(opener) post = None if 'post' in m: postData = m['post'] splitpost = postData.split(',') post = {} for p in splitpost: n = p.split(':')[0] v = p.split(':')[1] post[n] = v post = urllib.urlencode(post) if 'rawpost' in m: post = m['rawpost'] link = '' try: if post: response = urllib2.urlopen(req, post) else: response = urllib2.urlopen(req) if response.info().get('Content-Encoding') == 'gzip': from StringIO import StringIO import gzip buf = StringIO(response.read()) f = gzip.GzipFile(fileobj=buf) link = f.read() else: link = response.read() if 'proxy' in m and not current_proxies is None: urllib2.install_opener( urllib2.build_opener(current_proxies)) link = javascriptUnEscape(link) if 'includeheaders' in m: link += '$$HEADERS_START$$:' for b in response.headers: link += b + ':' + response.headers.get( b) + '\n' link += '$$HEADERS_END$$:' response.close() except: pass cachedPages[m['page']] = link if forCookieJarOnly: return cookieJar elif m['page'] and not m['page'].startswith('http'): if m['page'].startswith('$pyFunction:'): val = doEval(m['page'].split('$pyFunction:')[1], '', cookieJar, m) if forCookieJarOnly: return cookieJar link = val link = javascriptUnEscape(link) else: link = m['page'] if '$doregex' in m['expres']: m['expres'] = getRegexParsed(regexs, m['expres'], cookieJar, recursiveCall=True, cachedPages=cachedPages) if not m['expres'] == '': if '$LiveStreamCaptcha' in m['expres']: val = askCaptcha(m, link, cookieJar) url = url.replace("$doregex[" + k + "]", val) elif m['expres'].startswith( '$pyFunction:') or '#$pyFunction' in m['expres']: val = '' if m['expres'].startswith('$pyFunction:'): val = doEval(m['expres'].split('$pyFunction:')[1], link, cookieJar, m) else: val = doEvalFunction(m['expres'], link, cookieJar, m) if 'ActivateWindow' in m['expres']: return if forCookieJarOnly: return cookieJar if 'listrepeat' in m: listrepeat = m['listrepeat'] return listrepeat, eval(val), m, regexs, cookieJar try: url = url.replace(u"$doregex[" + k + "]", val) except: url = url.replace("$doregex[" + k + "]", val.decode("utf-8")) else: if 'listrepeat' in m: listrepeat = m['listrepeat'] ret = re.findall(m['expres'], link) return listrepeat, ret, m, regexs val = '' if not link == '': reg = re.compile(m['expres']).search(link) try: val = reg.group(1).strip() except: traceback.print_exc() elif m['page'] == '' or m['page'] == None: val = m['expres'] if rawPost: val = urllib.quote_plus(val) if 'htmlunescape' in m: import HTMLParser val = HTMLParser.HTMLParser().unescape(val) try: url = url.replace("$doregex[" + k + "]", val) except: url = url.replace("$doregex[" + k + "]", val.decode("utf-8")) else: url = url.replace("$doregex[" + k + "]", '') if '$epoctime$' in url: url = url.replace('$epoctime$', getEpocTime()) if '$epoctime2$' in url: url = url.replace('$epoctime2$', getEpocTime2()) if '$GUID$' in url: import uuid url = url.replace('$GUID$', str(uuid.uuid1()).upper()) if '$get_cookies$' in url: url = url.replace('$get_cookies$', getCookiesString(cookieJar)) if recursiveCall: return url if url == "": return else: return url, setresolved
def request(url, close=True, redirect=True, error=False, proxy=None, post=None, headers=None, mobile=False, limit=None, referer=None, cookie=None, output='', timeout='30'): try: handlers = [] if not proxy == None: handlers += [urllib2.ProxyHandler({'http':'%s' % (proxy)}), urllib2.HTTPHandler] opener = urllib2.build_opener(*handlers) opener = urllib2.install_opener(opener) if output == 'cookie' or output == 'extended' or not close == True: cookies = cookielib.LWPCookieJar() handlers += [urllib2.HTTPHandler(), urllib2.HTTPSHandler(), urllib2.HTTPCookieProcessor(cookies)] opener = urllib2.build_opener(*handlers) opener = urllib2.install_opener(opener) try: if sys.version_info < (2, 7, 9): raise Exception() import ssl; ssl_context = ssl.create_default_context() ssl_context.check_hostname = False ssl_context.verify_mode = ssl.CERT_NONE handlers += [urllib2.HTTPSHandler(context=ssl_context)] opener = urllib2.build_opener(*handlers) opener = urllib2.install_opener(opener) except: pass try: headers.update(headers) except: headers = {} if 'User-Agent' in headers: pass elif not mobile == True: #headers['User-Agent'] = agent() headers['User-Agent'] = cache.get(randomagent, 1) else: headers['User-Agent'] = 'Apple-iPhone/701.341' if 'Referer' in headers: pass elif referer == None: headers['Referer'] = '%s://%s/' % (urlparse.urlparse(url).scheme, urlparse.urlparse(url).netloc) else: headers['Referer'] = referer if not 'Accept-Language' in headers: headers['Accept-Language'] = 'en-US' if 'Cookie' in headers: pass elif not cookie == None: headers['Cookie'] = cookie if redirect == False: class NoRedirection(urllib2.HTTPErrorProcessor): def http_response(self, request, response): return response opener = urllib2.build_opener(NoRedirection) opener = urllib2.install_opener(opener) try: del headers['Referer'] except: pass request = urllib2.Request(url, data=post, headers=headers) try: response = urllib2.urlopen(request, timeout=int(timeout)) except urllib2.HTTPError as response: if response.code == 503: if 'cf-browser-verification' in response.read(5242880): netloc = '%s://%s' % (urlparse.urlparse(url).scheme, urlparse.urlparse(url).netloc) ua = headers['User-Agent'] cf = cache.get(cfcookie().get, 168, netloc, ua, timeout) headers['Cookie'] = cf request = urllib2.Request(url, data=post, headers=headers) response = urllib2.urlopen(request, timeout=int(timeout)) elif error == False: return elif error == False: return if output == 'cookie': try: result = '; '.join(['%s=%s' % (i.name, i.value) for i in cookies]) except: pass try: result = cf except: pass if close == True: response.close() return result elif output == 'geturl': result = response.geturl() if close == True: response.close() return result elif output == 'headers': result = response.headers if close == True: response.close() return result elif output == 'chunk': try: content = int(response.headers['Content-Length']) except: content = (2049 * 1024) if content < (2048 * 1024): return result = response.read(16 * 1024) if close == True: response.close() return result if limit == '0': result = response.read(224 * 1024) elif not limit == None: result = response.read(int(limit) * 1024) else: result = response.read(5242880) if 'sucuri_cloudproxy_js' in result: su = sucuri().get(result) headers['Cookie'] = su request = urllib2.Request(url, data=post, headers=headers) response = urllib2.urlopen(request, timeout=int(timeout)) if limit == '0': result = response.read(224 * 1024) elif not limit == None: result = response.read(int(limit) * 1024) else: result = response.read(5242880) if output == 'extended': response_headers = response.headers response_code = str(response.code) try: cookie = '; '.join(['%s=%s' % (i.name, i.value) for i in cookies]) except: pass try: cookie = cf except: pass if close == True: response.close() return (result, response_code, response_headers, headers, cookie) else: if close == True: response.close() return result except: return
def __init__(self, url, close=True, proxy=None, post=None, mobile=False, referer=None, cookie=None, output='', timeout='10'): if not proxy == None: proxy_handler = urllib2.ProxyHandler({'http':'%s' % (proxy)}) opener = urllib2.build_opener(proxy_handler, urllib2.HTTPHandler) opener = urllib2.install_opener(opener) if output == 'cookie' or not close == True: import cookielib cookie_handler = urllib2.HTTPCookieProcessor(cookielib.LWPCookieJar()) opener = urllib2.build_opener(cookie_handler, urllib2.HTTPBasicAuthHandler(), urllib2.HTTPHandler()) opener = urllib2.install_opener(opener) if not post == None: request = urllib2.Request(url, post) else: request = urllib2.Request(url,None) if mobile == True: request.add_header('User-Agent', 'Mozilla/5.0 (iPhone; CPU; CPU iPhone OS 4_0 like Mac OS X; en-us) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8A293 Safari/6531.22.7') else: request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0') if not referer == None: request.add_header('Referer', referer) if not cookie == None: request.add_header('cookie', cookie) response = urllib2.urlopen(request, timeout=int(timeout)) if output == 'cookie': result = str(response.headers.get('Set-Cookie')) elif output == 'geturl': result = response.geturl() else: result = response.read() if close == True: response.close() self.result = result
def getUrl(url, cookieJar=None,post=None, timeout=20, headers=None, noredir=False): cookie_handler = urllib2.HTTPCookieProcessor(cookieJar) if noredir: opener = urllib2.build_opener(NoRedirection,cookie_handler, urllib2.HTTPBasicAuthHandler(), urllib2.HTTPHandler()) else: opener = urllib2.build_opener(cookie_handler, urllib2.HTTPBasicAuthHandler(), urllib2.HTTPHandler()) #opener = urllib2.install_opener(opener) req = urllib2.Request(url) req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.154 Safari/537.36') if headers: for h,hv in headers: req.add_header(h,hv) response = opener.open(req,post,timeout=timeout) link=response.read() response.close() return link
def getRegexParsed(regexs, url,cookieJar=None,forCookieJarOnly=False,recursiveCall=False,cachedPages={}, rawPost=False, cookie_jar_file=None):#0,1,2 = URL, regexOnly, CookieJarOnly #cachedPages = {} #print 'url',url doRegexs = re.compile('\$doregex\[([^\]]*)\]').findall(url) # print 'doRegexs',doRegexs,regexs setresolved=True for k in doRegexs: if k in regexs: #print 'processing ' ,k m = regexs[k] #print m cookieJarParam=False if 'cookiejar' in m: # so either create or reuse existing jar #print 'cookiejar exists',m['cookiejar'] cookieJarParam=m['cookiejar'] if '$doregex' in cookieJarParam: cookieJar=getRegexParsed(regexs, m['cookiejar'],cookieJar,True, True,cachedPages) cookieJarParam=True else: cookieJarParam=True #print 'm[cookiejar]',m['cookiejar'],cookieJar if cookieJarParam: if cookieJar==None: #print 'create cookie jar' cookie_jar_file=None if 'open[' in m['cookiejar']: cookie_jar_file=m['cookiejar'].split('open[')[1].split(']')[0] # print 'cookieJar from file name',cookie_jar_file cookieJar=getCookieJar(cookie_jar_file) # print 'cookieJar from file',cookieJar if cookie_jar_file: saveCookieJar(cookieJar,cookie_jar_file) #import cookielib #cookieJar = cookielib.LWPCookieJar() #print 'cookieJar new',cookieJar elif 'save[' in m['cookiejar']: cookie_jar_file=m['cookiejar'].split('save[')[1].split(']')[0] complete_path=os.path.join(profile,cookie_jar_file) # print 'complete_path',complete_path saveCookieJar(cookieJar,cookie_jar_file) if m['page'] and '$doregex' in m['page']: pg=getRegexParsed(regexs, m['page'],cookieJar,recursiveCall=True,cachedPages=cachedPages) if len(pg)==0: pg='http://regexfailed' m['page']=pg if 'setcookie' in m and m['setcookie'] and '$doregex' in m['setcookie']: m['setcookie']=getRegexParsed(regexs, m['setcookie'],cookieJar,recursiveCall=True,cachedPages=cachedPages) if 'appendcookie' in m and m['appendcookie'] and '$doregex' in m['appendcookie']: m['appendcookie']=getRegexParsed(regexs, m['appendcookie'],cookieJar,recursiveCall=True,cachedPages=cachedPages) if 'post' in m and '$doregex' in m['post']: m['post']=getRegexParsed(regexs, m['post'],cookieJar,recursiveCall=True,cachedPages=cachedPages) # print 'post is now',m['post'] if 'rawpost' in m and '$doregex' in m['rawpost']: m['rawpost']=getRegexParsed(regexs, m['rawpost'],cookieJar,recursiveCall=True,cachedPages=cachedPages,rawPost=True) #print 'rawpost is now',m['rawpost'] if 'rawpost' in m and '$epoctime$' in m['rawpost']: m['rawpost']=m['rawpost'].replace('$epoctime$',getEpocTime()) if 'rawpost' in m and '$epoctime2$' in m['rawpost']: m['rawpost']=m['rawpost'].replace('$epoctime2$',getEpocTime2()) link='' if m['page'] and m['page'] in cachedPages and not 'ignorecache' in m and forCookieJarOnly==False : #print 'using cache page',m['page'] link = cachedPages[m['page']] else: if m['page'] and not m['page']=='' and m['page'].startswith('http'): if '$epoctime$' in m['page']: m['page']=m['page'].replace('$epoctime$',getEpocTime()) if '$epoctime2$' in m['page']: m['page']=m['page'].replace('$epoctime2$',getEpocTime2()) #print 'Ingoring Cache',m['page'] page_split=m['page'].split('|') pageUrl=page_split[0] header_in_page=None if len(page_split)>1: header_in_page=page_split[1] # if # proxy = urllib2.ProxyHandler({ ('https' ? proxytouse[:5]=="https":"http") : proxytouse}) # opener = urllib2.build_opener(proxy) # urllib2.install_opener(opener) # import urllib2 # print 'urllib2.getproxies',urllib2.getproxies() current_proxies=urllib2.ProxyHandler(urllib2.getproxies()) #print 'getting pageUrl',pageUrl req = urllib2.Request(pageUrl) if 'proxy' in m: proxytouse= m['proxy'] # print 'proxytouse',proxytouse # urllib2.getproxies= lambda: {} if pageUrl[:5]=="https": proxy = urllib2.ProxyHandler({ 'https' : proxytouse}) #req.set_proxy(proxytouse, 'https') else: proxy = urllib2.ProxyHandler({ 'http' : proxytouse}) #req.set_proxy(proxytouse, 'http') opener = urllib2.build_opener(proxy) urllib2.install_opener(opener) req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/14.0.1') proxytouse=None if 'referer' in m: req.add_header('Referer', m['referer']) if 'accept' in m: req.add_header('Accept', m['accept']) if 'agent' in m: req.add_header('User-agent', m['agent']) if 'x-req' in m: req.add_header('X-Requested-With', m['x-req']) if 'x-addr' in m: req.add_header('x-addr', m['x-addr']) if 'x-forward' in m: req.add_header('X-Forwarded-For', m['x-forward']) if 'setcookie' in m: # print 'adding cookie',m['setcookie'] req.add_header('Cookie', m['setcookie']) if 'appendcookie' in m: # print 'appending cookie to cookiejar',m['appendcookie'] cookiestoApend=m['appendcookie'] cookiestoApend=cookiestoApend.split(';') for h in cookiestoApend: n,v=h.split('=') w,n= n.split(':') ck = cookielib.Cookie(version=0, name=n, value=v, port=None, port_specified=False, domain=w, domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False) cookieJar.set_cookie(ck) if 'origin' in m: req.add_header('Origin', m['origin']) if header_in_page: header_in_page=header_in_page.split('&') for h in header_in_page: n,v=h.split('=') req.add_header(n,v) if not cookieJar==None: # print 'cookieJarVal',cookieJar cookie_handler = urllib2.HTTPCookieProcessor(cookieJar) opener = urllib2.build_opener(cookie_handler, urllib2.HTTPBasicAuthHandler(), urllib2.HTTPHandler()) opener = urllib2.install_opener(opener) # print 'noredirect','noredirect' in m if 'noredirect' in m: opener = urllib2.build_opener(cookie_handler,NoRedirection, urllib2.HTTPBasicAuthHandler(), urllib2.HTTPHandler()) opener = urllib2.install_opener(opener) elif 'noredirect' in m: opener = urllib2.build_opener(NoRedirection, urllib2.HTTPBasicAuthHandler(), urllib2.HTTPHandler()) opener = urllib2.install_opener(opener) if 'connection' in m: # print '..........................connection//////.',m['connection'] from keepalive import HTTPHandler keepalive_handler = HTTPHandler() opener = urllib2.build_opener(keepalive_handler) urllib2.install_opener(opener) #print 'after cookie jar' post=None if 'post' in m: postData=m['post'] #if '$LiveStreamRecaptcha' in postData: # (captcha_challenge,catpcha_word,idfield)=processRecaptcha(m['page'],cookieJar) # if captcha_challenge: # postData=postData.replace('$LiveStreamRecaptcha','manual_recaptcha_challenge_field:'+captcha_challenge+',recaptcha_response_field:'+catpcha_word+',id:'+idfield) splitpost=postData.split(','); post={} for p in splitpost: n=p.split(':')[0]; v=p.split(':')[1]; post[n]=v post = urllib.urlencode(post) if 'rawpost' in m: post=m['rawpost'] #if '$LiveStreamRecaptcha' in post: # (captcha_challenge,catpcha_word,idfield)=processRecaptcha(m['page'],cookieJar) # if captcha_challenge: # post=post.replace('$LiveStreamRecaptcha','&manual_recaptcha_challenge_field='+captcha_challenge+'&recaptcha_response_field='+catpcha_word+'&id='+idfield) link='' try: if post: response = urllib2.urlopen(req,post) else: response = urllib2.urlopen(req) if response.info().get('Content-Encoding') == 'gzip': from StringIO import StringIO import gzip buf = StringIO( response.read()) f = gzip.GzipFile(fileobj=buf) link = f.read() else: link=response.read() if 'proxy' in m and not current_proxies is None: urllib2.install_opener(urllib2.build_opener(current_proxies)) link=javascriptUnEscape(link) #print repr(link) #print link This just print whole webpage in LOG if 'includeheaders' in m: #link+=str(response.headers.get('Set-Cookie')) link+='$$HEADERS_START$$:' for b in response.headers: link+= b+':'+response.headers.get(b)+'\n' link+='$$HEADERS_END$$:' # print link response.close() except: pass cachedPages[m['page']] = link #print link #print 'store link for',m['page'],forCookieJarOnly if forCookieJarOnly: return cookieJar# do nothing elif m['page'] and not m['page'].startswith('http'): if m['page'].startswith('$pyFunction:'): val=doEval(m['page'].split('$pyFunction:')[1],'',cookieJar,m ) if forCookieJarOnly: return cookieJar# do nothing link=val link=javascriptUnEscape(link) else: link=m['page'] if '$doregex' in m['expres']: m['expres']=getRegexParsed(regexs, m['expres'],cookieJar,recursiveCall=True,cachedPages=cachedPages) if not m['expres']=='': #print 'doing it ',m['expres'] if '$LiveStreamCaptcha' in m['expres']: val=askCaptcha(m,link,cookieJar) #print 'url and val',url,val url = url.replace("$doregex[" + k + "]", val) elif m['expres'].startswith('$pyFunction:') or '#$pyFunction' in m['expres']: #print 'expeeeeeeeeeeeeeeeeeee',m['expres'] val='' if m['expres'].startswith('$pyFunction:'): val=doEval(m['expres'].split('$pyFunction:')[1],link,cookieJar,m) else: val=doEvalFunction(m['expres'],link,cookieJar,m) if 'ActivateWindow' in m['expres']: return if forCookieJarOnly: return cookieJar# do nothing if 'listrepeat' in m: listrepeat=m['listrepeat'] return listrepeat,eval(val), m,regexs,cookieJar try: url = url.replace(u"$doregex[" + k + "]", val) except: url = url.replace("$doregex[" + k + "]", val.decode("utf-8")) else: if 'listrepeat' in m: listrepeat=m['listrepeat'] ret=re.findall(m['expres'],link) return listrepeat,ret, m,regexs val='' if not link=='': #print 'link',link reg = re.compile(m['expres']).search(link) try: val=reg.group(1).strip() except: traceback.print_exc() elif m['page']=='' or m['page']==None: val=m['expres'] if rawPost: # print 'rawpost' val=urllib.quote_plus(val) if 'htmlunescape' in m: #val=urllib.unquote_plus(val) import HTMLParser val=HTMLParser.HTMLParser().unescape(val) try: url = url.replace("$doregex[" + k + "]", val) except: url = url.replace("$doregex[" + k + "]", val.decode("utf-8")) #print 'ur',url #return val else: url = url.replace("$doregex[" + k + "]",'') if '$epoctime$' in url: url=url.replace('$epoctime$',getEpocTime()) if '$epoctime2$' in url: url=url.replace('$epoctime2$',getEpocTime2()) if '$GUID$' in url: import uuid url=url.replace('$GUID$',str(uuid.uuid1()).upper()) if '$get_cookies$' in url: url=url.replace('$get_cookies$',getCookiesString(cookieJar)) if recursiveCall: return url #print 'final url',repr(url) if url=="": return else: return url,setresolved
def request(url, close=True, redirect=True, error=False, proxy=None, post=None, headers=None, mobile=False, XHR=False, limit=None, referer=None, cookie=None, compression=True, output='', timeout='30'): try: handlers = [] if not proxy == None: handlers += [ urllib2.ProxyHandler({'http': '%s' % (proxy)}), urllib2.HTTPHandler ] opener = urllib2.build_opener(*handlers) opener = urllib2.install_opener(opener) if output == 'cookie' or output == 'extended' or not close == True: cookies = cookielib.LWPCookieJar() handlers += [ urllib2.HTTPHandler(), urllib2.HTTPSHandler(), urllib2.HTTPCookieProcessor(cookies) ] opener = urllib2.build_opener(*handlers) opener = urllib2.install_opener(opener) if (2, 7, 8) < sys.version_info < (2, 7, 12): try: import ssl ssl_context = ssl.create_default_context() ssl_context.check_hostname = False ssl_context.verify_mode = ssl.CERT_NONE handlers += [urllib2.HTTPSHandler(context=ssl_context)] opener = urllib2.build_opener(*handlers) opener = urllib2.install_opener(opener) except: pass if url.startswith('//'): url = 'http:' + url try: headers.update(headers) except: headers = {} if 'User-Agent' in headers: pass elif not mobile == True: #headers['User-Agent'] = agent() headers['User-Agent'] = cache.get(randomagent, 1) else: headers['User-Agent'] = 'Apple-iPhone/701.341' if 'Referer' in headers: pass elif referer is not None: headers['Referer'] = referer if not 'Accept-Language' in headers: headers['Accept-Language'] = 'en-US' if 'X-Requested-With' in headers: pass elif XHR == True: headers['X-Requested-With'] = 'XMLHttpRequest' if 'Cookie' in headers: pass elif not cookie == None: headers['Cookie'] = cookie if 'Accept-Encoding' in headers: pass elif compression and limit is None: headers['Accept-Encoding'] = 'gzip' if redirect == False: class NoRedirection(urllib2.HTTPErrorProcessor): def http_response(self, request, response): return response opener = urllib2.build_opener(NoRedirection) opener = urllib2.install_opener(opener) try: del headers['Referer'] except: pass if isinstance(post, dict): post = urllib.urlencode(post) request = urllib2.Request(url, data=post) _add_request_header(request, headers) try: response = urllib2.urlopen(request, timeout=int(timeout)) except urllib2.HTTPError as response: if response.code == 503: cf_result = response.read(5242880) try: encoding = response.info().getheader('Content-Encoding') except: encoding = None if encoding == 'gzip': cf_result = gzip.GzipFile( fileobj=StringIO.StringIO(cf_result)).read() if 'cf-browser-verification' in cf_result: netloc = '%s://%s' % (urlparse.urlparse(url).scheme, urlparse.urlparse(url).netloc) ua = headers['User-Agent'] cf = cache.get(cfcookie().get, 168, netloc, ua, timeout) headers['Cookie'] = cf request = urllib2.Request(url, data=post) _add_request_header(request, headers) response = urllib2.urlopen(request, timeout=int(timeout)) else: log_utils.log( 'Request-Error (%s): %s' % (str(response.code), url), log_utils.LOGDEBUG) if error == False: return else: log_utils.log( 'Request-Error (%s): %s' % (str(response.code), url), log_utils.LOGDEBUG) if error == False: return if output == 'cookie': try: result = '; '.join( ['%s=%s' % (i.name, i.value) for i in cookies]) except: pass try: result = cf except: pass if close == True: response.close() return result elif output == 'geturl': result = response.geturl() if close == True: response.close() return result elif output == 'headers': result = response.headers if close == True: response.close() return result elif output == 'chunk': try: content = int(response.headers['Content-Length']) except: content = (2049 * 1024) if content < (2048 * 1024): return result = response.read(16 * 1024) if close == True: response.close() return result if limit == '0': result = response.read(224 * 1024) elif not limit == None: result = response.read(int(limit) * 1024) else: result = response.read(5242880) try: encoding = response.info().getheader('Content-Encoding') except: encoding = None if encoding == 'gzip': result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read() if 'sucuri_cloudproxy_js' in result: su = sucuri().get(result) headers['Cookie'] = su request = urllib2.Request(url, data=post) _add_request_header(request, headers) response = urllib2.urlopen(request, timeout=int(timeout)) if limit == '0': result = response.read(224 * 1024) elif not limit == None: result = response.read(int(limit) * 1024) else: result = response.read(5242880) try: encoding = response.info().getheader('Content-Encoding') except: encoding = None if encoding == 'gzip': result = gzip.GzipFile( fileobj=StringIO.StringIO(result)).read() if 'Blazingfast.io' in result and 'xhr.open' in result: netloc = '%s://%s' % (urlparse.urlparse(url).scheme, urlparse.urlparse(url).netloc) ua = headers['User-Agent'] headers['Cookie'] = cache.get(bfcookie().get, 168, netloc, ua, timeout) result = _basic_request(url, headers=headers, post=post, timeout=timeout, limit=limit) if output == 'extended': try: response_headers = dict([(item[0].title(), item[1]) for item in response.info().items()]) except: response_headers = response.headers response_code = str(response.code) try: cookie = '; '.join( ['%s=%s' % (i.name, i.value) for i in cookies]) except: pass try: cookie = cf except: pass if close == True: response.close() return (result, response_code, response_headers, headers, cookie) else: if close == True: response.close() return result except Exception as e: log_utils.log('Request-Error: (%s) => %s' % (str(e), url), log_utils.LOGDEBUG) return
def _download_file(self, url, save_path, file_name=None, param_dict=None, proxy_url=None, proxy_port=None): """ downloads a file """ try: #if url.find("http://") > -1: # url = url.replace("http://", "https://") if proxy_url is not None: if proxy_port is None: proxy_port = 80 proxies = { "http": "http://%s:%s" % (proxy_url, proxy_port), "https": "https://%s:%s" % (proxy_url, proxy_port) } proxy_support = urllib2.ProxyHandler(proxies) opener = urllib2.build_opener( proxy_support, urllib2.HTTPHandler(debuglevel=0), AGOLRedirectHandler()) urllib2.install_opener(opener) else: opener = urllib2.build_opener( urllib2.HTTPHandler(debuglevel=0), AGOLRedirectHandler()) urllib2.install_opener(opener) if param_dict is not None: encoded_args = urllib.urlencode(param_dict) url = url + '/?' + encoded_args file_data = urllib2.urlopen(url) file_data.getcode() file_data.geturl() if file_name is None: url = file_data.geturl() a = file_data.info().getheader('Content-Disposition') if a is not None: a = a.strip() file_name = re.findall(r'filename=\"(.+?)\"', a)[0] else: file_name = os.path.basename( file_data.geturl().split('?')[0]) if hasattr(file_data, "status") and \ (int(file_data.status) >= 300 and int(file_data.status) < 400): self._download_file(url=file_data.geturl(), save_path=save_path, file_name=file_name, proxy_url=self._proxy_url, proxy_port=self._proxy_port) return save_path + os.sep + file_name if (file_data.info().getheader('Content-Length')): total_size = int( file_data.info().getheader('Content-Length').strip()) downloaded = 0 CHUNK = 4096 with open(save_path + os.sep + file_name, 'wb') as out_file: while True: chunk = file_data.read(CHUNK) downloaded += len(chunk) if not chunk: break out_file.write(chunk) elif file_data.headers.maintype == 'image': with open(save_path + os.sep + file_name, 'wb') as out_file: buf = file_data.read() out_file.write(buf) return save_path + os.sep + file_name except urllib2.HTTPError, e: print "HTTP Error:", e.code, url return False
def get_cookie(self, netloc, ua, timeout): try: headers = {'User-Agent': ua} request = urllib2.Request(netloc) _add_request_header(request, headers) try: response = urllib2.urlopen(request, timeout=int(timeout)) except urllib2.HTTPError as response: result = response.read(5242880) try: encoding = response.info().getheader('Content-Encoding') except: encoding = None if encoding == 'gzip': result = gzip.GzipFile( fileobj=StringIO.StringIO(result)).read() jschl = re.findall('name="jschl_vc" value="(.+?)"/>', result)[0] init = re.findall('setTimeout\(function\(\){\s*.*?.*:(.*?)};', result)[-1] builder = re.findall(r"challenge-form\'\);\s*(.*)a.v", result)[0] decryptVal = self.parseJSString(init) lines = builder.split(';') for line in lines: if len(line) > 0 and '=' in line: sections = line.split('=') line_val = self.parseJSString(sections[1]) decryptVal = int( eval( str(decryptVal) + sections[0][-1] + str(line_val))) answer = decryptVal + len(urlparse.urlparse(netloc).netloc) query = '%s/cdn-cgi/l/chk_jschl?jschl_vc=%s&jschl_answer=%s' % ( netloc, jschl, answer) if 'type="hidden" name="pass"' in result: passval = re.findall('name="pass" value="(.*?)"', result)[0] query = '%s/cdn-cgi/l/chk_jschl?pass=%s&jschl_vc=%s&jschl_answer=%s' % ( netloc, urllib.quote_plus(passval), jschl, answer) time.sleep(6) cookies = cookielib.LWPCookieJar() handlers = [ urllib2.HTTPHandler(), urllib2.HTTPSHandler(), urllib2.HTTPCookieProcessor(cookies) ] opener = urllib2.build_opener(*handlers) opener = urllib2.install_opener(opener) try: request = urllib2.Request(query) _add_request_header(request, headers) response = urllib2.urlopen(request, timeout=int(timeout)) except: pass cookie = '; '.join(['%s=%s' % (i.name, i.value) for i in cookies]) if 'cf_clearance' in cookie: self.cookie = cookie except: pass
def request(url, close=True, redirect=True, error=False, proxy=None, post=None, headers=None, mobile=False, XHR=False, limit=None, referer=None, cookie=None, compression=True, output='', timeout='30', ignoreSsl=False, flare=True, ignoreErrors=None): try: if url is None: return None handlers = [] if proxy is not None: handlers += [ urllib2.ProxyHandler({'http': '%s' % (proxy)}), urllib2.HTTPHandler ] opener = urllib2.build_opener(*handlers) opener = urllib2.install_opener(opener) if output == 'cookie' or output == 'extended' or not close is True: cookies = cookielib.LWPCookieJar() handlers += [ urllib2.HTTPHandler(), urllib2.HTTPSHandler(), urllib2.HTTPCookieProcessor(cookies) ] opener = urllib2.build_opener(*handlers) opener = urllib2.install_opener(opener) if ignoreSsl or ((2, 7, 8) < sys.version_info < (2, 7, 12)): try: import ssl ssl_context = ssl.create_default_context() ssl_context.check_hostname = False ssl_context.verify_mode = ssl.CERT_NONE handlers += [urllib2.HTTPSHandler(context=ssl_context)] opener = urllib2.build_opener(*handlers) opener = urllib2.install_opener(opener) except: pass if url.startswith('//'): url = 'http:' + url try: headers.update(headers) except: headers = {} if 'User-Agent' in headers: pass elif mobile is not True: # headers['User-Agent'] = agent() headers['User-Agent'] = cache.get(randomagent, 1) else: headers['User-Agent'] = 'Apple-iPhone/701.341' if 'Referer' in headers: pass elif referer is not None: headers['Referer'] = referer if 'Accept-Language' not in headers: headers['Accept-Language'] = 'en-US' if 'X-Requested-With' in headers: pass elif XHR is True: headers['X-Requested-With'] = 'XMLHttpRequest' if 'Cookie' in headers: pass elif cookie is not None: headers['Cookie'] = cookie if 'Accept-Encoding' in headers: pass elif compression and limit is None: headers['Accept-Encoding'] = 'gzip' if redirect is False: class NoRedirection(urllib2.HTTPErrorProcessor): def http_response(self, request, response): return response opener = urllib2.build_opener(NoRedirection) opener = urllib2.install_opener(opener) try: del headers['Referer'] except: pass if isinstance(post, dict): # Gets rid of the error: 'ascii' codec can't decode byte 0xd0 in position 0: ordinal not in range(128) for key, value in post.iteritems(): try: post[key] = value.encode('utf-8') except: pass post = urlencode(post) request = urllib2.Request(url, data=post) _add_request_header(request, headers) try: response = urllib2.urlopen(request, timeout=int(timeout)) except urllib2.HTTPError as response: try: ignore = ignoreErrors and (int(response.code) == ignoreErrors or int( response.code) in ignoreErrors) except: ignore = False if not ignore: if response.code in [301, 307, 308, 503]: cf_result = response.read(5242880) try: encoding = response.info().getheader( 'Content-Encoding') except: encoding = None if encoding == 'gzip': cf_result = gzip.GzipFile( fileobj=StringIO(cf_result)).read() if flare and 'cloudflare' in str(response.info()).lower(): try: from openscrapers.modules import cfscrape if isinstance(post, dict): data = post else: try: data = parse_qs(post) except: data = None scraper = cfscrape.CloudScraper() response = scraper.request( method='GET' if post is None else 'POST', url=url, headers=headers, data=data, timeout=int(timeout)) result = response.content flare = 'cloudflare' # Used below try: cookies = response.request._cookies except: log_utils.error() except: log_utils.error() elif 'cf-browser-verification' in cf_result: netloc = '%s://%s' % (urlparse(url).scheme, urlparse(url).netloc) ua = headers['User-Agent'] cf = cache.get(cfcookie().get, 168, netloc, ua, timeout) headers['Cookie'] = cf request = urllib2.Request(url, data=post) _add_request_header(request, headers) response = urllib2.urlopen(request, timeout=int(timeout)) else: log_utils.log( 'Request-Error (%s): %s' % (str(response.code), url), log_utils.LOGDEBUG) if error is False: return None else: log_utils.log( 'Request-Error (%s): %s' % (str(response.code), url), log_utils.LOGDEBUG) if error is False: return None if output == 'cookie': try: result = '; '.join( ['%s=%s' % (i.name, i.value) for i in cookies]) except: pass try: result = cf except: pass if close is True: response.close() return result elif output == 'geturl': result = response.geturl() if close is True: response.close() return result elif output == 'headers': result = response.headers if close is True: response.close() return result elif output == 'chunk': try: content = int(response.headers['Content-Length']) except: content = (2049 * 1024) if content < (2048 * 1024): return result = response.read(16 * 1024) if close is True: response.close() return result if flare != 'cloudflare': if limit == '0': result = response.read(224 * 1024) elif limit is not None: result = response.read(int(limit) * 1024) else: result = response.read(5242880) try: encoding = response.info().getheader('Content-Encoding') except: encoding = None if encoding == 'gzip': result = gzip.GzipFile(fileobj=StringIO(result)).read() if 'sucuri_cloudproxy_js' in result: su = sucuri().get(result) headers['Cookie'] = su request = urllib2.Request(url, data=post) _add_request_header(request, headers) response = urllib2.urlopen(request, timeout=int(timeout)) if limit == '0': result = response.read(224 * 1024) elif limit is not None: result = response.read(int(limit) * 1024) else: result = response.read(5242880) try: encoding = response.info().getheader('Content-Encoding') except: encoding = None if encoding == 'gzip': result = gzip.GzipFile(fileobj=StringIO(result)).read() if 'Blazingfast.io' in result and 'xhr.open' in result: netloc = '%s://%s' % (urlparse(url).scheme, urlparse(url).netloc) ua = headers['User-Agent'] headers['Cookie'] = cache.get(bfcookie().get, 168, netloc, ua, timeout) result = _basic_request(url, headers=headers, post=post, timeout=timeout, limit=limit) if output == 'extended': try: response_headers = dict([(item[0].title(), item[1]) for item in response.info().items()]) except: response_headers = response.headers try: response_code = str(response.code) except: response_code = str(response.status_code ) # object from CFScrape Requests object. try: cookie = '; '.join( ['%s=%s' % (i.name, i.value) for i in cookies]) except: pass try: cookie = cf except: pass if close is True: response.close() return (result, response_code, response_headers, headers, cookie) else: if close is True: response.close() return result except Exception as e: # log_utils.error() log_utils.log('Request-Error: (%s) => %s' % (str(e), url), log_utils.LOGDEBUG) return None
#coding=utf-8 ''' 基本的urlopen()方法不支持代理、cookie等其他的HTTP/HTTPS高级功能。所以要支持这些功能: 使用相关的 Handler处理器 来创建特定功能的处理器对象; 然后通过 urllib2.build_opener()方法使用这些处理器对象,创建自定义opener对象; 使用自定义的opener对象,调用open()方法发送请求。 如果程序里所有的请求都使用自定义的opener,可以使用urllib2.install_opener() 将自定义的 opener 对象 定义为 全局opener,表示如果之后凡是调用urlopen,都将使用这个opener(根据自己的需求来选择) ''' import urllib2 # 构建一个HTTPHandler 处理器对象,支持处理HTTP请求 http_handler = urllib2.HTTPHandler() # 构建一个HTTPHandler 处理器对象,支持处理HTTPS请求 # http_handler = urllib2.HTTPSHandler() # 参数debuglevel=1,会将 Debug Log 打开 # 在执行的时候,会把收包和发包的报头在屏幕上自动打印出来 # http_handler = urllib2.HTTPSHandler(debuglevel=1) # 调用urllib2.build_opener()方法,创建支持处理HTTP请求的opener对象 opener = urllib2.build_opener(http_handler) # 构建 Request请求 request = urllib2.Request("http://www.baidu.com/") # 调用自定义opener对象的open()方法,发送request请求 response = opener.open(request) # 获取服务器响应内容
def request(url, close=True, redirect=True, error=False, verify=True, proxy=None, post=None, headers=None, mobile=False, XHR=False, limit=None, referer=None, cookie=None, compression=True, output='', timeout='15'): try: if not url: return if not headers: if not mobile: headers = randomagent() else: headers = randommobileagent() handlers = [] if proxy is not None: handlers += [ urllib2.ProxyHandler({'http': '%s' % (proxy)}), urllib2.HTTPHandler ] opener = urllib2.build_opener(*handlers) urllib2.install_opener(opener) if output == 'cookie' or output == 'extended' or close is not True: cookies = cookielib.LWPCookieJar() handlers += [ urllib2.HTTPHandler(), urllib2.HTTPSHandler(), urllib2.HTTPCookieProcessor(cookies) ] opener = urllib2.build_opener(*handlers) urllib2.install_opener(opener) try: import platform is_XBOX = platform.uname()[1] == 'XboxOne' except Exception: is_XBOX = False if verify is False and sys.version_info >= (2, 7, 12): try: import ssl ssl_context = ssl._create_unverified_context() handlers += [urllib2.HTTPSHandler(context=ssl_context)] opener = urllib2.build_opener(*handlers) urllib2.install_opener(opener) except Exception: pass if verify is True and ((2, 7, 8) < sys.version_info < (2, 7, 12) or is_XBOX): try: import ssl ssl_context = ssl.create_default_context() ssl_context.check_hostname = False ssl_context.verify_mode = ssl.CERT_NONE handlers += [urllib2.HTTPSHandler(context=ssl_context)] opener = urllib2.build_opener(*handlers) urllib2.install_opener(opener) except Exception: pass if url.startswith('//'): url = 'http:' + url _headers = {} try: _headers.update(headers) except Exception: pass if 'User-Agent' in _headers: pass elif mobile is True: _headers['User-Agent'] = cache.get(randommobileagent, 1) else: _headers['User-Agent'] = cache.get(randomagent, 1) if 'Referer' in _headers: pass elif referer is not None: _headers['Referer'] = referer if 'Accept-Language' not in _headers: _headers['Accept-Language'] = 'en-US' if 'X-Requested-With' in _headers: pass elif XHR is True: _headers['X-Requested-With'] = 'XMLHttpRequest' if 'Cookie' in _headers: pass elif cookie is not None: _headers['Cookie'] = cookie if 'Accept-Encoding' in _headers: pass elif compression and limit is None: _headers['Accept-Encoding'] = 'gzip' if redirect is False: class NoRedirectHandler(urllib2.HTTPRedirectHandler): def http_error_302(self, req, fp, code, msg, headers): infourl = urllib.addinfourl(fp, headers, req.get_full_url()) infourl.status = code infourl.code = code return infourl http_error_300 = http_error_302 http_error_301 = http_error_302 http_error_303 = http_error_302 http_error_307 = http_error_302 opener = urllib2.build_opener(NoRedirectHandler()) urllib2.install_opener(opener) try: del _headers['Referer'] except Exception: pass if isinstance(post, dict): post = utils.byteify(post) post = urllib.urlencode(post) url = utils.byteify(url) request = urllib2.Request(url, data=post) _add_request_header(request, _headers) try: response = urllib2.urlopen(request, timeout=int(timeout)) except urllib2.HTTPError as response: if response.code == 503: cf_result = response.read() try: encoding = response.info().getheader('Content-Encoding') except Exception: encoding = None if encoding == 'gzip': cf_result = gzip.GzipFile( fileobj=StringIO.StringIO(cf_result)).read() if 'cf-browser-verification' in cf_result: while 'cf-browser-verification' in cf_result: netloc = '%s://%s/' % (urlparse.urlparse(url).scheme, urlparse.urlparse(url).netloc) ua = _headers['User-Agent'] cf = cache.get(cfcookie().get, 1, netloc, ua, timeout) _headers['Cookie'] = cf request = urllib2.Request(url, data=post) _add_request_header(request, _headers) try: response = urllib2.urlopen(request, timeout=int(timeout)) cf_result = 'Success' except urllib2.HTTPError as response: cache.remove(cfcookie().get, netloc, ua, timeout) cf_result = response.read() else: log_utils.log( 'Request-Error (%s): %s' % (str(response.code), url), log_utils.LOGDEBUG) if error is False: return else: log_utils.log( 'Request-Error (%s): %s' % (str(response.code), url), log_utils.LOGDEBUG) if error is False: return if output == 'cookie': try: result = '; '.join( ['%s=%s' % (i.name, i.value) for i in cookies]) except Exception: pass try: result = cf except Exception: pass if close is True: response.close() return result elif output == 'geturl': result = response.geturl() if close is True: response.close() return result elif output == 'headers': result = response.headers if close is True: response.close() return result elif output == 'chunk': try: content = int(response.headers['Content-Length']) except Exception: content = (2049 * 1024) if content < (2048 * 1024): return result = response.read(16 * 1024) if close is True: response.close() return result elif output == 'file_size': try: content = int(response.headers['Content-Length']) except Exception: content = '0' response.close() return content if limit == '0': result = response.read(224 * 1024) elif limit is not None: result = response.read(int(limit) * 1024) else: result = response.read(5242880) try: encoding = response.info().getheader('Content-Encoding') except Exception: encoding = None if encoding == 'gzip': result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read() if 'sucuri_cloudproxy_js' in result: su = sucuri().get(result) _headers['Cookie'] = su request = urllib2.Request(url, data=post) _add_request_header(request, _headers) response = urllib2.urlopen(request, timeout=int(timeout)) if limit == '0': result = response.read(224 * 1024) elif limit is not None: result = response.read(int(limit) * 1024) else: result = response.read(5242880) try: encoding = response.info().getheader('Content-Encoding') except Exception: encoding = None if encoding == 'gzip': result = gzip.GzipFile( fileobj=StringIO.StringIO(result)).read() if 'Blazingfast.io' in result and 'xhr.open' in result: netloc = '%s://%s' % (urlparse.urlparse(url).scheme, urlparse.urlparse(url).netloc) ua = _headers['User-Agent'] _headers['Cookie'] = cache.get(bfcookie().get, 168, netloc, ua, timeout) result = _basic_request(url, headers=_headers, post=post, timeout=timeout, limit=limit) if output == 'extended': try: response_headers = dict([(item[0].title(), item[1]) for item in response.info().items()]) except Exception: response_headers = response.headers response_code = str(response.code) try: cookie = '; '.join( ['%s=%s' % (i.name, i.value) for i in cookies]) except Exception: pass try: cookie = cf except Exception: pass if close is True: response.close() return (result, response_code, response_headers, _headers, cookie) else: if close is True: response.close() return result except Exception as e: log_utils.log('Request-Error: (%s) => %s' % (str(e), url), log_utils.LOGDEBUG) return
# -*- encoding:utf-8 -*- ''' Created on 2012-10-21 @author: root ''' """ Send a data-stream to the stdin of a CGI and reading the data it returns to us """ import urllib2 #OPEN DEBUG MODEL httpHandler = urllib2.HTTPHandler(debuglevel=1) httpsHandler = urllib2.HTTPHandler(debuglevel=1) opener = urllib2.build_opener(httpHandler, httpsHandler) urllib2.install_opener(opener) req = urllib2.Request(url='http://10.1.1.10/cgi-bin/test.cgi', data='this data is passed to stdin of the CGI') f = urllib2.urlopen(req) print f.read() print f.info() #CGI PROGRAM #!/usr/bin/env python #import sys #data = sys.stdin.read() #print 'Content-length: 123\n\nContent-type: text-plain\n\nGot Data: "%s"' % data
def read_site_html(url_link): global Domain import requests, cookielib #check redirect headers = { 'Host': Domain, 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'he,he-IL;q=0.8,en-US;q=0.5,en;q=0.3', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Pragma': 'no-cache', 'Cache-Control': 'no-cache', } handlers = [MyHTTPHandler] cookjar = cookielib.CookieJar() handlers += [ urllib2.HTTPHandler(), urllib2.HTTPSHandler(), urllib2.HTTPCookieProcessor(cookjar) ] opener = urllib2.build_opener(*handlers) request = urllib2.Request(url_link, headers=headers) try: html = opener.open(request).read() except: #xbmcgui.Dialog().ok("בא לי פרק","בעיה בחיבור לאתר\n החלף כתובת בהגדרות") r = requests.get(url_link) regex_domain = '//(.+?)/' mathc_domain = re.compile(regex_domain).findall(r.url) new_domain = mathc_domain[0] if new_domain != Domain: Addon.setSetting("domain", new_domain) Domain = Addon.getSetting("domain") xbmcgui.Dialog().ok( "סרט HD", " כתובת האתר הוחלפה ועודכנה פתח שנית " + '\n[COLOR aqua]' + Domain + '[/COLOR]') sys.exit() #html = urllib2.urlopen(request, timeout=int(30)).read() #html=requests.get(url_link,headers=headers) cookies = {} for item in cookjar: cookies[item.name] = item.value #a=a+1 try: first = Crypt('protect_own' + Domain) second = Crypt('protect_up' + Domain) third = Crypt('js' + Domain) oc1 = str(cookies[first]) oc2 = str(cookies[second]) co3 = Crypt(oc1 + oc2) + Crypt(oc2 + oc1) cookies = { third: (co3), first: oc1, second: oc2, 'expires': "14400", '__cfduid': cookies['__cfduid'], 'path': '/' } html = requests.get(url_link, headers=headers, cookies=cookies).content request = urllib2.Request(url_link, headers=headers) #html=requests.get(url_link,headers=headers,cookies=cookies) #html = opener.open(request, timeout=int(30)).read() return html #.encode('utf8') except: return html #.encode('utf8')
def kelly(s, p, f): print 1.0 / (1.0 / s + 1.0 / p + 1.0 / f) pass kelly(2.84, 3.4, 2.2) s = datetime.datetime.strptime("2011-1-1 3:00", '%Y-%m-%d %H:%M') datetime.now().strftime('%Y-%m-%d %H:%M:%S') print s str1 = "888" c = type(str1) request = urllib2.Request("http://www.500.com") opener = urllib2.build_opener( urllib2.HTTPHandler(debuglevel=1)) #为了开启回显,需要手动构造一个HTTPHandler feeddata = opener.open(request).read() loginUrl = "http://www.126.com" request = urllib2.Request(loginUrl) request.add_header( "User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36" ) request.add_header( "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" ) request.add_header("Accept-Encoding", "gzip, deflate, sdch") #request.add_header("X-Requested-With","XMLHttpRequest") request.add_header("Accept-Language", "zh-CN,zh;q=0.8")
def getdata(url, params, headers=None, post=None, verbose=False, jsondecoder=True): """ Invoke URL call and retrieve data from data-service based on provided URL and set of parameters. Use post=True to invoke POST request. """ encoded_data = urllib.urlencode(params) if not post: if encoded_data: url = url + '?' + encoded_data if not headers: headers = {} if verbose: print '+++ getdata, url=%s, headers=%s' % (url, headers) obj = sys.version_info if obj[0] == 2 and obj[1] == 7 and obj[2] >= 9: # disable SSL verification, since it is default in python 2.7.9 # and many CMS services do not verify SSL cert. # https://www.python.org/dev/peps/pep-0476/ import ssl ssl._create_default_https_context = ssl._create_unverified_context req = urllib2.Request(url) for key, val in headers.iteritems(): req.add_header(key, val) if verbose > 1: handler = urllib2.HTTPHandler(debuglevel=1) opener = urllib2.build_opener(handler) urllib2.install_opener(opener) ckey, cert = get_key_cert() handler = HTTPSClientAuthHandler(ckey, cert, verbose) if verbose: print "handler", handler, handler.__dict__ opener = urllib2.build_opener(handler) urllib2.install_opener(opener) try: if post: data = urllib2.urlopen(req, encoded_data) else: data = urllib2.urlopen(req) info = data.info() code = data.getcode() if verbose > 1: print "+++ response code:", code print "+++ response info\n", info if jsondecoder: data = json.load(data) else: data = data.read() except urllib2.HTTPError as httperror: msg = 'HTTPError, url=%s, args=%s, headers=%s' \ % (url, params, headers) data = {'error': 'Unable to contact %s' % url, 'reason': msg} try: data.update({'httperror': extract_http_error(httperror.read())}) except Exception as exp: data.update({'httperror': None}) data = json.dumps(data) except Exception as exp: msg = 'HTTPError, url=%s, args=%s, headers=%s, error=%s' \ % (url, params, headers, str(exp)) data = {'error': 'Unable to contact %s' % url, 'reason': msg} data = json.dumps(data) return data
import csv import re if (len(sys.argv) < 2): print 'Usage: %s <word-file.csv>' % sys.argv[0] exit() csvfile = open(sys.argv[1], 'rb') csvobject = csv.reader(csvfile, delimiter='\t', quotechar='"') user = raw_input('User: '******'http://www.forvo.com/login/', urllib.urlencode({ 'login': user, 'password': password })) if (page.read().find('Login incorrect.') != -1): print "Error logging in" exit()
def get_data(host, query, idx, limit, debug, threshold=300, ckey=None, cert=None, das_headers=True): """Contact DAS server and retrieve data for given DAS query""" params = {'input': query, 'idx': idx, 'limit': limit} path = '/das/cache' pat = re.compile('http[s]{0,1}://') if not pat.match(host): msg = 'Invalid hostname: %s' % host raise Exception(msg) url = host + path headers = {"Accept": "application/json"} encoded_data = urllib.urlencode(params, doseq=True) url += '?%s' % encoded_data req = urllib2.Request(url=url, headers=headers) if ckey and cert: ckey = fullpath(ckey) cert = fullpath(cert) hdlr = HTTPSClientAuthHandler(ckey, cert, debug) else: hdlr = urllib2.HTTPHandler(debuglevel=debug) proxy_support = urllib2.ProxyHandler({}) opener = urllib2.build_opener(hdlr, proxy_support) fdesc = opener.open(req) data = fdesc.read() fdesc.close() pat = re.compile(r'^[a-z0-9]{32}') if data and isinstance(data, str) and pat.match(data) and len(data) == 32: pid = data elif data.find('"pid"') != -1 and data.find('"status"') != -1: pid = json.loads(data)['pid'] else: pid = None iwtime = 2 # initial waiting time in seconds wtime = 20 # final waiting time in seconds sleep = iwtime time0 = time.time() while pid: params.update({'pid': data}) encoded_data = urllib.urlencode(params, doseq=True) url = host + path + '?%s' % encoded_data req = urllib2.Request(url=url, headers=headers) try: fdesc = opener.open(req) data = fdesc.read() fdesc.close() except urllib2.HTTPError as err: return {"status": "fail", "reason": str(err)} if data and isinstance(data, str) and pat.match(data) and len(data) == 32: pid = data else: pid = None time.sleep(sleep) if sleep < wtime: sleep *= 2 elif sleep == wtime: sleep = iwtime # start new cycle else: sleep = wtime if (time.time() - time0) > threshold: reason = "client timeout after %s sec" % int(time.time() - time0) return {"status": "fail", "reason": reason} jsondict = json.loads(data) return jsondict
def downloadfile(url,nombrefichero,headers=[],silent=False,continuar=False): logger.info("[downloadtools.py] downloadfile: url="+url) logger.info("[downloadtools.py] downloadfile: nombrefichero="+nombrefichero) try: # Si no es XBMC, siempre a "Silent" try: import xbmcgui except: silent=True # antes #f=open(nombrefichero,"wb") try: import xbmc nombrefichero = xbmc.makeLegalFilename(nombrefichero) except: pass logger.info("[downloadtools.py] downloadfile: nombrefichero="+nombrefichero) # El fichero existe y se quiere continuar if os.path.exists(nombrefichero) and continuar: #try: # import xbmcvfs # f = xbmcvfs.File(nombrefichero) # existSize = f.size(nombrefichero) #except: f = open(nombrefichero, 'r+b') existSize = os.path.getsize(nombrefichero) logger.info("[downloadtools.py] downloadfile: el fichero existe, size=%d" % existSize) grabado = existSize f.seek(existSize) # el fichero ya existe y no se quiere continuar, se aborta elif os.path.exists(nombrefichero) and not continuar: logger.info("[downloadtools.py] downloadfile: el fichero existe, no se descarga de nuevo") return # el fichero no existe else: existSize = 0 logger.info("[downloadtools.py] downloadfile: el fichero no existe") #try: # import xbmcvfs # f = xbmcvfs.File(nombrefichero,"w") #except: f = open(nombrefichero, 'wb') grabado = 0 # Crea el diálogo de progreso if not silent: #progreso.create( "plugin" , "Descargando..." , os.path.basename(nombrefichero)+" desde "+urlparse.urlparse(url).hostname ) try: progreso = xbmcgui.DialogProgressBG() progreso.create("Descargas tvalacarta", "Descargando "+os.path.basename(nombrefichero)) except: progreso = xbmcgui.DialogProgress() progreso.create( "plugin" , "Descargando..." , url , os.path.basename(nombrefichero) ) else: progreso = "" try: xbmc.executebuiltin((u'XBMC.Notification("Descargas tvalacarta", "'+os.path.basename(nombrefichero)+'", 2000)')) except: pass # Login y password Filenium # http://abcd%40gmail.com:[email protected]/get/Oi8vd3d3/LmZpbGVz/ZXJ2ZS5j/b20vZmls/ZS9kTnBL/dm11/b0/?.zip if "filenium" in url: from servers import filenium url , authorization_header = filenium.extract_authorization_header(url) headers.append( [ "Authorization", authorization_header ] ) if "|" in url: additional_headers = url.split("|")[1] if "&" in additional_headers: additional_headers = additional_headers.split("&") else: additional_headers = [ additional_headers ] for additional_header in additional_headers: logger.info("[downloadtools.py] additional_header: "+additional_header) name = re.findall( "(.*?)=.*?" , additional_header )[0] value = urllib.unquote_plus(re.findall( ".*?=(.*?)$" , additional_header )[0]) headers.append( [ name,value ] ) url = url.split("|")[0] logger.info("[downloadtools.py] downloadfile: url="+url) # Timeout del socket a 60 segundos socket.setdefaulttimeout(60) h=urllib2.HTTPHandler(debuglevel=0) request = urllib2.Request(url) for header in headers: logger.info("[downloadtools.py] Header="+header[0]+": "+header[1]) request.add_header(header[0],header[1]) if existSize > 0: request.add_header('Range', 'bytes=%d-' % (existSize, )) opener = urllib2.build_opener(h) urllib2.install_opener(opener) try: connexion = opener.open(request) except urllib2.HTTPError,e: logger.info("[downloadtools.py] downloadfile: error %d (%s) al abrir la url %s" % (e.code,e.msg,url)) #print e.code #print e.msg #print e.hdrs #print e.fp f.close() if not silent: progreso.close() # El error 416 es que el rango pedido es mayor que el fichero => es que ya está completo if e.code==416: return 0 else: return -2 try: totalfichero = int(connexion.headers["Content-Length"]) except: totalfichero = 1 if existSize > 0: totalfichero = totalfichero + existSize logger.info("Content-Length=%s" % totalfichero) blocksize = 100*1024 bloqueleido = connexion.read(blocksize) logger.info("Iniciando descarga del fichero, bloqueleido=%s" % len(bloqueleido)) maxreintentos = 10 while len(bloqueleido)>0: try: # Escribe el bloque leido #try: # import xbmcvfs # f.write( bloqueleido ) #except: f.write(bloqueleido) grabado = grabado + len(bloqueleido) percent = int(float(grabado)*100/float(totalfichero)) totalmb = float(float(totalfichero)/(1024*1024)) descargadosmb = float(float(grabado)/(1024*1024)) # Lee el siguiente bloque, reintentando para no parar todo al primer timeout reintentos = 0 while reintentos <= maxreintentos: try: before = time.time() bloqueleido = connexion.read(blocksize) after = time.time() if (after - before) > 0: velocidad=len(bloqueleido)/((after - before)) falta=totalfichero-grabado if velocidad>0: tiempofalta=falta/velocidad else: tiempofalta=0 #logger.info(sec_to_hms(tiempofalta)) if not silent: #progreso.update( percent , "Descargando %.2fMB de %.2fMB (%d%%)" % ( descargadosmb , totalmb , percent),"Falta %s - Velocidad %.2f Kb/s" % ( sec_to_hms(tiempofalta) , velocidad/1024 ), os.path.basename(nombrefichero) ) progreso.update( percent , "%.2fMB/%.2fMB (%d%%) %.2f Kb/s %s falta " % ( descargadosmb , totalmb , percent , velocidad/1024 , sec_to_hms(tiempofalta))) break try: if xbmc.abortRequested: logger.error( "XBMC Abort requested 1" ) return -1 except: pass except: try: if xbmc.abortRequested: logger.error( "XBMC Abort requested 2" ) return -1 except: pass reintentos = reintentos + 1 logger.info("ERROR en la descarga del bloque, reintento %d" % reintentos) import traceback logger.error( traceback.format_exc() ) # El usuario cancelo la descarga try: if progreso.iscanceled(): logger.info("Descarga del fichero cancelada") f.close() progreso.close() return -1 except: pass # Ha habido un error en la descarga if reintentos > maxreintentos: logger.info("ERROR en la descarga del fichero") f.close() if not silent: progreso.close() return -2 except: import traceback logger.error( traceback.format_exc() ) f.close() if not silent: progreso.close() #advertencia = xbmcgui.Dialog() #resultado = advertencia.ok('Error al descargar' , 'Se ha producido un error' , 'al descargar el archivo') return -2
def downloadpage( url, post=None, headers=[[ 'User-Agent', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; es-ES; rv:1.9.2.12) Gecko/20101026 Firefox/3.6.12' ]], follow_redirects=True, timeout=socket.getdefaulttimeout()): if (DEBUG == True): logger.info("[scrapertools.py] downloadpage") if (DEBUG == True): logger.info("[scrapertools.py] url=" + url) if post is not None: if (DEBUG == True): logger.info("[scrapertools.py] post=" + post) else: if (DEBUG == True): logger.info("[scrapertools.py] post=None") # --------------------------------- # Instala las cookies # --------------------------------- # Inicializa la librería de las cookies ficherocookies = os.path.join(config.get_setting("cookies.dir"), 'cookies.dat') if (DEBUG == True): logger.info("[scrapertools.py] ficherocookies=" + ficherocookies) cj = None ClientCookie = None cookielib = None # Let's see if cookielib is available try: if (DEBUG == True): logger.info("[scrapertools.py] Importando cookielib") import cookielib except ImportError: if (DEBUG == True): logger.info("[scrapertools.py] cookielib no disponible") # If importing cookielib fails # let's try ClientCookie try: if (DEBUG == True): logger.info("[scrapertools.py] Importando ClientCookie") import ClientCookie except ImportError: if (DEBUG == True): logger.info("[scrapertools.py] ClientCookie no disponible") # ClientCookie isn't available either urlopen = urllib2.urlopen Request = urllib2.Request else: if (DEBUG == True): logger.info("[scrapertools.py] ClientCookie disponible") # imported ClientCookie urlopen = ClientCookie.urlopen Request = ClientCookie.Request cj = ClientCookie.MozillaCookieJar() else: if (DEBUG == True): logger.info("[scrapertools.py] cookielib disponible") # importing cookielib worked urlopen = urllib2.urlopen Request = urllib2.Request cj = cookielib.MozillaCookieJar() # This is a subclass of FileCookieJar # that has useful load and save methods if cj is not None: # we successfully imported # one of the two cookie handling modules if (DEBUG == True): logger.info("[scrapertools.py] Hay cookies") if os.path.isfile(ficherocookies): if (DEBUG == True): logger.info("[scrapertools.py] Leyendo fichero cookies") # if we have a cookie file already saved # then load the cookies into the Cookie Jar try: cj.load(ficherocookies) except: if (DEBUG == True): logger.info( "[scrapertools.py] El fichero de cookies existe pero es ilegible, se borra" ) os.remove(ficherocookies) # Now we need to get our Cookie Jar # installed in the opener; # for fetching URLs if cookielib is not None: if (DEBUG == True): logger.info( "[scrapertools.py] opener usando urllib2 (cookielib)") # if we use cookielib # then we get the HTTPCookieProcessor # and install the opener in urllib2 if not follow_redirects: opener = urllib2.build_opener( urllib2.HTTPHandler(debuglevel=DEBUG_LEVEL), urllib2.HTTPCookieProcessor(cj), NoRedirectHandler()) else: opener = urllib2.build_opener( urllib2.HTTPHandler(debuglevel=DEBUG_LEVEL), urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) else: if (DEBUG == True): logger.info("[scrapertools.py] opener usando ClientCookie") # if we use ClientCookie # then we get the HTTPCookieProcessor # and install the opener in ClientCookie opener = ClientCookie.build_opener( ClientCookie.HTTPCookieProcessor(cj)) ClientCookie.install_opener(opener) # ------------------------------------------------- # Cookies instaladas, lanza la petición # ------------------------------------------------- # Contador inicio = time.clock() # Diccionario para las cabeceras txheaders = {} # Construye el request if post is None: if (DEBUG == True): logger.info("[scrapertools.py] petición GET") else: if (DEBUG == True): logger.info("[scrapertools.py] petición POST") # Añade las cabeceras if (DEBUG == True): logger.info("[scrapertools.py] ---------------------------") for header in headers: if (DEBUG == True): logger.info("[scrapertools.py] header %s=%s" % (str(header[0]), str(header[1]))) txheaders[header[0]] = header[1] if (DEBUG == True): logger.info("[scrapertools.py] ---------------------------") req = Request(url, post, txheaders) if timeout is None: handle = urlopen(req) else: #Disponible en python 2.6 en adelante --> handle = urlopen(req, timeout=timeout) #Para todas las versiones: deftimeout = socket.getdefaulttimeout() try: socket.setdefaulttimeout(timeout) handle = urlopen(req) except: import sys for line in sys.exc_info(): logger.error("%s" % line) socket.setdefaulttimeout(deftimeout) # Actualiza el almacén de cookies cj.save(ficherocookies) # Lee los datos y cierra data = handle.read() info = handle.info() if (DEBUG == True): logger.info("[scrapertools.py] Respuesta") if (DEBUG == True): logger.info("[scrapertools.py] ---------------------------") for header in info: if (DEBUG == True): logger.info("[scrapertools.py] " + header + "=" + info[header]) handle.close() if (DEBUG == True): logger.info("[scrapertools.py] ---------------------------") ''' # Lanza la petición try: response = urllib2.urlopen(req) # Si falla la repite sustituyendo caracteres especiales except: req = urllib2.Request(url.replace(" ","%20")) # Añade las cabeceras for header in headers: req.add_header(header[0],header[1]) response = urllib2.urlopen(req) ''' # Tiempo transcurrido fin = time.clock() if (DEBUG == True): logger.info("[scrapertools.py] Descargado en %d segundos " % (fin - inicio + 1)) return data
import os import sys import cookielib import urllib import urllib2 import json import time cookies = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPRedirectHandler(), urllib2.HTTPHandler(debuglevel=0), urllib2.HTTPSHandler(debuglevel=0), urllib2.HTTPCookieProcessor(cookies)) opener.addheaders = [('User-agent', ('Mozilla/4.0 (compatible; MSIE 6.0; ' 'Windows NT 5.2; .NET CLR 1.1.4322)'))] site = 'https://stitcher.ncats.io/' site = 'http://localhost:8080/' def requestJson(uri): try: handle = opener.open(uri) response = handle.read() handle.close() obj = json.loads(response) return obj except: sys.stderr.write("failed: " + uri + "\n") sys.stderr.flush() time.sleep(5)
def request(url, close=True, error=False, proxy=None, post=None, headers=None, mobile=False, safe=False, referer=None, cookie=None, output='', timeout='30'): try: handlers = [] if not proxy == None: handlers += [ urllib2.ProxyHandler({'http': '%s' % (proxy)}), urllib2.HTTPHandler ] opener = urllib2.build_opener(*handlers) opener = urllib2.install_opener(opener) if output == 'cookie' or output == 'extended' or not close == True: import cookielib cookies = cookielib.LWPCookieJar() handlers += [ urllib2.HTTPHandler(), urllib2.HTTPSHandler(), urllib2.HTTPCookieProcessor(cookies) ] opener = urllib2.build_opener(*handlers) opener = urllib2.install_opener(opener) try: if sys.version_info < (2, 7, 9): raise Exception() import ssl ssl_context = ssl.create_default_context() ssl_context.check_hostname = False ssl_context.verify_mode = ssl.CERT_NONE handlers += [urllib2.HTTPSHandler(context=ssl_context)] opener = urllib2.build_opener(*handlers) opener = urllib2.install_opener(opener) except: pass try: headers.update(headers) except: headers = {} if 'User-Agent' in headers: pass elif not mobile == True: #headers['User-Agent'] = agent() headers['User-Agent'] = cache.get(randomagent, 1) else: headers['User-Agent'] = 'Apple-iPhone/701.341' if 'Referer' in headers: pass elif referer == None: headers['Referer'] = '%s://%s/' % (urlparse.urlparse(url).scheme, urlparse.urlparse(url).netloc) else: headers['Referer'] = referer if not 'Accept-Language' in headers: headers['Accept-Language'] = 'en-US' if 'Cookie' in headers: pass elif not cookie == None: headers['Cookie'] = cookie request = urllib2.Request(url, data=post, headers=headers) try: response = urllib2.urlopen(request, timeout=int(timeout)) except urllib2.HTTPError as response: if error == False: return if output == 'cookie': result = [] for c in cookies: result.append('%s=%s' % (c.name, c.value)) result = "; ".join(result) elif output == 'response': if safe == True: result = (str(response.code), response.read(224 * 1024)) else: result = (str(response.code), response.read()) elif output == 'chunk': try: content = int(response.headers['Content-Length']) except: content = (2049 * 1024) if content < (2048 * 1024): return result = response.read(16 * 1024) elif output == 'title': result = response.read(1 * 1024) result = parseDOM(result, 'title')[0] elif output == 'extended': cookie = [] for c in cookies: cookie.append('%s=%s' % (c.name, c.value)) cookie = "; ".join(cookie) content = response.headers result = response.read() return (result, headers, content, cookie) elif output == 'geturl': result = response.geturl() elif output == 'headers': content = response.headers return content else: if safe == True: result = response.read(224 * 1024) else: result = response.read() if close == True: response.close() return result except: return