'ym': '', 'neeaID': '', 'cities': '', 'citiesNames': '', 'whichFirst': 'AS', 'isFilter': 0, 'isSearch': 1 } QUERY_LIST = [] IS_LOGIN = False WATCH_FLAG = False POST_DATA = {'neeaID': '', 'pwd': ''} CJ = CookieJar() opener = build_opener(HTTPCookieProcessor(CJ), HTTPHandler) class Common(object): """global config object""" def __init__(self): """load config from __config__""" self.CONFIG = ConfigParser() self.CONFIG.read(os.path.join(os.getcwd(), __config__), encoding='utf-8') self.MAIL_HOST = self.CONFIG.get('email', 'host') self.MAIL_USER = self.CONFIG.get('email', 'user') self.MAIL_PASS = self.CONFIG.get('email', 'pass') self.SENDER = self.CONFIG.get('email', 'user') self.RECEIVERS = self.CONFIG.get('email', 'receivers')
def get_new_cookie(self): # Start by prompting user to input their credentials # Another Python2/3 workaround try: new_username = raw_input("Username: "******"Username: "******"Password (will not be displayed): ") # Build URS4 Cookie request auth_cookie_url = self.asf_urs4['url'] + '?client_id=' + self.asf_urs4['client'] + \ '&redirect_uri=' + \ self.asf_urs4['redir'] + '&response_type=code&state=' try: # python2 user_pass = base64.b64encode(bytes(new_username+":"+new_password)) except TypeError: # python3 user_pass = base64.b64encode( bytes(new_username+":"+new_password, "utf-8")) user_pass = user_pass.decode("utf-8") # Authenticate against URS, grab all the cookies self.cookie_jar = MozillaCookieJar() opener = build_opener(HTTPCookieProcessor( self.cookie_jar), HTTPHandler(), HTTPSHandler(**self.context)) request = Request(auth_cookie_url, headers={ "Authorization": "Basic {0}".format(user_pass)}) # Watch out cookie rejection! try: response = opener.open(request) except HTTPError as e: if "WWW-Authenticate" in e.headers and "Please enter your Earthdata Login credentials" in e.headers["WWW-Authenticate"]: print( " > Username and Password combo was not successful. Please try again.") return False else: # If an error happens here, the user most likely has not confirmed EULA. print("\nIMPORTANT: There was an error obtaining a download cookie!") print( "Your user appears to lack permission to download data from the ASF Datapool.") print("\n\nNew users: you must first log into Vertex and accept the EULA. In addition, your Study Area must be set at Earthdata https://urs.earthdata.nasa.gov") exit(-1) except URLError as e: print( "\nIMPORTANT: There was a problem communicating with URS, unable to obtain cookie. ") print("Try cookie generation later.") exit(-1) # Did we get a cookie? if self.check_cookie_is_logged_in(self.cookie_jar): # COOKIE SUCCESS! self.cookie_jar.save(self.cookie_jar_path) return True # if we aren't successful generating the cookie, nothing will work. Stop here! print("WARNING: Could not generate new cookie! Cannot proceed. Please try Username and Password again.") print("Response was {0}.".format(response.getcode())) print("\n\nNew users: you must first log into Vertex and accept the EULA. In addition, your Study Area must be set at Earthdata https://urs.earthdata.nasa.gov") exit(-1)
def new_opener(self): from cookielib import CookieJar from urllib2 import build_opener, HTTPCookieProcessor return build_opener(HTTPCookieProcessor(CookieJar()))
def request(url, close=True, redirect=True, error=False, proxy=None, post=None, headers=None, mobile=False, XHR=False, limit=None, referer=None, cookie=None, compression=True, output='', timeout='30', ignoreSsl=False, flare=True, ignoreErrors=None): try: if url is None: return None handlers = [] if proxy is not None: handlers += [ProxyHandler({'http': '%s' % (proxy)}), HTTPHandler] opener = build_opener(*handlers) opener = install_opener(opener) if output == 'cookie' or output == 'extended' or not close is True: cookies = cookielib.LWPCookieJar() handlers += [ HTTPHandler(), HTTPSHandler(), HTTPCookieProcessor(cookies) ] opener = build_opener(*handlers) opener = install_opener(opener) if ignoreSsl or ((2, 7, 8) < sys.version_info < (2, 7, 12)): try: import ssl ssl_context = ssl.create_default_context() ssl_context.check_hostname = False ssl_context.verify_mode = ssl.CERT_NONE handlers += [HTTPSHandler(context=ssl_context)] opener = build_opener(*handlers) opener = install_opener(opener) except: pass if url.startswith('//'): url = 'http:' + url try: headers.update(headers) except: headers = {} if 'User-Agent' in headers: pass elif mobile is not True: # headers['User-Agent'] = agent() headers['User-Agent'] = cache.get(randomagent, 1) else: headers['User-Agent'] = 'Apple-iPhone/701.341' if 'Referer' in headers: pass elif referer is not None: headers['Referer'] = referer if 'Accept-Language' not in headers: headers['Accept-Language'] = 'en-US' if 'X-Requested-With' in headers: pass elif XHR is True: headers['X-Requested-With'] = 'XMLHttpRequest' if 'Cookie' in headers: pass elif cookie is not None: headers['Cookie'] = cookie if 'Accept-Encoding' in headers: pass elif compression and limit is None: headers['Accept-Encoding'] = 'gzip' if redirect is False: class NoRedirection(HTTPErrorProcessor): def http_response(self, request, response): return response opener = build_opener(NoRedirection) opener = install_opener(opener) try: del headers['Referer'] except: pass if isinstance(post, dict): # Gets rid of the error: 'ascii' codec can't decode byte 0xd0 in position 0: ordinal not in range(128) try: iter_items = post.iteritems() except: iter_items = post.items() for key, value in iter_items: try: post[key] = value.encode('utf-8') except: pass post = urlencode(post) request = Request(url, data=post) _add_request_header(request, headers) try: response = urlopen(request, timeout=int(timeout)) except HTTPError as response: try: ignore = ignoreErrors and (int(response.code) == ignoreErrors or int( response.code) in ignoreErrors) except: ignore = False if not ignore: if response.code in [301, 307, 308, 503]: cf_result = response.read(5242880) try: encoding = response.info().getheader( 'Content-Encoding') except: encoding = None if encoding == 'gzip': cf_result = gzip.GzipFile( fileobj=StringIO(cf_result)).read() if flare and 'cloudflare' in str(response.info()).lower(): log_utils.log( 'client module calling cfscrape: url=%s' % url, log_utils.LOGDEBUG) try: from openscrapers.modules import cfscrape if isinstance(post, dict): data = post else: try: data = parse_qs(post) except: data = None scraper = cfscrape.CloudScraper() response = scraper.request( method='GET' if post is None else 'POST', url=url, headers=headers, data=data, timeout=int(timeout)) result = response.content flare = 'cloudflare' # Used below try: cookies = response.request._cookies except: log_utils.error() except: log_utils.error() elif 'cf-browser-verification' in cf_result: netloc = '%s://%s' % (urlparse(url).scheme, urlparse(url).netloc) ua = headers['User-Agent'] cf = cache.get(cfcookie().get, 168, netloc, ua, timeout) headers['Cookie'] = cf request = Request(url, data=post) _add_request_header(request, headers) response = urlopen(request, timeout=int(timeout)) else: log_utils.log( 'Request-Error (%s): %s' % (str(response.code), url), log_utils.LOGDEBUG) if error is False: return else: log_utils.log( 'Request-Error (%s): %s' % (str(response.code), url), log_utils.LOGDEBUG) if error is False: return if output == 'cookie': try: result = '; '.join( ['%s=%s' % (i.name, i.value) for i in cookies]) except: pass try: result = cf except: pass if close is True: response.close() return result elif output == 'geturl': result = response.geturl() if close is True: response.close() return result elif output == 'headers': result = response.headers if close is True: response.close() return result elif output == 'chunk': try: content = int(response.headers['Content-Length']) except: content = (2049 * 1024) if content < (2048 * 1024): return result = response.read(16 * 1024) if close is True: response.close() return result elif output == 'file_size': try: content = int(response.headers['Content-Length']) except: content = '0' response.close() return content if flare != 'cloudflare': if limit == '0': result = response.read(224 * 1024) elif limit is not None: result = response.read(int(limit) * 1024) else: result = response.read(5242880) try: encoding = response.info().getheader('Content-Encoding') except: encoding = None if encoding == 'gzip': result = gzip.GzipFile(fileobj=StringIO(result)).read() if 'sucuri_cloudproxy_js' in result: su = sucuri().get(result) headers['Cookie'] = su request = Request(url, data=post) _add_request_header(request, headers) response = urlopen(request, timeout=int(timeout)) if limit == '0': result = response.read(224 * 1024) elif limit is not None: result = response.read(int(limit) * 1024) else: result = response.read(5242880) try: encoding = response.info().getheader('Content-Encoding') except: encoding = None if encoding == 'gzip': result = gzip.GzipFile(fileobj=StringIO(result)).read() if 'Blazingfast.io' in result and 'xhr.open' in result: netloc = '%s://%s' % (urlparse(url).scheme, urlparse(url).netloc) ua = headers['User-Agent'] headers['Cookie'] = cache.get(bfcookie().get, 168, netloc, ua, timeout) result = _basic_request(url, headers=headers, post=post, timeout=timeout, limit=limit) if output == 'extended': try: response_headers = dict([(item[0].title(), item[1]) for item in response.info().items()]) except: response_headers = response.headers try: response_code = str(response.code) except: response_code = str(response.status_code ) # object from CFScrape Requests object. try: cookie = '; '.join( ['%s=%s' % (i.name, i.value) for i in cookies]) except: pass try: cookie = cf except: pass if close is True: response.close() return (result, response_code, response_headers, headers, cookie) else: if close is True: response.close() return result except Exception as e: log_utils.log('Request-Error: (%s) => %s' % (str(e), url), log_utils.LOGDEBUG) return
def __init__(self, url, **kwargs): """ Request init """ self.request = None self.response = None self.code = -1 self.header = {} self.cookieJar = None self.reason = '' self.content = '' self.content_dict = {} # 是否将服务端返回结果从 json 转为 dict self.is_decode_response = kwargs.get('is_decode_response', False) data = kwargs.get('data', None) # 当请求是 GET 请求,同时传了 data 字典的话,post_type 默认是 form,会进行 urlencode,并拼接到请求 URL 上 post_type = kwargs.get('post_type', 'form') if data is not None: if isinstance(data, dict): if post_type == 'json': data_str = json.dumps(data) else: # data = {"name":"meetbill", "age":"21"} ==> urlencode(data) = 'age=21&name=meetbill' data_str = urlencode(data) if not isinstance(data_str, basestring): raise ValueError('data must be string or dict') else: data_str = None request_type = kwargs.get('type', 'POST') if data_str and isinstance( request_type, basestring) and request_type.upper() != 'POST': # 如果是 GET 请求,则将 data 中的内容转为 url 的一部分 url = '{}?{}'.format(url, data_str) data_str = None # GET data must be None self.request = urlRequest(url, data_str) # Content-type, 默认是 'application/x-www-form-urlencoded' if request_type.upper() == 'POST' and post_type == "json": self.request.add_header('Content-type', 'application/json') # referer referer = kwargs.get('referer', None) if referer: self.request.add_header('referer', referer) # user-agent user_agent = kwargs.get('user_agent', None) if user_agent: self.request.add_header('User-Agent', user_agent) # auth auth = kwargs.get('auth', None) if auth and isinstance(auth, dict) and 'usr' in auth: auth_string = base64.b64encode('{}:{}'.format( auth.get('usr', ''), auth.get('pwd', ''))) self.request.add_header('Authorization', 'Basic {}'.format(auth_string)) # cookie cookie = kwargs.get('cookie', None) cj = None if cookie: if isinstance(cookie, CookieJar): cj = cookie elif isinstance(cookie, dict): result = [] for k, v in cookie.items(): result.append('{}={}'.format(k, v)) cookie = '; '.join(result) elif isinstance(cookie, Cookie.BaseCookie): cookie = cookie.output(header='') if isinstance(cookie, basestring): self.request.add_header('Cookie', cookie) if cj is None: cj = CookieJar() #! TODO: proxy # build opener debuglevel = 1 if kwargs.get('debug', False) else 0 opener = build_opener(HTTPHandler(debuglevel=debuglevel), HTTPSHandler(debuglevel=debuglevel), HTTPCookieProcessor(cj)) # timeout timeout = kwargs.get('timeout') if not isinstance(timeout, int): timeout = _DEFAULT_TIMEOUT t_beginning = time.time() try: # opener.open accept a URL or a Request object # 程序中判断是字符串时按照 URL 来处理, 否则按照是已经封装好的 Request 处理 self.response = opener.open(self.request, timeout=timeout) self.code = self.response.getcode() self.header = self.response.info().dict self.cookieJar = cj self.content = self.response.read() # 进行将 response 转为 dict if self.is_decode_response: self.content_dict = json.loads(self.content) # 检查 response 内容是否符合预期 check_key = kwargs.get('check_key', None) check_value = kwargs.get('check_value', None) if check_key is not None and check_value is not None: # 检查 check_value 类型 if isinstance(check_value, list): if self.content_dict[check_key] not in check_value: self.code = -1 self.reason = "[response not match: {response_value} not in {check_value}]".format( response_value=self.content_dict[check_key], check_value=check_value) elif self.content_dict[check_key] != check_value: self.code = -1 self.reason = "[response not match: {response_value} != {check_value}]".format( response_value=self.content_dict[check_key], check_value=check_value) except HTTPError as e: self.code = e.code self.reason = '{}'.format(e) except URLError as e: self.code = -1 self.reason = e.reason except Exception as e: self.code = -1 self.reason = '{}'.format(e) seconds_passed = time.time() - t_beginning cost_str = "%.6f" % seconds_passed # 打印日志 f = inspect.currentframe().f_back file_name, lineno, func_name = self._get_backframe_info(f) log_msg = ("[file={file_name}:{func_name}:{lineno} " "type=http_{method} " "req_path={req_path} " "req_data={req_data} " "cost={cost} " "is_success={is_success} " "err_no={err_no} " "err_msg={err_msg} " "res_len={res_len} " "res_data={res_data} " "res_attr={res_attr}]".format(file_name=file_name, func_name=func_name, lineno=lineno, method=request_type, req_path=url, req_data=data, cost=cost_str, is_success=self.success(), err_no=self.code, err_msg=self.reason, res_len=len(self.content), res_data=self.content, res_attr=json.dumps( self.header))) if self.success(): log.info(log_msg) else: log.error(log_msg)
def getRegexParsed( regexs, url, cookieJar=None, forCookieJarOnly=False, recursiveCall=False, cachedPages={}, rawPost=False, cookie_jar_file=None): # 0,1,2 = URL, regexOnly, CookieJarOnly # cachedPages = {} # print 'url',url doRegexs = re.compile('\$doregex\[([^\]]*)\]').findall(url) # print 'doRegexs',doRegexs,regexs setresolved = True for k in doRegexs: if k in regexs: # print 'processing ' ,k m = regexs[k] # print m cookieJarParam = False if 'cookiejar' in m: # so either create or reuse existing jar # print 'cookiejar exists',m['cookiejar'] cookieJarParam = m['cookiejar'] if '$doregex' in cookieJarParam: cookieJar = getRegexParsed(regexs, m['cookiejar'], cookieJar, True, True, cachedPages) cookieJarParam = True else: cookieJarParam = True # print 'm[cookiejar]',m['cookiejar'],cookieJar if cookieJarParam: if cookieJar is None: # print 'create cookie jar' cookie_jar_file = None if 'open[' in m['cookiejar']: cookie_jar_file = m['cookiejar'].split( 'open[')[1].split(']')[0] # print 'cookieJar from file name',cookie_jar_file cookieJar = getCookieJar(cookie_jar_file) # print 'cookieJar from file',cookieJar if cookie_jar_file: saveCookieJar(cookieJar, cookie_jar_file) # import cookielib # cookieJar = cookielib.LWPCookieJar() # print 'cookieJar new',cookieJar elif 'save[' in m['cookiejar']: cookie_jar_file = m['cookiejar'].split('save[')[1].split( ']')[0] complete_path = os.path.join(profile, cookie_jar_file) # print 'complete_path',complete_path saveCookieJar(cookieJar, cookie_jar_file) if m['page'] and '$doregex' in m['page']: pg = getRegexParsed(regexs, m['page'], cookieJar, recursiveCall=True, cachedPages=cachedPages) if len(pg) == 0: pg = 'http://regexfailed' m['page'] = pg if 'setcookie' in m and m['setcookie'] and '$doregex' in m[ 'setcookie']: m['setcookie'] = getRegexParsed(regexs, m['setcookie'], cookieJar, recursiveCall=True, cachedPages=cachedPages) if 'appendcookie' in m and m['appendcookie'] and '$doregex' in m[ 'appendcookie']: m['appendcookie'] = getRegexParsed(regexs, m['appendcookie'], cookieJar, recursiveCall=True, cachedPages=cachedPages) if 'post' in m and '$doregex' in m['post']: m['post'] = getRegexParsed(regexs, m['post'], cookieJar, recursiveCall=True, cachedPages=cachedPages) # print 'post is now',m['post'] if 'rawpost' in m and '$doregex' in m['rawpost']: m['rawpost'] = getRegexParsed(regexs, m['rawpost'], cookieJar, recursiveCall=True, cachedPages=cachedPages, rawPost=True) # print 'rawpost is now',m['rawpost'] if 'rawpost' in m and '$epoctime$' in m['rawpost']: m['rawpost'] = m['rawpost'].replace('$epoctime$', getEpocTime()) if 'rawpost' in m and '$epoctime2$' in m['rawpost']: m['rawpost'] = m['rawpost'].replace('$epoctime2$', getEpocTime2()) link = '' if m['page'] and m[ 'page'] in cachedPages and not 'ignorecache' in m and forCookieJarOnly is False: # print 'using cache page',m['page'] link = cachedPages[m['page']] else: if m['page'] and not m['page'] == '' and m['page'].startswith( 'http'): if '$epoctime$' in m['page']: m['page'] = m['page'].replace('$epoctime$', getEpocTime()) if '$epoctime2$' in m['page']: m['page'] = m['page'].replace('$epoctime2$', getEpocTime2()) # print 'Ingoring Cache',m['page'] page_split = m['page'].split('|') pageUrl = page_split[0] header_in_page = None if len(page_split) > 1: header_in_page = page_split[1] # if # proxy = ProxyHandler({ ('https' ? proxytouse[:5]=="https":"http") : proxytouse}) # opener = build_opener(proxy) # install_opener(opener) # print 'getproxies',getproxies() current_proxies = ProxyHandler(getproxies()) # print 'getting pageUrl',pageUrl req = Request(pageUrl) if 'proxy' in m: proxytouse = m['proxy'] # print 'proxytouse',proxytouse # getproxies= lambda: {} if pageUrl[:5] == "https": proxy = ProxyHandler({'https': proxytouse}) # req.set_proxy(proxytouse, 'https') else: proxy = ProxyHandler({'http': proxytouse}) # req.set_proxy(proxytouse, 'http') opener = build_opener(proxy) install_opener(opener) req.add_header( 'User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/14.0.1' ) proxytouse = None if 'referer' in m: req.add_header('Referer', m['referer']) if 'accept' in m: req.add_header('Accept', m['accept']) if 'agent' in m: req.add_header('User-agent', m['agent']) if 'x-req' in m: req.add_header('X-Requested-With', m['x-req']) if 'x-addr' in m: req.add_header('x-addr', m['x-addr']) if 'x-forward' in m: req.add_header('X-Forwarded-For', m['x-forward']) if 'setcookie' in m: # print 'adding cookie',m['setcookie'] req.add_header('Cookie', m['setcookie']) if 'appendcookie' in m: # print 'appending cookie to cookiejar',m['appendcookie'] cookiestoApend = m['appendcookie'] cookiestoApend = cookiestoApend.split(';') for h in cookiestoApend: n, v = h.split('=') w, n = n.split(':') ck = cookielib.Cookie(version=0, name=n, value=v, port=None, port_specified=False, domain=w, domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False) cookieJar.set_cookie(ck) if 'origin' in m: req.add_header('Origin', m['origin']) if header_in_page: header_in_page = header_in_page.split('&') for h in header_in_page: n, v = h.split('=') req.add_header(n, v) if cookieJar is not None: # print 'cookieJarVal',cookieJar cookie_handler = HTTPCookieProcessor(cookieJar) opener = build_opener(cookie_handler, HTTPBasicAuthHandler(), HTTPHandler()) opener = install_opener(opener) # print 'noredirect','noredirect' in m if 'noredirect' in m: opener = build_opener(cookie_handler, NoRedirection, HTTPBasicAuthHandler(), HTTPHandler()) opener = install_opener(opener) elif 'noredirect' in m: opener = build_opener(NoRedirection, HTTPBasicAuthHandler(), HTTPHandler()) opener = install_opener(opener) if 'connection' in m: # print '..........................connection//////.',m['connection'] from keepalive import HTTPHandler keepalive_handler = HTTPHandler() opener = build_opener(keepalive_handler) install_opener(opener) # print 'after cookie jar' post = None if 'post' in m: postData = m['post'] # if '$LiveStreamRecaptcha' in postData: # (captcha_challenge,catpcha_word,idfield)=processRecaptcha(m['page'],cookieJar) # if captcha_challenge: # postData=postData.replace('$LiveStreamRecaptcha','manual_recaptcha_challenge_field:'+captcha_challenge+',recaptcha_response_field:'+catpcha_word+',id:'+idfield) splitpost = postData.split(',') post = {} for p in splitpost: n = p.split(':')[0] v = p.split(':')[1] post[n] = v post = urlencode(post) if 'rawpost' in m: post = m['rawpost'] # if '$LiveStreamRecaptcha' in post: # (captcha_challenge,catpcha_word,idfield)=processRecaptcha(m['page'],cookieJar) # if captcha_challenge: # post=post.replace('$LiveStreamRecaptcha','&manual_recaptcha_challenge_field='+captcha_challenge+'&recaptcha_response_field='+catpcha_word+'&id='+idfield) link = '' try: if post: response = urlopen(req, post) else: response = urlopen(req) if response.info().get('Content-Encoding') == 'gzip': from StringIO import StringIO import gzip buf = StringIO(response.read()) f = gzip.GzipFile(fileobj=buf) link = f.read() else: link = response.read() if 'proxy' in m and not current_proxies is None: install_opener(build_opener(current_proxies)) link = javascriptUnEscape(link) # print repr(link) # print link This just print whole webpage in LOG if 'includeheaders' in m: # link+=str(response.headers.get('Set-Cookie')) link += '$$HEADERS_START$$:' for b in response.headers: link += b + ':' + response.headers.get( b) + '\n' link += '$$HEADERS_END$$:' # print link response.close() except: pass cachedPages[m['page']] = link # print link # print 'store link for',m['page'],forCookieJarOnly if forCookieJarOnly: return cookieJar # do nothing elif m['page'] and not m['page'].startswith('http'): if m['page'].startswith('$pyFunction:'): val = doEval(m['page'].split('$pyFunction:')[1], '', cookieJar, m) if forCookieJarOnly: return cookieJar # do nothing link = val link = javascriptUnEscape(link) else: link = m['page'] if '$doregex' in m['expres']: m['expres'] = getRegexParsed(regexs, m['expres'], cookieJar, recursiveCall=True, cachedPages=cachedPages) if not m['expres'] == '': # print 'doing it ',m['expres'] if '$LiveStreamCaptcha' in m['expres']: val = askCaptcha(m, link, cookieJar) # print 'url and val',url,val url = url.replace("$doregex[" + k + "]", val) elif m['expres'].startswith( '$pyFunction:') or '#$pyFunction' in m['expres']: # print 'expeeeeeeeeeeeeeeeeeee',m['expres'] val = '' if m['expres'].startswith('$pyFunction:'): val = doEval(m['expres'].split('$pyFunction:')[1], link, cookieJar, m) else: val = doEvalFunction(m['expres'], link, cookieJar, m) if 'ActivateWindow' in m['expres']: return if forCookieJarOnly: return cookieJar # do nothing if 'listrepeat' in m: listrepeat = m['listrepeat'] return listrepeat, eval(val), m, regexs, cookieJar try: url = url.replace(u"$doregex[" + k + "]", val) except: url = url.replace("$doregex[" + k + "]", val.decode("utf-8")) else: if 'listrepeat' in m: listrepeat = m['listrepeat'] ret = re.findall(m['expres'], link) return listrepeat, ret, m, regexs val = '' if not link == '': # print 'link',link reg = re.compile(m['expres']).search(link) try: val = reg.group(1).strip() except: traceback.print_exc() elif m['page'] == '' or m['page'] is None: val = m['expres'] if rawPost: # print 'rawpost' val = quote_plus(val) if 'htmlunescape' in m: # val=unquote_plus(val) try: from HTMLParser import HTMLParser except ImportError: from html.parser import HTMLParser val = HTMLParser().unescape(val) try: url = url.replace("$doregex[" + k + "]", val) except: url = url.replace("$doregex[" + k + "]", val.decode("utf-8")) # print 'ur',url # return val else: url = url.replace("$doregex[" + k + "]", '') if '$epoctime$' in url: url = url.replace('$epoctime$', getEpocTime()) if '$epoctime2$' in url: url = url.replace('$epoctime2$', getEpocTime2()) if '$GUID$' in url: import uuid url = url.replace('$GUID$', str(uuid.uuid1()).upper()) if '$get_cookies$' in url: url = url.replace('$get_cookies$', getCookiesString(cookieJar)) if recursiveCall: return url # print 'final url',repr(url) if url == "": return else: return url, setresolved
def __init__(self): """ Initialisation de AdeConnectionUtil Cette méthode permet de construire le connecteur de site et le gestionnaire de cookie.""" self.CJ = CookieJar() self.connection = build_opener(HTTPCookieProcessor(self.CJ))
def _get_cookie(self, netloc, ua, timeout): class NoRedirection(HTTPErrorProcessor): def http_response(self, request, response): return response def parseJSString(s): try: offset = 1 if s[0] == '+' else 0 val = int( eval(s.replace('!+[]', '1').replace('!![]', '1').replace('[]', '0').replace('(', 'str(')[offset:])) return val except: pass cookies = cookielib.LWPCookieJar() opener = build_opener(NoRedirection, HTTPCookieProcessor(cookies)) opener.addheaders = [('User-Agent', ua)] try: response = opener.open(netloc, timeout=int(timeout)) result = response.read() except HTTPError as response: result = response.read() try: encoding = response.info().getheader('Content-Encoding') except: encoding = None if encoding == 'gzip': result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read() jschl = re.compile('name="jschl_vc" value="(.+?)"/>').findall(result)[0] init = re.compile('setTimeout\(function\(\){\s*.*?.*:(.*?)};').findall(result)[0] builder = re.compile(r"challenge-form\'\);\s*(.*)a.v").findall(result)[0] if '/' in init: init = init.split('/') decryptVal = parseJSString(init[0]) / float(parseJSString(init[1])) else: decryptVal = parseJSString(init) lines = builder.split(';') for line in lines: if len(line) > 0 and '=' in line: sections = line.split('=') if '/' in sections[1]: subsecs = sections[1].split('/') line_val = parseJSString(subsecs[0]) / float(parseJSString(subsecs[1])) else: line_val = parseJSString(sections[1]) decryptVal = float(eval('%.16f' % decryptVal + sections[0][-1] + '%.16f' % line_val)) answer = float('%.10f' % decryptVal) + len(urlparse.urlparse(netloc).netloc) query = '%scdn-cgi/l/chk_jschl?jschl_vc=%s&jschl_answer=%s' % (netloc, jschl, answer) if 'type="hidden" name="pass"' in result: passval = re.findall('name="pass" value="(.*?)"', result)[0] query = '%scdn-cgi/l/chk_jschl?pass=%s&jschl_vc=%s&jschl_answer=%s' % ( netloc, quote_plus(passval), jschl, answer) time.sleep(6) opener.addheaders = [('User-Agent', ua), ('Referer', netloc), ('Accept', 'text/html, application/xhtml+xml, application/xml, */*'), ('Accept-Encoding', 'gzip, deflate')] response = opener.open(query) response.close() cookie = '; '.join(['%s=%s' % (i.name, i.value) for i in cookies]) if 'cf_clearance' in cookie: self.cookie = cookie
def request(url, close=True, redirect=True, error=False, verify=True, proxy=None, post=None, headers=None, mobile=False, XHR=False, limit=None, referer=None, cookie=None, compression=True, output='', timeout='30'): try: if not url: return handlers = [] if proxy is not None: handlers += [ProxyHandler({'http': '%s' % (proxy)}), HTTPHandler] opener = build_opener(*handlers) opener = install_opener(opener) if output == 'cookie' or output == 'extended' or not close is True: cookies = cookielib.LWPCookieJar() handlers += [HTTPHandler(), HTTPSHandler(), HTTPCookieProcessor(cookies)] opener = build_opener(*handlers) opener = install_opener(opener) try: import platform node = platform.node().lower() is_XBOX = platform.uname()[1] == 'XboxOne' except Exception: node = '' is_XBOX = False if verify is False and sys.version_info >= (2, 7, 12): try: import ssl ssl_context = ssl._create_unverified_context() handlers += [HTTPSHandler(context=ssl_context)] opener = build_opener(*handlers) opener = install_opener(opener) except: pass if verify is True and ((2, 7, 8) < sys.version_info < (2, 7, 12) or is_XBOX): try: import ssl ssl_context = ssl.create_default_context() ssl_context.check_hostname = False ssl_context.verify_mode = ssl.CERT_NONE handlers += [HTTPSHandler(context=ssl_context)] opener = build_opener(*handlers) opener = install_opener(opener) except: pass if url.startswith('//'): url = 'http:' + url _headers = {} try: _headers.update(headers) except: pass if 'User-Agent' in _headers: pass elif mobile is True: _headers['User-Agent'] = Database.get(randommobileagent, 1) else: _headers['User-Agent'] = Database.get(randomagent, 1) if 'Referer' in _headers: pass elif referer is not None: _headers['Referer'] = referer if not 'Accept-Language' in _headers: _headers['Accept-Language'] = 'en-US' if 'X-Requested-With' in _headers: pass elif XHR is True: _headers['X-Requested-With'] = 'XMLHttpRequest' if 'Cookie' in _headers: pass elif cookie is not None: _headers['Cookie'] = cookie if 'Accept-Encoding' in _headers: pass elif compression and limit is None: _headers['Accept-Encoding'] = 'gzip' if redirect is False: class NoRedirectHandler(urllib2.HTTPRedirectHandler): def http_error_302(self, req, fp, code, msg, headers): infourl = urllib.addinfourl(fp, headers, req.get_full_url()) infourl.status = code infourl.code = code return infourl http_error_300 = http_error_302 http_error_301 = http_error_302 http_error_303 = http_error_302 http_error_307 = http_error_302 opener = urllib2.build_opener(NoRedirectHandler()) opener = install_opener(opener) try: del _headers['Referer'] except: pass if isinstance(post, dict): post = utils.byteify(post) post = urlencode(post) url = utils.byteify(url) request = Request(url, data=post) _add_request_header(request, _headers) try: response = urlopen(request, timeout=int(timeout)) except HTTPError as response: if response.code == 503: cf_result = response.read(5242880) try: encoding = response.info().getheader('Content-Encoding') except: encoding = None if encoding == 'gzip': cf_result = gzip.GzipFile(fileobj=StringIO(cf_result)).read() if 'cf-browser-verification' in cf_result: while 'cf-browser-verification' in cf_result: netloc = '%s://%s/' % (urlparse(url).scheme, urlparse(url).netloc) ua = _headers['User-Agent'] cf = Database.get(cfcookie().get, 1, netloc, ua, timeout) _headers['Cookie'] = cf request = Request(url, data=post) _add_request_header(request, _headers) try: response = urlopen(request, timeout=int(timeout)) cf_result = 'Success' except HTTPError as response: Database.remove(cfcookie().get, netloc, ua, timeout) cf_result = response.read() else: controlo.log('Request-Error (%s): %s' % (str(response.code), url)) if error is False: return else: controlo.log('Request-Error (%s): %s' % (str(response.code), url)) if error is False: return if output == 'cookie': try: result = '; '.join(['%s=%s' % (i.name, i.value) for i in cookies]) except: pass try: result = cf except: pass if close is True: response.close() return result elif output == 'geturl': result = response.geturl() if close is True: response.close() return result elif output == 'headers': result = response.headers if close is True: response.close() return result elif output == 'chunk': try: content = int(response.headers['Content-Length']) except: content = (2049 * 1024) if content < (2048 * 1024): return result = response.read(16 * 1024) if close is True: response.close() return result elif output == 'file_size': try: content = int(response.headers['Content-Length']) except: content = '0' response.close() return content if limit == '0': result = response.read(224 * 1024) elif limit is not None: result = response.read(int(limit) * 1024) else: result = response.read(5242880) try: encoding = response.info().getheader('Content-Encoding') except: encoding = None if encoding == 'gzip': result = gzip.GzipFile(fileobj=StringIO(result)).read() if 'sucuri_cloudproxy_js' in result: su = sucuri().get(result) _headers['Cookie'] = su request = Request(url, data=post) _add_request_header(request, _headers) response = urlopen(request, timeout=int(timeout)) if limit == '0': result = response.read(224 * 1024) elif limit is not None: result = response.read(int(limit) * 1024) else: result = response.read(5242880) try: encoding = response.info().getheader('Content-Encoding') except: encoding = None if encoding == 'gzip': result = gzip.GzipFile(fileobj=StringIO(result)).read() if 'Blazingfast.io' in result and 'xhr.open' in result: netloc = '%s://%s' % (urlparse(url).scheme, urlparse(url).netloc) ua = _headers['User-Agent'] _headers['Cookie'] = Database.get(bfcookie().get, 168, netloc, ua, timeout) result = _basic_request(url, headers=_headers, post=post, timeout=timeout, limit=limit) if output == 'extended': try: response_headers = dict([(item[0].title(), item[1]) for item in response.info().items()]) except: response_headers = response.headers response_code = str(response.code) try: cookie = '; '.join(['%s=%s' % (i.name, i.value) for i in cookies]) except: pass try: cookie = cf except: pass if close is True: response.close() return (result, response_code, response_headers, _headers, cookie) else: if close is True: response.close() return result except Exception as e: controlo.log('Request-Error: (%s) => %s' % (str(e), url)) return
def crawl_author(): """ Crawls Google Scholar in order to retrieve information about an author. """ # The ID of the author in Google Scholar. scholar_id = request.form['scholar_id'] print 'Crawl author ' + scholar_id + '.' # Retrieve the author with that ID (if any). author = Author.query.filter_by(scholar_id=scholar_id).first() if author is None: author = Author() cookie_jar = CookieJar() opener = build_opener(HTTPCookieProcessor(cookie_jar)) install_opener(opener) url = 'https://scholar.google.com/citations' params = urlencode({ 'hl': 'en', 'view_op': 'list_works', 'sortby': 'pubdate', 'user': scholar_id, 'cstart': 0, 'pagesize': 20 }) req = Request(url + '?' + params) opener.open(req) res = opener.open(req) doc = html.parse(res) no_content = doc.xpath( './/div[contains(text(), "Sorry, no content found for this URL")]') if len(no_content): print 'Author ' + scholar_id + ' not found.' return 'Done.' author.scholar_id = scholar_id nname = doc.find('.//div[@id="gsc_prf_in"]') if nname is not None: # The name of the author. author.name = nname.text_content() nemaildomain = doc.find('.//div[@id="gsc_prf_ivh"]') if nemaildomain is not None: # The domain where the author has an email. author.email_domain = nemaildomain.text_content().split( " - ")[0].split()[-1] ncitations = doc.find('.//table[@id="gsc_rsb_st"]') if ncitations is not None: # The total citations for the author. author.total_citations = ncitations.xpath('.//tr[2]/td')[1].text # The h-index for the author. author.h_index = ncitations.xpath('.//tr[3]/td')[1].text # The i10-index for the author. author.i10_index = ncitations.xpath('.//tr[4]/td')[1].text params = urlencode({ 'hl': 'en', 'view_op': 'citations_histogram', 'user': scholar_id }) req = Request(url + '?' + params) opener.open(req) res = opener.open(req) doc = html.parse(res) # The citations per year for the author. author_citations_per_year = [] nhistogram = doc.find('.//div[@id="gsc_md_hist_b"]') if nhistogram is not None: years = [x.text for x in nhistogram.xpath('.//span[@class="gsc_g_t"]')] for a in nhistogram.xpath('.//a[@class="gsc_g_a"]'): i = int(a.get('style').split('z-index:')[1]) year = int(years[-i]) citations_per_year = AuthorCitationsPerYear.query.filter_by( author_id=author.id, year=year).first() if citations_per_year is None: citations_per_year = AuthorCitationsPerYear() citations_per_year.year = int(years[-i]) citations_per_year.citations = int( a.xpath('./span[@class="gsc_g_al"]')[0].text) author_citations_per_year.append(citations_per_year) author.citations_per_year = author_citations_per_year params = urlencode({ 'hl': 'en', 'view_op': 'list_colleagues', 'user': scholar_id }) req = Request(url + '?' + params) opener.open(req) res = opener.open(req) doc = html.parse(res) # The co-authors of the author. author_coauthors = [] for a in doc.xpath('.//h3[@class="gsc_1usr_name"]//a'): co_scholar_id = a.get('href').split('user='******'&hl')[0] coauthor = Author.query.filter_by(scholar_id=co_scholar_id).first() if coauthor is None: coauthor = Author() coauthor.scholar_id = co_scholar_id author_coauthors.append(coauthor) author.coauthors = author_coauthors # The publications. author_publications = [] cstart = 0 pagesize = 100 while True: params = urlencode({ 'hl': 'en', 'view_op': 'list_works', 'sortby': 'pubdate', 'user': scholar_id, 'cstart': cstart, 'pagesize': pagesize }) req = Request(url + '?' + params) opener.open(req) res = opener.open(req) doc = html.parse(res) for tr in doc.xpath('.//tr[@class="gsc_a_tr"]'): a = tr.find('.//td[@class="gsc_a_t"]//a') # NOTE: When there are no publications, there is a single tr. # <tr class="gsc_a_tr"><td class="gsc_a_e" colspan="3">There are no articles in this profile.</td></tr> if a is None: continue purl = a.get('href') # The ID of the publication in Google Scholar. pub_scholar_id = purl.split('citation_for_view=')[1] # Retrieve the publication with that ID (if any). publication = Publication.query.filter_by( scholar_id=pub_scholar_id).first() if publication is None: publication = Publication() publication.scholar_id = pub_scholar_id # The title of the publication. publication.title = a.text_content() pub_nyear = tr.find('.//td[@class="gsc_a_y"]//span') if pub_nyear is not None: year_of_publication = pub_nyear.text_content().strip() if year_of_publication: # The year of the publication. publication.year_of_publication = int(year_of_publication) pub_ncitations = tr.find('.//a[@class="gsc_a_ac"]') if pub_ncitations is not None: total_citations = pub_ncitations.text_content().strip() if total_citations: # The total citations for the publication. publication.total_citations = int(total_citations) author_publications.append(publication) if doc.xpath('.//button[@id="gsc_bpf_next"]')[0].get("disabled"): break cstart += 100 author.publications = author_publications # When information about the author was retrieved from Google Scholar. author.retrieved_at = datetime.datetime.now() db.session.add(author) db.session.commit() print 'Crawled author ' + scholar_id + '.' return 'Done.'
def crawl_publication(): """ Crawls Google Scholar in order to retrieve information about a publication. """ # The ID of the publication in Google Scholar. scholar_id = request.form['scholar_id'] print 'Crawl publication ' + scholar_id + '.' url = 'https://scholar.google.com/citations' publication = Publication.query.filter_by(scholar_id=scholar_id).first() if publication is None: publication = Publication() cookie_jar = CookieJar() opener = build_opener(HTTPCookieProcessor(cookie_jar)) install_opener(opener) url = 'https://scholar.google.com/citations' params = urlencode({ 'hl': 'en', 'view_op': 'view_citation', 'citation_for_view': scholar_id }) req = Request(url + '?' + params) opener.open(req) res = opener.open(req) doc = html.parse(res) publication.scholar_id = scholar_id ntitle = doc.find('.//a[@class="gsc_title_link"]') if ntitle is not None: # The title of the publication. publication.title = ntitle.text_content() ntype = doc.find('.//div[@class="gs_scl"][3]//div[@class="gsc_field"]') if ntype is not None: # The type of the publication. publication.type = ntype.text_content() if publication.type == 'Description': publication.type = 'Other' nyear = doc.xpath( './/div[text()="Publication date"]/ancestor::div[@class="gs_scl"]//div[@class="gsc_value"]' ) if nyear is not None and len(nyear): # The year of the publication. publication.year_of_publication = int(nyear[0].text.split('/')[0]) ncitations = doc.xpath( './/div[text()="Total citations"]/ancestor::div[@class="gs_scl"]//div[@class="gsc_value"]//a' ) if ncitations is not None and len(ncitations): # The total citations for the publication. publication.total_citations = ncitations[0].text.split(' ')[-1] nauthors = doc.xpath( './/div[text()="Authors"]/ancestor::div[@class="gs_scl"]//div[@class="gsc_value"]' ) if nauthors is not None and len(nauthors): # The authors of the publication. publication.author_names = nauthors[0].text # The citations per year for the publication. publication_citations_per_year = [] nhistogram = doc.find('.//div[@id="gsc_graph_bars"]') if nhistogram is not None: years = [x.text for x in nhistogram.xpath('.//span[@class="gsc_g_t"]')] for a in nhistogram.xpath('.//a[@class="gsc_g_a"]'): i = int(a.get('style').split('z-index:')[1]) year = int(years[-i]) citations_per_year = PublicationCitationsPerYear.query.filter_by( publication_id=publication.id, year=year).first() if citations_per_year is None: citations_per_year = PublicationCitationsPerYear() citations_per_year.year = int(years[-i]) citations_per_year.citations = int( a.xpath('./span[@class="gsc_g_al"]')[0].text) publication_citations_per_year.append(citations_per_year) publication.citations_per_year = publication_citations_per_year # When information about the author was retrieved from Google Scholar. publication.retrieved_at = datetime.datetime.now() db.session.add(publication) db.session.commit() print 'Crawled publication ' + scholar_id + '.' return 'Done.'
info = rootLogger.info warn = rootLogger.warning debug = rootLogger.debug error = rootLogger.error log_exception = rootLogger.exception # filepath constants GAME_STORAGE_DIR = r'.' COOKIES_FILENAME = r'gog-cookies.dat' MANIFEST_FILENAME = r'gog-manifest.dat' SERIAL_FILENAME = r'!serial.txt' INFO_FILENAME = r'!info.txt' # global web utilities global_cookies = cookiejar.LWPCookieJar(COOKIES_FILENAME) cookieproc = HTTPCookieProcessor(global_cookies) opener = build_opener(cookieproc) treebuilder = html5lib.treebuilders.getTreeBuilder('etree') parser = html5lib.HTMLParser(tree=treebuilder, namespaceHTMLElements=False) # GOG URLs GOG_HOME_URL = r'https://www.gog.com' GOG_ACCOUNT_URL = r'https://www.gog.com/account' GOG_LOGIN_URL = r'https://login.gog.com/login_check' # GOG Constants GOG_MEDIA_TYPE_GAME = '1' GOG_MEDIA_TYPE_MOVIE = '2' # HTTP request settings HTTP_FETCH_DELAY = 1 # in seconds
def __init__(self, user_agent=DEFAULT_USERAGENT, timeout=DEFAULT_TIMEOUT): self.cj = CookieJar() self.opener = build_opener(HTTPCookieProcessor(self.cj)) self.urlopen = self.opener.open self.user_agent = user_agent self.timeout = timeout
def _request(url, headers, post, cookies): log(url) url = quote_plus(url, safe='%/:?=&') if post: if sys.version_info[0] >= 3: # for Python 3 post = post.encode('utf-8') req = Request(url, post) log('########POST!') else: req = Request(url) if headers: for key in headers: req.add_header(key, headers[key]) #req.add_header('Content-Type','application/json') req.has_header = lambda header_name: ( True if header_name == 'Content-Length' else Request.has_header( req, header_name)) else: req.add_header( 'User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:25.0) Gecko/20100101 Firefox/25.0' ) req.add_header('Accept-Encoding', 'gzip, deflate') if cookies: cj = CookieJar() log(pathUserdata('')) log(pathUserdata('cookies.txt')) if not f_exists(pathUserdata('')): f_mkdir(pathUserdata('')) if f_exists(pathUserdata('cookies.txt')): cookies_txt = f_open(pathUserdata('cookies.txt')) if cookies_txt: if sys.version_info[0] >= 3: # for Python 3 if isinstance(cookies_txt, str): cookies_txt = cookies_txt.encode('utf-8') c = pickle.loads(cookies_txt) for cookie in c: cj.set_cookie(cookie) opener = build_opener(HTTPCookieProcessor(cj)) response = opener.open(req) c = [] for cookie in cj: log(str(cookie)) c.append(cookie) log(str(cj)) f_write(pathUserdata('cookies.txt'), pickle.dumps(c)) #cj.save(cookiefile) else: response = urlopen(req) compressed = response.info().get('Content-Encoding') == 'gzip' link = response.read() response.close() if compressed: if sys.version_info[0] < 3: buf = StringIO(link) else: buf = BytesIO(link) f = gzip.GzipFile(fileobj=buf) link = f.read() if sys.version_info[0] >= 3: # for Python 3 link = link.decode('utf-8') return link
def __get_http_opener(self): """ Devuelve una instancia del opener adecuado para interactuar vía https con client key y soporte de cookies """ return build_opener(HTTPHandler(debuglevel=self.DEBUG_LEVEL), HTTPCookieProcessor(self._cookiejar))
def start(args): """Login and session handler """ # create cookiejar args._cj = LWPCookieJar() # lets urllib handle cookies opener = build_opener(HTTPCookieProcessor(args._cj)) opener.addheaders = [( "User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36" ), ("Accept-Encoding", "identity"), ("Accept", "*/*"), ("Content-Type", "application/x-www-form-urlencoded"), ("DNT", "1")] install_opener(opener) # load cookies try: args._cj.load(getCookiePath(args), ignore_discard=True) except IOError: # cookie file does not exist pass # get login informations username = args._addon.getSetting("crunchyroll_username") password = args._addon.getSetting("crunchyroll_password") # session management if not (args._session_id and args._auth_token): # create new session payload = { "device_id": args._device_id, "device_type": API.DEVICE, "access_token": API.TOKEN } req = request(args, "start_session", payload, True) # check for error if req["error"]: return False args._session_id = req["data"]["session_id"] # make login payload = {"password": password, "account": username} req = request(args, "login", payload, True) # check for error if req["error"]: return False args._auth_token = req["data"]["auth"] if not getattr(args, "_session_restart", False): pass else: # restart session payload = { "device_id": args._device_id, "device_type": API.DEVICE, "access_token": API.TOKEN, "auth": args._auth_token } req = request(args, "start_session", payload, True) # check for error if req["error"]: destroy(args) return False args._session_id = req["data"]["session_id"] args._auth_token = req["data"]["auth"] args._session_restart = False return True
def _request(self, chunk=None, info_request=False): """Do the request. Used for fetching information and for fetching data. chunk -- specifies which range (part) should be loaded. info_request -- specifies if only information should be fetched. """ if self._response is not None: return self._response if self.url_parts.scheme == 'http': max_redirects = 0 if info_request: # allow redirects only for info-requests max_redirects = self.source.max_redirects req = Request(self.url) cookie_processor = HTTPCookieProcessor() if self.source.cookie_objects is not None: # Use the cookies which were received by previous # (info-)requests. for cookie in self.source.cookie_objects: cookie_processor.cookiejar.set_cookie(cookie) elif len(self.source.cookies) > 0 and info_request: # This is the first (info-)request where cookies are # used. Use user-defined cookies. fcres = FakeCookieResponse(self.source.cookies, self.url) cookie_processor.cookiejar.extract_cookies(fcres, req) if self.source.referrer != '': req.add_header('Referer', self.source.referrer) if self.source.user_agent != '': req.add_header('User-Agent', self.source.user_agent) if chunk is not None: start_offset = chunk.offset + chunk.loaded req.add_header('Range', 'bytes=' + str(start_offset) + '-') opener = build_opener(_LimitedHTTPRedirectHandler(max_redirects), cookie_processor) self._response = opener.open(req, timeout=self.source.timeout) if self.source.cookie_objects is None: # save cookie objects for later use (e.g. DataSlots) cookie_objects = [] for cookie in cookie_processor.cookiejar: cookie_objects.append(cookie) self.source.cookie_objects = cookie_objects return self._response elif self.url_parts.scheme == 'ftp': req = Request(self.url) if chunk is not None: start_offset = chunk.offset + chunk.loaded req.add_header('Offset', str(start_offset)) opener = build_opener(FTPChunkHandler()) self._response = opener.open(req, timeout=self.source.timeout) return self._response else: raise URLError('The protocol is not supported.')
# content = "<p><b>Cookie</b></p><br />" + cookie + "<br />" # # Write the response to a new HTML page # with open('case06.html','w') as fileWriter: # fileWriter.write(content) # url = "case06.html" # # Open the response page in a new tab # webbrowser.get('firefox').open_new_tab(url) # Imports from urllib2 import Request, build_opener, HTTPCookieProcessor, HTTPHandler import cookielib # CookieJar object to hold the cookies cJar = cookielib.CookieJar() # Open page opener = build_opener(HTTPCookieProcessor(cJar), HTTPHandler()) # Request req = Request("https://www.wsb.com/Assignment2/case06.php") res = opener.open(req) #Check out the cookies print "Cookie\n" for cookie in cJar: print cookie
from urllib2 import HTTPCookieProcessor,build_opener from cookielib import CookieJar,MozillaCookieJar from redis_test import Redis # 1. build a cookie with file name # 2. create a cookie handler # 3. build a opener fileName = 'cookie.txt' cookie = MozillaCookieJar(fileName) handler = HTTPCookieProcessor(cookie) opener = build_opener(handler) response = opener.open("http://www.baidu.com") for item in cookie: print 'Name = ' + item.name print 'Value = ' + item.value cookie.save(ignore_discard=True,ignore_expires=True)
def main(): """ Initializes and executes the program. """ login_sucessful = [] login_failed = [] login_skipped = [] version = check_revision(VERSION) print("%s\n\n%s %s (%s)\n" % (BANNER % tuple([color(_) for _ in BANNER_PASSWORDS]), NAME, version, URL)) args = parse_args() if args.update: update() exit() sites = list_sites() if args.list: for _ in sites: print("- %s" % _) exit() if not args.password and not args.load_file: args.password = getpass("%s Please enter password:"******"(?P<type>[^:]+)://(?P<address>[^:]+)" r":(?P<port>\d+)", args.proxy, re.I) if match: if match.group("type").upper() in ("HTTP", "HTTPS"): proxy_host = "%s:%s" % (match.group("address"), match.group("port")) proxy_handler = ProxyHandler({ "http": proxy_host, "https": proxy_host }) else: from thirdparty.socks import socks if match.group("type").upper() == "SOCKS4": socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS4, match.group("address"), int(match.group("port")), True) elif match.group("type").upper() == "SOCKS5": socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, match.group("address"), int(match.group("port")), True) proxy_handler = None else: proxy_handler = ProxyHandler() else: proxy_handler = None opener = build_opener(HTTPHandler(), HTTPSHandler(), HTTPCookieProcessor(cookie_handler)) if proxy_handler: opener.add_handler(proxy_handler) install_opener(opener) with open(USER_AGENTS_FILE, 'r') as ua_file: args.user_agent = sample(ua_file.readlines(), 1)[0].strip() if args.only: sites = [site for site in sites if site in args.only] elif args.exclude: sites = [site for site in sites if site not in args.exclude] print("%s Loaded %d %s to test." % (INFO, len(sites), "site" if len(sites) == 1 else "sites")) if args.load_file: if not isfile(args.load_file): print("%s could not find the file \"%s\"" % (WARN, color(args.load_file))) exit() _ = sum(1 for line in open(args.load_file, "r")) if _ < 1: print("%s the file \"%s\" doesn't contain any valid credentials." % (WARN, color(args.load_file))) exit() print("%s Loaded %d credential%s from \"%s\".\n" % (INFO, _, "s" if _ != 1 else "", color(args.load_file))) print("%s Starting tests at: \"%s\"\n" % (INFO, color(strftime("%X"), BW))) if not exists(OUTPUT_DIR): makedirs(OUTPUT_DIR) log = Logger("%s/credmap" % OUTPUT_DIR) log.open() def get_targets(): """ Retrieve and yield list of sites (targets) for testing. """ for site in sites: _ = populate_site(site, args) if not _: continue target = Website(_, {"verbose": args.verbose}) if not target.user_agent: target.user_agent = args.user_agent yield target def login(): """ Verify credentials for login and check if login was successful. """ if (target.username_or_email == "email" and not credentials["email"] or target.username_or_email == "username" and not credentials["username"]): if args.verbose: print( "%s Skipping %s\"%s\" since " "no \"%s\" was specified.\n" % (INFO, "[%s:%s] on " % (credentials["username"] or credentials["email"], credentials["password"]) if args.load_file else "", color(target.name), color(target.username_or_email, BW))) login_skipped.append(target.name) return print("%s Testing %s\"%s\"..." % (TEST, "[%s:%s] on " % (credentials["username"] or credentials["email"], credentials["password"]) if args.load_file else "", color(target.name, BW))) cookie_handler.clear() if target.perform_login(credentials, cookie_handler): log.write(">>> %s - %s:%s\n" % (target.name, credentials["username"] or credentials["email"], credentials["password"])) login_sucessful.append( "%s%s" % (target.name, " [%s:%s]" % (credentials["username"] or credentials["email"], credentials["password"]) if args.load_file else "")) else: login_failed.append(target.name) if args.load_file: if args.cred_format: separators = [ re.escape(args.cred_format[1]), re.escape(args.cred_format[3]) if len(args.cred_format) > 3 else "\n" ] cred_format = re.match(r"(u|e|p)[^upe](u|e|p)(?:[^upe](u|e|p))?", args.cred_format) if not cred_format: print("%s Could not parse --format: \"%s\"" % (ERROR, color(args.cred_format, BW))) exit() cred_format = [ v.replace("e", "email").replace("u", "username").replace( "p", "password") for v in cred_format.groups() if v is not None ] with open(args.load_file, "r") as load_list: for user in load_list: if args.cred_format: match = re.match( r"([^{0}]+){0}([^{1}]+)(?:{1}([^\n]+))?".format( separators[0], separators[1]), user) credentials = dict(zip(cred_format, match.groups())) credentials["password"] = quote(credentials["password"]) if ("email" in credentials and not re.match( r"^[A-Za-z0-9._%+-]+@(?:[A-Z" r"a-z0-9-]+\.)+[A-Za-z]{2,12}$", credentials["email"])): print("%s Specified e-mail \"%s\" does not appear " "to be correct. Skipping...\n" % (WARN, color(credentials["email"], BW))) continue if "email" not in credentials: credentials["email"] = None elif "username" not in credentials: credentials["username"] = None else: user = user.rstrip().split(":", 1) if not user[0]: if args.verbose: print("%s Could not parse credentials: \"%s\"\n" % (WARN, color(user, BW))) continue match = re.match( r"^[A-Za-z0-9._%+-]+@(?:[A-Z" r"a-z0-9-]+\.)+[A-Za-z]{2,12}$", user[0]) credentials = { "email": user[0] if match else None, "username": None if match else user[0], "password": quote(user[1]) } for target in get_targets(): login() else: credentials = { "username": args.username, "email": args.email, "password": quote(args.password) } for target in get_targets(): login() log.close() if not args.verbose: print() if len(login_sucessful) > 0 or len(login_failed) > 0: _ = "%s/%s" % (color(len(login_sucessful), BW), color(len(login_sucessful) + len(login_failed), BW)) sign = PLUS if len(login_sucessful) > (len(login_failed) + len(login_skipped)) else INFO print( "%s Succesfully logged in%s." % (sign, " with %s credentials on the list." % _ if args.load_file else "to %s websites." % _), ) print("%s An overall success rate of %s.\n" % (sign, color( "%%%s" % (100 * len(login_sucessful) / (len(login_sucessful) + len(login_failed))), BW))) if len(login_sucessful) > 0: print("%s The provided credentials worked on the following website%s: " "%s\n" % (PLUS, "s" if len(login_sucessful) != 1 else "", ", ".join(login_sucessful))) print("%s Finished tests at: \"%s\"\n" % (INFO, color(strftime("%X"), BW)))
def get_cookie(self, netloc, ua, timeout): try: headers = {'User-Agent': ua} request = Request(netloc) _add_request_header(request, headers) try: response = urlopen(request, timeout=int(timeout)) except HTTPError as response: result = response.read(5242880) try: encoding = response.info().getheader('Content-Encoding') except: encoding = None if encoding == 'gzip': result = gzip.GzipFile(fileobj=StringIO(result)).read() jschl = re.findall('name="jschl_vc" value="(.+?)"/>', result)[0] init = re.findall('setTimeout\(function\(\){\s*.*?.*:(.*?)};', result)[-1] builder = re.findall(r"challenge-form\'\);\s*(.*)a.v", result)[0] decryptVal = self.parseJSString(init) lines = builder.split(';') for line in lines: if len(line) > 0 and '=' in line: sections = line.split('=') line_val = self.parseJSString(sections[1]) decryptVal = int( eval( str(decryptVal) + sections[0][-1] + str(line_val))) answer = decryptVal + len(urlparse(netloc).netloc) query = '%s/cdn-cgi/l/chk_jschl?jschl_vc=%s&jschl_answer=%s' % ( netloc, jschl, answer) if 'type="hidden" name="pass"' in result: passval = re.findall('name="pass" value="(.*?)"', result)[0] query = '%s/cdn-cgi/l/chk_jschl?pass=%s&jschl_vc=%s&jschl_answer=%s' % ( netloc, quote_plus(passval), jschl, answer) time.sleep(6) cookies = cookielib.LWPCookieJar() handlers = [ HTTPHandler(), HTTPSHandler(), HTTPCookieProcessor(cookies) ] opener = build_opener(*handlers) opener = install_opener(opener) try: request = Request(query) _add_request_header(request, headers) response = urlopen(request, timeout=int(timeout)) except: pass cookie = '; '.join(['%s=%s' % (i.name, i.value) for i in cookies]) if 'cf_clearance' in cookie: self.cookie = cookie except: pass
except: # in case of python 3.X import urllib.request as urllib2 from urllib.request import HTTPRedirectHandler from urllib.request import HTTPCookieProcessor class MyHTTPRedirectHandler(HTTPRedirectHandler): def http_error_302(self, req, fp, code, msg, headers): return HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers) http_error_301 = http_error_303 = http_error_307 = http_error_302 cookieprocessor = HTTPCookieProcessor() opener = urllib2.build_opener(MyHTTPRedirectHandler, cookieprocessor) urllib2.install_opener(opener) class rss(Scraper): ''' Reades a generic RSS feed and proceeds if items not already in collection. Retrieves full HTML content from link provided in RSS feed Yields docs with keys from RSS entry plus full HTML source of linked content. Subclasses should probably overwrite the following functions: By overwriting the parsehtml function, more keys can be extracted By overwriting the getlink function, modifications to the link can be made, e.g. to bypass cookie walls '''
def add_file(self, release, filename, folder): """ release: the release name filename: path where to find the file on HD folder: the folder to the SRR e.g. "Sample" without the / """ # http://stackoverflow.com/questions/680305/ # using-multipartposthandler-to-post-form-data-with-python # Register the streaming http handlers with urllib2 opener = register_openers() # https://bitbucket.org/chrisatlee/poster/issue/7/ # multipart-form-post-doesnt-work-with if _PROXY: opener.add_handler(ProxyHandler({_PROXY_TYPE: _PROXY_URL})) # Start the multipart/form-data encoding of the file "DSC0001.jpg" # "image1" is the name of the parameter, which is normally set # via the "name" parameter of the HTML <input> tag. # Ensure file is Unicode: filename = filename.decode(sys.getfilesystemencoding()) # new_headers contains the necessary Content-Type and Content-Length # datagen is a generator object that yields the encoded parameters datagen, new_headers = multipart_encode({ "folder": folder, "MAX_FILE_SIZE": _MAX_FILE_SIZE, "file": open(filename, "rb"), "add": "Add", }) headers = dict(self.headers) # makes copy original dict headers.update(new_headers) url = self.baseurl + "release/add/" + release.replace(' ', '%20') request = Request(url, datagen, headers) opener.add_handler(HTTPCookieProcessor(self.cj)) if folder != "": fn = folder + "/" else: fn = "" fn += os.path.basename(filename) # Actually do the request, and get the response try: handle = urllib2.urlopen(request) html_source = handle.read() # sre_constants.error: unbalanced parenthesis if len(re.findall(".*%s.*" % re.escape(fn), html_source)): print("'%s' successfully uploaded." % fn) # also gives this result if it was already there in the first place success = True # elif len(re.findall(".*an error occurred while adding the file.*", # html_source)): # print("!!! '%s': file already added." % fn) # success = False elif len( re.findall("You were redirected to this page", html_source)): # grab release name from top of details page match = re.search(".*RELEASE .*value=\"(.*)\".*", html_source) if match: release = match.group(1) print("??? Redirecting to '%s'." % release) success = self.add_file(self, release, filename, folder) else: print("!!! Error uploading file to '%s'." % release) success = False else: print(html_source) print("The site has been changed.") success = False if "<html" not in html_source: # keep retrying again in this case raise httplib.HTTPException("No HTML recieved") except urllib2.HTTPError as e: if e.code == 404: print("!!! '%s': no such release." % release) success = False else: raise return success
def __init__(self, agent=_msie, cookies=True, handlers=[]): self.agent = agent if cookies: handlers.append(HTTPCookieProcessor(CookieJar())) self.opener = build_opener(*handlers)
log(content, xbmc.LOGERROR) def debug(content): log(content, xbmc.LOGDEBUG) def log(msg, level=xbmc.LOGNOTICE): msg = py2_enc(msg) xbmc.log("["+addon.getAddonInfo('id')+"-"+addon.getAddonInfo('version')+"]"+msg, level) cookie = os.path.join(temp, 'cookie.jar') cj = LWPCookieJar() if xbmcvfs.exists(cookie): cj.load(cookie, ignore_discard=True, ignore_expires=True) opener = build_opener(HTTPCookieProcessor(cj)) baseURL="https://www.anime-on-demand.de" class Infowindow(pyxbmct.AddonDialogWindow): text="" pos=0 image="" trailer="" starttext="" def __init__(self, text=''): self.ueberschrift=re.compile('<h1 style="margin: 0;">(.+?)</h1>', re.DOTALL).findall(text)[0] try: self.image= re.compile('class="newspic" src="(.+?)"', re.DOTALL).findall(text)[0] if self.image[:4] != "http": self.image = baseURL+self.image
import csv import json import sys from cookielib import CookieJar from urllib2 import build_opener, HTTPCookieProcessor from cStringIO import StringIO from difflib import SequenceMatcher spreadsheet_url = 'https://docs.google.com/spreadsheet/ccc?key=13bmt8pwh4x4GFTnoctxkxjKjsxDtYwwXbGS6ZEB-ik8&output=csv' local_json_file = 'quiz.json' opener = build_opener(HTTPCookieProcessor(CookieJar())) resp = opener.open(spreadsheet_url) data = resp.read() res = [] for index, question in enumerate(csv.DictReader(StringIO(data))): # ids can be nonsequential if question['ID']: res.append({ "ID": index, "question": question['Android Test Question'], "right": [i for i in question['Right Answer(s)'].split("\n") if i], "wrong": [i for i in question['Wrong Answer(s)'].split("\n") if i], "tags": [i.strip() for i in question['Question Tag'].split(",") if i], "docRef" : question["Reference Link"], }) print([i.strip() for i in question['Question Tag'].split(",") if i]) # cannot import local modules like # from checked_questions import reviewed
def download_file_with_cookiejar(self, url, file_count, total, recursion=False): # see if we've already download this file and if it is that it is the correct size download_file = os.path.basename(url).split('?')[0] if os.path.isfile(download_file): try: request = Request(url) request.get_method = lambda: 'HEAD' response = urlopen(request, timeout=30) remote_size = self.get_total_size(response) # Check that we were able to derive a size. if remote_size: local_size = os.path.getsize(download_file) if remote_size < (local_size+(local_size*.01)) and remote_size > (local_size-(local_size*.01)): print(" > Download file {0} exists! \n > Skipping download of {1}. ".format( download_file, url)) return None, None # partial file size wasn't full file size, lets blow away the chunk and start again print(" > Found {0} but it wasn't fully downloaded. Removing file and downloading again.".format( download_file)) os.remove(download_file) except ssl.CertificateError as e: print(" > ERROR: {0}".format(e)) print( " > Could not validate SSL Cert. You may be able to overcome this using the --insecure flag") return False, None except HTTPError as e: if e.code == 401: print( " > IMPORTANT: Your user may not have permission to download this type of data!") else: print( " > Unknown Error, Could not get file HEAD: {0}".format(e)) except URLError as e: print("URL Error (from HEAD): {0}, {1}".format(e.reason, url)) if "ssl.c" in "{0}".format(e.reason): print( "IMPORTANT: Remote location may not be accepting your SSL configuration. This is a terminal error.") return False, None # attempt https connection try: request = Request(url) response = urlopen(request, timeout=30) # Watch for redirect if response.geturl() != url: # See if we were redirect BACK to URS for re-auth. if 'https://urs.earthdata.nasa.gov/oauth/authorize' in response.geturl(): if recursion: print(" > Entering seemingly endless auth loop. Aborting. ") return False, None # make this easier. If there is no app_type=401, add it new_auth_url = response.geturl() if "app_type" not in new_auth_url: new_auth_url += "&app_type=401" print( " > While attempting to download {0}....".format(url)) print(" > Need to obtain new cookie from {0}".format( new_auth_url)) old_cookies = [cookie.name for cookie in self.cookie_jar] opener = build_opener(HTTPCookieProcessor( self.cookie_jar), HTTPHandler(), HTTPSHandler(**self.context)) request = Request(new_auth_url) try: response = opener.open(request) for cookie in self.cookie_jar: if cookie.name not in old_cookies: print(" > Saved new cookie: {0}".format( cookie.name)) # A little hack to save session cookies if cookie.discard: cookie.expires = int( time.time()) + 60*60*24*30 print( " > Saving session Cookie that should have been discarded! ") self.cookie_jar.save( self.cookie_jar_path, ignore_discard=True, ignore_expires=True) except HTTPError as e: print("HTTP Error: {0}, {1}".format(e.code, url)) return False, None # Okay, now we have more cookies! Lets try again, recursively! print(" > Attempting download again with new cookies!") return self.download_file_with_cookiejar(url, file_count, total, recursion=True) print( " > 'Temporary' Redirect download @ Remote archive:\n > {0}".format(response.geturl())) # seems to be working print("({0}/{1}) Downloading {2}".format(file_count, total, url)) # Open our local file for writing and build status bar tf = tempfile.NamedTemporaryFile(mode='w+b', delete=False, dir='.') self.chunk_read(response, tf, report_hook=self.chunk_report) # Reset download status sys.stdout.write('\n') tempfile_name = tf.name tf.close() # handle errors except HTTPError as e: print("HTTP Error: {0}, {1}".format(e.code, url)) if e.code == 401: print( " > IMPORTANT: Your user does not have permission to download this type of data!") if e.code == 403: print(" > Got a 403 Error trying to download this file. ") print(" > You MAY need to log in this app and agree to a EULA. ") return False, None except URLError as e: print("URL Error (from GET): {0}, {1}, {2}".format( e, e.reason, url)) if "ssl.c" in "{0}".format(e.reason): print( "IMPORTANT: Remote location may not be accepting your SSL configuration. This is a terminal error.") return False, None except socket.timeout as e: print(" > timeout requesting: {0}; {1}".format(url, e)) return False, None except ssl.CertificateError as e: print(" > ERROR: {0}".format(e)) print( " > Could not validate SSL Cert. You may be able to overcome this using the --insecure flag") return False, None # Return the file size shutil.copy(tempfile_name, download_file) os.remove(tempfile_name) file_size = self.get_total_size(response) actual_size = os.path.getsize(download_file) if file_size is None: # We were unable to calculate file size. file_size = actual_size return actual_size, file_size
from subprocess import Popen, PIPE from lxml import html COOKIES_FILE = '/usr/local/etc/bandcamp.cookies' URL = 'https://bandcamp.com' CDN_COVERS = 'https://f4.bcbits.com/img' cj = LWPCookieJar() if os.path.isfile(COOKIES_FILE): cj.load(COOKIES_FILE) handler = HTTPHandler(debuglevel=0) opener = build_opener(handler, HTTPCookieProcessor(cj)) opener.addheaders = [('User-agent', 'Enter you own user agent !'), ('Accept', '*/*'), ('Accept-Encoding', 'deflate')] TMP_PATH = '' TMP_FILE_PREFIX = 'tmpS_' queue = Queue() # Do we have to download then add cover to downloaded music file ? ADD_COVER = 1 # Keep the cover file ? KEEP_COVER_FILE = 0 # Infinite DL ? INFINITE_DL = 1
def check_cookie(self): if self.cookie_jar is None: print(" > Cookiejar is bunk: {0}".format(self.cookie_jar)) return False # File we know is valid, used to validate cookie file_check = 'https://urs.earthdata.nasa.gov/profile' # Apply custom Redirect Hanlder opener = build_opener(HTTPCookieProcessor(self.cookie_jar), HTTPHandler(), HTTPSHandler(**self.context)) install_opener(opener) # Attempt a HEAD request request = Request(file_check) request.get_method = lambda: 'HEAD' try: print(" > attempting to download {0}".format(file_check)) response = urlopen(request, timeout=30) resp_code = response.getcode() # Make sure we're logged in if not self.check_cookie_is_logged_in(self.cookie_jar): return False # Save cookiejar self.cookie_jar.save(self.cookie_jar_path) except HTTPError: # If we ge this error, again, it likely means the user has not agreed to current EULA print("\nIMPORTANT: ") print( "Your user appears to lack permissions to download data from the ASF Datapool." ) print( "\n\nNew users: you must first log into Vertex and accept the EULA. In addition, your Study Area must be set at Earthdata https://urs.earthdata.nasa.gov" ) exit(-1) # This return codes indicate the USER has not been approved to download the data if resp_code in (300, 301, 302, 303): try: redir_url = response.info().getheader('Location') except AttributeError: redir_url = response.getheader('Location') # Funky Test env: if ("vertex.daac.asf.alaska.edu" in redir_url and "test" in self.asf_urs4['redir']): print("Cough, cough. It's dusty in this test env!") return True print("Redirect ({0}) occured, invalid cookie value!".format( resp_code)) return False # These are successes! if resp_code in (200, 307): return True return False
def __init__(self): proxy = ProxyHandler(PROXY) self.cj = CookieJar() opener = build_opener(HTTPCookieProcessor(self.cj), proxy) install_opener(opener)