예제 #1
0
    def _update_opener(self, drop_tls_level=False):
        """
        Builds and installs a new opener to be used by all future calls to
        :func:`urllib2.urlopen`.
        """
        handlers = [
            urllib_request.HTTPCookieProcessor(self._cj),
            urllib_request.HTTPBasicAuthHandler()
        ]

        if self._http_debug:
            handlers += [urllib_request.HTTPHandler(debuglevel=1)]
        else:
            handlers += [urllib_request.HTTPHandler()]

        if self._proxy:
            handlers += [urllib_request.ProxyHandler({'http': self._proxy})]

        try:
            import platform
            node = platform.node().lower()
        except:
            node = ''

        if not self._ssl_verify or node == 'xboxone':
            try:
                import ssl
                ctx = ssl.create_default_context()
                ctx.check_hostname = False
                ctx.verify_mode = ssl.CERT_NONE
                if self._http_debug:
                    handlers += [
                        urllib_request.HTTPSHandler(context=ctx, debuglevel=1)
                    ]
                else:
                    handlers += [urllib_request.HTTPSHandler(context=ctx)]
            except:
                pass
        else:
            try:
                import ssl
                import certifi
                ctx = ssl.create_default_context(cafile=certifi.where())
                if drop_tls_level:
                    ctx.protocol = ssl.PROTOCOL_TLSv1_1
                if self._http_debug:
                    handlers += [
                        urllib_request.HTTPSHandler(context=ctx, debuglevel=1)
                    ]
                else:
                    handlers += [urllib_request.HTTPSHandler(context=ctx)]
            except:
                pass

        opener = urllib_request.build_opener(*handlers)
        urllib_request.install_opener(opener)
예제 #2
0
def urlopen(request):
	if not urlopen._opener_installed:
		handler = []
		proxy_http = ucr_get('proxy/http')
		if proxy_http:
			handler.append(urllib_request.ProxyHandler({'http': proxy_http, 'https': proxy_http}))
		handler.append(HTTPSHandler())
		opener = urllib_request.build_opener(*handler)
		urllib_request.install_opener(opener)
		urlopen._opener_installed = True
	return urllib_request.urlopen(request, timeout=60)
예제 #3
0
def install_opener(ucr):
    handler = []
    proxy_http = ucr.get('proxy/http')
    if proxy_http:
        handler.append(
            urllib_request.ProxyHandler({
                'http': proxy_http,
                'https': proxy_http
            }))
    handler.append(HTTPSHandler())
    opener = urllib_request.build_opener(*handler)
    urllib_request.install_opener(opener)
예제 #4
0
    def _update_opener(self):
        '''
        Builds and installs a new opener to be used by all future calls to
        :func:`urllib2.urlopen`.
        '''
        if self._http_debug:
            http = urllib_request.HTTPHandler(debuglevel=1)
        else:
            http = urllib_request.HTTPHandler()

        if self._proxy:
            opener = urllib_request.build_opener(
                urllib_request.HTTPCookieProcessor(self._cj),
                urllib_request.ProxyHandler({'http': self._proxy}),
                urllib_request.HTTPBasicAuthHandler(), http)
        else:
            opener = urllib_request.build_opener(
                urllib_request.HTTPCookieProcessor(self._cj),
                urllib_request.HTTPBasicAuthHandler(), http)
            urllib_request.install_opener(opener)
예제 #5
0
def getUrl(url, proxy={}, timeout=TIMEOUT, cookies=True):
    global cs
    cookie = []
    if proxy:
        urllib_request.install_opener(
            urllib_request.build_opener(urllib_request.ProxyHandler(proxy)))
    elif cookies:
        cookie = http_cookiejar.LWPCookieJar()
        opener = urllib_request.build_opener(
            urllib_request.HTTPCookieProcessor(cookie))
        urllib_request.install_opener(opener)
    req = urllib_request.Request(url)
    req.add_header(
        'User-Agent',
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'
    )
    try:
        response = urllib_request.urlopen(req, timeout=timeout)
        linkSRC = response.read()
        response.close()
    except:
        linkSRC = ''
    cs = ''.join(['%s=%s;' % (c.name, c.value) for c in cookie])
    return linkSRC
예제 #6
0
def getRegexParsed(
        regexs,
        url,
        cookieJar=None,
        forCookieJarOnly=False,
        recursiveCall=False,
        cachedPages={},
        rawPost=False,
        cookie_jar_file=None):  #0,1,2 = URL, regexOnly, CookieJarOnly
    #cachedPages = {}
    #print 'url',url
    doRegexs = re.compile('\$doregex\[([^\]]*)\]').findall(url)
    #        print 'doRegexs',doRegexs,regexs
    setresolved = True
    for k in doRegexs:
        if k in regexs:
            #print 'processing ' ,k
            m = regexs[k]
            #print m
            cookieJarParam = False
            if 'cookiejar' in m:  # so either create or reuse existing jar
                #print 'cookiejar exists',m['cookiejar']
                cookieJarParam = m['cookiejar']
                if '$doregex' in cookieJarParam:
                    cookieJar = getRegexParsed(regexs, m['cookiejar'],
                                               cookieJar, True, True,
                                               cachedPages)
                    cookieJarParam = True
                else:
                    cookieJarParam = True
            #print 'm[cookiejar]',m['cookiejar'],cookieJar
            if cookieJarParam:
                if cookieJar == None:
                    #print 'create cookie jar'
                    cookie_jar_file = None
                    if 'open[' in m['cookiejar']:
                        cookie_jar_file = m['cookiejar'].split(
                            'open[')[1].split(']')[0]
#                            print 'cookieJar from file name',cookie_jar_file

                    cookieJar = getCookieJar(cookie_jar_file)
                    #                        print 'cookieJar from file',cookieJar
                    if cookie_jar_file:
                        saveCookieJar(cookieJar, cookie_jar_file)
                    #cookieJar = http_cookiejar.LWPCookieJar()
                    #print 'cookieJar new',cookieJar
                elif 'save[' in m['cookiejar']:
                    cookie_jar_file = m['cookiejar'].split('save[')[1].split(
                        ']')[0]
                    complete_path = os.path.join(profile, cookie_jar_file)
                    #                        print 'complete_path',complete_path
                    saveCookieJar(cookieJar, cookie_jar_file)

            if m['page'] and '$doregex' in m['page']:
                pg = getRegexParsed(regexs,
                                    m['page'],
                                    cookieJar,
                                    recursiveCall=True,
                                    cachedPages=cachedPages)
                if len(pg) == 0:
                    pg = 'http://regexfailed'
                m['page'] = pg

            if 'setcookie' in m and m['setcookie'] and '$doregex' in m[
                    'setcookie']:
                m['setcookie'] = getRegexParsed(regexs,
                                                m['setcookie'],
                                                cookieJar,
                                                recursiveCall=True,
                                                cachedPages=cachedPages)
            if 'appendcookie' in m and m['appendcookie'] and '$doregex' in m[
                    'appendcookie']:
                m['appendcookie'] = getRegexParsed(regexs,
                                                   m['appendcookie'],
                                                   cookieJar,
                                                   recursiveCall=True,
                                                   cachedPages=cachedPages)

            if 'post' in m and '$doregex' in m['post']:
                m['post'] = getRegexParsed(regexs,
                                           m['post'],
                                           cookieJar,
                                           recursiveCall=True,
                                           cachedPages=cachedPages)
#                    print 'post is now',m['post']

            if 'rawpost' in m and '$doregex' in m['rawpost']:
                m['rawpost'] = getRegexParsed(regexs,
                                              m['rawpost'],
                                              cookieJar,
                                              recursiveCall=True,
                                              cachedPages=cachedPages,
                                              rawPost=True)
                #print 'rawpost is now',m['rawpost']

            if 'rawpost' in m and '$epoctime$' in m['rawpost']:
                m['rawpost'] = m['rawpost'].replace('$epoctime$',
                                                    getEpocTime())

            if 'rawpost' in m and '$epoctime2$' in m['rawpost']:
                m['rawpost'] = m['rawpost'].replace('$epoctime2$',
                                                    getEpocTime2())

            link = ''
            if m['page'] and m[
                    'page'] in cachedPages and not 'ignorecache' in m and forCookieJarOnly == False:
                #print 'using cache page',m['page']
                link = cachedPages[m['page']]
            else:
                if m['page'] and not m['page'] == '' and m['page'].startswith(
                        'http'):
                    if '$epoctime$' in m['page']:
                        m['page'] = m['page'].replace('$epoctime$',
                                                      getEpocTime())
                    if '$epoctime2$' in m['page']:
                        m['page'] = m['page'].replace('$epoctime2$',
                                                      getEpocTime2())

                    #print 'Ingoring Cache',m['page']
                    page_split = m['page'].split('|')
                    pageUrl = page_split[0]
                    header_in_page = None
                    if len(page_split) > 1:
                        header_in_page = page_split[1]

#                            if
#                            proxy = urllib_request.ProxyHandler({ ('https' ? proxytouse[:5]=="https":"http") : proxytouse})
#                            opener = urllib_request.build_opener(proxy)
#                            urllib_request.install_opener(opener)

#                        print 'urllib_request.getproxies',urllib_request.getproxies()
                    current_proxies = urllib_request.ProxyHandler(
                        urllib_request.getproxies())

                    #print 'getting pageUrl',pageUrl
                    req = urllib_request.Request(pageUrl)
                    if 'proxy' in m:
                        proxytouse = m['proxy']
                        #                            print 'proxytouse',proxytouse
                        #                            urllib_request.getproxies= lambda: {}
                        if pageUrl[:5] == "https":
                            proxy = urllib_request.ProxyHandler(
                                {'https': proxytouse})
                            #req.set_proxy(proxytouse, 'https')
                        else:
                            proxy = urllib_request.ProxyHandler(
                                {'http': proxytouse})
                            #req.set_proxy(proxytouse, 'http')
                        opener = urllib_request.build_opener(proxy)
                        urllib_request.install_opener(opener)

                    req.add_header(
                        'User-Agent',
                        'Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/14.0.1'
                    )
                    proxytouse = None

                    if 'referer' in m:
                        req.add_header('Referer', m['referer'])
                    if 'accept' in m:
                        req.add_header('Accept', m['accept'])
                    if 'agent' in m:
                        req.add_header('User-agent', m['agent'])
                    if 'x-req' in m:
                        req.add_header('X-Requested-With', m['x-req'])
                    if 'x-addr' in m:
                        req.add_header('x-addr', m['x-addr'])
                    if 'x-forward' in m:
                        req.add_header('X-Forwarded-For', m['x-forward'])
                    if 'setcookie' in m:
                        #                            print 'adding cookie',m['setcookie']
                        req.add_header('Cookie', m['setcookie'])
                    if 'appendcookie' in m:
                        #                            print 'appending cookie to cookiejar',m['appendcookie']
                        cookiestoApend = m['appendcookie']
                        cookiestoApend = cookiestoApend.split(';')
                        for h in cookiestoApend:
                            n, v = h.split('=')
                            w, n = n.split(':')
                            ck = http_cookiejar.Cookie(
                                version=0,
                                name=n,
                                value=v,
                                port=None,
                                port_specified=False,
                                domain=w,
                                domain_specified=False,
                                domain_initial_dot=False,
                                path='/',
                                path_specified=True,
                                secure=False,
                                expires=None,
                                discard=True,
                                comment=None,
                                comment_url=None,
                                rest={'HttpOnly': None},
                                rfc2109=False)
                            cookieJar.set_cookie(ck)
                    if 'origin' in m:
                        req.add_header('Origin', m['origin'])
                    if header_in_page:
                        header_in_page = header_in_page.split('&')
                        for h in header_in_page:
                            n, v = h.split('=')
                            req.add_header(n, v)

                    if not cookieJar == None:
                        #                            print 'cookieJarVal',cookieJar
                        cookie_handler = urllib_request.HTTPCookieProcessor(
                            cookieJar)
                        opener = urllib_request.build_opener(
                            cookie_handler,
                            urllib_request.HTTPBasicAuthHandler(),
                            urllib_request.HTTPHandler())
                        opener = urllib_request.install_opener(opener)
                        #                            print 'noredirect','noredirect' in m

                        if 'noredirect' in m:
                            opener = urllib_request.build_opener(
                                cookie_handler, NoRedirection,
                                urllib_request.HTTPBasicAuthHandler(),
                                urllib_request.HTTPHandler())
                            opener = urllib_request.install_opener(opener)
                    elif 'noredirect' in m:
                        opener = urllib_request.build_opener(
                            NoRedirection,
                            urllib_request.HTTPBasicAuthHandler(),
                            urllib_request.HTTPHandler())
                        opener = urllib_request.install_opener(opener)

                    if 'connection' in m:
                        #                            print '..........................connection//////.',m['connection']
                        from keepalive import HTTPHandler
                        keepalive_handler = HTTPHandler()
                        opener = urllib_request.build_opener(keepalive_handler)
                        urllib_request.install_opener(opener)

                    #print 'after cookie jar'
                    post = None

                    if 'post' in m:
                        postData = m['post']
                        #if '$LiveStreamRecaptcha' in postData:
                        #    (captcha_challenge,catpcha_word,idfield)=processRecaptcha(m['page'],cookieJar)
                        #    if captcha_challenge:
                        #        postData=postData.replace('$LiveStreamRecaptcha','manual_recaptcha_challenge_field:'+captcha_challenge+',recaptcha_response_field:'+catpcha_word+',id:'+idfield)
                        splitpost = postData.split(',')
                        post = {}
                        for p in splitpost:
                            n = p.split(':')[0]
                            v = p.split(':')[1]
                            post[n] = v
                        post = urllib_parse.urlencode(post)

                    if 'rawpost' in m:
                        post = m['rawpost']
                        #if '$LiveStreamRecaptcha' in post:
                        #    (captcha_challenge,catpcha_word,idfield)=processRecaptcha(m['page'],cookieJar)
                        #    if captcha_challenge:
                        #       post=post.replace('$LiveStreamRecaptcha','&manual_recaptcha_challenge_field='+captcha_challenge+'&recaptcha_response_field='+catpcha_word+'&id='+idfield)
                    link = ''
                    try:

                        if post:
                            response = urllib_request.urlopen(req, post)
                        else:
                            response = urllib_request.urlopen(req)
                        if response.info().get('Content-Encoding') == 'gzip':
                            import gzip
                            buf = six.BytesIO(response.read())
                            f = gzip.GzipFile(fileobj=buf)
                            link = f.read()
                        else:
                            link = response.read()

                        if 'proxy' in m and not current_proxies is None:
                            urllib_request.install_opener(
                                urllib_request.build_opener(current_proxies))

                        link = javascriptUnEscape(link)
                        #print repr(link)
                        #print link This just print whole webpage in LOG
                        if 'includeheaders' in m:
                            #link+=str(response.headers.get('Set-Cookie'))
                            link += '$$HEADERS_START$$:'
                            for b in response.headers:
                                link += b + ':' + response.headers.get(
                                    b) + '\n'
                            link += '$$HEADERS_END$$:'

#                        print link

                        response.close()
                    except:
                        pass
                    cachedPages[m['page']] = link
                    #print link
                    #print 'store link for',m['page'],forCookieJarOnly

                    if forCookieJarOnly:
                        return cookieJar  # do nothing
                elif m['page'] and not m['page'].startswith('http'):
                    if m['page'].startswith('$pyFunction:'):
                        val = doEval(m['page'].split('$pyFunction:')[1], '',
                                     cookieJar, m)
                        if forCookieJarOnly:
                            return cookieJar  # do nothing
                        link = val
                        link = javascriptUnEscape(link)
                    else:
                        link = m['page']

            if '$doregex' in m['expres']:
                m['expres'] = getRegexParsed(regexs,
                                             m['expres'],
                                             cookieJar,
                                             recursiveCall=True,
                                             cachedPages=cachedPages)

            if not m['expres'] == '':
                #print 'doing it ',m['expres']
                if '$LiveStreamCaptcha' in m['expres']:
                    val = askCaptcha(m, link, cookieJar)
                    #print 'url and val',url,val
                    url = url.replace("$doregex[" + k + "]", val)

                elif m['expres'].startswith(
                        '$pyFunction:') or '#$pyFunction' in m['expres']:
                    #print 'expeeeeeeeeeeeeeeeeeee',m['expres']
                    val = ''
                    if m['expres'].startswith('$pyFunction:'):
                        val = doEval(m['expres'].split('$pyFunction:')[1],
                                     link, cookieJar, m)
                    else:
                        val = doEvalFunction(m['expres'], link, cookieJar, m)
                    if 'ActivateWindow' in m['expres']: return
                    if forCookieJarOnly:
                        return cookieJar  # do nothing
                    if 'listrepeat' in m:
                        listrepeat = m['listrepeat']
                        return listrepeat, eval(val), m, regexs, cookieJar

                    try:
                        url = url.replace(u"$doregex[" + k + "]", val)
                    except:
                        url = url.replace("$doregex[" + k + "]",
                                          six.ensure_text(val))
                else:
                    if 'listrepeat' in m:
                        listrepeat = m['listrepeat']
                        ret = re.findall(m['expres'], link)
                        return listrepeat, ret, m, regexs

                    val = ''
                    if not link == '':
                        #print 'link',link
                        reg = re.compile(m['expres']).search(link)
                        try:
                            val = reg.group(1).strip()
                        except:
                            traceback.print_exc()
                    elif m['page'] == '' or m['page'] == None:
                        val = m['expres']

                    if rawPost:
                        #                            print 'rawpost'
                        val = urllib_parse.quote_plus(val)
                    if 'htmlunescape' in m:
                        #val=urllib_parse.unquote_plus(val)
                        import HTMLParser
                        val = HTMLParser.HTMLParser().unescape(val)
                    try:
                        url = url.replace("$doregex[" + k + "]", val)
                    except:
                        url = url.replace("$doregex[" + k + "]",
                                          six.ensure_text(val))
                    #print 'ur',url
                    #return val
            else:
                url = url.replace("$doregex[" + k + "]", '')
    if '$epoctime$' in url:
        url = url.replace('$epoctime$', getEpocTime())
    if '$epoctime2$' in url:
        url = url.replace('$epoctime2$', getEpocTime2())

    if '$GUID$' in url:
        import uuid
        url = url.replace('$GUID$', str(uuid.uuid1()).upper())
    if '$get_cookies$' in url:
        url = url.replace('$get_cookies$', getCookiesString(cookieJar))

    if recursiveCall: return url
    #print 'final url',repr(url)
    if url == "":
        return
    else:
        return url, setresolved