示例#1
0
def getUrlold(url,timeout=20, returnres=False):
    global cookieJar
    global clientHeader
    try:
        post=None
        #print 'url',url
        
        #openner = urllib2.build_opener(urllib2.HTTPHandler, urllib2.HTTPSHandler)
        cookie_handler = urllib2.HTTPCookieProcessor(cookieJar)
        openner = urllib2.build_opener(cookie_handler, urllib2.HTTPBasicAuthHandler(), urllib2.HTTPHandler())
        
        #print cookieJar

        if post:
            req = urllib2.Request(url, post)
        else:
            req = urllib2.Request(url)
        
        ua_header=False
        if clientHeader:
            for n,v in clientHeader:
                req.add_header(n,v)
                if n=='User-Agent':
                    ua_header=True

        if not ua_header:
            req.add_header('User-Agent','AppleCoreMedia/1.0.0.12B411 (iPhone; U; CPU OS 8_1 like Mac OS X; en_gb)')
        
        #req.add_header('X-Playback-Session-Id','9A1E596D-6AB6-435F-85D1-59BDD0E62D24')
        if gproxy:
            req.set_proxy(gproxy, 'http')
        response = openner.open(req)
        
        if returnres: return response
        data=response.read()

        #print len(data)

        return data

    except:
        print 'Error in getUrl'
        traceback.print_exc()
        return None
示例#2
0
ERR_STR_NO_DOI = 'No document object identifier found on the page: '
ERR_STR_NO_BIBTEX = 'No BibTeX entry found for the DOI: '
ERR_STR_NO_URL = 'No URL found in the BibTeX entry for the DOI: '
ERR_STR_REPORT = 'Please report the error to [email protected].'

# read url from std input
url = sys.stdin.readline()

# get rid of the newline at the end
url = url.strip()

cookie_jar = cookielib.CookieJar()

handlers = []
if "--debug" in sys.argv:
    handlers.append(urllib2.HTTPHandler(debuglevel=True))
handlers.append(urllib2.HTTPCookieProcessor(cookie_jar))

opener = urllib2.build_opener(*handlers)
opener.addheaders = [('User-agent', 'lwp-request/5.810')]
urllib2.install_opener(opener)

# link.aps.org/doi is basically a DOI resolver, but not necessarily to
# aps.org
if re.search(r'/doi/', url):
    f = urlopen(url)
    # Open the URL, which automatically follows redirects.
    # I think this just does similar to HEAD (i.e, no data downloaded)
    # but not sure.
    url = f.geturl()
    # if not an APS url we can handle, let another plugin handle it
示例#3
0
def read_body_and_headers(url,
                          post=None,
                          headers=[],
                          follow_redirects=False,
                          timeout=None):
    _log("read_body_and_headers " + url)

    if post is not None:
        _log("read_body_and_headers post=" + post)

    if len(headers) == 0:
        headers.append([
            "User-Agent",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:18.0) Gecko/20100101 Firefox/18.0"
        ])

    # Start cookie lib
    ficherocookies = os.path.join(get_data_path(), 'cookies.dat')
    _log("read_body_and_headers cookies_file=" + ficherocookies)

    cj = None
    ClientCookie = None
    cookielib = None

    # Let's see if cookielib is available
    try:
        _log("read_body_and_headers importing cookielib")
        import cookielib
    except ImportError:
        _log("read_body_and_headers cookielib no disponible")
        # If importing cookielib fails
        # let's try ClientCookie
        try:
            _log("read_body_and_headers importing ClientCookie")
            import ClientCookie
        except ImportError:
            _log("read_body_and_headers ClientCookie not available")
            # ClientCookie isn't available either
            urlopen = urllib2.urlopen
            Request = urllib2.Request
        else:
            _log("read_body_and_headers ClientCookie available")
            # imported ClientCookie
            urlopen = ClientCookie.urlopen
            Request = ClientCookie.Request
            cj = ClientCookie.MozillaCookieJar()

    else:
        _log("read_body_and_headers cookielib available")
        # importing cookielib worked
        urlopen = urllib2.urlopen
        Request = urllib2.Request
        cj = cookielib.MozillaCookieJar()
        # This is a subclass of FileCookieJar
        # that has useful load and save methods

    if cj is not None:
        # we successfully imported
        # one of the two cookie handling modules
        _log("read_body_and_headers Cookies enabled")

        if os.path.isfile(ficherocookies):
            _log("read_body_and_headers Reading cookie file")
            # if we have a cookie file already saved
            # then load the cookies into the Cookie Jar
            try:
                cj.load(ficherocookies)
            except:
                _log("read_body_and_headers Wrong cookie file, deleting...")
                os.remove(ficherocookies)

        # Now we need to get our Cookie Jar
        # installed in the opener;
        # for fetching URLs
        if cookielib is not None:
            _log("read_body_and_headers opener using urllib2 (cookielib)")
            # if we use cookielib
            # then we get the HTTPCookieProcessor
            # and install the opener in urllib2
            if not follow_redirects:
                opener = urllib2.build_opener(
                    urllib2.HTTPHandler(debuglevel=http_debug_log_enabled),
                    urllib2.HTTPCookieProcessor(cj), NoRedirectHandler())
            else:
                opener = urllib2.build_opener(
                    urllib2.HTTPHandler(debuglevel=http_debug_log_enabled),
                    urllib2.HTTPCookieProcessor(cj))
            urllib2.install_opener(opener)

        else:
            _log("read_body_and_headers opener using ClientCookie")
            # if we use ClientCookie
            # then we get the HTTPCookieProcessor
            # and install the opener in ClientCookie
            opener = ClientCookie.build_opener(
                ClientCookie.HTTPCookieProcessor(cj))
            ClientCookie.install_opener(opener)

    # -------------------------------------------------
    # Cookies instaladas, lanza la petición
    # -------------------------------------------------

    # Contador
    inicio = time.clock()

    # Diccionario para las cabeceras
    txheaders = {}

    # Construye el request
    if post is None:
        _log("read_body_and_headers GET request")
    else:
        _log("read_body_and_headers POST request")

    # Añade las cabeceras
    _log("read_body_and_headers ---------------------------")
    for header in headers:
        _log("read_body_and_headers header %s=%s" %
             (str(header[0]), str(header[1])))
        txheaders[header[0]] = header[1]
    _log("read_body_and_headers ---------------------------")

    req = Request(url, post, txheaders)
    if timeout is None:
        handle = urlopen(req)
    else:
        #Disponible en python 2.6 en adelante --> handle = urlopen(req, timeout=timeout)
        #Para todas las versiones:
        try:
            import socket
            deftimeout = socket.getdefaulttimeout()
            socket.setdefaulttimeout(timeout)
            handle = urlopen(req)
            socket.setdefaulttimeout(deftimeout)
        except:
            import sys
            for line in sys.exc_info():
                _log("%s" % line)

    # Actualiza el almacén de cookies
    cj.save(ficherocookies)

    # Lee los datos y cierra
    if handle.info().get('Content-Encoding') == 'gzip':
        buf = StringIO(handle.read())
        f = gzip.GzipFile(fileobj=buf)
        data = f.read()
    else:
        data = handle.read()

    info = handle.info()
    _log("read_body_and_headers Response")

    returnheaders = []
    _log("read_body_and_headers ---------------------------")
    for header in info:
        _log("read_body_and_headers " + header + "=" + info[header])
        returnheaders.append([header, info[header]])
    handle.close()
    _log("read_body_and_headers ---------------------------")
    '''
    # Lanza la petición
    try:
        response = urllib2.urlopen(req)
    # Si falla la repite sustituyendo caracteres especiales
    except:
        req = urllib2.Request(url.replace(" ","%20"))
    
        # Añade las cabeceras
        for header in headers:
            req.add_header(header[0],header[1])

        response = urllib2.urlopen(req)
    '''

    # Tiempo transcurrido
    fin = time.clock()
    _log("read_body_and_headers Downloaded in %d seconds " %
         (fin - inicio + 1))
    _log("read_body_and_headers body=" + data)

    return data, returnheaders
示例#4
0
access_token_secret = "LbCFs0erKoPpHNZFNVBeuaaT3In9CKkBSmwAohYGr8Tkl"

consumer_key = "GvbPz6XCMcOp8610jEifMg"
consumer_secret = "o7AT3QTrshnswlkQWmYoZiaCY5vYVzYBUQKWPn25zg"

_debug = 0

oauth_token    = woof.Token(key=access_token_key, secret=access_token_secret)
oauth_consumer = woof.Consumer(key=consumer_key, secret=consumer_secret)

signature_method_hmac_sha1 = woof.SignatureMethod_HMAC_SHA1()

http_method = "GET"


http_handler  = urllib.HTTPHandler(debuglevel=_debug)
https_handler = urllib.HTTPSHandler(debuglevel=_debug)

'''
Construct, sign, and open a twitter request
using the hard-coded credentials above.
'''
def twitterreq(url, method, parameters):
  req = woof.Request.from_consumer_and_token(oauth_consumer,
                                             token=oauth_token,
                                             http_method=http_method,
                                             http_url=url, 
                                             parameters=parameters)

  req.sign_request(signature_method_hmac_sha1, oauth_consumer, oauth_token)
示例#5
0
import urllib
import urllib2
import HTMLParser

http_header ={
    'User-Agent':'Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0',
    'Accept': 'image/png,image/*;q=0.8,*/*;q=0.5',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate',
    'DNT': '1',
    'Connection': 'keep-alive'
    }


class getBlogList(HTMLParser.HTMLParser):
    def handle_starttag(self,tag,attrs):
        if tag == 'a':
            for name,value in attrs:
                if name == 'href' and value.index('http://www.xxx.com/888/blog/'):
                   print value

host = 'http://www.xxx.com/888/blog'
http_debug_handler = urllib2.HTTPHandler(debuglevel=1)
urllib2.install_opener(urllib2.build_opener(http_debug_handler))
request = urllib2.Request(host,None,http_header)
response = urllib2.urlopen(request)

engine = getBlogList()
engine.feed(response.read())

示例#6
0
uwcicon = xbmc.translatePath(os.path.join(rootDir, 'icon.png'))
changelog = xbmc.translatePath(os.path.join(rootDir, 'changelog.txt'))

profileDir = addon.getAddonInfo('profile')
profileDir = xbmc.translatePath(profileDir).decode("utf-8")
cookiePath = os.path.join(profileDir, 'cookies.lwp')
kodiver = xbmc.getInfoLabel("System.BuildVersion").split(".")[0]

if not os.path.exists(profileDir):
    os.makedirs(profileDir)

urlopen = urllib2.urlopen
cj = cookielib.LWPCookieJar(xbmc.translatePath(cookiePath))
Request = urllib2.Request

handlers = [urllib2.HTTPBasicAuthHandler(), urllib2.HTTPHandler()]

if (2, 7, 8) < sys.version_info < (2, 7, 12):
    try:
        import ssl
        ssl_context = ssl.create_default_context()
        ssl_context.check_hostname = False
        ssl_context.verify_mode = ssl.CERT_NONE
        handlers += [urllib2.HTTPSHandler(context=ssl_context)]
    except:
        pass

if cj != None:
    if os.path.isfile(xbmc.translatePath(cookiePath)):
        try:
            cj.load()
示例#7
0
    def do_request(self, json_obj):
        headers = {
            'Content-Type': 'application/json-rpc',
            'User-Agent': 'python/zabbix_api'
        }

        if self.httpuser:
            self.debug(logging.INFO, "HTTP Auth enabled")
            auth = 'Basic ' + string.strip(
                base64.encodestring(self.httpuser + ':' + self.httppasswd))
            headers['Authorization'] = auth
        self.r_query.append(str(json_obj))
        self.debug(logging.INFO, "Sending: " + str(json_obj))
        self.debug(logging.DEBUG, "Sending headers: " + str(headers))

        request = urllib2.Request(url=self.url,
                                  data=json_obj.encode('utf-8'),
                                  headers=headers)
        if self.proto == "https":
            https_handler = urllib2.HTTPSHandler(debuglevel=0)
            opener = urllib2.build_opener(https_handler)
        elif self.proto == "http":
            http_handler = urllib2.HTTPHandler(debuglevel=0)
            opener = urllib2.build_opener(http_handler)
        else:
            raise ZabbixAPIException("Unknow protocol %s" % self.proto)

        urllib2.install_opener(opener)
        try:
            response = opener.open(request, timeout=self.timeout)
        except ssl.SSLError as e:
            if e.message == "The read operation timed out":
                raise APITimeout("SSL read timeout", )
            else:
                raise e
        except socket.timeout as e:
            raise APITimeout("HTTP read timeout", )
        except urllib2.URLError as e:
            if "Connection timed out" in e.message:
                raise APITimeout("HTTP read timeout", )
            else:
                raise e
        self.debug(logging.INFO, "Response Code: " + str(response.code))

        # NOTE: Getting a 412 response code means the headers are not in the
        # list of allowed headers.
        if response.code != 200:
            raise ZabbixAPIException("HTTP ERROR %s: %s" %
                                     (response.status, response.reason))
        reads = response.read()
        if len(reads) == 0:
            raise ZabbixAPIException("Received zero answer")
        try:
            jobj = json.loads(reads.decode('utf-8'))
        except ValueError as msg:
            print("unable to decode. returned string: %s" % reads)
            sys.exit(-1)
        self.debug(logging.DEBUG, "Response Body: " + str(jobj))

        self.id += 1

        if 'error' in jobj:  # some exception
            msg = "Error %s: %s, %s while sending %s" % (
                jobj['error']['code'], jobj['error']['message'],
                jobj['error']['data'], str(json_obj))
            if re.search(".*already\sexists.*", jobj["error"]["data"],
                         re.I):  # already exists
                raise Already_Exists(msg, jobj['error']['code'])
            else:
                raise ZabbixAPIException(msg, jobj['error']['code'])
        return jobj
示例#8
0
    def send(self,
             url,
             method="GET",
             payload=None,
             headers=None,
             cookie=None,
             auth=None,
             content=''):

        # initialize url service
        urlib = self.framework.urlib(url)
        if urlib.scheme == '':
            url = urlib.sub_service("http")

        # get random user agent
        if self.rand_agent:
            self.user_agent = self.framework.rand_uagent().get
        # Makes a web request and returns a response object.
        if method.upper() != "POST" and content:
            raise RequestException(
                "Invalid content type for the %s method: %s" %
                (method, content))
        # Prime local mutable variables to prevent persistence
        if payload is None:
            payload = {}
        if headers is None:
            headers = {}
        if auth is None:
            auth = ()

        # Set request arguments
        # Process user-agent header
        headers["User-Agent"] = self.user_agent
        if not headers["User-Agent"]:
            headers["User-Agent"] = self.user_agent
        # process payload
        if content.upper() == "JSON":
            headers["Content-Type"] = "application/json"
            payload = json.dumps(payload)
        else:
            payload = urlencode(encode_payload(payload))
        # process basic authentication
        if len(auth) == 2:
            authorization = b64encode(
                utf_8_encode('%s:%s' % (auth[0], auth[1]))).replace('\n', '')
            headers["Authorization"] = "Basic %s" % (authorization)
        # Process socket timeout
        if self.timeout:
            socket.setdefaulttimeout(self.timeout)

        # Set handlers
        # Declare handlers list according to debug setting
        handlers = [
            request.HTTPHandler(debuglevel=1),
            request.HTTPSHandler(debuglevel=1)
        ] if self.debug else []
        # Process cookie handler
        if cookie is not None and cookie != {}:
            if isinstance(cookie, dict):
                str_cookie = ""
                for i in cookie:
                    str_cookie += "%s=%s; " % (i, cookie[i])
                headers["Cookie"] = str_cookie
            else:
                handlers.append(request.HTTPCookieProcessor(cookie))

        # Process redirect and add handler
        if not self.redirect:
            handlers.append(NoRedirectHandler)
        # Process proxies and add handler
        if self.proxy:
            proxies = {"http": self.proxy, "https": self.proxy}
            handlers.append(request.ProxyHandler(proxies))

        # Install opener
        opener = request.build_opener(*handlers)
        request.install_opener(opener)
        # Process method and make request
        if method == "GET":
            if payload:
                url = "%s?%s" % (url, payload)
            req = request.Request(url, headers=headers)
        elif method == "POST":
            req = request.Request(url, data=payload, headers=headers)
        elif method == "HEAD":
            if payload:
                url = "%s?%s" % (url, payload)
            req = request.Request(url, headers=headers)
            req.get_method = lambda: "HEAD"
        else:
            raise RequestException(
                "Request method \'%s\' is not a supported method." % (method))

        try:
            resp = request.urlopen(req)
        except request.HTTPError as e:
            resp = e

        # Build and return response object
        return ResponseObject(resp, cookie)
示例#9
0
    def _post_multipart(self,
                        host,
                        selector,
                        fields,
                        files,
                        ssl=False,
                        port=80,
                        proxy_url=None,
                        proxy_port=None):
        """ performs a multi-post to AGOL, Portal, or AGS
            Inputs:
               host - string - root url (no http:// or https://)
                   ex: www.arcgis.com
               selector - string - everything after the host
                   ex: /PWJUSsdoJDp7SgLj/arcgis/rest/services/GridIndexFeatures/FeatureServer/0/1/addAttachment
               fields - dictionary - additional parameters like token and format information
               files - tuple array- tuple with the file name type, filename, full path
               ssl - option to use SSL
               proxy_url - string - url to proxy server
               proxy_port - interger - port value if not on port 80

            Output:
               JSON response as dictionary
            Useage:
               import urlparse
               url = "http://sampleserver3.arcgisonline.com/ArcGIS/rest/services/SanFrancisco/311Incidents/FeatureServer/0/10261291"
               parsed_url = urlparse.urlparse(url)
               params = {"f":"json"}
               print _post_multipart(host=parsed_url.hostname,
                               selector=parsed_url.path,
                               files=files,
                               fields=params
                               )
        """
        content_type, body = self._encode_multipart_formdata(fields, files)

        if ssl:
            url = "https://%s%s" % (host, selector)
        else:
            url = "http://%s%s" % (host, selector)
        if proxy_url is not None:
            if proxy_port is None:
                proxy_port = 80
            proxies = {
                "http": "http://%s:%s" % (proxy_url, proxy_port),
                "https": "https://%s:%s" % (proxy_url, proxy_port)
            }
            proxy_support = urllib2.ProxyHandler(proxies)
            opener = urllib2.build_opener(proxy_support,
                                          urllib2.HTTPHandler(debuglevel=0))
            urllib2.install_opener(opener)
        request = urllib2.Request(url)
        request.add_header('User-agent', 'ArcREST')
        request.add_header('Content-type', content_type)
        request.add_header('Content-length', len(body))
        request.add_data(body)
        result = urllib2.urlopen(request).read()
        if result == "":
            return ""
        jres = json.loads(result)
        if 'error' in jres:
            if jres['error']['message'] == 'Request not made over ssl':
                if url.startswith('http://'):
                    url = url.replace('http://', 'https://')
                    return self._post_multipart(host,
                                                selector,
                                                fields,
                                                files,
                                                ssl=True,
                                                port=port,
                                                proxy_url=proxy_url,
                                                proxy_port=proxy_port)
        return self._unicode_convert(jres)
示例#10
0
def getRegexParsed(
        regexs,
        url,
        cookieJar=None,
        forCookieJarOnly=False,
        recursiveCall=False,
        cachedPages={},
        rawPost=False,
        cookie_jar_file=None):  #0,1,2 = URL, regexOnly, CookieJarOnly
    doRegexs = re.compile('\$doregex\[([^\]]*)\]').findall(url)
    setresolved = True
    for k in doRegexs:
        if k in regexs:
            m = regexs[k]
            cookieJarParam = False
            if 'cookiejar' in m:
                cookieJarParam = m['cookiejar']
                if '$doregex' in cookieJarParam:
                    cookieJar = getRegexParsed(regexs, m['cookiejar'],
                                               cookieJar, True, True,
                                               cachedPages)
                    cookieJarParam = True
                else:
                    cookieJarParam = True
            if cookieJarParam:
                if cookieJar == None:
                    cookie_jar_file = None
                    if 'open[' in m['cookiejar']:
                        cookie_jar_file = m['cookiejar'].split(
                            'open[')[1].split(']')[0]
                    cookieJar = getCookieJar(cookie_jar_file)
                    if cookie_jar_file:
                        saveCookieJar(cookieJar, cookie_jar_file)
                elif 'save[' in m['cookiejar']:
                    cookie_jar_file = m['cookiejar'].split('save[')[1].split(
                        ']')[0]
                    complete_path = os.path.join(profile, cookie_jar_file)
                    saveCookieJar(cookieJar, cookie_jar_file)
            if m['page'] and '$doregex' in m['page']:
                pg = getRegexParsed(regexs,
                                    m['page'],
                                    cookieJar,
                                    recursiveCall=True,
                                    cachedPages=cachedPages)
                if len(pg) == 0:
                    pg = 'http://regexfailed'
                m['page'] = pg
            if 'setcookie' in m and m['setcookie'] and '$doregex' in m[
                    'setcookie']:
                m['setcookie'] = getRegexParsed(regexs,
                                                m['setcookie'],
                                                cookieJar,
                                                recursiveCall=True,
                                                cachedPages=cachedPages)
            if 'appendcookie' in m and m['appendcookie'] and '$doregex' in m[
                    'appendcookie']:
                m['appendcookie'] = getRegexParsed(regexs,
                                                   m['appendcookie'],
                                                   cookieJar,
                                                   recursiveCall=True,
                                                   cachedPages=cachedPages)
            if 'post' in m and '$doregex' in m['post']:
                m['post'] = getRegexParsed(regexs,
                                           m['post'],
                                           cookieJar,
                                           recursiveCall=True,
                                           cachedPages=cachedPages)
            if 'rawpost' in m and '$doregex' in m['rawpost']:
                m['rawpost'] = getRegexParsed(regexs,
                                              m['rawpost'],
                                              cookieJar,
                                              recursiveCall=True,
                                              cachedPages=cachedPages,
                                              rawPost=True)
            if 'rawpost' in m and '$epoctime$' in m['rawpost']:
                m['rawpost'] = m['rawpost'].replace('$epoctime$',
                                                    getEpocTime())
            if 'rawpost' in m and '$epoctime2$' in m['rawpost']:
                m['rawpost'] = m['rawpost'].replace('$epoctime2$',
                                                    getEpocTime2())
            link = ''
            if m['page'] and m[
                    'page'] in cachedPages and not 'ignorecache' in m and forCookieJarOnly == False:
                link = cachedPages[m['page']]
            else:
                if m['page'] and not m['page'] == '' and m['page'].startswith(
                        'http'):
                    if '$epoctime$' in m['page']:
                        m['page'] = m['page'].replace('$epoctime$',
                                                      getEpocTime())
                    if '$epoctime2$' in m['page']:
                        m['page'] = m['page'].replace('$epoctime2$',
                                                      getEpocTime2())
                    page_split = m['page'].split('|')
                    pageUrl = page_split[0]
                    header_in_page = None
                    if len(page_split) > 1:
                        header_in_page = page_split[1]
                    current_proxies = urllib2.ProxyHandler(
                        urllib2.getproxies())
                    req = urllib2.Request(pageUrl)
                    if 'proxy' in m:
                        proxytouse = m['proxy']
                        if pageUrl[:5] == "https":
                            proxy = urllib2.ProxyHandler({'https': proxytouse})
                        else:
                            proxy = urllib2.ProxyHandler({'http': proxytouse})
                        opener = urllib2.build_opener(proxy)
                        urllib2.install_opener(opener)
                    req.add_header(
                        'User-Agent',
                        'Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/14.0.1'
                    )
                    proxytouse = None
                    if 'referer' in m:
                        req.add_header('Referer', m['referer'])
                    if 'accept' in m:
                        req.add_header('Accept', m['accept'])
                    if 'agent' in m:
                        req.add_header('User-agent', m['agent'])
                    if 'x-req' in m:
                        req.add_header('X-Requested-With', m['x-req'])
                    if 'x-addr' in m:
                        req.add_header('x-addr', m['x-addr'])
                    if 'x-forward' in m:
                        req.add_header('X-Forwarded-For', m['x-forward'])
                    if 'setcookie' in m:
                        req.add_header('Cookie', m['setcookie'])
                    if 'appendcookie' in m:
                        cookiestoApend = m['appendcookie']
                        cookiestoApend = cookiestoApend.split(';')
                        for h in cookiestoApend:
                            n, v = h.split('=')
                            w, n = n.split(':')
                            ck = cookielib.Cookie(version=0,
                                                  name=n,
                                                  value=v,
                                                  port=None,
                                                  port_specified=False,
                                                  domain=w,
                                                  domain_specified=False,
                                                  domain_initial_dot=False,
                                                  path='/',
                                                  path_specified=True,
                                                  secure=False,
                                                  expires=None,
                                                  discard=True,
                                                  comment=None,
                                                  comment_url=None,
                                                  rest={'HttpOnly': None},
                                                  rfc2109=False)
                            cookieJar.set_cookie(ck)
                    if 'origin' in m:
                        req.add_header('Origin', m['origin'])
                    if header_in_page:
                        header_in_page = header_in_page.split('&')
                        for h in header_in_page:
                            n, v = h.split('=')
                            req.add_header(n, v)
                    if not cookieJar == None:
                        cookie_handler = urllib2.HTTPCookieProcessor(cookieJar)
                        opener = urllib2.build_opener(
                            cookie_handler, urllib2.HTTPBasicAuthHandler(),
                            urllib2.HTTPHandler())
                        opener = urllib2.install_opener(opener)
                        if 'noredirect' in m:
                            opener = urllib2.build_opener(
                                cookie_handler, NoRedirection,
                                urllib2.HTTPBasicAuthHandler(),
                                urllib2.HTTPHandler())
                            opener = urllib2.install_opener(opener)
                    elif 'noredirect' in m:
                        opener = urllib2.build_opener(
                            NoRedirection, urllib2.HTTPBasicAuthHandler(),
                            urllib2.HTTPHandler())
                        opener = urllib2.install_opener(opener)
                    if 'connection' in m:
                        from keepalive import HTTPHandler
                        keepalive_handler = HTTPHandler()
                        opener = urllib2.build_opener(keepalive_handler)
                        urllib2.install_opener(opener)
                    post = None
                    if 'post' in m:
                        postData = m['post']
                        splitpost = postData.split(',')
                        post = {}
                        for p in splitpost:
                            n = p.split(':')[0]
                            v = p.split(':')[1]
                            post[n] = v
                        post = urllib.urlencode(post)
                    if 'rawpost' in m:
                        post = m['rawpost']
                    link = ''
                    try:
                        if post:
                            response = urllib2.urlopen(req, post)
                        else:
                            response = urllib2.urlopen(req)
                        if response.info().get('Content-Encoding') == 'gzip':
                            from StringIO import StringIO
                            import gzip
                            buf = StringIO(response.read())
                            f = gzip.GzipFile(fileobj=buf)
                            link = f.read()
                        else:
                            link = response.read()
                        if 'proxy' in m and not current_proxies is None:
                            urllib2.install_opener(
                                urllib2.build_opener(current_proxies))
                        link = javascriptUnEscape(link)
                        if 'includeheaders' in m:
                            link += '$$HEADERS_START$$:'
                            for b in response.headers:
                                link += b + ':' + response.headers.get(
                                    b) + '\n'
                            link += '$$HEADERS_END$$:'
                        response.close()
                    except:
                        pass
                    cachedPages[m['page']] = link
                    if forCookieJarOnly:
                        return cookieJar
                elif m['page'] and not m['page'].startswith('http'):
                    if m['page'].startswith('$pyFunction:'):
                        val = doEval(m['page'].split('$pyFunction:')[1], '',
                                     cookieJar, m)
                        if forCookieJarOnly:
                            return cookieJar
                        link = val
                        link = javascriptUnEscape(link)
                    else:
                        link = m['page']
            if '$doregex' in m['expres']:
                m['expres'] = getRegexParsed(regexs,
                                             m['expres'],
                                             cookieJar,
                                             recursiveCall=True,
                                             cachedPages=cachedPages)
            if not m['expres'] == '':
                if '$LiveStreamCaptcha' in m['expres']:
                    val = askCaptcha(m, link, cookieJar)
                    url = url.replace("$doregex[" + k + "]", val)
                elif m['expres'].startswith(
                        '$pyFunction:') or '#$pyFunction' in m['expres']:
                    val = ''
                    if m['expres'].startswith('$pyFunction:'):
                        val = doEval(m['expres'].split('$pyFunction:')[1],
                                     link, cookieJar, m)
                    else:
                        val = doEvalFunction(m['expres'], link, cookieJar, m)
                    if 'ActivateWindow' in m['expres']: return
                    if forCookieJarOnly:
                        return cookieJar
                    if 'listrepeat' in m:
                        listrepeat = m['listrepeat']
                        return listrepeat, eval(val), m, regexs, cookieJar
                    try:
                        url = url.replace(u"$doregex[" + k + "]", val)
                    except:
                        url = url.replace("$doregex[" + k + "]",
                                          val.decode("utf-8"))
                else:
                    if 'listrepeat' in m:
                        listrepeat = m['listrepeat']
                        ret = re.findall(m['expres'], link)
                        return listrepeat, ret, m, regexs
                    val = ''
                    if not link == '':
                        reg = re.compile(m['expres']).search(link)
                        try:
                            val = reg.group(1).strip()
                        except:
                            traceback.print_exc()
                    elif m['page'] == '' or m['page'] == None:
                        val = m['expres']
                    if rawPost:
                        val = urllib.quote_plus(val)
                    if 'htmlunescape' in m:
                        import HTMLParser
                        val = HTMLParser.HTMLParser().unescape(val)
                    try:
                        url = url.replace("$doregex[" + k + "]", val)
                    except:
                        url = url.replace("$doregex[" + k + "]",
                                          val.decode("utf-8"))
            else:
                url = url.replace("$doregex[" + k + "]", '')
    if '$epoctime$' in url:
        url = url.replace('$epoctime$', getEpocTime())
    if '$epoctime2$' in url:
        url = url.replace('$epoctime2$', getEpocTime2())
    if '$GUID$' in url:
        import uuid
        url = url.replace('$GUID$', str(uuid.uuid1()).upper())
    if '$get_cookies$' in url:
        url = url.replace('$get_cookies$', getCookiesString(cookieJar))
    if recursiveCall: return url
    if url == "":
        return
    else:
        return url, setresolved
示例#11
0
def request(url, close=True, redirect=True, error=False, proxy=None, post=None, headers=None, mobile=False, limit=None, referer=None, cookie=None, output='', timeout='30'):
    try:
        handlers = []

        if not proxy == None:
            handlers += [urllib2.ProxyHandler({'http':'%s' % (proxy)}), urllib2.HTTPHandler]
            opener = urllib2.build_opener(*handlers)
            opener = urllib2.install_opener(opener)


        if output == 'cookie' or output == 'extended' or not close == True:
            cookies = cookielib.LWPCookieJar()
            handlers += [urllib2.HTTPHandler(), urllib2.HTTPSHandler(), urllib2.HTTPCookieProcessor(cookies)]
            opener = urllib2.build_opener(*handlers)
            opener = urllib2.install_opener(opener)


        try:
            if sys.version_info < (2, 7, 9): raise Exception()
            import ssl; ssl_context = ssl.create_default_context()
            ssl_context.check_hostname = False
            ssl_context.verify_mode = ssl.CERT_NONE
            handlers += [urllib2.HTTPSHandler(context=ssl_context)]
            opener = urllib2.build_opener(*handlers)
            opener = urllib2.install_opener(opener)
        except:
            pass


        try: headers.update(headers)
        except: headers = {}
        if 'User-Agent' in headers:
            pass
        elif not mobile == True:
            #headers['User-Agent'] = agent()
            headers['User-Agent'] = cache.get(randomagent, 1)
        else:
            headers['User-Agent'] = 'Apple-iPhone/701.341'
        if 'Referer' in headers:
            pass
        elif referer == None:
            headers['Referer'] = '%s://%s/' % (urlparse.urlparse(url).scheme, urlparse.urlparse(url).netloc)
        else:
            headers['Referer'] = referer
        if not 'Accept-Language' in headers:
            headers['Accept-Language'] = 'en-US'
        if 'Cookie' in headers:
            pass
        elif not cookie == None:
            headers['Cookie'] = cookie


        if redirect == False:

            class NoRedirection(urllib2.HTTPErrorProcessor):
                def http_response(self, request, response): return response

            opener = urllib2.build_opener(NoRedirection)
            opener = urllib2.install_opener(opener)

            try: del headers['Referer']
            except: pass


        request = urllib2.Request(url, data=post, headers=headers)


        try:
            response = urllib2.urlopen(request, timeout=int(timeout))
        except urllib2.HTTPError as response:

            if response.code == 503:
                if 'cf-browser-verification' in response.read(5242880):

                    netloc = '%s://%s' % (urlparse.urlparse(url).scheme, urlparse.urlparse(url).netloc)

                    ua = headers['User-Agent']

                    cf = cache.get(cfcookie().get, 168, netloc, ua, timeout)

                    headers['Cookie'] = cf

                    request = urllib2.Request(url, data=post, headers=headers)

                    response = urllib2.urlopen(request, timeout=int(timeout))

                elif error == False:
                    return

            elif error == False:
                return


        if output == 'cookie':
            try: result = '; '.join(['%s=%s' % (i.name, i.value) for i in cookies])
            except: pass
            try: result = cf
            except: pass
            if close == True: response.close()
            return result

        elif output == 'geturl':
            result = response.geturl()
            if close == True: response.close()
            return result

        elif output == 'headers':
            result = response.headers
            if close == True: response.close()
            return result

        elif output == 'chunk':
            try: content = int(response.headers['Content-Length'])
            except: content = (2049 * 1024)
            if content < (2048 * 1024): return
            result = response.read(16 * 1024)
            if close == True: response.close()
            return result


        if limit == '0':
            result = response.read(224 * 1024)
        elif not limit == None:
            result = response.read(int(limit) * 1024)
        else:
            result = response.read(5242880)


        if 'sucuri_cloudproxy_js' in result:
            su = sucuri().get(result)

            headers['Cookie'] = su

            request = urllib2.Request(url, data=post, headers=headers)

            response = urllib2.urlopen(request, timeout=int(timeout))

            if limit == '0':
                result = response.read(224 * 1024)
            elif not limit == None:
                result = response.read(int(limit) * 1024)
            else:
                result = response.read(5242880)


        if output == 'extended':
            response_headers = response.headers
            response_code = str(response.code)
            try: cookie = '; '.join(['%s=%s' % (i.name, i.value) for i in cookies])
            except: pass
            try: cookie = cf
            except: pass
            if close == True: response.close()
            return (result, response_code, response_headers, headers, cookie)
        else:
            if close == True: response.close()
            return result
    except:
        return
示例#12
0
 def __init__(self, url, close=True, proxy=None, post=None, mobile=False, referer=None, cookie=None, output='', timeout='10'):
     if not proxy == None:
         proxy_handler = urllib2.ProxyHandler({'http':'%s' % (proxy)})
         opener = urllib2.build_opener(proxy_handler, urllib2.HTTPHandler)
         opener = urllib2.install_opener(opener)
     if output == 'cookie' or not close == True:
         import cookielib
         cookie_handler = urllib2.HTTPCookieProcessor(cookielib.LWPCookieJar())
         opener = urllib2.build_opener(cookie_handler, urllib2.HTTPBasicAuthHandler(), urllib2.HTTPHandler())
         opener = urllib2.install_opener(opener)
     if not post == None:
         request = urllib2.Request(url, post)
     else:
         request = urllib2.Request(url,None)
     if mobile == True:
         request.add_header('User-Agent', 'Mozilla/5.0 (iPhone; CPU; CPU iPhone OS 4_0 like Mac OS X; en-us) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8A293 Safari/6531.22.7')
     else:
         request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0')
     if not referer == None:
         request.add_header('Referer', referer)
     if not cookie == None:
         request.add_header('cookie', cookie)
     response = urllib2.urlopen(request, timeout=int(timeout))
     if output == 'cookie':
         result = str(response.headers.get('Set-Cookie'))
     elif output == 'geturl':
         result = response.geturl()
     else:
         result = response.read()
     if close == True:
         response.close()
     self.result = result
示例#13
0
def getUrl(url, cookieJar=None,post=None, timeout=20, headers=None, noredir=False):
    cookie_handler = urllib2.HTTPCookieProcessor(cookieJar)

    if noredir:
        opener = urllib2.build_opener(NoRedirection,cookie_handler, urllib2.HTTPBasicAuthHandler(), urllib2.HTTPHandler())
    else:
        opener = urllib2.build_opener(cookie_handler, urllib2.HTTPBasicAuthHandler(), urllib2.HTTPHandler())
    #opener = urllib2.install_opener(opener)
    req = urllib2.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.154 Safari/537.36')
    if headers:
        for h,hv in headers:
            req.add_header(h,hv)

    response = opener.open(req,post,timeout=timeout)
    link=response.read()
    response.close()
    return link
示例#14
0
def getRegexParsed(regexs, url,cookieJar=None,forCookieJarOnly=False,recursiveCall=False,cachedPages={}, rawPost=False, cookie_jar_file=None):#0,1,2 = URL, regexOnly, CookieJarOnly
        #cachedPages = {}
        #print 'url',url
        doRegexs = re.compile('\$doregex\[([^\]]*)\]').findall(url)
#        print 'doRegexs',doRegexs,regexs
        setresolved=True
        for k in doRegexs:
            if k in regexs:
                #print 'processing ' ,k
                m = regexs[k]
                #print m
                cookieJarParam=False
                if  'cookiejar' in m: # so either create or reuse existing jar
                    #print 'cookiejar exists',m['cookiejar']
                    cookieJarParam=m['cookiejar']
                    if  '$doregex' in cookieJarParam:
                        cookieJar=getRegexParsed(regexs, m['cookiejar'],cookieJar,True, True,cachedPages)
                        cookieJarParam=True
                    else:
                        cookieJarParam=True
                #print 'm[cookiejar]',m['cookiejar'],cookieJar
                if cookieJarParam:
                    if cookieJar==None:
                        #print 'create cookie jar'
                        cookie_jar_file=None
                        if 'open[' in m['cookiejar']:
                            cookie_jar_file=m['cookiejar'].split('open[')[1].split(']')[0]
#                            print 'cookieJar from file name',cookie_jar_file

                        cookieJar=getCookieJar(cookie_jar_file)
#                        print 'cookieJar from file',cookieJar
                        if cookie_jar_file:
                            saveCookieJar(cookieJar,cookie_jar_file)
                        #import cookielib
                        #cookieJar = cookielib.LWPCookieJar()
                        #print 'cookieJar new',cookieJar
                    elif 'save[' in m['cookiejar']:
                        cookie_jar_file=m['cookiejar'].split('save[')[1].split(']')[0]
                        complete_path=os.path.join(profile,cookie_jar_file)
#                        print 'complete_path',complete_path
                        saveCookieJar(cookieJar,cookie_jar_file)

                if  m['page'] and '$doregex' in m['page']:
                    pg=getRegexParsed(regexs, m['page'],cookieJar,recursiveCall=True,cachedPages=cachedPages)
                    if len(pg)==0:
                        pg='http://regexfailed'
                    m['page']=pg

                if 'setcookie' in m and m['setcookie'] and '$doregex' in m['setcookie']:
                    m['setcookie']=getRegexParsed(regexs, m['setcookie'],cookieJar,recursiveCall=True,cachedPages=cachedPages)
                if 'appendcookie' in m and m['appendcookie'] and '$doregex' in m['appendcookie']:
                    m['appendcookie']=getRegexParsed(regexs, m['appendcookie'],cookieJar,recursiveCall=True,cachedPages=cachedPages)


                if  'post' in m and '$doregex' in m['post']:
                    m['post']=getRegexParsed(regexs, m['post'],cookieJar,recursiveCall=True,cachedPages=cachedPages)
#                    print 'post is now',m['post']

                if  'rawpost' in m and '$doregex' in m['rawpost']:
                    m['rawpost']=getRegexParsed(regexs, m['rawpost'],cookieJar,recursiveCall=True,cachedPages=cachedPages,rawPost=True)
                    #print 'rawpost is now',m['rawpost']

                if 'rawpost' in m and '$epoctime$' in m['rawpost']:
                    m['rawpost']=m['rawpost'].replace('$epoctime$',getEpocTime())

                if 'rawpost' in m and '$epoctime2$' in m['rawpost']:
                    m['rawpost']=m['rawpost'].replace('$epoctime2$',getEpocTime2())


                link=''
                if m['page'] and m['page'] in cachedPages and not 'ignorecache' in m and forCookieJarOnly==False :
                    #print 'using cache page',m['page']
                    link = cachedPages[m['page']]
                else:
                    if m['page'] and  not m['page']=='' and  m['page'].startswith('http'):
                        if '$epoctime$' in m['page']:
                            m['page']=m['page'].replace('$epoctime$',getEpocTime())
                        if '$epoctime2$' in m['page']:
                            m['page']=m['page'].replace('$epoctime2$',getEpocTime2())

                        #print 'Ingoring Cache',m['page']
                        page_split=m['page'].split('|')
                        pageUrl=page_split[0]
                        header_in_page=None
                        if len(page_split)>1:
                            header_in_page=page_split[1]

#                            if 
#                            proxy = urllib2.ProxyHandler({ ('https' ? proxytouse[:5]=="https":"http") : proxytouse})
#                            opener = urllib2.build_opener(proxy)
#                            urllib2.install_opener(opener)

                            
                        
#                        import urllib2
#                        print 'urllib2.getproxies',urllib2.getproxies()
                        current_proxies=urllib2.ProxyHandler(urllib2.getproxies())
        
        
                        #print 'getting pageUrl',pageUrl
                        req = urllib2.Request(pageUrl)
                        if 'proxy' in m:
                            proxytouse= m['proxy']
#                            print 'proxytouse',proxytouse
#                            urllib2.getproxies= lambda: {}
                            if pageUrl[:5]=="https":
                                proxy = urllib2.ProxyHandler({ 'https' : proxytouse})
                                #req.set_proxy(proxytouse, 'https')
                            else:
                                proxy = urllib2.ProxyHandler({ 'http'  : proxytouse})
                                #req.set_proxy(proxytouse, 'http')
                            opener = urllib2.build_opener(proxy)
                            urllib2.install_opener(opener)
                            
                        
                        req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/14.0.1')
                        proxytouse=None

                        if 'referer' in m:
                            req.add_header('Referer', m['referer'])
                        if 'accept' in m:
                            req.add_header('Accept', m['accept'])
                        if 'agent' in m:
                            req.add_header('User-agent', m['agent'])
                        if 'x-req' in m:
                            req.add_header('X-Requested-With', m['x-req'])
                        if 'x-addr' in m:
                            req.add_header('x-addr', m['x-addr'])
                        if 'x-forward' in m:
                            req.add_header('X-Forwarded-For', m['x-forward'])
                        if 'setcookie' in m:
#                            print 'adding cookie',m['setcookie']
                            req.add_header('Cookie', m['setcookie'])
                        if 'appendcookie' in m:
#                            print 'appending cookie to cookiejar',m['appendcookie']
                            cookiestoApend=m['appendcookie']
                            cookiestoApend=cookiestoApend.split(';')
                            for h in cookiestoApend:
                                n,v=h.split('=')
                                w,n= n.split(':')
                                ck = cookielib.Cookie(version=0, name=n, value=v, port=None, port_specified=False, domain=w, domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)
                                cookieJar.set_cookie(ck)
                        if 'origin' in m:
                            req.add_header('Origin', m['origin'])
                        if header_in_page:
                            header_in_page=header_in_page.split('&')
                            for h in header_in_page:
                                n,v=h.split('=')
                                req.add_header(n,v)
                        
                        if not cookieJar==None:
#                            print 'cookieJarVal',cookieJar
                            cookie_handler = urllib2.HTTPCookieProcessor(cookieJar)
                            opener = urllib2.build_opener(cookie_handler, urllib2.HTTPBasicAuthHandler(), urllib2.HTTPHandler())
                            opener = urllib2.install_opener(opener)
#                            print 'noredirect','noredirect' in m
                            
                            if 'noredirect' in m:
                                opener = urllib2.build_opener(cookie_handler,NoRedirection, urllib2.HTTPBasicAuthHandler(), urllib2.HTTPHandler())
                                opener = urllib2.install_opener(opener)
                        elif 'noredirect' in m:
                            opener = urllib2.build_opener(NoRedirection, urllib2.HTTPBasicAuthHandler(), urllib2.HTTPHandler())
                            opener = urllib2.install_opener(opener)
                            

                        if 'connection' in m:
#                            print '..........................connection//////.',m['connection']
                            from keepalive import HTTPHandler
                            keepalive_handler = HTTPHandler()
                            opener = urllib2.build_opener(keepalive_handler)
                            urllib2.install_opener(opener)


                        #print 'after cookie jar'
                        post=None

                        if 'post' in m:
                            postData=m['post']
                            #if '$LiveStreamRecaptcha' in postData:
                            #    (captcha_challenge,catpcha_word,idfield)=processRecaptcha(m['page'],cookieJar)
                            #    if captcha_challenge:
                            #        postData=postData.replace('$LiveStreamRecaptcha','manual_recaptcha_challenge_field:'+captcha_challenge+',recaptcha_response_field:'+catpcha_word+',id:'+idfield)
                            splitpost=postData.split(',');
                            post={}
                            for p in splitpost:
                                n=p.split(':')[0];
                                v=p.split(':')[1];
                                post[n]=v
                            post = urllib.urlencode(post)

                        if 'rawpost' in m:
                            post=m['rawpost']
                            #if '$LiveStreamRecaptcha' in post:
                            #    (captcha_challenge,catpcha_word,idfield)=processRecaptcha(m['page'],cookieJar)
                            #    if captcha_challenge:
                            #       post=post.replace('$LiveStreamRecaptcha','&manual_recaptcha_challenge_field='+captcha_challenge+'&recaptcha_response_field='+catpcha_word+'&id='+idfield)
                        link=''
                        try:
                            
                            if post:
                                response = urllib2.urlopen(req,post)
                            else:
                                response = urllib2.urlopen(req)
                            if response.info().get('Content-Encoding') == 'gzip':
                                from StringIO import StringIO
                                import gzip
                                buf = StringIO( response.read())
                                f = gzip.GzipFile(fileobj=buf)
                                link = f.read()
                            else:
                                link=response.read()
                            
                        
                        
                            if 'proxy' in m and not current_proxies is None:
                                urllib2.install_opener(urllib2.build_opener(current_proxies))
                            
                            link=javascriptUnEscape(link)
                            #print repr(link)
                            #print link This just print whole webpage in LOG
                            if 'includeheaders' in m:
                                #link+=str(response.headers.get('Set-Cookie'))
                                link+='$$HEADERS_START$$:'
                                for b in response.headers:
                                    link+= b+':'+response.headers.get(b)+'\n'
                                link+='$$HEADERS_END$$:'
    #                        print link

                            response.close()
                        except: 
                            pass
                        cachedPages[m['page']] = link
                        #print link
                        #print 'store link for',m['page'],forCookieJarOnly

                        if forCookieJarOnly:
                            return cookieJar# do nothing
                    elif m['page'] and  not m['page'].startswith('http'):
                        if m['page'].startswith('$pyFunction:'):
                            val=doEval(m['page'].split('$pyFunction:')[1],'',cookieJar,m )
                            if forCookieJarOnly:
                                return cookieJar# do nothing
                            link=val
                            link=javascriptUnEscape(link)
                        else:
                            link=m['page']

                if  '$doregex' in m['expres']:
                    m['expres']=getRegexParsed(regexs, m['expres'],cookieJar,recursiveCall=True,cachedPages=cachedPages)
                  
                if not m['expres']=='':
                    #print 'doing it ',m['expres']
                    if '$LiveStreamCaptcha' in m['expres']:
                        val=askCaptcha(m,link,cookieJar)
                        #print 'url and val',url,val
                        url = url.replace("$doregex[" + k + "]", val)

                    elif m['expres'].startswith('$pyFunction:') or '#$pyFunction' in m['expres']:
                        #print 'expeeeeeeeeeeeeeeeeeee',m['expres']
                        val=''
                        if m['expres'].startswith('$pyFunction:'):
                            val=doEval(m['expres'].split('$pyFunction:')[1],link,cookieJar,m)
                        else:
                            val=doEvalFunction(m['expres'],link,cookieJar,m)
                        if 'ActivateWindow' in m['expres']: return
                        if forCookieJarOnly:
                            return cookieJar# do nothing
                        if 'listrepeat' in m:
                            listrepeat=m['listrepeat']
                            return listrepeat,eval(val), m,regexs,cookieJar

                        try:
                            url = url.replace(u"$doregex[" + k + "]", val)
                        except: url = url.replace("$doregex[" + k + "]", val.decode("utf-8"))
                    else:
                        if 'listrepeat' in m:
                            listrepeat=m['listrepeat']
                            ret=re.findall(m['expres'],link)
                            return listrepeat,ret, m,regexs
                             
                        val=''
                        if not link=='':
                            #print 'link',link
                            reg = re.compile(m['expres']).search(link)                            
                            try:
                                val=reg.group(1).strip()
                            except: traceback.print_exc()
                        elif m['page']=='' or m['page']==None:
                            val=m['expres']
                            
                        if rawPost:
#                            print 'rawpost'
                            val=urllib.quote_plus(val)
                        if 'htmlunescape' in m:
                            #val=urllib.unquote_plus(val)
                            import HTMLParser
                            val=HTMLParser.HTMLParser().unescape(val)
                        try:
                            url = url.replace("$doregex[" + k + "]", val)
                        except: url = url.replace("$doregex[" + k + "]", val.decode("utf-8"))
                        #print 'ur',url
                        #return val
                else:
                    url = url.replace("$doregex[" + k + "]",'')
        if '$epoctime$' in url:
            url=url.replace('$epoctime$',getEpocTime())
        if '$epoctime2$' in url:
            url=url.replace('$epoctime2$',getEpocTime2())

        if '$GUID$' in url:
            import uuid
            url=url.replace('$GUID$',str(uuid.uuid1()).upper())
        if '$get_cookies$' in url:
            url=url.replace('$get_cookies$',getCookiesString(cookieJar))

        if recursiveCall: return url
        #print 'final url',repr(url)
        if url=="":
            return
        else:
            return url,setresolved
示例#15
0
def request(url,
            close=True,
            redirect=True,
            error=False,
            proxy=None,
            post=None,
            headers=None,
            mobile=False,
            XHR=False,
            limit=None,
            referer=None,
            cookie=None,
            compression=True,
            output='',
            timeout='30'):
    try:
        handlers = []

        if not proxy == None:
            handlers += [
                urllib2.ProxyHandler({'http': '%s' % (proxy)}),
                urllib2.HTTPHandler
            ]
            opener = urllib2.build_opener(*handlers)
            opener = urllib2.install_opener(opener)

        if output == 'cookie' or output == 'extended' or not close == True:
            cookies = cookielib.LWPCookieJar()
            handlers += [
                urllib2.HTTPHandler(),
                urllib2.HTTPSHandler(),
                urllib2.HTTPCookieProcessor(cookies)
            ]
            opener = urllib2.build_opener(*handlers)
            opener = urllib2.install_opener(opener)

        if (2, 7, 8) < sys.version_info < (2, 7, 12):
            try:
                import ssl
                ssl_context = ssl.create_default_context()
                ssl_context.check_hostname = False
                ssl_context.verify_mode = ssl.CERT_NONE
                handlers += [urllib2.HTTPSHandler(context=ssl_context)]
                opener = urllib2.build_opener(*handlers)
                opener = urllib2.install_opener(opener)
            except:
                pass

        if url.startswith('//'): url = 'http:' + url

        try:
            headers.update(headers)
        except:
            headers = {}
        if 'User-Agent' in headers:
            pass
        elif not mobile == True:
            #headers['User-Agent'] = agent()
            headers['User-Agent'] = cache.get(randomagent, 1)
        else:
            headers['User-Agent'] = 'Apple-iPhone/701.341'
        if 'Referer' in headers:
            pass
        elif referer is not None:
            headers['Referer'] = referer
        if not 'Accept-Language' in headers:
            headers['Accept-Language'] = 'en-US'
        if 'X-Requested-With' in headers:
            pass
        elif XHR == True:
            headers['X-Requested-With'] = 'XMLHttpRequest'
        if 'Cookie' in headers:
            pass
        elif not cookie == None:
            headers['Cookie'] = cookie
        if 'Accept-Encoding' in headers:
            pass
        elif compression and limit is None:
            headers['Accept-Encoding'] = 'gzip'

        if redirect == False:

            class NoRedirection(urllib2.HTTPErrorProcessor):
                def http_response(self, request, response):
                    return response

            opener = urllib2.build_opener(NoRedirection)
            opener = urllib2.install_opener(opener)

            try:
                del headers['Referer']
            except:
                pass

        if isinstance(post, dict):
            post = urllib.urlencode(post)

        request = urllib2.Request(url, data=post)
        _add_request_header(request, headers)

        try:
            response = urllib2.urlopen(request, timeout=int(timeout))
        except urllib2.HTTPError as response:

            if response.code == 503:
                cf_result = response.read(5242880)
                try:
                    encoding = response.info().getheader('Content-Encoding')
                except:
                    encoding = None
                if encoding == 'gzip':
                    cf_result = gzip.GzipFile(
                        fileobj=StringIO.StringIO(cf_result)).read()

                if 'cf-browser-verification' in cf_result:

                    netloc = '%s://%s' % (urlparse.urlparse(url).scheme,
                                          urlparse.urlparse(url).netloc)

                    ua = headers['User-Agent']

                    cf = cache.get(cfcookie().get, 168, netloc, ua, timeout)

                    headers['Cookie'] = cf

                    request = urllib2.Request(url, data=post)
                    _add_request_header(request, headers)

                    response = urllib2.urlopen(request, timeout=int(timeout))
                else:
                    log_utils.log(
                        'Request-Error (%s): %s' % (str(response.code), url),
                        log_utils.LOGDEBUG)
                    if error == False: return
            else:
                log_utils.log(
                    'Request-Error (%s): %s' % (str(response.code), url),
                    log_utils.LOGDEBUG)
                if error == False: return

        if output == 'cookie':
            try:
                result = '; '.join(
                    ['%s=%s' % (i.name, i.value) for i in cookies])
            except:
                pass
            try:
                result = cf
            except:
                pass
            if close == True: response.close()
            return result

        elif output == 'geturl':
            result = response.geturl()
            if close == True: response.close()
            return result

        elif output == 'headers':
            result = response.headers
            if close == True: response.close()
            return result

        elif output == 'chunk':
            try:
                content = int(response.headers['Content-Length'])
            except:
                content = (2049 * 1024)
            if content < (2048 * 1024): return
            result = response.read(16 * 1024)
            if close == True: response.close()
            return result

        if limit == '0':
            result = response.read(224 * 1024)
        elif not limit == None:
            result = response.read(int(limit) * 1024)
        else:
            result = response.read(5242880)

        try:
            encoding = response.info().getheader('Content-Encoding')
        except:
            encoding = None
        if encoding == 'gzip':
            result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()

        if 'sucuri_cloudproxy_js' in result:
            su = sucuri().get(result)

            headers['Cookie'] = su

            request = urllib2.Request(url, data=post)
            _add_request_header(request, headers)

            response = urllib2.urlopen(request, timeout=int(timeout))

            if limit == '0':
                result = response.read(224 * 1024)
            elif not limit == None:
                result = response.read(int(limit) * 1024)
            else:
                result = response.read(5242880)

            try:
                encoding = response.info().getheader('Content-Encoding')
            except:
                encoding = None
            if encoding == 'gzip':
                result = gzip.GzipFile(
                    fileobj=StringIO.StringIO(result)).read()

        if 'Blazingfast.io' in result and 'xhr.open' in result:
            netloc = '%s://%s' % (urlparse.urlparse(url).scheme,
                                  urlparse.urlparse(url).netloc)
            ua = headers['User-Agent']
            headers['Cookie'] = cache.get(bfcookie().get, 168, netloc, ua,
                                          timeout)

            result = _basic_request(url,
                                    headers=headers,
                                    post=post,
                                    timeout=timeout,
                                    limit=limit)

        if output == 'extended':
            try:
                response_headers = dict([(item[0].title(), item[1])
                                         for item in response.info().items()])
            except:
                response_headers = response.headers
            response_code = str(response.code)
            try:
                cookie = '; '.join(
                    ['%s=%s' % (i.name, i.value) for i in cookies])
            except:
                pass
            try:
                cookie = cf
            except:
                pass
            if close == True: response.close()
            return (result, response_code, response_headers, headers, cookie)
        else:
            if close == True: response.close()
            return result
    except Exception as e:
        log_utils.log('Request-Error: (%s) => %s' % (str(e), url),
                      log_utils.LOGDEBUG)
        return
示例#16
0
    def _download_file(self,
                       url,
                       save_path,
                       file_name=None,
                       param_dict=None,
                       proxy_url=None,
                       proxy_port=None):
        """ downloads a file """
        try:
            #if url.find("http://") > -1:
            #    url = url.replace("http://", "https://")
            if proxy_url is not None:
                if proxy_port is None:
                    proxy_port = 80
                proxies = {
                    "http": "http://%s:%s" % (proxy_url, proxy_port),
                    "https": "https://%s:%s" % (proxy_url, proxy_port)
                }
                proxy_support = urllib2.ProxyHandler(proxies)
                opener = urllib2.build_opener(
                    proxy_support, urllib2.HTTPHandler(debuglevel=0),
                    AGOLRedirectHandler())
                urllib2.install_opener(opener)
            else:
                opener = urllib2.build_opener(
                    urllib2.HTTPHandler(debuglevel=0), AGOLRedirectHandler())
                urllib2.install_opener(opener)

            if param_dict is not None:
                encoded_args = urllib.urlencode(param_dict)
                url = url + '/?' + encoded_args
            file_data = urllib2.urlopen(url)
            file_data.getcode()
            file_data.geturl()
            if file_name is None:
                url = file_data.geturl()
                a = file_data.info().getheader('Content-Disposition')
                if a is not None:
                    a = a.strip()
                    file_name = re.findall(r'filename=\"(.+?)\"', a)[0]
                else:
                    file_name = os.path.basename(
                        file_data.geturl().split('?')[0])
            if hasattr(file_data, "status") and \
               (int(file_data.status) >= 300 and int(file_data.status) < 400):
                self._download_file(url=file_data.geturl(),
                                    save_path=save_path,
                                    file_name=file_name,
                                    proxy_url=self._proxy_url,
                                    proxy_port=self._proxy_port)
                return save_path + os.sep + file_name
            if (file_data.info().getheader('Content-Length')):
                total_size = int(
                    file_data.info().getheader('Content-Length').strip())
                downloaded = 0
                CHUNK = 4096
                with open(save_path + os.sep + file_name, 'wb') as out_file:
                    while True:
                        chunk = file_data.read(CHUNK)
                        downloaded += len(chunk)
                        if not chunk: break
                        out_file.write(chunk)
            elif file_data.headers.maintype == 'image':
                with open(save_path + os.sep + file_name, 'wb') as out_file:
                    buf = file_data.read()
                    out_file.write(buf)
            return save_path + os.sep + file_name
        except urllib2.HTTPError, e:
            print "HTTP Error:", e.code, url
            return False
示例#17
0
    def get_cookie(self, netloc, ua, timeout):
        try:
            headers = {'User-Agent': ua}

            request = urllib2.Request(netloc)
            _add_request_header(request, headers)

            try:
                response = urllib2.urlopen(request, timeout=int(timeout))
            except urllib2.HTTPError as response:
                result = response.read(5242880)
                try:
                    encoding = response.info().getheader('Content-Encoding')
                except:
                    encoding = None
                if encoding == 'gzip':
                    result = gzip.GzipFile(
                        fileobj=StringIO.StringIO(result)).read()

            jschl = re.findall('name="jschl_vc" value="(.+?)"/>', result)[0]

            init = re.findall('setTimeout\(function\(\){\s*.*?.*:(.*?)};',
                              result)[-1]

            builder = re.findall(r"challenge-form\'\);\s*(.*)a.v", result)[0]

            decryptVal = self.parseJSString(init)

            lines = builder.split(';')

            for line in lines:

                if len(line) > 0 and '=' in line:

                    sections = line.split('=')
                    line_val = self.parseJSString(sections[1])
                    decryptVal = int(
                        eval(
                            str(decryptVal) + sections[0][-1] + str(line_val)))

            answer = decryptVal + len(urlparse.urlparse(netloc).netloc)

            query = '%s/cdn-cgi/l/chk_jschl?jschl_vc=%s&jschl_answer=%s' % (
                netloc, jschl, answer)

            if 'type="hidden" name="pass"' in result:
                passval = re.findall('name="pass" value="(.*?)"', result)[0]
                query = '%s/cdn-cgi/l/chk_jschl?pass=%s&jschl_vc=%s&jschl_answer=%s' % (
                    netloc, urllib.quote_plus(passval), jschl, answer)
                time.sleep(6)

            cookies = cookielib.LWPCookieJar()
            handlers = [
                urllib2.HTTPHandler(),
                urllib2.HTTPSHandler(),
                urllib2.HTTPCookieProcessor(cookies)
            ]
            opener = urllib2.build_opener(*handlers)
            opener = urllib2.install_opener(opener)

            try:
                request = urllib2.Request(query)
                _add_request_header(request, headers)
                response = urllib2.urlopen(request, timeout=int(timeout))
            except:
                pass

            cookie = '; '.join(['%s=%s' % (i.name, i.value) for i in cookies])

            if 'cf_clearance' in cookie: self.cookie = cookie
        except:
            pass
示例#18
0
def request(url,
            close=True,
            redirect=True,
            error=False,
            proxy=None,
            post=None,
            headers=None,
            mobile=False,
            XHR=False,
            limit=None,
            referer=None,
            cookie=None,
            compression=True,
            output='',
            timeout='30',
            ignoreSsl=False,
            flare=True,
            ignoreErrors=None):
    try:
        if url is None:
            return None

        handlers = []

        if proxy is not None:
            handlers += [
                urllib2.ProxyHandler({'http': '%s' % (proxy)}),
                urllib2.HTTPHandler
            ]
            opener = urllib2.build_opener(*handlers)
            opener = urllib2.install_opener(opener)

        if output == 'cookie' or output == 'extended' or not close is True:
            cookies = cookielib.LWPCookieJar()
            handlers += [
                urllib2.HTTPHandler(),
                urllib2.HTTPSHandler(),
                urllib2.HTTPCookieProcessor(cookies)
            ]
            opener = urllib2.build_opener(*handlers)
            opener = urllib2.install_opener(opener)

        if ignoreSsl or ((2, 7, 8) < sys.version_info < (2, 7, 12)):
            try:
                import ssl
                ssl_context = ssl.create_default_context()
                ssl_context.check_hostname = False
                ssl_context.verify_mode = ssl.CERT_NONE
                handlers += [urllib2.HTTPSHandler(context=ssl_context)]
                opener = urllib2.build_opener(*handlers)
                opener = urllib2.install_opener(opener)
            except:
                pass

        if url.startswith('//'):
            url = 'http:' + url

        try:
            headers.update(headers)
        except:
            headers = {}

        if 'User-Agent' in headers:
            pass
        elif mobile is not True:
            # headers['User-Agent'] = agent()
            headers['User-Agent'] = cache.get(randomagent, 1)
        else:
            headers['User-Agent'] = 'Apple-iPhone/701.341'

        if 'Referer' in headers:
            pass
        elif referer is not None:
            headers['Referer'] = referer

        if 'Accept-Language' not in headers:
            headers['Accept-Language'] = 'en-US'

        if 'X-Requested-With' in headers:
            pass
        elif XHR is True:
            headers['X-Requested-With'] = 'XMLHttpRequest'

        if 'Cookie' in headers:
            pass
        elif cookie is not None:
            headers['Cookie'] = cookie

        if 'Accept-Encoding' in headers:
            pass
        elif compression and limit is None:
            headers['Accept-Encoding'] = 'gzip'

        if redirect is False:

            class NoRedirection(urllib2.HTTPErrorProcessor):
                def http_response(self, request, response):
                    return response

            opener = urllib2.build_opener(NoRedirection)
            opener = urllib2.install_opener(opener)

            try:
                del headers['Referer']
            except:
                pass

        if isinstance(post, dict):
            # Gets rid of the error: 'ascii' codec can't decode byte 0xd0 in position 0: ordinal not in range(128)
            for key, value in post.iteritems():
                try:
                    post[key] = value.encode('utf-8')
                except:
                    pass

            post = urlencode(post)

        request = urllib2.Request(url, data=post)
        _add_request_header(request, headers)

        try:
            response = urllib2.urlopen(request, timeout=int(timeout))
        except urllib2.HTTPError as response:
            try:
                ignore = ignoreErrors and (int(response.code) == ignoreErrors
                                           or int(
                                               response.code) in ignoreErrors)
            except:
                ignore = False

            if not ignore:
                if response.code in [301, 307, 308, 503]:
                    cf_result = response.read(5242880)

                    try:
                        encoding = response.info().getheader(
                            'Content-Encoding')
                    except:
                        encoding = None

                    if encoding == 'gzip':
                        cf_result = gzip.GzipFile(
                            fileobj=StringIO(cf_result)).read()

                    if flare and 'cloudflare' in str(response.info()).lower():
                        try:
                            from openscrapers.modules import cfscrape
                            if isinstance(post, dict):
                                data = post
                            else:
                                try:
                                    data = parse_qs(post)
                                except:
                                    data = None

                            scraper = cfscrape.CloudScraper()
                            response = scraper.request(
                                method='GET' if post is None else 'POST',
                                url=url,
                                headers=headers,
                                data=data,
                                timeout=int(timeout))
                            result = response.content
                            flare = 'cloudflare'  # Used below
                            try:
                                cookies = response.request._cookies
                            except:
                                log_utils.error()
                        except:
                            log_utils.error()

                    elif 'cf-browser-verification' in cf_result:
                        netloc = '%s://%s' % (urlparse(url).scheme,
                                              urlparse(url).netloc)
                        ua = headers['User-Agent']
                        cf = cache.get(cfcookie().get, 168, netloc, ua,
                                       timeout)
                        headers['Cookie'] = cf
                        request = urllib2.Request(url, data=post)
                        _add_request_header(request, headers)
                        response = urllib2.urlopen(request,
                                                   timeout=int(timeout))
                    else:
                        log_utils.log(
                            'Request-Error (%s): %s' %
                            (str(response.code), url), log_utils.LOGDEBUG)
                        if error is False:
                            return None
                else:
                    log_utils.log(
                        'Request-Error (%s): %s' % (str(response.code), url),
                        log_utils.LOGDEBUG)
                    if error is False:
                        return None

        if output == 'cookie':
            try:
                result = '; '.join(
                    ['%s=%s' % (i.name, i.value) for i in cookies])
            except:
                pass
            try:
                result = cf
            except:
                pass

            if close is True:
                response.close()
            return result

        elif output == 'geturl':
            result = response.geturl()
            if close is True:
                response.close()
            return result

        elif output == 'headers':
            result = response.headers
            if close is True:
                response.close()
            return result

        elif output == 'chunk':
            try:
                content = int(response.headers['Content-Length'])
            except:
                content = (2049 * 1024)
            if content < (2048 * 1024):
                return
            result = response.read(16 * 1024)
            if close is True:
                response.close()
            return result

        if flare != 'cloudflare':
            if limit == '0':
                result = response.read(224 * 1024)
            elif limit is not None:
                result = response.read(int(limit) * 1024)
            else:
                result = response.read(5242880)

        try:
            encoding = response.info().getheader('Content-Encoding')
        except:
            encoding = None

        if encoding == 'gzip':
            result = gzip.GzipFile(fileobj=StringIO(result)).read()

        if 'sucuri_cloudproxy_js' in result:
            su = sucuri().get(result)

            headers['Cookie'] = su

            request = urllib2.Request(url, data=post)
            _add_request_header(request, headers)

            response = urllib2.urlopen(request, timeout=int(timeout))

            if limit == '0':
                result = response.read(224 * 1024)
            elif limit is not None:
                result = response.read(int(limit) * 1024)
            else:
                result = response.read(5242880)

            try:
                encoding = response.info().getheader('Content-Encoding')
            except:
                encoding = None
            if encoding == 'gzip':
                result = gzip.GzipFile(fileobj=StringIO(result)).read()

        if 'Blazingfast.io' in result and 'xhr.open' in result:
            netloc = '%s://%s' % (urlparse(url).scheme, urlparse(url).netloc)
            ua = headers['User-Agent']
            headers['Cookie'] = cache.get(bfcookie().get, 168, netloc, ua,
                                          timeout)
            result = _basic_request(url,
                                    headers=headers,
                                    post=post,
                                    timeout=timeout,
                                    limit=limit)

        if output == 'extended':
            try:
                response_headers = dict([(item[0].title(), item[1])
                                         for item in response.info().items()])
            except:
                response_headers = response.headers

            try:
                response_code = str(response.code)
            except:
                response_code = str(response.status_code
                                    )  # object from CFScrape Requests object.

            try:
                cookie = '; '.join(
                    ['%s=%s' % (i.name, i.value) for i in cookies])
            except:
                pass

            try:
                cookie = cf
            except:
                pass

            if close is True:
                response.close()
            return (result, response_code, response_headers, headers, cookie)
        else:
            if close is True:
                response.close()
            return result

    except Exception as e:
        # log_utils.error()
        log_utils.log('Request-Error: (%s) => %s' % (str(e), url),
                      log_utils.LOGDEBUG)
        return None
#coding=utf-8
'''
基本的urlopen()方法不支持代理、cookie等其他的HTTP/HTTPS高级功能。所以要支持这些功能:
使用相关的 Handler处理器 来创建特定功能的处理器对象;
然后通过 urllib2.build_opener()方法使用这些处理器对象,创建自定义opener对象;
使用自定义的opener对象,调用open()方法发送请求。
如果程序里所有的请求都使用自定义的opener,可以使用urllib2.install_opener() 将自定义的 opener 对象 定义为 全局opener,表示如果之后凡是调用urlopen,都将使用这个opener(根据自己的需求来选择)

'''
import urllib2

# 构建一个HTTPHandler 处理器对象,支持处理HTTP请求
http_handler = urllib2.HTTPHandler()

# 构建一个HTTPHandler 处理器对象,支持处理HTTPS请求
# http_handler = urllib2.HTTPSHandler()

# 参数debuglevel=1,会将 Debug Log 打开
#  在执行的时候,会把收包和发包的报头在屏幕上自动打印出来
# http_handler = urllib2.HTTPSHandler(debuglevel=1)

# 调用urllib2.build_opener()方法,创建支持处理HTTP请求的opener对象
opener = urllib2.build_opener(http_handler)

# 构建 Request请求
request = urllib2.Request("http://www.baidu.com/")

# 调用自定义opener对象的open()方法,发送request请求
response = opener.open(request)

# 获取服务器响应内容
示例#20
0
def request(url,
            close=True,
            redirect=True,
            error=False,
            verify=True,
            proxy=None,
            post=None,
            headers=None,
            mobile=False,
            XHR=False,
            limit=None,
            referer=None,
            cookie=None,
            compression=True,
            output='',
            timeout='15'):
    try:
        if not url:
            return

        if not headers:
            if not mobile:
                headers = randomagent()
            else:
                headers = randommobileagent()

        handlers = []

        if proxy is not None:
            handlers += [
                urllib2.ProxyHandler({'http': '%s' % (proxy)}),
                urllib2.HTTPHandler
            ]
            opener = urllib2.build_opener(*handlers)
            urllib2.install_opener(opener)

        if output == 'cookie' or output == 'extended' or close is not True:
            cookies = cookielib.LWPCookieJar()
            handlers += [
                urllib2.HTTPHandler(),
                urllib2.HTTPSHandler(),
                urllib2.HTTPCookieProcessor(cookies)
            ]
            opener = urllib2.build_opener(*handlers)
            urllib2.install_opener(opener)

        try:
            import platform
            is_XBOX = platform.uname()[1] == 'XboxOne'
        except Exception:
            is_XBOX = False

        if verify is False and sys.version_info >= (2, 7, 12):
            try:
                import ssl
                ssl_context = ssl._create_unverified_context()
                handlers += [urllib2.HTTPSHandler(context=ssl_context)]
                opener = urllib2.build_opener(*handlers)
                urllib2.install_opener(opener)
            except Exception:
                pass

        if verify is True and ((2, 7, 8) < sys.version_info <
                               (2, 7, 12) or is_XBOX):
            try:
                import ssl
                ssl_context = ssl.create_default_context()
                ssl_context.check_hostname = False
                ssl_context.verify_mode = ssl.CERT_NONE
                handlers += [urllib2.HTTPSHandler(context=ssl_context)]
                opener = urllib2.build_opener(*handlers)
                urllib2.install_opener(opener)
            except Exception:
                pass

        if url.startswith('//'):
            url = 'http:' + url

        _headers = {}

        try:
            _headers.update(headers)
        except Exception:
            pass

        if 'User-Agent' in _headers:
            pass
        elif mobile is True:
            _headers['User-Agent'] = cache.get(randommobileagent, 1)
        else:
            _headers['User-Agent'] = cache.get(randomagent, 1)

        if 'Referer' in _headers:
            pass
        elif referer is not None:
            _headers['Referer'] = referer
        if 'Accept-Language' not in _headers:
            _headers['Accept-Language'] = 'en-US'
        if 'X-Requested-With' in _headers:
            pass
        elif XHR is True:
            _headers['X-Requested-With'] = 'XMLHttpRequest'
        if 'Cookie' in _headers:
            pass
        elif cookie is not None:
            _headers['Cookie'] = cookie
        if 'Accept-Encoding' in _headers:
            pass
        elif compression and limit is None:
            _headers['Accept-Encoding'] = 'gzip'

        if redirect is False:

            class NoRedirectHandler(urllib2.HTTPRedirectHandler):
                def http_error_302(self, req, fp, code, msg, headers):
                    infourl = urllib.addinfourl(fp, headers,
                                                req.get_full_url())
                    infourl.status = code
                    infourl.code = code
                    return infourl

                http_error_300 = http_error_302
                http_error_301 = http_error_302
                http_error_303 = http_error_302
                http_error_307 = http_error_302

            opener = urllib2.build_opener(NoRedirectHandler())
            urllib2.install_opener(opener)

            try:
                del _headers['Referer']
            except Exception:
                pass

        if isinstance(post, dict):
            post = utils.byteify(post)
            post = urllib.urlencode(post)

        url = utils.byteify(url)

        request = urllib2.Request(url, data=post)
        _add_request_header(request, _headers)

        try:
            response = urllib2.urlopen(request, timeout=int(timeout))
        except urllib2.HTTPError as response:

            if response.code == 503:
                cf_result = response.read()
                try:
                    encoding = response.info().getheader('Content-Encoding')
                except Exception:
                    encoding = None
                if encoding == 'gzip':
                    cf_result = gzip.GzipFile(
                        fileobj=StringIO.StringIO(cf_result)).read()

                if 'cf-browser-verification' in cf_result:
                    while 'cf-browser-verification' in cf_result:
                        netloc = '%s://%s/' % (urlparse.urlparse(url).scheme,
                                               urlparse.urlparse(url).netloc)
                        ua = _headers['User-Agent']
                        cf = cache.get(cfcookie().get, 1, netloc, ua, timeout)

                        _headers['Cookie'] = cf

                        request = urllib2.Request(url, data=post)
                        _add_request_header(request, _headers)

                        try:
                            response = urllib2.urlopen(request,
                                                       timeout=int(timeout))
                            cf_result = 'Success'
                        except urllib2.HTTPError as response:
                            cache.remove(cfcookie().get, netloc, ua, timeout)
                            cf_result = response.read()
                else:
                    log_utils.log(
                        'Request-Error (%s): %s' % (str(response.code), url),
                        log_utils.LOGDEBUG)
                    if error is False:
                        return
            else:
                log_utils.log(
                    'Request-Error (%s): %s' % (str(response.code), url),
                    log_utils.LOGDEBUG)
                if error is False:
                    return

        if output == 'cookie':
            try:
                result = '; '.join(
                    ['%s=%s' % (i.name, i.value) for i in cookies])
            except Exception:
                pass
            try:
                result = cf
            except Exception:
                pass
            if close is True:
                response.close()
            return result
        elif output == 'geturl':
            result = response.geturl()
            if close is True:
                response.close()
            return result
        elif output == 'headers':
            result = response.headers
            if close is True:
                response.close()
            return result
        elif output == 'chunk':
            try:
                content = int(response.headers['Content-Length'])
            except Exception:
                content = (2049 * 1024)
            if content < (2048 * 1024):
                return
            result = response.read(16 * 1024)
            if close is True:
                response.close()
            return result
        elif output == 'file_size':
            try:
                content = int(response.headers['Content-Length'])
            except Exception:
                content = '0'
            response.close()
            return content

        if limit == '0':
            result = response.read(224 * 1024)
        elif limit is not None:
            result = response.read(int(limit) * 1024)
        else:
            result = response.read(5242880)

        try:
            encoding = response.info().getheader('Content-Encoding')
        except Exception:
            encoding = None
        if encoding == 'gzip':
            result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()

        if 'sucuri_cloudproxy_js' in result:
            su = sucuri().get(result)

            _headers['Cookie'] = su

            request = urllib2.Request(url, data=post)
            _add_request_header(request, _headers)

            response = urllib2.urlopen(request, timeout=int(timeout))

            if limit == '0':
                result = response.read(224 * 1024)
            elif limit is not None:
                result = response.read(int(limit) * 1024)
            else:
                result = response.read(5242880)

            try:
                encoding = response.info().getheader('Content-Encoding')
            except Exception:
                encoding = None
            if encoding == 'gzip':
                result = gzip.GzipFile(
                    fileobj=StringIO.StringIO(result)).read()

        if 'Blazingfast.io' in result and 'xhr.open' in result:
            netloc = '%s://%s' % (urlparse.urlparse(url).scheme,
                                  urlparse.urlparse(url).netloc)
            ua = _headers['User-Agent']
            _headers['Cookie'] = cache.get(bfcookie().get, 168, netloc, ua,
                                           timeout)

            result = _basic_request(url,
                                    headers=_headers,
                                    post=post,
                                    timeout=timeout,
                                    limit=limit)

        if output == 'extended':
            try:
                response_headers = dict([(item[0].title(), item[1])
                                         for item in response.info().items()])
            except Exception:
                response_headers = response.headers
            response_code = str(response.code)
            try:
                cookie = '; '.join(
                    ['%s=%s' % (i.name, i.value) for i in cookies])
            except Exception:
                pass
            try:
                cookie = cf
            except Exception:
                pass
            if close is True:
                response.close()
            return (result, response_code, response_headers, _headers, cookie)
        else:
            if close is True:
                response.close()
            return result
    except Exception as e:
        log_utils.log('Request-Error: (%s) => %s' % (str(e), url),
                      log_utils.LOGDEBUG)
        return
示例#21
0
# -*- encoding:utf-8 -*-
'''
Created on 2012-10-21

@author: root
'''
"""
Send a data-stream to the stdin of a CGI and reading the data it returns to us
"""

import urllib2

#OPEN DEBUG MODEL
httpHandler = urllib2.HTTPHandler(debuglevel=1)
httpsHandler = urllib2.HTTPHandler(debuglevel=1)
opener = urllib2.build_opener(httpHandler, httpsHandler)
urllib2.install_opener(opener)

req = urllib2.Request(url='http://10.1.1.10/cgi-bin/test.cgi',
                      data='this data is passed to stdin of the CGI')
f = urllib2.urlopen(req)

print f.read()
print f.info()

#CGI PROGRAM
#!/usr/bin/env python
#import sys
#data = sys.stdin.read()
#print 'Content-length: 123\n\nContent-type: text-plain\n\nGot Data: "%s"' % data
示例#22
0
def read_site_html(url_link):
    global Domain
    import requests, cookielib
    #check redirect

    headers = {
        'Host': Domain,
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'he,he-IL;q=0.8,en-US;q=0.5,en;q=0.3',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Pragma': 'no-cache',
        'Cache-Control': 'no-cache',
    }

    handlers = [MyHTTPHandler]
    cookjar = cookielib.CookieJar()
    handlers += [
        urllib2.HTTPHandler(),
        urllib2.HTTPSHandler(),
        urllib2.HTTPCookieProcessor(cookjar)
    ]
    opener = urllib2.build_opener(*handlers)
    request = urllib2.Request(url_link, headers=headers)
    try:
        html = opener.open(request).read()
    except:
        #xbmcgui.Dialog().ok("בא לי פרק","בעיה בחיבור לאתר\n החלף כתובת בהגדרות")

        r = requests.get(url_link)
        regex_domain = '//(.+?)/'
        mathc_domain = re.compile(regex_domain).findall(r.url)

        new_domain = mathc_domain[0]
        if new_domain != Domain:
            Addon.setSetting("domain", new_domain)
            Domain = Addon.getSetting("domain")
        xbmcgui.Dialog().ok(
            "סרט HD", " כתובת האתר הוחלפה ועודכנה פתח שנית " +
            '\n[COLOR aqua]' + Domain + '[/COLOR]')
        sys.exit()
    #html = urllib2.urlopen(request, timeout=int(30)).read()

    #html=requests.get(url_link,headers=headers)
    cookies = {}
    for item in cookjar:
        cookies[item.name] = item.value

    #a=a+1
    try:

        first = Crypt('protect_own' + Domain)
        second = Crypt('protect_up' + Domain)
        third = Crypt('js' + Domain)

        oc1 = str(cookies[first])
        oc2 = str(cookies[second])

        co3 = Crypt(oc1 + oc2) + Crypt(oc2 + oc1)

        cookies = {
            third: (co3),
            first: oc1,
            second: oc2,
            'expires': "14400",
            '__cfduid': cookies['__cfduid'],
            'path': '/'
        }

        html = requests.get(url_link, headers=headers, cookies=cookies).content
        request = urllib2.Request(url_link, headers=headers)
        #html=requests.get(url_link,headers=headers,cookies=cookies)
        #html = opener.open(request, timeout=int(30)).read()

        return html  #.encode('utf8')
    except:
        return html  #.encode('utf8')
示例#23
0
def kelly(s, p, f):
    print 1.0 / (1.0 / s + 1.0 / p + 1.0 / f)
    pass


kelly(2.84, 3.4, 2.2)
s = datetime.datetime.strptime("2011-1-1 3:00", '%Y-%m-%d %H:%M')
datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print s
str1 = "888"
c = type(str1)

request = urllib2.Request("http://www.500.com")
opener = urllib2.build_opener(
    urllib2.HTTPHandler(debuglevel=1))  #为了开启回显,需要手动构造一个HTTPHandler
feeddata = opener.open(request).read()

loginUrl = "http://www.126.com"
request = urllib2.Request(loginUrl)
request.add_header(
    "User-Agent",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36"
)
request.add_header(
    "Accept",
    "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
)
request.add_header("Accept-Encoding", "gzip, deflate, sdch")
#request.add_header("X-Requested-With","XMLHttpRequest")
request.add_header("Accept-Language", "zh-CN,zh;q=0.8")
示例#24
0
def getdata(url,
            params,
            headers=None,
            post=None,
            verbose=False,
            jsondecoder=True):
    """
    Invoke URL call and retrieve data from data-service based
    on provided URL and set of parameters. Use post=True to
    invoke POST request.
    """
    encoded_data = urllib.urlencode(params)
    if not post:
        if encoded_data:
            url = url + '?' + encoded_data
    if not headers:
        headers = {}
    if verbose:
        print '+++ getdata, url=%s, headers=%s' % (url, headers)
    obj = sys.version_info
    if obj[0] == 2 and obj[1] == 7 and obj[2] >= 9:
        # disable SSL verification, since it is default in python 2.7.9
        # and many CMS services do not verify SSL cert.
        # https://www.python.org/dev/peps/pep-0476/
        import ssl
        ssl._create_default_https_context = ssl._create_unverified_context
    req = urllib2.Request(url)
    for key, val in headers.iteritems():
        req.add_header(key, val)
    if verbose > 1:
        handler = urllib2.HTTPHandler(debuglevel=1)
        opener = urllib2.build_opener(handler)
        urllib2.install_opener(opener)
    ckey, cert = get_key_cert()
    handler = HTTPSClientAuthHandler(ckey, cert, verbose)
    if verbose:
        print "handler", handler, handler.__dict__
    opener = urllib2.build_opener(handler)
    urllib2.install_opener(opener)
    try:
        if post:
            data = urllib2.urlopen(req, encoded_data)
        else:
            data = urllib2.urlopen(req)
        info = data.info()
        code = data.getcode()
        if verbose > 1:
            print "+++ response code:", code
            print "+++ response info\n", info
        if jsondecoder:
            data = json.load(data)
        else:
            data = data.read()
    except urllib2.HTTPError as httperror:
        msg  = 'HTTPError, url=%s, args=%s, headers=%s' \
                    % (url, params, headers)
        data = {'error': 'Unable to contact %s' % url, 'reason': msg}
        try:
            data.update({'httperror': extract_http_error(httperror.read())})
        except Exception as exp:
            data.update({'httperror': None})
        data = json.dumps(data)
    except Exception as exp:
        msg  = 'HTTPError, url=%s, args=%s, headers=%s, error=%s' \
                    % (url, params, headers, str(exp))
        data = {'error': 'Unable to contact %s' % url, 'reason': msg}
        data = json.dumps(data)
    return data
示例#25
0
import csv
import re

if (len(sys.argv) < 2):
    print 'Usage: %s <word-file.csv>' % sys.argv[0]
    exit()

csvfile = open(sys.argv[1], 'rb')
csvobject = csv.reader(csvfile, delimiter='\t', quotechar='"')

user = raw_input('User: '******'http://www.forvo.com/login/',
                       urllib.urlencode({
                           'login': user,
                           'password': password
                       }))

if (page.read().find('Login incorrect.') != -1):
    print "Error logging in"
    exit()
示例#26
0
def get_data(host,
             query,
             idx,
             limit,
             debug,
             threshold=300,
             ckey=None,
             cert=None,
             das_headers=True):
    """Contact DAS server and retrieve data for given DAS query"""
    params = {'input': query, 'idx': idx, 'limit': limit}
    path = '/das/cache'
    pat = re.compile('http[s]{0,1}://')
    if not pat.match(host):
        msg = 'Invalid hostname: %s' % host
        raise Exception(msg)
    url = host + path
    headers = {"Accept": "application/json"}
    encoded_data = urllib.urlencode(params, doseq=True)
    url += '?%s' % encoded_data
    req = urllib2.Request(url=url, headers=headers)
    if ckey and cert:
        ckey = fullpath(ckey)
        cert = fullpath(cert)
        hdlr = HTTPSClientAuthHandler(ckey, cert, debug)
    else:
        hdlr = urllib2.HTTPHandler(debuglevel=debug)
    proxy_support = urllib2.ProxyHandler({})
    opener = urllib2.build_opener(hdlr, proxy_support)
    fdesc = opener.open(req)
    data = fdesc.read()
    fdesc.close()

    pat = re.compile(r'^[a-z0-9]{32}')
    if data and isinstance(data, str) and pat.match(data) and len(data) == 32:
        pid = data
    elif data.find('"pid"') != -1 and data.find('"status"') != -1:
        pid = json.loads(data)['pid']
    else:
        pid = None
    iwtime = 2  # initial waiting time in seconds
    wtime = 20  # final waiting time in seconds
    sleep = iwtime
    time0 = time.time()
    while pid:
        params.update({'pid': data})
        encoded_data = urllib.urlencode(params, doseq=True)
        url = host + path + '?%s' % encoded_data
        req = urllib2.Request(url=url, headers=headers)
        try:
            fdesc = opener.open(req)
            data = fdesc.read()
            fdesc.close()
        except urllib2.HTTPError as err:
            return {"status": "fail", "reason": str(err)}
        if data and isinstance(data,
                               str) and pat.match(data) and len(data) == 32:
            pid = data
        else:
            pid = None
        time.sleep(sleep)
        if sleep < wtime:
            sleep *= 2
        elif sleep == wtime:
            sleep = iwtime  # start new cycle
        else:
            sleep = wtime
        if (time.time() - time0) > threshold:
            reason = "client timeout after %s sec" % int(time.time() - time0)
            return {"status": "fail", "reason": reason}
    jsondict = json.loads(data)
    return jsondict
示例#27
0
def downloadfile(url,nombrefichero,headers=[],silent=False,continuar=False):
    logger.info("[downloadtools.py] downloadfile: url="+url)
    logger.info("[downloadtools.py] downloadfile: nombrefichero="+nombrefichero)

    try:
        # Si no es XBMC, siempre a "Silent"
        try:
            import xbmcgui
        except:
            silent=True
        
        # antes
        #f=open(nombrefichero,"wb")
        try:
            import xbmc
            nombrefichero = xbmc.makeLegalFilename(nombrefichero)
        except:
            pass
        logger.info("[downloadtools.py] downloadfile: nombrefichero="+nombrefichero)
    
        # El fichero existe y se quiere continuar
        if os.path.exists(nombrefichero) and continuar:
            #try:
            #    import xbmcvfs
            #    f = xbmcvfs.File(nombrefichero)
            #    existSize = f.size(nombrefichero)
            #except:
            f = open(nombrefichero, 'r+b')
            existSize = os.path.getsize(nombrefichero)
            
            logger.info("[downloadtools.py] downloadfile: el fichero existe, size=%d" % existSize)
            grabado = existSize
            f.seek(existSize)

        # el fichero ya existe y no se quiere continuar, se aborta
        elif os.path.exists(nombrefichero) and not continuar:
            logger.info("[downloadtools.py] downloadfile: el fichero existe, no se descarga de nuevo")
            return

        # el fichero no existe
        else:
            existSize = 0
            logger.info("[downloadtools.py] downloadfile: el fichero no existe")
            
            #try:
            #    import xbmcvfs
            #    f = xbmcvfs.File(nombrefichero,"w")
            #except:
            f = open(nombrefichero, 'wb')
            grabado = 0
    
        # Crea el diálogo de progreso
        if not silent:            
            #progreso.create( "plugin" , "Descargando..." , os.path.basename(nombrefichero)+" desde "+urlparse.urlparse(url).hostname )

            try:
                progreso = xbmcgui.DialogProgressBG()
                progreso.create("Descargas tvalacarta", "Descargando "+os.path.basename(nombrefichero))
            except:
                progreso = xbmcgui.DialogProgress()
                progreso.create( "plugin" , "Descargando..." , url , os.path.basename(nombrefichero) )

        else:
            progreso = ""
            try:
                xbmc.executebuiltin((u'XBMC.Notification("Descargas tvalacarta", "'+os.path.basename(nombrefichero)+'", 2000)'))
            except:
                pass

        # Login y password Filenium
        # http://abcd%40gmail.com:[email protected]/get/Oi8vd3d3/LmZpbGVz/ZXJ2ZS5j/b20vZmls/ZS9kTnBL/dm11/b0/?.zip
        if "filenium" in url:
            from servers import filenium
            url , authorization_header = filenium.extract_authorization_header(url)
            headers.append( [ "Authorization", authorization_header ] )
    
        if "|" in url:
            additional_headers = url.split("|")[1]
            if "&" in additional_headers:
                additional_headers = additional_headers.split("&")
            else:
                additional_headers = [ additional_headers ]
    
            for additional_header in additional_headers:
                logger.info("[downloadtools.py] additional_header: "+additional_header)
                name = re.findall( "(.*?)=.*?" , additional_header )[0]
                value = urllib.unquote_plus(re.findall( ".*?=(.*?)$" , additional_header )[0])
                headers.append( [ name,value ] )
    
            url = url.split("|")[0]
            logger.info("[downloadtools.py] downloadfile: url="+url)
    
        # Timeout del socket a 60 segundos
        socket.setdefaulttimeout(60)
    
        h=urllib2.HTTPHandler(debuglevel=0)
        request = urllib2.Request(url)
        for header in headers:
            logger.info("[downloadtools.py] Header="+header[0]+": "+header[1])
            request.add_header(header[0],header[1])
    
        if existSize > 0:
            request.add_header('Range', 'bytes=%d-' % (existSize, ))
    
        opener = urllib2.build_opener(h)
        urllib2.install_opener(opener)
        try:
            connexion = opener.open(request)
        except urllib2.HTTPError,e:
            logger.info("[downloadtools.py] downloadfile: error %d (%s) al abrir la url %s" % (e.code,e.msg,url))
            #print e.code
            #print e.msg
            #print e.hdrs
            #print e.fp
            f.close()
            if not silent:
                progreso.close()
            # El error 416 es que el rango pedido es mayor que el fichero => es que ya está completo
            if e.code==416:
                return 0
            else:
                return -2
    
        try:
            totalfichero = int(connexion.headers["Content-Length"])
        except:
            totalfichero = 1
                
        if existSize > 0:
            totalfichero = totalfichero + existSize
    
        logger.info("Content-Length=%s" % totalfichero)
    
        blocksize = 100*1024
    
        bloqueleido = connexion.read(blocksize)
        logger.info("Iniciando descarga del fichero, bloqueleido=%s" % len(bloqueleido))
    
        maxreintentos = 10
        
        while len(bloqueleido)>0:
            try:
                # Escribe el bloque leido
                #try:
                #    import xbmcvfs
                #    f.write( bloqueleido )
                #except:
                f.write(bloqueleido)
                grabado = grabado + len(bloqueleido)
                percent = int(float(grabado)*100/float(totalfichero))
                totalmb = float(float(totalfichero)/(1024*1024))
                descargadosmb = float(float(grabado)/(1024*1024))
    
                # Lee el siguiente bloque, reintentando para no parar todo al primer timeout
                reintentos = 0
                while reintentos <= maxreintentos:
                    try:
                        before = time.time()
                        bloqueleido = connexion.read(blocksize)
                        after = time.time()
                        if (after - before) > 0:
                            velocidad=len(bloqueleido)/((after - before))
                            falta=totalfichero-grabado
                            if velocidad>0:
                                tiempofalta=falta/velocidad
                            else:
                                tiempofalta=0
                            #logger.info(sec_to_hms(tiempofalta))
                            if not silent:
                                #progreso.update( percent , "Descargando %.2fMB de %.2fMB (%d%%)" % ( descargadosmb , totalmb , percent),"Falta %s - Velocidad %.2f Kb/s" % ( sec_to_hms(tiempofalta) , velocidad/1024 ), os.path.basename(nombrefichero) )
                                progreso.update( percent , "%.2fMB/%.2fMB (%d%%) %.2f Kb/s %s falta " % ( descargadosmb , totalmb , percent , velocidad/1024 , sec_to_hms(tiempofalta)))
                        break

                        try:
                            if xbmc.abortRequested:
                                logger.error( "XBMC Abort requested 1" )
                                return -1
                        except:
                            pass

                    except:
                        try:
                            if xbmc.abortRequested:
                                logger.error( "XBMC Abort requested 2" )
                                return -1
                        except:
                            pass

                        reintentos = reintentos + 1
                        logger.info("ERROR en la descarga del bloque, reintento %d" % reintentos)
                        import traceback
                        logger.error( traceback.format_exc() )
                
                # El usuario cancelo la descarga
                try:
                    if progreso.iscanceled():
                        logger.info("Descarga del fichero cancelada")
                        f.close()
                        progreso.close()
                        return -1
                except:
                    pass
    
                # Ha habido un error en la descarga
                if reintentos > maxreintentos:
                    logger.info("ERROR en la descarga del fichero")
                    f.close()
                    if not silent:
                        progreso.close()
    
                    return -2
    
            except:
                import traceback
                logger.error( traceback.format_exc() )

                f.close()
                if not silent:
                    progreso.close()
                
                #advertencia = xbmcgui.Dialog()
                #resultado = advertencia.ok('Error al descargar' , 'Se ha producido un error' , 'al descargar el archivo')
                
                return -2
示例#28
0
def downloadpage(
    url,
    post=None,
    headers=[[
        'User-Agent',
        'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; es-ES; rv:1.9.2.12) Gecko/20101026 Firefox/3.6.12'
    ]],
    follow_redirects=True,
    timeout=socket.getdefaulttimeout()):
    if (DEBUG == True): logger.info("[scrapertools.py] downloadpage")
    if (DEBUG == True): logger.info("[scrapertools.py] url=" + url)

    if post is not None:
        if (DEBUG == True): logger.info("[scrapertools.py] post=" + post)
    else:
        if (DEBUG == True): logger.info("[scrapertools.py] post=None")

    # ---------------------------------
    # Instala las cookies
    # ---------------------------------

    #  Inicializa la librería de las cookies
    ficherocookies = os.path.join(config.get_setting("cookies.dir"),
                                  'cookies.dat')
    if (DEBUG == True):
        logger.info("[scrapertools.py] ficherocookies=" + ficherocookies)

    cj = None
    ClientCookie = None
    cookielib = None

    # Let's see if cookielib is available
    try:
        if (DEBUG == True):
            logger.info("[scrapertools.py] Importando cookielib")
        import cookielib
    except ImportError:
        if (DEBUG == True):
            logger.info("[scrapertools.py] cookielib no disponible")
        # If importing cookielib fails
        # let's try ClientCookie
        try:
            if (DEBUG == True):
                logger.info("[scrapertools.py] Importando ClientCookie")
            import ClientCookie
        except ImportError:
            if (DEBUG == True):
                logger.info("[scrapertools.py] ClientCookie no disponible")
            # ClientCookie isn't available either
            urlopen = urllib2.urlopen
            Request = urllib2.Request
        else:
            if (DEBUG == True):
                logger.info("[scrapertools.py] ClientCookie disponible")
            # imported ClientCookie
            urlopen = ClientCookie.urlopen
            Request = ClientCookie.Request
            cj = ClientCookie.MozillaCookieJar()

    else:
        if (DEBUG == True):
            logger.info("[scrapertools.py] cookielib disponible")
        # importing cookielib worked
        urlopen = urllib2.urlopen
        Request = urllib2.Request
        cj = cookielib.MozillaCookieJar()
        # This is a subclass of FileCookieJar
        # that has useful load and save methods

    if cj is not None:
        # we successfully imported
        # one of the two cookie handling modules
        if (DEBUG == True): logger.info("[scrapertools.py] Hay cookies")

        if os.path.isfile(ficherocookies):
            if (DEBUG == True):
                logger.info("[scrapertools.py] Leyendo fichero cookies")
            # if we have a cookie file already saved
            # then load the cookies into the Cookie Jar
            try:
                cj.load(ficherocookies)
            except:
                if (DEBUG == True):
                    logger.info(
                        "[scrapertools.py] El fichero de cookies existe pero es ilegible, se borra"
                    )
                os.remove(ficherocookies)

        # Now we need to get our Cookie Jar
        # installed in the opener;
        # for fetching URLs
        if cookielib is not None:
            if (DEBUG == True):
                logger.info(
                    "[scrapertools.py] opener usando urllib2 (cookielib)")
            # if we use cookielib
            # then we get the HTTPCookieProcessor
            # and install the opener in urllib2
            if not follow_redirects:
                opener = urllib2.build_opener(
                    urllib2.HTTPHandler(debuglevel=DEBUG_LEVEL),
                    urllib2.HTTPCookieProcessor(cj), NoRedirectHandler())
            else:
                opener = urllib2.build_opener(
                    urllib2.HTTPHandler(debuglevel=DEBUG_LEVEL),
                    urllib2.HTTPCookieProcessor(cj))
            urllib2.install_opener(opener)

        else:
            if (DEBUG == True):
                logger.info("[scrapertools.py] opener usando ClientCookie")
            # if we use ClientCookie
            # then we get the HTTPCookieProcessor
            # and install the opener in ClientCookie
            opener = ClientCookie.build_opener(
                ClientCookie.HTTPCookieProcessor(cj))
            ClientCookie.install_opener(opener)

    # -------------------------------------------------
    # Cookies instaladas, lanza la petición
    # -------------------------------------------------

    # Contador
    inicio = time.clock()

    # Diccionario para las cabeceras
    txheaders = {}

    # Construye el request
    if post is None:
        if (DEBUG == True): logger.info("[scrapertools.py] petición GET")
    else:
        if (DEBUG == True): logger.info("[scrapertools.py] petición POST")

    # Añade las cabeceras
    if (DEBUG == True):
        logger.info("[scrapertools.py] ---------------------------")
    for header in headers:
        if (DEBUG == True):
            logger.info("[scrapertools.py] header %s=%s" %
                        (str(header[0]), str(header[1])))
        txheaders[header[0]] = header[1]
    if (DEBUG == True):
        logger.info("[scrapertools.py] ---------------------------")

    req = Request(url, post, txheaders)
    if timeout is None:
        handle = urlopen(req)
    else:
        #Disponible en python 2.6 en adelante --> handle = urlopen(req, timeout=timeout)
        #Para todas las versiones:
        deftimeout = socket.getdefaulttimeout()
        try:
            socket.setdefaulttimeout(timeout)
            handle = urlopen(req)
        except:
            import sys
            for line in sys.exc_info():
                logger.error("%s" % line)

        socket.setdefaulttimeout(deftimeout)

    # Actualiza el almacén de cookies
    cj.save(ficherocookies)

    # Lee los datos y cierra
    data = handle.read()
    info = handle.info()
    if (DEBUG == True): logger.info("[scrapertools.py] Respuesta")
    if (DEBUG == True):
        logger.info("[scrapertools.py] ---------------------------")
    for header in info:
        if (DEBUG == True):
            logger.info("[scrapertools.py] " + header + "=" + info[header])
    handle.close()
    if (DEBUG == True):
        logger.info("[scrapertools.py] ---------------------------")
    '''
    # Lanza la petición
    try:
        response = urllib2.urlopen(req)
    # Si falla la repite sustituyendo caracteres especiales
    except:
        req = urllib2.Request(url.replace(" ","%20"))
    
        # Añade las cabeceras
        for header in headers:
            req.add_header(header[0],header[1])

        response = urllib2.urlopen(req)
    '''

    # Tiempo transcurrido
    fin = time.clock()
    if (DEBUG == True):
        logger.info("[scrapertools.py] Descargado en %d segundos " %
                    (fin - inicio + 1))

    return data
示例#29
0
import os
import sys
import cookielib
import urllib
import urllib2
import json
import time

cookies = cookielib.CookieJar()

opener = urllib2.build_opener(urllib2.HTTPRedirectHandler(),
                              urllib2.HTTPHandler(debuglevel=0),
                              urllib2.HTTPSHandler(debuglevel=0),
                              urllib2.HTTPCookieProcessor(cookies))
opener.addheaders = [('User-agent', ('Mozilla/4.0 (compatible; MSIE 6.0; '
                                     'Windows NT 5.2; .NET CLR 1.1.4322)'))]
site = 'https://stitcher.ncats.io/'
site = 'http://localhost:8080/'


def requestJson(uri):
    try:
        handle = opener.open(uri)
        response = handle.read()
        handle.close()
        obj = json.loads(response)
        return obj
    except:
        sys.stderr.write("failed: " + uri + "\n")
        sys.stderr.flush()
        time.sleep(5)
示例#30
0
def request(url,
            close=True,
            error=False,
            proxy=None,
            post=None,
            headers=None,
            mobile=False,
            safe=False,
            referer=None,
            cookie=None,
            output='',
            timeout='30'):
    try:
        handlers = []
        if not proxy == None:
            handlers += [
                urllib2.ProxyHandler({'http': '%s' % (proxy)}),
                urllib2.HTTPHandler
            ]
            opener = urllib2.build_opener(*handlers)
            opener = urllib2.install_opener(opener)
        if output == 'cookie' or output == 'extended' or not close == True:
            import cookielib
            cookies = cookielib.LWPCookieJar()
            handlers += [
                urllib2.HTTPHandler(),
                urllib2.HTTPSHandler(),
                urllib2.HTTPCookieProcessor(cookies)
            ]
            opener = urllib2.build_opener(*handlers)
            opener = urllib2.install_opener(opener)
        try:
            if sys.version_info < (2, 7, 9): raise Exception()
            import ssl
            ssl_context = ssl.create_default_context()
            ssl_context.check_hostname = False
            ssl_context.verify_mode = ssl.CERT_NONE
            handlers += [urllib2.HTTPSHandler(context=ssl_context)]
            opener = urllib2.build_opener(*handlers)
            opener = urllib2.install_opener(opener)
        except:
            pass

        try:
            headers.update(headers)
        except:
            headers = {}
        if 'User-Agent' in headers:
            pass
        elif not mobile == True:
            #headers['User-Agent'] = agent()
            headers['User-Agent'] = cache.get(randomagent, 1)
        else:
            headers['User-Agent'] = 'Apple-iPhone/701.341'
        if 'Referer' in headers:
            pass
        elif referer == None:
            headers['Referer'] = '%s://%s/' % (urlparse.urlparse(url).scheme,
                                               urlparse.urlparse(url).netloc)
        else:
            headers['Referer'] = referer
        if not 'Accept-Language' in headers:
            headers['Accept-Language'] = 'en-US'
        if 'Cookie' in headers:
            pass
        elif not cookie == None:
            headers['Cookie'] = cookie

        request = urllib2.Request(url, data=post, headers=headers)

        try:
            response = urllib2.urlopen(request, timeout=int(timeout))
        except urllib2.HTTPError as response:
            if error == False: return

        if output == 'cookie':
            result = []
            for c in cookies:
                result.append('%s=%s' % (c.name, c.value))
            result = "; ".join(result)
        elif output == 'response':
            if safe == True:
                result = (str(response.code), response.read(224 * 1024))
            else:
                result = (str(response.code), response.read())
        elif output == 'chunk':
            try:
                content = int(response.headers['Content-Length'])
            except:
                content = (2049 * 1024)
            if content < (2048 * 1024): return
            result = response.read(16 * 1024)
        elif output == 'title':
            result = response.read(1 * 1024)
            result = parseDOM(result, 'title')[0]
        elif output == 'extended':
            cookie = []
            for c in cookies:
                cookie.append('%s=%s' % (c.name, c.value))
            cookie = "; ".join(cookie)
            content = response.headers
            result = response.read()
            return (result, headers, content, cookie)
        elif output == 'geturl':
            result = response.geturl()
        elif output == 'headers':
            content = response.headers
            return content
        else:
            if safe == True:
                result = response.read(224 * 1024)
            else:
                result = response.read()
        if close == True:
            response.close()

        return result
    except:
        return