def extractReqTarget(full_link):

    if "qunar" not in str(full_link):
        return None
    if "qrt=" in str(full_link):
        return full_link.partition('qrt=')[2]
    if "html.ng" in str(full_link):
        return 'qde'
    proto, rest = ur.splittype(full_link)
    res, rest = ur.splithost(rest)
    return None if not res else res
Exemplo n.º 2
0
    def __init__(self, uri, basepath=None):
        self.basepath = basepath
        self.mimetype = None
        self.file = None
        self.data = None
        self.uri = None
        self.local = None
        self.tmp_file = None
        uri = uri or str()
        if type(uri) != str:
            uri = uri.decode("utf-8")
        log.debug("FileObject %r, Basepath: %r", uri, basepath)

        # Data URI
        if uri.startswith("data:"):
            m = _rx_datauri.match(uri)
            self.mimetype = m.group("mime")
            b64 = urllib_unquote(m.group("data")).encode("utf-8")
            self.data = base64.b64decode(b64)

        else:
            # Check if we have an external scheme
            if basepath and not urlparse.urlparse(uri).scheme:
                urlParts = urlparse.urlparse(basepath)
            else:
                urlParts = urlparse.urlparse(uri)

            log.debug("URLParts: {}".format((urlParts, urlParts.scheme)))

            if urlParts.scheme == 'file':
                if basepath and uri.startswith('/'):
                    uri = urlparse.urljoin(basepath, uri[1:])
                urlResponse = urllib2.urlopen(uri)
                self.mimetype = urlResponse.info().get(
                    "Content-Type", '').split(";")[0]
                self.uri = urlResponse.geturl()
                self.file = urlResponse

            # Drive letters have len==1 but we are looking
            # for things like http:
            elif urlParts.scheme in ('http', 'https'):

                log.debug("Sending request for {} with httplib".format(uri))

                # External data
                if basepath:
                    uri = urlparse.urljoin(basepath, uri)

                log.debug("Uri parsed: {}".format(uri))

                #path = urlparse.urlsplit(url)[2]
                #mimetype = getMimeType(path)

                # Using HTTPLIB
                server, path = urllib2.splithost(uri[uri.find("//"):])
                if uri.startswith("https://"):
                    conn = httplib.HTTPSConnection(server,  **httpConfig)
                else:
                    conn = httplib.HTTPConnection(server)
                conn.request("GET", path)
                r1 = conn.getresponse()
                # log.debug("HTTP %r %r %r %r", server, path, uri, r1)
                if (r1.status, r1.reason) == (200, "OK"):
                    self.mimetype = r1.getheader(
                        "Content-Type", '').split(";")[0]
                    self.uri = uri
                    log.debug("here")
                    if r1.getheader("content-encoding") == "gzip":
                        import gzip

                        self.file = gzip.GzipFile(
                            mode="rb", fileobj=six.StringIO(r1.read()))
                    else:
                        self.file = pisaTempFile(r1.read())
                else:
                    log.debug(
                        "Received non-200 status: {}".format((r1.status, r1.reason)))
                    try:
                        urlResponse = urllib2.urlopen(uri)
                    except urllib2.HTTPError as e:
                        log.error("Could not process uri: {}".format(e))
                        return
                    self.mimetype = urlResponse.info().get(
                        "Content-Type", '').split(";")[0]
                    self.uri = urlResponse.geturl()
                    self.file = urlResponse

            else:

                log.debug("Unrecognized scheme, assuming local file path")

                # Local data
                if basepath:
                    if sys.platform == 'win32' and os.path.isfile(basepath):
                        basepath = os.path.dirname(basepath)
                    uri = os.path.normpath(os.path.join(basepath, uri))

                if os.path.isfile(uri):
                    self.uri = uri
                    self.local = uri

                    self.setMimeTypeByName(uri)
                    if self.mimetype and self.mimetype.startswith('text'):
                        self.file = open(uri, "r") #removed bytes... lets hope it goes ok :/
                    else:
                        # removed bytes... lets hope it goes ok :/
                        self.file = open(uri, "rb")
Exemplo n.º 3
0
    def __init__(self, uri, basepath=None):
        self.basepath = basepath
        self.mimetype = None
        self.file = None
        self.data = None
        self.uri = None
        self.local = None
        self.tmp_file = None
        uri = uri or str()
        if type(uri) != str:
            uri = uri.decode("utf-8")
        log.debug("FileObject %r, Basepath: %r", uri, basepath)

        # Data URI
        if uri.startswith("data:"):
            m = _rx_datauri.match(uri)
            self.mimetype = m.group("mime")
            self.data = base64.b64decode(m.group("data").encode("utf-8"))

        else:
            # Check if we have an external scheme
            if basepath and not urlparse.urlparse(uri).scheme:
                urlParts = urlparse.urlparse(basepath)
            else:
                urlParts = urlparse.urlparse(uri)

            log.debug("URLParts: {}".format((urlParts, urlParts.scheme)))

            if urlParts.scheme == 'file':
                if basepath and uri.startswith('/'):
                    uri = urlparse.urljoin(basepath, uri[1:])
                urlResponse = urllib2.urlopen(uri)
                self.mimetype = urlResponse.info().get("Content-Type",
                                                       '').split(";")[0]
                self.uri = urlResponse.geturl()
                self.file = urlResponse

            # Drive letters have len==1 but we are looking
            # for things like http:
            elif urlParts.scheme in ('http', 'https'):

                log.debug("Sending request for {} with httplib".format(uri))

                # External data
                if basepath:
                    uri = urlparse.urljoin(basepath, uri)

                log.debug("Uri parsed: {}".format(uri))

                #path = urlparse.urlsplit(url)[2]
                #mimetype = getMimeType(path)

                # Using HTTPLIB
                server, path = urllib2.splithost(uri[uri.find("//"):])
                if uri.startswith("https://"):
                    conn = httplib.HTTPSConnection(server)
                else:
                    conn = httplib.HTTPConnection(server)
                conn.request("GET", path)
                r1 = conn.getresponse()
                # log.debug("HTTP %r %r %r %r", server, path, uri, r1)
                if (r1.status, r1.reason) == (200, "OK"):
                    self.mimetype = r1.getheader("Content-Type",
                                                 '').split(";")[0]
                    self.uri = uri
                    log.debug("here")
                    if r1.getheader("content-encoding") == "gzip":
                        import gzip

                        self.file = gzip.GzipFile(mode="rb",
                                                  fileobj=six.StringIO(
                                                      r1.read()))
                    else:
                        self.file = r1
                else:
                    log.debug("Received non-200 status: {}".format(
                        (r1.status, r1.reason)))
                    try:
                        urlResponse = urllib2.urlopen(uri)
                    except urllib2.HTTPError as e:
                        log.error("Could not process uri: {}".format(e))
                        return
                    self.mimetype = urlResponse.info().get("Content-Type",
                                                           '').split(";")[0]
                    self.uri = urlResponse.geturl()
                    self.file = urlResponse

            else:

                log.debug("Unrecognized scheme, assuming local file path")

                # Local data
                if basepath:
                    uri = os.path.normpath(os.path.join(basepath, uri))

                if os.path.isfile(uri):
                    self.uri = uri
                    self.local = uri

                    self.setMimeTypeByName(uri)
                    if self.mimetype and self.mimetype.startswith('text'):
                        self.file = open(
                            uri,
                            "r")  #removed bytes... lets hope it goes ok :/
                    else:
                        self.file = open(
                            uri,
                            "rb")  #removed bytes... lets hope it goes ok :/
Exemplo n.º 4
0
list = dir(urllib2)
for s in list:
    print(s)

url = 'http://www.bing.com/images/search?q=%d0%93%d1%80%d1%83%d0%b7%d0%b8%d1%8f&FORM=HDRSC2'
url = 'http://www.bing.com/images/search?q=Imanuel+Kant&view=detailv2&&&id=20C00C8B61AC086C2988CB7172395A3AD1B87A9C&selectedIndex=19&ccid=o1O3XNKY&simid=608053578826975455&thid=JN.INhbeB%2bbzfDQuY0MgDnrNA&ajaxhist=0'
s1 = urllib2.unquote(url)
print(s1)

print(url)

print('splitattr')
x = urllib2.splitattr(url)
print(x)
print('splithost')
x = urllib2.splithost(url)
print(x)
print('splitpasswd')
x = urllib2.splitpasswd(url)
print(x)
print('splitport')
x = urllib2.splitport(url)
print(x)
print('splittype')
x = urllib2.splittype(url)
print(x)
print('splituser')
x = urllib2.splituser(url)
print(x)
print('splitvalue')
x = urllib2.splitvalue(url)
Exemplo n.º 5
0
    def __init__(self, uri, basepath=None):

        self.basepath = basepath
        self.mimetype = None
        self.file = None
        self.data = None
        self.uri = None
        self.local = None
        self.tmp_file = None
        uri = uri or str()
        if not isinstance(uri, str):
            uri = uri.decode("utf-8")
        log.debug("FileObject %r, Basepath: %r", uri, basepath)

        # Data URI
        if uri.startswith("data:"):
            m = _rx_datauri.match(uri)
            self.mimetype = m.group("mime")

            b64 = urllib_unquote(m.group("data"))

            # The data may be incorrectly unescaped... repairs needed
            b64 = b64.strip("b'").strip("'").encode()
            b64 = re.sub(b"\\n", b'', b64)
            b64 = re.sub(b'[^A-Za-z0-9\+\/]+', b'', b64)

            # Add padding as needed, to make length into a multiple of 4
            #
            b64 += b"=" * ((4 - len(b64) % 4) % 4)

            self.data = base64.b64decode(b64)

        else:
            # Check if we have an external scheme
            if basepath and not urlparse.urlparse(uri).scheme:
                urlParts = urlparse.urlparse(basepath)
            else:
                urlParts = urlparse.urlparse(uri)

            log.debug("URLParts: {}".format((urlParts, urlParts.scheme)))

            if urlParts.scheme == 'file':
                if basepath and uri.startswith('/'):
                    uri = urlparse.urljoin(basepath, uri[1:])
                urlResponse = urllib2.urlopen(uri)
                self.mimetype = urlResponse.info().get("Content-Type",
                                                       '').split(";")[0]
                self.uri = urlResponse.geturl()
                self.file = urlResponse

            # Drive letters have len==1 but we are looking
            # for things like http:
            elif urlParts.scheme in ('http', 'https'):

                log.debug("Sending request for {} with httplib".format(uri))

                # External data
                if basepath:
                    uri = urlparse.urljoin(basepath, uri)

                log.debug("Uri parsed: {}".format(uri))

                #path = urlparse.urlsplit(url)[2]
                #mimetype = getMimeType(path)

                # Using HTTPLIB
                server, path = urllib2.splithost(uri[uri.find("//"):])
                if uri.startswith("https://"):
                    conn = httplib.HTTPSConnection(server, **httpConfig)
                else:
                    conn = httplib.HTTPConnection(server)
                conn.request("GET", path)
                r1 = conn.getresponse()
                # log.debug("HTTP %r %r %r %r", server, path, uri, r1)
                if (r1.status, r1.reason) == (200, "OK"):
                    self.mimetype = r1.getheader("Content-Type",
                                                 '').split(";")[0]
                    self.uri = uri
                    log.debug("here")
                    if r1.getheader("content-encoding") == "gzip":
                        import gzip

                        self.file = gzip.GzipFile(mode="rb",
                                                  fileobj=six.BytesIO(
                                                      r1.read()))
                    else:
                        self.file = pisaTempFile(r1.read())
                else:
                    log.debug("Received non-200 status: {}".format(
                        (r1.status, r1.reason)))
                    try:
                        urlResponse = urllib2.urlopen(uri)
                    except urllib2.HTTPError as e:
                        log.error("Could not process uri: {}".format(e))
                        return
                    self.mimetype = urlResponse.info().get("Content-Type",
                                                           '').split(";")[0]
                    self.uri = urlResponse.geturl()
                    self.file = urlResponse

            else:

                log.debug("Unrecognized scheme, assuming local file path")

                # Local data
                if basepath:
                    if sys.platform == 'win32' and os.path.isfile(basepath):
                        basepath = os.path.dirname(basepath)
                    uri = os.path.normpath(os.path.join(basepath, uri))

                if os.path.isfile(uri):
                    self.uri = uri
                    self.local = uri

                    self.setMimeTypeByName(uri)
                    if self.mimetype and self.mimetype.startswith('text'):
                        self.file = open(
                            uri,
                            "r")  #removed bytes... lets hope it goes ok :/
                    else:
                        # removed bytes... lets hope it goes ok :/
                        self.file = open(uri, "rb")
    sp1=request.splitquery()
TypeError: splitquery() missing 1 required positional argument: 'url'
>>> sp1=request.splitquery(url1)
>>> spl
Traceback (most recent call last):
  File "<pyshell#31>", line 1, in <module>
    spl
NameError: name 'spl' is not defined
>>> sp1
('https://stackoverflow.com/questions/34475051/need-to-install-urllib2-for-python-3-5-1', 'admin=no&param=yes:8000')
>>> dir(request)
['AbstractBasicAuthHandler', 'AbstractDigestAuthHandler', 'AbstractHTTPHandler', 'BaseHandler', 'CacheFTPHandler', 'ContentTooShortError', 'DataHandler', 'FTPHandler', 'FancyURLopener', 'FileHandler', 'HTTPBasicAuthHandler', 'HTTPCookieProcessor', 'HTTPDefaultErrorHandler', 'HTTPDigestAuthHandler', 'HTTPError', 'HTTPErrorProcessor', 'HTTPHandler', 'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm', 'HTTPPasswordMgrWithPriorAuth', 'HTTPRedirectHandler', 'HTTPSHandler', 'MAXFTPCACHE', 'OpenerDirector', 'ProxyBasicAuthHandler', 'ProxyDigestAuthHandler', 'ProxyHandler', 'Request', 'URLError', 'URLopener', 'UnknownHandler', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', '__version__', '_cut_port_re', '_ftperrors', '_have_ssl', '_localhost', '_noheaders', '_opener', '_parse_proxy', '_proxy_bypass_macosx_sysconf', '_randombytes', '_safe_gethostbyname', '_thishost', '_url_tempfiles', 'addclosehook', 'addinfourl', 'base64', 'bisect', 'build_opener', 'contextlib', 'email', 'ftpcache', 'ftperrors', 'ftpwrapper', 'getproxies', 'getproxies_environment', 'getproxies_registry', 'hashlib', 'http', 'install_opener', 'io', 'localhost', 'noheaders', 'os', 'parse_http_list', 'parse_keqv_list', 'pathname2url', 'posixpath', 'proxy_bypass', 'proxy_bypass_environment', 'proxy_bypass_registry', 'quote', 're', 'request_host', 'socket', 'splitattr', 'splithost', 'splitpasswd', 'splitport', 'splitquery', 'splittag', 'splittype', 'splituser', 'splitvalue', 'ssl', 'string', 'sys', 'tempfile', 'thishost', 'time', 'to_bytes', 'unquote', 'unquote_to_bytes', 'unwrap', 'url2pathname', 'urlcleanup', 'urljoin', 'urlopen', 'urlparse', 'urlretrieve', 'urlsplit', 'urlunparse', 'warnings']
>>> sp1=request.splitattr(url1)
>>> sp1
('https://stackoverflow.com/questions/34475051/need-to-install-urllib2-for-python-3-5-1?admin=no&param=yes:8000', [])
>>> sp1=request.splithost(url1)
>>> sp1
(None, 'https://stackoverflow.com/questions/34475051/need-to-install-urllib2-for-python-3-5-1?admin=no&param=yes:8000')
>>> sp1=request.splittag(url1)
>>> sp1
('https://stackoverflow.com/questions/34475051/need-to-install-urllib2-for-python-3-5-1?admin=no&param=yes:8000', None)
>>> sp1=request.urlcleanup(url1)
Traceback (most recent call last):
  File "<pyshell#40>", line 1, in <module>
    sp1=request.urlcleanup(url1)
TypeError: urlcleanup() takes 0 positional arguments but 1 was given
>>> sp1=request.urlparse(url1)
>>> p1
Traceback (most recent call last):
  File "<pyshell#42>", line 1, in <module>
    p1