def test_extract__idn(): assert extract(u'http://пример.рф') == ('http', '', '', '', u'пример', u'рф', '', '', '', '', u'http://пример.рф') assert extract(u'http://إختبار.مصر/') == ('http', '', '', '', u'إختبار', u'مصر', '', '/', '', '', u'http://إختبار.مصر/')
def test_extract(): assert extract('http://example.com') == ('http', '', '', '', 'example', 'com', '', '', '', '', 'http://example.com') assert extract('http://example.com:8080') == ('http', '', '', '', 'example', 'com', '8080', '', '', '', 'http://example.com:8080') assert extract('http://example.com:8080/abc?x=1&y=2#qwe') == ( 'http', '', '', '', 'example', 'com', '8080', '/abc', 'x=1&y=2', 'qwe', 'http://example.com:8080/abc?x=1&y=2#qwe') assert extract('http://example.ac.at') == ('http', '', '', '', 'example', 'ac.at', '', '', '', '', 'http://example.ac.at') assert extract('http://example.co.uk/') == ('http', '', '', '', 'example', 'co.uk', '', '/', '', '', 'http://example.co.uk/') assert extract('http://foo.bar.example.co.uk') == ( 'http', '', '', 'foo.bar', 'example', 'co.uk', '', '', '', '', 'http://foo.bar.example.co.uk') assert extract('http://*****:*****@www.example.com:1234/foo/?x=1#bla') == ( 'http', 'foo', 'bar', 'www', 'example', 'com', '1234', '/foo/', 'x=1', 'bla', 'http://*****:*****@www.example.com:1234/foo/?x=1#bla') assert extract('http://example.com?x=y:z') == ('http', '', '', '', 'example', 'com', '', '', 'x=y:z', '', 'http://example.com?x=y:z') assert extract('http://example.com?x=y:z/') == ( 'http', '', '', '', 'example', 'com', '', '', 'x=y:z/', '', 'http://example.com?x=y:z/') assert extract('mailto:[email protected]') == ('mailto', 'foo', '', '', 'bar', 'com', '', '', '', '', 'mailto:[email protected]')
def test_extract(): assert extract('http://example.com') == ('http', '', '', '', 'example', 'com', '', '', '', '', 'http://example.com') assert extract('http://example.com:8080') == ('http', '', '', '', 'example', 'com', '8080', '', '', '', 'http://example.com:8080') assert extract('http://example.com:8080/abc?x=1&y=2#qwe') == ('http', '', '', '', 'example', 'com', '8080', '/abc', 'x=1&y=2', 'qwe', 'http://example.com:8080/abc?x=1&y=2#qwe') assert extract('http://example.ac.at') == ('http', '', '', '', 'example', 'ac.at', '', '', '', '', 'http://example.ac.at') assert extract('http://example.co.uk/') == ('http', '', '', '', 'example', 'co.uk', '', '/', '', '', 'http://example.co.uk/') assert extract('http://foo.bar.example.co.uk') == ('http', '', '', 'foo.bar', 'example', 'co.uk', '', '', '', '', 'http://foo.bar.example.co.uk') assert extract('http://*****:*****@www.example.com:1234/foo/?x=1#bla') == ('http', 'foo', 'bar', 'www', 'example', 'com', '1234', '/foo/', 'x=1', 'bla', 'http://*****:*****@www.example.com:1234/foo/?x=1#bla') assert extract('http://example.com?x=y:z') == ('http', '', '', '', 'example', 'com', '', '', 'x=y:z', '', 'http://example.com?x=y:z') assert extract('http://example.com?x=y:z/') == ('http', '', '', '', 'example', 'com', '', '', 'x=y:z/', '', 'http://example.com?x=y:z/') assert extract('mailto:[email protected]') == ('mailto', 'foo', '', '', 'bar', 'com', '', '', '', '', 'mailto:[email protected]')
def canonicalize_url(url, keep_params=False, keep_fragments=False): """Canonicalize the given url by applying the following procedures: # a sort query arguments, first by key, then by value # b percent encode paths and query arguments. non-ASCII characters are # c percent-encoded using UTF-8 (RFC-3986) # d normalize all spaces (in query arguments) '+' (plus symbol) # e normalize percent encodings case (%2f -> %2F) # f remove query arguments with blank values (unless site in NONCANONIC_SITES) # g remove fragments (unless #!) # h remove username/password at front of domain # i remove port if 80, keep if not # k remove query arguments (unless site in USEFUL_QUERY_KEYS) The url passed can be a str or unicode, while the url returned is always a str. """ if keep_params: # Preserve all query params parsed = extract(norm(url)) else: # Remove unwanted params parsed = extract(url_query_cleaner(normalize(url), parameterlist=config.USEFUL_QUERY_KEYS)) # Sort params, remove blank if not wanted query = urllib.urlencode(sorted(urlparse.parse_qsl(parsed.query, keep_blank_values=keep_params))) fragment = getFragment(url, keep_fragments) # The following is to remove orphaned '=' from query string params with no values query = re.sub(r"=$", "", query.replace("=&", "&")) # Reconstruct URL, escaping apart from safe chars # See http://stackoverflow.com/questions/2849756/list-of-valid-characters-for-the-fragment-identifier-in-an-url # http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links safe = "/.-_~!$&'()*+,;=:@" newurl = construct(URL(parsed.scheme, '', '', parsed.subdomain, parsed.domain, parsed.tld, parsed.port, quote(parsed.path, safe=safe), query, quote(fragment, safe=safe), '')) return newurl.rstrip('/')
async def check(line): global loop url = urltools.extract(line) base = url.domain + (url.tld and '.' + url.tld or '') c = colorama.Fore.RED prefixes = pfxs if url.tld else (sch + "://" for sch in schemes) for pfx in prefixes: uri = pfx + base data = None try: async with aiohttp.ClientSession(connector=connector()) as session: async with session.get(uri, headers={"User-Agent": USERAGENT}, timeout=timeout) as resp: if resp.status == 200: with contextlib.suppress(LookupError, UnicodeDecodeError): data = await resp.text() if data is None: continue c = await loop.run_in_executor( None, functools.partial(parse, resp.url, data)) except (OSError, ValueError): return except (RuntimeError, asyncio.TimeoutError, aiohttp.http_exceptions.BadHttpMessage, aiohttp.ClientResponseError, aiohttp.ServerDisconnectedError, ConnectionResetError): continue except Exception as e: print("Unhandled Exception: ", e) traceback.print_exc() return finally: print(colorama.Style.BRIGHT + c + uri + colorama.Style.RESET_ALL) await asyncio.sleep(0.5)
def get_content(): if len(argv) < 2: content = get_list() return content else: file = argv[1] try: content = [] list1 = list(open(file, "r")) links = list1[:] = [line.rstrip('\n') for line in list1] for item in links: if item: url_info = urltools.extract(item) if url_info[4] == str("twitter"): item = url_info[7].replace("/", "") content.append(item) return content except FileNotFoundError: print("File {0} not found!".format(argv[1])) exit(1)
def getFragment(url, keep_fragments): fragment = extract(norm(url)).fragment return fragment if fragment.startswith('!') or keep_fragments else ''
def test_extract(): assert extract("http://example.com") == ('http', '', 'example', 'com', '', '/', '', '') assert extract("http://example.com:8080") == ('http', '', 'example', 'com', '8080', '/', '', '') assert extract("http://example.com:8080/abc?x=1&y=2#qwe") == ('http', '', 'example', 'com', '8080', '/abc', 'x=1&y=2', 'qwe') assert extract("http://example.ac.at") == ('http', '', 'example', 'ac.at', '', '/', '', '') assert extract("http://example.co.uk") == ('http', '', 'example', 'co.uk', '', '/', '', '') assert extract("http://foo.bar.example.co.uk") == ('http', 'foo.bar', 'example', 'co.uk', '', '/', '', '') assert extract("example.com.") == ('', '', 'example', 'com', '', '', '', '') assert extract("example.com/abc") == ('', '', 'example', 'com', '', '/abc', '', '') assert extract("www.example.com") == ('', 'www', 'example', 'com', '', '', '', '') assert extract("example.com/") == ('', '', 'example', 'com', '', '/', '', '') assert extract("example.com:8080") == ('', '', 'example', 'com', '8080', '', '', '') assert extract("example.com:8080/") == ('', '', 'example', 'com', '8080', '/', '', '') assert extract("example.com:8080/abc") == ('', '', 'example', 'com', '8080', '/abc', '', '') assert extract("http://пример.рф") == ('http', '', 'пример', 'рф', '', '/', '', '') assert extract("http://إختبار.مصر/") == ('http', '', 'إختبار', 'مصر', '', '/', '', '')
def test_extract(): assert extract("http://example.com") == ('http', '', '', '', 'example', 'com', '', '', '', '') assert extract("http://example.com:8080") == ('http', '', '', '', 'example', 'com', '8080', '', '', '') assert extract("http://example.com:8080/abc?x=1&y=2#qwe") == ('http', '', '', '', 'example', 'com', '8080', '/abc', 'x=1&y=2', 'qwe') assert extract("http://example.ac.at") == ('http', '', '', '', 'example', 'ac.at', '', '', '', '') assert extract("http://example.co.uk/") == ('http', '', '', '', 'example', 'co.uk', '', '/', '', '') assert extract("http://foo.bar.example.co.uk") == ('http', '', '', 'foo.bar', 'example', 'co.uk', '', '', '', '') assert extract("http://*****:*****@www.example.com:1234/foo/?x=1#bla") == ('http', 'foo', 'bar', 'www', 'example', 'com', '1234', '/foo/', 'x=1', 'bla') assert extract("http://example.com?foo=bar:blub") == ('http', '', '', '', 'example', 'com', '', '', 'foo=bar:blub', '') assert extract("http://example.com?foo=bar:blub/") == ('http', '', '', '', 'example', 'com', '', '', 'foo=bar:blub/', '') assert extract("example.com.") == ('', '', '', '', 'example', 'com', '', '', '', '') assert extract("example.com/abc") == ('', '', '', '', 'example', 'com', '', '/abc', '', '') assert extract("www.example.com") == ('', '', '', 'www', 'example', 'com', '', '', '', '') assert extract("example.com/") == ('', '', '', '', 'example', 'com', '', '/', '', '') assert extract("example.com:8080") == ('', '', '', '', 'example', 'com', '8080', '', '', '') assert extract("example.com:8080/") == ('', '', '', '', 'example', 'com', '8080', '/', '', '') assert extract("example.com:8080/abc") == ('', '', '', '', 'example', 'com', '8080', '/abc', '', '') assert extract("www.example.com/?x=1") == ('', '', '', 'www', 'example', 'com', '', '/', 'x=1', '') assert extract("www.example.com?x=1") == ('', '', '', 'www', 'example', 'com', '', '', 'x=1', '') assert extract("www.example.com/#foo") == ('', '', '', 'www', 'example', 'com', '', '/', '', 'foo') assert extract("www.example.com#foo") == ('', '', '', 'www', 'example', 'com', '', '', '', 'foo') assert extract("http://пример.рф") == ('http', '', '', '', 'пример', 'рф', '', '', '', '') assert extract("http://إختبار.مصر/") == ('http', '', '', '', 'إختبار', 'مصر', '', '/', '', '') assert extract("mailto:[email protected]") == ('mailto', 'foo', '', '', 'bar', 'com', '', '', '', '') assert extract("http://[::1]/foo/bar") == ('http', '', '', '', '[::1]', '', '', '/foo/bar', '', '') assert extract("[::1]/foo/bar") == ('', '', '', '', '[::1]', '', '', '/foo/bar', '', '')
def test_extract__ip(): assert extract('http://[::1]/foo') == ('http', '', '', '', '[::1]', '', '', '/foo', '', '', 'http://[::1]/foo') assert extract('[::1]/foo') == ('', '', '', '', '[::1]', '', '', '/foo', '', '', '[::1]/foo')
def test_extract__no_scheme(): assert extract('example.com.') == ('', '', '', '', 'example', 'com', '', '', '', '', 'example.com.') assert extract('example.com/abc') == ('', '', '', '', 'example', 'com', '', '/abc', '', '', 'example.com/abc') assert extract('www.example.com') == ('', '', '', 'www', 'example', 'com', '', '', '', '', 'www.example.com') assert extract('example.com/') == ('', '', '', '', 'example', 'com', '', '/', '', '', 'example.com/') assert extract('example.com:8080') == ('', '', '', '', 'example', 'com', '8080', '', '', '', 'example.com:8080') assert extract('example.com:8080/') == ('', '', '', '', 'example', 'com', '8080', '/', '', '', 'example.com:8080/') assert extract('example.com:8080/abc') == ('', '', '', '', 'example', 'com', '8080', '/abc', '', '', 'example.com:8080/abc') assert extract('www.example.com/?x=1') == ('', '', '', 'www', 'example', 'com', '', '/', 'x=1', '', 'www.example.com/?x=1') assert extract('www.example.com?x=1') == ('', '', '', 'www', 'example', 'com', '', '', 'x=1', '', 'www.example.com?x=1') assert extract('www.example.com/#foo') == ('', '', '', 'www', 'example', 'com', '', '/', '', 'foo', 'www.example.com/#foo') assert extract('www.example.com#foo') == ('', '', '', 'www', 'example', 'com', '', '', '', 'foo', 'www.example.com#foo')
def test_extract(): assert extract("http://example.com") == ('http', '', '', '', 'example', 'com', '', '', '', '') assert extract("http://example.com:8080") == ('http', '', '', '', 'example', 'com', '8080', '', '', '') assert extract("http://example.com:8080/abc?x=1&y=2#qwe") == ( 'http', '', '', '', 'example', 'com', '8080', '/abc', 'x=1&y=2', 'qwe') assert extract("http://example.ac.at") == ('http', '', '', '', 'example', 'ac.at', '', '', '', '') assert extract("http://example.co.uk/") == ('http', '', '', '', 'example', 'co.uk', '', '/', '', '') assert extract("http://foo.bar.example.co.uk") == ('http', '', '', 'foo.bar', 'example', 'co.uk', '', '', '', '') assert extract("http://*****:*****@www.example.com:1234/foo/?x=1#bla") == ( 'http', 'foo', 'bar', 'www', 'example', 'com', '1234', '/foo/', 'x=1', 'bla') assert extract("http://example.com?foo=bar:blub") == ('http', '', '', '', 'example', 'com', '', '', 'foo=bar:blub', '') assert extract("http://example.com?foo=bar:blub/") == ('http', '', '', '', 'example', 'com', '', '', 'foo=bar:blub/', '') assert extract("example.com.") == ('', '', '', '', 'example', 'com', '', '', '', '') assert extract("example.com/abc") == ('', '', '', '', 'example', 'com', '', '/abc', '', '') assert extract("www.example.com") == ('', '', '', 'www', 'example', 'com', '', '', '', '') assert extract("example.com/") == ('', '', '', '', 'example', 'com', '', '/', '', '') assert extract("example.com:8080") == ('', '', '', '', 'example', 'com', '8080', '', '', '') assert extract("example.com:8080/") == ('', '', '', '', 'example', 'com', '8080', '/', '', '') assert extract("example.com:8080/abc") == ('', '', '', '', 'example', 'com', '8080', '/abc', '', '') assert extract("www.example.com/?x=1") == ('', '', '', 'www', 'example', 'com', '', '/', 'x=1', '') assert extract("www.example.com?x=1") == ('', '', '', 'www', 'example', 'com', '', '', 'x=1', '') assert extract("www.example.com/#foo") == ('', '', '', 'www', 'example', 'com', '', '/', '', 'foo') assert extract("www.example.com#foo") == ('', '', '', 'www', 'example', 'com', '', '', '', 'foo') assert extract("http://пример.рф") == ('http', '', '', '', 'пример', 'рф', '', '', '', '') assert extract("http://إختبار.مصر/") == ('http', '', '', '', 'إختبار', 'مصر', '', '/', '', '') assert extract("mailto:[email protected]") == ('mailto', 'foo', '', '', 'bar', 'com', '', '', '', '') assert extract("http://[::1]/foo/bar") == ('http', '', '', '', '[::1]', '', '', '/foo/bar', '', '') assert extract("[::1]/foo/bar") == ('', '', '', '', '[::1]', '', '', '/foo/bar', '', '')