def test_finds_query(self): url = 'http://mysubdomain.example.com?myquery=test' result = url_parser.get_url(url) self.assertEqual(result.query['myquery'], 'test') url = 'http://mysubdomain.example.com?myquery=test&one=two&test' result = url_parser.get_url(url) self.assertEqual(result.query['myquery'], 'test') self.assertEqual(result.query['one'], 'two') self.assertIsNone(result.query['test']) url = 'http://mysubdomain.example.com/file.js?myquery=test&one=two' result = url_parser.get_url(url) self.assertEqual(result.query['myquery'], 'test') self.assertEqual(result.query['one'], 'two') url = 'http://mysubdomain.example.com/path/and/file.js?myquery=test&one=two' result = url_parser.get_url(url) self.assertEqual(result.query['myquery'], 'test') self.assertEqual(result.query['one'], 'two') url = 'http://mysubdomain.example.com/path/?myquery=test&one=two' result = url_parser.get_url(url) self.assertEqual(result.query['myquery'], 'test') self.assertEqual(result.query['one'], 'two')
def test_finds_file(self): url = 'http://mysubdomain.example.com/cool.jpg' result = url_parser.get_url(url) self.assertEqual(result.file, 'cool.jpg') url = 'http://mysubdomain.example.com/directory/here/sample.mp4' result = url_parser.get_url(url) self.assertEqual(result.file, 'sample.mp4')
def test_finds_top_domain(self): url = 'http://mysubdomain.example.com' result = url_parser.get_url(url) self.assertEqual(result.top_domain, 'com') url = 'http://mysubdomain.example.co.uk' result = url_parser.get_url(url) self.assertEqual(result.top_domain, 'co.uk')
def test_finds_dir(self): url = 'http://mysubdomain.example.com/folder/' result = url_parser.get_url(url) self.assertEqual(result.dir, '/folder/') url = 'http://mysubdomain.example.com/multiple/folders/' result = url_parser.get_url(url) self.assertEqual(result.dir, '/multiple/folders/') url = 'http://mysubdomain.example.com/multiple/folders/with_a_file.js' result = url_parser.get_url(url) self.assertEqual(result.dir, '/multiple/folders/')
def test_finds_protocol(self): url = 'http://mysubdomain.example.com' result = url_parser.get_url(url) self.assertEqual(result.protocol, 'http') url = 'https://mysubdomain.example.com' result = url_parser.get_url(url) self.assertEqual(result.protocol, 'https') url = 'ftp://mysubdomain.example.com' result = url_parser.get_url(url) self.assertEqual(result.protocol, 'ftp')
def test_finds_path(self): url = 'http://mysubdomain.example.com/path' result = url_parser.get_url(url) self.assertEqual(result.path, '/path') url = 'http://mysubdomain.example.com/this/is/the/path' result = url_parser.get_url(url) self.assertEqual(result.path, '/this/is/the/path') url = 'http://mysubdomain.example.com/path/with/file.js' result = url_parser.get_url(url) self.assertEqual(result.path, '/path/with/file.js')
async def filter_links(bot, message): if ((not isinstance(message.author, discord.Member)) or message.author.permissions_in(message.channel).manage_messages): return regex = (r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|' r'(?:%[0-9a-fA-F][0-9a-fA-F]))+') matches = re.findall(regex, message.content, re.MULTILINE) urls = [] for link in matches: try: urls = [] async with bot.http._HTTPClient__session.get(link) as resp: urls.append(url_parser.get_url(link)._asdict()) for redirect in resp.history: urls.append( url_parser.get_url(redirect.real_url)._asdict()) for url in urls: for blocked in [ # "*" means any # [http[s]://][sub.]<name>.<domain>[/path] # Reason ########################################################### '*.grabify.link/*', # Ip Grabber '*.pornhub.com/*', # P**n ]: parsed_blocked = url_parser.get_url( blocked.replace('*', '-'))._asdict() for k, v in parsed_blocked.items(): if k in [ 'protocol', 'www', 'dir', 'file', 'fragment', 'query' ]: continue if v == url[k]: continue if isinstance(v, str): if v.replace('.', '') == '-': continue if k == 'path': if v[1:] == '-': continue return await message.delete() await message.channel.send(( f':warning: {message.author.mention} That link is not ' 'allowed :warning:'), delete_after=15) return except Exception as e: print(e)
async def check_link_base(url, block_list): url = url_parser.get_url(url)._asdict() for blocked in block_list: parsed_blocked = url_parser.get_url(blocked.replace('*', '-'))._asdict() delete = True for k in ['sub_domain', 'domain', 'top_domain', 'path']: rep = parsed_blocked[k] if k == 'path': rep = rep[1:] if url[k] != rep and rep.replace('.', '') != '-': delete = False break if delete: return True
def test_finds_fragment(self): url = 'http://mysubdomain.example.com#my_fragment' result = url_parser.get_url(url) self.assertEqual(result.fragment, 'my_fragment') url = 'http://mysubdomain.example.com/path/#my_fragment' result = url_parser.get_url(url) self.assertEqual(result.fragment, 'my_fragment') url = 'http://mysubdomain.example.com/path/file.js#my_fragment' result = url_parser.get_url(url) self.assertEqual(result.fragment, 'my_fragment') url = 'http://mysubdomain.example.com#my_fragment?myargs=test' result = url_parser.get_url(url) self.assertEqual(result.fragment, 'my_fragment') url = 'http://mysubdomain.example.com/test/path.js#my_fragment?myargs=test' result = url_parser.get_url(url) self.assertEqual(result.fragment, 'my_fragment')
def submit(): try: text = request.query.textin if text == '': return '<script>alert(\'输入为空,请重新输入新闻内容,{}\')</script>'.format( '谢谢使用!') except: return '<script>alert({})</script>'.format('网络异常,请重试!谢谢使用!') flag = url_parser.get_url(text) if flag: content = url_parser.get_content(flag) return text_deal_two(content) else: return text_deal_two(text)
def parseProductPage(URL): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36' } page = requests.get(URL, headers=headers) URL_OBJECT = get_url(URL) domain = URL_OBJECT.domain soup = BeautifulSoup(page.content, 'html.parser') if (domain == 'flipkart'): ret = getFlipkartProduct(soup, URL_OBJECT) ret['domain'] = domain return ret elif (domain == 'bewakoof'): ret = getBewakoofProduct(soup, URL_OBJECT) ret['domain'] = domain print(ret) return ret elif (domain == 'alibaba'): ret = getAlibabaProduct(soup, URL_OBJECT) ret['domain'] = domain return ret elif (domain == 'snapdeal'): ret = getSnapdealProduct(soup, URL_OBJECT) ret['domain'] = domain return ret elif (domain == 'amazon'): ret = getAmazonProduct(soup, URL_OBJECT) ret['domain'] = domain return ret else: print('Incompatible Website URL') return {}
from url_parser import parse_url, get_url, get_base_url url = parse_url( "https://open.prospecta.app/my_user_login?user=url-parser&password=H3ll0" ) # returns url sections as a dict url_object = get_url( "https://open.prospecta.app/my_user_login?user=url-parser&password=H3ll0" ) # Does the same, bur returns a object basic_url = get_base_url( "https://open.prospecta.app/my_user_login?user=url-parser&password=H3ll0" ) # Returns just the main url print(url["domain"]) # Outputs -> prospecta print(url_object.domain) # Outputs -> prospecta print(basic_url) # Outputs -> https://open.prospecta.app
def test_does_not_mistake_file_for_dir(self): url = 'http://mysubdomain.example.com/folder/test' result = url_parser.get_url(url) self.assertEqual(result.dir, '/folder/') self.assertNotEqual(result.dir, '/folder/test')
def test_parses_url_with_www(self): url = 'www.example.com' result = url_parser.get_url(url) self.assertEqual(result.domain, 'example') self.assertEqual(result.top_domain, 'com')
def test_returns_null_if_protocol_is_missing(self): url = 'www.example.com' result = url_parser.get_url(url) self.assertIsNone(result.protocol)
def test_removes_extra_dot_from_www(self): url = 'http://www..example.com' result = url_parser.get_url(url) has_dot = '.' in result.www self.assertFalse(has_dot)
def test_returns_null_if_sub_domain_is_missing(self): url = 'http://example.com' result = url_parser.get_url(url) self.assertIsNone(result.sub_domain)
def test_finds_multiple_subdomains(self): url = 'my.subdomain.example.com' result = url_parser.get_url(url) self.assertEqual(result.sub_domain, 'my.subdomain')