Пример #1
0
    def test_finds_query(self):
        url = 'http://mysubdomain.example.com?myquery=test'
        result = url_parser.get_url(url)
        self.assertEqual(result.query['myquery'], 'test')

        url = 'http://mysubdomain.example.com?myquery=test&one=two&test'
        result = url_parser.get_url(url)
        self.assertEqual(result.query['myquery'], 'test')
        self.assertEqual(result.query['one'], 'two')
        self.assertIsNone(result.query['test'])

        url = 'http://mysubdomain.example.com/file.js?myquery=test&one=two'
        result = url_parser.get_url(url)
        self.assertEqual(result.query['myquery'], 'test')
        self.assertEqual(result.query['one'], 'two')

        url = 'http://mysubdomain.example.com/path/and/file.js?myquery=test&one=two'
        result = url_parser.get_url(url)
        self.assertEqual(result.query['myquery'], 'test')
        self.assertEqual(result.query['one'], 'two')

        url = 'http://mysubdomain.example.com/path/?myquery=test&one=two'
        result = url_parser.get_url(url)
        self.assertEqual(result.query['myquery'], 'test')
        self.assertEqual(result.query['one'], 'two')
Пример #2
0
    def test_finds_file(self):
        url = 'http://mysubdomain.example.com/cool.jpg'
        result = url_parser.get_url(url)
        self.assertEqual(result.file, 'cool.jpg')

        url = 'http://mysubdomain.example.com/directory/here/sample.mp4'
        result = url_parser.get_url(url)
        self.assertEqual(result.file, 'sample.mp4')
Пример #3
0
    def test_finds_top_domain(self):
        url = 'http://mysubdomain.example.com'
        result = url_parser.get_url(url)
        self.assertEqual(result.top_domain, 'com')

        url = 'http://mysubdomain.example.co.uk'
        result = url_parser.get_url(url)
        self.assertEqual(result.top_domain, 'co.uk')
Пример #4
0
    def test_finds_dir(self):
        url = 'http://mysubdomain.example.com/folder/'
        result = url_parser.get_url(url)
        self.assertEqual(result.dir, '/folder/')

        url = 'http://mysubdomain.example.com/multiple/folders/'
        result = url_parser.get_url(url)
        self.assertEqual(result.dir, '/multiple/folders/')

        url = 'http://mysubdomain.example.com/multiple/folders/with_a_file.js'
        result = url_parser.get_url(url)
        self.assertEqual(result.dir, '/multiple/folders/')
Пример #5
0
    def test_finds_protocol(self):
        url = 'http://mysubdomain.example.com'
        result = url_parser.get_url(url)
        self.assertEqual(result.protocol, 'http')

        url = 'https://mysubdomain.example.com'
        result = url_parser.get_url(url)
        self.assertEqual(result.protocol, 'https')

        url = 'ftp://mysubdomain.example.com'
        result = url_parser.get_url(url)
        self.assertEqual(result.protocol, 'ftp')
Пример #6
0
    def test_finds_path(self):
        url = 'http://mysubdomain.example.com/path'
        result = url_parser.get_url(url)
        self.assertEqual(result.path, '/path')

        url = 'http://mysubdomain.example.com/this/is/the/path'
        result = url_parser.get_url(url)
        self.assertEqual(result.path, '/this/is/the/path')

        url = 'http://mysubdomain.example.com/path/with/file.js'
        result = url_parser.get_url(url)
        self.assertEqual(result.path, '/path/with/file.js')
Пример #7
0
async def filter_links(bot, message):
    if ((not isinstance(message.author, discord.Member))
            or message.author.permissions_in(message.channel).manage_messages):
        return
    regex = (r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|'
             r'(?:%[0-9a-fA-F][0-9a-fA-F]))+')

    matches = re.findall(regex, message.content, re.MULTILINE)
    urls = []
    for link in matches:
        try:
            urls = []
            async with bot.http._HTTPClient__session.get(link) as resp:
                urls.append(url_parser.get_url(link)._asdict())
                for redirect in resp.history:
                    urls.append(
                        url_parser.get_url(redirect.real_url)._asdict())
            for url in urls:
                for blocked in [  # "*" means any
                        # [http[s]://][sub.]<name>.<domain>[/path]         # Reason
                        ###########################################################
                        '*.grabify.link/*',  # Ip Grabber
                        '*.pornhub.com/*',  # P**n
                ]:
                    parsed_blocked = url_parser.get_url(
                        blocked.replace('*', '-'))._asdict()
                    for k, v in parsed_blocked.items():
                        if k in [
                                'protocol', 'www', 'dir', 'file', 'fragment',
                                'query'
                        ]:
                            continue
                        if v == url[k]:
                            continue
                        if isinstance(v, str):
                            if v.replace('.', '') == '-':
                                continue
                            if k == 'path':
                                if v[1:] == '-':
                                    continue
                        return
                    await message.delete()
                    await message.channel.send((
                        f':warning: {message.author.mention} That link is not '
                        'allowed :warning:'),
                                               delete_after=15)
                    return
        except Exception as e:
            print(e)
Пример #8
0
async def check_link_base(url, block_list):
    url = url_parser.get_url(url)._asdict()
    for blocked in block_list:
        parsed_blocked = url_parser.get_url(blocked.replace('*',
                                                            '-'))._asdict()
        delete = True
        for k in ['sub_domain', 'domain', 'top_domain', 'path']:
            rep = parsed_blocked[k]
            if k == 'path':
                rep = rep[1:]
            if url[k] != rep and rep.replace('.', '') != '-':
                delete = False
                break
        if delete:
            return True
Пример #9
0
    def test_finds_fragment(self):
        url = 'http://mysubdomain.example.com#my_fragment'
        result = url_parser.get_url(url)
        self.assertEqual(result.fragment, 'my_fragment')

        url = 'http://mysubdomain.example.com/path/#my_fragment'
        result = url_parser.get_url(url)
        self.assertEqual(result.fragment, 'my_fragment')

        url = 'http://mysubdomain.example.com/path/file.js#my_fragment'
        result = url_parser.get_url(url)
        self.assertEqual(result.fragment, 'my_fragment')

        url = 'http://mysubdomain.example.com#my_fragment?myargs=test'
        result = url_parser.get_url(url)
        self.assertEqual(result.fragment, 'my_fragment')

        url = 'http://mysubdomain.example.com/test/path.js#my_fragment?myargs=test'
        result = url_parser.get_url(url)
        self.assertEqual(result.fragment, 'my_fragment')
Пример #10
0
def submit():
    try:
        text = request.query.textin
        if text == '':
            return '<script>alert(\'输入为空,请重新输入新闻内容,{}\')</script>'.format(
                '谢谢使用!')
    except:
        return '<script>alert({})</script>'.format('网络异常,请重试!谢谢使用!')

    flag = url_parser.get_url(text)

    if flag:
        content = url_parser.get_content(flag)
        return text_deal_two(content)

    else:
        return text_deal_two(text)
Пример #11
0
def parseProductPage(URL):
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
    }
    page = requests.get(URL, headers=headers)

    URL_OBJECT = get_url(URL)
    domain = URL_OBJECT.domain
    soup = BeautifulSoup(page.content, 'html.parser')

    if (domain == 'flipkart'):
        ret = getFlipkartProduct(soup, URL_OBJECT)
        ret['domain'] = domain
        return ret

    elif (domain == 'bewakoof'):
        ret = getBewakoofProduct(soup, URL_OBJECT)
        ret['domain'] = domain
        print(ret)
        return ret

    elif (domain == 'alibaba'):
        ret = getAlibabaProduct(soup, URL_OBJECT)
        ret['domain'] = domain
        return ret

    elif (domain == 'snapdeal'):
        ret = getSnapdealProduct(soup, URL_OBJECT)
        ret['domain'] = domain
        return ret

    elif (domain == 'amazon'):
        ret = getAmazonProduct(soup, URL_OBJECT)
        ret['domain'] = domain
        return ret

    else:
        print('Incompatible Website URL')
        return {}
Пример #12
0
from url_parser import parse_url, get_url, get_base_url

url = parse_url(
    "https://open.prospecta.app/my_user_login?user=url-parser&password=H3ll0"
)  # returns url sections as a dict
url_object = get_url(
    "https://open.prospecta.app/my_user_login?user=url-parser&password=H3ll0"
)  # Does the same, bur returns a object
basic_url = get_base_url(
    "https://open.prospecta.app/my_user_login?user=url-parser&password=H3ll0"
)  # Returns just the main url

print(url["domain"])  # Outputs -> prospecta
print(url_object.domain)  # Outputs -> prospecta
print(basic_url)  # Outputs -> https://open.prospecta.app
Пример #13
0
 def test_does_not_mistake_file_for_dir(self):
     url = 'http://mysubdomain.example.com/folder/test'
     result = url_parser.get_url(url)
     self.assertEqual(result.dir, '/folder/')
     self.assertNotEqual(result.dir, '/folder/test')
Пример #14
0
 def test_parses_url_with_www(self):
     url = 'www.example.com'
     result = url_parser.get_url(url)
     self.assertEqual(result.domain, 'example')
     self.assertEqual(result.top_domain, 'com')
Пример #15
0
 def test_returns_null_if_protocol_is_missing(self):
     url = 'www.example.com'
     result = url_parser.get_url(url)
     self.assertIsNone(result.protocol)
Пример #16
0
 def test_removes_extra_dot_from_www(self):
     url = 'http://www..example.com'
     result = url_parser.get_url(url)
     has_dot = '.' in result.www
     self.assertFalse(has_dot)
Пример #17
0
 def test_returns_null_if_sub_domain_is_missing(self):
     url = 'http://example.com'
     result = url_parser.get_url(url)
     self.assertIsNone(result.sub_domain)
Пример #18
0
 def test_finds_multiple_subdomains(self):
     url = 'my.subdomain.example.com'
     result = url_parser.get_url(url)
     self.assertEqual(result.sub_domain, 'my.subdomain')