Python Url 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: dirhunt.url

클래스/타입: Url

hotexamples.com에서의 예제들: 10

Python Url - 10개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 dirhunt.url.Url에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Url(10)

is_valid(4)

add_extra(1)

fragment(1)

query(1)

예제 #1

파일 보기

 def links(self, soup):
     links = [
         full_url_address(link.attrs.get('href'), self.crawler_url.url)
         for link in soup.find_all('a')
     ]
     metas = filter(
         lambda meta: meta.attrs.get('http-equiv', '').lower() == 'refresh',
         soup.find_all('meta'))
     metas = filter(lambda meta: '=' in meta.attrs.get('content', ''),
                    metas)
     links += list(
         map(
             lambda meta: full_url_address(
                 meta.attrs['content'].split('=', 1)[1], self.crawler_url.
                 url), metas))
     for link in filter(bool, links):
         url = Url(link)
         if not url.is_valid():
             continue
         depth = self.crawler_url.depth
         if url.domain != self.crawler_url.url.domain or \
                 not url.path.startswith(self.crawler_url.url.directory_path):
             depth -= 1
         if depth <= 0:
             continue
         self.add_url(link, depth)

예제 #2

파일 보기

 def get_links(self, text, soup=None):
     """
     :param text:
     :param soup:
     :return:
     """
     contents = list(
         filter(lambda x: isinstance(x, NavigableString) or is_link(x),
                soup.find('pre').contents))
     links = []
     for i, content in enumerate(contents):
         if not is_link(content) or '?' in content.attrs.get('href', ''):
             continue
         link = Url(
             full_url_address(content.attrs.get('href'),
                              self.processor.crawler_url.url))
         if i + 1 < len(contents) and isinstance(contents[i + 1],
                                                 NavigableString):
             extra = {}
             text = str(contents[i + 1])
             dt = DATETIME_PATTERN.findall(text)
             if dt:
                 extra['created_at'] = dt[0]
             size = FILESIZE_PATTERN.findall(text)
             if size:
                 extra['filesize'] = size[0].rstrip(' ')
             link.add_extra(extra)
         links.append(link)
     return links

예제 #3

파일 보기

 def get_links(self, text, soup=None):
     links = [
         full_url_address(link.attrs.get('href'),
                          self.processor.crawler_url.url)
         for link in soup.find_all('a')
     ]
     return [Url(link) for link in links]

예제 #4

파일 보기

 def links(self, soup):
     links = [
         full_url_address(link.attrs.get('href'), self.crawler_url.url)
         for link in soup.find_all('a')
     ]
     for link in filter(bool, links):
         url = Url(link)
         if not url.is_valid():
             continue
         depth = self.crawler_url.depth
         if url.domain != self.crawler_url.url.domain or \
                 not url.path.startswith(self.crawler_url.url.directory_path):
             depth -= 1
         if depth <= 0:
             continue
         self.add_url(link, depth)

예제 #5

파일 보기

 def process(self, text, soup=None):
     links = [
         full_url_address(link.attrs.get('href'), self.crawler_url.url)
         for link in soup.find_all('a')
     ]
     for link in filter(lambda x: x.url.endswith('/'), links):
         self.add_url(link, type='directory')
     self.files = [Url(link) for link in links]

예제 #6

파일 보기

파일: crawler_url.py 프로젝트: sonahri/dirhunt

    def __init__(self,
                 crawler,
                 url,
                 depth=3,
                 source=None,
                 exists=None,
                 type=None,
                 timeout=10):
        """

        :type crawler: Crawler
        :type depth: int Máxima recursión sin haber subido respecto esta url
        """
        self.flags = set()
        self.depth = depth
        if not isinstance(url, Url):
            url = Url(url)
        if url.is_valid():
            url.query = ''
            url.fragment = ''
        self.url = url
        self.crawler = crawler
        self.source = source
        self.exists = exists
        self.type = type
        self.timeout = timeout
        if url.is_valid() and (not url.path or url.path == '/'):
            self.type = 'directory'
        self.resp = None

예제 #7

파일 보기

def is_url_loop(url, ignore_end=True):
    url = url if isinstance(url, Url) else Url(url)
    directories = list(filter(bool, url.directories))
    directories.reverse()
    for i in range(1, (len(directories) // MATCHS_LOOP_NUM) + 1):
        groups = [
            tuple(directories[j:j + i])
            for j in range(0, MATCHS_LOOP_NUM * i, i)
        ]
        if len(set(groups)) == 1 and len(groups) >= MATCHS_LOOP_NUM:
            return True
    if ignore_end:
        return is_url_loop(url.parent(), False)
    return False

예제 #8

파일 보기

파일: processors.py 프로젝트: sonahri/dirhunt

def full_url_address(address, url):
    """

    :type url: Url
    :type address: str
    :rtype :Url

    """
    if address is None:
        return
    protocol_match = address.split(':', 1)[0] if ':' in address else ''
    protocol_match = re.match('^([A-z0-9\\-]+)$', protocol_match)
    if protocol_match and protocol_match.group(1) not in ACCEPTED_PROTOCOLS:
        return
    # TODO: mejorar esto. Aceptar otros protocolos  a rechazar
    if address.startswith('//'):
        address = address.replace('//', '{}://'.format(url.protocol), 1)
    if '://' not in address or address.startswith('/'):
        url = url.copy()
        url.path = address
        return url
    url = Url(address)
    if url.is_valid():
        return url

예제 #9

파일 보기

 def _get_url_info(self):
     return UrlInfo(Sessions(), Url(self.url))

예제 #10

파일 보기

 def test_callback(self):
     with patch.object(UrlsInfo, '_get_url_info') as m:
         UrlsInfo([self.url], Sessions()).callback(len(self.url),
                                                   Url(self.url), 0)
         m.assert_called_once()