Пример #1
0
    def __init__(self,
                 crawler,
                 url,
                 depth=3,
                 source=None,
                 exists=None,
                 type=None,
                 timeout=10):
        """

        :type crawler: Crawler
        :type depth: int Máxima recursión sin haber subido respecto esta url
        """
        self.flags = set()
        self.depth = depth
        if not isinstance(url, Url):
            url = Url(url)
        if url.is_valid():
            url.query = ''
            url.fragment = ''
        self.url = url
        self.crawler = crawler
        self.source = source
        self.exists = exists
        self.type = type
        self.timeout = timeout
        if url.is_valid() and (not url.path or url.path == '/'):
            self.type = 'directory'
        self.resp = None
Пример #2
0
 def links(self, soup):
     links = [
         full_url_address(link.attrs.get('href'), self.crawler_url.url)
         for link in soup.find_all('a')
     ]
     metas = filter(
         lambda meta: meta.attrs.get('http-equiv', '').lower() == 'refresh',
         soup.find_all('meta'))
     metas = filter(lambda meta: '=' in meta.attrs.get('content', ''),
                    metas)
     links += list(
         map(
             lambda meta: full_url_address(
                 meta.attrs['content'].split('=', 1)[1], self.crawler_url.
                 url), metas))
     for link in filter(bool, links):
         url = Url(link)
         if not url.is_valid():
             continue
         depth = self.crawler_url.depth
         if url.domain != self.crawler_url.url.domain or \
                 not url.path.startswith(self.crawler_url.url.directory_path):
             depth -= 1
         if depth <= 0:
             continue
         self.add_url(link, depth)
Пример #3
0
 def links(self, soup):
     links = [
         full_url_address(link.attrs.get('href'), self.crawler_url.url)
         for link in soup.find_all('a')
     ]
     for link in filter(bool, links):
         url = Url(link)
         if not url.is_valid():
             continue
         depth = self.crawler_url.depth
         if url.domain != self.crawler_url.url.domain or \
                 not url.path.startswith(self.crawler_url.url.directory_path):
             depth -= 1
         if depth <= 0:
             continue
         self.add_url(link, depth)
Пример #4
0
def full_url_address(address, url):
    """

    :type url: Url
    :type address: str
    :rtype :Url

    """
    if address is None:
        return
    protocol_match = address.split(':', 1)[0] if ':' in address else ''
    protocol_match = re.match('^([A-z0-9\\-]+)$', protocol_match)
    if protocol_match and protocol_match.group(1) not in ACCEPTED_PROTOCOLS:
        return
    # TODO: mejorar esto. Aceptar otros protocolos  a rechazar
    if address.startswith('//'):
        address = address.replace('//', '{}://'.format(url.protocol), 1)
    if '://' not in address or address.startswith('/'):
        url = url.copy()
        url.path = address
        return url
    url = Url(address)
    if url.is_valid():
        return url