def __init__(self, crawler, url, depth=3, source=None, exists=None, type=None, timeout=10): """ :type crawler: Crawler :type depth: int Máxima recursión sin haber subido respecto esta url """ self.flags = set() self.depth = depth if not isinstance(url, Url): url = Url(url) if url.is_valid(): url.query = '' url.fragment = '' self.url = url self.crawler = crawler self.source = source self.exists = exists self.type = type self.timeout = timeout if url.is_valid() and (not url.path or url.path == '/'): self.type = 'directory' self.resp = None
def links(self, soup): links = [ full_url_address(link.attrs.get('href'), self.crawler_url.url) for link in soup.find_all('a') ] metas = filter( lambda meta: meta.attrs.get('http-equiv', '').lower() == 'refresh', soup.find_all('meta')) metas = filter(lambda meta: '=' in meta.attrs.get('content', ''), metas) links += list( map( lambda meta: full_url_address( meta.attrs['content'].split('=', 1)[1], self.crawler_url. url), metas)) for link in filter(bool, links): url = Url(link) if not url.is_valid(): continue depth = self.crawler_url.depth if url.domain != self.crawler_url.url.domain or \ not url.path.startswith(self.crawler_url.url.directory_path): depth -= 1 if depth <= 0: continue self.add_url(link, depth)
def links(self, soup): links = [ full_url_address(link.attrs.get('href'), self.crawler_url.url) for link in soup.find_all('a') ] for link in filter(bool, links): url = Url(link) if not url.is_valid(): continue depth = self.crawler_url.depth if url.domain != self.crawler_url.url.domain or \ not url.path.startswith(self.crawler_url.url.directory_path): depth -= 1 if depth <= 0: continue self.add_url(link, depth)
def full_url_address(address, url): """ :type url: Url :type address: str :rtype :Url """ if address is None: return protocol_match = address.split(':', 1)[0] if ':' in address else '' protocol_match = re.match('^([A-z0-9\\-]+)$', protocol_match) if protocol_match and protocol_match.group(1) not in ACCEPTED_PROTOCOLS: return # TODO: mejorar esto. Aceptar otros protocolos a rechazar if address.startswith('//'): address = address.replace('//', '{}://'.format(url.protocol), 1) if '://' not in address or address.startswith('/'): url = url.copy() url.path = address return url url = Url(address) if url.is_valid(): return url