def links(self, soup): links = [ full_url_address(link.attrs.get('href'), self.crawler_url.url) for link in soup.find_all('a') ] metas = filter( lambda meta: meta.attrs.get('http-equiv', '').lower() == 'refresh', soup.find_all('meta')) metas = filter(lambda meta: '=' in meta.attrs.get('content', ''), metas) links += list( map( lambda meta: full_url_address( meta.attrs['content'].split('=', 1)[1], self.crawler_url. url), metas)) for link in filter(bool, links): url = Url(link) if not url.is_valid(): continue depth = self.crawler_url.depth if url.domain != self.crawler_url.url.domain or \ not url.path.startswith(self.crawler_url.url.directory_path): depth -= 1 if depth <= 0: continue self.add_url(link, depth)
def get_links(self, text, soup=None): links = [ full_url_address(link.attrs.get('href'), self.processor.crawler_url.url) for link in soup.find_all('a') ] return [Url(link) for link in links]
def get_links(self, text, soup=None): """ :param text: :param soup: :return: """ contents = list( filter(lambda x: isinstance(x, NavigableString) or is_link(x), soup.find('pre').contents)) links = [] for i, content in enumerate(contents): if not is_link(content) or '?' in content.attrs.get('href', ''): continue link = Url( full_url_address(content.attrs.get('href'), self.processor.crawler_url.url)) if i + 1 < len(contents) and isinstance(contents[i + 1], NavigableString): extra = {} text = str(contents[i + 1]) dt = DATETIME_PATTERN.findall(text) if dt: extra['created_at'] = dt[0] size = FILESIZE_PATTERN.findall(text) if size: extra['filesize'] = size[0].rstrip(' ') link.add_extra(extra) links.append(link) return links
def assets(self, soup): assets = [ full_url_address(link.attrs.get('href'), self.crawler_url.url) for link in soup.find_all('link') ] assets += [ full_url_address(script.attrs.get('src'), self.crawler_url.url) for script in soup.find_all('script') ] assets += [ full_url_address(img.attrs.get('src'), self.crawler_url.url) for img in soup.find_all('img') ] for asset in filter(bool, assets): self.analyze_asset(asset) self.add_url(asset, type='asset')
def process(self, text, soup=None): if sys.version_info > (3, ) and isinstance(text, bytes): text = text.decode('utf-8') urls = [ full_url_address(url[0], self.crawler_url.url) for url in re.findall(TEXT_PLAIN_PATH_STRING_REGEX, text, re.VERBOSE) ] for url in urls: self.add_url(url, depth=0, type='asset') return urls
def process(self, text, soup=None): if sys.version_info > (3, ) and isinstance(text, bytes): text = text.decode('utf-8') urls = [ full_url_address(url, self.crawler_url.url) for url in re.findall(': *url\(["\']?(.+?)["\']?\)', text) ] for url in urls: self.add_url(url, depth=0, type='asset') return urls
def links(self, soup): links = [ full_url_address(link.attrs.get('href'), self.crawler_url.url) for link in soup.find_all('a') ] for link in filter(bool, links): url = Url(link) if not url.is_valid(): continue depth = self.crawler_url.depth if url.domain != self.crawler_url.url.domain or \ not url.path.startswith(self.crawler_url.url.directory_path): depth -= 1 if depth <= 0: continue self.add_url(link, depth)
def __init__(self, response, crawler_url): super(ProcessRedirect, self).__init__(response, crawler_url) self.redirector = full_url_address(response.headers.get('Location'), self.crawler_url.url)