def links(self): """Yields all links in the page""" # CHANGED from PIP original: # use HTMLParser instead of re # and store data-requires-python class AnchorParser(html_parser.HTMLParser, object): def __init__(self, *args, **kwargs): super(AnchorParser, self).__init__(*args, **kwargs) self.anchors = [] def handle_starttag(self, tag, attrs): if not tag == 'a': return for key, value in attrs: if key == 'href': self.anchors.append(dict(attrs)) break parser = AnchorParser() parser.feed(self.content) parser.close() for anchor in parser.anchors: url = anchor['href'] # CHANGED from PIP original: catch parsing errors try: url = self.clean_link(urljoin(self.base_url, url)) except ValueError: continue pyrequire = anchor.get('data-requires-python') yield Link(url, self, requires_python=pyrequire)
def links(self): """Yields all links in the page""" for match in self._href_re.finditer(self.content): url = match.group(1) or match.group(2) or match.group(3) # CHANGED from PIP original: catch parsing errors try: url = self.clean_link(urljoin(self.base_url, url)) except ValueError: continue yield Link(url, self)
def scraped_rel_links(self): for regex in (self._homepage_re, self._download_re): match = regex.search(self.content) if not match: continue href_match = self._href_re.search(self.content, pos=match.end()) if not href_match: continue url = href_match.group(1) or href_match.group(2) or href_match.group(3) if not url: continue url = self.clean_link(urljoin(self.base_url, url)) yield Link(url, self)
def explicit_rel_links(self, rels=('homepage', 'download')): """Yields all links with the given relations""" for match in self._rel_re.finditer(self.content): found_rels = match.group(1).lower().split() for rel in rels: if rel in found_rels: break else: continue match = self._href_re.search(match.group(0)) if not match: continue url = match.group(1) or match.group(2) or match.group(3) url = self.clean_link(urljoin(self.base_url, url)) yield Link(url, self)