def search_regulars(self): """ Search urls inside the <A> tags """ urls = set() tree = XPathExtractor().get_object(self.response.raw_html) for link_tag in tree.xpath("//a"): if not 'href' in link_tag.attrib: continue url = link_tag.attrib["href"] if not urlparse.urlparse(url).netloc: url = self._fix_url(url) url = self._normalize_url(url) urls.add(url) return urls
def execute(self): try: import IPython except ImportError: exit_with_error("Please install the ipython console") url = self.args[0] crawler = BaseCrawler() response = crawler._get_response(url) html = XPathExtractor().get_object(response) shell = IPython.Shell.IPShellEmbed(argv=[], user_ns={'response': response}) shell()
def _highlight_nodes(self, html, nodes): """ Highlights the nodes selected by the user in the current page """ html_tree = XPathExtractor().get_object(html) for xpath in nodes: tags = html_tree.xpath(xpath) if tags: tag = tags[0] classes = tag.attrib.get("class", "") classes = "%s %s" % (classes, SELECTED_CLASS) tag.attrib["class"] = classes.strip() tag.attrib["id"] = xpath return etree.tostring(html_tree.getroot(), pretty_print=True, method="html")
def __init__(self, url_regex, url, html): self._url_regex = url_regex self.url = url self.html_tree = XPathExtractor().get_object(html)