def process_html(self, html, path): parser = etree.HTMLParser(encoding='utf-8') tree = etree.fromstring(html.decode('utf-8'), parser).getroottree() page = tree.getroot() if page is None: print(repr(html)) raise ParserError('Could not parse the html') lines = html.splitlines() body, = CSSSelector('body')(page) self._bodies.append(body) if self.optimize_lookup: for each in body.iter(): identifier = each.attrib.get('id') if identifier: self._all_ids.add(identifier) classes = each.attrib.get('class') if classes: for class_ in classes.split(): self._all_classes.add(class_) for style in CSSSelector('style')(page): first_line = style.text.strip().splitlines()[0] for i, line in enumerate(lines): if line.count(first_line): key = (i + 1, path) self.blocks[key] = style.text break
def process_html(self, html, path): parser = etree.HTMLParser(encoding='utf-8') tree = etree.fromstring(html.decode('utf-8'), parser).getroottree() page = tree.getroot() if page is None: print(repr(html)) raise ParserError('Could not parse the html') lines = html.splitlines() body, = CSSSelector('body')(page) self._bodies.append(body) if self.optimize_lookup: for each in body.iter(): identifier = each.attrib.get('id') if identifier: self._all_ids.add(identifier) classes = each.attrib.get('class') if classes: for class_ in classes.split(): self._all_classes.add(class_) for style in CSSSelector('style')(page): first_line = style.text.strip().splitlines()[0] for i, line in enumerate(lines): if line.count(first_line): key = (i + 1, path) self.blocks[key] = style.text break
def process_html(self, html, url): parser = etree.HTMLParser(encoding='utf-8') tree = etree.fromstring(html.encode('utf-8'), parser).getroottree() page = tree.getroot() if page is None: print(repr(html)) raise ParserError('Could not parse the html') lines = html.splitlines() body, = CSSSelector('body')(page) self._bodies.append(body) if self.optimize_lookup: for each in body.iter(): identifier = each.attrib.get('id') if identifier: self._all_ids.add(identifier) classes = each.attrib.get('class') if classes: for class_ in classes.split(): self._all_classes.add(class_) for style in CSSSelector('style')(page): try: first_line = style.text.strip().splitlines()[0] except IndexError: # meaning the inline style tag was just whitespace continue except AttributeError: # happend when the style tag has absolute nothing it # not even whitespace continue for i, line in enumerate(lines): if line.count(first_line): key = (i + 1, url) self.blocks[key] = style.text break for link in CSSSelector('link')(page): if ( link.attrib.get('rel', '') == 'stylesheet' or link.attrib['href'].lower().split('?')[0].endswith('.css') ): link_url = self.make_absolute_url(url, link.attrib['href']) key = (link_url, link.attrib['href']) self.blocks[key] = self.download(link_url) if self.preserve_remote_urls: self.blocks[key] = self._rewrite_urls( self.blocks[key], link_url )
def process_html(self, html, url): parser = etree.HTMLParser(encoding='utf-8') tree = etree.fromstring(html.encode('utf-8'), parser).getroottree() page = tree.getroot() if page is None: print(repr(html)) raise ParserError('Could not parse the html') lines = html.splitlines() body, = CSSSelector('body')(page) self._bodies.append(body) if self.optimize_lookup: for each in body.iter(): identifier = each.attrib.get('id') if identifier: self._all_ids.add(identifier) classes = each.attrib.get('class') if classes: for class_ in classes.split(): self._all_classes.add(class_) for style in CSSSelector('style')(page): try: first_line = style.text.strip().splitlines()[0] except IndexError: # meaning the inline style tag was just whitespace continue except AttributeError: # happend when the style tag has absolute nothing it # not even whitespace continue for i, line in enumerate(lines): if line.count(first_line): key = (i + 1, url) self.blocks[key] = style.text break for link in CSSSelector('link')(page): if (link.attrib.get('rel', '') == 'stylesheet' or link.attrib['href'].lower().split('?')[0].endswith( '.css')): link_url = self.make_absolute_url(url, link.attrib['href']) key = (link_url, link.attrib['href']) self.blocks[key] = self.download(link_url) if self.preserve_remote_urls: self.blocks[key] = self._rewrite_urls( self.blocks[key], link_url)
def process_html(self, html, url): parser = etree.HTMLParser() tree = etree.fromstring(html, parser).getroottree() page = tree.getroot() if page is None: print repr(html) raise ParserError("Could not parse the html") lines = html.splitlines() body, = CSSSelector('body')(page) self._bodies.append(body) if self.optimize_lookup: for each in body.iter(): id = each.attrib.get('id') if id: self._all_ids.add(id) classes = each.attrib.get('class') if classes: for class_ in classes.split(): self._all_classes.add(class_) for style in CSSSelector('style')(page): first_line = style.text.strip().splitlines()[0] for i, line in enumerate(lines): if line.count(first_line): key = (i + 1, url) self.blocks[key] = style.text break for link in CSSSelector('link')(page): if ( link.attrib.get('rel', '') == 'stylesheet' or link.attrib['href'].lower().split('?')[0].endswith('.css') ): link_url = self.make_absolute_url(url, link.attrib['href']) key = (link_url, link.attrib['href']) self.blocks[key] = self._download(link_url) if self.preserve_remote_urls: self.blocks[key] = self._rewrite_urls( self.blocks[key], link_url )