예제 #1
0
    def process_html(self, html, path):
        parser = etree.HTMLParser(encoding='utf-8')
        tree = etree.fromstring(html.decode('utf-8'), parser).getroottree()
        page = tree.getroot()

        if page is None:
            print(repr(html))
            raise ParserError('Could not parse the html')

        lines = html.splitlines()
        body, = CSSSelector('body')(page)
        self._bodies.append(body)
        if self.optimize_lookup:
            for each in body.iter():
                identifier = each.attrib.get('id')
                if identifier:
                    self._all_ids.add(identifier)
                classes = each.attrib.get('class')
                if classes:
                    for class_ in classes.split():
                        self._all_classes.add(class_)

        for style in CSSSelector('style')(page):
            first_line = style.text.strip().splitlines()[0]
            for i, line in enumerate(lines):
                if line.count(first_line):
                    key = (i + 1, path)
                    self.blocks[key] = style.text
                    break
예제 #2
0
	def process_html(self, html, path):
		parser = etree.HTMLParser(encoding='utf-8')
		tree = etree.fromstring(html.decode('utf-8'), parser).getroottree()
		page = tree.getroot()

		if page is None:
			print(repr(html))
			raise ParserError('Could not parse the html')

		lines = html.splitlines()
		body, = CSSSelector('body')(page)
		self._bodies.append(body)
		if self.optimize_lookup:
			for each in body.iter():
				identifier = each.attrib.get('id')
				if identifier:
				    self._all_ids.add(identifier)
				classes = each.attrib.get('class')
				if classes:
				    for class_ in classes.split():
				        self._all_classes.add(class_)

		for style in CSSSelector('style')(page):
		    first_line = style.text.strip().splitlines()[0]
		    for i, line in enumerate(lines):
				if line.count(first_line):
					key = (i + 1, path)
					self.blocks[key] = style.text
					break
예제 #3
0
파일: processor.py 프로젝트: alanjds/mincss
    def process_html(self, html, url):
        parser = etree.HTMLParser(encoding='utf-8')
        tree = etree.fromstring(html.encode('utf-8'), parser).getroottree()
        page = tree.getroot()

        if page is None:
            print(repr(html))
            raise ParserError('Could not parse the html')

        lines = html.splitlines()
        body, = CSSSelector('body')(page)
        self._bodies.append(body)
        if self.optimize_lookup:
            for each in body.iter():
                identifier = each.attrib.get('id')
                if identifier:
                    self._all_ids.add(identifier)
                classes = each.attrib.get('class')
                if classes:
                    for class_ in classes.split():
                        self._all_classes.add(class_)

        for style in CSSSelector('style')(page):
            try:
                first_line = style.text.strip().splitlines()[0]
            except IndexError:
                # meaning the inline style tag was just whitespace
                continue
            except AttributeError:
                # happend when the style tag has absolute nothing it
                # not even whitespace
                continue
            for i, line in enumerate(lines):
                if line.count(first_line):
                    key = (i + 1, url)
                    self.blocks[key] = style.text
                    break

        for link in CSSSelector('link')(page):
            if (
                link.attrib.get('rel', '') == 'stylesheet' or
                link.attrib['href'].lower().split('?')[0].endswith('.css')
            ):
                link_url = self.make_absolute_url(url, link.attrib['href'])
                key = (link_url, link.attrib['href'])
                self.blocks[key] = self.download(link_url)
                if self.preserve_remote_urls:
                    self.blocks[key] = self._rewrite_urls(
                        self.blocks[key],
                        link_url
                    )
예제 #4
0
    def process_html(self, html, url):
        parser = etree.HTMLParser(encoding='utf-8')
        tree = etree.fromstring(html.encode('utf-8'), parser).getroottree()
        page = tree.getroot()

        if page is None:
            print(repr(html))
            raise ParserError('Could not parse the html')

        lines = html.splitlines()
        body, = CSSSelector('body')(page)
        self._bodies.append(body)
        if self.optimize_lookup:
            for each in body.iter():
                identifier = each.attrib.get('id')
                if identifier:
                    self._all_ids.add(identifier)
                classes = each.attrib.get('class')
                if classes:
                    for class_ in classes.split():
                        self._all_classes.add(class_)

        for style in CSSSelector('style')(page):
            try:
                first_line = style.text.strip().splitlines()[0]
            except IndexError:
                # meaning the inline style tag was just whitespace
                continue
            except AttributeError:
                # happend when the style tag has absolute nothing it
                # not even whitespace
                continue
            for i, line in enumerate(lines):
                if line.count(first_line):
                    key = (i + 1, url)
                    self.blocks[key] = style.text
                    break

        for link in CSSSelector('link')(page):
            if (link.attrib.get('rel', '') == 'stylesheet'
                    or link.attrib['href'].lower().split('?')[0].endswith(
                        '.css')):
                link_url = self.make_absolute_url(url, link.attrib['href'])
                key = (link_url, link.attrib['href'])
                self.blocks[key] = self.download(link_url)
                if self.preserve_remote_urls:
                    self.blocks[key] = self._rewrite_urls(
                        self.blocks[key], link_url)
예제 #5
0
    def process_html(self, html, url):
        parser = etree.HTMLParser()
        tree = etree.fromstring(html, parser).getroottree()
        page = tree.getroot()

        if page is None:
            print repr(html)
            raise ParserError("Could not parse the html")

        lines = html.splitlines()
        body, = CSSSelector('body')(page)
        self._bodies.append(body)
        if self.optimize_lookup:
            for each in body.iter():
                id = each.attrib.get('id')
                if id:
                    self._all_ids.add(id)
                classes = each.attrib.get('class')
                if classes:
                    for class_ in classes.split():
                        self._all_classes.add(class_)

        for style in CSSSelector('style')(page):
            first_line = style.text.strip().splitlines()[0]
            for i, line in enumerate(lines):
                if line.count(first_line):
                    key = (i + 1, url)
                    self.blocks[key] = style.text
                    break

        for link in CSSSelector('link')(page):
            if (
                link.attrib.get('rel', '') == 'stylesheet' or
                link.attrib['href'].lower().split('?')[0].endswith('.css')
            ):
                link_url = self.make_absolute_url(url, link.attrib['href'])
                key = (link_url, link.attrib['href'])
                self.blocks[key] = self._download(link_url)
                if self.preserve_remote_urls:
                    self.blocks[key] = self._rewrite_urls(
                        self.blocks[key],
                        link_url
                    )