示例#1
0
    def perform(self,document,sourceHTML,sourceURL):
        aggregateCSS = ""
        if sourceURL and not sourceURL.endswith('/'):
            sourceURL += '/'

        # retrieve CSS rel links from html pasted and aggregate into one string
        CSSRelSelector = CSSSelector("link[rel=stylesheet],link[rel=StyleSheet],link[rel=STYLESHEET],style,Style")
        matching = CSSRelSelector.evaluate(document)
        for element in matching:
            if element.tag.lower() == 'style':
                csstext = element.text
                if sourceURL:
                    csstext = fix_relative_urls(csstext, sourceURL)
            else:
                try:
                    csspath = element.get("href")
                    if sourceURL:
                        csspath = urlparse.urljoin(sourceURL, csspath)
                    r = requests.get(csspath)
                    csstext = fix_relative_urls(r.text, csspath)
                except:
                    raise IOError('The stylesheet ' + element.get("href") + ' could not be found')

            aggregateCSS += csstext
            element.getparent().remove(element)

        #convert  document to a style dictionary compatible with etree
        styledict = self.getView(document, aggregateCSS)

        #set inline style attribute if not one of the elements not worth styling
        ignoreList=['html','head','title','meta','link','script']
        for element, style in styledict.items():
            if element.tag not in ignoreList:
                v = style.getCssText(separator=u'')
                element.set('style', v)

        if self.mediaRules:
            bodyTag = document.find('body')
            if bodyTag is not None:
                styleTag = etree.Element('style', type="text/css")
                styleTag.text = self.mediaRules
                bodyTag.insert(0, styleTag)

        if sourceURL:
            for attr in ('href', 'src'):
                for item in document.xpath("//@%s" % attr):
                    parent = item.getparent()
                    if attr == 'href' and parent.attrib[attr].startswith('#'):
                        continue
                    parent.attrib[attr] = urlparse.urljoin(sourceURL, parent.attrib[attr])

        #convert tree back to plain text html
        self.convertedHTML = etree.tostring(document, method="xml", pretty_print=True,encoding='UTF-8')
        self.convertedHTML= self.convertedHTML.replace('
', '') #tedious raw conversion of line breaks.

        return self
示例#2
0
	def perform(self,document,sourceHTML,sourceURL):
		aggregateCSS="";
			
		# retrieve CSS rel links from html pasted and aggregate into one string
		CSSRelSelector = CSSSelector("link[rel=stylesheet],link[rel=StyleSheet],link[rel=STYLESHEET]")
		matching = CSSRelSelector.evaluate(document)
		for element in matching:
			try:
				csspath=element.get("href")
				if len(sourceURL):
					if element.get("href").lower().find("http://",0) < 0:
						parsedUrl=urlparse.urlparse(sourceURL);
						csspath=urlparse.urljoin(parsedUrl.scheme+"://"+parsedUrl.hostname, csspath)
				f=urllib.urlopen(csspath)
				aggregateCSS+=''.join(f.read())
				element.getparent().remove(element)
			except:
				raise IOError('The stylesheet '+element.get("href")+' could not be found')
		
		#include inline style elements
		print aggregateCSS
		CSSStyleSelector = CSSSelector("style,Style")
		matching = CSSStyleSelector.evaluate(document)
		for element in matching:
			aggregateCSS+=element.text
			element.getparent().remove(element)
		
		#convert  document to a style dictionary compatible with etree
		styledict = self.getView(document, aggregateCSS)
		
		#set inline style attribute if not one of the elements not worth styling
		ignoreList=['html','head','title','meta','link','script']
		for element, style in styledict.items():
			if element.tag not in ignoreList:
				v = style.getCssText(separator=u'')
				element.set('style', v)
		
		#convert tree back to plain text html
		self.convertedHTML = etree.tostring(document, method="xml", pretty_print=True,encoding='UTF-8')
		self.convertedHTML= self.convertedHTML.replace('&#13;', '') #tedious raw conversion of line breaks.
		
		return self
示例#3
0
    def perform(self, document, sourceHTML, sourceURL):
        aggregateCSS = ""

        # retrieve CSS rel links from html pasted and aggregate into one string
        CSSRelSelector = CSSSelector(
            "link[rel=stylesheet],link[rel=StyleSheet],link[rel=STYLESHEET]")
        matching = CSSRelSelector.evaluate(document)
        for element in matching:
            try:
                csspath = element.get("href")
                if len(sourceURL):
                    if element.get("href").lower().find("http://", 0) < 0:
                        parsedUrl = urlparse.urlparse(sourceURL)
                        csspath = urlparse.urljoin(
                            parsedUrl.scheme + "://" + parsedUrl.hostname,
                            csspath)
                f = urllib.urlopen(csspath)
                aggregateCSS += ''.join(f.read())
                element.getparent().remove(element)
            except:
                raise IOError('The stylesheet ' + element.get("href") +
                              ' could not be found')

        #include inline style elements
        print aggregateCSS
        CSSStyleSelector = CSSSelector("style,Style")
        matching = CSSStyleSelector.evaluate(document)
        for element in matching:
            aggregateCSS += element.text
            element.getparent().remove(element)

        #convert  document to a style dictionary compatible with etree
        styledict = self.getView(document, aggregateCSS)

        #set inline style attribute if not one of the elements not worth styling
        ignoreList = ['html', 'head', 'title', 'meta', 'link', 'script']
        for element, style in styledict.items():
            if element.tag not in ignoreList:
                v = style.getCssText(separator=u'')
                element.set('style', v)

        #convert tree back to plain text html
        self.convertedHTML = etree.tostring(document,
                                            method="xml",
                                            pretty_print=True,
                                            encoding='UTF-8')
        self.convertedHTML = self.convertedHTML.replace(
            '&#13;', '')  #tedious raw conversion of line breaks.

        return self
示例#4
0
    def getView(self, document, css):

        view = {}
        specificities = {}
        supportratios={}
        supportFailRate=0
        supportTotalRate=0;
        compliance=dict()

        #load CSV containing css property client support into dict
        mycsv = csv.DictReader(open(os.path.join(os.path.dirname(__file__), "css_compliance.csv")), delimiter=',')

        for row in mycsv:
            #count clients so we can calculate an overall support percentage later
            clientCount=len(row)
            compliance[row['property'].strip()]=dict(row);

        #decrement client count to account for first col which is property name
        clientCount-=1

        #sheet = csscombine(path="http://www.torchbox.com/css/front/import.css")
        sheet = cssutils.parseString(css)

        rules = (rule for rule in sheet if rule.type == rule.STYLE_RULE)
        for rule in rules:

            for selector in rule.selectorList:
                try:
                    cssselector = CSSSelector(selector.selectorText)
                    matching = cssselector.evaluate(document)

                    for element in matching:
                        # add styles for all matching DOM elements
                        if element not in view:
                            # add initial
                            view[element] = cssutils.css.CSSStyleDeclaration()
                            specificities[element] = {}

                            # add inline style if present
                            inlinestyletext= element.get('style')
                            if inlinestyletext:
                                inlinestyle= cssutils.css.CSSStyleDeclaration(cssText=inlinestyletext)
                            else:
                                inlinestyle = None
                            if inlinestyle:
                                for p in inlinestyle:
                                    # set inline style specificity
                                    view[element].setProperty(p)
                                    specificities[element][p.name] = (1,0,0,0)

                        for p in rule.style:
                            #create supportratio dic item for this property
                            if p.name not in supportratios:
                                supportratios[p.name]={'usage':0,'failedClients':0}
                            #increment usage
                            supportratios[p.name]['usage']+=1

                            try:
                                if not p.name in self.CSSUnsupportErrors:
                                    for client, support in compliance[p.name].items():
                                        if support == "N" or support=="P":
                                            #increment client failure count for this property
                                            supportratios[p.name]['failedClients']+=1
                                            if not p.name in self.CSSUnsupportErrors:
                                                if support == "P":
                                                    self.CSSUnsupportErrors[p.name]=[client + ' (partial support)']
                                                else:
                                                    self.CSSUnsupportErrors[p.name]=[client]
                                            else:
                                                if support == "P":
                                                    self.CSSUnsupportErrors[p.name].append(client + ' (partial support)')
                                                else:
                                                    self.CSSUnsupportErrors[p.name].append(client)

                            except KeyError:
                                pass

                            # update styles
                            if p not in view[element]:
                                view[element].setProperty(p.name, p.value, p.priority)
                                specificities[element][p.name] = selector.specificity
                            else:
                                sameprio = (p.priority == view[element].getPropertyPriority(p.name))
                                if not sameprio and bool(p.priority) or (sameprio and selector.specificity >= specificities[element][p.name]):
                                    # later, more specific or higher prio
                                    view[element].setProperty(p.name, p.value, p.priority)

                except ExpressionError:
                    if str(sys.exc_info()[1]) not in self.CSSErrors:
                        self.CSSErrors.append(str(sys.exc_info()[1]))
                    pass

        for props, propvals in supportratios.items():
            supportFailRate+=(propvals['usage']) * int(propvals['failedClients'])
            supportTotalRate+=int(propvals['usage']) * clientCount

        if(supportFailRate and supportTotalRate):
            self.supportPercentage= 100- ((float(supportFailRate)/float(supportTotalRate)) * 100)
        return view
示例#5
0
    def getView(self, document, css):

        view = {}
        specificities = {}
        supportratios = {}
        supportFailRate = 0
        supportTotalRate = 0
        compliance = dict()

        #load CSV containing css property client support into dict
        mycsv = csv.DictReader(open(
            os.path.join(os.path.dirname(__file__), "css_compliance.csv")),
                               delimiter=',')

        for row in mycsv:
            #count clients so we can calculate an overall support percentage later
            clientCount = len(row)
            compliance[row['property'].strip()] = dict(row)

        #decrement client count to account for first col which is property name
        clientCount -= 1

        #sheet = csscombine(path="http://www.torchbox.com/css/front/import.css")
        sheet = cssutils.parseString(css)

        rules = (rule for rule in sheet if rule.type == rule.STYLE_RULE)
        for rule in rules:

            for selector in rule.selectorList:
                try:
                    cssselector = CSSSelector(selector.selectorText)
                    matching = cssselector.evaluate(document)

                    for element in matching:
                        # add styles for all matching DOM elements
                        if element not in view:
                            # add initial
                            view[element] = cssutils.css.CSSStyleDeclaration()
                            specificities[element] = {}

                            # add inline style if present
                            inlinestyletext = element.get('style')
                            if inlinestyletext:
                                inlinestyle = cssutils.css.CSSStyleDeclaration(
                                    cssText=inlinestyletext)
                            else:
                                inlinestyle = None
                            if inlinestyle:
                                for p in inlinestyle:
                                    # set inline style specificity
                                    view[element].setProperty(p)
                                    specificities[element][p.name] = (1, 0, 0,
                                                                      0)

                        for p in rule.style:
                            #create supportratio dic item for this property
                            if p.name not in supportratios:
                                supportratios[p.name] = {
                                    'usage': 0,
                                    'failedClients': 0
                                }
                            #increment usage
                            supportratios[p.name]['usage'] += 1

                            try:
                                if not p.name in self.CSSUnsupportErrors:
                                    for client, support in compliance[
                                            p.name].items():
                                        if support == "N" or support == "P":
                                            #increment client failure count for this property
                                            supportratios[
                                                p.name]['failedClients'] += 1
                                            if not p.name in self.CSSUnsupportErrors:
                                                if support == "P":
                                                    self.CSSUnsupportErrors[
                                                        p.name] = [
                                                            client +
                                                            ' (partial support)'
                                                        ]
                                                else:
                                                    self.CSSUnsupportErrors[
                                                        p.name] = [client]
                                            else:
                                                if support == "P":
                                                    self.CSSUnsupportErrors[
                                                        p.name].append(
                                                            client +
                                                            ' (partial support)'
                                                        )
                                                else:
                                                    self.CSSUnsupportErrors[
                                                        p.name].append(client)

                            except KeyError:
                                pass

                            # update styles
                            if p not in view[element]:
                                view[element].setProperty(
                                    p.name, p.value, p.priority)
                                specificities[element][
                                    p.name] = selector.specificity
                            else:
                                sameprio = (p.priority ==
                                            view[element].getPropertyPriority(
                                                p.name))
                                if not sameprio and bool(p.priority) or (
                                        sameprio and selector.specificity >=
                                        specificities[element][p.name]):
                                    # later, more specific or higher prio
                                    view[element].setProperty(
                                        p.name, p.value, p.priority)

                except ExpressionError:
                    if str(sys.exc_info()[1]) not in self.CSSErrors:
                        self.CSSErrors.append(str(sys.exc_info()[1]))
                    pass

        for props, propvals in supportratios.items():
            supportFailRate += (propvals['usage']) * int(
                propvals['failedClients'])
            supportTotalRate += int(propvals['usage']) * clientCount

        if (supportFailRate and supportTotalRate):
            self.supportPercentage = 100 - (
                (float(supportFailRate) / float(supportTotalRate)) * 100)
        return view
示例#6
0
    def perform(self, document, sourceHTML, sourceURL):
        aggregateCSS = ""
        if sourceURL and not sourceURL.endswith('/'):
            sourceURL += '/'

        # retrieve CSS rel links from html pasted and aggregate into one string
        CSSRelSelector = CSSSelector(
            "link[rel=stylesheet],link[rel=StyleSheet],link[rel=STYLESHEET],style,Style"
        )
        matching = CSSRelSelector.evaluate(document)
        for element in matching:
            if element.tag.lower() == 'style':
                csstext = element.text
                if sourceURL:
                    csstext = fix_relative_urls(csstext, sourceURL)
            else:
                try:
                    csspath = element.get("href")
                    if sourceURL:
                        csspath = urlparse.urljoin(sourceURL, csspath)
                    r = requests.get(csspath)
                    csstext = fix_relative_urls(r.text, csspath)
                except:
                    raise IOError('The stylesheet ' + element.get("href") +
                                  ' could not be found')

            aggregateCSS += csstext
            element.getparent().remove(element)

        #convert  document to a style dictionary compatible with etree
        styledict = self.getView(document, aggregateCSS)

        #set inline style attribute if not one of the elements not worth styling
        ignoreList = ['html', 'head', 'title', 'meta', 'link', 'script']
        for element, style in styledict.items():
            if element.tag not in ignoreList:
                v = style.getCssText(separator=u'')
                element.set('style', v)

        if self.mediaRules:
            bodyTag = document.find('body')
            if bodyTag is not None:
                styleTag = etree.Element('style', type="text/css")
                styleTag.text = self.mediaRules
                bodyTag.insert(0, styleTag)

        if sourceURL:
            for attr in ('href', 'src'):
                for item in document.xpath("//@%s" % attr):
                    parent = item.getparent()
                    if attr == 'href' and parent.attrib[attr].startswith('#'):
                        continue
                    parent.attrib[attr] = urlparse.urljoin(
                        sourceURL, parent.attrib[attr])

        #convert tree back to plain text html
        self.convertedHTML = etree.tostring(document,
                                            method="xml",
                                            pretty_print=True,
                                            encoding='UTF-8')
        self.convertedHTML = self.convertedHTML.replace(
            '&#13;', '')  #tedious raw conversion of line breaks.

        return self
示例#7
0
from lxml import etree
from cssselect import CSSSelector
from BeautifulSoup import BeautifulSoup

if len(sys.argv) < 2:
    print >> sys.stderr, 'usage: weather.py CITY, STATE'
    exit(2)

data = urllib.urlencode({'inputstring': ' '.join(sys.argv[1:])})
info = urllib2.urlopen('http://forecast.weather.gov/zipcity.php', data)
content = info.read()

# Solution #1
parser = lxml.etree.HTMLParser(encoding='utf-8')
tree = lxml.etree.fromstring(content, parser)
div = CSSSelector('div.pull-left')(tree)[0]
print 'Condition:', div[1].text.strip()
print 'Temperature:', div[2].text.strip()
tr = tree.xpath('.//td[b="Humidity"]')[0].getparent()
print 'Humidity:', tr.findall('td')[1].text
print

#Solution #2
soup = BeautifulSoup(content)
div = soup.find('div', 'pull-left')
print 'Condition:', div.contents[3].string.strip()
temp = div.contents[5].string or div.contents[7].string
print 'Temperature:', temp.replace('&deg;', ' ')
tr = soup.find('b', text='Humidity').parent.parent.parent
print 'Humidity:', tr('td')[1].string
print
示例#8
0
	def perform(self,document,sourceHTML,sourceURL,srcPrefix):
		aggregateCSS="";
		if len(srcPrefix) and not srcPrefix.endswith('/'):
			srcPrefix = srcPrefix + '/'

		# retrieve CSS rel links from html pasted and aggregate into one string
		CSSRelSelector = CSSSelector("link[rel=stylesheet],link[rel=StyleSheet],link[rel=STYLESHEET]")
		matching = CSSRelSelector.evaluate(document)
		for element in matching:
			try:
				csspath=element.get("href")
				if len(sourceURL):
					if element.get("href").lower().find("http://",0) < 0:
						parsedUrl=urlparse.urlparse(sourceURL);
						csspath=urlparse.urljoin(parsedUrl.scheme+"://"+parsedUrl.hostname, csspath)
				f=urllib.urlopen(csspath)
				aggregateCSS+=''.join(f.read())
				element.getparent().remove(element)
			except:
				raise IOError('The stylesheet '+element.get("href")+' could not be found')

		#include inline style elements
		print aggregateCSS
		CSSStyleSelector = CSSSelector("style,Style")
		matching = CSSStyleSelector.evaluate(document)
		for element in matching:
			aggregateCSS+=element.text
			element.getparent().remove(element)

		#convert  document to a style dictionary compatible with etree
		styledict = self.getView(document, aggregateCSS)

		#set inline style attribute if not one of the elements not worth styling
		ignoreList=['html','head','title','meta','link','script','repeater','singleline','multiline','br','layout']
		for element, style in styledict.items():
			if element.tag not in ignoreList:
				v = style.getCssText(separator=u'')
				element.set('style', v)

		#convert tree back to plain text html
		self.convertedHTML = etree.tostring(document, method="xml", pretty_print=True,encoding='UTF-8')
		self.convertedHTML= self.convertedHTML.replace('&#13;', '') #tedious raw conversion of line breaks.

		# We've inline styled the CSS, now do the HTML src tags
		soup = BeautifulSoup(self.convertedHTML)
		for img in soup.find_all("img"):
			img['src'] = srcPrefix + img.get('src')

		# Now we would like to set width and min-width on all our tables
		for table in soup.find_all("table"):
			if table.get('width') is not None:
				width = table.get('width')
				if not width.endswith('%'):
					if table.get('style') is None:
						style = []
					else:
						style = table.get('style').split(';')
					style = [x for x in style if x]
					style.append("min-width:" + width + "px")
					style.append("width:" + width + "px")
					table['style'] = ';'.join(style)

		# Might as well go ahead and throw a style tag in the head for iOS fixes
		if soup.html.head is None:
			soup.html.insert(0, soup.new_tag('head'))
		if soup.html.head.style is None:
			soup.html.head.append(soup.new_tag('style', type="text/css"))
		soup.html.head.style.append("""
			a[href^="x-apple-data-detectors:"] {
    color: #000000;
    text-decoration: none;
}
a[href^="tel"], a[href^="sms"], a[href^="mailto"] {
    color: #000000;
    text-decoration: none;
}
""")

		for img in soup.find_all('img'):
			if 'spacer.gif' in img.get('src'):
				classes = img.get('class')
				if classes is not None:
					if 'w' in classes:
						img.parent['width'] = img.get('width')
					if 'h' in classes:
						img.parent['height'] = img.get('height')

		self.convertedHTML = str(soup)

		return self