def __init__(self): _config = Config() self.value = _config.getfloat('rank', 'value') self.stack = Stack()
class Parser: def __init__(self): _config = Config() self.value = _config.getfloat('rank', 'value') self.stack = Stack() def setDic(self, dic): self.dic = dic def setSST(self, sst): self.styletree = sst def setSource(self, source): self.pq = pq(source) def parse(self): body = self.pq('body') #init node self.stack.init() self.stack.push( [ body, self.styletree.body ] ) self.parseIter() def markNoiseNode(self, fnode, isnoise): ''' mark html node ''' if isnoise: try: fnode.css('background-color', 'gray') fnode.css('border', '2px solid yellow') except: print 'mark wrong!!!!' else: try: fnode.css('background-color', 'blue') fnode.css('border', '2px solid red') except: print 'mark wrong!!!!' def parseIter(self): def scanDataNode(fnode, element): ''' add datanodes first build a data container (DataNode) then for each data, create a Data and register in DataNode ''' rank = element.getCompImp() self.markNoiseNode(fnode, rank < self.value) #print 'addDataNode' #children = fnode.children() #print 'fnode.children: ', children #dics #dn = element.getDataNode() #self.datatagextractor.setNode(str(fnode)) #features = self.datatagextractor.getFeatures() #for each data add to DataNode #dn.addFeatures(features) def scanStyleNode(node): #print 'addStyleNode(%s)'% node #clean node childnodes = node.children() stylenode = StyleNode(self.dic) assert node != None , "addStyleNode(None)" stylenode.generateStyleNode(node) print 'stylenode:', stylenode.generatePreview() stylenodes = element.getChildStyleNodes()[1:] print '.. start stylenodes list:' for node in stylenodes: print 'stylenodes: ', node.getPreview() print '.. end stylenode list' _stylenode = element.findStyleNode(stylenode) print 'find _stylenode', _stylenode j = -1 for i in range(len(childnodes)): try: child = childnodes.eq(i) tag = getTagName(child) #print '** tag:', tag if tag not in nodenames: j += 1 childnode = _stylenode.getChild(j) self.stack.push([ childnodes.eq(i), childnode ]) except: print 'wrong!!!!' while not self.stack.empty(): (node , element) = self.stack.pop() #print '.. stylenode: ', _stylenode scanDataNode(node, element) scanStyleNode(node) def tofile(self, filename): open(filename, 'w').write(str(self.pq.html()))