def __init__(self, dic): self.styletree = StyleTree(dic) self.stack = Stack() self.datatagextractor = datatagextractor.DatatagExtractor() # centra dic self.dic = Dic() self.dic.fromfile()
class SourceParser: ''' parse html source and add nodes to styletree ''' def __init__(self, dic): self.styletree = StyleTree(dic) self.stack = Stack() self.datatagextractor = datatagextractor.DatatagExtractor() # centra dic self.dic = Dic() self.dic.fromfile() def setSource(self, source): self.pq = pq(source) ''' self.pq.remove('script') self.pq.remove('style') ''' def setPagenum(self, num): self.styletree.setPagenum(num) def parse(self): body = self.pq('body') #init node self.stack.init() self.stack.push( [ body, self.styletree.body ] ) self.parseIter() def parseIter(self): def addDataNode(fnode, element): ''' add datanodes first build a data container (DataNode) then for each data, create a Data and register in DataNode ''' #print 'addDataNode' children = fnode.children() #print 'fnode.children: ', children #dics dn = element.getDataNode() self.datatagextractor.setNode(str(fnode)) features = self.datatagextractor.getFeatures() #for each data add to DataNode dn.addFeatures(features) def addStyleNode(node): #print 'addStyleNode(%s)'% node #clean node childnodes = node.children() stylenode = StyleNode(self.dic) assert node != None , "addStyleNode(None)" stylenode.generateStyleNode(node) _stylenode = element.registerStyleNode(stylenode) j = -1 for i in range(len(childnodes)): child = childnodes.eq(i) tag = getTagName(child) #print '** tag:', tag if tag not in nodenames: j += 1 childnode = _stylenode.getChild(j) self.stack.push([ childnodes.eq(i), childnode ]) while not self.stack.empty(): (node , element) = self.stack.pop() #print '.. stylenode: ', _stylenode addDataNode(node, element) addStyleNode(node) def _getTag(self, node): end = str(node).index('>') res = str(node)[:end+1] #print 'getTag: ', res return res