Пример #1
0
 def getFeatures(self):  
     def searchAdd(self, list, data):
         if data in list:
             return
         list.append(data)
     #some tag nodes
     for tag in nodenames:
         tags = self.root(tag)
         for i in range(len(tags)):
             #fix pq bug: mistakenly treat "<img>hello" as a node
             #split tag and text data
             data = str(tags.eq(i))
             _end = data.rfind('>')
             #tag
             yield trim(data[:_end+1])
             #remaining text
             _res = trim(data[_end+1:])
             if _res:
                 for w in [trim(word) for word in wordsplit.split(_res)]:
                     if w: yield w
         #remove tags
         self.root.remove(tag)
     #words as features
     text = self.root.text()
     for word in [trim(w)  for w in wordsplit.split(text)]:
         if word: yield word
Пример #2
0
    def getFeatures(self):  
        def searchAdd(list, data):
            if data in list:
                return
            list.append(data)
            
        children = self.root.children()
        
        containtags = []
        for i in range(len(children)):
            child = children.eq(i)
            _tag = getTagName(child)
            #print '_tag: ', _tag
            if _tag: searchAdd(containtags, _tag)
            if _tag in nodenames:
                #add special tag nodes as feature
                data = str(child)
                #fix pq bug: mistakenly treat "<img>hello" as a node
                #split tag and text data
                _end = data.rfind('>')
                yield trim(data[:_end+1])
                _res = trim(data[_end+1:])
                if _res: yield _res
        #print 'containtags: ', containtags

        #remove all tags
        #print 'containtags', containtags
        for t in containtags: self.root.remove(t)
        #words as features
        text = self.root.text()
        for word in [trim(w) for w in wordsplit.split(text)]:
            if word: yield word
Пример #3
0
 def setSource(self, source):
     self.source = source
     if trim(self.source):
         self.root = pq(self.source)('body')
         #clean source
         self.root.remove('script')
         self.root.remove('style')
         return True
     return False
Пример #4
0
 def setNode(self, node):
     self.source = str(node)
     if trim(self.source):
         tag = getTagName(self.source)
         if not tag: return
         self.root = pq(self.source)(tag)
         #clean source
         self.root.remove('script')
         self.root.remove('style')
         return True
     return False