def getFeatures(self): def searchAdd(self, list, data): if data in list: return list.append(data) #some tag nodes for tag in nodenames: tags = self.root(tag) for i in range(len(tags)): #fix pq bug: mistakenly treat "<img>hello" as a node #split tag and text data data = str(tags.eq(i)) _end = data.rfind('>') #tag yield trim(data[:_end+1]) #remaining text _res = trim(data[_end+1:]) if _res: for w in [trim(word) for word in wordsplit.split(_res)]: if w: yield w #remove tags self.root.remove(tag) #words as features text = self.root.text() for word in [trim(w) for w in wordsplit.split(text)]: if word: yield word
def getFeatures(self): def searchAdd(list, data): if data in list: return list.append(data) children = self.root.children() containtags = [] for i in range(len(children)): child = children.eq(i) _tag = getTagName(child) #print '_tag: ', _tag if _tag: searchAdd(containtags, _tag) if _tag in nodenames: #add special tag nodes as feature data = str(child) #fix pq bug: mistakenly treat "<img>hello" as a node #split tag and text data _end = data.rfind('>') yield trim(data[:_end+1]) _res = trim(data[_end+1:]) if _res: yield _res #print 'containtags: ', containtags #remove all tags #print 'containtags', containtags for t in containtags: self.root.remove(t) #words as features text = self.root.text() for word in [trim(w) for w in wordsplit.split(text)]: if word: yield word
def setSource(self, source): self.source = source if trim(self.source): self.root = pq(self.source)('body') #clean source self.root.remove('script') self.root.remove('style') return True return False
def setNode(self, node): self.source = str(node) if trim(self.source): tag = getTagName(self.source) if not tag: return self.root = pq(self.source)(tag) #clean source self.root.remove('script') self.root.remove('style') return True return False