def getFeatures(self): def searchAdd(self, list, data): if data in list: return list.append(data) #some tag nodes for tag in nodenames: tags = self.root(tag) for i in range(len(tags)): #fix pq bug: mistakenly treat "<img>hello" as a node #split tag and text data data = str(tags.eq(i)) _end = data.rfind('>') #tag yield trim(data[:_end+1]) #remaining text _res = trim(data[_end+1:]) if _res: for w in [trim(word) for word in wordsplit.split(_res)]: if w: yield w #remove tags self.root.remove(tag) #words as features text = self.root.text() for word in [trim(w) for w in wordsplit.split(text)]: if word: yield word
def getFeatures(self): def searchAdd(list, data): if data in list: return list.append(data) children = self.root.children() containtags = [] for i in range(len(children)): child = children.eq(i) _tag = getTagName(child) #print '_tag: ', _tag if _tag: searchAdd(containtags, _tag) if _tag in nodenames: #add special tag nodes as feature data = str(child) #fix pq bug: mistakenly treat "<img>hello" as a node #split tag and text data _end = data.rfind('>') yield trim(data[:_end+1]) _res = trim(data[_end+1:]) if _res: yield _res #print 'containtags: ', containtags #remove all tags #print 'containtags', containtags for t in containtags: self.root.remove(t) #words as features text = self.root.text() for word in [trim(w) for w in wordsplit.split(text)]: if word: yield word