def isNodeScoreThreshholdMet(self, node, e): topNodeScore = self.getScore(node) currentNodeScore = self.getScore(e) thresholdScore = topNodeScore * 0.08 if topNodeScore < 0 and currentNodeScore < 0: return True if not Parser.hasChildTags(e, ['a','img']): return True if e.tag in ['ul']: textLen,stopCount,isHighLink = self.getTextStats(e) if stopCount > 5: return True if currentNodeScore < thresholdScore and e.tag != 'td': return False return True
def convertDivsToParagraphs(self, doc, domTypes): divs = Parser.getElementsByTags(doc, domTypes) tags = self.child_tags for div in divs: if div is None: continue attrs = div.attrib['goose_attributes'] if 'goose_attributes' in div.attrib else '' if not Parser.hasChildTags(div, tags): div.tag = 'p' elif self.re_dontconvert.search(attrs) is not None: continue else: replaceNodes = self.getReplacementNodes(div) text = div.tail attrib = {} for a in div.attrib: attrib[a] = div.attrib[a] div.clear() div.extend(replaceNodes) div.tail = text for a in attrib: div.attrib[a] = attrib[a] return doc