示例#1
0
    def isNodeScoreThreshholdMet(self, node, e):
        topNodeScore = self.getScore(node)
        currentNodeScore = self.getScore(e)
        thresholdScore = topNodeScore * 0.08

        if topNodeScore < 0 and currentNodeScore < 0:
            return True

        if not Parser.hasChildTags(e, ['a','img']):
            return True

        if e.tag in ['ul']:
            textLen,stopCount,isHighLink = self.getTextStats(e)
            if stopCount > 5:
                return True

        if currentNodeScore < thresholdScore and e.tag != 'td':
            return False
        return True
示例#2
0
    def convertDivsToParagraphs(self, doc, domTypes):
        divs = Parser.getElementsByTags(doc, domTypes)
        tags = self.child_tags

        for div in divs:
            if div is None: continue
            attrs = div.attrib['goose_attributes'] if 'goose_attributes' in div.attrib else ''
            if not Parser.hasChildTags(div, tags): div.tag = 'p'
            elif self.re_dontconvert.search(attrs) is not None: continue
            else:
                replaceNodes = self.getReplacementNodes(div)
                text = div.tail
                attrib = {}
                for a in div.attrib: attrib[a] = div.attrib[a]
                div.clear()
                div.extend(replaceNodes)
                div.tail = text
                for a in attrib: div.attrib[a] = attrib[a]

        return doc