Exemplo n.º 1
0
    def removeDropCaps(self, doc):
        items = cache.cssselect("span[class~=dropcap], span[class~=drop_cap]",
                                doc)
        for item in items:
            item.drop_tag()

        return doc
Exemplo n.º 2
0
 def getElementsByTags(self, node, tags):
     selector = ','.join(tags)
     elems = cache.cssselect(selector, node)
     # remove the root node
     # if we have a selection tag
     if node in elems:
         elems.remove(node)
     return elems
Exemplo n.º 3
0
 def getElementsByTags(self, node, tags):
     selector = ','.join(tags)
     elems = cache.cssselect(selector, node)
     # remove the root node
     # if we have a selection tag
     if node in elems:
         elems.remove(node)
     return elems
Exemplo n.º 4
0
 def removeNodesWithNegativeScores(self):
     """\
     if there are elements inside our top node 
     that have a negative gravity score, 
     let's give em the boot
     """
     gravityItems = cache.cssselect("*[gravityScore]", self.topNode)
     for item in gravityItems:
         score = int(item.attrib.get('gravityScore'),0)
         if score < 1:
             item.getparent().remove(item)
Exemplo n.º 5
0
 def removeNodesWithNegativeScores(self):
     """\
     if there are elements inside our top node 
     that have a negative gravity score, 
     let's give em the boot
     """
     gravityItems = cache.cssselect("*[gravityScore]", self.topNode)
     for item in gravityItems:
         score = int(item.attrib.get('gravityScore'), 0)
         if score < 1:
             item.getparent().remove(item)
Exemplo n.º 6
0
    def getMetaContent(self, doc, metaName):
        """\
        Extract a given meta content form document
        """
        meta = cache.cssselect(metaName, doc)
        content = None

        if meta is not None and len(meta) > 0:
            content = meta[0].attrib.get('content')

        if content:
            return content.strip()

        return ''
Exemplo n.º 7
0
 def getMetaContent(self, doc, metaName):
     """\
     Extract a given meta content form document
     """
     meta = cache.cssselect(metaName, doc)
     content = None
     
     if meta is not None and len(meta) > 0:
         content = meta[0].attrib.get('content')
         
     if content:
         return content.strip()
         
     return ''
Exemplo n.º 8
0
    def extractTags(self, article):
        node = article.doc

        # node doesn't have chidren
        if len(list(node)) == 0:
            return NO_STRINGS

        elements = cache.cssselect(A_REL_TAG_SELECTOR, node)
        if elements is None:
            return NO_STRINGS

        tags = []
        for el in elements:
            tag = Parser.getText(el)
            if tag:
                tags.append(tag)

        return set(tags)
Exemplo n.º 9
0
    def extractTags(self, article):
        node = article.doc
        
        # node doesn't have chidren
        if len(list(node)) == 0:
            return NO_STRINGS

        elements = cache.cssselect(A_REL_TAG_SELECTOR, node)
        if elements is None:
            return NO_STRINGS
        
        tags = []
        for el in elements:
            tag = Parser.getText(el)
            if tag:
                tags.append(tag)
                
        return set(tags)
Exemplo n.º 10
0
 def removeDropCaps(self, doc):
     items = cache.cssselect("span[class~=dropcap], span[class~=drop_cap]", doc)
     for item in items:
         item.drop_tag()
     
     return doc
Exemplo n.º 11
0
 def cleanUpSpanTagsInParagraphs(self, doc):
     spans = cache.cssselect('p > span', doc)
     for item in spans:
         item.drop_tag()
     return doc
Exemplo n.º 12
0
 def cleanUpSpanTagsInParagraphs(self, doc):
     spans = cache.cssselect('p > span', doc)
     for item in spans:
         item.drop_tag()
     return doc