Exemplos de cssselect em Python, exemplos de goose.cache.cssselect em Python

Exemplo n.º 1

0

Exibir arquivo

    def removeDropCaps(self, doc):
        items = cache.cssselect("span[class~=dropcap], span[class~=drop_cap]",
                                doc)
        for item in items:
            item.drop_tag()

        return doc

Exemplo n.º 2

0

Exibir arquivo

Arquivo: parsers.py Projeto: SalesLoft/python-goose

 def getElementsByTags(self, node, tags):
     selector = ','.join(tags)
     elems = cache.cssselect(selector, node)
     # remove the root node
     # if we have a selection tag
     if node in elems:
         elems.remove(node)
     return elems

Exemplo n.º 3

0

Exibir arquivo

Arquivo: parsers.py Projeto: toddwilson/python-goose

 def getElementsByTags(self, node, tags):
     selector = ','.join(tags)
     elems = cache.cssselect(selector, node)
     # remove the root node
     # if we have a selection tag
     if node in elems:
         elems.remove(node)
     return elems

Exemplo n.º 4

0

Exibir arquivo

Arquivo: outputformatters.py Projeto: SalesLoft/python-goose

 def removeNodesWithNegativeScores(self):
     """\
     if there are elements inside our top node 
     that have a negative gravity score, 
     let's give em the boot
     """
     gravityItems = cache.cssselect("*[gravityScore]", self.topNode)
     for item in gravityItems:
         score = int(item.attrib.get('gravityScore'),0)
         if score < 1:
             item.getparent().remove(item)

Exemplo n.º 5

0

Exibir arquivo

 def removeNodesWithNegativeScores(self):
     """\
     if there are elements inside our top node 
     that have a negative gravity score, 
     let's give em the boot
     """
     gravityItems = cache.cssselect("*[gravityScore]", self.topNode)
     for item in gravityItems:
         score = int(item.attrib.get('gravityScore'), 0)
         if score < 1:
             item.getparent().remove(item)

Exemplo n.º 6

0

Exibir arquivo

    def getMetaContent(self, doc, metaName):
        """\
        Extract a given meta content form document
        """
        meta = cache.cssselect(metaName, doc)
        content = None

        if meta is not None and len(meta) > 0:
            content = meta[0].attrib.get('content')

        if content:
            return content.strip()

        return ''

Exemplo n.º 7

0

Exibir arquivo

Arquivo: extractors.py Projeto: SalesLoft/python-goose

 def getMetaContent(self, doc, metaName):
     """\
     Extract a given meta content form document
     """
     meta = cache.cssselect(metaName, doc)
     content = None
     
     if meta is not None and len(meta) > 0:
         content = meta[0].attrib.get('content')
         
     if content:
         return content.strip()
         
     return ''

Exemplo n.º 8

0

Exibir arquivo

    def extractTags(self, article):
        node = article.doc

        # node doesn't have chidren
        if len(list(node)) == 0:
            return NO_STRINGS

        elements = cache.cssselect(A_REL_TAG_SELECTOR, node)
        if elements is None:
            return NO_STRINGS

        tags = []
        for el in elements:
            tag = Parser.getText(el)
            if tag:
                tags.append(tag)

        return set(tags)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: extractors.py Projeto: SalesLoft/python-goose

    def extractTags(self, article):
        node = article.doc
        
        # node doesn't have chidren
        if len(list(node)) == 0:
            return NO_STRINGS

        elements = cache.cssselect(A_REL_TAG_SELECTOR, node)
        if elements is None:
            return NO_STRINGS
        
        tags = []
        for el in elements:
            tag = Parser.getText(el)
            if tag:
                tags.append(tag)
                
        return set(tags)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: cleaners.py Projeto: SalesLoft/python-goose

 def removeDropCaps(self, doc):
     items = cache.cssselect("span[class~=dropcap], span[class~=drop_cap]", doc)
     for item in items:
         item.drop_tag()
     
     return doc

Exemplo n.º 11

0

Exibir arquivo

Arquivo: cleaners.py Projeto: SalesLoft/python-goose

 def cleanUpSpanTagsInParagraphs(self, doc):
     spans = cache.cssselect('p > span', doc)
     for item in spans:
         item.drop_tag()
     return doc

Exemplo n.º 12

0

Exibir arquivo

 def cleanUpSpanTagsInParagraphs(self, doc):
     spans = cache.cssselect('p > span', doc)
     for item in spans:
         item.drop_tag()
     return doc