Python parse примеры, amara.bindery.html.parse Python примеры использования

Пример #1

0

Показать файл

Файл: inaem.py Проект: albertogcatalan/daw1

 def __init__(self):
     self._doc = html.parse(URL)
     xpath_cursos = u'/html/body/form/table/tbody/tr'
     # primer elemento es cabecera
     self._lista_cursos = []
     for nodo in self._doc.xml_select(xpath_cursos)[1:]:
         self._lista_cursos.append(Curso(nodo))

Пример #2

0

Показать файл

Файл: markuptools.py Проект: dpla/akara

def tidy(body, ctype):
    '''
    Tidy arbitrary HTML (using html5lib)

    Sample request:
    curl --request POST --data-binary "<a>one two <b>three four </b><c>five <d>six seven</d> eight</c> nine</a>" --header "Content-Type: application/xml" "http://localhost:8880/akara.tidy"
    '''
    doc = htmldoc.parse(body)
    return doc

Пример #3

0

Показать файл

Файл: agregas.py Проект: lmorillas/agrega-monitor

def extrae_nodos():
    from amara.bindery import html
    doc = html.parse('http://www.agrega2.es/web/')
    doc.xml_select(u'//div[@id="block-views-nodos-de-agrega-block-1"]//li//a')

    nodos = doc.xml_select(u'//div[@id="block-views-nodos-de-agrega-block-1"]//li//a')

    nodos_agrega = dict([(unicode(n), n.href) for n in nodos])
    return nodos_agrega

Пример #4

0

Показать файл

Файл: markuptools.py Проект: mredar/akara

def tidy(body, ctype):
    '''
    Tidy arbitrary HTML (using html5lib)

    Sample request:
    curl --request POST --data-binary "<a>one two <b>three four </b><c>five <d>six seven</d> eight</c> nine</a>" --header "Content-Type: application/xml" "http://localhost:8880/akara.tidy"
    '''
    doc = htmldoc.parse(body)
    return doc

Пример #5

0

Показать файл

Файл: test_html.py Проект: mredar/amara

 def test_parse_file(self):
     """Parse ugly HTML file"""
     f = filesource('nastytagsoup1.html')
     doc = html.parse(f.source)
     self.assertEqual(len(doc.xml_children), 1)
     self.assertEqual(doc.xml_children[0].xml_type, tree.element.xml_type)
     self.assertEqual(doc.xml_children[0].xml_qname, 'html')
     self.assertEqual(doc.xml_children[0].xml_namespace, None)
     self.assertEqual(doc.xml_children[0].xml_prefix, None)
     self.assertEqual(len(list(doc.html.xml_elements)), 2)
     return

Пример #6

0

Показать файл

Файл: amazon.py Проект: albertogcatalan/daw1

 def __init__(self):
     self._doc = html.parse(URL)
     xpath_pc_1 = u'//*[@id="atfResults"]/div'
     xpath_pc_2 = u'//*[@id="btfResults"]/div'
     
     self._lista_pc = []
     for nodo in self._doc.xml_select(xpath_pc_1):
         self._lista_pc.append(Catalogo(nodo))
         
     for nodo in self._doc.xml_select(xpath_pc_2):
         self._lista_pc.append(Catalogo(nodo))

Пример #7

0

Показать файл

Файл: test_html.py Проект: abed-hawa/amara

 def test_parse_file(self):
     """Parse ugly HTML file"""
     f = filesource('nastytagsoup1.html')
     doc = html.parse(f.source)
     self.assertEqual(len(doc.xml_children), 1)
     self.assertEqual(doc.xml_children[0].xml_type, tree.element.xml_type)
     self.assertEqual(doc.xml_children[0].xml_qname, 'html')
     self.assertEqual(doc.xml_children[0].xml_namespace, None)
     self.assertEqual(doc.xml_children[0].xml_prefix, None)
     self.assertEqual(len(list(doc.html.xml_elements)), 2)
     return

Пример #8

0

Показать файл

Файл: test_html.py Проект: abed-hawa/amara

def test_reserved_attributes_page_ns():
    EXPECTED = '<h1 xmlns="http://www.w3.org/1999/xhtml" xmlns:h="http://www.w3.org/1999/xhtml" id="akara:metadata">akara:metadata</h1>'
    f = filesource('tagsoup2.html')
    doc = html.parse(f.source, prefixes=XHTML_NSS, use_xhtml_ns=True)
    #import sys; print >> sys.stderr, doc.xml_select(u'*')[0].xml_name
    #import sys; print >> sys.stderr, doc.xml_select(u'//h:div[@id="content"]')[0].xml_first_child
    #content = doc.xml_select(u'//div[@id="content"]//h1')[0]
    #first_h1 = content.xml_select(u'.//h1')[0]
    first_h1 = doc.xml_select(u'//h:div[@id="content"]//h:h1')[0]
    treecompare.check_xml(first_h1.xml_encode(), EXPECTED)
    assert first_h1.id == u'akara:metadata', (first_h1.id, u'akara:metadata')
    return

Пример #9

0

Показать файл

Файл: rdfascrape.py Проект: distobj/amara

def rdfascrape(source):
    from amara.lib import inputsource
    source = inputsource(source, None)
    doc = html.parse(source.stream)
    try:
        docuri = doc.html.head.base.href
    except:
        docuri = source.uri

    statement_elems = doc.xml_select(u'//*[@property|@resource|@rel]')
    triples = ( handle_statement(elem, docuri) for elem in statement_elems )
    return triples

Пример #10

0

Показать файл

def rdfascrape(source):
    from amara.lib import inputsource
    source = inputsource(source, None)
    doc = html.parse(source.stream)
    try:
        docuri = doc.html.head.base.href
    except:
        docuri = source.uri

    statement_elems = doc.xml_select(u'//*[@property|@resource|@rel]')
    triples = (handle_statement(elem, docuri) for elem in statement_elems)
    return triples

Пример #11

0

Показать файл

Файл: test_html.py Проект: mredar/amara

def test_reserved_attributes_page_ns():
    EXPECTED = '<h1 xmlns="http://www.w3.org/1999/xhtml" xmlns:h="http://www.w3.org/1999/xhtml" id="akara:metadata">akara:metadata</h1>'
    f = filesource('tagsoup2.html')
    doc = html.parse(f.source, prefixes=XHTML_NSS, use_xhtml_ns=True)
    #import sys; print >> sys.stderr, doc.xml_select(u'*')[0].xml_name
    #import sys; print >> sys.stderr, doc.xml_select(u'//h:div[@id="content"]')[0].xml_first_child
    #content = doc.xml_select(u'//div[@id="content"]//h1')[0]
    #first_h1 = content.xml_select(u'.//h1')[0]
    first_h1 = doc.xml_select(u'//h:div[@id="content"]//h:h1')[0]
    treecompare.check_xml(first_h1.xml_encode(), EXPECTED)
    assert first_h1.id == u'akara:metadata', (first_h1.id, u'akara:metadata')
    return

Пример #12

0

Показать файл

Файл: test_html.py Проект: abed-hawa/amara

def test_reserved_attributes_page():
    EXPECTED = '<h1 id="akara:metadata">akara:metadata</h1>'
    f = filesource('tagsoup2.html')
    doc = html.parse(f.source)
    #import sys; print >> sys.stderr, [ d.xml_name for d in doc.xml_select(u'//div') ]
    #import sys; print >> sys.stderr, dict(doc.xml_select(u'//div')[1].xml_attributes)
    #import sys; print >> sys.stderr, doc.xml_select(u'*')[0].xml_name
    #content = doc.xml_select(u'//div[@id="content"]//h1')[0]
    #first_h1 = content.xml_select(u'.//h1')[0]
    #import sys; print >> sys.stderr, doc.xml_select(u'//div[@id="content"]')[0].xml_first_child
    first_h1 = doc.xml_select(u'//div[@id="content"]//h1')[0]
    treecompare.check_xml(first_h1.xml_encode(), EXPECTED)
    assert first_h1.id == u'akara:metadata', (first_h1.id, u'akara:metadata')
    return

Пример #13

0

Показать файл

def akara_xpath(body, ctype, **params):
    '''
    select - XPath expression to be evaluated against the document
    tidy - 'yes' to tidy HTML, or 'no'

    Sample request:
    curl --request POST --data-binary "@foo.xml" --header "Content-Type: application/xml" "http://localhost:8880/akara.xpath?select=/html/head/title&tidy=yes"
    '''
    if params.get("tidy") == 'yes':
        doc = html.parse(body)
    else:
        doc = amara.parse(body)
    result = simplify(doc.xml_select(params['select'].decode('utf-8')))
    return str(result)

Пример #14

0

Показать файл

Файл: xslt.py Проект: dpla/akara

def akara_xpath(body, ctype, **params):
    '''
    select - XPath expression to be evaluated against the document
    tidy - 'yes' to tidy HTML, or 'no'

    Sample request:
    curl --request POST --data-binary "@foo.xml" --header "Content-Type: application/xml" "http://localhost:8880/akara.xpath?select=/html/head/title&tidy=yes"
    '''
    if params.get("tidy") == 'yes':
        doc = html.parse(body)
    else:
        doc = amara.parse(body)
    result = simplify(doc.xml_select(params['select'].decode('utf-8')))
    return str(result)

Пример #15

0

Показать файл

def rdfascrape(source):
    from amara.lib import inputsource
    source = inputsource(source, None)
    doc = html.parse(source.stream)
    try:
        docuri = doc.html.head.base.href
    except:
        docuri = source.uri
 
    #https://github.com/zepheira/amara/issues/8
    #statement_elems = doc.xml_select(u'//*[@property|@resource|@rel]')
    statement_elems = chain(doc.xml_select(u'//*[@property]'), doc.xml_select(u'//*[@resource]'), doc.xml_select(u'//*[@rel]'))
    triples = ( handle_statement(elem, docuri) for elem in statement_elems )
    return triples

Пример #16

0

Показать файл

Файл: test_html.py Проект: mredar/amara

def test_reserved_attributes_page():
    EXPECTED = '<h1 id="akara:metadata">akara:metadata</h1>'
    f = filesource('tagsoup2.html')
    doc = html.parse(f.source)
    #import sys; print >> sys.stderr, [ d.xml_name for d in doc.xml_select(u'//div') ]
    #import sys; print >> sys.stderr, dict(doc.xml_select(u'//div')[1].xml_attributes)
    #import sys; print >> sys.stderr, doc.xml_select(u'*')[0].xml_name
    #content = doc.xml_select(u'//div[@id="content"]//h1')[0]
    #first_h1 = content.xml_select(u'.//h1')[0]
    #import sys; print >> sys.stderr, doc.xml_select(u'//div[@id="content"]')[0].xml_first_child
    first_h1 = doc.xml_select(u'//div[@id="content"]//h1')[0]
    treecompare.check_xml(first_h1.xml_encode(), EXPECTED)
    assert first_h1.id == u'akara:metadata', (first_h1.id, u'akara:metadata')
    return

Пример #17

0

Показать файл

Файл: test_html.py Проект: abed-hawa/amara

 def test_tagsoup1(self):
     """Test RDFa interpretation from tagsoup"""
     f = filesource('tagsouprdfa1.html')
     doc = html.parse(f.source)
     h = doc.xml_select(u'//h1')[0]
     self.assertEqual(h.property, u'dc:title')
     self.assertEqual(h.xml_attributes[None, u'property'], u'dc:title')
     #print h.xml_namespaces.copy()[u'xml']
     #print h.xml_namespaces.copy()
     self.assertEqual(h.xml_namespaces.copy()[u'xml'], u'http://www.w3.org/XML/1998/namespace')
     self.assertEqual(h.xml_namespaces[u'xml'], u'http://www.w3.org/XML/1998/namespace')
     self.assertEqual(h.xml_namespaces[u'd'], u'http://purl.org/dc/elements/1.1/')
     self.assertEqual(h.xml_namespaces[u'xlink'], u'http://www.w3.org/1999/xlink')
     self.assertEqual(h.xml_namespaces[u'mml'], u'http://www.w3.org/1998/Math/MathML')
     self.assertEqual(h.xml_namespaces[u'xs'], u'http://www.w3.org/2001/XMLSchema')
     self.assertEqual(h.xml_namespaces[u'aml'], u'http://topazproject.org/aml/')
     return

Пример #18

0

Показать файл

Файл: markuptools.py Проект: mredar/akara

def akara_twc(body, ctype, max=None, html='no'):
    '''
    Take some POSTed markup and return a version with words trimmed, but intelligently,
    with understanding of markup, so that tags are not counted, and the structure of
    sub-elements included in the same set is preserved.

    max (query parameter) - which is the maximum word count of the resulting text
    html (query parameter) - if 'yes', try to parse the input as HTML

    Sample request:
    curl --request POST --data-binary "<a>one two <b>three four </b><c>five <d>six seven</d> eight</c> nine</a>" --header "Content-Type: application/xml" "http://localhost:8880/akara.twc?max=7"
    '''
    #Raises ValueError
    #Is there a monadic approach we can provide for Akara for error handling?  This cries out for "Maybe"
    #(OK OK, the idea of Maybe, but more of the simple expressiveness of assert)
    max_ = int(max) if max else 500
    if html == 'yes':
        doc = htmldoc.parse(body)
    else:
        doc = amara.parse(body)
    return trim_word_count(doc, max_)

Пример #19

0

Показать файл

Файл: markuptools.py Проект: dpla/akara

def akara_twc(body, ctype, max=None, html='no'):
    '''
    Take some POSTed markup and return a version with words trimmed, but intelligently,
    with understanding of markup, so that tags are not counted, and the structure of
    sub-elements included in the same set is preserved.

    max (query parameter) - which is the maximum word count of the resulting text
    html (query parameter) - if 'yes', try to parse the input as HTML

    Sample request:
    curl --request POST --data-binary "<a>one two <b>three four </b><c>five <d>six seven</d> eight</c> nine</a>" --header "Content-Type: application/xml" "http://localhost:8880/akara.twc?max=7"
    '''
    #Raises ValueError
    #Is there a monadic approach we can provide for Akara for error handling?  This cries out for "Maybe"
    #(OK OK, the idea of Maybe, but more of the simple expressiveness of assert)
    max_ = int(max) if max else 500
    if html == 'yes':
        doc = htmldoc.parse(body)
    else:
        doc = amara.parse(body)
    return trim_word_count(doc, max_)

Пример #20

0

Показать файл

def tidy_content_element(root, check=u'//atom:title|//atom:summary|//atom:content', prefixes=PREFIXES):
    """
    Takes all Atom content elements with type=html (i.e. a:title, a:summary or a:content)
    And convert them to be of type=xhtml

    This operation mutates root in place.

    Example:

    import amara; from util import tidy_content_element
    A = '<entry xmlns="http://www.w3.org/2005/Atom"><id>urn:bogus:x</id><title type="html">&lt;div&gt;x&lt;p&gt;y&lt;p&gt;&lt;/div&gt;</title></entry>'
    doc = amara.parse(A)
    tidy_content_element(doc)
    doc.xml_write()
    """
    nodes = root.xml_select(check, prefixes)
    for node in nodes:
        if node.xml_select(u'@type = "html"') and node.xml_select(u'string(.)'):
            #unsouped = html.parse('<html xmlns="http://www.w3.org/1999/xhtml">%s</html>'%node.xml_select(u'string(.)').encode('utf-8'), encoding='utf-8')
            unsouped = html.parse('<html>%s</html>'%node.xml_select(u'string(.)').encode('utf-8'), encoding='utf-8')
            unsouped.html.xml_namespaces[None] = XHTML_NAMESPACE
            subtree = element_subtree_iter(unsouped)
            #Grab body, before changing the namespaces changes how it's bound
            #After NS is changed, you'd need to remember to do unsouped.html_.body_
            body = unsouped.html.body
            for e in subtree:
                if isinstance(e, tree.element):
                    e.xml_namespace = XHTML_NAMESPACE
                    #Temporary fixup until bindery can handle namespace change better
                    e.xml_parent.xml_fixup(e)
            #amara.xml_print(unsouped, stream=sys.stderr, indent=True)
            while node.xml_children: node.xml_remove(node.xml_first_child)
            node.xml_append(amara.parse('<div xmlns="http://www.w3.org/1999/xhtml"/>').xml_first_child)
            #node.xml_append_fragment('<div xmlns="http://www.w3.org/1999/xhtml"/>')
            for child in body.xml_children:
                node.xml_first_child.xml_append(child)
            node.xml_attributes[None, u'type'] = u'xhtml'
    return root

Пример #21

0

Показать файл

Файл: unicodetools.py Проект: dpla/akara

def charsearch(q=None):
    '''
    name - a string to search for in Unicode information (using http://www.fileformat.info )
    
    Sample request:
    curl "http://*****:*****@class="list"]//*[starts-with(@class, "row")]'))
        )
    ))
    return buf.getvalue()

Пример #22

0

Показать файл

Файл: unicodetools.py Проект: mredar/akara

def charsearch(q=None):
    '''
    name - a string to search for in Unicode information (using http://www.fileformat.info )
    
    Sample request:
    curl "http://*****:*****@class="list"]//*[starts-with(@class, "row")]')))))
    return buf.getvalue()

Пример #23

0

Показать файл

Файл: test_html.py Проект: mredar/amara

 def test_tagsoup1(self):
     """Test RDFa interpretation from tagsoup"""
     f = filesource('tagsouprdfa1.html')
     doc = html.parse(f.source)
     h = doc.xml_select(u'//h1')[0]
     self.assertEqual(h.property, u'dc:title')
     self.assertEqual(h.xml_attributes[None, u'property'], u'dc:title')
     #print h.xml_namespaces.copy()[u'xml']
     #print h.xml_namespaces.copy()
     self.assertEqual(h.xml_namespaces.copy()[u'xml'],
                      u'http://www.w3.org/XML/1998/namespace')
     self.assertEqual(h.xml_namespaces[u'xml'],
                      u'http://www.w3.org/XML/1998/namespace')
     self.assertEqual(h.xml_namespaces[u'd'],
                      u'http://purl.org/dc/elements/1.1/')
     self.assertEqual(h.xml_namespaces[u'xlink'],
                      u'http://www.w3.org/1999/xlink')
     self.assertEqual(h.xml_namespaces[u'mml'],
                      u'http://www.w3.org/1998/Math/MathML')
     self.assertEqual(h.xml_namespaces[u'xs'],
                      u'http://www.w3.org/2001/XMLSchema')
     self.assertEqual(h.xml_namespaces[u'aml'],
                      u'http://topazproject.org/aml/')
     return

Пример #24

0

Показать файл

Файл: inaem.py Проект: albertogcatalan/daw1

 def leer_especialidad(self, href):
     URL = href
     doc = html.parse(URL)
     xpath = u'/html/body/form/fieldset[2]/div[2]/div[2]'
     self.especialidad = U(doc.xml_select(xpath))

Пример #25

0

Показать файл

Файл: dspace_adapter.py Проект: zepheira/open-science

def dspace_adapter(search=None, id=None):
    '''
    Sample queries:
    curl "http://*****:*****@class="result_table"]//*[@class="article_title"]'):
        for li in islice(doc.xml_select(u'//*[@id="'+RESULTS_DIV+'"]//*[@class="artifact-description"]/..'), 0, maxarticles):
            row = li.xml_parent.xml_parent
            title = li.xml_select(u'.//*[@class="artifact-title"]')[0]
            rel_id = title.a.href.partition(u'/handle/')[2]
            dspace_id = DSPACE_ID_BASE + rel_id
            alt_link = DSPACE_ARTICLE_BASE + u'1721.1/7488'
            #Do not quote.  DSpace doesn't like that
            #alt_link = DSPACE_ARTICLE_BASE + urllib.quote(u'1721.1/7488', '')
            title = unicode(title)
            summary = unicode(row.xml_select(u'string(.//*[@class="summary"])'))
            updated = unicode(row.xml_select(u'string(.//*[@class="date"])')).strip().partition(u'Published: ')[2]
            #updated = time.strptime(updated, "%m/%d/%Y %H:%M:%S") #2/11/2008 2:20:00 AM
            authors = [ (name.strip(), None, None) for name in unicode(row.xml_select(u'string(.//*[@class="author"]//b)')).split(';') ]

            #Retrieve the DSpace page
            qstr = urllib.urlencode({'verb' : 'GetRecord', 'metadataPrefix': 'oai_dc', 'identifier': dspace_id})
            url = DSPACE_OAI_ENDPOINT + '?' + qstr
            print >> sys.stderr, url
            #keywords = [ (k.strip(), JOVE_TAG) for k in unicode(row.xml_select(u'string(.//*[@class="keywords"])')).split(',') ]

            doc = bindery.parse(url, model=OAI_MODEL)
            #print >> sys.stderr, list(generate_metadata(doc))
            resources, first_id = metadata_dict(generate_metadata(doc))
            record = doc.OAI_PMH

            resource = resources[first_id]

            authors = [ (a, None, None) for a in resource[u'creator'] ]
            links = [
                (DSPACE_ARTICLE_BASE + rel_id, u'alternate'),
                (u'dspace?id=' + dspace_id, u'self'),
            ]
            elements = [
                E((ATOM_NAMESPACE, u'content'), {u'src': alt_link}),
            ]
            f.append(
                dspace_id,
                U(resource['title']),
                updated=U(resource['date']),
                summary=U(resource['description']),
                authors=authors,
                links=links,
                #categories=categories,
                elements=elements,
            )

        #FIXME: indent
        return f.xml_encode()

Пример #26

0

Показать файл

Файл: jove_adapter.py Проект: zepheira/open-science

def jove_adapter(search=None, id=None):
    '''
    Sample queries:
    curl "http://*****:*****@class="result_table"]//*[@class="article_title"]'):
        for item in islice(doc.xml_select(u'//*[@class="result_table"]//*[@class="article_title"]'), 0, maxarticles):
            row = item.xml_parent.xml_parent
            title = unicode(item)
            alt_link = item.a.href
            summary = unicode(row.xml_select(u'string(.//*[@class="summary"])'))
            updated = unicode(row.xml_select(u'string(.//*[@class="publication_date"])')).strip().partition(u'Published: ')[2]
            #updated = time.strptime(updated, "%m/%d/%Y %H:%M:%S") #2/11/2008 2:20:00 AM
            authors = [ (name.strip(), None, None) for name in unicode(row.xml_select(u'string(.//*[@class="authors"]//b)')).split(',') ]
            keywords = [ (k.strip(), JOVE_TAG) for k in unicode(row.xml_select(u'string(.//*[@class="keywords"])')).split(',') ]
            icon = first_item(row.xml_select(u'.//*[@class="thumbnail"]')).img.src
            icon = ''.join(icon.split())
            jove_id = item.a.href[len(JOVE_ARTICLE):]

            links = [
                (JOVE_ADAPTER_BASE + '?id=' + jove_id, u'self'),
                (icon, u'icon'),
                #(NCBI_HTML_ARTICLE_LINK_BASE + unicode(aid), u'alternate'),
            ]
            #print >> sys.stderr, links
            #categories = [ (unicode(k), SD_NS+u'authorKeyword') for k in authkw(article) ]
            elements = [
                E((ATOM_NAMESPACE, u'content'), {u'src': item.a.href}),
            #    E((SD_NS, u'sd:journal-cover'), unicode(article.journalCover).strip() if hasattr(article, 'journalCover') else DEFAULT_ICON),
            #    E((SD_NS, u'sd:journal-name'), unicode(article.journalName)),
            ]
            elements.extend([
#                E((ATOM_NAMESPACE, u'link'), {u'rel': u'self', u'href': JOVE_ADAPTER_BASE + '/?id=' + jove_id}),
                E((ATOM_NAMESPACE, u'link'), {u'rel': u'icon', u'href': icon}),
            ])
            f.append(
                item.a.href,
                title,
                updated=datetime.datetime.now().isoformat(),
                summary=summary,
                authors=authors,
                links=links,
                categories=keywords,
                elements=elements,
            )
            #print >> sys.stderr, article.xml_select(u'//*[contains(name(), "journal")]')
            #entry['journal_cover'] = 

        for e in f.feed.entry:
            ENTRY_CACHE[jove_id] = e.xml_encode()
        #FIXME: indent
        return f.xml_encode()

Пример #27

0

Показать файл

Файл: test_html.py Проект: abed-hawa/amara

def test_simple_attr_update3():
    EXPECTED = """<html xmlns="http://www.w3.org/1999/xhtml"><head><title>HELLO</title></head><body><p>WORLD</body></html>"""
    doc = html.parse('<n:a xmlns:n="urn:bogus:x" x="1"/>')
    doc.a.x = unicode(int(doc.a.x)+1)
    treecompare.check_xml(doc.xml_encode(), XMLDECL+EXPECTED)
    return

Пример #28

0

Показать файл

Файл: test_html.py Проект: mredar/amara

def test_simple_attr_update3():
    EXPECTED = """<html xmlns="http://www.w3.org/1999/xhtml"><head><title>HELLO</title></head><body><p>WORLD</body></html>"""
    doc = html.parse('<n:a xmlns:n="urn:bogus:x" x="1"/>')
    doc.a.x = unicode(int(doc.a.x) + 1)
    treecompare.check_xml(doc.xml_encode(), XMLDECL + EXPECTED)
    return

Пример #29

0

Показать файл

Файл: crawler.py Проект: syedan/Simple-Python-Web-Crawler

	def crawl(self):

		number_of_pages_to_crawl = self.limit
		crawl_limit = self.limit

		if self.output_filename is not None:
			self.write_fd = open(self.output_filename,"w")


		if self.limit is None:
			crawl_limit = "Infinite"
			number_of_pages_to_crawl = 2**20



		for i in range(0,number_of_pages_to_crawl):

			try:
				url = self.seeds_queue.get(block=False)				
			except Queue.Empty:
				break

			try:
				html_output =  requests.get(url).text   			

			except (requests.exceptions.ConnectionError,requests.exceptions.InvalidURL) as e:
				frameinfo = getframeinfo(currentframe())
				self.error_code = 1
				self.error_message = "Exception:"+frameinfo.filename + ":%d."%(frameinfo.lineno)+ "  Invalid base url: " + url
				if self.verbose and url == self.baseurl:
					print bcolors.FAIL + "Exception:"+frameinfo.filename, "line number:%d."%(frameinfo.lineno),"Invalid base url: "+url + bcolors.ENDC
				continue


			

			html_output = html_output.encode('utf-8')
			source = html.inputsource(arg=html_output, sourcetype=1)
			self.sites_already_crawled.append(url)

			if self.verbose:
				print "Crawling %s, %d of %s."%(url, i+1, str(crawl_limit))


			if self.write_fd:
				self.write_fd.write("Crawling %s, %d of %s."%(url, i+1, str(crawl_limit)) + "\n")


			try:
				doc = html.parse(html_output)			
			except ValueError:										
				continue


			href_repo_list = list()
			
			hrefs=doc.xml_select(u"//a/@href")

			for href in hrefs:
				if ( not self.keep_inpage_ref ) and href.xml_value.startswith("#"):				  
					continue

				if not href.xml_value.startswith("http") or href.xml_value.startswith("/"):  
					href.xml_value=self.baseurl+href.xml_value
				

				if href.xml_value.endswith("/"):				 
					href.xml_value = href.xml_value[:-1]


			 
				if (not self.keep_duplicate_links) and ( href.xml_value not in href_repo_list):   	
					href_repo_list.append(href.xml_value)

				if self.keep_duplicate_links:														
					href_repo_list.append(href.xml_value)

				if (not self.recrawl_pages)  and (href.xml_value not in self.sites_already_crawled): 
					self.seeds_queue.put(href.xml_value)

				if self.recrawl_pages:																	 
					self.seeds_queue.put(href.xml_value)


			page_href_dict = dict()
			page_href_dict["url"]=url
			page_href_dict["href_repo_list"]=href_repo_list	
			self.crawled_list.append(page_href_dict)

Python parse примеры использования