def __init__(self): self._doc = html.parse(URL) xpath_cursos = u'/html/body/form/table/tbody/tr' # primer elemento es cabecera self._lista_cursos = [] for nodo in self._doc.xml_select(xpath_cursos)[1:]: self._lista_cursos.append(Curso(nodo))
def tidy(body, ctype): ''' Tidy arbitrary HTML (using html5lib) Sample request: curl --request POST --data-binary "<a>one two <b>three four </b><c>five <d>six seven</d> eight</c> nine</a>" --header "Content-Type: application/xml" "http://localhost:8880/akara.tidy" ''' doc = htmldoc.parse(body) return doc
def extrae_nodos(): from amara.bindery import html doc = html.parse('http://www.agrega2.es/web/') doc.xml_select(u'//div[@id="block-views-nodos-de-agrega-block-1"]//li//a') nodos = doc.xml_select(u'//div[@id="block-views-nodos-de-agrega-block-1"]//li//a') nodos_agrega = dict([(unicode(n), n.href) for n in nodos]) return nodos_agrega
def test_parse_file(self): """Parse ugly HTML file""" f = filesource('nastytagsoup1.html') doc = html.parse(f.source) self.assertEqual(len(doc.xml_children), 1) self.assertEqual(doc.xml_children[0].xml_type, tree.element.xml_type) self.assertEqual(doc.xml_children[0].xml_qname, 'html') self.assertEqual(doc.xml_children[0].xml_namespace, None) self.assertEqual(doc.xml_children[0].xml_prefix, None) self.assertEqual(len(list(doc.html.xml_elements)), 2) return
def __init__(self): self._doc = html.parse(URL) xpath_pc_1 = u'//*[@id="atfResults"]/div' xpath_pc_2 = u'//*[@id="btfResults"]/div' self._lista_pc = [] for nodo in self._doc.xml_select(xpath_pc_1): self._lista_pc.append(Catalogo(nodo)) for nodo in self._doc.xml_select(xpath_pc_2): self._lista_pc.append(Catalogo(nodo))
def test_reserved_attributes_page_ns(): EXPECTED = '<h1 xmlns="http://www.w3.org/1999/xhtml" xmlns:h="http://www.w3.org/1999/xhtml" id="akara:metadata">akara:metadata</h1>' f = filesource('tagsoup2.html') doc = html.parse(f.source, prefixes=XHTML_NSS, use_xhtml_ns=True) #import sys; print >> sys.stderr, doc.xml_select(u'*')[0].xml_name #import sys; print >> sys.stderr, doc.xml_select(u'//h:div[@id="content"]')[0].xml_first_child #content = doc.xml_select(u'//div[@id="content"]//h1')[0] #first_h1 = content.xml_select(u'.//h1')[0] first_h1 = doc.xml_select(u'//h:div[@id="content"]//h:h1')[0] treecompare.check_xml(first_h1.xml_encode(), EXPECTED) assert first_h1.id == u'akara:metadata', (first_h1.id, u'akara:metadata') return
def rdfascrape(source): from amara.lib import inputsource source = inputsource(source, None) doc = html.parse(source.stream) try: docuri = doc.html.head.base.href except: docuri = source.uri statement_elems = doc.xml_select(u'//*[@property|@resource|@rel]') triples = ( handle_statement(elem, docuri) for elem in statement_elems ) return triples
def rdfascrape(source): from amara.lib import inputsource source = inputsource(source, None) doc = html.parse(source.stream) try: docuri = doc.html.head.base.href except: docuri = source.uri statement_elems = doc.xml_select(u'//*[@property|@resource|@rel]') triples = (handle_statement(elem, docuri) for elem in statement_elems) return triples
def test_reserved_attributes_page(): EXPECTED = '<h1 id="akara:metadata">akara:metadata</h1>' f = filesource('tagsoup2.html') doc = html.parse(f.source) #import sys; print >> sys.stderr, [ d.xml_name for d in doc.xml_select(u'//div') ] #import sys; print >> sys.stderr, dict(doc.xml_select(u'//div')[1].xml_attributes) #import sys; print >> sys.stderr, doc.xml_select(u'*')[0].xml_name #content = doc.xml_select(u'//div[@id="content"]//h1')[0] #first_h1 = content.xml_select(u'.//h1')[0] #import sys; print >> sys.stderr, doc.xml_select(u'//div[@id="content"]')[0].xml_first_child first_h1 = doc.xml_select(u'//div[@id="content"]//h1')[0] treecompare.check_xml(first_h1.xml_encode(), EXPECTED) assert first_h1.id == u'akara:metadata', (first_h1.id, u'akara:metadata') return
def akara_xpath(body, ctype, **params): ''' select - XPath expression to be evaluated against the document tidy - 'yes' to tidy HTML, or 'no' Sample request: curl --request POST --data-binary "@foo.xml" --header "Content-Type: application/xml" "http://localhost:8880/akara.xpath?select=/html/head/title&tidy=yes" ''' if params.get("tidy") == 'yes': doc = html.parse(body) else: doc = amara.parse(body) result = simplify(doc.xml_select(params['select'].decode('utf-8'))) return str(result)
def rdfascrape(source): from amara.lib import inputsource source = inputsource(source, None) doc = html.parse(source.stream) try: docuri = doc.html.head.base.href except: docuri = source.uri #https://github.com/zepheira/amara/issues/8 #statement_elems = doc.xml_select(u'//*[@property|@resource|@rel]') statement_elems = chain(doc.xml_select(u'//*[@property]'), doc.xml_select(u'//*[@resource]'), doc.xml_select(u'//*[@rel]')) triples = ( handle_statement(elem, docuri) for elem in statement_elems ) return triples
def test_tagsoup1(self): """Test RDFa interpretation from tagsoup""" f = filesource('tagsouprdfa1.html') doc = html.parse(f.source) h = doc.xml_select(u'//h1')[0] self.assertEqual(h.property, u'dc:title') self.assertEqual(h.xml_attributes[None, u'property'], u'dc:title') #print h.xml_namespaces.copy()[u'xml'] #print h.xml_namespaces.copy() self.assertEqual(h.xml_namespaces.copy()[u'xml'], u'http://www.w3.org/XML/1998/namespace') self.assertEqual(h.xml_namespaces[u'xml'], u'http://www.w3.org/XML/1998/namespace') self.assertEqual(h.xml_namespaces[u'd'], u'http://purl.org/dc/elements/1.1/') self.assertEqual(h.xml_namespaces[u'xlink'], u'http://www.w3.org/1999/xlink') self.assertEqual(h.xml_namespaces[u'mml'], u'http://www.w3.org/1998/Math/MathML') self.assertEqual(h.xml_namespaces[u'xs'], u'http://www.w3.org/2001/XMLSchema') self.assertEqual(h.xml_namespaces[u'aml'], u'http://topazproject.org/aml/') return
def akara_twc(body, ctype, max=None, html='no'): ''' Take some POSTed markup and return a version with words trimmed, but intelligently, with understanding of markup, so that tags are not counted, and the structure of sub-elements included in the same set is preserved. max (query parameter) - which is the maximum word count of the resulting text html (query parameter) - if 'yes', try to parse the input as HTML Sample request: curl --request POST --data-binary "<a>one two <b>three four </b><c>five <d>six seven</d> eight</c> nine</a>" --header "Content-Type: application/xml" "http://localhost:8880/akara.twc?max=7" ''' #Raises ValueError #Is there a monadic approach we can provide for Akara for error handling? This cries out for "Maybe" #(OK OK, the idea of Maybe, but more of the simple expressiveness of assert) max_ = int(max) if max else 500 if html == 'yes': doc = htmldoc.parse(body) else: doc = amara.parse(body) return trim_word_count(doc, max_)
def tidy_content_element(root, check=u'//atom:title|//atom:summary|//atom:content', prefixes=PREFIXES): """ Takes all Atom content elements with type=html (i.e. a:title, a:summary or a:content) And convert them to be of type=xhtml This operation mutates root in place. Example: import amara; from util import tidy_content_element A = '<entry xmlns="http://www.w3.org/2005/Atom"><id>urn:bogus:x</id><title type="html"><div>x<p>y<p></div></title></entry>' doc = amara.parse(A) tidy_content_element(doc) doc.xml_write() """ nodes = root.xml_select(check, prefixes) for node in nodes: if node.xml_select(u'@type = "html"') and node.xml_select(u'string(.)'): #unsouped = html.parse('<html xmlns="http://www.w3.org/1999/xhtml">%s</html>'%node.xml_select(u'string(.)').encode('utf-8'), encoding='utf-8') unsouped = html.parse('<html>%s</html>'%node.xml_select(u'string(.)').encode('utf-8'), encoding='utf-8') unsouped.html.xml_namespaces[None] = XHTML_NAMESPACE subtree = element_subtree_iter(unsouped) #Grab body, before changing the namespaces changes how it's bound #After NS is changed, you'd need to remember to do unsouped.html_.body_ body = unsouped.html.body for e in subtree: if isinstance(e, tree.element): e.xml_namespace = XHTML_NAMESPACE #Temporary fixup until bindery can handle namespace change better e.xml_parent.xml_fixup(e) #amara.xml_print(unsouped, stream=sys.stderr, indent=True) while node.xml_children: node.xml_remove(node.xml_first_child) node.xml_append(amara.parse('<div xmlns="http://www.w3.org/1999/xhtml"/>').xml_first_child) #node.xml_append_fragment('<div xmlns="http://www.w3.org/1999/xhtml"/>') for child in body.xml_children: node.xml_first_child.xml_append(child) node.xml_attributes[None, u'type'] = u'xhtml' return root
def charsearch(q=None): ''' name - a string to search for in Unicode information (using http://www.fileformat.info ) Sample request: curl "http://*****:*****@class="list"]//*[starts-with(@class, "row")]')) ) )) return buf.getvalue()
def charsearch(q=None): ''' name - a string to search for in Unicode information (using http://www.fileformat.info ) Sample request: curl "http://*****:*****@class="list"]//*[starts-with(@class, "row")]'))))) return buf.getvalue()
def leer_especialidad(self, href): URL = href doc = html.parse(URL) xpath = u'/html/body/form/fieldset[2]/div[2]/div[2]' self.especialidad = U(doc.xml_select(xpath))
def dspace_adapter(search=None, id=None): ''' Sample queries: curl "http://*****:*****@class="result_table"]//*[@class="article_title"]'): for li in islice(doc.xml_select(u'//*[@id="'+RESULTS_DIV+'"]//*[@class="artifact-description"]/..'), 0, maxarticles): row = li.xml_parent.xml_parent title = li.xml_select(u'.//*[@class="artifact-title"]')[0] rel_id = title.a.href.partition(u'/handle/')[2] dspace_id = DSPACE_ID_BASE + rel_id alt_link = DSPACE_ARTICLE_BASE + u'1721.1/7488' #Do not quote. DSpace doesn't like that #alt_link = DSPACE_ARTICLE_BASE + urllib.quote(u'1721.1/7488', '') title = unicode(title) summary = unicode(row.xml_select(u'string(.//*[@class="summary"])')) updated = unicode(row.xml_select(u'string(.//*[@class="date"])')).strip().partition(u'Published: ')[2] #updated = time.strptime(updated, "%m/%d/%Y %H:%M:%S") #2/11/2008 2:20:00 AM authors = [ (name.strip(), None, None) for name in unicode(row.xml_select(u'string(.//*[@class="author"]//b)')).split(';') ] #Retrieve the DSpace page qstr = urllib.urlencode({'verb' : 'GetRecord', 'metadataPrefix': 'oai_dc', 'identifier': dspace_id}) url = DSPACE_OAI_ENDPOINT + '?' + qstr print >> sys.stderr, url #keywords = [ (k.strip(), JOVE_TAG) for k in unicode(row.xml_select(u'string(.//*[@class="keywords"])')).split(',') ] doc = bindery.parse(url, model=OAI_MODEL) #print >> sys.stderr, list(generate_metadata(doc)) resources, first_id = metadata_dict(generate_metadata(doc)) record = doc.OAI_PMH resource = resources[first_id] authors = [ (a, None, None) for a in resource[u'creator'] ] links = [ (DSPACE_ARTICLE_BASE + rel_id, u'alternate'), (u'dspace?id=' + dspace_id, u'self'), ] elements = [ E((ATOM_NAMESPACE, u'content'), {u'src': alt_link}), ] f.append( dspace_id, U(resource['title']), updated=U(resource['date']), summary=U(resource['description']), authors=authors, links=links, #categories=categories, elements=elements, ) #FIXME: indent return f.xml_encode()
def jove_adapter(search=None, id=None): ''' Sample queries: curl "http://*****:*****@class="result_table"]//*[@class="article_title"]'): for item in islice(doc.xml_select(u'//*[@class="result_table"]//*[@class="article_title"]'), 0, maxarticles): row = item.xml_parent.xml_parent title = unicode(item) alt_link = item.a.href summary = unicode(row.xml_select(u'string(.//*[@class="summary"])')) updated = unicode(row.xml_select(u'string(.//*[@class="publication_date"])')).strip().partition(u'Published: ')[2] #updated = time.strptime(updated, "%m/%d/%Y %H:%M:%S") #2/11/2008 2:20:00 AM authors = [ (name.strip(), None, None) for name in unicode(row.xml_select(u'string(.//*[@class="authors"]//b)')).split(',') ] keywords = [ (k.strip(), JOVE_TAG) for k in unicode(row.xml_select(u'string(.//*[@class="keywords"])')).split(',') ] icon = first_item(row.xml_select(u'.//*[@class="thumbnail"]')).img.src icon = ''.join(icon.split()) jove_id = item.a.href[len(JOVE_ARTICLE):] links = [ (JOVE_ADAPTER_BASE + '?id=' + jove_id, u'self'), (icon, u'icon'), #(NCBI_HTML_ARTICLE_LINK_BASE + unicode(aid), u'alternate'), ] #print >> sys.stderr, links #categories = [ (unicode(k), SD_NS+u'authorKeyword') for k in authkw(article) ] elements = [ E((ATOM_NAMESPACE, u'content'), {u'src': item.a.href}), # E((SD_NS, u'sd:journal-cover'), unicode(article.journalCover).strip() if hasattr(article, 'journalCover') else DEFAULT_ICON), # E((SD_NS, u'sd:journal-name'), unicode(article.journalName)), ] elements.extend([ # E((ATOM_NAMESPACE, u'link'), {u'rel': u'self', u'href': JOVE_ADAPTER_BASE + '/?id=' + jove_id}), E((ATOM_NAMESPACE, u'link'), {u'rel': u'icon', u'href': icon}), ]) f.append( item.a.href, title, updated=datetime.datetime.now().isoformat(), summary=summary, authors=authors, links=links, categories=keywords, elements=elements, ) #print >> sys.stderr, article.xml_select(u'//*[contains(name(), "journal")]') #entry['journal_cover'] = for e in f.feed.entry: ENTRY_CACHE[jove_id] = e.xml_encode() #FIXME: indent return f.xml_encode()
def test_simple_attr_update3(): EXPECTED = """<html xmlns="http://www.w3.org/1999/xhtml"><head><title>HELLO</title></head><body><p>WORLD</body></html>""" doc = html.parse('<n:a xmlns:n="urn:bogus:x" x="1"/>') doc.a.x = unicode(int(doc.a.x)+1) treecompare.check_xml(doc.xml_encode(), XMLDECL+EXPECTED) return
def test_simple_attr_update3(): EXPECTED = """<html xmlns="http://www.w3.org/1999/xhtml"><head><title>HELLO</title></head><body><p>WORLD</body></html>""" doc = html.parse('<n:a xmlns:n="urn:bogus:x" x="1"/>') doc.a.x = unicode(int(doc.a.x) + 1) treecompare.check_xml(doc.xml_encode(), XMLDECL + EXPECTED) return
def crawl(self): number_of_pages_to_crawl = self.limit crawl_limit = self.limit if self.output_filename is not None: self.write_fd = open(self.output_filename,"w") if self.limit is None: crawl_limit = "Infinite" number_of_pages_to_crawl = 2**20 for i in range(0,number_of_pages_to_crawl): try: url = self.seeds_queue.get(block=False) except Queue.Empty: break try: html_output = requests.get(url).text except (requests.exceptions.ConnectionError,requests.exceptions.InvalidURL) as e: frameinfo = getframeinfo(currentframe()) self.error_code = 1 self.error_message = "Exception:"+frameinfo.filename + ":%d."%(frameinfo.lineno)+ " Invalid base url: " + url if self.verbose and url == self.baseurl: print bcolors.FAIL + "Exception:"+frameinfo.filename, "line number:%d."%(frameinfo.lineno),"Invalid base url: "+url + bcolors.ENDC continue html_output = html_output.encode('utf-8') source = html.inputsource(arg=html_output, sourcetype=1) self.sites_already_crawled.append(url) if self.verbose: print "Crawling %s, %d of %s."%(url, i+1, str(crawl_limit)) if self.write_fd: self.write_fd.write("Crawling %s, %d of %s."%(url, i+1, str(crawl_limit)) + "\n") try: doc = html.parse(html_output) except ValueError: continue href_repo_list = list() hrefs=doc.xml_select(u"//a/@href") for href in hrefs: if ( not self.keep_inpage_ref ) and href.xml_value.startswith("#"): continue if not href.xml_value.startswith("http") or href.xml_value.startswith("/"): href.xml_value=self.baseurl+href.xml_value if href.xml_value.endswith("/"): href.xml_value = href.xml_value[:-1] if (not self.keep_duplicate_links) and ( href.xml_value not in href_repo_list): href_repo_list.append(href.xml_value) if self.keep_duplicate_links: href_repo_list.append(href.xml_value) if (not self.recrawl_pages) and (href.xml_value not in self.sites_already_crawled): self.seeds_queue.put(href.xml_value) if self.recrawl_pages: self.seeds_queue.put(href.xml_value) page_href_dict = dict() page_href_dict["url"]=url page_href_dict["href_repo_list"]=href_repo_list self.crawled_list.append(page_href_dict)