def htmldiff(path1, path2): tree1 = parse(path1).getroot() tree2 = parse(path2).getroot() elementsA_hash = {} elementsB_hash = {} isLeafNodeA = {} isLeafNodeB = {} outputDict = {} numberOfChanges = 0 hashNodes(tree1, elementsA_hash, isLeafNodeA, False, None) hashNodes(tree2, elementsB_hash, isLeafNodeB, True, elementsA_hash) sameKeys = findSimilarNodes(elementsA_hash, elementsB_hash) for key, value in elementsA_hash.iteritems(): output = {} try: if key in sameKeys: continue isSameNode(elementsA_hash[key], elementsB_hash[key], isLeafNodeA[key], outputDict) except KeyError as e: node = elementsA_hash[key] # print 'I got a KeyError - reason "%s"' % str(e) output['afterText'] = "" output['afterAttribute'] = "" output['rawTextChange'] = node.text output['rawAttributeChange'] = node.attrib output['elementType'] = node.tag output['op'] = "" output['fullText'] = "" output['otherInfo'] = "DELETED NODE" outputDict[getDiffOutputNumber(outputDict)] = output for key, value in elementsB_hash.iteritems(): output = {} try: # Check to see if this node exist in the original file tempNode = elementsA_hash[key] except KeyError as e: node = elementsB_hash[key] # print 'I got a KeyError - reason "%s"' % str(e) output['afterText'] = node.text output['afterAttribute'] = node.attrib output['rawTextChange'] = node.text output['rawAttributeChange'] = node.attrib output['elementType'] = node.tag output['op'] = "" output['fullText'] = "" output['otherInfo'] = "ADDED NODE" outputDict[getDiffOutputNumber(outputDict)] = output print getDiffOutputNumber(outputDict) return outputDict
def getfields(keys): for page in ['system','signal','status']: root=parse(fetch("%s/%s.asp" % (host, page))) for x in root.xpath(".//tr"): fields=[totext(t) for t in x.xpath('./td')] if "%s/%s" % (page, fields[0]) in keys: print fields[0], fields[1]
def from_str(self, html_str, partial=False) -> HtmlNode: ''' Create an `HtmlNode` from a string. Keyword Arguments: partial: If True, the provided string is considered as a part of HTML for parsing. ''' if partial: html_str = f"<html><body>{html_str}</body></html>" lxml_tree = soupparser.parse(StringIO(html_str)) body = lxml_tree.getroot().find('body') return HtmlNode(list(body)[0]) else: lxml_tree = soupparser.parse(StringIO(html_str)) body = lxml_tree.getroot() return HtmlNode(body)
def _getElementTreeRoot(self, url): import lxml.html.soupparser as soupparser import urllib.request _url = urllib.request.urlopen(url) tree = soupparser.parse(_url) return tree.getroot()
def readXMLlxml(filename): from lxml.html import soupparser tree = soupparser.parse(filename) root = tree.getroot() for child in root: yield child.tag, child.attrib
def fetch(url): # url to etree try: f = urlopen(url) except: return '[!] unable to open %s' % url return parse(f)
def fetch(url): # url to etree try: f=urlopen(url) except: return '[!] unable to open %s' % url return parse(f)
def unmeta(url, res): """ Finds any meta redirects a httplib.response object that has text/html as content-type. Args: url (str): The url to follow one redirect res (httplib.response): a http.response object Returns: (str). The return resolved url """ if res and (res.getheader('Content-type') or "").startswith('text/html'): size=65535 if res.getheader('Content-Length'): try: tmp=int(res.getheader('Content-length')) if tmp<65535: size=tmp except: print "wrong content-length:",res.getheader('Content-length') root=parse(StringIO(res.read(size))) for x in root.xpath('//meta[@http-equiv="refresh"]'): newurl=x.get('content').split(';') if len(newurl)>1: newurl=newurl[1].strip()[4:] parts=httplib.urlsplit(urllib.unquote_plus(newurl)) if parts.scheme and parts.netloc: url=newurl return weedparams(url)
def fetch(url): # url to etree print >> sys.stderr, url f=urllib2.urlopen(url) raw=parse(f) f.close() return raw
def interfaces(): print "Interface Name\tProvisioned\tState\tSpeed\tMAC Address" root=parse(fetch("%s/status.asp" % host)) for x in root.xpath(".//tr"): fields=[totext(t) for t in x.xpath('./td')] if "status/%s" % fields[0] in ifs: print fields[0], '\t', '\t'.join(fields[1:])
def __parseto_xtree(self, xhtml_s): if isinstance(xhtml_s, dict): base_url = xhtml_s.pop("base_url", None) # print "IN" print base_url resolve_base = xhtml_s.pop("resolve_base", True) clean_xhtml = xhtml_s.pop("clean_xhtml", False) xhtml_s = xhtml_s.pop("xhtml_s", None) assert xhtml_s,\ "LinkExtractor.__parseto_xtree() Error: Dictionary with <None> xhtml source" elif isinstance(xhtml_s, str): clean_xhtml = False base_url = None else: raise Exception( "LinkExtractor.__parseto_xtree() Error: string or dictionary instance expected" ) if clean_xhtml: xhtml_clr = html_clr( scripts=True, javascript=True, comments=True, style=True, links=True, meta=True, page_structure=False, processing_instructions=True, embedded=True, annoying_tags=True, remove_unknown_tags=True ) # meta=False because we need MetaInfo xhtml_s = xhtml_clr.clean_html(xhtml_s) # The HTMLParser(s) should be defined in the thread (or process) when lxml.html.parser is dispatched into it htmlparser = lxml.html.HTMLParser(recover=True, no_network=False) # recover mode and download DTD enabled # Now parse the XHTML source try: etree = lxml.html.parse(StringIO(xhtml_s), parser=htmlparser) except Exception as e: print("LinkExtractor Error: %s" % e) print("LinkExtractor: Now Trying with the SOUP parser") try: etree = soup.parse(xhtml_s) except Exception as e: raise Exception("LinkExtractor Error: %s" % e) if base_url: eroot = etree.getroot() try: eroot.make_links_absolute(base_url, resolve_base_href=resolve_base) except Exception as e: raise Exception( "LinkExtractor.__parseto_xtree() while making links absolute Error: " % e ) # Return the etree just created return etree
def dump(pages=['system','signal','status','log','emta']): for page in pages: root=parse(fetch("%s/%s.asp" % (host, page))) for x in root.xpath(".//tr"): fields=[totext(t) for t in x.xpath('./td')] if filter(None,fields) and fields!=['']: print ':'.join(fields) print
def _initmap(self): pos=0 i=0 offset=0 paths={} tree = parse(StringIO(self.doc.body.encode('utf8'))) textnodes=tree.xpath('//div[@id="TexteOnly"]//text()') cut=5 if not textnodes: textnodes=tree.xpath('//text()') cut=10 texts=[unescape(x) for x in textnodes] #tmp = [token for frag in texts if frag for token in nltk.tokenize.wordpunct_tokenize(frag)] #for line in difflib.context_diff(tmp, self.doc.tokens): # print repr(line) #print texts #print self.doc.tokens lastgood=(i,offset) while pos<len(self.doc.tokens): if i>=len(texts): print "guessing frag: %s, reset to %s, %s" % (self.doc.tokens[pos].encode('utf8'), lastgood[0], lastgood[1]) (i, offset)=lastgood path=tree.getpath(textnodes[i].getparent())[cut:] paths[pos]=(path, offset) offset+=len(self.doc.tokens[pos]) if offset>=len(texts[i]): i+=1 offset=0 pos+=1 continue offset=texts[i].find(self.doc.tokens[pos],offset) if offset==-1: i+=1 offset=0 continue if textnodes[i].is_tail: path=tree.getpath(textnodes[i].getparent().getparent())[cut:] siblings=textnodes[i].getparent().getparent().xpath('.//text()') adjust=len(''.join(siblings[:siblings.index(textnodes[i])])) paths[pos]=(path, adjust+offset) #print 'asdf', self.doc.tokens[pos:pos+l], ''.join(siblings)[adjust+offset:adjust+offset+len(self.doc.tokens[pos])], adjust+offset, offset else: path=tree.getpath(textnodes[i].getparent())[cut:] paths[pos]=(path, offset) #print 'qwer', self.doc.tokens[pos], texts[i][offset:offset+len(self.doc.tokens[pos])], paths[pos], path, offset #print "frag: %s(%s) @%s" % (i,len(texts), paths[pos][1]),"token: %s(%s)" % (pos, len(self.doc.tokens)), self.doc.tokens[pos].encode('utf8') #print paths[pos] offset+=len(self.doc.tokens[pos]) if offset>=len(texts[i]): i+=1 offset=0 lastgood=(i,offset) pos+=1 #for pos, (path, offset) in sorted(paths.items()): # print self.doc.tokens[pos], pos, path, offset #print len(paths), len(self.doc.tokens) #print return paths
def fetch(url, retries=5, ignore=[], params=None): try: return parse(fetch_raw(url, retries, ignore, params)) except: if retries > 0: time.sleep(4 * (6 - retries)) return fetch(url, retries - 1, ignore=ignore) else: raise
def main(basedir='', outdir='', infile='', outfile='', sexpr='.SearchResults'): """basic setup from directory""" os.chdir(basedir) if not outfile: outfile = os.path.splitext(infile)[0] + '.csv' doc = soupparser.parse(infile) table = selecttable(doc, sexpr) rows = table2csv(table) write_csv(outfile,rows)
def munin(): muninfields=["system/Receive Power Level", "system/Transmit Power Level", "signal/Signal to Noise Ratio"] for page in ['system','signal','status']: root=parse(fetch("%s/%s.asp" % (host, page))) for x in root.xpath(".//tr"): fields=[totext(t) for t in x.xpath('./td')] key="%s/%s" % (page, fields[0]) if key in muninfields: print "%s.value %s" % (fields[0].lower().replace(' ','_'), split(fields[1]))
def fetch(url, retries=5, ignore=[], params=None): try: return parse(fetch_raw(url, retries, ignore, params)) except: if retries>0: time.sleep(4*(6-retries)) return fetch(url,retries-1, ignore=ignore) else: raise
def main(): (options, args) = parseOpts() print "Fetch EPG data from '%s'." % get_url(options) raw = urllib2.urlopen(get_url(options), 'utf-8') content = parse(raw) exporter = XmltvExporter(options.output) parser = OtrParser(exporter) parser.parse(content) exporter.write()
def nextPage(req): response = opener.open(req) tree = parse(response) map( scrape, ["http://www.europarl.europa.eu/oeil/" + x.get("href") for x in tree.xpath('//a[@class="com_acronym"]')] ) img = tree.xpath('//a/img[@src="img/cont/activities/navigation/navi_next_activities.gif"]') if len(img): next = "http://www.europarl.europa.eu/" + img[0].xpath("..")[0].get("href") print >>sys.stderr, ("retrieving next page") nextPage(next)
def title(request): url = request.GET.get('url', None) if url is None: return HttpResponseBadRequest() else: soup = parse(urlopen(url)) title = soup.find('.//title').text return HttpResponse(dumps({ 'url': url, 'title': title, }))
def fetch(url): # url to etree try: f=urllib2.urlopen(url) except urllib2.HTTPError: try: f=urllib2.urlopen(url) except urllib2.HTTPError: try: f=urllib2.urlopen(url) except urllib2.HTTPError: return '' return parse(f)
def fetchVotes(d): url="%s%s%s" % ("http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-//EP//NONSGML+PV+", d, "+RES-RCV+DOC+WORD+V0//EN&language=EN") print >>sys.stderr, url f=urllib2.urlopen(url) tmp=mkstemp() fd=os.fdopen(tmp[0],'w') fd.write(f.read()) fd.close() f.close() res=subprocess.Popen(['/usr/bin/wvHtml', tmp[1], '-'], stdout=subprocess.PIPE).communicate()[0] os.unlink(tmp[1]) return parse(StringIO(res))
def fetch_broken_html(self, *args, **kwargs): ''' like ``fetch_html`` with even more relaxed parsing by using ``BeautifulSoup`` as our parser ''' from lxml.html import soupparser from eureka.xml import HTMLParser with self.fetch(*args, **kwargs) as fp: result = soupparser.parse(fp, makeelement=HTMLParser().makeelement).getroot() result.make_links_absolute(fp.geturl(), handle_failures='ignore') return result
def munin_speed(): root=parse(fetch("%s/signal.asp" % host)) modmap={'BPSK': 1, 'QPSK': 2, '8PSK': 3, '16QAM': 4, '32QAM': 5, '64QAM': 6, '256QAM': 8, } c=0 for x in root.xpath(".//tr"): fields=[totext(t) for t in x.xpath('./td')] if fields[0]=="Modulation": c=modmap[fields[1]] continue if fields[0]=="Bit Rate": print "downstream_bitrate.value %.3f" % (int(split(fields[1]))/8000000.0) continue if fields[0]=="Symbol Rate": print "upstream_bitrate.value %.3f" % (int(split(fields[1]))*c/8000.0) continue
def fetch(url): # url to etree print >> sys.stderr, url try: f=urllib2.urlopen(url) except (urllib2.HTTPError, urllib2.URLError): try: f=urllib2.urlopen(url) except (urllib2.HTTPError, urllib2.URLError): try: f=urllib2.urlopen(url) except (urllib2.HTTPError, urllib2.URLError): return '' raw=parse(f) f.close() return raw
def fetch(url): # url to etree print >> sys.stderr, url try: f = urllib2.urlopen(url) except (urllib2.HTTPError, urllib2.URLError): try: f = urllib2.urlopen(url) except (urllib2.HTTPError, urllib2.URLError): try: f = urllib2.urlopen(url) except (urllib2.HTTPError, urllib2.URLError): return '' raw = parse(f) f.close() return raw
def parsetoXtree(self, xhtml_s, clean_xhtml=False): if clean_xhtml: cleaner = Cleaner( scripts=True, javascript=True, comments=True, style=True,\ links=True, meta=True, page_structure=False, processing_instructions=True,\ embedded=True, annoying_tags=True, remove_unknown_tags=True )#meta=False because we need MetaInfo try: xhtml_s = cleaner.clean_html(xhtml_s) except: pass #The HTML Parsers with and without recover mode but the capability to download the Proper DTD always ON #In case the lxml.html.parser will dispatched to sub-processes or threads then #the HTMLParser(s) should be defined within these sub-processes or threads htmlparser = lxml.html.HTMLParser(recover=False, no_network=False) htmlparser_rcv = lxml.html.HTMLParser(recover=True, no_network=False) #Parse the XHTML Source parsing_errors = list() try: xhtml_t = lxml.html.parse(StringIO(xhtml_s), parser=htmlparser, base_url=self.due.base_url['url']) except: #except ValueError, error: #except lxml.etree.XMLSyntaxError, error: #print(xhtml_s) pass #print("PARSE ERROR (no recovery mode): %s" % error) #parsing_errors.append(error) try: xhtml_t = lxml.html.parse(StringIO(xhtml_s), parser=htmlparser_rcv, base_url=self.due.base_url['url']) #StringIO(xhtml_s) except: print("PARSE ERROR (recivery mode): %s" % error) parsing_errors.append(error) try: print('DA ZOUP') xhtml_t = soup.parse(xhtml_s) #StringIO(xhtml_s) except: print("F****D-UP PAGE") parsing_errors.append("BeautifullSoup Failed") return {'xtree' : None, 'parsing_errors' : parsing_errors} #Get the root Element and make the links absolute xhtml_troot = xhtml_t.getroot() try: xhtml_troot.make_links_absolute(self.due.base_url['url'], resolve_base_href=True) except: return {'xtree' : None, 'parsing_errors' : parsing_errors} for i in xhtml_t.iterlinks(): pass return {'xtree' : xhtml_t, 'parsing_errors' : parsing_errors}
def parse_html(file): html = parser.parse(file) # extract the thread ID thread_id = html.xpath('//form/a[@name][1]/@name') thread_posts = html.xpath('count(//form/a[@name])') posts = [] post_ids = html.xpath('//td[@class="reply"]/@id') # first post is special, unfortunately. post_id = post_ids.pop(0) author = html.xpath('//form/span[@class="postername"][1]/text()')[0] content = ElementTree.tostring( html.xpath('//form/blockquote')[0]).decode('UTF-8') date = html.xpath('//form/span[@class="posttime"][1]/text()')[0] attach = html.xpath('//form/span[@class="filesize"]/a[1]/@href') attach = attach[0] if len(attach) > 0 else None posts.append(Post(post_id, author, content, date, attach)) # <a class="quotelink unkfunc" href="http://suptg.thisisnotatrueending.com/archive/17738107/#17745349" onclick="replyhl('17745349');">>>17745349</a> magic = re.compile(r'<a class=".*?" href=".*?" onclick=".*?">(.*?)</a>') # extract other postss for post in post_ids: author = html.xpath( '//td[@id={}]/span[@class="commentpostername"]/text()'.format( post))[0] content = ElementTree.tostring( html.xpath( '//td[@id={}]/blockquote'.format(post))[0]).decode('UTF-8') date = html.xpath( '//td[@id={}][span[@class="commentpostername"]]/text()[string-length()>1][1]' .format(post))[0] attach = html.xpath( '//td[@id={}][span[@class="filesize"]]/a/@href'.format(post)) attach = attach[0] if len(attach) > 0 else None content = magic.sub(r'\\1', content) posts.append(Post(post, author, content, date, attach)) return posts
def parse_product(id, cid, name): root = parse(os.path.join(base_dir, 'htmls', '%s.html' % id)) xml_product_content = root.xpath("//div[@class='productContent']")[0] product_img = norm_image_url(xml_product_content.xpath("//img[@class='productPic']")[0].attrib['src']) product_name = xml_product_content.xpath("//p[@class='d_title']/span")[0].text product_for = xml_product_content.xpath("//p[@class='d_title']/label")[0].text.rstrip(u'。').strip() product_spec_texts = re.compile(u'/|/').split(xml_product_content.xpath("//p[@class='d_title']/text()")[1].strip()[3:]) product_specs = [norm_spec(i) for i in product_spec_texts] xml_product_detail = xml_product_content.xpath("//div[@class='pl_detail']")[0] product_detail = parse_detail(id, cid, name, xml_product_content) product = {} product['id'] = id product['cid'] = cid product['name'] = product_name product['image'] = product_img product['for'] = product_for product['specs'] = product_specs product['detail'] = product_detail return product
def _initmap(self): pos=0 i=0 offset=0 paths={} tree = parse(StringIO(self.doc.body.encode('utf8'))) textnodes=tree.xpath('//div[@id="TexteOnly"]//text()') cut=5 if not textnodes: textnodes=tree.xpath('//text()') cut=10 texts=[unescape(x) for x in textnodes] #print texts #print self.doc.tokens while i<len(texts) and pos<len(self.doc.tokens): #print i,len(texts),len(self.doc.tokens),pos, self.doc.tokens[pos].encode('utf8') offset=texts[i].find(self.doc.tokens[pos],offset) if offset==-1: i+=1 offset=0 continue if textnodes[i].is_tail: path=tree.getpath(textnodes[i].getparent().getparent())[cut:] siblings=textnodes[i].getparent().getparent().xpath('.//text()') adjust=len(''.join(siblings[:siblings.index(textnodes[i])])) paths[pos]=(path, adjust+offset) #print 'asdf', self.doc.tokens[pos], ''.join(siblings)[adjust+offset:adjust+offset+len(self.doc.tokens[pos])], adjust+offset, offset else: path=tree.getpath(textnodes[i].getparent())[cut:] paths[pos]=(path, offset) #print 'qwer', self.doc.tokens[pos], texts[i][offset:offset+len(self.doc.tokens[pos])], paths[pos], path, offset #print paths[pos] offset+=len(self.doc.tokens[pos]) if offset>=len(texts[i]): i+=1 offset=0 pos+=1 #for pos, (path, offset) in sorted(paths.items()): # print self.doc.tokens[pos], pos, path, offset #print len(paths), len(self.doc.tokens) #print return paths
def readSemcor3File(filename): ''' Reads an XML semcore3.0 file and returns a corresponding MLN database. ''' if not java.isJvmRunning(): java.startJvm() tree = p.parse(filename) parser = StanfordParser(grammarPath) for e in tree.iter(): if e.tag == 's': s, atoms = reconstruct(e) print('//', s) for a in atoms: print(a) deps = parser.get_dependencies(s) depstr = list(map(str, deps)) # do some sanity check for d in depstr: print(d) print('---')
def fetchVotes(d): url = "%s%s%s" % ( "http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-//EP//NONSGML+PV+", d, "+RES-RCV+DOC+WORD+V0//EN&language=EN") print >> sys.stderr, url try: f = urllib2.urlopen(url) except (urllib2.HTTPError, urllib2.URLError): try: f = urllib2.urlopen(url) except (urllib2.HTTPError, urllib2.URLError): try: f = urllib2.urlopen(url) except (urllib2.HTTPError, urllib2.URLError): return '' tmp = mkstemp() fd = os.fdopen(tmp[0], 'w') fd.write(f.read()) fd.close() f.close() res = subprocess.Popen(['/usr/bin/wvHtml', tmp[1], '-'], stdout=subprocess.PIPE).communicate()[0] os.unlink(tmp[1]) return parse(StringIO(res))
def from_str(self, html_str): return HtmlNode(soupparser.parse(StringIO(html_str)))
v = etree.SubElement(pm, wp+'meta_value') v.text = etree.CDATA(val) return pm #get wp ids from file wpidsfl = open('wpids.mrs') wpids = marshal.load(wpidsfl) wpidsfl.close() #base is a boilerplate from wordpress export doc = etree.parse(bdir + '/boilerplate_import.xml') root = doc.getroot() channel = root.find('channel') #todo: parsing in commandline values bld = soupparser.parse("/home/rik/Dropbox/jos compendium/jos/out/biografieen2.html") broot = bld.getroot() fl = open("/home/rik/Dropbox/jos compendium/jos/nin/2beeldmateriaal.html") txt = fl.read() ch = copyhelper.choptext(txt, ['Algemeen', '2a', '2b', '2c', '2d']) def convert_imgs(broot=broot, hfdst=[], categories=[], rectype='biografie', startnr=10, log=''): s = CSSSelector('img') imgs = s(broot) #imgs.reverse()
import lxml.html.soupparser as soupparser from lxml.etree import tostring import lxml.html import io import sys import re import nltk from django.utils.encoding import smart_str #file = open('./webpages/romeo_juliet.html') file = open(sys.argv[1]) html = file.read() file.close() tree = soupparser.parse(io.BytesIO(html)) fulltext = "" for a in tree.xpath('//*[name()="a"]'): if a.text is not None: if 'name' in a.attrib: fulltext += " " + a.text #print a.attrib['name'] + a.text oSentences = nltk.sent_tokenize(fulltext) for s in oSentences: s = smart_str(re.sub(r'\s+', ' ', s)) s = re.sub(r'^\s', '', s) print s
def getIpexData(): page = parse(fetch('http://www.ipex.eu/IPEXL-WEB/epdoc.do')) title = None for url in page.xpath('//div[@id="widgetContent_LU_WID"]//a'): title = u''.join(url.xpath('text()')) if title == u'a. Legislative procedures (currently ongoing or ended during the 7th Parliamentary term)': a = url break assert title == u'a. Legislative procedures (currently ongoing or ended during the 7th Parliamentary term)', "title changed on ipex: %s" % title url = "http://www.ipex.eu%s" % a.get('href') items = list(csv.DictReader(fetch(url), dialect="hash")) ipexmap = {} for item in items: date = None for k in cdates[::-1]: if item[k]: date = item[k] break item['Rapporteur'] = [[ x['_id'], getMEPGroup(x, date), x['Name']['full'] ] for x in filter(None, [ getMEPRef(mep) for mep in item['Rapporteur'].decode( 'raw_unicode_escape').split(', ') ])] item['Shadows'] = [[ x['_id'], getMEPGroup(x, date), x['Name']['full'] ] for x in filter(None, [ getMEPRef(mep) for mep in item['Shadows'].decode('raw_unicode_escape').split(', ') ])] item['Dates'] = [] for k in dates.keys(): tmp = item[k].split(' ') body = dates[k]['body'] if len(tmp) == 1: try: tmp1 = toDate(tmp[0]) if tmp1: item['Dates'].append({ 'type': 'Event', 'body': body, 'date': tmp1, 'type': k }) except: print k, tmp[0] raise elif len(tmp) > 1: tmp1 = toDate(tmp[-1]) if tmp1: item['Dates'].append({ 'type': 'Event', 'body': body, 'date': tmp1, 'type': k }) else: print >> sys.stderr, "[!]", k, item[k] del item[k] item['Dates'] = sorted(item['Dates']) tmp = basre.match(item['Bas Doc']) if tmp: item['Base Doc'] = u"%s/%s/%s" % tmp.groups() del item['Bas Doc'] item['Com Opinion'] = filter(None, item['Com Avis'].split(';')) item['title'] = item['Titre EN'].decode('raw_unicode_escape') item['subject'] = item['Theme'].decode('raw_unicode_escape') item['Com Responible'] = item['ComFond'].decode('raw_unicode_escape') for k in ['ComFond', 'Theme', ' ', 'Titre EN', 'Com Avis']: del item[k] for k in item.keys(): if not item[k]: del item[k] ipexmap[item['ProcRef']] = item # other fields # 'ComFond': 'BUDG', # 'Phase': '8.10 Ended', # 'Pol Group': 'PPE', # 'Type': 'DBA', # 'url OEIL': 'http://www.europarl.europa.eu/oeil/FindByProcnum.do?lang=en&procnum=BUD/2009/2048' # 'Scrutiny': 'http://www.ipex.eu/ipex/cms/home/Documents/dossier_CNS20110817' return ipexmap
opener.addheaders = [('User-agent', 'weurstchen/0.5')] def fetch(url, retries=5): # url to etree try: f = urllib2.urlopen(url) except (urllib2.HTTPError, urllib2.URLError), e: if hasattr(e, 'code') and e.code >= 400 and e.code not in [504]: print >> sys.stderr, "[!] %d %s" % (e.code, url) raise if retries > 0: f = fetch(url, retries - 1) else: raise return parse(f) def getNewItems(root): for d in root.xpath('//td[@class="listlevelthree"]/../td/a'): dossier = fetch((URL + d.attrib['href']).encode('utf8')) for e in dossier.xpath('//a[@class="com_acronym"]'): d_url = e.attrib['href'] if not db.dossiers.find_one({'meta.source': URL + d_url}): oeil_scrape(URL + d_url) # print '[!] NEW ITEM: %s%s scraped!!' % (URL, d_url) def scrape(url): root = fetch(url) # TODO optimize this!! (reduce steps)
# is actually the html of a google maps page. We use the lxml library to load it as a python object, # and the html soup parser to fix any issues understanding the html and still retrieve acceptable, # well structured xml def urlToET(url, tryAgain = True): # This just attempt to load the URL twice to account for the possibility of network trouble. # This is not a good example of error-proof code, but it still greatly reduces the probability # of errors in retrieval effecting your application. try: fURL = urllib2.urlopen(url) except Exception, e: if tryAgain: return urlToET(url, tryAgain = False) else: return None return soup.parse(fURL) def parseMap(tree, multiple=False): # This function parses the inputted xml data for either one or more routes, finding the relevant # information about each. It uses xpath, a way of searching for a particular node within a # tree of xml data. routes = [] # These next two lines find all nodes that look like <div class='dir-altroute-inner'>....</div> (inside # certain parent nodes), which correpond to the route information we're looking for. You can look at # ./map.html if you want to see the raw info, although it's pretty ugly. route_div = '//ol[@id="dir_altroutes_body"]/li[@class="dir-altroute"]/div[@class="dir-altroute-inner"]' div_list = tree.xpath(route_div) x = 0 for div in div_list: # This bit of arguably ugly xml parsing is just pulling names and project times from the route # info, and remembering if they include projected traffic or not.
def _initmap(self): pos = 0 i = 0 offset = 0 paths = {} tree = parse(StringIO(self.doc.body.encode('utf8'))) textnodes = tree.xpath('//div[@id="TexteOnly"]//text()') cut = 5 if not textnodes: textnodes = tree.xpath('//text()') cut = 10 texts = [unescape(x) for x in textnodes] #tmp = [token for frag in texts if frag for token in nltk.tokenize.wordpunct_tokenize(frag)] #for line in difflib.context_diff(tmp, self.doc.tokens): # print repr(line) #print texts #print self.doc.tokens lastgood = (i, offset) while pos < len(self.doc.tokens): if i >= len(texts): print "guessing frag: %s, reset to %s, %s" % ( self.doc.tokens[pos].encode('utf8'), lastgood[0], lastgood[1]) (i, offset) = lastgood path = tree.getpath(textnodes[i].getparent())[cut:] paths[pos] = (path, offset) offset += len(self.doc.tokens[pos]) if offset >= len(texts[i]): i += 1 offset = 0 pos += 1 continue offset = texts[i].find(self.doc.tokens[pos], offset) if offset == -1: i += 1 offset = 0 continue if textnodes[i].is_tail: path = tree.getpath(textnodes[i].getparent().getparent())[cut:] siblings = textnodes[i].getparent().getparent().xpath( './/text()') adjust = len(''.join(siblings[:siblings.index(textnodes[i])])) paths[pos] = (path, adjust + offset) #print 'asdf', self.doc.tokens[pos:pos+l], ''.join(siblings)[adjust+offset:adjust+offset+len(self.doc.tokens[pos])], adjust+offset, offset else: path = tree.getpath(textnodes[i].getparent())[cut:] paths[pos] = (path, offset) #print 'qwer', self.doc.tokens[pos], texts[i][offset:offset+len(self.doc.tokens[pos])], paths[pos], path, offset #print "frag: %s(%s) @%s" % (i,len(texts), paths[pos][1]),"token: %s(%s)" % (pos, len(self.doc.tokens)), self.doc.tokens[pos].encode('utf8') #print paths[pos] offset += len(self.doc.tokens[pos]) if offset >= len(texts[i]): i += 1 offset = 0 lastgood = (i, offset) pos += 1 #for pos, (path, offset) in sorted(paths.items()): # print self.doc.tokens[pos], pos, path, offset #print len(paths), len(self.doc.tokens) #print return paths
# and the html soup parser to fix any issues understanding the html and still retrieve acceptable, # well structured xml def urlToET(url, tryAgain=True): # This just attempt to load the URL twice to account for the possibility of network trouble. # This is not a good example of error-proof code, but it still greatly reduces the probability # of errors in retrieval effecting your application. try: fURL = urllib2.urlopen(url) except Exception, e: if tryAgain: return urlToET(url, tryAgain=False) else: return None return soup.parse(fURL) def parseMap(tree, multiple=False): # This function parses the inputted xml data for either one or more routes, finding the relevant # information about each. It uses xpath, a way of searching for a particular node within a # tree of xml data. routes = [] # These next two lines find all nodes that look like <div class='dir-altroute-inner'>....</div> (inside # certain parent nodes), which correpond to the route information we're looking for. You can look at # ./map.html if you want to see the raw info, although it's pretty ugly. route_div = '//ol[@id="dir_altroutes_body"]/li[@class="dir-altroute"]/div[@class="dir-altroute-inner"]' div_list = tree.xpath(route_div) x = 0 for div in div_list: # This bit of arguably ugly xml parsing is just pulling names and project times from the route
def listings(): for page in rawpages(): tree = soupparser.parse(page) _listings = tree.xpath('//*[@id="search-results"]/li[*]/div') for listing in _listings: yield listing
#return False #if a.tail != b.tail: #return False #if a.values()!=b.values(): #redundant to the attrib matching #return False # if sorted(a.keys()) != sorted(b.keys()): #may also be redundant to the attrib matching, #See if any attributes were added/removed # str1 = ''.join(sorted(a.keys())) # str2 = ''.join(sorted(b.keys())) # reportStringChange(str1, str2, a.tag, "ATTRIBUTE CHANGE") return True path1 = sys.argv[1] path2 = sys.argv[2] tree1 = parse(path1).getroot() tree2 = parse(path2).getroot() elementsA_hash = {} elementsB_hash = {} isLeafNodeA = {} isLeafNodeB = {} hashNodes(tree1, elementsA_hash, isLeafNodeA) hashNodes(tree2, elementsB_hash, isLeafNodeB) noofchanges = 0 for key, value in elementsA_hash.iteritems(): try: isSameNode(elementsA_hash[key], elementsB_hash[key], isLeafNodeA[key])
def get_revisions(self): root = parse(self.page).getroot() list_rev = root.xpath("//ul[@id='pagehistory']/li") for rev in list_rev: yield Revision(rev)
def transformHTML(i, o, root_dir='.', prefix=None, exclude=None): """ @param root_dir: Path to look for resources from. @param prefix: If provided, don't inline stuff. Instead, prepend the prefix to relative paths. """ exclude = exclude or [] root = soupparser.parse(i) html = root.getroot() # links (css) if 'link' not in exclude: for link in html.xpath('//link'): href = link.attrib.get('href', '') if prefix: # prefix link.attrib['href'] = prefix + href else: # inline loaded = loadThing(href, root_dir) style_tag = etree.Element('style') style_tag.text = loaded['content'] link.getparent().replace(link, style_tag) # css if 'css' not in exclude: r_import = re.compile(r'(@import\s+url\((.*?)\)\s*;)') r_url = re.compile(r'(url\((.*?)\))', re.S | re.M) for style in html.xpath('//style'): # imports while True: imports = r_import.findall(style.text) if not imports: break for rule, url in imports: # inline loaded = loadThing(url, root_dir) style.text = style.text.replace(rule, loaded['content']) # other urls urls = r_url.findall(style.text) for match, url in urls: if prefix: # prefix pass else: # inline loaded = loadThing(url, root_dir) style.text = style.text.replace( match, 'url(' + toDataURL(**loaded) + ')') # images if 'img' not in exclude: for image in html.xpath('//img'): src = image.attrib.get('src', '') if src.startswith('data:'): # already a data url continue if prefix: # prefix if src.startswith('//') or src.startswith( 'http:') or src.startswith('https:'): pass else: image.attrib['src'] = prefix + src else: # inline loaded = loadThing(src, root_dir) image.attrib['src'] = toDataURL(**loaded) o.write(etree.tostring(html, method='html'))
import lxml.html.soupparser as soupparser import lxml.html import io import sys import re import nltk from django.utils.encoding import smart_str file = open(sys.argv[1]) html = file.read() file.close() tree = soupparser.parse(io.BytesIO(html)) original = "" modern = "" for t in tree.xpath('//*[name()="div"]'): if t.text is not None: if 'class' in t.attrib : if t.attrib['class'] == 'original-line' : oline = t.text.replace('\n', ' ') oline = smart_str(re.sub(r'\s+', ' ', oline)) original += " " + oline elif t.attrib['class'] == 'modern-line' : mline = t.text.replace('\n', ' ') mline = smart_str(re.sub(r'\s+', ' ', mline)) modern += " " + mline
def munin_freq(): root=parse(fetch("%s/signal.asp" % host)) for x in root.xpath(".//tr"): fields=[totext(t) for t in x.xpath('./td')] if fields[0].endswith("stream Frequency"): print "%s.value %s" % (fields[0].lower().replace(' ','_'), split(fields[1]))
# urllib2.ProxyHandler({'http': 'http://*****:*****@class="listlevelthree"]/../td/a'): dossier = fetch((URL+d.attrib['href']).encode('utf8')) for e in dossier.xpath('//a[@class="com_acronym"]'): d_url = e.attrib['href'] if not db.dossiers.find_one({'meta.source': URL+d_url}): oeil_scrape(URL+d_url) # print '[!] NEW ITEM: %s%s scraped!!' % (URL, d_url) def scrape(url): root = fetch(url) # TODO optimize this!! (reduce steps) if not exists(LAST_UPDATED_CACHE) or open(LAST_UPDATED_CACHE).read() != strip(root.xpath('//div[text()="Data updated on :"]/span/text()')[0]): print >>sys.stderr, '[!] Site modification found, scraping unfinished dossiers....'