def GetBillLinks(): committees = [] billyears = [] uin = urllib.urlopen(url_pbc_previous) s = uin.read() uin.close() billyears = re.findall('<a href="([^"]*)"[^>]*>(Session .*?)</a>(?is)', s) for billyear in billyears: match = re.match("Session (\d\d\d\d)-\d\d(?:\d\d)?", billyear[1]) if not match: raise Exception, "Did not find session dates in %s" % billyear[1] year = match.group(1) if miscfuncs.IsNotQuiet(): print "year=", year for link, text in get_oldstyle_bill_links(billyear[0]): committees.append((year, link, text)) committees += (( '2012', 'http://services.parliament.uk/bills/2010-12/financeno4/committees/houseofcommonspublicbillcommitteeonthefinancebill201213.html', 'Finance Bill' ), ('2014', 'http://services.parliament.uk/bills/2013-14/finance/committees/houseofcommonspublicbillcommitteeonthefinancebill201415.html', 'Finance Bill')) return get_committee_attributes(committees)
def CmIndexFromNewPage(date, type='commons'): if type=='lords': if date >= mx.DateTime.Date(2012,7,31) and date <= mx.DateTime.Date(2012,9,17): return [] if date == mx.DateTime.Date(2012,10,1): return [] urllinkpage = '%s?d=%s&m=%d&y=%d' % (url_bydate_index_lords, date.day, date.month, date.year) else: urllinkpage = '%s?d=%s&m=%d&y=%d' % (url_bydate_index, date.day, date.month, date.year) urlinkpage = urllib.urlopen(urllinkpage) srlinkpage = urlinkpage.read() urlinkpage.close() entries = [] for link1 in re.findall('<a[^>]*?href="(http://www\.publications\.[^"#]+)(?:#[^"]*)?">([^<]*)</a>(?i)', srlinkpage): linkhref = link1[0] linktext = link1[1] if not re.search('debate|westminster|written(?i)', linktext): continue if linkhref.endswith('pdf'): if miscfuncs.IsNotQuiet(): print "Skipping PDF: %s" % linktext continue uind = re.sub('(?:\s|%20)', '', linkhref) typ = re.sub('\s+', ' ', linktext).strip() if entries and entries[-1][1] == uind: continue entries.append((typ, uind)) return entries
def PullGlueToday(forcescrape): # Fetch 'Today in the Commons' index page frontpagedata = fetchTextFromUrl(TodayInTheCommonsIndexPageUrl) link01url = re.search("<a href=\"(01\.htm)\">Go to Full Report</a>", frontpagedata).group(1) pageurl = urlparse.urljoin(TodayInTheCommonsIndexPageUrl, link01url) preparedDateMatch = re.search("<p class=\"prepared\">Prepared: <strong>(\d+:\d+) on (\d+ [a-zA-Z]+ \d+)</strong></p>", frontpagedata) preparedDateTime = mx.DateTime.DateTimeFrom(preparedDateMatch.group(1) + " " + preparedDateMatch.group(2)) spreparedDateTime = "%s" % preparedDateTime # convert to string (can't find the real way to do it) # extract the date from the browse links lower down headingDateMatch = re.search('''(?x)<h2>Browse\sReport\sBy\sSection</h2>\s* <ul>\s* <p\sclass="indextext"\salign=left><a\shref="01.htm\#hddr_1"><b>House\sof\sCommons</b></a></p>\s* <p\sclass="indextext"\salign=left><a\shref="01.htm\#hddr_2"><i>([^<]*)</i></a></p>''', frontpagedata) headingDateTime = mx.DateTime.DateTimeFrom(headingDateMatch.group(1)) sdate = headingDateTime.date assert sdate <= preparedDateTime.date # prepared date must come after date from heading # make files which we will copy into lddaymap, pwcmfolder = MakeDayMap("debates", "debates") dgflatest, dgflatestdayalpha, dgfnext, dgfnextdayalpha = GetFileDayVersions(sdate, lddaymap, pwcmfolder, "debates") # See if we actually want to proceed with scraping, or if there already exists a 'printed' version # in which case we avoid replacing it with the 'today' version latestScrapedFileMetaData = readPageX(dgflatest) if latestScrapedFileMetaData.get('type')=='printed': print "'Printed' version of hansard for today has already been scraped. Skipping scrape of 'Today' version" return None if not forcescrape and latestScrapedFileMetaData.get('prepareddatetime') == spreparedDateTime: if miscfuncs.IsNotQuiet(): print "Prepared datetime", spreparedDateTime, "already done" return None tempFileHandle = open(tempfilename, "w") tempFileHandle.write('<pagex url="%s" scrapedate="%s" scrapetime="%s" prepareddatetime="%s" type="today" />\n' % (TodayInTheCommonsIndexPageUrl, time.strftime('%Y-%m-%d', time.gmtime()), time.strftime('%X', time.gmtime()), spreparedDateTime)) GlueByToday(tempFileHandle, pageurl) tempFileHandle.close() comp = CompareScrapedFiles(dgflatest, tempfilename) # now commit the file if comp == 'DIFFERENT': print "writing: ", dgfnext os.rename(tempfilename, dgfnext) return sdate elif comp == 'EXTENSION': print "OVER-writing: ", dgflatest shutil.copyfile(tempfilename, dgflatest) os.remove(tempfilename) return sdate else: assert comp == 'SAME' print "download exactly the same: ", dgflatest return None
def GlueByToday(outputFileHandle, pageurl): pagenumber=1 while pageurl: assert pagenumber==int(re.search('(\d+)\.htm$', pageurl).group(1)) preparedDateTime, nextLink, body = ScrapeTodayPage(pageurl) if miscfuncs.IsNotQuiet(): print "Processed [%s] which was prepared [%s]" % (pageurl, preparedDateTime) now = time.gmtime() outputFileHandle.write('<page url="%s" prepareddatetime="%s" />\n' % (pageurl, preparedDateTime) ) outputFileHandle.write(body) outputFileHandle.write('\n') if nextLink: pageurl = urlparse.urljoin(pageurl, nextLink) else: pageurl = None pagenumber += 1
def PullGluePages(options, folder, typ): daymap, scrapedDataOutputPath = MakeDayMap(folder, typ) scrape = [] # Post 2010 election scraping done directly, not via index if options.dateto >= '2010-05-18': if options.datefrom > '2010-05-18': date = mx.DateTime.DateTimeFrom(options.datefrom) else: date = mx.DateTime.DateTimeFrom('2010-05-18') while date.date <= options.dateto and date < mx.DateTime.today(): for recordType, link in CmIndexFromNewPage(date): if recordType == 'Written Statements': recordType = 'Written Ministerial Statements' if recordType == 'Debates and Oral Answers': recordType = 'Debates' if re.search(typ, recordType, re.I): scrape.append(CommonsIndexElement(date.date, recordType, link)) date += mx.DateTime.DateTimeDelta(1) # loop through the index file previously made by createhansardindex for commonsIndexRecord in CommonsIndex().res: # implement date range if not re.search(typ, commonsIndexRecord.recordType, re.I): continue if commonsIndexRecord.date < options.datefrom or commonsIndexRecord.date > options.dateto: continue scrape.append(commonsIndexRecord) for commonsIndexRecord in scrape: latestFilePath, latestFileStem, nextFilePath, nextFileStem = \ GetFileDayVersions(commonsIndexRecord.date, daymap, scrapedDataOutputPath, typ) try: # hansard index page urlx = commonsIndexRecord.url if commonsIndexRecord.recordType == 'Votes and Proceedings' or commonsIndexRecord.recordType == 'questionbook': urla = [urlx] #FIXME - should we be detecting somehow? I don't think this bit is currently used. glue_function = GlueByNext else: urla, new_type_index = ProcessIndexUrl(urlx, latestFilePath, options.forcescrape) # this checks the url at start of file glue_function = GlueByNextNew if new_type_index else GlueByNext if not urla: continue if miscfuncs.IsNotQuiet(): print commonsIndexRecord.date, (latestFilePath and 'RE-scraping' or 'scraping'), re.sub(".*?cmhansrd/", "", urlx) # now we take out the local pointer and start the gluing glue_function(tempfilename, urla, urlx, commonsIndexRecord.date) except Exception, e: options.anyerrors = True if options.quietc: print e print "\tERROR! %s failed to scrape on %s, quietly moving to next day" % (typ, commonsIndexRecord.date) continue else: raise if CompareScrapedFiles(latestFilePath, tempfilename) == "SAME": if miscfuncs.IsNotQuiet(): print " matched with:", latestFilePath continue # before we copy over the file from tempfilename to nextFilePath, copy over the patch if there is one. ReplicatePatchToNewScrapedVersion(folder, latestFileStem, latestFilePath, nextFilePath, nextFileStem) # now commit the file os.rename(tempfilename, nextFilePath) # make the message print commonsIndexRecord.date, (latestFilePath and 'RE-scraped' or 'scraped'), re.sub(".*?cmpages/", "", nextFilePath)
def GlueByNext(outputFileName, urla, urlx, sdate): fout = open(outputFileName, "w") # put out the indexlink for comparison with the hansardindex file lt = time.gmtime() fout.write('<pagex url="%s" scrapedate="%s" scrapetime="%s" type="printed" />\n' % \ (urlx, time.strftime('%Y-%m-%d', lt), time.strftime('%X', lt))) # Patches if sdate=='2006-05-09' and urla[0]=='http://www.publications.parliament.uk/pa/cm200506/cmhansrd/cm060508/text/60508w0308.htm': urla = urla[1:] if sdate=='2006-05-10' and urla[0]=='http://www.publications.parliament.uk/pa/cm200506/cmhansrd/cm060509/text/60510w0332.htm': urla = urla[1:] if urla[0]=='http://www.publications.parliament.uk/pa/cm200506/cmhansrd/vo060524/debtext/60524-0001.htm': urla = [urla[0]] if sdate=='2006-06-05' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200506/cmhansrd/cm060605/text/60605w0640.htm': urla = ['http://www.publications.parliament.uk/pa/cm200506/cmhansrd/cm060602/text/60602w0601.htm', 'http://www.publications.parliament.uk/pa/cm200506/cmhansrd/cm060605/text/60605w0602.htm'] + urla if sdate=='2006-06-07' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200506/cmhansrd/cm060607/text/60607w0001.htm': urla = urla[0:2] + ['http://www.publications.parliament.uk/pa/cm200506/cmhansrd/cm060607/text/60607w0003.htm'] + urla[2:] if sdate=='2006-06-14' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200506/cmhansrd/vo060614/halltext/60614h0001.htm': urla = [urla[0]] if sdate=='2006-06-13' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200506/cmhansrd/vo060613/halltext/60613h0001.htm': urla = [urla[0]] if sdate=='2006-07-17' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200506/cmhansrd/vo060717/text/60717w0001.htm': urla = [urla[0]] if sdate=='2006-10-30' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200506/cmhansrd/cm061030/text/61030w0001.htm': urla = [urla[0]] if re.match(r'http://www.publications.parliament.uk/pa/cm200506/cmhansrd/vo0(\d{5})/debtext/\1-0001.htm', urla[0]) and \ (sdate=='2006-10-17' or sdate=='2006-10-26' or sdate=='2006-10-11' or sdate=='2006-07-12'): urla = [urla[0]] if sdate=='2006-11-21' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm061121/debtext/61121-0001.htm': urla = urla[0:11] + urla[13:] # Incorrect link in middle of index if sdate=='2007-03-28' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070328/halltext/70328h0001.htm': urla = [urla[0]] if sdate=='2007-04-24' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070424/debtext/70424-0001.htm': urla = urla[0:14] + urla[16:] if sdate=='2007-05-15' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070515/halltext/70515h0001.htm': urla = urla[0:4] + urla[6:] if urla[0] == 'http://www.publications.parliament.uk/pa/cm200506/cmhansrd/cm060614/halltext/60614h0178.htm': urla = [urla[0]] if re.match(r'http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm0(\d{5})/debtext/\1-0001.htm', urla[0]) and \ (sdate=='2007-10-15' or sdate=='2007-10-23' or sdate=='2007-10-09' or sdate=='2007-02-05' or sdate=='2007-03-26' or \ sdate=='2007-01-15' or sdate=='2006-11-29' or sdate=='2006-11-22' or sdate=='2007-07-11' or sdate=='2007-07-05'): urla = [urla[0]] if sdate=='2007-10-01' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm071001/text/71001w0001.htm': urla = [urla[0]] if sdate=='2007-07-19' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200607/cmhansrd/cm070719/wmstext/70719m0001.htm': urla = [urla[0]] if sdate=='2008-01-24' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200708/cmhansrd/cm080124/halltext/80124h0001.htm': urla = [urla[0]] if re.match(r'http://www.publications.parliament.uk/pa/cm200809/cmhansrd/cm0(\d{5})/halltext/\1h0001.htm', urla[0]) and \ (sdate=='2009-02-12' or sdate=='2009-02-24' or sdate=='2009-06-10'): urla = [urla[0]] if sdate=='2009-02-12' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200809/cmhansrd/cm090212/wmstext/90212m0001.htm': urla = [urla[0]] if sdate=='2010-02-23' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200910/cmhansrd/cm100223/wmstext/100223m0001.htm': urla = [urla[0]] if re.match(r'http://www.publications.parliament.uk/pa/cm200809/cmhansrd/cm0(\d{5})/text/\1w0001.htm', urla[0]) and \ (sdate=='2009-02-09' or sdate=='2009-02-25' or sdate=='2009-02-26' or sdate=='2009-02-27' or sdate=='2009-09-01' or sdate=='2009-10-19' or sdate=='2009-06-01' or sdate=='2009-05-05'): urla = [urla[0]] if re.match(r'http://www.publications.parliament.uk/pa/cm200708/cmhansrd/cm0(\d{5})/text/\1w0001.htm', urla[0]) and \ (sdate=='2008-04-21' or sdate=='2008-03-13' or sdate=='2008-01-28' or sdate=='2008-01-16' or sdate=='2008-01-14' or sdate=='2007-11-28'): urla = [urla[0]] if sdate=='2008-11-17' and urla[0] == 'http://www.publications.parliament.uk/pa/cm200708/cmhansrd/cm081117/text/81117w0001.htm': urla = urla[0:27] + urla[29:] if re.match(r'http://www.publications.parliament.uk/pa/cm200708/cmhansrd/cm0(\d{5})/debtext/\1-0001.htm', urla[0]) and \ (sdate=='2008-06-17' or sdate=='2008-07-07' or sdate=='2008-03-06' or sdate=='2008-01-14' or sdate=='2008-06-30' or sdate=='2008-11-20'): urla = [urla[0]] if re.match(r'http://www.publications.parliament.uk/pa/cm200809/cmhansrd/cm0(\d{5})/debtext/\1-0001.htm', urla[0]) and \ (sdate=='2009-03-24' or sdate=='2009-06-30' or sdate=='2009-10-19' or sdate=='2009-07-20'): urla = [urla[0]] if re.match(r'http://www.publications.parliament.uk/pa/cm200910/cmhansrd/cm(\d{6})/debtext/\1-0001.htm', urla[0]) and \ (sdate=='2010-04-08'): urla = [urla[0]] if re.match(r'http://www.publications.parliament.uk/pa/cm201011/cmhansrd/cm(\d{6})/debtext/\1-0001.htm', urla[0]) and \ sdate in ('2010-09-06', '2011-01-25'): urla = [urla[0]] if re.match(r'http://www.publications.parliament.uk/pa/cm201011/cmhansrd/cm(\d{6})/wmsindx/\1-x.htm', urlx) and \ sdate in ('2010-06-14', '2010-09-07', '2010-09-08', '2010-09-09', '2010-09-13', '2010-09-14', '2010-09-15', '2010-09-16', '2010-10-11', '2010-10-12', '2010-10-14'): # The first link in here points erroneously to wrans urla.pop(0) if re.match(r'http://www.publications.parliament.uk/pa/cm201011/cmhansrd/cm(\d{6})/debindx/\1-x.htm', urlx) and \ sdate=='2010-10-19': urla = urla[:4] if re.match(r'http://www.publications.parliament.uk/pa/cm201011/cmhansrd/cm100906/index/100906-x.htm', urlx) and sdate=='2010-09-06': if urla[6] == 'http://www.publications.parliament.uk/pa/cm201011/cmhansrd/cm100906/text/100906w0004.htm': urla.pop(6) if urla[5] == 'http://www.publications.parliament.uk/pa/cm201011/cmhansrd/cm100906/text/100906w0013.htm': urla.pop(5) if urla[0] == 'http://www.publications.parliament.uk/pa/cm201011/cmhansrd/cm100906/text/100906w0013.htm': urla.pop(0) if re.match(r'http://www.publications.parliament.uk/pa/cm201011/cmhansrd/cm100713/debindx/100713-x.htm', urlx) and sdate=='2010-07-13': urla[1:6] = [] #print "urla" #for aaa in urla: # print aaa #sys.exit(1) # loop which scrapes through all the pages following the nextlinks while urla: # import pdb;pdb.set_trace() url2 = url = urla[0] if sdate=='2009-02-27': url2 = re.sub('\s+', '', url2) #print " reading " + url ur = urllib.urlopen(url2) sr = ur.read() ur.close(); # write the marker telling us which page this comes from if (url2 != urlx): fout.write('<page url="' + url2 + '"/>\n') sr = re.sub('<!-- end of variable data -->.*<hr>(?si)', '<hr>', sr) # To cope with post 2006-05-08, turn <body> into <hr> sr = re.sub('<body><br>', '<body><hr><br>', sr) sr = re.sub('<body>\s+<notus', '<body><hr> <notus', sr) sr = re.sub('<body><h3 align="center"', '<body><hr><h3 align="center"', sr) sr = re.sub('<body><p>', '<body><hr><p>', sr) sr = re.sub('<body>\s+<!--<hd>--><br>', '<body><hr><!--<hd>--><br>', sr) # To cope with post 2006-09; need a better way of doing this! sr = re.sub('<div id="maincontent1">\s*<(p|br)>', r'<hr><\1>', sr) sr = re.sub('<div id="maincontent1">\s*<h3', '<hr><h3', sr) sr = re.sub('<div id="maincontent1">\s*<!--<hd>-->', '<hr>', sr) sr = re.sub('<div id="maincontent1">\s*<(notus|meta|a)', r'<hr> <\1', sr) # 2006-05-09 / 2006-10-20 sr = re.sub('<div id="maincontent1">\s*<link[^>]*>\s*<(br|p|h3|notus|meta|a)', r'<hr><\1', sr) # 2008-06-17 / 2008-10... if sdate=='2006-11-07' or sdate=='2006-11-08': sr = re.sub('<!--end of UK Parliament banner for Publications -->\s*<div class="breadcrumb">.*?</div>\s*<h2(?s)', '<hr> <h2', sr) sr = re.sub("</?mekon[^>]*>", "", sr) sr = re.sub("</?vbcrlf>", "", sr) # To cope with post 2011-03 sr = re.sub('<div id="content-small">', '<div id="content-small"><hr/>', sr) # Make sure correction is before written answer question number - XXX right place? sr = re.sub('(\[\d+\])\s*((?:</p>)?)\s*(<a href="[^"]*corrtext[^"]*">.*?</a>)', r'\3 \1\2', sr) # split by sections # hrsections = re.split('<hr(?: size=3)>(?i)', sr) # hrsections = re.split('<hr(?: size=3)?(?: width="90%" align="left")?/?>(?i)', sr) hrsections = re.split('<hr[^>]*>(?i)', sr) # import pdb;pdb.set_trace() hrsections = [ re.sub('^\s*<table\s*width\s*=\s*"90%">\s*<tr>\s*<td>\s*(.*)</td>\s*</tr>\s*</table>\s*$(?s)', r'\1', x) for x in hrsections ] # this is the case for debates on 2003-03-13 page 30 # http://www.publications.parliament.uk/pa/cm200203/cmhansrd/vo030313/debtext/30313-32.htm if len(hrsections) == 1: if miscfuncs.IsNotQuiet(): print len(hrsections), 'page missing', url fout.write('<UL><UL><UL></UL></UL></UL>\n') urla = urla[1:] continue # Grr, missing footers ALL OVER THE PLACE now if len(hrsections) == 2: WriteCleanText(fout, hrsections[1], url, sdate) # write the body of the text for i in range(1,len(hrsections) - 1): WriteCleanText(fout, hrsections[i], url, sdate) # find the lead on with the footer footer = hrsections[-1] # the files are sectioned by the <hr> tag into header, body and footer. nextsectionlink = re.findall('<\s*a\s+href\s*=\s*"?(.*?)"?\s*>next(?: section)?</(?:a|td)>(?i)', footer) if len(nextsectionlink) > 1: raise Exception, "More than one Next Section!!!" if not nextsectionlink: urla = urla[1:] if urla and miscfuncs.IsNotQuiet(): print "Bridging the missing next section link at %s" % url else: currenturl = url url = urlparse.urljoin(url, nextsectionlink[0]) if len(urla) > 1 and urla[1] == url: urla = urla[1:] else: for uo in urla: if uo == url: print "previous URLs:\n" print string.join(urla, "\n") print "\nbad next url:\n" print url print "\ncurrent url:\n" print currenturl raise Exception, "Next Section misses out the urla list" urla[0] = url fout.close()
def GlueByNextNew(outputFileName, urla, urlx, sdate): fout = open(outputFileName, "w") # put out the indexlink for comparison with the hansardindex file lt = time.gmtime() fout.write('<pagex url="%s" scrapedate="%s" scrapetime="%s" type="printed" />\n' % \ (urlx, time.strftime('%Y-%m-%d', lt), time.strftime('%X', lt))) if re.match(r'http://www.publications.parliament.uk/pa/cm201314/cmhansrd/cm140224/debindx/140224-x.htm', urlx) and sdate=='2014-02-24': urla = [urla[0]] if re.match(r'http://www.publications.parliament.uk/pa/cm201314/cmhansrd/cm140319/debindx/140319-x.htm', urlx) and sdate=='2014-03-19': urla = [urla[0]] if re.match(r'http://www.publications.parliament.uk/pa/cm201314/cmhansrd/cm140512/debindx/140512-x.htm', urlx) and sdate=='2014-05-12': urla = [urla[0]] if re.match(r'http://www.publications.parliament.uk/pa/cm201415/cmhansrd/cm140911/index/140911-x.htm', urlx) and sdate=='2014-09-11': urla = [urla[0]] if re.match(r'http://www.publications.parliament.uk/pa/cm201415/cmhansrd/cm150210/debindx/150210-x.htm', urlx) and sdate=='2015-02-10': urla = [urla[0]] if re.match(r'http://www.publications.parliament.uk/pa/cm201516/cmhansrd/cm151208/debindx/151208-x.htm', urlx) and sdate=='2015-12-08': urla = [urla[0]] if re.match(r'http://www.publications.parliament.uk/pa/cm201516/cmhansrd/cm151208/hallindx/151208-x.htm', urlx) and sdate=='2015-12-08': urla = [urla[0]] # loop which scrapes through all the pages following the nextlinks while urla: url = urla[0] ur = urllib.urlopen(url) sr = ur.read() ur.close() # write the marker telling us which page this comes from if (url != urlx): fout.write('<page url="' + url + '"/>\n') # Hopefully this comment is always present now: content = re.sub('^.*?<!--end of page header-->(?s)', '', sr) # Pages bar first one have a <hr> before the main content table, but first page does not. # After line above, first <hr> will be at the end of the main content. # import pdb;pdb.set_trace() if '<div class="navLinks">' in content: content = re.sub('<hr[^>]*>.*(?s)', '', content) else: content = re.sub('</td>\s*</tr>\s*</table>\s*<hr[^>]*>.*(?s)', '', content) WriteCleanText(fout, content, url, sdate) nextsectionlink = re.findall('<\s*a\s+href\s*=\s*"?(.*?)"?\s*>next(?: section)?</(?:a|td)>(?i)', sr) if len(nextsectionlink) > 1: raise Exception, "More than one Next Section!!!" if not nextsectionlink: urla = urla[1:] if urla and miscfuncs.IsNotQuiet(): print "Bridging the missing next section link at %s" % url else: currenturl = url url = urlparse.urljoin(url, nextsectionlink[0]) if len(urla) > 1 and urla[1] == url: urla = urla[1:] else: for uo in urla: if uo == url: print "previous URLs:\n", "\n".join(urla) print "\nbad next url:\n", url print "\ncurrent url:\n", currenturl raise Exception, "Next Section misses out the urla list" urla[0] = url fout.close()
def FactorChangesWrans(majblocks, scrapeversion): # we need to break the scrape version # we separate out and match the major headings separately # (anyway, these aren't really used) # and then match the questions # first extract all the oldtype gid-redirects that will have been put in here by the pre-2005 bMakeOldWransGidsToNew cases res = re.findall( '<gidredirect oldgid="[^"]*" newgid="[^"]*" matchtype="oldwransgid"/>\n', scrapeversion) # extract major headings and match injectively exactly (till we find a failed example). mhchks = re.findall( '<major-heading id="([^"]*)"[^>]*>\n\s*([\s\S]*?)\s*?\n</major-heading>', scrapeversion) majblocknames = [ "".join(majblock[0].stext).strip() for majblock in majblocks ] for mhchk in mhchks: if mhchk[1] in majblocknames: i = majblocknames.index(mhchk[1]) res.append( '<gidredirect oldgid="%s" newgid="%s" matchtype="perfectmatch"/>\n' % (mhchk[0], majblocks[i][0].qGID)) majblocknames[i] = None # take it out of circulation else: res.append( '<gidredirect oldgid="%s" newgid="%s" matchtype="removed"/>\n' % (mhchk[0], majblocks[0][0].qGID)) # break into question blocks # [0]=headingGID, [1]=further choss, [2]=headingtext, [3]=question+reply text # the "<publicwhip> tags have been removed, so split to end of document qebchks = re.findall( '<minor-heading id="([^"]*)"([^>]*)>\n([\s\S]*?)</minor-heading>\n([\s\S]*?)\s*(?=<(?:major-heading|minor-heading|gidredirect[^>]*oldwranstype)|$)', scrapeversion) # make the map from qnums to blocks qnummissings = [] qnummapq = {} for majblock in majblocks: for qblock in majblock[1]: for qnum in qblock.qnums: assert qnum not in qnummapq # failure means this qnum is found twice in the newly parsed file. qnummapq[qnum] = qblock if re.match("ZZZZerror", qnum): qnummissings.append(qnum) # for each block, find the map forward and check if we want to reprint it in full. for qebchk in qebchks: qqnums = re.findall('<p [^>]*?qnum="([\d\w]+)">', qebchk[3]) assert qqnums # make sure that they all link to the same qnum in the new one qblock = None for qqnum in qqnums: if qblock: if qblock.headingqb.qGID != qnummapq[qqnum].headingqb.qGID: print qblock.headingqb.qGID, qnummapq[qqnum].headingqb.qGID assert qblock.headingqb.qGID == qnummapq[ qqnum].headingqb.qGID elif qqnum != '0' and qqnum in qnummapq: # 0 is when there is a missing qnum qblock = qnummapq[qqnum] # in this case the qnums are fail for finding the match, so we either drop it, or find # the match by closest in text. Prefer to match blocks to if not qblock: # find the closest match for this block out of this missing qnum blocks on the new page # (this will need to account for all blocks if in future the correction is to add in the qnum) if qnummissings: qmissblocksscore = [] for qqnum in qnummissings: similarity = MeasureBlockSimilarity( qebchk[3], qnummapq[qqnum]) qmissblocksscore.append((similarity, qqnum)) qmissblockscorebest = max(qmissblocksscore) qblock = qnummapq[qmissblockscorebest[1]] if miscfuncs.IsNotQuiet(): print "Missing qnum; mapping %s to %s with score %f" % ( qebchk[0], qblock.headingqb.qGID, qmissblockscorebest[0]) assert qmissblockscorebest[ 0] > 0.8 # otherwise it's not really a match and we need to look harder. # perhaps it's matched to a block in the new file which newly has a qnum, and we then have to scan against all of them. # now have to check matching. # convert both to strings and compare. essxfq = [ ] # this forms the string which we will be comparing against. qebchkquesids = [] # expect only one of each qebchkreplids = [] for wd in re.findall("<[^>]*>|&\w+;|[^<>\s]+", qebchk[3]): mwd = re.match('<(p|tr|reply|ques)\s*(?:p?id="([^"]*)")?[^>]*>', wd) if mwd: essxfq.append("<%s>" % mwd.group(1)) assert mwd.group(1) not in ("reply", "ques") or mwd.group(2) if mwd.group(1) == "ques": qebchkquesids.append(mwd.group(2)) elif mwd.group(1) == "reply": qebchkreplids.append(mwd.group(2)) elif not re.match("<gidredirect", wd): essxfq.append(wd) if not qblock and not qnummissings: res.append( '<gidredirect oldgid="%s" newgid="%s" matchtype="removed"/>\n' % (qebchk[0], majblocks[0][0].qGID)) for qebq in qebchkquesids: res.append( '<gidredirect oldgid="%s" newgid="%s" matchtype="removed"/>\n' % (qebq, majblocks[0][0].qGID)) for qebqr in qebchkreplids: res.append( '<gidredirect oldgid="%s" newgid="%s" matchtype="removed"/>\n' % (qebqr, majblocks[0][0].qGID)) # Is the lred current-gidredirects bit needed here too? Don't think so, but not sure continue # build up the same summary from the question block essbkfq = [] for qblockqr in (qblock.queses, qblock.replies): for qb in qblockqr: essbkfq.append("<%s>" % qb.typ) for wd in re.findall("<[^>]*>|&\w+;|[^<>\s]+", "\n".join(qb.stext)): mwd = re.match("<(p|tr)[^>]*>", wd) if mwd: essbkfq.append("<%s>" % mwd.group(1)) elif not re.match("<gidredirect", wd): essbkfq.append(wd) essbkfq.append("</%s>" % qb.typ) # print the link forwards bchanges = (essxfq != essbkfq) matchtype = bchanges and "changes" or "perfectmatch" if bchanges: res.append("\n") res.append('<gidredirect oldgid="%s" newgid="%s" matchtype="%s"/>\n' % (qebchk[0], qblock.headingqb.qGID, matchtype)) # write the parallel redirects for the question and reply (both mapping to same parts of each) # this may be more sophisticated once we see an example of failure # ultimately this is a job for paragraph matching # sometimes we get more than one question. # when we find a mismatch we'll deal with it as a special paragraph problem, or not bother. if len(qebchkquesids) != len(qblock.queses): print len(qebchkquesids), len(qblock.queses), qblock.queses[0].qGID assert len(qebchkquesids) == len(qblock.queses) for i in range(len(qebchkquesids)): res.append( '<gidredirect oldgid="%s" newgid="%s" matchtype="%s"/>\n' % (qebchkquesids[i], qblock.queses[i].qGID, matchtype)) assert len(qebchkreplids) == len(qblock.replies) == 1 for qebqr in qebchkreplids: res.append( '<gidredirect oldgid="%s" newgid="%s" matchtype="%s"/>\n' % (qebqr, qblock.replies[0].qGID, matchtype)) # if changes write out the original, else just the gidmaps if bchanges: res.append('<minor-heading id="%s"%s>\n' % qebchk[0:2]) res.append(qebchk[2]) res.append('</minor-heading>\n') res.append(qebchk[3]) res.append("\n\n") else: for lred in re.findall("<gidredirect[^>]*>\n", qebchk[3]): res.append("\t") res.append(lred) return res
def GlueByNext(fout, urlx, billtitle): # put out the indexlink for comparison with the hansardindex file lt = time.gmtime() fout.write('<pagex url="%s" scrapedate="%s" scrapetime="%s" billtitle="%s"/>\n' % \ (urlx, time.strftime('%Y-%m-%d', lt), time.strftime('%X', lt), billtitle)) url = urlx year = int(re.search('cm(\d{4})', urlx).group(1)) if year >= 2010: pageheader = '<div id="content"' pagefooter = '<a name="end"/>' else: pageheader = '<img\s*src="/pa/img/portsgrn.gif"\s*alt="House\s*of\s*Commons\s*portcullis"><BR>' # there are various green button gifs, including two which say "continue", but with different filenames pagefooter = '<a href\s*=\s*"[^"]*">\s*<img border=0(?: align=top)? src="/pa/img/(?:ctntgrn|conugrn|prevgrn|contgrn).gif"' if re.search("/pa/cm200203/cmstand/d/st030401/am/30401s01.htm$", urlx): pageheader = "<!--end of UK Parliament banner for Publications -->" if re.search("/pa/cm200102/cmstand/d/st020115/am/20115s01.htm$", urlx): pageheader = "<!--end of UK Parliament banner for Publications -->" if re.search("/pa/cm200304/cmstand/c/st040428/pm/40428s01.htm$", urlx): pageheader = "<!--end of UK Parliament banner for Publications-->" if re.search("/pa/cm200203/cmstand/c/st030402/30402s01.htm$", urlx): pageheader = "<!--end of UK Parliament banner for Publications-->" if re.search("/pa/cm200102/cmstand/g/st020213/am/20213s01.htm$", urlx): pageheader = "<!--end of UK Parliament banner for Publications-->" if re.search("/pa/cm199900/cmstand/f/st000525/00525s10.htm#pm$", urlx): pageheader = "<a name=pm>" url = re.sub("#pm", "", url) if re.search("/pa/cm200910/cmpublic/bribery/100323/am", urlx): pageheader = '<div id="maincontent">' pagefooter = '<a name="end"/>' if re.search("/pa/cm200910/cmpublic/cooperativeandcommunity/100303/am", urlx): pagefooter = '<a name="end">' if re.search("/pa/cm200910/cmpublic/marriagewales/100224/pm", urlx): pagefooter = '<a name="end">' if re.search('/pa/cm200910/cmpublic/thirdparties/100316/am', urlx): pagefooter = '<a name="end">' if re.search("/pa/cm200910/cmpublic/gromarket/100330/am", urlx): pageheader = '<div id="maincontent">' pagefooter = '<a name="end"/>' # loop which scrapes through all the pages following the nextlinks # knocking off the known links as we go in case a "next page" is missing. while True: if re.search("/pa/cm199798/cmstand/b/st971106/am/71106s04.htm$", url): url = re.sub("s04.htm", "s05.htm", url) # skip over missing page ur = urllib.urlopen(url) sr = ur.read() ur.close() # write the marker telling us which page this comes from fout.write('<page url="' + url + '"/>\n') repagebody = '(?si).*?%s(.*?)%s' % (pageheader, pagefooter) mbody = re.match(repagebody, sr) if not mbody: if re.search("/pa/cm199899/cmstand/e/st990429/am/90429s03.htm$", url): # continuation does not exist break if re.search( "/pa/cm199899/cmstand/special/st990420/pm/pt3/90420s12.htm$", url): # continuation does not exist break if re.search("/pa/cm200203/cmstand/d/st031016/pm/31016s06.htm$", url): # continuation does not exist break print "\n", pageheader, "\n\n", pagefooter, "\n\n" print "header", re.search('(?si)' + pageheader, sr) print "footer", re.search('(?si)' + pagefooter, sr) print url print sr[:2000] assert False miscfuncs.WriteCleanText(fout, mbody.group(1), False) # the files are sectioned by the <hr> tag into header, body and footer. mnextsectionlink = re.search( '(?si)<\s*a\s+href\s*=\s*"?([^"]*?)"?\s*>\s*<img border=0 align=top src="/pa/img/conugrn.gif"', sr[mbody.end(1):]) #print " nextsectionlink", mnextsectionlink if not mnextsectionlink: break url = urlparse.urljoin(url, mnextsectionlink.group(1)) if miscfuncs.IsNotQuiet(): print " ", re.sub(".*?cmstand/", "", url) # second and subsequent pages pageheader = '<p align=right>\[<a href="[^"]*">back to previous text</a>\]' pass #endwhile urla
def StandingPullGluePages(datefrom, dateto, bforcescrape): # make the output firectory if not os.path.isdir(pwstandingpages): os.mkdir(pwstandingpages) # load the index file previously made by createhansardindex cstandingindex = LoadStandingIndex(pwstandingindex) # scan through the directory and make a mapping of all the copies for each lshortnamemap = {} for ldfile in os.listdir(pwstandingpages): mnums = re.match("(standing.*?)([a-z]*)\.html$", ldfile) if mnums: lshortnamemap.setdefault(mnums.group(1), []).append( (AlphaStringToOrder(mnums.group(2)), mnums.group(2), ldfile)) elif os.path.isfile(os.path.join(pwstandingpages, ldfile)): print "not recognized file:", ldfile, " in ", pwstandingpages # loop through the index of each lord line. for dnu in cstandingindex.res: # implement date range if dnu[2] < datefrom or dnu[2] > dateto: continue # make the filename dgflatestalpha, dgflatest = "", None if dnu[0] in lshortnamemap: ldgf = max(lshortnamemap[dnu[0]]) dgflatestalpha = ldgf[1] dgflatest = os.path.join(pwstandingpages, ldgf[2]) dgfnextalpha = NextAlphaString(dgflatestalpha) ldgfnext = '%s%s.html' % (dnu[0], dgfnextalpha) dgfnext = os.path.join(pwstandingpages, ldgfnext) assert not dgflatest or os.path.isfile(dgflatest) assert not os.path.isfile(dgfnext), dgfnext dgfnextstem = "%s%s" % (dnu[0], dgfnextalpha) dgflateststem = "%s%s" % (dnu[0], dgflatestalpha) # hansard index page urlx = dnu[1] # if not force scrape then we may choose to scrape it anyway # where the header doesn't match if not bforcescrape and dgflatest: fpgx = open(dgflatest, "r") pgx = fpgx.readline() fpgx.close() if pgx: pgx = re.findall('<pagex url="([^"]*)"[^/]*/>', pgx) if pgx: if pgx[0] == urlx: continue # make the message if miscfuncs.IsNotQuiet(): print dnu[0], (dgflatest and 'RE-scraping' or 'scraping'), re.sub(".*?cmstand/", "", urlx) print dnu[3] # now we take out the local pointer and start the gluing # we could check that all our links above get cleared. dtemp = open(tempfilename, "w") GlueByNext(dtemp, urlx, dnu[3]) dtemp.close() # now we have to decide whether it's actually new and should be copied onto dgfnext. if dgflatest: # the removal of \r makes testing sizes unreliable -- : and os.path.getsize(tempfilename) == os.path.getsize(dgflatest): # load in as strings and check matching fdgflatest = open(dgflatest) sdgflatest = fdgflatest.readlines() fdgflatest.close() fdgfnext = open(tempfilename) sdgfnext = fdgfnext.readlines() fdgfnext.close() # first line contains the scrape date if sdgflatest[1:] == sdgfnext[1:]: if miscfuncs.IsNotQuiet(): print " matched with:", dgflatest continue ReplicatePatchToNewScrapedVersion('standing', dgflateststem, dgflatest, dgfnext, dgfnextstem) print dnu[0], (dgflatest and 'RE-scraped' or 'scraped'), re.sub(".*?cmpages[/\\\\]", "", dgfnext) os.rename(tempfilename, dgfnext)