def StripWestminhallHeadings(headspeak, sdate): # check and strip the first two headings in as much as they are there ih = 0 ih = StripDebateHeading('Initial', ih, headspeak) # Westminster Hall ih = StripDebateHeading('westminster hall(?i)', ih, headspeak) # date line givendate = re.sub('</?i>', ' ', headspeak[ih][0]) gd = re.match('(?:<stamp aname="[^"]*"/>)*(.*)$(?i)', givendate) if gd: givendate = gd.group(1) if ((sdate != mx.DateTime.DateTimeFrom(givendate).date)) or headspeak[ih][2]: raise Exception, 'date heading %s mismatches with date %s' % (repr( headspeak[ih]), sdate) ih = ih + 1 # next line is: # <H3><center>[Mr. John McWilliam in the Chair]</center></H3> # but we leave it as a title. # find the url, colnum and time stamps that occur before anything else in the unspoken text stampurl = StampUrl(sdate) stampurl.timestamp = '<stamp time="%s"/>' % "unknown" for j in range(0, ih): stampurl.UpdateStampUrl(headspeak[j][1]) if (not stampurl.stamp) or (not stampurl.pageurl): raise Exception, ' missing stamp url at beginning of file ' return (ih, stampurl)
def StripWestminhallHeadings(headspeak, sdate): # check and strip the first two headings in as much as they are there ih = 0 ih = StripDebateHeading('Initial', ih, headspeak) # Westminster Hall ih = StripDebateHeading('westminster hall(?i)', ih, headspeak) # date line givendate = re.sub('</?i>',' ', headspeak[ih][0]) gd = re.match('(?:<stamp aname="[^"]*"/>)*(.*)$(?i)', givendate) if gd: givendate = gd.group(1) if ((sdate != mx.DateTime.DateTimeFrom(givendate).date)) or headspeak[ih][2]: raise Exception, 'date heading %s mismatches with date %s' % (repr(headspeak[ih]), sdate) ih = ih + 1 # next line is: # <H3><center>[Mr. John McWilliam in the Chair]</center></H3> # but we leave it as a title. # find the url, colnum and time stamps that occur before anything else in the unspoken text stampurl = StampUrl(sdate) stampurl.timestamp = '<stamp time="%s"/>' % "unknown" for j in range(0, ih): stampurl.UpdateStampUrl(headspeak[j][1]) if (not stampurl.stamp) or (not stampurl.pageurl): raise Exception, ' missing stamp url at beginning of file ' return (ih, stampurl)
def StripDebateHeadings(headspeak, sdate): # check and strip the first two headings in as much as they are there ih = 0 ih = StripDebateHeading( 'Initial', ih, headspeak ) # the 'Initial' is inserted by the splitheadingsspeakers function # volume type heading if re.search('THE$', headspeak[ih][0]): ih = StripDebateHeading('THE', ih, headspeak) ih = StripDebateHeading('PARLIAMENTARY(?: )+DEBATES', ih, headspeak) elif re.search('THE PARLIAMENTARY DEBATES', headspeak[ih][0]): ih = StripDebateHeading('THE PARLIAMENTARY DEBATES', ih, headspeak) if re.search('OFFICIAL REPORT', headspeak[ih][0]): ih = StripDebateHeading('OFFICIAL REPORT', ih, headspeak) ih = StripDebateHeading( 'IN THE .*? SESSION OF THE .*? PARLIAMENT OF THE', ih, headspeak, True) ih = StripDebateHeading( 'UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND', ih, headspeak, True) ih = StripDebateHeading('\[WHICH OPENED .*?\]', ih, headspeak, True) ih = StripDebateHeading('.*? YEAR OF THE REIGN OF.*?', ih, headspeak, True) ih = StripDebateHeading('HER MAJESTY QUEEN ELIZABETH II', ih, headspeak, True) ih = StripDebateHeading('SI.*? SERIES.*?VOLUME \d+', ih, headspeak, True) ih = StripDebateHeading('SI.*? SERIES', ih, headspeak, True) ih = StripDebateHeading('VOLUME \d+', ih, headspeak, True) ih = StripDebateHeading('.*? VOLUME OF SESSION .*?', ih, headspeak) #House of Commons ih = StripDebateHeading('house of commons(?i)', ih, headspeak) # Tuesday 9 December 2003 if not re.match('the house met at .*(?i)', headspeak[ih][0]): givendate = re.sub(' ', ' ', headspeak[ih][0]) givendate = re.sub('</?i>', ' ', givendate) gd = re.match('(?:<stamp aname="[^"]*"/>)*(.*)$(?i)', givendate) if gd: givendate = gd.group(1) if ((sdate != mx.DateTime.DateTimeFrom(givendate).date)) or headspeak[ih][2]: raise Exception, 'date heading %s mismatches with date %s' % (repr( headspeak[ih]), sdate) ih = ih + 1 gstarttime = None if sdate != "2001-06-13": #The House met at half-past Ten o'clock gstarttime = re.match( '(?:<stamp aname="[^"]*"/>)*(?:<i>)?\s*the\s+house (?:being |having )?met at?\s+(?:</i><i>\s*)?(.*?)(?:, and the Speaker-Elect having taken the Chair;)?(?:</i>)?$(?i)', headspeak[ih][0]) if (not gstarttime) or headspeak[ih][2]: raise ContextException( 'non-conforming "the house met at" heading %s' % repr(headspeak[ih]), "") ih = ih + 1 # Start of a new parliament is special if sdate not in ["2001-06-14", "2001-06-13", "2005-05-11", "2005-05-12"]: #PRAYERS ih = StripDebateHeading('prayers(?i)', ih, headspeak, True) ih = StripDebateHeading('pursuant to the Standing Order\.', ih, headspeak, True) # in the chair ih = StripDebateHeading('\[.*?[ >]in the chair[<>i/\.]*\](?i)', ih, headspeak, True) # find the url, colnum and time stamps that occur before anything else in the unspoken text stampurl = StampUrl(sdate) # set the time from the wording 'house met at' thing. if gstarttime: time = gstarttime.group(1) time = re.sub('</?i>', ' ', time) time = re.sub('\s+', ' ', time) if re.match("half-past Nine(?i)", time): newtime = '09:30:00' elif re.match("a quarter to Ten o(?i)", time): newtime = '09:45:00' elif re.match("Ten o'clock(?i)", time): newtime = '10:00:00' elif re.match("half-past Ten(?i)", time): newtime = '10:30:00' elif re.match("Eleven o'clock(?i)", time): newtime = '11:00:00' elif re.match("twenty-five minutes past\s*Eleven(?i)", time): newtime = '11:25:00' elif re.match("twenty-six minutes past\s*Eleven(?i)", time): newtime = '11:26:00' elif re.match("twenty-nine minutes past\s*Eleven(?i)", time): newtime = '11:29:00' elif re.match("half-past Eleven(?i)", time): newtime = '11:30:00' elif re.match("Twelve noon(?i)", time): newtime = '12:00:00' elif re.match("half-past One(?i)", time): newtime = '13:30:00' elif re.match("half-past Two(?i)", time): newtime = '14:30:00' elif re.match("twenty minutes to Three(?i)", time): newtime = '14:40:00' elif re.match("10 minutes past Three(?i)", time): newtime = '15:10:00' elif re.match("Six o'clock(?i)", time): newtime = '18:00:00' else: raise ContextException, "Start time not known: " + time stampurl.timestamp = '<stamp time="%s"/>' % newtime for j in range(0, ih): stampurl.UpdateStampUrl(headspeak[j][1]) if (not stampurl.stamp) or (not stampurl.pageurl): raise Exception, ' missing stamp url at beginning of file ' return (ih, stampurl)
def StripDebateHeadings(headspeak, sdate): # check and strip the first two headings in as much as they are there ih = 0 ih = StripDebateHeading('Initial', ih, headspeak) # the 'Initial' is inserted by the splitheadingsspeakers function # volume type heading if re.search('THE$', headspeak[ih][0]): ih = StripDebateHeading('THE', ih, headspeak) ih = StripDebateHeading('PARLIAMENTARY(?: )+DEBATES', ih, headspeak) elif re.search('THE PARLIAMENTARY DEBATES', headspeak[ih][0]): ih = StripDebateHeading('THE PARLIAMENTARY DEBATES', ih, headspeak) if re.search('OFFICIAL REPORT', headspeak[ih][0]): ih = StripDebateHeading('OFFICIAL REPORT', ih, headspeak) ih = StripDebateHeading('IN THE .*? SESSION OF THE .*? PARLIAMENT OF THE', ih, headspeak, True) ih = StripDebateHeading('UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND', ih, headspeak, True) ih = StripDebateHeading('\[WHICH OPENED .*?\]', ih, headspeak, True) ih = StripDebateHeading('.*? YEAR OF THE REIGN OF.*?', ih, headspeak, True) ih = StripDebateHeading('HER MAJESTY QUEEN ELIZABETH II', ih, headspeak, True) ih = StripDebateHeading('SI.*? SERIES.*?VOLUME \d+', ih, headspeak, True) ih = StripDebateHeading('SI.*? SERIES', ih, headspeak, True) ih = StripDebateHeading('VOLUME \d+', ih, headspeak, True) ih = StripDebateHeading('.*? VOLUME OF SESSION .*?', ih, headspeak) #House of Commons ih = StripDebateHeading('house of commons(?i)', ih, headspeak) # Tuesday 9 December 2003 if not re.match('the house met at .*(?i)', headspeak[ih][0]): givendate = re.sub(' ',' ',headspeak[ih][0]) givendate = re.sub('</?i>',' ', givendate) gd = re.match('(?:<stamp aname="[^"]*"/>)*(.*)$(?i)', givendate) if gd: givendate = gd.group(1) if ((sdate != mx.DateTime.DateTimeFrom(givendate).date)) or headspeak[ih][2]: raise Exception, 'date heading %s mismatches with date %s' % (repr(headspeak[ih]), sdate) ih = ih + 1 gstarttime = None if sdate != "2001-06-13": #The House met at half-past Ten o'clock gstarttime = re.match('(?:<stamp aname="[^"]*"/>)*(?:<i>)?\s*the\s+house (?:being |having )?met at?\s+(?:</i><i>\s*)?(.*?)(?:, and the Speaker-Elect having taken the Chair;)?(?:</i>)?$(?i)', headspeak[ih][0]) if (not gstarttime) or headspeak[ih][2]: raise ContextException('non-conforming "the house met at" heading %s' % repr(headspeak[ih]), "") ih = ih + 1 # Start of a new parliament is special if sdate not in ["2001-06-14", "2001-06-13", "2005-05-11", "2005-05-12"]: #PRAYERS ih = StripDebateHeading('prayers(?i)', ih, headspeak, True) ih = StripDebateHeading('pursuant to the Standing Order\.', ih, headspeak, True) # in the chair ih = StripDebateHeading('\[.*?[ >]in the chair[<>i/\.]*\](?i)', ih, headspeak, True) # find the url, colnum and time stamps that occur before anything else in the unspoken text stampurl = StampUrl(sdate) # set the time from the wording 'house met at' thing. if gstarttime: time = gstarttime.group(1) time = re.sub('</?i>',' ', time) time = re.sub('\s+',' ', time) if re.match("half-past Nine(?i)", time): newtime = '09:30:00' elif re.match("a quarter to Ten o(?i)", time): newtime = '09:45:00' elif re.match("Ten o'clock(?i)", time): newtime = '10:00:00' elif re.match("half-past Ten(?i)", time): newtime = '10:30:00' elif re.match("Eleven o'clock(?i)", time): newtime = '11:00:00' elif re.match("twenty-five minutes past\s*Eleven(?i)", time): newtime = '11:25:00' elif re.match("twenty-six minutes past\s*Eleven(?i)", time): newtime = '11:26:00' elif re.match("twenty-nine minutes past\s*Eleven(?i)", time): newtime = '11:29:00' elif re.match("half-past Eleven(?i)", time): newtime = '11:30:00' elif re.match("Twelve noon(?i)", time): newtime = '12:00:00' elif re.match("half-past One(?i)", time): newtime = '13:30:00' elif re.match("half-past Two(?i)", time): newtime = '14:30:00' elif re.match("twenty minutes to Three(?i)", time): newtime = '14:40:00' elif re.match("10 minutes past Three(?i)", time): newtime = '15:10:00' elif re.match("Six o'clock(?i)", time): newtime = '18:00:00' else: raise ContextException, "Start time not known: " + time stampurl.timestamp = '<stamp time="%s"/>' % newtime for j in range(0, ih): stampurl.UpdateStampUrl(headspeak[j][1]) if (not stampurl.stamp) or (not stampurl.pageurl): raise Exception, ' missing stamp url at beginning of file ' return (ih, stampurl)