Пример #1
0
def StripWestminhallHeadings(headspeak, sdate):
    # check and strip the first two headings in as much as they are there
    ih = 0
    ih = StripDebateHeading('Initial', ih, headspeak)

    # Westminster Hall
    ih = StripDebateHeading('westminster hall(?i)', ih, headspeak)

    # date line
    givendate = re.sub('</?i>', ' ', headspeak[ih][0])
    gd = re.match('(?:<stamp aname="[^"]*"/>)*(.*)$(?i)', givendate)
    if gd:
        givendate = gd.group(1)
    if ((sdate !=
         mx.DateTime.DateTimeFrom(givendate).date)) or headspeak[ih][2]:
        raise Exception, 'date heading %s mismatches with date %s' % (repr(
            headspeak[ih]), sdate)
    ih = ih + 1

    # next line is:
    # <H3><center>[Mr. John McWilliam in the Chair]</center></H3>
    # but we leave it as a title.

    # find the url, colnum and time stamps that occur before anything else in the unspoken text
    stampurl = StampUrl(sdate)
    stampurl.timestamp = '<stamp time="%s"/>' % "unknown"

    for j in range(0, ih):
        stampurl.UpdateStampUrl(headspeak[j][1])

    if (not stampurl.stamp) or (not stampurl.pageurl):
        raise Exception, ' missing stamp url at beginning of file '
    return (ih, stampurl)
Пример #2
0
def StripWestminhallHeadings(headspeak, sdate):
	# check and strip the first two headings in as much as they are there
	ih = 0
	ih = StripDebateHeading('Initial', ih, headspeak)

	# Westminster Hall
	ih = StripDebateHeading('westminster hall(?i)', ih, headspeak)

	# date line
        givendate = re.sub('</?i>',' ', headspeak[ih][0])
        gd = re.match('(?:<stamp aname="[^"]*"/>)*(.*)$(?i)', givendate)
        if gd:
                givendate = gd.group(1)
	if ((sdate != mx.DateTime.DateTimeFrom(givendate).date)) or headspeak[ih][2]:
		raise Exception, 'date heading %s mismatches with date %s' % (repr(headspeak[ih]), sdate)
	ih = ih + 1

	# next line is:
	# <H3><center>[Mr. John McWilliam in the Chair]</center></H3>
	# but we leave it as a title.

	# find the url, colnum and time stamps that occur before anything else in the unspoken text
	stampurl = StampUrl(sdate)
	stampurl.timestamp = '<stamp time="%s"/>' % "unknown"

	for j in range(0, ih):
		stampurl.UpdateStampUrl(headspeak[j][1])

	if (not stampurl.stamp) or (not stampurl.pageurl):
		raise Exception, ' missing stamp url at beginning of file '
	return (ih, stampurl)
Пример #3
0
def StripDebateHeadings(headspeak, sdate):
    # check and strip the first two headings in as much as they are there
    ih = 0
    ih = StripDebateHeading(
        'Initial', ih, headspeak
    )  # the 'Initial' is inserted by the splitheadingsspeakers function

    # volume type heading
    if re.search('THE$', headspeak[ih][0]):
        ih = StripDebateHeading('THE', ih, headspeak)
        ih = StripDebateHeading('PARLIAMENTARY(?:&nbsp;)+DEBATES', ih,
                                headspeak)
    elif re.search('THE PARLIAMENTARY DEBATES', headspeak[ih][0]):
        ih = StripDebateHeading('THE PARLIAMENTARY DEBATES', ih, headspeak)
    if re.search('OFFICIAL REPORT', headspeak[ih][0]):
        ih = StripDebateHeading('OFFICIAL REPORT', ih, headspeak)
        ih = StripDebateHeading(
            'IN THE .*? SESSION OF THE .*? PARLIAMENT OF THE', ih, headspeak,
            True)
        ih = StripDebateHeading(
            'UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND', ih,
            headspeak, True)
        ih = StripDebateHeading('\[WHICH OPENED .*?\]', ih, headspeak, True)
        ih = StripDebateHeading('.*? YEAR OF THE REIGN OF.*?', ih, headspeak,
                                True)
        ih = StripDebateHeading('HER MAJESTY QUEEN ELIZABETH II', ih,
                                headspeak, True)
        ih = StripDebateHeading('SI.*? SERIES.*?VOLUME \d+', ih, headspeak,
                                True)
        ih = StripDebateHeading('SI.*? SERIES', ih, headspeak, True)
        ih = StripDebateHeading('VOLUME \d+', ih, headspeak, True)
        ih = StripDebateHeading('.*? VOLUME OF SESSION .*?', ih, headspeak)

    #House of Commons
    ih = StripDebateHeading('house of commons(?i)', ih, headspeak)

    # Tuesday 9 December 2003
    if not re.match('the house met at .*(?i)', headspeak[ih][0]):
        givendate = re.sub('&nbsp;', ' ', headspeak[ih][0])
        givendate = re.sub('</?i>', ' ', givendate)
        gd = re.match('(?:<stamp aname="[^"]*"/>)*(.*)$(?i)', givendate)
        if gd:
            givendate = gd.group(1)
        if ((sdate !=
             mx.DateTime.DateTimeFrom(givendate).date)) or headspeak[ih][2]:
            raise Exception, 'date heading %s mismatches with date %s' % (repr(
                headspeak[ih]), sdate)
        ih = ih + 1

    gstarttime = None
    if sdate != "2001-06-13":
        #The House met at half-past Ten o'clock
        gstarttime = re.match(
            '(?:<stamp aname="[^"]*"/>)*(?:<i>)?\s*the\s+house (?:being |having )?met at?\s+(?:</i><i>\s*)?(.*?)(?:, and the Speaker-Elect having taken the Chair;)?(?:</i>)?$(?i)',
            headspeak[ih][0])
        if (not gstarttime) or headspeak[ih][2]:
            raise ContextException(
                'non-conforming "the house met at" heading %s' %
                repr(headspeak[ih]), "")
        ih = ih + 1

# Start of a new parliament is special
    if sdate not in ["2001-06-14", "2001-06-13", "2005-05-11", "2005-05-12"]:

        #PRAYERS
        ih = StripDebateHeading('prayers(?i)', ih, headspeak, True)

        ih = StripDebateHeading('pursuant to the Standing Order\.', ih,
                                headspeak, True)

        # in the chair
        ih = StripDebateHeading('\[.*?[ >]in the chair[<>i/\.]*\](?i)', ih,
                                headspeak, True)

    # find the url, colnum and time stamps that occur before anything else in the unspoken text
    stampurl = StampUrl(sdate)

    # set the time from the wording 'house met at' thing.
    if gstarttime:
        time = gstarttime.group(1)
        time = re.sub('</?i>', ' ', time)
        time = re.sub('\s+', ' ', time)
        if re.match("half-past Nine(?i)", time):
            newtime = '09:30:00'
        elif re.match("a quarter to Ten o(?i)", time):
            newtime = '09:45:00'
        elif re.match("Ten o'clock(?i)", time):
            newtime = '10:00:00'
        elif re.match("half-past Ten(?i)", time):
            newtime = '10:30:00'
        elif re.match("Eleven o&#039;clock(?i)", time):
            newtime = '11:00:00'
        elif re.match("twenty-five minutes past\s*Eleven(?i)", time):
            newtime = '11:25:00'
        elif re.match("twenty-six minutes past\s*Eleven(?i)", time):
            newtime = '11:26:00'
        elif re.match("twenty-nine minutes past\s*Eleven(?i)", time):
            newtime = '11:29:00'
        elif re.match("half-past Eleven(?i)", time):
            newtime = '11:30:00'
        elif re.match("Twelve noon(?i)", time):
            newtime = '12:00:00'
        elif re.match("half-past One(?i)", time):
            newtime = '13:30:00'
        elif re.match("half-past Two(?i)", time):
            newtime = '14:30:00'
        elif re.match("twenty minutes to Three(?i)", time):
            newtime = '14:40:00'
        elif re.match("10 minutes past Three(?i)", time):
            newtime = '15:10:00'
        elif re.match("Six o'clock(?i)", time):
            newtime = '18:00:00'
        else:
            raise ContextException, "Start time not known: " + time
        stampurl.timestamp = '<stamp time="%s"/>' % newtime

    for j in range(0, ih):
        stampurl.UpdateStampUrl(headspeak[j][1])

    if (not stampurl.stamp) or (not stampurl.pageurl):
        raise Exception, ' missing stamp url at beginning of file '
    return (ih, stampurl)
Пример #4
0
def StripDebateHeadings(headspeak, sdate):
	# check and strip the first two headings in as much as they are there
	ih = 0
	ih = StripDebateHeading('Initial', ih, headspeak)  # the 'Initial' is inserted by the splitheadingsspeakers function

	# volume type heading
	if re.search('THE$', headspeak[ih][0]):
		ih = StripDebateHeading('THE', ih, headspeak)
		ih = StripDebateHeading('PARLIAMENTARY(?:&nbsp;)+DEBATES', ih, headspeak)
	elif re.search('THE PARLIAMENTARY DEBATES', headspeak[ih][0]):
		ih = StripDebateHeading('THE PARLIAMENTARY DEBATES', ih, headspeak)
	if re.search('OFFICIAL REPORT', headspeak[ih][0]):
		ih = StripDebateHeading('OFFICIAL REPORT', ih, headspeak)
		ih = StripDebateHeading('IN THE .*? SESSION OF THE .*? PARLIAMENT OF THE', ih, headspeak, True)
		ih = StripDebateHeading('UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND', ih, headspeak, True)
		ih = StripDebateHeading('\[WHICH OPENED .*?\]', ih, headspeak, True)
		ih = StripDebateHeading('.*? YEAR OF THE REIGN OF.*?', ih, headspeak, True)
		ih = StripDebateHeading('HER MAJESTY QUEEN ELIZABETH II', ih, headspeak, True)
                ih = StripDebateHeading('SI.*? SERIES.*?VOLUME \d+', ih, headspeak, True)
		ih = StripDebateHeading('SI.*? SERIES', ih, headspeak, True)
		ih = StripDebateHeading('VOLUME \d+', ih, headspeak, True)
		ih = StripDebateHeading('.*? VOLUME OF SESSION .*?', ih, headspeak)


	#House of Commons
	ih = StripDebateHeading('house of commons(?i)', ih, headspeak)

	# Tuesday 9 December 2003
	if not re.match('the house met at .*(?i)', headspeak[ih][0]):
                givendate = re.sub('&nbsp;',' ',headspeak[ih][0])
                givendate = re.sub('</?i>',' ', givendate)
                gd = re.match('(?:<stamp aname="[^"]*"/>)*(.*)$(?i)', givendate)
                if gd:
                        givendate = gd.group(1)
		if ((sdate != mx.DateTime.DateTimeFrom(givendate).date)) or headspeak[ih][2]:
			raise Exception, 'date heading %s mismatches with date %s' % (repr(headspeak[ih]), sdate)
		ih = ih + 1

        gstarttime = None
        if sdate != "2001-06-13":
                #The House met at half-past Ten o'clock
                gstarttime = re.match('(?:<stamp aname="[^"]*"/>)*(?:<i>)?\s*the\s+house (?:being |having )?met at?\s+(?:</i><i>\s*)?(.*?)(?:, and the Speaker-Elect having taken the Chair;)?(?:</i>)?$(?i)', headspeak[ih][0])
                if (not gstarttime) or headspeak[ih][2]:
                        raise ContextException('non-conforming "the house met at" heading %s' % repr(headspeak[ih]), "")
                ih = ih + 1

        # Start of a new parliament is special
        if sdate not in ["2001-06-14", "2001-06-13", "2005-05-11", "2005-05-12"]:

                #PRAYERS
                ih = StripDebateHeading('prayers(?i)', ih, headspeak, True)

                ih = StripDebateHeading('pursuant to the Standing Order\.', ih, headspeak, True)

                # in the chair
                ih = StripDebateHeading('\[.*?[ >]in the chair[<>i/\.]*\](?i)', ih, headspeak, True)

	# find the url, colnum and time stamps that occur before anything else in the unspoken text
	stampurl = StampUrl(sdate)

	# set the time from the wording 'house met at' thing.
        if gstarttime:
                time = gstarttime.group(1)
                time = re.sub('</?i>',' ', time)
                time = re.sub('\s+',' ', time)
                if re.match("half-past Nine(?i)", time):
                        newtime = '09:30:00'
                elif re.match("a quarter to Ten o(?i)", time):
                        newtime = '09:45:00'
                elif re.match("Ten o'clock(?i)", time):
                        newtime = '10:00:00'
                elif re.match("half-past Ten(?i)", time):
                        newtime = '10:30:00'
                elif re.match("Eleven o&#039;clock(?i)", time):
                        newtime = '11:00:00'
                elif re.match("twenty-five minutes past\s*Eleven(?i)", time):
                        newtime = '11:25:00'
                elif re.match("twenty-six minutes past\s*Eleven(?i)", time):
                        newtime = '11:26:00'
                elif re.match("twenty-nine minutes past\s*Eleven(?i)", time):
                        newtime = '11:29:00'
                elif re.match("half-past Eleven(?i)", time):
                        newtime = '11:30:00'
                elif re.match("Twelve noon(?i)", time):
                        newtime = '12:00:00'
                elif re.match("half-past One(?i)", time):
                        newtime = '13:30:00'
                elif re.match("half-past Two(?i)", time):
                        newtime = '14:30:00'
                elif re.match("twenty minutes to Three(?i)", time):
                        newtime = '14:40:00'
                elif re.match("10 minutes past Three(?i)", time):
                        newtime = '15:10:00'
                elif re.match("Six o'clock(?i)", time):
                        newtime = '18:00:00'
                else:
                        raise ContextException, "Start time not known: " + time
                stampurl.timestamp = '<stamp time="%s"/>' % newtime

	for j in range(0, ih):
		stampurl.UpdateStampUrl(headspeak[j][1])

	if (not stampurl.stamp) or (not stampurl.pageurl):
		raise Exception, ' missing stamp url at beginning of file '
	return (ih, stampurl)