Python qspeech示例，clsinglespeech.qspeech Python示例

示例#1

0

显示文件

文件： sections.py 项目： emmaclarke/parlparse

def GrabWestminDivisionInterruptProced(qbp, rawtext):
	if len(qbp.stext) < 3:
		return None
	iskip = 0
	if re.search("italic.*?>on resuming&\S*</p>(?i)", qbp.stext[-1]):
		if not re.search("italic.*?>(?:sitting )?(?:suspended|adjourned)(?: for (?:a division|divisions) in the house)?[\.\s]*(?i)", qbp.stext[-2]):
			raise ContextException('failed to detect sitting suspended interruption',
				fragment=qbp.stext[-2]
			)
		iskip = -2

	elif re.search("italic.*?>sitting suspended(?: for| until| till|\.)(?i)", qbp.stext[-1]):
		iskip = -1

	# copy the lines into a non-speaking paragraph.
	if iskip:
		dumtext = re.sub('<p>(?:<stamp aname="[^"]*?"/>)?<i>sitting suspended.*(?si)','',rawtext)
                # Why didn't I make a note of why I did the following lines? Must be something to do with the timestamps...
		s = copy.copy(qbp.sstampurl)
		qbdp = qspeech('nospeaker="true"', dumtext, s)
		qbdp = qspeech('nospeaker="true"', "", s)
		qbdp.typ = 'speech'
		qbdp.stext = qbp.stext[iskip:]
		# trim back the given one by two lines
		qbp.stext = qbp.stext[:iskip]
		return qbdp
	return None

示例#2

0

显示文件

文件： sections.py 项目： JonathanBowker/parlparse

def NewGrabLordDivisionProced(qbp, qbd):
	if not re.match("speech|motion", qbp.typ) or len(qbp.stext) < 1:
		print qbp.stext
		raise ContextException("previous to division not speech", stamp=qbp.sstampurl)

        iskim = 1
        while iskim <= len(qbp.stext) and not redivisionon.match(qbp.stext[-iskim]):
                iskim = iskim + 1
        if iskim > len(qbp.stext):
                raise ContextException("Could not find Division 'title'", stamp=qbp.sstampurl)

	hdg = renewlorddiv.match(qbp.stext[-iskim+1])
	if not hdg:
		print qbp.stext[-iskim+1]
		raise ContextException("no totals before division", stamp=qbp.sstampurl)

	# if previous thing is already a no-speaker, we don't need to break it out
	# (the coding on the question put is complex and multilined)
	if re.search('nospeaker="true"', qbp.speaker):
		qbp.stext = SubsPWtextset(qbp.stext)
		return None

	# copy the two lines into a non-speaking paragraph.
	qbdp = qspeech('nospeaker="true"', "", qbp.sstampurl)
	qbdp.typ = 'speech'
	qbdp.stext = SubsPWtextset(qbp.stext[-iskim:])

	# trim back the given one by two lines
	qbp.stext = qbp.stext[:-iskim]

	return qbdp

示例#3

0

显示文件

文件： sections.py 项目： JonathanBowker/parlparse

def GrabLordDivisionProced(qbp, qbd):
	if not re.match("speech|motion", qbp.typ) or len(qbp.stext) < 1:
		print qbp.stext
		raise ContextException("previous to division not speech", stamp=qbp.sstampurl)

	hdg = relorddiv.match(qbp.stext[-1])
	if not hdg:
		print qbp.stext[-1]
		raise ContextException("no lordships divided before division", stamp=qbp.sstampurl)

	# if previous thing is already a no-speaker, we don't need to break it out
	# (the coding on the question put is complex and multilined)
	if re.search('nospeaker="true"', qbp.speaker):
		qbp.stext = SubsPWtextset(qbp.stext)
		return None

	# look back at previous paragraphs and skim off a part of what's there
	# to make a non-spoken bit reporting on the division.
	iskim = 1
	if not resaidamend.match(qbp.stext[-2]):
		print qbp.stext[-2]
		raise ContextException("no on said amendment", stamp=qbp.sstampurl, fragment=qbp.stext[-2])
	iskim = 2

	# copy the two lines into a non-speaking paragraph.
	qbdp = qspeech('nospeaker="true"', "", qbp.sstampurl)
	qbdp.typ = 'speech'
	qbdp.stext = SubsPWtextset(qbp.stext[-iskim:])

	# trim back the given one by two lines
	qbp.stext = qbp.stext[:-iskim]

	return qbdp

示例#4

0

显示文件

文件： divisionsections.py 项目： henare/parlparse

def DivisionParsingPart(divno, unspoketxt, stampurl, sdate):
    # find the ending of the division and split it off.
    gquesacc = re.search(regenddiv, unspoketxt)
    if gquesacc:
        divtext = unspoketxt[: gquesacc.start(1)]
        unspoketxt = unspoketxt[gquesacc.start(1) :]
        if re.match(strexplicitenddiv, unspoketxt):  # strip off signal tag
            unspoketxt = unspoketxt[len(strexplicitenddiv) :]
    else:
        divtext = unspoketxt
        print unspoketxt
        print "division missing %s" % regenddiv
        print "try inserting <explicit-end-division>"
        unspoketxt = ""

        # Add a division object (will contain votes and motion text)
    spattr = 'nospeaker="true" divdate="%s" divnumber="%s"' % (sdate, divno)
    qbd = qspeech(spattr, divtext, stampurl)
    qbd.typ = "division"  # this type field seems easiest way

    # filtering divisions here because we may need more sophisticated detection
    # of end of division than the "Question accordingly" marker.
    qbd.stext = FilterDivision(qbd.text, stampurl, sdate)

    return (unspoketxt, qbd)

示例#5

0

显示文件

文件： sections.py 项目： JonathanBowker/parlparse

def FilterWMSSections(text, sdate, lords=False):
	text = ApplyFixSubstitutions(text, sdate, fixsubs)
	# split into list of triples of (heading, pre-first speech text, [ (speaker, text) ])
	headspeak = SplitHeadingsSpeakers(text)

	(ih, stampurl) = StripWMSHeadings(headspeak, sdate, lords)

	flatb = [ ]
	for sht in headspeak[ih:]:
		try:
			headingtxt = stampurl.UpdateStampUrl(string.strip(sht[0]))  # we're getting stamps inside the headings sometimes
			unspoketxt = sht[1]
			speechestxt = sht[2]

			if (not re.match('(?:<[^>]*>|\s|&nbsp;)*$', unspoketxt)):
				raise ContextException("unspoken text under heading in WMS", stamp=stampurl, fragment=unspoketxt)

			qbh = NormalHeadingPart(headingtxt, stampurl, sdate, speechestxt, lords)
                        flatb.append(qbh)
                        stampurl.UpdateStampUrl(unspoketxt)
			for ss in speechestxt:
# Put everything in XML, de-dupe elsewhere
#                                if lords and re.search('My (?:right )?honourable friend .*? has made the following (?:Written )?Ministerial Statement', ss[1]):
#                                        continue
				qb = qspeech(ss[0], ss[1], stampurl)
				qb.typ = 'speech'
				FilterWMSSpeech(qb)
				flatb.append(qb)

		except ContextException, e:
			raise
		except Exception, e:
			# add extra stamp info to the exception
			raise ContextException(str(e), stamp=stampurl)

示例#6

0

显示文件

文件： sections.py 项目： JonathanBowker/parlparse

def NormalHeadingPart(headingtxt, stampurl, sdate, speechestxt, lords):
	bmajorheading = False

	if lords:
		bmajorheading = False
	elif not re.search('[a-z]', headingtxt) and headingtxt != 'BNFL':
		bmajorheading = True
	elif re.search('_dpthd', stampurl.aname) or re.search('_head', stampurl.aname):
		bmajorheading = True
	if re.search('_sbhd', stampurl.aname):
		bmajorheading = False
        if sdate>'2006-05-07': # Assume major heading if no speeches in new style
                bmajorheading = not speechestxt

	if bmajorheading:
                if not parlPhrases.wransmajorheadings.has_key(headingtxt.upper()):
		        raise ContextException("unrecognized major heading, please add to parlPhrases.wransmajorheadings (a)", fragment = headingtxt, stamp = stampurl)
		headingtxt = parlPhrases.wransmajorheadings[headingtxt.upper()] # no need to fix since text is from a map.

	headingtxtfx = FixHTMLEntities(headingtxt)
	qb = qspeech('nospeaker="true"', headingtxtfx, stampurl)
	if bmajorheading:
		qb.typ = 'major-heading'
	else:
		qb.typ = 'minor-heading'

	# headings become one unmarked paragraph of text
	qb.stext = [ headingtxtfx ]
	return qb

示例#7

0

显示文件

文件： sections.py 项目： JonathanBowker/parlparse

def LordsHeadingPart(headingtxt, stampurl, major):

	headingtxtfx = FixHTMLEntities(headingtxt)
	qb = qspeech('nospeaker="true"', headingtxtfx, stampurl)
        if major and stampurl.sdate > '2008-12-01':
		qb.typ = 'major-heading'
	else:
		qb.typ = 'minor-heading'

	# headings become one unmarked paragraph of text
	qb.stext = [ headingtxtfx ]
	return qb

示例#8

0

显示文件

文件： divisions.py 项目： JonathanBowker/parlparse

def LordsDivisionParsingPart(divno, unspoketxt, stampurl, sdate):
	# find the ending of the division and split it off.
	gquesacc = re.search(regenddiv, unspoketxt)
	if gquesacc:
		divtext = unspoketxt[:gquesacc.start(1)]
		unspoketxt = unspoketxt[gquesacc.start(1):]
                unspoketxt = re.sub(':ENDDIVISION:', '', unspoketxt)
	elif sdate > '2008-12-01': # Sigh XXX
		m = re.match('.*, [A-Z]\.</p>(?s)', unspoketxt)
                if not m:
			m = re.match('.*<br>(?s)', unspoketxt)
                divtext = m.group()
                unspoketxt = unspoketxt[m.end():]
	else:
		divtext = unspoketxt
		print "division missing %s" % regenddiv
		print unspoketxt
		print "is there a linefeed before the </center> on the CONTENTS?"
		raise ContextException("Division missing resolved in the", stamp=stampurl, fragment="Division") # newly added
		unspoketxt = ''

	divtext = re.sub(' style="margin-bottom:[^"]*"', '', divtext)

	# Add a division object (will contain votes and motion text)
	spattr = 'nospeaker="true" divdate="%s" divnumber="%s"' % (sdate, divno)
	qbd = qspeech(spattr, divtext, stampurl)
	qbd.typ = 'division' # this type field seems easiest way

	if not stampurl.timestamp:
		raise ContextException("Division missing any timestamps; need to put one in to make it consistent.  like <h5>2.44 pm</h5>", stamp=stampurl, fragment="Division")

	# filtering divisions here because we may need more sophisticated detection
	# of end of division than the "Question accordingly" marker.
	qbd.stext = LordsFilterDivision(qbd.text, stampurl, sdate)

	return (unspoketxt, qbd)

示例#9

0

显示文件

文件： sections.py 项目： JonathanBowker/parlparse

def LordsFilterSections(text, sdate):

	# deal with one exceptional case of indenting
	if sdate == "2005-10-26":
		l = len(text)
		text = re.sub("<ul><ul>(<ul>)?", "<ul>", text)
		text = re.sub("</ul></ul>(</ul>)?", "</ul>", text)

		# regsection1 = '<h\d><center>.*?\s*</center></h\d>' in splitheadingsspeakers.py
		print "Duplicate <ul>s removed and <center> sorted on %s which shortened text by %d" % (sdate, l - len(text))


	# split into list of triples of (heading, pre-first speech text, [ (speaker, text) ])
	headspeak = SplitHeadingsSpeakers(text)


	# break down into lists of headings and lists of speeches
	(ih, stampurl) = StripLordsDebateHeadings(headspeak, sdate)
	if ih == None:
		return

	# loop through each detected heading and the detected partitioning of speeches which follow.
	# this is a flat output of qspeeches, some encoding headings, and some divisions.
	# see the typ variable for the type.
	flatb = [ ]

	for sht in headspeak[ih:]:
		# triplet of ( heading, unspokentext, [(speaker, text)] )
		headingtxt = stampurl.UpdateStampUrl(string.strip(sht[0]))  # we're getting stamps inside the headings sometimes
		unspoketxt = sht[1]
		speechestxt = sht[2]
                headingmajor = sht[3]

		# the heading detection, as a division or a heading speech object
		# detect division headings
		gdiv = re.search('Division No\.(?:\s|&#160;)+(\d+)', headingtxt)
		assert not re.match("(?:NOT-)?CONTENTS", headingtxt)

		# heading type
		if not gdiv:
			qbh = LordsHeadingPart(headingtxt, stampurl, headingmajor)

        		# ram together minor headings into previous ones which have no speeches
        		if qbh.typ == 'minor-heading' and len(flatb) > 0 and flatb[-1].typ == 'minor-heading':
        			flatb[-1].stext.append(" &mdash; ")
        			flatb[-1].stext.extend(qbh.stext)

        		# ram together minor headings into previous ones which have no speeches
        		elif sdate>'2008-12-01' and qbh.typ == 'minor-heading' and len(flatb) > 0 and flatb[-1].typ == 'major-heading':
        			flatb[-1].stext.append(" &mdash; ")
        			flatb[-1].stext.extend(qbh.stext)

        		# otherwise put out this heading
        		else:
        			flatb.append(qbh)

		# division type
		else:
			(unspoketxt, qbd) = LordsDivisionParsingPart(string.atoi(gdiv.group(1)), unspoketxt, stampurl, sdate)

			# grab some division text off the back end of the previous speech
			# and wrap into a new no-speaker speech
			if sdate >= '2008-12-01':
                                qbdp = NewGrabLordDivisionProced(flatb[-1], qbd)
                        else:
			        qbdp = GrabLordDivisionProced(flatb[-1], qbd)
			if qbdp:
				flatb.append(qbdp)
			flatb.append(qbd)

		# continue and output unaccounted for unspoken text occuring after a
		# division, or after a heading
		if (not re.match('(?:<[^>]*>|\s)*$', unspoketxt)):
			qb = qspeech('nospeaker="true"', unspoketxt, stampurl)
			qb.typ = 'speech'
			flatb.extend(FilterLordsSpeech(qb))

		# there is no text; update from stamps if there are any
		else:
			stampurl.UpdateStampUrl(unspoketxt)

		# go through each of the speeches in a block and put it into our batch of speeches
		for ss in speechestxt:
			qb = qspeech(ss[0], ss[1], stampurl)
			qb.typ = 'speech'
			flatb.extend(FilterLordsSpeech(qb))


	# we now have everything flattened out in a series of speeches
	return flatb

示例#10

0

显示文件

文件： sections.py 项目： JonathanBowker/parlparse

def FilterLordsSpeech(qb):

	# pull in the normal filtering that gets done on debate speeches
	# does the paragraph indents and tables.  Maybe should be inlined for lords
	FilterDebateSpeech(qb)


	# the colon attr is blank or has a : depending on what was there after the name that was matched
	ispeechstartp1 = 0 # plus 1

	# no colonattr or colon, must be making a speech
	recol = re.search('colon="(:?)"', qb.speaker)
	bSpeakerExists = not re.match('nospeaker="true"', qb.speaker)
	if bSpeakerExists and (not recol or recol.group(1)):
		# text of this kind at the begining should not be spoken, assume there wasn't a colon
		if not re.search("<p>(?:moved|asked|rose to move,) (?i)", qb.stext[0]) or re.search("<p>moved formally(?i)", qb.stext[0]):
		        ispeechstartp1 = 1  # 0th paragraph is speech text

	res = [ ] # output list
	preparagraphtype = ""
	if bSpeakerExists and (ispeechstartp1 == 0):
		if re.match("<p>asked Her Majesty's Government|<p>asked the|<p>&mdash;Took the Oath", qb.stext[0]):
			preparagraphtype = "asked"
			ispeechstartp1 = SearchForNobleLordSaid(qb, preparagraphtype)
                        if ispeechstartp1 == len(qb.stext): # No Noble Lord said, the usual
                                ispeechstartp1 = 1
                        if ispeechstartp1 != 1:
				print "Noble Lord Said on ", ispeechstartp1, "paragraph"
				raise ContextException("Noble Lord Said missing in second paragraph", stamp=qb.sstampurl)
			# ensure that the noble lord said doesn't say an amendment withdrawn
			assert not MatchPWmotionStuff(qb, ispeechstartp1)

		elif re.match("<p>rose to (?:ask|call|draw attention|consider)", qb.stext[0]):
			preparagraphtype = "asked"
			ispeechstartp1 = SearchForNobleLordSaid(qb, preparagraphtype)
			if ispeechstartp1 not in [1, 2]:
				print "Noble Lord Said on ", ispeechstartp1, "paragraph"
				raise ContextException("Noble Lord Said missing in second paragraph", stamp=qb.sstampurl)

			# ensure that the noble lord said doesn't say an amendment withdrawn
			assert not MatchPWmotionStuff(qb, ispeechstartp1)

		# identify a writ of summons (single line)
		elif re.match("<p>(?:[\s,]*having received a [Ww]rit of [Ss]ummons .*?)?[Tt]ook the [Oo]ath\.</p>$", qb.stext[0]):
			assert len(qb.stext) == 1
			qb.stext[0] = re.sub('^<p>', '<p pwmotiontext="summons">', qb.stext[0])  # cludgy; already have the <p>-tag embedded in the string
			res.append(qb)
			return res  # bail out

                elif re.search("having been created.*?Was, in (his|her) robes, introduced", qb.stext[0]):
			assert len(qb.stext) == 1
		        qbunspo = qspeech('nospeaker="true"', "", qb.sstampurl)
        		qbunspo.typ = 'speech'
                        qbunspo.stext = qb.stext
                        qbunspo.stext[0] = re.sub('^<p>', '<p pwmotiontext="introduced">', qbunspo.stext[0])
        		res.append(qbunspo)
                        return res

		elif re.match("<p>&mdash;Took the Oath", qb.stext[0]):
			assert False

		# identify a moved amendment
		elif re.match("<p>moved,? |<p>Amendments? |<p>had given notice|<p>(?:rose )?to move|<p>had given his intention", qb.stext[0]):

			# find where the speech begins, and strip out "The noble lord said:"
			preparagraphtype = "moved"
			ispeechstartp1 = SearchForNobleLordSaid(qb, preparagraphtype)

			# everything up to this point is non-speech
			assert ispeechstartp1 > 0
			qbprev = qspeech(qb.speaker, "", qb.sstampurl)
			qbprev.typ = 'speech'
			qbprev.stext = qb.stext[:ispeechstartp1]

			res.append(qbprev)
			if ispeechstartp1 == len(qb.stext):
				return res

			# upgrade the spoken part
			qb.speaker = string.replace(qb.speaker, 'colon=""', 'colon=":"')
			del qb.stext[:ispeechstartp1]
			assert qb.stext
			ispeechstartp1 = 1 # the spoken text must reach at least here (after the line, "The noble lord said:")

		# error, no moved amendment found
		else:
			print qb.stext
			print "no moved amendment; is a colon missing after the name?"
			raise ContextException("missing moved amendment", stamp=qb.sstampurl)

	# advance to place where non-speeches happen
	if ispeechstartp1 > len(qb.stext):
		print "ispeechstartp1 problem; speeches running through", ispeechstartp1, len(qb.stext)
		print qb.stext
		raise ContextException("end of speech boundary unclear running through; need to separate paragraphs?", stamp=qb.sstampurl)


	# a common end of speech is to withdraw an amendment
	# we go through paragraphs until we match that or some other motion text type statement
	sAmendmentStatement = None
	while bSpeakerExists and (ispeechstartp1 < len(qb.stext)):
		sAmendmentStatement = MatchPWmotionStuff(qb, ispeechstartp1)
		if sAmendmentStatement:
			break

		ispeechstartp1 += 1

	# there are no further lines after the widthdrawal
	if ispeechstartp1 == len(qb.stext):
		assert not sAmendmentStatement
		res.append(qb)
		return res

	# do the further lines after withdrawal
	assert (not bSpeakerExists) or sAmendmentStatement

	# splice off the unspoken text running off from the amendment statements
	if ispeechstartp1 != 0:
		qbunspo = qspeech('nospeaker="true"', "", qb.sstampurl)
		qbunspo.typ = 'speech'
		qbunspo.stext = qb.stext[ispeechstartp1:]
		del qb.stext[ispeechstartp1:]
		res.append(qb)
		res.append(qbunspo)
	else:
		res.append(qb)
		qbunspo = qb

	# check that once we begin pwmotion amendment statements, all statements are of this type
	for i in range(len(qbunspo.stext)):
		if not re.match('<p', qbunspo.stext[i]):
			continue
		sAmendmentStatement = MatchKnownAsPWmotionStuff(qbunspo, i)
		if not sAmendmentStatement:
			if IsNotQuiet():
				print "UNRECOGNIZED-MOTION-TEXT%s: %s" % (bSpeakerExists and " " or "(*)", qbunspo.stext[i])
			sAmendmentStatement = "unrecognized"
		qbunspo.stext[i] = re.sub('^<p(.*?)>', '<p\\1 pwmotiontext="%s">' % sAmendmentStatement, qbunspo.stext[i])

	return res

示例#11

0

显示文件

def LordsFilterSections(text, sdate):

    # deal with one exceptional case of indenting
    if sdate == "2005-10-26":
        l = len(text)
        text = re.sub("<ul><ul>(<ul>)?", "<ul>", text)
        text = re.sub("</ul></ul>(</ul>)?", "</ul>", text)

        # regsection1 = '<h\d><center>.*?\s*</center></h\d>' in splitheadingsspeakers.py
        print "Duplicate <ul>s removed and <center> sorted on %s which shortened text by %d" % (
            sdate, l - len(text))

    # split into list of triples of (heading, pre-first speech text, [ (speaker, text) ])
    headspeak = SplitHeadingsSpeakers(text)

    # break down into lists of headings and lists of speeches
    (ih, stampurl) = StripLordsDebateHeadings(headspeak, sdate)
    if ih == None:
        return

    # loop through each detected heading and the detected partitioning of speeches which follow.
    # this is a flat output of qspeeches, some encoding headings, and some divisions.
    # see the typ variable for the type.
    flatb = []

    for sht in headspeak[ih:]:
        # triplet of ( heading, unspokentext, [(speaker, text)] )
        headingtxt = stampurl.UpdateStampUrl(string.strip(
            sht[0]))  # we're getting stamps inside the headings sometimes
        unspoketxt = sht[1]
        speechestxt = sht[2]
        headingmajor = sht[3]

        # the heading detection, as a division or a heading speech object
        # detect division headings
        gdiv = re.search('Division No\.(?:\s|&#160;)+(\d+)', headingtxt)
        assert not re.match("(?:NOT-)?CONTENTS", headingtxt)

        # heading type
        if not gdiv:
            qbh = LordsHeadingPart(headingtxt, stampurl, headingmajor)

            # ram together minor headings into previous ones which have no speeches
            if qbh.typ == 'minor-heading' and len(
                    flatb) > 0 and flatb[-1].typ == 'minor-heading':
                flatb[-1].stext.append(" &mdash; ")
                flatb[-1].stext.extend(qbh.stext)

# ram together minor headings into previous ones which have no speeches
            elif sdate > '2008-12-01' and qbh.typ == 'minor-heading' and len(
                    flatb) > 0 and flatb[-1].typ == 'major-heading':
                flatb[-1].stext.append(" &mdash; ")
                flatb[-1].stext.extend(qbh.stext)

# otherwise put out this heading
            else:
                flatb.append(qbh)

        # division type
        else:
            (unspoketxt,
             qbd) = LordsDivisionParsingPart(string.atoi(gdiv.group(1)),
                                             unspoketxt, stampurl, sdate)

            # grab some division text off the back end of the previous speech
            # and wrap into a new no-speaker speech
            if sdate >= '2008-12-01':
                qbdp = NewGrabLordDivisionProced(flatb[-1], qbd)
            else:
                qbdp = GrabLordDivisionProced(flatb[-1], qbd)
            if qbdp:
                flatb.append(qbdp)
            flatb.append(qbd)

        # continue and output unaccounted for unspoken text occurring after a
        # division, or after a heading
        if (not re.match('(?:<[^>]*>|\s)*$', unspoketxt)):
            qb = qspeech('nospeaker="true"', unspoketxt, stampurl)
            qb.typ = 'speech'
            flatb.extend(FilterLordsSpeech(qb))

        # there is no text; update from stamps if there are any
        else:
            stampurl.UpdateStampUrl(unspoketxt)

        # go through each of the speeches in a block and put it into our batch of speeches
        for ss in speechestxt:
            qb = qspeech(ss[0], ss[1], stampurl)
            qb.typ = 'speech'
            flatb.extend(FilterLordsSpeech(qb))

    # we now have everything flattened out in a series of speeches
    return flatb

示例#12

0

显示文件

文件： sections.py 项目： samknight/parlparse

def FilterWransSections(text, sdate, lords=False):
    text = ApplyFixSubstitutions(text, sdate, fixsubs)
    headspeak = SplitHeadingsSpeakers(text)

    # break down into lists of headings and lists of speeches
    (ih, stampurl) = StripWransHeadings(headspeak, sdate)

    # full list of question batches
    # We create a list of lists of speeches
    flatb = []
    justhadnewtitle = False  # For when they put another "Written Answers to Questions" and date
    for sht in headspeak[ih:]:
        # triplet of ( heading, unspokentext, [(speaker, text)] )
        headingtxt = stampurl.UpdateStampUrl(string.strip(
            sht[0]))  # we're getting stamps inside the headings sometimes
        unspoketxt = sht[1]
        speechestxt = sht[2]

        # update the stamps from the pre-spoken text
        if (not re.match('(?:<[^>]*>|\s)*$', unspoketxt)):
            raise ContextException("unspoken text under heading in wrans",
                                   stamp=stampurl,
                                   fragment=unspoketxt)
        stampurl.UpdateStampUrl(unspoketxt)

        # headings become one unmarked paragraph of text

        # detect if this is a major heading
        if not re.search('[a-z]', headingtxt) and not speechestxt:
            if not parlPhrases.wransmajorheadings.has_key(headingtxt):
                raise ContextException(
                    "unrecognized major heading, please add to parlPhrases.wransmajorheadings (a)",
                    fragment=headingtxt,
                    stamp=stampurl)
            majheadingtxtfx = parlPhrases.wransmajorheadings[
                headingtxt]  # no need to fix since text is from a map.
            qbH = qspeech('nospeaker="true"', majheadingtxtfx, stampurl)
            qbH.typ = 'major-heading'
            qbH.stext = [majheadingtxtfx]
            flatb.append(qbH)
            continue
        elif not speechestxt and sdate > '2006-05-07':
            if headingtxt == 'Written Answers to Questions':
                justhadnewtitle = True
                continue
            if not parlPhrases.wransmajorheadings.has_key(headingtxt.upper()):
                if justhadnewtitle:
                    justhadnewtitle = False
                    continue
                raise ContextException(
                    "unrecognized major heading, please add to parlPhrases.wransmajorheadings (b)",
                    fragment=headingtxt,
                    stamp=stampurl)
            majheadingtxtfx = parlPhrases.wransmajorheadings[
                headingtxt.upper()]  # no need to fix since text is from a map.
            qbH = qspeech('nospeaker="true"', majheadingtxtfx, stampurl)
            qbH.typ = 'major-heading'
            qbH.stext = [majheadingtxtfx]
            flatb.append(qbH)
            justhadnewtitle = False
            continue
        elif not speechestxt:
            raise ContextException('broken heading %s' % headingtxt,
                                   stamp=stampurl,
                                   fragment=headingtxt)

        # non-major heading; to a question batch
        if parlPhrases.wransmajorheadings.has_key(headingtxt):
            raise Exception, ' speeches found in major heading %s' % headingtxt

        headingtxtfx = FixHTMLEntities(headingtxt)
        headingmark = 'nospeaker="true"'
        bNextStartofQ = True

        # go through each of the speeches in a block and put it into our batch of speeches
        qnums = []  # used to account for spurious qnums seen in answers
        for ss in speechestxt:
            qb = qspeech(ss[0], ss[1], stampurl)
            #print ss[0] + "  " + stampurl.stamp
            lqnums = re.findall('\[(?:HL)?(\d+)R?\]', ss[1])

            # question posed
            if re.match('(?:<[^>]*?>|\s)*?(to ask|asked (Her Majesty(&#039;|&#146;|\')s Government|the ))(?i)', qb.text) or \
                                    re.search('<wrans-question>', qb.text):
                qb.text = qb.text.replace('<wrans-question>', '')
                qb.typ = 'ques'

                # put out the heading for this question-reply block.
                # we don't assert true since we can have multiple questions answsered in a block.
                if bNextStartofQ:
                    # put out a heading
                    # we need to make the heading of from the same stampurl as the first question
                    qbh = qspeech(headingmark, headingtxtfx, qb.sstampurl)
                    qbh.typ = 'minor-heading'
                    qbh.stext = [headingtxtfx]
                    flatb.append(qbh)

                    bNextStartofQ = False

                    # used to show that the subsequent headings in this block have been created,
                    # and weren't in the original text.
                    headingmark = 'nospeaker="true" inserted-heading="true"'
                    qnums = lqnums  # reset the qnums count
                else:
                    qnums.extend(lqnums)

                qb.stext = FilterQuestion(qb, sdate, lords)
                if not lqnums:
                    errmess = ' <p class="error">Question number missing in Hansard, possibly truncated question.</p> '
                    qb.stext.append(errmess)

                flatb.append(qb)

            # do the reply
            else:
                if bNextStartofQ:
                    raise ContextException('start of question expected',
                                           stamp=qb.sstampurl,
                                           fragment=qb.text)
                qb.typ = 'reply'

                # this case is so rare we flag them in the corrections of the html with this tag
                if re.search("\<another-answer-to-follow\>", qb.text):
                    qb.text = qb.text.replace("<another-answer-to-follow>", "")
                else:
                    bNextStartofQ = True

                # check against qnums which are sometimes repeated in the answer code
                # Don't care if qnum is given in an answer!
                #for qn in lqnums:
                #	# sometimes [n] is an enumeration or part of a title
                #	nqn = string.atoi(qn)
                #	if (not qnums.count(qn)) and (nqn > 100) and ((nqn < 1900) or (nqn > 2010)):
                #		if qb.text.find("<ok-extra-qnum>") >= 0:
                #			qb.text = qb.text.replace("<ok-extra-qnum>", "", 1)
                #		else:
                #			raise ContextException('unknown qnum %s present in answer, make it clear' % qn, stamp = qb.sstampurl, fragment = qb.text)
                qb.stext = FilterReply(qb)
                flatb.append(qb)

        if not bNextStartofQ:
            print speechestxt
            # Note - not sure if this should be speechestxt[-1][1] here.  Does what I want for now...
            raise ContextException("missing answer to question",
                                   stamp=stampurl,
                                   fragment=speechestxt[-1][1])

    # we now have everything flattened out in a series of speeches,
    # where some of the speeches are headings (inserted and otherwise).
    return flatb

示例#13

0

显示文件

def FilterDebateSections(text, sdate, typ):
    # make the corrections at this level which enables the headings to be resolved.
    # old style fixing (before patches existed)
    if typ == "debate":
        text = ApplyFixSubstitutions(text, sdate, fixsubs)
    else:
        assert typ == "westminhall"
        # this is crap!!!
        text = re.sub('<ul><ul><ul>(?i)', '<ul>', text)
        text = re.sub('</ul></ul></ul>(?i)', '</ul>', text)
        text = re.sub('<h5></h5>(?i)', '', text)

    # split into list of triples of (heading, pre-first speech text, [ (speaker, text) ])
    headspeak = SplitHeadingsSpeakers(text)

    # break down into lists of headings and lists of speeches
    if typ == "debate":
        (ih, stampurl) = StripDebateHeadings(headspeak, sdate)
    elif typ == "westminhall":
        (ih, stampurl) = StripWestminhallHeadings(headspeak, sdate)
    else:
        assert False  # to be for writminstat?

    # loop through each detected heading and the detected partitioning of speeches which follow.
    # this is a flat output of qspeeches, some encoding headings, and some divisions.
    # see the typ variable for the type.
    flatb = []
    state = {}
    #lastheading = None
    chair_head = 0
    for sht in headspeak[ih:]:
        try:
            # triplet of ( heading, unspokentext, [(speaker, text)], major? )
            headingtxt = stampurl.UpdateStampUrl(string.strip(
                sht[0]))  # we're getting stamps inside the headings sometimes
            headingmajor = sht[3]
            if typ == 'debate' and (headingmajor
                                    or sht == headspeak[-1]):  # UGH again
                headingtxt = headingtxt.upper()
            unspoketxt = sht[1]
            speechestxt = sht[2]

            # the heading detection, as a division or a heading speech object
            # detect division headings
            gdiv = re.match('(?:<b>)?Division No. (\d+)(?i)', headingtxt)

            # heading type
            if not gdiv:  # and lastheading != headingtxt:
                qbh = NormalHeadingPart(headingtxt, stampurl, state, typ)
                # print "h ", qbh.typ, qbh.stext

                # ram together minor headings into previous ones which have no speeches
                if qbh.typ == 'minor-heading' and len(
                        flatb) > 0 and flatb[-1].typ == 'minor-heading':
                    flatb[-1].stext.append(" &mdash; ")
                    flatb[-1].stext.extend(qbh.stext)

# ram together major headings into previous ones which have no speeches
                elif qbh.typ == 'major-heading' and len(
                        flatb) > 0 and flatb[-1].typ == 'major-heading':
                    flatb[-1].stext.append(" &mdash; ")
                    flatb[-1].stext.extend(qbh.stext)

                elif qbh.typ == 'minor-heading' and len(flatb) > 0 and flatb[-1].typ == 'major-heading' and \
                    ( re.search('(Allotted|Allocated) Day(?i)', qbh.stext[-1]) or re.search('^Petition$(?i)', flatb[-1].stext[-1]) ):
                    flatb[-1].stext.append(" &mdash; ")
                    flatb[-1].stext.extend(qbh.stext)

                elif re.search(
                        "(?:sitting suspended(?: for| until| till|\.))|(on resuming&)(?i)",
                        qbh.stext[0]):
                    if len(flatb) > 0 and flatb[-1].typ == 'speech':
                        qb = qspeech('nospeaker="true"', qbh.stext[0],
                                     stampurl)
                        qb.typ = 'speech'
                        FilterDebateSpeech(qb)
                        flatb.append(qb)

                elif re.match(
                        "\[.*? in\s*the\s*Chair\.?\]$(?i)", qbh.stext[0]
                ) and len(flatb) > 0 and flatb[-1].typ == 'speech':
                    qb = qspeech('nospeaker="true"', qbh.stext[0], stampurl)
                    qb.typ = 'speech'
                    FilterDebateSpeech(qb)
                    flatb.append(qb)

    # this is where we suck in a trailing "Clause" part of the title that is mistakenly outside the heading.
                elif (qbh.typ == 'minor-heading' or qbh.typ == 'major-heading'
                      ) and len(flatb) > 0 and flatb[-1].typ == 'speech':
                    mmm = re.match(
                        '\s*<p>\s*((?:New )?(?:clause|schedule) \d+\w?)</p>(?i)',
                        flatb[-1].stext[-1])
                    if mmm:
                        if IsNotQuiet():
                            print "Clause/schedule moving", flatb[-1].stext[-1]
                        qbh.stext.insert(0, " &mdash; ")
                        qbh.stext.insert(0, mmm.group(1))
                        flatb[-1].stext = flatb[
                            -1].stext[:-1]  # delete final value

                        # remove an empty speech
                        if not flatb[-1].stext:
                            if IsNotQuiet():
                                print "removing empty speech after moving 'clause/schedule' out"
                            assert flatb[-1].speaker == 'nospeaker="true"'
                            del flatb[-1]

                    # converting a search into a match, for safety, and double checking
                    else:
                        if re.search(
                                '<p>\s*((?:New )?\s*(?:clause|schedule)\s*\w+)\s*</p>(?i)',
                                flatb[-1].stext[-1]):
                            print flatb[-1].stext[-1]
                            assert False

                    flatb.append(qbh)

# otherwise put out this heading
                else:
                    flatb.append(qbh)

            # division case
            elif gdiv:
                (unspoketxt,
                 qbd) = DivisionParsingPart(string.atoi(gdiv.group(1)),
                                            unspoketxt, stampurl, sdate)

                # grab some division text off the back end of the previous speech
                # and wrap into a new no-speaker speech
                qbdp = GrabDivisionProced(flatb[-1], qbd)
                if qbdp:
                    flatb.append(qbdp)
                flatb.append(qbd)

                # write out our file with the report of all divisions
                PreviewDivisionTextGuess(flatb)

#lastheading = headingtxt

            # continue and output unaccounted for unspoken text occuring after a
            # division, or after a heading
            if (not re.match('(?:<[^>]*>|\s)*$', unspoketxt)):
                qb = qspeech('nospeaker="true"', unspoketxt, stampurl)
                qb.typ = 'speech'
                FilterDebateSpeech(qb)
                flatb.append(qb)

            # there is no text; update from stamps if there are any
            else:
                stampurl.UpdateStampUrl(unspoketxt)

            # go through each of the speeches in a block and put it into our batch of speeches
            for ss in speechestxt:
                qb = qspeech(ss[0], ss[1], stampurl)
                qb.typ = 'speech'
                FilterDebateSpeech(qb, bDebateBegToMove=True)

                qbdp = GrabWestminDivisionInterruptProced(
                    qb, ss[1])  # captures tail off westminster hall speeches
                flatb.append(qb)
                if qbdp:
                    flatb.append(qbdp)

        except ContextException, e:
            raise

示例#14

0

显示文件

文件： divisionsections.py 项目： samknight/parlparse

def GrabDivisionProced(qbp, qbd):
	if qbp.typ != 'speech' or len(qbp.stext) < 1:

		# this is that crazy correction one
		if qbp.sstampurl.sdate == '2003-12-18':
			return None

		print qbp.stext
		raise Exception, "previous to division not speech"

        qbp.stext[-1] = re.sub(' </i><i> ', ' ', qbp.stext[-1])
        qbp.stext[-1] = re.sub('</i><i> ', ' ', qbp.stext[-1])
	hdg = rehousediv.match(qbp.stext[-1])
	if not hdg:
		hdg_a = rehousediv_a.match(qbp.stext[-2])
		hdg_b = rehousediv_b.match(qbp.stext[-1])
		if hdg_a and hdg_b:
			hdg = hdg_b
		elif hdg_b:
			# They are occasionally putting "The" "Committee"
			# "divided" in two or three separate paragraphs
			two_prev = re.sub('</p><p[^>]*>', '', ''.join(qbp.stext[-3:-1]))
			three_prev = re.sub('</p><p[^>]*>', '', ''.join(qbp.stext[-4:-1]))
			if rehousediv_a.match(three_prev):
				qbp.stext = qbp.stext[:-4] + [ three_prev, qbp.stext[-1] ]
				hdg = hdg_b
			elif rehousediv_a.match(two_prev):
				qbp.stext = qbp.stext[:-3] + [ two_prev, qbp.stext[-1] ]
				hdg = hdg_b
	if not hdg:
		if rehousediv_a.match(qbp.stext[-4]) and rehousediv_b.match(qbp.stext[-3]) and rehousediv_england.match(qbp.stext[-2]):
			hdg = hdg_b
	if not hdg:
		hdg = redivshouldappear.match(qbp.stext[-1])
	if not hdg:
		# another correction one
		if qbp.sstampurl.sdate != '2003-09-16':
			raise ContextException, "no house divided before division: %s" % qbp.stext[-1]
		return None

	# if previous thing is already a no-speaker, we don't need to break it out
	# (the coding on the question put is complex and multilined)
	if re.search('nospeaker="true"', qbp.speaker):
		qbp.stext = SubsPWtextset(qbp.stext)
		return None

	# look back at previous paragraphs and skim off a part of what's there
	# to make a non-spoken bit reporting on the division.
	iskim = 1
	if re.search('Serjeant at Arms', qbp.stext[-2]):
		pass
	else:
		while len(qbp.stext) >= iskim:
			if reqput.match(qbp.stext[-iskim]):
				break
			iskim += 1

		# haven't found a question put before we reach the front
		if len(qbp.stext) < iskim:
			iskim = 1
			# VALID in 99% of cases: raise Exception, "no question put before division"

	# copy the two lines into a non-speaking paragraph.
	qbdp = qspeech('nospeaker="true"', "", qbp.sstampurl)
	qbdp.typ = 'speech'
	qbdp.stext = SubsPWtextset(qbp.stext[-iskim:])


	# trim back the given one by two lines
	qbp.stext = qbp.stext[:-iskim]

	return qbdp

示例#15

0

显示文件

文件： sections.py 项目： emmaclarke/parlparse

def FilterDebateSections(text, sdate, typ):
	# make the corrections at this level which enables the headings to be resolved.
	# old style fixing (before patches existed)
	if typ == "debate":
		text = ApplyFixSubstitutions(text, sdate, fixsubs)
	else:
		assert typ == "westminhall"
		# this is crap!!!
		text = re.sub('<ul><ul><ul>(?i)', '<ul>', text)
		text = re.sub('</ul></ul></ul>(?i)', '</ul>', text)
		text = re.sub('<h5></h5>(?i)', '', text)


	# split into list of triples of (heading, pre-first speech text, [ (speaker, text) ])
	headspeak = SplitHeadingsSpeakers(text)

	# break down into lists of headings and lists of speeches
	if typ == "debate":
		(ih, stampurl) = StripDebateHeadings(headspeak, sdate)
	elif typ == "westminhall":
		(ih, stampurl) = StripWestminhallHeadings(headspeak, sdate)
	else:
		assert False # to be for writminstat?

	# loop through each detected heading and the detected partitioning of speeches which follow.
	# this is a flat output of qspeeches, some encoding headings, and some divisions.
	# see the typ variable for the type.
	flatb = [ ]
        state = {}
        #lastheading = None
        chair_head = 0
	for sht in headspeak[ih:]:
		try:
			# triplet of ( heading, unspokentext, [(speaker, text)], major? )
			headingtxt = stampurl.UpdateStampUrl(string.strip(sht[0]))  # we're getting stamps inside the headings sometimes
                        headingmajor = sht[3]
                        if typ == 'debate' and (headingmajor or sht == headspeak[-1]): # UGH again
                                headingtxt = headingtxt.upper()
			unspoketxt = sht[1]
			speechestxt = sht[2]

			# the heading detection, as a division or a heading speech object
			# detect division headings
			gdiv = re.match('(?:<b>)?Division No. (\d+)(?i)', headingtxt)

			# heading type
			if not gdiv: # and lastheading != headingtxt:
				qbh = NormalHeadingPart(headingtxt, stampurl, state, typ)
        			# print "h ", qbh.typ, qbh.stext

        			# ram together minor headings into previous ones which have no speeches
        			if qbh.typ == 'minor-heading' and len(flatb) > 0 and flatb[-1].typ == 'minor-heading':
        				flatb[-1].stext.append(" &mdash; ")
        				flatb[-1].stext.extend(qbh.stext)

        			# ram together major headings into previous ones which have no speeches
        			elif qbh.typ == 'major-heading' and len(flatb) > 0 and flatb[-1].typ == 'major-heading':
        				flatb[-1].stext.append(" &mdash; ")
	        			flatb[-1].stext.extend(qbh.stext)

                                elif qbh.typ == 'minor-heading' and len(flatb) > 0 and flatb[-1].typ == 'major-heading' and \
                                    ( re.search('Allotted Day(?i)', qbh.stext[-1]) or re.search('^Petition$(?i)', flatb[-1].stext[-1]) ):
                                        flatb[-1].stext.append(" &mdash; ")
                                        flatb[-1].stext.extend(qbh.stext)

                                elif re.search("(?:sitting suspended(?: for| until| till|\.))|(on resuming&)(?i)", qbh.stext[0]):
                                        if len(flatb) > 0 and flatb[-1].typ == 'speech':
        			                qb = qspeech('nospeaker="true"', qbh.stext[0], stampurl)
                        			qb.typ = 'speech'
                        			FilterDebateSpeech(qb)
                        			flatb.append(qb)

                                elif re.match("\[.*? in\s*the\s*Chair\.?\]$(?i)", qbh.stext[0]) and len(flatb) > 0 and flatb[-1].typ == 'speech':
                                        qb = qspeech('nospeaker="true"', qbh.stext[0], stampurl)
                                        qb.typ = 'speech'
                                        FilterDebateSpeech(qb)
                                        flatb.append(qb)

	        		# this is where we suck in a trailing "Clause" part of the title that is mistakenly outside the heading.
        			elif (qbh.typ == 'minor-heading' or qbh.typ == 'major-heading') and len(flatb) > 0 and flatb[-1].typ == 'speech':
        				mmm = re.match('\s*<p>\s*((?:New )?(?:clause|schedule) \d+\w?)</p>(?i)', flatb[-1].stext[-1])
        				if mmm:
        					if IsNotQuiet():
        						print "Clause/schedule moving", flatb[-1].stext[-1]
        					qbh.stext.insert(0, " &mdash; ")
        					qbh.stext.insert(0, mmm.group(1))
        					flatb[-1].stext = flatb[-1].stext[:-1]  # delete final value

        					# remove an empty speech
        					if not flatb[-1].stext:
        						if IsNotQuiet():
        							print "removing empty speech after moving 'clause/schedule' out"
        						assert flatb[-1].speaker == 'nospeaker="true"'
        						del flatb[-1]

        				# converting a search into a match, for safety, and double checking
	        			else:
		        			if re.search('<p>\s*((?:New )?\s*(?:clause|schedule)\s*\w+)\s*</p>(?i)', flatb[-1].stext[-1]):
			        			print flatb[-1].stext[-1]
				       			assert False

				        flatb.append(qbh)

        			# otherwise put out this heading
        			else:
        				flatb.append(qbh)

			# division case
			elif gdiv:
				(unspoketxt, qbd) = DivisionParsingPart(string.atoi(gdiv.group(1)), unspoketxt, stampurl, sdate)

				# grab some division text off the back end of the previous speech
				# and wrap into a new no-speaker speech
				qbdp = GrabDivisionProced(flatb[-1], qbd)
				if qbdp:
					flatb.append(qbdp)
				flatb.append(qbd)

				# write out our file with the report of all divisions
				PreviewDivisionTextGuess(flatb)

                        #lastheading = headingtxt

			# continue and output unaccounted for unspoken text occuring after a
			# division, or after a heading
			if (not re.match('(?:<[^>]*>|\s)*$', unspoketxt)):
				qb = qspeech('nospeaker="true"', unspoketxt, stampurl)
				qb.typ = 'speech'
				FilterDebateSpeech(qb)
				flatb.append(qb)

			# there is no text; update from stamps if there are any
			else:
				stampurl.UpdateStampUrl(unspoketxt)

			# go through each of the speeches in a block and put it into our batch of speeches
			for ss in speechestxt:
				qb = qspeech(ss[0], ss[1], stampurl)
				qb.typ = 'speech'
				FilterDebateSpeech(qb, bDebateBegToMove=True)

				qbdp = GrabWestminDivisionInterruptProced(qb,ss[1]) # captures tail off westminster hall speeches
				flatb.append(qb)
				if qbdp:
					flatb.append(qbdp)

		except ContextException, e:
			raise

示例#16

0

显示文件

文件： sections.py 项目： emmaclarke/parlparse

def NormalHeadingPart(headingtxt, stampurl, state, typ):
	# This is an attempt at major heading detection.
        # The main wrap code spots adjournment debates, and does its best with some procedural things
        # But it's pretty flawed Also, Oral questions heading is a super-major heading,
	# so doesn't fit into the scheme.

	# remove junk italic settings that appear in the today pages
	headingtxt = re.sub("</?(?:i|sup)>(?i)", "", headingtxt)

	# detect if this is a major heading and record it in the correct variable

	bmajorheading = False
	boralheading = False
	binsertedheading = False

	if re.search('-- lost heading --(?i)', headingtxt):
		binsertedheading = True

	# Oral question are really a major heading
	elif re.match("Oral Answers to Questions(?i)", headingtxt):
		boralheading = True
	# Check if there are any other spellings of "Oral Answers to Questions" with a loose match
	elif re.search('oral(?i)', headingtxt) and re.search('ques(?i)', headingtxt) and (not re.search(" Not ", headingtxt)) and \
                        (not re.search("electoral", headingtxt)) and \
			stampurl.sdate not in ("2002-06-11", "2012-02-09"): # have a genuine title with Oral in it
		print headingtxt
		raise ContextException('Oral question match not precise enough', stamp=stampurl, fragment=headingtxt)

	# All upper case headings - UGH
	elif not re.search('[a-z]', headingtxt) and not re.match('[A-Z\d/]+[\d/][A-Z\d/]+$', headingtxt) and not \
            ('remaining_private_bills' in state and re.search(' Bill$(?i)', headingtxt)):
		bmajorheading = True

	elif 'just_had_points_of_order' in state:
		bmajorheading = True
		del state['just_had_points_of_order']

	# If this is labeled major, then it gets concatenated with the
	# subsequent major heading.  It's kind of a procedural info about the
	# running of things, so fair to have it as a minor heading alone.
	elif re.match("\[.*? in\s*the\s*Chair\.?\]$(?i)", headingtxt):
		bmajorheading = False

	elif re.search("in\s*the\s*chair(?i)", headingtxt):
		print headingtxt
		raise ContextException('in the chair match not precise enough', stamp=stampurl, fragment=headingtxt)

	# Other major headings, marked by _head in their anchor tag
	elif re.search('"topichd_|"hd_|_head', stampurl.aname):
		bmajorheading = True

        # Wah
        if stampurl.sdate > '2006-05-07':
                if re.match("(Private business|Business of the House|Orders of the day|Opposition Day|Deferred Division|Petition)(?i)", headingtxt):
                        bmajorheading = True
                if re.match("Points? of Order(?i)", headingtxt):
                        bmajorheading = True
                        state['just_had_points_of_order'] = True
                if re.match("Remaining Private Members[^ ]* Bills(?i)", headingtxt):
                        bmajorheading = True
                        state['remaining_private_bills'] = True

	# we're not writing a block for division headings
	# write out block for headings
	headingtxtfx = FixHTMLEntities(headingtxt)
	try:
		assert not re.search("[<>]", headingtxtfx), headingtxtfx  # an assertion in gidmatching
	except AssertionError:
		raise ContextException('Tag found in heading text', stamp=stampurl, fragment=headingtxt)
	
	qb = qspeech('nospeaker="true"', headingtxtfx, stampurl)
	if typ == 'westminhall':
		qb.typ = 'minor-heading'
	elif binsertedheading:
		qb.typ = 'inserted-heading'
	elif boralheading:
		qb.typ = 'oral-heading'
	elif bmajorheading:
		qb.typ = 'major-heading'
	else:
		qb.typ = 'minor-heading'

	# headings become one unmarked paragraph of text
	qb.stext = [ headingtxtfx ]
	return qb

示例#17

0

显示文件

def NormalHeadingPart(headingtxt, stampurl, state, typ):
    # This is an attempt at major heading detection.
    # The main wrap code spots adjournment debates, and does its best with some procedural things
    # But it's pretty flawed Also, Oral questions heading is a super-major heading,
    # so doesn't fit into the scheme.

    # remove junk italic settings that appear in the today pages
    headingtxt = re.sub("</?(?:i|sup)>(?i)", "", headingtxt)

    # detect if this is a major heading and record it in the correct variable

    bmajorheading = False
    boralheading = False
    binsertedheading = False

    if re.search('-- lost heading --(?i)', headingtxt):
        binsertedheading = True

    # Oral question are really a major heading
    elif re.match("Oral Answers to Questions(?i)", headingtxt):
        boralheading = True
    # Check if there are any other spellings of "Oral Answers to Questions" with a loose match
    elif re.search('oral(?i)', headingtxt) and re.search('ques(?i)', headingtxt) and (not re.search(" Not ", headingtxt)) and \
                           (not re.search("electoral", headingtxt)) and \
      stampurl.sdate not in ("2002-06-11", "2012-02-09"): # have a genuine title with Oral in it
        print headingtxt
        raise ContextException('Oral question match not precise enough',
                               stamp=stampurl,
                               fragment=headingtxt)

    # All upper case headings - UGH
    elif not re.search('[a-z]', headingtxt) and not re.match('[A-Z\d/]+[\d/][A-Z\d/]+$', headingtxt) and not \
               ('remaining_private_bills' in state and re.search(' Bill$(?i)', headingtxt)):
        bmajorheading = True

    elif 'just_had_points_of_order' in state:
        bmajorheading = True
        del state['just_had_points_of_order']

    # If this is labeled major, then it gets concatenated with the
    # subsequent major heading.  It's kind of a procedural info about the
    # running of things, so fair to have it as a minor heading alone.
    elif re.match("\[.*? in\s*the\s*Chair\.?\]$(?i)", headingtxt):
        bmajorheading = False

    elif re.search("in\s*the\s*chair(?i)", headingtxt):
        print headingtxt
        raise ContextException('in the chair match not precise enough',
                               stamp=stampurl,
                               fragment=headingtxt)

    # Other major headings, marked by _head in their anchor tag
    elif re.search('"topichd_|"ordayhd_|"hd_|_head', stampurl.aname):
        bmajorheading = True

# Wah
    if stampurl.sdate > '2006-05-07':
        if re.match(
                "(Private business|Business of the House|Orders of the day|Opposition Day|Deferred Division|Petition)(?i)",
                headingtxt):
            bmajorheading = True
        if re.match("Points? of Order(?i)", headingtxt):
            bmajorheading = True
            state['just_had_points_of_order'] = True
        if re.match("Remaining Private Members[^ ]* Bills(?i)", headingtxt):
            bmajorheading = True
            state['remaining_private_bills'] = True

    # we're not writing a block for division headings
    # write out block for headings
    headingtxtfx = FixHTMLEntities(headingtxt)
    try:
        assert not re.search(
            "[<>]", headingtxtfx), headingtxtfx  # an assertion in gidmatching
    except AssertionError:
        raise ContextException('Tag found in heading text',
                               stamp=stampurl,
                               fragment=headingtxt)

    qb = qspeech('nospeaker="true"', headingtxtfx, stampurl)
    if typ == 'westminhall':
        qb.typ = 'minor-heading'
    elif binsertedheading:
        qb.typ = 'inserted-heading'
    elif boralheading:
        qb.typ = 'oral-heading'
    elif bmajorheading:
        qb.typ = 'major-heading'
    else:
        qb.typ = 'minor-heading'

    # headings become one unmarked paragraph of text
    qb.stext = [headingtxtfx]
    return qb

示例#18

0

显示文件

文件： divisionsections.py 项目： henare/parlparse

def GrabDivisionProced(qbp, qbd):
    if qbp.typ != "speech" or len(qbp.stext) < 1:

        # this is that crazy correction one
        if qbp.sstampurl.sdate == "2003-12-18":
            return None

        print qbp.stext
        raise Exception, "previous to division not speech"

    qbp.stext[-1] = re.sub(" </i><i> ", " ", qbp.stext[-1])
    qbp.stext[-1] = re.sub("</i><i> ", " ", qbp.stext[-1])
    hdg = rehousediv.match(qbp.stext[-1])
    if not hdg:
        hdg_a = rehousediv_a.match(qbp.stext[-2])
        hdg_b = rehousediv_b.match(qbp.stext[-1])
        if hdg_a and hdg_b:
            hdg = hdg_b
        elif hdg_b:
            # They are occasionally putting "The" "Committee"
            # "divided" in two or three separate paragraphs
            two_prev = re.sub("</p><p[^>]*>", "", "".join(qbp.stext[-3:-1]))
            three_prev = re.sub("</p><p[^>]*>", "", "".join(qbp.stext[-4:-1]))
            if rehousediv_a.match(three_prev):
                qbp.stext = qbp.stext[:-4] + [three_prev, qbp.stext[-1]]
                hdg = hdg_b
            elif rehousediv_a.match(two_prev):
                qbp.stext = qbp.stext[:-3] + [two_prev, qbp.stext[-1]]
                hdg = hdg_b
    if not hdg:
        hdg = redivshouldappear.match(qbp.stext[-1])
    if not hdg:
        # another correction one
        if qbp.sstampurl.sdate != "2003-09-16":
            raise ContextException, "no house divided before division: %s" % qbp.stext[-1]
        return None

        # if previous thing is already a no-speaker, we don't need to break it out
        # (the coding on the question put is complex and multilined)
    if re.search('nospeaker="true"', qbp.speaker):
        qbp.stext = SubsPWtextset(qbp.stext)
        return None

        # look back at previous paragraphs and skim off a part of what's there
        # to make a non-spoken bit reporting on the division.
    iskim = 1
    if re.search("Serjeant at Arms", qbp.stext[-2]):
        pass
    else:
        while len(qbp.stext) >= iskim:
            if reqput.match(qbp.stext[-iskim]):
                break
            iskim += 1

            # haven't found a question put before we reach the front
        if len(qbp.stext) < iskim:
            iskim = 1
            # VALID in 99% of cases: raise Exception, "no question put before division"

            # copy the two lines into a non-speaking paragraph.
    qbdp = qspeech('nospeaker="true"', "", qbp.sstampurl)
    qbdp.typ = "speech"
    qbdp.stext = SubsPWtextset(qbp.stext[-iskim:])

    # trim back the given one by two lines
    qbp.stext = qbp.stext[:-iskim]

    return qbdp

示例#19

0

显示文件

def FilterLordsSpeech(qb):

    # pull in the normal filtering that gets done on debate speeches
    # does the paragraph indents and tables.  Maybe should be inlined for lords
    FilterDebateSpeech(qb)

    # the colon attr is blank or has a : depending on what was there after the name that was matched
    ispeechstartp1 = 0  # plus 1

    # no colonattr or colon, must be making a speech
    recol = re.search('colon="(:?)"', qb.speaker)
    bSpeakerExists = not re.match('nospeaker="true"', qb.speaker)
    if bSpeakerExists and (not recol or recol.group(1)):
        # text of this kind at the beginning should not be spoken, assume there wasn't a colon
        if not re.search("<p>(?:moved|asked|rose to move,) (?i)",
                         qb.stext[0]) or re.search("<p>moved formally(?i)",
                                                   qb.stext[0]):
            ispeechstartp1 = 1  # 0th paragraph is speech text

    res = []  # output list
    preparagraphtype = ""
    if bSpeakerExists and (ispeechstartp1 == 0):
        if re.match(
                "<p>asked Her Majesty's Government|<p>asked the|<p>&mdash;Took the Oath",
                qb.stext[0]):
            preparagraphtype = "asked"
            ispeechstartp1 = SearchForNobleLordSaid(qb, preparagraphtype)
            if ispeechstartp1 == len(
                    qb.stext):  # No Noble Lord said, the usual
                ispeechstartp1 = 1
            if ispeechstartp1 != 1:
                print "Noble Lord Said on ", ispeechstartp1, "paragraph"
                raise ContextException(
                    "Noble Lord Said missing in second paragraph",
                    stamp=qb.sstampurl)
            # ensure that the noble lord said doesn't say an amendment withdrawn
            assert not MatchPWmotionStuff(qb, ispeechstartp1)

        elif re.match("<p>rose to (?:ask|call|draw attention|consider)",
                      qb.stext[0]):
            preparagraphtype = "asked"
            ispeechstartp1 = SearchForNobleLordSaid(qb, preparagraphtype)
            if ispeechstartp1 not in [1, 2]:
                print "Noble Lord Said on ", ispeechstartp1, "paragraph"
                raise ContextException(
                    "Noble Lord Said missing in second paragraph",
                    stamp=qb.sstampurl)

            # ensure that the noble lord said doesn't say an amendment withdrawn
            assert not MatchPWmotionStuff(qb, ispeechstartp1)

        # identify a writ of summons (single line)
        elif re.match(
                "<p>(?:[\s,]*having received a [Ww]rit of [Ss]ummons .*?)?[Tt]ook the [Oo]ath\.</p>$",
                qb.stext[0]):
            assert len(qb.stext) == 1
            qb.stext[0] = re.sub(
                '^<p>', '<p pwmotiontext="summons">', qb.stext[0]
            )  # cludgy; already have the <p>-tag embedded in the string
            res.append(qb)
            return res  # bail out

        elif re.search(
                "having been created.*?Was, in (his|her) robes, introduced",
                qb.stext[0]):
            assert len(qb.stext) == 1
            qbunspo = qspeech('nospeaker="true"', "", qb.sstampurl)
            qbunspo.typ = 'speech'
            qbunspo.stext = qb.stext
            qbunspo.stext[0] = re.sub('^<p>', '<p pwmotiontext="introduced">',
                                      qbunspo.stext[0])
            res.append(qbunspo)
            return res

        elif re.match("<p>&mdash;Took the Oath", qb.stext[0]):
            assert False

        # identify a moved amendment
        elif re.match(
                "<p>moved,? |<p>Amendments? |<p>had given notice|<p>(?:rose )?to move|<p>had given his intention",
                qb.stext[0]):

            # find where the speech begins, and strip out "The noble lord said:"
            preparagraphtype = "moved"
            ispeechstartp1 = SearchForNobleLordSaid(qb, preparagraphtype)

            # everything up to this point is non-speech
            assert ispeechstartp1 > 0
            qbprev = qspeech(qb.speaker, "", qb.sstampurl)
            qbprev.typ = 'speech'
            qbprev.stext = qb.stext[:ispeechstartp1]

            res.append(qbprev)
            if ispeechstartp1 == len(qb.stext):
                return res

            # upgrade the spoken part
            qb.speaker = string.replace(qb.speaker, 'colon=""', 'colon=":"')
            del qb.stext[:ispeechstartp1]
            assert qb.stext
            ispeechstartp1 = 1  # the spoken text must reach at least here (after the line, "The noble lord said:")

        # error, no moved amendment found
        else:
            print qb.stext
            print "no moved amendment; is a colon missing after the name?"
            raise ContextException("missing moved amendment",
                                   stamp=qb.sstampurl)

    # advance to place where non-speeches happen
    if ispeechstartp1 > len(qb.stext):
        print "ispeechstartp1 problem; speeches running through", ispeechstartp1, len(
            qb.stext)
        print qb.stext
        raise ContextException(
            "end of speech boundary unclear running through; need to separate paragraphs?",
            stamp=qb.sstampurl)

    # a common end of speech is to withdraw an amendment
    # we go through paragraphs until we match that or some other motion text type statement
    sAmendmentStatement = None
    while bSpeakerExists and (ispeechstartp1 < len(qb.stext)):
        sAmendmentStatement = MatchPWmotionStuff(qb, ispeechstartp1)
        if sAmendmentStatement:
            break

        ispeechstartp1 += 1

    # there are no further lines after the widthdrawal
    if ispeechstartp1 == len(qb.stext):
        assert not sAmendmentStatement
        res.append(qb)
        return res

    # do the further lines after withdrawal
    assert (not bSpeakerExists) or sAmendmentStatement

    # splice off the unspoken text running off from the amendment statements
    if ispeechstartp1 != 0:
        qbunspo = qspeech('nospeaker="true"', "", qb.sstampurl)
        qbunspo.typ = 'speech'
        qbunspo.stext = qb.stext[ispeechstartp1:]
        del qb.stext[ispeechstartp1:]
        res.append(qb)
        res.append(qbunspo)
    else:
        res.append(qb)
        qbunspo = qb

    # check that once we begin pwmotion amendment statements, all statements are of this type
    for i in range(len(qbunspo.stext)):
        if not re.match('<p', qbunspo.stext[i]):
            continue
        sAmendmentStatement = MatchKnownAsPWmotionStuff(qbunspo, i)
        if not sAmendmentStatement:
            if IsNotQuiet():
                print "UNRECOGNIZED-MOTION-TEXT%s: %s" % (
                    bSpeakerExists and " " or "(*)", qbunspo.stext[i])
            sAmendmentStatement = "unrecognized"
        qbunspo.stext[i] = re.sub(
            '^<p(.*?)>', '<p\\1 pwmotiontext="%s">' % sAmendmentStatement,
            qbunspo.stext[i])

    return res

示例#20

0

显示文件

文件： filterwranssections.py 项目： scotm/parlparse

def FilterWransSections(text, sdate, lords=False):
	text = ApplyFixSubstitutions(text, sdate, fixsubs)
	headspeak = SplitHeadingsSpeakers(text)


	# break down into lists of headings and lists of speeches
	(ih, stampurl) = StripWransHeadings(headspeak, sdate)


	# full list of question batches
	# We create a list of lists of speeches
	flatb = [ ]
        justhadnewtitle = False # For when they put another "Written Answers to Questions" and date
	for sht in headspeak[ih:]:
		# triplet of ( heading, unspokentext, [(speaker, text)] )
		headingtxt = stampurl.UpdateStampUrl(string.strip(sht[0]))  # we're getting stamps inside the headings sometimes
		unspoketxt = sht[1]
		speechestxt = sht[2]

		# update the stamps from the pre-spoken text
		if (not re.match('(?:<[^>]*>|\s)*$', unspoketxt)):
			raise ContextException("unspoken text under heading in wrans", stamp=stampurl, fragment=unspoketxt)
		stampurl.UpdateStampUrl(unspoketxt)

		# headings become one unmarked paragraph of text

		# detect if this is a major heading
		if not re.search('[a-z]', headingtxt) and not speechestxt:
			if not parlPhrases.wransmajorheadings.has_key(headingtxt):
				raise ContextException("unrecognized major heading, please add to parlPhrases.wransmajorheadings (a)", fragment = headingtxt, stamp = stampurl)
			majheadingtxtfx = parlPhrases.wransmajorheadings[headingtxt] # no need to fix since text is from a map.
			qbH = qspeech('nospeaker="true"', majheadingtxtfx, stampurl)
			qbH.typ = 'major-heading'
			qbH.stext = [ majheadingtxtfx ]
			flatb.append(qbH)
			continue
                elif not speechestxt and sdate > '2006-05-07':
                        if headingtxt == 'Written Answers to Questions':
                                justhadnewtitle = True
                                continue
			if not parlPhrases.wransmajorheadings.has_key(headingtxt.upper()):
                                if justhadnewtitle:
                                        justhadnewtitle = False
                                        continue
				raise ContextException("unrecognized major heading, please add to parlPhrases.wransmajorheadings (b)", fragment = headingtxt, stamp = stampurl)
			majheadingtxtfx = parlPhrases.wransmajorheadings[headingtxt.upper()] # no need to fix since text is from a map.
			qbH = qspeech('nospeaker="true"', majheadingtxtfx, stampurl)
			qbH.typ = 'major-heading'
			qbH.stext = [ majheadingtxtfx ]
			flatb.append(qbH)
                        justhadnewtitle = False
			continue
                elif not speechestxt:
                        raise ContextException('broken heading %s' % headingtxt, stamp=stampurl, fragment=headingtxt)


		# non-major heading; to a question batch
		if parlPhrases.wransmajorheadings.has_key(headingtxt):
			raise Exception, ' speeches found in major heading %s' % headingtxt

		headingtxtfx = FixHTMLEntities(headingtxt)
		headingmark = 'nospeaker="true"'
		bNextStartofQ = True

		# go through each of the speeches in a block and put it into our batch of speeches
		qnums = []	# used to account for spurious qnums seen in answers
		for ss in speechestxt:
			qb = qspeech(ss[0], ss[1], stampurl)
			#print ss[0] + "  " + stampurl.stamp
			lqnums = re.findall('\[(?:HL)?(\d+)R?\]', ss[1])

			# question posed
			if re.match('(?:<[^>]*?>|\s)*?(to ask|asked (Her Majesty(&#039;|&#146;|\')s Government|the ))(?i)', qb.text) or \
                           re.search('<wrans-question>', qb.text):
                                qb.text = qb.text.replace('<wrans-question>', '')
				qb.typ = 'ques'

				# put out the heading for this question-reply block.
				# we don't assert true since we can have multiple questions answsered in a block.
				if bNextStartofQ:
					# put out a heading
					# we need to make the heading of from the same stampurl as the first question
					qbh = qspeech(headingmark, headingtxtfx, qb.sstampurl)
					qbh.typ = 'minor-heading'
					qbh.stext = [ headingtxtfx ]
					flatb.append(qbh)

					bNextStartofQ = False

					# used to show that the subsequent headings in this block have been created,
					# and weren't in the original text.
					headingmark = 'nospeaker="true" inserted-heading="true"'
					qnums = lqnums # reset the qnums count
				else:
					qnums.extend(lqnums)

				qb.stext = FilterQuestion(qb, sdate, lords)
				if not lqnums:
					errmess = ' <p class="error">Question number missing in Hansard, possibly truncated question.</p> '
					qb.stext.append(errmess)

				flatb.append(qb)

			# do the reply
			else:
				if bNextStartofQ:
					raise ContextException('start of question expected', stamp = qb.sstampurl, fragment = qb.text)
				qb.typ = 'reply'

				# this case is so rare we flag them in the corrections of the html with this tag
				if re.search("\<another-answer-to-follow\>", qb.text):
					qb.text = qb.text.replace("<another-answer-to-follow>", "")
				else:
					bNextStartofQ = True

				# check against qnums which are sometimes repeated in the answer code
                                # Don't care if qnum is given in an answer!
				#for qn in lqnums:
				#	# sometimes [n] is an enumeration or part of a title
				#	nqn = string.atoi(qn)
				#	if (not qnums.count(qn)) and (nqn > 100) and ((nqn < 1900) or (nqn > 2010)):
				#		if qb.text.find("<ok-extra-qnum>") >= 0:
				#			qb.text = qb.text.replace("<ok-extra-qnum>", "", 1)
				#		else:
				#			raise ContextException('unknown qnum %s present in answer, make it clear' % qn, stamp = qb.sstampurl, fragment = qb.text)
				qb.stext = FilterReply(qb)
				flatb.append(qb)

		if not bNextStartofQ:
                        print speechestxt
                        # Note - not sure if this should be speechestxt[-1][1] here.  Does what I want for now...
			raise ContextException("missing answer to question", stamp=stampurl, fragment=speechestxt[-1][1])


	# we now have everything flattened out in a series of speeches,
	# where some of the speeches are headings (inserted and otherwise).
	return flatb