Exemplos de FixHTMLEntities em Python, exemplos de miscfuncs.FixHTMLEntities em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: filtersentence.py Projeto: mashedkeyboard/parlparse

def TokenStandingOrder(mstandingo, phrtok):
    if mstandingo.group(2):
        return ('phrase', ' class="standing-order" code="%s" title="%s"' %
                (FixHTMLEntities(mstandingo.group(1)),
                 FixHTMLEntities(re.sub('<[^>]*>', '', mstandingo.group(2)))))
    return ('phrase',
            ' class="standing-order" code="%s"' % mstandingo.group(1))

Exemplo n.º 2

0

Exibir arquivo

Arquivo: filtersentence.py Projeto: mashedkeyboard/parlparse

    def TokenizePhraseRecurse(self, qs, stex, itc):

        # end of the chain
        if itc == len(tokenchain):
            self.toklist.append(
                ('', '', FixHTMLEntities(stex, stampurl=(qs
                                                         and qs.sstampurl))))
            return

        # keep eating through the pieces for the same token
        while stex:
            # attempt to split the token
            mtoken = tokenchain[itc][1].search(stex)
            if mtoken:  # the and/or method fails with this
                headtex = stex[:mtoken.span(0)[0]]
            else:
                headtex = stex

            # check for marginals
            if tokenchain[itc][2] and tokenchain[itc][2].search(headtex):
                pass
                #print "Marginal token match:", tokenchain[itc][0]
                #print tokenchain[itc][2].findall(headtex)
                #print headtex

            # send down the one or three pieces up the token chain
            if headtex:
                self.TokenizePhraseRecurse(qs, headtex, itc + 1)

            # no more left
            if not mtoken:
                break

            # break up the token if it is there
            tokpair = tokenchain[itc][3](mtoken, self)
            self.toklist.append((tokpair[0], tokpair[1],
                                 FixHTMLEntities(mtoken.group(0),
                                                 stampurl=(qs
                                                           and qs.sstampurl))))
            #print "Token detected:", mtoken.group(0)

            # the tail part
            stex = stex[mtoken.span(0)[1]:]

Exemplo n.º 3

0

Exibir arquivo

def LordsHeadingPart(headingtxt, stampurl, major):

    headingtxtfx = FixHTMLEntities(headingtxt)
    qb = qspeech('nospeaker="true"', headingtxtfx, stampurl)
    if major and stampurl.sdate > '2008-12-01':
        qb.typ = 'major-heading'
    else:
        qb.typ = 'minor-heading'

    # headings become one unmarked paragraph of text
    qb.stext = [headingtxtfx]
    return qb

Exemplo n.º 4

0

Exibir arquivo

Arquivo: filtersentence.py Projeto: mashedkeyboard/parlparse

def TokenDate(ldate, phrtok):
    sdate_year = phrtok.sdate[0:4]
    tdate = ldate.group(0).replace('&nbsp;', ' ')
    noyear = False
    if not ldate.group(2):
        tdate += " %s" % sdate_year
        noyear = True
    try:
        lldate = mx.DateTime.DateTimeFrom(tdate)
        #if noyear and lldate > mx.DateTime.now():
        #	lldate = (lldate - mx.DateTime.RelativeDateTime(years=1))
        ldate = lldate.date
        phrtok.lastdate = ldate
    except:
        phrtok.lastdate = ''
    return ('phrase',
            ' class="date" code="%s"' % FixHTMLEntities(phrtok.lastdate))

Exemplo n.º 5

0

Exibir arquivo

Arquivo: division.py Projeto: samknight/parlparse

def MpTellerList(fsm, vote, stampurl, sdate):
    res = []
    for fss in fsm:
        if fss == '</b>':
            continue  # The end </b> on Tellers for the (Ayes|Noes):
        if fss == '<b> and</b>':
            continue  # The 'and' now gets a paragraph of its own
        while fss:  # split by lines, but linefeed sometimes missing
            gftell = re.match(
                '\s*(?:and )?([ \w.\-\'&#;]*?)(?:\(([ \w.\-\'&#;]*)\))?(?: and(.*))?\s*\.?\s*$',
                fss)
            if not gftell:
                raise ContextException("no match on teller line",
                                       stamp=stampurl,
                                       fragment=fss)

            fssf = gftell.group(1)
            fssfcons = gftell.group(2)
            fss = gftell.group(3)

            if len(res) >= 2:
                print fsm
                raise ContextException(' too many tellers ',
                                       stamp=stampurl,
                                       fragment=fss)

            # It always is
            if fssf == 'Mr. Michael Foster':
                fssfcons = 'Worcester'

            (mpid, remadename, remadecons) = memberList.matchfullnamecons(
                fssf.strip(), fssfcons, sdate)
            #print fssf, " ++> ", remadename.encode("latin-1")
            if not mpid:
                raise ContextException("teller name bad match",
                                       stamp=stampurl,
                                       fragment=fssf)
            res.append(
                '\t<mpname person_id="%s" vote="%s" teller="yes">%s</mpname>' %
                (mpid, vote, FixHTMLEntities(fssf)))

    return res

Exemplo n.º 6

0

Exibir arquivo

Arquivo: divisions.py Projeto: samknight/parlparse

def LordsFilterDivision(text, stampurl, sdate):

	# the intention is to splice out the known parts of the division
	fs = re.split('\s*(?:<br>|</?p>)\s*(?i)', text)

	contentlords = [ ]
	notcontentlords = [ ]
	contstate = ''

	for fss in fs:
		if not fss:
			continue
		cfs = recontma.match(fss)
		if cfs:
			if cfs.group(1) == "CONTENTS":
				assert contstate == ''
				contstate = 'content'
			elif cfs.group(1) == 'NOT-CONTENTS' or cfs.group(1) == 'NOT CONTENTS':
				assert contstate == 'content'
				contstate = 'not-content'
			else:
				print "$$$%s$$$" % cfs.group(1)
				raise ContextException("unrecognised content state", stamp=stampurl, fragment=fss)

		elif re.match("(?:\[\*|\*\[)[Ss]ee col\. \d+\]", fss):
			print "Disregarding cross-reference in Division", fss
		elif re.match("\[\*\s*The Tellers.*?[Tt]he Clerks.*?\]", fss):
			print "Disregarding clerk comment on numbers", fss
		elif re.match("\[\*\s*The name of a .*? removed from the voting lists\.\]", fss):
			print "Disregarding removed from list comment", fss

		else:
			if not contstate:
				raise ContextException("empty contstate", stamp=stampurl, fragment=fss)

			# split off teller case
			teller = retellma.match(fss)
			tels = ''
			lfss = fss
			if teller:
				lfss = teller.group(1)
				tels = ' teller="yes"'

			# strip out the office
			offm = reoffma.match(lfss)
			if offm:
				lfss = offm.group(1)
			if not lfss:
				raise ContextException("no name on line", stamp=stampurl, fragment=fss)
			lordid = lordsList.MatchRevName(lfss, sdate, stampurl)
			lordw = '\t<lord person_id="%s" vote="%s"%s>%s</lord>' % (lordid, contstate, tels, FixHTMLEntities(fss))

			if contstate == 'content':
				contentlords.append(lordw)
			else:
				notcontentlords.append(lordw)

	# now build up the return value
	stext = [ ]
	stext.append('<divisioncount content="%d" not-content="%d"/>' % (len(contentlords), len(notcontentlords)))
	stext.append('<lordlist vote="content">')
	stext.extend(contentlords)
	stext.append('</lordlist>')
	stext.append('<lordlist vote="not-content">')
	stext.extend(notcontentlords)
	stext.append('</lordlist>')

	return stext

Exemplo n.º 7

0

Exibir arquivo

def NormalHeadingPart(headingtxt, stampurl, state, typ):
    # This is an attempt at major heading detection.
    # The main wrap code spots adjournment debates, and does its best with some procedural things
    # But it's pretty flawed Also, Oral questions heading is a super-major heading,
    # so doesn't fit into the scheme.

    # remove junk italic settings that appear in the today pages
    headingtxt = re.sub("</?(?:i|sup)>(?i)", "", headingtxt)

    # detect if this is a major heading and record it in the correct variable

    bmajorheading = False
    boralheading = False
    binsertedheading = False

    if re.search('-- lost heading --(?i)', headingtxt):
        binsertedheading = True

    # Oral question are really a major heading
    elif re.match("Oral Answers to Questions(?i)", headingtxt):
        boralheading = True
    # Check if there are any other spellings of "Oral Answers to Questions" with a loose match
    elif re.search('oral(?i)', headingtxt) and re.search('ques(?i)', headingtxt) and (not re.search(" Not ", headingtxt)) and \
                           (not re.search("electoral", headingtxt)) and \
      stampurl.sdate not in ("2002-06-11", "2012-02-09"): # have a genuine title with Oral in it
        print headingtxt
        raise ContextException('Oral question match not precise enough',
                               stamp=stampurl,
                               fragment=headingtxt)

    # All upper case headings - UGH
    elif not re.search('[a-z]', headingtxt) and not re.match('[A-Z\d/]+[\d/][A-Z\d/]+$', headingtxt) and not \
               ('remaining_private_bills' in state and re.search(' Bill$(?i)', headingtxt)):
        bmajorheading = True

    elif 'just_had_points_of_order' in state:
        bmajorheading = True
        del state['just_had_points_of_order']

    # If this is labeled major, then it gets concatenated with the
    # subsequent major heading.  It's kind of a procedural info about the
    # running of things, so fair to have it as a minor heading alone.
    elif re.match("\[.*? in\s*the\s*Chair\.?\]$(?i)", headingtxt):
        bmajorheading = False

    elif re.search("in\s*the\s*chair(?i)", headingtxt):
        print headingtxt
        raise ContextException('in the chair match not precise enough',
                               stamp=stampurl,
                               fragment=headingtxt)

    # Other major headings, marked by _head in their anchor tag
    elif re.search('"topichd_|"ordayhd_|"hd_|_head', stampurl.aname):
        bmajorheading = True

# Wah
    if stampurl.sdate > '2006-05-07':
        if re.match(
                "(Private business|Business of the House|Orders of the day|Opposition Day|Deferred Division|Petition)(?i)",
                headingtxt):
            bmajorheading = True
        if re.match("Points? of Order(?i)", headingtxt):
            bmajorheading = True
            state['just_had_points_of_order'] = True
        if re.match("Remaining Private Members[^ ]* Bills(?i)", headingtxt):
            bmajorheading = True
            state['remaining_private_bills'] = True

    # we're not writing a block for division headings
    # write out block for headings
    headingtxtfx = FixHTMLEntities(headingtxt)
    try:
        assert not re.search(
            "[<>]", headingtxtfx), headingtxtfx  # an assertion in gidmatching
    except AssertionError:
        raise ContextException('Tag found in heading text',
                               stamp=stampurl,
                               fragment=headingtxt)

    qb = qspeech('nospeaker="true"', headingtxtfx, stampurl)
    if typ == 'westminhall':
        qb.typ = 'minor-heading'
    elif binsertedheading:
        qb.typ = 'inserted-heading'
    elif boralheading:
        qb.typ = 'oral-heading'
    elif bmajorheading:
        qb.typ = 'major-heading'
    else:
        qb.typ = 'minor-heading'

    # headings become one unmarked paragraph of text
    qb.stext = [headingtxtfx]
    return qb

Exemplo n.º 8

0

Exibir arquivo

Arquivo: filter.py Projeto: JonathanBowker/parlparse

def RunRegmemFilters(fout, text, sdate, sdatever):
    if sdate >= '2010-09-01':
        return RunRegmemFilters2010(fout, text, sdate, sdatever)

    # message for cron so I check I'm using this
    print "New register of members interests!  Check it is working properly (via mpinfoin.pl) - %s" % sdate

    text = ApplyFixSubstitutions(text, sdate, fixsubs)

    WriteXMLHeader(fout)
    fout.write("<publicwhip>\n")

    text = re.sub('Rt Shaun', 'Shaun', text)  # Always get his name wrong
    text = re.sub('&#128;', '&#163;',
                  text)  # Always get some pound signs wrong
    rows = re.findall("<TR>(.*)</TR>", text)
    rows = [re.sub("&nbsp;", " ", row) for row in rows]
    rows = [re.sub("<B>|</B>|<BR>|`", "", row) for row in rows]
    rows = [
        re.sub('<span style="background-color: #FFFF00">|</span>', '', row)
        for row in rows
    ]
    rows = [re.sub('<IMG SRC="3lev.gif">', "", row) for row in rows]
    rows = [re.sub("&#173;", "-", row) for row in rows]
    rows = [
        re.sub('\[<A NAME="n\d+"><A HREF="\#note\d+">\d+</A>\]', '', row)
        for row in rows
    ]
    rows = [re.sub('\[<A NAME="n\d+">\d+\]', '', row) for row in rows]

    # Fix incorrect tabling of categories when highlighting is in play
    rows = [
        re.sub('<TD COLSPAN=4>(\d\.) ([^<]*?)</TD>',
               r'<TD>\1</TD><TD COLSPAN=3>\2</TD>', row) for row in rows
    ]
    # split into cells within a row
    rows = [re.findall("<TD.*?>\s*(.*?)\s*</TD>", row) for row in rows]

    memberset = set()
    needmemberend = False
    category = None
    categoryname = None
    subcategory = None
    for row in rows:
        striprow = re.sub('</?[^>]+>', '', "".join(row))
        #print row
        if striprow.strip() == "":
            # There is no text on the row, just tags
            pass
        elif len(row) == 1 and re.match("(?i)(<i>)? +(</i>)?", row[0]):
            # <TR><TD COLSPAN=4>&nbsp;</TD></TR>
            pass
        elif len(row) == 1:
            # <TR><TD COLSPAN=4><B>JACKSON, Robert (Wantage)</B></TD></TR>
            res = re.search("^([^,]*), ([^(]*) \((.*)\)$", row[0])
            if not res:
                print row
                raise ContextException, "Failed to break up into first/last/cons: %s" % row[
                    0]
            (lastname, firstname, constituency) = res.groups()
            constituency = constituency.replace(')', '')
            constituency = constituency.replace('(', '')
            firstname = memberList.striptitles(firstname)[0]

            # Register came out after they stood down
            if (firstname == 'Ian' and lastname == 'GIBSON' and sdate > '2009-06-08') \
                or (firstname == 'Michael' and lastname == 'MARTIN' and sdate > '2009-06-22'):
                check_date = '2009-06-08'
            else:
                check_date = sdate
            (id, remadename, remadecons) = memberList.matchfullnamecons(
                firstname + " " + memberList.lowercaselastname(lastname),
                constituency, check_date)
            if not id:
                raise ContextException, "Failed to match name %s %s (%s) date %s" % (
                    firstname, lastname, constituency, sdate)
            if category:
                fout.write('\t</category>\n')
            if needmemberend:
                fout.write('</regmem>\n')
                needmemberend = False
            fout.write(('<regmem personid="%s" membername="%s" date="%s">\n' %
                        (id, remadename, sdate)).encode("latin-1"))
            memberset.add(id)
            needmemberend = True
            category = None
            categoryname = None
            subcategory = None
        elif len(row) == 2 and row[0] == '' and re.match('Nil\.\.?', row[1]):
            # <TR><TD></TD><TD COLSPAN=3><B>Nil.</B></TD></TR>
            fout.write('Nil.\n')
        elif len(row) == 2 and row[0] != '':
            # <TR><TD><B>1.</B></TD><TD COLSPAN=3><B>Remunerated directorships</B></TD></TR>
            if category:
                fout.write('\t</category>\n')
            digits = row[0]
            category = re.match("\s*(\d\d?)\.$", digits).group(1)
            categoryname = row[1]
            subcategory = None
            fout.write('\t<category type="%s" name="%s">\n' %
                       (category, categoryname))
        elif len(row) == 2 and row[0] == '':
            # <TR><TD></TD><TD COLSPAN=3><B>Donations to the Office of the Leader of the Liberal Democrats received from:</B></TD></TR>
            if subcategory:
                fout.write('\t\t<item subcategory="%s">%s</item>\n' %
                           (subcategory, FixHTMLEntities(row[1])))
            else:
                fout.write('\t\t<item>%s</item>\n' % FixHTMLEntities(row[1]))
        elif len(row) == 3 and row[0] == '' and row[1] == '':
            # <TR><TD></TD><TD></TD><TD COLSPAN=2>19 and 20 September 2002, two days fishing on the River Tay in Scotland as a guest of Scottish Coal. (Registered 3 October 2002)</TD></TR>
            if subcategory:
                fout.write('\t\t<item subcategory="%s">%s</item>\n' %
                           (subcategory, FixHTMLEntities(row[2])))
            else:
                fout.write('\t\t<item>%s</item>\n' % FixHTMLEntities(row[2]))
        elif len(row) == 3 and row[0] == '':
            # <TR><TD></TD><TD><B>(a)</B></TD><TD COLSPAN=2>Smithville Associates; training consultancy.</TD></TR>
            if subcategory:
                fout.write(
                    '\t\t<item subcategory="%s">%s</item>\n' %
                    (subcategory, FixHTMLEntities(row[1] + ' ' + row[2])))
            else:
                fout.write('\t\t<item>%s</item>\n' %
                           FixHTMLEntities(row[1] + ' ' + row[2]))
        elif len(row) == 4 and row[0] == '' and (row[1] == '' or row[1]
                                                 == '<IMG SRC="3lev.gif">'):
            # <TR><TD></TD><TD></TD><TD>(b)</TD><TD>Great Portland Estates PLC</TD></TR>
            subcategorymatch = re.match("\(([ab])\)$", row[2])
            if not subcategorymatch:
                content = FixHTMLEntities(row[2] + " " + row[3])
                if subcategory:
                    fout.write('\t\t<item subcategory="%s">%s</item>\n' %
                               (subcategory, content))
                else:
                    fout.write('\t\t<item>%s</item>\n' % content)
            else:
                subcategory = subcategorymatch.group(1)
                fout.write('\t\t(%s)\n' % subcategory)
                fout.write('\t\t<item subcategory="%s">%s</item>\n' %
                           (subcategory, FixHTMLEntities(row[3])))
        else:
            print row
            raise ContextException, "Unknown row type match, length %d" % (
                len(row))
    if category:
        fout.write('\t</category>\n')
    if needmemberend:
        fout.write('</regmem>\n')
        needmemberend = False

    membersetexpect = set(
        [m['person_id'] for m in memberList.mpslistondate(sdate)])

    # check for missing/extra entries
    missing = membersetexpect.difference(memberset)
    if len(missing) > 0:
        print "Missing %d MP entries:\n" % len(missing), missing
    extra = memberset.difference(membersetexpect)
    if len(extra) > 0:
        print "Extra %d MP entries:\n" % len(extra), extra

    fout.write("</publicwhip>\n")

Exemplo n.º 9

0

Exibir arquivo

Arquivo: division.py Projeto: samknight/parlparse

def MpList(fsm, vote, stampurl, sdate):
    # Merge lone listed constituencies onto end of previous line
    newfsm = []
    for fss in fsm:
        if not fss: continue
        if reconstnm.match(fss):
            # print "constnm only %s appending to previous line %s" % (fss, newfsm[-1])
            newfsm[-1] += " " + fss
        else:
            newfsm.append(fss)

    res = []
    pfss = ''

    multimatches = {}  # from tuple to number of matches accounted, and name

    for fss in newfsm:
        #print "fss ", fss

        # break up concattenated lines
        # Beresford, Sir PaulBlunt, Crispin

        while re.search('\S', fss):
            # there was an & in [A-Z] on line below, but it broke up this incorrectly:
            # Simon, Si&#244;n <i>(B'ham Erdington)</i>
            regsep = re.search('(.*?,.*?(?:[a-z]|</i>|\.|\)))([A-Z].*?,.*)$',
                               fss)
            regsep2 = re.match('(.*?,.*?)  ([A-Z].*?,.*)$', fss)
            if regsep and not re.search('  Mc$', regsep.group(1)):
                fssf = regsep.group(1)
                fss = regsep.group(2)
            elif regsep2:
                fssf = regsep2.group(1)
                fss = regsep2.group(2)
            else:
                fssf = fss
                fss = ''

            # check alphabetical - but "rh" and so on confound so don't bother
            #if pfss and (pfss > fssf):
            #	print pfss, fssf
            #	raise Exception, ' out of alphabetical order %s and %s' % (pfss, fssf)
            #pfss = fssf

            # flipround the name
            # Bradley, rh Keith <i>(Withington)</i>
            # Simon, Sio(r)n <i>(Withington)</i>
            #print "fssf ", fssf
            ginp = reflipname.match(fssf)
            if ginp:
                #print "grps ", ginp.groups()
                fnam = '%s %s' % (ginp.group(2), ginp.group(1))
                cons = ginp.group(3)

            # name not being flipped, is firstname lastname
            else:
                ginp = renoflipname.match(fssf)
                if not ginp:
                    raise ContextException(
                        "No flipped or non-flipped name match (division)",
                        stamp=stampurl,
                        fragment=fssf)
                fnam = ginp.group(1)
                cons = ginp.group(2)

            #print "fss ", fssf
            (mpid, remadename,
             remadecons) = memberList.matchfullnamecons(fnam,
                                                        cons,
                                                        sdate,
                                                        alwaysmatchcons=False)
            if not mpid and remadename == "MultipleMatch":
                assert type(remadecons) == tuple  # actually the list of ids
                i = len(multimatches.setdefault(remadecons,
                                                []))  # the index we work with
                if i >= len(remadecons):
                    print "Name", fnam, "used too many times for list", remadecons, "where other instances are", multimatches[
                        remadecons]
                    raise ContextException("Too many instances",
                                           stamp=stampurl,
                                           fragment=fnam)
                mpid = remadecons[i]
                multimatches[remadecons].append(fnam)

                # appears with multiple matching which is ignorable when both ambiguous people vote on same side of a division
                #print "For name", fnam, "returning id", mpid, ";", i, " out of ", remadecons

            elif not mpid and remadename != "MultipleMatch":
                print "division.py: no match for", fnam, cons, sdate
                raise ContextException("No match on name",
                                       stamp=stampurl,
                                       fragment=fnam)
            #print fnam, " --> ", remadename.encode("latin-1")
            res.append('\t<mpname person_id="%s" vote="%s">%s</mpname>' %
                       (mpid, vote, FixHTMLEntities(fssf)))

    # now we have to check if the multimatched names were all exhausted
    for ids in multimatches:
        if len(multimatches[ids]) != len(ids):
            print "Insufficient vote matches on name", multimatches[
                ids], "ids taken to", ids
            raise ContextException("Not enough vote match on ambiguous name",
                                   stamp=stampurl,
                                   fragment=multimatches[ids][0])
    return res

Exemplo n.º 10

0

Exibir arquivo

Arquivo: ques.py Projeto: samknight/parlparse

def FilterQuestion(qs, sdate, lords):
    text = qs.text
    stampurl = qs.sstampurl

    # split into paragraphs.  The second results is a parallel array of bools
    (textp, textpindent) = SplitParaIndents(text, stampurl)
    if not textp:
        raise ContextException('no paragraphs in result',
                               stamp=stampurl,
                               fragment=text)

    textn = []

    # special case exceptions.  Indented text in questions nearly always marks numbered sections
    # - rarely is it quoted text like this:
    # 2002-11-07 - happened again.  Did a patch.
    if sdate == '2004-01-05' and len(textp) > 1 and re.search(
            '"Given that 98.5 per cent', text):
        # if this happens a lot - do this properly, so the indented bit gets its own paragraph
        textp = (string.join(textp, " "), )
        textpindent = (0, )

    # I /think/ this is to match Lords written answers
    if lords:
        stext = []
        start = 0
        if re.match(
                'asked Her Majesty(&#039;|&#146;|\')s Government|asked the (?i)',
                textp[0]):
            stext.append('<p>%s</p>' % FixHTMLEntities(textp[0]))
            start = 1
        for i in range(start, len(textp)):
            eqnum = ExtractQnum(textp[i], stampurl)
            stext.append('<p qnum="%s">%s</p>' %
                         (eqnum[1], FixHTMLEntities(eqnum[0])))
        return stext

    # multi-part type
    if len(textp) > 1:
        # find the first (1)
        gbone = re.search('\(1\)', textp[0])
        if not gbone:
            m = re.match(
                'To ask the ((Secretary|Minister) of State,? (Ministry of|for( the)?) )?(%s),? (?i)'
                % '|'.join(parlPhrases.wransmajorheadings.keys()), textp[0])
            if not m:
                raise ContextException('no (1) in first multipart para',
                                       fragment=text,
                                       stamp=stampurl)
            textp[0] = textp[0][:m.end()] + '(1) ' + textp[0][m.end():]
            gbone = re.search('\(1\)', textp[0])
        textn.append((textp[0][:gbone.span(0)[0]], ''))
        eqnum = ExtractQnum(textp[0][gbone.span(0)[1]:], stampurl)
        textn.append(eqnum)

        # scan through the rest of the numbered paragraphs
        for i in range(1, len(textp)):
            gbnum = re.search('^\((\d+)\)', textp[i])
            if not gbnum:
                raise ContextException('no number match in paragraph',
                                       fragment=textp[i],
                                       stamp=stampurl)
            gbnumseq = string.atoi(gbnum.group(1))
            # MPS 2007-06-22 Don't care
            #if gbnumseq != i + 1:
            #	raise ContextException('paragraph numbers not consecutive', fragment=textp[i], stamp=stampurl)
            eqnum = ExtractQnum(textp[i][gbnum.span(0)[1]:], stampurl)
            textn.append(eqnum)

    # single paragraph type
    else:
        eqnum = ExtractQnum(textp[0], stampurl)
        textn.append(eqnum)

    # put the paragraphs back in together, with their numbering
    # should do some blocking out of this, especially the "to ask" phrase.
    pht = PhraseTokenize(qs, textn[0][0])
    firstpara = re.sub('</?p[^>]*>', '', pht.GetPara(''))

    if len(textn) > 1:
        stext = ['<p>%s</p>' % firstpara]
        for i in range(1, len(textn)):
            pht = PhraseTokenize(qs, textn[i][0])
            stext.append(
                '<p class="numindent" qnum="%s">(%d) %s</p>' %
                (textn[i][1], i, re.sub('</?p[^>]*>', '', pht.GetPara(''))))

    else:
        stext = ['<p qnum="%s">%s</p>' % (textn[0][1], firstpara)]

    return stext

Exemplo n.º 11

0

Exibir arquivo

def ParseTable(lstable, stampur):
    # remove the table bracketing
    stable = re.match('<table[^>]*>\s*([\s\S]*?)\s*</table>$(?i)', lstable)
    if not stable:
        raise ContextException('Missing </table> somewhere...',
                               stamp=stampur,
                               fragment=stable)
    stable = stable.group(1)
    if re.search('<table[^>]*>|</table>(?i)', stable):
        print lstable
        raise Exception, 'Double <table> start tag in table parse chunk'

    # break into rows, making sure we can deal with non-closed <tr> symbols
    sprows = re.split('(<tr[^>]*>[\s\S]*?(?:</tr>|(?=<tr[^>]*>)))(?i)', stable)

    # build the rows
    stitle = ''
    srows = []
    for sprow in sprows:
        trg = re.match('<tr[^>]*>([\s\S]*?)(?:</tr>)?$(?i)', sprow)

        if trg:
            srows.append(trg.group(1))

        elif re.search('\S', sprow):
            if (not srows) and (not stitle):
                stitle = sprow
            elif not re.match(
                    '(?:</t[dhr]>|</font>|</?tbody>|</?thead>|\s)*$(?i)',
                    sprow):
                raise ContextException("non-row text",
                                       stamp=stampur,
                                       fragment=sprow)

    # take out tags round the title; they're always out of order
    #print "stitle ", stitle
    stitle = string.strip(
        re.sub('</?font[^>]*>|</?p>|</?i>|<br>|<tbody>|</?thead>|&nbsp;(?i)',
               '', stitle))
    ctitle = ''
    if stitle:
        ts = re.match(
            '(?:\s|<b>|<center>)+([\s\S]*?)(?:</b>|</center>)+\s*([\s\S]*?)\s*$(?i)',
            stitle)
        if not ts:
            raise ContextException(' non-standard table title: %s ' % stitle,
                                   stamp=stampur,
                                   fragment=stitle)
        Lstitle = ['\t<caption>']
        Lstitle.append(
            FixHTMLEntities(ts.group(1),
                            '</?font[^>]*>|</?p>|\n(?i)',
                            stampurl=stampur))
        if ts.group(2):
            Lstitle.append(' -- ')
            Lstitle.append(
                FixHTMLEntities(ts.group(2),
                                '</?font[^>]*>|</?p>|\n(?i)',
                                stampurl=stampur))
        Lstitle.append('</caption>')
        ctitle = string.join(Lstitle, '')

    # split into header and body
    for ih in range(len(srows)):
        if re.search('<td[^>]*>(?i)', srows[ih]):
            break

    # construct the text for writing the table
    res = ['<table>']
    if ctitle:
        res.append(ctitle)

    if ih > 0:
        res.append('\t<thead>')
        for srow in srows[:ih]:
            res.append(ParseRow(srow, 'th', stampur))
        res.append('\t</thead>')

    res.append('\t<tbody>')
    for srow in srows[ih:]:
        res.append(ParseRow(srow, 'td', stampur))
    res.append('\t</tbody>')

    res.append('</table>')

    return res

Exemplo n.º 12

0

Exibir arquivo

Arquivo: sections.py Projeto: samknight/parlparse

def FilterWransSections(text, sdate, lords=False):
    text = ApplyFixSubstitutions(text, sdate, fixsubs)
    headspeak = SplitHeadingsSpeakers(text)

    # break down into lists of headings and lists of speeches
    (ih, stampurl) = StripWransHeadings(headspeak, sdate)

    # full list of question batches
    # We create a list of lists of speeches
    flatb = []
    justhadnewtitle = False  # For when they put another "Written Answers to Questions" and date
    for sht in headspeak[ih:]:
        # triplet of ( heading, unspokentext, [(speaker, text)] )
        headingtxt = stampurl.UpdateStampUrl(string.strip(
            sht[0]))  # we're getting stamps inside the headings sometimes
        unspoketxt = sht[1]
        speechestxt = sht[2]

        # update the stamps from the pre-spoken text
        if (not re.match('(?:<[^>]*>|\s)*$', unspoketxt)):
            raise ContextException("unspoken text under heading in wrans",
                                   stamp=stampurl,
                                   fragment=unspoketxt)
        stampurl.UpdateStampUrl(unspoketxt)

        # headings become one unmarked paragraph of text

        # detect if this is a major heading
        if not re.search('[a-z]', headingtxt) and not speechestxt:
            if not parlPhrases.wransmajorheadings.has_key(headingtxt):
                raise ContextException(
                    "unrecognized major heading, please add to parlPhrases.wransmajorheadings (a)",
                    fragment=headingtxt,
                    stamp=stampurl)
            majheadingtxtfx = parlPhrases.wransmajorheadings[
                headingtxt]  # no need to fix since text is from a map.
            qbH = qspeech('nospeaker="true"', majheadingtxtfx, stampurl)
            qbH.typ = 'major-heading'
            qbH.stext = [majheadingtxtfx]
            flatb.append(qbH)
            continue
        elif not speechestxt and sdate > '2006-05-07':
            if headingtxt == 'Written Answers to Questions':
                justhadnewtitle = True
                continue
            if not parlPhrases.wransmajorheadings.has_key(headingtxt.upper()):
                if justhadnewtitle:
                    justhadnewtitle = False
                    continue
                raise ContextException(
                    "unrecognized major heading, please add to parlPhrases.wransmajorheadings (b)",
                    fragment=headingtxt,
                    stamp=stampurl)
            majheadingtxtfx = parlPhrases.wransmajorheadings[
                headingtxt.upper()]  # no need to fix since text is from a map.
            qbH = qspeech('nospeaker="true"', majheadingtxtfx, stampurl)
            qbH.typ = 'major-heading'
            qbH.stext = [majheadingtxtfx]
            flatb.append(qbH)
            justhadnewtitle = False
            continue
        elif not speechestxt:
            raise ContextException('broken heading %s' % headingtxt,
                                   stamp=stampurl,
                                   fragment=headingtxt)

        # non-major heading; to a question batch
        if parlPhrases.wransmajorheadings.has_key(headingtxt):
            raise Exception, ' speeches found in major heading %s' % headingtxt

        headingtxtfx = FixHTMLEntities(headingtxt)
        headingmark = 'nospeaker="true"'
        bNextStartofQ = True

        # go through each of the speeches in a block and put it into our batch of speeches
        qnums = []  # used to account for spurious qnums seen in answers
        for ss in speechestxt:
            qb = qspeech(ss[0], ss[1], stampurl)
            #print ss[0] + "  " + stampurl.stamp
            lqnums = re.findall('\[(?:HL)?(\d+)R?\]', ss[1])

            # question posed
            if re.match('(?:<[^>]*?>|\s)*?(to ask|asked (Her Majesty(&#039;|&#146;|\')s Government|the ))(?i)', qb.text) or \
                                    re.search('<wrans-question>', qb.text):
                qb.text = qb.text.replace('<wrans-question>', '')
                qb.typ = 'ques'

                # put out the heading for this question-reply block.
                # we don't assert true since we can have multiple questions answsered in a block.
                if bNextStartofQ:
                    # put out a heading
                    # we need to make the heading of from the same stampurl as the first question
                    qbh = qspeech(headingmark, headingtxtfx, qb.sstampurl)
                    qbh.typ = 'minor-heading'
                    qbh.stext = [headingtxtfx]
                    flatb.append(qbh)

                    bNextStartofQ = False

                    # used to show that the subsequent headings in this block have been created,
                    # and weren't in the original text.
                    headingmark = 'nospeaker="true" inserted-heading="true"'
                    qnums = lqnums  # reset the qnums count
                else:
                    qnums.extend(lqnums)

                qb.stext = FilterQuestion(qb, sdate, lords)
                if not lqnums:
                    errmess = ' <p class="error">Question number missing in Hansard, possibly truncated question.</p> '
                    qb.stext.append(errmess)

                flatb.append(qb)

            # do the reply
            else:
                if bNextStartofQ:
                    raise ContextException('start of question expected',
                                           stamp=qb.sstampurl,
                                           fragment=qb.text)
                qb.typ = 'reply'

                # this case is so rare we flag them in the corrections of the html with this tag
                if re.search("\<another-answer-to-follow\>", qb.text):
                    qb.text = qb.text.replace("<another-answer-to-follow>", "")
                else:
                    bNextStartofQ = True

                # check against qnums which are sometimes repeated in the answer code
                # Don't care if qnum is given in an answer!
                #for qn in lqnums:
                #	# sometimes [n] is an enumeration or part of a title
                #	nqn = string.atoi(qn)
                #	if (not qnums.count(qn)) and (nqn > 100) and ((nqn < 1900) or (nqn > 2010)):
                #		if qb.text.find("<ok-extra-qnum>") >= 0:
                #			qb.text = qb.text.replace("<ok-extra-qnum>", "", 1)
                #		else:
                #			raise ContextException('unknown qnum %s present in answer, make it clear' % qn, stamp = qb.sstampurl, fragment = qb.text)
                qb.stext = FilterReply(qb)
                flatb.append(qb)

        if not bNextStartofQ:
            print speechestxt
            # Note - not sure if this should be speechestxt[-1][1] here.  Does what I want for now...
            raise ContextException("missing answer to question",
                                   stamp=stampurl,
                                   fragment=speechestxt[-1][1])

    # we now have everything flattened out in a series of speeches,
    # where some of the speeches are headings (inserted and otherwise).
    return flatb