Пример #1
0
                '\\2\\1\\3', 1, '2003-02-24'),

        ( '(<B> Mr\. Spellar)\)', '\\1', 1, '2003-03-31'),

        # wrong constituency in debates
        ( 'Sir Archy Kirkwood  \(Brecon and Radnorshire\)', 'Sir Archy Kirkwood (Roxburgh and Berwickshire)', 1, '2003-06-26'),

]

# 2. <B> Mr. Colin Breed  (South-East Cornwall)</B> (LD):
# <B> Mr. Hutton: </B>
# 2. <stamp aname="40205-06_para4"/><B> Mr. Colin Breed</B>:

# Q4.  [161707]<a name="40317-03_wqn5"><B> Mr. Andy Reed  (Loughborough)</B>

parties = "|".join(map(string.lower, memberList.partylist())) + "|uup|ld|dup|in the chair"

# Splitting condition
# this must be a generalization of the one below.  so changes need to be reflected in both.
recomb = re.compile('''(?ix)((?:[QT]?\d+\.\s*)?(?:\[\d+\]\s*)?
					(?:<stamp\saname="[^"]*"/>\s*)?
					<b>
					(?:<stamp\saname="[^"]*"/>)*
					[^<]*
					</b>(?!</h[34]>)
					\s*\)?
					(?:\s*\([^)]*\))?
					(?:\s*\((?:%s)\))?
					\s*:?)''' % parties)

Пример #2
0
                '\\2\\1\\3', 1, '2003-02-24'),

        ( '(<B> Mr\. Spellar)\)', '\\1', 1, '2003-03-31'),

        # wrong constituency in debates
        ( 'Sir Archy Kirkwood  \(Brecon and Radnorshire\)', 'Sir Archy Kirkwood (Roxburgh and Berwickshire)', 1, '2003-06-26'),

]

# 2. <B> Mr. Colin Breed  (South-East Cornwall)</B> (LD):
# <B> Mr. Hutton: </B>
# 2. <stamp aname="40205-06_para4"/><B> Mr. Colin Breed</B>:

# Q4.  [161707]<a name="40317-03_wqn5"><B> Mr. Andy Reed  (Loughborough)</B>

parties = "|".join(map(lambda x: x.lower().replace(' ', '[ ]'), memberList.partylist())) + "|uup|ld|dup|in[ ]the[ ]chair|ind|sdlp|snp|con|lab|pc|lab/[ ]?co-op"

retabletext = '<p[ ]class="tabletext"><b>[^<]*</b></p>'
# Splitting condition
# this must be a generalization of the one below.  so changes need to be reflected in both.
respeaker = '''
    (?:[QT]?\d+\.\s*)?(?:\[\d+\]\s*)?  # Question number before speaker name
    (?:<stamp\saname="[^"]*"/>\s*)?    # Stamps before <b>
    <b>
    (?:<stamp\saname="[^"]*"/>)*       # Stamps after <b>
    [^<]*                              # Any non-HTML
    </b>(?!</h[34]>)                   # End bold as long as not followed by end heading
    \s*\)?                             # Possible random closing bracket
    (?:\s*\([^)]*\))?                  # Possible string in brackets (e.g. constituency)
    (?:\s*\((?:%s)\))?                 # Possible party string in brackets
    \s*:?                              # Possible colon at the end