def saveStatement(self, t): def mcUp(match): return 'Mc' + match.group(1).upper() if t['topic']: # Question No. 139-- -> Question No. 139 t['topic'] = re.sub(r'\-+$', '', t['topic']) t['topic'] = re.sub(r"'S", "'s", t['topic']) t['topic'] = re.sub(r'Mc([a-z])', mcUp, t['topic']) if t.hasText(): if not t['member_title']: t['member_title'] = 'Proceedings' print "WARNING: No title for %s" % t.getText() timestamp = t['timestamp'] if not isinstance(timestamp, datetime.datetime): # The older parser provides only datetime.time objects timestamp = datetime.datetime.combine(self.date, timestamp) statement = Statement(hansard=self.hansard, heading=t['heading'], topic=t['topic'], time=timestamp, member=t['member'], politician=t['politician'], who=t['member_title'], text=t.getText(), sequence=self.statement_index, written_question=bool(t['written_question'])) if r_notamember.search(t['member_title']) and 'Speaker' in t['member_title']: statement.speaker = True self.statement_index += 1 self.statements.append(statement) if ENABLE_PRINT: print u"HEADING: %s" % t['heading'] print u"TOPIC: %s" % t['topic'] print u"MEMBER TITLE: %s" % t['member_title'] print u"MEMBER: %s" % t['member'] print u"TIME: %s" % t['timestamp'] print u"TEXT: %s" % t.getText() if ENABLE_READLINE: sys.stdin.readline() t.onward()
def saveStatement(self, t): def mcUp(match): return 'Mc' + match.group(1).upper() if t['topic']: # Question No. 139-- -> Question No. 139 t['topic'] = re.sub(r'\-+$', '', t['topic']) t['topic'] = re.sub(r"'S", "'s", t['topic']) t['topic'] = re.sub(r'Mc([a-z])', mcUp, t['topic']) if t.hasText(): if not t['member_title']: t['member_title'] = 'Proceedings' print "WARNING: No title for %s" % t.getText().encode( 'ascii', 'replace') timestamp = t['timestamp'] if not isinstance(timestamp, datetime.datetime): # The older parser provides only datetime.time objects timestamp = datetime.datetime.combine(self.date, timestamp) statement = Statement(hansard=self.hansard, heading=t['heading'], topic=t['topic'], time=timestamp, member=t['member'], politician=t['politician'], who=t['member_title'], text=t.getText(), sequence=self.statement_index, written_question=bool(t['written_question'])) if r_notamember.search(t['member_title'])\ and ('Speaker' in t['member_title'] or 'The Chair' in t['member_title']): statement.speaker = True self.statement_index += 1 self.statements.append(statement) if ENABLE_PRINT: print u"HEADING: %s" % t['heading'] print u"TOPIC: %s" % t['topic'] print u"MEMBER TITLE: %s" % t['member_title'] print u"MEMBER: %s" % t['member'] print u"TIME: %s" % t['timestamp'] print u"TEXT: %s" % t.getText() if ENABLE_READLINE: sys.stdin.readline() t.onward()
def saveProceedingsStatement(self, text, t): text = parsetools.sane_quotes(parsetools.tameWhitespace(text.strip())) if len(text): statement = Statement(hansard=self.hansard, time=datetime.datetime.combine( self.date, t['timestamp']), text=text, sequence=self.statement_index, who='Proceedings') self.statement_index += 1 self.statements.append(statement)
def saveProceedingsStatement(self, text, t): text = parsetools.sane_quotes(parsetools.tameWhitespace(text.strip())) if len(text): timestamp = t['timestamp'] if not isinstance(timestamp, datetime.datetime): # The older parser provides only datetime.time objects timestamp = datetime.datetime.combine(self.date, timestamp) statement = Statement(hansard=self.hansard, time=timestamp, text=text, sequence=self.statement_index, who='Proceedings') self.statement_index += 1 self.statements.append(statement)
def import_document(document, interactive=True, reimport_preserving_sequence=False): old_statements = None if document.statement_set.all().exists(): if reimport_preserving_sequence: if OldSequenceMapping.objects.filter(document=document).exists(): logger.error("Sequence mapping already exits for %s" % document) return old_statements = list(document.statement_set.all()) document.statement_set.all().delete() else: if not interactive: return sys.stderr.write("Statements already exist for %r.\nDelete them? (y/n) " % document) if raw_input().strip() != 'y': return document.statement_set.all().delete() document.download() xml_en = document.get_cached_xml('en') pdoc_en = alpheus.parse_file(xml_en) xml_en.close() xml_fr = document.get_cached_xml('fr') pdoc_fr = alpheus.parse_file(xml_fr) xml_fr.close() if document.date and document.date != pdoc_en.meta['date']: # Sometimes they get the date wrong if document.date != pdoc_fr.meta['date']: logger.error("Date mismatch on document #%s: %s %s" % ( document.id, document.date, pdoc_en.meta['date'])) else: document.date = pdoc_en.meta['date'] document.number = pdoc_en.meta['document_number'] document.public = True statements = [] for pstate in pdoc_en.statements: s = Statement( document=document, sequence=len(statements), content_en=pstate.content, time=pstate.meta['timestamp']) s.source_id = pstate.meta['id'] s.h1 = pstate.meta.get('h1', '') s.h2 = pstate.meta.get('h2', '') s.h3 = pstate.meta.get('h3', '') if s.h3 and not s.h2: s.h2 = s.h3 s.h3 = '' s.who = pstate.meta.get('person_attribution', '') s.who_hocid = int(pstate.meta['person_id']) if pstate.meta.get('person_id') else None s.who_context = pstate.meta.get('person_context', '') s.statement_type = pstate.meta.get('intervention_type', '').lower() s.written_question = pstate.meta.get('written_question', '').upper()[:1] if s.who_hocid and not pstate.meta.get('person_type'): # At the moment. person_type is only set if we know the person # is a non-politician. This might change... try: s.politician = Politician.objects.get_by_parl_id(s.who_hocid, session=document.session) s.member = ElectedMember.objects.get_by_pol(s.politician, date=document.date) except Politician.DoesNotExist: logger.info("Could not resolve speaking politician ID %s for %r" % (s.who_hocid, s.who)) s._related_pols = set() s._related_bills = set() s.content_en = _process_related_links(s.content_en, s) statements.append(s) if len(statements) != len(pdoc_fr.statements): logger.info("French and English statement counts don't match for %r" % document) _r_paragraphs = re.compile(ur'<p[^>]* data-HoCid=.+?</p>') _r_paragraph_id = re.compile(ur'<p[^>]* data-HoCid="(?P<id>\d+)"') fr_paragraphs = dict() def _get_paragraph_id(p): return int(_r_paragraph_id.match(p).group('id')) for st in pdoc_fr.statements: for p in _r_paragraphs.findall(st.content): fr_paragraphs[_get_paragraph_id(p)] = p def _substitute_french_content(match): try: return fr_paragraphs[_get_paragraph_id(match.group(0))] except KeyError: logger.error("Paragraph ID %s not found in French for %s" % (match.group(0), document)) return match.group(0) for st in statements: st.content_fr = _process_related_links( _r_paragraphs.sub(_substitute_french_content, st.content_en), st ) document.multilingual = True Statement.set_slugs(statements) if old_statements: for mapping in _align_sequences(statements, old_statements): OldSequenceMapping.objects.create( document=document, sequence=mapping[0], slug=mapping[1] ) for s in statements: s.save() s.mentioned_politicians.add(*list(s._related_pols)) s.bills.add(*list(s._related_bills)) if getattr(s, '_related_vote', False): s._related_vote.context_statement = s s._related_vote.save() document.save() return document
def import_document(document, interactive=True, reimport_preserving_sequence=False): old_statements = None if document.statement_set.all().exists(): if reimport_preserving_sequence: if OldSequenceMapping.objects.filter(document=document).exists(): logger.error("Sequence mapping already exits for %s" % document) return old_statements = list(document.statement_set.all()) document.statement_set.all().delete() else: if not interactive: return sys.stderr.write("Statements already exist for %r.\nDelete them? (y/n) " % document) if raw_input().strip() != 'y': return document.statement_set.all().delete() if not document.downloaded: return False xml_en = document.get_cached_xml('en') pdoc_en = alpheus.parse_file(xml_en) xml_en.close() xml_fr = document.get_cached_xml('fr') pdoc_fr = alpheus.parse_file(xml_fr) xml_fr.close() if document.date and document.date != pdoc_en.meta['date']: # Sometimes they get the date wrong if document.date != pdoc_fr.meta['date']: logger.error("Date mismatch on document #%s: %s %s" % ( document.id, document.date, pdoc_en.meta['date'])) else: document.date = pdoc_en.meta['date'] document.number = pdoc_en.meta['document_number'] document.public = True statements = [] for pstate in pdoc_en.statements: s = Statement( document=document, sequence=len(statements), content_en=pstate.content, time=pstate.meta['timestamp']) s.source_id = pstate.meta['id'] s.h1_en = pstate.meta.get('h1', '') s.h2_en = pstate.meta.get('h2', '') s.h3_en = pstate.meta.get('h3', '') if s.h1_en and not s.h2_en: s.h2_en = s.h3_en s.h3_en = '' s.who_en = pstate.meta.get('person_attribution', '') s.who_hocid = int(pstate.meta['person_id']) if pstate.meta.get('person_id') else None s.who_context_en = pstate.meta.get('person_context', '') s.statement_type = pstate.meta.get('intervention_type', '').lower() s.written_question = pstate.meta.get('written_question', '').upper()[:1] if s.who_hocid and not pstate.meta.get('person_type'): # At the moment. person_type is only set if we know the person # is a non-politician. This might change... try: s.politician = Politician.objects.get_by_parl_id(s.who_hocid, session=document.session) s.member = ElectedMember.objects.get_by_pol(s.politician, date=document.date) except Politician.DoesNotExist: logger.info("Could not resolve speaking politician ID %s for %r" % (s.who_hocid, s.who)) s._related_pols = set() s._related_bills = set() s.content_en = _process_related_links(s.content_en, s) statements.append(s) if len(statements) != len(pdoc_fr.statements): logger.info("French and English statement counts don't match for %r" % document) _r_paragraphs = re.compile(ur'<p[^>]* data-HoCid=.+?</p>') _r_paragraph_id = re.compile(ur'<p[^>]* data-HoCid="(?P<id>\d+)"') fr_paragraphs = dict() fr_statements = dict() missing_id_count = 0 def _get_paragraph_id(p): return int(_r_paragraph_id.match(p).group('id')) def _get_paragraphs_and_ids(content): return [(p, _get_paragraph_id(p)) for p in _r_paragraphs.findall(content)] for st in pdoc_fr.statements: if st.meta['id']: fr_statements[st.meta['id']] = st for p, pid in _get_paragraphs_and_ids(st.content): if pid: fr_paragraphs[pid] = p else: missing_id_count += 1 def _substitute_french_content(match): try: pid = _get_paragraph_id(match.group(0)) if pid: return fr_paragraphs[pid] else: return match.group(0) except KeyError: logger.error("Paragraph ID %s not found in French for %s" % (match.group(0), document)) return match.group(0) if missing_id_count > float(len(fr_paragraphs)): logger.error("French paragraphs not available") document.multilingual = False else: document.multilingual = True for st in statements: fr_data = fr_statements.get(st.source_id) pids_en = [pid for p, pid in _get_paragraphs_and_ids(st.content_en)] pids_fr = [pid for p, pid in _get_paragraphs_and_ids(fr_data.content)] if fr_data else None if fr_data and pids_en == pids_fr: # Match by statement st.content_fr = _process_related_links(fr_data.content, st) elif all(pids_en): # Match by paragraph st.content_fr = _process_related_links( _r_paragraphs.sub(_substitute_french_content, st.content_en), st ) else: logger.warning("Could not do multilingual match of statement %s", st.source_id) document.multilingual = False if fr_data: st.h1_fr = fr_data.meta.get('h1', '') st.h2_fr = fr_data.meta.get('h2', '') st.h3_fr = fr_data.meta.get('h3', '') if st.h1_fr and not st.h2_fr: st.h2_fr = s.h3_fr st.h3_fr = '' st.who_fr = fr_data.meta.get('person_attribution', '') st.who_context_fr = fr_data.meta.get('person_context', '') Statement.set_slugs(statements) if old_statements: for mapping in _align_sequences(statements, old_statements): OldSequenceMapping.objects.create( document=document, sequence=mapping[0], slug=mapping[1] ) for s in statements: s.save() s.mentioned_politicians.add(*list(s._related_pols)) s.bills.add(*list(s._related_bills)) if getattr(s, '_related_vote', False): s._related_vote.context_statement = s s._related_vote.save() document.save() return document