예제 #1
0
 def saveStatement(self, t):
     def mcUp(match):
         return 'Mc' + match.group(1).upper()
     if t['topic']:
         # Question No. 139-- -> Question No. 139
         t['topic'] = re.sub(r'\-+$', '', t['topic'])
         t['topic'] = re.sub(r"'S", "'s", t['topic'])
         t['topic'] = re.sub(r'Mc([a-z])', mcUp, t['topic'])
     if t.hasText():
         if not t['member_title']:
             t['member_title'] = 'Proceedings'
             print "WARNING: No title for %s" % t.getText()
         timestamp = t['timestamp']
         if not isinstance(timestamp, datetime.datetime):
             # The older parser provides only datetime.time objects
             timestamp = datetime.datetime.combine(self.date, timestamp)
         statement = Statement(hansard=self.hansard, heading=t['heading'], topic=t['topic'],
          time=timestamp, member=t['member'],
          politician=t['politician'], who=t['member_title'],
          text=t.getText(), sequence=self.statement_index, written_question=bool(t['written_question']))
         if r_notamember.search(t['member_title']) and 'Speaker' in t['member_title']:
             statement.speaker = True
         self.statement_index += 1
         self.statements.append(statement)
         
         if ENABLE_PRINT:
             print u"HEADING: %s" % t['heading']
             print u"TOPIC: %s" % t['topic']
             print u"MEMBER TITLE: %s" % t['member_title']
             print u"MEMBER: %s" % t['member']
             print u"TIME: %s" % t['timestamp']
             print u"TEXT: %s" % t.getText()
         if ENABLE_READLINE:
             sys.stdin.readline()
     t.onward()
예제 #2
0
    def saveStatement(self, t):
        def mcUp(match):
            return 'Mc' + match.group(1).upper()

        if t['topic']:
            # Question No. 139-- -> Question No. 139
            t['topic'] = re.sub(r'\-+$', '', t['topic'])
            t['topic'] = re.sub(r"'S", "'s", t['topic'])
            t['topic'] = re.sub(r'Mc([a-z])', mcUp, t['topic'])
        if t.hasText():
            if not t['member_title']:
                t['member_title'] = 'Proceedings'
                print "WARNING: No title for %s" % t.getText().encode(
                    'ascii', 'replace')
            timestamp = t['timestamp']
            if not isinstance(timestamp, datetime.datetime):
                # The older parser provides only datetime.time objects
                timestamp = datetime.datetime.combine(self.date, timestamp)
            statement = Statement(hansard=self.hansard,
                                  heading=t['heading'],
                                  topic=t['topic'],
                                  time=timestamp,
                                  member=t['member'],
                                  politician=t['politician'],
                                  who=t['member_title'],
                                  text=t.getText(),
                                  sequence=self.statement_index,
                                  written_question=bool(t['written_question']))
            if r_notamember.search(t['member_title'])\
              and ('Speaker' in t['member_title'] or 'The Chair' in t['member_title']):
                statement.speaker = True
            self.statement_index += 1
            self.statements.append(statement)

            if ENABLE_PRINT:
                print u"HEADING: %s" % t['heading']
                print u"TOPIC: %s" % t['topic']
                print u"MEMBER TITLE: %s" % t['member_title']
                print u"MEMBER: %s" % t['member']
                print u"TIME: %s" % t['timestamp']
                print u"TEXT: %s" % t.getText()
            if ENABLE_READLINE:
                sys.stdin.readline()
        t.onward()
예제 #3
0
 def saveProceedingsStatement(self, text, t):
     text = parsetools.sane_quotes(parsetools.tameWhitespace(text.strip()))
     if len(text):
         statement = Statement(hansard=self.hansard,
                               time=datetime.datetime.combine(
                                   self.date, t['timestamp']),
                               text=text,
                               sequence=self.statement_index,
                               who='Proceedings')
         self.statement_index += 1
         self.statements.append(statement)
예제 #4
0
 def saveProceedingsStatement(self, text, t):
     text = parsetools.sane_quotes(parsetools.tameWhitespace(text.strip()))
     if len(text):
         timestamp = t['timestamp']
         if not isinstance(timestamp, datetime.datetime):
             # The older parser provides only datetime.time objects
             timestamp = datetime.datetime.combine(self.date, timestamp)
         statement = Statement(hansard=self.hansard,
                               time=timestamp,
                               text=text,
                               sequence=self.statement_index,
                               who='Proceedings')
         self.statement_index += 1
         self.statements.append(statement)
예제 #5
0
def import_document(document, interactive=True, reimport_preserving_sequence=False):
    old_statements = None
    if document.statement_set.all().exists():
        if reimport_preserving_sequence:
            if OldSequenceMapping.objects.filter(document=document).exists():
                logger.error("Sequence mapping already exits for %s" % document)
                return
            old_statements = list(document.statement_set.all())
            document.statement_set.all().delete()
        else:
            if not interactive:
                return
            sys.stderr.write("Statements already exist for %r.\nDelete them? (y/n) " % document)
            if raw_input().strip() != 'y':
                return
            document.statement_set.all().delete()

    document.download()
    xml_en = document.get_cached_xml('en')
    pdoc_en = alpheus.parse_file(xml_en)
    xml_en.close()

    xml_fr = document.get_cached_xml('fr')
    pdoc_fr = alpheus.parse_file(xml_fr)
    xml_fr.close()
    
    if document.date and document.date != pdoc_en.meta['date']:
        # Sometimes they get the date wrong
        if document.date != pdoc_fr.meta['date']:
            logger.error("Date mismatch on document #%s: %s %s" % (
                document.id, document.date, pdoc_en.meta['date']))
    else:
        document.date = pdoc_en.meta['date']
    document.number = pdoc_en.meta['document_number']
    document.public = True

    statements = []

    for pstate in pdoc_en.statements:
        s = Statement(
            document=document,
            sequence=len(statements),
            content_en=pstate.content,
            time=pstate.meta['timestamp'])
        s.source_id = pstate.meta['id']
        s.h1 = pstate.meta.get('h1', '')
        s.h2 = pstate.meta.get('h2', '')
        s.h3 = pstate.meta.get('h3', '')

        if s.h3 and not s.h2:
            s.h2 = s.h3
            s.h3 = ''

        s.who = pstate.meta.get('person_attribution', '')
        s.who_hocid = int(pstate.meta['person_id']) if pstate.meta.get('person_id') else None
        s.who_context = pstate.meta.get('person_context', '')

        s.statement_type = pstate.meta.get('intervention_type', '').lower()
        s.written_question = pstate.meta.get('written_question', '').upper()[:1]

        if s.who_hocid and not pstate.meta.get('person_type'):
            # At the moment. person_type is only set if we know the person
            # is a non-politician. This might change...
            try:
                s.politician = Politician.objects.get_by_parl_id(s.who_hocid, session=document.session)
                s.member = ElectedMember.objects.get_by_pol(s.politician, date=document.date)
            except Politician.DoesNotExist:
                logger.info("Could not resolve speaking politician ID %s for %r" % (s.who_hocid, s.who))

        s._related_pols = set()
        s._related_bills = set()
        s.content_en = _process_related_links(s.content_en, s)

        statements.append(s)

    if len(statements) != len(pdoc_fr.statements):
        logger.info("French and English statement counts don't match for %r" % document)

    _r_paragraphs = re.compile(ur'<p[^>]* data-HoCid=.+?</p>')
    _r_paragraph_id = re.compile(ur'<p[^>]* data-HoCid="(?P<id>\d+)"')
    fr_paragraphs = dict()

    def _get_paragraph_id(p):
        return int(_r_paragraph_id.match(p).group('id'))

    for st in pdoc_fr.statements:
        for p in _r_paragraphs.findall(st.content):
            fr_paragraphs[_get_paragraph_id(p)] = p

    def _substitute_french_content(match):
        try:
            return fr_paragraphs[_get_paragraph_id(match.group(0))]
        except KeyError:
            logger.error("Paragraph ID %s not found in French for %s" % (match.group(0), document))
            return match.group(0)

    for st in statements:
        st.content_fr = _process_related_links(
            _r_paragraphs.sub(_substitute_french_content, st.content_en),
            st
        )
    document.multilingual = True

    Statement.set_slugs(statements)

    if old_statements:
        for mapping in _align_sequences(statements, old_statements):
            OldSequenceMapping.objects.create(
                document=document,
                sequence=mapping[0],
                slug=mapping[1]
            )
        
    for s in statements:
        s.save()

        s.mentioned_politicians.add(*list(s._related_pols))
        s.bills.add(*list(s._related_bills))
        if getattr(s, '_related_vote', False):
            s._related_vote.context_statement = s
            s._related_vote.save()

    document.save()

    return document
예제 #6
0
def import_document(document, interactive=True, reimport_preserving_sequence=False):
    old_statements = None
    if document.statement_set.all().exists():
        if reimport_preserving_sequence:
            if OldSequenceMapping.objects.filter(document=document).exists():
                logger.error("Sequence mapping already exits for %s" % document)
                return
            old_statements = list(document.statement_set.all())
            document.statement_set.all().delete()
        else:
            if not interactive:
                return
            sys.stderr.write("Statements already exist for %r.\nDelete them? (y/n) " % document)
            if raw_input().strip() != 'y':
                return
            document.statement_set.all().delete()

    if not document.downloaded:
        return False
    xml_en = document.get_cached_xml('en')
    pdoc_en = alpheus.parse_file(xml_en)
    xml_en.close()

    xml_fr = document.get_cached_xml('fr')
    pdoc_fr = alpheus.parse_file(xml_fr)
    xml_fr.close()
    
    if document.date and document.date != pdoc_en.meta['date']:
        # Sometimes they get the date wrong
        if document.date != pdoc_fr.meta['date']:
            logger.error("Date mismatch on document #%s: %s %s" % (
                document.id, document.date, pdoc_en.meta['date']))
    else:
        document.date = pdoc_en.meta['date']
    document.number = pdoc_en.meta['document_number']
    document.public = True

    statements = []

    for pstate in pdoc_en.statements:
        s = Statement(
            document=document,
            sequence=len(statements),
            content_en=pstate.content,
            time=pstate.meta['timestamp'])
        s.source_id = pstate.meta['id']
        s.h1_en = pstate.meta.get('h1', '')
        s.h2_en = pstate.meta.get('h2', '')
        s.h3_en = pstate.meta.get('h3', '')

        if s.h1_en and not s.h2_en:
            s.h2_en = s.h3_en
            s.h3_en = ''

        s.who_en = pstate.meta.get('person_attribution', '')
        s.who_hocid = int(pstate.meta['person_id']) if pstate.meta.get('person_id') else None
        s.who_context_en = pstate.meta.get('person_context', '')

        s.statement_type = pstate.meta.get('intervention_type', '').lower()
        s.written_question = pstate.meta.get('written_question', '').upper()[:1]

        if s.who_hocid and not pstate.meta.get('person_type'):
            # At the moment. person_type is only set if we know the person
            # is a non-politician. This might change...
            try:
                s.politician = Politician.objects.get_by_parl_id(s.who_hocid, session=document.session)
                s.member = ElectedMember.objects.get_by_pol(s.politician, date=document.date)
            except Politician.DoesNotExist:
                logger.info("Could not resolve speaking politician ID %s for %r" % (s.who_hocid, s.who))

        s._related_pols = set()
        s._related_bills = set()
        s.content_en = _process_related_links(s.content_en, s)

        statements.append(s)

    if len(statements) != len(pdoc_fr.statements):
        logger.info("French and English statement counts don't match for %r" % document)

    _r_paragraphs = re.compile(ur'<p[^>]* data-HoCid=.+?</p>')
    _r_paragraph_id = re.compile(ur'<p[^>]* data-HoCid="(?P<id>\d+)"')
    fr_paragraphs = dict()
    fr_statements = dict()
    missing_id_count = 0

    def _get_paragraph_id(p):
        return int(_r_paragraph_id.match(p).group('id'))

    def _get_paragraphs_and_ids(content):
        return [(p, _get_paragraph_id(p)) for p in _r_paragraphs.findall(content)]

    for st in pdoc_fr.statements:
        if st.meta['id']:
            fr_statements[st.meta['id']] = st
        for p, pid in _get_paragraphs_and_ids(st.content):
            if pid:
                fr_paragraphs[pid] = p
            else:
                missing_id_count += 1

    def _substitute_french_content(match):
        try:
            pid = _get_paragraph_id(match.group(0))
            if pid:
                return fr_paragraphs[pid]
            else:
                return match.group(0)
        except KeyError:
            logger.error("Paragraph ID %s not found in French for %s" % (match.group(0), document))
            return match.group(0)

    if missing_id_count > float(len(fr_paragraphs)):
        logger.error("French paragraphs not available")
        document.multilingual = False
    else:
        document.multilingual = True
        for st in statements:
            fr_data = fr_statements.get(st.source_id)
            pids_en = [pid for p, pid in _get_paragraphs_and_ids(st.content_en)]
            pids_fr = [pid for p, pid in _get_paragraphs_and_ids(fr_data.content)] if fr_data else None
            if fr_data and pids_en == pids_fr:
                # Match by statement
                st.content_fr = _process_related_links(fr_data.content, st)
            elif all(pids_en):
                # Match by paragraph
                st.content_fr = _process_related_links(
                    _r_paragraphs.sub(_substitute_french_content, st.content_en),
                    st
                )
            else:
                logger.warning("Could not do multilingual match of statement %s", st.source_id)
                document.multilingual = False
            if fr_data:
                st.h1_fr = fr_data.meta.get('h1', '')
                st.h2_fr = fr_data.meta.get('h2', '')
                st.h3_fr = fr_data.meta.get('h3', '')
                if st.h1_fr and not st.h2_fr:
                    st.h2_fr = s.h3_fr
                    st.h3_fr = ''
                st.who_fr = fr_data.meta.get('person_attribution', '')
                st.who_context_fr = fr_data.meta.get('person_context', '')

    Statement.set_slugs(statements)

    if old_statements:
        for mapping in _align_sequences(statements, old_statements):
            OldSequenceMapping.objects.create(
                document=document,
                sequence=mapping[0],
                slug=mapping[1]
            )
        
    for s in statements:
        s.save()

        s.mentioned_politicians.add(*list(s._related_pols))
        s.bills.add(*list(s._related_bills))
        if getattr(s, '_related_vote', False):
            s._related_vote.context_statement = s
            s._related_vote.save()

    document.save()

    return document