Пример #1
0
    def parse_document(self):
        debate = self.xml.debate

        if self.ns:
            people = self.tree.findall('an:debate/an:meta/an:references/an:TLCPerson', namespaces={'an': self.ns})
        else:
            people = self.tree.findall('debate/meta/references/TLCPerson')
        if people is None: people = []
        for person in people:
            id = person.get('id')
            href = person.get('href')
            try:
                p = Person.objects.get(popit_id=href)
            except Person.DoesNotExist:
                p = Person(popit_id=href, api_instance=self.ai)
                if self.commit:
                    p.save()

            try:
                speaker = Speaker.objects.get(instance=self.instance, person=p)
            except Speaker.DoesNotExist:
                speaker = Speaker(instance=self.instance, name=person.get('showAs'), person=p)
                if self.commit:
                    speaker.save()

            self.speakers[id] = speaker

        if self.ns:
            docDate = debate.find('an:preface//an:docDate', namespaces={'an': self.ns})
        else:
            docDate = debate.find('preface//docDate')
        if docDate is not None:
            self.start_date = dateutil.parse(docDate.get('date'))

        self.visit(debate.debateBody, None)
Пример #2
0
    def parse_document(self):
        debate = self.xml.debate

        if self.ns:
            people = debate.findall('an:meta/an:references/an:TLCPerson',
                                    namespaces={'an': self.ns})
        else:
            people = debate.findall('meta/references/TLCPerson')
        if people is None: people = []
        for person in people:
            id = person.get('id')
            href = person.get('href')
            try:
                p = Person.objects.get(popit_id=href)
            except Person.DoesNotExist:
                p = Person(popit_id=href, api_instance=self.ai)
                if self.commit:
                    p.save()

            try:
                speaker = Speaker.objects.get(instance=self.instance, person=p)
            except Speaker.DoesNotExist:
                speaker = Speaker(instance=self.instance,
                                  name=person.get('showAs'),
                                  person=p)
                if self.commit:
                    speaker.save()

            self.speakers[id] = speaker

        if self.ns:
            docDate = debate.find(
                'an:coverPage//an:docDate|an:preface//an:docDate',
                namespaces={'an': self.ns})
        else:
            docDate = debate.find('coverPage//docDate|preface//docDate')
        if docDate is not None:
            self.start_date = dateutil.parse(docDate.get('date'))

        if self.ns:
            docTitle = debate.find(
                'an:coverPage//an:docTitle|an:preface//an:docTitle',
                namespaces={'an': self.ns})
        else:
            docTitle = debate.find('coverPage//docTitle|preface//docTitle')
        if docTitle is None:
            section = None
        else:
            section = self.make(Section, parent=None, title=docTitle.text)

        self.visit(debate.debateBody, section)
Пример #3
0
    def get_person(self, name, party, pombola_person_slug=None):

        # If we can directly find the person from the
        # pombola_person_slug, use that - the Code4SA / PMG
        # identification of speakers seems to be better than that from
        # popolo_name_resolver.
        speaker_from_slug = None
        if pombola_person_slug is not None:
            speaker_from_slug = Speaker.objects.filter(
                identifiers__scheme='pombola_person_slug',
                identifiers__identifier=pombola_person_slug).first()
            if speaker_from_slug:
                return speaker_from_slug

        cached = self.person_cache.get(name, None)
        if cached:
            return cached

        display_name = name or '(narrative)'

        speaker = None
        person = None

        if name:
            person = self.resolver.get_person(display_name, party)
            if person:
                speaker = person.speaker

        if not speaker:
            try:
                speaker = Speaker.objects.get(
                    instance=self.instance, name=display_name)
            except Speaker.DoesNotExist:
                speaker = Speaker(instance=self.instance, name=display_name)
                if self.commit:
                    speaker.save()

        self.person_cache[name] = speaker
        return speaker
Пример #4
0
    def parse_document(self):
        debate = self.xml.debate

        if self.ns:
            people = debate.findall('an:meta/an:references/an:TLCPerson', namespaces={'an': self.ns})
        else:
            people = debate.findall('meta/references/TLCPerson')
        if people is None: people = []
        for person in people:
            id = person.get('id')
            href = person.get('href')
            try:
                speaker = Speaker.objects.get(instance=self.instance, identifiers__identifier=href)
            except Speaker.DoesNotExist:
                speaker = Speaker(instance=self.instance, name=person.get('showAs'))
                if self.commit:
                    speaker.save()
                    speaker.identifiers.create(identifier=href, scheme='Akoma Ntoso import')

            self.speakers[id] = speaker

        if self.ns:
            docDate = debate.find('an:coverPage//an:docDate|an:preface//an:docDate', namespaces={'an': self.ns})
        else:
            docDate = debate.find('coverPage//docDate|preface//docDate')
        if docDate is not None:
            self.start_date = dateutil.parse(docDate.get('date'))

        if self.ns:
            docTitle = debate.find('an:coverPage//an:docTitle|an:preface//an:docTitle', namespaces={'an': self.ns})
        else:
            docTitle = debate.find('coverPage//docTitle|preface//docTitle')
        if docTitle is None:
            section = None
        else:
            section = self.make(Section, parent=None, title=docTitle.text)

        self.visit(debate.debateBody, section)
Пример #5
0
 def get_persons_data(self, dp):
     persons = next((resource for resource in dp.resources
                     if resource.descriptor['name'] == 'persons-person'),
                    None)
     # One day this information will be useful, but not now...
     # positions = next((resource for resource in dp.resources if resource.descriptor['name'] == 'persons-position'),
     #                  None)
     # persons_to_positions = next(
     #     (resource for resource in dp.resources if resource.descriptor['name'] == 'persons-persons-to-positions'),
     #     None)
     if persons is not None:
         for person in persons.data:
             speaker = Speaker(instance=self.instance)
             speaker.given_name = person['first_name']
             speaker.family_name = person['last_name']
             speaker.name = speaker.given_name + ' ' + speaker.family_name
             speaker.email = person['email']
             speaker.gender = person['gender_description']
             speaker.save()
Пример #6
0
    def get_person(self, name, party, pombola_person_slug=None):

        # If we can directly find the person from the
        # pombola_person_slug, use that - the Code4SA / PMG
        # identification of speakers seems to be better than that from
        # popolo_name_resolver.
        speaker_from_slug = None
        if pombola_person_slug is not None:
            speaker_from_slug = Speaker.objects.filter(
                identifiers__scheme='pombola_person_slug',
                identifiers__identifier=pombola_person_slug).first()
            if speaker_from_slug:
                return speaker_from_slug

        cached = self.person_cache.get(name, None)
        if cached:
            return cached

        display_name = name or '(narrative)'

        speaker = None
        person = None

        if name:
            person = self.resolver.get_person(display_name, party)
            if person:
                speaker = person.speaker

        if not speaker:
            try:
                speaker = Speaker.objects.get(instance=self.instance, name=display_name)
            except Speaker.DoesNotExist:
                speaker = Speaker(instance=self.instance, name=display_name)
                if self.commit:
                    speaker.save()

        self.person_cache[name] = speaker
        return speaker
Пример #7
0
    def get_person(self, name):
        cached = self.person_cache.get(name, None)
        if cached:
            return cached

        display_name = name or '(narrative)'

        speaker = None
        popit_person = None

        if name:
            self.speakers_count += 1
            if self.resolver:
                popit_person = self.resolver.get_person(display_name)

                if popit_person:
                    self.speakers_matched += 1
                    try:
                        speaker = Speaker.objects.get(
                            instance = self.instance,
                            person = popit_person)
                    except Speaker.DoesNotExist:
                        pass
                else:
                    logger.info(" - Failed to get user %s" % display_name)

        if not speaker:
            try:
                speaker = Speaker.objects.get(instance=self.instance, name=display_name)
            except Speaker.DoesNotExist:
                speaker = Speaker(instance=self.instance, name=display_name)
                if self.commit:
                    speaker.save()

            if popit_person:
                speaker.person = popit_person
                if self.commit:
                    speaker.save()

        self.person_cache[name] = speaker
        return speaker
Пример #8
0
    def parse_document(self):
        self.stats = {Speaker: 0}
        debate = self.xml.debate

        if self.ns:
            people = debate.findall(
                'an:meta/an:references/an:TLCPerson',
                namespaces={'an': self.ns},
                )
        else:
            people = debate.findall('meta/references/TLCPerson')
        if people is None:
            people = []
        for person in people:
            id = person.get('id')
            href = person.get('href')
            try:
                speaker = Speaker.objects.get(
                    instance=self.instance, identifiers__identifier=href)
            except Speaker.DoesNotExist:
                name = person.get('showAs')
                if not name:
                    raise Exception("TLCPerson '%s' is missing showAs" % href)
                speaker = Speaker(instance=self.instance, name=name)
                self.stats[Speaker] += 1

                if self.commit:
                    speaker.save()
                    speaker.identifiers.create(
                        identifier=href, scheme='Akoma Ntoso import')

            self.speakers[id] = speaker

        docDate = self.get_preface_tag(debate, 'docDate')
        if docDate is not None:
            date = docDate.get('date')
            if date:
                try:
                    self.start_date = dateutil.parse(date)
                except ValueError:
                    logger.warn("docDate element did not parse '%s'" % date)
            else:
                logger.warn("docDate element missing required date attribute")

        docTitle = self.get_preface_tag(debate, 'docTitle')
        if docTitle:
            docTitle = docTitle.text

        docNumber = self.get_preface_tag(debate, 'docNumber')
        if docNumber:
            docNumber = docNumber.text

        legislature = self.get_preface_tag(debate, 'legislature')
        if legislature:
            legislature = legislature.text

        session = self.get_preface_tag(debate, 'session')
        if session:
            session = session.text

        source_url = self.get_preface_tag(debate, 'link')
        if source_url is not None:
            source_url = source_url.get('href')

        self.imported_section_ids = set()

        section = None
        if docTitle:
            kwargs = {
                'parent': None,
                'heading': docTitle,
                'start_date': self.start_date,
                'number': docNumber or '',
                'legislature': legislature or '',
                'session': session or '',
            }

            section = self.make_section(source_url=source_url or '', **kwargs)

            if not section:
                return self.stats

        self.visit(debate.debateBody, section)
        return self.stats
Пример #9
0
    def parse_document(self):
        debate = self.xml.debate

        if self.ns:
            people = debate.findall(
                'an:meta/an:references/an:TLCPerson',
                namespaces={'an': self.ns},
            )
        else:
            people = debate.findall('meta/references/TLCPerson')
        if people is None:
            people = []
        for person in people:
            id = person.get('id')
            href = person.get('href')
            try:
                speaker = Speaker.objects.get(instance=self.instance,
                                              identifiers__identifier=href)
            except Speaker.DoesNotExist:
                speaker = Speaker(instance=self.instance,
                                  name=person.get('showAs'))
                if self.commit:
                    speaker.save()
                    speaker.identifiers.create(identifier=href,
                                               scheme='Akoma Ntoso import')

            self.speakers[id] = speaker

        docDate = self.get_preface_tag(debate, 'docDate')
        if docDate:
            self.start_date = dateutil.parse(docDate.get('date'))

        docTitle = self.get_preface_tag(debate, 'docTitle')
        if docTitle:
            docTitle = docTitle.text

        docNumber = self.get_preface_tag(debate, 'docNumber')
        if docNumber:
            docNumber = docNumber.text

        legislature = self.get_preface_tag(debate, 'legislature')
        if legislature:
            legislature = legislature.text

        session = self.get_preface_tag(debate, 'session')
        if session:
            session = session.text

        self.imported_section_ids = set()

        section = None
        if docTitle:
            kwargs = {
                'parent': None,
                'heading': docTitle,
                'start_date': self.start_date,
                'number': docNumber or '',
                'legislature': legislature or '',
                'session': session or '',
            }
            section = self.make_section(**kwargs)
            if not section:
                return

        self.visit(debate.debateBody, section)
    def handle(self, *args, **options):
        if options['list'] or len(args) != 1:
            self.stdout.write('Plays:\n')
            for play in sorted(PLAYS.values()):
                self.stdout.write('* %s\n' % play)
            if not options['list']:
                raise CommandError("Please specify a play")
            return

        play = args[0]
        file = None
        for f, p in PLAYS.items():
            if play == p:
                file = f
                break

        if not file:
            raise CommandError("No matching play found")

        try:
            self.instance = Instance.objects.get(label=options['instance'])
        except:
            raise CommandError("Instance specified not found")

        self.commit = options['commit']

        xml = urlopen('http://www.ibiblio.org/xml/examples/shakespeare/%s' %
                      file).read()
        play_xml = etree.fromstring(xml)
        play_section = self.make(Section, heading=play)

        speakers = {}
        for act in play_xml:
            if act.tag != 'ACT':
                continue
            act_heading = act[0].text
            act_section = self.make(Section,
                                    heading=act_heading,
                                    parent=play_section)
            scenes = act[1:]
            for scene in scenes:
                scene_heading = scene[0].text
                scene_section = self.make(Section,
                                          heading=scene_heading,
                                          parent=act_section)
                speeches_xml = scene[1:]
                for sp in speeches_xml:
                    if sp.tag == 'STAGEDIR' or sp.tag == 'SUBHEAD' or sp.tag == 'SUBTITLE':
                        self.make(Speech,
                                  section=scene_section,
                                  text='<p><i>%s</i></p>' % sp.text,
                                  type='narrative')
                        continue

                    if not sp[0].text:
                        speaker = None
                    elif self.commit:
                        name = sp[0].text.replace('[', '').replace(']', '')
                        if name in speakers:
                            speaker = speakers[name]
                        else:
                            speaker = Speaker.objects.create(
                                name=name, instance=self.instance)
                            speakers[name] = speaker
                    else:
                        speaker = Speaker(name=sp[0].text,
                                          instance=self.instance)

                    text = ""
                    lines = sp[1:]
                    for line in lines:
                        if len(line):
                            text += '<i>%s</i>' % line[0].text
                            if line[0].tail:
                                text += ' %s' % line[0].tail.strip()
                            text += '<br>\n'
                        elif line.tag == 'LINE':
                            text += '%s<br>\n' % line.text
                        elif line.tag == 'STAGEDIR':
                            text += '<i>%s</i><br>\n' % line.text

                    text = '<p>%s</p>' % text
                    self.make(Speech,
                              speaker=speaker,
                              section=scene_section,
                              text=text,
                              type='speech')
Пример #11
0
    def parse_document(self):
        debate = self.xml.debate

        if self.ns:
            people = debate.findall(
                'an:meta/an:references/an:TLCPerson',
                namespaces={'an': self.ns},
                )
        else:
            people = debate.findall('meta/references/TLCPerson')
        if people is None:
            people = []
        for person in people:
            id = person.get('id')
            href = person.get('href')
            try:
                speaker = Speaker.objects.get(
                    instance=self.instance, identifiers__identifier=href)
            except Speaker.DoesNotExist:
                speaker = Speaker(
                    instance=self.instance, name=person.get('showAs'))
                if self.commit:
                    speaker.save()
                    speaker.identifiers.create(
                        identifier=href, scheme='Akoma Ntoso import')

            self.speakers[id] = speaker

        docDate = self.get_preface_tag(debate, 'docDate')
        if docDate:
            self.start_date = dateutil.parse(docDate.get('date'))

        docTitle = self.get_preface_tag(debate, 'docTitle')
        if docTitle:
            docTitle = docTitle.text

        docNumber = self.get_preface_tag(debate, 'docNumber')
        if docNumber:
            docNumber = docNumber.text

        legislature = self.get_preface_tag(debate, 'legislature')
        if legislature:
            legislature = legislature.text

        session = self.get_preface_tag(debate, 'session')
        if session:
            session = session.text

        section = None
        if docTitle:
            section = self.make(
                Section,
                parent=None,
                heading=docTitle,
                start_date=self.start_date,
                number=docNumber or '',
                legislature=legislature or '',
                session=session or '',
            )

        self.visit(debate.debateBody, section)
Пример #12
0
    def parse_document(self):
        debate = self.xml.debate

        if self.ns:
            people = debate.findall("an:meta/an:references/an:TLCPerson", namespaces={"an": self.ns})
        else:
            people = debate.findall("meta/references/TLCPerson")
        if people is None:
            people = []
        for person in people:
            id = person.get("id")
            href = person.get("href")
            try:
                speaker = Speaker.objects.get(instance=self.instance, identifiers__identifier=href)
            except Speaker.DoesNotExist:
                speaker = Speaker(instance=self.instance, name=person.get("showAs"))
                if self.commit:
                    speaker.save()
                    speaker.identifiers.create(identifier=href, scheme="Akoma Ntoso import")

            self.speakers[id] = speaker

        docDate = self.get_preface_tag(debate, "docDate")
        if docDate:
            self.start_date = dateutil.parse(docDate.get("date"))

        docTitle = self.get_preface_tag(debate, "docTitle")
        if docTitle:
            docTitle = docTitle.text

        docNumber = self.get_preface_tag(debate, "docNumber")
        if docNumber:
            docNumber = docNumber.text

        legislature = self.get_preface_tag(debate, "legislature")
        if legislature:
            legislature = legislature.text

        session = self.get_preface_tag(debate, "session")
        if session:
            session = session.text

        source_url = self.get_preface_tag(debate, "link") or ""
        if source_url:
            source_url = source_url.get("href")

        section = None
        if docTitle:
            kwargs = {
                "parent": None,
                "heading": docTitle,
                "start_date": self.start_date,
                "number": docNumber or "",
                "legislature": legislature or "",
                "session": session or "",
            }

            # If the importer has no opinion on clobbering, just import the section,
            # potentially creating a duplicate section.
            if self.clobber is not None:
                try:
                    section = Section.objects.for_instance(self.instance).get(**kwargs)
                    if self.clobber:
                        logger.info("Clobbering %s" % docTitle)
                        for speech in section.descendant_speeches():
                            speech.delete()
                        section.delete()
                    else:
                        logger.info("Skipping %s" % docTitle)
                        return
                except Section.DoesNotExist:
                    logger.info("Importing %s" % docTitle)

            section = self.make(Section, source_url=source_url, **kwargs)

        self.visit(debate.debateBody, section)
Пример #13
0
    def parse_document(self):
        debate = self.xml.debate

        if self.ns:
            people = debate.findall(
                'an:meta/an:references/an:TLCPerson',
                namespaces={'an': self.ns},
                )
        else:
            people = debate.findall('meta/references/TLCPerson')
        if people is None:
            people = []
        for person in people:
            id = person.get('id')
            href = person.get('href')
            try:
                speaker = Speaker.objects.get(
                    instance=self.instance, identifiers__identifier=href)
            except Speaker.DoesNotExist:
                speaker = Speaker(
                    instance=self.instance, name=person.get('showAs'))
                if self.commit:
                    speaker.save()
                    speaker.identifiers.create(
                        identifier=href, scheme='Akoma Ntoso import')

            self.speakers[id] = speaker

        docDate = self.get_preface_tag(debate, 'docDate')
        if docDate:
            self.start_date = dateutil.parse(docDate.get('date'))

        docTitle = self.get_preface_tag(debate, 'docTitle')
        if docTitle:
            docTitle = docTitle.text

        docNumber = self.get_preface_tag(debate, 'docNumber')
        if docNumber:
            docNumber = docNumber.text

        legislature = self.get_preface_tag(debate, 'legislature')
        if legislature:
            legislature = legislature.text

        session = self.get_preface_tag(debate, 'session')
        if session:
            session = session.text

        section = None
        if docTitle:
            kwargs = {
                'parent': None,
                'heading': docTitle,
                'start_date': self.start_date,
                'number': docNumber or '',
                'legislature': legislature or '',
                'session': session or '',
            }

            # If the importer has no opinion on clobbering, just import the section,
            # potentially creating a duplicate section.
            if self.clobber is not None:
                try:
                    section = Section.objects.for_instance(self.instance).get(**kwargs)
                    if self.clobber:
                        logger.info('Clobbering %s' % docTitle)
                        for speech in section.descendant_speeches():
                            speech.delete()
                        section.delete()
                    else:
                        logger.info('Skipping %s' % docTitle)
                        return
                except Section.DoesNotExist:
                    logger.info('Importing %s' % docTitle)

            section = self.make(Section, **kwargs)

        self.visit(debate.debateBody, section)
    def parse_document(self):
        self.stats = {Speaker: 0}
        debate = self.xml.debate

        if self.ns:
            people = debate.findall(
                'an:meta/an:references/an:TLCPerson',
                namespaces={'an': self.ns},
            )
        else:
            people = debate.findall('meta/references/TLCPerson')
        if people is None:
            people = []
        for person in people:
            id = person.get('id')
            href = person.get('href')
            try:
                speaker = Speaker.objects.get(instance=self.instance,
                                              identifiers__identifier=href)
            except Speaker.DoesNotExist:
                name = person.get('showAs')
                if not name:
                    raise Exception("TLCPerson '%s' is missing showAs" % href)
                speaker = Speaker(instance=self.instance, name=name)
                self.stats[Speaker] += 1

                if self.commit:
                    speaker.save()
                    speaker.identifiers.create(identifier=href,
                                               scheme='Akoma Ntoso import')

            self.speakers[id] = speaker

        docDate = self.get_preface_tag(debate, 'docDate')
        if docDate is not None:
            date = docDate.get('date')
            if date:
                try:
                    self.start_date = dateutil.parse(date)
                except ValueError:
                    logger.warn("docDate element did not parse '%s'" % date)
            else:
                logger.warn("docDate element missing required date attribute")

        docTitle = self.get_preface_tag(debate, 'docTitle')
        if docTitle:
            docTitle = docTitle.text

        docNumber = self.get_preface_tag(debate, 'docNumber')
        if docNumber:
            docNumber = docNumber.text

        legislature = self.get_preface_tag(debate, 'legislature')
        if legislature:
            legislature = legislature.text

        session = self.get_preface_tag(debate, 'session')
        if session:
            session = session.text

        source_url = self.get_preface_tag(debate, 'link')
        if source_url is not None:
            source_url = source_url.get('href')

        self.imported_section_ids = set()

        section = None
        if docTitle:
            kwargs = {
                'parent': None,
                'heading': docTitle,
                'start_date': self.start_date,
                'number': docNumber or '',
                'legislature': legislature or '',
                'session': session or '',
            }

            section = self.make_section(source_url=source_url or '', **kwargs)

            if not section:
                return self.stats

        self.visit(debate.debateBody, section)
        return self.stats