Пример #1
0
    def import_doc(self, info):
        origin_id = info['origin_id']
        try:
            doc = MeetingDocument.objects.get(origin_id=origin_id)
            if not self.options['full_update'] and doc.last_modified_time >= info['last_modified']:
                if self.verbosity >= 2:
                    self.logger.info("Up-to-date document %s (last modified %s)" % (origin_id, info['last_modified']))
                return
            else:
                print "Re-importing document %s" % origin_id
        except MeetingDocument.DoesNotExist:
            print "Adding new document %s" % origin_id
            doc = MeetingDocument(origin_id=origin_id)

        d = [int(x) for x in info['date'].split('-')]
        doc_date = datetime.date(*d)

        try:
            policymaker = Policymaker.objects.get(origin_id=info['policymaker_id'])
        except Policymaker.DoesNotExist:
            org = Organization.objects.get(origin_id=info['policymaker_id'])
            print "Creating new policymaker for %s" % org
            args = {'name': org.name_fi, 'abbreviation': org.abbreviation,
                    'type': org.type, 'origin_id': info['policymaker_id']}
            policymaker = Policymaker(**args)
            policymaker.slug = org.slug
            policymaker.save()
            org.policymaker = policymaker
            org.save(update_fields=['policymaker'])

        if not policymaker.abbreviation and 'policymaker_abbr' in info:
            self.logger.info("Saving abbreviation '%s' for %s" % (info['policymaker_abbr'], policymaker))
            policymaker.abbreviation = info['policymaker_abbr']
            policymaker.save()

        args = {'policymaker': policymaker, 'number': info['meeting_nr'],
                'year': doc_date.year}
        try:
            meeting = Meeting.objects.get(**args)
        except Meeting.DoesNotExist:
            meeting = Meeting(**args)
            meeting.minutes = False
            meeting.date = info['date']
            meeting.save()

        doc.meeting = meeting
        doc.organisation = info['org']
        doc.policymaker = info['policymaker']
        doc.date = doc_date
        if str(meeting.date) != str(doc.date):
            # If the new meeting date comes from a document with the latest modification
            # time, assume the earlier meeting date is incorrect. Otherwise, bail out.
            latest_doc = meeting.meetingdocument_set.order_by('-last_modified_time')[0]
            if info['last_modified'] > latest_doc.last_modified_time:
                self.logger.warning("Fixing date mismatch between doc and meeting (%s vs. %s)" % (meeting.date, doc.date))
                meeting.date = doc.date
                meeting.save(update_fields=['date'])
            else:
                raise Exception("Date mismatch between doc and meeting (%s vs. %s)" % (meeting.date, doc.date))
        doc.meeting_nr = info['meeting_nr']
        doc.origin_url = info['url']

        adoc = AhjoDocument(verbosity=self.verbosity, options=self.options)
        zipf = self.scanner.download_document(info)
        try:
            adoc.import_from_zip(zipf)
        except ParseError as e:
            self.logger.error("Error importing document %s" % origin_id, exc_info=e)
            self.failed_import_list.append(origin_id)
            raise

        fname = info['origin_id'] + '.xml'
        print "Storing cleaned XML to %s" % fname
        xmlf = open(os.path.join(self.xml_path, fname), 'w')
        doc.type = adoc.type
        if doc.type == 'agenda':
            assert info['doc_type'] == 'agenda'
        elif doc.type == 'minutes':
            assert info['doc_type'] == 'minutes'
        adoc.output_cleaned_xml(xmlf)
        xmlf.close()
        doc.xml_file = os.path.join(settings.AHJO_PATHS['xml'], fname)
        doc.publish_time = adoc.publish_time
        doc.last_modified_time = info['last_modified']
        doc.save()

        if info['policymaker_id'] != adoc.policymaker_id:
            raise Exception("Policymaker id mismatch (%s vs. %s)" % (info['policymaker_id'], adoc.policymaker_id))

        if meeting.minutes and info['doc_type'] == 'agenda':
            self.logger.info("Skipping agenda doc because minutes already exists")
            return

        # Perform some sanity checks.
        existing_ais = AgendaItem.objects.filter(meeting=meeting).order_by('index')
        if existing_ais.count() > len(adoc.items):
            self.logger.warning("More agenda items in DB (%d) than in document (%d)" % (existing_ais.count(), len(adoc.items)))
            existing_ais.delete()

        register_ids = set()
        for adi in adoc.items:
            register_id = adi.get('register_id', None)
            if register_id is None:
                continue
            if register_id in register_ids:
                self.logger.warning("Issue %s listed more than twice in a meeting" % register_id)
            else:
                register_ids.add(register_id)

        for ai in existing_ais:
            for adi in adoc.items:
                if adi['number'] == ai.index:
                    break
            else:
                self.logger.warning("Agenda item %s not found in incoming items" % ai)
                ai.should_delete = True

            if ai.issue is not None:
                obj_register_id = ai.issue.register_id
            else:
                obj_register_id = None
            if adi.get('register_id', None) != obj_register_id:
                self.logger.warning("Issue mismatch at index %d: %s vs. %s" % (ai.index, adi['register_id'], obj_register_id))
                AgendaItem.objects.filter(meeting=meeting, index__gte=ai.index).delete()
                break

        for ai in existing_ais:
            if getattr(ai, 'should_delete', False):
                self.logger.warning("Deleting stale agenda item %s" % ai)
                ai.delete()

        for issue in adoc.items:
            self.store_issue(meeting, doc, issue, adoc)

        if doc.type == 'minutes':
            meeting.minutes = True
            meeting.save()

        if not self.options['no_videos']:
            self.import_videos(meeting)
Пример #2
0
    def import_doc(self, info):
        origin_id = info['origin_id']
        try:
            doc = MeetingDocument.objects.get(origin_id=origin_id)
            if not self.options['full_update'] and doc.last_modified_time >= info['last_modified']:
                if self.verbosity >= 2:
                    self.logger.info("Up-to-date document %s (last modified %s)" % (origin_id, info['last_modified']))
                return
            else:
                print "Re-importing document %s" % origin_id
        except MeetingDocument.DoesNotExist:
            print "Adding new document %s" % origin_id
            doc = MeetingDocument(origin_id=origin_id)

        d = [int(x) for x in info['date'].split('-')]
        doc_date = datetime.date(*d)

        policymaker = Policymaker.objects.get(origin_id=info['policymaker_id'])
        args = {'policymaker': policymaker, 'number': info['meeting_nr'],
                'year': doc_date.year}
        if not policymaker.abbreviation and 'policymaker_abbr' in info:
            self.logger.info("Saving abbreviation '%s' for %s" % (info['policymaker_abbr'], policymaker))
            policymaker.abbreviation = info['policymaker_abbr']
            policymaker.save()
        try:
            meeting = Meeting.objects.get(**args)
        except Meeting.DoesNotExist:
            meeting = Meeting(**args)
            meeting.minutes = False
            meeting.date = info['date']
            meeting.save()

        doc.meeting = meeting
        doc.organisation = info['org']
        doc.policymaker = info['policymaker']
        doc.date = doc_date
        if str(meeting.date) != str(doc.date):
            raise Exception("Date mismatch between doc and meeting (%s vs. %s)" % (meeting.date, doc.date))
        doc.meeting_nr = info['meeting_nr']
        doc.origin_url = info['url']

        adoc = AhjoDocument(verbosity=self.verbosity, options=self.options)
        zipf = self.scanner.download_document(info)
        try:
            adoc.import_from_zip(zipf)
        except ParseError as e:
            self.logger.error("Error importing document %s" % origin_id, exc_info=e)
            self.failed_import_list.append(origin_id)
            raise

        fname = info['origin_id'] + '.xml'
        print "Storing cleaned XML to %s" % fname
        xmlf = open(os.path.join(self.xml_path, fname), 'w')
        doc.type = adoc.type
        if doc.type == 'agenda':
            assert info['doc_type'] == 'agenda'
        elif doc.type == 'minutes':
            assert info['doc_type'] == 'minutes'
        adoc.output_cleaned_xml(xmlf)
        xmlf.close()
        doc.xml_file = os.path.join(settings.AHJO_PATHS['xml'], fname)
        doc.publish_time = adoc.publish_time
        doc.last_modified_time = info['last_modified']
        doc.save()

        if info['policymaker_id'] != adoc.policymaker_id:
            raise Exception("Policymaker id mismatch (%s vs. %s)" % (info['policymaker_id'], adoc.policymaker_id))

        if meeting.minutes and info['doc_type'] == 'agenda':
            self.logger.info("Skipping agenda doc because minutes already exists")
            return

        # Perform some sanity checks.
        existing_ais = AgendaItem.objects.filter(meeting=meeting).order_by('index')
        if existing_ais.count() > len(adoc.items):
            self.logger.warning("More agenda items in DB (%d) than in document (%d)" % (existing_ais.count(), len(adoc.items)))
            existing_ais.delete()
        for idx, ai in enumerate(existing_ais):
            adi = adoc.items[idx]
            if adi['register_id'] == ai.issue.register_id and adi['number'] == ai.index:
                continue
            self.logger.warning("Issue mismatch at index %d: %s vs. %s" % (idx, adi['register_id'], ai.issue.register_id))
            AgendaItem.objects.filter(meeting=meeting, index__gte=ai.index).delete()
            break

        for issue in adoc.items:
            self.store_issue(meeting, doc, issue, adoc)

        if doc.type == 'minutes':
            meeting.minutes = True
            meeting.save()

        if not self.options['no_videos']:
            self.import_videos(meeting)
Пример #3
0
    def import_doc(self, info):
        origin_id = info['origin_id']
        try:
            doc = MeetingDocument.objects.get(origin_id=origin_id)
            if not self.options[
                    'full_update'] and doc.last_modified_time >= info[
                        'last_modified']:
                if self.verbosity >= 2:
                    self.logger.info(
                        "Up-to-date document %s (last modified %s)" %
                        (origin_id, info['last_modified']))
                return
            else:
                print "Re-importing document %s" % origin_id
        except MeetingDocument.DoesNotExist:
            print "Adding new document %s" % origin_id
            doc = MeetingDocument(origin_id=origin_id)

        d = [int(x) for x in info['date'].split('-')]
        doc_date = datetime.date(*d)

        try:
            policymaker = Policymaker.objects.get(
                origin_id=info['policymaker_id'])
        except Policymaker.DoesNotExist:
            org = Organization.objects.get(origin_id=info['policymaker_id'])
            print "Creating new policymaker for %s" % org
            args = {
                'name': org.name_fi,
                'abbreviation': org.abbreviation,
                'type': org.type,
                'origin_id': info['policymaker_id']
            }
            policymaker = Policymaker(**args)
            policymaker.slug = org.slug
            policymaker.save()
            org.policymaker = policymaker
            org.save(update_fields=['policymaker'])

        if not policymaker.abbreviation and 'policymaker_abbr' in info:
            self.logger.info("Saving abbreviation '%s' for %s" %
                             (info['policymaker_abbr'], policymaker))
            policymaker.abbreviation = info['policymaker_abbr']
            policymaker.save()

        args = {
            'policymaker': policymaker,
            'number': info['meeting_nr'],
            'year': doc_date.year
        }
        try:
            meeting = Meeting.objects.get(**args)
        except Meeting.DoesNotExist:
            meeting = Meeting(**args)
            meeting.minutes = False
            meeting.date = info['date']
            meeting.save()

        doc.meeting = meeting
        doc.organisation = info['org']
        doc.policymaker = info['policymaker']
        doc.date = doc_date
        if str(meeting.date) != str(doc.date):
            # If the new meeting date comes from a document with the latest modification
            # time, assume the earlier meeting date is incorrect. Otherwise, bail out.
            latest_doc = meeting.meetingdocument_set.order_by(
                '-last_modified_time')[0]
            if info['last_modified'] > latest_doc.last_modified_time:
                self.logger.warning(
                    "Fixing date mismatch between doc and meeting (%s vs. %s)"
                    % (meeting.date, doc.date))
                meeting.date = doc.date
                meeting.save(update_fields=['date'])
            else:
                raise Exception(
                    "Date mismatch between doc and meeting (%s vs. %s)" %
                    (meeting.date, doc.date))
        doc.meeting_nr = info['meeting_nr']
        doc.origin_url = info['url']

        adoc = AhjoDocument(verbosity=self.verbosity, options=self.options)
        zipf = self.scanner.download_document(info)
        try:
            adoc.import_from_zip(zipf)
        except ParseError as e:
            self.logger.error("Error importing document %s" % origin_id,
                              exc_info=e)
            self.failed_import_list.append(origin_id)
            raise

        fname = info['origin_id'] + '.xml'
        print "Storing cleaned XML to %s" % fname
        xmlf = open(os.path.join(self.xml_path, fname), 'w')
        doc.type = adoc.type
        if doc.type == 'agenda':
            assert info['doc_type'] == 'agenda'
        elif doc.type == 'minutes':
            assert info['doc_type'] == 'minutes'
        adoc.output_cleaned_xml(xmlf)
        xmlf.close()
        doc.xml_file = os.path.join(settings.AHJO_PATHS['xml'], fname)
        doc.publish_time = adoc.publish_time
        doc.last_modified_time = info['last_modified']
        doc.save()

        if info['policymaker_id'] != adoc.policymaker_id:
            raise Exception("Policymaker id mismatch (%s vs. %s)" %
                            (info['policymaker_id'], adoc.policymaker_id))

        if meeting.minutes and info['doc_type'] == 'agenda':
            self.logger.info(
                "Skipping agenda doc because minutes already exists")
            return

        # Perform some sanity checks.
        existing_ais = AgendaItem.objects.filter(
            meeting=meeting).order_by('index')
        if existing_ais.count() > len(adoc.items):
            self.logger.warning(
                "More agenda items in DB (%d) than in document (%d)" %
                (existing_ais.count(), len(adoc.items)))
            existing_ais.delete()

        register_ids = set()
        for adi in adoc.items:
            register_id = adi.get('register_id', None)
            if register_id is None:
                continue
            if register_id in register_ids:
                self.logger.warning(
                    "Issue %s listed more than twice in a meeting" %
                    register_id)
            else:
                register_ids.add(register_id)

        for ai in existing_ais:
            for adi in adoc.items:
                if adi['number'] == ai.index:
                    break
            else:
                self.logger.warning(
                    "Agenda item %s not found in incoming items" % ai)
                ai.should_delete = True

            if ai.issue is not None:
                obj_register_id = ai.issue.register_id
            else:
                obj_register_id = None
            if adi.get('register_id', None) != obj_register_id:
                self.logger.warning(
                    "Issue mismatch at index %d: %s vs. %s" %
                    (ai.index, adi['register_id'], obj_register_id))
                AgendaItem.objects.filter(meeting=meeting,
                                          index__gte=ai.index).delete()
                break

        for ai in existing_ais:
            if getattr(ai, 'should_delete', False):
                self.logger.warning("Deleting stale agenda item %s" % ai)
                ai.delete()

        for issue in adoc.items:
            self.store_issue(meeting, doc, issue, adoc)

        if doc.type == 'minutes':
            meeting.minutes = True
            meeting.save()

        if not self.options['no_videos']:
            self.import_videos(meeting)