def import_doc(self, info): origin_id = info['origin_id'] try: doc = MeetingDocument.objects.get(origin_id=origin_id) if not self.options['full_update'] and doc.last_modified_time >= info['last_modified']: if self.verbosity >= 2: self.logger.info("Up-to-date document %s (last modified %s)" % (origin_id, info['last_modified'])) return else: print "Re-importing document %s" % origin_id except MeetingDocument.DoesNotExist: print "Adding new document %s" % origin_id doc = MeetingDocument(origin_id=origin_id) d = [int(x) for x in info['date'].split('-')] doc_date = datetime.date(*d) try: policymaker = Policymaker.objects.get(origin_id=info['policymaker_id']) except Policymaker.DoesNotExist: org = Organization.objects.get(origin_id=info['policymaker_id']) print "Creating new policymaker for %s" % org args = {'name': org.name_fi, 'abbreviation': org.abbreviation, 'type': org.type, 'origin_id': info['policymaker_id']} policymaker = Policymaker(**args) policymaker.slug = org.slug policymaker.save() org.policymaker = policymaker org.save(update_fields=['policymaker']) if not policymaker.abbreviation and 'policymaker_abbr' in info: self.logger.info("Saving abbreviation '%s' for %s" % (info['policymaker_abbr'], policymaker)) policymaker.abbreviation = info['policymaker_abbr'] policymaker.save() args = {'policymaker': policymaker, 'number': info['meeting_nr'], 'year': doc_date.year} try: meeting = Meeting.objects.get(**args) except Meeting.DoesNotExist: meeting = Meeting(**args) meeting.minutes = False meeting.date = info['date'] meeting.save() doc.meeting = meeting doc.organisation = info['org'] doc.policymaker = info['policymaker'] doc.date = doc_date if str(meeting.date) != str(doc.date): # If the new meeting date comes from a document with the latest modification # time, assume the earlier meeting date is incorrect. Otherwise, bail out. latest_doc = meeting.meetingdocument_set.order_by('-last_modified_time')[0] if info['last_modified'] > latest_doc.last_modified_time: self.logger.warning("Fixing date mismatch between doc and meeting (%s vs. %s)" % (meeting.date, doc.date)) meeting.date = doc.date meeting.save(update_fields=['date']) else: raise Exception("Date mismatch between doc and meeting (%s vs. %s)" % (meeting.date, doc.date)) doc.meeting_nr = info['meeting_nr'] doc.origin_url = info['url'] adoc = AhjoDocument(verbosity=self.verbosity, options=self.options) zipf = self.scanner.download_document(info) try: adoc.import_from_zip(zipf) except ParseError as e: self.logger.error("Error importing document %s" % origin_id, exc_info=e) self.failed_import_list.append(origin_id) raise fname = info['origin_id'] + '.xml' print "Storing cleaned XML to %s" % fname xmlf = open(os.path.join(self.xml_path, fname), 'w') doc.type = adoc.type if doc.type == 'agenda': assert info['doc_type'] == 'agenda' elif doc.type == 'minutes': assert info['doc_type'] == 'minutes' adoc.output_cleaned_xml(xmlf) xmlf.close() doc.xml_file = os.path.join(settings.AHJO_PATHS['xml'], fname) doc.publish_time = adoc.publish_time doc.last_modified_time = info['last_modified'] doc.save() if info['policymaker_id'] != adoc.policymaker_id: raise Exception("Policymaker id mismatch (%s vs. %s)" % (info['policymaker_id'], adoc.policymaker_id)) if meeting.minutes and info['doc_type'] == 'agenda': self.logger.info("Skipping agenda doc because minutes already exists") return # Perform some sanity checks. existing_ais = AgendaItem.objects.filter(meeting=meeting).order_by('index') if existing_ais.count() > len(adoc.items): self.logger.warning("More agenda items in DB (%d) than in document (%d)" % (existing_ais.count(), len(adoc.items))) existing_ais.delete() register_ids = set() for adi in adoc.items: register_id = adi.get('register_id', None) if register_id is None: continue if register_id in register_ids: self.logger.warning("Issue %s listed more than twice in a meeting" % register_id) else: register_ids.add(register_id) for ai in existing_ais: for adi in adoc.items: if adi['number'] == ai.index: break else: self.logger.warning("Agenda item %s not found in incoming items" % ai) ai.should_delete = True if ai.issue is not None: obj_register_id = ai.issue.register_id else: obj_register_id = None if adi.get('register_id', None) != obj_register_id: self.logger.warning("Issue mismatch at index %d: %s vs. %s" % (ai.index, adi['register_id'], obj_register_id)) AgendaItem.objects.filter(meeting=meeting, index__gte=ai.index).delete() break for ai in existing_ais: if getattr(ai, 'should_delete', False): self.logger.warning("Deleting stale agenda item %s" % ai) ai.delete() for issue in adoc.items: self.store_issue(meeting, doc, issue, adoc) if doc.type == 'minutes': meeting.minutes = True meeting.save() if not self.options['no_videos']: self.import_videos(meeting)
def import_doc(self, info): origin_id = info['origin_id'] try: doc = MeetingDocument.objects.get(origin_id=origin_id) if not self.options['full_update'] and doc.last_modified_time >= info['last_modified']: if self.verbosity >= 2: self.logger.info("Up-to-date document %s (last modified %s)" % (origin_id, info['last_modified'])) return else: print "Re-importing document %s" % origin_id except MeetingDocument.DoesNotExist: print "Adding new document %s" % origin_id doc = MeetingDocument(origin_id=origin_id) d = [int(x) for x in info['date'].split('-')] doc_date = datetime.date(*d) policymaker = Policymaker.objects.get(origin_id=info['policymaker_id']) args = {'policymaker': policymaker, 'number': info['meeting_nr'], 'year': doc_date.year} if not policymaker.abbreviation and 'policymaker_abbr' in info: self.logger.info("Saving abbreviation '%s' for %s" % (info['policymaker_abbr'], policymaker)) policymaker.abbreviation = info['policymaker_abbr'] policymaker.save() try: meeting = Meeting.objects.get(**args) except Meeting.DoesNotExist: meeting = Meeting(**args) meeting.minutes = False meeting.date = info['date'] meeting.save() doc.meeting = meeting doc.organisation = info['org'] doc.policymaker = info['policymaker'] doc.date = doc_date if str(meeting.date) != str(doc.date): raise Exception("Date mismatch between doc and meeting (%s vs. %s)" % (meeting.date, doc.date)) doc.meeting_nr = info['meeting_nr'] doc.origin_url = info['url'] adoc = AhjoDocument(verbosity=self.verbosity, options=self.options) zipf = self.scanner.download_document(info) try: adoc.import_from_zip(zipf) except ParseError as e: self.logger.error("Error importing document %s" % origin_id, exc_info=e) self.failed_import_list.append(origin_id) raise fname = info['origin_id'] + '.xml' print "Storing cleaned XML to %s" % fname xmlf = open(os.path.join(self.xml_path, fname), 'w') doc.type = adoc.type if doc.type == 'agenda': assert info['doc_type'] == 'agenda' elif doc.type == 'minutes': assert info['doc_type'] == 'minutes' adoc.output_cleaned_xml(xmlf) xmlf.close() doc.xml_file = os.path.join(settings.AHJO_PATHS['xml'], fname) doc.publish_time = adoc.publish_time doc.last_modified_time = info['last_modified'] doc.save() if info['policymaker_id'] != adoc.policymaker_id: raise Exception("Policymaker id mismatch (%s vs. %s)" % (info['policymaker_id'], adoc.policymaker_id)) if meeting.minutes and info['doc_type'] == 'agenda': self.logger.info("Skipping agenda doc because minutes already exists") return # Perform some sanity checks. existing_ais = AgendaItem.objects.filter(meeting=meeting).order_by('index') if existing_ais.count() > len(adoc.items): self.logger.warning("More agenda items in DB (%d) than in document (%d)" % (existing_ais.count(), len(adoc.items))) existing_ais.delete() for idx, ai in enumerate(existing_ais): adi = adoc.items[idx] if adi['register_id'] == ai.issue.register_id and adi['number'] == ai.index: continue self.logger.warning("Issue mismatch at index %d: %s vs. %s" % (idx, adi['register_id'], ai.issue.register_id)) AgendaItem.objects.filter(meeting=meeting, index__gte=ai.index).delete() break for issue in adoc.items: self.store_issue(meeting, doc, issue, adoc) if doc.type == 'minutes': meeting.minutes = True meeting.save() if not self.options['no_videos']: self.import_videos(meeting)
def import_doc(self, info): origin_id = info['origin_id'] try: doc = MeetingDocument.objects.get(origin_id=origin_id) if not self.options[ 'full_update'] and doc.last_modified_time >= info[ 'last_modified']: if self.verbosity >= 2: self.logger.info( "Up-to-date document %s (last modified %s)" % (origin_id, info['last_modified'])) return else: print "Re-importing document %s" % origin_id except MeetingDocument.DoesNotExist: print "Adding new document %s" % origin_id doc = MeetingDocument(origin_id=origin_id) d = [int(x) for x in info['date'].split('-')] doc_date = datetime.date(*d) try: policymaker = Policymaker.objects.get( origin_id=info['policymaker_id']) except Policymaker.DoesNotExist: org = Organization.objects.get(origin_id=info['policymaker_id']) print "Creating new policymaker for %s" % org args = { 'name': org.name_fi, 'abbreviation': org.abbreviation, 'type': org.type, 'origin_id': info['policymaker_id'] } policymaker = Policymaker(**args) policymaker.slug = org.slug policymaker.save() org.policymaker = policymaker org.save(update_fields=['policymaker']) if not policymaker.abbreviation and 'policymaker_abbr' in info: self.logger.info("Saving abbreviation '%s' for %s" % (info['policymaker_abbr'], policymaker)) policymaker.abbreviation = info['policymaker_abbr'] policymaker.save() args = { 'policymaker': policymaker, 'number': info['meeting_nr'], 'year': doc_date.year } try: meeting = Meeting.objects.get(**args) except Meeting.DoesNotExist: meeting = Meeting(**args) meeting.minutes = False meeting.date = info['date'] meeting.save() doc.meeting = meeting doc.organisation = info['org'] doc.policymaker = info['policymaker'] doc.date = doc_date if str(meeting.date) != str(doc.date): # If the new meeting date comes from a document with the latest modification # time, assume the earlier meeting date is incorrect. Otherwise, bail out. latest_doc = meeting.meetingdocument_set.order_by( '-last_modified_time')[0] if info['last_modified'] > latest_doc.last_modified_time: self.logger.warning( "Fixing date mismatch between doc and meeting (%s vs. %s)" % (meeting.date, doc.date)) meeting.date = doc.date meeting.save(update_fields=['date']) else: raise Exception( "Date mismatch between doc and meeting (%s vs. %s)" % (meeting.date, doc.date)) doc.meeting_nr = info['meeting_nr'] doc.origin_url = info['url'] adoc = AhjoDocument(verbosity=self.verbosity, options=self.options) zipf = self.scanner.download_document(info) try: adoc.import_from_zip(zipf) except ParseError as e: self.logger.error("Error importing document %s" % origin_id, exc_info=e) self.failed_import_list.append(origin_id) raise fname = info['origin_id'] + '.xml' print "Storing cleaned XML to %s" % fname xmlf = open(os.path.join(self.xml_path, fname), 'w') doc.type = adoc.type if doc.type == 'agenda': assert info['doc_type'] == 'agenda' elif doc.type == 'minutes': assert info['doc_type'] == 'minutes' adoc.output_cleaned_xml(xmlf) xmlf.close() doc.xml_file = os.path.join(settings.AHJO_PATHS['xml'], fname) doc.publish_time = adoc.publish_time doc.last_modified_time = info['last_modified'] doc.save() if info['policymaker_id'] != adoc.policymaker_id: raise Exception("Policymaker id mismatch (%s vs. %s)" % (info['policymaker_id'], adoc.policymaker_id)) if meeting.minutes and info['doc_type'] == 'agenda': self.logger.info( "Skipping agenda doc because minutes already exists") return # Perform some sanity checks. existing_ais = AgendaItem.objects.filter( meeting=meeting).order_by('index') if existing_ais.count() > len(adoc.items): self.logger.warning( "More agenda items in DB (%d) than in document (%d)" % (existing_ais.count(), len(adoc.items))) existing_ais.delete() register_ids = set() for adi in adoc.items: register_id = adi.get('register_id', None) if register_id is None: continue if register_id in register_ids: self.logger.warning( "Issue %s listed more than twice in a meeting" % register_id) else: register_ids.add(register_id) for ai in existing_ais: for adi in adoc.items: if adi['number'] == ai.index: break else: self.logger.warning( "Agenda item %s not found in incoming items" % ai) ai.should_delete = True if ai.issue is not None: obj_register_id = ai.issue.register_id else: obj_register_id = None if adi.get('register_id', None) != obj_register_id: self.logger.warning( "Issue mismatch at index %d: %s vs. %s" % (ai.index, adi['register_id'], obj_register_id)) AgendaItem.objects.filter(meeting=meeting, index__gte=ai.index).delete() break for ai in existing_ais: if getattr(ai, 'should_delete', False): self.logger.warning("Deleting stale agenda item %s" % ai) ai.delete() for issue in adoc.items: self.store_issue(meeting, doc, issue, adoc) if doc.type == 'minutes': meeting.minutes = True meeting.save() if not self.options['no_videos']: self.import_videos(meeting)