def body(self, lib_object: JSON, body: Body) -> Body: body.short_name = self.utils.normalize_body_name(body.short_name) body.ags = lib_object.get("ags") if body.ags: body.ags = body.ags.replace(" ", "") if len(body.ags or "") > 8: # Special case for https://ris.krefeld.de/webservice/oparl/v1/body/1 if body.ags[8:] == "0" * len(body.ags[8:]): body.ags = body.ags[:8] else: raise RuntimeError( "The Amtliche Gemeindeschlüssel of {} is longer than 8 characters: '{}'".format( body, body.ags ) ) # We don't really need the location because we have our own outline # importing logic and don't need the city, but we import it for comprehensiveness location = self.retrieve(Location, lib_object.get("location"), body.oparl_id) if location and location.geometry: if location.geometry["type"] == "Point": body.center = location body.outline = None elif location.geometry["type"] == "Polygon": logger.warning("Overriding outline of Body with api version") body.center = None body.outline = location else: logger.warning( "Location object is of type {}, which is neither 'Point' nor 'Polygon'." "Skipping this location.".format(location.geometry["type"]) ) return body
def person(self, lib_object: JSON, person: Person) -> Person: name = lib_object.get("name") given_name = lib_object.get("givenName") family_name = lib_object.get("familyName") if not name: if given_name and family_name: name = given_name + " " + family_name else: logger.warning("Person without name: {}".format(lib_object["id"])) name = _("Unknown") if not given_name and not family_name and " " in name: given_name = name.split(" ")[-2] family_name = name.split(" ")[-1] logger.warning("Inferring given and family name from compound name") if not given_name: logger.warning("Person without given name: {}".format(lib_object["id"])) given_name = _("Unknown") if not family_name: logger.warning("Person without family name: {}".format(lib_object["id"])) family_name = _("Unknown") person.name = name person.given_name = given_name person.family_name = family_name person.location = self.retrieve( Location, lib_object.get("location"), person.oparl_id ) return person
def meeting_related(self, libobject: JSON, meeting: Meeting) -> None: meeting.auxiliary_files.set( self.retrieve_many(File, libobject.get("auxiliaryFile"))) meeting.persons.set( self.retrieve_many(Person, libobject.get("participant"))) meeting.organizations.set( self.retrieve_many(Organization, libobject.get("organization")))
def paper(self, lib_object: JSON, paper: Paper) -> Paper: if lib_object.get("paperType"): paper_type, created = PaperType.objects.get_or_create( paper_type=lib_object.get("paperType") ) paper.paper_type = paper_type if created: logging.info( "Created new paper type {} through {}".format( paper_type, lib_object["id"] ) ) paper.reference_number = lib_object.get("reference") paper.main_file = self.retrieve( File, lib_object.get("mainFile"), paper.oparl_id ) paper.legal_date = self.utils.parse_date(lib_object.get("date")) # At this point we don't have the agenda items yet. We'll fix up the # cases where there are consultations but no legal date later paper.display_date = paper.legal_date # If we don't have a good date, sort them behind those with a good date paper.sort_date = self.utils.date_to_datetime(paper.legal_date) or fallback_date return paper
def paper_related(self, libobject: JSON, paper: Paper) -> None: paper.files.set( self.retrieve_many(File, libobject.get("auxiliaryFile"))) paper.organizations.set( self.retrieve_many(Organization, libobject.get("underDirectionOf"))) paper.persons.set( self.retrieve_many(Person, libobject.get("originatorPerson")))
def legislative_term(self, libobject: JSON, term: LegislativeTerm) -> Optional[LegislativeTerm]: if not libobject.get("startDate") or not libobject.get("endDate"): logger.error("Term has no start or end date - skipping") return None term.start = self.utils.parse_date(libobject.get("startDate")) term.end = self.utils.parse_date(libobject.get("endDate")) return term
def init_base( self, lib_object: JSON, base: E, name_fixup: Optional[str] = None ) -> E: """Sets common fields""" if not lib_object["id"]: raise RuntimeError("id is none: " + str(lib_object)) base.oparl_id = lib_object["id"] base.deleted = bool(lib_object.get("deleted", False)) if isinstance(base, ShortableNameFields): base.name = lib_object.get("name") or name_fixup base.set_short_name(lib_object.get("shortName") or base.name) return base
def visit_object(self, response: JSON): if response.get("type") == "https://schema.oparl.org/1.0/File": if "accessUrl" in response: response["accessUrl"] = response["accessUrl"].replace( r"files//rim", r"files/rim") if "downloadUrl" in response: response["downloadUrl"] = response["downloadUrl"].replace( r"files//rim", r"files/rim") if response.get("type") == "https://schema.oparl.org/1.0/Body": # Check for a missing leading zero ags = response.get("ags") if ags and len(ags) == 7: # noinspection PyTypeChecker response["ags"] = "0" + ags
def visit(self, data: JSON): """ Removes quirks like `"streetAddress": " "` in Location """ for key, value in data.copy().items(): if isinstance(value, dict): self.visit(value) elif isinstance(value, str): if value == "N/A" or not value.strip(): del data[key]
def organization(self, libobject: JSON, organization: Organization) -> Organization: type_name = libobject.get("organizationType") # E.g. Leipzig sets organizationType: "Gremium" and classification: "Fraktion" for factions, # so we give priority to classification if libobject.get("classification") in self.utils.organization_classification: type_name = libobject["classification"] type_id = self.utils.organization_classification.get(type_name) if type_id: orgtype = OrganizationType.objects.get(id=type_id) else: orgtype, _ = OrganizationType.objects.get_or_create( name=libobject.get("organizationType") ) organization.organization_type = orgtype if libobject.get("body"): # If we really have a case with an extra body then this should error because then we need some extra handling organization.body = Body.by_oparl_id(libobject["body"]) else: organization.body = self.default_body organization.start = self.utils.parse_date(libobject.get("startDate")) organization.end = self.utils.parse_date(libobject.get("endDate")) organization.location = self.retrieve(Location, libobject.get("location")) if organization.name == organization.short_name and type_name: pattern = "[- ]?" + re.escape(type_name) + "[ ]?" organization.short_name = re.sub( pattern, "", organization.short_name, flags=re.I ) return organization
def externalize(libobject: JSON, key_callback: Optional[Set[str]] = None) -> List[CachedObject]: """Converts an oparl object with embedded objects to multiple flat json objects""" externalized = [] # sorted copies, thereby avoiding modification while iterating for key in sorted(libobject.keys()): # Skip the geojson object if key == "geojson": continue entry = libobject[key] if isinstance(entry, dict): if "id" not in entry: logger.warning( f"Embedded object '{key}' in {libobject['id']} does not have an id, skipping: {entry}" ) del libobject[key] continue if isinstance(key_callback, set): key_callback.add(key) entry["mst:backref"] = libobject["id"] externalized += externalize(entry) libobject[key] = entry["id"] if isinstance(entry, list) and len(entry) > 0 and isinstance( entry[0], dict): if isinstance(key_callback, set): key_callback.add(key) for pos, entry in enumerate(entry): if "id" not in entry: logger.warning( f"Embedded object '{key}' in {libobject['id']} does not have an id, skipping: {entry}" ) del libobject[key] break entry["mst:backref"] = libobject["id"] entry[ "mst:backrefPosition"] = pos # We need this for agenda items externalized += externalize(entry) libobject[key][pos] = entry["id"] externalized.append( CachedObject( url=libobject["id"], data=libobject, oparl_type=libobject["type"].split("/")[-1], )) return externalized
def location(self, libobject: JSON, location: Location) -> Location: location.description = libobject.get("description") location.is_official = self.utils.official_geojson location.geometry = libobject.get("geojson", {}).get("geometry") location.street_address = libobject.get("streetAddress") location.room = libobject.get("room") location.postal_code = libobject.get("postalCode") location.locality = libobject.get("locality") if not location.description: description = "" if location.room: description += location.room + ", " if location.street_address: description += location.street_address + ", " if location.locality: if location.postal_code: description += location.postal_code + " " description += location.locality location.description = description # If a street_address is present, we try to find the exact location on the map if location.street_address and not location.geometry: search_str = location.street_address + ", " if location.locality: if location.postal_code: search_str += location.postal_code + " " + location.locality elif self.default_body: search_str += self.default_body.short_name search_str += " " + settings.GEOEXTRACT_SEARCH_COUNTRY location.geometry = geocode(search_str) return location
def meeting(self, libobject: JSON, meeting: Meeting) -> Meeting: meeting.start = self.utils.parse_datetime(libobject.get("start")) meeting.end = self.utils.parse_datetime(libobject.get("end")) meeting.location = self.retrieve(Location, libobject.get("location")) meeting.invitation = self.retrieve(File, libobject.get("invitation")) meeting.verbatim_protocol = self.retrieve( File, libobject.get("verbatimProtocol")) meeting.results_protocol = self.retrieve( File, libobject.get("resultsProtocol")) meeting.cancelled = libobject.get("cancelled", False) return meeting
def get_ags(self, body: Body, system: JSON, userinput: str) -> Tuple[str, str]: """ This function tries: 1. The ags field in the oparl body 2. Querying wikidata with a) the body's short name b) the user's input c) the body's full name d) the system's name e) locality in the location Returns the ags and the name that did match """ ags = body.ags if ags: if len(ags) == 8 or len(ags) == 5: return ags, body.short_name else: logger.error("Ignoring ags '{}' with invalid legth {}".format( ags, len(ags))) district = bool( re.match(settings.DISTRICT_REGEX, body.name, re.IGNORECASE)) to_check = [ ("body short name", body.short_name), ("user input", userinput), ("body name", body.name), ] if system.get("name"): short_system_name = self.utils.normalize_body_name(system["name"]) to_check.append(("system name", short_system_name)) if body.center and body.center.locality: locality = body.center.locality to_check.append(("body location locality", locality)) for source, value in to_check: ags = city_to_ags(value, district) if ags: logger.debug("Found ags using the {}: '{}'".format( source, value)) return ags, value raise RuntimeError( "Could not determine the Amtliche Gemeindeschlüssel using {}". format(to_check))
def consultation(self, libobject: JSON, consultation: Consultation) -> Consultation: consultation.authoritative = libobject.get("authoritative") consultation.role = libobject.get("role") paper_backref = libobject.get("paper") or libobject.get("mst:backref") consultation.paper = self.retrieve(Paper, paper_backref) consultation.meeting = self.retrieve(Meeting, libobject.get("meeting")) consultation.authoritative = libobject.get("authoritative") return consultation
def visit(self, data: JSON): """Removes quirks like `"streetAddress": " "` in Location""" # `"auxiliaryFile": { ... }` -> `"auxiliaryFile": [{ ... }]` if "auxiliaryFile" in data and isinstance(data["auxiliaryFile"], dict): logger.warning( f"auxiliaryFile is supposed to be an array of objects, " f"but is an object (in {data.get('id')})") data["auxiliaryFile"] = [data["auxiliaryFile"]] for key, value in data.copy().items(): if isinstance(value, dict): self.visit(value) if isinstance(value, list): for i in value: if isinstance(i, dict): self.visit(i) elif isinstance(value, str): if value == "N/A" or not value.strip(): del data[key]
def membership(self, lib_object: JSON, membership: Membership) -> Membership: role = lib_object.get("role") or _("Unknown") membership.start = self.utils.parse_date(lib_object.get("startDate")) membership.end = self.utils.parse_date(lib_object.get("endDate")) membership.role = role person_backref = lib_object.get("person") or lib_object.get("mst:backref") membership.person = self.retrieve(Person, person_backref, membership.oparl_id) membership.organization = self.retrieve( Organization, lib_object.get("organization"), membership.oparl_id ) return membership
def externalize(libobject: JSON, key_callback: Optional[Set[str]] = None) -> List[CachedObject]: """ Converts an oparl object with embedded objects to multiple flat json objeczs """ externalized = [] for key in libobject.keys(): # Skip the geojson object if key == "geojson": continue entry = libobject[key] if isinstance(entry, dict): if isinstance(key_callback, set): key_callback.add(key) entry["mst:backref"] = libobject["id"] externalized += externalize(entry) libobject[key] = entry["id"] if isinstance(entry, list) and len(entry) > 0 and isinstance( entry[0], dict): if isinstance(key_callback, set): key_callback.add(key) for pos, entry in enumerate(entry): entry["mst:backref"] = libobject["id"] entry[ "mst:backrefPosition"] = pos # We need this for agenda items externalized += externalize(entry) libobject[key][pos] = entry["id"] externalized.append( CachedObject( url=libobject["id"], data=libobject, oparl_type=libobject["type"].split("/")[-1], )) return externalized
def paper(self, libobject: JSON, paper: Paper) -> Paper: if libobject.get("paperType"): paper_type, created = PaperType.objects.get_or_create( defaults={"paper_type": libobject.get("paperType")}) paper.paper_type = paper_type if created: logging.info("Created new paper type {} through {}".format( paper_type, libobject["id"])) paper.legal_date = self.utils.parse_date(libobject.get("date")) paper.sort_date = (self.utils.date_to_datetime(paper.legal_date) or self.utils.parse_datetime( libobject.get("created")) or timezone.now()) paper.reference_number = libobject.get("reference") paper.main_file = self.retrieve(File, libobject.get("mainFile")) return paper
def body_related(self, lib_object: JSON, body: Body) -> None: body.legislative_terms.set( self.retrieve_many( LegislativeTerm, lib_object.get("legislativeTerm"), lib_object["id"] ) )
def agenda_item(self, libobject: JSON, item: AgendaItem) -> AgendaItem: item.key = libobject.get("number") or "-" item.name = libobject.get("name") item.public = libobject.get("public") item.result = libobject.get("result") item.resolution_text = libobject.get("resolutionText") item.start = self.utils.parse_datetime(libobject.get("start")) item.end = self.utils.parse_datetime(libobject.get("end")) meeting_backref = libobject.get("meeting") or libobject.get( "mst:backref") item.meeting = self.retrieve(Meeting, meeting_backref) item.position = libobject.get("mst:backrefPosition") item.consultation = self.retrieve(Consultation, libobject.get("consultation")) item.resolution_file = self.retrieve(File, libobject.get("resolutionFile")) return item
def agenda_item_related(self, lib_object: JSON, item: AgendaItem) -> None: item.auxiliary_file.set( self.retrieve_many(File, lib_object.get("auxiliaryFile"), lib_object["id"]) )
def agenda_item(self, lib_object: JSON, item: AgendaItem) -> AgendaItem: item.key = lib_object.get("number") or "-" if len(item.key) > 20: logger.warning( f"Overly long AgendaItem key, limiting to 20 character: {item.key}" ) item.key = item.key[:20] item.name = lib_object.get("name") item.public = lib_object.get("public") item.result = lib_object.get("result") item.resolution_text = lib_object.get("resolutionText") item.start = self.utils.parse_datetime(lib_object.get("start")) item.end = self.utils.parse_datetime(lib_object.get("end")) meeting_backref = lib_object.get("meeting") or lib_object.get("mst:backref") item.meeting = self.retrieve(Meeting, meeting_backref, item.oparl_id) item.position = lib_object.get("mst:backrefPosition") item.consultation = self.retrieve( Consultation, lib_object.get("consultation"), item.oparl_id ) item.resolution_file = self.retrieve( File, lib_object.get("resolutionFile"), item.oparl_id ) return item
def file(self, lib_object: JSON, file: File) -> File: cutoff = self.utils.filename_length_cutoff if lib_object.get("fileName"): filename = lib_object.get("fileName") elif lib_object.get("name"): extension = mimetypes.guess_extension("application/pdf") or "" length = cutoff - len(extension) filename = slugify(lib_object.get("name"))[:length] + extension else: access_url = lib_object["accessUrl"] filename = slugify(access_url.split("/")[-1])[-cutoff:] file.name = lib_object.get("name", "") if len(file.name) > 200: file.name = textwrap.wrap(file.name, 199)[0] + "\u2026" file.filename = filename file.mime_type = lib_object.get("mimeType") or "application/octet-stream" file.legal_date = self.utils.parse_date(lib_object.get("date")) file.sort_date = ( self.utils.date_to_datetime(file.legal_date) or self.utils.parse_datetime(lib_object.get("created")) or timezone.now() ) file.oparl_access_url = lib_object.get("accessUrl") file.oparl_download_url = lib_object.get("downloadUrl") file.filesize = None file.parsed_text = lib_object.get("text") file.license = lib_object.get("fileLicense") # We current do not handle locations attached to files due # to the lack of data and our own location extraction return file