def status() -> None: init_django() from openstates.data.models import Bill states = sorted(CONVERSION_FUNCTIONS.keys()) click.secho("state | bills | missing | errors ", fg="white") click.secho("===================================", fg="white") for state in states: all_bills = Bill.objects.filter( legislative_session__jurisdiction_id=abbr_to_jid(state) ) missing_search = all_bills.filter(searchable__isnull=True).count() errors = all_bills.filter(searchable__is_error=True).count() all_bills = all_bills.count() errcolor = mscolor = "green" if missing_search > 0: missing_search = math.ceil(missing_search / all_bills * 100) mscolor = "yellow" if missing_search > 1: mscolor = "red" if errors > 0: errcolor = "yellow" errors = math.ceil(errors / all_bills * 100) if errors > 5: errcolor = "red" click.echo( f"{state:5} | {all_bills:7} | " + click.style(f"{missing_search:6}%", fg=mscolor) + " | " + click.style(f"{errors:6}%", fg=errcolor) )
def to_csv(abbreviations: list[str], upload: bool) -> None: """ Generate CSV files for YAML and optionally sync to S3. """ if not abbreviations: abbreviations = get_all_abbreviations() if upload: s3 = boto3.client("s3") for abbr in abbreviations: click.secho("==== {} ====".format(abbr), bold=True) jurisdiction_id = abbr_to_jid(abbr) directory = get_data_path(abbr) person_files = sorted((directory / "legislature").glob("*.yml")) fname = f"{abbr}.csv" write_csv(person_files, jurisdiction_id, fname) if upload: s3.upload_file( fname, "data.openstates.org", f"people/current/{abbr}.csv", ExtraArgs={"ContentType": "text/csv", "ACL": "public-read"}, ) click.secho(f"uploaded to data.openstates.org/people/current/{abbr}.csv", fg="green")
def main(input_dir: str) -> None: """ Convert scraped JSON in INPUT_DIR to YAML files for this repo. Will put data into incoming/ directory for usage with merge.py's --incoming option. """ # abbr is last piece of directory name abbr = "" for piece in input_dir.split("/")[::-1]: if piece: abbr = piece break jurisdiction_id = abbr_to_jid(abbr) output_dir = get_data_path(abbr) output_dir = Path(str(output_dir).replace("data", "incoming")) / "legislature" assert "incoming" in str(output_dir) try: output_dir.mkdir() except FileExistsError: for file in output_dir.glob("*.yml"): file.unlink() process_dir(Path(input_dir), output_dir, jurisdiction_id)
def reindex_state(state: str, session: str = None) -> None: init_django() from openstates.data.models import SearchableBill if session: bills = SearchableBill.objects.filter( bill__legislative_session__jurisdiction_id=abbr_to_jid(state), bill__legislative_session__identifier=session, ) else: bills = SearchableBill.objects.filter( bill__legislative_session__jurisdiction_id=abbr_to_jid(state) ) ids = list(bills.values_list("id", flat=True)) print(f"reindexing {len(ids)} bills for state") reindex(ids)
def to_dict(self): party = PARTIES.get(self.party.lower(), self.party) d = OrderedDict({ "id": f"ocd-person/{uuid.uuid4()}", "name": str(self.name), "party": [{ "name": party }], "roles": [{ "district": self.district, "type": self.chamber, "jurisdiction": abbr_to_jid(self.state), }], "links": self.links, "sources": self.sources, }) if self.given_name: d["given_name"] = str(self.given_name) if self.family_name: d["family_name"] = str(self.family_name) if self.suffix: d["suffix"] = str(self.suffix) if self.image: d["image"] = str(self.image) if self.email: d["email"] = str(self.email) if self.ids: d["ids"] = self.ids if self.extras: d["extras"] = self.extras # contact details d["contact_details"] = [] if self.district_office.to_dict(): d["contact_details"].append(self.district_office.to_dict()) if self.capitol_office.to_dict(): d["contact_details"].append(self.capitol_office.to_dict()) return d
def _resample(state: str, n: int = 50) -> None: """ Grab new versions for a state from the database. """ init_django() from openstates.data.models import BillVersion versions = BillVersion.objects.filter( bill__legislative_session__jurisdiction_id=abbr_to_jid(state) ).order_by("?")[:n] count = 0 fieldnames = [ "id", "session", "identifier", "title", "jurisdiction_id", "media_type", "url", "note", ] with open(get_raw_dir() / f"{state}.csv", "w") as outf: out = csv.DictWriter(outf, fieldnames=fieldnames) out.writeheader() for v in versions: for link in v.links.all(): out.writerow( { "id": v.id, "session": v.bill.legislative_session.identifier, "jurisdiction_id": v.bill.legislative_session.jurisdiction_id, "identifier": v.bill.identifier, "title": v.bill.title, "url": link.url, "media_type": link.media_type, "note": v.note, } ) count += 1 click.secho(f"wrote new sample csv with {count} records")
def create_person( fname: str, lname: str, name: str, state: str, district: str, party: str, rtype: str, url: str, image: str, email: str, start_date: str, ) -> None: role = Role( type=rtype, district=district, jurisdiction=abbr_to_jid(state), start_date=start_date ) if rtype in ("upper", "lower", "legislature"): directory = "legislature" elif rtype in ("mayor",): directory = "municipalities" elif rtype in ("governor", "lt_governor"): directory = "executive" person = Person( id=ocd_uuid("person"), name=name or f"{fname} {lname}", given_name=fname, family_name=lname, image=image, email=email, party=[Party(name=party)], roles=[role], links=[Link(url=url)], sources=[Link(url=url)], ) output_dir = get_data_path(state) / directory dump_obj(person.dict(exclude_defaults=True), output_dir=output_dir)
def update( state: str, n: int, clear_errors: bool, checkpoint: int, session: str = None ) -> None: init_django() from openstates.data.models import Bill, SearchableBill # print status within checkpoints status_num = checkpoint / 5 if state == "all": all_bills = Bill.objects.all() elif session: all_bills = Bill.objects.filter( legislative_session__jurisdiction_id=abbr_to_jid(state), legislative_session__identifier=session, ) else: all_bills = Bill.objects.filter( legislative_session__jurisdiction_id=abbr_to_jid(state) ) if clear_errors: if state == "all": print("--clear-errors only works with specific states, not all") return errs = SearchableBill.objects.filter(bill__in=all_bills, is_error=True) print(f"clearing {len(errs)} errors") errs.delete() missing_search = all_bills.filter(searchable__isnull=True) if state == "all": MAX_UPDATE = 1000 aggregates = missing_search.values( "legislative_session__jurisdiction__name" ).annotate(count=Count("id")) for agg in aggregates: state_name = agg["legislative_session__jurisdiction__name"] if agg["count"] > MAX_UPDATE: click.secho( f"Too many bills to update for {state_name}: {agg['count']}, skipping", fg="red", ) missing_search = missing_search.exclude( legislative_session__jurisdiction__name=state_name ) print(f"{len(missing_search)} missing, updating") else: print( f"{state}: {len(all_bills)} bills, {len(missing_search)} without search results" ) if n: missing_search = missing_search[: int(n)] else: n = len(missing_search) ids_to_update = [] updated_count = 0 # going to manage our own transactions here so we can save in chunks transaction.set_autocommit(False) for b in missing_search: ids_to_update.append(update_bill(b)) updated_count += 1 if updated_count % status_num == 0: print(f"{state}: updated {updated_count} out of {n}") if updated_count % checkpoint == 0: reindex(ids_to_update) transaction.commit() ids_to_update = [] # be sure to reindex final set reindex(ids_to_update) transaction.commit() transaction.set_autocommit(True)