def test_person_add_party():
    p = Person("Groot")
    p.add_party("Green")
    p._related[0].validate()
    assert get_pseudo_id(p._related[0].organization_id) == {
        "name": "Green",
        "classification": "party",
    }
예제 #2
0
    def scrape(self):
        # chambers = [chamber] if chamber is not None else ['upper', 'lower']
        leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv"
        page = self.get(leg_url)

        committees = {}

        # Ensure that the spreadsheet's structure hasn't generally changed
        _row_headers = page.text.split("\r\n")[0].replace('"', "").split(",")
        assert _row_headers == HEADERS, "Spreadsheet structure may have changed"

        page = open_csv(page)
        for row in page:

            chamber = {"H": "lower", "S": "upper"}[row["office code"]]

            district = row["dist"].lstrip("0")
            assert district.isdigit(), "Invalid district found: {}".format(
                district)

            name = row["first name"]
            mid = row["middle initial"].strip()
            if mid:
                name += " %s" % mid
            name += " %s" % row["last name"]
            suffix = row["suffix"].strip()
            if suffix:
                name += " %s" % suffix

            party = row["party"]
            if party == "Democrat":
                party = "Democratic"

            leg = Person(primary_org=chamber,
                         name=name,
                         district=district,
                         party=party)

            legislator_url = row["URL"].replace("\\", "//").strip()
            if legislator_url != "":
                if not legislator_url.startswith("http"):
                    legislator_url = "http://"
                leg.add_link(legislator_url)

            leg.add_party(party=party)

            office_address = "%s\nRoom %s\nHartford, CT 06106" % (
                row["capitol street address"],
                row["room number"],
            )
            # extra_office_fields = dict()
            email = row["email"].strip()
            if "@" not in email:
                if not email:
                    email = None
                elif email.startswith("http://") or email.startswith(
                        "https://"):
                    # extra_office_fields['contact_form'] = email
                    email = None
                else:
                    raise ValueError(
                        "Problematic email found: {}".format(email))
            leg.add_contact_detail(type="address",
                                   value=office_address,
                                   note="Capitol Office")
            leg.add_contact_detail(type="voice",
                                   value=row["capitol phone"],
                                   note="Capitol Office")
            if email:
                leg.add_contact_detail(type="email", value=email)

            home_address = "{}\n{}, {} {}".format(
                row["home street address"],
                row["home city"],
                row["home state"],
                row["home zip code"],
            )
            if "Legislative Office Building" not in home_address:
                leg.add_contact_detail(type="address",
                                       value=home_address,
                                       note="District Office")
                if row["home phone"].strip():
                    leg.add_contact_detail(type="voice",
                                           value=row["home phone"],
                                           note="District Office")
            leg.add_source(leg_url)

            for comm_name in row["committee member1"].split(";"):
                if " (" in comm_name:
                    comm_name, role = comm_name.split(" (")
                    role = role.strip(")").lower()
                else:
                    role = "member"
                comm_name = comm_name.strip()
                if comm_name:
                    if comm_name in committees:
                        com = committees[comm_name]
                    else:
                        com = Organization(comm_name,
                                           classification="committee",
                                           chamber=chamber)
                        com.add_source(leg_url)
                        committees[comm_name] = com
                        yield com

                    leg.add_membership(name_or_org=com, role=role)

            yield leg
예제 #3
0
    def scrape_chamber(self, chamber):
        body = {"lower": "H", "upper": "S"}[chamber]
        url = "http://www.azleg.gov/MemberRoster/?body=" + body
        page = self.get(url).text

        # there is a bad comment closing tag on this page
        page = page.replace("--!>", "-->")

        root = html.fromstring(page)

        path = "//table//tr"
        roster = root.xpath(path)[1:]
        for row in roster:
            position = ""
            name, district, party, email, room, phone, = row.xpath("td")

            if email.attrib.get("class") == "vacantmember":
                continue  # Skip any vacant members.

            link = name.xpath("string(a/@href)")
            if len(name) == 1:
                name = name.text_content().strip()
            else:
                position = name.tail.strip()
                name = name[0].text_content().strip()
            if "--" in name:
                name = name.split("--")[0].strip()

            linkpage = self.get(link).text
            linkpage = linkpage.replace("--!>", "-->")
            linkroot = html.fromstring(linkpage)
            linkroot.make_links_absolute(link)

            photos = linkroot.xpath("//img[contains(@src, 'MemberPhoto')]")

            if len(photos) != 1:
                self.warning("no photo on " + link)
                photo_url = ""
            else:
                photo_url = photos[0].attrib["src"]

            district = district.text_content().strip()
            party = party.text_content().strip()
            email = email.text_content().strip()

            if email.startswith("Email: "):
                email = email.replace("Email: ", "").lower() + "@azleg.gov"
            else:
                email = ""

            party = self.get_party(party)
            room = room.text_content().strip()
            if chamber == "lower":
                address = "House of Representatives\n"
            else:
                address = "Senate\n"
            address = (
                address + "1700 West Washington\n Room " + room + "\nPhoenix, AZ 85007"
            )

            phone = phone.text_content().strip()
            if "602" not in re.findall(r"(\d+)", phone):
                phone = "602-" + phone

            leg = Person(
                primary_org=chamber,
                image=photo_url,
                name=name,
                district=district,
                party=party,
            )
            leg.add_contact_detail(type="address", value=address, note="Capitol Office")
            leg.add_contact_detail(type="voice", value=phone, note="Capitol Office")
            leg.add_party(party=party)
            leg.add_link(link)

            if email:
                leg.add_contact_detail(type="email", value=email)
            if position:
                leg.add_membership(name_or_org=party, role=position)
                # leg.add_role(position, term, chamber=chamber,
                #             district=district, party=party)

            leg.add_source(url)

            # Probably just get this from the committee scraper
            # self.scrape_member_page(link, session, chamber, leg)
            yield leg