def test_person_add_membership_name(): p = Person("Leonardo DiCaprio") p.add_membership("Academy of Motion Picture Arts and Sciences", role="winner", start_date="2016") p._related[0].validate() assert get_pseudo_id(p._related[0].organization_id) == { "name": "Academy of Motion Picture Arts and Sciences" } assert p._related[0].person_id == p._id assert p._related[0].role == "winner" assert p._related[0].start_date == "2016"
def test_person_add_membership_org(): p = Person("Bob B. Bear") p.add_source("http://example.com") o = Organization("test org", classification="unknown") p.add_membership(o, role="member", start_date="2007", end_date=datetime.date(2015, 5, 8)) assert len(p._related) == 1 p._related[0].validate() assert p._related[0].person_id == p._id assert p._related[0].organization_id == o._id assert p._related[0].start_date == "2007" assert p._related[0].end_date == datetime.date(2015, 5, 8)
def scrape(self): # chambers = [chamber] if chamber is not None else ['upper', 'lower'] leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv" page = self.get(leg_url) committees = {} # Ensure that the spreadsheet's structure hasn't generally changed _row_headers = page.text.split("\r\n")[0].replace('"', "").split(",") assert _row_headers == HEADERS, "Spreadsheet structure may have changed" page = open_csv(page) for row in page: chamber = {"H": "lower", "S": "upper"}[row["office code"]] district = row["dist"].lstrip("0") assert district.isdigit(), "Invalid district found: {}".format( district) name = row["first name"] mid = row["middle initial"].strip() if mid: name += " %s" % mid name += " %s" % row["last name"] suffix = row["suffix"].strip() if suffix: name += " %s" % suffix party = row["party"] if party == "Democrat": party = "Democratic" leg = Person(primary_org=chamber, name=name, district=district, party=party) legislator_url = row["URL"].replace("\\", "//").strip() if legislator_url != "": if not legislator_url.startswith("http"): legislator_url = "http://" leg.add_link(legislator_url) leg.add_party(party=party) office_address = "%s\nRoom %s\nHartford, CT 06106" % ( row["capitol street address"], row["room number"], ) # extra_office_fields = dict() email = row["email"].strip() if "@" not in email: if not email: email = None elif email.startswith("http://") or email.startswith( "https://"): # extra_office_fields['contact_form'] = email email = None else: raise ValueError( "Problematic email found: {}".format(email)) leg.add_contact_detail(type="address", value=office_address, note="Capitol Office") leg.add_contact_detail(type="voice", value=row["capitol phone"], note="Capitol Office") if email: leg.add_contact_detail(type="email", value=email) home_address = "{}\n{}, {} {}".format( row["home street address"], row["home city"], row["home state"], row["home zip code"], ) if "Legislative Office Building" not in home_address: leg.add_contact_detail(type="address", value=home_address, note="District Office") if row["home phone"].strip(): leg.add_contact_detail(type="voice", value=row["home phone"], note="District Office") leg.add_source(leg_url) for comm_name in row["committee member1"].split(";"): if " (" in comm_name: comm_name, role = comm_name.split(" (") role = role.strip(")").lower() else: role = "member" comm_name = comm_name.strip() if comm_name: if comm_name in committees: com = committees[comm_name] else: com = Organization(comm_name, classification="committee", chamber=chamber) com.add_source(leg_url) committees[comm_name] = com yield com leg.add_membership(name_or_org=com, role=role) yield leg
def scrape_chamber(self, chamber): if chamber == "lower": url = "http://www.scstatehouse.gov/member.php?chamber=H" else: url = "http://www.scstatehouse.gov/member.php?chamber=S" seen_committees = {} data = self.get(url).text doc = lxml.html.fromstring(data) doc.make_links_absolute(url) for a in doc.xpath('//a[@class="membername"]'): full_name = a.text leg_url = a.get("href") if full_name.startswith("Senator"): full_name = full_name.replace("Senator ", "") if full_name.startswith("Representative"): full_name = full_name.replace("Representative ", "") leg_html = self.get(leg_url).text leg_doc = lxml.html.fromstring(leg_html) leg_doc.make_links_absolute(leg_url) if "Resigned effective" in leg_html: self.info("Resigned") continue party, district, _ = leg_doc.xpath( '//p[@style="font-size: 17px;' ' margin: 0 0 0 0; padding: 0;"]/text()') if "Republican" in party: party = "Republican" elif "Democrat" in party: party = "Democratic" # District # - County - Map district = district.split()[1] try: photo_url = leg_doc.xpath( '//img[contains(@src,"/members/")]/@src')[0] except IndexError: self.warning("No Photo URL for {}".format(full_name)) photo_url = "" person = Person( name=full_name, district=district, party=party, primary_org=chamber, image=photo_url, ) # capitol office address try: capitol_address = lxml.etree.tostring( leg_doc.xpath('//h2[text()="Columbia Address"]/../p[1]') [0]).decode() if capitol_address: capitol_address = parse_address(capitol_address) person.add_contact_detail(type="address", value=capitol_address, note="Capitol Office") except IndexError: self.warning("no capitol address for {0}".format(full_name)) # capitol office phone try: capitol_phone = ( leg_doc.xpath('//h2[text()="Columbia Address"]/../p[2]') [0].text_content().strip()) label, number = parse_phone(capitol_phone) if number: person.add_contact_detail(type="voice", value=number, note="Capitol Office") except IndexError: self.warning("no capitol phone for {0}".format(full_name)) # home address try: home_address = lxml.etree.tostring( leg_doc.xpath('//h2[text()="Home Address"]/../p[1]') [0]).decode() if home_address: home_address = parse_address(home_address) person.add_contact_detail(type="address", value=home_address, note="District Office") except IndexError: self.warning("no home address for {0}".format(full_name)) # home or business phone try: home_phone = ( leg_doc.xpath('//h2[text()="Home Address"]/../p[2]') [0].text_content().strip()) label, number = parse_phone(home_phone) if number: label = ("Primary Office" if label == "Business" else "District Office") person.add_contact_detail(type="voice", value=number, note=label) except IndexError: self.warning( "no home or business phone for {0}".format(full_name)) # business or home phone try: business_phone = ( leg_doc.xpath('//h2[text()="Home Address"]/../p[3]') [0].text_content().strip()) label, number = parse_phone(business_phone) if number: label = ("Primary Office" if label == "Business" else "District Office") person.add_contact_detail(type="voice", value=number, note=label) except IndexError: pass person.add_link(leg_url) person.add_source(url) person.add_source(leg_url) # committees (skip first link) for com in leg_doc.xpath( '//a[contains(@href, "committee.php")]')[1:]: if com.text.endswith(", "): committee, role = com.text_content().rsplit(", ", 1) # known roles role = { "Treas.": "treasurer", "Secy.": "secretary", "Secy./Treas.": "secretary/treasurer", "V.C.": "vice-chair", "1st V.C.": "first vice-chair", "Co 1st V.C.": "co-first vice-chair", "2nd V.C.": "second vice-chair", "3rd V.C.": "third vice-chair", "Ex.Officio Member": "ex-officio member", "Chairman": "chairman", }[role] else: committee = com.text role = "member" # only yield each committee once if committee not in seen_committees: com = Organization(name=committee, classification="committee", chamber=chamber) com.add_source(url) seen_committees[committee] = com yield com else: com = seen_committees[committee] person.add_membership(com, role=role) yield person
def scrape_chamber(self, chamber): body = {"lower": "H", "upper": "S"}[chamber] url = "http://www.azleg.gov/MemberRoster/?body=" + body page = self.get(url).text # there is a bad comment closing tag on this page page = page.replace("--!>", "-->") root = html.fromstring(page) path = "//table//tr" roster = root.xpath(path)[1:] for row in roster: position = "" name, district, party, email, room, phone, = row.xpath("td") if email.attrib.get("class") == "vacantmember": continue # Skip any vacant members. link = name.xpath("string(a/@href)") if len(name) == 1: name = name.text_content().strip() else: position = name.tail.strip() name = name[0].text_content().strip() if "--" in name: name = name.split("--")[0].strip() linkpage = self.get(link).text linkpage = linkpage.replace("--!>", "-->") linkroot = html.fromstring(linkpage) linkroot.make_links_absolute(link) photos = linkroot.xpath("//img[contains(@src, 'MemberPhoto')]") if len(photos) != 1: self.warning("no photo on " + link) photo_url = "" else: photo_url = photos[0].attrib["src"] district = district.text_content().strip() party = party.text_content().strip() email = email.text_content().strip() if email.startswith("Email: "): email = email.replace("Email: ", "").lower() + "@azleg.gov" else: email = "" party = self.get_party(party) room = room.text_content().strip() if chamber == "lower": address = "House of Representatives\n" else: address = "Senate\n" address = ( address + "1700 West Washington\n Room " + room + "\nPhoenix, AZ 85007" ) phone = phone.text_content().strip() if "602" not in re.findall(r"(\d+)", phone): phone = "602-" + phone leg = Person( primary_org=chamber, image=photo_url, name=name, district=district, party=party, ) leg.add_contact_detail(type="address", value=address, note="Capitol Office") leg.add_contact_detail(type="voice", value=phone, note="Capitol Office") leg.add_party(party=party) leg.add_link(link) if email: leg.add_contact_detail(type="email", value=email) if position: leg.add_membership(name_or_org=party, role=position) # leg.add_role(position, term, chamber=chamber, # district=district, party=party) leg.add_source(url) # Probably just get this from the committee scraper # self.scrape_member_page(link, session, chamber, leg) yield leg