def test_same_name_people():
    create_jurisdiction()
    o = Organization.objects.create(name="WWE", jurisdiction_id="jid")

    # importing two people with the same name to a pristine database should error
    p1 = ScrapePerson("Dwayne Johnson", image="http://example.com/1")
    p2 = ScrapePerson("Dwayne Johnson", image="http://example.com/2")
    with pytest.raises(SameNameError):
        PersonImporter("jid").import_data([p1.as_dict(), p2.as_dict()])

    # importing one person should pass
    PersonImporter("jid").import_data([p1.as_dict()])
    # create fake memberships so that future lookups work on the imported people
    for p in Person.objects.all():
        Membership.objects.create(person=p, organization=o)

    # importing another person with the same name should fail
    with pytest.raises(SameNameError):
        PersonImporter("jid").import_data([p1.as_dict(), p2.as_dict()])

    # adding birth dates should pass
    p1.birth_date = "1970"
    p2.birth_date = "1930"
    resp = PersonImporter("jid").import_data([p1.as_dict(), p2.as_dict()])
    assert resp["person"]["insert"] == 1
    assert resp["person"]["noop"] == 0
    assert resp["person"]["update"] == 1
    assert Person.objects.count() == 2
    # create fake memberships so that future lookups work on the imported people
    for p in Person.objects.all():
        Membership.objects.create(person=p, organization=o)

    # adding a third person with the same name but without a birthday should error
    p3 = ScrapePerson("Dwayne Johnson", image="http://example.com/3")

    with pytest.raises(SameNameError):
        PersonImporter("jid").import_data([p3.as_dict()])

    # and now test that an update works and we can insert a new one with the same name
    p1.image = "http://example.com/1.jpg"
    p2.birth_date = "1931"  # change birth_date, means a new insert
    resp = PersonImporter("jid").import_data([p1.as_dict(), p2.as_dict()])
    assert Person.objects.count() == 3
    assert resp["person"]["insert"] == 1
    assert resp["person"]["noop"] == 0
    assert resp["person"]["update"] == 1
Пример #2
0
    def _parse_person(self, row, chamber, seat_map):
        # Capture legislator vitals.
        first_name = row["FirstName"]
        middle_name = row["MiddleName"]
        last_name = row["LastName"]
        full_name = "{} {} {}".format(first_name, middle_name, last_name)
        full_name = re.sub(r"[\s]{2,}", " ", full_name)

        if chamber == "lower":
            district = "{} {}".format(row["County"],
                                      int(row["District"])).strip()
        else:
            district = str(int(row["District"])).strip()

        party = self.party_map[row["party"].upper()]
        email = row["WorkEmail"]

        if district == "0":
            self.warning("Skipping {}, district is set to 0".format(full_name))
            return

        person = Person(primary_org=chamber,
                        district=district,
                        name=full_name,
                        party=party)

        extras = {
            "first_name": first_name,
            "middle_name": middle_name,
            "last_name": last_name,
        }

        person.extras = extras
        if email:
            office = "Capitol" if email.endswith(
                "@leg.state.nh.us") else "District"
            person.add_contact_detail(type="email",
                                      value=email,
                                      note=office + " Office")

        # Capture legislator office contact information.
        district_address = "{}\n{}\n{}, {} {}".format(row["Address"],
                                                      row["address2"],
                                                      row["city"],
                                                      row["State"],
                                                      row["Zipcode"]).strip()

        phone = row["Phone"].strip()
        if not phone:
            phone = None

        if district_address:
            office = "Capitol" if chamber == "upper" else "District"
            person.add_contact_detail(type="address",
                                      value=district_address,
                                      note=office + " Office")
        if phone:
            office = "Capitol" if "271-" in phone else "District"
            person.add_contact_detail(type="voice",
                                      value=phone,
                                      note=office + " Office")

        # Retrieve legislator portrait.
        profile_url = None
        if chamber == "upper":
            profile_url = self.senate_profile_url.format(row["District"])
        elif chamber == "lower":
            try:
                seat_number = seat_map[row["seatno"]]
                profile_url = self.house_profile_url.format(seat_number)
            except KeyError:
                pass

        if profile_url:
            person.image = self._get_photo(profile_url, chamber)
            person.add_source(profile_url)

        return person
Пример #3
0
    def _scrape_representative(self, url, parties):
        # logger.info(f'Generating representative person object from {url}')
        """
        Returns a Person object representing a member of the lower
        legislative chamber.
        """
        # url = self.get(url).text.replace('<br>', '')
        member_page = self.lxmlize(url)

        photo_url = member_page.xpath('//img[@class="member-photo"]/@src')[0]
        if photo_url.endswith("/.jpg"):
            photo_url = None

        scraped_name, district_text = member_page.xpath(
            '//div[@class="member-info"]/h2')
        scraped_name = scraped_name.text_content().strip().replace("Rep. ", "")
        scraped_name = " ".join(scraped_name.split())

        name = " ".join(scraped_name.split(", ")[::-1])

        district_text = district_text.text_content().strip()
        district = str(self.district_re.search(district_text).group(1))

        # Vacant house "members" are named after their district numbers:
        if re.match(r"^District \d+$", scraped_name):
            return None

        party = parties[district]

        person = Person(name=name,
                        district=district,
                        party=party,
                        primary_org="lower")

        if photo_url is not None:
            person.image = photo_url

        person.add_link(url)
        person.add_source(url)

        def office_name(element):
            """Returns the office address type."""
            return element.xpath("preceding-sibling::h4[1]/text()")[0].rstrip(
                ":")

        offices_text = [{
            "name":
            office_name(p_tag),
            "type":
            office_name(p_tag).replace(" Address", "").lower(),
            "details":
            p_tag.text_content(),
        } for p_tag in member_page.xpath(
            '//h4/following-sibling::p[@class="double-space"]')]

        for office_text in offices_text:
            details = office_text["details"].strip()

            # A few member pages have blank office listings:
            if details == "":
                continue

            # At the time of writing, this case of multiple district
            # offices occurs exactly once, for the representative at
            # District 43:
            if details.count("Office") > 1:
                district_offices = [
                    district_office.strip() for district_office in re.findall(
                        r"(\w+ Office.+?(?=\w+ Office|$))",
                        details,
                        flags=re.DOTALL)
                ]
                offices_text += [{
                    "name":
                    re.match(r"\w+ Office", office).group(),
                    "type":
                    "district",
                    "details":
                    re.search(r"(?<=Office).+(?=\w+ Office|$)?", office,
                              re.DOTALL).group(),
                } for office in district_offices]

            match = self.address_re.search(details)
            if match is not None:
                address = re.sub(
                    " +$",
                    "",
                    match.group().replace("\r", "").replace("\n\n", "\n"),
                    flags=re.MULTILINE,
                )
            else:
                # No valid address found in the details.
                continue

            phone_number = extract_phone(details)
            fax_number = extract_fax(details)

            if address:
                person.add_contact_detail(type="address",
                                          value=address,
                                          note=office_text["name"])
            if phone_number:
                person.add_contact_detail(type="voice",
                                          value=phone_number,
                                          note=office_text["name"])
            if fax_number:
                person.add_contact_detail(type="fax",
                                          value=fax_number,
                                          note=office_text["name"])

        yield person
Пример #4
0
    def _scrape_senator(self, url, parties):
        # logger.info(f'Generating senator person object from {url}')
        """
        Returns a Person object representing a member of the upper
        legislative chamber.
        """
        # Scrape legislator information from roster URL
        # Example: view-source:https://senate.texas.gov/member.php?d=1
        member_page = self.lxmlize(url)

        photo_url = member_page.xpath('//img[@id="memhead"]/@src')[0]
        scraped_name_district_text = member_page.xpath(
            '//div[@class="pgtitle"]/text()')[0]
        scraped_name, district_text = scraped_name_district_text.split(":")
        name = " ".join(scraped_name.replace("Senator ", "").split()).strip()
        district = str(district_text.split()[1]).strip()
        # Vacant house "members" are named after their district numbers:
        if re.match(r"^District \d+$", name):
            return None
        bio = " ".join(member_page.xpath('//div[@class="bio"]/text()'))
        party = parties[district]

        person = Person(
            name=name,
            district=district,
            party=party,
            primary_org="upper",
            biography=bio,
        )

        if photo_url is not None:
            person.image = photo_url
        person.add_link(url)
        person.add_source(url)

        office_ids = []
        # Get offices based on table headers
        for th_tag in member_page.xpath('//table[@class="memdir"]/tr/th'):
            # logger.warn([th_tag.xpath('text()'),th_tag.xpath('@id')])
            id = th_tag.xpath("@id")[0] if th_tag.xpath("@id") else ""
            label = th_tag.xpath("text()")[0].strip() if th_tag.xpath(
                "text()") else ""
            if id != "" and label != "":
                office_ids.append({"id": id, "label": label})

        # logger.warn(office_ids)
        for office in office_ids:
            # logger.warn(office)
            row = member_page.xpath(
                f'//table[@class="memdir"]/tr/td[@headers="{office["id"]}"]')
            # A few member pages have broken ids for office listings:
            if len(row) == 0:
                row = member_page.xpath(
                    '//table[@class="memdir"]/tr/td[@headers="dDA1"]')
            if len(row) > 0:
                details = " ".join(row[0].xpath("text()")).strip()
                details = details.replace("\r", "").replace("\n", "")
            # logger.warn(details)
            # A few member pages have blank office listings:
            if details == "":
                continue

            match = self.address_re.search(details)
            if match is not None:
                address = re.sub(
                    " +$",
                    "",
                    match.group().replace("\r", "").replace("\n", ""),
                    flags=re.MULTILINE,
                )
            else:
                # No valid address found in the details.
                continue

            phone_number = extract_phone(details)
            fax_number = extract_fax(details)

            if address:
                person.add_contact_detail(type="address",
                                          value=address,
                                          note=office["label"])
            if phone_number:
                person.add_contact_detail(type="voice",
                                          value=phone_number,
                                          note=office["label"])
            if fax_number:
                person.add_contact_detail(type="fax",
                                          value=fax_number,
                                          note=office["label"])

        yield person
Пример #5
0
    def legislators(self, latest_only):
        legs = {}

        for member, chamber, term, url in self._memberships(latest_only):
            name, _, _, district, party = member.xpath("td")
            district = district.text
            detail_url = name.xpath("a/@href")[0]

            if party.text_content().strip() == "":
                party = "Independent"
            else:
                party = {"D": "Democratic", "R": "Republican", "I": "Independent"}[
                    party.text
                ]
            name = name.text_content().strip()

            # inactive legislator, skip them for now
            if name.endswith("*"):
                name = name.strip("*")
                continue

            name = AKA.get(name, name)

            if name in legs:
                p, terms = legs[name]
                terms.append((chamber, district, term, party))
            else:
                p = Person(name, party=party)
                legs[name] = p, [(chamber, district, term, party)]

            p.add_source(url)
            p.add_source(detail_url)
            p.add_link(detail_url)

            birth_date = BIRTH_DATES.get(name, None)
            if birth_date:
                p.birth_date = birth_date

            leg_html = self.get(detail_url).text
            leg_doc = lxml.html.fromstring(leg_html)
            leg_doc.make_links_absolute(detail_url)

            hotgarbage = (
                "Senate Biography Information for the 98th General "
                "Assembly is not currently available."
            )

            if hotgarbage in leg_html:
                # The legislator's bio isn't available yet.
                self.logger.warning("No legislator bio available for " + name)
                continue

            photo_url = leg_doc.xpath('//img[contains(@src, "/members/")]/@src')[0]
            p.image = photo_url

            p.contact_details = []
            # email
            email = leg_doc.xpath('//b[text()="Email: "]')
            if email:
                p.add_contact_detail(
                    type="email", value=email[0].tail.strip(), note="Capitol Office"
                )

            offices = {
                "Capitol Office": '//table[contains(string(), "Springfield Office")]',
                "District Office": '//table[contains(string(), "District Office")]',
            }

            for location, xpath in offices.items():
                table = leg_doc.xpath(xpath)
                if table:
                    for type, value in self._table_to_office(table[3]):
                        if type in ("fax", "voice") and not validate_phone_number(
                            value
                        ):
                            continue

                        p.add_contact_detail(type=type, value=value, note=location)

        return legs
Пример #6
0
    def scrape_chamber(self, chamber, session):
        url = "https://docs.legis.wisconsin.gov/{}/legislators/{}".format(
            session, {
                "upper": "senate",
                "lower": "assembly"
            }[chamber])

        body = self.get(url).text
        page = lxml.html.fromstring(body)
        page.make_links_absolute(url)

        for row in page.xpath(
                ".//div[@class='box-content']/div[starts-with(@id,'district')]"
        ):
            if row.xpath(
                    ".//a/@href") and not row.xpath(".//a[text()='Vacant']"):
                rep_url = row.xpath(".//a[text()='Details']/@href")[0].strip(
                    "https://")
                rep_url = "https://" + rep_url
                rep_doc = lxml.html.fromstring(self.get(rep_url).text)
                rep_doc.make_links_absolute(rep_url)

                full_name = (rep_doc.xpath('.//div[@id="district"]/h1/text()')
                             [0].replace("Senator ",
                                         "").replace("Representative ", ""))

                party = rep_doc.xpath('.//div[@id="district"]//small/text()')
                if len(party) > 0:
                    party = PARTY_DICT[party[0].split("-")[0].strip(
                        "(").strip()]
                else:
                    party = None
                district = rep_doc.xpath(
                    './/div[@id="district"]/h3/a/@href')[1]
                district = district.split("/")[-1]
                district = str(int(district))

                # email
                email = rep_doc.xpath("//span[@class='info email']/a/text()")
                if email:
                    email = email[0]
                else:
                    email = ""

                assert party is not None, "{} is missing party".format(
                    full_name)

                person = Person(name=full_name,
                                district=district,
                                primary_org=chamber,
                                party=party)

                img = rep_doc.xpath('.//div[@id="district"]/img/@src')
                if img:
                    person.image = img[0]

                # office ####
                address_lines = rep_doc.xpath(
                    './/span[@class="info office"]/text()')
                address = "\n".join([
                    line.strip() for line in address_lines
                    if line.strip() != ""
                ])
                person.add_contact_detail(type="address",
                                          value=address,
                                          note="Capitol Office")

                phone = rep_doc.xpath(
                    './/span[@class="info telephone"]/text()')
                if phone:
                    phone = re.sub(r"\s+", " ", phone[1]).strip()
                    person.add_contact_detail(type="voice",
                                              value=phone,
                                              note="Capitol Office")

                fax = rep_doc.xpath('.//span[@class="info fax"]/text()')
                if fax:
                    fax = re.sub(r"\s+", " ", fax[1]).strip()
                    person.add_contact_detail(type="fax",
                                              value=fax,
                                              note="Capitol Office")

                if email:
                    person.add_contact_detail(type="email",
                                              value=email,
                                              note="Capitol Office")

                person.add_link(rep_url)
                person.add_source(rep_url)

                yield person
Пример #7
0
    def _scrape_upper_chamber(self):
        self.info("Scraping upper chamber for legislators.")

        chamber = "upper"

        url = self._senators_url
        source_url = url
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        table = page.xpath('//*[@id="content-2"]//table//tr')
        rowcount = 0
        for tr in table:
            rowcount += 1

            # the first two rows are headers, skip:
            if rowcount <= 2:
                continue

            tds = tr.xpath("td")
            full_name = tds[0].xpath("div/a")[0].text_content().strip()

            if full_name.startswith(
                ("Vacant", "Vacancy")) or full_name.endswith(("Vacant")):
                self.warning("Skipping vacancy, named '{}'".format(full_name))
                continue

            party_and_district = tds[1].text_content().strip().split("-")
            if party_and_district[0] == "D":
                party = "Democratic"
            elif party_and_district[0] == "R":
                party = "Republican"

            district = party_and_district[1].lstrip("0")
            phone = tds[3].xpath("div")[0].text_content().strip()
            url = self._senator_details_url.format(int(district))

            details_page = self.get(url).text
            if "currently vacant" in details_page:
                continue

            person = Person(name=full_name,
                            primary_org=chamber,
                            district=district,
                            party=party)

            person.add_source(source_url)
            person.add_source(url)
            person.add_link(url)

            page = lxml.html.fromstring(details_page)
            photo_url = page.xpath(
                '//*[@id="content-2"]//img[contains(@src, "uploads")]/@src')[0]

            contact_info = [
                line.strip()
                for line in page.xpath('//div[@class="textwidget"]/p[1]')
                [0].text_content().split("\n") if "Capitol Office:" not in line
            ]
            address = "\n".join(contact_info[:2])
            email = next((line for line in iter(contact_info) if "@" in line),
                         None)
            phone_pattern = re.compile(r"\(\d{3}\) \d{3}-\d{4}")
            phone_numbers = [
                line for line in contact_info
                if phone_pattern.search(line) is not None
            ]

            phone = phone_pattern.search(phone_numbers[0]).group()
            fax = next(
                (phone_pattern.search(phone_number).group()
                 for phone_number in iter(phone_numbers)
                 if "fax" in phone_number.lower()),
                None,
            )

            person.add_contact_detail(type="address",
                                      value=address,
                                      note="Capitol Office")
            person.add_contact_detail(type="voice",
                                      value=phone,
                                      note="Capitol Office")
            if fax:
                person.add_contact_detail(type="fax",
                                          value=fax,
                                          note="Capitol Office")
            if email:
                person.add_contact_detail(type="email",
                                          value=email,
                                          note="Capitol Office")

            person.image = photo_url

            yield person
Пример #8
0
    def _scrape_lower_chamber(self):
        self.info("Scraping lower chamber for legislators.")

        chamber = "lower"

        roster_url = self._reps_url
        page = self.get(roster_url).text
        page = lxml.html.fromstring(page)
        # This is the ASP.net table container
        table_xpath = "//table[@id='theTable']"
        table = page.xpath(table_xpath)[0]
        for tr in table.xpath("tr")[3:]:
            # If a given term hasn't occurred yet, then ignore it
            # Eg, in 2017, the 2018 term page will have a blank table
            if tr.attrib.get("class") == "dxgvEmptyDataRow":
                self.warning("No House members found")
                return

            tds = tr.xpath("td")
            last_name = tds[1].text_content().strip()
            first_name = tds[2].text_content().strip()
            full_name = "{} {}".format(first_name, last_name)
            district = str(int(tds[3].text_content().strip()))
            party = tds[4].text_content().strip()
            if party == "D":
                party = "Democratic"
            elif party == "R":
                party = "Republican"

            if party.strip() == "":  # Workaround for now.
                party = "Other"

            phone = tds[6].text_content().strip()
            room = tds[7].text_content().strip()

            address = self._assumed_address_fmt.format(room if room else "")

            if last_name == "Vacant":
                person = Person(name=full_name,
                                primary_org=chamber,
                                district=district,
                                party=party)
                person.extras = {
                    "first_name": first_name,
                    "last_name": last_name
                }

                person.add_contact_detail(type="address",
                                          value=address,
                                          note="Capitol Office")
                if phone.strip():
                    person.add_contact_detail(type="voice",
                                              value=phone,
                                              note="Capitol Office")

                person.add_source(roster_url)

                self._save_vacant_legislator(person)
            else:
                party_override = {
                    " Green": "Democratic",
                    " Sisco": "Republican"
                }

                if party == "" and full_name in party_override:
                    party = party_override[full_name]

                details_url = self._rep_details_url.format(district)
                details_page = lxml.html.fromstring(self.get(details_url).text)

                person = Person(name=full_name,
                                primary_org=chamber,
                                district=district,
                                party=party)
                person.extras = {
                    "first_name": first_name,
                    "last_name": last_name
                }
                person.add_source(roster_url)
                person.add_source(details_url)
                person.add_link(details_url)

                email = details_page.xpath(
                    '//*[@id="ContentPlaceHolder1_lblAddresses"] '
                    '//a[starts-with(@href,"mailto:")]/@href')
                if len(email) > 0 and email[0].lower() != "mailto:":
                    email = email[0].split(":")[1]
                else:
                    email = None

                person.add_contact_detail(type="address",
                                          value=address,
                                          note="Capitol Office")
                if phone:
                    person.add_contact_detail(type="voice",
                                              value=phone,
                                              note="Capitol Office")
                if email:
                    person.add_contact_detail(type="email",
                                              value=email,
                                              note="Capitol Office")

                picture = details_page.xpath(
                    '//*[@id="ContentPlaceHolder1_imgPhoto"]/@src')
                if len(picture) > 0:
                    person.image = picture[0]

                yield person