def test_committee_add_member_person():
    c = Organization("Defense", classification="committee")
    p = Person("John Adams")
    c.add_member(p, role="chairman")
    assert c._related[0].person_id == p._id
    assert c._related[0].organization_id == c._id
    assert c._related[0].role == "chairman"
Exemplo n.º 2
0
    def handle_list_item(self, item):
        name = " ".join(item.xpath(".//text()"))
        name = re.sub(r"\s+", " ", name).replace(" ,", ",").strip()

        if "Vacant" in name:
            return

        district = item.xpath("string(../../td[1])")
        party = item.xpath("string(../../td[2])")
        if party == "Democrat":
            party = "Democratic"

        leg_url = item.get("href")

        name = fix_name(name)
        leg = Person(
            name=name,
            district=district,
            party=party,
            primary_org="upper",
            role="Senator",
        )
        leg.add_link(leg_url)
        leg.add_source(self.url)
        leg.add_source(leg_url)

        self.scrape_page(SenDetail, leg_url, obj=leg)

        return leg
Exemplo n.º 3
0
    def scrape_member(self, chamber, link):
        name = link.text.strip()
        leg_url = link.get("href")
        district = link.xpath("string(../../td[3])")
        party = link.xpath("string(../../td[4])")

        # we get email on the next page now
        # email = link.xpath("string(../../td[5])")

        if party == "Democrat":
            party = "Democratic"
        elif party == "No Party Specified":
            party = "Independent"

        pid = re.search(r"personID=(\d+)", link.attrib["href"]).group(1)
        photo_url = ("https://www.legis.iowa.gov/photo"
                     "?action=getPhoto&ga=%s&pid=%s" %
                     (self.latest_session(), pid))

        leg = Person(
            name=name,
            primary_org=chamber,
            district=district,
            party=party,
            image=photo_url,
        )

        leg.add_link(leg_url)
        leg.add_source(leg_url)

        leg_page = lxml.html.fromstring(self.get(link.attrib["href"]).text)
        self.scrape_member_page(leg, leg_page)
        yield leg
Exemplo n.º 4
0
    def scrape_member_page(self, chamber, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for legislator in page.xpath(
            "//div[contains(concat(' ', normalize-space(@class), ' '), "
            "' memberModule ')]"
        ):
            img = legislator.xpath(".//div[@class='thumbnail']//img")[0].attrib["src"]
            data = legislator.xpath(".//div[@class='data']")[0]
            homepage = data.xpath(".//a[@class='black']")[0]
            full_name = homepage.text_content()

            if "Vacant" in full_name:
                continue

            homepage = homepage.attrib["href"]
            party = data.xpath(".//span[@class='partyLetter']")[0].text_content()
            party = {"R": "Republican", "D": "Democratic"}[party]
            office_lines = data.xpath("child::text()")
            phone = office_lines.pop(-1)

            if re.search(r"(Leader|Whip|Speaker)", office_lines[0]):
                office_lines.pop(0)

            office = "\n".join(office_lines)
            h3 = data.xpath("./h3")
            if len(h3):
                h3 = h3[0]
                district = h3.xpath("./br")[0].tail.replace("District", "").strip()
            else:
                district = re.findall(r"\d+\.png", legislator.attrib["style"])[
                    -1
                ].split(".", 1)[0]

            full_name = re.sub(r"\s+", " ", full_name).strip()
            email = (
                "rep{0:0{width}}@ohiohouse.gov"
                if chamber == "lower"
                else "sd{0:0{width}}@ohiosenate.gov"
            ).format(int(district), width=2)

            leg = Person(
                name=full_name,
                district=district,
                party=party,
                primary_org=chamber,
                image=img,
            )

            leg.add_contact_detail(type="address", value=office, note="Capitol Office")
            leg.add_contact_detail(type="voice", value=phone, note="Capitol Office")
            leg.add_contact_detail(type="email", value=email, note="Capitol Office")

            self.scrape_homepage(leg, chamber, homepage)

            leg.add_source(url)
            leg.add_link(homepage)
            yield leg
Exemplo n.º 5
0
def test_save_object_invalid():
    s = Scraper(juris, "/tmp/")
    p = Person("Michael Jordan")
    # no source, won't validate

    with pytest.raises(ValueError):
        s.save_object(p)
Exemplo n.º 6
0
    def scrape_legislator(self, chamber, name, url):
        html = self.get(url).text
        page = lxml.html.fromstring(html)
        page.make_links_absolute(url)

        district = (page.xpath('//h1[contains(., "DISTRICT")]/text()').pop().
                    split()[1].strip().lstrip("0"))

        party = page.xpath("//h2").pop().text_content()
        party = re.search(r"\((R|D|I)[ \-\]]", party).group(1)

        if party == "D":
            party = "Democratic"
        elif party == "R":
            party = "Republican"
        elif party == "I":
            party = "Independent"

        photo_url = page.xpath(
            "//img[contains(@src, 'images/members/')]")[0].attrib["src"]

        leg = Person(name,
                     district=district,
                     party=party,
                     image=photo_url,
                     primary_org=chamber)
        leg.add_link(url)
        leg.add_source(url)
        self.scrape_offices(leg, page)

        yield leg
Exemplo n.º 7
0
    def scrape_lower(self, chamber):
        url = "http://www.house.mi.gov/mhrpublic/frmRepList.aspx"
        table = ["website", "district", "name", "party", "location", "phone", "email"]

        data = self.get(url).text
        doc = lxml.html.fromstring(data)

        # skip two rows at top
        for row in doc.xpath('//table[@id="grvRepInfo"]/*'):
            tds = row.xpath(".//td")
            if len(tds) == 0:
                continue
            metainf = {}
            for i in range(0, len(table)):
                metainf[table[i]] = tds[i]
            district = str(int(metainf["district"].text_content().strip()))
            party = metainf["party"].text_content().strip()
            phone = metainf["phone"].text_content().strip()
            email = metainf["email"].text_content().strip()
            name = metainf["name"].text_content().strip()
            if name == "Vacant" or re.match(r"^District \d{1,3}$", name):
                self.warning(
                    "District {} appears vacant, and will be skipped".format(district)
                )
                continue
            leg_url = metainf["website"].xpath("./a")[0].attrib["href"]

            office = metainf["location"].text_content().strip()
            office = re.sub(
                " HOB",
                " Anderson House Office Building\n124 North Capitol Avenue\nLansing, MI 48933",
                office,
            )
            office = re.sub(" CB", " State Capitol Building\nLansing, MI 48909", office)

            try:
                photo_url = self.get_photo_url(leg_url)[0]
            except (scrapelib.HTTPError, IndexError):
                photo_url = ""
                self.warning("no photo url for %s", name)

            person = Person(
                name=name,
                district=district,
                party=abbr[party],
                primary_org="lower",
                image=photo_url,
            )

            person.add_link(leg_url)
            person.add_source(leg_url)

            person.add_contact_detail(
                type="address", value=office, note="Capitol Office"
            )
            person.add_contact_detail(type="voice", value=phone, note="Capitol Office")
            person.add_contact_detail(type="email", value=email, note="Capitol Office")

            yield person
def test_person_add_party():
    p = Person("Groot")
    p.add_party("Green")
    p._related[0].validate()
    assert get_pseudo_id(p._related[0].organization_id) == {
        "name": "Green",
        "classification": "party",
    }
Exemplo n.º 9
0
    def scrape_chamber(self, chamber):
        client = ApiClient(self)
        session = self.latest_session()
        base_url = "http://iga.in.gov/legislative"
        api_base_url = "https://api.iga.in.gov"
        chamber_name = "senate" if chamber == "upper" else "house"
        r = client.get("chamber_legislators",
                       session=session,
                       chamber=chamber_name)
        all_pages = client.unpaginate(r)
        for leg in all_pages:
            firstname = leg["firstName"]
            lastname = leg["lastName"]
            party = leg["party"]
            link = leg["link"]
            api_link = api_base_url + link
            html_link = base_url + link.replace("legislators/",
                                                "legislators/legislator_")
            try:
                html = get_with_increasing_timeout(self,
                                                   html_link,
                                                   fail=True,
                                                   kwargs={"verify": False})
            except scrapelib.HTTPError:
                self.logger.warning("Legislator's page is not available.")
                continue
            doc = lxml.html.fromstring(html.text)
            doc.make_links_absolute(html_link)
            address, phone = doc.xpath("//address")
            address = address.text_content().strip()
            address = "\n".join([ln.strip() for ln in address.split("\n")])
            phone = phone.text_content().strip()
            try:
                district = (doc.xpath("//span[@class='district-heading']")
                            [0].text.lower().replace("district", "").strip())
            except IndexError:
                self.warning("skipping legislator w/o district")
                continue
            image_link = base_url + link.replace("legislators/",
                                                 "portraits/legislator_")
            legislator = Person(
                primary_org=chamber,
                district=district,
                name=" ".join([firstname, lastname]),
                party=party,
                image=image_link,
            )
            legislator.add_contact_detail(type="address",
                                          note="Capitol Office",
                                          value=address)
            legislator.add_contact_detail(type="voice",
                                          note="Capitol Office",
                                          value=phone)
            legislator.add_link(html_link)
            legislator.add_source(html_link)
            legislator.add_source(api_link)

            yield legislator
Exemplo n.º 10
0
    def scrape_senator_page(self, chamber, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for legislator in page.xpath(
            "//div[@id='senators']//div[contains(concat(' ', normalize-space(@class), ' '), "
            "' portraitContainer ')]"
        ):
            img = legislator.xpath(
                ".//div[@class='profileThumbnailBoundingBox']/@style"
            )[0]
            img = img[img.find("(") + 1 : img.find(")")]
            full_name = legislator.xpath(".//div[@class='profileName']/a/text()")[0]
            homepage_url = legislator.xpath(".//a[@class='profileImageLink']")[
                0
            ].attrib["href"]
            district = legislator.xpath(".//div[@class='profileDistrict']" "/a/text()")[
                0
            ].split("#")[1]

            if "Vacant" in full_name:
                continue

            homepage = self.get(homepage_url).text
            page = lxml.html.fromstring(homepage)
            phone = page.xpath("//div[@class='phone']/span/text()")[0]

            address_lines = page.xpath("//div[@class='address']/descendant::*/text()")
            address = "\n".join(address_lines)

            party_image = page.xpath('//div[@class="senatorParty"]/img/@src')[0]
            if "Republican" in party_image:
                party = "Republican"
            elif "Democrat" in party_image:
                party = "Democratic"

            email = (
                "rep{0:0{width}}@ohiohouse.gov"
                if chamber == "lower"
                else "sd{0:0{width}}@ohiosenate.gov"
            ).format(int(district), width=2)

            leg = Person(
                name=full_name,
                district=district,
                primary_org=chamber,
                image=img,
                party=party,
            )

            leg.add_contact_detail(type="address", value=address, note="Capitol Office")
            leg.add_contact_detail(type="voice", value=phone, note="Capitol Office")
            leg.add_contact_detail(type="email", value=email, note="Capitol Office")

            leg.add_source(url)
            leg.add_link(homepage_url)
            yield leg
Exemplo n.º 11
0
    def scrape_rep(self, url):

        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        main = page.xpath('//div[@id="main-info"]')[0]
        if "Resigned" in main.text_content():
            print("Member resigned {}".format(url))
            raise StopIteration  # don't yield anything

        if "Deceased" in main.text_content():
            print("Member is deceased {}".format(url))
            raise StopIteration  # don't yield anything

        name = page.xpath('//div[@class="member-name"]/text()')[0].strip()
        name = re.sub(r"\s+", " ", name)
        district_number = page.xpath(
            '//span[contains(text(), "House District:")]'
            "/following-sibling::span/text()")[0].strip()
        # remove anything after first whitespace
        district_number = re.sub(r"\s.*", "", district_number.strip())

        email = None
        email_content = page.xpath(
            '//a[./i[contains(@class,"fa-envelope")]]/text()')
        if email_content and email_content[0].strip():
            email = email_content[0].strip()

        photo_url = page.xpath('//header[@id="home"]/img/@src')[0]

        party = self.get_rep_table_by_header(page,
                                             "Party Affiliation").text.strip()
        party = _party_map[party[0]]  # standardize

        main_p_text = page.xpath('//div[@id="main-info"]/p/text()')
        address = [t.strip() for t in main_p_text if t.strip()][0]

        person = Person(
            name=name,
            district=district_number,
            primary_org="lower",
            party=party,
            image=photo_url,
        )

        person.add_contact_detail(type="address",
                                  value=address,
                                  note="District Office")
        if email:
            person.add_contact_detail(type="email",
                                      value=email,
                                      note="District Office")

        person.add_link(url)
        person.add_source(url)

        yield person
Exemplo n.º 12
0
    def get_member(self, session, chamber, kpid):
        url = "%smembers/%s" % (ksapi.url, kpid)
        content = json.loads(self.get(url).text)["content"]

        party = content["PARTY"]
        if party == "Democrat":
            party = "Democratic"

        slug = {
            "2013-2014": "b2013_14",
            "2015-2016": "b2015_16",
            "2017-2018": "b2017_18",
            "2019-2020": "b2019_20",
        }[session]
        leg_url = "http://www.kslegislature.org/li/%s/members/%s/" % (slug,
                                                                      kpid)

        try:
            legislator_page = self.lxmlize(leg_url)
            (photo_url,
             ) = legislator_page.xpath('//img[@class="profile-picture"]/@src')
        except scrapelib.HTTPError:
            self.warning("{}'s legislator bio page not found".format(
                content["FULLNAME"]))
            leg_url = ""
            photo_url = ""

        person = Person(
            name=content["FULLNAME"],
            district=str(content["DISTRICT"]),
            primary_org=chamber,
            party=party,
            image=photo_url,
        )
        person.extras = {"occupation": content["OCCUPATION"]}

        address = "\n".join([
            "Room {}".format(content["OFFICENUM"]),
            "Kansas State Capitol Building",
            "300 SW 10th St.",
            "Topeka, KS 66612",
        ])

        note = "Capitol Office"
        person.add_contact_detail(type="address", value=address, note=note)
        person.add_contact_detail(type="email",
                                  value=content["EMAIL"],
                                  note=note)
        if content["OFFPH"]:
            person.add_contact_detail(type="voice",
                                      value=content["OFFPH"],
                                      note=note)

        person.add_source(url)
        person.add_link(leg_url)

        yield person
def test_basic_invalid_person():
    bob = Person("Bob B. Johnson")
    bob.add_source(url="http://example.com")
    bob.validate()

    bob.name = None

    with pytest.raises(ScrapeValueError):
        bob.validate()
def test_person_add_term():
    p = Person("Eternal")
    p.add_term("eternal", "council", start_date="0001", end_date="9999")
    p._related[0].validate()
    assert get_pseudo_id(p._related[0].organization_id) == {
        "classification": "council"
    }
    assert p._related[0].start_date == "0001"
    assert p._related[0].end_date == "9999"
Exemplo n.º 15
0
    def _scrape_legislator(self, row, chamber):
        name_cell = row.xpath('./td[@class="rosterCell nameCell"]/a')[0]
        name = " ".join([
            line.strip() for line in name_cell.text_content().split("\n")
            if len(line.strip()) > 0
        ])

        party_letter = row.xpath(
            './td[@class="rosterCell partyCell"]/text()')[0].strip()
        party = dict(D="Democratic", R="Republican")[party_letter]

        chamber_abbr = self._chamber_map[chamber]
        district = (row.xpath('./td[@class="rosterCell seatCell"]'
                              "/text()")[0].replace(chamber_abbr, "").strip())
        try:
            email = (row.xpath('./td[@class="rosterCell emailCell"]'
                               "/a/@href")[0].replace("mailto:", "").strip())
        except IndexError:
            email = None

        phone = (row.xpath('./td[@class="rosterCell phoneCell"]'
                           "/text()")[0].strip() or None)

        details_url = "https://leg.mt.gov{}".format(name_cell.attrib["href"])
        response = self.get(details_url)
        details_page = lxml.html.fromstring(response.text)

        address_lines = (details_page.xpath(
            '//div[@class="col-lg-6 col-md-12 text-lg-left align-self-center"]'
            '/p[contains(text(), "Address")]')[0].text_content().replace(
                "Address", "").split("\n"))
        address = "\n".join(
            [line.strip() for line in address_lines if len(line.strip()) > 0])

        legislator = Person(name=name,
                            district=district,
                            party=party,
                            primary_org=chamber)

        legislator.add_contact_detail(type="address",
                                      value=address,
                                      note="Capitol Office")
        if phone is not None:
            legislator.add_contact_detail(type="voice",
                                          value=phone,
                                          note="Capitol Office")

        if email is not None:
            legislator.add_contact_detail(type="email",
                                          value=email,
                                          note="E-mail")

        legislator.add_link(details_url)
        legislator.add_source(self._roster_url)

        yield legislator
Exemplo n.º 16
0
    def handle_list_item(self, row):
        if not row["First Name"]:
            return
        name = "{} {}".format(row["First Name"], row["Last Name"])
        party = PARTIES[row["Party"]]
        leg = Person(
            name=name,
            district=row["District"].lstrip("0"),
            party=party,
            primary_org="upper",
            role="Senator",
            image=self.extra_info[name]["image"],
        )
        leg.add_link(self.extra_info[name]["url"])
        leg.add_contact_detail(type="voice",
                               value=self.extra_info[name]["office_phone"],
                               note="capitol")
        if "email" in self.extra_info[name]:
            leg.add_contact_detail(type="email",
                                   value=self.extra_info[name]["email"],
                                   note="capitol")

        row["Zipcode"] = row["Zipcode"].strip()
        # Accommodate for multiple address column naming conventions.
        address1_fields = [row.get("Address"), row.get("Office Building")]
        address2_fields = [row.get("Address2"), row.get("Office Address")]
        row["Address"] = next((a for a in address1_fields if a is not None),
                              False)
        row["Address2"] = next((a for a in address2_fields if a is not None),
                               False)

        if (a in row["Address2"] for a in
            ["95 University Avenue W", "100 Rev. Dr. Martin Luther King"]):
            address = "{Address}\n{Address2}\n{City}, {State} {Zipcode}".format(
                **row)
            if "Rm. Number" in row:
                address = "{0} {1}".format(row["Rm. Number"], address)
            leg.add_contact_detail(type="address",
                                   value=address,
                                   note="capitol")
        elif row["Address2"]:
            address = "{Address}\n{Address2}\n{City}, {State} {Zipcode}".format(
                **row)
            leg.add_contact_detail(type="address",
                                   value=address,
                                   note="district")
        else:
            address = "{Address}\n{City}, {State} {Zipcode}".format(**row)
            leg.add_contact_detail(type="address",
                                   value=address,
                                   note="district")

        leg.add_source(self.url)
        leg.add_source(self._html_url)

        return leg
Exemplo n.º 17
0
    def scrape_senator(self, district):
        link = "https://legislature.maine.gov/District-{}".format(district)
        page = lxml.html.fromstring(self.get(link).text)
        page.make_links_absolute(link)

        main = page.xpath('//div[@id="main"]/div[@id="content"]')[0]
        title = main.xpath("h1")[0].text
        # e.g. District 25 - State Senator Catherine Breen (D - Cumberland)...
        title_match = re.match(
            r"District (\d+) - State Senator ([^\(]+) \(([DRI])", title)
        _, name, party = title_match.groups()
        name = re.sub(r"\s+", " ", name.strip())
        party = _party_map[party]

        image_url = address = phone = email = None

        for p in main.xpath("p"):
            if p.xpath(".//img") and not image_url:
                image_url = p.xpath(".//img/@src")[0]
                continue
            field, _, value = p.text_content().partition(":")
            value = value.strip()
            if field in ("Address", "Mailing Address"):
                address = value
            elif field in ("Phone", "Home Phone"):
                phone = value
            elif field == "Email":
                email = value

        person = Person(
            name=name,
            district=district,
            image=image_url,
            primary_org="upper",
            party=party,
        )

        person.add_link(link)
        person.add_source(link)

        if address:
            person.add_contact_detail(type="address",
                                      value=address,
                                      note="District Office")

        if phone:
            person.add_contact_detail(type="voice",
                                      value=clean_phone(phone),
                                      note="District Phone")
        person.add_contact_detail(type="email",
                                  value=email,
                                  note="District Email")

        yield person
def test_legislator_related_party():
    leg = Person("John Adams", party="Democratic-Republican")
    leg.pre_save("jurisdiction-id")

    # a party membership
    assert len(leg._related) == 1
    assert leg._related[0].person_id == leg._id
    assert get_pseudo_id(leg._related[0].organization_id) == {
        "classification": "party",
        "name": "Democratic-Republican",
    }
    assert leg._related[0].role == "member"
def test_person_add_membership_name():
    p = Person("Leonardo DiCaprio")
    p.add_membership("Academy of Motion Picture Arts and Sciences",
                     role="winner",
                     start_date="2016")
    p._related[0].validate()
    assert get_pseudo_id(p._related[0].organization_id) == {
        "name": "Academy of Motion Picture Arts and Sciences"
    }
    assert p._related[0].person_id == p._id
    assert p._related[0].role == "winner"
    assert p._related[0].start_date == "2016"
Exemplo n.º 20
0
def test_save_object_basics():
    # ensure that save object dumps a file
    s = Scraper(juris, "/tmp/")
    p = Person("Michael Jordan")
    p.add_source("http://example.com")

    with mock.patch("json.dump") as json_dump:
        s.save_object(p)

    # ensure object is saved in right place
    filename = "person_" + p._id + ".json"
    assert filename in s.output_names["person"]
    json_dump.assert_called_once_with(p.as_dict(), mock.ANY, cls=mock.ANY)
def test_legislator_related_district():
    leg = Person("John Adams", district="1", primary_org="legislature")
    leg.pre_save("jurisdiction-id")

    assert len(leg._related) == 1
    assert leg._related[0].person_id == leg._id
    assert get_pseudo_id(leg._related[0].organization_id) == {
        "classification": "legislature"
    }
    assert get_pseudo_id(leg._related[0].post_id) == {
        "organization__classification": "legislature",
        "label": "1",
    }
Exemplo n.º 22
0
    def scrape_lower_legislator(self, url, leg_info):
        page = self.lxmlize(url)

        name = page.xpath(
            '//span[@id="body_FormView5_FULLNAMELabel"]/text()')[0].strip()
        if name.startswith("District ") or name.startswith("Vacant "):
            self.warning("Seat is vacant: {}".format(name))
            return

        photo = page.xpath(
            '//img[contains(@src, "/h_reps/RepPics")]')[0].attrib["src"]
        party_flags = {
            "Democrat": "Democratic",
            "Republican": "Republican",
            "Independent": "Independent",
        }
        party_info = page.xpath(
            '//span[@id="body_FormView5_PARTYAFFILIATIONLabel"]/text()'
        )[0].strip()
        party = party_flags[party_info]
        try:
            email = page.xpath(
                '//span[@id="body_FormView6_EMAILADDRESSPUBLICLabel"]/text()'
            )[0].strip()
        except IndexError:
            email = None
        district = leg_info["dist"].replace("Dist", "").strip()

        person = Person(name=name,
                        party=party,
                        district=district,
                        primary_org="lower",
                        image=photo)

        contacts = [
            (leg_info["office"], "address"),
            (leg_info["phone"], "voice"),
            (email, "email"),
        ]

        for value, key in contacts:
            if value:
                person.add_contact_detail(type=key,
                                          value=value,
                                          note="District Office")

        person.add_source(url)
        person.add_link(url)

        yield person
def test_person_add_membership_org():
    p = Person("Bob B. Bear")
    p.add_source("http://example.com")
    o = Organization("test org", classification="unknown")
    p.add_membership(o,
                     role="member",
                     start_date="2007",
                     end_date=datetime.date(2015, 5, 8))
    assert len(p._related) == 1
    p._related[0].validate()
    assert p._related[0].person_id == p._id
    assert p._related[0].organization_id == o._id
    assert p._related[0].start_date == "2007"
    assert p._related[0].end_date == datetime.date(2015, 5, 8)
Exemplo n.º 24
0
def table_row_to_legislator_and_profile_url(table_row_element, chamber):
    """Derive a Legislator from an HTML table row lxml Element, and a link to their profile"""
    td_elements = table_row_element.xpath("td")
    (
        role_element,
        name_element,
        district_element,
        party_element,
        phone_element,
        email_element,
    ) = td_elements

    # Name comes in the form Last, First
    # last_name_first_name = name_element.text_content().strip()
    # full_name = last_name_first_name_to_full_name(last_name_first_name)
    full_name = name_element.text_content().strip()
    if full_name.count(", ") == 1:
        full_name = " ".join(full_name.split(", ")[::-1]).strip()
    district = district_element.text_content().strip()
    party = party_element.text_content().strip()
    if party == "Democrat":
        party = "Democratic"
    elif party == "Unaffiliated":
        party = "Independent"

    role = role_element.text_content().strip()
    address = co_address_from_role(role)
    phone = phone_element.text_content().strip()
    email = email_element.text_content().strip()

    (profile_url, ) = name_element.xpath("a/@href")
    print(chamber, district, party)
    legislator = Person(primary_org=chamber,
                        name=full_name,
                        district=district,
                        party=party)
    legislator.add_contact_detail(type="address",
                                  value=address,
                                  note="Capitol Office")
    if phone:
        legislator.add_contact_detail(type="voice",
                                      value=phone,
                                      note="Capitol Office")
    if email:
        legislator.add_contact_detail(type="email",
                                      value=email,
                                      note="Capitol Office")

    return legislator, profile_url
Exemplo n.º 25
0
def test_save_related():
    s = Scraper(juris, "/tmp/")
    p = Person("Michael Jordan")
    p.add_source("http://example.com")
    o = Organization("Chicago Bulls", classification="committee")
    o.add_source("http://example.com")
    p._related.append(o)

    with mock.patch("json.dump") as json_dump:
        s.save_object(p)

    assert json_dump.mock_calls == [
        mock.call(p.as_dict(), mock.ANY, cls=mock.ANY),
        mock.call(o.as_dict(), mock.ANY, cls=mock.ANY),
    ]
Exemplo n.º 26
0
    def scrape_chamber(self, chamber):
        leg_list_url = utils.urls["people"][chamber]
        page = self.get(leg_list_url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(leg_list_url)

        # email addresses are hidden away on a separate page now, at
        # least for Senators
        contact_url = utils.urls["contacts"][chamber]
        contact_page = self.get(contact_url).text
        contact_page = lxml.html.fromstring(contact_page)

        for link in page.xpath("//a[contains(@href, '_bio.cfm')]"):
            full_name = " ".join(link.text.split(", ")[::-1]).strip()
            full_name = re.sub(r"\s+", " ", full_name)
            district = link.getparent().getnext().tail.strip()
            district = re.search(r"District (\d+)", district).group(1)

            party = link.getparent().tail.strip()[-2]
            if party == "R":
                party = "Republican"
            elif party == "D":
                party = "Democratic"
            elif party == "I":
                party = "Independent"

            url = link.get("href")
            leg_id = url.split("?id=")[1]

            person = Person(name=full_name,
                            district=district,
                            party=party,
                            primary_org=chamber)
            person.add_link(leg_list_url)
            person.add_source(leg_list_url)

            # Scrape email, offices, photo.
            page = self.get(url).text
            doc = lxml.html.fromstring(page)
            doc.make_links_absolute(url)

            email = self.scrape_email_address(contact_page, leg_id)
            self.scrape_offices(url, doc, person, email)
            self.scrape_photo_url(url, doc, person)

            yield person
def test_legislator_related_chamber_district_role():
    leg = Person("John Adams",
                 district="1",
                 primary_org="lower",
                 role="Speaker")
    leg.pre_save("jurisdiction-id")

    assert len(leg._related) == 1
    assert leg._related[0].person_id == leg._id
    assert get_pseudo_id(leg._related[0].organization_id) == {
        "classification": "lower"
    }
    assert get_pseudo_id(leg._related[0].post_id) == {
        "organization__classification": "lower",
        "label": "1",
        "role": "Speaker",
    }
    assert leg._related[0].role == "Speaker"
Exemplo n.º 28
0
    def scrape_chamber(self, session):
        session_key = SESSION_KEYS[session]
        legislators_reponse = self.api_client.get("legislators", session=session_key)

        for legislator in legislators_reponse:
            url_name = legislator["WebSiteUrl"].split("/")[-1]
            chamber_name = "house" if legislator["Chamber"] == "H" else "senate"
            img = "https://www.oregonlegislature.gov/{}/MemberPhotos/{}.jpg".format(
                chamber_name, url_name
            )

            party = legislator["Party"]
            if party == "Democrat":
                party = "Democratic"

            person = Person(
                name="{} {}".format(legislator["FirstName"], legislator["LastName"]),
                primary_org={"S": "upper", "H": "lower"}[legislator["Chamber"]],
                party=party,
                district=legislator["DistrictNumber"],
                image=img,
            )
            person.add_link(legislator["WebSiteUrl"])
            person.add_source(legislator["WebSiteUrl"])

            if legislator["CapitolAddress"]:
                person.add_contact_detail(
                    type="address",
                    value=legislator["CapitolAddress"],
                    note="Capitol Office",
                )

            if legislator["CapitolPhone"]:
                person.add_contact_detail(
                    type="voice",
                    value=legislator["CapitolPhone"],
                    note="Capitol Office",
                )

            person.add_contact_detail(
                type="email", value=legislator["EmailAddress"], note="Capitol Office"
            )

            yield person
Exemplo n.º 29
0
    def handle_list_item(self, item):
        photo_url = item.xpath("./img/@src")[0]
        url = item.xpath(".//h5/a/@href")[0]
        name_text = item.xpath(".//h5/a/b/text()")[0]

        name_match = re.match(r"^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$", name_text)
        name = name_match.group(1).strip()
        district = name_match.group(2).lstrip("0").upper()
        party_text = name_match.group(3)
        party = PARTIES[party_text]

        info_texts = [
            x.strip() for x in item.xpath("./div/text()[normalize-space()]")
            if x.strip()
        ]
        address = "\n".join((info_texts[0], info_texts[1]))

        phone_text = info_texts[2]
        if validate_phone_number(phone_text):
            phone = phone_text

        email_text = item.xpath(".//a/@href")[1].replace("mailto:", "").strip()
        if validate_email_address(email_text):
            email = email_text

        rep = Person(
            name=name,
            district=district,
            party=party,
            primary_org="lower",
            role="Representative",
            image=photo_url,
        )
        rep.add_link(url)
        rep.add_contact_detail(type="address", value=address, note="capitol")
        rep.add_contact_detail(type="voice", value=phone, note="capitol")
        rep.add_contact_detail(type="email", value=email, note="capitol")
        rep.add_source(self.url)

        yield rep
Exemplo n.º 30
0
    def scrape_upper_chamber(self, term):
        url = "http://oksenate.gov/Senators/Default.aspx"
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        for a in doc.xpath("//table[@summary]")[0].xpath(
                './/td//a[contains(@href, "biographies")]'):
            tail = a.xpath("..")[0].tail
            if tail:
                district = tail.split()[1]
            else:
                district = a.xpath("../../span")[1].text.split()[1]

            if a.text is None or a.text.strip() == "Vacant":
                self.warning(
                    "District {} appears to be empty".format(district))
                continue
            else:
                match = re.match(r"(.+) \(([A-Z])\)", a.text.strip())
                if match:
                    name, party = match.group(1), self._parties[match.group(2)]
                else:
                    self.warning(
                        "District {} appears to have empty Representative name,party"
                        .format(district))
                    continue

            url = a.get("href")

            person = Person(primary_org="upper",
                            district=district,
                            name=name.strip(),
                            party=party)
            person.add_link(url)
            person.add_source(url)
            self.scrape_upper_offices(person, url)
            yield person