Пример #1
0
    def scrape_member(self, chamber, link):
        name = link.text.strip()
        leg_url = link.get('href')
        district = link.xpath("string(../../td[3])")
        party = link.xpath("string(../../td[4])")

        # we get email on the next page now
        # email = link.xpath("string(../../td[5])")

        if party == 'Democrat':
            party = 'Democratic'
        elif party == 'No Party Specified':
            party = 'Independent'

        pid = re.search(r"personID=(\d+)", link.attrib['href']).group(1)
        photo_url = ("https://www.legis.iowa.gov/photo"
                     "?action=getPhoto&ga=%s&pid=%s" %
                     (self.latest_session(), pid))

        leg = Person(name=name,
                     primary_org=chamber,
                     district=district,
                     party=party,
                     image=photo_url)

        leg.add_link(leg_url)
        leg.add_source(leg_url)

        leg_page = lxml.html.fromstring(self.get(link.attrib['href']).text)
        self.scrape_member_page(leg, leg_page)
        yield leg
Пример #2
0
    def scrape(self):
        urls = Urls(dict(list=legislators_url), self)

        council = Organization('Temecula City Council',
                               classification='legislature')
        council.add_source(urls.list.url)
        yield council

        for tr in urls.list.xpath('//table[2]//tr')[1:]:

            # Parse some attributes.
            name, role = tr.xpath('td/p[1]//font/text()')
            image = tr.xpath('td/img/@src').pop()

            # Create legislator.
            person = Person(name, image=image)

            # Add membership on council.
            memb = person.add_membership(council, role=role)

            # Add email address.
            email, detail_url = tr.xpath('td//a/@href')
            email = email[7:]
            memb.contact_details.append(
                dict(type='email', value=email, note='work'))

            # Add sources.
            person.add_source(urls.list.url)
            person.add_source(detail_url)

            yield person
Пример #3
0
    def scrape_legislator(self, chamber, name, url):
        html = self.get(url).text
        page = lxml.html.fromstring(html)
        page.make_links_absolute(url)

        district = page.xpath('//h1[contains(., "DISTRICT")]/text()').pop() \
            .split()[1].strip().lstrip('0')

        party = page.xpath('//h2').pop().text_content()
        party = re.search(r'\((R|D|I)[ \-\]]', party).group(1)

        if party == 'D':
            party = 'Democratic'
        elif party == 'R':
            party = 'Republican'
        elif party == 'I':
            party = 'Independent'

        photo_url = page.xpath(
            "//img[contains(@src, 'images/members/')]")[0].attrib['src']

        leg = Person(name,
                     district=district,
                     party=party,
                     image=photo_url,
                     primary_org=chamber)
        leg.add_link(url)
        leg.add_source(url)
        self.scrape_offices(leg, page)

        yield leg
Пример #4
0
def test_committee_add_member_person():
    c = Organization('Defense', classification='committee')
    p = Person('John Adams')
    c.add_member(p, role='chairman')
    assert c._related[0].person_id == p._id
    assert c._related[0].organization_id == c._id
    assert c._related[0].role == 'chairman'
Пример #5
0
    def scrape_alderman(self, ward_num):
        ward_url = "{}/ward-{}".format(Urls.ALDERMEN_HOME, ward_num)
        alderman_url = self.alderman_url(ward_url)
        alderman_page = self.lxmlize(alderman_url)

        # person's name is the only <h1> tag on the page
        raw_name = alderman_page.xpath("//h1/text()")[0]
        name = HumanName.name_firstandlast(raw_name)

        # initialize person object with appropriate data so that pupa can
        # automatically create a membership object linking this person to
        # a post in the jurisdiction's "Board of Aldermen" organization
        district = "Ward {} Alderman".format(ward_num)
        person = Person(name=name,
                        district=district,
                        role="Alderman",
                        primary_org="legislature")

        # set additional fields
        person.image = alderman_page.xpath("//div/img/@src")[0]
        phone_number = alderman_page.xpath(
            "//strong[text()='Phone:']/../text()")[1].strip()
        person.add_contact_detail(type="voice", value=phone_number)

        # add sources
        person.add_source(alderman_url, note="profile")
        person.add_source(ward_url, note="ward")

        return person
    def handle_list_item(self, item):
        name = " ".join(item.xpath('.//text()'))
        name = re.sub(r'\s+', " ", name).replace(" ,", ",").strip()

        if 'Vacant' in name:
            return

        district = item.xpath("string(../../td[1])")
        party = item.xpath("string(../../td[2])")
        if party == 'Democrat':
            party = 'Democratic'

        leg_url = item.get('href')

        leg = Person(name=name,
                     district=district,
                     party=party,
                     primary_org='upper',
                     role='Senator')
        leg.add_link(leg_url)
        leg.add_source(self.url)
        leg.add_source(leg_url)

        self.scrape_page(SenDetail, leg_url, obj=leg)

        return leg
    def handle_list_item(self, item):
        link = item.xpath('.//div[contains(@class, "rep_style")]/a')[0]
        name = link.text_content().strip()

        if 'Vacant' in name or 'Resigned' in name or 'Pending' in name:
            return

        party = item.xpath(
            './/div[contains(@class, "party_style")]/text()')[0].strip()
        party = {'D': 'Democratic', 'R': 'Republican'}[party]

        district = item.xpath(
            './/div[contains(@class, "district_style")]/text()')[0].strip()

        leg_url = link.get('href')
        split_url = parse.urlsplit(leg_url)
        member_id = parse.parse_qs(split_url.query)['MemberId'][0]
        image = "http://www.flhouse.gov/FileStores/Web/Imaging/Member/{}.jpg".format(
            member_id)

        rep = Person(name=name,
                     district=district,
                     party=party,
                     primary_org='lower',
                     role='Representative',
                     image=image)
        rep.add_link(leg_url)
        rep.add_source(leg_url)
        rep.add_source(self.url)

        self.scrape_page(RepDetail, leg_url, obj=rep)

        return rep
Пример #8
0
def test_save_object_invalid():
    s = Scraper('jurisdiction', '/tmp/')
    p = Person('Michael Jordan')
    # no source, won't validate

    with pytest.raises(ValueError):
        s.save_object(p)
Пример #9
0
    def scrape_upper_chamber(self, term):
        url = "http://oksenate.gov/Senators/Default.aspx"
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        for a in doc.xpath('//table[@summary]')[0]. \
                xpath('.//td//a[contains(@href, "biographies")]'):
            tail = a.xpath('..')[0].tail
            if tail:
                district = tail.split()[1]
            else:
                district = a.xpath('../../span')[1].text.split()[1]

            if a.text is None or a.text.strip() == 'Vacant':
                self.warning(
                    "District {} appears to be empty".format(district))
                continue
            else:
                match = re.match(r'(.+) \(([A-Z])\)', a.text.strip())
                name, party = match.group(1), self._parties[match.group(2)]

            url = a.get('href')

            person = Person(
                primary_org='upper',
                district=district,
                name=name.strip(),
                party=party,
            )
            person.add_link(url)
            person.add_source(url)
            self.scrape_upper_offices(person, url)
            yield person
Пример #10
0
    def scrape_senator_page(self, chamber, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for legislator in page.xpath(
                "//div[@id='senators']//div[contains(concat(' ', normalize-space(@class), ' '), "
                "' portraitContainer ')]"):
            img = legislator.xpath(
                ".//div[@class='profileThumbnailBoundingBox']/@style")[0]
            img = img[img.find("(") + 1:img.find(")")]
            full_name = legislator.xpath(
                ".//div[@class='profileName']/a/text()")[0]
            homepage_url = legislator.xpath(
                ".//a[@class='profileImageLink']")[0].attrib["href"]
            district = legislator.xpath(".//div[@class='profileDistrict']"
                                        "/a/text()")[0].split("#")[1]

            if "Vacant" in full_name:
                continue

            homepage = self.get(homepage_url).text
            page = lxml.html.fromstring(homepage)
            phone = page.xpath("//div[@class='phone']/span/text()")[0]

            address_lines = page.xpath("//div[@class='address']/span/text()")
            address = "\n".join(address_lines)

            party_image = page.xpath(
                '//div[@class="senatorParty"]/img/@src')[0]
            if "Republican" in party_image:
                party = "Republican"
            elif "Democrat" in party_image:
                party = "Democratic"

            email = ("rep{0:0{width}}@ohiohouse.gov" if chamber == "lower" else
                     "sd{0:0{width}}@ohiosenate.gov").format(int(district),
                                                             width=2)

            leg = Person(
                name=full_name,
                district=district,
                primary_org=chamber,
                image=img,
                party=party,
            )

            leg.add_contact_detail(type="address",
                                   value=address,
                                   note="Capitol Office")
            leg.add_contact_detail(type="voice",
                                   value=phone,
                                   note="Capitol Office")
            leg.add_contact_detail(type="email",
                                   value=email,
                                   note="Capitol Office")

            leg.add_source(url)
            leg.add_link(homepage_url)
            yield leg
Пример #11
0
    def scrape(self):

        # lower
        url = 'http://164.100.47.194/Loksabha/Members/AlphabeticalList.aspx'
        entry = self.get(url).content

        page = lxml.html.fromstring(entry)
        page.make_links_absolute(url)

        for tr in page.xpath(
                '//table[contains(@class,"member_list_table")]/tr'):
            name = tr.xpath('td[2]/a[1]/@title')[0]
            bio = tr.xpath('td[2]/a[1]/@href')[0]
            photo_url = tr.xpath('td[2]/a[1]/img/@src')[0]
            party = tr.xpath('td[3]/text()')[0].strip()
            state = tr.xpath('td[4]/text()')[0].strip()
            member = Person(name=name,
                            role="member",
                            primary_org="lower",
                            party=party,
                            image=photo_url,
                            district=state)

            member.add_source('http://164.100.47.5/Newmembers/memberlist.aspx')
            yield member

        # upper
        url = 'http://164.100.47.5/Newmembers/memberlist.aspx'
        entry = self.get(url).content

        page = lxml.html.fromstring(entry)
        page.make_links_absolute(url)

        for tr in page.xpath(
                '//table[@id="ContentPlaceHolder1_GridView2"]/tr')[1:]:
            name = tr.xpath('td[2]/font/a/text()')[0]
            party_abbr = tr.xpath('td[3]/font/text()')[0].strip()
            state = tr.xpath('td[4]/font/text()')[0].strip()

            member = Person(name=name,
                            role="member",
                            primary_org="upper",
                            party=party_abbr,
                            district=state)

            member.add_source('http://164.100.47.5/Newmembers/memberlist.aspx')
            yield member
Пример #12
0
    def scrape_chamber(self, chamber):
        self._party_map = {
            'Democrat': 'Democratic',
            'Republican': 'Republican',
            'Non Affiliated': 'Independent',
            'Not Affiliated': 'Independent',
        }

        if chamber == 'upper':
            url = 'http://senate.legis.state.ak.us/'
        else:
            url = 'http://house.legis.state.ak.us/'

        page = self.lxmlize(url)

        items = page.xpath('//ul[@class="item"]')[1].getchildren()

        for item in items:
            photo_url = item.xpath('.//img/@src')[0]
            name = item.xpath('.//strong/text()')[0]
            leg_url = item.xpath('.//a/@href')[0]
            email = item.xpath('.//a[text()="Email Me"]/@href')
            if email:
                email = email[0].replace('mailto:', '')
            else:
                self.warning('no email for ' + name)

            party = district = None
            skip = False

            for dt in item.xpath('.//dt'):
                dd = dt.xpath('following-sibling::dd')[0].text_content()
                label = dt.text.strip()
                if label == 'Party:':
                    party = dd
                elif label == 'District:':
                    district = dd
                elif label.startswith('Deceased'):
                    skip = True
                    self.warning('skipping deceased ' + name)
                    break

            if skip:
                continue

            person = Person(
                primary_org=chamber,
                district=district,
                name=name,
                party=self._party_map[party],
                image=photo_url,
            )
            person.add_source(leg_url)
            person.add_link(leg_url)

            # scrape offices
            self._scrape_offices(person, leg_url, email)

            yield person
Пример #13
0
    def scrape_upper_chamber(self, term):
        url = 'https://senado.pr.gov/Pages/Senadores.aspx'

        doc = self.lxmlize(url)
        links = self.get_nodes(doc, '//ul[@class="senadores-list"]/li/a/@href')

        for link in links:
            senator_page = self.lxmlize(link)
            profile_links = self.get_nodes(senator_page,
                                           '//ul[@class="profiles-links"]/li')

            name_text = self.get_node(
                senator_page, '//span[@class="name"]').text_content().strip()
            # Convert to title case as some names are in all-caps
            name = re.sub(r'^Hon\.', '', name_text,
                          flags=re.IGNORECASE).strip().title()
            party = profile_links[0].text_content().strip()
            # Translate to English since being an Independent is a universal construct
            if party == "Independiente":
                party = "Independent"

            photo_url = self.get_node(senator_page,
                                      '//div[@class="avatar"]//img/@src')

            if profile_links[1].text_content().strip(
            ) == "Senador por Distrito":
                district_text = self.get_node(
                    senator_page,
                    '//div[@class="module-distrito"]//span[@class="headline"]'
                ).text_content()
                district = district_text.replace('DISTRITO', '',
                                                 1).replace('\u200b',
                                                            '').strip()
            elif profile_links[1].text_content().strip(
            ) == "Senador por Acumulación":
                district = "At-Large"

            phone_node = self.get_node(senator_page,
                                       '//a[@class="contact-data tel"]')
            phone = phone_node.text_content().strip()
            email_node = self.get_node(senator_page,
                                       '//a[@class="contact-data email"]')
            email = email_node.text_content().replace('\u200b', '').strip()

            person = Person(primary_org='upper',
                            district=district,
                            name=name,
                            party=party,
                            image=photo_url)
            person.add_contact_detail(type='email',
                                      value=email,
                                      note='Capitol Office')
            person.add_contact_detail(type='voice',
                                      value=phone,
                                      note='Capitol Office')
            person.add_link(link)
            person.add_source(link)

            yield person
Пример #14
0
def test_person_add_party():
    p = Person('Groot')
    p.add_party('Green')
    p._related[0].validate()
    assert get_pseudo_id(p._related[0].organization_id) == {
        'name': 'Green',
        'classification': 'party'
    }
Пример #15
0
    def scrape_chamber(self, chamber):
        client = ApiClient(self)
        session = self.latest_session()
        base_url = "http://iga.in.gov/legislative"
        api_base_url = "https://api.iga.in.gov"
        chamber_name = "senate" if chamber == "upper" else "house"
        r = client.get("chamber_legislators",
                       session=session,
                       chamber=chamber_name)
        all_pages = client.unpaginate(r)
        for leg in all_pages:
            firstname = leg["firstName"]
            lastname = leg["lastName"]
            party = leg["party"]
            link = leg["link"]
            api_link = api_base_url + link
            html_link = base_url + link.replace("legislators/",
                                                "legislators/legislator_")
            try:
                html = get_with_increasing_timeout(self,
                                                   html_link,
                                                   fail=True,
                                                   kwargs={"verify": False})
            except scrapelib.HTTPError:
                self.logger.warning("Legislator's page is not available.")
                continue
            doc = lxml.html.fromstring(html.text)
            doc.make_links_absolute(html_link)
            address, phone = doc.xpath("//address")
            address = address.text_content().strip()
            address = "\n".join([l.strip() for l in address.split("\n")])
            phone = phone.text_content().strip()
            try:
                district = (doc.xpath("//span[@class='district-heading']")
                            [0].text.lower().replace("district", "").strip())
            except IndexError:
                self.warning("skipping legislator w/o district")
                continue
            image_link = base_url + link.replace("legislators/",
                                                 "portraits/legislator_")
            legislator = Person(
                primary_org=chamber,
                district=district,
                name=" ".join([firstname, lastname]),
                party=party,
                image=image_link,
            )
            legislator.add_contact_detail(type="address",
                                          note="Capitol Office",
                                          value=address)
            legislator.add_contact_detail(type="voice",
                                          note="Capitol Office",
                                          value=phone)
            legislator.add_link(html_link)
            legislator.add_source(html_link)
            legislator.add_source(api_link)

            yield legislator
Пример #16
0
    def scrape_chamber(self, chamber):
        """
        Scrapes legislators for the current term only
        """
        # self.validate_term(term, latest_only=True)
        url = BASE_URL % CHAMBERS[chamber].lower()
        index = self.get(url, verify=False).text
        html = lxml.html.fromstring(index)
        html.make_links_absolute(url)

        rows = html.xpath('//div[contains(@class, "row-equal-height")]')

        for row in rows:
            img_url = row.xpath('.//img/@src')[0]

            inner = row.xpath('.//div[@class="vc-column-innner-wrapper"]')[1]
            inner_text = inner.text_content()
            if 'Resigned' in inner_text or 'Substitute' in inner_text:
                continue

            name = inner.xpath('p/strong')[0].text.replace(u'\xa0', ' ').strip()
            name = re.sub('\s+', ' ', name)
            party = PARTY[inner.xpath('p/strong')[0].tail.strip()]
            email = inner.xpath('p/strong/a')[0].text
            district = inner.xpath('p/a')[0].text.replace('District ', '')

            person_url = inner.xpath('p/a/@href')[0]
            # skip roles for now
            role = ''
            # for com in inner.xpath('p/a[contains(@href, "committees")]'):
            #     role = com.tail.strip()

            person = Person(name=name, district=district,
                            party=party, primary_org=chamber,
                            image=img_url, role=role)
            phones = get_phones(inner)
            phone = phones.get('home') or phones.get('business')
            office_phone = phones.get('office')
            address = get_address(inner)
            fax = get_fax(inner)
            if address:
                person.add_contact_detail(type='address', value=address,
                                          note='District Office')
            if phone:
                person.add_contact_detail(type='voice', value=phone,
                                          note='District Office')
            if fax:
                person.add_contact_detail(type='fax', value=fax,
                                          note='District Office')
            if email:
                person.add_contact_detail(type='email', value=email,
                                          note='District Office')
            if office_phone:
                person.add_contact_detail(type='voice', value=office_phone,
                                          note='Capitol Office')
            person.add_source(url)
            person.add_link(person_url)
            yield person
Пример #17
0
def test_basic_invalid_person():
    bob = Person("Bob B. Johnson")
    bob.add_source(url='foo')
    bob.validate()

    bob.name = None

    with pytest.raises(ValidationError):
        bob.validate()
Пример #18
0
    def scrape_lower(self, chamber):
        url = 'http://www.house.mi.gov/mhrpublic/frmRepList.aspx'
        table = [
            "website",
            "district",
            "name",
            "party",
            "location",
            "phone",
            "email"
        ]

        data = self.get(url).text
        doc = lxml.html.fromstring(data)

        # skip two rows at top
        for row in doc.xpath('//table[@id="grvRepInfo"]/*'):
            tds = row.xpath('.//td')
            if len(tds) == 0:
                continue
            metainf = {}
            for i in range(0, len(table)):
                metainf[table[i]] = tds[i]
            district = str(int(metainf['district'].text_content().strip()))
            party = metainf['party'].text_content().strip()
            phone = metainf['phone'].text_content().strip()
            email = metainf['email'].text_content().strip()
            leg_url = metainf['website'].xpath("./a")[0].attrib['href']
            name = metainf['name'].text_content().strip()
            if name == 'Vacant' or re.match(r'^District \d{1,3}$', name):
                self.warning('District {} appears vacant, and will be skipped'.format(district))
                continue

            office = metainf['location'].text_content().strip()
            office = re.sub(
                ' HOB',
                ' Anderson House Office Building\n124 North Capitol Avenue\nLansing, MI 48933',
                office
            )
            office = re.sub(
                ' CB',
                ' State Capitol Building\nLansing, MI 48909',
                office
            )

            photo_url = self.get_photo_url(leg_url)
            person = Person(name=name, district=district, party=abbr[party],
                            primary_org='lower', image=photo_url[0] if photo_url else None)

            person.add_link(leg_url)
            person.add_source(leg_url)

            person.add_contact_detail(type='address', value=office, note='Capitol Office')
            person.add_contact_detail(type='voice', value=phone, note='Capitol Office')
            person.add_contact_detail(type='email', value=email, note='Capitol Office')

            yield person
Пример #19
0
    def get_member(self, session, chamber, kpid):
        url = "%smembers/%s" % (ksapi.url, kpid)
        content = json.loads(self.get(url).text)["content"]

        party = content["PARTY"]
        if party == "Democrat":
            party = "Democratic"

        slug = {
            "2013-2014": "b2013_14",
            "2015-2016": "b2015_16",
            "2017-2018": "b2017_18",
            "2019-2020": "b2019_20",
        }[session]
        leg_url = "http://www.kslegislature.org/li/%s/members/%s/" % (slug,
                                                                      kpid)

        try:
            legislator_page = self.lxmlize(leg_url)
            (photo_url,
             ) = legislator_page.xpath('//img[@class="profile-picture"]/@src')
        except scrapelib.HTTPError:
            self.warning("{}'s legislator bio page not found".format(
                content["FULLNAME"]))
            leg_url = ""
            photo_url = ""

        person = Person(
            name=content["FULLNAME"],
            district=str(content["DISTRICT"]),
            primary_org=chamber,
            party=party,
            image=photo_url,
        )
        person.extras = {"occupation": content["OCCUPATION"]}

        address = "\n".join([
            "Room {}".format(content["OFFICENUM"]),
            "Kansas State Capitol Building",
            "300 SW 10th St.",
            "Topeka, KS 66612",
        ])

        note = "Capitol Office"
        person.add_contact_detail(type="address", value=address, note=note)
        person.add_contact_detail(type="email",
                                  value=content["EMAIL"],
                                  note=note)
        if content["OFFPH"]:
            person.add_contact_detail(type="voice",
                                      value=content["OFFPH"],
                                      note=note)

        person.add_source(url)
        person.add_link(leg_url)

        yield person
Пример #20
0
def test_basic_invalid_person():
    bob = Person("Bob B. Johnson")
    bob.add_source(url='http://example.com')
    bob.validate()

    bob.name = None

    with pytest.raises(ScrapeValueError):
        bob.validate()
Пример #21
0
def test_person_add_term():
    p = Person('Eternal')
    p.add_term('eternal', 'council', start_date='0001', end_date='9999')
    p._related[0].validate()
    assert get_pseudo_id(p._related[0].organization_id) == {
        'classification': 'council',
    }
    assert p._related[0].start_date == '0001'
    assert p._related[0].end_date == '9999'
Пример #22
0
def test_legislator_related_chamber_district():
    leg = Person('John Adams', district='1', primary_org='upper')
    leg.pre_save('jurisdiction-id')

    assert len(leg._related) == 1
    assert leg._related[0].person_id == leg._id
    assert get_pseudo_id(leg._related[0].organization_id) == {'classification': 'upper'}
    assert get_pseudo_id(leg._related[0].post_id) == {"organization__classification": "upper",
                                                      "label": "1"}
Пример #23
0
    def handle_list_item(self, row):
        if not row['First Name']:
            return
        name = '{} {}'.format(row['First Name'], row['Last Name'])
        party = PARTIES[row['Party']]
        leg = Person(name=name,
                     district=row['District'].lstrip('0'),
                     party=party,
                     primary_org='upper',
                     role='Senator',
                     image=self.extra_info[name]['image'])
        leg.add_link(self.extra_info[name]['url'])
        leg.add_contact_detail(type='voice',
                               value=self.extra_info[name]['office_phone'],
                               note='capitol')
        if 'email' in self.extra_info[name]:
            leg.add_contact_detail(type='email',
                                   value=self.extra_info[name]['email'],
                                   note='capitol')

        row['Zipcode'] = row['Zipcode'].strip()
        # Accommodate for multiple address column naming conventions.
        address1_fields = [row.get('Address'), row.get('Office Building')]
        address2_fields = [row.get('Address2'), row.get('Office Address')]
        row['Address'] = next((a for a in address1_fields if a is not None),
                              False)
        row['Address2'] = next((a for a in address2_fields if a is not None),
                               False)

        if (a in row['Address2'] for a in
            ['95 University Avenue W', '100 Rev. Dr. Martin Luther King']):
            address = (
                '{Address}\n{Address2}\n{City}, {State} {Zipcode}'.format(
                    **row))
            if 'Rm. Number' in row:
                address = '{0} {1}'.format(row['Rm. Number'], address)
            leg.add_contact_detail(type='address',
                                   value=address,
                                   note='capitol')
        elif row['Address2']:
            address = (
                '{Address}\n{Address2}\n{City}, {State} {Zipcode}'.format(
                    **row))
            leg.add_contact_detail(type='address',
                                   value=address,
                                   note='district')
        else:
            address = '{Address}\n{City}, {State} {Zipcode}'.format(**row)
            leg.add_contact_detail(type='address',
                                   value=address,
                                   note='district')

        leg.add_source(self.url)
        leg.add_source(self._html_url)

        return leg
Пример #24
0
    def scrape_member_page(self, chamber, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for legislator in page.xpath(
                "//div[contains(concat(' ', normalize-space(@class), ' '), "
                "' memberModule ')]"
                ):
            img = legislator.xpath(
                ".//div[@class='thumbnail']//img")[0].attrib['src']
            data = legislator.xpath(".//div[@class='data']")[0]
            homepage = data.xpath(".//a[@class='black']")[0]
            full_name = homepage.text_content()

            if "Vacant" in full_name:
                continue

            homepage = homepage.attrib['href']
            party = data.xpath(
                ".//span[@class='partyLetter']")[0].text_content()
            party = {"R": "Republican", "D": "Democratic"}[party]
            office_lines = data.xpath("child::text()")
            phone = office_lines.pop(-1)
            office = "\n".join(office_lines)
            h3 = data.xpath("./h3")
            if len(h3):
                h3 = h3[0]
                district = h3.xpath("./br")[0].tail.replace("District", ""
                                                            ).strip()
            else:
                district = re.findall(
                    r"\d+\.png",
                    legislator.attrib['style']
                )[-1].split(".", 1)[0]

            full_name = re.sub(r"\s+", " ", full_name).strip()
            email = (
                'rep{0:0{width}}@ohiohouse.gov'
                if chamber == 'lower' else
                'sd{0:0{width}}@ohiosenate.gov'
            ).format(int(district), width=2)

            leg = Person(name=full_name, district=district,
                         party=party, primary_org=chamber,
                         image=img)

            leg.add_contact_detail(type='address', value=office, note='Capitol Office')
            leg.add_contact_detail(type='voice', value=phone, note='Capitol Office')
            leg.add_contact_detail(type='email', value=email, note='Capitol Office')

            self.scrape_homepage(leg, chamber, homepage)

            leg.add_source(url)
            leg.add_link(homepage)
            yield leg
Пример #25
0
    def get_member(self, session, chamber, kpid):
        url = '%smembers/%s' % (ksapi.url, kpid)
        content = json.loads(self.get(url).text)['content']

        party = content['PARTY']
        if party == 'Democrat':
            party = 'Democratic'

        slug = {
            '2013-2014': 'b2013_14',
            '2015-2016': 'b2015_16',
            '2017-2018': 'b2017_18'
        }[session]
        leg_url = 'http://www.kslegislature.org/li/%s/members/%s/' % (slug,
                                                                      kpid)

        try:
            legislator_page = self.lxmlize(leg_url)
            photo_url, = legislator_page.xpath(
                '//img[@class="profile-picture"]/@src')
        except scrapelib.HTTPError:
            self.warning("{}'s legislator bio page not found".format(
                content['FULLNAME']))
            leg_url = ''
            photo_url = ''

        person = Person(
            name=content['FULLNAME'],
            district=str(content['DISTRICT']),
            primary_org=chamber,
            party=party,
            image=photo_url,
        )
        person.extras = {'occupation': content['OCCUPATION']}

        address = '\n'.join([
            'Room {}'.format(content['OFFICENUM']),
            'Kansas State Capitol Building',
            '300 SW 10th St.',
            'Topeka, KS 66612',
        ])

        note = 'Capitol Office'
        person.add_contact_detail(type='address', value=address, note=note)
        person.add_contact_detail(type='email',
                                  value=content['EMAIL'],
                                  note=note)
        if content['OFFPH']:
            person.add_contact_detail(type='voice',
                                      value=content['OFFPH'],
                                      note=note)

        person.add_source(url)
        person.add_link(leg_url)

        yield person
Пример #26
0
    def _scrape_legislator(self, row, chamber):
        name_cell = row.xpath('./td[@class="rosterCell nameCell"]/a')[0]
        name = " ".join([
            line.strip() for line in name_cell.text_content().split("\n")
            if len(line.strip()) > 0
        ])

        party_letter = row.xpath(
            './td[@class="rosterCell partyCell"]/text()')[0].strip()
        party = dict(D="Democratic", R="Republican")[party_letter]

        chamber_abbr = self._chamber_map[chamber]
        district = (row.xpath('./td[@class="rosterCell seatCell"]'
                              "/text()")[0].replace(chamber_abbr, "").strip())
        try:
            email = (row.xpath('./td[@class="rosterCell emailCell"]'
                               "/a/@href")[0].replace("mailto:", "").strip())
        except IndexError:
            email = None

        phone = (row.xpath('./td[@class="rosterCell phoneCell"]'
                           "/text()")[0].strip() or None)

        details_url = "https://leg.mt.gov{}".format(name_cell.attrib["href"])
        response = self.get(details_url)
        details_page = lxml.html.fromstring(response.text)

        address_lines = (details_page.xpath(
            '//div[@class="col-lg-6 col-md-12 text-lg-left align-self-center"]'
            '/p[contains(text(), "Address")]')[0].text_content().replace(
                "Address", "").split("\n"))
        address = "\n".join(
            [line.strip() for line in address_lines if len(line.strip()) > 0])

        legislator = Person(name=name,
                            district=district,
                            party=party,
                            primary_org=chamber)

        legislator.add_contact_detail(type="address",
                                      value=address,
                                      note="Capitol Office")
        if phone is not None:
            legislator.add_contact_detail(type="voice",
                                          value=phone,
                                          note="Capitol Office")

        if email is not None:
            legislator.add_contact_detail(type="email",
                                          value=email,
                                          note="E-mail")

        legislator.add_link(details_url)
        legislator.add_source(self._roster_url)

        yield legislator
Пример #27
0
def test_person_add_membership_name():
    p = Person('Leonardo DiCaprio')
    p.add_membership('Academy of Motion Picture Arts and Sciences',
                     role='winner', start_date='2016')
    p._related[0].validate()
    assert get_pseudo_id(p._related[0].organization_id) == {
        'name': 'Academy of Motion Picture Arts and Sciences'}
    assert p._related[0].person_id == p._id
    assert p._related[0].role == 'winner'
    assert p._related[0].start_date == '2016'
Пример #28
0
def test_legislator_related_party():
    leg = Person('John Adams', party='Democratic-Republican')
    leg.pre_save('jurisdiction-id')

    # a party membership
    assert len(leg._related) == 1
    assert leg._related[0].person_id == leg._id
    assert get_pseudo_id(leg._related[0].organization_id) == {'classification': 'party',
                                                              'name': 'Democratic-Republican'}
    assert leg._related[0].role == 'member'
Пример #29
0
def test_person_add_membership():
    p = Person('Bob B. Bear')
    p.add_source('http://example.com')
    o = Organization('test org')
    p.add_membership(o, 'member', start_date='2007')
    assert len(p._related) == 1
    p._related[0].validate()
    assert p._related[0].person_id == p._id
    assert p._related[0].organization_id == o._id
    assert p._related[0].start_date == '2007'
Пример #30
0
    def parse_senate(self, div, chamber):
        name = div.xpath('.//h3/text()')[0]
        if name.endswith(' (R)'):
            party = 'Republican'
        elif name.endswith(' (D)'):
            party = 'Democratic'
        else:
            self.warning('skipping ' + name)
            return None
        name = name.split(' (')[0]

        district = div.xpath(
            './/div[contains(@class, "senator-district")]/div/text()'
        )[0].strip().lstrip('0')
        photo_url = div.xpath('.//img/@src')[0]

        person = Person(
            name=name,
            party=party,
            district=district,
            primary_org=chamber,
            image=photo_url,
        )

        url = div.xpath('.//a/@href')[0]
        person.add_link(url)

        # CA senators have working emails, but they're not putting them on
        # their public pages anymore
        email = self._construct_email(chamber, name)

        person.add_contact_detail(type='email', value=email, note='Senate Office')

        office_path = './/div[contains(@class, "{}")]//p'

        for addr in div.xpath(office_path.format('views-field-field-senator-capitol-office')):
            note = 'Senate Office'
            addr, phone = addr.text_content().split('; ')
            person.add_contact_detail(type='address', value=addr.strip(), note=note)
            person.add_contact_detail(type='voice', value=phone.strip(), note=note)

        n = 1
        for addr in div.xpath(office_path.format('views-field-field-senator-district-office')):
            note = 'District Office #{}'.format(n)
            for addr in addr.text_content().strip().splitlines():
                try:
                    addr, phone = addr.strip().replace(u'\xa0', ' ').split('; ')
                    person.add_contact_detail(type='address', value=addr.strip(), note=note)
                    person.add_contact_detail(type='voice', value=phone.strip(), note=note)
                except ValueError:
                    addr = addr.strip().replace(u'\xa0', ' ')
                    person.add_contact_detail(type='address', value=addr.strip(), note=note)
            n += 1

        return person