Пример #1
0
    def scrape(self, chamber, term):
        if term != '2011-2012':
            raise NoDataForPeriod

        base_url = ("http://www.arkleg.state.ar.us/assembly/2011/2011R/"
                    "Pages/Committees.aspx?committeetype=")

        for chamber, url_ext in COMM_TYPES.iteritems():
            chamber_url = urlescape(base_url + url_ext)
            with self.urlopen(chamber_url) as page:
                page = lxml.html.fromstring(page)

                for a in page.xpath('//td[@class="dxtl dxtl__B0"]/a'):
                    if a.attrib.get('colspan') == '2':
                        # colspan=2 signals a subcommittee, but it's easier
                        # to pick those up from links on the committee page,
                        # so we do that in scrape_committee() and skip
                        # it here
                        continue

                    name = re.sub(r'\s*-\s*(SENATE|HOUSE)$', '',
                                  a.text).strip()

                    comm_url = urlescape(a.attrib['href'])
                    if chamber == 'task_force':
                        chamber = 'joint'
                    self.scrape_committee(chamber, name, comm_url)
Пример #2
0
    def scrape(self, chamber, term):

        # Get start year of term.
        for termdict in self.metadata['terms']:
            if termdict['name'] == term:
                break
        start_year = termdict['start_year']

        base_url = ("http://www.arkleg.state.ar.us/assembly/%s/%sR/"
                    "Pages/Committees.aspx?committeetype=")
        base_url = base_url % (start_year, start_year)

        for chamber, url_ext in COMM_TYPES.iteritems():
            chamber_url = urlescape(base_url + url_ext)
            page = self.urlopen(chamber_url)
            page = lxml.html.fromstring(page)

            for a in page.xpath('//td[@class="dxtl dxtl__B0"]/a'):
                if a.attrib.get('colspan') == '2':
                    # colspan=2 signals a subcommittee, but it's easier
                    # to pick those up from links on the committee page,
                    # so we do that in scrape_committee() and skip
                    # it here
                    continue

                name = re.sub(r'\s*-\s*(SENATE|HOUSE)$', '', a.text).strip()

                comm_url = urlescape(a.attrib['href'])
                if chamber == 'task_force':
                    chamber = 'joint'
                self.scrape_committee(chamber, name, comm_url)
Пример #3
0
    def scrape_bill(self, chamber, session, url):
        with self.urlopen(url) as data:
            if "Bill does not exist." in data:
                return

            bill = self.parse_bill_xml(chamber, session, data)
            bill.add_source(urlescape(url))

            versions_url = url.replace('billhistory', 'billtext/html')
            # URLs for versions inexplicably (H|S)(J|C) instead of (H|J)(CR|JR)
            versions_url = versions_url.replace('JR', 'J').replace('CR', 'C')
            versions_url = '/'.join(versions_url.split('/')[0:-1])

            bill_prefix = bill['bill_id'].split()[0]
            bill_num = int(bill['bill_id'].split()[1])
            long_bill_id = "%s%05d" % (bill_prefix, bill_num)

            try:
                with self.urlopen(versions_url) as versions_list:
                    bill.add_source(urlescape(versions_url))
                    for version in parse_ftp_listing(versions_list):
                        if version.startswith(long_bill_id):
                            version_name = version.split('.')[0]
                            version_url = urlparse.urljoin(
                                versions_url + '/',
                                version)
                            bill.add_version(version_name,
                                             urlescape(version_url))
            except urllib2.URLError:
                # Sometimes the text is missing
                pass

            self.save_bill(bill)
Пример #4
0
    def scrape(self, chamber, term):
        if term != '2011-2012':
            raise NoDataForPeriod

        base_url = ("http://www.arkleg.state.ar.us/assembly/2011/2011R/"
                    "Pages/Committees.aspx?committeetype=")

        for chamber, url_ext in COMM_TYPES.iteritems():
            chamber_url = urlescape(base_url + url_ext)
            with self.urlopen(chamber_url) as page:
                page = lxml.html.fromstring(page)

                for a in page.xpath('//td[@class="dxtl dxtl__B0"]/a'):
                    if a.attrib.get('colspan') == '2':
                        # colspan=2 signals a subcommittee, but it's easier
                        # to pick those up from links on the committee page,
                        # so we do that in scrape_committee() and skip
                        # it here
                        continue

                    name = re.sub(r'\s*-\s*(SENATE|HOUSE)$', '', a.text).strip()

                    comm_url = urlescape(a.attrib['href'])
                    if chamber == 'task_force':
                        chamber = 'joint'
                    self.scrape_committee(chamber, name, comm_url)
Пример #5
0
    def scrape(self, chamber, term):

        # Get start year of term.
        for termdict in self.metadata['terms']:
            if termdict['name'] == term:
                break
        start_year = termdict['start_year']

        base_url = ("http://www.arkleg.state.ar.us/assembly/%s/%sR/"
                    "Pages/Committees.aspx?committeetype=")
        base_url = base_url % (start_year, start_year)

        for chamber, url_ext in COMM_TYPES.iteritems():
            chamber_url = urlescape(base_url + url_ext)
            page = self.get(chamber_url).text
            page = lxml.html.fromstring(page)
            page.make_links_absolute(chamber_url)

            for a in page.xpath('//td[@class="dxtl dxtl__B0"]/a'):
                if a.attrib.get('colspan') == '2':
                    # colspan=2 signals a subcommittee, but it's easier
                    # to pick those up from links on the committee page,
                    # so we do that in scrape_committee() and skip
                    # it here
                    continue

                name = re.sub(r'\s*-\s*(SENATE|HOUSE)$', '', a.text).strip()

                comm_url = urlescape(a.attrib['href'])
                if chamber == 'task_force':
                    chamber = 'joint'

                self.scrape_committee(chamber, name, comm_url)
Пример #6
0
    def scrape_bill(self, chamber, session, bill_id, url):
        try:
            page = lxml.html.fromstring(self.urlopen(url))
        except scrapelib.HTTPError as e:
            self.warning("error (%s) fetching %s, skipping" % (e, url))
            return

        title = page.xpath("string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip()

        if "JR" in bill_id:
            bill_type = ["joint resolution"]
        elif "CR" in bill_id:
            bill_type = ["concurrent resolution"]
        elif "R" in bill_id:
            bill_type = ["resolution"]
        else:
            bill_type = ["bill"]

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(url)
        bill["subjects"] = self.subject_map[bill_id]

        for link in page.xpath("//a[contains(@id, 'Auth')]"):
            name = link.xpath("string()").strip()

            if "otherAuth" in link.attrib["id"]:
                bill.add_sponsor("coauthor", name)
            else:
                bill.add_sponsor("author", name)

        act_table = page.xpath("//table[contains(@id, 'Actions')]")[0]
        for tr in act_table.xpath("tr")[2:]:
            action = tr.xpath("string(td[1])").strip()
            if not action or action == "None":
                continue

            date = tr.xpath("string(td[3])").strip()
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            actor = tr.xpath("string(td[4])").strip()
            if actor == "H":
                actor = "lower"
            elif actor == "S":
                actor = "upper"

            bill.add_action(actor, action, date, type=action_type(action))

        version_table = page.xpath("//table[contains(@id, 'Versions')]")[0]
        for link in version_table.xpath(".//a[contains(@href, '.DOC')]"):
            version_url = link.attrib["href"]
            if "COMMITTEE REPORTS" in version_url:
                continue

            name = link.text.strip()
            bill.add_version(name, version_url)

        for link in page.xpath(".//a[contains(@href, '_VOTES')]"):
            self.scrape_votes(bill, urlescape(link.attrib["href"]))

        self.save_bill(bill)
Пример #7
0
    def scrape(self, chamber, term):
        self.validate_term(term, latest_only=True)

        if chamber == 'upper':
            chamber_abbrev = 'sen'
            title_abbrev = 'sen'
        else:
            chamber_abbrev = 'hse'
            title_abbrev = 'del'

        url = "http://www.legis.state.wv.us/districts/maps/%s_dist.cfm" % (
            chamber_abbrev)
        page = lxml.html.fromstring(self.urlopen(url))
        page.make_links_absolute(url)

        view_url = '%smemview' % title_abbrev
        for link in page.xpath("//a[contains(@href, '%s')]" % view_url):
            name = link.xpath("string()").strip()
            leg_url = urlescape(link.attrib['href'])

            if name in [
                    'Members', 'Senate Members', 'House Members', 'Vacancy',
                    'VACANT'
            ]:
                continue

            self.scrape_legislator(chamber, term, name, leg_url)
Пример #8
0
    def scrape(self, chamber, term):
        self.validate_term(term, latest_only=True)

        if chamber == 'upper':
            chamber_abbrev = 'sen'
            title_abbrev = 'sen'
        else:
            chamber_abbrev = 'hse'
            title_abbrev = 'del'

        url = "http://www.legis.state.wv.us/districts/maps/%s_dist.cfm" % (
            chamber_abbrev)
        page = lxml.html.fromstring(self.urlopen(url))
        page.make_links_absolute(url)

        view_url = '%smemview' % title_abbrev
        for link in page.xpath("//a[contains(@href, '%s')]" % view_url):
            name = link.xpath("string()").strip()
            leg_url = urlescape(link.attrib['href'])

            if name in ['Members', 'Senate Members', 'House Members',
                        'Vacancy']:
                continue

            self.scrape_legislator(chamber, term, name, leg_url)
Пример #9
0
    def scrape(self, chamber, term):
        self.validate_term(term, latest_only=True)

        if chamber == 'upper':
            chamber_abbrev = 'Senate1'
        else:
            chamber_abbrev = 'House'

        url = 'http://www.legis.state.wv.us/%s/roster.cfm' % chamber_abbrev
        page = lxml.html.fromstring(self.get(url).text)
        page.make_links_absolute(url)

        for link in page.xpath("//a[contains(@href, '?member=')]"):
            if not link.text:
                continue
            name = link.xpath("string()").strip()
            leg_url = urlescape(link.attrib['href'])

            if name in [
                    'Members', 'Senate Members', 'House Members', 'Vacancy',
                    'VACANT', 'Vacant', 'To Be Announced', 'To Be Appointed'
            ]:
                continue

            self.scrape_legislator(chamber, term, name, leg_url)
Пример #10
0
    def scrape_committee(self, chamber, name, url, subcommittee=None):
        comm = Committee(chamber, name, subcommittee=subcommittee)
        comm.add_source(url)

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            for tr in page.xpath('//table[@class="gridtable"]/'
                                 'tr[position()>1]'):
                if tr.xpath('string(td[1])'):
                    mtype = tr.xpath('string(td[1])')
                else:
                    mtype = 'member'
                member = tr.xpath('string(td[3])').split()
                member = ' '.join(member[1:])
                comm.add_member(member, mtype)

            for a in page.xpath('//ul/li/a'):
                sub_name = a.text.strip()
                sub_url = urlescape(a.attrib['href'])
                self.scrape_committee(chamber,
                                      name,
                                      sub_url,
                                      subcommittee=sub_name)

            self.save_committee(comm)
Пример #11
0
    def scrape_committee(self, chamber, name, url, subcommittee=None):
        name = self._fix_committee_name(name)
        name = self._fix_committee_case(name)

        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        # Get the subcommittee name.
        xpath = '//div[@class="ms-WPBody"]//table//tr/td/b/text()'

        if subcommittee:
            subcommittee = page.xpath(xpath)
            if subcommittee:
                subcommittee = page.xpath(xpath).pop(0)
                subcommittee = self._fix_committee_name(
                    subcommittee, parent=name, subcommittee=True)
                subcommittee = self._fix_committee_case(subcommittee)
            else:
                subcommittee = None

        # Dedupe.
        if (chamber, name, subcommittee) in self._seen:
            return
        self._seen.add((chamber, name, subcommittee))

        comm = Committee(chamber, name, subcommittee=subcommittee)
        comm.add_source(url)

        for tr in page.xpath('//table[@class="dxgvTable"]/tr[position()>1]'):
            if tr.xpath('string(td[1])').strip():
                mtype = tr.xpath('string(td[1])').strip()
            else:
                mtype = 'member'

            member = tr.xpath('string(td[3])').split()
            title = member[0]
            member = ' '.join(member[1:])

            if title == 'Senator':
                mchamber = 'upper'
            elif title == 'Representative':
                mchamber = 'lower'
            else:
                # skip non-legislative members
                continue

            comm.add_member(member, mtype, chamber=mchamber)

        for a in page.xpath('//ul/li/a'):
            sub_name = a.text.strip()
            sub_url = urlescape(a.attrib['href'])
            self.scrape_committee(chamber, name, sub_url,
                                  subcommittee=sub_name)

        if not comm['members']:
            self.warning('not saving empty committee %s' % name)
        else:
            self.save_committee(comm)
Пример #12
0
    def scrape_committee(self, chamber, name, url, subcommittee=None):
        name = self._fix_committee_name(name)
        name = self._fix_committee_case(name)

        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        # Get the subcommittee name.
        xpath = '//div[@class="ms-WPBody"]//table//tr/td/b/text()'

        if subcommittee:
            subcommittee = page.xpath(xpath)
            if subcommittee:
                subcommittee = page.xpath(xpath).pop(0)
                subcommittee = self._fix_committee_name(
                    subcommittee, parent=name, subcommittee=True)
                subcommittee = self._fix_committee_case(subcommittee)
            else:
                subcommittee = None

        # Dedupe.
        if (chamber, name, subcommittee) in self._seen:
            return
        self._seen.add((chamber, name, subcommittee))

        comm = Committee(chamber, name, subcommittee=subcommittee)
        comm.add_source(url)

        for tr in page.xpath('//table[@class="dxgvTable"]/tr[position()>1]'):
            if tr.xpath('string(td[1])').strip():
                mtype = tr.xpath('string(td[1])').strip()
            else:
                mtype = 'member'

            member = tr.xpath('string(td[3])').split()
            title = member[0]
            member = ' '.join(member[1:])

            if title == 'Senator':
                mchamber = 'upper'
            elif title == 'Representative':
                mchamber = 'lower'
            else:
                # skip non-legislative members
                continue

            comm.add_member(member, mtype, chamber=mchamber)

        for a in page.xpath('//ul/li/a'):
            sub_name = a.text.strip()
            sub_url = urlescape(a.attrib['href'])
            self.scrape_committee(chamber, name, sub_url,
                                  subcommittee=sub_name)

        if not comm['members']:
            self.warning('not saving empty committee %s' % name)
        else:
            self.save_committee(comm)
Пример #13
0
    def scrape_bill(self, chamber, session, bill_id, url):
        page = lxml.html.fromstring(self.urlopen(url))

        title = page.xpath(
            "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip()

        if 'JR' in bill_id:
            bill_type = ['joint resolution']
        elif 'CR' in bill_id:
            bill_type = ['concurrent resolution']
        elif 'R' in bill_id:
            bill_type = ['resolution']
        else:
            bill_type = ['bill']

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(url)
        bill['subjects'] = self.subject_map[bill_id]

        for link in page.xpath("//a[contains(@id, 'Auth')]"):
            name = link.xpath("string()").strip()

            if 'otherAuth' in link.attrib['id']:
                bill.add_sponsor('coauthor', name)
            else:
                bill.add_sponsor('author', name)

        act_table = page.xpath("//table[contains(@id, 'Actions')]")[0]
        for tr in act_table.xpath("tr")[2:]:
            action = tr.xpath("string(td[1])").strip()
            if not action or action == 'None':
                continue

            date = tr.xpath("string(td[3])").strip()
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            actor = tr.xpath("string(td[4])").strip()
            if actor == 'H':
                actor = 'lower'
            elif actor == 'S':
                actor = 'upper'

            bill.add_action(actor, action, date,
                            type=action_type(action))

        version_table = page.xpath("//table[contains(@id, 'Versions')]")[0]
        for link in version_table.xpath(".//a[contains(@href, '.DOC')]"):
            version_url = link.attrib['href']
            if 'COMMITTEE REPORTS' in version_url:
                continue

            name = link.text.strip()
            bill.add_version(name, version_url)

        for link in page.xpath(".//a[contains(@href, '_VOTES')]"):
            self.scrape_votes(bill, urlescape(link.attrib['href']))

        self.save_bill(bill)
Пример #14
0
    def scrape(self, chamber, term):
        if term != '2011-2012':
            raise NoDataForPeriod

        base_url = ("http://www.arkleg.state.ar.us/assembly/2011/2011R/"
                    "Pages/Committees.aspx?committeetype=")

        for chamber, url_ext in COMM_TYPES.iteritems():
            chamber_url = urlescape(base_url + url_ext)
            with self.urlopen(chamber_url) as page:
                page = lxml.html.fromstring(page)

                for a in page.xpath('//td[@class="dxtl dxtl__B0"]/a'):
                    name = a.text.strip()
                    comm_url = urlescape(a.attrib['href'])
                    if chamber == 'task_force':
                        chamber = 'joint'
                    self.scrape_committee(chamber, name, comm_url)
Пример #15
0
    def scrape(self, chamber, term):
        if term != '2011-2012':
            raise NoDataForPeriod

        base_url = ("http://www.arkleg.state.ar.us/assembly/2011/2011R/"
                    "Pages/Committees.aspx?committeetype=")

        for chamber, url_ext in COMM_TYPES.iteritems():
            chamber_url = urlescape(base_url + url_ext)
            with self.urlopen(chamber_url) as page:
                page = lxml.html.fromstring(page)

                for a in page.xpath('//td[@class="dxtl dxtl__B0"]/a'):
                    name = a.text.strip()
                    comm_url = urlescape(a.attrib['href'])
                    if chamber == 'task_force':
                        chamber = 'joint'
                    self.scrape_committee(chamber, name, comm_url)
Пример #16
0
    def scrape(self, chamber, term):
        url = ('http://www.arkleg.state.ar.us/assembly/2011/2011R/Pages/'
               'LegislatorSearchResults.aspx?member=&committee=All&chamber=')

        with self.urlopen(url) as page:
            root = lxml.html.fromstring(page)

            for a in root.xpath('//table[@class="dxgvTable"]'
                                '/tr[contains(@class, "dxgvDataRow")]'
                                '/td[1]/a'):
                member_url = urlescape(a.attrib['href'])
                self.scrape_member(chamber, term, member_url)
Пример #17
0
    def scrape_committee(self, chamber, name, url, subcommittee=None):
        name = self._fix_committee_name(name)
        name = self._fix_committee_case(name)

        page = self.urlopen(url)
        page = lxml.html.fromstring(page)

        # Get the subcommittee name.
        xpath = '//div[@class="ms-WPBody"]//table//tr/td/b/text()'

        if subcommittee:
            subcommittee = page.xpath(xpath).pop(0)
            subcommittee = self._fix_committee_name(subcommittee, parent=name, subcommittee=True)
            subcommittee = self._fix_committee_case(subcommittee)

        # Dedupe.
        if (chamber, name, subcommittee) in self._seen:
            return
        self._seen.add((chamber, name, subcommittee))

        comm = Committee(chamber, name, subcommittee=subcommittee)
        comm.add_source(url)

        for tr in page.xpath('//table[@class="gridtable"]/tr[position()>1]'):
            if tr.xpath("string(td[1])"):
                mtype = tr.xpath("string(td[1])")
            else:
                mtype = "member"

            member = tr.xpath("string(td[3])").split()
            title = member[0]
            member = " ".join(member[1:])

            if title == "Senator":
                mchamber = "upper"
            elif title == "Representative":
                mchamber = "lower"
            else:
                # skip non-legislative members
                continue

            comm.add_member(member, mtype, chamber=mchamber)

        for a in page.xpath("//ul/li/a"):
            sub_name = a.text.strip()
            sub_url = urlescape(a.attrib["href"])
            self.scrape_committee(chamber, name, sub_url, subcommittee=sub_name)

        if not comm["members"]:
            self.warning("not saving empty committee %s" % name)
        else:
            self.save_committee(comm)
Пример #18
0
    def scrape_committee(self, chamber, name, url, subcommittee=None):
        if subcommittee:
            split_sub = subcommittee.split('-')
            if len(split_sub) > 1:
                subcommittee = '-'.join(split_sub[1:])
            subcommittee = re.sub(r'^(HOUSE|SENATE)\s+', '',
                                  subcommittee.strip())

        if (name, subcommittee) in self._seen:
            return
        self._seen.add((name, subcommittee))

        comm = Committee(chamber, name, subcommittee=subcommittee)
        comm.add_source(url)

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            for tr in page.xpath('//table[@class="gridtable"]/'
                                 'tr[position()>1]'):
                if tr.xpath('string(td[1])'):
                    mtype = tr.xpath('string(td[1])')
                else:
                    mtype = 'member'

                member = tr.xpath('string(td[3])').split()
                title = member[0]
                member = ' '.join(member[1:])

                if title == 'Senator':
                    mchamber = 'upper'
                elif title == 'Representative':
                    mchamber = 'lower'
                else:
                    # skip non-legislative members
                    continue

                comm.add_member(member, mtype, chamber=mchamber)

            for a in page.xpath('//ul/li/a'):
                sub_name = a.text.strip()
                sub_url = urlescape(a.attrib['href'])
                self.scrape_committee(chamber,
                                      name,
                                      sub_url,
                                      subcommittee=sub_name)

            if not comm['members']:
                self.warning('not saving empty committee %s' % name)
            else:
                self.save_committee(comm)
Пример #19
0
    def scrape(self, chamber, term):
        if term != "2011-2012":
            raise NoDataForPeriod

        url = (
            "http://www.arkleg.state.ar.us/assembly/2011/2011R/Pages/"
            "LegislatorSearchResults.aspx?member=&committee=All&chamber="
        )

        with self.urlopen(url) as page:
            root = lxml.html.fromstring(page)

            for a in root.xpath('//table[@class="dxgvTable"]' '/tr[contains(@class, "dxgvDataRow")]' "/td[1]/a"):
                member_url = urlescape(a.attrib["href"])
                self.scrape_member(chamber, term, member_url)
Пример #20
0
    def scrape_committee(self, chamber, name, url, subcommittee=None):
        if subcommittee:
            split_sub = subcommittee.split('-')
            if len(split_sub) > 1:
                subcommittee = '-'.join(split_sub[1:])
            subcommittee = re.sub(r'^(HOUSE|SENATE)\s+', '', subcommittee.strip())

        if (name, subcommittee) in self._seen:
            return
        self._seen.add((name, subcommittee))

        comm = Committee(chamber, name, subcommittee=subcommittee)
        comm.add_source(url)

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            for tr in page.xpath('//table[@class="gridtable"]/'
                                 'tr[position()>1]'):
                if tr.xpath('string(td[1])'):
                    mtype = tr.xpath('string(td[1])')
                else:
                    mtype = 'member'

                member = tr.xpath('string(td[3])').split()
                title = member[0]
                member = ' '.join(member[1:])

                if title == 'Senator':
                    mchamber = 'upper'
                elif title == 'Representative':
                    mchamber = 'lower'
                else:
                    # skip non-legislative members
                    continue

                comm.add_member(member, mtype, chamber=mchamber)

            for a in page.xpath('//ul/li/a'):
                sub_name = a.text.strip()
                sub_url = urlescape(a.attrib['href'])
                self.scrape_committee(chamber, name, sub_url,
                                      subcommittee=sub_name)

            if not comm['members']:
                self.warning('not saving empty committee %s' % name)
            else:
                self.save_committee(comm)
Пример #21
0
    def scrape(self, chamber, term):

        # Get start year of term.
        for termdict in self.metadata['terms']:
            if termdict['name'] == term:
                break
        start_year = termdict['start_year']

        url = ('http://www.arkleg.state.ar.us/assembly/%s/%sR/Pages/'
               'LegislatorSearchResults.aspx?member=&committee=All&chamber=')
        url = url % (start_year, start_year)
        page = self.urlopen(url)
        root = lxml.html.fromstring(page)

        for a in root.xpath('//table[@class="dxgvTable"]'
                            '/tr[contains(@class, "dxgvDataRow")]'
                            '/td[1]/a'):
            member_url = urlescape(a.attrib['href'])
            self.scrape_member(chamber, term, member_url)
Пример #22
0
    def scrape(self, chamber, term):

        # Get start year of term.
        for termdict in self.metadata['terms']:
            if termdict['name'] == term:
                break
        start_year = termdict['start_year']

        url = ('http://www.arkleg.state.ar.us/assembly/%s/%sR/Pages/'
               'LegislatorSearchResults.aspx?member=&committee=All&chamber=')
        url = url % (start_year, start_year)
        page = self.get(url).text
        root = lxml.html.fromstring(page)

        for a in root.xpath('//table[@class="dxgvTable"]'
                            '/tr[contains(@class, "dxgvDataRow")]'
                            '/td[1]/a'):
            member_url = urlescape(a.attrib['href'])
            self.scrape_member(chamber, term, member_url)
Пример #23
0
    def scrape_committee(self, chamber, name, url, subcommittee=None):
        if subcommittee:
            split_sub = subcommittee.split("-")
            if len(split_sub) > 1:
                subcommittee = "-".join(split_sub[1:])
            subcommittee = re.sub(r"^(HOUSE|SENATE)\s+", "", subcommittee.strip())

        if (name, subcommittee) in self._seen:
            return
        self._seen.add((name, subcommittee))

        comm = Committee(chamber, name, subcommittee=subcommittee)
        comm.add_source(url)

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            for tr in page.xpath('//table[@class="gridtable"]/' "tr[position()>1]"):
                if tr.xpath("string(td[1])"):
                    mtype = tr.xpath("string(td[1])")
                else:
                    mtype = "member"

                member = tr.xpath("string(td[3])").split()
                title = member[0]
                member = " ".join(member[1:])

                if title == "Senator":
                    mchamber = "upper"
                elif title == "Representative":
                    mchamber = "lower"

                comm.add_member(member, mtype, chamber=mchamber)

            for a in page.xpath("//ul/li/a"):
                sub_name = a.text.strip()
                sub_url = urlescape(a.attrib["href"])
                self.scrape_committee(chamber, name, sub_url, subcommittee=sub_name)

            self.save_committee(comm)
Пример #24
0
    def scrape(self, chamber, term):
        self.validate_term(term, latest_only=True)

        if chamber == "upper":
            chamber_abbrev = "sen"
            title_abbrev = "sen"
        else:
            chamber_abbrev = "hse"
            title_abbrev = "del"

        url = "http://www.legis.state.wv.us/districts/maps/%s_dist.cfm" % (chamber_abbrev)
        page = lxml.html.fromstring(self.urlopen(url))
        page.make_links_absolute(url)

        view_url = "%smemview" % title_abbrev
        for link in page.xpath("//a[contains(@href, '%s')]" % view_url):
            name = link.xpath("string()").strip()
            leg_url = urlescape(link.attrib["href"])

            if name in ["Members", "Senate Members", "House Members", "Vacancy", "VACANT", "Vacant"]:
                continue

            self.scrape_legislator(chamber, term, name, leg_url)
Пример #25
0
    def scrape(self, chamber, term):
        self.validate_term(term, latest_only=True)

        if chamber == 'upper':
            chamber_abbrev = 'Senate1'
        else:
            chamber_abbrev = 'House'

        url = 'http://www.legis.state.wv.us/%s/roster.cfm' % chamber_abbrev
        page = lxml.html.fromstring(self.get(url).text)
        page.make_links_absolute(url)

        for link in page.xpath("//a[contains(@href, '?member=')]"):
            if not link.text:
                continue
            name = link.xpath("string()").strip()
            leg_url = urlescape(link.attrib['href'])

            if name in ['Members', 'Senate Members', 'House Members',
                        'Vacancy', 'VACANT', 'Vacant', "To Be Announced"]:
                continue

            self.scrape_legislator(chamber, term, name, leg_url)
Пример #26
0
    def scrape_committee(self, chamber, name, url, subcommittee=None):
        comm = Committee(chamber, name, subcommittee=subcommittee)
        comm.add_source(url)

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            for tr in page.xpath('//table[@class="gridtable"]/'
                                 'tr[position()>1]'):
                if tr.xpath('string(td[1])'):
                    mtype = tr.xpath('string(td[1])')
                else:
                    mtype = 'member'
                member = tr.xpath('string(td[3])').split()
                member = ' '.join(member[1:])
                comm.add_member(member, mtype)

            for a in page.xpath('//ul/li/a'):
                sub_name = a.text.strip()
                sub_url = urlescape(a.attrib['href'])
                self.scrape_committee(chamber, name, sub_url,
                                      subcommittee=sub_name)

            self.save_committee(comm)
Пример #27
0
    def scrape_member(self, chamber, term, member_url):
        with self.urlopen(member_url) as page:
            root = lxml.html.fromstring(page)
            root.make_links_absolute(member_url)

            sdiv = root.xpath('//div[@class="subtitle"]')[0]
            table = sdiv.getnext()

            photo_url = table.xpath('//img[@id="ctl00_ContentPlaceHolder1'
                                    '_imgMember"]')[0].attrib['src']

            td = table.xpath('//td[@valign="top"]')[0]

            type = td.xpath('string(//div[1]/strong)').strip()

            full_name = td.xpath('string(//div[2]/strong)').strip()
            full_name = re.sub(r'\s+', ' ', full_name)

            district = td.xpath('string(//div[3])').strip()
            district = district.replace('District ', '')

            addrs = {}
            for atype, text in (('capital_address', 'Capitol address:'),
                                ('district_address', 'District address:')):
                aspan = root.xpath("//span[. = '%s']" % text)
                addrs[atype] = None

                if aspan:
                    addrs[atype] = aspan[0].tail
                    elem = aspan[0].getnext()
                    while elem is not None and elem.tag == 'br':
                        if elem.tail:
                            addrs[atype] += "\n" + elem.tail
                        elem = elem.getnext()

            party = td.xpath('string(//div[4])').strip()[0]
            if party == 'D':
                party = 'Democratic'
            elif party == 'R':
                party = 'Republican'

            if type == 'Lt. Gov.':
                leg = Person(full_name)
                leg.add_role('Lt. Governor', term, party=party, **addrs)
            else:
                leg = Legislator(term,
                                 chamber,
                                 district,
                                 full_name,
                                 party=party,
                                 photo_url=photo_url,
                                 **addrs)

            leg.add_source(urlescape(member_url))

            comm_div = root.xpath('//div[string() = "Committee Membership:"]'
                                  '/following-sibling::div'
                                  '[@class="rcwcontent"]')[0]

            for link in comm_div.xpath('*/a'):
                name = link.text

                if '(Vice Chair)' in name:
                    mtype = 'vice chair'
                elif '(Chair)' in name:
                    mtype = 'chair'
                else:
                    mtype = 'member'

                name = clean_committee_name(link.text)

                # There's no easy way to determine whether a committee
                # is joint or not using the mobile legislator directory
                # (without grabbing a whole bunch of pages, at least)
                # so for now we will hard-code the one broken case
                if (name == "Oversight of HHS Eligibility System"
                        and term == '82'):
                    comm_chamber = 'joint'
                else:
                    comm_chamber = chamber

                if name.startswith('Appropriations-S/C on '):
                    sub = name.replace('Appropriations-S/C on ', '')
                    leg.add_role('committee member',
                                 term,
                                 chamber=comm_chamber,
                                 committee='Appropriations',
                                 subcommittee=sub,
                                 position=mtype)
                else:
                    leg.add_role('committee member',
                                 term,
                                 chamber=comm_chamber,
                                 committee=name,
                                 position=mtype)

            if type == 'Lt. Gov.':
                self.save_person(leg)
            else:
                if district:
                    self.save_legislator(leg)
Пример #28
0
    def scrape_bill(self, chamber, session, bill_id, url):
        try:
            page = lxml.html.fromstring(self.urlopen(url))
        except scrapelib.HTTPError as e:
            self.warning('error (%s) fetching %s, skipping' % (e, url))
            return

        title = page.xpath(
            "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip()

        if 'JR' in bill_id:
            bill_type = ['joint resolution']
        elif 'CR' in bill_id:
            bill_type = ['concurrent resolution']
        elif 'R' in bill_id:
            bill_type = ['resolution']
        else:
            bill_type = ['bill']

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(url)
        bill['subjects'] = self.subject_map[bill_id]

        for link in page.xpath("//a[contains(@id, 'Auth')]"):
            name = link.xpath("string()").strip()

            if 'otherAuth' in link.attrib['id']:
                bill.add_sponsor('cosponsor', name)
            else:
                bill.add_sponsor('primary', name)

        act_table = page.xpath("//table[contains(@id, 'Actions')]")[0]
        for tr in act_table.xpath("tr")[2:]:
            action = tr.xpath("string(td[1])").strip()
            if not action or action == 'None':
                continue

            date = tr.xpath("string(td[3])").strip()
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            actor = tr.xpath("string(td[4])").strip()
            if actor == 'H':
                actor = 'lower'
            elif actor == 'S':
                actor = 'upper'

            bill.add_action(actor, action, date,
                            type=action_type(action))

        version_table = page.xpath("//table[contains(@id, 'Versions')]")[0]
        for link in version_table.xpath(".//a[contains(@href, '.DOC')]"):
            version_url = link.attrib['href']
            if 'COMMITTEE REPORTS' in version_url:
                continue

            name = link.text.strip()
            bill.add_version(name, version_url, mimetype='application/msword')

        for link in page.xpath(".//a[contains(@href, '_VOTES')]"):
            self.scrape_votes(bill, urlescape(link.attrib['href']))

        # # If the bill has no actions and no versions, it's a bogus bill on
        # # their website, which appears to happen occasionally. Skip.
        has_no_actions = not bill['actions']
        has_no_versions = not bill['versions']
        has_no_title = (bill['title'] == "Short Title Not Found.")
        first_sponsor_is_bogus = bill['sponsors'][0]['name'] = "Author Not Found."
        has_no_sponsors = (len(bill['sponsors']) == 1) and first_sponsor_is_bogus
        if has_no_actions and has_no_versions:
            if has_no_title or has_no_sponsors:
                msg = '%r appears to be bogus. Skipping it.' % bill_id
                self.logger.warning(msg)
                return
        else:
            # Otherwise, save the bills.
            self.save_bill(bill)
Пример #29
0
    def scrape_member(self, chamber, term, member_url):
        page = self.get(member_url).text
        root = lxml.html.fromstring(page)
        root.make_links_absolute(member_url)

        sdiv = root.xpath('//div[@class="subtitle"]')[0]
        table = sdiv.getnext()

        photo_url = table.xpath('//img[@id="ctl00_ContentPlaceHolder1'
                                '_imgMember"]')[0].attrib['src']

        td = table.xpath('//td[@valign="top"]')[0]

        type = td.xpath('string(//div[1]/strong)').strip()

        full_name = td.xpath('//div/strong/text()')
        full_name = [re.sub(r'\s+', ' ', x).strip() for x in full_name]
        if full_name == []:
            self.warning("ERROR: CAN'T GET FULL NAME")
            return

        full_name = full_name[-1]

        district = td.xpath('string(//div[3])').strip()
        district = district.replace('District ', '')

        party = td.xpath('string(//div[4])').strip()[0]
        if party == 'D':
            party = 'Democratic'
        elif party == 'R':
            party = 'Republican'

        if type == 'Lt. Gov.':
            leg = Person(full_name)
            leg.add_role('Lt. Governor', term, party=party)
        else:
            leg = Legislator(term,
                             chamber,
                             district,
                             full_name,
                             party=party,
                             photo_url=photo_url,
                             url=member_url)

        leg.add_source(urlescape(member_url))

        # add addresses
        for atype, text in (('capitol', 'Capitol address'),
                            ('district', 'District address')):
            aspan = root.xpath("//span[. = '%s:']" % text)
            addr = ''
            phone = None
            if aspan:
                # cycle through brs
                addr = aspan[0].tail.strip()
                elem = aspan[0].getnext()
                while elem is not None and elem.tag == 'br':
                    if elem.tail:
                        if not phone_re.match(elem.tail):
                            addr += "\n" + elem.tail
                        else:
                            phone = elem.tail
                    elem = elem.getnext()
                # now add the addresses
                leg.add_office(atype, text, address=addr, phone=phone)

        # add committees
        comm_div = root.xpath('//div[string() = "Committee Membership:"]'
                              '/following-sibling::div'
                              '[@class="rcwcontent"]')[0]

        for link in comm_div.xpath('*/a'):
            name = link.text

            if '(Vice Chair)' in name:
                mtype = 'vice chair'
            elif '(Chair)' in name:
                mtype = 'chair'
            else:
                mtype = 'member'

            name = clean_committee_name(link.text)

            # There's no easy way to determine whether a committee
            # is joint or not using the mobile legislator directory
            # (without grabbing a whole bunch of pages, at least)
            # so for now we will hard-code the one broken case
            if (name == "Oversight of HHS Eligibility System"
                    and term == '82'):
                comm_chamber = 'joint'
            else:
                comm_chamber = chamber

            if name.startswith('Appropriations-S/C on '):
                sub = name.replace('Appropriations-S/C on ', '')
                leg.add_role('committee member',
                             term,
                             chamber=comm_chamber,
                             committee='Appropriations',
                             subcommittee=sub,
                             position=mtype)
            else:
                leg.add_role('committee member',
                             term,
                             chamber=comm_chamber,
                             committee=name,
                             position=mtype)

        if type == 'Lt. Gov.':
            self.save_object(leg)
        else:
            if district:
                self.save_legislator(leg)
Пример #30
0
    def scrape_member(self, chamber, term, member_url):
        with self.urlopen(member_url) as page:
            root = lxml.html.fromstring(page)
            root.make_links_absolute(member_url)

            sdiv = root.xpath('//div[@class="subtitle"]')[0]
            table = sdiv.getnext()

            photo_url = table.xpath('//img[@id="ctl00_ContentPlaceHolder1' '_imgMember"]')[0].attrib["src"]

            td = table.xpath('//td[@valign="top"]')[0]

            type = td.xpath("string(//div[1]/strong)").strip()

            full_name = td.xpath("string(//div[2]/strong)").strip()
            full_name = re.sub(r"\s+", " ", full_name)

            district = td.xpath("string(//div[3])").strip()
            district = district.replace("District ", "")

            party = td.xpath("string(//div[4])").strip()[0]
            if party == "D":
                party = "Democratic"
            elif party == "R":
                party = "Republican"

            if type == "Lt. Gov.":
                leg = Person(full_name)
                leg.add_role("Lt. Governor", term, party=party)
            else:
                leg = Legislator(term, chamber, district, full_name, party=party, photo_url=photo_url, url=member_url)

            leg.add_source(urlescape(member_url))

            # add addresses
            for atype, text in (("capitol", "Capitol address"), ("district", "District address")):
                aspan = root.xpath("//span[. = '%s:']" % text)
                addr = ""
                phone = None
                if aspan:
                    # cycle through brs
                    addr = aspan[0].tail.strip()
                    elem = aspan[0].getnext()
                    while elem is not None and elem.tag == "br":
                        if elem.tail:
                            if not phone_re.match(elem.tail):
                                addr += "\n" + elem.tail
                            else:
                                phone = elem.tail
                        elem = elem.getnext()
                    # now add the addresses
                    leg.add_office(atype, text, address=addr, phone=phone)

            # add committees
            comm_div = root.xpath(
                '//div[string() = "Committee Membership:"]' "/following-sibling::div" '[@class="rcwcontent"]'
            )[0]

            for link in comm_div.xpath("*/a"):
                name = link.text

                if "(Vice Chair)" in name:
                    mtype = "vice chair"
                elif "(Chair)" in name:
                    mtype = "chair"
                else:
                    mtype = "member"

                name = clean_committee_name(link.text)

                # There's no easy way to determine whether a committee
                # is joint or not using the mobile legislator directory
                # (without grabbing a whole bunch of pages, at least)
                # so for now we will hard-code the one broken case
                if name == "Oversight of HHS Eligibility System" and term == "82":
                    comm_chamber = "joint"
                else:
                    comm_chamber = chamber

                if name.startswith("Appropriations-S/C on "):
                    sub = name.replace("Appropriations-S/C on ", "")
                    leg.add_role(
                        "committee member",
                        term,
                        chamber=comm_chamber,
                        committee="Appropriations",
                        subcommittee=sub,
                        position=mtype,
                    )
                else:
                    leg.add_role("committee member", term, chamber=comm_chamber, committee=name, position=mtype)

            if type == "Lt. Gov.":
                self.save_object(leg)
            else:
                if district:
                    self.save_legislator(leg)
Пример #31
0
    def scrape_member(self, chamber, term, member_url):
        page = self.urlopen(member_url)
        root = lxml.html.fromstring(page)
        root.make_links_absolute(member_url)

        sdiv = root.xpath('//div[@class="subtitle"]')[0]
        table = sdiv.getnext()

        photo_url = table.xpath('//img[@id="ctl00_ContentPlaceHolder1'
                                '_imgMember"]')[0].attrib['src']

        td = table.xpath('//td[@valign="top"]')[0]

        type = td.xpath('string(//div[1]/strong)').strip()

        full_name = td.xpath('string(//div[2]/strong)').strip()
        full_name = re.sub(r'\s+', ' ', full_name)

        district = td.xpath('string(//div[3])').strip()
        district = district.replace('District ', '')

        party = td.xpath('string(//div[4])').strip()[0]
        if party == 'D':
            party = 'Democratic'
        elif party == 'R':
            party = 'Republican'

        if type == 'Lt. Gov.':
            leg = Person(full_name)
            leg.add_role('Lt. Governor', term, party=party)
        else:
            leg = Legislator(term, chamber, district, full_name,
                             party=party, photo_url=photo_url,
                             url=member_url)

        leg.add_source(urlescape(member_url))

        # add addresses
        for atype, text in (('capitol', 'Capitol address'),
                            ('district', 'District address')):
            aspan = root.xpath("//span[. = '%s:']" % text)
            addr = ''
            phone = None
            if aspan:
                # cycle through brs
                addr = aspan[0].tail.strip()
                elem = aspan[0].getnext()
                while elem is not None and elem.tag == 'br':
                    if elem.tail:
                        if not phone_re.match(elem.tail):
                            addr += "\n" + elem.tail
                        else:
                            phone = elem.tail
                    elem = elem.getnext()
                # now add the addresses
                leg.add_office(atype, text, address=addr, phone=phone)

        # add committees
        comm_div = root.xpath('//div[string() = "Committee Membership:"]'
                              '/following-sibling::div'
                              '[@class="rcwcontent"]')[0]

        for link in comm_div.xpath('*/a'):
            name = link.text

            if '(Vice Chair)' in name:
                mtype = 'vice chair'
            elif '(Chair)' in name:
                mtype = 'chair'
            else:
                mtype = 'member'

            name = clean_committee_name(link.text)

            # There's no easy way to determine whether a committee
            # is joint or not using the mobile legislator directory
            # (without grabbing a whole bunch of pages, at least)
            # so for now we will hard-code the one broken case
            if (name == "Oversight of HHS Eligibility System" and
                term == '82'):
                comm_chamber = 'joint'
            else:
                comm_chamber = chamber

            if name.startswith('Appropriations-S/C on '):
                sub = name.replace('Appropriations-S/C on ', '')
                leg.add_role('committee member', term,
                             chamber=comm_chamber,
                             committee='Appropriations',
                             subcommittee=sub,
                             position=mtype)
            else:
                leg.add_role('committee member', term,
                             chamber=comm_chamber,
                             committee=name,
                             position=mtype)

        if type == 'Lt. Gov.':
            self.save_object(leg)
        else:
            if district:
                self.save_legislator(leg)
Пример #32
0
    def scrape_member(self, chamber, term, member_url):
        with self.urlopen(member_url) as page:
            root = lxml.html.fromstring(page)
            root.make_links_absolute(member_url)

            sdiv = root.xpath('//div[@class="subtitle"]')[0]
            table = sdiv.getnext()

            photo_url = table.xpath('//img[@id="ctl00_ContentPlaceHolder1'
                                    '_imgMember"]')[0].attrib['src']

            td = table.xpath('//td[@valign="top"]')[0]

            type = td.xpath('string(//div[1]/strong)').strip()

            full_name = td.xpath('string(//div[2]/strong)').strip()
            full_name = re.sub(r'\s+', ' ', full_name)

            district = td.xpath('string(//div[3])').strip()
            district = district.replace('District ', '')

            addrs = {}
            for atype, text in (('capital_address', 'Capitol address:'),
                                ('district_address', 'District address:')):
                aspan = root.xpath("//span[. = '%s']" % text)
                addrs[atype] = None

                if aspan:
                    addrs[atype] = aspan[0].tail
                    elem = aspan[0].getnext()
                    while elem is not None and elem.tag == 'br':
                        if elem.tail:
                            addrs[atype] += "\n" + elem.tail
                        elem = elem.getnext()

            party = td.xpath('string(//div[4])').strip()[0]
            if party == 'D':
                party = 'Democratic'
            elif party == 'R':
                party = 'Republican'

            if type == 'Lt. Gov.':
                leg = Person(full_name)
                leg.add_role('Lt. Governor', term, party=party, **addrs)
            else:
                leg = Legislator(term, chamber, district, full_name,
                                 party=party, photo_url=photo_url,
                                 **addrs)

            leg.add_source(urlescape(member_url))

            comm_div = root.xpath('//div[string() = "Committee Membership:"]'
                                  '/following-sibling::div'
                                  '[@class="rcwcontent"]')[0]

            for link in comm_div.xpath('*/a'):
                name = link.text

                if '(Vice Chair)' in name:
                    mtype = 'vice chair'
                elif '(Chair)' in name:
                    mtype = 'chair'
                else:
                    mtype = 'member'

                name = clean_committee_name(link.text)

                if name.startswith('Appropriations-S/C on '):
                    sub = name.replace('Appropriations-S/C on ', '')
                    leg.add_role('committee member', term,
                                 chamber=chamber,
                                 committee='Appropriations',
                                 subcommittee=sub,
                                 position=mtype)
                else:
                    leg.add_role('committee member', term,
                                 chamber=chamber,
                                 committee=name,
                                 position=mtype)

            if type == 'Lt. Gov.':
                self.save_person(leg)
            else:
                if district:
                    self.save_legislator(leg)
Пример #33
0
    def scrape_bill(self, chamber, session, bill_id, url):
        try:
            page = lxml.html.fromstring(self.urlopen(url))
        except scrapelib.HTTPError as e:
            self.warning("error (%s) fetching %s, skipping" % (e, url))
            return

        title = page.xpath("string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip()

        if "JR" in bill_id:
            bill_type = ["joint resolution"]
        elif "CR" in bill_id:
            bill_type = ["concurrent resolution"]
        elif "R" in bill_id:
            bill_type = ["resolution"]
        else:
            bill_type = ["bill"]

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(url)
        bill["subjects"] = self.subject_map[bill_id]

        for link in page.xpath("//a[contains(@id, 'Auth')]"):
            name = link.xpath("string()").strip()

            if ":" in name:
                raise Exception(name)
            if "otherAuth" in link.attrib["id"]:
                bill.add_sponsor("cosponsor", name)
            else:
                bill.add_sponsor("primary", name)

        act_table = page.xpath("//table[contains(@id, 'Actions')]")[0]
        for tr in act_table.xpath("tr")[2:]:
            action = tr.xpath("string(td[1])").strip()
            if not action or action == "None":
                continue

            date = tr.xpath("string(td[3])").strip()
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            actor = tr.xpath("string(td[4])").strip()
            if actor == "H":
                actor = "lower"
            elif actor == "S":
                actor = "upper"

            attrs = dict(actor=actor, action=action, date=date)
            attrs.update(**self.categorizer.categorize(action))
            bill.add_action(**attrs)

        version_table = page.xpath("//table[contains(@id, 'Versions')]")[0]
        for link in version_table.xpath(".//a[contains(@href, '.PDF')]"):
            version_url = link.attrib["href"]
            if "COMMITTEE REPORTS" in version_url:
                continue

            name = link.text.strip()
            bill.add_version(name, version_url, mimetype="application/pdf")

        for link in page.xpath(".//a[contains(@href, '_VOTES')]"):
            if "HT_" not in link.attrib["href"]:
                self.scrape_votes(bill, urlescape(link.attrib["href"]))

        # # If the bill has no actions and no versions, it's a bogus bill on
        # # their website, which appears to happen occasionally. Skip.
        has_no_title = bill["title"] == "Short Title Not Found."
        if has_no_title:
            # If there's no title, this is an empty page. Skip!
            return

        else:
            # Otherwise, save the bills.
            self.save_bill(bill)
Пример #34
0
    def scrape_bill(self, chamber, session, bill_id, url):
        try:
            page = lxml.html.fromstring(self.get(url).text)
        except scrapelib.HTTPError as e:
            self.warning('error (%s) fetching %s, skipping' % (e, url))
            return

        title = page.xpath(
            "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip()

        if 'JR' in bill_id:
            bill_type = ['joint resolution']
        elif 'CR' in bill_id:
            bill_type = ['concurrent resolution']
        elif 'R' in bill_id:
            bill_type = ['resolution']
        else:
            bill_type = ['bill']

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(url)
        bill['subjects'] = self.subject_map[bill_id]

        for link in page.xpath("//a[contains(@id, 'Auth')]"):
            name = link.xpath("string()").strip()

            if ':' in name:
                raise Exception(name)
            if 'otherAuth' in link.attrib['id']:
                bill.add_sponsor('cosponsor', name)
            else:
                bill.add_sponsor('primary', name)

        act_table = page.xpath("//table[contains(@id, 'Actions')]")[0]
        for tr in act_table.xpath("tr")[2:]:
            action = tr.xpath("string(td[1])").strip()
            if not action or action == 'None':
                continue

            date = tr.xpath("string(td[3])").strip()
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            actor = tr.xpath("string(td[4])").strip()
            if actor == 'H':
                actor = 'lower'
            elif actor == 'S':
                actor = 'upper'

            attrs = dict(actor=actor, action=action, date=date)
            attrs.update(**self.categorizer.categorize(action))
            bill.add_action(**attrs)

        version_table = page.xpath("//table[contains(@id, 'Versions')]")[0]
        for link in version_table.xpath(".//a[contains(@href, '.PDF')]"):
            version_url = link.attrib['href']
            name = link.text.strip()

            if 'COMMITTEE REPORTS' in version_url:
                bill.add_document(name, version_url, mimetype='application/pdf')
                continue

            bill.add_version(name, version_url, mimetype='application/pdf')

        for link in page.xpath(".//a[contains(@href, '_VOTES')]"):
            if 'HT_' not in link.attrib['href']:
                self.scrape_votes(bill, urlescape(link.attrib['href']))

        # # If the bill has no actions and no versions, it's a bogus bill on
        # # their website, which appears to happen occasionally. Skip.
        has_no_title = (bill['title'] == "Short Title Not Found.")
        if has_no_title:
            # If there's no title, this is an empty page. Skip!
            return

        else:
            # Otherwise, save the bills.
            self.save_bill(bill)
Пример #35
0
    def scrape_bill(self, chamber, session, bill_id, url):
        try:
            page = lxml.html.fromstring(self.get(url).text)
        except scrapelib.HTTPError as e:
            self.warning('error (%s) fetching %s, skipping' % (e, url))
            return

        title = page.xpath(
            "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip()

        if 'JR' in bill_id:
            bill_type = ['joint resolution']
        elif 'CR' in bill_id:
            bill_type = ['concurrent resolution']
        elif 'R' in bill_id:
            bill_type = ['resolution']
        else:
            bill_type = ['bill']

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(url)
        bill['subjects'] = self.subject_map[bill_id]

        for link in page.xpath("//a[contains(@id, 'Auth')]"):
            name = link.xpath("string()").strip()

            if ':' in name:
                raise Exception(name)
            if 'otherAuth' in link.attrib['id']:
                bill.add_sponsor('cosponsor', name)
            else:
                bill.add_sponsor('primary', name)

        act_table = page.xpath("//table[contains(@id, 'Actions')]")[0]
        for tr in act_table.xpath("tr")[2:]:
            action = tr.xpath("string(td[1])").strip()
            if not action or action == 'None':
                continue

            date = tr.xpath("string(td[3])").strip()
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            actor = tr.xpath("string(td[4])").strip()
            if actor == 'H':
                actor = 'lower'
            elif actor == 'S':
                actor = 'upper'

            attrs = dict(actor=actor, action=action, date=date)
            attrs.update(**self.categorizer.categorize(action))
            bill.add_action(**attrs)

        version_table = page.xpath("//table[contains(@id, 'Versions')]")[0]
        for link in version_table.xpath(".//a[contains(@href, '.PDF')]"):
            version_url = link.attrib['href']
            name = link.text.strip()

            if 'COMMITTEE REPORTS' in version_url:
                bill.add_document(name,
                                  version_url,
                                  mimetype='application/pdf')
                continue

            bill.add_version(name, version_url, mimetype='application/pdf')

        for link in page.xpath(".//a[contains(@href, '_VOTES')]"):
            if 'HT_' not in link.attrib['href']:
                self.scrape_votes(bill, urlescape(link.attrib['href']))

        # # If the bill has no actions and no versions, it's a bogus bill on
        # # their website, which appears to happen occasionally. Skip.
        has_no_title = (bill['title'] == "Short Title Not Found.")
        if has_no_title:
            # If there's no title, this is an empty page. Skip!
            return

        else:
            # Otherwise, save the bills.
            self.save_bill(bill)
Пример #36
0
    def scrape_committee(self, chamber, name, url, subcommittee=None):
        name = self._fix_committee_name(name)
        name = self._fix_committee_case(name)

        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        # Get the subcommittee name.
        xpath = '//div[@class="ms-WPBody"]//table//tr/td/b/text()'

        if subcommittee:
            subcommittee = page.xpath(xpath)
            if subcommittee:
                subcommittee = page.xpath(xpath).pop(0)
                subcommittee = self._fix_committee_name(subcommittee,
                                                        parent=name,
                                                        subcommittee=True)
                subcommittee = self._fix_committee_case(subcommittee)
            else:
                subcommittee = None

        # Dedupe.
        if (chamber, name, subcommittee) in self._seen:
            return
        self._seen.add((chamber, name, subcommittee))

        comm = Committee(chamber, name, subcommittee=subcommittee)
        comm.add_source(url)

        member_nodes = page.xpath('//table[@class="dxgvTable"]/tr')

        for member_node in member_nodes:
            # Skip empty rows.
            if member_node.attrib['class'] == 'dxgvEmptyDataRow':
                continue

            mtype = member_node.xpath('string(td[1])').strip()

            if not mtype:
                mtype = 'member'

            member = member_node.xpath('string(td[3])').split()

            title = member[0]
            member = ' '.join(member[1:])

            if title == 'Senator':
                mchamber = 'upper'
            elif title == 'Representative':
                mchamber = 'lower'
            else:
                # skip non-legislative members
                continue

            comm.add_member(member, mtype, chamber=mchamber)

        for a in page.xpath('//table[@id="ctl00_m_g_a194465c_f092_46df_b753_'
                            '354150ac7dbd_ctl00_tblContainer"]//ul/li/a'):
            sub_name = a.text.strip()
            sub_url = urlescape(a.attrib['href'])
            self.scrape_committee(chamber,
                                  name,
                                  sub_url,
                                  subcommittee=sub_name)

        if not comm['members']:
            if subcommittee:
                self.warning(
                    'Not saving empty subcommittee {}.'.format(subcommittee))
            else:
                self.warning('Not saving empty committee {}.'.format(name))
        else:
            self.save_committee(comm)
Пример #37
0
    def scrape_committee(self, chamber, name, url, subcommittee=None):
        name = self._fix_committee_name(name)
        name = self._fix_committee_case(name)

        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        # Get the subcommittee name.
        xpath = '//div[@class="ms-WPBody"]//table//tr/td/b/text()'

        if subcommittee:
            subcommittee = page.xpath(xpath)
            if subcommittee:
                subcommittee = page.xpath(xpath).pop(0)
                subcommittee = self._fix_committee_name(
                    subcommittee, parent=name, subcommittee=True)
                subcommittee = self._fix_committee_case(subcommittee)
            else:
                subcommittee = None

        # Dedupe.
        if (chamber, name, subcommittee) in self._seen:
            return
        self._seen.add((chamber, name, subcommittee))

        comm = Committee(chamber, name, subcommittee=subcommittee)
        comm.add_source(url)

        member_nodes = page.xpath('//table[@class="dxgvTable"]/tr')

        for member_node in member_nodes:
            # Skip empty rows.
            if member_node.attrib['class'] == 'dxgvEmptyDataRow':
                continue

            mtype = member_node.xpath('string(td[1])').strip()

            if not mtype:
                mtype = 'member'

            member = member_node.xpath('string(td[3])').split()

            title = member[0]
            member = ' '.join(member[1:])

            if title == 'Senator':
                mchamber = 'upper'
            elif title == 'Representative':
                mchamber = 'lower'
            else:
                # skip non-legislative members
                continue

            comm.add_member(member, mtype, chamber=mchamber)

        for a in page.xpath('//table[@id="ctl00_m_g_a194465c_f092_46df_b753_'
            '354150ac7dbd_ctl00_tblContainer"]//ul/li/a'):
            sub_name = a.text.strip()
            sub_url = urlescape(a.attrib['href'])
            self.scrape_committee(chamber, name, sub_url,
                subcommittee=sub_name)

        if not comm['members']:
            if subcommittee:
                self.warning('Not saving empty subcommittee {}.'.format(
                    subcommittee))
            else:
                self.warning('Not saving empty committee {}.'.format(name))
        else:
            self.save_committee(comm)
Пример #38
0
    def scrape_bill(self, chamber, session, bill_id, url):
        try:
            page = lxml.html.fromstring(self.urlopen(url))
        except scrapelib.HTTPError as e:
            self.warning('error (%s) fetching %s, skipping' % (e, url))
            return

        title = page.xpath(
            "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip()

        if 'JR' in bill_id:
            bill_type = ['joint resolution']
        elif 'CR' in bill_id:
            bill_type = ['concurrent resolution']
        elif 'R' in bill_id:
            bill_type = ['resolution']
        else:
            bill_type = ['bill']

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(url)
        bill['subjects'] = self.subject_map[bill_id]

        for link in page.xpath("//a[contains(@id, 'Auth')]"):
            name = link.xpath("string()").strip()

            if 'otherAuth' in link.attrib['id']:
                bill.add_sponsor('cosponsor', name)
            else:
                bill.add_sponsor('primary', name)

        act_table = page.xpath("//table[contains(@id, 'Actions')]")[0]
        for tr in act_table.xpath("tr")[2:]:
            action = tr.xpath("string(td[1])").strip()
            if not action or action == 'None':
                continue

            date = tr.xpath("string(td[3])").strip()
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            actor = tr.xpath("string(td[4])").strip()
            if actor == 'H':
                actor = 'lower'
            elif actor == 'S':
                actor = 'upper'

            bill.add_action(actor, action, date, type=action_type(action))

        version_table = page.xpath("//table[contains(@id, 'Versions')]")[0]
        for link in version_table.xpath(".//a[contains(@href, '.DOC')]"):
            version_url = link.attrib['href']
            if 'COMMITTEE REPORTS' in version_url:
                continue

            name = link.text.strip()
            bill.add_version(name, version_url)

        for link in page.xpath(".//a[contains(@href, '_VOTES')]"):
            self.scrape_votes(bill, urlescape(link.attrib['href']))

        # # If the bill has no actions and no versions, it's a bogus bill on
        # # their website, which appears to happen occasionally. Skip.
        has_no_actions = not bill['actions']
        has_no_versions = not bill['versions']
        has_no_title = (bill['title'] == "Short Title Not Found.")
        first_sponsor_is_bogus = bill['sponsors'][0][
            'name'] = "Author Not Found."
        has_no_sponsors = (len(bill['sponsors'])
                           == 1) and first_sponsor_is_bogus
        if has_no_actions and has_no_versions:
            if has_no_title or has_no_sponsors:
                msg = '%r appears to be bogus. Skipping it.' % bill_id
                self.logger.warning(msg)
                return
        else:
            # Otherwise, save the bills.
            self.save_bill(bill)