示例#1
0
    def scrape_bill(self, chamber, session, url):
        with self.urlopen(url) as data:
            if "Bill does not exist." in data:
                return

            bill = self.parse_bill_xml(chamber, session, data)
            bill.add_source(urlescape(url))

            versions_url = url.replace('billhistory', 'billtext/html')
            # URLs for versions inexplicably (H|S)(J|C) instead of (H|J)(CR|JR)
            versions_url = versions_url.replace('JR', 'J').replace('CR', 'C')
            versions_url = '/'.join(versions_url.split('/')[0:-1])

            bill_prefix = bill['bill_id'].split()[0]
            bill_num = int(bill['bill_id'].split()[1])
            long_bill_id = "%s%05d" % (bill_prefix, bill_num)

            try:
                with self.urlopen(versions_url) as versions_list:
                    bill.add_source(urlescape(versions_url))
                    for version in parse_ftp_listing(versions_list):
                        if version.startswith(long_bill_id):
                            version_name = version.split('.')[0]
                            version_url = urlparse.urljoin(
                                versions_url + '/',
                                version)
                            bill.add_version(version_name,
                                             urlescape(version_url))
            except urllib2.URLError:
                # Sometimes the text is missing
                pass

            self.save_bill(bill)
示例#2
0
    def scrape(self, chamber, session):
        self.validate_session(session)

        if len(session) == 2:
            session = "%sR" % session

        for btype in ["bills", "concurrent_resolutions", "joint_resolutions", "resolutions"]:
            billdirs_path = "/bills/%s/billhistory/%s_%s/" % (session, chamber_name(chamber), btype)
            billdirs_url = urlparse.urljoin(self._ftp_root, billdirs_path)

            with self.urlopen(billdirs_url) as bill_dirs:
                for dir in parse_ftp_listing(bill_dirs):
                    bill_url = urlparse.urljoin(billdirs_url, dir) + "/"
                    with self.urlopen(bill_url) as bills:
                        for history in parse_ftp_listing(bills):
                            self.scrape_bill(chamber, session, urlparse.urljoin(bill_url, history))
示例#3
0
    def scrape(self, chamber, session):
        self.validate_session(session)

        if len(session) == 2:
            session = "%sR" % session

        for btype in ['bills', 'concurrent_resolutions',
                      'joint_resolutions', 'resolutions']:
            billdirs_path = '/bills/%s/billhistory/%s_%s/' % (
                session, chamber_name(chamber), btype)
            billdirs_url = urlparse.urljoin(self._ftp_root, billdirs_path)

            with self.urlopen(billdirs_url) as bill_dirs:
                for dir in parse_ftp_listing(bill_dirs):
                    bill_url = urlparse.urljoin(billdirs_url, dir) + '/'
                    with self.urlopen(bill_url) as bills:
                        for history in parse_ftp_listing(bills):
                            self.scrape_bill(chamber, session,
                                             urlparse.urljoin(bill_url,
                                                              history))
示例#4
0
    def scrape(self, chamber, session):
        self.validate_session(session)

        if len(session) == 2:
            session = "%sR" % session

        journal_root = urlparse.urljoin(self._ftp_root,
                                        ("/journals/" + session + "/html/"),
                                        True)

        if chamber == 'lower':
            journal_root = urlparse.urljoin(journal_root, "house/", True)
        else:
            journal_root = urlparse.urljoin(journal_root, "senate/", True)

        with self.urlopen(journal_root) as listing:
            for name in parse_ftp_listing(listing):
                if not name.startswith(session):
                    continue
                url = urlparse.urljoin(journal_root, name)
                self.scrape_journal(url, chamber, session)
示例#5
0
    def scrape(self, chamber, session):
        self.validate_session(session)

        if len(session) == 2:
            session = "%sR" % session

        journal_root = urlparse.urljoin(self._ftp_root, ("/journals/" +
                                                         session +
                                                         "/html/"),
                                        True)

        if chamber == 'lower':
            journal_root = urlparse.urljoin(journal_root, "house/", True)
        else:
            journal_root = urlparse.urljoin(journal_root, "senate/", True)

        with self.urlopen(journal_root) as listing:
            for name in parse_ftp_listing(listing):
                if not name.startswith('81'):
                    continue
                url = urlparse.urljoin(journal_root, name)
                self.scrape_journal(url, chamber)