Exemplo n.º 1
0
    def scrape_bill(self, session, chamber, bill_type, url):
        bill_html = self.get(url).text
        bill_page = lxml.html.fromstring(bill_html)

        qs = dict(urlparse.parse_qsl(urlparse.urlparse(url).query))
        bill_id = "{}{}".format(qs["billtype"], qs["billnumber"])
        versions = bill_page.xpath(
            "//table[contains(@id, 'GridViewVersions')]")[0]

        metainf_table = bill_page.xpath(
            '//div[contains(@id, "itemPlaceholder")]//table[1]')[0]
        action_table = bill_page.xpath(
            '//div[contains(@id, "UpdatePanel1")]//table[1]')[0]

        meta = self.parse_bill_metainf_table(metainf_table)

        subs = [s.strip() for s in meta["Report Title"].split(";")]
        if "" in subs:
            subs.remove("")
        b = Bill(
            bill_id,
            session,
            meta["Measure Title"],
            chamber=chamber,
            classification=bill_type,
        )
        if meta["Description"]:
            b.add_abstract(meta["Description"], "description")
        for subject in subs:
            b.add_subject(subject)
        if url:
            b.add_source(url)

        prior_session = "{} Regular Session".format(str(int(session[:4]) - 1))
        companion = meta["Companion"].strip()
        if companion:
            b.add_related_bill(
                identifier=companion.replace(u"\xa0", " "),
                legislative_session=prior_session,
                relation_type="companion",
            )
        if bill_page.xpath(
                "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()"
        ):
            prior = bill_page.xpath(
                "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()"
            )[-1]
            if "carried over" in prior.lower():
                b.add_related_bill(
                    identifier=bill_id.replace(u"\xa0", " "),
                    legislative_session=prior_session,
                    relation_type="companion",
                )
        for sponsor in meta["Introducer(s)"]:
            b.add_sponsorship(sponsor, "primary", "person", True)
        versions = self.parse_bill_versions_table(b, versions)
        yield from self.parse_bill_actions_table(b, action_table, bill_id,
                                                 session, url, chamber)
        yield b
Exemplo n.º 2
0
    def scrape(self):
        self.session = '2011'

        for i, page in enumerate(self.searchLegislation()):
            for legislation_summary in self.parseSearchResults(page):
                title = legislation_summary['Title'].strip()
                if title == "":
                    continue

                bill = Bill(name=legislation_summary['Record #'],
                            session=self.session,
                            title=title,
                            type=[legislation_summary['Type'].lower()],
                            organization=self.jurisdiction.name)

                bill.add_source(legislation_summary['URL'])

                legislation_details = self.expandLegislationSummary(
                    legislation_summary)

                for related_bill in legislation_details.get(
                        'Related files', []):
                    bill.add_related_bill(name=related_bill,
                                          session=self.session,
                                          relation='other-session',
                                          chamber=None)

                for i, sponsor in enumerate(
                        legislation_details.get('Sponsors', [])):
                    if i == 0:
                        primary = True
                        sponsorship_type = "Primary"
                    else:
                        primary = False
                        sponsorship_type = "Regular"

                    bill.add_sponsor(sponsor, sponsorship_type, 'person',
                                     primary)

                for subject in legislation_details.get(u'Topics', []):
                    bill.add_subject(subject)

                for attachment in legislation_details.get(u'Attachments', []):
                    bill.add_version_link('PDF',
                                          attachment['url'],
                                          mimetype="application/pdf")

                yield bill
Exemplo n.º 3
0
    def scrape(self):
        self.session = '2011'

        for i, page in enumerate(self.searchLegislation()) :
            for legislation_summary in self.parseSearchResults(page) :
                title = legislation_summary['Title'].strip()
                if title == "":
                    continue

                bill = Bill(name=legislation_summary['Record #'],
                            session=self.session,
                            title=title,
                            type=[legislation_summary['Type'].lower()],
                            organization=self.jurisdiction.name)

                bill.add_source(legislation_summary['URL'])

                legislation_details = self.expandLegislationSummary(legislation_summary)

                for related_bill in legislation_details.get('Related files', []) :
                    bill.add_related_bill(name = related_bill,
                                          session = self.session,
                                          relation='other-session',
                                          chamber=None)

                for i, sponsor in enumerate(legislation_details.get('Sponsors', [])) :
                    if i == 0 :
                        primary = True
                        sponsorship_type = "Primary"
                    else :
                        primary = False
                        sponsorship_type = "Regular"

                    bill.add_sponsor(sponsor, sponsorship_type,
                                     'person', primary)

                for subject in legislation_details.get(u'Topics', []) :
                    bill.add_subject(subject)

                for attachment in legislation_details.get(u'Attachments', []) :
                    bill.add_version_link('PDF',
                                          attachment['url'],
                                          mimetype="application/pdf")


                yield bill
Exemplo n.º 4
0
    def scrape_bill(self, session, chamber, bill_type, url):
        bill_html = self.get(url).text
        bill_page = lxml.html.fromstring(bill_html)

        qs = dict(urlparse.parse_qsl(urlparse.urlparse(url).query))
        bill_id = '{}{}'.format(qs['billtype'], qs['billnumber'])
        versions = bill_page.xpath("//table[contains(@id, 'GridViewVersions')]")[0]

        metainf_table = bill_page.xpath('//div[contains(@id, "itemPlaceholder")]//table[1]')[0]
        action_table = bill_page.xpath('//div[contains(@id, "UpdatePanel1")]//table[1]')[0]

        meta = self.parse_bill_metainf_table(metainf_table)

        subs = [s.strip() for s in meta['Report Title'].split(";")]
        if "" in subs:
            subs.remove("")
        b = Bill(bill_id, session, meta['Measure Title'],
                 chamber=chamber,
                 classification=bill_type)
        if meta['Description']:
            b.add_abstract(meta['Description'], 'description')
        for subject in subs:
            b.add_subject(subject)
        if url:
            b.add_source(url)

        prior_session = '{} Regular Session'.format(str(int(session[:4]) - 1))
        companion = meta['Companion'].strip()
        if companion:
            b.add_related_bill(identifier=companion.replace(u'\xa0', ' '),
                               legislative_session=prior_session,
                               relation_type="companion")
        prior = bill_page.xpath(
            "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()")[-1]
        if 'carried over' in prior.lower():
            b.add_related_bill(identifier=bill_id.replace(u'\xa0', ' '),
                               legislative_session=prior_session,
                               relation_type="companion")
        for sponsor in meta['Introducer(s)']:
            b.add_sponsorship(sponsor, 'primary', 'person', True)
        versions = self.parse_bill_versions_table(b, versions)
        yield from self.parse_bill_actions_table(b, action_table, bill_id, session, url, chamber)
        yield b
Exemplo n.º 5
0
    def parse_bill_status_page(self, status_url, bill_url, session, chamber):
        status_page = lxml.html.fromstring(self.get(status_url).text)
        # see 2007 HB 2... weird.
        bill_re = r'.*?/([A-Z]+)0*(\d+)\.pdf'
        bill_xpath = '//a[contains(@href, ".pdf") and contains(@href, "billpdf")]/@href'
        bill_id = re.search(bill_re, status_page.xpath(bill_xpath)[0],
                            re.IGNORECASE).groups()
        bill_id = "{0} {1}".format(bill_id[0], int(bill_id[1]))

        try:
            xp = '//b[text()="Short Title:"]/../following-sibling::td/text()'
            title = status_page.xpath(xp).pop()
        except IndexError:
            title = status_page.xpath('//tr[1]/td[2]')[0].text_content()

        # Add bill type.
        _bill_id = bill_id.lower()
        if 'b' in _bill_id:
            classification = 'bill'
        elif 'j' in _bill_id or 'jr' in _bill_id:
            classification = 'joint resolution'
        elif 'cr' in _bill_id:
            classification = 'concurrent resolution'
        elif 'r' in _bill_id:
            classification = 'resolution'

        bill = Bill(bill_id, legislative_session=session, chamber=chamber,
                    title=title, classification=classification)

        self.add_actions(bill, status_page)
        votes = self.add_votes(bill, status_page, status_url)

        tabledata = self._get_tabledata(status_page)

        # Add sponsor info.
        bill.add_sponsorship(tabledata['primary sponsor:'][0], classification='primary',
                             entity_type='person', primary=True)

        # A various plus fields MT provides.
        plus_fields = [
            'requester',
            ('chapter number:', 'chapter'),
            'transmittal date:',
            'drafter',
            'fiscal note probable:',
            'bill draft number:',
            'preintroduction required:',
            'by request of',
            'category:']

        for x in plus_fields:
            if isinstance(x, tuple):
                _key, key = x
            else:
                _key = key = x
                key = key.replace(' ', '_')

            try:
                val = tabledata[_key]
            except KeyError:
                continue

            if len(val) == 1:
                val = val[0]

            bill.extras[key] = val

        # Add bill subjects.
        xp = '//th[contains(., "Revenue/Approp.")]/ancestor::table/tr'
        subjects = []
        for tr in status_page.xpath(xp):
            try:
                subj = tr.xpath('td')[0].text_content()
            except:
                continue
            subjects.append(subj)

        for s in subjects:
            bill.add_subject(s)

        self.add_fiscal_notes(status_page, bill)

        return bill, list(votes)
Exemplo n.º 6
0
    def scrape(self):
        unreachable_urls = []

        for leg_summary in self.legislation(
                created_after=datetime.datetime(2015, 5, 17)):
            title = leg_summary['Title'].strip()

            if not title or not leg_summary['Intro\xa0Date']:
                continue
                # https://chicago.legistar.com/LegislationDetail.aspx?ID=1800754&GUID=29575A7A-5489-4D8B-8347-4FC91808B201&Options=Advanced&Search=
                # doesn't have an intro date

            bill_type = BILL_TYPES[leg_summary['Type']]

            bill_session = self.session(
                self.toTime(leg_summary['Intro\xa0Date']))
            bill = Bill(identifier=leg_summary['Record #'],
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name": "Chicago City Council"})

            bill.add_source(leg_summary['url'])

            try:
                leg_details = self.legDetails(leg_summary['url'])
            except IndexError:
                unreachable_urls.append(leg_summary['url'])
                yield bill
                continue

            for related_bill in leg_details.get('Related files', []):
                lower_title = title.lower()
                if "sundry" in title or "miscellaneous" in title:  #these are ominbus
                    bill.add_related_bill(
                        identifier=related_bill['label'],
                        legislative_session=bill.legislative_session,
                        relation_type='replaces')
                #for now we're skipping related bills if they
                #don't contain words that make us think they're
                #in a ominbus relationship with each other

            for i, sponsor in enumerate(leg_details.get('Sponsors', [])):
                if i == 0:
                    primary = True
                    sponsorship_type = "Primary"
                else:
                    primary = False
                    sponsorship_type = "Regular"

                sponsor_name = sponsor['label']

                # Does the Mayor/Clerk introduce legisislation as
                # individuals role holders or as the OFfice of City
                # Clerk and the Office of the Mayor?
                entity_type = 'person'
                if sponsor_name.startswith(('City Clerk', 'Mendoza, Susana')):
                    sponsor_name = 'Office of the City Clerk'
                    entity_type = 'organization'
                elif sponsor_name.startswith(('Emanuel, Rahm', )):
                    sponsor_name = 'Office of the Mayor'
                    entity_type = 'organization'
                if not sponsor_name.startswith(
                    ('Misc. Transmittal', 'No Sponsor', 'Dept./Agency')):
                    bill.add_sponsorship(
                        sponsor_name,
                        sponsorship_type,
                        entity_type,
                        primary,
                        entity_id=_make_pseudo_id(name=sponsor_name))

            if 'Topic' in leg_details:
                for subject in leg_details[u'Topic'].split(','):
                    bill.add_subject(subject)

            for attachment in leg_details.get('Attachments', []):
                if attachment['label']:
                    bill.add_version_link(attachment['label'],
                                          attachment['url'],
                                          media_type="application/pdf")

            for action in self.history(leg_summary['url']):
                action_description = action['Action']
                try:
                    action_date = self.toTime(
                        action['Date']).date().isoformat()
                except AttributeError:  # https://chicago.legistar.com/LegislationDetail.aspx?ID=1424866&GUID=CEC53337-B991-4268-AE8A-D4D174F8D492
                    continue

                if action_description:
                    try:
                        responsible_org = action['Action\xa0By']['label']
                    except TypeError:
                        responsible_org = action['Action\xa0By']
                    if responsible_org == 'City Council':
                        responsible_org = 'Chicago City Council'

                    act = bill.add_action(
                        action_description,
                        action_date,
                        organization={'name': responsible_org},
                        classification=ACTION_CLASSIFICATION[
                            action_description])

                    if action_description == 'Referred':
                        try:
                            leg_details[
                                'Current Controlling Legislative Body'][
                                    'label']
                            controlling_bodies = [
                                leg_details[
                                    'Current Controlling Legislative Body']
                            ]
                        except TypeError:
                            controlling_bodies = leg_details[
                                'Current Controlling Legislative Body']
                        if controlling_bodies:
                            for controlling_body in controlling_bodies:
                                body_name = controlling_body['label']
                                if body_name.startswith("Joint Committee"):
                                    act.add_related_entity(
                                        body_name, 'organization')
                                else:
                                    act.add_related_entity(
                                        body_name,
                                        'organization',
                                        entity_id=_make_pseudo_id(
                                            name=body_name))

                    if 'url' in action['Action\xa0Details']:
                        action_detail_url = action['Action\xa0Details']['url']
                        result, votes = self.extractVotes(action_detail_url)

                        if votes and result:  # see https://github.com/datamade/municipal-scrapers-us/issues/15
                            action_vote = VoteEvent(
                                legislative_session=bill.legislative_session,
                                motion_text=action_description,
                                organization={'name': responsible_org},
                                classification=None,
                                start_date=action_date,
                                result=result,
                                bill=bill)
                            action_vote.add_source(action_detail_url)

                            for option, voter in votes:
                                action_vote.vote(option, voter)

                            yield action_vote

            bill.extras = {'local_classification': leg_summary['Type']}

            yield bill
        print(unreachable_urls)
Exemplo n.º 7
0
    def get_bill_info(self, chamber, session, bill_detail_url,
                      version_list_url):
        """
        Extracts all the requested info for a given bill.

        Calls the parent's methods to enter the results into JSON files.
        """
        chamber = 'lower' if chamber.lower() == 'house' else chamber
        chamber = 'upper' if chamber.lower() == 'senate' else chamber

        # Get html and parse
        doc = self.lxmlize(bill_detail_url)

        # Check if bill hasn't been transmitted to the other chamber yet
        transmit_check = self.get_node(
            doc,
            '//h1[text()[contains(.,"Bills")]]/following-sibling::ul/li/text()'
        )
        if (transmit_check is not None
                and 'has not been transmitted' in transmit_check.strip()):
            self.logger.debug('Bill has not been transmitted to other chamber '
                              '... skipping {0}'.format(bill_detail_url))
            return

        # Get the basic parts of the bill
        bill_id = self.get_node(doc, '//h1/text()')
        self.logger.debug(bill_id)
        bill_title_text = self.get_node(
            doc,
            '//h2[text()[contains(.,"Description")]]/following-sibling::p/text()'
        )
        if bill_title_text is not None:
            bill_title = bill_title_text.strip()
        else:
            long_desc_url = self.get_node(
                doc, '//a[text()[contains(.,"Long Description")]]/@href')
            long_desc_page = self.lxmlize(long_desc_url)
            long_desc_text = self.get_node(
                long_desc_page, '//h1/'
                'following-sibling::p/text()')
            if long_desc_text is not None:
                bill_title = long_desc_text.strip()
            else:
                bill_title = 'No title found.'
                self.logger.warning('No title found for {}.'.format(bill_id))
        self.logger.debug(bill_title)
        bill_type = {
            'F': 'bill',
            'R': 'resolution',
            'C': 'concurrent resolution'
        }[bill_id[1]]
        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=bill_title,
                    classification=bill_type)

        # Add source
        bill.add_source(bill_detail_url)

        for subject in self._subject_mapping[bill_id]:
            bill.add_subject(subject)

        # Get companion bill.
        companion = doc.xpath('//table[@class="status_info"]//tr[1]/td[2]'
                              '/a[starts-with(@href, "?")]/text()')
        companion = self.make_bill_id(
            companion[0]) if len(companion) > 0 else None
        companion_chamber = self.chamber_from_bill(companion)
        if companion is not None:
            bill.add_companion(companion, chamber=companion_chamber)

        # Grab sponsors
        bill = self.extract_sponsors(bill, doc, chamber)

        # Add Actions performed on the bill.
        bill = self.extract_actions(bill, doc, chamber)

        # Get all versions of the bill.
        bill = self.extract_versions(bill, doc, chamber, version_list_url)

        yield bill
Exemplo n.º 8
0
    def scrape_matter(self, matter_link, sess):
        matter_types = {
        "Additions":"other",
        "Administrative Order":"order",
        "Annual Evaluation":"other",
        "Bid Advertisement":"other",
        "Bid Awards":"other",
        "Bid Contract":"contract",
        "Bid Protest":"other",
        "Bid Rejection":"other",
        "Birthday Scroll":"commemoration",
        "Certificate of Appreciation":"commemoration",
        "Change Order":"order",
        "Citizen's Presentation":"other",
        "Commendation":"commemoration",
        "Conflict Waiver":"other",
        "Congratulatory Certificate":"commemoration",
        "Deferrals":"other",
        "Discussion Item":"other",
        "Distinguished Visitor":"other",
        "Joint Meeting/Workshop":"other",
        "Mayoral Veto":"other",
        "Miscellaneous":"other",
        "Nomination":"nomination",
        "Oath of Office":"other",
        "Omnibus Reserve":"bill",
        "Ordinance":"ordinance",
        "Plaque":"commemoration",
        "Presentation":"other",
        "Proclamation":"proclamation",
        "Professional Service Agreement":"contract",
        "Public Hearing":"other",
        "Report":"other",
        "Request for Proposals":"other",
        "Request for Qualifications":"other",
        "Request to Advertise":"other",
        "Resolution":"resolution",
        "Resolution of Sympathy":"resolution",
        "Service Awards":"commemoration",
        "Special Item":"other",
        "Special Presentation":"other",
        "Supplement":"other",
        "Swearing-In":"other",
        "Time Sensitive Items":"other",
        "Withdrawals":"other",
        "Workshop Item":"other",
        "Zoning":"other",
        "Zoning Resolution":"resolution"
        }
        matter_doc = self.lxmlize(matter_link)
        info_dict = self.matter_table_to_dict(matter_doc)
        #we're going to use the year of the intro date as the session
        #until/unless we come up with something better
        intro_date = datetime.strptime(info_dict["Introduced"],"%m/%d/%Y")
        session = sess["identifier"]
        category = matter_types[info_dict["File Type"]]
        if 'File Name' in info_dict:
            title = info_dict["File Name"]
        elif "Title" in info_dict and info_dict["Title"].strip():
            title = info_dict["Title"].strip()
        else:
            self.warning("bill has no title")
            return
        if category == 'other':
            bill = Bill(identifier=info_dict["File Number"],
                legislative_session=session,
                title=title
                )
        else:
            bill = Bill(identifier=info_dict["File Number"],
                legislative_session=session,
                title=title,
                classification=category
                )
        for spons in info_dict["Sponsors"]:
            if spons == "NONE":
                continue
            try:
                name,spons_type = spons.rsplit(",",1)
            except ValueError:
                name = spons
                spons_type = "Sponsor"
            primary = True if "Prime Sponsor" in spons_type else False
            entity = "person"
            if "committee" in name:
                entity = committee
            bill.add_sponsorship(name,spons_type,entity,primary)
        if "Indexes" in info_dict:
            for subj in info_dict["Indexes"]:
                if subj.strip() and subj.strip() != "NONE":
                    bill.add_subject(subj.strip())
        if "Title" in info_dict and info_dict["Title"].strip():
            note = "bill's long title'"
            if ("Note" in info_dict and info_dict["Note"].strip()):
                note = info_dict["Note"]
            bill.add_abstract(abstract=info_dict["Title"],note=note)
        self.process_action_table(matter_doc,bill)
        bill.add_source(matter_link, note='web')

        yield bill
Exemplo n.º 9
0
    def scrape(self):
        unreachable_urls = []

        for leg_summary in self.legislation(created_after=datetime.datetime(2015, 5, 17)) :
            title = leg_summary['Title'].strip()

            if not title or not leg_summary['Intro\xa0Date'] :
                continue
                # https://chicago.legistar.com/LegislationDetail.aspx?ID=1800754&GUID=29575A7A-5489-4D8B-8347-4FC91808B201&Options=Advanced&Search=
                # doesn't have an intro date

            bill_type = BILL_TYPES[leg_summary['Type']]

            bill_session = self.session(self.toTime(leg_summary['Intro\xa0Date']))
            bill = Bill(identifier=leg_summary['Record #'],
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name":"Chicago City Council"})

            bill.add_source(leg_summary['url'])

            try :
                leg_details = self.legDetails(leg_summary['url'])
            except IndexError :
                unreachable_urls.append(leg_summary['url'])
                yield bill
                continue

            for related_bill in leg_details.get('Related files', []) :
                lower_title = title.lower()
                if "sundry" in title or "miscellaneous" in title: #these are ominbus
                    bill.add_related_bill(identifier = related_bill['label'],
                                          legislative_session = bill.legislative_session,
                                          relation_type='replaces')
                #for now we're skipping related bills if they
                #don't contain words that make us think they're
                #in a ominbus relationship with each other
                
            for i, sponsor in enumerate(leg_details.get('Sponsors', [])) :
                if i == 0 :
                    primary = True
                    sponsorship_type = "Primary"
                else :
                    primary = False
                    sponsorship_type = "Regular"

                sponsor_name = sponsor['label']

                # Does the Mayor/Clerk introduce legisislation as
                # individuals role holders or as the OFfice of City
                # Clerk and the Office of the Mayor?
                entity_type = 'person'
                if sponsor_name.startswith(('City Clerk', 
                                            'Mendoza, Susana')) :
                    sponsor_name = 'Office of the City Clerk'
                    entity_type = 'organization'
                elif sponsor_name.startswith(('Emanuel, Rahm',)) :
                    sponsor_name = 'Office of the Mayor'
                    entity_type = 'organization'
                if not sponsor_name.startswith(('Misc. Transmittal',
                                                'No Sponsor',
                                                'Dept./Agency')) :
                    bill.add_sponsorship(sponsor_name, 
                                         sponsorship_type,
                                         entity_type,
                                         primary,
                                         entity_id = _make_pseudo_id(name=sponsor_name))

            if 'Topic' in leg_details :
                for subject in leg_details[u'Topic'].split(',') :
                    bill.add_subject(subject)

            for attachment in leg_details.get('Attachments', []) :
                if attachment['label'] :
                    bill.add_version_link(attachment['label'],
                                          attachment['url'],
                                          media_type="application/pdf")

            for action in self.history(leg_summary['url']) :
                action_description = action['Action']
                try :
                    action_date =  self.toTime(action['Date']).date().isoformat()
                except AttributeError : # https://chicago.legistar.com/LegislationDetail.aspx?ID=1424866&GUID=CEC53337-B991-4268-AE8A-D4D174F8D492
                    continue

                if action_description :
                    try :
                        responsible_org = action['Action\xa0By']['label']
                    except TypeError  :
                        responsible_org = action['Action\xa0By']
                    if responsible_org == 'City Council' :
                        responsible_org = 'Chicago City Council'

                    act = bill.add_action(action_description,
                                          action_date,
                                          organization={'name': responsible_org},
                                          classification=ACTION_CLASSIFICATION[action_description])

                    if action_description == 'Referred' :
                        try :
                            leg_details['Current Controlling Legislative Body']['label']
                            controlling_bodies = [leg_details['Current Controlling Legislative Body']]
                        except TypeError :
                            controlling_bodies = leg_details['Current Controlling Legislative Body']
                        if controlling_bodies :
                            for controlling_body in controlling_bodies :
                                body_name = controlling_body['label']
                                if body_name.startswith("Joint Committee") :
                                    act.add_related_entity(body_name,
                                                           'organization')
                                else :
                                    act.add_related_entity(body_name,
                                                           'organization',
                                                           entity_id = _make_pseudo_id(name=body_name))


                    if 'url' in action['Action\xa0Details'] :
                        action_detail_url = action['Action\xa0Details']['url']
                        result, votes = self.extractVotes(action_detail_url)

                        if votes and result : # see https://github.com/datamade/municipal-scrapers-us/issues/15
                            action_vote = VoteEvent(legislative_session=bill.legislative_session, 
                                               motion_text=action_description,
                                               organization={'name': responsible_org},
                                               classification=None,
                                               start_date=action_date,
                                               result=result,
                                               bill=bill)
                            action_vote.add_source(action_detail_url)

                            for option, voter in votes :
                                action_vote.vote(option, voter)

                            yield action_vote

            bill.extras = {'local_classification' : leg_summary['Type']}
                            
            yield bill
        print(unreachable_urls)
    def scrape_bill(self, chamber, session, bill_id, bill_type, url):
        doc = lxml.html.fromstring(self.get(url).text)
        doc.make_links_absolute(url)

        title = doc.xpath('//b[text()="TITLE:"]')
        if title:
            title = title[0].tail.strip().strip('"')
        else:
            self.warning("skipping bill %s, no information" % url)
            return

        bill = Bill(
            bill_id,
            title=title,
            chamber=chamber,
            classification=bill_type,
            legislative_session=session,
        )
        bill.add_source(url)

        # Get sponsors
        spons_str = doc.xpath(
            '//b[contains(text(), "SPONSOR")]')[0].tail.strip()
        sponsors_match = re.match(
            '(SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})', spons_str)
        if sponsors_match:
            sponsors = sponsors_match.group(2).split(',')
            sponsor = sponsors[0].strip()

            if sponsor:
                bill.add_sponsorship(
                    sponsors[0],
                    entity_type='person',
                    classification='primary',
                    primary=True,
                )

            for sponsor in sponsors[1:]:
                sponsor = sponsor.strip()
                if sponsor:
                    bill.add_sponsorship(
                        sponsor,
                        entity_type='person',
                        classification='cosponsor',
                        primary=False,
                    )
        else:
            # Committee sponsorship
            spons_str = spons_str.strip()

            if re.match(r' BY REQUEST OF THE GOVERNOR$', spons_str):
                spons_str = re.sub(r' BY REQUEST OF THE GOVERNOR$', '',
                                   spons_str).title()
                spons_str = (spons_str +
                             " Committee (by request of the governor)")

            if spons_str:
                bill.add_sponsorship(
                    spons_str,
                    entity_type='person',
                    classification='primary',
                    primary=True,
                )

        # Get actions from second myth table
        self._current_comm = None
        act_rows = doc.xpath('(//table[@class="myth"])[2]//tr')[1:]
        for row in act_rows:
            date, journal, raw_chamber, action = row.xpath('td')

            act_date = datetime.datetime.strptime(date.text_content().strip(),
                                                  '%m/%d/%y')
            raw_chamber = raw_chamber.text_content().strip()
            action = action.text_content().strip()

            if raw_chamber == "(H)":
                act_chamber = "lower"
            elif raw_chamber == "(S)":
                act_chamber = "upper"

            if re.match("\w+ Y(\d+)", action):
                vote_href = journal.xpath('.//a/@href')
                if vote_href:
                    yield from self.parse_vote(bill, action, act_chamber,
                                               act_date, vote_href[0])

            action, atype = self.clean_action(action)

            match = re.match('^Prefile released (\d+/\d+/\d+)$', action)
            if match:
                action = 'Prefile released'
                act_date = datetime.datetime.strptime(match.group(1),
                                                      '%m/%d/%y')

            bill.add_action(action,
                            chamber=act_chamber,
                            date=act_date.strftime('%Y-%m-%d'),
                            classification=atype)

        # Get subjects
        for subj in doc.xpath('//a[contains(@href, "subject")]/text()'):
            bill.add_subject(subj.strip())

        # Get versions
        text_list_url = ("http://www.legis.state.ak.us/"
                         "basis/get_fulltext.asp?session=%s&bill=%s") % (
                             session, bill_id)
        bill.add_source(text_list_url)

        text_doc = lxml.html.fromstring(self.get(text_list_url).text)
        text_doc.make_links_absolute(text_list_url)
        for link in text_doc.xpath('//a[contains(@href, "get_bill_text")]'):
            name = link.xpath('../preceding-sibling::td/text()')[0].strip()
            text_url = link.get('href')
            bill.add_version_link(name, text_url, media_type="text/html")

        # Get documents
        doc_list_url = ("http://www.legis.state.ak.us/"
                        "basis/get_documents.asp?session=%s&bill=%s") % (
                            session, bill_id)
        doc_list = lxml.html.fromstring(self.get(doc_list_url).text)
        doc_list.make_links_absolute(doc_list_url)
        bill.add_source(doc_list_url)
        for href in doc_list.xpath(
                '//a[contains(@href, "get_documents")][@onclick]'):
            h_name = href.text_content()
            h_href = href.attrib['href']
            if h_name.strip():
                bill.add_document_link(h_name, h_href)

        yield bill
Exemplo n.º 11
0
    def scrape_bill(self, chamber, session, bill_id):
        # there will be a space in bill_id if we're doing a one-off bill scrape
        # convert HB 102 into H102
        if ' ' in bill_id:
            bill_id = bill_id[0] + bill_id.split(' ')[-1]

        # if chamber comes in as House/Senate convert to lower/upper
        if chamber == 'Senate':
            chamber = 'upper'
        elif chamber == 'House':
            chamber = 'lower'

        bill_detail_url = (
            'http://www.ncleg.net/gascripts/'
            'BillLookUp/BillLookUp.pl?Session=%s&BillID=%s&votesToView=all'
        ) % (session, bill_id)

        # parse the bill data page, finding the latest html text
        data = self.get(bill_detail_url).text
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(bill_detail_url)

        title_div_txt = doc.xpath(
            '//td[@style="text-align: center; white-space: nowrap; '
            'width: 60%; font-weight: bold; font-size: x-large;"]/text()')[0]
        if 'Joint Resolution' in title_div_txt:
            bill_type = 'joint resolution'
            bill_id = bill_id[0] + 'JR ' + bill_id[1:]
        elif 'Resolution' in title_div_txt:
            bill_type = 'resolution'
            bill_id = bill_id[0] + 'R ' + bill_id[1:]
        elif 'Bill' in title_div_txt:
            bill_type = 'bill'
            bill_id = bill_id[0] + 'B ' + bill_id[1:]

        bill_title = doc.xpath('//div[@id="title"]')[0].text_content()

        bill = Bill(bill_id,
                    legislative_session=session,
                    title=bill_title,
                    chamber=chamber,
                    classification=bill_type)
        bill.add_source(bill_detail_url)

        # skip first PDF link (duplicate link to cur version)
        if chamber == 'lower':
            link_xpath = '//a[contains(@href, "/Bills/House/PDF/")]'
        else:
            link_xpath = '//a[contains(@href, "/Bills/Senate/PDF/")]'
        for vlink in doc.xpath(link_xpath)[1:]:
            # get the name from the PDF link...
            version_name = vlink.text.replace(u'\xa0', ' ')
            # but neighboring span with anchor inside has the HTML version
            version_url = vlink.xpath('./following-sibling::span/a/@href')
            version_url = 'http://www.ncleg.net' + version_url[0]
            bill.add_version_link(version_name,
                                  version_url,
                                  media_type='text/html',
                                  on_duplicate='ignore')

        # sponsors
        spon_td = doc.xpath(
            '//th[text()="Sponsors:"]/following-sibling::td')[0]
        # first sponsors are primary, until we see (Primary)
        spon_type = 'primary'
        for leg in spon_td.text_content().split(';'):
            name = leg.replace(u'\xa0', ' ').strip()
            if name.startswith('(Primary)'):
                name = name.replace('(Primary)', '').strip()
                spon_type = 'cosponsor'
            if not name:
                continue
            bill.add_sponsorship(name,
                                 classification=spon_type,
                                 entity_type='person',
                                 primary=(spon_type == 'primary'))

        # keywords
        kw_td = doc.xpath('//th[text()="Keywords:"]/following-sibling::td')[0]
        for subject in kw_td.text_content().split(', '):
            bill.add_subject(subject)

        # actions
        action_tr_xpath = '//td[starts-with(text(),"History")]/../../tr'
        # skip two header rows
        for row in doc.xpath(action_tr_xpath)[2:]:
            tds = row.xpath('td')
            act_date = tds[0].text
            actor = tds[1].text or ''
            # if text is blank, try diving in
            action = tds[2].text.strip() or tds[2].text_content().strip()

            act_date = dt.datetime.strptime(act_date,
                                            '%m/%d/%Y').strftime('%Y-%m-%d')

            if actor == 'Senate':
                actor = 'upper'
            elif actor == 'House':
                actor = 'lower'
            else:
                actor = 'executive'

            for pattern, atype in self._action_classifiers.items():
                if action.startswith(pattern):
                    break
            else:
                atype = None

            bill.add_action(action,
                            act_date,
                            chamber=actor,
                            classification=atype)

        yield from self.scrape_votes(bill, doc)

        yield bill
Exemplo n.º 12
0
    def _scrape_bills(self):
        """
        Does the following

        1) Scrapes bill data from unitedstates project and saves the data to path specified in UnitedStates module
        2) Iterates over bill data and converts each one to an OCD-compliant bill model.
        3) Yields the OCD-compliant bill model instance

        @return: generator for federal US bills in OCD-compliant format
        @rtype: generator
        """

        # run scraper first to pull in all the bill data
        self._run_unitedstates_bill_scraper()
        # iterate over all the files and build and yield Bill objects
        for filename in find_files(settings.SCRAPED_DATA_DIR, '.*/data/[0-9]+/bills/[^\/]+/[^\/]+/data.json'):
            try:
                with open(filename) as json_file:
                    json_data = json.load(json_file)

                    # Initialize Object
                    bill = Bill(constants.TYPE_MAP[json_data['bill_type']]['canonical'] + ' ' + json_data['number'],
                                json_data['congress'],
                                json_data['official_title'],
                                chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber']
                    )

                    # add source of data
                    bill.add_source(json_data['url'], note='all')

                    # add subjects
                    for subject in json_data['subjects']:
                        bill.add_subject(subject)

                    # add summary
                    if 'summary' in json_data and json_data['summary'] is not None:
                        bill.add_abstract(json_data['summary']['text'],
                                          json_data['summary']['as'],
                                          json_data['summary']['date'])

                    # add titles
                    for item in json_data['titles']:
                        bill.add_title(item['title'], item['type'])

                    # add other/related Bills
                    for b in json_data['related_bills']:
                        if 'type' in b and b['type'] == 'bill':
                            split = b['bill_id'].split('-')
                            m = UnitedStatesBillScraper.BILL_SPLIT.match(split[0])

                            bill.add_related_bill(constants.TYPE_MAP[m.group(1)]['canonical'] + ' ' + m.group(2),
                                                  legislative_session=split[1],
                                                  relation_type='companion')

                    # add sponsor
                    bill.add_sponsorship_by_identifier(json_data['sponsor']['name'], 'person', 'person', True,
                                                       scheme='thomas_id', identifier=json_data['sponsor']['thomas_id'],
                                                       chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber'])

                    # add cosponsors
                    for cs in json_data['cosponsors']:
                        bill.add_sponsorship_by_identifier(cs['name'], 'person', 'person', False,
                                                           scheme='thomas_id', identifier=cs['thomas_id'],
                                                           chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber'])

                    # add introduced_at and actions
                    bill.add_action('date of introduction', datetime_to_date(json_data['introduced_at']),
                                    chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber'],
                                    related_entities=[])

                    # add other actions
                    for action in json_data['actions']:
                        bill.actions.append({'date': datetime_to_date(action['acted_at']),
                                             'type': [action['type']],
                                             'description': action['text'],
                                             'actor': constants.TYPE_MAP[json_data['bill_type']]['chamber'],
                                             'related_entities': []
                                             })

                    # add bill versions
                    for version_path in find_files(os.path.join(settings.SCRAPED_DATA_DIR,
                                                   'data', bill.legislative_session, 'bills', json_data['bill_type'],
                                                   json_data['bill_type'] + json_data['number'],
                                                   'text-versions'), '/.*/*\.json'):
                        try:
                            with open(version_path) as version_file:
                                version_json_data = json.load(version_file)
                                for k, v in version_json_data['urls'].items():
                                    bill.versions.append({'date': datetime_to_date(version_json_data['issued_on']),
                                      'type': version_json_data['version_code'],
                                      'name': constants.VERSION_MAP[version_json_data['version_code']],
                                      'links': [{'mimetype': k, 'url': v}]})
                        except IOError:
                            print("Unable to open or parse file with path " + version_path)
                            continue

                    # finally yield bill object
                    yield bill

            except IOError:
                print("Unable to open file with path " + filename)
                print(traceback.format_exc())
                continue
            except KeyError:
                print("Unable to parse file with path " + filename)
                print(traceback.format_exc())
                continue
            except:
                print('Unknown error with ' + filename)
                print(traceback.format_exc())
                continue
Exemplo n.º 13
0
    def scrape_bill(self, session, history_url):
        history_xml = self.get(history_url).text
        root = etree.fromstring(history_xml)

        bill_title = root.findtext("caption")
        if (bill_title is None or
                "Bill does not exist" in history_xml):
            self.warning("Bill does not appear to exist")
            return
        bill_id = ' '.join(root.attrib['bill'].split(' ')[1:])

        chamber = self.CHAMBERS[bill_id[0]]

        if bill_id[1] == 'B':
            bill_type = ['bill']
        elif bill_id[1] == 'R':
            bill_type = ['resolution']
        elif bill_id[1:3] == 'CR':
            bill_type = ['concurrent resolution']
        elif bill_id[1:3] == 'JR':
            bill_type = ['joint resolution']
        else:
            raise ScrapeError("Invalid bill_id: %s" % bill_id)

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=bill_title,
            classification=bill_type
        )

        bill.add_source(history_url)

        for subject in root.iterfind('subjects/subject'):
            bill.add_subject(subject.text.strip())

        versions = [x for x in self.versions if x[0] == bill_id]
        for version in versions:
            bill.add_version_link(
                note=self.NAME_SLUGS[version[1][-5]],
                url=version[1],
                media_type='text/html'
            )

        analyses = [x for x in self.analyses if x[0] == bill_id]
        for analysis in analyses:
            bill.add_document_link(
                note="Analysis ({})".format(self.NAME_SLUGS[analysis[1][-5]]),
                url=analysis[1],
                media_type='text/html'
            )

        fiscal_notes = [x for x in self.fiscal_notes if x[0] == bill_id]
        for fiscal_note in fiscal_notes:
            bill.add_document_link(
                note="Fiscal Note ({})".format(self.NAME_SLUGS
                                               [fiscal_note[1][-5]]),
                url=fiscal_note[1],
                media_type='text/html'
            )

        witnesses = [x for x in self.witnesses if x[0] == bill_id]
        for witness in witnesses:
            bill.add_document_link(
                note="Witness List ({})".format(self.NAME_SLUGS
                                                [witness[1][-5]]),
                url=witness[1],
                media_type='text/html'
            )

        for action in root.findall('actions/action'):
            act_date = datetime.datetime.strptime(action.findtext('date'),
                                                  "%m/%d/%Y").date()

            action_number = action.find('actionNumber').text
            actor = {'H': 'lower',
                     'S': 'upper',
                     'E': 'executive'}[action_number[0]]

            desc = action.findtext('description').strip()

            if desc == 'Scheduled for public hearing on . . .':
                self.warning("Skipping public hearing action with no date")
                continue

            introduced = False

            if desc == 'Amended':
                atype = 'amendment-passage'
            elif desc == 'Amendment(s) offered':
                atype = 'amendment-introduction'
            elif desc == 'Amendment amended':
                atype = 'amendment-amendment'
            elif desc == 'Amendment withdrawn':
                atype = 'amendment-withdrawal'
            elif desc == 'Passed' or desc == 'Adopted':
                atype = 'passage'
            elif re.match(r'^Received (by|from) the', desc):
                if 'Secretary of the Senate' not in desc:
                    atype = 'introduction'
                else:
                    atype = 'filing'
            elif desc.startswith('Sent to the Governor'):
                # But what if it gets lost in the mail?
                atype = 'executive-receipt'
            elif desc.startswith('Signed by the Governor'):
                atype = 'executive-signature'
            elif desc == 'Vetoed by the Governor':
                atype = 'executive-veto'
            elif desc == 'Read first time':
                atype = ['introduction', 'reading-1']
                introduced = True
            elif desc == 'Read & adopted':
                atype = ['passage']
                if not introduced:
                    introduced = True
                    atype.append('introduction')
            elif desc == "Passed as amended":
                atype = 'passage'
            elif (desc.startswith('Referred to') or
                    desc.startswith("Recommended to be sent to ")):
                atype = 'referral-committee'
            elif desc == "Reported favorably w/o amendment(s)":
                atype = 'committee-passage'
            elif desc == "Filed":
                atype = 'filing'
            elif desc == 'Read 3rd time':
                atype = 'reading-3'
            elif desc == 'Read 2nd time':
                atype = 'reading-2'
            elif desc.startswith('Reported favorably'):
                atype = 'committee-passage-favorable'
            else:
                atype = None

            act = bill.add_action(
                action.findtext('description'),
                act_date,
                chamber=actor,
                classification=atype
            )

            if atype and 'referral-committee' in atype:
                repls = [
                    'Referred to',
                    "Recommended to be sent to "
                ]
                ctty = desc
                for r in repls:
                    ctty = ctty.replace(r, "").strip()
                act.add_related_entity(name=ctty, entity_type='organization')

        for author in root.findtext('authors').split(' | '):
            if author != "":
                bill.add_sponsorship(author, classification='primary',
                                     entity_type='person', primary=True)
        for coauthor in root.findtext('coauthors').split(' | '):
            if coauthor != "":
                bill.add_sponsorship(coauthor, classification='cosponsor',
                                     entity_type='person', primary=False)
        for sponsor in root.findtext('sponsors').split(' | '):
            if sponsor != "":
                bill.add_sponsorship(sponsor, classification='primary',
                                     entity_type='person', primary=True)
        for cosponsor in root.findtext('cosponsors').split(' | '):
            if cosponsor != "":
                bill.add_sponsorship(cosponsor, classification='cosponsor',
                                     entity_type='person', primary=False)

        if root.findtext('companions'):
            self._get_companion(bill)

        yield bill
Exemplo n.º 14
0
    def scrape_bills(self, chamber_to_scrape, session):
        url = 'http://billstatus.ls.state.ms.us/%s/pdf/all_measures/allmsrs.xml' % session

        bill_dir_page = self.get(url)
        root = lxml.etree.fromstring(bill_dir_page.content)
        for mr in root.xpath('//LASTACTION/MSRGROUP'):
            bill_id = mr.xpath('string(MEASURE)').replace(" ", "")
            if bill_id[0] == "S":
                chamber = "upper"
            else:
                chamber = "lower"

            bill_type = {'B': 'bill', 'C': 'concurrent resolution',
                         'R': 'resolution', 'N': 'nomination'}[bill_id[1]]

            # just skip past bills that are of the wrong chamber
            if chamber != chamber_to_scrape:
                continue

            link = mr.xpath('string(ACTIONLINK)').replace("..", "")
            main_doc = mr.xpath('string(MEASURELINK)').replace("../../../", "")
            main_doc_url = 'http://billstatus.ls.state.ms.us/%s' % main_doc
            bill_details_url = 'http://billstatus.ls.state.ms.us/%s/pdf%s' % (session, link)
            try:
                details_page = self.get(bill_details_url)
            except scrapelib.HTTPError:
                self.warning('Bill page not loading for {}; skipping'.format(bill_id))
                continue

            page = details_page.content
            # Some pages have the (invalid) byte 11 sitting around. Just drop
            # them out. Might as well.

            details_root = lxml.etree.fromstring(page)
            title = details_root.xpath('string(//SHORTTITLE)')
            longtitle = details_root.xpath('string(//LONGTITLE)')

            bill = Bill(bill_id,
                        legislative_session=session,
                        chamber=chamber,
                        title=title,
                        classification=bill_type)
            bill.extras['summary'] = longtitle
            bill.add_source(main_doc_url)
            # sponsors
            main_sponsor = details_root.xpath('string(//P_NAME)').split()
            if main_sponsor:
                main_sponsor = main_sponsor[0]
                main_sponsor_link = details_root.xpath('string(//P_LINK)').replace(" ", "_")
                main_sponsor_url = ('http://billstatus.ls.state.ms.us/%s/'
                                    'pdf/%s') % (session, main_sponsor_link.strip('../'))
                type = "primary"
                bill.add_source(main_sponsor_url)
                bill.add_sponsorship(main_sponsor,
                                     classification=type,
                                     entity_type='person',
                                     primary=True)

            for author in details_root.xpath('//AUTHORS/ADDITIONAL'):
                leg = author.xpath('string(CO_NAME)').replace(" ", "_")
                if leg:
                    leg_url = ('http://billstatus.ls.state.ms.us/%s/'
                               'pdf/House_authors/%s.xml') % (session, leg)
                    type = "cosponsor"
                    bill.add_source(leg_url)
                    bill.add_sponsorship(leg,
                                         classification=type,
                                         entity_type='person',
                                         primary=False
                                         )
            # Versions
            curr_version = details_root.xpath('string(//CURRENT_OTHER'
                                              ')').replace("../../../../", "")
            if curr_version != "":
                curr_version_url = "http://billstatus.ls.state.ms.us/" \
                        + curr_version
                bill.add_version_link("Current version", curr_version_url,
                                      on_duplicate="ignore",
                                      media_type="text/html"
                                      )

            intro_version = details_root.xpath('string(//INTRO_OTHER)').replace("../../../../", "")
            if intro_version != "":
                intro_version_url = "http://billstatus.ls.state.ms.us/"\
                        + intro_version
                bill.add_version_link("As Introduced", intro_version_url,
                                      on_duplicate='ignore',
                                      media_type='text/html')

            comm_version = details_root.xpath('string(//CMTESUB_OTHER'
                                              ')').replace("../../../../", "")
            if comm_version.find("documents") != -1:
                comm_version_url = "http://billstatus.ls.state.ms.us/" + comm_version
                bill.add_version_link("Committee Substitute", comm_version_url,
                                      on_duplicate='ignore',
                                      media_type='text/html')
            passed_version = details_root.xpath('string(//PASSED_OTHER'
                                                ')').replace("../../../../", "")
            if passed_version.find("documents") != -1:
                passed_version_url = "http://billstatus.ls.state.ms.us/" + passed_version
                title = "As Passed the " + chamber
                bill.add_version_link(title, passed_version_url,
                                      on_duplicate='ignore',
                                      media_type='text/html')

            asg_version = details_root.xpath('string(//ASG_OTHER)').replace("../../../../", "")
            if asg_version.find("documents") != -1:
                asg_version_url = "http://billstatus.ls.state.ms.us/" + asg_version
                bill.add_version_link("Approved by the Governor", asg_version_url,
                                      on_duplicate='ignore',
                                      media_type='text/html')

            # amendments
            # ex: http://billstatus.ls.state.ms.us/2018/pdf/history/HB/HB1040.xml
            for amd in details_root.xpath('//AMENDMENTS/*'):
                if amd.tag == 'HAM':
                    name = amd.xpath('HAM_DESC[1]/text()')[0]
                    name = append_parens(amd, 'HAM_DISP', name)
                    name = append_parens(amd, 'HAM_VDESC', name)

                    pdf_url = amd.xpath('string(HAM_PDF'
                                        ')').replace("../", "")

                    html_url = amd.xpath('string(HAM_OTHER'
                                         ')').replace("../", "")
                elif amd.tag == 'SAM':
                    name = amd.xpath('SAM_DESC[1]/text()')[0]
                    name = append_parens(amd, 'SAM_DISP', name)
                    name = append_parens(amd, 'SAM_VDESC', name)

                    pdf_url = amd.xpath('string(SAM_PDF'
                                        ')').replace("../", "")

                    html_url = amd.xpath('string(SAM_OTHER'
                                         ')').replace("../", "")
                elif amd.tag == 'AMRPT':
                    name = amd.xpath('AMRPT_DESC[1]/text()')[0]
                    pdf_url = amd.xpath('string(AMRPT_PDF'
                                        ')').replace("../", "")

                    html_url = amd.xpath('string(AMRPT_OTHER'
                                         ')').replace("../", "")

                pdf_url = 'http://billstatus.ls.state.ms.us/' + pdf_url
                html_url = 'http://billstatus.ls.state.ms.us/' + html_url

                if 'adopted' in name.lower() or 'amendment report' in name.lower():
                    bill.add_version_link(name, pdf_url,
                                          on_duplicate='ignore',
                                          media_type='application/pdf')
                    bill.add_version_link(name, html_url,
                                          on_duplicate='ignore',
                                          media_type='text/html')

            # avoid duplicate votes
            seen_votes = set()

            # Actions
            for action in details_root.xpath('//HISTORY/ACTION'):
                # action_num  = action.xpath('string(ACT_NUMBER)').strip()
                # action_num = int(action_num)
                act_vote = action.xpath('string(ACT_VOTE)').replace("../../../..", "")
                action_desc = action.xpath('string(ACT_DESC)')
                date, action_desc = action_desc.split(" ", 1)
                date = date + "/" + session[0:4]
                date = datetime.strptime(date, "%m/%d/%Y")

                if action_desc.startswith("(H)"):
                    actor = "lower"
                    action = action_desc[4:]
                elif action_desc.startswith("(S)"):
                    actor = "upper"
                    action = action_desc[4:]
                else:
                    actor = "executive"
                    action = action_desc

                if "Veto" in action and actor == 'executive':
                    version_path = details_root.xpath("string(//VETO_OTHER)")
                    version_path = version_path.replace("../../../../", "")
                    version_url = "http://billstatus.ls.state.ms.us/" + version_path
                    bill.add_document_link("Veto", version_url)

                atype = 'other'
                for prefix, prefix_type in self._action_types:
                    if action.startswith(prefix):
                        atype = prefix_type
                        break

                bill.add_action(action, self._tz.localize(date),
                                chamber=actor,
                                classification=atype if atype != 'other' else None)

                # use committee names as scraped subjects
                subjects = details_root.xpath('//H_NAME/text()')
                subjects += details_root.xpath('//S_NAME/text()')

                for subject in subjects:
                    if subject not in bill.subject:
                        bill.add_subject(subject)

                if act_vote:
                    vote_url = 'http://billstatus.ls.state.ms.us%s' % act_vote
                    if vote_url not in seen_votes:
                        seen_votes.add(vote_url)
                        yield from self.scrape_votes(vote_url, action,
                                                     date, actor, bill)

            bill.add_source(bill_details_url)
            yield bill
Exemplo n.º 15
0
    def get_bill(self, matter):
        '''Make Bill object from given matter.'''
        
        '''
        Currently, NYC Legistar does not have conventional "Types" for 
        three newly added committees: https://legistar.council.nyc.gov/Departments.aspx
        We communicated the issue to NYC, and until we learn more, we will
        skip the bills attached to those committees.
        '''
        orgs_without_type = ['Charter Revision Commission 2019',
                             'New York City Advisory Commission on Property Tax Reform',
                             'Democratic Conference of the Council of the City of New York']
        if matter['MatterBodyName'].strip() in orgs_without_type:
            return None

        matter_id = matter['MatterId']
        if matter_id in DUPLICATED_ACTIONS:
            return None

        date = matter['MatterIntroDate']
        title = matter['MatterName']
        identifier = matter['MatterFile']

        if not all((date, title, identifier)):
            return None

        leg_type = BILL_TYPES[matter['MatterTypeName']]

        bill_session = self.sessions(self.toTime(date))

        bill = Bill(identifier=identifier,
                    title=title,
                    classification=leg_type,
                    legislative_session=bill_session,
                    from_organization={"name": "New York City Council"})

        legistar_web = matter['legistar_url']
        legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)

        bill.add_source(legistar_web, note='web')
        bill.add_source(legistar_api, note='api')

        if matter['MatterTitle']:
            bill.add_title(matter['MatterTitle'])

        if matter['MatterEXText5']:
            bill.add_abstract(matter['MatterEXText5'], note='')

        try:
            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)
        except KeyError:
            self.version_errors.append(legistar_web)
            return None

        for attachment in self.attachments(matter_id):

            if attachment['MatterAttachmentId'] == 103315:  # Duplicate
                return None

            if attachment['MatterAttachmentName']:
                bill.add_document_link(attachment['MatterAttachmentName'],
                                       attachment['MatterAttachmentHyperlink'],
                                       media_type='application/pdf')

        for topic in self.topics(matter_id) :
            bill.add_subject(topic['MatterIndexName'].strip())

        for relation in self.relations(matter_id):
            try:
                related_bill = self.endpoint('/matters/{0}', relation['MatterRelationMatterId'])
            except scrapelib.HTTPError:
                return None
            else:
                date = related_bill['MatterIntroDate']
                related_bill_session = self.session(self.toTime(date))
                identifier = related_bill['MatterFile']
                bill.add_related_bill(identifier=identifier,
                                      legislative_session=related_bill_session,
                                      relation_type='companion')

        try:
            text = self.text(matter_id)
        except KeyError:
            self.version_errors.append(legistar_web)
            return None

        bill.extras['local_classification'] = matter['MatterTypeName']

        if text:
            if text['MatterTextPlain']:
                bill.extras['plain_text'] = text['MatterTextPlain'].replace(u'\u0000', '')

            if text['MatterTextRtf']:
                bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '')

        return bill
Exemplo n.º 16
0
    def scrape(self, window=28) :
        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window))
        for matter in self.matters(n_days_ago) :
            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            if not all((date, title, identifier)) :
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name":"Board of Directors"})
            
            legistar_web = matter['legistar_url']
            
            legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)

            bill.add_source(legistar_web, note='web')
            bill.add_source(legistar_api, note='api')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id) :
                act = bill.add_action(**action)

                if action['description'] == 'Referred' :
                    body_name = matter['MatterBodyName']
                    act.add_related_entity(body_name,
                                           'organization',
                                           entity_id = _make_pseudo_id(name=body_name))

                result, votes = vote
                if result :
                    vote_event = VoteEvent(legislative_session=bill.legislative_session, 
                                           motion_text=action['description'],
                                           organization=action['organization'],
                                           classification=None,
                                           start_date=action['date'],
                                           result=result,
                                           bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes :
                        raw_option = vote['VoteValueName'].lower()
                        clean_option = self.VOTE_OPTIONS.get(raw_option,
                                                             raw_option)
                        vote_event.vote(clean_option, 
                                        vote['VotePersonName'].strip())

                    yield vote_event


            for sponsorship in self.sponsorships(matter_id) :
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id) :
                bill.add_subject(topic['MatterIndexName'].strip())

            for relation in self.relations(matter_id):
                try:
                    # Get data (i.e., json) for the related bill. 
                    # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session).
                    # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue.
                    related_bill = self.endpoint('/matters/{0}', relation['MatterRelationMatterId'])
                except scrapelib.HTTPError:
                    continue
                else:
                    date = related_bill['MatterIntroDate']
                    related_bill_session = self.session(self.toTime(date))
                    identifier = related_bill['MatterFile']
                    bill.add_related_bill(identifier=identifier,
                                          legislative_session=related_bill_session,
                                          relation_type='companion')
                    # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104
                    # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'.

            bill.add_version_link('Board Report',
                                  'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'.format(matter_id),
                                   media_type="application/pdf")

            for attachment in self.attachments(matter_id) :
                if attachment['MatterAttachmentName'] :
                    bill.add_document_link(attachment['MatterAttachmentName'],
                                           attachment['MatterAttachmentHyperlink'],
                                           media_type="application/pdf")

            bill.extras = {'local_classification' : matter['MatterTypeName']}

            text = self.text(matter_id)

            if text :
                if text['MatterTextPlain'] :
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf'] :
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '')

            yield bill
Exemplo n.º 17
0
    def scrape_bill(self, session, bill_url):
        page = self.get(bill_url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(bill_url)

        try:
            bill_id = page.xpath('//span[@id="lblBillNumber"]/a[1]')[0].text
        except IndexError:
            self.logger.warning("Something is wrong with bill page, skipping.")
            return
        secondary_bill_id = page.xpath('//span[@id="lblCompNumber"]/a[1]')

        # checking if there is a matching bill
        if secondary_bill_id:
            secondary_bill_id = secondary_bill_id[0].text
            # swap ids if * is in secondary_bill_id
            if '*' in secondary_bill_id:
                bill_id, secondary_bill_id = secondary_bill_id, bill_id
                secondary_bill_id = secondary_bill_id.strip()
            secondary_bill_id = secondary_bill_id.replace('  ', ' ')

        bill_id = bill_id.replace('*', '').replace('  ', ' ').strip()

        if 'B' in bill_id:
            bill_type = 'bill'
        elif 'JR' in bill_id:
            bill_type = 'joint resolution'
        elif 'R' in bill_id:
            bill_type = 'resolution'

        primary_chamber = 'lower' if 'H' in bill_id else 'upper'
        # secondary_chamber = 'upper' if primary_chamber == 'lower' else 'lower'

        title = page.xpath("//span[@id='lblAbstract']")[0].text
        if title is None:
            msg = '%s detail page was missing title info.'
            self.logger.warning(msg % bill_id)
            return

        # bill subject
        subject_pos = title.find('-')
        subjects = [s.strip() for s in title[:subject_pos - 1].split(',')]
        subjects = filter(None, subjects)

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=primary_chamber,
            title=title,
            classification=bill_type,
        )
        for subject in subjects:
            bill.add_subject(subject)

        if secondary_bill_id:
            bill.add_identifier(secondary_bill_id)

        bill.add_source(bill_url)

        # Primary Sponsor
        sponsor = page.xpath("//span[@id='lblBillPrimeSponsor']")[0].text_content().split("by")[-1]
        sponsor = sponsor.replace('*', '').strip()
        if sponsor:
            bill.add_sponsorship(
                sponsor,
                classification='primary',
                entity_type='person',
                primary=True,
            )

        # bill text
        btext = page.xpath("//span[@id='lblBillNumber']/a")[0]
        bill.add_version_link('Current Version', btext.get('href'),
                              media_type='application/pdf')

        # documents
        summary = page.xpath('//a[contains(@href, "BillSummaryArchive")]')
        if summary:
            bill.add_document_link('Summary', summary[0].get('href'))
        fiscal = page.xpath('//span[@id="lblFiscalNote"]//a')
        if fiscal:
            bill.add_document_link('Fiscal Note', fiscal[0].get('href'))
        amendments = page.xpath('//a[contains(@href, "/Amend/")]')
        for amendment in amendments:
            bill.add_document_link('Amendment ' + amendment.text, amendment.get('href'))
        # amendment notes in image with alt text describing doc inside <a>
        amend_fns = page.xpath('//img[contains(@alt, "Fiscal Memo")]')
        for afn in amend_fns:
            bill.add_document_link(
                afn.get('alt'),
                afn.getparent().get('href'),
                on_duplicate='ignore'
            )

        # actions
        atable = page.xpath("//table[@id='gvBillActionHistory']")[0]
        actions_from_table(bill, atable)

        # if there is a matching bill
        if secondary_bill_id:
            # secondary sponsor
            secondary_sponsor = page.xpath(
                "//span[@id='lblCompPrimeSponsor']")[0].text_content().split("by")[-1]
            secondary_sponsor = secondary_sponsor.replace('*', '').replace(')', '').strip()
            # Skip black-name sponsors.
            if secondary_sponsor:
                bill.add_sponsorship(
                    secondary_sponsor,
                    classification='primary',
                    entity_type='person',
                    primary=True,
                )

            # secondary actions
            cotable = page.xpath("//table[@id='gvCoActionHistory']")[0]
            actions_from_table(bill, cotable)

        # votes
        yield from self.scrape_vote_events(bill, page, bill_url)

        bill.actions.sort(key=lambda a: a['date'])
        yield bill
Exemplo n.º 18
0
    def scrape_details(self, bill_detail_url, session, chamber, bill_id):
        """
        Create the Bill and add the information obtained from the provided bill_detail_url.
        and then yield the bill object.
        :param bill_detail_url:
        :param session:
        :param chamber:
        :param bill_id:
        :return:
        """
        page = self.get(bill_detail_url).text

        if 'INVALID BILL NUMBER' in page:
            self.warning('INVALID BILL %s' % bill_detail_url)
            return

        doc = lxml.html.fromstring(page)
        doc.make_links_absolute(bill_detail_url)

        bill_div = doc.xpath('//div[@style="margin:0 0 40px 0;"]')[0]

        bill_type = bill_div.xpath('span/text()')[0]

        if 'General Bill' in bill_type:
            bill_type = 'bill'
        elif 'Concurrent Resolution' in bill_type:
            bill_type = 'concurrent resolution'
        elif 'Joint Resolution' in bill_type:
            bill_type = 'joint resolution'
        elif 'Resolution' in bill_type:
            bill_type = 'resolution'
        else:
            raise ValueError('unknown bill type: %s' % bill_type)

        # this is fragile, but less fragile than it was
        b = bill_div.xpath('./b[text()="Summary:"]')[0]
        bill_summary = b.getnext().tail.strip()

        bill = Bill(
            bill_id,
            legislative_session=session,  # session name metadata's `legislative_sessions`
            chamber=chamber,  # 'upper' or 'lower'
            title=bill_summary,
            classification=bill_type
        )

        subjects = list(self._subjects[bill_id])

        for subject in subjects:
            bill.add_subject(subject)

        # sponsors
        for sponsor in doc.xpath('//a[contains(@href, "member.php")]/text()'):
            bill.add_sponsorship(
                name=sponsor,
                classification='primary',
                primary=True,
                entity_type='person'
            )
        for sponsor in doc.xpath('//a[contains(@href, "committee.php")]/text()'):
            sponsor = sponsor.replace(u'\xa0', ' ').strip()
            bill.add_sponsorship(
                name=sponsor,
                classification='primary',
                primary=True,
                entity_type='organization'
            )

        # find versions
        version_url = doc.xpath('//a[text()="View full text"]/@href')[0]
        version_html = self.get(version_url).text
        version_doc = lxml.html.fromstring(version_html)
        version_doc.make_links_absolute(version_url)
        for version in version_doc.xpath('//a[contains(@href, "/prever/")]'):
            # duplicate versions with same date, use first appearance

            bill.add_version_link(
                note=version.text,  # Description of the version from the state;
                                    #  eg, 'As introduced', 'Amended', etc.
                url=version.get('href'),
                on_duplicate='ignore',
                media_type='text/html'  # Still a MIME type
            )

        # actions
        for row in bill_div.xpath('table/tr'):
            date_td, chamber_td, action_td = row.xpath('td')

            date = datetime.datetime.strptime(date_td.text, "%m/%d/%y")
            action_chamber = {'Senate': 'upper',
                              'House': 'lower',
                              None: 'legislature'}[chamber_td.text]

            action = action_td.text_content()
            action = action.split('(House Journal')[0]
            action = action.split('(Senate Journal')[0].strip()

            atype = action_type(action)

            bill.add_action(
                description=action,  # Action description, from the state
                date=date.strftime('%Y-%m-%d'),  # `YYYY-MM-DD` format
                chamber=action_chamber,  # 'upper' or 'lower'
                classification=atype  # Options explained in the next section
            )

        # votes
        vurl = doc.xpath('//a[text()="View Vote History"]/@href')
        if vurl:
            vurl = vurl[0]
            yield from self.scrape_vote_history(bill, vurl)

        bill.add_source(bill_detail_url)
        yield bill
Exemplo n.º 19
0
    def get_bill(self, matter):
        '''Make Bill object from given matter.'''
        matter_id = matter['MatterId']
        if matter_id in DUPLICATED_ACTIONS:
            return None

        date = matter['MatterIntroDate']
        title = matter['MatterName']
        identifier = matter['MatterFile']

        if not all((date, title, identifier)):
            return None

        leg_type = BILL_TYPES[matter['MatterTypeName']]

        bill_session = self.sessions(self.toTime(date))

        bill = Bill(identifier=identifier,
                    title=title,
                    classification=leg_type,
                    legislative_session=bill_session,
                    from_organization={"name": "New York City Council"})

        legistar_web = matter['legistar_url']
        legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)

        bill.add_source(legistar_web, note='web')
        bill.add_source(legistar_api, note='api')

        if matter['MatterTitle']:
            bill.add_title(matter['MatterTitle'])

        if matter['MatterEXText5']:
            bill.add_abstract(matter['MatterEXText5'], note='')

        try:
            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)
        except KeyError:
            self.version_errors.append(legistar_web)
            return None

        for attachment in self.attachments(matter_id):

            if attachment['MatterAttachmentId'] == 103315:  # Duplicate
                return None

            if attachment['MatterAttachmentName']:
                bill.add_document_link(attachment['MatterAttachmentName'],
                                       attachment['MatterAttachmentHyperlink'],
                                       media_type='application/pdf')

        for topic in self.topics(matter_id):
            bill.add_subject(topic['MatterIndexName'].strip())

        for relation in self.relations(matter_id):
            try:
                related_bill = self.endpoint(
                    '/matters/{0}', relation['MatterRelationMatterId'])
            except scrapelib.HTTPError:
                return None
            else:
                date = related_bill['MatterIntroDate']
                related_bill_session = self.session(self.toTime(date))
                identifier = related_bill['MatterFile']
                bill.add_related_bill(identifier=identifier,
                                      legislative_session=related_bill_session,
                                      relation_type='companion')

        try:
            text = self.text(matter_id)
        except KeyError:
            self.version_errors.append(legistar_web)
            return None

        bill.extras['local_classification'] = matter['MatterTypeName']

        if text:
            if text['MatterTextPlain']:
                bill.extras['plain_text'] = text['MatterTextPlain'].replace(
                    u'\u0000', '')

            if text['MatterTextRtf']:
                bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                    u'\u0000', '')

        return bill
Exemplo n.º 20
0
    def scrape_bill(self, chamber, session, bill_id, bill_type, url):
        doc = lxml.html.fromstring(self.get(url).text)
        doc.make_links_absolute(url)

        title = doc.xpath('//b[text()="TITLE:"]')
        if title:
            title = title[0].tail.strip().strip('"')
        else:
            self.warning("skipping bill %s, no information" % url)
            return

        bill = Bill(
            bill_id,
            title=title,
            chamber=chamber,
            classification=bill_type,
            legislative_session=session,
        )
        bill.add_source(url)

        # Get sponsors
        spons_str = doc.xpath('//b[contains(text(), "SPONSOR")]')[0].tail.strip()
        sponsors_match = re.match(
            '(SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})',
            spons_str)
        if sponsors_match:
            sponsors = sponsors_match.group(2).split(',')
            sponsor = sponsors[0].strip()

            if sponsor:
                bill.add_sponsorship(
                    sponsors[0],
                    entity_type='person',
                    classification='primary',
                    primary=True,
                )

            for sponsor in sponsors[1:]:
                sponsor = sponsor.strip()
                if sponsor:
                    bill.add_sponsorship(
                        sponsor,
                        entity_type='person',
                        classification='cosponsor',
                        primary=False,
                    )
        else:
            # Committee sponsorship
            spons_str = spons_str.strip()

            if re.match(r' BY REQUEST OF THE GOVERNOR$', spons_str):
                spons_str = re.sub(r' BY REQUEST OF THE GOVERNOR$',
                                   '', spons_str).title()
                spons_str = (spons_str +
                             " Committee (by request of the governor)")

            if spons_str:
                bill.add_sponsorship(
                    spons_str,
                    entity_type='person',
                    classification='primary',
                    primary=True,
                )

        # Get actions from second myth table
        self._current_comm = None
        act_rows = doc.xpath('(//table[@class="myth"])[2]//tr')[1:]
        for row in act_rows:
            date, journal, raw_chamber, action = row.xpath('td')

            act_date = datetime.datetime.strptime(date.text_content().strip(),
                                                  '%m/%d/%y')
            raw_chamber = raw_chamber.text_content().strip()
            action = action.text_content().strip()

            if raw_chamber == "(H)":
                act_chamber = "lower"
            elif raw_chamber == "(S)":
                act_chamber = "upper"

            if re.match("\w+ Y(\d+)", action):
                vote_href = journal.xpath('.//a/@href')
                if vote_href:
                    yield from self.parse_vote(bill, action, act_chamber, act_date,
                                               vote_href[0])

            action, atype = self.clean_action(action)

            match = re.match('^Prefile released (\d+/\d+/\d+)$', action)
            if match:
                action = 'Prefile released'
                act_date = datetime.datetime.strptime(match.group(1), '%m/%d/%y')

            bill.add_action(
                action, chamber=act_chamber, date=act_date.strftime('%Y-%m-%d'),
                classification=atype)

        # Get subjects
        for subj in doc.xpath('//a[contains(@href, "subject")]/text()'):
            bill.add_subject(subj.strip())

        # Get versions
        text_list_url = (
            "http://www.legis.state.ak.us/"
            "basis/get_fulltext.asp?session=%s&bill=%s"
        ) % (session, bill_id)
        bill.add_source(text_list_url)

        text_doc = lxml.html.fromstring(self.get(text_list_url).text)
        text_doc.make_links_absolute(text_list_url)
        for link in text_doc.xpath('//a[contains(@href, "get_bill_text")]'):
            name = link.xpath('../preceding-sibling::td/text()')[0].strip()
            text_url = link.get('href')
            bill.add_version_link(name, text_url, media_type="text/html")

        # Get documents
        doc_list_url = (
            "http://www.legis.state.ak.us/"
            "basis/get_documents.asp?session=%s&bill=%s"
        ) % (session, bill_id)
        doc_list = lxml.html.fromstring(self.get(doc_list_url).text)
        doc_list.make_links_absolute(doc_list_url)
        bill.add_source(doc_list_url)
        for href in doc_list.xpath('//a[contains(@href, "get_documents")][@onclick]'):
            h_name = href.text_content()
            h_href = href.attrib['href']
            if h_name.strip():
                bill.add_document_link(h_name, h_href)

        yield bill
Exemplo n.º 21
0
    def scrape_bill(self, chamber, session, bill_id, title, url):
        page = self.lxmlize(url)

        if re.match(r'^(S|H)B ', bill_id):
            btype = ['bill']
        elif re.match(r'(S|H)C ', bill_id):
            btype = ['commemoration']
        elif re.match(r'(S|H)JR ', bill_id):
            btype = ['joint resolution']
        elif re.match(r'(S|H)CR ', bill_id):
            btype = ['concurrent resolution']
        else:
            btype = ['bill']

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=btype)
        bill.add_source(url)

        regex_ns = "http://exslt.org/regular-expressions"
        version_links = page.xpath(
            r"//a[re:test(@href, 'Bill.aspx\?File=.*\.htm', 'i')]",
            namespaces={'re': regex_ns})
        for link in version_links:
            bill.add_version_link(link.xpath('string()').strip(),
                                  link.attrib['href'],
                                  media_type='text/html',
                                  on_duplicate='ignore')

        sponsor_links = page.xpath(
            '//div[@id="ctl00_ContentPlaceHolder1_ctl00_BillDetail"]' +
            '/label[contains(text(), "Sponsors:")]' +
            '/following-sibling::div[1]/p/a')
        for link in sponsor_links:
            if link.attrib['href'].startswith(
                    'https://sdlegislature.gov/Legislators/'):
                sponsor_type = 'person'
            elif link.attrib['href'].startswith(
                    'https://sdlegislature.gov/Legislative_Session/Committees'
            ):
                sponsor_type = 'organization'
            else:
                raise ScrapeError('Found unexpected sponsor, URL: ' +
                                  link.attrib['href'])
            bill.add_sponsorship(link.text,
                                 classification='primary',
                                 primary=True,
                                 entity_type=sponsor_type)

        actor = chamber
        use_row = False

        for row in page.xpath("//table[contains(@id, 'tblBillActions')]//tr"):
            # Some tables have null rows, that are just `<tr></tr>`
            # Eg: sdlegislature.gov/Legislative_Session/Bills/Bill.aspx?Bill=1005&Session=2018
            if row.text_content() == '':
                self.debug(
                    'Skipping action table row that is completely empty')
                continue

            if 'Date' in row.text_content() and 'Action' in row.text_content():
                use_row = True
                continue
            elif not use_row:
                continue

            action = row.xpath("string(td[2])").strip()

            atypes = []
            if action.startswith('First read'):
                atypes.append('introduction')
                atypes.append('reading-1')

            if re.match(r'Signed by (?:the\s)*Governor', action,
                        re.IGNORECASE):
                atypes.append('executive-signature')
                actor = 'executive'

            match = re.match(r'(.*) Do Pass( Amended)?, (Passed|Failed)',
                             action)
            if match:
                if match.group(1) in ['Senate', 'House of Representatives']:
                    first = ''
                else:
                    first = 'committee-'
                if match.group(3).lower() == 'passed':
                    second = 'passage'
                elif match.group(3).lower() == 'failed':
                    second = 'failure'
                atypes.append("%s%s" % (first, second))

            if 'referred to' in action.lower():
                atypes.append('referral-committee')

            if 'Motion to amend, Passed Amendment' in action:
                atypes.append('amendment-introduction')
                atypes.append('amendment-passage')

            if 'Veto override, Passed' in action:
                atypes.append('veto-override-passage')
            elif 'Veto override, Failed' in action:
                atypes.append('veto-override-failure')

            if 'Delivered to the Governor' in action:
                atypes.append('executive-receipt')

            match = re.match("First read in (Senate|House)", action)
            if match:
                if match.group(1) == 'Senate':
                    actor = 'upper'
                else:
                    actor = 'lower'

            date = row.xpath("string(td[1])").strip()
            match = re.match(r'\d{2}/\d{2}/\d{4}', date)
            if not match:
                self.warning("Bad date: %s" % date)
                continue
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            for link in row.xpath("td[2]/a[contains(@href, 'RollCall')]"):
                yield from self.scrape_vote(bill, date, link.attrib['href'])

            bill.add_action(action, date, chamber=actor, classification=atypes)

        for link in page.xpath("//a[contains(@href, 'Keyword')]"):
            bill.add_subject(link.text.strip())

        yield bill
Exemplo n.º 22
0
    def scrape_bill(self, chamber, session, session_id, bill_id, url):
        sidebar = lxml.html.fromstring(self.get(url).text)
        sidebar.make_links_absolute("https://www.legis.iowa.gov")

        try:
            hist_url = sidebar.xpath(
                '//a[contains(., "Bill History")]')[0].attrib['href']
        except IndexError:
            # where is it?
            return

        page = lxml.html.fromstring(self.get(hist_url).text)
        page.make_links_absolute("https://www.legis.iowa.gov")

        title = page.xpath('string(//div[@id="content"]/div[@class='
                           '"divideVert"]/div[not(@class)])').strip()

        if title == '':
            self.warning("URL: %s gives us an *EMPTY* bill. Aborting." % url)
            return

        if title.lower().startswith("in"):
            title = page.xpath("string(//table[2]/tr[3])").strip()

        if 'HR' in bill_id or 'SR' in bill_id:
            bill_type = ['resolution']
        elif 'HJR' in bill_id or 'SJR' in bill_id:
            bill_type = ['joint resolution']
        elif 'HCR' in bill_id or 'SCR' in bill_id:
            bill_type = ['concurrent resolution']
        else:
            bill_type = ['bill']

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=bill_type)

        bill.add_source(hist_url)

        # base url for text version (version_abbrev, session_id, bill_id)
        version_html_url_template = 'https://www.legis.iowa.gov/docs/'\
            'publications/LG{}/{}/attachments/{}.html'
        version_pdf_url_template = 'https://www.legis.iowa.gov/docs/'\
            'publications/LG{}/{}/{}.pdf'

        # get pieces of version_link
        vpieces = sidebar.xpath('//select[@id="billVersions"]/option')
        if vpieces:
            for version in vpieces:
                version_name = version.text
                version_abbrev = version.xpath('string(@value)')

                # Get HTML document of bill version.
                version_html_url = version_html_url_template.format(
                    version_abbrev.upper(), session_id,
                    bill_id.replace(' ', ''))

                bill.add_version_link(note=version_name,
                                      url=version_html_url,
                                      media_type='text/html')

                # Get PDF document of bill version.
                version_pdf_url = version_pdf_url_template.format(
                    version_abbrev.upper(), session_id,
                    bill_id.replace(' ', ''))

                bill.add_version_link(note=version_name,
                                      url=version_pdf_url,
                                      media_type='application/pdf')

        sponsors_str = page.xpath(
            "string(//div[@id='content']/div[@class='divideVert']/div[@class='divideVert'])"
        ).strip()
        if re.search('^By ', sponsors_str):
            sponsors = re.split(',| and ', sponsors_str.split('By ')[1])
        # for some bills sponsors listed in different format
        else:
            sponsors = re.findall('[\w-]+(?:, [A-Z]\.)?(?:,|(?: and)|\.$)',
                                  sponsors_str)

        for sponsor in sponsors:
            sponsor = sponsor.replace(' and', '').strip(' .,')

            # a few sponsors get mangled by our regex
            sponsor = {
                'Means': 'Ways & Means',
                'Iowa': 'Economic Growth/Rebuild Iowa',
                'Safety': 'Public Safety',
                'Resources': 'Human Resources',
                'Affairs': 'Veterans Affairs',
                'Protection': 'Environmental Protection',
                'Government': 'State Government',
                'Boef': 'De Boef'
            }.get(sponsor, sponsor)

            if sponsor[0].islower():
                # SSBs catch cruft in it ('charges', 'overpayments')
                # https://sunlight.atlassian.net/browse/DATA-286
                continue

            bill.add_sponsorship(name=sponsor,
                                 classification='primary',
                                 entity_type='person',
                                 primary=True)

        for tr in page.xpath(
                "//table[contains(@class, 'billActionTable')]/tbody/tr"):
            date = tr.xpath("string(td[contains(text(), ', 20')])").strip()
            if date.startswith("***"):
                continue
            elif "No history is recorded at this time." in date:
                return
            if date == "":
                continue

            date = datetime.datetime.strptime(date, "%B %d, %Y").date()

            action = tr.xpath("string(td[2])").strip()
            action = re.sub(r'\s+', ' ', action)

            # Capture any amendment links.
            links = [
                link
                for link in [version['links'] for version in bill.versions]
            ]
            version_urls = [
                link['url'] for link in [i for sub in links for i in sub]
            ]
            if 'amendment' in action.lower():
                for anchor in tr.xpath('td[2]/a'):
                    if '-' in anchor.text:
                        # These links aren't given hrefs for some reason
                        # (needs to be fixed upstream)
                        try:
                            url = anchor.attrib['href']
                        except KeyError:
                            continue

                        if url not in version_urls:
                            bill.add_version_link(note=anchor.text,
                                                  url=url,
                                                  media_type='text/html')
                            version_urls.append(url)

            if 'S.J.' in action or 'SCS' in action:
                actor = 'upper'
            elif 'H.J.' in action or 'HCS' in action:
                actor = 'lower'
            else:
                actor = "legislature"

            action = re.sub(r'(H|S)\.J\.\s+\d+\.$', '', action).strip()

            if action.startswith('Introduced'):
                atype = ['introduction']
                if ', referred to' in action:
                    atype.append('referral-committee')
            elif action.startswith('Read first time'):
                atype = 'reading-1'
            elif action.startswith('Referred to'):
                atype = 'referral-committee'
            elif action.startswith('Sent to Governor'):
                atype = 'executive-receipt'
            elif action.startswith('Reported Signed by Governor'):
                atype = 'executive-signature'
            elif action.startswith('Signed by Governor'):
                atype = 'executive-signature'
            elif action.startswith('Vetoed by Governor'):
                atype = 'executive-veto'
            elif action.startswith('Item veto'):
                atype = 'executive-veto-line-item'
            elif re.match(r'Passed (House|Senate)', action):
                atype = 'passage'
            elif re.match(r'Amendment (S|H)-\d+ filed', action):
                atype = ['amendment-introduction']
                if ', adopted' in action:
                    atype.append('amendment-passage')
            elif re.match(r'Amendment (S|H)-\d+( as amended,)? adopted',
                          action):
                atype = 'amendment-passage'
            elif re.match('Amendment (S|N)-\d+ lost', action):
                atype = 'amendment-failure'
            elif action.startswith('Resolution filed'):
                atype = 'introduction'
            elif action.startswith('Resolution adopted'):
                atype = 'passage'
            elif (action.startswith('Committee report')
                  and action.endswith('passage.')):
                atype = 'committee-passage'
            elif action.startswith('Withdrawn'):
                atype = 'withdrawal'
            else:
                atype = None

            if action.strip() == "":
                continue

            if re.search('END OF \d+ ACTIONS', action):
                continue

            if '$history' not in action:
                bill.add_action(description=action,
                                date=date,
                                chamber=actor,
                                classification=atype)

        for subject in self._subjects[bill_id]:
            bill.add_subject(subject['Name'])

        yield bill
Exemplo n.º 23
0
    def scrape_bill(self, chamber, session, bill_id, short_title=None):
        """
        Scrapes documents, actions, vote counts and votes for
        bills from the 2009 session and above.
        """
        url = BILL_URL % (session, bill_id.replace(' ', ''))
        bill_page = self.get(url, verify=False).text
        html = lxml.html.fromstring(bill_page)
        html.make_links_absolute('http://legislature.idaho.gov/legislation/%s/' % session)
        bill_tables = html.xpath('//table[contains(@class, "bill-table")]')
        title = bill_tables[1].text_content().strip()
        bill_type = get_bill_type(bill_id)
        bill = Bill(legislative_session=session, chamber=chamber, identifier=bill_id, title=title,
                    classification=bill_type)
        bill.add_source(url)
        for subject in self._subjects[bill_id.replace(' ', '')]:
            bill.add_subject(subject)

        if short_title and title.lower() != short_title.lower():
            bill.add_title(short_title, 'short title')

        # documents
        doc_links = html.xpath('//div[contains(@class,"pf-content")]//a')
        for link in doc_links:
            name = link.text_content().strip()
            href = link.get('href')
            if 'Engrossment' in name or 'Bill Text' in name:
                bill.add_version_link(note=name, url=href, media_type="application/pdf")
            else:
                bill.add_document_link(note=name, url=href, media_type="application/pdf")

        def _split(string):
            return re.split(r"\w+[,|AND]\s+", string)

        # sponsors range from a committee to one legislator to a group of legs
        sponsor_lists = bill_tables[0].text_content().split('by')
        if len(sponsor_lists) > 1:
            for sponsors in sponsor_lists[1:]:
                if 'COMMITTEE' in sponsors.upper():
                    bill.add_sponsorship(name=sponsors.strip(), entity_type="organization",
                                         primary=True, classification='primary')
                else:
                    for person in _split(sponsors):
                        person = person.strip()
                        if person != "":
                            bill.add_sponsorship(classification='primary', name=person,
                                                 entity_type="person", primary=True)

        actor = chamber
        last_date = None
        for row in bill_tables[2]:
            # lots of empty rows
            if len(row) == 1:
                continue
            _, date, action, _ = [x.text_content().strip() for x in row]

            if date:
                last_date = date
            else:
                date = last_date
            date = datetime.datetime.strptime(date + '/' + session[0:4],
                                              "%m/%d/%Y").strftime('%Y-%m-%d')
            if action.startswith('House'):
                actor = 'lower'
            elif action.startswith('Senate'):
                actor = 'upper'

            # votes
            if 'AYES' in action or 'NAYS' in action:
                yield from self.parse_vote(actor, date, row[2], session, bill_id, chamber, url)
                # bill.add_vote_event(vote)
            # some td's text is seperated by br elements
            if len(row[2]):
                action = "".join(row[2].itertext())
            action = action.replace(u'\xa0', ' ').strip()
            atype = get_action(actor, action)
            bill.add_action(action, date, chamber=actor, classification=atype)
            # after voice vote/roll call and some actions the bill is sent
            # 'to House' or 'to Senate'
            if 'to House' in action:
                actor = 'lower'
            elif 'to Senate' in action:
                actor = 'upper'
        yield bill
Exemplo n.º 24
0
    def scrape_bill(self, chamber, session, bill_id, title, url):
        page = self.lxmlize(url)

        if re.match(r'^(S|H)B ', bill_id):
            btype = ['bill']
        elif re.match(r'(S|H)C ', bill_id):
            btype = ['commemoration']
        elif re.match(r'(S|H)JR ', bill_id):
            btype = ['joint resolution']
        elif re.match(r'(S|H)CR ', bill_id):
            btype = ['concurrent resolution']
        else:
            btype = ['bill']

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=btype
                    )
        bill.add_source(url)

        regex_ns = "http://exslt.org/regular-expressions"
        version_links = page.xpath(
            r"//a[re:test(@href, 'Bill.aspx\?File=.*\.htm', 'i')]",
            namespaces={'re': regex_ns})
        for link in version_links:
            bill.add_version_link(
                link.xpath('string()').strip(),
                link.attrib['href'],
                media_type='text/html',
                on_duplicate='ignore'
            )

        sponsor_links = page.xpath(
            '//div[@id="ctl00_ContentPlaceHolder1_ctl00_BillDetail"]' +
            '/label[contains(text(), "Sponsors:")]' +
            '/following-sibling::div[1]/p/a'
        )
        for link in sponsor_links:
            if link.attrib['href'].startswith('https://sdlegislature.gov/Legislators/'):
                sponsor_type = 'person'
            elif link.attrib['href'].startswith(
                'https://sdlegislature.gov/Legislative_Session/Committees'
            ):
                sponsor_type = 'organization'
            else:
                raise ScrapeError(
                    'Found unexpected sponsor, URL: ' +
                    link.attrib['href']
                )
            bill.add_sponsorship(
                link.text,
                classification='primary',
                primary=True,
                entity_type=sponsor_type
            )

        actor = chamber
        use_row = False

        for row in page.xpath("//table[contains(@id, 'tblBillActions')]//tr"):
            # Some tables have null rows, that are just `<tr></tr>`
            # Eg: sdlegislature.gov/Legislative_Session/Bills/Bill.aspx?Bill=1005&Session=2018
            if row.text_content() == '':
                self.debug('Skipping action table row that is completely empty')
                continue

            if 'Date' in row.text_content() and 'Action' in row.text_content():
                use_row = True
                continue
            elif not use_row:
                continue

            action = row.xpath("string(td[2])").strip()

            atypes = []
            if action.startswith('First read'):
                atypes.append('introduction')
                atypes.append('reading-1')

            if re.match(r'Signed by (?:the\s)*Governor', action, re.IGNORECASE):
                atypes.append('executive-signature')
                actor = 'executive'

            match = re.match(r'(.*) Do Pass( Amended)?, (Passed|Failed)',
                             action)
            if match:
                if match.group(1) in ['Senate',
                                      'House of Representatives']:
                    first = ''
                else:
                    first = 'committee-'
                if match.group(3).lower() == 'passed':
                    second = 'passage'
                elif match.group(3).lower() == 'failed':
                    second = 'failure'
                atypes.append("%s%s" % (first, second))

            if 'referred to' in action.lower():
                atypes.append('referral-committee')

            if 'Motion to amend, Passed Amendment' in action:
                atypes.append('amendment-introduction')
                atypes.append('amendment-passage')
                amd = row.xpath('td[2]/a[contains(@href,"Amendment.aspx")]')[0]
                version_name = amd.xpath('string(.)')
                version_url = amd.xpath('@href')[0]
                if 'htm' in version_url:
                    mimetype = 'text/html'
                elif 'pdf' in version_url:
                    mimetype = 'application/pdf'
                bill.add_version_link(
                    version_name,
                    version_url,
                    media_type=mimetype,
                    on_duplicate='ignore'
                )

            if 'Veto override, Passed' in action:
                atypes.append('veto-override-passage')
            elif 'Veto override, Failed' in action:
                atypes.append('veto-override-failure')

            if 'Delivered to the Governor' in action:
                atypes.append('executive-receipt')

            match = re.match("First read in (Senate|House)", action)
            if match:
                if match.group(1) == 'Senate':
                    actor = 'upper'
                else:
                    actor = 'lower'

            date = row.xpath("string(td[1])").strip()
            match = re.match(r'\d{2}/\d{2}/\d{4}', date)
            if not match:
                self.warning("Bad date: %s" % date)
                continue
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            for link in row.xpath("td[2]/a[contains(@href, 'RollCall')]"):
                yield from self.scrape_vote(bill, date, link.attrib['href'])

            bill.add_action(action, date, chamber=actor, classification=atypes)

        for link in page.xpath("//a[contains(@href, 'Keyword')]"):
            bill.add_subject(link.text.strip())

        yield bill
Exemplo n.º 25
0
    def scrape_bill(self, session, bill_url):
        page = self.get(bill_url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(bill_url)

        try:
            bill_id = page.xpath('//span[@id="lblBillNumber"]/a[1]')[0].text
        except IndexError:
            self.logger.warning("Something is wrong with bill page, skipping.")
            return
        secondary_bill_id = page.xpath('//span[@id="lblCompNumber"]/a[1]')

        # checking if there is a matching bill
        if secondary_bill_id:
            secondary_bill_id = secondary_bill_id[0].text
            # swap ids if * is in secondary_bill_id
            if '*' in secondary_bill_id:
                bill_id, secondary_bill_id = secondary_bill_id, bill_id
                secondary_bill_id = secondary_bill_id.strip()
            secondary_bill_id = secondary_bill_id.replace('  ', ' ')

        bill_id = bill_id.replace('*', '').replace('  ', ' ').strip()

        if 'B' in bill_id:
            bill_type = 'bill'
        elif 'JR' in bill_id:
            bill_type = 'joint resolution'
        elif 'R' in bill_id:
            bill_type = 'resolution'

        primary_chamber = 'lower' if 'H' in bill_id else 'upper'
        # secondary_chamber = 'upper' if primary_chamber == 'lower' else 'lower'

        title = page.xpath("//span[@id='lblAbstract']")[0].text
        if title is None:
            msg = '%s detail page was missing title info.'
            self.logger.warning(msg % bill_id)
            return

        # bill subject
        subject_pos = title.find('-')
        subjects = [s.strip() for s in title[:subject_pos - 1].split(',')]
        subjects = filter(None, subjects)

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=primary_chamber,
            title=title,
            classification=bill_type,
        )
        for subject in subjects:
            bill.add_subject(subject)

        if secondary_bill_id:
            bill.add_identifier(secondary_bill_id)

        bill.add_source(bill_url)

        # Primary Sponsor
        sponsor = page.xpath("//span[@id='lblBillPrimeSponsor']")[0].text_content().split("by")[-1]
        sponsor = sponsor.replace('*', '').strip()
        if sponsor:
            bill.add_sponsorship(
                sponsor,
                classification='primary',
                entity_type='person',
                primary=True,
            )

        # bill text
        btext = page.xpath("//span[@id='lblBillNumber']/a")[0]
        bill.add_version_link('Current Version', btext.get('href'),
                              media_type='application/pdf')

        # documents
        summary = page.xpath('//a[contains(@href, "BillSummaryArchive")]')
        if summary:
            bill.add_document_link('Summary', summary[0].get('href'))
        fiscal = page.xpath('//span[@id="lblFiscalNote"]//a')
        if fiscal:
            bill.add_document_link('Fiscal Note', fiscal[0].get('href'))
        amendments = page.xpath('//a[contains(@href, "/Amend/")]')
        for amendment in amendments:
            bill.add_document_link('Amendment ' + amendment.text, amendment.get('href'))
        # amendment notes in image with alt text describing doc inside <a>
        amend_fns = page.xpath('//img[contains(@alt, "Fiscal Memo")]')
        for afn in amend_fns:
            bill.add_document_link(
                afn.get('alt'),
                afn.getparent().get('href'),
                on_duplicate='ignore'
            )

        # actions
        atable = page.xpath("//table[@id='gvBillActionHistory']")[0]
        actions_from_table(bill, atable)

        # if there is a matching bill
        if secondary_bill_id:
            # secondary sponsor
            secondary_sponsor = page.xpath(
                "//span[@id='lblCompPrimeSponsor']")[0].text_content().split("by")[-1]
            secondary_sponsor = secondary_sponsor.replace('*', '').replace(')', '').strip()
            # Skip black-name sponsors.
            if secondary_sponsor:
                bill.add_sponsorship(
                    secondary_sponsor,
                    classification='primary',
                    entity_type='person',
                    primary=True,
                )

            # secondary actions
            cotable = page.xpath("//table[@id='gvCoActionHistory']")[0]
            actions_from_table(bill, cotable)

        # votes
        yield from self.scrape_vote_events(bill, page, bill_url)

        bill.actions.sort(key=lambda a: a['date'])
        yield bill
Exemplo n.º 26
0
    def scrape(self) :
        three_days_ago = datetime.datetime.now() - datetime.timedelta(3)
        for matter in self.matters(three_days_ago) :
            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            if not all((date, title, identifier)) :
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name":"Chicago City Council"})

            legistar_web = self.legislation_detail_url(matter_id)
            legistar_api = 'http://webapi.legistar.com/v1/chicago/matters/{0}'.format(matter_id)

            bill.add_source(legistar_web, note='web')
            bill.add_source(legistar_api, note='api')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id) :
                act = bill.add_action(**action)

                if action['description'] == 'Referred' :
                    body_name = matter['MatterBodyName']
                    if body_name != 'City Council' :
                        act.add_related_entity(body_name,
                                               'organization',
                                               entity_id = _make_pseudo_id(name=body_name))

                result, votes = vote
                if result :
                    vote_event = VoteEvent(legislative_session=bill.legislative_session, 
                                           motion_text=action['description'],
                                           organization=action['organization'],
                                           classification=None,
                                           start_date=action['date'],
                                           result=result,
                                           bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes :
                        raw_option = vote['VoteValueName'].lower()
                        clean_option = self.VOTE_OPTIONS.get(raw_option,
                                                             raw_option)
                        vote_event.vote(clean_option, 
                                        vote['VotePersonName'].strip())

                    yield vote_event


            for sponsorship in self.sponsorships(matter_id) :
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id) :
                bill.add_subject(topic['MatterIndexName'].strip())

            for attachment in self.attachments(matter_id) :
                if attachment['MatterAttachmentName'] :
                    bill.add_version_link(attachment['MatterAttachmentName'],
                                          attachment['MatterAttachmentHyperlink'],
                                          media_type="application/pdf")

            bill.extras = {'local_classification' : matter['MatterTypeName']}

            text = self.text(matter_id)

            if text :
                if text['MatterTextPlain'] :
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf'] :
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '')

            yield bill
Exemplo n.º 27
0
    def scrape_bills(self, chamber_to_scrape, session):
        url = 'http://billstatus.ls.state.ms.us/%s/pdf/all_measures/allmsrs.xml' % session

        bill_dir_page = self.get(url)
        root = lxml.etree.fromstring(bill_dir_page.content)
        for mr in root.xpath('//LASTACTION/MSRGROUP'):
            bill_id = mr.xpath('string(MEASURE)').replace(" ", "")
            if bill_id[0] == "S":
                chamber = "upper"
            else:
                chamber = "lower"

            bill_type = {
                'B': 'bill',
                'C': 'concurrent resolution',
                'R': 'resolution',
                'N': 'nomination'
            }[bill_id[1]]

            # just skip past bills that are of the wrong chamber
            if chamber != chamber_to_scrape:
                continue

            link = mr.xpath('string(ACTIONLINK)').replace("..", "")
            main_doc = mr.xpath('string(MEASURELINK)').replace("../../../", "")
            main_doc_url = 'http://billstatus.ls.state.ms.us/%s' % main_doc
            bill_details_url = 'http://billstatus.ls.state.ms.us/%s/pdf/%s' % (
                session, link)
            try:
                details_page = self.get(bill_details_url)
            except scrapelib.HTTPError:
                self.warning(
                    'Bill page not loading for {}; skipping'.format(bill_id))
                continue

            page = details_page.content
            # Some pages have the (invalid) byte 11 sitting around. Just drop
            # them out. Might as well.

            details_root = lxml.etree.fromstring(page)
            title = details_root.xpath('string(//SHORTTITLE)')
            longtitle = details_root.xpath('string(//LONGTITLE)')

            bill = Bill(bill_id,
                        legislative_session=session,
                        chamber=chamber,
                        title=title,
                        classification=bill_type)
            bill.extras['summary'] = longtitle
            bill.add_source(main_doc_url)
            # sponsors
            main_sponsor = details_root.xpath('string(//P_NAME)').split()
            if main_sponsor:
                main_sponsor = main_sponsor[0]
                main_sponsor_link = details_root.xpath(
                    'string(//P_LINK)').replace(" ", "_")
                main_sponsor_url = (
                    'http://billstatus.ls.state.ms.us/%s/'
                    'pdf/%s') % (session, main_sponsor_link.strip('../'))
                type = "primary"
                bill.add_source(main_sponsor_url)
                bill.add_sponsorship(main_sponsor,
                                     classification=type,
                                     entity_type='person',
                                     primary=True)

            for author in details_root.xpath('//AUTHORS/ADDITIONAL'):
                leg = author.xpath('string(CO_NAME)').replace(" ", "_")
                if leg:
                    leg_url = ('http://billstatus.ls.state.ms.us/%s/'
                               'pdf/House_authors/%s.xml') % (session, leg)
                    type = "cosponsor"
                    bill.add_source(leg_url)
                    bill.add_sponsorship(leg,
                                         classification=type,
                                         entity_type='person',
                                         primary=False)
            # Versions
            curr_version = details_root.xpath('string(//CURRENT_OTHER'
                                              ')').replace("../../../../", "")
            if curr_version != "":
                curr_version_url = "http://billstatus.ls.state.ms.us/" \
                        + curr_version
                bill.add_version_link("Current version",
                                      curr_version_url,
                                      on_duplicate="ignore",
                                      media_type="text/html")

            intro_version = details_root.xpath(
                'string(//INTRO_OTHER)').replace("../../../../", "")
            if intro_version != "":
                intro_version_url = "http://billstatus.ls.state.ms.us/"\
                        + intro_version
                bill.add_version_link("As Introduced",
                                      intro_version_url,
                                      on_duplicate='ignore',
                                      media_type='text/html')

            comm_version = details_root.xpath('string(//CMTESUB_OTHER'
                                              ')').replace("../../../../", "")
            if comm_version.find("documents") != -1:
                comm_version_url = "http://billstatus.ls.state.ms.us/" + comm_version
                bill.add_version_link("Committee Substitute",
                                      comm_version_url,
                                      on_duplicate='ignore',
                                      media_type='text/html')
            passed_version = details_root.xpath('string(//PASSED_OTHER'
                                                ')').replace(
                                                    "../../../../", "")
            if passed_version.find("documents") != -1:
                passed_version_url = "http://billstatus.ls.state.ms.us/" + passed_version
                title = "As Passed the " + chamber
                bill.add_version_link(title,
                                      passed_version_url,
                                      on_duplicate='ignore',
                                      media_type='text/html')

            asg_version = details_root.xpath('string(//ASG_OTHER)').replace(
                "../../../../", "")
            if asg_version.find("documents") != -1:
                asg_version_url = "http://billstatus.ls.state.ms.us/" + asg_version
                bill.add_version_link("Approved by the Governor",
                                      asg_version_url,
                                      on_duplicate='ignore',
                                      media_type='text/html')

            # avoid duplicate votes
            seen_votes = set()

            # Actions
            for action in details_root.xpath('//HISTORY/ACTION'):
                # action_num  = action.xpath('string(ACT_NUMBER)').strip()
                # action_num = int(action_num)
                act_vote = action.xpath('string(ACT_VOTE)').replace(
                    "../../../..", "")
                action_desc = action.xpath('string(ACT_DESC)')
                date, action_desc = action_desc.split(" ", 1)
                date = date + "/" + session[0:4]
                date = datetime.strptime(date, "%m/%d/%Y")

                if action_desc.startswith("(H)"):
                    actor = "lower"
                    action = action_desc[4:]
                elif action_desc.startswith("(S)"):
                    actor = "upper"
                    action = action_desc[4:]
                else:
                    actor = "executive"
                    action = action_desc

                if "Veto" in action and actor == 'executive':
                    version_path = details_root.xpath("string(//VETO_OTHER)")
                    version_path = version_path.replace("../../../../", "")
                    version_url = "http://billstatus.ls.state.ms.us/" + version_path
                    bill.add_document_link("Veto", version_url)

                atype = 'other'
                for prefix, prefix_type in self._action_types:
                    if action.startswith(prefix):
                        atype = prefix_type
                        break

                bill.add_action(
                    action,
                    self._tz.localize(date),
                    chamber=actor,
                    classification=atype if atype is not 'other' else None)

                # use committee names as scraped subjects
                subjects = details_root.xpath('//H_NAME/text()')
                subjects += details_root.xpath('//S_NAME/text()')

                for subject in subjects:
                    if subject not in bill.subject:
                        bill.add_subject(subject)

                if act_vote:
                    vote_url = 'http://billstatus.ls.state.ms.us%s' % act_vote
                    if vote_url not in seen_votes:
                        seen_votes.add(vote_url)
                        yield from self.scrape_votes(vote_url, action, date,
                                                     actor, bill)

            bill.add_source(bill_details_url)
            yield bill
Exemplo n.º 28
0
    def scrape_bill(self, chamber, session, bill_id):
        # there will be a space in bill_id if we're doing a one-off bill scrape
        # convert HB 102 into H102
        if ' ' in bill_id:
            bill_id = bill_id[0] + bill_id.split(' ')[-1]

        # if chamber comes in as House/Senate convert to lower/upper
        if chamber == 'Senate':
            chamber = 'upper'
        elif chamber == 'House':
            chamber = 'lower'

        bill_detail_url = ('http://www.ncleg.net/gascripts/'
                           'BillLookUp/BillLookUp.pl?Session=%s&BillID=%s&votesToView=all') % (
                               session, bill_id)

        # parse the bill data page, finding the latest html text
        data = self.get(bill_detail_url).text
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(bill_detail_url)

        title_div_txt = doc.xpath('//div[contains(@class, "h2")]/text()')[0]
        if 'Joint Resolution' in title_div_txt:
            bill_type = 'joint resolution'
            bill_id = bill_id[0] + 'JR ' + bill_id[1:]
        elif 'Resolution' in title_div_txt:
            bill_type = 'resolution'
            bill_id = bill_id[0] + 'R ' + bill_id[1:]
        elif 'Bill' in title_div_txt:
            bill_type = 'bill'
            bill_id = bill_id[0] + 'B ' + bill_id[1:]

        bill_title = doc.xpath(
            '/html/body/div/div/main/div[2]/div[contains(@class,"col-12")]/a')[0]
        bill_title = bill_title.text_content().strip()

        bill = Bill(bill_id, legislative_session=session, title=bill_title, chamber=chamber,
                    classification=bill_type)
        bill.add_source(bill_detail_url)

        # skip first PDF link (duplicate link to cur version)
        if chamber == 'lower':
            link_xpath = '//a[contains(@href, "/Bills/House/PDF/")]'
        else:
            link_xpath = '//a[contains(@href, "/Bills/Senate/PDF/")]'
        for vlink in doc.xpath(link_xpath)[1:]:
            # get the name from the PDF link...
            version_name = vlink.text.replace(u'\xa0', ' ')
            version_url = vlink.attrib['href']

            media_type = 'text/html'
            if version_url.lower().endswith(".pdf"):
                media_type = 'application/pdf'

            bill.add_version_link(version_name, version_url, media_type=media_type,
                                  on_duplicate='ignore')

        # rows with a 'adopted' in the text and an amendment link, skip failed amds
        for row in doc.xpath('//div[@class="card-body"]/div[contains(., "Adopted")'
                             ' and contains(@class,"row")]//a[@title="Amendment"]'):
            version_url = row.xpath('@href')[0]
            version_name = row.xpath('string(.)').strip()
            bill.add_version_link(version_name, version_url, media_type='application/pdf',
                                  on_duplicate='ignore')

        # sponsors
        spon_row = doc.xpath('//div[contains(text(), "Sponsors")]/following-sibling::div')[0]
        # first sponsors are primary, until we see (Primary)
        spon_type = 'primary'
        spon_lines = spon_row.text_content().replace('\r\n', ';')
        for leg in spon_lines.split(';'):
            name = leg.replace(u'\xa0', ' ').strip()
            if name.startswith('(Primary)') or name.endswith('(Primary)'):
                name = name.replace('(Primary)', '').strip()
                spon_type = 'cosponsor'
            if not name:
                continue
            bill.add_sponsorship(name, classification=spon_type, entity_type='person',
                                 primary=(spon_type == 'primary'))

        # keywords
        kw_row = doc.xpath('//div[contains(text(), "Keywords:")]/following-sibling::div')[0]
        for subject in kw_row.text_content().split(', '):
            bill.add_subject(subject)

        # actions
        action_tr_xpath = (
            '//h6[contains(text(), "History")]'
            '/ancestor::div[contains(@class, "gray-card")]'
            '//div[contains(@class, "card-body")]'
            '/div[@class="row"]'
        )
        # skip two header rows
        for row in doc.xpath(action_tr_xpath):
            cols = row.xpath('div')
            act_date = cols[1].text
            actor = cols[3].text or ''
            # if text is blank, try diving in
            action = (cols[5].text or '').strip() or cols[5].text_content().strip()

            act_date = dt.datetime.strptime(act_date, '%m/%d/%Y').strftime('%Y-%m-%d')

            if actor == 'Senate':
                actor = 'upper'
            elif actor == 'House':
                actor = 'lower'
            else:
                actor = 'executive'

            for pattern, atype in self._action_classifiers.items():
                if action.startswith(pattern):
                    break
            else:
                atype = None

            bill.add_action(action, act_date, chamber=actor, classification=atype)

        # TODO: Fix vote scraper
        # yield from self.scrape_votes(bill, doc)

        yield bill
Exemplo n.º 29
0
    def scrape(self):
        three_days_ago = datetime.datetime.now() - datetime.timedelta(3)
        for matter in self.matters(three_days_ago):
            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']

            if not all((date, title)):
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            bill = Bill(identifier=matter['MatterFile'],
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name": "Chicago City Council"})

            legistar_web = self.legislation_detail_url(matter_id)
            legistar_api = 'http://webapi.legistar.com/v1/chicago/matters/{0}'.format(
                matter_id)

            bill.add_source(legistar_web, note='web')
            bill.add_source(legistar_api, note='api')

            for action, vote in self.actions(matter_id):
                act = bill.add_action(**action)

                if action['description'] == 'Referred':
                    body_name = matter['MatterBodyName']
                    if body_name != 'City Council':
                        act.add_related_entity(
                            body_name,
                            'organization',
                            entity_id=_make_pseudo_id(name=body_name))

                result, votes = vote
                if result:
                    vote_event = VoteEvent(
                        legislative_session=bill.legislative_session,
                        motion_text=action['description'],
                        organization=action['organization'],
                        classification=None,
                        start_date=action['date'],
                        result=result,
                        bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes:
                        raw_option = vote['VoteValueName'].lower()
                        clean_option = self.VOTE_OPTIONS.get(
                            raw_option, raw_option)
                        vote_event.vote(clean_option,
                                        vote['VotePersonName'].strip())

                    yield vote_event

            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id):
                bill.add_subject(topic['MatterIndexName'].strip())

            for attachment in self.attachments(matter_id):
                if attachment['MatterAttachmentName']:
                    bill.add_version_link(
                        attachment['MatterAttachmentName'],
                        attachment['MatterAttachmentHyperlink'],
                        media_type="application/pdf")

            bill.extras = {'local_classification': matter['MatterTypeName']}

            text = self.text(matter_id)

            if text:
                if text['MatterTextPlain']:
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf']:
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                        u'\u0000', '')

            yield bill
Exemplo n.º 30
0
    def scrape_bill(self, bill_id):
        old = self.api('bills/' + bill_id + '?')

        # not needed
        old.pop('id')
        old.pop('state')
        old.pop('level', None)
        old.pop('country', None)
        old.pop('created_at')
        old.pop('updated_at')
        old.pop('action_dates')
        old.pop('+bill_type',None)
        old.pop('+subject', None)
        old.pop('+scraped_subjects', None)
        old.pop('subjects', [])

        classification = old.pop('type')

        # ca weirdness
        if 'fiscal committee' in classification:
            classification.remove('fiscal committee')
        if 'urgency' in classification:
            classification.remove('urgency')
        if 'local program' in classification:
            classification.remove('local program')
        if 'tax levy' in classification:
            classification.remove('tax levy')

        if classification[0] in ['miscellaneous', 'jres', 'cres']:
            return

        if classification == ['memorial resolution'] and self.state == 'ar':
            classification = ['memorial']
        if classification == ['concurrent memorial resolution'] and self.state == 'ar':
            classification = ['concurrent memorial']
        if classification == ['joint session resolution'] and self.state == 'il':
            classification = ['joint resolution']
        if classification == ['legislative resolution'] and self.state == 'ny':
            classification = ['resolution']
        if classification == ['address'] and self.state == 'nh':
            classification = ['resolution']

        if not old['title'] and self.state == 'me':
            old['title'] = '(unknown)'

        chamber = old.pop('chamber')
        if self.state in ('ne', 'dc'):
            chamber = 'legislature'
        elif chamber in ('joint', 'conference'):
            chamber = 'legislature'

        new = Bill(old.pop('bill_id'), old.pop('session'), old.pop('title'),
                   chamber=chamber, classification=classification)

        abstract = old.pop('summary', None)
        if abstract:
            new.add_abstract(abstract, note='')

        for title in old.pop('alternate_titles'):
            new.add_title(title)

        for doc in old.pop('documents'):
            new.add_document_link(doc['name'], doc['url'], on_duplicate='ignore')

        for doc in old.pop('versions'):
            new.add_version_link(doc['name'], doc['url'], media_type=doc.pop('mimetype', ''))

        for subj in old.pop('scraped_subjects', []):
            if subj:
                new.add_subject(subj)

        for spon in old.pop('sponsors'):
            if spon.get('committee_id') is not None:
                entity_type = 'organization'
            elif spon.get('leg_id') is not None:
                entity_type = 'person'
            else:
                entity_type = ''
            new.add_sponsorship(spon['name'], spon['type'], entity_type,
                                spon['type'] == 'primary')

        for act in old.pop('actions'):
            actor = act['actor']
            if actor.lower() in ('governor', 'mayor', 'secretary of state'):
                actor = 'executive'
            elif actor.lower() == 'house' or (actor.lower().startswith('lower (') and self.state == 'ca'):
                actor = 'lower'
            elif actor.lower() in ('senate', 'upper`') or (actor.lower().startswith('upper (') and self.state == 'ca'):
                actor = 'upper'
            elif actor in ('joint', 'other', 'Data Systems', 'Speaker', 'clerk',
                           'Office of the Legislative Fiscal Analyst', 'Became Law w',
                           'conference') or (actor.lower().startswith('legislature (') and self.state == 'ca'):
                actor = 'legislature'

            if actor in ('committee', 'sponsor') and self.state == 'pr':
                actor = 'legislature'

            # nebraska & DC
            if actor in ('upper','council') and self.state in ('ne', 'dc'):
                actor = 'legislature'

            if act['action']:
                newact = new.add_action(act['action'], act['date'][:10], chamber=actor,
                                        classification=[action_types[c] for c in act['type'] if c != 'other'])
                for re in act.get('related_entities', []):
                    if re['type'] == 'committee':
                        re['type'] = 'organization'
                    elif re['type'] == 'legislator':
                        re['type'] = 'person'
                    newact.add_related_entity(re['name'], re['type'])

        for comp in old.pop('companions', []):
            if self.state in ('nj', 'ny', 'mn'):
                rtype = 'companion'
            new.add_related_bill(comp['bill_id'], comp['session'], rtype)

        for abid in old.pop('alternate_bill_ids', []) + old.pop('+alternate_bill_ids', []):
            new.add_identifier(abid)


        # generic OpenStates stuff
        for id in old.pop('all_ids'):
            new.add_identifier(id, scheme='openstates')

        for source in old.pop('sources'):
            source.pop('retrieved', None)
            new.add_source(**source)

        ext_title = old.pop('+extended_title', None)
        if ext_title:
            new.add_title(ext_title, note='Extended Title')
        official_title = old.pop('+official_title', None)
        if official_title:
            new.add_title(official_title, note='Official Title')

        to_extras = ['+status', '+final_disposition', '+volume_chapter', '+ld_number', '+referral',
                     '+companion', '+description', '+fiscal_note_probable:',
                     '+preintroduction_required:', '+drafter', '+category:', '+chapter',
                     '+requester', '+transmittal_date:', '+by_request_of', '+bill_draft_number:',
                     '+bill_lr', '+bill_url', '+rcs_num', '+fiscal_note', '+impact_clause', '+fiscal_notes',
                     '+short_title', '+type_', '+conference_committee', 'conference_committee',
                     '+companion_bill_ids', '+additional_information']
        for k in to_extras:
            v = old.pop(k, None)
            if v:
                new.extras[k.replace('+', '')] = v

        # votes
        vote_no = 1
        for vote in old.pop('votes'):
            vote.pop('id')
            vote.pop('state')
            vote.pop('bill_id')
            vote.pop('bill_chamber', None)
            vote.pop('+state', None)
            vote.pop('+country', None)
            vote.pop('+level', None)
            vote.pop('+vacant', None)
            vote.pop('+not_voting', None)
            vote.pop('+amended', None)
            vote.pop('+excused', None)
            vote.pop('+NV', None)
            vote.pop('+AB', None)
            vote.pop('+P', None)
            vote.pop('+V', None)
            vote.pop('+E', None)
            vote.pop('+EXC', None)
            vote.pop('+EMER', None)
            vote.pop('+present', None)
            vote.pop('+absent', None)
            vote.pop('+seconded', None)
            vote.pop('+moved', None)
            vote.pop('+vote_type', None)
            vote.pop('+actual_vote', None)
            vote.pop('+skip_votes', None)
            vote.pop('vote_id')
            vote.pop('+bill_chamber', None)
            vote.pop('+session', None)
            vote.pop('+bill_id', None)
            vote.pop('+bill_session', None)
            vote.pop('committee', None)
            vote.pop('committee_id', None)
            vtype = vote.pop('type', 'passage')

            if vtype == 'veto_override':
                vtype = ['veto-override']
            elif vtype == 'amendment':
                vtype = ['amendment-passage']
            elif vtype == 'other':
                vtype = ''
            else:
                vtype = ['bill-passage']

            # most states need identifiers for uniqueness, just do it everywhere
            identifier = vote['date'] + '-' + str(vote_no)
            vote_no += 1

            chamber = vote.pop('chamber')
            if chamber == 'upper' and self.state in ('ne', 'dc'):
                chamber = 'legislature'
            elif chamber == 'joint':
                chamber = 'legislature'

            newvote = VoteEvent(legislative_session=vote.pop('session'),
                           motion_text=vote.pop('motion'),
                           result='pass' if vote.pop('passed') else 'fail',
                           chamber=chamber,
                           start_date=vote.pop('date'),
                           classification=vtype,
                           bill=new,
                           identifier=identifier)
            for vt in ('yes', 'no', 'other'):
                newvote.set_count(vt, vote.pop(vt + '_count'))
                for name in vote.pop(vt + '_votes'):
                    newvote.vote(vt, name['name'])

            for source in vote.pop('sources'):
                source.pop('retrieved', None)
                newvote.add_source(**source)

            if not newvote.sources:
                newvote.sources = new.sources

            to_extras = ['+record', '+method', 'method', '+filename', 'record', '+action',
                         '+location', '+rcs_num', '+type_', '+threshold', '+other_vote_detail',
                         '+voice_vote']
            for k in to_extras:
                v = vote.pop(k, None)
                if v:
                    newvote.extras[k.replace('+', '')] = v

            assert not vote, vote.keys()
            yield newvote

        assert not old, old.keys()

        yield new
Exemplo n.º 31
0
    def parse_bill_status_page(self, url, page, session, chamber):
        # see 2007 HB 2... weird.
        parsed_url = urllib.parse.urlparse(url)
        parsed_query = dict(urllib.parse.parse_qsl(parsed_url.query))
        bill_id = "{0} {1}".format(
            parsed_query['P_BLTP_BILL_TYP_CD'],
            parsed_query['P_BILL_NO1'])

        try:
            xp = '//b[text()="Short Title:"]/../following-sibling::td/text()'
            title = page.xpath(xp).pop()
        except IndexError:
            title = page.xpath('//tr[1]/td[2]')[0].text_content()

        # Add bill type.
        _bill_id = bill_id.lower()
        if 'b' in _bill_id:
            classification = 'bill'
        elif 'j' in _bill_id or 'jr' in _bill_id:
            classification = 'joint resolution'
        elif 'cr' in _bill_id:
            classification = 'concurrent resolution'
        elif 'r' in _bill_id:
            classification = 'resolution'

        bill = Bill(bill_id, legislative_session=session, chamber=chamber,
                    title=title, classification=classification)

        self.add_actions(bill, page)
        votes = self.add_votes(bill, page, url)

        tabledata = self._get_tabledata(page)

        # Add sponsor info.
        bill.add_sponsorship(tabledata['primary sponsor:'][0], classification='primary',
                             entity_type='person', primary=True)

        # A various plus fields MT provides.
        plus_fields = [
            'requester',
            ('chapter number:', 'chapter'),
            'transmittal date:',
            'drafter',
            'fiscal note probable:',
            'bill draft number:',
            'preintroduction required:',
            'by request of',
            'category:']

        for x in plus_fields:
            if isinstance(x, tuple):
                _key, key = x
            else:
                _key = key = x
                key = key.replace(' ', '_')

            try:
                val = tabledata[_key]
            except KeyError:
                continue

            if len(val) == 1:
                val = val[0]

            bill.extras[key] = val

        # Add bill subjects.
        xp = '//th[contains(., "Revenue/Approp.")]/ancestor::table/tr'
        subjects = []
        for tr in page.xpath(xp):
            try:
                subj = tr.xpath('td')[0].text_content()
            except IndexError:
                continue
            subjects.append(subj)

        for s in subjects:
            bill.add_subject(s)

        self.add_fiscal_notes(page, bill)

        return bill, list(votes)
Exemplo n.º 32
0
    def scrape(self):
        state = 'MN'
        session = self.jurisdiction.legislative_sessions[0]
        apiKey = 'd2c0db7e-6a6e-4606-a9b0-83c18e647ff6'
        pyopenstates.set_api_key(apiKey)
        bills_upper = pyopenstates.search_bills(state=state,
                                                chamber="upper",
                                                updated_since="2017-01-01")
        bills_lower = pyopenstates.search_bills(state=state,
                                                chamber="lower",
                                                updated_since="2017-01-01")

        for b in bills_lower:
            number = b['bill_id']
            title = b['title']
            bill_id = b['id']
            dbill = pyopenstates.get_bill(bill_id)
            url = dbill['sources'][0]['url']

            bill = Bill(identifier=number,
                        legislative_session=session['identifier'],
                        title=title,
                        classification=b['type'][0],
                        chamber='upper')
            bill.add_source(url)
            bill.add_identifier(bill_id, scheme='openstatesv1')

            subjects = b['subjects']
            for s in subjects:
                bill.add_subject(s)

            sponsors = dbill['sponsors']
            for sponsor in sponsors:
                if not sponsor['leg_id'] == None:
                    l = pyopenstates.get_legislator(sponsor['leg_id'])
                    full_name = l['full_name'].split(' ')
                    if len(full_name) == 3:
                        full_name.pop(1)
                    full_name = (' ').join(full_name)
                    primary = False
                    if sponsor['type'] == 'primary':
                        primary = True
                    try:
                        bill.add_sponsorship(name=full_name,
                                             classification=sponsor['type'],
                                             entity_type='person',
                                             primary=primary)
                    except:
                        pass

            actions = dbill['actions']
            for act in actions:
                action = act['action']
                actor = act['actor']
                date = tz.localize(datetime.strptime(act['date'], DATE_FORMAT))
                Action_Type = act['type']
                bill.add_action(action, date, chamber=actor)

            action_dates = dbill['action_dates']
            for act in action_dates.items():
                k, v = act[0], act[1]
                if '_' in k:
                    chamber = k.split('_')[1]
                elif k == 'signed':
                    chamber = 'executive'
                else:
                    chamber = None
                k.replace('_', ' ')
                if not v == None and not k in ['first', 'last']:
                    bill.add_action(k, tz.localize(v), chamber=chamber)
            yield bill

        for b in bills_upper:
            number = b['bill_id']
            title = b['title']
            bill_id = b['id']
            dbill = pyopenstates.get_bill(bill_id)
            url = dbill['sources'][0]['url']

            bill = Bill(identifier=number,
                        legislative_session=session['identifier'],
                        title=title,
                        classification=b['type'][0],
                        chamber='upper')
            bill.add_source(url)
            bill.add_identifier(bill_id, scheme='openstatesv1')

            subjects = b['subjects']
            for s in subjects:
                bill.add_subject(s)

            sponsors = dbill['sponsors']
            for sponsor in sponsors:
                if not sponsor['leg_id'] == None:
                    l = pyopenstates.get_legislator(sponsor['leg_id'])
                    full_name = l['full_name'].split(' ')
                    if len(full_name) == 3:
                        full_name.pop(1)
                    full_name = (' ').join(full_name)
                    primary = False
                    if sponsor['type'] == 'primary':
                        primary = True
                    try:
                        bill.add_sponsorship(name=full_name,
                                             classification=sponsor['type'],
                                             entity_type='person',
                                             primary=primary)
                    except:
                        pass

            actions = dbill['actions']
            for act in actions:
                action = act['action']
                actor = act['actor']
                date = tz.localize(datetime.strptime(act['date'], DATE_FORMAT))
                Action_Type = act['type']
                bill.add_action(action, date, chamber=actor)

            action_dates = dbill['action_dates']
            for act in action_dates.items():
                k, v = act[0], act[1]
                if '_' in k:
                    chamber = k.split('_')[1]
                elif k == 'signed':
                    chamber = 'executive'
                else:
                    chamber = None
                k.replace('_', ' ')
                if not v == None and not k in ['first', 'last']:
                    bill.add_action(k, tz.localize(v), chamber=chamber)
            yield bill
Exemplo n.º 33
0
    def scrape(self, window=28, matter_ids=None):
        '''By default, scrape board reports updated in the last 28 days.
        Optionally specify a larger or smaller window of time from which to
        scrape updates, or specific matters to scrape.
        Note that passing a value for :matter_ids supercedes the value of
        :window, such that the given matters will be scraped regardless of
        when they were updated.

        Optional parameters
        :window (numeric) - Amount of time for which to scrape updates, e.g.
        a window of 7 will scrape legislation updated in the last week. Pass
        a window of 0 to scrape all legislation.
        :matter_ids (str) - Comma-separated list of matter IDs to scrape
        '''

        if matter_ids:
            matters = [
                self.matter(matter_id) for matter_id in matter_ids.split(',')
            ]
            matters = filter(
                None, matters)  # Skip matters that are not yet in Legistar
        elif float(window):  # Support for partial days, i.e., window=0.15
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
                float(window))
            matters = self.matters(n_days_ago)
        else:
            # Scrape all matters, including those without a last-modified date
            matters = self.matters()

        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
            float(window))
        for matter in matters:
            # Skip this bill, until Metro cleans up duplicate in Legistar API
            if matter['MatterFile'] == '2017-0447':
                continue

            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            if not all((date, title, identifier)):
                continue

            # Do not scrape private bills introduced before this timestamp.
            if self._is_restricted(matter) and (
                    date < self.START_DATE_PRIVATE_SCRAPE):
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name": "Board of Directors"})

            # The Metro scraper scrapes private bills.
            # However, we do not want to capture significant data about private bills,
            # other than the value of the helper function `_is_restricted` and a last modified timestamp.
            # We yield private bills early, wipe data from previously imported once-public bills,
            # and include only data *required* by the pupa schema.
            # https://github.com/opencivicdata/pupa/blob/master/pupa/scrape/schemas/bill.py
            bill.extras = {'restrict_view': self._is_restricted(matter)}

            # Add API source early.
            # Private bills should have this url for debugging.
            legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)
            bill.add_source(legistar_api, note='api')

            if self._is_restricted(matter):
                # required fields
                bill.title = 'Restricted View'

                # wipe old data
                bill.extras['plain_text'] = ''
                bill.extras['rtf_text'] = ''
                bill.sponsorships = []
                bill.related_bills = []
                bill.versions = []
                bill.documents = []
                bill.actions = []

                yield bill
                continue

            legistar_web = matter['legistar_url']
            bill.add_source(legistar_web, note='web')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id):
                act = bill.add_action(**action)

                if action['description'] == 'Referred':
                    body_name = matter['MatterBodyName']
                    act.add_related_entity(
                        body_name,
                        'organization',
                        entity_id=_make_pseudo_id(name=body_name))

                result, votes = vote
                if result:
                    vote_event = VoteEvent(
                        legislative_session=bill.legislative_session,
                        motion_text=action['description'],
                        organization=action['organization'],
                        classification=None,
                        start_date=action['date'],
                        result=result,
                        bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes:
                        try:
                            raw_option = vote['VoteValueName'].lower()
                        except AttributeError:
                            raw_option = None
                        clean_option = self.VOTE_OPTIONS.get(
                            raw_option, raw_option)
                        vote_event.vote(clean_option,
                                        vote['VotePersonName'].strip())

                    yield vote_event

            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id):
                bill.add_subject(topic['MatterIndexName'].strip())

            for relation in self.relations(matter_id):
                try:
                    # Get data (i.e., json) for the related bill.
                    # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session).
                    # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue.
                    related_bill = self.endpoint(
                        '/matters/{0}', relation['MatterRelationMatterId'])
                except scrapelib.HTTPError:
                    continue
                else:
                    date = related_bill['MatterIntroDate']
                    related_bill_session = self.session(self.toTime(date))
                    identifier = related_bill['MatterFile']
                    bill.add_related_bill(
                        identifier=identifier,
                        legislative_session=related_bill_session,
                        relation_type='companion')
                    # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104
                    # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'.

            bill.add_version_link(
                'Board Report',
                'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'
                .format(matter_id),
                media_type="application/pdf")

            for attachment in self.attachments(matter_id):
                if attachment['MatterAttachmentName']:
                    bill.add_document_link(
                        attachment['MatterAttachmentName'],
                        attachment['MatterAttachmentHyperlink'].strip(),
                        media_type="application/pdf")

            bill.extras['local_classification'] = matter['MatterTypeName']

            matter_version_value = matter['MatterVersion']
            text = self.text(matter_id, matter_version_value)

            if text:
                if text['MatterTextPlain']:
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf']:
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                        u'\u0000', '')

            yield bill
Exemplo n.º 34
0
    def scrape_bill(self, chamber, session, bill_id):
        # there will be a space in bill_id if we're doing a one-off bill scrape
        # convert HB 102 into H102
        if ' ' in bill_id:
            bill_id = bill_id[0] + bill_id.split(' ')[-1]

        # if chamber comes in as House/Senate convert to lower/upper
        if chamber == 'Senate':
            chamber = 'upper'
        elif chamber == 'House':
            chamber = 'lower'

        bill_detail_url = (
            'http://www.ncleg.net/gascripts/'
            'BillLookUp/BillLookUp.pl?Session=%s&BillID=%s&votesToView=all'
        ) % (session, bill_id)

        # parse the bill data page, finding the latest html text
        data = self.get(bill_detail_url).text
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(bill_detail_url)

        title_div_txt = doc.xpath('//div[contains(@class, "h2")]/text()')[0]
        if 'Joint Resolution' in title_div_txt:
            bill_type = 'joint resolution'
            bill_id = bill_id[0] + 'JR ' + bill_id[1:]
        elif 'Resolution' in title_div_txt:
            bill_type = 'resolution'
            bill_id = bill_id[0] + 'R ' + bill_id[1:]
        elif 'Bill' in title_div_txt:
            bill_type = 'bill'
            bill_id = bill_id[0] + 'B ' + bill_id[1:]

        bill_title = doc.xpath(
            '//div[contains(@class, "h5")]')[0].text_content().strip()

        bill = Bill(bill_id,
                    legislative_session=session,
                    title=bill_title,
                    chamber=chamber,
                    classification=bill_type)
        bill.add_source(bill_detail_url)

        # skip first PDF link (duplicate link to cur version)
        if chamber == 'lower':
            link_xpath = '//a[contains(@href, "/Bills/House/PDF/")]'
        else:
            link_xpath = '//a[contains(@href, "/Bills/Senate/PDF/")]'
        for vlink in doc.xpath(link_xpath)[1:]:
            # get the name from the PDF link...
            version_name = vlink.text.replace(u'\xa0', ' ')
            version_url = vlink.attrib['href']

            media_type = 'text/html'
            if version_url.lower().endswith(".pdf"):
                media_type = 'application/pdf'

            bill.add_version_link(version_name,
                                  version_url,
                                  media_type=media_type,
                                  on_duplicate='ignore')

        # sponsors
        spon_row = doc.xpath(
            '//div[contains(text(), "Sponsors")]/following-sibling::div')[0]
        # first sponsors are primary, until we see (Primary)
        spon_type = 'primary'
        for leg in spon_row.text_content().split(';'):
            name = leg.replace(u'\xa0', ' ').strip()
            if name.startswith('(Primary)'):
                name = name.replace('(Primary)', '').strip()
                spon_type = 'cosponsor'
            if not name:
                continue
            bill.add_sponsorship(name,
                                 classification=spon_type,
                                 entity_type='person',
                                 primary=(spon_type == 'primary'))

        # keywords
        kw_row = doc.xpath(
            '//div[contains(text(), "Keywords:")]/following-sibling::div')[0]
        for subject in kw_row.text_content().split(', '):
            bill.add_subject(subject)

        # actions
        action_tr_xpath = ('//h6[contains(text(), "History")]'
                           '/ancestor::div[contains(@class, "gray-card")]'
                           '//div[contains(@class, "card-body")]'
                           '/div[@class="row"]')
        # skip two header rows
        for row in doc.xpath(action_tr_xpath):
            cols = row.xpath('div')
            act_date = cols[1].text
            actor = cols[3].text or ''
            # if text is blank, try diving in
            action = (cols[5].text
                      or '').strip() or cols[5].text_content().strip()

            act_date = dt.datetime.strptime(act_date,
                                            '%m/%d/%Y').strftime('%Y-%m-%d')

            if actor == 'Senate':
                actor = 'upper'
            elif actor == 'House':
                actor = 'lower'
            else:
                actor = 'executive'

            for pattern, atype in self._action_classifiers.items():
                if action.startswith(pattern):
                    break
            else:
                atype = None

            bill.add_action(action,
                            act_date,
                            chamber=actor,
                            classification=atype)

        yield from self.scrape_votes(bill, doc)

        yield bill
Exemplo n.º 35
0
    def scrape(self, session=None):
        if not session:
            session = self.latest_session()
            self.info("no session specified, using %s", session)

        self._bill_prefix_map = {
            "HB": {"type": "bill", "url_segment": "bills/house"},
            "HR": {"type": "resolution", "url_segment": "resolutions/house/simple"},
            "HCR": {
                "type": "concurrent resolution",
                "url_segment": "resolutions/house/concurrent",
            },
            "HJR": {
                "type": "joint resolution",
                "url_segment": "resolutions/house/joint",
            },
            "HC": {
                "type": "concurrent resolution",
                "url_segment": "resolutions/house/concurrent",
            },
            "HJ": {
                "type": "joint resolution",
                "url_segment": "resolutions/house/joint",
            },
            "SB": {"type": "bill", "url_segment": "bills/senate"},
            "SR": {"type": "resolution", "url_segment": "resolutions/senate/simple"},
            "SCR": {
                "type": "concurrent resolution",
                "url_segment": "resolutions/senate/concurrent",
            },
            "SJR": {
                "type": "joint resolution",
                "url_segment": "resolutions/senate/joint",
            },
            "SC": {
                "type": "concurrent resolution",
                "url_segment": "resolutions/senate/concurrent",
            },
            "SJ": {
                "type": "joint resolution",
                "url_segment": "resolutions/senate/joint",
            },
        }

        api_base_url = "https://api.iga.in.gov"
        proxy = {"url": "http://in-proxy.openstates.org"}

        # ah, indiana. it's really, really hard to find
        # pdfs in their web interface. Super easy with
        # the api, but a key needs to be passed
        # in the headers. To make these documents
        # viewable to the public and our scrapers,
        # sunlight's put up a proxy service at this link
        # using our api key for pdf document access.

        client = ApiClient(self)
        r = client.get("bills", session=session)
        all_pages = client.unpaginate(r)
        for b in all_pages:
            bill_id = b["billName"]
            for idx, char in enumerate(bill_id):
                try:
                    int(char)
                except ValueError:
                    continue
                disp_bill_id = bill_id[:idx] + " " + str(int(bill_id[idx:]))
                break

            bill_link = b["link"]
            api_source = api_base_url + bill_link
            try:
                bill_json = client.get("bill", session=session, bill_id=bill_id.lower())
            except scrapelib.HTTPError:
                self.logger.warning("Bill could not be accessed. Skipping.")
                continue

            title = bill_json["title"]
            if title == "NoneNone":
                title = None
            # sometimes title is blank
            # if that's the case, we can check to see if
            # the latest version has a short description
            if not title:
                title = bill_json["latestVersion"]["shortDescription"]

            # and if that doesn't work, use the bill_id but throw a warning
            if not title:
                title = bill_id
                self.logger.warning("Bill is missing a title, using bill id instead.")

            bill_prefix = self._get_bill_id_components(bill_id)[0]

            original_chamber = (
                "lower" if bill_json["originChamber"].lower() == "house" else "upper"
            )
            bill_type = self._bill_prefix_map[bill_prefix]["type"]
            bill = Bill(
                disp_bill_id,
                legislative_session=session,
                chamber=original_chamber,
                title=title,
                classification=bill_type,
            )

            bill.add_source(self._get_bill_url(session, bill_id))
            bill.add_source(api_source)

            # sponsors
            for s in bill_json["authors"]:
                bill.add_sponsorship(
                    classification="author",
                    name=self._get_name(s),
                    entity_type="person",
                    primary=True,
                )

            for s in bill_json["coauthors"]:
                bill.add_sponsorship(
                    classification="coauthor",
                    name=self._get_name(s),
                    entity_type="person",
                    primary=False,
                )

            for s in bill_json["sponsors"]:
                bill.add_sponsorship(
                    classification="sponsor",
                    name=self._get_name(s),
                    entity_type="person",
                    primary=True,
                )

            for s in bill_json["cosponsors"]:
                bill.add_sponsorship(
                    classification="cosponsor",
                    name=self._get_name(s),
                    entity_type="person",
                    primary=False,
                )

            # actions
            action_link = bill_json["actions"]["link"]
            api_source = api_base_url + action_link

            try:
                actions = client.get(
                    "bill_actions", session=session, bill_id=bill_id.lower()
                )
            except scrapelib.HTTPError:
                self.logger.warning("Could not find bill actions page")
                actions = {"items": []}

            for a in actions["items"]:
                action_desc = a["description"]
                if "governor" in action_desc.lower():
                    action_chamber = "executive"
                elif a["chamber"]["name"].lower() == "house":
                    action_chamber = "lower"
                else:
                    action_chamber = "upper"
                date = a["date"]

                if not date:
                    self.logger.warning("Action has no date, skipping")
                    continue

                # convert time to pupa fuzzy time
                date = date.replace("T", " ")
                # TODO: if we update pupa to accept datetimes we can drop this line
                date = date.split()[0]

                action_type = []
                d = action_desc.lower()
                committee = None

                reading = False
                if "first reading" in d:
                    action_type.append("reading-1")
                    reading = True

                if "second reading" in d or "reread second time" in d:
                    action_type.append("reading-2")
                    reading = True

                if "third reading" in d or "reread third time" in d:
                    action_type.append("reading-3")
                    if "passed" in d:
                        action_type.append("passage")
                    if "failed" in d:
                        action_type.append("failure")
                    reading = True

                if "adopted" in d and reading:
                    action_type.append("passage")

                if (
                    "referred" in d
                    and "committee on" in d
                    or "reassigned" in d
                    and "committee on" in d
                ):
                    committee = d.split("committee on")[-1].strip()
                    action_type.append("referral-committee")

                if "committee report" in d:
                    if "pass" in d:
                        action_type.append("committee-passage")
                    if "fail" in d:
                        action_type.append("committee-failure")

                if "amendment" in d and "without amendment" not in d:
                    if "pass" in d or "prevail" in d or "adopted" in d:
                        action_type.append("amendment-passage")
                    if "fail" or "out of order" in d:
                        action_type.append("amendment-failure")
                    if "withdraw" in d:
                        action_type.append("amendment-withdrawal")

                if "signed by the governor" in d:
                    action_type.append("executive-signature")

                if len(action_type) == 0:
                    # calling it other and moving on with a warning
                    self.logger.warning(
                        "Could not recognize an action in '{}'".format(action_desc)
                    )
                    action_type = None

                a = bill.add_action(
                    chamber=action_chamber,
                    description=action_desc,
                    date=date,
                    classification=action_type,
                )
                if committee:
                    a.add_related_entity(committee, entity_type="organization")

            # subjects
            subjects = [s["entry"] for s in bill_json["latestVersion"]["subjects"]]
            for subject in subjects:
                bill.add_subject(subject)

            # versions and votes
            for version in bill_json["versions"][::-1]:
                try:
                    version_json = client.get(
                        "bill_version",
                        session=session,
                        bill_id=version["billName"],
                        version_id=version["printVersionName"],
                    )
                except scrapelib.HTTPError:
                    self.logger.warning("Bill version does not seem to exist.")
                    continue

                yield from self.deal_with_version(
                    version_json, bill, bill_id, original_chamber, session, proxy
                )

            yield bill
Exemplo n.º 36
0
    def get_bill_info(self, chamber, session, bill_detail_url, version_list_url):
        """
        Extracts all the requested info for a given bill.

        Calls the parent's methods to enter the results into JSON files.
        """
        chamber = 'lower' if chamber.lower() == 'house' else chamber
        chamber = 'upper' if chamber.lower() == 'senate' else chamber

        # Get html and parse
        doc = self.lxmlize(bill_detail_url)

        # Check if bill hasn't been transmitted to the other chamber yet
        transmit_check = self.get_node(
            doc,
            '//h1[text()[contains(.,"Bills")]]/following-sibling::ul/li/text()'
        )
        if (transmit_check is not None and
                'has not been transmitted' in transmit_check.strip()):
            self.logger.debug('Bill has not been transmitted to other chamber '
                              '... skipping {0}'.format(bill_detail_url))
            return

        # Get the basic parts of the bill
        bill_id = self.get_node(doc, '//h1[contains(@class,"card-title float-left mr-4")]/text()')
        self.logger.debug(bill_id)
        bill_title_text = self.get_node(
            doc,
            '//h2[text()[contains(.,"Description")]]/following-sibling::p/text()'
        )
        if bill_title_text is not None:
            bill_title = bill_title_text.strip()
        else:
            long_desc_url = self.get_node(
                doc,
                '//a[text()[contains(.,"Long Description")]]/@href'
            )
            long_desc_page = self.lxmlize(long_desc_url)
            long_desc_text = self.get_node(long_desc_page, '//h1/'
                                           'following-sibling::p/text()')
            if long_desc_text is not None:
                bill_title = long_desc_text.strip()
            else:
                bill_title = 'No title found.'
                self.logger.warning('No title found for {}.'.format(bill_id))
        self.logger.debug(bill_title)
        bill_type = {'F': 'bill', 'R': 'resolution',
                     'C': 'concurrent resolution'}[bill_id[1].upper()]
        bill = Bill(bill_id, legislative_session=session, chamber=chamber,
                    title=bill_title, classification=bill_type)

        # Add source
        bill.add_source(bill_detail_url)

        for subject in self._subject_mapping[bill_id]:
            bill.add_subject(subject)

        # Get companion bill.
        companion = doc.xpath('//table[@class="status_info"]//tr[1]/td[2]'
                              '/a[starts-with(@href, "?")]/text()')
        companion = self.make_bill_id(companion[0]) if len(companion) > 0 else None
        companion_chamber = self.chamber_from_bill(companion)
        if companion is not None:
            bill.add_companion(companion, chamber=companion_chamber)

        # Grab sponsors
        bill = self.extract_sponsors(bill, doc, chamber)

        # Add Actions performed on the bill.
        bill = self.extract_actions(bill, doc, chamber)

        # Get all versions of the bill.
        bill = self.extract_versions(bill, doc, chamber, version_list_url)

        yield bill
Exemplo n.º 37
0
    def scrape_bill(self, session, bill_url):
        page = self.get(bill_url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(bill_url)

        try:
            bill_id = page.xpath('//span[@id="lblBillNumber"]/a[1]')[0].text
        except IndexError:
            self.logger.warning("Something is wrong with bill page, skipping.")
            return
        secondary_bill_id = page.xpath('//span[@id="lblCompNumber"]/a[1]')

        # checking if there is a matching bill
        if secondary_bill_id:
            secondary_bill_id = secondary_bill_id[0].text
            # swap ids if * is in secondary_bill_id
            if "*" in secondary_bill_id:
                bill_id, secondary_bill_id = secondary_bill_id, bill_id
                secondary_bill_id = secondary_bill_id.strip()
            secondary_bill_id = secondary_bill_id.replace("  ", " ")

        bill_id = bill_id.replace("*", "").replace("  ", " ").strip()

        if "B" in bill_id:
            bill_type = "bill"
        elif "JR" in bill_id:
            bill_type = "joint resolution"
        elif "R" in bill_id:
            bill_type = "resolution"

        primary_chamber = "lower" if "H" in bill_id else "upper"
        # secondary_chamber = 'upper' if primary_chamber == 'lower' else 'lower'

        title = page.xpath("//span[@id='lblAbstract']")[0].text
        if title is None:
            msg = "%s detail page was missing title info."
            self.logger.warning(msg % bill_id)
            return

        # bill subject
        subject_pos = title.find("-")
        subjects = [s.strip() for s in title[: subject_pos - 1].split(",")]
        subjects = filter(None, subjects)

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=primary_chamber,
            title=title,
            classification=bill_type,
        )
        for subject in subjects:
            bill.add_subject(subject)

        if secondary_bill_id:
            bill.add_identifier(secondary_bill_id)

        if page.xpath('//span[@id="lblCompNumber"]/a'):
            companion_id = page.xpath('//span[@id="lblCompNumber"]/a')[0].text_content().strip()
            bill.add_related_bill(
                identifier=companion_id,
                legislative_session=session,
                relation_type="companion",
            )

        bill.add_source(bill_url)

        # Primary Sponsor
        sponsor = (
            page.xpath("//span[@id='lblBillPrimeSponsor']")[0]
            .text_content()
            .split("by")[-1]
        )
        sponsor = sponsor.replace("*", "").strip()
        if sponsor:
            bill.add_sponsorship(
                sponsor, classification="primary", entity_type="person", primary=True
            )

        # bill text
        btext = page.xpath("//span[@id='lblBillNumber']/a")[0]
        bill.add_version_link(
            "Current Version", btext.get("href"), media_type="application/pdf"
        )

        # documents
        summary = page.xpath('//a[contains(@href, "BillSummaryArchive")]')
        if summary:
            bill.add_document_link("Summary", summary[0].get("href"))
        fiscal = page.xpath('//span[@id="lblFiscalNote"]//a')
        if fiscal:
            bill.add_document_link("Fiscal Note", fiscal[0].get("href"))
        amendments = page.xpath('//a[contains(@href, "/Amend/")]')
        for amendment in amendments:
            bill.add_document_link("Amendment " + amendment.text, amendment.get("href"))
        # amendment notes in image with alt text describing doc inside <a>
        amend_fns = page.xpath('//img[contains(@alt, "Fiscal Memo")]')
        for afn in amend_fns:
            bill.add_document_link(
                afn.get("alt"), afn.getparent().get("href"), on_duplicate="ignore"
            )

        # actions
        atable = page.xpath("//table[@id='gvBillActionHistory']")[0]
        actions_from_table(bill, atable)

        # if there is a matching bill
        if secondary_bill_id:
            # secondary sponsor
            secondary_sponsor = (
                page.xpath("//span[@id='lblCompPrimeSponsor']")[0]
                .text_content()
                .split("by")[-1]
            )
            secondary_sponsor = (
                secondary_sponsor.replace("*", "").replace(")", "").strip()
            )
            # Skip black-name sponsors.
            if secondary_sponsor:
                bill.add_sponsorship(
                    secondary_sponsor,
                    classification="primary",
                    entity_type="person",
                    primary=True,
                )

            # secondary actions
            cotable = page.xpath("//table[@id='gvCoActionHistory']")[0]
            actions_from_table(bill, cotable)

        # votes
        yield from self.scrape_vote_events(bill, page, bill_url)

        bill.actions.sort(key=lambda a: a["date"])
        yield bill
Exemplo n.º 38
0
    def scrape_matter(self, matter_link, sess):
        matter_types = {
            "Additions": "other",
            "Administrative Order": "order",
            "Annual Evaluation": "other",
            "Bid Advertisement": "other",
            "Bid Awards": "other",
            "Bid Contract": "contract",
            "Bid Protest": "other",
            "Bid Rejection": "other",
            "Birthday Scroll": "commemoration",
            "Certificate of Appreciation": "commemoration",
            "Change Order": "order",
            "Citizen's Presentation": "other",
            "Commendation": "commemoration",
            "Conflict Waiver": "other",
            "Congratulatory Certificate": "commemoration",
            "Deferrals": "other",
            "Discussion Item": "other",
            "Distinguished Visitor": "other",
            "Joint Meeting/Workshop": "other",
            "Mayoral Veto": "other",
            "Miscellaneous": "other",
            "Nomination": "nomination",
            "Oath of Office": "other",
            "Omnibus Reserve": "bill",
            "Ordinance": "ordinance",
            "Plaque": "commemoration",
            "Presentation": "other",
            "Proclamation": "proclamation",
            "Professional Service Agreement": "contract",
            "Public Hearing": "other",
            "Report": "other",
            "Request for Proposals": "other",
            "Request for Qualifications": "other",
            "Request to Advertise": "other",
            "Resolution": "resolution",
            "Resolution of Sympathy": "resolution",
            "Service Awards": "commemoration",
            "Special Item": "other",
            "Special Presentation": "other",
            "Supplement": "other",
            "Swearing-In": "other",
            "Time Sensitive Items": "other",
            "Withdrawals": "other",
            "Workshop Item": "other",
            "Zoning": "other",
            "Zoning Resolution": "resolution"
        }
        matter_doc = self.lxmlize(matter_link)
        info_dict = self.matter_table_to_dict(matter_doc)
        #we're going to use the year of the intro date as the session
        #until/unless we come up with something better
        intro_date = datetime.strptime(info_dict["Introduced"], "%m/%d/%Y")
        session = sess["identifier"]
        category = matter_types[info_dict["File Type"]]
        if 'File Name' in info_dict:
            title = info_dict["File Name"]
        elif "Title" in info_dict and info_dict["Title"].strip():
            title = info_dict["Title"].strip()
        else:
            self.warning("bill has no title")
            return
        if category == 'other':
            bill = Bill(identifier=info_dict["File Number"],
                        legislative_session=session,
                        title=title)
        else:
            bill = Bill(identifier=info_dict["File Number"],
                        legislative_session=session,
                        title=title,
                        classification=category)
        for spons in info_dict["Sponsors"]:
            if spons == "NONE":
                continue
            try:
                name, spons_type = spons.rsplit(",", 1)
            except ValueError:
                name = spons
                spons_type = "Sponsor"
            primary = True if "Prime Sponsor" in spons_type else False
            entity = "person"
            if "committee" in name:
                entity = committee
            bill.add_sponsorship(name, spons_type, entity, primary)
        if "Indexes" in info_dict:
            for subj in info_dict["Indexes"]:
                if subj.strip() and subj.strip() != "NONE":
                    bill.add_subject(subj.strip())
        if "Title" in info_dict and info_dict["Title"].strip():
            note = "bill's long title'"
            if ("Note" in info_dict and info_dict["Note"].strip()):
                note = info_dict["Note"]
            bill.add_abstract(abstract=info_dict["Title"], note=note)
        self.process_action_table(matter_doc, bill)
        bill.add_source(matter_link, note='web')

        yield bill
Exemplo n.º 39
0
    def parse_bill_status_page(self, status_url, bill_url, session, chamber):
        status_page = lxml.html.fromstring(self.get(status_url).text)
        status_page.make_links_absolute(status_url)
        # see 2007 HB 2... weird.
        bill_re = r'.*?/([A-Z]+)0*(\d+)\.pdf'
        bill_xpath = '//a[contains(@href, ".pdf") and contains(@href, "billpdf")]/@href'
        bill_id = re.search(bill_re, status_page.xpath(bill_xpath)[0],
                            re.IGNORECASE).groups()
        bill_id = "{0} {1}".format(bill_id[0], int(bill_id[1]))

        try:
            xp = '//b[text()="Short Title:"]/../following-sibling::td/text()'
            title = status_page.xpath(xp).pop()
        except IndexError:
            title = status_page.xpath('//tr[1]/td[2]')[0].text_content()

        # Add bill type.
        _bill_id = bill_id.lower()
        if 'b' in _bill_id:
            classification = 'bill'
        elif 'j' in _bill_id or 'jr' in _bill_id:
            classification = 'joint resolution'
        elif 'cr' in _bill_id:
            classification = 'concurrent resolution'
        elif 'r' in _bill_id:
            classification = 'resolution'

        bill = Bill(bill_id, legislative_session=session, chamber=chamber,
                    title=title, classification=classification)

        self.add_actions(bill, status_page)
        votes = self.add_votes(bill, status_page, status_url)

        tabledata = self._get_tabledata(status_page)

        # Add sponsor info.
        bill.add_sponsorship(tabledata['primary sponsor:'][0], classification='primary',
                             entity_type='person', primary=True)

        # A various plus fields MT provides.
        plus_fields = [
            'requester',
            ('chapter number:', 'chapter'),
            'transmittal date:',
            'drafter',
            'fiscal note probable:',
            'bill draft number:',
            'preintroduction required:',
            'by request of',
            'category:']

        for x in plus_fields:
            if isinstance(x, tuple):
                _key, key = x
            else:
                _key = key = x
                key = key.replace(' ', '_')

            try:
                val = tabledata[_key]
            except KeyError:
                continue

            if len(val) == 1:
                val = val[0]

            bill.extras[key] = val

        # Add bill subjects.
        xp = '//th[contains(., "Revenue/Approp.")]/ancestor::table/tr'
        subjects = []
        for tr in status_page.xpath(xp):
            try:
                subj = tr.xpath('td')[0].text_content()
            except IndexError:
                continue
            subjects.append(subj)

        for s in subjects:
            bill.add_subject(s)

        self.add_fiscal_notes(status_page, bill)

        return bill, list(votes)
Exemplo n.º 40
0
    def scrape(self, window=30):
        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window))
        self.retry_wait_seconds = 20

        for matter in self.matters(n_days_ago):
            matter_id = matter["MatterId"]

            date = matter["MatterIntroDate"]
            title = matter["MatterTitle"]
            identifier = matter["MatterFile"]

            # If a bill has a duplicate action item that"s causing the entire scrape
            # to fail, add it to the `problem_bills` array to skip it.
            # For the time being...nothing to skip!

            problem_bills = []

            if identifier in problem_bills:
                continue

            if not all((date, title, identifier)):
                continue

            bill_session = self.session(self.toTime(date))

            if matter["MatterTypeName"] in BILL_TYPES:
                ocd_bill_type = BILL_TYPES[matter["MatterTypeName"]]
            else:
                ocd_bill_type = None

            if identifier.startswith("S"):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=ocd_bill_type,
                        from_organization={"name": "Pittsburgh City Council"})

            legistar_web = matter["legistar_url"]
            legistar_api = "http://webapi.legistar.com/v1/pittsburgh/matters/{0}".format(matter_id)
            bill.add_source(legistar_web, note="web")
            bill.add_source(legistar_api, note="api")

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id):
                responsible_person = action.pop("responsible person")
                act = bill.add_action(**action)

                if responsible_person:
                    act.add_related_entity(responsible_person,
                                           "person",
                                           entity_id=_make_pseudo_id(name=responsible_person))

                if action["description"] == "Referred":
                    body_name = matter["MatterBodyName"]
                    if body_name != "City Council":
                        act.add_related_entity(body_name,
                                               "organization",
                                               entity_id=_make_pseudo_id(name=body_name))

                result, votes = vote

                if result:
                    vote_event = VoteEvent(legislative_session=bill.legislative_session,
                                           motion_text=action["description"],
                                           organization=action["organization"],
                                           classification=None,
                                           start_date=action["date"],
                                           result=result,
                                           bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + "/histories")

                    for vote in votes:
                        raw_option = vote["VoteValueName"].lower()
                        clean_option = self.VOTE_OPTIONS.get(raw_option,
                                                             raw_option)
                        vote_event.vote(clean_option,
                                        vote["VotePersonName"].strip())

                    yield vote_event

            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id):
                bill.add_subject(topic["MatterIndexName"].strip())

            for attachment in self.attachments(matter_id):
                if attachment["MatterAttachmentName"]:
                    bill.add_version_link(attachment["MatterAttachmentName"],
                                          attachment["MatterAttachmentHyperlink"],
                                          media_type="application/pdf")

            bill.extras = {"local_classification": matter["MatterTypeName"]}
            text = self.text(matter_id)

            if text:
                if text["MatterTextPlain"]:
                    bill.extras["plain_text"] = text["MatterTextPlain"]

                if text["MatterTextRtf"]:
                    bill.extras["rtf_text"] = text["MatterTextRtf"].replace(u"\u0000", "")

            yield bill
    def scrape_details(self, bill_detail_url, session, chamber, bill_id):
        """
        Create the Bill and add the information obtained from the provided bill_detail_url.
        and then yield the bill object.
        :param bill_detail_url:
        :param session:
        :param chamber:
        :param bill_id:
        :return:
        """
        page = self.get(bill_detail_url).text

        if 'INVALID BILL NUMBER' in page:
            self.warning('INVALID BILL %s' % bill_detail_url)
            return

        doc = lxml.html.fromstring(page)
        doc.make_links_absolute(bill_detail_url)

        bill_div = doc.xpath('//div[@style="margin:0 0 40px 0;"]')[0]

        bill_type = bill_div.xpath('span/text()')[0]

        if 'General Bill' in bill_type:
            bill_type = 'bill'
        elif 'Concurrent Resolution' in bill_type:
            bill_type = 'concurrent resolution'
        elif 'Joint Resolution' in bill_type:
            bill_type = 'joint resolution'
        elif 'Resolution' in bill_type:
            bill_type = 'resolution'
        else:
            raise ValueError('unknown bill type: %s' % bill_type)

        # this is fragile, but less fragile than it was
        b = bill_div.xpath('./b[text()="Summary:"]')[0]
        bill_summary = b.getnext().tail.strip()

        bill = Bill(
            bill_id,
            legislative_session=
            session,  # session name metadata's `legislative_sessions`
            chamber=chamber,  # 'upper' or 'lower'
            title=bill_summary,
            classification=bill_type)

        subjects = list(self._subjects[bill_id])

        for subject in subjects:
            bill.add_subject(subject)

        # sponsors
        for sponsor in doc.xpath('//a[contains(@href, "member.php")]/text()'):
            bill.add_sponsorship(name=sponsor,
                                 classification='primary',
                                 primary=True,
                                 entity_type='person')
        for sponsor in doc.xpath(
                '//a[contains(@href, "committee.php")]/text()'):
            sponsor = sponsor.replace(u'\xa0', ' ').strip()
            bill.add_sponsorship(name=sponsor,
                                 classification='primary',
                                 primary=True,
                                 entity_type='organization')

        # find versions
        version_url = doc.xpath('//a[text()="View full text"]/@href')[0]
        version_html = self.get(version_url).text
        version_doc = lxml.html.fromstring(version_html)
        version_doc.make_links_absolute(version_url)
        for version in version_doc.xpath('//a[contains(@href, "/prever/")]'):
            # duplicate versions with same date, use first appearance

            bill.add_version_link(
                note=version.
                text,  # Description of the version from the state;
                #  eg, 'As introduced', 'Amended', etc.
                url=version.get('href'),
                on_duplicate='ignore',
                media_type='text/html'  # Still a MIME type
            )

        # actions
        for row in bill_div.xpath('table/tr'):
            date_td, chamber_td, action_td = row.xpath('td')

            date = datetime.datetime.strptime(date_td.text, "%m/%d/%y")
            action_chamber = {
                'Senate': 'upper',
                'House': 'lower',
                None: 'legislature'
            }[chamber_td.text]

            action = action_td.text_content()
            action = action.split('(House Journal')[0]
            action = action.split('(Senate Journal')[0].strip()

            atype = action_type(action)

            bill.add_action(
                description=action,  # Action description, from the state
                date=date.strftime('%Y-%m-%d'),  # `YYYY-MM-DD` format
                chamber=action_chamber,  # 'upper' or 'lower'
                classification=atype  # Options explained in the next section
            )

        # votes
        vurl = doc.xpath('//a[text()="View Vote History"]/@href')
        if vurl:
            vurl = vurl[0]
            yield from self.scrape_vote_history(bill, vurl)

        bill.add_source(bill_detail_url)
        yield bill
Exemplo n.º 42
0
    def scrape(self, session=None, chambers=None):
        # Bills endpoint can sometimes take a very long time to load
        self.timeout = 300

        if not session:
            session = self.latest_session()
            self.info('no session, using %s', session)

        if int(session) < 128:
            raise AssertionError("No data for period {}".format(session))

        elif int(session) < 131:
            # they changed their data format starting in 131st and added
            # an undocumented API
            yield from self.old_scrape(session)

        else:
            chamber_dict = {"Senate": "upper", "House": "lower",
                            "House of Representatives": "lower",
                            "house": "lower", "senate": "upper"}

            # so presumanbly not everything passes, but we haven't
            # seen anything not pass yet, so we'll need to wait
            # till it fails and get the right language in here
            vote_results = {"approved": True,
                            "passed": True,
                            "adopted": True,
                            "true": True,
                            "false": False,
                            "failed": False,
                            True: True,
                            False: False}

            action_dict = {"ref_ctte_100": "referral-committee",
                           "intro_100": "introduction",
                           "intro_101": "introduction",
                           "pass_300": "passage",
                           "intro_110": "reading-1",
                           "refer_210": "referral-committee",
                           "crpt_301": None,
                           "crpt_317": None,
                           "concur_606": "passage",
                           "pass_301": "passage",
                           "refer_220": "referral-committee",
                           "intro_102": ["introduction", "passage"],
                           "intro_105": ["introduction", "passage"],
                           "intro_ref_ctte_100": "referral-committee",
                           "refer_209": None,
                           "intro_108": ["introduction", "passage"],
                           "intro_103": ["introduction", "passage"],
                           "msg_reso_503": "passage",
                           "intro_107": ["introduction", "passage"],
                           "imm_consid_360": "passage",
                           "refer_213": None,
                           "adopt_reso_100": "passage",
                           "adopt_reso_110": "passage",
                           "msg_507": "amendment-passage",
                           "confer_713": None,
                           "concur_603": None,
                           "confer_712": None,
                           "msg_506": "amendment-failure",
                           "receive_message_100": "passage",
                           "motion_920": None,
                           "concur_611": None,
                           "confer_735": None,
                           "third_429": None,
                           "final_501": None,
                           "concur_608": None,
                           }

            base_url = "http://search-prod.lis.state.oh.us"
            first_page = base_url
            first_page += "/solarapi/v1/general_assembly_{session}/".format(session=session)
            legislators = self.get_legislator_ids(first_page)
            all_amendments = self.get_other_data_source(first_page, base_url, "amendments")
            all_fiscals = self.get_other_data_source(first_page, base_url, "fiscals")
            all_synopsis = self.get_other_data_source(first_page, base_url, "synopsiss")
            all_analysis = self.get_other_data_source(first_page, base_url, "analysiss")

            for row in self.get_bill_rows(session):
                spacer, number_link, _ga, title, primary_sponsor, status, spacer = row.xpath('td')

                # S.R.No.1 -> SR1
                bill_id = number_link.text_content().replace('No.', '')
                bill_id = bill_id.replace('.', '').replace(' ', '')
                # put one space back in between type and number
                bill_id = re.sub(r'([a-zA-Z]+)(\d+)', r'\1 \2', bill_id)

                title = title.text_content().strip()
                title = re.sub(r'^Title', '', title)

                chamber = 'lower' if 'H' in bill_id else 'upper'
                classification = 'bill' if 'B' in bill_id else 'resolution'

                bill = Bill(bill_id, legislative_session=session, chamber=chamber,
                            title=title, classification=classification)
                bill.add_source(number_link.xpath('a/@href')[0])

                # get bill from API
                bill_api_url = ('http://search-prod.lis.state.oh.us/solarapi/v1/'
                                'general_assembly_{}/{}/{}/'.format(
                                    session,
                                    'bills' if 'B' in bill_id else 'resolutions',
                                    bill_id.lower().replace(' ', '')
                                ))
                data = self.get(bill_api_url).json()

                # add title if no short title
                if not bill.title:
                    bill.title = data['items'][0]['longtitle']
                bill.add_title(data['items'][0]['longtitle'], 'long title')

                # this stuff is version-specific
                for version in data['items']:
                    version_name = version["version"]
                    version_link = base_url+version["pdfDownloadLink"]
                    bill.add_version_link(version_name, version_link, media_type='application/pdf')

                # we'll use latest bill_version for everything else
                bill_version = data['items'][0]
                bill.add_source(bill_api_url)

                # subjects
                for subj in bill_version["subjectindexes"]:
                    try:
                        bill.add_subject(subj["primary"])
                    except KeyError:
                        pass
                    try:
                        secondary_subj = subj["secondary"]
                    except KeyError:
                        secondary_subj = ""
                    if secondary_subj:
                        bill.add_subject(secondary_subj)

                # sponsors
                sponsors = bill_version["sponsors"]
                for sponsor in sponsors:
                    sponsor_name = self.get_sponsor_name(sponsor)
                    bill.add_sponsorship(
                                        sponsor_name,
                                        classification='primary',
                                        entity_type='person',
                                        primary=True
                        )

                cosponsors = bill_version["cosponsors"]
                for sponsor in cosponsors:
                    sponsor_name = self.get_sponsor_name(sponsor)
                    bill.add_sponsorship(
                                         sponsor_name,
                                         classification='cosponsor',
                                         entity_type='person',
                                         primary=False,
                        )

                try:
                    action_doc = self.get(base_url+bill_version["action"][0]["link"])
                except scrapelib.HTTPError:
                    pass
                else:

                    actions = action_doc.json()
                    for action in reversed(actions["items"]):
                        actor = chamber_dict[action["chamber"]]
                        action_desc = action["description"]
                        try:
                            action_type = action_dict[action["actioncode"]]
                        except KeyError:
                            self.warning("Unknown action {desc} with code {code}."
                                         " Add it to the action_dict"
                                         ".".format(desc=action_desc,
                                                    code=action["actioncode"]))
                            action_type = None

                        date = self._tz.localize(datetime.datetime.strptime(
                                                 action["datetime"],
                                                 "%Y-%m-%dT%H:%M:%S"))
                        date = "{:%Y-%m-%d}".format(date)

                        bill.add_action(action_desc,
                                        date, chamber=actor,
                                        classification=action_type)

                # attach documents gathered earlier
                self.add_document(all_amendments, bill_id, "amendment", bill, base_url)
                self.add_document(all_fiscals, bill_id, "fiscal", bill, base_url)
                self.add_document(all_synopsis, bill_id, "synopsis", bill, base_url)
                self.add_document(all_analysis, bill_id, "analysis", bill, base_url)

                # votes
                vote_url = base_url+bill_version["votes"][0]["link"]
                vote_doc = self.get(vote_url)
                votes = vote_doc.json()
                yield from self.process_vote(votes, vote_url,
                                             base_url, bill, legislators,
                                             chamber_dict, vote_results)

                vote_url = base_url
                vote_url += bill_version["cmtevotes"][0]["link"]
                try:
                    vote_doc = self.get(vote_url)
                except scrapelib.HTTPError:
                    self.warning("Vote page not "
                                 "loading; skipping: {}".format(vote_url))
                    continue
                votes = vote_doc.json()
                yield from self.process_vote(votes, vote_url,
                                             base_url, bill, legislators,
                                             chamber_dict, vote_results)

                if data["items"][0]["effective_date"]:
                    effective_date = datetime.datetime.strptime(data["items"][0]["effective_date"],
                                                                "%Y-%m-%d")
                    effective_date = self._tz.localize(effective_date)
                    # the OH website adds an action that isn't in the action list JSON.
                    # It looks like:
                    # Effective 7/6/18
                    effective_date_oh = "{:%-m/%-d/%y}".format(effective_date)
                    effective_action = "Effective {}".format(effective_date_oh)
                    bill.add_action(effective_action,
                                    effective_date,
                                    chamber="executive",
                                    classification=["became-law"])

                # we have never seen a veto or a disapprove, but they seem important.
                # so we'll check and throw an error if we find one
                # life is fragile. so are our scrapers.
                if "veto" in bill_version:
                    veto_url = base_url+bill_version["veto"][0]["link"]
                    veto_json = self.get(veto_url).json()
                    if len(veto_json["items"]) > 0:
                        raise AssertionError("Whoa, a veto! We've never"
                                             " gotten one before."
                                             " Go write some code to deal"
                                             " with it: {}".format(veto_url))

                if "disapprove" in bill_version:
                    disapprove_url = base_url+bill_version["disapprove"][0]["link"]
                    disapprove_json = self.get(disapprove_url).json()
                    if len(disapprove_json["items"]) > 0:
                        raise AssertionError("Whoa, a disapprove! We've never"
                                             " gotten one before."
                                             " Go write some code to deal "
                                             "with it: {}".format(disapprove_url))

                yield bill
    def scrape_bill(self, chamber, session, bill_id, title, url):
        page = self.lxmlize(url)

        if re.match(r'^(S|H)B ', bill_id):
            btype = ['bill']
        elif re.match(r'(S|H)C ', bill_id):
            btype = ['commemoration']
        elif re.match(r'(S|H)JR ', bill_id):
            btype = ['joint resolution']
        elif re.match(r'(S|H)CR ', bill_id):
            btype = ['concurrent resolution']
        else:
            btype = ['bill']

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=btype
                    )
        bill.add_source(url)

        regex_ns = "http://exslt.org/regular-expressions"
        version_links = page.xpath(
            "//a[re:test(@href, 'Bill.aspx\?File=.*\.htm', 'i')]",
            namespaces={'re': regex_ns})
        for link in version_links:
            bill.add_version_link(
                                link.xpath('string()').strip(),
                                link.attrib['href'],
                                media_type='text/html',
                                on_duplicate='ignore'
                )

        sponsor_links = page.xpath(
            "//td[contains(@id, 'tdSponsors')]/a")
        for link in sponsor_links:
            bill.add_sponsorship(
                    link.text,
                    classification='primary',
                    primary=True,
                    entity_type='person'
                )

        actor = chamber
        use_row = False
        self.debug(bill_id)
        for row in page.xpath("//table[contains(@id, 'BillActions')]/tr"):

            if 'Date' in row.text_content() and 'Action' in row.text_content():
                use_row = True
                continue
            elif not use_row:
                continue

            action = row.xpath("string(td[2])").strip()

            atypes = []
            if action.startswith('First read'):
                atypes.append('introduction')
                atypes.append('reading-1')
            elif action.startswith('Signed by Governor'):
                atypes.append('executive-signature')
                actor = 'executive'

            match = re.match(r'(.*) Do Pass( Amended)?, (Passed|Failed)',
                             action)
            if match:
                if match.group(1) in ['Senate',
                                      'House of Representatives']:
                    first = ''
                else:
                    first = 'committee-'
                if match.group(3).lower() == 'passed':
                    second = 'passage'
                elif match.group(3).lower() == 'failed':
                    second = 'failure'
                atypes.append("%s%s" % (first, second))

            if 'referred to' in action.lower():
                atypes.append('referral-committee')

            if 'Motion to amend, Passed Amendment' in action:
                atypes.append('amendment-introduction')
                atypes.append('amendment-passage')

            if 'Veto override, Passed' in action:
                atypes.append('veto-override-passage')
            elif 'Veto override, Failed' in action:
                atypes.append('veto-override-failure')

            if 'Delivered to the Governor' in action:
                atypes.append('executive-receipt')

            match = re.match("First read in (Senate|House)", action)
            if match:
                if match.group(1) == 'Senate':
                    actor = 'upper'
                else:
                    actor = 'lower'

            date = row.xpath("string(td[1])").strip()
            match = re.match('\d{2}/\d{2}/\d{4}', date)
            if not match:
                self.warning("Bad date: %s" % date)
                continue
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            for link in row.xpath("td[2]/a[contains(@href, 'RollCall')]"):
                yield from self.scrape_vote(bill, date, link.attrib['href'])

            bill.add_action(action, date, chamber=actor, classification=atypes)

        for link in page.xpath("//a[contains(@href, 'Keyword')]"):
            bill.add_subject(link.text.strip())

        yield bill
Exemplo n.º 44
0
    def scrape_bill(self, chamber, session, bill_id, short_title=None):
        """
        Scrapes documents, actions, vote counts and votes for
        bills from the 2009 session and above.
        """
        url = BILL_URL % (session, bill_id.replace(" ", ""))
        bill_page = self.get(url).text
        html = lxml.html.fromstring(bill_page)
        html.make_links_absolute(
            "http://legislature.idaho.gov/legislation/%s/" % session)
        bill_tables = html.xpath('//table[contains(@class, "bill-table")]')
        title = bill_tables[1].text_content().strip()
        bill_type = get_bill_type(bill_id)
        bill = Bill(
            legislative_session=session,
            chamber=chamber,
            identifier=bill_id,
            title=title,
            classification=bill_type,
        )
        bill.add_source(url)
        for subject in self._subjects[bill_id.replace(" ", "")]:
            bill.add_subject(subject)

        if short_title and title.lower() != short_title.lower():
            bill.add_title(short_title, "short title")

        # documents
        doc_links = html.xpath('//div[contains(@class,"insert-page")]//a')
        for link in doc_links:
            name = link.text_content().strip()
            href = link.get("href")
            if "Engrossment" in name or "Bill Text" in name or "Amendment" in name:
                bill.add_version_link(note=name,
                                      url=href,
                                      media_type="application/pdf")
            else:
                bill.add_document_link(note=name,
                                       url=href,
                                       media_type="application/pdf")

        def _split(string):
            return re.split(r"\w+[,|AND]\s+", string)

        # sponsors range from a committee to one legislator to a group of legs
        sponsor_lists = bill_tables[0].text_content().split("by")
        if len(sponsor_lists) > 1:
            for sponsors in sponsor_lists[1:]:
                if "COMMITTEE" in sponsors.upper():
                    bill.add_sponsorship(
                        name=sponsors.strip(),
                        entity_type="organization",
                        primary=True,
                        classification="primary",
                    )
                else:
                    for person in _split(sponsors):
                        person = person.strip()
                        if person != "":
                            bill.add_sponsorship(
                                classification="primary",
                                name=person,
                                entity_type="person",
                                primary=True,
                            )

        actor = chamber
        last_date = None
        # if a bill has passed a chamber or been 'received from'
        # then the next committee passage is in the opposite chamber
        has_moved_chambers = False
        for row in bill_tables[2]:
            # lots of empty rows
            if len(row) == 1:
                continue
            _, date, action, _ = [x.text_content().strip() for x in row]

            if date:
                last_date = date
            else:
                date = last_date
            date = datetime.datetime.strptime(date + "/" + session[0:4],
                                              "%m/%d/%Y").strftime("%Y-%m-%d")
            if action.startswith("House"):
                actor = "lower"
            elif action.startswith("Senate"):
                actor = "upper"

            # votes
            if "AYES" in action or "NAYS" in action:
                yield from self.parse_vote(actor, date, row[2], session,
                                           bill_id, chamber, url)
                # bill.add_vote_event(vote)
            # some td's text is seperated by br elements
            if len(row[2]):
                action = "".join(row[2].itertext())
            action = action.replace(u"\xa0", " ").strip()
            atype = get_action(actor, action)
            if atype and "passage" in atype:
                has_moved_chambers = True

            if atype and "committee-passage" in atype and has_moved_chambers:
                actor = _OTHER_CHAMBERS[actor]

            bill.add_action(action, date, chamber=actor, classification=atype)
            # after voice vote/roll call and some actions the bill is sent
            # 'to House' or 'to Senate'
            if "to House" in action:
                actor = "lower"
            elif "to Senate" in action:
                actor = "upper"
        yield bill
Exemplo n.º 45
0
    def scrape(self):
        three_days_ago = datetime.datetime.now() - datetime.timedelta(3)
        for matter in self.matters(three_days_ago):
            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            if not all((date, title, identifier)):
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name": "Board of Directors"})

            legistar_web = self.legislation_detail_url(matter_id)
            legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)

            bill.add_source(legistar_web, note='web')
            bill.add_source(legistar_api, note='api')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id):
                act = bill.add_action(**action)

                if action['description'] == 'Referred':
                    body_name = matter['MatterBodyName']
                    act.add_related_entity(
                        body_name,
                        'organization',
                        entity_id=_make_pseudo_id(name=body_name))

                result, votes = vote
                if result:
                    vote_event = VoteEvent(
                        legislative_session=bill.legislative_session,
                        motion_text=action['description'],
                        organization=action['organization'],
                        classification=None,
                        start_date=action['date'],
                        result=result,
                        bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes:
                        raw_option = vote['VoteValueName'].lower()
                        clean_option = self.VOTE_OPTIONS.get(
                            raw_option, raw_option)
                        vote_event.vote(clean_option,
                                        vote['VotePersonName'].strip())

                    yield vote_event

            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id):
                bill.add_subject(topic['MatterIndexName'].strip())

            bill.add_version_link(
                'Board Report',
                'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'
                .format(matter_id),
                media_type="application/pdf")

            for attachment in self.attachments(matter_id):
                if attachment['MatterAttachmentName']:
                    bill.add_document_link(
                        attachment['MatterAttachmentName'],
                        attachment['MatterAttachmentHyperlink'],
                        media_type="application/pdf")

            bill.extras = {'local_classification': matter['MatterTypeName']}

            text = self.text(matter_id)

            if text:
                if text['MatterTextPlain']:
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf']:
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                        u'\u0000', '')

            yield bill
Exemplo n.º 46
0
    def scrape(self, window=28, matter_ids=None):
        '''By default, scrape board reports updated in the last 28 days.
        Optionally specify a larger or smaller window of time from which to
        scrape updates, or specific matters to scrape.
        Note that passing a value for :matter_ids supercedes the value of
        :window, such that the given matters will be scraped regardless of
        when they were updated.
        
        Optional parameters
        :window (numeric) - Amount of time for which to scrape updates, e.g.
        a window of 7 will scrape legislation updated in the last week. Pass
        a window of 0 to scrape all legislation.
        :matter_ids (str) - Comma-separated list of matter IDs to scrape
        '''

        if matter_ids:
            matters = [
                self.matter(matter_id) for matter_id in matter_ids.split(',')
            ]
            matters = filter(
                None, matters)  # Skip matters that are not yet in Legistar
        elif float(window):  # Support for partial days, i.e., window=0.15
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
                float(window))
            matters = self.matters(n_days_ago)
        else:
            # Scrape all matters, including those without a last-modified date
            matters = self.matters()

        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
            float(window))
        for matter in matters:
            # If this Boolean field is True, then do not scrape the Bill.
            # This issue explains why a restricted Bill might appear (unwelcome) in the Legistar API:
            # https://github.com/datamade/la-metro-councilmatic/issues/345#issuecomment-421184826
            if matter['MatterRestrictViewViaWeb']:
                continue

            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            if not all((date, title, identifier)):
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name": "Board of Directors"})

            legistar_web = matter['legistar_url']

            legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)

            bill.add_source(legistar_web, note='web')
            bill.add_source(legistar_api, note='api')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id):
                act = bill.add_action(**action)

                if action['description'] == 'Referred':
                    body_name = matter['MatterBodyName']
                    act.add_related_entity(
                        body_name,
                        'organization',
                        entity_id=_make_pseudo_id(name=body_name))

                result, votes = vote
                if result:
                    vote_event = VoteEvent(
                        legislative_session=bill.legislative_session,
                        motion_text=action['description'],
                        organization=action['organization'],
                        classification=None,
                        start_date=action['date'],
                        result=result,
                        bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes:
                        raw_option = vote['VoteValueName'].lower()
                        clean_option = self.VOTE_OPTIONS.get(
                            raw_option, raw_option)
                        vote_event.vote(clean_option,
                                        vote['VotePersonName'].strip())

                    yield vote_event

            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id):
                bill.add_subject(topic['MatterIndexName'].strip())

            for relation in self.relations(matter_id):
                try:
                    # Get data (i.e., json) for the related bill.
                    # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session).
                    # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue.
                    related_bill = self.endpoint(
                        '/matters/{0}', relation['MatterRelationMatterId'])
                except scrapelib.HTTPError:
                    continue
                else:
                    date = related_bill['MatterIntroDate']
                    related_bill_session = self.session(self.toTime(date))
                    identifier = related_bill['MatterFile']
                    bill.add_related_bill(
                        identifier=identifier,
                        legislative_session=related_bill_session,
                        relation_type='companion')
                    # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104
                    # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'.

            bill.add_version_link(
                'Board Report',
                'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'
                .format(matter_id),
                media_type="application/pdf")

            for attachment in self.attachments(matter_id):
                if attachment['MatterAttachmentName']:
                    bill.add_document_link(
                        attachment['MatterAttachmentName'],
                        attachment['MatterAttachmentHyperlink'],
                        media_type="application/pdf")

            bill.extras = {'local_classification': matter['MatterTypeName']}

            text = self.text(matter_id)

            if text:
                if text['MatterTextPlain']:
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf']:
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                        u'\u0000', '')

            yield bill
Exemplo n.º 47
0
    def scrape(self, session=None, chambers=None):
        # Bills endpoint can sometimes take a very long time to load
        self.timeout = 300

        if not session:
            session = self.latest_session()
            self.info('no session, using %s', session)

        if int(session) < 128:
            raise AssertionError("No data for period {}".format(session))

        elif int(session) < 131:
            # they changed their data format starting in 131st and added
            # an undocumented API
            yield from self.old_scrape(session)

        else:
            chamber_dict = {"Senate": "upper", "House": "lower",
                            "House of Representatives": "lower",
                            "house": "lower", "senate": "upper"}

            # so presumanbly not everything passes, but we haven't
            # seen anything not pass yet, so we'll need to wait
            # till it fails and get the right language in here
            vote_results = {"approved": True,
                            "passed": True,
                            "adopted": True,
                            "true": True,
                            "false": False,
                            "failed": False,
                            True: True,
                            False: False}

            action_dict = {"ref_ctte_100": "referral-committee",
                           "intro_100": "introduction",
                           "pass_300": "passage",
                           "intro_110": "reading-1",
                           "refer_210": "referral-committee",
                           "crpt_301": None,
                           "crpt_317": None,
                           "concur_606": "passage",
                           "pass_301": "passage",
                           "refer_220": "referral-committee",
                           "intro_102": ["introduction", "passage"],
                           "intro_105": ["introduction", "passage"],
                           "intro_ref_ctte_100": "referral-committee",
                           "refer_209": None,
                           "intro_108": ["introduction", "passage"],
                           "intro_103": ["introduction", "passage"],
                           "msg_reso_503": "passage",
                           "intro_107": ["introduction", "passage"],
                           "imm_consid_360": "passage",
                           "refer_213": None,
                           "adopt_reso_100": "passage",
                           "msg_507": "amendment-passage",
                           "confer_713": None,
                           "concur_603": None,
                           "confer_712": None,
                           "msg_506": "amendment-failure",
                           "receive_message_100": "passage",
                           "motion_920": None,
                           "concur_611": None,
                           "confer_735": None
                           }

            base_url = "http://search-prod.lis.state.oh.us"
            first_page = base_url
            first_page += "/solarapi/v1/general_assembly_{session}/".format(session=session)
            legislators = self.get_legislator_ids(first_page)
            all_amendments = self.get_other_data_source(first_page, base_url, "amendments")
            all_fiscals = self.get_other_data_source(first_page, base_url, "fiscals")
            all_synopsis = self.get_other_data_source(first_page, base_url, "synopsiss")
            all_analysis = self.get_other_data_source(first_page, base_url, "analysiss")

            for row in self.get_bill_rows(session):
                number_link, ga, title, primary_sponsor, status = row.xpath('td')

                bill_id = number_link.text_content()
                title = title.text_content().strip()
                chamber = 'lower' if 'H' in bill_id else 'upper'
                classification = 'bill' if 'B' in bill_id else 'resolution'

                bill = Bill(bill_id, legislative_session=session, chamber=chamber,
                            title=title, classification=classification)
                bill.add_source(number_link.xpath('a/@href')[0])

                # get bill from API
                bill_api_url = ('http://search-prod.lis.state.oh.us/solarapi/v1/'
                                'general_assembly_{}/{}/{}/'.format(
                                    session,
                                    'bills' if 'B' in bill_id else 'resolutions',
                                    bill_id.lower().replace(' ', '')
                                ))
                data = self.get(bill_api_url).json()

                # add title if no short title
                if not bill.title:
                    bill.title = data['items'][0]['longtitle']
                bill.add_title(data['items'][0]['longtitle'], 'long title')

                # this stuff is version-specific
                for version in data['items']:
                    version_name = version["version"]
                    version_link = base_url+version["pdfDownloadLink"]
                    bill.add_version_link(version_name, version_link, media_type='application/pdf')

                # we'll use latest bill_version for everything else
                bill_version = data['items'][0]
                bill.add_source(bill_api_url)

                # subjects
                for subj in bill_version["subjectindexes"]:
                    try:
                        bill.add_subject(subj["primary"])
                    except KeyError:
                        pass
                    try:
                        secondary_subj = subj["secondary"]
                    except KeyError:
                        secondary_subj = ""
                    if secondary_subj:
                        bill.add_subject(secondary_subj)

                # sponsors
                sponsors = bill_version["sponsors"]
                for sponsor in sponsors:
                    sponsor_name = self.get_sponsor_name(sponsor)
                    bill.add_sponsorship(
                                        sponsor_name,
                                        classification='primary',
                                        entity_type='person',
                                        primary=True
                        )

                cosponsors = bill_version["cosponsors"]
                for sponsor in cosponsors:
                    sponsor_name = self.get_sponsor_name(sponsor)
                    bill.add_sponsorship(
                                         sponsor_name,
                                         classification='cosponsor',
                                         entity_type='person',
                                         primary=False,
                        )

                try:
                    action_doc = self.get(base_url+bill_version["action"][0]["link"])
                except scrapelib.HTTPError:
                    pass
                else:

                    actions = action_doc.json()
                    for action in reversed(actions["items"]):
                        actor = chamber_dict[action["chamber"]]
                        action_desc = action["description"]
                        try:
                            action_type = action_dict[action["actioncode"]]
                        except KeyError:
                            self.warning("Unknown action {desc} with code {code}."
                                         " Add it to the action_dict"
                                         ".".format(desc=action_desc,
                                                    code=action["actioncode"]))
                            action_type = None

                        date = self._tz.localize(datetime.datetime.strptime(
                                                 action["datetime"],
                                                 "%Y-%m-%dT%H:%M:%S"))
                        date = "{:%Y-%m-%d}".format(date)

                        bill.add_action(action_desc,
                                        date, chamber=actor,
                                        classification=action_type)

                # attach documents gathered earlier
                self.add_document(all_amendments, bill_id, "amendment", bill, base_url)
                self.add_document(all_fiscals, bill_id, "fiscal", bill, base_url)
                self.add_document(all_synopsis, bill_id, "synopsis", bill, base_url)
                self.add_document(all_analysis, bill_id, "analysis", bill, base_url)

                # votes
                vote_url = base_url+bill_version["votes"][0]["link"]
                vote_doc = self.get(vote_url)
                votes = vote_doc.json()
                yield from self.process_vote(votes, vote_url,
                                             base_url, bill, legislators,
                                             chamber_dict, vote_results)

                vote_url = base_url
                vote_url += bill_version["cmtevotes"][0]["link"]
                try:
                    vote_doc = self.get(vote_url)
                except scrapelib.HTTPError:
                    self.warning("Vote page not "
                                 "loading; skipping: {}".format(vote_url))
                    continue
                votes = vote_doc.json()
                yield from self.process_vote(votes, vote_url,
                                             base_url, bill, legislators,
                                             chamber_dict, vote_results)

                # we have never seen a veto or a disapprove, but they seem important.
                # so we'll check and throw an error if we find one
                # life is fragile. so are our scrapers.
                if "veto" in bill_version:
                    veto_url = base_url+bill_version["veto"][0]["link"]
                    veto_json = self.get(veto_url).json()
                    if len(veto_json["items"]) > 0:
                        raise AssertionError("Whoa, a veto! We've never"
                                             " gotten one before."
                                             " Go write some code to deal"
                                             " with it: {}".format(veto_url))

                if "disapprove" in bill_version:
                    disapprove_url = base_url+bill_version["disapprove"][0]["link"]
                    disapprove_json = self.get(disapprove_url).json()
                    if len(disapprove_json["items"]) > 0:
                        raise AssertionError("Whoa, a disapprove! We've never"
                                             " gotten one before."
                                             " Go write some code to deal "
                                             "with it: {}".format(disapprove_url))

                yield bill
Exemplo n.º 48
0
    def get_bill(self, matter):
        '''Make Bill object from given matter.'''
        '''
        Currently, NYC Legistar does not have conventional "Types" for 
        three newly added committees: https://legistar.council.nyc.gov/Departments.aspx
        We communicated the issue to NYC, and until we learn more, we will
        skip the bills attached to those committees.
        '''
        orgs_without_type = [
            'Charter Revision Commission 2019',
            'New York City Advisory Commission on Property Tax Reform',
            'Democratic Conference of the Council of the City of New York'
        ]
        if matter['MatterBodyName'].strip() in orgs_without_type:
            return None

        matter_id = matter['MatterId']
        if matter_id in DUPLICATED_ACTIONS:
            return None

        date = matter['MatterIntroDate']
        title = matter['MatterName']
        identifier = matter['MatterFile']

        if not all((date, title, identifier)):
            return None

        leg_type = BILL_TYPES[matter['MatterTypeName']]

        bill_session = self.sessions(self.toTime(date))

        bill = Bill(identifier=identifier,
                    title=title,
                    classification=leg_type,
                    legislative_session=bill_session,
                    from_organization={"name": "New York City Council"})

        legistar_web = matter['legistar_url']
        legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)

        bill.add_source(legistar_web, note='web')
        bill.add_source(legistar_api, note='api')

        if matter['MatterTitle']:
            bill.add_title(matter['MatterTitle'])

        if matter['MatterEXText5']:
            bill.add_abstract(matter['MatterEXText5'], note='')

        try:
            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)
        except KeyError:
            self.version_errors.append(legistar_web)
            return None

        for attachment in self.attachments(matter_id):

            if attachment['MatterAttachmentId'] == 103315:  # Duplicate
                return None

            if attachment['MatterAttachmentName']:
                bill.add_document_link(attachment['MatterAttachmentName'],
                                       attachment['MatterAttachmentHyperlink'],
                                       media_type='application/pdf')

        for topic in self.topics(matter_id):
            bill.add_subject(topic['MatterIndexName'].strip())

        for relation in self.relations(matter_id):
            try:
                related_bill = self.endpoint(
                    '/matters/{0}', relation['MatterRelationMatterId'])
            except scrapelib.HTTPError:
                return None
            else:
                date = related_bill['MatterIntroDate']
                related_bill_session = self.session(self.toTime(date))
                identifier = related_bill['MatterFile']
                bill.add_related_bill(identifier=identifier,
                                      legislative_session=related_bill_session,
                                      relation_type='companion')

        try:
            text = self.text(matter_id)
        except KeyError:
            self.version_errors.append(legistar_web)
            return None

        bill.extras['local_classification'] = matter['MatterTypeName']

        if text:
            if text['MatterTextPlain']:
                bill.extras['plain_text'] = text['MatterTextPlain'].replace(
                    u'\u0000', '')

            if text['MatterTextRtf']:
                bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                    u'\u0000', '')

        return bill
Exemplo n.º 49
0
    def scrape(self):
        session_name = self.latest_session()
        session = session_name[0:5]
        self._bill_prefix_map = {
            'HB':  {
                'type': 'bill',
                'url_segment': 'bills/house',
            },
            'HR':  {
                'type': 'resolution',
                'url_segment': 'resolutions/house/simple',
            },
            'HCR': {
                'type': 'concurrent resolution',
                'url_segment': 'resolutions/house/concurrent',
            },
            'HJR': {
                'type': 'joint resolution',
                'url_segment': 'resolutions/house/joint'
            },
            'HC': {
                'type': 'concurrent resolution',
                'url_segment': 'resolutions/house/concurrent',
            },
            'HJ': {
                'type': 'joint resolution',
                'url_segment': 'resolutions/house/joint',
            },
            'SB': {
                'type': 'bill',
                'url_segment': 'bills/senate',
            },
            'SR': {
                'type': 'resolution',
                'url_segment': 'resolutions/senate/simple',
            },
            'SCR': {
                'type': 'concurrent resolution',
                'url_segment': 'resolutions/senate/concurrent',
            },
            'SJR': {
                'type': 'joint resolution',
                'url_segment': 'resolutions/senate/joint',
            },
            'SC': {
                'type': 'concurrent resolution',
                'url_segment': 'resolutions/senate/concurrent',
            },
            'SJ': {
                'type': 'joint resolution',
                'url_segment': 'resolutions/senate/joint',
            },
        }

        api_base_url = "https://api.iga.in.gov"
        proxy = {"url": "http://in-proxy.openstates.org"}

        # ah, indiana. it's really, really hard to find
        # pdfs in their web interface. Super easy with
        # the api, but a key needs to be passed
        # in the headers. To make these documents
        # viewable to the public and our scrapers,
        # sunlight's put up a proxy service at this link
        # using our api key for pdf document access.

        client = ApiClient(self)
        r = client.get("bills", session=session)
        all_pages = client.unpaginate(r)
        for b in all_pages:
            bill_id = b["billName"]
            for idx, char in enumerate(bill_id):
                try:
                    int(char)
                except ValueError:
                    continue
                disp_bill_id = bill_id[:idx]+" "+str(int(bill_id[idx:]))
                break

            bill_link = b["link"]
            api_source = api_base_url + bill_link
            try:
                bill_json = client.get("bill", session=session, bill_id=bill_id.lower())
            except scrapelib.HTTPError:
                self.logger.warning('Bill could not be accessed. Skipping.')
                continue

            title = bill_json["title"]
            if title == "NoneNone":
                title = None
            # sometimes title is blank
            # if that's the case, we can check to see if
            # the latest version has a short description
            if not title:
                title = bill_json["latestVersion"]["shortDescription"]

            # and if that doesn't work, use the bill_id but throw a warning
            if not title:
                title = bill_id
                self.logger.warning("Bill is missing a title, using bill id instead.")

            bill_prefix = self._get_bill_id_components(bill_id)[0]

            original_chamber = ("lower" if bill_json["originChamber"].lower() == "house"
                                else "upper")
            bill_type = self._bill_prefix_map[bill_prefix]['type']
            bill = Bill(disp_bill_id,
                        legislative_session=session,
                        chamber=original_chamber,
                        title=title,
                        classification=bill_type)

            bill.add_source(self._get_bill_url(session, bill_id))
            bill.add_source(api_source)

            # sponsors
            for s in bill_json["authors"]:
                bill.add_sponsorship(classification="author",
                                     name=self._get_name(s),
                                     entity_type='person',
                                     primary=True)

            for s in bill_json["coauthors"]:
                bill.add_sponsorship(classification="coauthor",
                                     name=self._get_name(s),
                                     entity_type='person',
                                     primary=False)

            for s in bill_json["sponsors"]:
                bill.add_sponsorship(classification="sponsor",
                                     name=self._get_name(s),
                                     entity_type='person', primary=True)

            for s in bill_json["cosponsors"]:
                bill.add_sponsorship(classification="cosponsor",
                                     name=self._get_name(s),
                                     entity_type='person',
                                     primary=False)

            # actions
            action_link = bill_json["actions"]["link"]
            api_source = api_base_url + action_link

            try:
                actions = client.get("bill_actions", session=session, bill_id=bill_id.lower())
            except scrapelib.HTTPError:
                self.logger.warning("Could not find bill actions page")
                actions = {"items": []}

            for a in actions["items"]:
                action_desc = a["description"]
                if "governor" in action_desc.lower():
                    action_chamber = "executive"
                elif a["chamber"]["name"].lower() == "house":
                    action_chamber = "lower"
                else:
                    action_chamber = "upper"
                date = a["date"]

                if not date:
                    self.logger.warning("Action has no date, skipping")
                    continue

                # convert time to pupa fuzzy time
                date = date.replace('T', ' ')
                # TODO: if we update pupa to accept datetimes we can drop this line
                date = date.split()[0]

                action_type = []
                d = action_desc.lower()
                committee = None

                reading = False
                if "first reading" in d:
                    action_type.append("reading-1")
                    reading = True

                if ("second reading" in d or "reread second time" in d):
                    action_type.append("reading-2")
                    reading = True

                if ("third reading" in d or "reread third time" in d):
                    action_type.append("reading-3")
                    if "passed" in d:
                        action_type.append("passage")
                    if "failed" in d:
                        action_type.append("failure")
                    reading = True

                if "adopted" in d and reading:
                    action_type.append("passage")

                if ("referred" in d and "committee on" in d
                        or "reassigned" in d and "committee on" in d):
                    committee = d.split("committee on")[-1].strip()
                    action_type.append("referral-committee")

                if "committee report" in d:
                    if "pass" in d:
                        action_type.append("committee-passage")
                    if "fail" in d:
                        action_type.append("committee-failure")

                if "amendment" in d and "without amendment" not in d:
                    if "pass" in d or "prevail" in d or "adopted" in d:
                        action_type.append("amendment-passage")
                    if "fail" or "out of order" in d:
                        action_type.append("amendment-failure")
                    if "withdraw" in d:
                        action_type.append("amendment-withdrawal")

                if "signed by the governor" in d:
                    action_type.append("executive-signature")

                if len(action_type) == 0:
                    # calling it other and moving on with a warning
                    self.logger.warning("Could not recognize an action in '{}'".format(
                        action_desc))
                    action_type = None

                a = bill.add_action(chamber=action_chamber,
                                    description=action_desc,
                                    date=date,
                                    classification=action_type)
                if committee:
                    a.add_related_entity(committee, entity_type='organization')

            # subjects
            subjects = [s["entry"] for s in bill_json["latestVersion"]["subjects"]]
            for subject in subjects:
                bill.add_subject(subject)

            # versions and votes
            for version in bill_json["versions"][::-1]:
                try:
                    version_json = client.get("bill_version",
                                              session=session,
                                              bill_id=version["billName"],
                                              version_id=version["printVersionName"])
                except scrapelib.HTTPError:
                    self.logger.warning("Bill version does not seem to exist.")
                    continue

                yield from self.deal_with_version(version_json, bill, bill_id,
                                                  original_chamber, session, proxy)

            yield bill
Exemplo n.º 50
0
    def scrape(self, session=None, chambers=None):
        # Bills endpoint can sometimes take a very long time to load
        self.timeout = 300

        if not session:
            session = self.latest_session()
            self.info('no session, using %s', session)

        if int(session) < 128:
            raise AssertionError("No data for period {}".format(session))

        elif int(session) < 131:
            # they changed their data format starting in 131st and added
            # an undocumented API
            yield from self.old_scrape(session)

        else:
            chamber_dict = {"Senate": "upper", "House": "lower",
                            "House of Representatives": "lower",
                            "house": "lower", "senate": "upper"}

            # so presumanbly not everything passes, but we haven't
            # seen anything not pass yet, so we'll need to wait
            # till it fails and get the right language in here
            vote_results = {"approved": True,
                            "passed": True,
                            "adopted": True,
                            "true": True,
                            "false": False,
                            "failed": False,
                            True: True,
                            False: False}

            action_dict = {"ref_ctte_100": "referral-committee",
                           "intro_100": "introduction",
                           "pass_300": "passage",
                           "intro_110": "reading-1",
                           "refer_210": "referral-committee",
                           "crpt_301": None,
                           "crpt_317": None,
                           "concur_606": "passage",
                           "pass_301": "passage",
                           "refer_220": "referral-committee",
                           "intro_102": ["introduction", "passage"],
                           "intro_105": ["introduction", "passage"],
                           "intro_ref_ctte_100": "referral-committee",
                           "refer_209": None,
                           "intro_108": ["introduction", "passage"],
                           "intro_103": ["introduction", "passage"],
                           "msg_reso_503": "passage",
                           "intro_107": ["introduction", "passage"],
                           "imm_consid_360": "passage",
                           "refer_213": None,
                           "adopt_reso_100": "passage",
                           "msg_507": "amendment-passage",
                           "confer_713": None,
                           "concur_603": None,
                           "confer_712": None,
                           "msg_506": "amendment-failure",
                           "receive_message_100": "passage",
                           "motion_920": None,
                           "concur_611": None,
                           "confer_735": None
                           }

            base_url = "http://search-prod.lis.state.oh.us"
            first_page = base_url
            first_page += "/solarapi/v1/general_assembly_{session}/".format(session=session)
            legislators = self.get_legislator_ids(first_page)
            all_amendments = self.get_other_data_source(first_page, base_url, "amendments")
            all_fiscals = self.get_other_data_source(first_page, base_url, "fiscals")
            all_synopsis = self.get_other_data_source(first_page, base_url, "synopsiss")
            all_analysis = self.get_other_data_source(first_page, base_url, "analysiss")
            doc_types = ["bills", "resolutions"]
            for doc_type in doc_types:
                bill_versions = {}
                for doc in self.pages(base_url, first_page+doc_type):
                    for v in doc["items"]:
                        # bills is actually a list of versions
                        # going to create a dictionary as follows:
                        # key=bill_id
                        # value = dict of all versions, where keys are versionids
                        # and values are the bill data from the api.
                        # then loop through that to avoid duplicate bills

                        try:
                            bill_id = v["number"]
                        except KeyError:
                            self.warning("Apparent bill has no information:\n{}".format(v))
                            continue

                        version_id = v["versionid"]
                        if bill_id in bill_versions:
                            if version_id in bill_versions[bill_id]:
                                self.logger.warning("There are two versions of {bill_id}"
                                                    " called the same thing."
                                                    " Bad news bears!".format(bill_id=bill_id))
                            else:
                                bill_versions[bill_id][version_id] = v
                        else:
                            bill_versions[bill_id] = {}
                            bill_versions[bill_id][version_id] = v

                for b in bill_versions:
                    bill = None
                    for bill_version in bill_versions[b].values():
                        if not bill:
                            bill_id = bill_version["number"]
                            title = bill_version["shorttitle"] or bill_version["longtitle"]
                            title = title.strip()

                            if not len(title):
                                self.warning("Missing title for {bill_id}".format(bill_id=bill_id))
                                next

                            chamber = "lower" if "h" in bill_id else "upper"

                            subjects = []
                            for subj in bill_version["subjectindexes"]:
                                try:
                                    subjects.append(subj["primary"])
                                except KeyError:
                                    pass
                                try:
                                    secondary_subj = subj["secondary"]
                                except KeyError:
                                    secondary_subj = ""
                                if secondary_subj:
                                    subjects.append(secondary_subj)

                            # they use bill id of format HB 1 on the site
                            # but hb1 in the API.

                            for idx, char in enumerate(bill_id):
                                try:
                                    int(char)
                                except ValueError:
                                    continue

                                display_id = bill_id[:idx]+" "+bill_id[idx:]
                                break
                            classification = {'bills': 'bill',
                                              'resolutions': 'resolution'}[doc_type]
                            bill = Bill(display_id.upper(),
                                        legislative_session=session,
                                        chamber=chamber,
                                        title=title,
                                        classification=classification
                                        )

                            for subject in subjects:
                                bill.add_subject(subject)
                            # this stuff is the same for all versions

                            bill.add_source(first_page+doc_type+"/"+bill_id)

                            sponsors = bill_version["sponsors"]
                            for sponsor in sponsors:
                                sponsor_name = self.get_sponsor_name(sponsor)
                                bill.add_sponsorship(
                                                    sponsor_name,
                                                    classification='primary',
                                                    entity_type='person',
                                                    primary=True
                                    )

                            cosponsors = bill_version["cosponsors"]
                            for sponsor in cosponsors:
                                sponsor_name = self.get_sponsor_name(sponsor)
                                bill.add_sponsorship(
                                                     sponsor_name,
                                                     classification='cosponsor',
                                                     entity_type='person',
                                                     primary=False,
                                    )
                            try:
                                action_doc = self.get(base_url+bill_version["action"][0]["link"])
                            except scrapelib.HTTPError:
                                pass
                            else:

                                actions = action_doc.json()
                                for action in reversed(actions["items"]):
                                    actor = chamber_dict[action["chamber"]]
                                    action_desc = action["description"]
                                    try:
                                        action_type = action_dict[action["actioncode"]]
                                    except KeyError:
                                        self.warning("Unknown action {desc} with code {code}."
                                                     " Add it to the action_dict"
                                                     ".".format(desc=action_desc,
                                                                code=action["actioncode"]))
                                        action_type = None

                                    date = self._tz.localize(datetime.datetime.strptime(
                                                             action["datetime"],
                                                             "%Y-%m-%dT%H:%M:%S"))
                                    date = "{:%Y-%m-%d}".format(date)

                                    bill.add_action(action_desc,
                                                    date, chamber=actor,
                                                    classification=action_type)
                            self.add_document(all_amendments, bill_id, "amendment", bill, base_url)
                            self.add_document(all_fiscals, bill_id, "fiscal", bill, base_url)
                            self.add_document(all_synopsis, bill_id, "synopsis", bill, base_url)
                            self.add_document(all_analysis, bill_id, "analysis", bill, base_url)

                            vote_url = base_url+bill_version["votes"][0]["link"]
                            vote_doc = self.get(vote_url)
                            votes = vote_doc.json()
                            yield from self.process_vote(votes, vote_url,
                                                         base_url, bill, legislators,
                                                         chamber_dict, vote_results)

                            vote_url = base_url
                            vote_url += bill_version["cmtevotes"][0]["link"]
                            try:
                                vote_doc = self.get(vote_url)
                            except scrapelib.HTTPError:
                                self.warning("Vote page not "
                                             "loading; skipping: {}".format(vote_url))
                                continue
                            votes = vote_doc.json()
                            yield from self.process_vote(votes, vote_url,
                                                         base_url, bill, legislators,
                                                         chamber_dict, vote_results)

                            # we have never seen a veto or a disapprove, but they seem important.
                            # so we'll check and throw an error if we find one
                            # life is fragile. so are our scrapers.
                            if "veto" in bill_version:
                                veto_url = base_url+bill_version["veto"][0]["link"]
                                veto_json = self.get(veto_url).json()
                                if len(veto_json["items"]) > 0:
                                    raise AssertionError("Whoa, a veto! We've never"
                                                         " gotten one before."
                                                         " Go write some code to deal"
                                                         " with it: {}".format(veto_url))

                            if "disapprove" in bill_version:
                                disapprove_url = base_url+bill_version["disapprove"][0]["link"]
                                disapprove_json = self.get(disapprove_url).json()
                                if len(disapprove_json["items"]) > 0:
                                    raise AssertionError("Whoa, a disapprove! We've never"
                                                         " gotten one before."
                                                         " Go write some code to deal "
                                                         "with it: {}".format(disapprove_url))

                        # this stuff is version-specific
                        version_name = bill_version["version"]
                        version_link = base_url+bill_version["pdfDownloadLink"]
                        if version_link.endswith("pdf"):
                            mimetype = "application/pdf"
                        else:
                            mimetype = "application/octet-stream"
                        bill.add_version_link(version_name, version_link, media_type=mimetype)
                    # Need to sort bill actions, since they may be jumbled

                    yield bill