Пример #1
0
    def prepare_for_db(self, data):
        data['legislative_session_id'] = self.get_session_id(
            data.pop('legislative_session'))
        data['organization_id'] = self.org_importer.resolve_json_id(
            data.pop('organization'))

        bill = data.pop('bill')
        if bill and bill.startswith('~'):
            # unpack psuedo id and apply filter in case there are any that alter it
            bill = get_pseudo_id(bill)
            self.bill_importer.apply_transformers(bill)
            bill = _make_pseudo_id(**bill)

        data['bill_id'] = self.bill_importer.resolve_json_id(bill)
        bill_action = data.pop('bill_action')
        if bill_action:
            try:
                action = BillAction.objects.get(
                    bill_id=data['bill_id'],
                    description=bill_action,
                    date=data['start_date'],
                    organization_id=data['organization_id'],
                )
                # seen_action_ids is for ones being added in this import
                # action.vote is already set if action was set on prior import
                if action.id in self.seen_action_ids or hasattr(
                        action, 'vote'):
                    self.warning('can not match two VoteEvents to %s: %s',
                                 action.id, bill_action)
                else:
                    data['bill_action_id'] = action.id
                    self.seen_action_ids.add(action.id)
            except BillAction.DoesNotExist:
                self.warning('could not match VoteEvent to %s %s %s', bill,
                             bill_action, data['start_date'])
            except BillAction.MultipleObjectsReturned as e:
                self.warning('could not match VoteEvent to %s %s %s: %s', bill,
                             bill_action, data['start_date'], e)

        for vote in data['votes']:
            vote['voter_id'] = self.person_importer.resolve_json_id(
                vote['voter_id'], allow_no_match=True)
        return data
Пример #2
0
    def prepare_for_db(self, data):
        data['jurisdiction_id'] = self.jurisdiction_id
        data['location'] = self.get_location(data['location'])

        data['start_date'] = data['start_date']
        data['end_date'] = data.get('end_date', "")

        for participant in data['participants']:
            if 'person_id' in participant:
                participant['person_id'] = self.person_importer.resolve_json_id(
                    participant['person_id'],
                    allow_no_match=True)
            elif 'organization_id' in participant:
                participant['organization_id'] = self.org_importer.resolve_json_id(
                    participant['organization_id'],
                    allow_no_match=True)

        for item in data['agenda']:
            for entity in item['related_entities']:
                if 'person_id' in entity:
                    entity['person_id'] = self.person_importer.resolve_json_id(
                        entity['person_id'],
                        allow_no_match=True)
                elif 'organization_id' in entity:
                    entity['organization_id'] = self.org_importer.resolve_json_id(
                        entity['organization_id'],
                        allow_no_match=True)
                elif 'bill_id' in entity:
                    # unpack and repack bill psuedo id in case filters alter it
                    bill = get_pseudo_id(entity['bill_id'])
                    self.bill_importer.apply_transformers(bill)
                    bill = _make_pseudo_id(**bill)
                    entity['bill_id'] = self.bill_importer.resolve_json_id(
                        bill,
                        allow_no_match=True)
                elif 'vote_event_id' in entity:
                    entity['vote_event_id'] = self.vote_event_importer.resolve_json_id(
                        entity['vote_event_id'],
                        allow_no_match=True)

        return data
Пример #3
0
    def prepare_for_db(self, data):
        data['jurisdiction_id'] = self.jurisdiction_id
        if data['location']:
            data['location'] = self.get_location(data['location'])

        data['start_date'] = data['start_date']
        data['end_date'] = data.get('end_date', "")

        for participant in data['participants']:
            if 'person_id' in participant:
                participant[
                    'person_id'] = self.person_importer.resolve_json_id(
                        participant['person_id'], allow_no_match=True)
            elif 'organization_id' in participant:
                participant[
                    'organization_id'] = self.org_importer.resolve_json_id(
                        participant['organization_id'], allow_no_match=True)

        for item in data['agenda']:
            for entity in item['related_entities']:
                if 'person_id' in entity:
                    entity['person_id'] = self.person_importer.resolve_json_id(
                        entity['person_id'], allow_no_match=True)
                elif 'organization_id' in entity:
                    entity[
                        'organization_id'] = self.org_importer.resolve_json_id(
                            entity['organization_id'], allow_no_match=True)
                elif 'bill_id' in entity:
                    # unpack and repack bill psuedo id in case filters alter it
                    bill = get_pseudo_id(entity['bill_id'])
                    self.bill_importer.apply_transformers(bill)
                    bill = _make_pseudo_id(**bill)
                    entity['bill_id'] = self.bill_importer.resolve_json_id(
                        bill, allow_no_match=True)
                elif 'vote_event_id' in entity:
                    entity[
                        'vote_event_id'] = self.vote_event_importer.resolve_json_id(
                            entity['vote_event_id'], allow_no_match=True)

        return data
Пример #4
0
    def prepare_for_db(self, data):
        data['legislative_session_id'] = self.get_session_id(data.pop('legislative_session'))
        data['organization_id'] = self.org_importer.resolve_json_id(data.pop('organization'))

        bill = data.pop('bill')
        if bill and bill.startswith('~'):
            # unpack psuedo id and apply filter in case there are any that alter it
            bill = get_pseudo_id(bill)
            self.bill_importer.apply_transformers(bill)
            bill = _make_pseudo_id(**bill)

        data['bill_id'] = self.bill_importer.resolve_json_id(bill)
        bill_action = data.pop('bill_action')
        if bill_action:
            try:
                action = BillAction.objects.get(bill_id=data['bill_id'],
                                                description=bill_action,
                                                date=data['start_date'],
                                                organization_id=data['organization_id'],
                                                )
                # seen_action_ids is for ones being added in this import
                # action.vote is already set if action was set on prior import
                if action.id in self.seen_action_ids or hasattr(action, 'vote'):
                    self.warning('can not match two VoteEvents to %s: %s',
                                 action.id, bill_action)
                else:
                    data['bill_action_id'] = action.id
                    self.seen_action_ids.add(action.id)
            except BillAction.DoesNotExist:
                self.warning('could not match VoteEvent to %s %s %s',
                             bill, bill_action, data['start_date'])
            except BillAction.MultipleObjectsReturned as e:
                self.warning('could not match VoteEvent to %s %s %s: %s',
                             bill, bill_action, data['start_date'], e)

        for vote in data['votes']:
            vote['voter_id'] = self.person_importer.resolve_json_id(vote['voter_id'],
                                                                    allow_no_match=True)
        return data
Пример #5
0
    def prepare_for_db(self, data):
        data['legislative_session_id'] = self.get_session_id(
            data.pop('legislative_session'))
        data['organization_id'] = self.org_importer.resolve_json_id(
            data.pop('organization'))

        bill = data.pop('bill')
        if bill and bill.startswith('~'):
            bill = get_pseudo_id(bill)
            bill['identifier'] = fix_bill_id(bill['identifier'])
            bill = _make_pseudo_id(**bill)

        data['bill_id'] = self.bill_importer.resolve_json_id(bill)
        bill_action = data.pop('bill_action')
        if bill_action:
            try:
                action = BillAction.objects.get(
                    bill_id=data['bill_id'],
                    description=bill_action,
                    date=data['start_date'],
                    organization_id=data['organization_id'],
                )
                if action.id in self.seen_action_ids:
                    self.warning('can not match two VoteEvents to %s: %s',
                                 action.id, bill_action)
                else:
                    data['bill_action_id'] = action.id
                    self.seen_action_ids.add(action.id)
            except BillAction.DoesNotExist:
                self.warning('could not match VoteEvent to %s %s %s', bill,
                             bill_action, data['start_date'])
            except BillAction.MultipleObjectsReturned as e:
                self.warning('could not match VoteEvent to %s %s %s: %s', bill,
                             bill_action, data['start_date'], e)

        for vote in data['votes']:
            vote['voter_id'] = self.person_importer.resolve_json_id(
                vote['voter_id'], allow_no_match=True)
        return data
Пример #6
0
    def prepare_for_db(self, data):
        data['jurisdiction_id'] = self.jurisdiction_id
        data['location'] = self.get_location(data['location'])

        data['start_date'] = data['start_date']
        data['end_date'] = data.get('end_date', "")

        for participant in data['participants']:
            if 'person_id' in participant:
                participant[
                    'person_id'] = self.person_importer.resolve_json_id(
                        participant['person_id'], allow_no_match=True)
            elif 'organization_id' in participant:
                participant[
                    'organization_id'] = self.org_importer.resolve_json_id(
                        participant['organization_id'], allow_no_match=True)

        for item in data['agenda']:
            for entity in item['related_entities']:
                if 'person_id' in entity:
                    entity['person_id'] = self.person_importer.resolve_json_id(
                        entity['person_id'], allow_no_match=True)
                elif 'organization_id' in entity:
                    entity[
                        'organization_id'] = self.org_importer.resolve_json_id(
                            entity['organization_id'], allow_no_match=True)
                elif 'bill_id' in entity:
                    bill = get_pseudo_id(entity['bill_id'])
                    bill['identifier'] = fix_bill_id(bill['identifier'])
                    bill = _make_pseudo_id(**bill)
                    entity['bill_id'] = self.bill_importer.resolve_json_id(
                        bill, allow_no_match=True)
                elif 'vote_event_id' in entity:
                    entity[
                        'vote_event_id'] = self.vote_event_importer.resolve_json_id(
                            entity['vote_event_id'], allow_no_match=True)

        return data
Пример #7
0
    def scrape(self, start_time=None):

        if start_time is None:
            start_time = datetime.datetime(2017, 1, 1, 0, 0, tzinfo=pytz.utc)

        dupes = {}
        uniq = {}
        bad_ids = []

        for i, hearing in enumerate(self.congressional_hearings(start_time)):
            package_id = hearing['packageId']
            try:
                package_num, = re.findall('\d+$', package_id)
            except ValueError:
                bad_ids.append(package_id)
                continue
            # For appropriations hearings, the committees tend to
            # publish portions of the hearings as they are completed,
            # and then the final hearing are usually compiled,
            # printed, and added to the repository at the request of
            # the Committee.
            #
            # packages with 8 digits after hrg are the in-process
            # version
            #
            # There could be some time between the in-process and
            # final packages. Publication of hearings is the purview
            # of the committee.
            #
            # https://github.com/usgpo/api/issues/21#issuecomment-435926223
            if len(package_num) == 8:
                continue

            mods_link = hearing['download']['modsLink']
            response = self.get(mods_link)
            mods = xmltodict.parse(response.content)
            extension = collections.ChainMap(*mods['mods']['extension'])

            granule_class = extension.get('granuleClass', 'boo')
            if granule_class == 'ERRATA':
                continue

            meeting_type = self._meeting_type(extension)
            if meeting_type is None:
                continue

            held_date = extension['heldDate']
            if type(held_date) is list:
                start_date = min(held_date)
            else:
                start_date = held_date

            event = Event(name=self._title(mods),
                          start_date=start_date,
                          classification=meeting_type,
                          location_name='unknown')
            if not event.name:
                continue

            if 'number' in extension:
                hearing_number = '{docClass} {congress}-{number}'.format(
                    **extension)
                print(hearing_number)
                event.extras['hearing_number'] = hearing_number

            for committee_d in self._unique(extension.get('congCommittee',
                                                          [])):
                names = committee_d['name']
                committee_name = self._name_type(names, 'authority-standard')
                if committee_name is None:
                    committee_name = self._name_type(names, 'authority-short')

                if committee_d['@chamber'] == 'H':
                    committee_name = 'House ' + committee_name
                elif committee_d['@chamber'] == 'S':
                    committee_name = 'Senate ' + committee_name

                try:
                    thomas_id = committee_d['@authorityId'].upper()
                except KeyError:
                    thomas_id = None

                sub_committees = self._subcommittees(committee_d)
                if sub_committees:
                    for sub_committee_d in sub_committees:
                        sub_committee_name = sub_committee_d['name']['#text']
                        sub_committee_name = sub_committee_name.strip(
                            string.punctuation)
                        sub_committee_id = _make_pseudo_id(
                            name=sub_committee_name,
                            parent__identifiers__identifier=thomas_id)
                        ret = {
                            "name": sub_committee_name,
                            "entity_type": 'organization',
                            "note": 'host',
                            "organization_id": sub_committee_id,
                        }
                        event.participants.append(ret)

                else:
                    if thomas_id:
                        ret = {
                            "name":
                            committee_name,
                            "entity_type":
                            'organization',
                            "note":
                            'host',
                            "organization_id":
                            _make_pseudo_id(identifiers__identifier=thomas_id)
                        }
                        event.participants.append(ret)
                    else:
                        event.add_committee(committee_name, note='host')

            links = mods['mods']['location']['url']
            for link in self._unique(links):
                if link['@displayLabel'] == 'Content Detail':
                    event.add_source(link['#text'], note='web')
                elif link['@displayLabel'] == 'HTML rendition':
                    event.add_document('transcript',
                                       link['#text'],
                                       media_type='text/html')
                elif link['@displayLabel'] == 'PDF rendition':
                    event.add_document('transcript',
                                       link['#text'],
                                       media_type='application/pdf')

            event.add_source(mods_link, note='API')

            self._unique_event(uniq, event, dupes)

        self._house_docs(uniq)

        for event in uniq.values():
            yield event

        with open('bad_ids.txt', 'w') as f:
            for id in bad_ids:
                f.write(id + '\n')
Пример #8
0
    def _house_docs(self, uniq):
        _house_docs = {}

        house_scraper = HouseCommittee(
            cache_storage=self.cache_storage,
            requests_per_minute=self.requests_per_minute)
        for link, hearing_xml in house_scraper.scrape():
            meeting_title, = hearing_xml.xpath('//meeting-title/text()')
            start_date, = hearing_xml.xpath(
                '//meeting-date/calendar-date/text()')

            try:
                room, = hearing_xml.xpath('//room/text()')
            except ValueError:
                location = 'unknown'
            else:
                location = '{} {}'.format(
                    hearing_xml.xpath('//building/text()')[0], room)

            meeting_title = meeting_title.upper()
            event = uniq.get((meeting_title.upper(), start_date))

            if event is not None:
                event.location = {
                    "name": location,
                    "note": "",
                    "coordinates": None
                }
                event.add_source(link, note='docs.house.gov XML')
                self._add_house_docs(event, hearing_xml)

            else:

                event = Event(name=meeting_title[:1000],
                              start_date=start_date,
                              location_name=location)

                event.add_source(link, note='docs.house.gov XML')

                for sub_committee in hearing_xml.xpath(
                        '//subcommittees/committee-name'):
                    name, = sub_committee.xpath('.//text()')
                    thomas_id = sub_committee.attrib['parent-id']
                    participant = {
                        "name":
                        name,
                        "entity_type":
                        'organization',
                        "note":
                        'host',
                        "organization_id":
                        _make_pseudo_id(
                            name=name,
                            parent__identifiers__identifier=thomas_id),
                    }
                    event.participants.append(participant)

                for committee in hearing_xml.xpath(
                        '//committees/committee-name'):
                    name, = committee.xpath('.//text()')
                    thomas_id = committee.attrib['id']
                    participant = {
                        "name":
                        name,
                        "entity_type":
                        'organization',
                        "note":
                        'host',
                        "organization_id":
                        _make_pseudo_id(identifiers__identifier=thomas_id),
                    }

                    event.participants.append(participant)

                self._add_house_docs(event, hearing_xml)

                uniq[(meeting_title, start_date)] = event
Пример #9
0
    def scrape(self, window=28, matter_ids=None):
        '''By default, scrape board reports updated in the last 28 days.
        Optionally specify a larger or smaller window of time from which to
        scrape updates, or specific matters to scrape.
        Note that passing a value for :matter_ids supercedes the value of
        :window, such that the given matters will be scraped regardless of
        when they were updated.
        
        Optional parameters
        :window (numeric) - Amount of time for which to scrape updates, e.g.
        a window of 7 will scrape legislation updated in the last week. Pass
        a window of 0 to scrape all legislation.
        :matter_ids (str) - Comma-separated list of matter IDs to scrape
        '''

        if matter_ids:
            matters = [
                self.matter(matter_id) for matter_id in matter_ids.split(',')
            ]
            matters = filter(
                None, matters)  # Skip matters that are not yet in Legistar
        elif float(window):  # Support for partial days, i.e., window=0.15
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
                float(window))
            matters = self.matters(n_days_ago)
        else:
            # Scrape all matters, including those without a last-modified date
            matters = self.matters()

        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
            float(window))
        for matter in matters:
            # If this Boolean field is True, then do not scrape the Bill.
            # This issue explains why a restricted Bill might appear (unwelcome) in the Legistar API:
            # https://github.com/datamade/la-metro-councilmatic/issues/345#issuecomment-421184826
            if matter['MatterRestrictViewViaWeb']:
                continue

            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            if not all((date, title, identifier)):
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name": "Board of Directors"})

            legistar_web = matter['legistar_url']

            legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)

            bill.add_source(legistar_web, note='web')
            bill.add_source(legistar_api, note='api')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id):
                act = bill.add_action(**action)

                if action['description'] == 'Referred':
                    body_name = matter['MatterBodyName']
                    act.add_related_entity(
                        body_name,
                        'organization',
                        entity_id=_make_pseudo_id(name=body_name))

                result, votes = vote
                if result:
                    vote_event = VoteEvent(
                        legislative_session=bill.legislative_session,
                        motion_text=action['description'],
                        organization=action['organization'],
                        classification=None,
                        start_date=action['date'],
                        result=result,
                        bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes:
                        raw_option = vote['VoteValueName'].lower()
                        clean_option = self.VOTE_OPTIONS.get(
                            raw_option, raw_option)
                        vote_event.vote(clean_option,
                                        vote['VotePersonName'].strip())

                    yield vote_event

            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id):
                bill.add_subject(topic['MatterIndexName'].strip())

            for relation in self.relations(matter_id):
                try:
                    # Get data (i.e., json) for the related bill.
                    # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session).
                    # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue.
                    related_bill = self.endpoint(
                        '/matters/{0}', relation['MatterRelationMatterId'])
                except scrapelib.HTTPError:
                    continue
                else:
                    date = related_bill['MatterIntroDate']
                    related_bill_session = self.session(self.toTime(date))
                    identifier = related_bill['MatterFile']
                    bill.add_related_bill(
                        identifier=identifier,
                        legislative_session=related_bill_session,
                        relation_type='companion')
                    # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104
                    # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'.

            bill.add_version_link(
                'Board Report',
                'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'
                .format(matter_id),
                media_type="application/pdf")

            for attachment in self.attachments(matter_id):
                if attachment['MatterAttachmentName']:
                    bill.add_document_link(
                        attachment['MatterAttachmentName'],
                        attachment['MatterAttachmentHyperlink'],
                        media_type="application/pdf")

            bill.extras = {'local_classification': matter['MatterTypeName']}

            text = self.text(matter_id)

            if text:
                if text['MatterTextPlain']:
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf']:
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                        u'\u0000', '')

            yield bill
Пример #10
0
    def scrape(self):
        three_days_ago = datetime.datetime.now() - datetime.timedelta(3)
        for matter in self.matters(three_days_ago):
            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            if not all((date, title, identifier)):
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name": "Board of Directors"})

            legistar_web = self.legislation_detail_url(matter_id)
            legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)

            bill.add_source(legistar_web, note='web')
            bill.add_source(legistar_api, note='api')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id):
                act = bill.add_action(**action)

                if action['description'] == 'Referred':
                    body_name = matter['MatterBodyName']
                    act.add_related_entity(
                        body_name,
                        'organization',
                        entity_id=_make_pseudo_id(name=body_name))

                result, votes = vote
                if result:
                    vote_event = VoteEvent(
                        legislative_session=bill.legislative_session,
                        motion_text=action['description'],
                        organization=action['organization'],
                        classification=None,
                        start_date=action['date'],
                        result=result,
                        bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes:
                        raw_option = vote['VoteValueName'].lower()
                        clean_option = self.VOTE_OPTIONS.get(
                            raw_option, raw_option)
                        vote_event.vote(clean_option,
                                        vote['VotePersonName'].strip())

                    yield vote_event

            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id):
                bill.add_subject(topic['MatterIndexName'].strip())

            bill.add_version_link(
                'Board Report',
                'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'
                .format(matter_id),
                media_type="application/pdf")

            for attachment in self.attachments(matter_id):
                if attachment['MatterAttachmentName']:
                    bill.add_document_link(
                        attachment['MatterAttachmentName'],
                        attachment['MatterAttachmentHyperlink'],
                        media_type="application/pdf")

            bill.extras = {'local_classification': matter['MatterTypeName']}

            text = self.text(matter_id)

            if text:
                if text['MatterTextPlain']:
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf']:
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                        u'\u0000', '')

            yield bill
Пример #11
0
    def scrape(self, window=28) :
        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window))
        for matter in self.matters(n_days_ago) :
            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            if not all((date, title, identifier)) :
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name":"Board of Directors"})
            
            legistar_web = matter['legistar_url']
            
            legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)

            bill.add_source(legistar_web, note='web')
            bill.add_source(legistar_api, note='api')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id) :
                act = bill.add_action(**action)

                if action['description'] == 'Referred' :
                    body_name = matter['MatterBodyName']
                    act.add_related_entity(body_name,
                                           'organization',
                                           entity_id = _make_pseudo_id(name=body_name))

                result, votes = vote
                if result :
                    vote_event = VoteEvent(legislative_session=bill.legislative_session, 
                                           motion_text=action['description'],
                                           organization=action['organization'],
                                           classification=None,
                                           start_date=action['date'],
                                           result=result,
                                           bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes :
                        raw_option = vote['VoteValueName'].lower()
                        clean_option = self.VOTE_OPTIONS.get(raw_option,
                                                             raw_option)
                        vote_event.vote(clean_option, 
                                        vote['VotePersonName'].strip())

                    yield vote_event


            for sponsorship in self.sponsorships(matter_id) :
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id) :
                bill.add_subject(topic['MatterIndexName'].strip())

            for relation in self.relations(matter_id):
                try:
                    # Get data (i.e., json) for the related bill. 
                    # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session).
                    # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue.
                    related_bill = self.endpoint('/matters/{0}', relation['MatterRelationMatterId'])
                except scrapelib.HTTPError:
                    continue
                else:
                    date = related_bill['MatterIntroDate']
                    related_bill_session = self.session(self.toTime(date))
                    identifier = related_bill['MatterFile']
                    bill.add_related_bill(identifier=identifier,
                                          legislative_session=related_bill_session,
                                          relation_type='companion')
                    # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104
                    # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'.

            bill.add_version_link('Board Report',
                                  'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'.format(matter_id),
                                   media_type="application/pdf")

            for attachment in self.attachments(matter_id) :
                if attachment['MatterAttachmentName'] :
                    bill.add_document_link(attachment['MatterAttachmentName'],
                                           attachment['MatterAttachmentHyperlink'],
                                           media_type="application/pdf")

            bill.extras = {'local_classification' : matter['MatterTypeName']}

            text = self.text(matter_id)

            if text :
                if text['MatterTextPlain'] :
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf'] :
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '')

            yield bill
Пример #12
0
    def scrape(self, window=3):
        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
            float(window))
        for matter in self.matters(n_days_ago):
            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            # There are currently no bills with duplicate action items! If a
            # bill has a duplicate action item that's causing the entire scrape
            # to fail, add it to the `problem_bills` array to skip it.

            problem_bills = []

            if identifier in problem_bills:
                continue

            if not all((date, title, identifier)):
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name": "Chicago City Council"})

            legistar_web = matter['legistar_url']

            legistar_api = 'http://webapi.legistar.com/v1/chicago/matters/{0}'.format(
                matter_id)

            bill.add_source(legistar_web, note='web')
            bill.add_source(legistar_api, note='api')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id):
                responsible_person = action.pop('responsible person')
                act = bill.add_action(**action)

                if responsible_person:
                    act.add_related_entity(
                        responsible_person,
                        'person',
                        entity_id=_make_pseudo_id(name=responsible_person))

                if action['description'] == 'Referred':
                    body_name = matter['MatterBodyName']
                    if body_name != 'City Council':
                        act.add_related_entity(
                            body_name,
                            'organization',
                            entity_id=_make_pseudo_id(name=body_name))

                result, votes = vote
                if result:
                    vote_event = VoteEvent(
                        legislative_session=bill.legislative_session,
                        motion_text=action['description'],
                        organization=action['organization'],
                        classification=None,
                        start_date=action['date'],
                        result=result,
                        bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes:
                        raw_option = vote['VoteValueName'].lower()
                        clean_option = self.VOTE_OPTIONS.get(
                            raw_option, raw_option)
                        vote_event.vote(clean_option,
                                        vote['VotePersonName'].strip())

                    yield vote_event

            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id):
                bill.add_subject(topic['MatterIndexName'].strip())

            for attachment in self.attachments(matter_id):
                if attachment['MatterAttachmentName']:
                    bill.add_version_link(
                        attachment['MatterAttachmentName'],
                        attachment['MatterAttachmentHyperlink'],
                        media_type="application/pdf")

            bill.extras = {'local_classification': matter['MatterTypeName']}

            text = self.text(matter_id)

            if text:
                if text['MatterTextPlain']:
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf']:
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                        u'\u0000', '')

            yield bill
Пример #13
0
    def scrape(self, window=3):
        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
            float(window))
        for matter in self.matters(n_days_ago):
            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            # Temporarily, we should not scrape or import these bills:
            # https://chicago.legistar.com/LegislationDetail.aspx?ID=3291304&GUID=72ACF5FE-0803-46E8-90B4-604119803293
            # They have duplicate action items, which cause the entire scrape
            # to fail. The Chicago clerk's office should fix it in the near
            # future, after which we can remove this code.
            problem_bills = ['CL2017-1281']
            if identifier in problem_bills:
                continue

            if not all((date, title, identifier)):
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name": "Chicago City Council"})

            legistar_web = matter['legistar_url']

            legistar_api = 'http://webapi.legistar.com/v1/chicago/matters/{0}'.format(
                matter_id)

            bill.add_source(legistar_web, note='web')
            bill.add_source(legistar_api, note='api')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id):
                responsible_person = action.pop('responsible person')
                act = bill.add_action(**action)

                if responsible_person:
                    act.add_related_entity(
                        responsible_person,
                        'person',
                        entity_id=_make_pseudo_id(name=responsible_person))

                if action['description'] == 'Referred':
                    body_name = matter['MatterBodyName']
                    if body_name != 'City Council':
                        act.add_related_entity(
                            body_name,
                            'organization',
                            entity_id=_make_pseudo_id(name=body_name))

                result, votes = vote
                if result:
                    vote_event = VoteEvent(
                        legislative_session=bill.legislative_session,
                        motion_text=action['description'],
                        organization=action['organization'],
                        classification=None,
                        start_date=action['date'],
                        result=result,
                        bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes:
                        raw_option = vote['VoteValueName'].lower()
                        clean_option = self.VOTE_OPTIONS.get(
                            raw_option, raw_option)
                        vote_event.vote(clean_option,
                                        vote['VotePersonName'].strip())

                    yield vote_event

            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id):
                bill.add_subject(topic['MatterIndexName'].strip())

            for attachment in self.attachments(matter_id):
                if attachment['MatterAttachmentName']:
                    bill.add_version_link(
                        attachment['MatterAttachmentName'],
                        attachment['MatterAttachmentHyperlink'],
                        media_type="application/pdf")

            bill.extras = {'local_classification': matter['MatterTypeName']}

            text = self.text(matter_id)

            if text:
                if text['MatterTextPlain']:
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf']:
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                        u'\u0000', '')

            yield bill
Пример #14
0
    def extract_actions(self, bill, doc, current_chamber):
        """
        Extract the actions taken on a bill.
        A bill can have actions taken from either chamber.  The current
        chamber's actions will be the first table of actions. The other
        chamber's actions will be in the second table.
        """

        bill_actions = list()
        action_tables = doc.xpath('//table[@class="actions"]')

        for cur_table in action_tables:
            for row in cur_table.xpath('.//tr'):
                bill_action = dict()

                # Split up columns
                date_col, the_rest = row.xpath('td')

                # The second column can hold a link to full text
                # and pages (what should be in another column),
                # but also links to committee elements or other spanned
                # content.
                action_date = date_col.text_content().strip()
                action_text = the_rest.text.strip()
                committee = the_rest.xpath("a[contains(@href,'committee')]/text()")
                extra = ''.join(the_rest.xpath('span[not(@style)]/text() | a/text()'))

                # skip non-actions (don't have date)
                if action_text in ('Chapter number', 'See also', 'See',
                                   'Effective date', 'Secretary of State'):
                    continue

                # dates are really inconsistent here, sometimes in action_text
                try:
                    action_date = datetime.datetime.strptime(
                        action_date, '%m/%d/%Y').date()
                except ValueError:
                    try:
                        action_date = datetime.datetime.strptime(
                                extra, '%m/%d/%y').date()
                    except ValueError:
                        try:
                            action_date = datetime.datetime.strptime(
                                extra, '%m/%d/%Y').date()
                        except ValueError:
                            self.warning('ACTION without date: %s' %
                                         action_text)
                            continue

                # categorize actions
                action_type = None
                for pattern, atype in self._categorizers:
                    if re.match(pattern, action_text):
                        action_type = atype
                        if 'referral-committee' in action_type and len(committee) > 0:
                            bill_action['committees'] = committee[0]
                        break

                if extra:
                    action_text += ' ' + extra
                bill_action['action_text'] = action_text
                if isinstance(action_type, list):
                    for atype in action_type:
                        if atype is not None and atype.startswith('governor'):
                            bill_action['action_chamber'] = 'executive'
                            break
                    else:
                        bill_action['action_chamber'] = current_chamber
                else:
                    if (action_type is not None and
                            action_type.startswith('governor')):
                        bill_action['action_chamber'] = 'executive'
                    else:
                        bill_action['action_chamber'] = current_chamber
                bill_action['action_date'] = action_date
                bill_action['action_type'] = action_type
                bill_actions.append(bill_action)

                # Try to extract vote
                # bill = self.extract_vote_from_action(bill, bill_action, current_chamber, row)

            # if there's a second table, toggle the current chamber
            if current_chamber == 'upper':
                current_chamber = 'lower'
            else:
                current_chamber = 'upper'

        # Add acctions to bill
        for action in bill_actions:
            act = bill.add_action(action['action_text'],
                                  action['action_date'],
                                  chamber=action['action_chamber'],
                                  classification=action['action_type'])

            if 'committees' in action:
                committee = action['committees']
                act.add_related_entity(
                    committee, 'organization',
                    entity_id=_make_pseudo_id(name=committee))

        return bill
Пример #15
0
    def scrape(self):
        unreachable_urls = []

        for leg_summary in self.legislation(
                created_after=datetime.datetime(2015, 5, 17)):
            title = leg_summary['Title'].strip()

            if not title or not leg_summary['Intro\xa0Date']:
                continue
                # https://chicago.legistar.com/LegislationDetail.aspx?ID=1800754&GUID=29575A7A-5489-4D8B-8347-4FC91808B201&Options=Advanced&Search=
                # doesn't have an intro date

            bill_type = BILL_TYPES[leg_summary['Type']]

            bill_session = self.session(
                self.toTime(leg_summary['Intro\xa0Date']))
            bill = Bill(identifier=leg_summary['Record #'],
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name": "Chicago City Council"})

            bill.add_source(leg_summary['url'])

            try:
                leg_details = self.legDetails(leg_summary['url'])
            except IndexError:
                unreachable_urls.append(leg_summary['url'])
                yield bill
                continue

            for related_bill in leg_details.get('Related files', []):
                lower_title = title.lower()
                if "sundry" in title or "miscellaneous" in title:  #these are ominbus
                    bill.add_related_bill(
                        identifier=related_bill['label'],
                        legislative_session=bill.legislative_session,
                        relation_type='replaces')
                #for now we're skipping related bills if they
                #don't contain words that make us think they're
                #in a ominbus relationship with each other

            for i, sponsor in enumerate(leg_details.get('Sponsors', [])):
                if i == 0:
                    primary = True
                    sponsorship_type = "Primary"
                else:
                    primary = False
                    sponsorship_type = "Regular"

                sponsor_name = sponsor['label']

                # Does the Mayor/Clerk introduce legisislation as
                # individuals role holders or as the OFfice of City
                # Clerk and the Office of the Mayor?
                entity_type = 'person'
                if sponsor_name.startswith(('City Clerk', 'Mendoza, Susana')):
                    sponsor_name = 'Office of the City Clerk'
                    entity_type = 'organization'
                elif sponsor_name.startswith(('Emanuel, Rahm', )):
                    sponsor_name = 'Office of the Mayor'
                    entity_type = 'organization'
                if not sponsor_name.startswith(
                    ('Misc. Transmittal', 'No Sponsor', 'Dept./Agency')):
                    bill.add_sponsorship(
                        sponsor_name,
                        sponsorship_type,
                        entity_type,
                        primary,
                        entity_id=_make_pseudo_id(name=sponsor_name))

            if 'Topic' in leg_details:
                for subject in leg_details[u'Topic'].split(','):
                    bill.add_subject(subject)

            for attachment in leg_details.get('Attachments', []):
                if attachment['label']:
                    bill.add_version_link(attachment['label'],
                                          attachment['url'],
                                          media_type="application/pdf")

            for action in self.history(leg_summary['url']):
                action_description = action['Action']
                try:
                    action_date = self.toTime(
                        action['Date']).date().isoformat()
                except AttributeError:  # https://chicago.legistar.com/LegislationDetail.aspx?ID=1424866&GUID=CEC53337-B991-4268-AE8A-D4D174F8D492
                    continue

                if action_description:
                    try:
                        responsible_org = action['Action\xa0By']['label']
                    except TypeError:
                        responsible_org = action['Action\xa0By']
                    if responsible_org == 'City Council':
                        responsible_org = 'Chicago City Council'

                    act = bill.add_action(
                        action_description,
                        action_date,
                        organization={'name': responsible_org},
                        classification=ACTION_CLASSIFICATION[
                            action_description])

                    if action_description == 'Referred':
                        try:
                            leg_details[
                                'Current Controlling Legislative Body'][
                                    'label']
                            controlling_bodies = [
                                leg_details[
                                    'Current Controlling Legislative Body']
                            ]
                        except TypeError:
                            controlling_bodies = leg_details[
                                'Current Controlling Legislative Body']
                        if controlling_bodies:
                            for controlling_body in controlling_bodies:
                                body_name = controlling_body['label']
                                if body_name.startswith("Joint Committee"):
                                    act.add_related_entity(
                                        body_name, 'organization')
                                else:
                                    act.add_related_entity(
                                        body_name,
                                        'organization',
                                        entity_id=_make_pseudo_id(
                                            name=body_name))

                    if 'url' in action['Action\xa0Details']:
                        action_detail_url = action['Action\xa0Details']['url']
                        result, votes = self.extractVotes(action_detail_url)

                        if votes and result:  # see https://github.com/datamade/municipal-scrapers-us/issues/15
                            action_vote = VoteEvent(
                                legislative_session=bill.legislative_session,
                                motion_text=action_description,
                                organization={'name': responsible_org},
                                classification=None,
                                start_date=action_date,
                                result=result,
                                bill=bill)
                            action_vote.add_source(action_detail_url)

                            for option, voter in votes:
                                action_vote.vote(option, voter)

                            yield action_vote

            bill.extras = {'local_classification': leg_summary['Type']}

            yield bill
        print(unreachable_urls)
Пример #16
0
    def scrape(self):
        three_days_ago = datetime.datetime.now() - datetime.timedelta(3)
        for matter in self.matters(three_days_ago):
            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']

            if not all((date, title)):
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            bill = Bill(identifier=matter['MatterFile'],
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name": "Chicago City Council"})

            legistar_web = self.legislation_detail_url(matter_id)
            legistar_api = 'http://webapi.legistar.com/v1/chicago/matters/{0}'.format(
                matter_id)

            bill.add_source(legistar_web, note='web')
            bill.add_source(legistar_api, note='api')

            for action, vote in self.actions(matter_id):
                act = bill.add_action(**action)

                if action['description'] == 'Referred':
                    body_name = matter['MatterBodyName']
                    if body_name != 'City Council':
                        act.add_related_entity(
                            body_name,
                            'organization',
                            entity_id=_make_pseudo_id(name=body_name))

                result, votes = vote
                if result:
                    vote_event = VoteEvent(
                        legislative_session=bill.legislative_session,
                        motion_text=action['description'],
                        organization=action['organization'],
                        classification=None,
                        start_date=action['date'],
                        result=result,
                        bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes:
                        raw_option = vote['VoteValueName'].lower()
                        clean_option = self.VOTE_OPTIONS.get(
                            raw_option, raw_option)
                        vote_event.vote(clean_option,
                                        vote['VotePersonName'].strip())

                    yield vote_event

            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id):
                bill.add_subject(topic['MatterIndexName'].strip())

            for attachment in self.attachments(matter_id):
                if attachment['MatterAttachmentName']:
                    bill.add_version_link(
                        attachment['MatterAttachmentName'],
                        attachment['MatterAttachmentHyperlink'],
                        media_type="application/pdf")

            bill.extras = {'local_classification': matter['MatterTypeName']}

            text = self.text(matter_id)

            if text:
                if text['MatterTextPlain']:
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf']:
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                        u'\u0000', '')

            yield bill
Пример #17
0
    def scrape(self):
        for leg_summary in self.legislation(created_after=datetime.datetime(2014, 1, 1)) :
            leg_type = BILL_TYPES[leg_summary['Type']]
            
            bill = Bill(identifier=leg_summary['File\xa0#'],
                        title=leg_summary['Title'],
                        legislative_session=None,
                        classification=leg_type,
                        from_organization={"name":"New York City Council"})
            bill.add_source(leg_summary['url'], note='web')

            leg_details = self.legDetails(leg_summary['url'])
            history = self.history(leg_summary['url'])

            bill.add_title(leg_details['Name'], 
                           note='created by administrative staff')

            if 'Summary' in leg_details :
                bill.add_abstract(leg_details['Summary'], note='')

            if leg_details['Law number'] :
                bill.add_identifier(leg_details['Law number'], 
                                    note='law number')

            for sponsorship in self._sponsors(leg_details.get('Sponsors', [])) :
                sponsor, sponsorship_type, primary = sponsorship
                bill.add_sponsorship(sponsor, sponsorship_type,
                                     'person', primary)

            
            for attachment in leg_details.get('Attachments', []) :
                bill.add_document_link(attachment['label'],
                                       attachment['url'],
                                       media_type="application/pdf")

            history = list(history)

            if history :
                earliest_action = min(self.toTime(action['Date']) 
                                      for action in history)

                bill.legislative_session = self.sessions(earliest_action)
            else :
                bill.legislative_session = str(self.SESSION_STARTS[0])

            for action in history :
                action_description = action['Action']
                if not action_description :
                    continue
                    
                action_class = ACTION_CLASSIFICATION[action_description]

                action_date = self.toDate(action['Date'])
                responsible_org = action['Action\xa0By']
                if responsible_org == 'City Council' :
                    responsible_org = 'New York City Council'
                elif responsible_org == 'Administration' :
                    responsible_org = 'Mayor'
                   
                if responsible_org == 'Town Hall Meeting' :
                    continue
                else :
                    act = bill.add_action(action_description,
                                          action_date,
                                          organization={'name': responsible_org},
                                          classification=action_class)

                if 'url' in action['Action\xa0Details'] :
                    action_detail_url = action['Action\xa0Details']['url']
                    if action_class == 'committee-referral' :
                        action_details = self.actionDetails(action_detail_url)
                        referred_committee = action_details['Action text'].rsplit(' to the ', 1)[-1]
                        act.add_related_entity(referred_committee,
                                               'organization',
                                               entity_id = _make_pseudo_id(name=referred_committee))
                    result, votes = self.extractVotes(action_detail_url)
                    if result and votes :
                        action_vote = VoteEvent(legislative_session=bill.legislative_session, 
                                           motion_text=action_description,
                                           organization={'name': responsible_org},
                                           classification=action_class,
                                           start_date=action_date,
                                           result=result,
                                           bill=bill)
                        action_vote.add_source(action_detail_url, note='web')

                        for option, voter in votes :
                            action_vote.vote(option, voter)


                        yield action_vote
            
            text = self.text(leg_summary['url'])

            if text :
                bill.extras = {'local_classification' : leg_summary['Type'],
                               'full_text' : text}
            else :
                bill.extras = {'local_classification' : leg_summary['Type']}

            yield bill
Пример #18
0
    def scrape(self, window=28, matter_ids=None):
        '''By default, scrape board reports updated in the last 28 days.
        Optionally specify a larger or smaller window of time from which to
        scrape updates, or specific matters to scrape.
        Note that passing a value for :matter_ids supercedes the value of
        :window, such that the given matters will be scraped regardless of
        when they were updated.

        Optional parameters
        :window (numeric) - Amount of time for which to scrape updates, e.g.
        a window of 7 will scrape legislation updated in the last week. Pass
        a window of 0 to scrape all legislation.
        :matter_ids (str) - Comma-separated list of matter IDs to scrape
        '''

        if matter_ids:
            matters = [
                self.matter(matter_id) for matter_id in matter_ids.split(',')
            ]
            matters = filter(
                None, matters)  # Skip matters that are not yet in Legistar
        elif float(window):  # Support for partial days, i.e., window=0.15
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
                float(window))
            matters = self.matters(n_days_ago)
        else:
            # Scrape all matters, including those without a last-modified date
            matters = self.matters()

        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
            float(window))
        for matter in matters:
            # Skip this bill, until Metro cleans up duplicate in Legistar API
            if matter['MatterFile'] == '2017-0447':
                continue

            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            if not all((date, title, identifier)):
                continue

            # Do not scrape private bills introduced before this timestamp.
            if self._is_restricted(matter) and (
                    date < self.START_DATE_PRIVATE_SCRAPE):
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name": "Board of Directors"})

            # The Metro scraper scrapes private bills.
            # However, we do not want to capture significant data about private bills,
            # other than the value of the helper function `_is_restricted` and a last modified timestamp.
            # We yield private bills early, wipe data from previously imported once-public bills,
            # and include only data *required* by the pupa schema.
            # https://github.com/opencivicdata/pupa/blob/master/pupa/scrape/schemas/bill.py
            bill.extras = {'restrict_view': self._is_restricted(matter)}

            # Add API source early.
            # Private bills should have this url for debugging.
            legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)
            bill.add_source(legistar_api, note='api')

            if self._is_restricted(matter):
                # required fields
                bill.title = 'Restricted View'

                # wipe old data
                bill.extras['plain_text'] = ''
                bill.extras['rtf_text'] = ''
                bill.sponsorships = []
                bill.related_bills = []
                bill.versions = []
                bill.documents = []
                bill.actions = []

                yield bill
                continue

            legistar_web = matter['legistar_url']
            bill.add_source(legistar_web, note='web')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id):
                act = bill.add_action(**action)

                if action['description'] == 'Referred':
                    body_name = matter['MatterBodyName']
                    act.add_related_entity(
                        body_name,
                        'organization',
                        entity_id=_make_pseudo_id(name=body_name))

                result, votes = vote
                if result:
                    vote_event = VoteEvent(
                        legislative_session=bill.legislative_session,
                        motion_text=action['description'],
                        organization=action['organization'],
                        classification=None,
                        start_date=action['date'],
                        result=result,
                        bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes:
                        try:
                            raw_option = vote['VoteValueName'].lower()
                        except AttributeError:
                            raw_option = None
                        clean_option = self.VOTE_OPTIONS.get(
                            raw_option, raw_option)
                        vote_event.vote(clean_option,
                                        vote['VotePersonName'].strip())

                    yield vote_event

            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id):
                bill.add_subject(topic['MatterIndexName'].strip())

            for relation in self.relations(matter_id):
                try:
                    # Get data (i.e., json) for the related bill.
                    # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session).
                    # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue.
                    related_bill = self.endpoint(
                        '/matters/{0}', relation['MatterRelationMatterId'])
                except scrapelib.HTTPError:
                    continue
                else:
                    date = related_bill['MatterIntroDate']
                    related_bill_session = self.session(self.toTime(date))
                    identifier = related_bill['MatterFile']
                    bill.add_related_bill(
                        identifier=identifier,
                        legislative_session=related_bill_session,
                        relation_type='companion')
                    # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104
                    # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'.

            bill.add_version_link(
                'Board Report',
                'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'
                .format(matter_id),
                media_type="application/pdf")

            for attachment in self.attachments(matter_id):
                if attachment['MatterAttachmentName']:
                    bill.add_document_link(
                        attachment['MatterAttachmentName'],
                        attachment['MatterAttachmentHyperlink'].strip(),
                        media_type="application/pdf")

            bill.extras['local_classification'] = matter['MatterTypeName']

            matter_version_value = matter['MatterVersion']
            text = self.text(matter_id, matter_version_value)

            if text:
                if text['MatterTextPlain']:
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf']:
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                        u'\u0000', '')

            yield bill
Пример #19
0
    def scrape(self) :
        three_days_ago = datetime.datetime.now() - datetime.timedelta(3)
        for matter in self.matters(three_days_ago) :
            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            if not all((date, title, identifier)) :
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name":"Chicago City Council"})

            legistar_web = self.legislation_detail_url(matter_id)
            legistar_api = 'http://webapi.legistar.com/v1/chicago/matters/{0}'.format(matter_id)

            bill.add_source(legistar_web, note='web')
            bill.add_source(legistar_api, note='api')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id) :
                act = bill.add_action(**action)

                if action['description'] == 'Referred' :
                    body_name = matter['MatterBodyName']
                    if body_name != 'City Council' :
                        act.add_related_entity(body_name,
                                               'organization',
                                               entity_id = _make_pseudo_id(name=body_name))

                result, votes = vote
                if result :
                    vote_event = VoteEvent(legislative_session=bill.legislative_session, 
                                           motion_text=action['description'],
                                           organization=action['organization'],
                                           classification=None,
                                           start_date=action['date'],
                                           result=result,
                                           bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes :
                        raw_option = vote['VoteValueName'].lower()
                        clean_option = self.VOTE_OPTIONS.get(raw_option,
                                                             raw_option)
                        vote_event.vote(clean_option, 
                                        vote['VotePersonName'].strip())

                    yield vote_event


            for sponsorship in self.sponsorships(matter_id) :
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id) :
                bill.add_subject(topic['MatterIndexName'].strip())

            for attachment in self.attachments(matter_id) :
                if attachment['MatterAttachmentName'] :
                    bill.add_version_link(attachment['MatterAttachmentName'],
                                          attachment['MatterAttachmentHyperlink'],
                                          media_type="application/pdf")

            bill.extras = {'local_classification' : matter['MatterTypeName']}

            text = self.text(matter_id)

            if text :
                if text['MatterTextPlain'] :
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf'] :
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '')

            yield bill
Пример #20
0
    def scrape(self):
        unreachable_urls = []

        for leg_summary in self.legislation(created_after=datetime.datetime(2015, 5, 17)) :
            title = leg_summary['Title'].strip()

            if not title or not leg_summary['Intro\xa0Date'] :
                continue
                # https://chicago.legistar.com/LegislationDetail.aspx?ID=1800754&GUID=29575A7A-5489-4D8B-8347-4FC91808B201&Options=Advanced&Search=
                # doesn't have an intro date

            bill_type = BILL_TYPES[leg_summary['Type']]

            bill_session = self.session(self.toTime(leg_summary['Intro\xa0Date']))
            bill = Bill(identifier=leg_summary['Record #'],
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name":"Chicago City Council"})

            bill.add_source(leg_summary['url'])

            try :
                leg_details = self.legDetails(leg_summary['url'])
            except IndexError :
                unreachable_urls.append(leg_summary['url'])
                yield bill
                continue

            for related_bill in leg_details.get('Related files', []) :
                lower_title = title.lower()
                if "sundry" in title or "miscellaneous" in title: #these are ominbus
                    bill.add_related_bill(identifier = related_bill['label'],
                                          legislative_session = bill.legislative_session,
                                          relation_type='replaces')
                #for now we're skipping related bills if they
                #don't contain words that make us think they're
                #in a ominbus relationship with each other
                
            for i, sponsor in enumerate(leg_details.get('Sponsors', [])) :
                if i == 0 :
                    primary = True
                    sponsorship_type = "Primary"
                else :
                    primary = False
                    sponsorship_type = "Regular"

                sponsor_name = sponsor['label']

                # Does the Mayor/Clerk introduce legisislation as
                # individuals role holders or as the OFfice of City
                # Clerk and the Office of the Mayor?
                entity_type = 'person'
                if sponsor_name.startswith(('City Clerk', 
                                            'Mendoza, Susana')) :
                    sponsor_name = 'Office of the City Clerk'
                    entity_type = 'organization'
                elif sponsor_name.startswith(('Emanuel, Rahm',)) :
                    sponsor_name = 'Office of the Mayor'
                    entity_type = 'organization'
                if not sponsor_name.startswith(('Misc. Transmittal',
                                                'No Sponsor',
                                                'Dept./Agency')) :
                    bill.add_sponsorship(sponsor_name, 
                                         sponsorship_type,
                                         entity_type,
                                         primary,
                                         entity_id = _make_pseudo_id(name=sponsor_name))

            if 'Topic' in leg_details :
                for subject in leg_details[u'Topic'].split(',') :
                    bill.add_subject(subject)

            for attachment in leg_details.get('Attachments', []) :
                if attachment['label'] :
                    bill.add_version_link(attachment['label'],
                                          attachment['url'],
                                          media_type="application/pdf")

            for action in self.history(leg_summary['url']) :
                action_description = action['Action']
                try :
                    action_date =  self.toTime(action['Date']).date().isoformat()
                except AttributeError : # https://chicago.legistar.com/LegislationDetail.aspx?ID=1424866&GUID=CEC53337-B991-4268-AE8A-D4D174F8D492
                    continue

                if action_description :
                    try :
                        responsible_org = action['Action\xa0By']['label']
                    except TypeError  :
                        responsible_org = action['Action\xa0By']
                    if responsible_org == 'City Council' :
                        responsible_org = 'Chicago City Council'

                    act = bill.add_action(action_description,
                                          action_date,
                                          organization={'name': responsible_org},
                                          classification=ACTION_CLASSIFICATION[action_description])

                    if action_description == 'Referred' :
                        try :
                            leg_details['Current Controlling Legislative Body']['label']
                            controlling_bodies = [leg_details['Current Controlling Legislative Body']]
                        except TypeError :
                            controlling_bodies = leg_details['Current Controlling Legislative Body']
                        if controlling_bodies :
                            for controlling_body in controlling_bodies :
                                body_name = controlling_body['label']
                                if body_name.startswith("Joint Committee") :
                                    act.add_related_entity(body_name,
                                                           'organization')
                                else :
                                    act.add_related_entity(body_name,
                                                           'organization',
                                                           entity_id = _make_pseudo_id(name=body_name))


                    if 'url' in action['Action\xa0Details'] :
                        action_detail_url = action['Action\xa0Details']['url']
                        result, votes = self.extractVotes(action_detail_url)

                        if votes and result : # see https://github.com/datamade/municipal-scrapers-us/issues/15
                            action_vote = VoteEvent(legislative_session=bill.legislative_session, 
                                               motion_text=action_description,
                                               organization={'name': responsible_org},
                                               classification=None,
                                               start_date=action_date,
                                               result=result,
                                               bill=bill)
                            action_vote.add_source(action_detail_url)

                            for option, voter in votes :
                                action_vote.vote(option, voter)

                            yield action_vote

            bill.extras = {'local_classification' : leg_summary['Type']}
                            
            yield bill
        print(unreachable_urls)
Пример #21
0
    def scrape(self, window=30):
        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window))
        self.retry_wait_seconds = 20

        for matter in self.matters(n_days_ago):
            matter_id = matter["MatterId"]

            date = matter["MatterIntroDate"]
            title = matter["MatterTitle"]
            identifier = matter["MatterFile"]

            # If a bill has a duplicate action item that"s causing the entire scrape
            # to fail, add it to the `problem_bills` array to skip it.
            # For the time being...nothing to skip!

            problem_bills = []

            if identifier in problem_bills:
                continue

            if not all((date, title, identifier)):
                continue

            bill_session = self.session(self.toTime(date))

            if matter["MatterTypeName"] in BILL_TYPES:
                ocd_bill_type = BILL_TYPES[matter["MatterTypeName"]]
            else:
                ocd_bill_type = None

            if identifier.startswith("S"):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=ocd_bill_type,
                        from_organization={"name": "Pittsburgh City Council"})

            legistar_web = matter["legistar_url"]
            legistar_api = "http://webapi.legistar.com/v1/pittsburgh/matters/{0}".format(matter_id)
            bill.add_source(legistar_web, note="web")
            bill.add_source(legistar_api, note="api")

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id):
                responsible_person = action.pop("responsible person")
                act = bill.add_action(**action)

                if responsible_person:
                    act.add_related_entity(responsible_person,
                                           "person",
                                           entity_id=_make_pseudo_id(name=responsible_person))

                if action["description"] == "Referred":
                    body_name = matter["MatterBodyName"]
                    if body_name != "City Council":
                        act.add_related_entity(body_name,
                                               "organization",
                                               entity_id=_make_pseudo_id(name=body_name))

                result, votes = vote

                if result:
                    vote_event = VoteEvent(legislative_session=bill.legislative_session,
                                           motion_text=action["description"],
                                           organization=action["organization"],
                                           classification=None,
                                           start_date=action["date"],
                                           result=result,
                                           bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + "/histories")

                    for vote in votes:
                        raw_option = vote["VoteValueName"].lower()
                        clean_option = self.VOTE_OPTIONS.get(raw_option,
                                                             raw_option)
                        vote_event.vote(clean_option,
                                        vote["VotePersonName"].strip())

                    yield vote_event

            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id):
                bill.add_subject(topic["MatterIndexName"].strip())

            for attachment in self.attachments(matter_id):
                if attachment["MatterAttachmentName"]:
                    bill.add_version_link(attachment["MatterAttachmentName"],
                                          attachment["MatterAttachmentHyperlink"],
                                          media_type="application/pdf")

            bill.extras = {"local_classification": matter["MatterTypeName"]}
            text = self.text(matter_id)

            if text:
                if text["MatterTextPlain"]:
                    bill.extras["plain_text"] = text["MatterTextPlain"]

                if text["MatterTextRtf"]:
                    bill.extras["rtf_text"] = text["MatterTextRtf"].replace(u"\u0000", "")

            yield bill
Пример #22
0
    def scrape(self):
        for leg_summary in self.legislation(
                created_after=datetime.datetime(2014, 1, 1)):
            leg_type = BILL_TYPES[leg_summary['Type']]

            bill = Bill(identifier=leg_summary['File\xa0#'],
                        title=leg_summary['Title'],
                        legislative_session=None,
                        classification=leg_type,
                        from_organization={"name": "New York City Council"})
            bill.add_source(leg_summary['url'])

            leg_details = self.legDetails(leg_summary['url'])
            history = self.history(leg_summary['url'])

            bill.add_title(leg_details['Name'],
                           note='created by administrative staff')

            if 'Summary' in leg_details:
                bill.add_abstract(leg_details['Summary'], note='')

            if leg_details['Law number']:
                bill.add_identifier(leg_details['Law number'],
                                    note='law number')

            for sponsorship in self._sponsors(leg_details.get('Sponsors', [])):
                sponsor, sponsorship_type, primary = sponsorship
                bill.add_sponsorship(sponsor,
                                     sponsorship_type,
                                     'person',
                                     primary,
                                     entity_id=_make_pseudo_id(name=sponsor))

            for attachment in leg_details.get('Attachments', []):
                bill.add_document_link(attachment['label'],
                                       attachment['url'],
                                       media_type="application/pdf")

            history = list(history)

            if history:
                earliest_action = min(
                    self.toTime(action['Date']) for action in history)

                bill.legislative_session = self.sessions(earliest_action)
            else:
                bill.legislative_session = str(self.SESSION_STARTS[0])

            for action in history:
                action_description = action['Action']
                if not action_description:
                    continue

                action_class = ACTION_CLASSIFICATION[action_description]

                action_date = self.toDate(action['Date'])
                responsible_org = action['Action\xa0By']
                if responsible_org == 'City Council':
                    responsible_org = 'New York City Council'
                elif responsible_org == 'Administration':
                    responsible_org = 'Mayor'

                if responsible_org == 'Town Hall Meeting':
                    continue
                else:
                    act = bill.add_action(
                        action_description,
                        action_date,
                        organization={'name': responsible_org},
                        classification=action_class)

                if 'url' in action['Action\xa0Details']:
                    action_detail_url = action['Action\xa0Details']['url']
                    if action_class == 'committee-referral':
                        action_details = self.actionDetails(action_detail_url)
                        referred_committee = action_details[
                            'Action text'].rsplit(' to the ', 1)[-1]
                        act.add_related_entity(
                            referred_committee,
                            'organization',
                            entity_id=_make_pseudo_id(name=referred_committee))
                    result, votes = self.extractVotes(action_detail_url)
                    if votes:
                        action_vote = VoteEvent(
                            legislative_session=bill.legislative_session,
                            motion_text=action_description,
                            organization={'name': responsible_org},
                            classification=action_class,
                            start_date=action_date,
                            result=result,
                            bill=bill)
                        action_vote.add_source(action_detail_url)

                        for option, voter in votes:
                            action_vote.vote(option, voter)

                        yield action_vote

            text = self.text(leg_summary['url'])

            if text:
                bill.extras = {
                    'local_classification': leg_summary['Type'],
                    'full_text': text
                }
            else:
                bill.extras = {'local_classification': leg_summary['Type']}

            yield bill
Пример #23
0
    def extract_actions(self, bill, doc, current_chamber):
        """
        Extract the actions taken on a bill.
        A bill can have actions taken from either chamber.  The current
        chamber's actions will be the first table of actions. The other
        chamber's actions will be in the second table.
        """

        bill_actions = list()
        action_tables = doc.xpath('//table[@class="actions"]')

        for cur_table in action_tables:
            for row in cur_table.xpath('.//tr'):
                bill_action = dict()

                # Split up columns
                date_col, the_rest = row.xpath('td')

                # The second column can hold a link to full text
                # and pages (what should be in another column),
                # but also links to committee elements or other spanned
                # content.
                action_date = date_col.text_content().strip()
                action_text = the_rest.text.strip()
                committee = the_rest.xpath(
                    "a[contains(@href,'committee')]/text()")
                extra = ''.join(
                    the_rest.xpath('span[not(@style)]/text() | a/text()'))

                # skip non-actions (don't have date)
                if action_text in ('Chapter number', 'See also', 'See',
                                   'Effective date', 'Secretary of State'):
                    continue

                # dates are really inconsistent here, sometimes in action_text
                try:
                    action_date = datetime.datetime.strptime(
                        action_date, '%m/%d/%Y').date()
                except ValueError:
                    try:
                        action_date = datetime.datetime.strptime(
                            extra, '%m/%d/%y').date()
                    except ValueError:
                        try:
                            action_date = datetime.datetime.strptime(
                                extra, '%m/%d/%Y').date()
                        except ValueError:
                            self.warning('ACTION without date: %s' %
                                         action_text)
                            continue

                # categorize actions
                action_type = None
                for pattern, atype in self._categorizers:
                    if re.match(pattern, action_text):
                        action_type = atype
                        if 'referral-committee' in action_type and len(
                                committee) > 0:
                            bill_action['committees'] = committee[0]
                        break

                if extra:
                    action_text += ' ' + extra
                bill_action['action_text'] = action_text
                if isinstance(action_type, list):
                    for atype in action_type:
                        if atype is not None and atype.startswith('governor'):
                            bill_action['action_chamber'] = 'executive'
                            break
                    else:
                        bill_action['action_chamber'] = current_chamber
                else:
                    if (action_type is not None
                            and action_type.startswith('governor')):
                        bill_action['action_chamber'] = 'executive'
                    else:
                        bill_action['action_chamber'] = current_chamber
                bill_action['action_date'] = action_date
                bill_action['action_type'] = action_type
                bill_actions.append(bill_action)

                # Try to extract vote
                # bill = self.extract_vote_from_action(bill, bill_action, current_chamber, row)

            # if there's a second table, toggle the current chamber
            if current_chamber == 'upper':
                current_chamber = 'lower'
            else:
                current_chamber = 'upper'

        # Add acctions to bill
        for action in bill_actions:
            act = bill.add_action(action['action_text'],
                                  action['action_date'],
                                  chamber=action['action_chamber'],
                                  classification=action['action_type'])

            if 'committees' in action:
                committee = action['committees']
                act.add_related_entity(
                    committee,
                    'organization',
                    entity_id=_make_pseudo_id(name=committee))

        return bill