Python Event.add_agenda_item示例，pupa.scrape.Event.add_agenda_item Python示例

示例#1

0

显示文件

文件： events.py 项目： sunlightlabs/openstates

    def scrape_meeting_notice(self, chamber, item, url):
        # Since Event Name is not provided for all mettings.
        event_name = str(item['CommitteeName'])
        # 04/25/2012 03:00:00 PM
        fmt = "%m/%d/%y %I:%M %p"
        start_time = dt.datetime.strptime(str(item['MeetingDateTime']), fmt)
        location_name = str(item['AddressAliasNickname'])
        event = Event(location_name=location_name,
                      start_date=self._tz.localize(start_time),
                      name=event_name,
                      description='Committee Meeting Status: {}'
                      .format(item['CommitteeMeetingStatusName'])
                      )

        event.add_source(url)
        event.add_committee(name=str(item['CommitteeName']), id=item['CommitteeId'])

        page_url = ("http://legis.delaware.gov/json/MeetingNotice/"
                    "GetCommitteeMeetingItems?committeeMeetingId={}".format(
                        item['CommitteeMeetingId'])
                    )

        event.add_source(page_url)
        page_data = self.post(page_url).json()['Data']
        for item in page_data:
            event.add_agenda_item(description=str(item['ItemDescription']))
            event.add_person(name=str(item['PrimarySponsorShortName']),
                             id=str(item['PrimarySponsorPersonId']),
                             note='Sponsor')

        yield event

示例#2

0

显示文件

文件： events.py 项目： sunlightlabs/openstates

    def scrape_meetings(self, meetings, group):
        """
        Scrape and save event data from a list of meetings.

        Arguments:
        meetings -- A list of lxml elements containing event information
        group -- The type of meeting. The legislature site applies
                 different formatting to events based on which group
                 they correspond to.  `group` should be one of the
                 following strings: 'house', 'senate', or 'commission'.

        """
        for meeting in meetings:
            when = self.get_date(meeting)
            description = self.get_description(meeting)
            location = self.get_location(meeting)

            if when and description and location:
                event = Event(name=description, start_date=when.replace(tzinfo=self.tz),
                              description=description,
                              location_name=location)
                agenda = self.get_agenda(meeting)
                if agenda:
                    event.add_agenda_item(agenda)
                event.add_source(url)
                yield event

示例#3

0

显示文件

文件： events.py 项目： sunlightlabs/openstates

    def parse_div(self, row, chamber, com):
        cal_link = row.xpath('.//a[.//span[@id="calendarmarker"]]/@href')[0]
        # event_date = row.xpath('string(.//div[contains(@class,"ItemDate")])').strip()
        title, location, start_date, end_date = self.parse_gcal(cal_link)

        event = Event(
            start_date=start_date,
            end_date=end_date,
            name=title,
            location_name=location,
        )

        event.add_source('http://mgaleg.maryland.gov/webmga/frmHearingSchedule.aspx')

        for item in row.xpath('.//div[@class="col-xs-12a Item"]'):
            description = item.xpath('string(.)').strip()
            agenda = event.add_agenda_item(description=description)

        for item in row.xpath('.//div[contains(@class,"ItemContainer")]/a'):
            description = item.xpath('string(.)').strip()
            agenda = event.add_agenda_item(description=description)

            event.add_document(
                description,
                item.xpath('@href')[0],
                media_type="application/pdf",
                on_duplicate="ignore"
            )

        for item in row.xpath('.//div[contains(@class,"ItemContainer")]'
                              '[./div[@class="col-xs-1 Item"]]'):
            description = item.xpath('string(.)').strip()
            agenda = event.add_agenda_item(description=description)

            bill = item.xpath('.//div[@class="col-xs-1 Item"]/a/text()')[0].strip()
            agenda.add_bill(bill)

        video = row.xpath('.//a[./span[@class="OnDemand"]]')
        if video:
            event.add_media_link(
                'Video of Hearing',
                video[0].xpath('@href')[0],
                'text/html'
            )

        if 'subcommittee' in title.lower():
            subcom = title.split('-')[0].strip()
            event.add_participant(
                subcom,
                type='committee',
                note='host',
            )
        else:
            event.add_participant(
                com,
                type='committee',
                note='host',
            )
        yield event

示例#4

0

显示文件

文件： events.py 项目： sunlightlabs/openstates

    def scrape_upper(self):
        listing_url = 'https://www.senate.mo.gov/hearingsschedule/hrings.htm'

        html = self.get(listing_url).text

        # The HTML here isn't wrapped in a container per-event
        # which makes xpath a pain. So string split by <hr>
        # then parse each event's fragment for cleaner results
        for fragment in html.split('<hr />')[1:]:
            page = lxml.html.fromstring(fragment)

            when_date = self.row_content(page, 'Date:')
            when_time = self.row_content(page, 'Time:')
            location = self.row_content(page, 'Room:')

            location = '{}, {}'.format(
                location,
                '201 W Capitol Ave, Jefferson City, MO 65101'
            )

            # com = self.row_content(page, 'Committee:')
            com = page.xpath('//td[descendant::b[contains(text(),"Committee")]]/a/text()')[0]
            com = com.split(', Senator')[0].strip()

            start_date = self._TZ.localize(
                dateutil.parser.parse('{} {}'.format(when_date, when_time))
            )

            event = Event(
                start_date=start_date,
                name=com,
                location_name=location
            )

            event.add_source(listing_url)

            event.add_participant(
                com,
                type='committee',
                note='host',
            )

            for bill_table in page.xpath('//table[@width="85%" and @border="0"]'):
                bill_link = ''
                if bill_table.xpath(self.bill_link_xpath):
                    agenda_line = bill_table.xpath('string(tr[2])').strip()
                    agenda_item = event.add_agenda_item(description=agenda_line)

                    bill_link = bill_table.xpath(self.bill_link_xpath)[0].strip()
                    agenda_item.add_bill(bill_link)
                else:
                    agenda_line = bill_table.xpath('string(tr[1])').strip()
                    agenda_item = event.add_agenda_item(description=agenda_line)

            yield event

示例#5

0

显示文件

文件： events.py 项目： neelneelpurk/openstates

    def scrape_event_page(self, session, chamber, url, datetime):
        page = self.lxmlize(url)
        info = page.xpath("//p")
        metainfo = {}
        plaintext = ""
        for p in info:
            content = re.sub("\s+", " ", p.text_content())
            plaintext += content + "\n"
            if ":" in content:
                key, val = content.split(":", 1)
                metainfo[key.strip()] = val.strip()
        committee = metainfo['COMMITTEE']
        where = metainfo['PLACE']
        if "CHAIR" in where:
            where, chair = where.split("CHAIR:")
            metainfo['PLACE'] = where.strip()
            metainfo['CHAIR'] = chair.strip()

        chair = None
        if "CHAIR" in metainfo:
            chair = metainfo['CHAIR']

        plaintext = re.sub("\s+", " ", plaintext).strip()
        regexp = r"(S|J|H)(B|M|R) (\d+)"
        bills = re.findall(regexp, plaintext)

        event = Event(
            name=committee,
            start_date=self._tz.localize(datetime),
            location_name=where
        )

        event.add_source(url)
        event.add_participant(committee, type='committee', note='host')
        if chair is not None:
            event.add_participant(chair, type='legislator', note='chair')

        for bill in bills:
            chamber, type, number = bill
            bill_id = "%s%s %s" % (chamber, type, number)
            item = event.add_agenda_item('Bill up for discussion')
            item.add_bill(bill_id)

        event.add_agenda_item(plaintext)

        yield event

示例#6

0

显示文件

文件： events.py 项目： sunlightlabs/openstates

    def scrape_event_page(self, url, event_type):
        page = self.lxmlize(url)
        page.make_links_absolute('https://malegislature.gov/')

        title = page.xpath('string(//div[contains(@class,"followable")]/h1)')
        title = title.replace('Hearing Details', '').strip()
        title = title.replace('Special Event Details', '')

        start_day = page.xpath('string(//dl[contains(@class,"eventInformation")]/dd[2])').strip()
        start_time = page.xpath('string(//dl[contains(@class,"eventInformation")]/dd[3])').strip()

        location = page.xpath('string(//dl[contains(@class,"eventInformation")]/dd[4]//a)').strip()

        description = page.xpath('string(//dl[contains(@class,"eventInformation")]/dd[5])').strip()

        start_date = self._TZ.localize(
            dateutil.parser.parse(
                '{} {}'.format(start_day, start_time),
            )
        )

        event = Event(
            start_date=start_date,
            name=title,
            location_name=location,
            description=description
        )

        event.add_source(url)

        agenda_rows = page.xpath(
            '//div[contains(@class,"col-sm-8") and .//h2[contains(@class,"agendaHeader")]]'
            '/div/div/div[contains(@class,"panel-default")]')

        for row in agenda_rows:
            # only select the text node, not the spans
            agenda_title = row.xpath('string(.//h4/a/text()[normalize-space()])').strip()

            if agenda_title == '':
                agenda_title = row.xpath('string(.//h4/text()[normalize-space()])').strip()

            agenda = event.add_agenda_item(description=agenda_title)

            bills = row.xpath('.//tbody/tr/td[1]/a/text()')
            for bill in bills:
                bill = bill.strip().replace('.', ' ')
                agenda.add_bill(bill)

        if event_type == 'Hearing':
            event.add_participant(
                title,
                type='committee',
                note='host',
            )

        yield event

示例#7

0

显示文件

文件： events.py 项目： sunlightlabs/openstates

    def scrape(self):
        calendar_url = "http://dccouncil.us/calendar"
        data = self.get(calendar_url).text
        doc = lxml.html.fromstring(data)

        committee_regex = re.compile("(Committee .*?)will")

        event_list = doc.xpath("//div[@class='event-description-dev']")
        for event in event_list:
            place_and_time = event.xpath(".//div[@class='event-description-dev-metabox']/p/text()")
            when = " ".join([place_and_time[0].strip(), place_and_time[1].strip()])
            if len(place_and_time) > 2:
                location = place_and_time[2]
            else:
                location = "unknown"
            # when is now of the following format:
            # Wednesday, 2/25/2015 9:30am
            when = datetime.datetime.strptime(when, "%A, %m/%d/%Y %I:%M%p")
            description_content = event.xpath(".//div[@class='event-description-content-dev']")[0]
            description_lines = description_content.xpath("./*")
            name = description_lines[0].text_content()
            desc_without_title = " ".join(d.text_content() for d in description_lines[1:])
            description = re.sub(r'\s+', " ", description_content.text_content()).strip()
            potential_bills = description_content.xpath(".//li")

            committee = committee_regex.search(desc_without_title)
            event_type = 'other'
            if committee is not None:
                committee = committee.group(1).strip()
                event_type = 'committee:meeting'

            e = Event(name=name,
                      description=description,
                      start_date=self._tz.localize(when),
                      location_name=location,
                      classification=event_type,
                      )

            for b in potential_bills:
                bill = b.xpath("./a/text()")
                if len(bill) == 0:
                    continue
                bill = bill[0]
                bill_desc = b.text_content().replace(bill, "").strip(", ").strip()
                ses, num = bill.split("-")
                bill = ses.replace(" ", "") + "-" + num.zfill(4)
                item = e.add_agenda_item(bill_desc)
                item.add_bill(bill)

            e.add_source(calendar_url)

            if committee:
                e.add_participant(committee, type='organization', note='host')

            yield e

示例#8

0

显示文件

文件： events.py 项目： neelneelpurk/openstates

    def scrape(self, chamber=None):
        URL = 'http://utahlegislature.granicus.com/ViewPublisherRSS.php?view_id=2&mode=agendas'
        doc = self.lxmlize(URL)
        events = doc.xpath('//item')

        for info in events:
            title_and_date = info.xpath('title/text()')[0].split(" - ")
            title = title_and_date[0]
            when = title_and_date[-1]
            # if not when.endswith(session[ :len("20XX")]):
            #    continue

            event = Event(name=title,
                          start_date=self._tz.localize(datetime.datetime.strptime(when,
                                                                                  '%b %d, %Y')),
                          location_name='State Capitol'
                          )
            event.add_source(URL)

            url = re.search(r'(http://.*?)\s', info.text_content()).group(1)
            try:
                doc = self.lxmlize(url)
            except HTTPError:
                self.logger.warning("Page missing, skipping")
                continue
            event.add_source(url)

            committee = doc.xpath('//a[text()="View committee page"]/@href')
            if committee:
                committee_doc = self.lxmlize(committee[0])
                committee_name = committee_doc.xpath(
                        '//h3[@class="heading committee"]/text()')[0].strip()
                event.add_participant(committee_name, type='committee',
                                      note='host')

            documents = doc.xpath('.//td')
            for document in documents:
                url = re.search(r'(http://.*?pdf)', document.xpath('@onclick')[0])
                if url is None:
                    continue
                url = url.group(1)
                event.add_document(
                        note=document.xpath('text()')[0],
                        url=url,
                        media_type='application/pdf'
                        )
                bills = document.xpath('@onclick')
                for bill in bills:
                    if "bills/static" in bill:
                        bill_name = bill.split("/")[-1].split(".")[0]
                        item = event.add_agenda_item('Bill up for discussion')
                        item.add_bill(bill_name)
            yield event

示例#9

0

显示文件

文件： events.py 项目： sunlightlabs/openstates

    def scrape(self):
        page = self.lxmlize(calurl)
        events = page.xpath("//table[@class='agenda-body']//tr")[1:]

        for event in events:
            comit_url = event.xpath(
                ".//a[contains(@href, '/Pages/comm-info.aspx?c=')]")

            if len(comit_url) != 1:
                raise Exception

            comit_url = comit_url[0]
            who = self.scrape_participants(comit_url.attrib['href'])

            tds = event.xpath("./*")
            date = tds[0].text_content().strip()
            cttie = tds[1].text_content().strip()
            _chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)]
            info = tds[2]
            name = info.xpath("./a[contains(@href, 'raw')]")[0]
            notice = name.attrib['href']
            name = name.text
            time, where = info.xpath("./i/text()")
            what = tds[3].text_content()
            what = what.replace("Items: ", "")
            if "(None)" in what:
                continue
            what = [x.strip() for x in what.split(";")]

            when = ", ".join([date, str(dt.datetime.now().year), time])
            when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p")

            event = Event(
                name=name,
                location_name=where,
                start_date=self._tz.localize(when),
            )

            event.add_source(calurl)

            event.add_committee(cttie, note='host')

            event.add_document("notice", notice, media_type='application/pdf')

            for entry in what:
                item = event.add_agenda_item(entry)
                if entry.startswith('AB') or entry.startswith('SB'):
                    item.add_bill(entry)

            for thing in who:
                event.add_person(thing['name'])

            yield event

示例#10

0

显示文件

文件： events.py 项目： cliftonmcintosh/openstates

    def scrape(self):
        tz = pytz.timezone("US/Eastern")
        get_short_codes(self)
        page = self.lxmlize(URL)
        table = page.xpath(
            "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0]

        for event in table.xpath(".//tr")[1:]:
            tds = event.xpath("./td")
            committee = tds[0].text_content().strip()
            descr = [x.text_content() for x in tds[1].xpath(".//span")]
            if len(descr) != 1:
                raise Exception
            descr = descr[0].replace('.', '').strip()
            when = tds[2].text_content().strip()
            where = tds[3].text_content().strip()
            notice = tds[4].xpath(".//a")[0]
            notice_href = notice.attrib['href']
            notice_name = notice.text
            when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p")
            when = pytz.utc.localize(when)
            event = Event(name=descr, start_time=when, classification='committee-meeting',
                          description=descr, location_name=where, timezone=tz.zone)

            if "/" in committee:
                committees = committee.split("/")
            else:
                committees = [committee]

            for committee in committees:
                if "INFO" not in committee:
                    committee = self.short_ids.get("committee", {"chamber": "unknown",
                                                                 "name": committee})

                else:
                    committee = {
                        "chamber": "joint",
                        "name": committee,
                    }
                event.add_committee(committee['name'], note='host')

            event.add_source(URL)
            event.add_document(notice_name,
                               notice_href,
                               media_type='text/html')
            for bill in self.get_related_bills(notice_href):
                a = event.add_agenda_item(description=bill['descr'])
                a.add_bill(
                    bill['bill_id'],
                    note=bill['type']
                )
            yield event

示例#11

0

显示文件

文件： events.py 项目： sunlightlabs/openstates

    def scrape_lower_item(self, page):
        # print(lxml.etree.tostring(page, pretty_print=True))
        com = self.table_row_content(page, 'Committee:')
        when_date = self.table_row_content(page, 'Date:')
        when_time = self.table_row_content(page, 'Time:')
        location = self.table_row_content(page, 'Location:')

        if 'house hearing room' in location.lower():
            location = '{}, {}'.format(
                location,
                '201 W Capitol Ave, Jefferson City, MO 65101'
            )

        # fix some broken times, e.g. '12 :00'
        when_time = when_time.replace(' :', ':')

        # some times have extra info after the AM/PM
        if 'upon' in when_time:
            when_time = when_time.split('AM', 1)[0]
            when_time = when_time.split('PM', 1)[0]

        start_date = self._TZ.localize(
            dateutil.parser.parse('{} {}'.format(when_date, when_time))
        )

        event = Event(
            start_date=start_date,
            name=com,
            location_name=location
        )

        event.add_source('https://house.mo.gov/HearingsTimeOrder.aspx')

        event.add_participant(
            com,
            type='committee',
            note='host',
        )

        # different from general MO link xpath due to the <b>
        house_link_xpath = './/a[contains(@href, "Bill.aspx") ' \
            'or contains(@href, "bill.aspx")]/b/text()'

        for bill_title in page.xpath(house_link_xpath):
            bill_no = bill_title.split('--')[0].strip()
            bill_no = bill_no.replace('HCS', '').strip()

            agenda_item = event.add_agenda_item(description=bill_title)
            agenda_item.add_bill(bill_no)

        yield event

示例#12

0

显示文件

文件： events.py 项目： neelneelpurk/openstates

    def scrape_page(self, url, session, chamber):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        ctty_name = doc.xpath("//span[@class='heading']")[0].text_content()

        tables = doc.xpath("//table[@cellpadding='3']")
        info = tables[0]
        rows = info.xpath(".//tr")
        metainf = {}
        for row in rows:
            tds = row.xpath(".//td")
            key = tds[0].text_content().strip()
            value = tds[1].text_content().strip()
            metainf[key] = value

        where = metainf['Location:']
        subject_matter = metainf['Subject Matter:']
        description = "{}, {}".format(ctty_name, subject_matter)

        datetime = metainf['Scheduled Date:']
        datetime = re.sub("\s+", " ", datetime)
        repl = {
            "AM": " AM",
            "PM": " PM"  # Space shim.
        }
        for r in repl:
            datetime = datetime.replace(r, repl[r])
        datetime = self.localize(dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p"))

        event = Event(description,
                      start_date=datetime,
                      location_name=where)
        event.add_source(url)

        if ctty_name.startswith('Hearing Notice For'):
            ctty_name.replace('Hearing Notice For', '')
        event.add_participant(ctty_name, 'organization')

        bills = tables[1]
        for bill in bills.xpath(".//tr")[1:]:
            tds = bill.xpath(".//td")
            if len(tds) < 4:
                continue
            # First, let's get the bill ID:
            bill_id = tds[0].text_content()
            agenda_item = event.add_agenda_item(bill_id)
            agenda_item.add_bill(bill_id)

        return event

示例#13

0

显示文件

文件： events.py 项目： sunlightlabs/openstates

    def parse_event(self, row, chamber):
        # sample event available at http://www.akleg.gov/apptester.html
        committee_code = row.xpath('string(Sponsor)').strip()
        committee_name = '{} {}'.format(
                self.COMMITTEES_PRETTY[chamber],
                self.COMMITTEES[chamber][committee_code]['name']
            )

        name = '{} {}'.format(
            self.COMMITTEES_PRETTY[chamber],
            row.xpath('string(Title)').strip()
        )

        # If name is missing, make it "<CHAMBER> <COMMITTEE NAME>"
        if name == '':
            name = committee_name

        location = row.xpath('string(Location)').strip()

        # events with no location all seem to be committee hearings
        if location == '':
            location = 'Alaska State Capitol, 120 4th St, Juneau, AK 99801'

        start_date = dateutil.parser.parse(row.xpath('string(Schedule)'))
        # todo: do i need to self._TZ.localize() ?

        event = Event(
            start_date=start_date,
            name=name,
            location_name=location
        )

        event.add_source('http://w3.akleg.gov/index.php#tab4')

        event.add_participant(
            committee_name,
            type='committee',
            note='host',
        )

        for item in row.xpath('Agenda/Item'):
            agenda_desc = item.xpath('string(Text)').strip()
            if agenda_desc != '':
                agenda_item = event.add_agenda_item(description=agenda_desc)
                if item.xpath('BillRoot'):
                    bill_id = item.xpath('string(BillRoot)')
                    # AK Bill ids have a bunch of extra spaces
                    bill_id = re.sub(r'\s+', ' ', bill_id)
                    agenda_item.add_bill(bill_id)

        yield event

示例#14

0

显示文件

文件： events.py 项目： cliftonmcintosh/openstates

    def scrape_chamber(self, chamber):
        url = utils.urls['events'][chamber]
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for table in page.xpath('//table[@class="CMS-MeetingDetail-CurrMeeting"]'):
            date_string = table.xpath('ancestor::div[@class="CMS-MeetingDetail"]/div/a/@name')[0]
            for row in table.xpath('tr'):
                time_string = row.xpath('td[@class="CMS-MeetingDetail-Time"]/text()')[0].strip()
                description = row.xpath(
                    'td[@class="CMS-MeetingDetail-Agenda"]/div/div'
                )[-1].text_content().strip()
                location = row.xpath(
                    'td[@class="CMS-MeetingDetail-Location"]'
                )[0].text_content().strip()
                committees = row.xpath('.//div[@class="CMS-MeetingDetail-Agenda-CommitteeName"]/a')
                bills = row.xpath('.//a[contains(@href, "billinfo")]')

                try:
                    start_time = datetime.datetime.strptime(
                        '{} {}'.format(date_string, time_string),
                        '%m/%d/%Y %I:%M %p',
                    )
                except ValueError:
                    break

                event = Event(
                    name=description,
                    start_time=self._tz.localize(start_time),
                    location_name=location,
                    timezone=self._tz.zone,
                )
                event.add_source(url)

                if bills or committees:
                    item = event.add_agenda_item(description)
                    for bill in bills:
                        parsed = urllib.parse.urlparse(bill.get('href'))
                        qs = urllib.parse.parse_qs(parsed.query)
                        item.add_bill('{}{} {}'.format(qs['body'], qs['type'], qs['bn']))
                    for committee in committees:
                        parsed = urllib.parse.urlparse(committee.get('href'))
                        qs = urllib.parse.parse_qs(parsed.query)
                        item.add_committee(
                            re.sub(r' \([S|H]\)$', '', committee.text),
                            id=qs.get('Code'),
                        )

                yield event

示例#15

0

显示文件

文件： events.py 项目： sunlightlabs/openstates

    def scrape(self, session=None):
        if session is None:
            session = self.latest_session()
            self.info('no session specified, using %s', session)

        year_abr = ((int(session) - 209) * 2) + 2000
        self._init_mdb(year_abr)
        self.initialize_committees(year_abr)
        records = self.access_to_csv("Agendas")
        for record in records:
            if record['Status'] != "Scheduled":
                continue
            description = record['Comments']
            related_bills = []

            for bill in re.findall(r"(A|S)(-)?(\d{4})", description):
                related_bills.append({
                    "bill_id": "%s %s" % (bill[0], bill[2]),
                    "descr": description
                })

            date_time = "%s %s" % (record['Date'], record['Time'])
            date_time = dt.datetime.strptime(date_time, "%m/%d/%Y %I:%M %p")

            try:
                hr_name = self._committees[record['CommHouse']]
            except KeyError:
                self.warning('unknown committee code %s, skipping', record['CommHouse'])

            description = 'Meeting of the {}'.format(hr_name)

            event = Event(
                name=description,
                start_date=self._tz.localize(date_time),
                location_name=record['Location'] or 'Statehouse',
            )
            item = None
            for bill in related_bills:
                item = item or event.add_agenda_item(description)
                item.add_bill(bill['bill_id'])

            event.add_committee(
                hr_name,
                id=record['CommHouse'],
                note='host',
            )
            event.add_source('http://www.njleg.state.nj.us/downloads.asp')

            yield event

示例#16

0

显示文件

文件： events.py 项目： neelneelpurk/openstates

    def scrape_chamber(self, chamber):
        grouped_hearings = defaultdict(list)

        for hearing in self.session.query(CACommitteeHearing):
            location = self.session.query(CALocation).filter_by(
                location_code=hearing.location_code)[0].description

            date = self._tz.localize(hearing.hearing_date)

            chamber_abbr = location[0:3]
            event_chamber = {'Asm': 'lower', 'Sen': 'upper'}[chamber_abbr]

            if event_chamber != chamber:
                continue

            grouped_hearings[(location, date)].append(hearing)

        for ((location, date), hearings) in grouped_hearings.items():

            # Get list of bill_ids from the database.
            bill_ids = [hearing.bill_id for hearing in hearings]
            bills = ["%s %s" % re.match(r'\d+([^\d]+)(\d+)', bill).groups()
                     for bill in bill_ids]

            # Dereference the committee_nr number and get display name.
            msg = 'More than one committee meeting at (location, date) %r'
            msg = msg % ((location, date),)
            assert len(set(hearing.committee_nr for hearing in hearings)) == 1, msg
            committee_name = _committee_nr[hearings.pop().committee_nr]

            desc = 'Committee Meeting: ' + committee_name
            event = Event(
                name=desc,
                start_date=date,
                location_name=committee_name,
            )
            for bill_id in bills:
                if 'B' in bill_id:
                    type_ = 'bill'
                else:
                    type_ = 'resolution'
                item = event.add_agenda_item('consideration')
                item.add_bill(bill_id, note=type_)

            event.add_person(committee_name + ' Committee', note='host')
            event.add_source('ftp://www.leginfo.ca.gov/pub/bill/')

            yield event

示例#17

0

显示文件

文件： events.py 项目： cliftonmcintosh/openstates

    def scrape_chamber(self, chamber, session):
        cha = {"upper": "7", "lower": "3", "other": "4"}[chamber]

        print_format = "%m/%d/%Y"
        now = dt.datetime.now()

        start = now.strftime(print_format)
        end = (now+timedelta(days=30)).strftime(print_format)
        url = event_page % (cha, start, end)

        page = self.lxmlize(url)

        committees = page.xpath("//a[contains(@href,'Agendas?CommitteeId')]/@href")
        for comm in committees:
            comm_page = self.lxmlize(comm)
            meetings = comm_page.xpath("//li[contains(@class, 'partialagendaitems')]")
            for meeting in meetings:
                heading, content = meeting.xpath("./ul/li")
                who, when = heading.text.split(" - ")
                meeting_title = "Scheduled meeting of %s" % who.strip()
                where_lines = content.text_content().split("\r\n")
                where = "\r\n".join([l.strip() for l in where_lines[6:9]])

                when = dt.datetime.strptime(when.strip(), "%m/%d/%Y %I:%M:%S %p")

                location = (where or '').strip() or "unknown"

                event = Event(name=meeting_title, start_time=self._tz.localize(when),
                              timezone=self._tz.zone, location_name=location,
                              description=meeting_title)

                event.add_participant(who.strip(), type='committee', note='host')
                event.add_source(url)

                # only scraping public hearing bills for now.
                bills = meeting.xpath(".//div[text() = 'Public Hearing']/following-sibling::li"
                                      "[contains(@class, 'visible-lg')]")
                for bill in bills:
                    bill_id, descr = bill.xpath("./a/text()")[0].split(" - ")
                    item = event.add_agenda_item(descr.strip())
                    item.add_bill(bill_id.strip())

                yield event

示例#18

0

显示文件

文件： events.py 项目： dtpeters/scrapers-us-municipal

    def scrape(self):
        start = dt.datetime.utcnow()
        start = start - dt.timedelta(days=10)
        end = start + dt.timedelta(days=30)

        url = URL.format(**{"from": start.strftime("%Y/%m/%d"), "til": end.strftime("%Y/%m/%d")})

        page = self.lxmlize(url)
        events = page.xpath("//ul[contains(@class, 'committee-events')]//li")

        for event in events:
            string = event.text_content()

            po = CLICK_INFO.match(event.xpath(".//span")[0].attrib["onclick"])
            if po is None:
                continue

            poid = po.groupdict()["info_id"]  # This is used to get more deetz on

            popage = self.popOverUrl(poid)
            when = dt.datetime.strptime(popage.xpath("//strong")[0].text, "%B %d, %Y @ %I:%M %p")
            who = popage.xpath("//h1")[0].text
            related = []

            for item in popage.xpath("//div"):
                t = item.text
                if t is None:
                    continue

                t = t.strip()
                for related_entity in ORD_INFO.findall(t):
                    related.append({"ord_no": related_entity, "what": t})

            e = Event(name=who, when=when, location="unknown")
            e.add_source(url)

            for o in related:
                i = e.add_agenda_item(o["what"])
                i.add_bill(o["ord_no"], note="consideration")

            yield e

示例#19

0

显示文件

文件： events.py 项目： opencivicdata/scrapers-us-state

    def scrape(self):
        method = 'events/?state={}&dtstart=1776-07-04'.format(self.state)
        self.events = self.api(method)
        seen = set()
        for event in self.events:
            begin = self._date_parse(event.pop('when'))
            end = self._date_parse(event.pop('end'))
            all_day = event.pop('all_day',False)

            e = Event(name=event.pop('description'),
                      classification=event.pop('type'),
                      location_name=event.pop('location'),
                      timezone=event.pop('timezone'),
                      start_time=begin,
                      end_time=end,
                      all_day=all_day,)
            if len(e.name) >= 300:
                e.name = e.name[:290]

            if len(e.location['name']) >= 100:
                e.location['name'] = e.location['name'][:90]

            composite_key = (e.name, e.description, e.start_time)
            if composite_key in seen:
                print("Duplicate found: %s/%s/%s" % (composite_key))
                continue

            seen.add(composite_key)

            for source in event.pop('sources'):
                if 'retrieved' in source:
                    source.pop('retrieved')
                e.add_source(**source)

            if e.sources == []:
                continue

            ignore = ['country', 'level', 'state', 'created_at', 'updated_at',
                      'notes', '+location_url', 'session', 'id', '+chamber',
                      '+agenda', '+cancelled', '+media_contact', '+contact',
                      '+details']
            # +agenda:
            #   Agenda on old (very old) OpenStates data is actually a string
            #   and not any sort of structured data we can use in the items
            #   schema, and is only present for a handful of events.

            for i in ignore:
                if i in event:
                    event.pop(i)

            for link in ['+link', 'link']:
                if link in event:
                    e.add_source(url=event.pop(link))

            for p in event.pop('participants', []):
                type_ = {
                    "committee": "organization",
                    "legislator": "person",
                    None: None,
                }[p.get('participant_type')]

                if type_ is None:
                    # Garbage data.
                    continue

                e.add_participant(name=p['participant'],
                                  note=p['type'],
                                  type=type_,)

            for b in event.pop('related_bills', []):
                item = e.add_agenda_item(
                    b.pop('description', b.pop('+description', None)))

                item.add_bill(bill=b['bill_id'],
                              note=b.pop('type', b.pop('+type', None)))

            seen_documents = set([])
            for document in event.pop('documents', []):
                if document['url'] in seen_documents:
                    print("XXX: Buggy data in: Duped Document URL: %s (%s)" % (
                        document['url'], document['name']
                    ))
                    continue

                seen_documents.add(document['url'])
                e.add_document(url=document['url'],
                               note=document['name'])

            assert event == {}, "Unknown fields: %s" % (
                ", ".join(event.keys())
            )

            yield e

示例#20

0

显示文件

    def scrape_events_range(self, start_date, end_date):
        def daterange(start_date, end_date):
            number_of_days = int((end_date - start_date).days)
            for n in range(number_of_days):
                yield start_date + dt.timedelta(n)

        for date in daterange(start_date, end_date):
            events = self.extract_events_by_day(date)
            for event in events:
                tz = pytz.timezone("America/Toronto")
                time = dt.datetime.strptime(event['time'], '%I:%M %p')
                start = tz.localize(
                    date.replace(hour=time.hour,
                                 minute=time.minute,
                                 second=0,
                                 microsecond=0))
                source_url = CALENDAR_DAY_TEMPLATE.format(
                    start.year, start.month, start.day)
                org_name = event['meeting']
                e = Event(name=org_name,
                          start_time=start,
                          timezone=tz.zone,
                          location_name=event['location'],
                          status=STATUS_DICT.get(event['meeting_status']))
                e.add_source(source_url)
                e.extras = {
                    'meeting_number': event['no'],
                    'tmmis_meeting_id': event['meeting_id'],
                }
                e.add_participant(
                    name=org_name,
                    type='organization',
                )

                def is_agenda_available(event):
                    return event['publishing_status'] in [
                        'Agenda Published', 'Minutes Published'
                    ]

                def is_council(event):
                    return True if event[
                        'meeting'] == self.jurisdiction.name else False

                if is_agenda_available(event):
                    template = AGENDA_FULL_COUNCIL_TEMPLATE if is_council(
                        event) else AGENDA_FULL_STANDARD_TEMPLATE
                    agenda_url = template.format(event['meeting_id'])
                    full_identifiers = list(
                        self.full_identifiers(event['meeting_id'],
                                              is_council(event)))

                    e.add_source(agenda_url)
                    agenda_items = self.agenda_from_url(agenda_url)
                    for i, item in enumerate(agenda_items):

                        a = e.add_agenda_item(item['title'])
                        a.add_classification(item['type'].lower())
                        a['order'] = str(i)

                        def is_vote_event(item):
                            return True if item['type'] == 'ACTION' else False

                        def normalize_wards(raw):
                            if not raw: raw = 'All'
                            if raw == 'All':
                                return raw.lower()
                            else:
                                return raw.split(', ')

                        def is_being_introduced(item, event):
                            org_name = event['meeting']
                            identifier = item['identifier']

                            # `org_code` is two-letter code for committee
                            current_org_code = self.committees_by_name.get(
                                org_name)[0]['code']
                            originating_org_code = re.search(
                                r'([A-Z]{2})[0-9]+\.[0-9]+',
                                identifier).group(1)

                            return current_org_code == originating_org_code

                        if is_vote_event(item):
                            wards = normalize_wards(item['wards'])
                            identifier_regex = re.compile(
                                r'^[0-9]{4}\.([A-Z]{2}[0-9]+\.[0-9]+)$')
                            [full_identifier] = [
                                id for id in full_identifiers
                                if identifier_regex.match(id).group(1) ==
                                item['identifier']
                            ]
                            a.add_bill(full_identifier)
                            if is_being_introduced(item, event):
                                b = Bill(
                                    # TODO: Fix this hardcode
                                    legislative_session='2014-2018',
                                    identifier=full_identifier,
                                    title=item['title'],
                                    from_organization={'name': org_name},
                                )
                                b.add_source(agenda_url)
                                b.add_document_link(
                                    note='canonical',
                                    media_type='text/html',
                                    url=AGENDA_ITEM_TEMPLATE.format(
                                        full_identifier))
                                b.extras = {
                                    'wards': wards,
                                }

                                yield b

                yield e

示例#21

0

显示文件

    def scrape(self):
        web_results = self.scrapeWebCalendar()

        for event in self.events():
            # Create a key for lookups in the web_results dict.
            key = (event['EventBodyName'].strip(),
                   self.toTime(event['EventDate']).date(), event['EventTime'])

            web_event_dict = web_results.get(
                key, {
                    'Meeting Details': 'Meeting\xa0details',
                    'Audio': 'Not\xa0available',
                    'Recap/Minutes': 'Not\xa0available'
                })

            body_name = event["EventBodyName"]
            if 'Board of Directors -' in body_name:
                body_name, event_name = [
                    part.strip() for part in body_name.split('-')
                ]
            else:
                event_name = body_name

            status_name = event['EventAgendaStatusName']
            if status_name == 'Draft':
                status = 'confirmed'
            elif status_name == 'Final':
                status = 'passed'
            elif status_name == 'Canceled':
                status = 'cancelled'
            else:
                status = ''

            e = Event(event_name,
                      start_time=event["start"],
                      timezone=self.TIMEZONE,
                      description='',
                      location_name=event["EventLocation"],
                      status=status)

            for item in self.agenda(event):
                agenda_item = e.add_agenda_item(item["EventItemTitle"])
                if item["EventItemMatterFile"]:
                    identifier = item["EventItemMatterFile"]
                    agenda_item.add_bill(identifier)

            e.add_participant(name=body_name, type="organization")

            e.add_source(self.BASE_URL + '/events/{EventId}'.format(**event),
                         note='api')

            if event['EventAgendaFile']:
                e.add_document(note='Agenda',
                               url=event['EventAgendaFile'],
                               media_type="application/pdf")

            if event['EventMinutesFile']:
                e.add_document(note='Minutes',
                               url=event['EventMinutesFile'],
                               media_type="application/pdf")

            # Update 'e' with data from https://metro.legistar.com/Calendar.aspx, if that data exists.
            if web_event_dict['Audio'] != 'Not\xa0available':

                redirect_url = self.head(
                    web_event_dict['Audio']['url']).headers['Location']

                e.add_media_link(note=web_event_dict['Audio']['label'],
                                 url=redirect_url,
                                 media_type='text/html')

            if web_event_dict['Recap/Minutes'] != 'Not\xa0available':
                e.add_document(note=web_event_dict['Recap/Minutes']['label'],
                               url=web_event_dict['Recap/Minutes']['url'],
                               media_type="application/pdf")

            if web_event_dict['Meeting Details'] != 'Meeting\xa0details':
                if requests.head(web_event_dict['Meeting Details']
                                 ['url']).status_code == 200:
                    e.add_source(web_event_dict['Meeting Details']['url'],
                                 note='web')
                else:
                    e.add_source('https://metro.legistar.com/Calendar.aspx',
                                 note='web')

            yield e

示例#22

0

显示文件

文件： events.py 项目： sunlightlabs/openstates

    def scrape_meeting(self, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)
        title = page.xpath("//a[@id='linkTitle']//text()")[0]
        date = page.xpath("//span[@id='lDate']/text()")[0]
        time = page.xpath("//span[@id='lTime']/text()")[0]
        location = page.xpath("//span[@id='lLocation']/text()")[0]

        substs = {
            "AM": ["A.M.", "a.m."],
            "PM": ["P.M.", "p.m.", "Noon"],
        }

        for key, values in substs.items():
            for value in values:
                time = time.replace(value, key)

        # Make sure there's a space between the time's minutes and its AM/PM
        if re.search(r'(?i)\d[AP]M$', time):
            time = time[:-2] + " " + time[-2:]

        if re.search("UPON ADJ|TBA", ' '.join(time.split()).upper()):
            all_day = True
            when = datetime.datetime.strptime(date, "%B %d, %Y")
        else:
            all_day = False
            when = datetime.datetime.strptime("%s %s" % (
                date, time
            ), "%B %d, %Y %I:%M %p")

        # when = self._tz.localize(when)

        description = "Meeting on %s of the %s" % (date, title)
        chambers = {"house": "lower",
                    "senate": "upper",
                    "joint": "legislature"}

        for chamber_ in chambers.keys():
            if chamber_ in title.lower():
                break
        else:
            return

        event = Event(name=description,
                      start_date=self._tz.localize(when),
                      location_name=location,
                      all_day=all_day)
        event.add_source(url)

        event.add_participant(title, note='host', type='committee')

        trs = iter(page.xpath("//tr[@valign='top']"))
        next(trs)

        for tr in trs:
            try:
                _, _, bill, whom, descr = tr.xpath("./td")
            except ValueError:
                continue

            bill_title = bill.text_content()

            if "S" in bill_title or "H" in bill_title:
                item = event.add_agenda_item(descr.text_content())
                item.add_bill(bill_title)
            else:
                continue

        yield event

示例#23

0

显示文件

文件： events.py 项目： jtroxell1414/openstates

    def scrape_meeting(self, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)
        title = page.xpath("//a[@id='linkTitle']//text()")[0]
        date = page.xpath("//span[@id='lDate']/text()")[0]
        time = page.xpath("//span[@id='lTime']/text()")[0]
        location = page.xpath("//span[@id='lLocation']/text()")[0]

        substs = {"AM": ["A.M.", "a.m."], "PM": ["P.M.", "p.m.", "Noon"]}

        for key, values in substs.items():
            for value in values:
                time = time.replace(value, key)

        # Make sure there's a space between the time's minutes and its AM/PM
        if re.search(r"(?i)\d[AP]M$", time):
            time = time[:-2] + " " + time[-2:]

        if re.search("UPON ADJ|TBA", " ".join(time.split()).upper()):
            all_day = True
            when = datetime.datetime.strptime(date, "%B %d, %Y")
        else:
            all_day = False
            when = datetime.datetime.strptime(
                "%s %s" % (date, time), "%B %d, %Y %I:%M %p"
            )

        # when = self._tz.localize(when)

        description = "Meeting on %s of the %s" % (date, title)
        chambers = {"house": "lower", "senate": "upper", "joint": "legislature"}

        for chamber_ in chambers.keys():
            if chamber_ in title.lower():
                break
        else:
            return

        event = Event(
            name=description,
            start_date=self._tz.localize(when),
            location_name=location,
            all_day=all_day,
        )
        event.add_source(url)

        event.add_participant(title, note="host", type="committee")

        trs = iter(page.xpath("//tr[@valign='top']"))
        next(trs)

        for tr in trs:
            try:
                _, _, bill, whom, descr = tr.xpath("./td")
            except ValueError:
                continue

            bill_title = bill.text_content()

            if "S" in bill_title or "H" in bill_title:
                item = event.add_agenda_item(descr.text_content())
                item.add_bill(bill_title)
            else:
                continue

        yield event

示例#24

0

显示文件

    def scrape_lower(self):
        PDF_URL = 'http://www.ohiohouse.gov/Assets/CommitteeSchedule/calendar.pdf'
        (path, _response) = self.urlretrieve(PDF_URL)
        text = convert_pdf(path, type='text-nolayout').decode()
        os.remove(path)

        days = re.split(r'(\wF+day, \w+ \d{1,2}, 20\d{2})', text)
        date = None
        for day in enumerate(days[1:]):
            if day[0] % 2 == 0:
                date = day[1]
            else:

                events = re.split(r'\n((?:\w+\s?)+)\n', day[1])
                comm = ''
                for event in enumerate(events[1:]):
                    if event[0] % 2 == 0:
                        comm = event[1].strip()
                    else:

                        try:
                            (time, location, description) = re.search(
                                r'''(?mxs)
                                    (\d{1,2}:\d{2}\s[ap]\.m\.)  # Meeting time
                                    .*?,\s  # Potential extra text for meeting time
                                    (.*?),\s  # Location, usually a room
                                    .*?\n  # Chairman of committee holding event
                                    (.*)  # Description of event
                                    ''', event[1]).groups()
                        except AttributeError:
                            continue

                        time = time.replace(".", "").upper()
                        time = datetime.datetime.strptime(
                            time + "_" + date, '%I:%M %p_%A, %B %d, %Y')
                        time = self._tz.localize(time)

                        location = location.strip()

                        description = '\n'.join([
                            x.strip() for x in description.split('\n')
                            if x.strip() and not x.strip()[0].isdigit()
                        ])

                        if not description:
                            description = '[No description provided by state]'

                        event = Event(name=description,
                                      start_date=time,
                                      location_name=location,
                                      description=description)
                        event.add_source(PDF_URL)
                        event.add_participant(comm,
                                              type='committee',
                                              note='host')
                        for line in description.split('\n'):
                            related_bill = re.search(
                                r'(H\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$',
                                line)
                            if related_bill:
                                (related_bill,
                                 relation) = related_bill.groups()
                                relation = relation.strip()
                                related_bill = related_bill.replace(".", "")
                                item = event.add_agenda_item(relation)
                                item.add_bill(related_bill)

                        yield event

示例#25

0

显示文件

文件： events.py 项目： showerst/scrapers-us-municipal

    def scrape(self, window=3):
        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
            float(window))
        for api_event, event in self.events(n_days_ago):

            when = api_event['start']
            location = api_event['EventLocation']

            description = event['Meeting\xa0Topic']

            if any(each in description for each in ('Multiple meeting items',
                                                    'AGENDA TO BE ANNOUNCED')):
                description = None

            if description:
                e = Event(name=api_event["EventBodyName"],
                          start_date=when,
                          description=description,
                          location_name=location,
                          status=api_event['status'])
            else:
                e = Event(name=api_event["EventBodyName"],
                          start_date=when,
                          location_name=location,
                          status=api_event['status'])

            e.pupa_id = str(api_event['EventId'])

            if event['Multimedia'] != 'Not\xa0available':
                e.add_media_link(note='Recording',
                                 url=event['Multimedia']['url'],
                                 type="recording",
                                 media_type='text/html')

            self.addDocs(e, event, 'Agenda')
            self.addDocs(e, event, 'Minutes')

            location_string = event[u'Meeting Location']
            location_notes, other_orgs = self._parse_location(location_string)

            if location_notes:
                e.extras = {'location note': ' '.join(location_notes)}

            if e.name == 'City Council Stated Meeting':
                participating_orgs = ['New York City Council']
            elif 'committee' in e.name.lower():
                participating_orgs = [e.name]
            else:
                participating_orgs = []

            if other_orgs:
                other_orgs = re.sub('Jointl*y with the ', '', other_orgs)
                participating_orgs += re.split(' and the |, the ', other_orgs)

            for org in participating_orgs:
                e.add_committee(name=org)

            for item in self.agenda(api_event):
                agenda_item = e.add_agenda_item(item["EventItemTitle"])
                if item["EventItemMatterFile"]:
                    identifier = item["EventItemMatterFile"]
                    agenda_item.add_bill(identifier)

            participants = set()

            for call in self.rollcalls(api_event):
                if call['RollCallValueName'] == 'Present':
                    participants.add(call['RollCallPersonName'].strip())

            for person in participants:
                e.add_participant(name=person, type="person")

            e.add_source(self.BASE_URL +
                         '/events/{EventId}'.format(**api_event),
                         note='api')

            try:
                detail_url = event['Meeting Details']['url']
            except TypeError:
                e.add_source(self.EVENTSPAGE, note='web')
            else:
                if requests.head(detail_url).status_code == 200:
                    e.add_source(detail_url, note='web')

            yield e

示例#26

0

显示文件

文件： events-incremental.py 项目： dogooderapp/scrapers-ca

    def scrape_events_range(self, start_date, end_date):

        def daterange(start_date, end_date):
            number_of_days = int((end_date - start_date).days)
            for n in range(number_of_days):
                yield start_date + dt.timedelta(n)

        for date in daterange(start_date, end_date):
            events = self.extract_events_by_day(date)
            for event in events:
                tz = pytz.timezone("America/Toronto")
                time = dt.datetime.strptime(event['time'], '%I:%M %p')
                start = tz.localize(date.replace(hour=time.hour, minute=time.minute, second=0, microsecond=0))
                source_url = CALENDAR_DAY_TEMPLATE.format(start.year, start.month, start.day)
                org_name = event['meeting']
                e = Event(
                    name = org_name,
                    start_time = start,
                    timezone = tz.zone,
                    location_name = event['location'],
                    status=STATUS_DICT.get(event['meeting_status'])
                    )
                e.add_source(source_url)
                e.extras = {
                    'meeting_number': event['no'],
                    'tmmis_meeting_id': event['meeting_id'],
                    }
                e.add_participant(
                    name = org_name,
                    type = 'organization',
                    )

                def is_agenda_available(event):
                    return event['publishing_status'] in ['Agenda Published', 'Minutes Published']

                def is_council(event):
                    return True if event['meeting'] == self.jurisdiction.name else False

                if is_agenda_available(event):
                    template = AGENDA_FULL_COUNCIL_TEMPLATE if is_council(event) else AGENDA_FULL_STANDARD_TEMPLATE
                    agenda_url = template.format(event['meeting_id'])
                    full_identifiers = list(self.full_identifiers(event['meeting_id'], is_council(event)))

                    e.add_source(agenda_url)
                    agenda_items = self.agenda_from_url(agenda_url)
                    for i, item in enumerate(agenda_items):

                        a = e.add_agenda_item(item['title'])
                        a.add_classification(item['type'].lower())
                        a['order'] = str(i)

                        def normalize_wards(raw):
                            if not raw: raw = 'All'
                            if raw == 'All':
                                return raw.lower()
                            else:
                                return raw.split(', ')

                        wards = normalize_wards(item['wards'])
                        identifier_regex = re.compile(r'^[0-9]{4}\.([A-Z]{2}[0-9]+\.[0-9]+)$')
                        [full_identifier] = [id for id in full_identifiers if identifier_regex.match(id).group(1) == item['identifier']]
                        a.add_bill(full_identifier)
                        if full_identifier not in self.seen_agenda_items:
                            b = Bill(
                                # TODO: Fix this hardcode
                                legislative_session = '2014-2018',
                                identifier = full_identifier,
                                title = item['title'],
                                from_organization = {'name': self.jurisdiction.name},
                                )
                            b.add_source(agenda_url)
                            b.add_document_link(note='canonical', media_type='text/html', url=AGENDA_ITEM_TEMPLATE.format(full_identifier))
                            b.extras = {
                                'wards': wards,
                                }

                            self.seen_agenda_items.append(full_identifier)

                            yield b

                yield e

示例#27

0

显示文件

文件： events.py 项目： jtotoole/scrapers-us-municipal

    def scrape(self, window=30):
        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window))
        self.retry_wait_seconds = 20

        for api_event, event in self.events(n_days_ago):

            description = api_event["EventComment"]
            when = api_event["start"]
            location = api_event["EventLocation"]

            if location == "Council Chambers":
                location = "Council Chambers, 5th Floor, City-County Building, " \
                            "414 Grant Street, Pittsburgh, PA 15219"

            if not location :
                continue

            status_string = api_event["status"]

            if len(status_string) > 1 and status_string[1] :
                status_text = status_string[1].lower()
                if any(phrase in status_text
                       for phrase in ("rescheduled to",
                                      "postponed to",
                                      "reconvened to",
                                      "rescheduled to",
                                      "meeting recessed",
                                      "recessed meeting",
                                      "postponed to",
                                      "recessed until",
                                      "deferred",
                                      "time change",
                                      "date change",
                                      "recessed meeting - reconvene",
                                      "cancelled",
                                      "new date and time",
                                      "rescheduled indefinitely",
                                      "rescheduled for",)) :
                    status = "cancelled"
                elif status_text in ("rescheduled", "recessed") :
                    status = "cancelled"
                elif status_text in ("meeting reconvened",
                                     "reconvened meeting",
                                     "recessed meeting",
                                     "reconvene meeting",
                                     "rescheduled hearing",
                                     "rescheduled meeting",) :
                    status = api_event["status"]
                elif status_text in ("amended notice of meeting",
                                     "room change",
                                     "amended notice",
                                     "change of location",
                                     "revised - meeting date and time") :
                    status = api_event["status"]
                elif "room" in status_text :
                    location = status_string[1] + ", " + location
                elif status_text in ("wrong meeting date",):
                    continue
                else :
                    print(status_text)
                    status = api_event["status"]
            else :
                status = api_event["status"]

            if event["Meeting Name"] == "Post Agenda":
                event_name = "Agenda Announcement"
            elif event["Meeting Name"] == "City Council":
                event_name = "Regular meeting"
            else:
                event_name = event["Meeting Name"]

            if description:
                e = Event(name=event_name,
                          start_date=when,
                          description=description,
                          location_name=location,
                          status=status)
            else:
                e = Event(name=event_name,
                          start_date=when,
                          location_name=location,
                          status=status)

            e.pupa_id = str(api_event["EventId"])

            if event["Meeting video"] != "Not\xa0available":
                if "url" not in event["Meeting video"]:
                    pass
                else:
                    video_url = self.get_meeting_video_link(event["Meeting video"]["url"])
                    e.add_media_link(note="Recording",
                                     url=video_url,
                                     type="recording",
                                     media_type="text/html")

            self.addDocs(e, event, "Published agenda")
            self.addDocs(e, event, "Published minutes")

            participant = event["Meeting Name"]

            if participant == "City Council" or participant == "Post Agenda":
                participant = "Pittsburgh City Council"

            e.add_participant(name=participant,
                              type="organization")

            for item in self.agenda(api_event):
                clean_title = self.clean_agenda_item_title(item["EventItemTitle"])
                agenda_item = e.add_agenda_item(clean_title)
                if item["EventItemMatterFile"]:
                    identifier = item["EventItemMatterFile"]
                    agenda_item.add_bill(identifier)
                if item["EventItemVideo"] and event["Meeting video"] != "Not\xa0available":
                    item_video_url = self.get_meeting_video_link(event["Meeting video"]["url"]) + \
                                     '?view_id=2&meta_id=' + str(item["EventItemVideo"])

                    agenda_item.add_media_link(note="Recording",
                                               url=item_video_url,
                                               type="recording",
                                               media_type="text/html")

            participants = set()

            for call in self.rollcalls(api_event):
                if call["RollCallValueName"] == "Present":
                    participants.add(call["RollCallPersonName"])

            for person in participants:
                e.add_participant(name=person,
                                  type="person")

            e.add_source(self.BASE_URL + "/events/{EventId}".format(**api_event),
                         note="api")

            try:
                detail_url = event["Meeting Details"]["url"]
            except TypeError:
                e.add_source(self.EVENTSPAGE, note="web")
            else:
                if requests.head(detail_url).status_code == 200:
                    e.add_source(detail_url, note="web")

            yield e

示例#28

0

显示文件

    def scrape_agenda(self, url):
        page = self.lxmlize(url)
        # Get the date/time info:
        date_time = page.xpath("//table[@class='time_place']")
        if date_time == []:
            return

        date_time = date_time[0]
        lines = date_time.xpath("./tr")
        metainf = {}
        for line in lines:
            tds = line.xpath("./td")
            metainf[tds[0].text_content()] = tds[1].text_content()
        date = metainf["DATE:"]
        time = metainf["TIME:"]
        where = metainf["PLACE:"]

        # check for duration in time
        if " - " in time:
            start, end = time.split(" - ")
            am_pm_srch = re.search("(?i)(am|pm)", end)
            if am_pm_srch:
                time = " ".join([start, am_pm_srch.group().upper()])
            else:
                time = start

        fmts = [
            "%A, %B %d, %Y", "%A, %B %d, %Y %I:%M %p", "%A, %B %d, %Y %I:%M"
        ]

        event_desc = "Meeting Notice"
        if "Rise" in time:
            datetime = date
            event_desc = "Meeting Notice: Starting at {}".format(time)
        else:
            datetime = "%s %s" % (date, time)
        if "CANCELLED" in datetime.upper():
            return

        transtable = {
            "P.M": "PM",
            "PM.": "PM",
            "P.M.": "PM",
            "A.M.": "AM",
            "POSTPONED": "",
            "RESCHEDULED": "",
            "and Rise of the Senate": "",
        }
        for trans in transtable:
            datetime = datetime.replace(trans, transtable[trans])

        datetime = datetime.strip()

        for fmt in fmts:
            try:
                datetime = dt.datetime.strptime(datetime, fmt)
                break
            except ValueError:
                continue

        event = Event(name=event_desc,
                      start_date=self._tz.localize(datetime),
                      location_name=where)
        event.add_source(url)
        # aight. Let's get us some bills!
        bills = page.xpath("//b/a")
        for bill in bills:
            bill_ft = bill.attrib["href"]
            event.add_document(bill.text_content(),
                               bill_ft,
                               media_type="application/pdf")
            root = bill.xpath("../../*")
            root = [x.text_content() for x in root]
            bill_id = "".join(root)

            if "SCHEDULED FOR" in bill_id:
                continue

            descr = (bill.getparent().getparent().getparent().getnext().
                     getnext().text_content())

            for thing in replace:
                bill_id = bill_id.replace(thing, replace[thing])

            item = event.add_agenda_item(descr)
            item.add_bill(bill.text_content())

        committee = page.xpath("//span[@id='lblSession']")[0].text_content()

        event.add_participant(committee, "committee", note="host")

        yield event

示例#29

0

显示文件

    def scrape(self):
        method = 'events/?state={}&dtstart=1776-07-04'.format(self.state)
        self.events = self.api(method)
        seen = set()
        for event in self.events:
            e = Event(name=event.pop('description'),
                      classification=event.pop('type'),
                      location=event.pop('location'),
                      timezone=event.pop('timezone'),
                      start_time=self._date_parse(event.pop('when')),
                      end_time=self._date_parse(event.pop('end')),)
            if len(e.name) >= 300:
                e.name = e.name[:290]

            if len(e.location['name']) >= 100:
                e.location['name'] = e.location['name'][:90]

            composite_key = (e.name, e.description, e.start_time)
            if composite_key in seen:
                print("Duplicate found: %s/%s/%s" % (composite_key))
                continue

            seen.add(composite_key)

            for source in event.pop('sources'):
                if 'retrieved' in source:
                    source.pop('retrieved')
                e.add_source(**source)

            if e.sources == []:
                continue

            ignore = ['country', 'level', 'state', 'created_at', 'updated_at',
                      'notes', '+location_url', 'session', 'id', '+chamber',
                      '+agenda', '+cancelled', '+media_contact', '+contact',
                      '+details']
            # +agenda:
            #   Agenda on old (very old) OpenStates data is actually a string
            #   and not any sort of structured data we can use in the items
            #   schema, and is only present for a handful of events.

            for i in ignore:
                if i in event:
                    event.pop(i)

            for link in ['+link', 'link']:
                if link in event:
                    e.add_source(url=event.pop(link))

            for p in event.pop('participants', []):
                type_ = {
                    "committee": "organization",
                    "legislator": "person",
                    None: None,
                }[p.get('participant_type')]

                if type_ is None:
                    # Garbage data.
                    continue

                e.add_participant(name=p['participant'],
                                  note=p['type'],
                                  type=type_,)

            for b in event.pop('related_bills', []):
                item = e.add_agenda_item(
                    b.pop('description', b.pop('+description', None)))

                item.add_bill(bill=b['bill_id'],
                              note=b.pop('type', b.pop('+type', None)))

            seen_documents = set([])
            for document in event.pop('documents', []):
                if document['url'] in seen_documents:
                    print("XXX: Buggy data in: Duped Document URL: %s (%s)" % (
                        document['url'], document['name']
                    ))
                    continue

                seen_documents.add(document['url'])
                e.add_document(url=document['url'],
                               note=document['name'])

            assert event == {}, "Unknown fields: %s" % (
                ", ".join(event.keys())
            )

            yield e

示例#30

0

显示文件

文件： events.py 项目： showerst/scrapers-us-municipal

    def scrape(self, window=3):
        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
            float(window))
        for api_event, event in self.events(n_days_ago):

            description = None

            when = api_event['start']
            location_string = event[u'Meeting Location']

            location_list = location_string.split('--', 2)
            location = ', '.join(location_list[0:2])
            if not location:
                continue

            status_string = location_list[-1].split('Chicago, Illinois')
            if len(status_string) > 1 and status_string[1]:
                status_text = status_string[1].lower()
                if any(phrase in status_text for phrase in (
                        'rescheduled to',
                        'postponed to',
                        'reconvened to',
                        'rescheduled to',
                        'meeting recessed',
                        'recessed meeting',
                        'postponed to',
                        'recessed until',
                        'deferred',
                        'time change',
                        'date change',
                        'recessed meeting - reconvene',
                        'cancelled',
                        'new date and time',
                        'rescheduled indefinitely',
                        'rescheduled for',
                )):
                    status = 'cancelled'
                elif status_text in ('rescheduled', 'recessed'):
                    status = 'cancelled'
                elif status_text in (
                        'meeting reconvened',
                        'reconvened meeting',
                        'recessed meeting',
                        'reconvene meeting',
                        'rescheduled hearing',
                        'rescheduled meeting',
                ):
                    status = api_event['status']
                elif status_text in ('amended notice of meeting',
                                     'room change', 'amended notice',
                                     'change of location',
                                     'revised - meeting date and time'):
                    status = api_event['status']
                elif 'room' in status_text:
                    location = status_string[1] + ', ' + location
                elif status_text in ('wrong meeting date', ):
                    continue
                else:
                    print(status_text)
                    description = status_string[1].replace('--em--',
                                                           '').strip()
                    status = api_event['status']
            else:
                status = api_event['status']

            if description:
                e = Event(name=event["Name"]["label"],
                          start_date=when,
                          description=description,
                          location_name=location,
                          status=status)
            else:
                e = Event(name=event["Name"]["label"],
                          start_date=when,
                          location_name=location,
                          status=status)

            e.pupa_id = str(api_event['EventId'])

            if event['Video'] != 'Not\xa0available':
                e.add_media_link(note='Recording',
                                 url=event['Video']['url'],
                                 type="recording",
                                 media_type='text/html')

            self.addDocs(e, event, 'Agenda')
            self.addDocs(e, event, 'Notice')
            self.addDocs(e, event, 'Transcript')
            self.addDocs(e, event, 'Summary')

            participant = event["Name"]["label"]
            if participant == 'City Council':
                participant = 'Chicago City Council'
            elif participant == 'Committee on Energy, Environmental Protection and Public Utilities (inactive)':
                participant = 'Committee on Energy, Environmental Protection and Public Utilities'

            e.add_participant(name=participant, type="organization")

            for item in self.agenda(api_event):
                agenda_item = e.add_agenda_item(item["EventItemTitle"])
                if item["EventItemMatterFile"]:
                    identifier = item["EventItemMatterFile"]
                    agenda_item.add_bill(identifier)

            participants = set()
            for call in self.rollcalls(api_event):
                if call['RollCallValueName'] == 'Present':
                    participants.add(call['RollCallPersonName'])

            for person in participants:
                e.add_participant(name=person, type="person")

            e.add_source(self.BASE_URL +
                         '/events/{EventId}'.format(**api_event),
                         note='api')

            try:
                detail_url = event['Meeting Details']['url']
            except TypeError:
                e.add_source(self.EVENTSPAGE, note='web')
            else:
                if requests.head(detail_url).status_code == 200:
                    e.add_source(detail_url, note='web')

            yield e

示例#31

0

显示文件

    def scrape_upper(self):
        PDF_URL = 'http://www.ohiosenate.gov/Assets/CommitteeSchedule/calendar.pdf'
        (path, _response) = self.urlretrieve(PDF_URL)
        text = convert_pdf(path, type='text').decode()
        os.remove(path)

        days = re.split(r'(\w+day, \w+ \d{1,2})', text)
        date = None
        for day in enumerate(days[1:]):
            if day[0] % 2 == 0:
                # Calendar is put out for the current week, so use that year
                date = day[1] + ", " + str(datetime.datetime.now().year)
            else:

                events = re.split(r'\n\n((?:\w+\s?)+),\s', day[1])
                comm = ''
                for event in enumerate(events[1:]):
                    if event[0] % 2 == 0:
                        comm = event[1].strip()
                    else:

                        try:
                            (time, location, description) = re.search(
                                r'''(?mxs)
                                    (\d{1,2}:\d{2}\s[AP]M)  # Meeting time
                                    .*?,\s  # Potential extra text for meeting time
                                    (.*?)\n  # Location, usually a room
                                    .*?\n  # Chairman of committee holding event
                                    (.*)  # Description of event
                                    ''', event[1]).groups()
                        except AttributeError:
                            continue

                        time = datetime.datetime.strptime(
                            time + "_" + date, '%I:%M %p_%A, %B %d, %Y')
                        time = self._tz.localize(time)

                        location = location.strip()

                        description = '\n'.join([
                            x.strip() for x in description.split('\n')
                            if x.strip() and not x.strip().startswith("Page ")
                            and not x.strip().startswith("*Possible Vote") and
                            not x.strip() == "NO OTHER COMMITTEES WILL MEET"
                        ])

                        if not description:
                            description = '[No description provided by state]'

                        event = Event(name=description,
                                      start_date=time,
                                      location_name=location,
                                      description=description)

                        event.add_source(PDF_URL)
                        event.add_participant(comm,
                                              type='committee',
                                              note='host')
                        for line in description.split('\n'):
                            related_bill = re.search(
                                r'(S\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$',
                                line)
                            if related_bill:
                                (related_bill,
                                 relation) = related_bill.groups()
                                relation = relation.strip()
                                related_bill = related_bill.replace(".", "")
                                item = event.add_agenda_item(relation)
                                item.add_bill(related_bill)

                        yield event

示例#32

0

显示文件

文件： events.py 项目： rchrist/scrapers-us-municipal

    def scrape(self):
        last_events = deque(maxlen=10)
        for event, agenda in self.events(since=2011):
            other_orgs = ''
            extras = []

            if '--em--' in event[u'Meeting Location']:
                location_string, note = event[u'Meeting Location'].split(
                    '--em--')[:2]
                for each in note.split(' - '):
                    if each.startswith('Join'):
                        other_orgs = each
                    else:
                        extras.append(each)
            else:
                location_string = event[u'Meeting Location']

            location_list = location_string.split('-', 2)
            location = ', '.join([each.strip() for each in location_list[0:2]])
            if not location:
                continue

            when = self.toTime(event[u'Meeting Date'])

            event_time = event['iCalendar'].subcomponents[0]['DTSTART'].dt
            when = when.replace(hour=event_time.hour, minute=event_time.minute)

            time_string = event['Meeting Time']
            if time_string in ('Deferred', ):
                status = 'cancelled'
            elif self.now() < when:
                status = 'confirmed'
            else:
                status = 'passed'

            description = event['Meeting\xa0Topic']
            if any(each in description for each in ('Multiple meeting items',
                                                    'AGENDA TO BE ANNOUNCED')):
                description = ''

            event_name = event['Name']

            event_id = (event_name, when)

            if event_id in last_events:
                continue
            else:
                last_events.append(event_id)

            e = Event(name=event_name,
                      start_time=when,
                      timezone=self.TIMEZONE,
                      description=description,
                      location_name=location,
                      status=status)

            if extras:
                e.extras = {'location note': ' '.join(extras)}

            if event['Multimedia'] != 'Not\xa0available':
                e.add_media_link(note='Recording',
                                 url=event['Multimedia']['url'],
                                 type="recording",
                                 media_type='text/html')

            self.addDocs(e, event, 'Agenda')
            self.addDocs(e, event, 'Minutes')

            if event['Name'] == 'City Council Stated Meeting':
                participating_orgs = ['New York City Council']
            elif 'committee' in event['Name'].lower():
                participating_orgs = [event["Name"]]
            else:
                participating_orgs = []

            if other_orgs:
                other_orgs = re.sub('Jointl*y with the ', '', other_orgs)
                participating_orgs += re.split(' and the |, the ', other_orgs)

            for org in participating_orgs:
                e.add_committee(name=org)

            if agenda:
                e.add_source(event["Meeting Details"]['url'])

                for item, _, _ in agenda:
                    if item["Name"]:
                        agenda_item = e.add_agenda_item(item["Name"])
                        if item["File\xa0#"]:
                            if item['Action']:
                                note = item['Action']
                            else:
                                note = 'consideration'
                            agenda_item.add_bill(item["File\xa0#"]['label'],
                                                 note=note)
            else:
                e.add_source(self.EVENTSPAGE)

            yield e

示例#33

0

显示文件

文件： events.py 项目： jtotoole/scrapers-us-municipal

    def scrape(self, window=None):
        if window:
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
                float(window))
        else:
            n_days_ago = None

        events = self.events(since_datetime=n_days_ago)

        for event, web_event in self._merge_events(events):
            body_name = event["EventBodyName"]

            if 'Board of Directors -' in body_name:
                body_name, event_name = [
                    part.strip() for part in body_name.split('-')
                ]
            else:
                event_name = body_name

            # Events can have an EventAgendaStatusName of "Final", "Final Revised",
            # and "Final 2nd Revised."
            # We classify these events as "passed."
            status_name = event['EventAgendaStatusName']
            if status_name.startswith('Final'):
                status = 'passed'
            elif status_name == 'Draft':
                status = 'confirmed'
            elif status_name == 'Canceled':
                status = 'cancelled'
            else:
                status = 'tentative'

            location = event["EventLocation"]

            if not location:
                # We expect some events to have no location. LA Metro would
                # like these displayed in the Councilmatic interface. However,
                # OCD requires a value for this field. Add a sane default.
                location = 'Not available'

            e = Event(event_name,
                      start_date=event["start"],
                      description='',
                      location_name=location,
                      status=status)

            e.pupa_id = str(event['EventId'])

            # Metro requires the EventGuid to build out MediaPlayer links.
            # Add both the English event GUID, and the Spanish event GUID if
            # it exists, to the extras dict.
            e.extras = {'guid': event['EventGuid']}

            legistar_api_url = self.BASE_URL + '/events/{0}'.format(
                event['EventId'])
            e.add_source(legistar_api_url, note='api')

            if event.get('SAPEventGuid'):
                e.extras['sap_guid'] = event['SAPEventGuid']

            if 'event_details' in event:
                # if there is not a meeting detail page on legistar
                # don't capture the agenda data from the API
                for item in self.agenda(event):
                    agenda_item = e.add_agenda_item(item["EventItemTitle"])
                    if item["EventItemMatterFile"]:
                        identifier = item["EventItemMatterFile"]
                        agenda_item.add_bill(identifier)

                    if item["EventItemAgendaNumber"]:
                        # To the notes field, add the item number as given in the agenda minutes
                        note = "Agenda number, {}".format(
                            item["EventItemAgendaNumber"])
                        agenda_item['notes'].append(note)

                    # The EventItemAgendaSequence provides
                    # the line number of the Legistar agenda grid.
                    agenda_item['extras']['item_agenda_sequence'] = item[
                        'EventItemAgendaSequence']

                # Historically, the Legistar system has duplicated the EventItemAgendaSequence,
                # resulting in data inaccuracies. The scrape should fail in such cases, until Metro
                # cleans the data.
                item_agenda_sequences = [
                    item['extras']['item_agenda_sequence'] for item in e.agenda
                ]
                if len(item_agenda_sequences) != len(
                        set(item_agenda_sequences)):
                    error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \
                        {event_name} on {event_date} ({legistar_api_url}). \
                        Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.'

                    raise ValueError(
                        error_msg.format(
                            event_name=e.name,
                            event_date=e.start_date.strftime("%B %d, %Y"),
                            legistar_api_url=legistar_api_url))

            e.add_participant(name=body_name, type="organization")

            if event.get('SAPEventId'):
                e.add_source(self.BASE_URL +
                             '/events/{0}'.format(event['SAPEventId']),
                             note='api (sap)')

            if event['EventAgendaFile']:
                e.add_document(note='Agenda',
                               url=event['EventAgendaFile'],
                               media_type="application/pdf")

            if event['EventMinutesFile']:
                e.add_document(note='Minutes',
                               url=event['EventMinutesFile'],
                               media_type="application/pdf")

            if web_event['Published minutes'] != 'Not\xa0available':
                e.add_document(note=web_event['Published minutes']['label'],
                               url=web_event['Published minutes']['url'],
                               media_type="application/pdf")

            for audio in event['audio']:
                try:
                    redirect_url = self.head(audio['url']).headers['Location']

                except KeyError:
                    # In some cases, the redirect URL does not yet
                    # contain the location of the audio file. Skip
                    # these events, and retry on next scrape.
                    continue

                # Sometimes if there is an issue getting the Spanish
                # audio created, Metro has the Spanish Audio link
                # go to the English Audio.
                #
                # Pupa does not allow the for duplicate media links,
                # so we'll ignore the the second media link if it's
                # the same as the first media link.
                #
                # Because of the way that the event['audio'] is created
                # the first audio link is always English and the
                # second is always Spanish
                e.add_media_link(note=audio['label'],
                                 url=redirect_url,
                                 media_type='text/html',
                                 on_duplicate='ignore')

            if event['event_details']:
                for link in event['event_details']:
                    e.add_source(**link)
            else:
                e.add_source('https://metro.legistar.com/Calendar.aspx',
                             note='web')

            yield e

示例#34

0

显示文件

    def transform_parse(self, parsed_form, response):

        _source = {
            "url": response.url,
            "note": "LDA Form LD-1"
        }

        # basic disclosure fields
        _disclosure = Disclosure(
            effective_date=datetime.strptime(
                parsed_form['datetimes']['effective_date'],
                '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC),
            timezone='America/New_York',
            submitted_date=datetime.strptime(
                parsed_form['datetimes']['signature_date'],
                '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC),
            classification="lobbying"
        )

        _disclosure.add_authority(name=self.authority.name,
                                  type=self.authority._type,
                                  id=self.authority._id)

        _disclosure.add_identifier(
            identifier=parsed_form['_meta']['document_id'],
            scheme="urn:sopr:filing"
        )

        # disclosure extras
        _disclosure.extras = {}
        _disclosure.extras['registrant'] = {
            'self_employed_individual': parsed_form['registrant']['self_employed_individual'],
            'general_description': parsed_form['registrant']['registrant_general_description'],
            'signature': {
                "signature_date": parsed_form['datetimes']['signature_date'],
                "signature": parsed_form['signature']
            }
        }

        _disclosure.extras['client'] = {
            'same_as_registrant':
                parsed_form['client']['client_self'],
            'general_description':
                parsed_form['client']['client_general_description']
        }

        _disclosure.extras['registration_type'] = {
            'is_amendment':
                parsed_form['registration_type']['is_amendment'],
            'new_registrant':
                parsed_form['registration_type']['new_registrant'],
            'new_client_for_existing_registrant':
                parsed_form['registration_type'][
                    'new_client_for_existing_registrant'],
        }

        # # Registrant
        # build registrant
        _registrant_self_employment = None

        if parsed_form['registrant']['self_employed_individual']:
            n = ' '.join([p for p in [
                parsed_form['registrant']['registrant_individual_prefix'],
                parsed_form['registrant']['registrant_individual_firstname'],
                parsed_form['registrant']['registrant_individual_lastname']
            ] if len(p) > 0]).strip()

            _registrant = Person(
                name=n,
                source_identified=True
            )

            _registrant_self_employment = Organization(
                name='SELF-EMPLOYMENT of {n}'.format(n=n),
                classification='company',
                source_identified=True
            )

            _registrant.add_membership(
                organization=_registrant_self_employment,
                role='self_employed',
                label='self-employment of {n}'.format(n=n),
                start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
            )
        else:
            _registrant = Organization(
                name=parsed_form['registrant']['registrant_org_name'],
                classification='company',
                source_identified=True
            )

        if len(parsed_form['registrant']['registrant_house_id']) > 0:
            _registrant.add_identifier(
                identifier=parsed_form['registrant']['registrant_house_id'],
                scheme='urn:house_clerk:registrant'
            )

        if len(parsed_form['registrant']['registrant_senate_id']) > 0:
            _registrant.add_identifier(
                identifier=parsed_form['registrant']['registrant_senate_id'],
                scheme='urn:sopr:registrant'
            )

        registrant_contact_details = [
            {
                "type": "address",
                "note": "contact address",
                "value": '; '.join([
                    p for p in [
                        parsed_form['registrant']['registrant_address_one'],
                        parsed_form['registrant']['registrant_address_two'],
                        parsed_form['registrant']['registrant_city'],
                        parsed_form['registrant']['registrant_state'],
                        parsed_form['registrant']['registrant_zip'],
                        parsed_form['registrant']['registrant_country']]
                    if len(p) > 0]).strip(),
            },
            {
                "type": "voice",
                "note": "contact phone",
                "value": parsed_form['registrant']['registrant_contact_phone'],
            },
            {
                "type": "email",
                "note": "contact email",
                "value": parsed_form['registrant']['registrant_contact_email'],
            },
        ]

        registrant_contact_ppb = {
            "type": "address",
            "note": "principal place of business",
            "value": '; '.join([
                p for p in [
                    parsed_form['registrant']['registrant_ppb_city'],
                    parsed_form['registrant']['registrant_ppb_state'],
                    parsed_form['registrant']['registrant_ppb_zip'],
                    parsed_form['registrant']['registrant_ppb_country']]
                if len(p) > 0]).strip(),
        }

        if registrant_contact_ppb["value"]:
            registrant_contact_details.append(registrant_contact_ppb)

        for cd in registrant_contact_details:
            _registrant.add_contact_detail(**cd)

        _registrant.extras = {
            "contact_details_structured": [
                {
                    "type": "address",
                    "note": "contact address",
                    "parts": [
                        {
                            "note": "address_one",
                            "value": parsed_form['registrant'][
                                'registrant_address_one'],
                        },
                        {
                            "note": "address_two",
                            "value": parsed_form['registrant'][
                                'registrant_address_two'],
                        },
                        {
                            "note": "city",
                            "value": parsed_form['registrant'][
                                'registrant_city'],
                        },
                        {
                            "note": "state",
                            "value": parsed_form['registrant'][
                                'registrant_state'],
                        },
                        {
                            "note": "zip",
                            "value": parsed_form['registrant'][
                                'registrant_zip'],
                        },
                        {
                            "note": "country",
                            "value": parsed_form['registrant'][
                                'registrant_country'],
                        }
                    ],
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "parts": [
                        {
                            "note": "city",
                            "value": parsed_form['registrant'][
                                'registrant_ppb_city'],
                        },
                        {
                            "note": "state",
                            "value": parsed_form['registrant'][
                                'registrant_ppb_state'],
                        },
                        {
                            "note": "zip",
                            "value": parsed_form['registrant'][
                                'registrant_ppb_zip'],
                        },
                        {
                            "note": "country",
                            "value": parsed_form['registrant'][
                                'registrant_ppb_country'],
                        }
                    ],
                },
            ]
        }

        # # People
        # build contact
        _main_contact = Person(
            name=parsed_form['registrant']['registrant_contact_name'],
            source_identified=True
        )

        main_contact_contact_details = [
            {
                "type": "voice",
                "note": "contact phone",
                "value": parsed_form['registrant']['registrant_contact_phone'],
            },
            {
                "type": "email",
                "note": "contact email",
                "value": parsed_form['registrant']['registrant_contact_email'],
            }
        ]

        for cd in main_contact_contact_details:
            _main_contact.add_contact_detail(**cd)

        if _registrant._type == 'organization':
            _registrant.add_member(
                name_or_person=_main_contact,
                role='main_contact',
                label='main contact for {n}'.format(n=_registrant.name),
                start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
            )
        else:
            _registrant_self_employment.add_member(
                name_or_person=_main_contact,
                role='main_contact',
                label='main contact for {n}'.format(n=_registrant.name),
                start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
            )

        # # Client
        # build client
        _client = Organization(
            name=parsed_form['client']['client_name'],
            classification='company',
            source_identified=True
        )

        client_contact_details = [
            {
                "type": "address",
                "note": "contact address",
                "value": '; '.join([
                    p for p in [
                        parsed_form['client']['client_address'],
                        parsed_form['client']['client_city'],
                        parsed_form['client']['client_state'],
                        parsed_form['client']['client_zip'],
                        parsed_form['client']['client_country']]
                    if len(p) > 0]).strip(),
            },
        ]

        client_contact_ppb = {
            "type": "address",
            "note": "principal place of business",
            "value": '; '.join([
                p for p in [
                    parsed_form['client']['client_ppb_city'],
                    parsed_form['client']['client_ppb_state'],
                    parsed_form['client']['client_ppb_zip'],
                    parsed_form['client']['client_ppb_country']]
                if len(p) > 0]).strip(),
        }

        if client_contact_ppb["value"]:
            client_contact_details.append(client_contact_ppb)

        for cd in client_contact_details:
            _client.add_contact_detail(**cd)

        _client.extras = {
            "contact_details_structured": [
                {
                    "type": "address",
                    "note": "contact address",
                    "parts": [
                        {
                            "note": "address",
                            "value": parsed_form['client']['client_address'],
                        },
                        {
                            "note": "city",
                            "value": parsed_form['client']['client_city'],
                        },
                        {
                            "note": "state",
                            "value": parsed_form['client']['client_state'],
                        },
                        {
                            "note": "zip",
                            "value": parsed_form['client']['client_zip'],
                        },
                        {
                            "note": "country",
                            "value": parsed_form['client']['client_country'],
                        }
                    ],
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "parts": [
                        {
                            "note": "city",
                            "value": parsed_form['client']['client_ppb_city'],
                        },
                        {
                            "note": "state",
                            "value": parsed_form['client']['client_ppb_state'],
                        },
                        {
                            "note": "zip",
                            "value": parsed_form['client']['client_ppb_zip'],
                        },
                        {
                            "note": "country",
                            "value": parsed_form['client'][
                                'client_ppb_country'],
                        }
                    ],
                },
            ],
        }

        # Collect Foreign Entities
        _foreign_entities = []
        _foreign_entities_by_name = {}
        for fe in parsed_form['foreign_entities']:
            fe_extras = {}
            fe_name = fe['foreign_entity_name']

            # check for name-based duplicates
            if fe_name in _foreign_entities_by_name:
                _foreign_entity = _foreign_entities_by_name[fe_name]
            else:
                _foreign_entity = Organization(
                    name=fe_name,
                    classification='company',
                    source_identified=True
                )

            # collect contact details
            foreign_entity_contact_details = [
                {
                    "type": "address",
                    "note": "contact address",
                    "value": '; '.join([
                        p for p in [
                            fe['foreign_entity_address'],
                            fe['foreign_entity_city'],
                            fe['foreign_entity_state'],
                            fe['foreign_entity_country']]
                        if len(p) > 0]).strip(),
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "value": '; '.join([
                        p for p in [
                            fe['foreign_entity_ppb_state'],
                            fe['foreign_entity_ppb_country']]
                        if len(p) > 0]).strip(),
                },
            ]

            foreign_entity_contact_ppb = {
                "type": "address",
                "note": "principal place of business",
                "value": '; '.join([
                    p for p in [
                        fe['foreign_entity_ppb_city'],
                        fe['foreign_entity_ppb_state'],
                        fe['foreign_entity_ppb_country']]
                    if len(p) > 0]),
            }

            if foreign_entity_contact_ppb["value"]:
                foreign_entity_contact_details.append(
                    foreign_entity_contact_ppb)

            # add contact details
            for cd in foreign_entity_contact_details:
                if cd['value'] != '':
                    _foreign_entity.add_contact_detail(**cd)

            # add extras
            fe_extras["contact_details_structured"] = [
                {
                    "type": "address",
                    "note": "contact address",
                    "parts": [
                        {
                            "note": "address",
                            "value": fe['foreign_entity_address'],
                        },
                        {
                            "note": "city",
                            "value": fe['foreign_entity_city'],
                        },
                        {
                            "note": "state",
                            "value": fe['foreign_entity_state'],
                        },
                        {
                            "note": "country",
                            "value": fe['foreign_entity_country'],
                        }
                    ],
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "parts": [
                        {
                            "note": "state",
                            "value": fe['foreign_entity_ppb_state'],
                        },
                        {
                            "note": "country",
                            "value": fe['foreign_entity_ppb_country'],
                        }
                    ],
                },
            ]

            _foreign_entity.extras = combine_dicts(_foreign_entity.extras,
                                                   fe_extras)

            _foreign_entities_by_name[fe_name] = _foreign_entity

        for unique_foreign_entity in _foreign_entities_by_name.values():
            _foreign_entities.append(unique_foreign_entity)

            # TODO: add a variant on memberships to represent inter-org
            # relationships (associations, ownership, etc)
            #
            # _client['memberships'].append({
            #     "id": _foreign_entity['id'],
            #     "classification": "organization",
            #     "name": _foreign_entity['name'],
            #     "extras": {
            #         "ownership_percentage":
            #             fe['foreign_entity_amount']
            #     }
            # })

        # Collect Lobbyists
        # TODO: deal with wierd non-name line continuation cases (blanks, "continued")
        _lobbyists_by_name = {}

        for l in parsed_form['lobbyists']:
            l_extras = {}
            l_name = ' '.join([l['lobbyist_first_name'],
                               l['lobbyist_last_name'],
                               l['lobbyist_suffix']
                               ]).strip()

            if l_name in _lobbyists_by_name:
                _lobbyist = _lobbyists_by_name[l_name]
            else:
                _lobbyist = Person(
                    name=l_name,
                    source_identified=True
                )

            if l['lobbyist_covered_official_position']:
                l_extras['lda_covered_official_positions'] = [
                    {
                        'date_reported':
                            parsed_form['datetimes']['effective_date'],
                        'covered_official_position':
                            l['lobbyist_covered_official_position']
                    },
                ]

            _lobbyist.extras = combine_dicts(_lobbyist.extras, l_extras)

            _lobbyists_by_name[l_name] = _lobbyist

        _lobbyists = []
        for unique_lobbyist in _lobbyists_by_name.values():
            _lobbyists.append(unique_lobbyist)

        if _registrant._type == 'organization':
            for l in _lobbyists:
                _registrant.add_member(
                    l,
                    role='lobbyist',
                    label='lobbyist for {n}'.format(n=_registrant.name),
                    start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
                )
        else:
            for l in _lobbyists:
                _registrant_self_employment.add_member(
                    l,
                    role='lobbyist',
                    label='lobbyist for {n}'.format(n=_registrant.name),
                    start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
                )

        # # Document
        # build document
        _disclosure.add_document(
            note='submitted filing',
            date=parsed_form['datetimes']['effective_date'][:10],
            url=response.url
        )

        # Collect Affiliated orgs
        _affiliated_organizations = []
        _affiliated_organizations_by_name = {}
        for ao in parsed_form['affiliated_organizations']:
            ao_extras = {}
            ao_name = ao['affiliated_organization_name']
            if ao_name in _affiliated_organizations_by_name:
                # There's already one by this name
                _affiliated_organization = _affiliated_organizations_by_name[ao_name]
            else:
                # New affiliated org
                _affiliated_organization = Organization(
                    name=ao_name,
                    classification='company',
                    source_identified=True
                )

            # collect contact details
            affiliated_organization_contact_details = [
                {
                    "type": "address",
                    "note": "contact address",
                    "value": '; '.join([
                        p for p in [
                            ao['affiliated_organization_address'],
                            ao['affiliated_organization_city'],
                            ao['affiliated_organization_state'],
                            ao['affiliated_organization_zip'],
                            ao['affiliated_organization_country']]
                        if len(p) > 0]).strip(),
                },
            ]

            affiliated_organization_contact_ppb = {
                "type": "address",
                "note": "principal place of business",
                "value": '; '.join([
                    p for p in [
                        ao['affiliated_organization_ppb_city'],
                        ao['affiliated_organization_ppb_state'],
                        ao['affiliated_organization_ppb_country']]
                    if len(p) > 0]).strip(),
            }

            if affiliated_organization_contact_ppb["value"]:
                affiliated_organization_contact_details.append(
                    affiliated_organization_contact_ppb)

            # add contact details
            for cd in affiliated_organization_contact_details:
                _affiliated_organization.add_contact_detail(**cd)

            ao_extras["contact_details_structured"] = [
                {
                    "type": "address",
                    "note": "contact address",
                    "parts": [
                        {
                            "note": "address",
                            "value": ao['affiliated_organization_address'],
                        },
                        {
                            "note": "city",
                            "value": ao['affiliated_organization_city'],
                        },
                        {
                            "note": "state",
                            "value": ao['affiliated_organization_state'],
                        },
                        {
                            "note": "zip",
                            "value": ao['affiliated_organization_zip'],
                        },
                        {
                            "note": "country",
                            "value": ao['affiliated_organization_country'],
                        }
                    ],
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "parts": [
                        {
                            "note": "city",
                            "value":
                                ao['affiliated_organization_ppb_city'],
                        },
                        {
                            "note": "state",
                            "value":
                                ao['affiliated_organization_ppb_state'],
                        },
                        {
                            "note": "country",
                            "value":
                                ao['affiliated_organization_ppb_country'],
                        }
                    ],
                },
            ],

            _affiliated_organization.extras = combine_dicts(
                _affiliated_organization.extras, ao_extras)

        for unique_affiliated_organization in _affiliated_organizations_by_name.values():
            _affiliated_organizations.append(unique_affiliated_organization)

        # # Events & Agendas
        # name
        if parsed_form['registration_type']['new_registrant']:
            registration_type = 'New Client, New Registrant'
        elif parsed_form['registration_type']['is_amendment']:
            registration_type = 'Amended Registration'
        else:
            registration_type = 'New Client for Existing Registrant'

        # Create registration event
        _event = Event(
            name="{rn} - {rt}, {cn}".format(rn=_registrant.name,
                                            rt=registration_type,
                                            cn=_client.name),
            timezone='America/New_York',
            location='United States',
            start_time=datetime.strptime(
                parsed_form['datetimes']['effective_date'],
                '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC),
            classification='registration'
        )

        # add participants
        _event.add_participant(type=_registrant._type,
                               id=_registrant._id,
                               name=_registrant.name,
                               note="registrant")

        if _registrant._type == 'person':
            _event.add_participant(type=_registrant._type,
                                   id=_registrant._id,
                                   name=_registrant.name,
                                   note="registrant")

        _event.add_participant(type=_client._type,
                               id=_client._id,
                               name=_client.name,
                               note="client")

        for l in _lobbyists:
            _event.add_participant(type=l._type,
                                   id=l._id,
                                   name=l.name,
                                   note='lobbyist')

        for fe in _foreign_entities:
            _event.add_participant(type=fe._type,
                                   id=fe._id,
                                   name=fe.name,
                                   note='foreign_entity')

        for ao in _affiliated_organizations:
            _event.add_participant(type=ao._type,
                                   id=ao._id,
                                   name=ao.name,
                                   note='affiliated_organization')

        # add agenda item
        _agenda = _event.add_agenda_item(
            description='issues lobbied on',
        )

        _agenda['notes'].append(
            parsed_form['lobbying_issues_detail']
        )

        for li in parsed_form['lobbying_issues']:
            if li['general_issue_area'] != '':
                _agenda.add_subject(li['general_issue_area'])

        _disclosure.add_disclosed_event(
            name=_event.name,
            type=_event._type,
            classification=_event.classification,
            id=_event._id
        )

        # add registrant to disclosure's _related and related_entities fields
        _disclosure.add_registrant(name=_registrant.name,
                                   type=_registrant._type,
                                   id=_registrant._id)

        _registrant.add_source(
            url=_source['url'],
            note='registrant'
        )
        yield _registrant

        if _registrant_self_employment is not None:
            _registrant_self_employment.add_source(
                url=_source['url'],
                note='registrant_self_employment'
            )

            yield _registrant_self_employment

        _client.add_source(
            url=_source['url'],
            note='client'
        )
        yield _client

        _main_contact.add_source(
            url=_source['url'],
            note='main_contact'
        )
        yield _main_contact

        for ao in _affiliated_organizations:
            ao.add_source(
                url=_source['url'],
                note='affiliated_organization'
            )
            yield ao
        for fe in _foreign_entities:
            fe.add_source(
                url=_source['url'],
                note='foreign_entity'
            )
            yield fe
        for l in _lobbyists:
            l.add_source(
                url=_source['url'],
                note='lobbyist'
            )
            yield l

        _event.add_source(**_source)
        yield _event
        _disclosure.add_source(**_source)
        yield _disclosure

示例#35

0

显示文件

文件： floor_update.py 项目： crdunwel/scrapers-us-federal

    def _parse_house_floor_xml_legislative_activity(self, xml):
        """
        Parses XML string of House floor updates and yields them in loop.

        @param xml: XML of field update
        @type xml: string
        @return: complete Event object
        @rtype: Event
        """
        tree = self._xml_parser(xml)

        congress = tree.xpath('.//legislative_congress')[0].get('congress')

        house_committees = self._get_current_house_committee_names()
        for fa in tree.xpath('.//floor_action'):
            fa_text = fa.xpath('.//action_description')[0].xpath('string()')

            eastern = pytz.timezone('US/Eastern')
            dt = datetime.datetime.strptime(fa.xpath('action_time')[0].get('for-search'), '%Y%m%dT%H:%M:%S')
            event = Event('House Floor Update on {0} at {1}.'.format(dt.strftime('%Y-%m-%d'), dt.strftime('%H:%M:%S')),
                          eastern.localize(dt).astimezone(pytz.utc),
                          'US/Eastern',
                          '',
                          description=fa_text,
                          classification='floor_update')

            event.set_location("East Capitol Street Northeast & First St SE, Washington, DC 20004",
                               note='House Floor', url='http://www.house.gov',
                               coordinates={'latitude': '38.889931', 'longitude': '-77.009003'})

            event.add_source(self._house_floor_src_url(date_str=tree.xpath('.//legislative_day')[0].get('date')),
                             note="Scraped from the Office of the Clerk, U.S. House of Representatives website.")

            event.extras['act-id'] = fa.get('act-id')
            event.extras['unique-id'] = fa.get('unique-id')

            # bills
            ai_b = event.add_agenda_item(description='Bills referenced by this update.')
            for bill in fa.xpath(".//a[@rel='bill']"):
                bill_name = bill.xpath('string()')
                ai_b.add_bill(bill_name, id=make_pseudo_id(identifier=bill_code_to_id(bill_name), congress=congress),
                              note="Bill was referenced on the House floor.")

            # publaws
            ai_p = event.add_agenda_item(description='Public laws referenced by this update.')
            for law in fa.xpath(".//a[@rel='publaw']"):
                detail_url = '/'.join(law.get('href').split('/')[0:-2]) + '/content-detail.html'
                ai_p.add_bill(law.xpath('string()'),
                              id=make_pseudo_id(**self._public_law_detail_scraper(url=detail_url)),
                              note='Law was referenced on the House floor.')

            # votes
            ai_v = event.add_agenda_item(description='Votes referenced by this update.')
            for vote in fa.xpath(".//a[@rel='vote']"):
                vote_name = vote.xpath('string()')
                ai_v.add_vote(vote_name,
                              id=make_pseudo_id(identifier=vote_code_to_id(vote_name), congress=congress),
                              note='Vote was referenced on the House floor.')

            # reports
            for report in fa.xpath(".//a[@rel='report']"):
                event.add_document('Document referenced by this update.', report.get('href'), media_type='text/html')

            for name in house_committees:
                if name.replace('House ', '') in fa_text:
                    event.add_committee(name, id=make_pseudo_id(name=name))

            # TODO identify legislators and add them as participants?


            yield event

示例#36

0

显示文件

文件： events.py 项目： eneilly/scrapers-us-municipal

    def scrape(self, window=None) :
        if window:
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window))
        else:
            n_days_ago = None

        events = self.events(n_days_ago)

        for event, web_event in self._merge_events(events):
            body_name = event["EventBodyName"]

            if 'Board of Directors -' in body_name:
                body_name, event_name = [part.strip()
                                         for part
                                         in body_name.split('-')]
            else:
                event_name = body_name

            status_name = event['EventAgendaStatusName']
            if status_name == 'Draft':
                status = 'confirmed'
            elif status_name == 'Final':
                status = 'passed'
            elif status_name == 'Canceled':
                status = 'cancelled'
            else:
                status = 'tentative'

            location = event["EventLocation"]

            if not location:
                # We expect some events to have no location. LA Metro would
                # like these displayed in the Councilmatic interface. However,
                # OCD requires a value for this field. Add a sane default.
                location = 'Not available'

            e = Event(event_name,
                      start_date=event["start"],
                      description='',
                      location_name=location,
                      status=status)

            e.pupa_id = str(event['EventId'])

            # Metro requires the EventGuid to build out MediaPlayer links.
            # Add both the English event GUID, and the Spanish event GUID if
            # it exists, to the extras dict.
            e.extras = {'guid': event['EventGuid']}

            if event.get('SAPEventGuid'):
                e.extras['sap_guid'] = event['SAPEventGuid']


            if 'event_details' in event:
                # if there is not a meeting detail page on legistar
                # don't capture the agenda data from the API
                for item in self.agenda(event):
                    agenda_item = e.add_agenda_item(item["EventItemTitle"])
                    if item["EventItemMatterFile"]:
                        identifier = item["EventItemMatterFile"]
                        agenda_item.add_bill(identifier)

                    if item["EventItemAgendaNumber"]:
                        # To the notes field, add the item number as given in the agenda minutes
                        note = "Agenda number, {}".format(item["EventItemAgendaNumber"])
                        agenda_item['notes'].append(note)

            e.add_participant(name=body_name,
                              type="organization")

            e.add_source(self.BASE_URL + '/events/{0}'.format(event['EventId']),
                         note='api')

            if event.get('SAPEventId'):
                e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']),
                             note='api (sap)')

            if event['EventAgendaFile']:
                e.add_document(note= 'Agenda',
                               url = event['EventAgendaFile'],
                               media_type="application/pdf")

            if event['EventMinutesFile']:
                e.add_document(note= 'Minutes',
                               url = event['EventMinutesFile'],
                               media_type="application/pdf")

            for audio in event['audio']:
                try:
                    redirect_url = self.head(audio['url']).headers['Location']

                except KeyError:
                    # In some cases, the redirect URL does not yet
                    # contain the location of the audio file. Skip
                    # these events, and retry on next scrape.
                    continue

                e.add_media_link(note=audio['label'],
                                 url=redirect_url,
                                 media_type='text/html')

            if web_event['Recap/Minutes'] != 'Not\xa0available':
                e.add_document(note=web_event['Recap/Minutes']['label'],
                               url=web_event['Recap/Minutes']['url'],
                               media_type="application/pdf")

            if event['event_details']:
                for link in event['event_details']:
                    e.add_source(**link)
            else:
                e.add_source('https://metro.legistar.com/Calendar.aspx', note='web')

            yield e

示例#37

0

显示文件

文件： events.py 项目： ErnieAtLYD/scrapers-us-municipal

    def scrape(self):
        for event, agenda in self.events() :

            description = None

            location_string = event[u'Meeting Location']

            location_list = location_string.split('--', 2)
            location = ', '.join(location_list[0:2])
            if not location :
                continue

            when = self.toTime(event[u'Meeting Date'])

            event_time = event['iCalendar'].subcomponents[0]['DTSTART'].dt
            when = when.replace(hour=event_time.hour,
                                minute=event_time.minute)

            status_string = location_list[-1].split('Chicago, Illinois')
            if len(status_string) > 1 and status_string[1] :
                status_text = status_string[1].lower()
                if any(phrase in status_text 
                       for phrase in ('rescheduled to',
                                      'postponed to',
                                      'reconvened to',
                                      'rescheduled to',
                                      'meeting recessed',
                                      'recessed meeting',
                                      'postponed to',
                                      'recessed until',
                                      'deferred',
                                      'time change',
                                      'date change',
                                      'recessed meeting - reconvene',
                                      'cancelled',
                                      'new date and time',
                                      'rescheduled indefinitely',
                                      'rescheduled for',)) :
                    status = 'cancelled'
                elif status_text in ('rescheduled', 'recessed') :
                    status = 'cancelled'
                elif status_text in ('meeting reconvened',
                                     'reconvened meeting',
                                     'recessed meeting',
                                     'reconvene meeting',
                                     'rescheduled hearing',
                                     'rescheduled meeting',) :
                    status = confirmedOrPassed(when)
                elif status_text in ('amended notice of meeting',
                                     'room change',
                                     'amended notice',
                                     'change of location',
                                     'revised - meeting date and time') :
                    status = confirmedOrPassed(when)
                elif 'room' in status_text :
                    location = status_string[1] + ', ' + location
                elif status_text in ('wrong meeting date',) :
                    continue
                else :
                    print(status_text)
                    description = status_string[1].replace('--em--', '').strip()
                    status = confirmedOrPassed(when)
            else :
                status = confirmedOrPassed(when)


            if description :
                e = Event(name=event["Name"]["label"],
                          start_time=when,
                          description=description,
                          timezone='US/Central',
                          location_name=location,
                          status=status)
            else :
                e = Event(name=event["Name"]["label"],
                          start_time=when,
                          timezone='US/Central',
                          location_name=location,
                          status=status)


            if event['Video'] != 'Not\xa0available' : 
                e.add_media_link(note='Recording',
                                 url = event['Video']['url'],
                                 type="recording",
                                 media_type = 'text/html')

            self.addDocs(e, event, 'Agenda')
            self.addDocs(e, event, 'Notice')
            self.addDocs(e, event, 'Transcript')
            self.addDocs(e, event, 'Summary')

            participant = event["Name"]["label"]
            if participant == 'City Council' :
                participant = 'Chicago City Council'
            elif participant == 'Committee on Energy, Environmental Protection and Public Utilities (inactive)' :
                participant = 'Committee on Energy, Environmental Protection and Public Utilities'

            e.add_participant(name=participant,
                              type="organization")

            if agenda :
                e.add_source(event['Meeting Details']['url'], note='web')
                
                for item, _, _ in agenda :
                    agenda_item = e.add_agenda_item(item["Title"])
                    if item["Record #"] :
                        identifier = item["Record #"]['label']
                        if identifier.startswith('S'):
                            identifier = identifier[1:]
                        agenda_item.add_bill(identifier)

            else :
                e.add_source(self.EVENTSPAGE, note='web')

            yield e

示例#38

0

显示文件

文件： events.py 项目： tor-councilmatic/scrapers-ca

    def get_events(self):
        "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMeetingScheduleReport"
        "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMemberAttendanceReport"

        # scrape attendance

        tmpdir = tempfile.mkdtemp()

        page = self.lxmlize("http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMemberAttendanceReport")
        members = page.xpath('//td[@class="inputText"]/select[@name="memberId"]/option')
        for member in members:
            post = {
                "function": "getMemberAttendanceReport",
                "download": "csv",
                "exportPublishReportId": 1,
                "termId": 4,
                "memberId": member.attrib["value"],
                "decisionBodyId": 0,
            }
            r = self.post("http://app.toronto.ca/tmmis/getAdminReport.do", data=post)
            if r.headers["content-type"] != "application/vnd.ms-excel":
                continue

            attendance_file = open(tmpdir + "/" + member.text + ".csv", "w")
            attendance_file.write(r.text)
            attendance_file.close()

        # scrape events
        post = {
            "function": "getMeetingScheduleReport",
            "download": "csv",
            "exportPublishReportId": 3,
            "termId": 4,
            "decisionBodyId": 0,
        }

        r = self.post("http://app.toronto.ca/tmmis/getAdminReport.do", data=post)
        empty = []

        meeting_file = open("meetings.csv", "w")
        meeting_file.write(r.text)
        meeting_file.close()
        with open("meetings.csv", "rb") as csvfile:
            csvfile = csv.reader(csvfile, delimiter=",")
            next(csvfile)

            committee = ""
            agenda_items = []

            for row in csvfile:
                name = row[0]
                when = row[2]
                when = dt.datetime.strptime(when, "%Y-%m-%d")
                location = row[5]

                if name != committee:
                    committee = name
                    agenda_items = find_items(committee)

                e = Event(name=name, session=self.session, when=when, location=location)

                attendees = find_attendees(tmpdir, row)
                if len(attendees) == 0:
                    empty.append(row)
                for attendee in find_attendees(tmpdir, row):
                    e.add_person(attendee)
                e.add_source("http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMeetingScheduleReport")

                for item in agenda_items:
                    if item["date"].date() == when.date():
                        i = e.add_agenda_item(item["description"])
                        i.add_committee(committee)
                        i["order"] = item["order"]

                        for link in item["links"]:
                            i.add_media_link(link["name"], link["url"], on_duplicate="ignore")

                        if "notes" in item:
                            i["notes"] = [item["notes"]]

                yield e

        shutil.rmtree(tmpdir)
        os.remove("meetings.csv")

示例#39

0

显示文件

文件： events.py 项目： sunlightlabs/openstates

    def scrape_lower(self):
        PDF_URL = 'http://www.ohiohouse.gov/Assets/CommitteeSchedule/calendar.pdf'
        (path, _response) = self.urlretrieve(PDF_URL)
        text = convert_pdf(path, type='text-nolayout').decode()
        os.remove(path)

        days = re.split(r'(\wF+day, \w+ \d{1,2}, 20\d{2})', text)
        date = None
        for day in enumerate(days[1:]):
            if day[0] % 2 == 0:
                date = day[1]
            else:

                events = re.split(r'\n((?:\w+\s?)+)\n', day[1])
                comm = ''
                for event in enumerate(events[1:]):
                    if event[0] % 2 == 0:
                        comm = event[1].strip()
                    else:

                        try:
                            (time, location, description) = re.search(
                                    r'''(?mxs)
                                    (\d{1,2}:\d{2}\s[ap]\.m\.)  # Meeting time
                                    .*?,\s  # Potential extra text for meeting time
                                    (.*?),\s  # Location, usually a room
                                    .*?\n  # Chairman of committee holding event
                                    (.*)  # Description of event
                                    ''',
                                    event[1]).groups()
                        except AttributeError:
                            continue

                        time = time.replace(".", "").upper()
                        time = datetime.datetime.strptime(
                                time + "_" + date,
                                '%I:%M %p_%A, %B %d, %Y'
                                )
                        time = self._tz.localize(time)

                        location = location.strip()

                        description = '\n'.join([
                                x.strip() for x in
                                description.split('\n')
                                if x.strip() and not x.strip()[0].isdigit()
                                ])

                        if not description:
                            description = '[No description provided by state]'

                        event = Event(
                                name=description,
                                start_date=time,
                                location_name=location,
                                description=description
                        )
                        event.add_source(PDF_URL)
                        event.add_participant(comm, type='committee', note='host')
                        for line in description.split('\n'):
                            related_bill = re.search(r'(H\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$',
                                                     line)
                            if related_bill:
                                (related_bill, relation) = related_bill.groups()
                                relation = relation.strip()
                                related_bill = related_bill.replace(".", "")
                                item = event.add_agenda_item(relation)
                                item.add_bill(related_bill)

                        yield event

示例#40

0

显示文件

    def scrape(self):
        for event, agenda in self.events() :

            description = None

            location_string = event[u'Meeting Location']

            location_list = location_string.split('--', 2)
            location = ', '.join(location_list[0:2])
            if not location :
                continue

            when = self.toTime(event[u'Meeting Date'])

            event_time = event['iCalendar'].subcomponents[0]['DTSTART'].dt
            when = when.replace(hour=event_time.hour,
                                minute=event_time.minute)

            status_string = location_list[-1].split('Chicago, Illinois')
            if len(status_string) > 1 and status_string[1] :
                status_text = status_string[1].lower()
                if any(phrase in status_text 
                       for phrase in ('rescheduled to',
                                      'postponed to',
                                      'reconvened to',
                                      'rescheduled to',
                                      'meeting recessed',
                                      'recessed meeting',
                                      'postponed to',
                                      'recessed until',
                                      'deferred',
                                      'time change',
                                      'date change',
                                      'recessed meeting - reconvene',
                                      'cancelled',
                                      'new date and time',
                                      'rescheduled indefinitely',
                                      'rescheduled for',)) :
                    status = 'cancelled'
                elif status_text in ('rescheduled', 'recessed') :
                    status = 'cancelled'
                elif status_text in ('meeting reconvened',
                                     'reconvened meeting',
                                     'recessed meeting',
                                     'reconvene meeting',
                                     'rescheduled hearing',
                                     'rescheduled meeting',) :
                    status = confirmedOrPassed(when)
                elif status_text in ('amended notice of meeting',
                                     'room change',
                                     'amended notice',
                                     'change of location',
                                     'revised - meeting date and time') :
                    status = confirmedOrPassed(when)
                elif 'room' in status_text :
                    location = status_string[1] + ', ' + location
                elif status_text in ('wrong meeting date',) :
                    continue
                else :
                    print(status_text)
                    description = status_string[1].replace('--em--', '').strip()
                    status = confirmedOrPassed(when)
            else :
                status = confirmedOrPassed(when)


            if description :
                e = Event(name=event["Name"]["label"],
                          start_time=when,
                          description=description,
                          timezone='US/Central',
                          location_name=location,
                          status=status)
            else :
                e = Event(name=event["Name"]["label"],
                          start_time=when,
                          timezone='US/Central',
                          location_name=location,
                          status=status)


            if event['Video'] != 'Not\xa0available' : 
                e.add_media_link(note='Recording',
                                 url = event['Video']['url'],
                                 type="recording",
                                 media_type = 'text/html')

            self.addDocs(e, event, 'Agenda')
            self.addDocs(e, event, 'Notice')
            self.addDocs(e, event, 'Transcript')
            self.addDocs(e, event, 'Summary')

            participant = event["Name"]["label"]
            if participant == 'City Council' :
                participant = 'Chicago City Council'
            elif participant == 'Committee on Energy, Environmental Protection and Public Utilities (inactive)' :
                participant = 'Committee on Energy, Environmental Protection and Public Utilities'

            e.add_participant(name=participant,
                              type="organization")

            if agenda :
                e.add_source(event['Meeting Details']['url'], note='web')
                
                for item, _, _ in agenda :
                    agenda_item = e.add_agenda_item(item["Title"])
                    if item["Record #"] :
                        agenda_item.add_bill(item["Record #"]['label'])

            else :
                e.add_source(self.EVENTSPAGE, note='web')

            yield e

示例#41

0

显示文件

    def scrape(self, chamber=None, session=None):
        url = 'http://leg.colorado.gov/content/committees'
        if not session:
            session = self.latest_session()
            self.info('no session specified, using %s', session)
        chambers = [chamber] if chamber else ['upper', 'lower']
        for chamber in chambers:
            if chamber == 'lower':
                xpath = '//div/h3[text()="House Committees of Reference"]/../' \
                        'following-sibling::div[contains(@class,"view-content")]/' \
                        'table//td//span[contains(@class,"field-content")]/a/@href'
            elif chamber == 'upper':
                xpath = '//div/h3[text()="Senate Committees of Reference"]/../' \
                        'following-sibling::div[contains(@class,"view-content")]/' \
                        'table//td//span[contains(@class,"field-content")]/a/@href'
            elif chamber == 'other':
                # All the links under the headers that don't contain "House" or "Senate"
                xpath = '//div/h3[not(contains(text(),"House")) and ' \
                        'not(contains(text(),"Senate"))]/../' \
                        'following-sibling::div[contains(@class,"view-content")]/' \
                        'table//td//span[contains(@class,"field-content")]/a/@href'

            page = self.lxmlize(url)
            com_links = page.xpath(xpath)

            for link in com_links:
                page = self.lxmlize(link)

                hearing_links = page.xpath(
                    '//div[contains(@class,"schedule-item-content")]'
                    '/h4/a/@href')

                for link in hearing_links:
                    try:
                        page = self.lxmlize(link)

                        title = page.xpath(
                            '//header/h1[contains(@class,"node-title")]')[0]
                        title = title.text_content().strip()

                        date_day = page.xpath(
                            '//div[contains(@class,"calendar-date")]')[0]
                        date_day = date_day.text_content().strip()

                        details = page.xpath(
                            '//span[contains(@class, "calendar-details")]')[0]
                        details = details.text_content().split('|')

                        date_time = details[0].strip()
                        location = details[1].strip()

                        if 'Upon Adjournment' in date_time:
                            date = dt.datetime.strptime(
                                date_day, '%A %B %d, %Y')
                        else:
                            date_str = '{} {}'.format(date_day, date_time)
                            date = dt.datetime.strptime(
                                date_str, '%A %B %d, %Y %I:%M %p')

                        agendas = []
                        # they overload the bills table w/ other agenda items. colspon=2 is agenda
                        non_bills = page.xpath(
                            '//td[@data-label="Hearing Item" and @colspan="2"]'
                        )
                        for row in non_bills:
                            content = row.text_content().strip()
                            agendas.append(content)

                        agenda = "\n".join(agendas) if agendas else ''

                        event = Event(name=title,
                                      start_date=self._tz.localize(date),
                                      location_name=location)
                        if agenda:
                            event.add_agenda_item(agenda)
                        event.add_source(link)
                        bills = page.xpath(
                            '//td[@data-label="Hearing Item"]/a')
                        for bill in bills:
                            bill_id = bill.text_content().strip()

                            item = event.add_agenda_item("hearing item")
                            item.add_bill(bill_id)

                        yield event
                    except Exception:  # TODO: this is awful
                        pass

示例#42

0

显示文件

文件： events.py 项目： neelneelpurk/openstates

    def scrape(self, chamber=None, session=None):
        url = 'http://leg.colorado.gov/content/committees'
        if not session:
            session = self.latest_session()
            self.info('no session specified, using %s', session)
        chambers = [chamber] if chamber else ['upper', 'lower']
        for chamber in chambers:
            if chamber == 'lower':
                xpath = '//div/h3[text()="House Committees of Reference"]/../' \
                        'following-sibling::div[contains(@class,"view-content")]/' \
                        'table//td//span[contains(@class,"field-content")]/a/@href'
            elif chamber == 'upper':
                xpath = '//div/h3[text()="Senate Committees of Reference"]/../' \
                        'following-sibling::div[contains(@class,"view-content")]/' \
                        'table//td//span[contains(@class,"field-content")]/a/@href'
            elif chamber == 'other':
                # All the links under the headers that don't contain "House" or "Senate"
                xpath = '//div/h3[not(contains(text(),"House")) and ' \
                        'not(contains(text(),"Senate"))]/../' \
                        'following-sibling::div[contains(@class,"view-content")]/' \
                        'table//td//span[contains(@class,"field-content")]/a/@href'

            page = self.lxmlize(url)
            com_links = page.xpath(xpath)

            for link in com_links:
                page = self.lxmlize(link)

                hearing_links = page.xpath('//div[contains(@class,"schedule-item-content")]'
                                           '/h4/a/@href')

                for link in hearing_links:
                    try:
                        page = self.lxmlize(link)

                        title = page.xpath('//header/h1[contains(@class,"node-title")]')[0]
                        title = title.text_content().strip()

                        date_day = page.xpath('//div[contains(@class,"calendar-date")]')[0]
                        date_day = date_day.text_content().strip()

                        details = page.xpath('//span[contains(@class, "calendar-details")]')[0]
                        details = details.text_content().split('|')

                        date_time = details[0].strip()
                        location = details[1].strip()

                        if 'Upon Adjournment' in date_time:
                            date = dt.datetime.strptime(date_day, '%A %B %d, %Y')
                        else:
                            date_str = '{} {}'.format(date_day, date_time)
                            date = dt.datetime.strptime(date_str, '%A %B %d, %Y %I:%M %p')

                        agendas = []
                        # they overload the bills table w/ other agenda items. colspon=2 is agenda
                        non_bills = page.xpath('//td[@data-label="Hearing Item" and @colspan="2"]')
                        for row in non_bills:
                            content = row.text_content().strip()
                            agendas.append(content)

                        agenda = "\n".join(agendas) if agendas else ''

                        event = Event(name=title,
                                      start_date=self._tz.localize(date),
                                      location_name=location
                                      )
                        if agenda:
                            event.add_agenda_item(agenda)
                        event.add_source(link)
                        bills = page.xpath('//td[@data-label="Hearing Item"]/a')
                        for bill in bills:
                            bill_id = bill.text_content().strip()

                            item = event.add_agenda_item("hearing item")
                            item.add_bill(bill_id)

                        yield event
                    except:
                        pass

示例#43

0

显示文件

文件： events.py 项目： jtroxell1414/openstates

    def scrape_event_page(self, url, chamber):
        html = self.get(url).text
        page = lxml.html.fromstring(html)
        trs = page.xpath("//table[@id='frg_committeemeeting_MeetingTable']/tr")
        metainf = {}
        for tr in trs:
            tds = tr.xpath(".//td")
            if len(tds) <= 1:
                continue
            key = tds[0].text_content().strip()
            val = tds[1]
            metainf[key] = {"txt": val.text_content().strip(), "obj": val}

        if metainf == {}:
            return

        # Wednesday, 5/16/2012 3:00 pm
        datetime = "%s %s" % (
            metainf["Date"]["txt"],
            metainf["Time"]["txt"].replace(".", ""),
        )
        if "Cancelled" in datetime:
            return

        translate = {
            "noon": " PM",
            "a.m.": " AM",
            "am": " AM",  # This is due to a nasty line they had.
            "a.m": "AM",  # another weird one
        }

        for t in translate:
            if t in datetime:
                datetime = datetime.replace(t, translate[t])

        datetime = re.sub(r"\s+", " ", datetime)

        for text_to_remove in [
                "or after committees are given leave",
                "or later immediately after committees are given leave",
                "or later after committees are given leave by the House to meet",
                "**Please note time**",
        ]:
            datetime = datetime.split(text_to_remove)[0].strip()

        datetime = datetime.replace("p.m.", "pm")
        datetime = datetime.replace("Noon", "pm")
        try:
            datetime = dt.datetime.strptime(datetime, "%A, %m/%d/%Y %I:%M %p")
        except ValueError:
            datetime = dt.datetime.strptime(datetime, "%A, %m/%d/%Y %I %p")
        where = metainf["Location"]["txt"]
        title = metainf["Committee"]["txt"]  # XXX: Find a better title

        if chamber == "other":
            chamber = "joint"

        event = Event(name=title,
                      start_date=self._tz.localize(datetime),
                      location_name=where)
        event.add_source(url)
        event.add_source(mi_events)

        chair_name = metainf["Chair"]["txt"].strip()
        if chair_name:
            event.add_participant(chair_name, type="legislator", note="chair")
        else:
            self.warning("No chair found for event '{}'".format(title))

        event.add_participant(metainf["Committee"]["txt"],
                              type="committee",
                              note="host")

        agenda = metainf["Agenda"]["obj"]
        agendas = agenda.text_content().split("\r")

        related_bills = agenda.xpath("//a[contains(@href, 'getObject')]")
        for bill in related_bills:
            description = agenda
            for a in agendas:
                if bill.text_content() in a:
                    description = a

            item = event.add_agenda_item(description)
            item.add_bill(bill.text_content())

        yield event

示例#44

0

显示文件

文件： events.py 项目： datamade/scrapers-us-municipal

    def scrape(self, window=None) :
        if window:
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window))
        else:
            n_days_ago = None

        events = self.events(n_days_ago)

        for event, web_event in self._merge_events(events):
            body_name = event["EventBodyName"]

            if 'Board of Directors -' in body_name:
                body_name, event_name = [part.strip()
                                         for part
                                         in body_name.split('-')]
            else:
                event_name = body_name

            # Events can have an EventAgendaStatusName of "Final", "Final Revised", 
            # and "Final 2nd Revised."
            # We classify these events as "passed."
            status_name = event['EventAgendaStatusName']
            if status_name.startswith('Final'):
                status = 'passed'
            elif status_name == 'Draft':
                status = 'confirmed'
            elif status_name == 'Canceled':
                status = 'cancelled'
            else:
                status = 'tentative'

            location = event["EventLocation"]

            if not location:
                # We expect some events to have no location. LA Metro would
                # like these displayed in the Councilmatic interface. However,
                # OCD requires a value for this field. Add a sane default.
                location = 'Not available'

            e = Event(event_name,
                      start_date=event["start"],
                      description='',
                      location_name=location,
                      status=status)

            e.pupa_id = str(event['EventId'])

            # Metro requires the EventGuid to build out MediaPlayer links.
            # Add both the English event GUID, and the Spanish event GUID if
            # it exists, to the extras dict.
            e.extras = {'guid': event['EventGuid']}

            legistar_api_url = self.BASE_URL + '/events/{0}'.format(event['EventId'])
            e.add_source(legistar_api_url, note='api')

            if event.get('SAPEventGuid'):
                e.extras['sap_guid'] = event['SAPEventGuid']

            if 'event_details' in event:
                # if there is not a meeting detail page on legistar
                # don't capture the agenda data from the API
                for item in self.agenda(event):
                    agenda_item = e.add_agenda_item(item["EventItemTitle"])
                    if item["EventItemMatterFile"]:
                        identifier = item["EventItemMatterFile"]
                        agenda_item.add_bill(identifier)

                    if item["EventItemAgendaNumber"]:
                        # To the notes field, add the item number as given in the agenda minutes
                        note = "Agenda number, {}".format(item["EventItemAgendaNumber"])
                        agenda_item['notes'].append(note)

                    # The EventItemAgendaSequence provides 
                    # the line number of the Legistar agenda grid.
                    agenda_item['extras']['item_agenda_sequence'] = item['EventItemAgendaSequence']

                # Historically, the Legistar system has duplicated the EventItemAgendaSequence,
                # resulting in data inaccuracies. The scrape should fail in such cases, until Metro
                # cleans the data.
                item_agenda_sequences = [item['extras']['item_agenda_sequence'] for item in e.agenda]
                if len(item_agenda_sequences) != len(set(item_agenda_sequences)):
                    error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \
                        {event_name} on {event_date} ({legistar_api_url}). \
                        Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.'

                    raise ValueError(error_msg.format(event_name=e.name, 
                                                      event_date=e.start_date.strftime("%B %d, %Y"),
                                                      legistar_api_url=legistar_api_url))

            e.add_participant(name=body_name,
                              type="organization")

            if event.get('SAPEventId'):
                e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']),
                             note='api (sap)')

            if event['EventAgendaFile']:
                e.add_document(note= 'Agenda',
                               url = event['EventAgendaFile'],
                               media_type="application/pdf")

            if event['EventMinutesFile']:
                e.add_document(note= 'Minutes',
                               url = event['EventMinutesFile'],
                               media_type="application/pdf")

            for audio in event['audio']:
                try:
                    redirect_url = self.head(audio['url']).headers['Location']

                except KeyError:
                    # In some cases, the redirect URL does not yet
                    # contain the location of the audio file. Skip
                    # these events, and retry on next scrape.
                    continue

                e.add_media_link(note=audio['label'],
                                 url=redirect_url,
                                 media_type='text/html')

            if web_event['Recap/Minutes'] != 'Not\xa0available':
                e.add_document(note=web_event['Recap/Minutes']['label'],
                               url=web_event['Recap/Minutes']['url'],
                               media_type="application/pdf")

            if event['event_details']:
                for link in event['event_details']:
                    e.add_source(**link)
            else:
                e.add_source('https://metro.legistar.com/Calendar.aspx', note='web')

            yield e

示例#45

0

显示文件

文件： events.py 项目： rjm328/openstates

    def scrape(self, session=None, start=None, end=None):

        if session is None:
            session = self.latest_session()
            self.info('no session specified, using %s', session)

        # testimony url, we'll need it later in a loop

        # testmony query looks gnary but breaks down to:
        # $filter: (Request/PaperNumber eq 'SP0219') and (Request/Legislature eq 129)
        # $orderby: LastName,FirstName,Organization
        # $expand: Request
        # $select: Id,FileType,NamePrefix,FirstName,LastName,Organization,
        # PresentedDate,FileSize,Topic

        testimony_url_base = 'http://legislature.maine.gov/backend/' \
            'breeze/data/CommitteeTestimony?' \
            '$filter=(Request%2FPaperNumber%20eq%20%27{}%27)%20and' \
            '%20(Request%2FLegislature%20eq%20{})' \
            '&$orderby=LastName%2CFirstName%2COrganization&' \
            '$expand=Request&$select=Id%2CFileType%2CNamePrefix' \
            '%2CFirstName%2CLastName%2COrganization%2CPresentedDate%2CFileSize%2CTopic'

        if start is None:
            start_date = datetime.datetime.now().isoformat()
        else:
            start_date = datetime.datetime.strptime(start, "%Y-%m-%d")
            start_date = start_date.isoformat()

        # default to 30 days if no end
        if end is None:
            dtdelta = datetime.timedelta(days=30)
            end_date = datetime.datetime.now() + dtdelta
            end_date = end_date.isoformat()
        else:
            end_date = datetime.datetime.strptime(end, "%Y-%m-%d")
            end_date = end_date.isoformat()

        bills_by_event = {}

        bills_url = 'http://legislature.maine.gov/backend/breeze/data/' \
            'getCalendarEventsBills?startDate={}&endDate={}'
        bills_url = bills_url.format(start_date, end_date)
        page = json.loads(self.get(bills_url).content)

        for row in page:
            bills_by_event.setdefault(row['EventId'], [])
            bills_by_event[row['EventId']].append(row)

        # http://legislature.maine.gov/backend/breeze/data/getCalendarEventsRaw?startDate=2019-03-01T05%3A00%3A00.000Z&endDate=2019-04-01T03%3A59%3A59.999Z&OnlyPHWS=false
        url = 'http://legislature.maine.gov/backend/breeze/data/' \
            'getCalendarEventsRaw?startDate={}&endDate={}&OnlyPHWS=true'
        url = url.format(start_date, end_date)

        page = json.loads(self.get(url).content)

        for row in page:
            if row['Cancelled'] is True or row['Postponed'] is True:
                continue

            start_date = self._TZ.localize(
                dateutil.parser.parse(row['FromDateTime']))
            end_date = self._TZ.localize(
                dateutil.parser.parse(row['ToDateTime']))

            name = row['CommitteeName']

            if name is None:
                name = row['Host']

            address = row['Location']
            address = address.replace(
                'Cross Building',
                'Cross Office Building, 111 Sewall St, Augusta, ME 04330')

            address = address.replace(
                'State House',
                'Maine State House, 210 State St, Augusta, ME 04330')

            event = Event(
                start_date=start_date,
                end_date=end_date,
                name=name,
                location_name=address,
            )

            event.add_source(
                'http://legislature.maine.gov/committee/#Committees/{}'.format(
                    row['CommitteeCode']))

            if bills_by_event.get(row['Id']):
                for bill in bills_by_event[row['Id']]:
                    description = 'LD {}: {}'.format(bill['LD'], bill['Title'])
                    agenda = event.add_agenda_item(description=description)
                    agenda.add_bill('LD {}'.format(bill['LD']))

                    if bill['TestimonyCount'] > 0:
                        test_url = testimony_url_base.format(
                            bill['PaperNumber'], session)
                        test_page = json.loads(self.get(test_url).content)
                        for test in test_page:
                            title = '{} {} - {}'.format(
                                test['FirstName'], test['LastName'],
                                test['Organization'])
                            if test['NamePrefix'] is not None:
                                title = '{} {}'.format(test['NamePrefix'],
                                                       title)

                            test_url = 'http://legislature.maine.gov/backend/app/services' \
                                '/getDocument.aspx?doctype=test&documentId={}'.format(test['Id'])

                            if test['FileType'] == 'pdf':
                                media_type = "application/pdf"

                            event.add_document(note=title,
                                               url=test_url,
                                               media_type=media_type)
            yield event

示例#46

0

显示文件

文件： events.py 项目： sunlightlabs/openstates

    def scrape_upper(self):
        PDF_URL = 'http://www.ohiosenate.gov/Assets/CommitteeSchedule/calendar.pdf'
        (path, _response) = self.urlretrieve(PDF_URL)
        text = convert_pdf(path, type='text').decode()
        os.remove(path)

        days = re.split(r'(\w+day, \w+ \d{1,2})', text)
        date = None
        for day in enumerate(days[1:]):
            if day[0] % 2 == 0:
                # Calendar is put out for the current week, so use that year
                date = day[1] + ", " + str(datetime.datetime.now().year)
            else:

                events = re.split(r'\n\n((?:\w+\s?)+),\s', day[1])
                comm = ''
                for event in enumerate(events[1:]):
                    if event[0] % 2 == 0:
                        comm = event[1].strip()
                    else:

                        try:
                            (time, location, description) = re.search(
                                    r'''(?mxs)
                                    (\d{1,2}:\d{2}\s[AP]M)  # Meeting time
                                    .*?,\s  # Potential extra text for meeting time
                                    (.*?)\n  # Location, usually a room
                                    .*?\n  # Chairman of committee holding event
                                    (.*)  # Description of event
                                    ''',
                                    event[1]).groups()
                        except AttributeError:
                            continue

                        time = datetime.datetime.strptime(
                                time + "_" + date,
                                '%I:%M %p_%A, %B %d, %Y'
                                )
                        time = self._tz.localize(time)

                        location = location.strip()

                        description = '\n'.join([
                                x.strip() for x in
                                description.split('\n')
                                if x.strip() and
                                not x.strip().startswith("Page ") and
                                not x.strip().startswith("*Possible Vote") and
                                not x.strip() == "NO OTHER COMMITTEES WILL MEET"
                                ])

                        if not description:
                            description = '[No description provided by state]'

                        event = Event(
                                name=description,
                                start_date=time,
                                location_name=location,
                                description=description
                        )

                        event.add_source(PDF_URL)
                        event.add_participant(comm, type='committee', note='host')
                        for line in description.split('\n'):
                            related_bill = re.search(r'(S\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$',
                                                     line)
                            if related_bill:
                                (related_bill, relation) = related_bill.groups()
                                relation = relation.strip()
                                related_bill = related_bill.replace(".", "")
                                item = event.add_agenda_item(relation)
                                item.add_bill(related_bill)

                        yield event

示例#47

0

显示文件

文件： events.py 项目： sunlightlabs/openstates

    def scrape_event_page(self, url, chamber):
        html = self.get(url).text
        page = lxml.html.fromstring(html)
        trs = page.xpath("//table[@id='frg_committeemeeting_MeetingTable']/tr")
        metainf = {}
        for tr in trs:
            tds = tr.xpath(".//td")
            if len(tds) <= 1:
                continue
            key = tds[0].text_content().strip()
            val = tds[1]
            metainf[key] = {
                "txt": val.text_content().strip(),
                "obj": val
            }

        if metainf == {}:
            return

        # Wednesday, 5/16/2012 3:00 pm
        datetime = "%s %s" % (
            metainf['Date']['txt'],
            metainf['Time']['txt'].replace(".", "")
        )
        if "Cancelled" in datetime:
            return

        translate = {
            "noon": " PM",
            "a.m.": " AM",
            "am": " AM",  # This is due to a nasty line they had.
            "a.m": "AM"  # another weird one

        }

        for t in translate:
            if t in datetime:
                datetime = datetime.replace(t, translate[t])

        datetime = re.sub(r"\s+", " ", datetime)

        for text_to_remove in [
                "or after committees are given leave",
                "or later immediately after committees are given leave",
                "or later after committees are given leave by the House to meet",
                "**Please note time**"]:
            datetime = datetime.split(text_to_remove)[0].strip()

        datetime = datetime.replace('p.m.', 'pm')
        datetime = datetime.replace('Noon', "pm")
        try:
            datetime = dt.datetime.strptime(datetime, "%A, %m/%d/%Y %I:%M %p")
        except ValueError:
            datetime = dt.datetime.strptime(datetime, "%A, %m/%d/%Y %I %p")
        where = metainf['Location']['txt']
        title = metainf['Committee']['txt']  # XXX: Find a better title

        if chamber == 'other':
            chamber = 'joint'

        event = Event(
            name=title,
            start_date=self._tz.localize(datetime),
            location_name=where,
        )
        event.add_source(url)
        event.add_source(mi_events)

        chair_name = metainf['Chair']['txt'].strip()
        if chair_name:
            event.add_participant(chair_name, type='legislator', note='chair')
        else:
            self.warning("No chair found for event '{}'".format(title))

        event.add_participant(metainf['Committee']['txt'], type='committee', note='host')

        agenda = metainf['Agenda']['obj']
        agendas = agenda.text_content().split("\r")

        related_bills = agenda.xpath("//a[contains(@href, 'getObject')]")
        for bill in related_bills:
            description = agenda
            for a in agendas:
                if bill.text_content() in a:
                    description = a

            item = event.add_agenda_item(description)
            item.add_bill(bill.text_content())

        yield event

示例#48

0

显示文件

文件： events-incremental.py 项目： opencivicdata/scrapers-ca

    def scrape_events_range(self, start_date, end_date):

        def daterange(start_date, end_date):
            number_of_days = int((end_date - start_date).days)
            for n in range(number_of_days):
                yield start_date + datetime.timedelta(n)

        for date in daterange(start_date, end_date):
            calendar_day_url = CALENDAR_DAY_TEMPLATE.format(date.year, date.month - 1, date.day)
            events = self.extract_events_by_url(calendar_day_url)
            for event in events:
                tz = pytz.timezone("America/Toronto")
                time = datetime.datetime.strptime(event['time'], '%I:%M %p')
                start = tz.localize(date.replace(hour=time.hour, minute=time.minute, second=0, microsecond=0))
                org_name = event['meeting']
                e = Event(
                    name=org_name,
                    start_time=start,
                    timezone=tz.zone,
                    location_name=event['location'],
                    status=STATUS_DICT.get(event['meeting_status'])
                )
                e.extras = {
                    'meeting_number': event['no'],
                    'tmmis_meeting_id': event['meeting_id'],
                }
                e.add_source(calendar_day_url)
                e.add_participant(
                    name=org_name,
                    type='organization',
                )

                def is_agenda_available(event):
                    return event['publishing_status'] in ['Agenda Published', 'Minutes Published']

                def is_council(event):
                    return True if event['meeting'] == self.jurisdiction.name else False

                if is_agenda_available(event):
                    agenda_url_template = AGENDA_FULL_COUNCIL_TEMPLATE if is_council(event) else AGENDA_FULL_STANDARD_TEMPLATE
                    agenda_url = agenda_url_template.format(event['meeting_id'])
                    full_identifiers = list(self.full_identifiers(event['meeting_id'], is_council(event)))

                    e.add_source(agenda_url)
                    agenda_items = self.agenda_from_url(agenda_url)
                    for i, item in enumerate(agenda_items):

                        a = e.add_agenda_item(item['title'])
                        a.add_classification(item['type'].lower())
                        a['order'] = str(i)

                        def normalize_wards(raw):
                            if not raw:
                                raw = 'All'
                            if raw == 'All':
                                return raw.lower()
                            else:
                                return raw.split(', ')

                        identifier_regex = re.compile(r'^[0-9]{4}\.([A-Z]{2}[0-9]+\.[0-9]+)$')
                        [full_identifier] = [id for id in full_identifiers if identifier_regex.match(id).group(1) == item['identifier']]
                        a.add_bill(full_identifier)

                yield e

示例#49

0

显示文件

    def scrape(self):
        calendar_url = "http://dccouncil.us/calendar"
        data = self.get(calendar_url).text
        doc = lxml.html.fromstring(data)

        committee_regex = re.compile("(Committee .*?)will")

        event_list = doc.xpath("//div[@class='event-description-dev']")
        for event in event_list:
            place_and_time = event.xpath(
                ".//div[@class='event-description-dev-metabox']/p/text()")
            when = " ".join(
                [place_and_time[0].strip(), place_and_time[1].strip()])
            if len(place_and_time) > 2:
                location = place_and_time[2]
            else:
                location = "unknown"
            # when is now of the following format:
            # Wednesday, 2/25/2015 9:30am
            when = datetime.datetime.strptime(when, "%A, %m/%d/%Y %I:%M%p")
            description_content = event.xpath(
                ".//div[@class='event-description-content-dev']")[0]
            description_lines = description_content.xpath("./*")
            desc_without_title = " ".join(d.text_content()
                                          for d in description_lines[1:])
            description = re.sub(r'\s+', " ",
                                 description_content.text_content()).strip()
            potential_bills = description_content.xpath(".//li")

            committee = committee_regex.search(desc_without_title)
            event_type = 'other'
            if committee is not None:
                committee = committee.group(1).strip()
                event_type = 'committee:meeting'

            e = Event(
                name=description,
                description=description,
                start_time=self._tz.localize(when),
                timezone=self._tz.zone,
                location_name=location,
                classification=event_type,
            )

            for b in potential_bills:
                bill = b.xpath("./a/text()")
                if len(bill) == 0:
                    continue
                bill = bill[0]
                bill_desc = b.text_content().replace(bill,
                                                     "").strip(", ").strip()
                ses, num = bill.split("-")
                bill = ses.replace(" ", "") + "-" + num.zfill(4)
                item = e.add_agenda_item(bill_desc)
                item.add_bill(bill)

            e.add_source(calendar_url)

            if committee:
                e.add_participant(committee, type='orgnization', note='host')

            yield e

示例#50

0

显示文件

    def scrape(self):
        for c in house_base:
            m = {}
            m['notice'] = c.xpath('.//p/span[@class="cal_special"]/text()')
            links = c.xpath('.//h3/a/@href')            
            if len(links) > 0:
                m['cmt'] = c.xpath('.//h3/a/text()')[0]
                m['link'] = c.xpath('.//h3/a/@href')[0]
                title = c.xpath('.//h3/text()')[0]
                if title == 'Agenda:':
                    m['title'] = c.xpath('.//h3/a/text()')[0]
                else:
                    m['title'] = c.xpath('.//h3/text()')[0]
                
            else:
                m['title'] = c.xpath('.//h3/text()')[0]
                m['link'] = None
            info_div = c.xpath('.//*[@class="calendar_p_indent"]')
            if len(info_div) == 0:
                pass
            else:
                info_div = info_div[0]
            print('Info Div: ', info_div)
            if len(info_div) > 0:
                info_list = info_div.xpath('.//text()')
                info_links = info_div.xpath('.//*/@href')
                print("info links: ", info_links)
                info_list = [x.replace('\n', '').strip() for x in info_list]
                info_list = [x for x in info_list if len(x) > 0]
                print('Info list: ', info_list)
                if info_list[0].startswith('Room:'):
                    m['room'] = info_list[1]
                else:
                    m['room'] = 'n/a'
                if len(info_list) > 2:
                    if info_list[2].startswith('Chair:'):
                        chair = info_list[3]
                        if ',' in chair:
                            chairs = chair.replace('\xa0', '').split(',')
                            nchairs = []
                            for chair in chairs:
                                if chair.startswith('Rep.') or chair.startswith('Sen.'):
                                    cname = pull_middle_name(chair[4:])
                                    nchairs.append(cname.strip())
                            m['chair'] = nchairs
                        elif chair.startswith('Rep.') or chair.startswith('Sen.'):
                            cname = pull_middle_name(chair[4:].strip())
                            m['chair'] = [cname.strip()]
                else:
                    m['chair'] = None
            
            bill_rows = c.xpath(('.//*/table[@class="cal_bills"]/tbody/tr'))
            print('Bills: ', bill_rows)
            bills = []
            for brs in bill_rows:
                cells = brs.xpath('.//td')
                if len(cells) == 3:
                    b = {}
                    b['bill'] = cells[0].xpath('.//text()')[0]
                    b['author'] = cells[1].xpath('./text()')[0]
                    b['summary'] = cells[2].xpath('./text()')[0]
                    bills.append(b)
            if len(m['notice']) > 0:
                m['notice'] = m['notice'][0]
            else:
                m['notice'] = 'N/A'
            date = c.xpath('.//p/b/text()')
            if len(date) < 1:
                print('\n\n\n\n NO DATE')
                continue
            m['date'] = datetime.datetime.strptime(date[0], format1)

            if 'House Meets in Session' in m['title']:
                m['room'] = 'State leg'
                m['cmt'] = 'Minnesota House of Representatives'
                m['chair'] = None
                m['link'] = 'https://www.leg.state.mn.us/cal?type=all'
            event = Event(name=m['title'],
                          start_date=tz.localize(m['date']),
                          location_name=m['room'] 
            )
            if len(bills) > 0:
                for bill in bills:
                    nbill = event.add_agenda_item(description=bill['summary'])
                    nbill.add_bill(bill['bill'].replace('HF', 'HF '))
            if len(m['notice']) > 0:
                pass
            event.add_committee(m['cmt'])
            if m['link'] is not None:
                event.add_source(m['link'])
            if m['chair'] is not None:
                for chair in m['chair']:
                   event.add_person(name=chair, note="Chair")
            yield event

示例#51

0

显示文件

    def scrape(self, window=3):
        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
            float(window))
        for api_event, web_event in self.events(n_days_ago):

            when = api_event['start']
            location = api_event[u'EventLocation']

            extracts = self._parse_comment(api_event[u'EventComment'])
            description, room, status, invalid_event = extracts

            if invalid_event:
                continue

            if room:
                location = room + ', ' + location

            if not status:
                status = api_event['status']

            if description:
                e = Event(name=api_event["EventBodyName"],
                          start_date=when,
                          description=description,
                          location_name=location,
                          status=status)
            else:
                e = Event(name=api_event["EventBodyName"],
                          start_date=when,
                          location_name=location,
                          status=status)

            e.pupa_id = str(api_event['EventId'])

            if web_event['Meeting video'] != 'Not\xa0available':
                e.add_media_link(note='Recording',
                                 url=web_event['Meeting video']['url'],
                                 type="recording",
                                 media_type='text/html')
            self.addDocs(e, web_event, 'Published agenda')
            self.addDocs(e, web_event, 'Notice')
            self.addDocs(e, web_event, 'Published summary')
            if 'Captions' in web_event:
                self.addDocs(e, web_event, 'Captions')

            participant = api_event["EventBodyName"]
            if participant == 'City Council':
                participant = 'Mountain View City Council'

            e.add_participant(name=participant, type="organization")

            for item in self.agenda(api_event):
                agenda_item = e.add_agenda_item(item["EventItemTitle"])
                if item["EventItemMatterFile"]:
                    identifier = item["EventItemMatterFile"]
                    agenda_item.add_bill(identifier)

            participants = set()
            for call in self.rollcalls(api_event):
                if call['RollCallValueName'] == 'Present':
                    participants.add(call['RollCallPersonName'])

            for person in participants:
                e.add_participant(name=person, type="person")

            e.add_source(self.BASE_URL +
                         '/events/{EventId}'.format(**api_event),
                         note='api')

            e.add_source(web_event['Meeting Name']['url'], note='web')

            yield e