def scrape_meetings(self, meetings, group):
        """
        Scrape and save event data from a list of meetings.

        Arguments:
        meetings -- A list of lxml elements containing event information
        group -- The type of meeting. The legislature site applies
                 different formatting to events based on which group
                 they correspond to.  `group` should be one of the
                 following strings: 'house', 'senate', or 'commission'.

        """
        for meeting in meetings:
            when = self.get_date(meeting)
            description = self.get_description(meeting)
            location = self.get_location(meeting)

            if when and description and location:
                event = Event(
                    name=description,
                    start_date=when.replace(tzinfo=self.tz),
                    description=description,
                    location_name=location,
                )
                agenda = self.get_agenda(meeting)
                if agenda:
                    event.add_agenda_item(agenda)
                event.add_source(url)
                yield event
示例#2
0
    def scrape_meeting_notice(self, chamber, item, url):
        # Since Event Name is not provided for all mettings.
        event_name = str(item["CommitteeName"])
        # 04/25/2012 03:00:00 PM
        fmt = "%m/%d/%y %I:%M %p"
        start_time = dt.datetime.strptime(str(item["MeetingDateTime"]), fmt)
        location_name = str(item["AddressAliasNickname"])
        event = Event(
            location_name=location_name,
            start_date=self._tz.localize(start_time),
            name=event_name,
            description="Committee Meeting Status: {}".format(
                item["CommitteeMeetingStatusName"]),
        )

        event.add_source(url)
        event.add_committee(name=str(item["CommitteeName"]),
                            id=item["CommitteeId"])

        page_url = ("http://legis.delaware.gov/json/MeetingNotice/"
                    "GetCommitteeMeetingItems?committeeMeetingId={}".format(
                        item["CommitteeMeetingId"]))

        event.add_source(page_url)
        page_data = self.post(page_url).json()["Data"]
        for item in page_data:
            event.add_agenda_item(description=str(item["ItemDescription"]))
            event.add_person(
                name=str(item["PrimarySponsorShortName"]),
                id=str(item["PrimarySponsorPersonId"]),
                note="Sponsor",
            )

        yield event
示例#3
0
    def scrape_lower_event(self, url):
        html = self.get(url).text

        if "not meeting" in html.lower():
            self.info(f"Skipping {url}, not meeting")
            return

        page = lxml.html.fromstring(html)
        page.make_links_absolute(url)

        com = (page.xpath('//div[contains(@class,"sectionhead")]/h1')
               [0].text_content().strip())

        com = f"House {com}"

        start = self.get_meeting_row(page, "Start Date")
        start = self.tz.localize(dateutil.parser.parse(start))

        end = None
        if self.get_meeting_row(page, "End Date"):
            end = self.get_meeting_row(page, "End Date")
            end = self.tz.localize(dateutil.parser.parse(end))
        location = self.get_meeting_row(page, "Location")

        summary = ""
        if page.xpath('//div[contains(text(),"Meeting Overview")]'):
            summary = (page.xpath(
                '//div[div[contains(text(),"Meeting Overview")]]/div[contains(@class,"ml-3")]'
            )[0].text_content().strip())

        if end:
            event = Event(
                name=com,
                start_date=start,
                end_date=end,
                location_name=location,
                description=summary,
            )
        else:
            event = Event(name=com,
                          start_date=start,
                          location_name=location,
                          description=summary)
        event.add_source(url)

        for h5 in page.xpath(
                '//div[contains(@class,"meeting-actions-bills")]/h5'):
            event.add_agenda_item(h5.text_content().strip())
            for agenda_item in h5.xpath("following-sibling::ul/li"):
                agenda_text = agenda_item.text_content().strip()
                agenda_text = re.sub(r"\s+\u2013\s+", " - ", agenda_text)
                item = event.add_agenda_item(agenda_text)
                found_bills = re.findall(r"H.*\s+\d+", agenda_text)
                if found_bills:
                    item.add_bill(found_bills[0])

        yield event
示例#4
0
    def scrape_upper(self):
        listing_url = "https://www.senate.mo.gov/hearingsschedule/hrings.htm"

        html = self.get(listing_url).text

        # The HTML here isn't wrapped in a container per-event
        # which makes xpath a pain. So string split by <hr>
        # then parse each event's fragment for cleaner results
        for fragment in html.split("<hr />")[1:]:
            page = lxml.html.fromstring(fragment)

            when_date = self.row_content(page, "Date:")
            when_time = self.row_content(page, "Time:")
            location = self.row_content(page, "Room:")

            location = "{}, {}".format(
                location, "201 W Capitol Ave, Jefferson City, MO 65101")

            # com = self.row_content(page, 'Committee:')
            com = page.xpath(
                '//td[descendant::b[contains(text(),"Committee")]]/a/text()'
            )[0]
            com = com.split(", Senator")[0].strip()

            start_date = self._TZ.localize(
                dateutil.parser.parse("{} {}".format(when_date, when_time)))

            event = Event(start_date=start_date,
                          name=com,
                          location_name=location)

            event.add_source(listing_url)

            event.add_participant(com, type="committee", note="host")

            for bill_table in page.xpath(
                    '//table[@width="85%" and @border="0"]'):
                bill_link = ""
                if bill_table.xpath(self.bill_link_xpath):
                    agenda_line = bill_table.xpath("string(tr[2])").strip()
                    agenda_item = event.add_agenda_item(
                        description=agenda_line)

                    bill_link = bill_table.xpath(
                        self.bill_link_xpath)[0].strip()
                    agenda_item.add_bill(bill_link)
                else:
                    agenda_line = bill_table.xpath("string(tr[1])").strip()
                    agenda_item = event.add_agenda_item(
                        description=agenda_line)

            yield event
示例#5
0
    def parse_div(self, row, chamber, com):
        cal_link = row.xpath('.//a[.//span[@id="calendarmarker"]]/@href')[0]
        # event_date = row.xpath('string(.//div[contains(@class,"ItemDate")])').strip()
        title, location, start_date, end_date = self.parse_gcal(cal_link)

        event = Event(start_date=start_date,
                      end_date=end_date,
                      name=title,
                      location_name=location)

        event.add_source(
            "http://mgaleg.maryland.gov/webmga/frmHearingSchedule.aspx")

        for item in row.xpath('.//div[@class="col-xs-12a Item"]'):
            description = item.xpath("string(.)").strip()
            agenda = event.add_agenda_item(description=description)

        for item in row.xpath('.//div[contains(@class,"ItemContainer")]/a'):
            description = item.xpath("string(.)").strip()
            agenda = event.add_agenda_item(description=description)

            event.add_document(
                description,
                item.xpath("@href")[0],
                media_type="application/pdf",
                on_duplicate="ignore",
            )

        for item in row.xpath('.//div[contains(@class,"ItemContainer")]'
                              '[./div[@class="col-xs-1 Item"]]'):
            description = item.xpath("string(.)").strip()
            agenda = event.add_agenda_item(description=description)

            bill = item.xpath(
                './/div[@class="col-xs-1 Item"]/a/text()')[0].strip()
            agenda.add_bill(bill)

        video = row.xpath('.//a[./span[@class="OnDemand"]]')
        if video:
            event.add_media_link("Video of Hearing",
                                 video[0].xpath("@href")[0], "text/html")

        if "subcommittee" in title.lower():
            subcom = title.split("-")[0].strip()
            event.add_participant(subcom, type="committee", note="host")
        else:
            event.add_participant(com, type="committee", note="host")
        yield event
示例#6
0
    def scrape_lower_event(self, url):
        page = self.get(url).content
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        table = page.xpath('//section[@id="leg-agenda-mod"]/div/table')[0]
        meta = table.xpath("tr[1]/td[1]/text()")

        # careful, the committee name in the page #committee_div
        # is getting inserted via JS
        # so use the one from the table, and strip the chair name
        com_name = re.sub(r"\(.*\)", "", meta[0])
        com_name = f"Assembly {com_name}"

        when = dateutil.parser.parse(meta[1])
        when = self._tz.localize(when)
        location = meta[2]

        event = Event(
            name=com_name,
            start_date=when,
            location_name=location,
        )

        event.add_participant(com_name, type="committee", note="host")

        event.add_source(url)

        if table.xpath('.//a[contains(@href, "/leg/")]'):
            agenda = event.add_agenda_item("Bills under Consideration")
            for bill_link in table.xpath('.//a[contains(@href, "/leg/")]'):
                agenda.add_bill(bill_link.text_content().strip())

        yield event
示例#7
0
    def upper_parse_agenda_item(self, item):
        response = self.api_client.get(
            "meeting",
            year=item["agendaId"]["year"],
            agenda_id=item["agendaId"]["number"],
            committee=item["committeeId"]["name"],
        )

        data = response["result"]

        chamber = data["committee"]["committeeId"]["chamber"].title()
        com_code = data["committee"]["committeeId"]["name"]
        com_name = f"{chamber} {com_code}"

        # each "meeting" is actually a listing page of multiple meetings of the same committee
        # broken out by different addendumId
        for addendum in data["committee"]["addenda"]["items"]:
            if addendum["addendumId"] != item["addendum"]:
                continue

            meeting = addendum["meeting"]

            when = dateutil.parser.parse(meeting["meetingDateTime"])
            when = self._tz.localize(when)

            location = meeting["location"]
            description = meeting["notes"]

            if location == "":
                location = "See Committee Site"

            if "canceled" in description.lower():
                continue

            event = Event(
                name=com_name,
                start_date=when,
                location_name=location,
                description=description,
            )

            event.add_participant(com_name, type="committee", note="host")

            com_code = (com_code.lower().replace("'", "").replace(" ",
                                                                  "-").replace(
                                                                      ",", ""))
            url = f"https://www.nysenate.gov/committees/{com_code}"
            event.add_source(url)

            bills = addendum["bills"]["items"]

            if len(bills) > 0:
                agenda = event.add_agenda_item("Bills under consideration")

            for bill in bills:
                agenda.add_bill(bill["billId"]["printNo"])

            yield event
示例#8
0
    def scrape_events(self, session, start_date):
        session_key = SESSION_KEYS[session]

        if start_date is None:
            start_date = datetime.date.today()
        else:
            start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")

        committees_by_code = {}

        committees_response = self.api_client.get("committees",
                                                  session=session_key)
        for committee in committees_response:
            committees_by_code[
                committee["CommitteeCode"]] = committee["CommitteeName"]

        meetings_response = self.api_client.get(
            "committee_meetings",
            start_date=start_date.strftime(self._DATE_FORMAT),
            session=session_key,
        )

        if len(meetings_response) == 0:
            raise EmptyScrape

        for meeting in meetings_response:
            event_date = self._TZ.localize(
                datetime.datetime.strptime(meeting["MeetingDate"],
                                           self._DATE_FORMAT))
            com_name = committees_by_code[meeting["CommitteeCode"]]

            event = Event(start_date=event_date,
                          name=com_name,
                          location_name=meeting["Location"])

            event.add_source(meeting["AgendaUrl"])

            event.extras["meeting_guid"] = meeting["MeetingGuid"]
            event.extras["committee_code"] = committee["CommitteeCode"]

            event.add_participant(com_name, type="committee", note="host")

            for row in meeting["CommitteeAgendaItems"]:
                if row["Comments"] is not None:
                    agenda = event.add_agenda_item(row["Comments"])

                if row["MeasureNumber"] is not None:
                    bill_id = "{} {}".format(row["MeasurePrefix"],
                                             row["MeasureNumber"])
                    agenda.add_bill(bill_id)

            for row in meeting["CommitteeMeetingDocuments"]:
                event.add_document(
                    note=row["ExhibitTitle"],
                    url=row["DocumentUrl"],
                    on_duplicate="ignore",
                )
            yield event
示例#9
0
    def scrape_events(self, page):

        page = lxml.html.fromstring(page)

        if page.xpath(
                "//h3[contains(text(),'There are no hearings for the date range')]"
        ):
            raise EmptyScrape
            return

        for meeting in page.xpath('//div[@class="card mb-4"]'):
            com = meeting.xpath(
                'div[contains(@class, "card-header")]/text()')[0].strip()
            details = meeting.xpath(
                'div[contains(@class, "card-header")]/small/text()')[0].strip(
                )

            (location, time) = details.split(" - ")

            # turn room numbers into the full address
            if location.lower().startswith("room"):
                location = "1445 K St, Lincoln, NE 68508, {}".format(location)

            day = meeting.xpath(
                "./preceding-sibling::h2[@class='text-center']/text()"
            )[-1].strip()

            # Thursday February 27, 2020 1:30 PM
            date = "{} {}".format(day, time)
            event_date = self._tz.localize(
                datetime.datetime.strptime(date, "%A %B %d, %Y %I:%M %p"))

            event = Event(
                name=com,
                start_date=event_date,
                classification="committee-meeting",
                description="Committee Meeting",
                location_name=location,
            )

            event.add_committee(com, note="host")

            for row in meeting.xpath("div/table/tr"):
                if not row.xpath("td[3]"):
                    continue
                agenda_desc = row.xpath("td[3]/text()")[0].strip()
                agenda_item = event.add_agenda_item(description=agenda_desc)

                if row.xpath("td[1]/a"):
                    # bill link
                    agenda_item.add_bill(
                        row.xpath("td[1]/a/text()")[0].strip())

            event.add_source(
                "https://nebraskalegislature.gov/calendar/calendar.php")

            yield event
    def scrape_event_page(self, session, chamber, url, datetime):
        page = self.lxmlize(url)
        info = page.xpath("//p")
        metainfo = {}
        plaintext = ""
        for p in info:
            content = re.sub(r"\s+", " ", p.text_content())
            plaintext += content + "\n"
            if ":" in content:
                key, val = content.split(":", 1)
                metainfo[key.strip()] = val.strip()
        committee = metainfo["COMMITTEE"]
        where = metainfo["PLACE"]
        if "CHAIR" in where:
            where, chair = where.split("CHAIR:")
            metainfo["PLACE"] = where.strip()
            metainfo["CHAIR"] = chair.strip()

        chair = None
        if "CHAIR" in metainfo:
            chair = metainfo["CHAIR"]

        plaintext = re.sub(r"\s+", " ", plaintext).strip()
        regexp = r"(S|J|H)(B|M|R) (\d+)"
        bills = re.findall(regexp, plaintext)

        event = Event(
            name=committee, start_date=self._tz.localize(datetime), location_name=where
        )

        event.add_source(url)
        event.add_participant(committee, type="committee", note="host")
        if chair is not None:
            event.add_participant(chair, type="legislator", note="chair")

        for bill in bills:
            chamber, type, number = bill
            bill_id = "%s%s %s" % (chamber, type, number)
            item = event.add_agenda_item("Bill up for discussion")
            item.add_bill(bill_id)

        event.add_agenda_item(plaintext)

        yield event
示例#11
0
    def scrape(self):
        page = self.lxmlize(calurl)
        events = page.xpath("//table[@class='agenda-body']//tr")[1:]

        for event in events:
            comit_url = event.xpath(".//a[contains(@title,'Committee Details')]")
            if len(comit_url) != 1:
                continue

            comit_url = comit_url[0]
            who = self.scrape_participants(comit_url.attrib["href"])

            tds = event.xpath("./*")
            date = tds[0].text_content().strip()
            cttie = tds[1].text_content().strip()
            chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)]
            info = tds[2]
            name = info.xpath("./a[contains(@href, 'raw')]")[0]
            notice = name.attrib["href"]
            name = name.text
            time, where = info.xpath("./i/text()")
            what = tds[3].text_content()
            what = what.replace("Items: ", "")
            if "(None)" in what:
                continue
            what = [x.strip() for x in what.split(";")]

            when = ", ".join([date, str(dt.datetime.now().year), time])
            when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p")

            if cttie:
                cttie = cttie.replace("Committee on", "").strip()
                cttie = f"{chamber} {cttie}"
                name = cttie

            event = Event(
                name=name, location_name=where, start_date=self._tz.localize(when)
            )

            event.add_source(calurl)

            event.add_committee(cttie, note="host")

            event.add_document("notice", notice, media_type="application/pdf")

            for entry in what:
                item = event.add_agenda_item(entry)
                if entry.startswith("AB") or entry.startswith("SB"):
                    item.add_bill(entry)

            for thing in who:
                event.add_person(thing["name"])

            yield event
    def scrape_chamber(self, chamber):
        url = utils.urls["events"][chamber]
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for table in page.xpath(
                '//table[@class="CMS-MeetingDetail-CurrMeeting"]'):
            date_string = table.xpath(
                'ancestor::div[@class="CMS-MeetingDetail"]/div/a/@name')[0]
            for row in table.xpath("tr"):
                time_string = row.xpath(
                    'td[@class="CMS-MeetingDetail-Time"]/text()')[0].strip()
                description = (
                    row.xpath('td[@class="CMS-MeetingDetail-Agenda"]/div/div')
                    [-1].text_content().strip())
                location = (row.xpath('td[@class="CMS-MeetingDetail-Location"]'
                                      )[0].text_content().strip())
                committees = row.xpath(
                    './/div[@class="CMS-MeetingDetail-Agenda-CommitteeName"]/a'
                )
                bills = row.xpath('.//a[contains(@href, "billinfo")]')

                try:
                    start_date = datetime.datetime.strptime(
                        "{} {}".format(date_string, time_string),
                        "%m/%d/%Y %I:%M %p")
                except ValueError:
                    break

                event = Event(
                    name=description,
                    start_date=self._tz.localize(start_date),
                    location_name=location,
                )
                event.add_source(url)

                if bills or committees:
                    item = event.add_agenda_item(description)
                    for bill in bills:
                        parsed = urllib.parse.urlparse(bill.get("href"))
                        qs = urllib.parse.parse_qs(parsed.query)
                        item.add_bill("{}{} {}".format(qs["body"], qs["type"],
                                                       qs["bn"]))
                    for committee in committees:
                        parsed = urllib.parse.urlparse(committee.get("href"))
                        qs = urllib.parse.parse_qs(parsed.query)
                        item.add_committee(
                            re.sub(r" \([S|H]\)$", "", committee.text),
                            id=qs.get("Code"),
                        )

                yield event
    def parse_event(self, row, chamber):
        # sample event available at http://www.akleg.gov/apptester.html
        committee_code = row.xpath("string(Sponsor)").strip()

        if committee_code in self.COMMITTEES[chamber]:
            committee_name = "{} {}".format(
                self.COMMITTEES_PRETTY[chamber],
                self.COMMITTEES[chamber][committee_code]["name"],
            )
        else:
            committee_name = "{} {}".format(
                self.COMMITTEES_PRETTY[chamber],
                "MISCELLANEOUS",
            )

        name = "{} {}".format(self.COMMITTEES_PRETTY[chamber],
                              row.xpath("string(Title)").strip())

        # If name is missing, make it "<CHAMBER> <COMMITTEE NAME>"
        if name == "":
            name = committee_name

        location = row.xpath("string(Location)").strip()

        # events with no location all seem to be committee hearings
        if location == "":
            location = "Alaska State Capitol, 120 4th St, Juneau, AK 99801"

        start_date = dateutil.parser.parse(row.xpath("string(Schedule)"))
        # todo: do i need to self._TZ.localize() ?

        event = Event(start_date=start_date, name=name, location_name=location)

        event.add_source("http://w3.akleg.gov/index.php#tab4")

        if committee_code in self.COMMITTEES[chamber]:
            event.add_participant(committee_name,
                                  type="committee",
                                  note="host")

        for item in row.xpath("Agenda/Item"):
            agenda_desc = item.xpath("string(Text)").strip()
            if agenda_desc != "":
                agenda_item = event.add_agenda_item(description=agenda_desc)
                if item.xpath("BillRoot"):
                    bill_id = item.xpath("string(BillRoot)")
                    # AK Bill ids have a bunch of extra spaces
                    bill_id = re.sub(r"\s+", " ", bill_id)
                    agenda_item.add_bill(bill_id)

        yield event
示例#14
0
    def scrape_lower_item(self, page):
        # print(lxml.etree.tostring(page, pretty_print=True))
        com = self.table_row_content(page, "Committee:")
        when_date = self.table_row_content(page, "Date:")
        when_time = self.table_row_content(page, "Time:")
        location = self.table_row_content(page, "Location:")

        if "house hearing room" in location.lower():
            location = "{}, {}".format(
                location, "201 W Capitol Ave, Jefferson City, MO 65101")

        # fix some broken times, e.g. '12 :00'
        when_time = when_time.replace(" :", ":")
        # a.m. and p.m. seem to confuse dateutil.parser
        when_time = when_time.replace("A.M.", "AM").replace("P.M.", "PM")

        # some times have extra info after the AM/PM
        if "upon" in when_time:
            when_time = when_time.split("AM", 1)[0]
            when_time = when_time.split("PM", 1)[0]

        # fix '- Upcoming', '- In Progress'  in dates
        when_date = re.sub(r"- (.*)", "", when_date).strip()

        try:
            start_date = dateutil.parser.parse(f"{when_date} {when_time}")
        except dateutil.parser._parser.ParserError:
            start_date = dateutil.parser.parse(when_date)

        start_date = self._TZ.localize(start_date)

        event = Event(start_date=start_date, name=com, location_name=location)

        event.add_source("https://house.mo.gov/HearingsTimeOrder.aspx")

        event.add_participant(com, type="committee", note="host")

        # different from general MO link xpath due to the <b>
        house_link_xpath = ('.//a[contains(@href, "Bill.aspx") '
                            'or contains(@href, "bill.aspx")]/b/text()')

        for bill_title in page.xpath(house_link_xpath):
            bill_no = bill_title.split("--")[0].strip()
            bill_no = bill_no.replace("HCS", "").strip()

            agenda_item = event.add_agenda_item(description=bill_title)
            agenda_item.add_bill(bill_no)

        yield event
示例#15
0
    def scrape_chamber(self, chamber):
        grouped_hearings = defaultdict(list)

        for hearing in self.session.query(CACommitteeHearing):
            location = (self.session.query(CALocation).filter_by(
                location_code=hearing.location_code)[0].description)

            date = self._tz.localize(hearing.hearing_date)

            chamber_abbr = location[0:3]
            event_chamber = {"Asm": "lower", "Sen": "upper"}[chamber_abbr]

            if event_chamber != chamber:
                continue

            grouped_hearings[(location, date)].append(hearing)

        for ((location, date), hearings) in grouped_hearings.items():

            # Get list of bill_ids from the database.
            bill_ids = [hearing.bill_id for hearing in hearings]
            bills = [
                "%s %s" % re.match(r"\d+([^\d]+)(\d+)", bill).groups()
                for bill in bill_ids
            ]

            # Dereference the committee_nr number and get display name.
            msg = "More than one committee meeting at (location, date) %r"
            msg = msg % ((location, date), )
            assert len(set(hearing.committee_nr
                           for hearing in hearings)) == 1, msg
            committee_name = _committee_nr[hearings.pop().committee_nr]

            desc = "Committee Meeting: " + committee_name
            event = Event(name=desc,
                          start_date=date,
                          location_name=committee_name)
            for bill_id in bills:
                if "B" in bill_id:
                    type_ = "bill"
                else:
                    type_ = "resolution"
                item = event.add_agenda_item("consideration")
                item.add_bill(bill_id, note=type_)

            event.add_person(committee_name + " Committee", note="host")
            event.add_source("https://downloads.leginfo.legislature.ca.gov/")

            yield event
示例#16
0
    def scrape(self, session=None):
        if session is None:
            session = self.latest_session()
            self.info("no session specified, using %s", session)

        year_abr = ((int(session) - 209) * 2) + 2000
        self._init_mdb(year_abr)
        self.initialize_committees(year_abr)
        # Keep record of all events
        records = self.access_to_csv("Agendas")
        for record in records:
            if record["Status"] != "Scheduled":
                continue
            description = record["Comments"]
            related_bills = []

            for bill in re.findall(r"(A|S)(-)?(\d{4})", description):
                related_bills.append({
                    "bill_id": "%s %s" % (bill[0], bill[2]),
                    "descr": description
                })

            date_time = "%s %s" % (record["Date"], record["Time"])
            date_time = dt.datetime.strptime(date_time, "%m/%d/%Y %I:%M %p")

            try:
                hr_name = self._committees[record["CommHouse"]]
            except KeyError:
                self.warning("unknown committee code %s, skipping",
                             record["CommHouse"])

            description = "Meeting of the {}".format(hr_name)

            event = Event(
                name=description,
                start_date=self._tz.localize(date_time),
                location_name=record["Location"] or "Statehouse",
            )
            item = None
            for bill in related_bills:
                item = item or event.add_agenda_item(description)
                item.add_bill(bill["bill_id"])
            # Add committee to event
            event.add_committee(hr_name, id=record["CommHouse"], note="host")
            event.add_source("http://www.njleg.state.nj.us/downloads.asp")

            yield event
    def scrape_page(self, url, session, chamber):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        ctty_name = doc.xpath("//span[@class='heading']")[0].text_content()

        tables = doc.xpath("//table[@cellpadding='3']")
        info = tables[0]
        rows = info.xpath(".//tr")
        metainf = {}
        for row in rows:
            tds = row.xpath(".//td")
            key = tds[0].text_content().strip()
            value = tds[1].text_content().strip()
            metainf[key] = value

        where = metainf["Location:"]
        subject_matter = metainf["Subject Matter:"]
        description = "{}, {}".format(ctty_name, subject_matter)

        datetime = metainf["Scheduled Date:"]
        datetime = re.sub(r"\s+", " ", datetime)
        repl = {"AM": " AM", "PM": " PM"}  # Space shim.
        for r in repl:
            datetime = datetime.replace(r, repl[r])
        datetime = self.localize(
            dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p"))

        event = Event(description, start_date=datetime, location_name=where)
        event.add_source(url)

        if ctty_name.startswith("Hearing Notice For"):
            ctty_name.replace("Hearing Notice For", "")
        event.add_participant(ctty_name, "organization")

        bills = tables[1]
        for bill in bills.xpath(".//tr")[1:]:
            tds = bill.xpath(".//td")
            if len(tds) < 4:
                continue
            # First, let's get the bill ID:
            bill_id = tds[0].text_content()
            agenda_item = event.add_agenda_item(bill_id)
            agenda_item.add_bill(bill_id)

        return event
示例#18
0
    def scrape_meeting_notice(self, item, url):
        # Since Event Name is not provided for all mettings.
        if "Joint" in str(item["CommitteeName"]):
            event_name = str(item["CommitteeName"])
        else:
            event_name = "{} {}".format(str(item["CommitteeTypeName"]),
                                        str(item["CommitteeName"]))
        # 04/25/2012 03:00:00 PM
        fmt = "%m/%d/%y %I:%M %p"
        start_time = dt.datetime.strptime(str(item["MeetingDateTime"]), fmt)
        location_name = str(item["AddressAliasNickname"])
        event = Event(
            location_name=location_name,
            start_date=self._tz.localize(start_time),
            name=event_name,
            description="Committee Meeting Status: {}".format(
                item["CommitteeMeetingStatusName"]),
        )

        event.add_committee(name=str(item["CommitteeName"]),
                            id=item["CommitteeId"])

        html_url = f'https://legis.delaware.gov/MeetingNotice?committeeMeetingId={item["CommitteeMeetingId"]}'
        event.add_source(html_url)

        page_url = f'https://legis.delaware.gov/json/MeetingNotice/GetCommitteeMeetingItems?committeeMeetingId={item["CommitteeMeetingId"]}'

        page_data = []
        try:
            page_data = self.post(page_url).json()["Data"]
        except json.decoder.JSONDecodeError:
            # No agenda items
            self.info(f"POST returned nothing on {page_url}")

        for item in page_data:
            a = event.add_agenda_item(description=str(item["ItemDescription"]))
            if item["LegislationDisplayText"] is not None:
                a.add_bill(item["LegislationDisplayText"])

            event.add_person(
                name=str(item["PrimarySponsorShortName"]),
                id=str(item["PrimarySponsorPersonId"]),
                note="Sponsor",
            )

        yield event
示例#19
0
    def scrape_senate(self):
        url = "https://www.senate.gov/general/committee_schedules/hearings.xml"

        page = self.get(url).content
        page = lxml.etree.fromstring(page)

        rows = page.xpath("//meeting")

        for row in rows:
            com = row.xpath("string(committee)")

            if com == "":
                continue

            com = "Senate {}".format(com)

            address = row.xpath("string(room)")
            parts = address.split("-")
            building_code = parts[0]

            if self.buildings.get(building_code):
                address = "{}, Room {}".format(
                    self.buildings.get(building_code), parts[1])

            agenda = row.xpath("string(matter)")

            try:
                event_date = datetime.datetime.strptime(
                    row.xpath("string(date)"), "%d-%b-%Y %H:%M %p")
            except ValueError:
                event_date = datetime.datetime.strptime(
                    row.xpath("string(date)"), "%d-%b-%Y")

            event_date = self._TZ.localize(event_date)

            event = Event(start_date=event_date,
                          name=com,
                          location_name=address)

            agenda_item = event.add_agenda_item(description=agenda)

            # ex: Business meeting to consider S.785, to improve mental...
            matches = re.findall(r"\s(\w+)\.(\d+),", agenda)

            if matches:
                match = matches[0]
                bill_type = match[0]
                bill_number = match[1]
                bill_name = "{} {}".format(bill_type, bill_number)
                agenda_item.add_bill(bill_name)

            event.add_participant(
                com,
                type="committee",
                note="host",
            )

            event.add_source(
                "https://www.senate.gov/committees/hearings_meetings.htm")

            yield event
示例#20
0
    def scrape(self, chamber=None, session=None):
        url = "http://leg.colorado.gov/content/committees"
        if not session:
            session = self.latest_session()
            self.info("no session specified, using %s", session)
        chambers = [chamber] if chamber else ["upper", "lower"]
        for chamber in chambers:
            if chamber == "lower":
                xpath = (
                    '//div/h3[text()="House Committees of Reference"]/../'
                    'following-sibling::div[contains(@class,"view-content")]/'
                    'table//td//span[contains(@class,"field-content")]/a/@href'
                )
            elif chamber == "upper":
                xpath = (
                    '//div/h3[text()="Senate Committees of Reference"]/../'
                    'following-sibling::div[contains(@class,"view-content")]/'
                    'table//td//span[contains(@class,"field-content")]/a/@href'
                )
            elif chamber == "other":
                # All the links under the headers that don't contain "House" or "Senate"
                xpath = (
                    '//div/h3[not(contains(text(),"House")) and '
                    'not(contains(text(),"Senate"))]/../'
                    'following-sibling::div[contains(@class,"view-content")]/'
                    'table//td//span[contains(@class,"field-content")]/a/@href'
                )

            page = self.lxmlize(url)
            com_links = page.xpath(xpath)

            for link in com_links:
                page = self.lxmlize(link)

                hearing_links = page.xpath(
                    '//div[contains(@class,"schedule-item-content")]'
                    "/h4/a/@href")

                for link in hearing_links:
                    try:
                        page = self.lxmlize(link)

                        title = page.xpath(
                            '//header/h1[contains(@class,"node-title")]')[0]
                        title = title.text_content().strip()

                        date_day = page.xpath(
                            '//div[contains(@class,"calendar-date")]')[0]
                        date_day = date_day.text_content().strip()

                        details = page.xpath(
                            '//span[contains(@class, "calendar-details")]')[0]
                        details = details.text_content().split("|")

                        date_time = details[0].strip()
                        location = details[1].strip()

                        if "Upon Adjournment" in date_time:
                            date = dt.datetime.strptime(
                                date_day, "%A %B %d, %Y")
                        else:
                            date_str = "{} {}".format(date_day, date_time)
                            date = dt.datetime.strptime(
                                date_str, "%A %B %d, %Y %I:%M %p")

                        agendas = []
                        # they overload the bills table w/ other agenda items. colspon=2 is agenda
                        non_bills = page.xpath(
                            '//td[@data-label="Hearing Item" and @colspan="2"]'
                        )
                        for row in non_bills:
                            content = row.text_content().strip()
                            agendas.append(content)

                        agenda = "\n".join(agendas) if agendas else ""

                        event = Event(
                            name=title,
                            start_date=self._tz.localize(date),
                            location_name=location,
                        )
                        if agenda.strip():
                            event.add_agenda_item(agenda)

                        event.add_source(link)
                        bills = page.xpath(
                            '//td[@data-label="Hearing Item"]/a')
                        for bill in bills:
                            bill_id = bill.text_content().strip()

                            item = event.add_agenda_item("hearing item")
                            item.add_bill(bill_id)

                        yield event
                    except Exception:  # TODO: this is awful
                        pass
示例#21
0
    def scrape_event_page(self, url, chamber):
        html = self.get(url).text
        page = lxml.html.fromstring(html)
        trs = page.xpath(
            "//table[@id='frg_mcommitteemeeting_MeetingTable']/tr")
        metainf = {}
        for tr in trs:
            tds = tr.xpath(".//td")
            if len(tds) <= 1:
                continue
            key = tds[0].text_content().strip()
            val = tds[1]
            metainf[key] = {"txt": val.text_content().strip(), "obj": val}

        if metainf == {}:
            return

        # Wednesday, 5/16/2012 3:00 pm
        datetime = "%s %s" % (
            metainf["Date"]["txt"],
            metainf["Time"]["txt"].replace(".", ""),
        )
        if "Cancelled" in datetime:
            return

        translate = {
            "noon": " PM",
            "a.m.": " AM",
            "am": " AM",  # This is due to a nasty line they had.
            "a.m": "AM",  # another weird one
        }

        for t in translate:
            if t in datetime:
                datetime = datetime.replace(t, translate[t])

        datetime = re.sub(r"\s+", " ", datetime)

        for text_to_remove in [
                "or after committees are given leave",
                "or later immediately after committees are given leave",
                "or later after committees are given leave by the House to meet",
                "**Please note time**",
        ]:
            datetime = datetime.split(text_to_remove)[0].strip()

        datetime = datetime.replace("p.m.", "pm")
        datetime = datetime.replace("Noon", "pm")
        try:
            datetime = dt.datetime.strptime(datetime, "%A, %m/%d/%Y %I:%M %p")
        except ValueError:
            datetime = dt.datetime.strptime(datetime, "%A, %m/%d/%Y %I %p")
        where = metainf["Location"]["txt"]
        title = metainf["Committee(s)"]["txt"]  # XXX: Find a better title

        if chamber == "other":
            chamber = "joint"

        event = Event(name=title,
                      start_date=self._tz.localize(datetime),
                      location_name=where)
        event.add_source(url)
        event.add_source(mi_events)

        chair_name = metainf["Chair"]["txt"].strip()
        if chair_name:
            event.add_participant(chair_name, type="legislator", note="chair")
        else:
            self.warning("No chair found for event '{}'".format(title))

        event.add_participant(metainf["Committee(s)"]["txt"],
                              type="committee",
                              note="host")

        agenda = metainf["Agenda"]["obj"]
        agendas = agenda.text_content().split("\r")

        related_bills = agenda.xpath("//a[contains(@href, 'getObject')]")
        for bill in related_bills:
            description = agenda
            for a in agendas:
                if bill.text_content() in a:
                    description = a

            item = event.add_agenda_item(description)
            item.add_bill(bill.text_content())

        yield event
示例#22
0
    def scrape(self):
        url = "https://www.ncleg.gov/LegislativeCalendar/"
        page = self.lxmlize(url)
        page.make_links_absolute(url)
        for day_row in page.xpath('//div[@class="row cal-event-day"]'):

            date = day_row.xpath(
                './/div[contains(@class, "cal-event-day-full")]/text()'
            )[0].strip()
            for row in day_row.xpath(
                    './/div[contains(@class, "cal-event row")]'):
                # first cal-event-row sometimes contains full date, skip that
                time = row.xpath(
                    'div[contains(@class,"col-12 text-left col-sm-3 text-sm-right")]/text()'
                )[0].strip()

                event_row = row.xpath(
                    'div[contains(@class,"col-12 col-sm-9 col-md-12 ")]')[0]

                # skip floor sessions
                if event_row.xpath(
                        './/a[contains(text(), "Session Convenes")]'):
                    continue

                chamber = ""
                if len(
                        event_row.xpath(
                            'span[contains(@class, "text-dark font-weight-bold")]/text()'
                        )):
                    chamber = event_row.xpath(
                        'span[contains(@class, "text-dark font-weight-bold")]/text()'
                    )[0].strip()
                    chamber = chamber.replace(":", "")

                # sometimes there are unlinked events, usually just press conferences
                if not event_row.xpath('a[contains(@href,"/Committees/")]'):
                    continue

                com_link = event_row.xpath(
                    'a[contains(@href,"/Committees/")]')[0]
                com_name = com_link.text_content().strip()
                com_name = f"{chamber} {com_name}".strip()

                com_url = com_link.xpath("@href")[0]

                where = (row.xpath('div[contains(@class,"col-12 offset-sm-3")]'
                                   )[0].text_content().strip())
                where = where.replace("STREAM", "")

                when = f"{date} {time}"
                try:
                    when = dateutil.parser.parse(when)
                    # occasionally they'd do 9am-1pm which confuses the TZ detection
                    when = self._tz.localize(when)
                except (ParserError, ValueError):
                    self.warning(
                        f"Unable to parse {time}, only using day component")
                    when = dateutil.parser.parse(date)
                    when = self._tz.localize(when).date()

                event = Event(
                    name=com_name,
                    start_date=when,
                    location_name=where,
                    classification="committee-meeting",
                )
                event.add_source(com_url)

                event.add_participant(com_name, type="committee", note="host")

                # NOTE: if you follow the committee link, there are agenda PDF links
                # but they don't load at all as of 2021-02-01 -- showerst

                for agenda_row in event_row.xpath(".//p"):
                    agenda_text = agenda_row.text_content().strip()
                    if agenda_text != "":
                        agenda = event.add_agenda_item(agenda_text)

                        for bill_row in agenda_row.xpath(
                                './/a[contains(@href,"BillLookUp")]/text()'):
                            agenda.add_bill(bill_row.split(":")[0])

                yield event
示例#23
0
    def scrape_meeting_page(self, url):
        page = self.lxmlize(url)
        page.make_links_absolute(url)

        if page.xpath('//div[text()="Error"]'):
            return

        if not page.xpath('//div[@id="wrapleftcol"]/h3'):
            return

        com = page.xpath('//div[@id="wrapleftcol"]/h3[1]/text()')[0].strip()
        when = page.xpath('//div[@id="wrapleftcol"]/h1[1]/text()')[0].strip()

        if "time to be announced" in when.lower() or "tba" in when.lower():
            when = re.sub("time to be announced",
                          "",
                          when,
                          flags=re.IGNORECASE)
            when = re.sub("TBA", "", when, flags=re.IGNORECASE)

        when = re.sub(r"or\s+conclusion\s+(.*)", "", when, flags=re.IGNORECASE)

        when = when.split("-")[0]
        when = self.clean_date(when)
        when = dateutil.parser.parse(when)
        when = self._tz.localize(when)

        # we check for this elsewhere, but just in case the very first event on a committee page is way in the past
        if when.year < datetime.datetime.today().year:
            return

        where = page.xpath(
            '//div[@id="wrapleftcol"]/*[contains(text(), "Location")]/text()'
        )[0].strip()
        desc = (page.xpath('//div[@id="wrapleftcol"]/blockquote[1]')
                [0].text_content().strip())

        event = Event(
            name=com,
            start_date=when,
            location_name=where,
            classification="committee-meeting",
            description=desc,
        )

        for row in page.xpath('//div[@id="wrapleftcol"]/blockquote[1]/p'):
            if row.text_content().strip() != "":
                agenda = event.add_agenda_item(row.text_content().strip())
                for bill in re.findall(self.bill_regex, row.text_content()):
                    bill_id = re.sub(r"\.\s*",
                                     "",
                                     bill[0],
                                     flags=re.IGNORECASE)
                    bill_id = re.sub(r"house bill",
                                     "HB",
                                     bill_id,
                                     flags=re.IGNORECASE)
                    bill_id = re.sub(r"senate bill",
                                     "SB",
                                     bill_id,
                                     flags=re.IGNORECASE)
                    agenda.add_bill(bill_id)

        event.add_source(url)

        yield event
示例#24
0
    def scrape(self, session=None, start=None, end=None):

        if session is None:
            session = self.latest_session()
            self.info("no session specified, using %s", session)

        # testimony url, we'll need it later in a loop

        # testmony query looks gnary but breaks down to:
        # $filter: (Request/PaperNumber eq 'SP0219') and (Request/Legislature eq 129)
        # $orderby: LastName,FirstName,Organization
        # $expand: Request
        # $select: Id,FileType,NamePrefix,FirstName,LastName,Organization,
        # PresentedDate,FileSize,Topic

        testimony_url_base = (
            "http://legislature.maine.gov/backend/"
            "breeze/data/CommitteeTestimony?"
            "$filter=(Request%2FPaperNumber%20eq%20%27{}%27)%20and"
            "%20(Request%2FLegislature%20eq%20{})"
            "&$orderby=LastName%2CFirstName%2COrganization&"
            "$expand=Request&$select=Id%2CFileType%2CNamePrefix"
            "%2CFirstName%2CLastName%2COrganization%2CPresentedDate%2CFileSize%2CTopic"
        )

        if start is None:
            start_date = datetime.datetime.now().isoformat()
        else:
            start_date = datetime.datetime.strptime(start, "%Y-%m-%d")
            start_date = start_date.isoformat()

        # default to 30 days if no end
        if end is None:
            dtdelta = datetime.timedelta(days=30)
            end_date = datetime.datetime.now() + dtdelta
            end_date = end_date.isoformat()
        else:
            end_date = datetime.datetime.strptime(end, "%Y-%m-%d")
            end_date = end_date.isoformat()

        bills_by_event = {}

        bills_url = ("http://legislature.maine.gov/backend/breeze/data/"
                     "getCalendarEventsBills?startDate={}&endDate={}")
        bills_url = bills_url.format(start_date, end_date)
        page = json.loads(self.get(bills_url).content)

        for row in page:
            bills_by_event.setdefault(row["EventId"], [])
            bills_by_event[row["EventId"]].append(row)

        # http://legislature.maine.gov/backend/breeze/data/getCalendarEventsRaw?startDate=2019-03-01T05%3A00%3A00.000Z&endDate=2019-04-01T03%3A59%3A59.999Z&OnlyPHWS=false
        url = ("http://legislature.maine.gov/backend/breeze/data/"
               "getCalendarEventsRaw?startDate={}&endDate={}&OnlyPHWS=true")
        url = url.format(start_date, end_date)

        page = json.loads(self.get(url).content)

        for row in page:
            if row["Cancelled"] is True or row["Postponed"] is True:
                continue

            start_date = self._TZ.localize(
                dateutil.parser.parse(row["FromDateTime"]))
            end_date = self._TZ.localize(
                dateutil.parser.parse(row["ToDateTime"]))

            name = row["CommitteeName"]

            if name is None:
                name = row["Host"]

            address = row["Location"]
            address = address.replace(
                "Cross Building",
                "Cross Office Building, 111 Sewall St, Augusta, ME 04330",
            )

            address = address.replace(
                "State House",
                "Maine State House, 210 State St, Augusta, ME 04330")

            event = Event(
                start_date=start_date,
                end_date=end_date,
                name=name,
                location_name=address,
            )

            event.add_source(
                "http://legislature.maine.gov/committee/#Committees/{}".format(
                    row["CommitteeCode"]))

            if bills_by_event.get(row["Id"]):
                for bill in bills_by_event[row["Id"]]:
                    description = "LD {}: {}".format(bill["LD"], bill["Title"])
                    agenda = event.add_agenda_item(description=description)
                    agenda.add_bill("LD {}".format(bill["LD"]))

                    if bill["TestimonyCount"] > 0:
                        test_url = testimony_url_base.format(
                            bill["PaperNumber"], session)
                        test_page = json.loads(self.get(test_url).content)
                        for test in test_page:
                            title = "{} {} - {}".format(
                                test["FirstName"],
                                test["LastName"],
                                test["Organization"],
                            )
                            if test["NamePrefix"] is not None:
                                title = "{} {}".format(test["NamePrefix"],
                                                       title)

                            test_url = (
                                "http://legislature.maine.gov/backend/app/services"
                                "/getDocument.aspx?doctype=test&documentId={}".
                                format(test["Id"]))

                            if test["FileType"] == "pdf":
                                media_type = "application/pdf"

                            event.add_document(note=title,
                                               url=test_url,
                                               media_type=media_type)
            yield event
示例#25
0
    def scrape_lower(self):
        url = "https://www.house.leg.state.mn.us/Schedules/All"
        page = self.lxmlize(url)

        for row in page.xpath('//div[contains(@class,"my-2 d-print-block")]'):
            # print(row.text_content())

            # skip floor sessions and unlinked events
            if not row.xpath(
                    'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/b'
            ):
                continue

            # skip joint ones, we'll get those from the senate API
            if row.xpath('div[contains(@class,"card-header bg-joint")]'):
                continue

            # top-level committee
            com = row.xpath(
                'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/b/text()'
            )[0].strip()
            com_link = row.xpath(
                'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/@href'
            )[0]

            when = (row.xpath(
                'div[contains(@class,"card-header")]/span[contains(@class,"text-white")]/text()'
            )[0].replace("\r\n", "").strip())
            when = dateutil.parser.parse(when)
            when = self._tz.localize(when)

            if row.xpath('.//b[.="Location:"]'):
                where = row.xpath(
                    './/b[.="Location:"]/following-sibling::text()[1]'
                )[0].strip()
            else:
                where = "See committee page"

            if row.xpath('.//b[.="Agenda:"]'):
                desc = "\n".join(
                    row.xpath('.//b[.="Agenda:"]/following-sibling::div/text()'
                              )).strip()
            else:
                desc = "See committee page"

            event = Event(
                name=com,
                start_date=when,
                location_name=where,
                classification="committee-meeting",
                description=desc,
            )

            event.add_source(com_link)

            for bill in get_bill_ids(desc):
                event.add_bill(desc)

            if row.xpath(
                    ".//a[contains(@href,'/bills/bill.php') and contains(@class,'pull-left')]"
            ):
                agenda = event.add_agenda_item("Bills")
                for bill_id in row.xpath(
                        ".//a[contains(@href,'/bills/bill.php') and contains(@class,'pull-left')]/text()"
                ):
                    agenda.add_bill(bill_id.strip())

            for attachment in row.xpath(".//ul/li/div/a"):
                doc_url = attachment.xpath("@href")[0]
                doc_name = attachment.xpath("text()")[0].strip()
                # if they don't provide a name just use the filename
                if doc_name == "":
                    parsed_url = urlparse(doc_url)
                    doc_name = os.path.basename(parsed_url)

                # sometimes broken links to .msg files (emails?) are attached,
                # they always 404.
                if doc_url.endswith(".msg"):
                    continue
                media_type = get_media_type(doc_url)
                event.add_document(doc_name,
                                   doc_url,
                                   media_type=media_type,
                                   on_duplicate="ignore")

            for committee in row.xpath(
                    'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/b/text()'
            ):
                event.add_participant(committee, type="committee", note="host")

            yield event
示例#26
0
    def scrape_committee_page(self, url):
        page = self.get(url, headers=self.cf_headers).content
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)
        com = page.xpath(
            '//div[contains(@class, "pull-left span8")]/h1/text()')[0].strip()

        for row in page.xpath('//div[contains(@id, "agenda-item")]'):
            # status = "tentative"
            meta = row.xpath(
                'div[contains(@class,"accordion-heading-agenda")]/a')[0]

            date = meta.xpath("text()")[0].strip()

            time_and_loc = meta.xpath("span/text()")[0].strip()
            time_and_loc = time_and_loc.split("\n")
            time = time_and_loc[0]
            loc = time_and_loc[1]

            if loc == "":
                loc = "See Agenda"

            com = com.replace("(S)", "Senate").replace("(H)", "House")

            # Indiana has a LOT of undefined times, stuff like "15 mins after adj. of elections"
            # so just remove the time component if it won't parse, and the user can go to the agenda
            try:
                when = dateutil.parser.parse(f"{date} {time}")
            except dateutil.parser._parser.ParserError:
                when = dateutil.parser.parse(date)
            when = self._tz.localize(when)

            if "cancelled" in time.lower():
                continue

            event = Event(
                name=com,
                start_date=when,
                location_name=loc,
                classification="committee-meeting",
            )

            event.add_source(url)
            event.add_participant(com, type="committee", note="host")

            if row.xpath('.//a[contains(text(), "View Agenda")]'):
                agenda_url = row.xpath(
                    './/a[contains(text(), "View Agenda")]/@href')[0]
                event.add_document("Agenda",
                                   agenda_url,
                                   media_type="application/pdf")

            if row.xpath('.//a[contains(text(), "Watch")]'):
                vid_url = row.xpath('.//a[contains(text(), "Watch")]/@href')[0]
                event.add_media_link("Video of Hearing",
                                     vid_url,
                                     media_type="text/html")

            if row.xpath('.//tr[contains(@class,"bill-container")]/td'):
                agenda = event.add_agenda_item("Bills under consideration")
                for bill_row in row.xpath(
                        './/tr[contains(@class,"bill-container")]'):
                    bill_id = bill_row.xpath(
                        ".//a[contains(@class,'bill-name-link')]/text()")[0]
                    agenda.add_bill(bill_id)

            yield event
示例#27
0
    def house_meeting(self, xml, source_url):

        title = xml.xpath("string(//meeting-details/meeting-title)")

        meeting_date = xml.xpath("string(//meeting-date/calendar-date)")
        start_time = xml.xpath("string(//meeting-date/start-time)")
        end_time = xml.xpath("string(//meeting-date/end-time)")

        start_dt = datetime.datetime.strptime(
            "{} {}".format(meeting_date, start_time), "%Y-%m-%d %H:%M:%S")

        start_dt = self._TZ.localize(start_dt)

        end_dt = None

        if end_time != "":
            end_dt = datetime.datetime.strptime(
                "{} {}".format(meeting_date, end_time), "%Y-%m-%d %H:%M:%S")
            end_dt = self._TZ.localize(end_dt)

        building = xml.xpath(
            "string(//meeting-details/meeting-location/capitol-complex/building)"
        )

        address = "US Capitol"
        if building != "Select one":
            if self.buildings.get(building):
                building = self.buildings.get(building)

            room = xml.xpath(
                "string(//meeting-details/meeting-location/capitol-complex/room)"
            )
            address = "{}, Room {}".format(building, room)

        event = Event(start_date=start_dt, name=title, location_name=address)

        event.add_source(source_url)

        coms = xml.xpath(
            "//committees/committee-name | //subcommittees/committee-name")
        for com in coms:
            com_name = com.xpath("string(.)")
            com_name = "House {}".format(com_name)
            event.add_participant(
                com_name,
                type="committee",
                note="host",
            )

        docs = xml.xpath("//meeting-documents/meeting-document")
        for doc in docs:
            doc_name = doc.xpath("string(description)")
            doc_files = doc.xpath("files/file")
            for doc_file in doc_files:
                media_type = self.media_types[doc_file.get("doc-type")]
                url = doc_file.get("doc-url")

                if doc.get("type") in ["BR", "AM", "CA"]:
                    if doc_name == "":
                        doc_name = doc.xpath("string(legis-num)").strip()
                    matches = re.findall(r"([\w|\.]+)\s+(\d+)", doc_name)

                    if matches:
                        match = matches[0]
                        bill_type = match[0].replace(".", "")
                        bill_number = match[1]
                        bill_name = "{} {}".format(bill_type, bill_number)
                        agenda = event.add_agenda_item(description=bill_name)
                        agenda.add_bill(bill_name)

                if doc_name == "":
                    try:
                        doc_name = self.hearing_document_types[doc.get("type")]
                    except KeyError:
                        self.warning("Unable to find document type: {}".format(
                            doc.get("type")))

                event.add_document(doc_name,
                                   url,
                                   media_type=media_type,
                                   on_duplicate="ignore")

        yield event
示例#28
0
    def scrape(self):

        get_short_codes(self)
        page = self.lxmlize(URL)

        if page.xpath("//td[contains(string(.),'No Hearings')]"):
            raise EmptyScrape

        table = page.xpath(
            "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0]

        for event in table.xpath(".//tr")[1:]:
            tds = event.xpath("./td")
            committee = tds[0].text_content().strip()

            # Multi-committee events will be CODE1/CODE2/CODE3
            if "/" in committee:
                coms = committee.split("/")
                com_names = []
                for com in coms:
                    com_names.append("{} {}".format(
                        self.chambers[self.short_ids[com]["chamber"]],
                        self.short_ids[com]["name"],
                    ))
                descr = ", ".join(com_names)
            elif self.short_ids.get(committee):
                descr = "{} {}".format(
                    self.chambers[self.short_ids[committee]["chamber"]],
                    self.short_ids[committee]["name"],
                )
            else:
                descr = [x.text_content() for x in tds[1].xpath(".//span")]
                if len(descr) != 1:
                    raise Exception
                descr = descr[0].replace(".", "").strip()

            when = tds[2].text_content().strip()
            where = tds[3].text_content().strip()
            notice = tds[4].xpath(".//a")[0]
            notice_href = notice.attrib["href"]
            notice_name = notice.text

            # the listing page shows the same hearing in multiple rows.
            # combine these -- get_related_bills() will take care of adding the bills
            # and descriptions
            if notice_href in self.seen_hearings:
                continue
            else:
                self.seen_hearings.append(notice_href)

            when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p")
            when = TIMEZONE.localize(when)
            event = Event(
                name=descr,
                start_date=when,
                classification="committee-meeting",
                description=descr,
                location_name=where,
            )

            if "/" in committee:
                committees = committee.split("/")
            else:
                committees = [committee]

            for committee in committees:
                if "INFO" not in committee and committee in self.short_ids:
                    committee = "{} {}".format(
                        self.chambers[self.short_ids[committee]["chamber"]],
                        self.short_ids[committee]["name"],
                    )
                event.add_committee(committee, note="host")

            event.add_source(URL)
            event.add_document(notice_name,
                               notice_href,
                               media_type="text/html")
            for bill in self.get_related_bills(notice_href):
                a = event.add_agenda_item(description=bill["descr"].strip())
                bill["bill_id"] = bill["bill_id"].split(",")[0]
                a.add_bill(bill["bill_id"], note=bill["type"])
            yield event
示例#29
0
    def scrape_chamber(self, chamber):
        if chamber == "upper":
            url = "https://legislature.idaho.gov/sessioninfo/agenda/sagenda/"
        elif chamber == "lower":
            url = "https://legislature.idaho.gov/sessioninfo/agenda/hagenda/"

        page = self.get(url).content
        page = lxml.html.fromstring(page)

        for row in page.xpath('//div[@id="ai1ec-container"]/div'):
            month = row.xpath(
                ".//div[contains(@class,'calendarHeader')]/div[contains(@class,'date')]/text()"
            )[0].strip()
            day = row.xpath(
                ".//div[contains(@class,'calendarHeader')]/div[contains(@class,'date')]/span/text()"
            )[0].strip()

            time_and_loc = row.xpath(
                ".//div[contains(@class,'calendarHeader')]/div[contains(@class,'abbr')]/h2/text()"
            )
            time = time_and_loc[0].strip()
            loc = time_and_loc[1].strip()

            if "not meet" in time.lower():
                continue

            try:
                start = dateutil.parser.parse(f"{month} {day} {time}")
            except dateutil.parser._parser.ParserError:
                start = dateutil.parser.parse(f"{month} {day}")

            start = self._tz.localize(start)

            com = row.xpath(
                ".//div[contains(@class,'calendarHeader')]/div[contains(@class,'day')]/h2/a/text()"
            )[0].strip()

            event = Event(
                name=com,
                start_date=start,
                location_name=loc,
                classification="committee-meeting",
            )

            event.add_participant(com, type="committee", note="host")

            agenda_url = row.xpath(
                './/a[contains(text(), "Full Agenda")]/@href')[0]
            event.add_document("Agenda",
                               agenda_url,
                               media_type="application/pdf")

            agenda_rows = row.xpath(
                './/div[contains(@class,"card")]/div[contains(@id, "Agenda")]/div/table/tbody/tr'
            )[1:]

            for agenda_row in agenda_rows:
                subject = agenda_row.xpath("string(td[1])").strip()
                description = agenda_row.xpath("string(td[2])").strip()
                presenter = agenda_row.xpath("string(td[3])").strip()
                if presenter != "":
                    agenda_text = (
                        f"{subject} {description} Presenter: {presenter}".
                        strip())
                    event.add_participant(agenda_text,
                                          type="person",
                                          note="Presenter")
                else:
                    agenda_text = f"{subject} {description}".strip()

                agenda = event.add_agenda_item(agenda_text)

                if agenda_row.xpath(
                        'td[1]/a[contains(@href,"/legislation/")]'):
                    agenda.add_bill(
                        agenda_row.xpath(
                            'td[1]/a[contains(@href,"/legislation/")]/text()')
                        [0].strip())

            event.add_source(url)
            yield event
示例#30
0
    def scrape_upper(self):
        url = "https://www.senate.mn/api/schedule/upcoming"
        data = self.get(url).json()

        for row in data["events"]:
            com = row["committee"]["committee_name"]
            start = dateutil.parser.parse(row["hearing_start"])
            start = self._tz.localize(start)

            if (row["hearing_room"] and "hearing_building" in row
                    and row["hearing_building"]):
                where = f"{row['hearing_building']} {row['hearing_room']}"
            elif "hearing_building" in row and row["hearing_building"]:
                where = row["hearing_building"]
            else:
                where = "TBD"

            description = ""

            if "hearing_notes" in row and row["hearing_notes"]:
                description = row["hearing_notes"]

            event = Event(
                name=com,
                location_name=where,
                start_date=start,
                classification="committee-meeting",
                description=description,
            )

            for bill in get_bill_ids(description):
                event.add_bill(description)

            if "lrl_schedule_link" in row:
                event.add_source(row["lrl_schedule_link"])
            else:
                if "link" in row["committee"]:
                    if row["committee"]["link"].startswith("http"):
                        event.add_source(row["committee"]["link"])
                    elif row["committee"]["link"].startswith("www"):
                        event.add_source(f"http://{row['committee']['link']}")
                    else:
                        event.add_source(
                            f"https://www.senate.mn/{row['committee']['link']}"
                        )
                elif "senate_chair_link" in row["committee"]:
                    event.add_source(
                        f"https://www.senate.mn/{row['committee']['senate_chair_link']}"
                    )

            if "agenda" in row:
                for agenda_row in row["agenda"]:
                    if (agenda_row["description"] is None
                            or agenda_row["description"].strip() == ""):
                        # sometimes they have blank agendas but bills or files
                        agenda_row["description"] = "Agenda"
                    agenda = event.add_agenda_item(agenda_row["description"])
                    if "bill_type" in agenda_row:
                        agenda.add_bill("{} {}".format(
                            agenda_row["bill_type"].replace(".", ""),
                            agenda_row["bill_number"],
                        ))

                    if "files" in agenda_row:
                        for file_row in agenda_row["files"]:
                            doc_name = file_row["filename"]
                            doc_url = file_row["file_path"]

                            # if they don't provide a name just use the filename
                            if doc_name == "":
                                parsed_url = urlparse(doc_url)
                                doc_name = os.path.basename(parsed_url.path)

                            event.add_document(
                                doc_name,
                                f"https://www.senate.mn/{doc_url}",
                                media_type="text/html",
                                on_duplicate="ignore",
                            )

            if "video_link" in row:
                event.add_media_link("Video", row["video_link"], "text/html")

            if "audio_link" in row:
                event.add_media_link("Audio", row["audio_link"], "text/html")

            yield event