Exemplo n.º 1
0
    def scrape_lower_event(self, url):
        page = self.get(url).content
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        table = page.xpath('//section[@id="leg-agenda-mod"]/div/table')[0]
        meta = table.xpath("tr[1]/td[1]/text()")

        # careful, the committee name in the page #committee_div
        # is getting inserted via JS
        # so use the one from the table, and strip the chair name
        com_name = re.sub(r"\(.*\)", "", meta[0])
        com_name = f"Assembly {com_name}"

        when = dateutil.parser.parse(meta[1])
        when = self._tz.localize(when)
        location = meta[2]

        event = Event(
            name=com_name,
            start_date=when,
            location_name=location,
        )

        event.add_participant(com_name, type="committee", note="host")

        event.add_source(url)

        if table.xpath('.//a[contains(@href, "/leg/")]'):
            agenda = event.add_agenda_item("Bills under Consideration")
            for bill_link in table.xpath('.//a[contains(@href, "/leg/")]'):
                agenda.add_bill(bill_link.text_content().strip())

        yield event
Exemplo n.º 2
0
    def upper_parse_agenda_item(self, item):
        response = self.api_client.get(
            "meeting",
            year=item["agendaId"]["year"],
            agenda_id=item["agendaId"]["number"],
            committee=item["committeeId"]["name"],
        )

        data = response["result"]

        chamber = data["committee"]["committeeId"]["chamber"].title()
        com_code = data["committee"]["committeeId"]["name"]
        com_name = f"{chamber} {com_code}"

        # each "meeting" is actually a listing page of multiple meetings of the same committee
        # broken out by different addendumId
        for addendum in data["committee"]["addenda"]["items"]:
            if addendum["addendumId"] != item["addendum"]:
                continue

            meeting = addendum["meeting"]

            when = dateutil.parser.parse(meeting["meetingDateTime"])
            when = self._tz.localize(when)

            location = meeting["location"]
            description = meeting["notes"]

            if location == "":
                location = "See Committee Site"

            if "canceled" in description.lower():
                continue

            event = Event(
                name=com_name,
                start_date=when,
                location_name=location,
                description=description,
            )

            event.add_participant(com_name, type="committee", note="host")

            com_code = (com_code.lower().replace("'", "").replace(" ",
                                                                  "-").replace(
                                                                      ",", ""))
            url = f"https://www.nysenate.gov/committees/{com_code}"
            event.add_source(url)

            bills = addendum["bills"]["items"]

            if len(bills) > 0:
                agenda = event.add_agenda_item("Bills under consideration")

            for bill in bills:
                agenda.add_bill(bill["billId"]["printNo"])

            yield event
Exemplo n.º 3
0
    def scrape_events(self, session, start_date):
        session_key = SESSION_KEYS[session]

        if start_date is None:
            start_date = datetime.date.today()
        else:
            start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")

        committees_by_code = {}

        committees_response = self.api_client.get("committees",
                                                  session=session_key)
        for committee in committees_response:
            committees_by_code[
                committee["CommitteeCode"]] = committee["CommitteeName"]

        meetings_response = self.api_client.get(
            "committee_meetings",
            start_date=start_date.strftime(self._DATE_FORMAT),
            session=session_key,
        )

        if len(meetings_response) == 0:
            raise EmptyScrape

        for meeting in meetings_response:
            event_date = self._TZ.localize(
                datetime.datetime.strptime(meeting["MeetingDate"],
                                           self._DATE_FORMAT))
            com_name = committees_by_code[meeting["CommitteeCode"]]

            event = Event(start_date=event_date,
                          name=com_name,
                          location_name=meeting["Location"])

            event.add_source(meeting["AgendaUrl"])

            event.extras["meeting_guid"] = meeting["MeetingGuid"]
            event.extras["committee_code"] = committee["CommitteeCode"]

            event.add_participant(com_name, type="committee", note="host")

            for row in meeting["CommitteeAgendaItems"]:
                if row["Comments"] is not None:
                    agenda = event.add_agenda_item(row["Comments"])

                if row["MeasureNumber"] is not None:
                    bill_id = "{} {}".format(row["MeasurePrefix"],
                                             row["MeasureNumber"])
                    agenda.add_bill(bill_id)

            for row in meeting["CommitteeMeetingDocuments"]:
                event.add_document(
                    note=row["ExhibitTitle"],
                    url=row["DocumentUrl"],
                    on_duplicate="ignore",
                )
            yield event
Exemplo n.º 4
0
    def scrape(self, start=None, end=None):
        if start is None:
            start = dt.datetime.today()
        else:
            start = dateutil.parser.parse(start)

        if end is None:
            end = start + relativedelta(months=+3)
        else:
            end = dateutil.parser.parse(end)

        start = start.strftime("%Y-%m-%d")
        end = end.strftime("%Y-%m-%d")

        url = f"{self.base_url}calendar-data?start={start}&end={end}"
        data = json.loads(self.scraper.get(url).content)

        for item in data:
            name = item["title"].strip()
            if "canceled" in name.lower():
                continue

            if "house session" in name.lower(
            ) or "senate session" in name.lower():
                continue

            url = f"{self.base_url}{item['url']}"

            when = dateutil.parser.parse(item["start"])
            when = self._tz.localize(when)

            page = self.scraper.get(url).content
            page = lxml.html.fromstring(page)

            location = page.xpath(
                '//div[contains(@class,"eventModule") and h3[contains(text(), "Location")]]/text()'
            )[0].strip()
            agenda_url = page.xpath(
                '//a[contains(@class,"linkButton") and contains(text(),"Agenda")]/@href'
            )[0]

            event = Event(
                name=name,
                start_date=when,
                location_name=location,
            )

            event.add_participant(name, type="committee", note="host")
            event.add_document("Agenda",
                               agenda_url,
                               media_type="application/pdf")
            event.add_source(url)

            yield event
Exemplo n.º 5
0
    def scrape_upper(self):
        listing_url = "https://www.senate.mo.gov/hearingsschedule/hrings.htm"

        html = self.get(listing_url).text

        # The HTML here isn't wrapped in a container per-event
        # which makes xpath a pain. So string split by <hr>
        # then parse each event's fragment for cleaner results
        for fragment in html.split("<hr />")[1:]:
            page = lxml.html.fromstring(fragment)

            when_date = self.row_content(page, "Date:")
            when_time = self.row_content(page, "Time:")
            location = self.row_content(page, "Room:")

            location = "{}, {}".format(
                location, "201 W Capitol Ave, Jefferson City, MO 65101")

            # com = self.row_content(page, 'Committee:')
            com = page.xpath(
                '//td[descendant::b[contains(text(),"Committee")]]/a/text()'
            )[0]
            com = com.split(", Senator")[0].strip()

            start_date = self._TZ.localize(
                dateutil.parser.parse("{} {}".format(when_date, when_time)))

            event = Event(start_date=start_date,
                          name=com,
                          location_name=location)

            event.add_source(listing_url)

            event.add_participant(com, type="committee", note="host")

            for bill_table in page.xpath(
                    '//table[@width="85%" and @border="0"]'):
                bill_link = ""
                if bill_table.xpath(self.bill_link_xpath):
                    agenda_line = bill_table.xpath("string(tr[2])").strip()
                    agenda_item = event.add_agenda_item(
                        description=agenda_line)

                    bill_link = bill_table.xpath(
                        self.bill_link_xpath)[0].strip()
                    agenda_item.add_bill(bill_link)
                else:
                    agenda_line = bill_table.xpath("string(tr[1])").strip()
                    agenda_item = event.add_agenda_item(
                        description=agenda_line)

            yield event
Exemplo n.º 6
0
    def scrape_cal_page(self, url):
        page = self.get(url).content
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for row in page.xpath("//article[contains(@class,'accordion')]"):
            when = row.xpath(".//time/@datetime")[0]
            when = dateutil.parser.parse(when)

            title = row.xpath(
                ".//h3[contains(@class,'heading-link')]/text()")[0].strip()

            description = row.xpath(
                "section/div[contains(@class,'large-8')]/div[contains(@class,'base')]"
            )[0].text_content()

            # fix special chars
            description = (description.replace("\n\u2013", " ").replace(
                "\n", " ").replace("\u203a", ""))
            description = description.replace("More about this event",
                                              "").strip()

            location = row.xpath(
                "header/div/div[contains(@class,'large-8')]/div/div[contains(@class,'text-right')]/p"
            )[0].text_content()

            event = Event(
                name=title,
                description=description,
                start_date=when,
                location_name=location,
            )

            agenda_url = row.xpath(
                ".//a[contains(text(),'More about this event')]/@href")
            if agenda_url != []:
                event.add_document("Details and Agenda",
                                   agenda_url[0],
                                   media_type="text/html")

            if "committee meeting" in title.lower():
                com_name = title.replace("Committee Meeting", "").strip()
                event.add_participant(com_name, type="commitee", note="host")

            event.add_source(url)

            yield event

        if page.xpath("//a[contains(text(), 'Upcoming Events')]"):
            next_url = page.xpath(
                "//a[contains(text(), 'Upcoming Events')]/@href")[0]
            yield from self.scrape_cal_page(next_url)
    def parse_event(self, row, chamber):
        # sample event available at http://www.akleg.gov/apptester.html
        committee_code = row.xpath("string(Sponsor)").strip()

        if committee_code in self.COMMITTEES[chamber]:
            committee_name = "{} {}".format(
                self.COMMITTEES_PRETTY[chamber],
                self.COMMITTEES[chamber][committee_code]["name"],
            )
        else:
            committee_name = "{} {}".format(
                self.COMMITTEES_PRETTY[chamber],
                "MISCELLANEOUS",
            )

        name = "{} {}".format(self.COMMITTEES_PRETTY[chamber],
                              row.xpath("string(Title)").strip())

        # If name is missing, make it "<CHAMBER> <COMMITTEE NAME>"
        if name == "":
            name = committee_name

        location = row.xpath("string(Location)").strip()

        # events with no location all seem to be committee hearings
        if location == "":
            location = "Alaska State Capitol, 120 4th St, Juneau, AK 99801"

        start_date = dateutil.parser.parse(row.xpath("string(Schedule)"))
        # todo: do i need to self._TZ.localize() ?

        event = Event(start_date=start_date, name=name, location_name=location)

        event.add_source("http://w3.akleg.gov/index.php#tab4")

        if committee_code in self.COMMITTEES[chamber]:
            event.add_participant(committee_name,
                                  type="committee",
                                  note="host")

        for item in row.xpath("Agenda/Item"):
            agenda_desc = item.xpath("string(Text)").strip()
            if agenda_desc != "":
                agenda_item = event.add_agenda_item(description=agenda_desc)
                if item.xpath("BillRoot"):
                    bill_id = item.xpath("string(BillRoot)")
                    # AK Bill ids have a bunch of extra spaces
                    bill_id = re.sub(r"\s+", " ", bill_id)
                    agenda_item.add_bill(bill_id)

        yield event
Exemplo n.º 8
0
    def scrape_lower_item(self, page):
        # print(lxml.etree.tostring(page, pretty_print=True))
        com = self.table_row_content(page, "Committee:")
        when_date = self.table_row_content(page, "Date:")
        when_time = self.table_row_content(page, "Time:")
        location = self.table_row_content(page, "Location:")

        if "house hearing room" in location.lower():
            location = "{}, {}".format(
                location, "201 W Capitol Ave, Jefferson City, MO 65101")

        # fix some broken times, e.g. '12 :00'
        when_time = when_time.replace(" :", ":")
        # a.m. and p.m. seem to confuse dateutil.parser
        when_time = when_time.replace("A.M.", "AM").replace("P.M.", "PM")

        # some times have extra info after the AM/PM
        if "upon" in when_time:
            when_time = when_time.split("AM", 1)[0]
            when_time = when_time.split("PM", 1)[0]

        # fix '- Upcoming', '- In Progress'  in dates
        when_date = re.sub(r"- (.*)", "", when_date).strip()

        try:
            start_date = dateutil.parser.parse(f"{when_date} {when_time}")
        except dateutil.parser._parser.ParserError:
            start_date = dateutil.parser.parse(when_date)

        start_date = self._TZ.localize(start_date)

        event = Event(start_date=start_date, name=com, location_name=location)

        event.add_source("https://house.mo.gov/HearingsTimeOrder.aspx")

        event.add_participant(com, type="committee", note="host")

        # different from general MO link xpath due to the <b>
        house_link_xpath = ('.//a[contains(@href, "Bill.aspx") '
                            'or contains(@href, "bill.aspx")]/b/text()')

        for bill_title in page.xpath(house_link_xpath):
            bill_no = bill_title.split("--")[0].strip()
            bill_no = bill_no.replace("HCS", "").strip()

            agenda_item = event.add_agenda_item(description=bill_title)
            agenda_item.add_bill(bill_no)

        yield event
Exemplo n.º 9
0
    def scrape_chamber(self, chamber, session, start, end):
        page = self.get_xml(start, end)

        for row in xpath(page, "//wa:CommitteeMeeting"):
            event_cancelled = xpath(row, "string(wa:Cancelled)")
            if event_cancelled == "true":
                continue

            event_chamber = xpath(row, "string(wa:Agency)")
            if self.chambers[event_chamber] != chamber:
                continue

            event_date = datetime.datetime.strptime(
                xpath(row, "string(wa:Date)"), "%Y-%m-%dT%H:%M:%S"
            )
            event_date = self._tz.localize(event_date)
            event_com = xpath(row, "string(wa:Committees/" "wa:Committee/wa:LongName)")
            agenda_id = xpath(row, "string(wa:AgendaId)")
            notes = xpath(row, "string(wa:Notes)")
            room = xpath(row, "string(wa:Room)")
            building = xpath(row, "string(wa:Building)")
            # XML has a wa:Address but it seems useless
            city = xpath(row, "string(wa:City)")
            state = xpath(row, "string(wa:State)")

            location = "{}, {}, {} {}".format(room, building, city, state)

            event = Event(
                name=event_com,
                start_date=event_date,
                location_name=location,
                description=notes,
            )

            source_url = (
                "https://app.leg.wa.gov/committeeschedules/Home/Agenda/{}".format(
                    agenda_id
                )
            )
            event.add_source(source_url)

            event.add_participant(event_com, type="committee", note="host")

            event.extras["agendaId"] = agenda_id

            self.scrape_agenda_items(agenda_id, event)

            yield event
Exemplo n.º 10
0
    def parse_div(self, row, chamber, com):
        cal_link = row.xpath('.//a[.//span[@id="calendarmarker"]]/@href')[0]
        # event_date = row.xpath('string(.//div[contains(@class,"ItemDate")])').strip()
        title, location, start_date, end_date = self.parse_gcal(cal_link)

        event = Event(start_date=start_date,
                      end_date=end_date,
                      name=title,
                      location_name=location)

        event.add_source(
            "http://mgaleg.maryland.gov/webmga/frmHearingSchedule.aspx")

        for item in row.xpath('.//div[@class="col-xs-12a Item"]'):
            description = item.xpath("string(.)").strip()
            agenda = event.add_agenda_item(description=description)

        for item in row.xpath('.//div[contains(@class,"ItemContainer")]/a'):
            description = item.xpath("string(.)").strip()
            agenda = event.add_agenda_item(description=description)

            event.add_document(
                description,
                item.xpath("@href")[0],
                media_type="application/pdf",
                on_duplicate="ignore",
            )

        for item in row.xpath('.//div[contains(@class,"ItemContainer")]'
                              '[./div[@class="col-xs-1 Item"]]'):
            description = item.xpath("string(.)").strip()
            agenda = event.add_agenda_item(description=description)

            bill = item.xpath(
                './/div[@class="col-xs-1 Item"]/a/text()')[0].strip()
            agenda.add_bill(bill)

        video = row.xpath('.//a[./span[@class="OnDemand"]]')
        if video:
            event.add_media_link("Video of Hearing",
                                 video[0].xpath("@href")[0], "text/html")

        if "subcommittee" in title.lower():
            subcom = title.split("-")[0].strip()
            event.add_participant(subcom, type="committee", note="host")
        else:
            event.add_participant(com, type="committee", note="host")
        yield event
Exemplo n.º 11
0
    def scrape(self, session=None, chamber=None):
        if not session:
            session = self.latest_session()
            self.info("no session specified, using %s", session)

        url = "ftp://www.arkleg.state.ar.us/dfadooas/ScheduledMeetings.txt"
        page = self.get(url)
        page = csv.reader(StringIO(page.text), delimiter="|")

        for row in page:
            # Deal with embedded newline characters, which cause fake new rows
            LINE_LENGTH = 11
            while len(row) < LINE_LENGTH:
                row += next(page)

            desc = row[7].strip()

            match = re.match(r"^(.*)- (HOUSE|SENATE)$", desc)
            if match:

                comm = match.group(1).strip()
                comm = re.sub(r"\s+", " ", comm)
                location = row[5].strip() or "Unknown"
                when = datetime.datetime.strptime(row[2], "%Y-%m-%d %H:%M:%S")
                when = self._tz.localize(when)
                # Only assign events to a session if they are in the same year
                # Given that session metadata have some overlap and
                # missing end dates, this is the best option available
                session_year = int(session[:4])
                if session_year != when.year:
                    continue

                description = "%s MEETING" % comm
                event = Event(
                    name=description,
                    start_date=when,
                    location_name=location,
                    description=description,
                )
                event.add_source(url)

                event.add_participant(comm, type="committee", note="host")
                # time = row[3].strip()
                # if time in _TIMECODES:
                #     event['notes'] = TIMECODES[time]

                yield event
Exemplo n.º 12
0
    def scrape_page(self, url, session, chamber):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        ctty_name = doc.xpath("//span[@class='heading']")[0].text_content()

        tables = doc.xpath("//table[@cellpadding='3']")
        info = tables[0]
        rows = info.xpath(".//tr")
        metainf = {}
        for row in rows:
            tds = row.xpath(".//td")
            key = tds[0].text_content().strip()
            value = tds[1].text_content().strip()
            metainf[key] = value

        where = metainf["Location:"]
        subject_matter = metainf["Subject Matter:"]
        description = "{}, {}".format(ctty_name, subject_matter)

        datetime = metainf["Scheduled Date:"]
        datetime = re.sub(r"\s+", " ", datetime)
        repl = {"AM": " AM", "PM": " PM"}  # Space shim.
        for r in repl:
            datetime = datetime.replace(r, repl[r])
        datetime = self.localize(
            dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p"))

        event = Event(description, start_date=datetime, location_name=where)
        event.add_source(url)

        if ctty_name.startswith("Hearing Notice For"):
            ctty_name.replace("Hearing Notice For", "")
        event.add_participant(ctty_name, "organization")

        bills = tables[1]
        for bill in bills.xpath(".//tr")[1:]:
            tds = bill.xpath(".//td")
            if len(tds) < 4:
                continue
            # First, let's get the bill ID:
            bill_id = tds[0].text_content()
            agenda_item = event.add_agenda_item(bill_id)
            agenda_item.add_bill(bill_id)

        return event
Exemplo n.º 13
0
    def scrape_event_page(self, session, chamber, url, datetime):
        page = self.lxmlize(url)
        info = page.xpath("//p")
        metainfo = {}
        plaintext = ""
        for p in info:
            content = re.sub(r"\s+", " ", p.text_content())
            plaintext += content + "\n"
            if ":" in content:
                key, val = content.split(":", 1)
                metainfo[key.strip()] = val.strip()
        committee = metainfo["COMMITTEE"]
        where = metainfo["PLACE"]
        if "CHAIR" in where:
            where, chair = where.split("CHAIR:")
            metainfo["PLACE"] = where.strip()
            metainfo["CHAIR"] = chair.strip()

        chair = None
        if "CHAIR" in metainfo:
            chair = metainfo["CHAIR"]

        plaintext = re.sub(r"\s+", " ", plaintext).strip()
        regexp = r"(S|J|H)(B|M|R) (\d+)"
        bills = re.findall(regexp, plaintext)

        event = Event(name=committee,
                      start_date=self._tz.localize(datetime),
                      location_name=where)
        event.dedupe_key = url

        event.add_source(url)
        event.add_participant(committee, type="committee", note="host")
        if chair is not None:
            event.add_participant(chair, type="legislator", note="chair")

        # add a single agenda item, attach all bills
        agenda = event.add_agenda_item(plaintext)

        for bill in bills:
            chamber, type, number = bill
            bill_id = "%s%s %s" % (chamber, type, number)
            agenda.add_bill(bill_id)

        yield event
Exemplo n.º 14
0
    def scrape(self):
        url = "https://www.ncleg.gov/LegislativeCalendar/"
        page = self.lxmlize(url)
        page.make_links_absolute(url)
        for day_row in page.xpath('//div[@class="row cal-event-day"]'):

            date = day_row.xpath(
                './/div[contains(@class, "cal-event-day-full")]/text()'
            )[0].strip()
            for row in day_row.xpath(
                    './/div[contains(@class, "cal-event row")]'):
                # first cal-event-row sometimes contains full date, skip that
                time = row.xpath(
                    'div[contains(@class,"col-12 text-left col-sm-3 text-sm-right")]/text()'
                )[0].strip()

                event_row = row.xpath(
                    'div[contains(@class,"col-12 col-sm-9 col-md-12 ")]')[0]

                # skip floor sessions
                if event_row.xpath(
                        './/a[contains(text(), "Session Convenes")]'):
                    continue

                chamber = ""
                if len(
                        event_row.xpath(
                            'span[contains(@class, "text-dark font-weight-bold")]/text()'
                        )):
                    chamber = event_row.xpath(
                        'span[contains(@class, "text-dark font-weight-bold")]/text()'
                    )[0].strip()
                    chamber = chamber.replace(":", "")

                # sometimes there are unlinked events, usually just press conferences
                if not event_row.xpath('a[contains(@href,"/Committees/")]'):
                    continue

                com_link = event_row.xpath(
                    'a[contains(@href,"/Committees/")]')[0]
                com_name = com_link.text_content().strip()
                com_name = f"{chamber} {com_name}".strip()

                com_url = com_link.xpath("@href")[0]

                where = (row.xpath('div[contains(@class,"col-12 offset-sm-3")]'
                                   )[0].text_content().strip())
                where = where.replace("STREAM", "")

                when = f"{date} {time}"
                try:
                    when = dateutil.parser.parse(when)
                    # occasionally they'd do 9am-1pm which confuses the TZ detection
                    when = self._tz.localize(when)
                except (ParserError, ValueError):
                    self.warning(
                        f"Unable to parse {time}, only using day component")
                    when = dateutil.parser.parse(date)
                    when = self._tz.localize(when).date()

                event = Event(
                    name=com_name,
                    start_date=when,
                    location_name=where,
                    classification="committee-meeting",
                )
                event.add_source(com_url)

                event.add_participant(com_name, type="committee", note="host")

                # NOTE: if you follow the committee link, there are agenda PDF links
                # but they don't load at all as of 2021-02-01 -- showerst

                for agenda_row in event_row.xpath(".//p"):
                    agenda_text = agenda_row.text_content().strip()
                    if agenda_text != "":
                        agenda = event.add_agenda_item(agenda_text)

                        for bill_row in agenda_row.xpath(
                                './/a[contains(@href,"BillLookUp")]/text()'):
                            agenda.add_bill(bill_row.split(":")[0])

                yield event
Exemplo n.º 15
0
    def scrape_lower(self):
        url = "https://www.house.leg.state.mn.us/Schedules/All"
        page = self.lxmlize(url)

        for row in page.xpath('//div[contains(@class,"my-2 d-print-block")]'):
            # print(row.text_content())

            # skip floor sessions and unlinked events
            if not row.xpath(
                    'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/b'
            ):
                continue

            # skip joint ones, we'll get those from the senate API
            if row.xpath('div[contains(@class,"card-header bg-joint")]'):
                continue

            # top-level committee
            com = row.xpath(
                'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/b/text()'
            )[0].strip()
            com_link = row.xpath(
                'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/@href'
            )[0]

            when = (row.xpath(
                'div[contains(@class,"card-header")]/span[contains(@class,"text-white")]/text()'
            )[0].replace("\r\n", "").strip())
            when = dateutil.parser.parse(when)
            when = self._tz.localize(when)

            if row.xpath('.//b[.="Location:"]'):
                where = row.xpath(
                    './/b[.="Location:"]/following-sibling::text()[1]'
                )[0].strip()
            else:
                where = "See committee page"

            if row.xpath('.//b[.="Agenda:"]'):
                desc = "\n".join(
                    row.xpath('.//b[.="Agenda:"]/following-sibling::div/text()'
                              )).strip()
            else:
                desc = "See committee page"

            event = Event(
                name=com,
                start_date=when,
                location_name=where,
                classification="committee-meeting",
                description=desc,
            )

            event.add_source(com_link)

            for bill in get_bill_ids(desc):
                event.add_bill(desc)

            if row.xpath(
                    ".//a[contains(@href,'/bills/bill.php') and contains(@class,'pull-left')]"
            ):
                agenda = event.add_agenda_item("Bills")
                for bill_id in row.xpath(
                        ".//a[contains(@href,'/bills/bill.php') and contains(@class,'pull-left')]/text()"
                ):
                    agenda.add_bill(bill_id.strip())

            for attachment in row.xpath(".//ul/li/div/a"):
                doc_url = attachment.xpath("@href")[0]
                doc_name = attachment.xpath("text()")[0].strip()
                # if they don't provide a name just use the filename
                if doc_name == "":
                    parsed_url = urlparse(doc_url)
                    doc_name = os.path.basename(parsed_url)

                # sometimes broken links to .msg files (emails?) are attached,
                # they always 404.
                if doc_url.endswith(".msg"):
                    continue
                media_type = get_media_type(doc_url)
                event.add_document(doc_name,
                                   doc_url,
                                   media_type=media_type,
                                   on_duplicate="ignore")

            for committee in row.xpath(
                    'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/b/text()'
            ):
                event.add_participant(committee, type="committee", note="host")

            yield event
Exemplo n.º 16
0
    def scrape_senate(self):
        url = "https://www.senate.gov/general/committee_schedules/hearings.xml"

        page = self.get(url).content
        page = lxml.etree.fromstring(page)

        rows = page.xpath("//meeting")

        for row in rows:
            com = row.xpath("string(committee)")

            if com == "":
                continue

            com = "Senate {}".format(com)

            address = row.xpath("string(room)")
            parts = address.split("-")
            building_code = parts[0]

            if self.buildings.get(building_code):
                address = "{}, Room {}".format(
                    self.buildings.get(building_code), parts[1])

            agenda = row.xpath("string(matter)")

            try:
                event_date = datetime.datetime.strptime(
                    row.xpath("string(date)"), "%d-%b-%Y %H:%M %p")
            except ValueError:
                event_date = datetime.datetime.strptime(
                    row.xpath("string(date)"), "%d-%b-%Y")

            event_date = self._TZ.localize(event_date)

            event = Event(start_date=event_date,
                          name=com,
                          location_name=address)

            agenda_item = event.add_agenda_item(description=agenda)

            # ex: Business meeting to consider S.785, to improve mental...
            matches = re.findall(r"\s(\w+)\.(\d+),", agenda)

            if matches:
                match = matches[0]
                bill_type = match[0]
                bill_number = match[1]
                bill_name = "{} {}".format(bill_type, bill_number)
                agenda_item.add_bill(bill_name)

            event.add_participant(
                com,
                type="committee",
                note="host",
            )

            event.add_source(
                "https://www.senate.gov/committees/hearings_meetings.htm")

            yield event
Exemplo n.º 17
0
    def house_meeting(self, xml, source_url):

        title = xml.xpath("string(//meeting-details/meeting-title)")

        meeting_date = xml.xpath("string(//meeting-date/calendar-date)")
        start_time = xml.xpath("string(//meeting-date/start-time)")
        end_time = xml.xpath("string(//meeting-date/end-time)")

        start_dt = datetime.datetime.strptime(
            "{} {}".format(meeting_date, start_time), "%Y-%m-%d %H:%M:%S")

        start_dt = self._TZ.localize(start_dt)

        end_dt = None

        if end_time != "":
            end_dt = datetime.datetime.strptime(
                "{} {}".format(meeting_date, end_time), "%Y-%m-%d %H:%M:%S")
            end_dt = self._TZ.localize(end_dt)

        building = xml.xpath(
            "string(//meeting-details/meeting-location/capitol-complex/building)"
        )

        address = "US Capitol"
        if building != "Select one":
            if self.buildings.get(building):
                building = self.buildings.get(building)

            room = xml.xpath(
                "string(//meeting-details/meeting-location/capitol-complex/room)"
            )
            address = "{}, Room {}".format(building, room)

        event = Event(start_date=start_dt, name=title, location_name=address)

        event.add_source(source_url)

        coms = xml.xpath(
            "//committees/committee-name | //subcommittees/committee-name")
        for com in coms:
            com_name = com.xpath("string(.)")
            com_name = "House {}".format(com_name)
            event.add_participant(
                com_name,
                type="committee",
                note="host",
            )

        docs = xml.xpath("//meeting-documents/meeting-document")
        for doc in docs:
            doc_name = doc.xpath("string(description)")
            doc_files = doc.xpath("files/file")
            for doc_file in doc_files:
                media_type = self.media_types[doc_file.get("doc-type")]
                url = doc_file.get("doc-url")

                if doc.get("type") in ["BR", "AM", "CA"]:
                    if doc_name == "":
                        doc_name = doc.xpath("string(legis-num)").strip()
                    matches = re.findall(r"([\w|\.]+)\s+(\d+)", doc_name)

                    if matches:
                        match = matches[0]
                        bill_type = match[0].replace(".", "")
                        bill_number = match[1]
                        bill_name = "{} {}".format(bill_type, bill_number)
                        agenda = event.add_agenda_item(description=bill_name)
                        agenda.add_bill(bill_name)

                if doc_name == "":
                    try:
                        doc_name = self.hearing_document_types[doc.get("type")]
                    except KeyError:
                        self.warning("Unable to find document type: {}".format(
                            doc.get("type")))

                event.add_document(doc_name,
                                   url,
                                   media_type=media_type,
                                   on_duplicate="ignore")

        yield event
Exemplo n.º 18
0
    def scrape_event_page(self, url, event_type):
        page = self.lxmlize(url)
        page.make_links_absolute("https://malegislature.gov/")

        title = page.xpath('string(//div[contains(@class,"followable")]/h1)')
        title = title.replace("Hearing Details", "").strip()
        title = title.replace("Special Event Details", "")

        start_day = page.xpath(
            '//dl[contains(@class,"eventInformation")]/dd[2]/text()[last()]'
        )[0].strip()
        start_time = page.xpath(
            'string(//dl[contains(@class,"eventInformation")]/dd[3])').strip()

        # If an event gets moved, ignore the original time
        start_time = re.sub(
            r"Original Start Time(.*)New Start Time(\n*)",
            "",
            start_time,
            flags=re.IGNORECASE | re.MULTILINE | re.DOTALL,
        )
        location = page.xpath(
            'string(//dl[contains(@class,"eventInformation")]/dd[4]//a)'
        ).strip()

        if location == "":
            location = page.xpath(
                'string(//dl[contains(@class,"eventInformation")]/dd[4])'
            ).strip()

        description = page.xpath(
            'string(//dl[contains(@class,"eventInformation")]/dd[5])').strip()

        start_date = self._TZ.localize(
            dateutil.parser.parse("{} {}".format(start_day, start_time)))

        event = Event(
            start_date=start_date,
            name=title,
            location_name=location,
            description=description,
        )

        event.add_source(url)

        agenda_rows = page.xpath(
            '//div[contains(@class,"col-sm-8") and .//h2[contains(@class,"agendaHeader")]]'
            '/div/div/div[contains(@class,"panel-default")]')

        for row in agenda_rows:
            # only select the text node, not the spans
            agenda_title = row.xpath(
                "string(.//h4/a/text()[normalize-space()])").strip()

            if agenda_title == "":
                agenda_title = row.xpath(
                    "string(.//h4/text()[normalize-space()])").strip()

            agenda = event.add_agenda_item(description=agenda_title)

            bills = row.xpath(".//tbody/tr/td[1]/a/text()")
            for bill in bills:
                bill = bill.strip().replace(".", " ")
                agenda.add_bill(bill)

        if event_type == "Hearing":
            event.add_participant(title, type="committee", note="host")

        yield event
Exemplo n.º 19
0
    def scrape_agenda(self, chamber, url):
        page = self.lxmlize(url)
        # Get the date/time info:
        date_time = page.xpath("//table[@class='time_place']")
        if date_time == []:
            return

        date_time = date_time[0]
        lines = date_time.xpath("./tr")
        metainf = {}
        for line in lines:
            tds = line.xpath("./td")
            metainf[tds[0].text_content()] = tds[1].text_content()
        date = metainf["DATE:"]
        time = metainf["TIME:"]
        where = metainf["PLACE:"]

        # check for duration in time
        if " - " in time:
            start, end = time.split(" - ")
            am_pm_srch = re.search("(?i)(am|pm)", end)
            if am_pm_srch:
                time = " ".join([start, am_pm_srch.group().upper()])
            else:
                time = start

        fmts = [
            "%A, %B %d, %Y", "%A, %B %d, %Y %I:%M %p", "%A, %B %d, %Y %I:%M"
        ]

        event_desc = "Meeting Notice"
        if "Rise" in time:
            datetime = date
            event_desc = "Meeting Notice: Starting at {}".format(time)
        else:
            datetime = "%s %s" % (date, time)
        if "CANCELLED" in datetime.upper() or "CANCELED" in datetime.upper():
            return

        if page.xpath("//span[@id='lblSession']"):
            event_desc = (page.xpath("//span[@id='lblSession']")
                          [0].text_content().strip())

        transtable = {
            "P.M": "PM",
            "PM.": "PM",
            "P.M.": "PM",
            "A.M.": "AM",
            "POSTPONED": "",
            "RESCHEDULED": "",
            "and Rise of the Senate": "",
        }
        for trans in transtable:
            datetime = datetime.replace(trans, transtable[trans])

        datetime = datetime.strip()

        for fmt in fmts:
            try:
                datetime = dt.datetime.strptime(datetime, fmt)
                break
            except ValueError:
                continue

        event = Event(name=event_desc,
                      start_date=self._tz.localize(datetime),
                      location_name=where)

        event.add_document("Agenda",
                           url,
                           media_type="text/html",
                           on_duplicate="ignore")
        event.add_source(url)

        # aight. Let's get us some bills!
        bills = page.xpath("//b/a")
        for bill in bills:
            bill_ft = bill.attrib["href"]
            event.add_document(
                bill.text_content(),
                bill_ft,
                media_type="application/pdf",
                on_duplicate="ignore",
            )
            root = bill.xpath("../../*")
            root = [x.text_content() for x in root]
            bill_id = "".join(root).replace("\u00a0", "")

            if "SCHEDULED FOR" in bill_id:
                continue

            descr = bill.getparent().getparent().text_content().replace(
                "\u00a0", " ")

            for thing in replace:
                bill_id = bill_id.replace(thing, replace[thing])

            item = event.add_agenda_item(descr)
            item.add_bill(bill_id)

        # sometimes bill references are just plain links or plain text.
        bill_links = page.xpath('//a[contains(@href,"/BillText/")]/@href')
        linked_bills = set()
        for bill_link in bill_links:
            bill_nums = re.findall(r"\/(\w+\d+)\.pdf",
                                   bill_link,
                                   flags=re.IGNORECASE)
            for bill_num in bill_nums:
                linked_bills.add(bill_num)

        # sometimes (H 1234) ends up in the title or somewhere else unlinked
        text_bill_nums = re.findall(r"\((\w{1,3}\s?\d+)\)",
                                    page.text_content(),
                                    flags=re.IGNORECASE)
        for bill_num in text_bill_nums:
            bill_num = bill_num.replace(" ", "")
            linked_bills.add(bill_num)

        if len(linked_bills) != 0:
            item = event.add_agenda_item("Bills under consideration")
            for bill in linked_bills:
                item.add_bill(bill)

        if page.xpath("//span[@id='lblSession']"):
            committee = page.xpath(
                "//span[@id='lblSession']")[0].text_content()
            event.add_participant(committee, "committee", note="host")

        yield event
Exemplo n.º 20
0
    def scrape_chamber(self, chamber):
        session = self.latest_session()
        session_id = session_metadata.session_id_meta_data[session]

        chamber_abbr = self.chamber_codes[chamber]

        com_url = (
            "https://apps.azleg.gov/api/Committee/?includeOnlyCommitteesWithAgendas=true"
            "&legislativeBody={}&sessionId={}&standingOnly=true&interimOnly=false&jointCommitteesOnly=false"
        )
        com_url = com_url.format(chamber_abbr, session_id)

        coms = self.get(com_url).json()

        for com in coms:
            # joint committees get returned by both endpoints, so skip one
            if com["LegislativeBody"] != chamber_abbr:
                continue

            #  https://apps.azleg.gov/api/Agenda/?showPassed=true&sessionId=123
            #  &isInterimAgenda=false&body=S&includeItems=false&committeeId=1960
            events_url = (
                "https://apps.azleg.gov/api/Agenda/?includeItems=true&showPassed=true"
                "&sessionId={}&isInterimAgenda=false&body={}&committeeId={}")
            events_url = events_url.format(session_id, chamber_abbr,
                                           com["CommitteeId"])
            events_list = self.get(events_url).json()

            for row in events_list:
                if (row["AgendaCanceled"] is True
                        or "not meeting" in row["Time"].lower()):
                    continue

                title = "{} {}".format(self.code_chambers[chamber_abbr],
                                       row["CommitteeName"])

                # fix for dateutil parser confusion
                row["Time"] = row["Time"].replace("A.M.",
                                                  "AM").replace("P.M.", "PM")

                if "upon rec" not in row["Time"].lower():
                    time = re.findall(r"(\d+:\d+\s+[A|P]M)", row["Time"])
                    if len(time) == 0:
                        self.warning(
                            f"Unable to get time for {row['Time']} on {title}")
                        time = "00:00:00"
                    else:
                        time = time[0]

                    time = time.replace(r"\s+", " ")
                else:
                    time = ""

                when = dateutil.parser.parse(f"{row['Date']} {time}")
                when = self._tz.localize(when)

                where = "{}, Room {}".format(self.address, row["Room"])

                description = ""

                event = Event(
                    name=title,
                    location_name=where,
                    start_date=when,
                    description=description,
                )

                event.add_document("Agenda",
                                   row["HttpPath"],
                                   media_type="text/html")
                event.add_document("Agenda",
                                   row["HttpPdfPath"],
                                   media_type="application/pdf")

                event.add_participant(row["CommitteeName"],
                                      type="committee",
                                      note="host")

                for item in row["Items"]:
                    agenda_item = event.add_agenda_item(item["Description"])
                    bill_id = re.findall(r"^(.*?)\s", item["Description"])
                    bill_id = bill_id[0]
                    agenda_item.add_bill(bill_id)

                    for speaker in item["RequestsToSpeak"]:
                        speaker_title = speaker["Name"]
                        if speaker["Representing"] != "Self":
                            speaker_title = (
                                f"{speaker['Name']} ({speaker['Representing']})"
                            )

                        event.add_participant(speaker_title,
                                              type="person",
                                              note="speaker")

                event.add_source(
                    "https://apps.azleg.gov/BillStatus/AgendaSearch")
                yield event
Exemplo n.º 21
0
    def scrape_committee_page(self, url):
        page = self.get(url, headers=self.cf_headers).content
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)
        com = page.xpath(
            '//div[contains(@class, "pull-left span8")]/h1/text()')[0].strip()

        for row in page.xpath('//div[contains(@id, "agenda-item")]'):
            # status = "tentative"
            meta = row.xpath(
                'div[contains(@class,"accordion-heading-agenda")]/a')[0]

            date = meta.xpath("text()")[0].strip()

            time_and_loc = meta.xpath("span/text()")[0].strip()
            time_and_loc = time_and_loc.split("\n")
            time = time_and_loc[0]
            loc = time_and_loc[1]

            if loc == "":
                loc = "See Agenda"

            com = com.replace("(S)", "Senate").replace("(H)", "House")

            # Indiana has a LOT of undefined times, stuff like "15 mins after adj. of elections"
            # so just remove the time component if it won't parse, and the user can go to the agenda
            try:
                when = dateutil.parser.parse(f"{date} {time}")
            except dateutil.parser._parser.ParserError:
                when = dateutil.parser.parse(date)
            when = self._tz.localize(when)

            if "cancelled" in time.lower():
                continue

            event = Event(
                name=com,
                start_date=when,
                location_name=loc,
                classification="committee-meeting",
            )

            event.add_source(url)
            event.add_participant(com, type="committee", note="host")

            if row.xpath('.//a[contains(text(), "View Agenda")]'):
                agenda_url = row.xpath(
                    './/a[contains(text(), "View Agenda")]/@href')[0]
                event.add_document("Agenda",
                                   agenda_url,
                                   media_type="application/pdf")

            if row.xpath('.//a[contains(text(), "Watch")]'):
                vid_url = row.xpath('.//a[contains(text(), "Watch")]/@href')[0]
                event.add_media_link("Video of Hearing",
                                     vid_url,
                                     media_type="text/html")

            if row.xpath('.//tr[contains(@class,"bill-container")]/td'):
                agenda = event.add_agenda_item("Bills under consideration")
                for bill_row in row.xpath(
                        './/tr[contains(@class,"bill-container")]'):
                    bill_id = bill_row.xpath(
                        ".//a[contains(@class,'bill-name-link')]/text()")[0]
                    agenda.add_bill(bill_id)

            yield event
Exemplo n.º 22
0
    def scrape_event_page(self, url, chamber):
        html = self.get(url).text
        page = lxml.html.fromstring(html)
        trs = page.xpath(
            "//table[@id='frg_mcommitteemeeting_MeetingTable']/tr")
        metainf = {}
        for tr in trs:
            tds = tr.xpath(".//td")
            if len(tds) <= 1:
                continue
            key = tds[0].text_content().strip()
            val = tds[1]
            metainf[key] = {"txt": val.text_content().strip(), "obj": val}

        if metainf == {}:
            return

        # Wednesday, 5/16/2012 3:00 pm
        datetime = "%s %s" % (
            metainf["Date"]["txt"],
            metainf["Time"]["txt"].replace(".", ""),
        )
        if "Cancelled" in datetime:
            return

        translate = {
            "noon": " PM",
            "a.m.": " AM",
            "am": " AM",  # This is due to a nasty line they had.
            "a.m": "AM",  # another weird one
        }

        for t in translate:
            if t in datetime:
                datetime = datetime.replace(t, translate[t])

        datetime = re.sub(r"\s+", " ", datetime)

        for text_to_remove in [
                "or after committees are given leave",
                "or later immediately after committees are given leave",
                "or later after committees are given leave by the House to meet",
                "**Please note time**",
        ]:
            datetime = datetime.split(text_to_remove)[0].strip()

        datetime = datetime.replace("p.m.", "pm")
        datetime = datetime.replace("Noon", "pm")
        try:
            datetime = dt.datetime.strptime(datetime, "%A, %m/%d/%Y %I:%M %p")
        except ValueError:
            datetime = dt.datetime.strptime(datetime, "%A, %m/%d/%Y %I %p")
        where = metainf["Location"]["txt"]
        title = metainf["Committee(s)"]["txt"]  # XXX: Find a better title

        if chamber == "other":
            chamber = "joint"

        event = Event(name=title,
                      start_date=self._tz.localize(datetime),
                      location_name=where)
        event.add_source(url)
        event.add_source(mi_events)

        chair_name = metainf["Chair"]["txt"].strip()
        if chair_name:
            event.add_participant(chair_name, type="legislator", note="chair")
        else:
            self.warning("No chair found for event '{}'".format(title))

        event.add_participant(metainf["Committee(s)"]["txt"],
                              type="committee",
                              note="host")

        agenda = metainf["Agenda"]["obj"]
        agendas = agenda.text_content().split("\r")

        related_bills = agenda.xpath("//a[contains(@href, 'getObject')]")
        for bill in related_bills:
            description = agenda
            for a in agendas:
                if bill.text_content() in a:
                    description = a

            item = event.add_agenda_item(description)
            item.add_bill(bill.text_content())

        yield event
Exemplo n.º 23
0
    def scrape_upper(self):
        listing_url = "https://www.senate.mo.gov/hearingsschedule/hrings.htm"

        html = self.get(listing_url).text

        # The HTML here isn't wrapped in a container per-event
        # which makes xpath a pain. So string split by <hr>
        # then parse each event's fragment for cleaner results
        for fragment in html.split("<hr />")[1:]:
            page = lxml.html.fromstring(fragment)

            when_date = self.row_content(page, "Date:")
            when_time = self.row_content(page, "Time:")
            when_time = re.sub("or upon .* recess", "", when_time)

            # fix for upon adjournment
            when_time = when_time.replace(
                "or upon morning adjournment whichever is later", "").strip()
            # 15/30/45 minutes/hours upon adjournment/recess
            when_time = re.sub(r"\d+ \w+ upon \w+",
                               "",
                               when_time,
                               flags=re.IGNORECASE)
            # a.m. and p.m. seem to confuse dateutil.parser
            when_time = when_time.replace("A.M.", "AM").replace("P.M.", "PM")

            location = self.row_content(page, "Room:")

            location = "{}, {}".format(
                location, "201 W Capitol Ave, Jefferson City, MO 65101")

            if not page.xpath(
                    '//td[descendant::b[contains(text(),"Committee")]]/a/text()'
            ):
                continue

            com = page.xpath(
                '//td[descendant::b[contains(text(),"Committee")]]/a/text()'
            )[0]
            com = com.split(", Senator")[0].strip()

            try:
                start_date = dateutil.parser.parse(f"{when_date} {when_time}")
            except dateutil.parser._parser.ParserError:
                start_date = dateutil.parser.parse(when_date)

            start_date = self._TZ.localize(start_date)

            event = Event(start_date=start_date,
                          name=com,
                          location_name=location)

            event.add_source(listing_url)

            event.add_participant(com, type="committee", note="host")

            for bill_table in page.xpath(
                    '//table[@width="85%" and @border="0"]'):
                bill_link = ""
                if bill_table.xpath(self.bill_link_xpath):
                    agenda_line = bill_table.xpath("string(tr[2])").strip()
                    agenda_item = event.add_agenda_item(
                        description=agenda_line)

                    bill_link = bill_table.xpath(
                        self.bill_link_xpath)[0].strip()
                    agenda_item.add_bill(bill_link)
                else:
                    agenda_line = bill_table.xpath("string(tr[1])").strip()
                    agenda_item = event.add_agenda_item(
                        description=agenda_line)

            yield event
Exemplo n.º 24
0
    def scrape_chamber(self, chamber, session):
        today = datetime.date.today()
        start_date = today - datetime.timedelta(days=10)
        end_date = today + datetime.timedelta(days=10)

        if chamber == "upper":
            chamber_abbrev = "S"
        else:
            chamber_abbrev = "H"

        url = ("http://www.legis.iowa.gov/committees/meetings/meetingsList"
               "Chamber?chamber=%s&bDate=%02d/%02d/"
               "%d&eDate=%02d/%02d/%d" % (
                   chamber_abbrev,
                   start_date.month,
                   start_date.day,
                   start_date.year,
                   end_date.month,
                   end_date.day,
                   end_date.year,
               ))

        page = lxml.html.fromstring(self.get(url).text)
        page.make_links_absolute(url)
        for link in page.xpath("//div[contains(@class, 'meetings')]/table[1]/"
                               "tbody/tr[not(contains(@class, 'hidden'))]"):
            comm = link.xpath("string(./td[2]/a[1]/text())").strip()
            desc = comm + " Committee Hearing"

            location = link.xpath("string(./td[3]/text())").strip()

            when = link.xpath("string(./td[1]/span[1]/text())").strip()

            if "cancelled" in when.lower() or "upon" in when.lower():
                continue
            if "To Be Determined" in when:
                continue

            if "AM" in when:
                when = when.split("AM")[0] + " AM"
            else:
                when = when.split("PM")[0] + " PM"

            junk = ["Reception"]
            for key in junk:
                when = when.replace(key, "")

            when = re.sub(r"\s+", " ", when).strip()
            if "tbd" in when.lower():
                # OK. This is a partial date of some sort.
                when = datetime.datetime.strptime(when,
                                                  "%m/%d/%Y TIME - TBD %p")
            else:
                try:
                    when = datetime.datetime.strptime(when,
                                                      "%m/%d/%Y %I:%M %p")
                except ValueError:
                    try:
                        when = datetime.datetime.strptime(
                            when, "%m/%d/%Y %I %p")
                    except ValueError:
                        self.warning("error parsing timestamp %s", when)
                        continue

            event = Event(
                name=desc,
                description=desc,
                start_date=self._tz.localize(when),
                location_name=location,
            )

            event.add_source(url)
            event.add_participant(comm, note="host", type="committee")

            yield event
Exemplo n.º 25
0
    def lower_parse_page(self, url):
        page = self.lxmlize(url)
        tables = page.xpath("//table[@class='pubhrgtbl']")
        date = None
        for table in tables:
            metainf = {}
            rows = table.xpath(".//tr")
            for row in rows:
                tds = row.xpath("./*")
                if len(tds) < 2:
                    continue
                key, value = tds

                if key.tag == "th" and key.get("class") == "hrgdate":
                    date = key.text_content()
                    date = re.sub(r"\s+", " ", date)
                    date = re.sub(".*POSTPONED NEW DATE", "", date).strip()

                # Due to the html structure this shouldn't be an elif
                # It needs to fire twice in the same loop iteration
                if value.tag == "th" and value.get("class") == "commtitle":
                    coms = value.xpath(
                        './/div[contains(@class,"comm-txt")]/text()')

                elif key.tag == "td":
                    key = key.text_content().strip()
                    value = value.text_content().strip()
                    value = value.replace(u"\x96", "-")
                    value = re.sub(r"\s+", " ", value)
                    metainf[key] = value

            time = metainf["Time:"]
            repl = {"A.M.": "AM", "P.M.": "PM"}
            drepl = {"Sept": "Sep"}
            for r in repl:
                time = time.replace(r, repl[r])

            for r in drepl:
                date = date.replace(r, drepl[r])

            time = re.sub("-.*", "", time)
            time = time.strip()

            year = dt.datetime.now().year

            date = "%s %s %s" % (date, year, time)

            if "tbd" in date.lower():
                continue

            date = date.replace(" PLEASE NOTE NEW TIME", "")

            # Check if the event has been postponed.
            postponed = "POSTPONED" in date
            if postponed:
                date = date.replace(" POSTPONED", "")

            date_formats = ["%B %d %Y %I:%M %p", "%b. %d %Y %I:%M %p"]
            datetime = None
            for fmt in date_formats:
                try:
                    datetime = dt.datetime.strptime(date, fmt)
                except ValueError:
                    pass

            # If the datetime can't be parsed, bail.
            if datetime is None:
                return

            title_key = set(metainf) & set([
                "Public Hearing:",
                "Summit:",
                "Roundtable:",
                "Public Roundtable:",
                "Public Meeting:",
                "Public Forum:",
                "Meeting:",
            ])
            assert len(title_key) == 1, "Couldn't determine event title."
            title_key = list(title_key).pop()
            title = metainf[title_key]

            title = re.sub(r"\*\*Click here to view public hearing notice\*\*",
                           "", title)

            # If event was postponed, add a warning to the title.
            if postponed:
                title = "POSTPONED: %s" % title

            event = Event(
                name=title,
                start_date=self._tz.localize(datetime),
                location_name=metainf["Place:"],
            )
            event.extras = {"contact": metainf["Contact:"]}
            if "Media Contact:" in metainf:
                event.extras.update(media_contact=metainf["Media Contact:"])
            event.add_source(url)

            for com in coms:
                event.add_participant(com.strip(),
                                      type="committee",
                                      note="host")
                participant = event.participants[-1]
                participant["extras"] = ({
                    "chamber":
                    self.classify_committee(com)
                }, )

            yield event
Exemplo n.º 26
0
    def scrape(self):
        url = "https://apps.legislature.ky.gov/legislativecalendar"

        page = self.get(url).content
        page = lxml.html.fromstring(page)

        for time_row in page.xpath(
                '//div[contains(@class,"TimeAndLocation")]'):
            date = (time_row.xpath(
                'preceding-sibling::div[contains(@class,"DateHeading")][1]')
                    [0].text_content().strip())

            status = "tentative"

            if time_row.xpath('div[contains(@class,"Cancelled")]'):
                status = "cancelled"

            row_text = time_row.text_content()
            row_text = row_text.replace("Noon", "PM")
            # upon recess (of House|Senate)
            row_text = re.sub(r"Upon Recess(\sof\s)?(House|Senate)?", "",
                              row_text)
            parts = re.split(r",|AM|PM", row_text)
            time = parts[0].strip()
            location = " ".join(
                x.replace(r"\xa0", "").strip() for x in parts[1:])

            when = f"{date} {time}"
            when = dateutil.parser.parse(when)
            when = self._tz.localize(when)

            if not time_row.xpath(
                    'following-sibling::div[contains(@class,"CommitteeName")][1]/a'
            ):
                continue

            com_name = (time_row.xpath(
                'following-sibling::div[contains(@class,"CommitteeName")][1]/a'
            )[0].text_content().strip())

            event = Event(
                name=com_name,
                start_date=when,
                classification="committee-meeting",
                location_name=location,
                status=status,
            )

            if time_row.xpath(
                    'following-sibling::div[contains(@class,"Agenda")][1]'):
                agenda_row = time_row.xpath(
                    'following-sibling::div[contains(@class,"Agenda")][1]')[0]
                agenda_text = agenda_row.text_content().strip()

                agenda = event.add_agenda_item(agenda_text)

                for bill_link in agenda_row.xpath(
                        './/a[contains(@href,"/record/")]'):
                    agenda.add_bill(bill_link.text_content().strip())

            event.add_participant(com_name, note="host", type="committee")

            com_page_link = time_row.xpath(
                'following-sibling::div[contains(@class,"CommitteeName")][1]/a/@href'
            )[0]

            docs = self.scrape_com_docs(com_page_link)
            lookup_date = when.strftime("%Y-%m-%d")

            if lookup_date in docs["mats"]:
                for mat in docs["mats"][lookup_date]:
                    event.add_document(mat["text"],
                                       mat["url"],
                                       on_duplicate="ignore")

            if lookup_date in docs["minutes"]:
                for mat in docs["minutes"][lookup_date]:
                    event.add_document(mat["text"],
                                       mat["url"],
                                       on_duplicate="ignore")

            event.add_source(url)

            yield event
Exemplo n.º 27
0
    def scrape_house_weekly_schedule(self):
        url = "https://house.louisiana.gov/H_Sched/Hse_MeetingSchedule.aspx"
        page = self.lxmlize(url)

        meeting_rows = page.xpath('//table[@id = "table229"]/tr')

        valid_meetings = [
            row for row in meeting_rows
            if row.xpath("./td[1]")[0].text_content().replace("\xa0", "")
            and row.xpath('./td/a/img[contains(@src, "PDF-AGENDA.png")]')
            and "Not Meeting" not in row.xpath("./td[2]")[0].text_content()
        ]

        for meeting in valid_meetings:
            try:
                guid = meeting.xpath("./td/a[descendant::img[contains(@src,"
                                     '"PDF-AGENDA.png")]]/@href')[0]
                # self.logger.debug(guid)
                self.warning("logger.debug" + guid)
            except KeyError:
                continue  # Sometimes we have a dead link. This is only on
                # dead entries.

            committee_name = meeting.xpath("./td[1]/text()")[0].strip()
            meeting_string = meeting.xpath("./td[2]")[0].text_content()

            if "@" in meeting_string:
                continue  # Contains no time data.
            date, time, location = (
                [s.strip()
                 for s in meeting_string.split(",") if s] + [None] * 3)[:3]

            # check for time in date because of missing comma
            time_srch = re.search(r"\d{2}:\d{2} (AM|PM)", date)
            if time_srch:
                location = time
                time = time_srch.group()
                date = date.replace(time, "")

            # self.logger.debug(location)
            self.warning("logger.debug" + location)

            year = datetime.datetime.now().year
            datetime_string = " ".join((date, str(year), time))
            when = datetime.datetime.strptime(datetime_string,
                                              "%b %d %Y %I:%M %p")
            when = self._tz.localize(when)

            description = "Committee Meeting: {}".format(committee_name)
            # self.logger.debug(description)
            self.warning("logger.debug" + description)

            event = Event(
                name=description,
                start_date=self._tz.localize(when),
                location_name=location,
            )
            event.add_source(url)
            event.add_participant(committee_name,
                                  type="committee",
                                  note="host")
            event.add_document(note="Agenda",
                               url=guid,
                               text="agenda",
                               media_type="application/pdf")

            yield event
Exemplo n.º 28
0
    def scrape_meeting(self, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)
        title = page.xpath("//a[@id='linkTitle']//text()")[0]
        date = page.xpath("//span[@id='lDate']/text()")[0]
        time = page.xpath("//span[@id='lTime']/text()")[0]
        location = page.xpath("//span[@id='lLocation']/text()")[0]

        substs = {"AM": ["A.M.", "a.m."], "PM": ["P.M.", "p.m.", "Noon"]}

        for key, values in substs.items():
            for value in values:
                time = time.replace(value, key)

        # Make sure there's a space between the time's minutes and its AM/PM
        if re.search(r"(?i)\d[AP]M$", time):
            time = time[:-2] + " " + time[-2:]

        if re.search("UPON ADJ|TBA", " ".join(time.split()).upper()):
            all_day = True
            when = datetime.datetime.strptime(date, "%B %d, %Y")
        else:
            all_day = False
            when = dateutil.parser.parse(f"{date} {time}".strip())

        # when = self._tz.localize(when)

        description = "Meeting on %s of the %s" % (date, title)
        chambers = {
            "house": "lower",
            "senate": "upper",
            "joint": "legislature"
        }

        for chamber_ in chambers:
            if chamber_ in title.lower():
                break
        else:
            return

        event = Event(
            name=description,
            start_date=self._tz.localize(when),
            location_name=location,
            all_day=all_day,
        )
        event.add_source(url)

        event.add_participant(title, note="host", type="committee")

        trs = iter(page.xpath("//tr[@valign='top']"))
        next(trs)

        for tr in trs:
            try:
                _, _, bill, whom, descr = tr.xpath("./td")
            except ValueError:
                continue

            bill_title = bill.text_content()

            if "S" in bill_title or "H" in bill_title:
                item = event.add_agenda_item(descr.text_content())
                item.add_bill(bill_title)
            else:
                continue

        yield event
Exemplo n.º 29
0
    def scrape_chamber(self, chamber):
        if chamber == "upper":
            url = "https://legislature.idaho.gov/sessioninfo/agenda/sagenda/"
        elif chamber == "lower":
            url = "https://legislature.idaho.gov/sessioninfo/agenda/hagenda/"

        page = self.get(url).content
        page = lxml.html.fromstring(page)

        for row in page.xpath('//div[@id="ai1ec-container"]/div'):
            month = row.xpath(
                ".//div[contains(@class,'calendarHeader')]/div[contains(@class,'date')]/text()"
            )[0].strip()
            day = row.xpath(
                ".//div[contains(@class,'calendarHeader')]/div[contains(@class,'date')]/span/text()"
            )[0].strip()

            time_and_loc = row.xpath(
                ".//div[contains(@class,'calendarHeader')]/div[contains(@class,'abbr')]/h2/text()"
            )
            time = time_and_loc[0].strip()
            loc = time_and_loc[1].strip()

            if "not meet" in time.lower():
                continue

            try:
                start = dateutil.parser.parse(f"{month} {day} {time}")
            except dateutil.parser._parser.ParserError:
                start = dateutil.parser.parse(f"{month} {day}")

            start = self._tz.localize(start)

            com = row.xpath(
                ".//div[contains(@class,'calendarHeader')]/div[contains(@class,'day')]/h2/a/text()"
            )[0].strip()

            event = Event(
                name=com,
                start_date=start,
                location_name=loc,
                classification="committee-meeting",
            )

            event.add_participant(com, type="committee", note="host")

            agenda_url = row.xpath(
                './/a[contains(text(), "Full Agenda")]/@href')[0]
            event.add_document("Agenda",
                               agenda_url,
                               media_type="application/pdf")

            agenda_rows = row.xpath(
                './/div[contains(@class,"card")]/div[contains(@id, "Agenda")]/div/table/tbody/tr'
            )[1:]

            for agenda_row in agenda_rows:
                subject = agenda_row.xpath("string(td[1])").strip()
                description = agenda_row.xpath("string(td[2])").strip()
                presenter = agenda_row.xpath("string(td[3])").strip()
                if presenter != "":
                    agenda_text = (
                        f"{subject} {description} Presenter: {presenter}".
                        strip())
                    event.add_participant(agenda_text,
                                          type="person",
                                          note="Presenter")
                else:
                    agenda_text = f"{subject} {description}".strip()

                agenda = event.add_agenda_item(agenda_text)

                if agenda_row.xpath(
                        'td[1]/a[contains(@href,"/legislation/")]'):
                    agenda.add_bill(
                        agenda_row.xpath(
                            'td[1]/a[contains(@href,"/legislation/")]/text()')
                        [0].strip())

            event.add_source(url)
            yield event
Exemplo n.º 30
0
    def scrape_lower(self):
        list_url = (
            "https://virginiageneralassembly.gov/house/schedule/meetingSchedule.php"
        )

        page = self.get(list_url).content
        page = lxml.html.fromstring(page)

        page.make_links_absolute(list_url)

        for row in page.xpath("//table[contains(@class, 'CODayTable')]/tbody/tr"):

            # TODO: it would be nice to go back in and update the record to mark it as cancelled,
            # but since there's no ics link it makes the day logic way more complicated
            if row.xpath(".//span[contains(@class, 'COCancelled')]"):
                continue

            # fallback for unlinked events
            source = (
                "https://virginiageneralassembly.gov/house/schedule/meetingSchedule.php"
            )

            if row.xpath(".//a[1]/text()"):
                title = row.xpath(".//a[1]/text()")[0].strip()
                source = row.xpath(".//a[1]/@href")[0]
                event_type = "committee-meeting"
            else:
                # skip unlinked misc events
                if row.xpath("td[contains(@class, 'COCommType')]/text()"):
                    title = row.xpath("td[contains(@class, 'COCommType')]/text()")[
                        0
                    ].strip()
                    event_type = "other"
                else:
                    continue

            date_link = row.xpath(".//a[@title='Add to Calendar']/@href")[0]
            parsed = parse.parse_qs(parse.urlparse(date_link).query)
            date_raw = parsed["dt"][0]
            location = parsed["loc"][0]

            start = dateutil.parser.parse(date_raw, tzinfos=self.tzinfos)

            # If there's a chair in parentheticals, remove them from the title
            # and add as a person instead
            chair_note = re.findall(r"\(.*\)", title)
            chair = None
            for chair_str in chair_note:
                title = title.replace(chair_str, "").strip()
                # drop the outer parens
                chair = chair_str[1:-1]

            event = Event(
                name=title,
                start_date=start,
                location_name=location,
                classification=event_type,
            )
            event.add_source(source)

            if chair is not None:
                event.add_participant(chair, type="person", note="chair")

            if event_type == "committee-meeting":
                event.add_participant(title, type="committee", note="host")

            if row.xpath(".//a[contains(@class,'COAgendaLink')]"):
                agenda_url = row.xpath(".//a[contains(@class,'COAgendaLink')]/@href")[0]
                event.add_document("Agenda", agenda_url, media_type="text/html")
                self.scrape_lower_agenda(event, agenda_url)

            yield event