示例#1
0
    def scrape_meeting_notice(self, chamber, item, url):
        # Since Event Name is not provided for all mettings.
        event_name = str(item["CommitteeName"])
        # 04/25/2012 03:00:00 PM
        fmt = "%m/%d/%y %I:%M %p"
        start_time = dt.datetime.strptime(str(item["MeetingDateTime"]), fmt)
        location_name = str(item["AddressAliasNickname"])
        event = Event(
            location_name=location_name,
            start_date=self._tz.localize(start_time),
            name=event_name,
            description="Committee Meeting Status: {}".format(
                item["CommitteeMeetingStatusName"]),
        )

        event.add_source(url)
        event.add_committee(name=str(item["CommitteeName"]),
                            id=item["CommitteeId"])

        page_url = ("http://legis.delaware.gov/json/MeetingNotice/"
                    "GetCommitteeMeetingItems?committeeMeetingId={}".format(
                        item["CommitteeMeetingId"]))

        event.add_source(page_url)
        page_data = self.post(page_url).json()["Data"]
        for item in page_data:
            event.add_agenda_item(description=str(item["ItemDescription"]))
            event.add_person(
                name=str(item["PrimarySponsorShortName"]),
                id=str(item["PrimarySponsorPersonId"]),
                note="Sponsor",
            )

        yield event
示例#2
0
    def scrape_events(self, page):

        page = lxml.html.fromstring(page)

        if page.xpath(
                "//h3[contains(text(),'There are no hearings for the date range')]"
        ):
            raise EmptyScrape
            return

        for meeting in page.xpath('//div[@class="card mb-4"]'):
            com = meeting.xpath(
                'div[contains(@class, "card-header")]/text()')[0].strip()
            details = meeting.xpath(
                'div[contains(@class, "card-header")]/small/text()')[0].strip(
                )

            (location, time) = details.split(" - ")

            # turn room numbers into the full address
            if location.lower().startswith("room"):
                location = "1445 K St, Lincoln, NE 68508, {}".format(location)

            day = meeting.xpath(
                "./preceding-sibling::h2[@class='text-center']/text()"
            )[-1].strip()

            # Thursday February 27, 2020 1:30 PM
            date = "{} {}".format(day, time)
            event_date = self._tz.localize(
                datetime.datetime.strptime(date, "%A %B %d, %Y %I:%M %p"))

            event = Event(
                name=com,
                start_date=event_date,
                classification="committee-meeting",
                description="Committee Meeting",
                location_name=location,
            )

            event.add_committee(com, note="host")

            for row in meeting.xpath("div/table/tr"):
                if not row.xpath("td[3]"):
                    continue
                agenda_desc = row.xpath("td[3]/text()")[0].strip()
                agenda_item = event.add_agenda_item(description=agenda_desc)

                if row.xpath("td[1]/a"):
                    # bill link
                    agenda_item.add_bill(
                        row.xpath("td[1]/a/text()")[0].strip())

            event.add_source(
                "https://nebraskalegislature.gov/calendar/calendar.php")

            yield event
示例#3
0
    def scrape(self):
        page = self.lxmlize(calurl)
        events = page.xpath("//table[@class='agenda-body']//tr")[1:]

        for event in events:
            comit_url = event.xpath(".//a[contains(@title,'Committee Details')]")
            if len(comit_url) != 1:
                continue

            comit_url = comit_url[0]
            who = self.scrape_participants(comit_url.attrib["href"])

            tds = event.xpath("./*")
            date = tds[0].text_content().strip()
            cttie = tds[1].text_content().strip()
            chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)]
            info = tds[2]
            name = info.xpath("./a[contains(@href, 'raw')]")[0]
            notice = name.attrib["href"]
            name = name.text
            time, where = info.xpath("./i/text()")
            what = tds[3].text_content()
            what = what.replace("Items: ", "")
            if "(None)" in what:
                continue
            what = [x.strip() for x in what.split(";")]

            when = ", ".join([date, str(dt.datetime.now().year), time])
            when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p")

            if cttie:
                cttie = cttie.replace("Committee on", "").strip()
                cttie = f"{chamber} {cttie}"
                name = cttie

            event = Event(
                name=name, location_name=where, start_date=self._tz.localize(when)
            )

            event.add_source(calurl)

            event.add_committee(cttie, note="host")

            event.add_document("notice", notice, media_type="application/pdf")

            for entry in what:
                item = event.add_agenda_item(entry)
                if entry.startswith("AB") or entry.startswith("SB"):
                    item.add_bill(entry)

            for thing in who:
                event.add_person(thing["name"])

            yield event
示例#4
0
    def scrape(self, session=None):
        if session is None:
            session = self.latest_session()
            self.info("no session specified, using %s", session)

        year_abr = ((int(session) - 209) * 2) + 2000
        self._init_mdb(year_abr)
        self.initialize_committees(year_abr)
        # Keep record of all events
        records = self.access_to_csv("Agendas")
        for record in records:
            if record["Status"] != "Scheduled":
                continue
            description = record["Comments"]
            related_bills = []

            for bill in re.findall(r"(A|S)(-)?(\d{4})", description):
                related_bills.append({
                    "bill_id": "%s %s" % (bill[0], bill[2]),
                    "descr": description
                })

            date_time = "%s %s" % (record["Date"], record["Time"])
            date_time = dt.datetime.strptime(date_time, "%m/%d/%Y %I:%M %p")

            try:
                hr_name = self._committees[record["CommHouse"]]
            except KeyError:
                self.warning("unknown committee code %s, skipping",
                             record["CommHouse"])

            description = "Meeting of the {}".format(hr_name)

            event = Event(
                name=description,
                start_date=self._tz.localize(date_time),
                location_name=record["Location"] or "Statehouse",
            )
            item = None
            for bill in related_bills:
                item = item or event.add_agenda_item(description)
                item.add_bill(bill["bill_id"])
            # Add committee to event
            event.add_committee(hr_name, id=record["CommHouse"], note="host")
            event.add_source("http://www.njleg.state.nj.us/downloads.asp")

            yield event
示例#5
0
    def scrape_meeting_notice(self, item, url):
        # Since Event Name is not provided for all mettings.
        if "Joint" in str(item["CommitteeName"]):
            event_name = str(item["CommitteeName"])
        else:
            event_name = "{} {}".format(str(item["CommitteeTypeName"]),
                                        str(item["CommitteeName"]))
        # 04/25/2012 03:00:00 PM
        fmt = "%m/%d/%y %I:%M %p"
        start_time = dt.datetime.strptime(str(item["MeetingDateTime"]), fmt)
        location_name = str(item["AddressAliasNickname"])
        event = Event(
            location_name=location_name,
            start_date=self._tz.localize(start_time),
            name=event_name,
            description="Committee Meeting Status: {}".format(
                item["CommitteeMeetingStatusName"]),
        )

        event.add_committee(name=str(item["CommitteeName"]),
                            id=item["CommitteeId"])

        html_url = f'https://legis.delaware.gov/MeetingNotice?committeeMeetingId={item["CommitteeMeetingId"]}'
        event.add_source(html_url)

        page_url = f'https://legis.delaware.gov/json/MeetingNotice/GetCommitteeMeetingItems?committeeMeetingId={item["CommitteeMeetingId"]}'

        page_data = []
        try:
            page_data = self.post(page_url).json()["Data"]
        except json.decoder.JSONDecodeError:
            # No agenda items
            self.info(f"POST returned nothing on {page_url}")

        for item in page_data:
            a = event.add_agenda_item(description=str(item["ItemDescription"]))
            if item["LegislationDisplayText"] is not None:
                a.add_bill(item["LegislationDisplayText"])

            event.add_person(
                name=str(item["PrimarySponsorShortName"]),
                id=str(item["PrimarySponsorPersonId"]),
                note="Sponsor",
            )

        yield event
    def scrape(self, session=None):
        if session is None:
            session = self.latest_session()
        year_slug = self.jurisdiction.get_year_slug(session)

        url = "http://legislature.vermont.gov/committee/loadAllMeetings/{}".format(
            year_slug)

        json_data = self.get(url).text
        events = json.loads(json_data)["data"]

        for info in events:
            # Determine when the committee meets
            if (info["TimeSlot"] == "" or info["TimeSlot"] == "1"
                    or info["TimeSlot"] == 1):
                start_time = datetime.datetime.strptime(
                    info["MeetingDate"], "%A, %B %d, %Y")
                all_day = True
            else:
                try:
                    start_time = datetime.datetime.strptime(
                        info["MeetingDate"] + ", " + info["TimeSlot"],
                        "%A, %B %d, %Y, %I:%M %p",
                    )
                except ValueError:
                    start_time = datetime.datetime.strptime(
                        info["MeetingDate"] + ", " + info["StartTime"],
                        "%A, %B %d, %Y, %I:%M %p",
                    )
                all_day = False

            event = Event(
                start_date=self.TIMEZONE.localize(start_time),
                all_day=all_day,
                name="Meeting of the {}".format(info["LongName"]),
                description="committee meeting",
                location_name="{0}, Room {1}".format(info["BuildingName"],
                                                     info["RoomNbr"]),
            )
            event.add_source(url)
            event.add_committee(name=info["LongName"], note="host")

            yield event
示例#7
0
    def scrape(self, session=None):
        year_slug = self.jurisdiction.get_year_slug(session)

        url = "http://legislature.vermont.gov/committee/loadAllMeetings/{}".format(
            year_slug)

        json_data = self.get(url).text
        events = json.loads(json_data)["data"]

        for info in events:
            # Determine when the committee meets
            if (info["TimeSlot"] == "" or info["TimeSlot"] == "1"
                    or info["TimeSlot"] == 1):
                start_time = dateutil.parser.parse(info["MeetingDate"])
                all_day = True
            else:
                try:
                    start_time = dateutil.parser.parse(
                        f"{info['MeetingDate']}, {info['TimeSlot']}")
                except ParserError:
                    start_time = dateutil.parser.parse(info["MeetingDate"])

                all_day = False

            event = Event(
                start_date=self.TIMEZONE.localize(start_time),
                all_day=all_day,
                name="Meeting of the {}".format(info["LongName"]),
                description="committee meeting",
                location_name="{0}, Room {1}".format(info["BuildingName"],
                                                     info["RoomNbr"]),
            )
            event.add_source(url)
            event.add_committee(name=info["LongName"], note="host")

            yield event
示例#8
0
    def scrape(self):

        get_short_codes(self)
        page = self.lxmlize(URL)

        if page.xpath("//td[contains(string(.),'No Hearings')]"):
            raise EmptyScrape

        table = page.xpath(
            "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0]

        for event in table.xpath(".//tr")[1:]:
            tds = event.xpath("./td")
            committee = tds[0].text_content().strip()

            # Multi-committee events will be CODE1/CODE2/CODE3
            if "/" in committee:
                coms = committee.split("/")
                com_names = []
                for com in coms:
                    com_names.append("{} {}".format(
                        self.chambers[self.short_ids[com]["chamber"]],
                        self.short_ids[com]["name"],
                    ))
                descr = ", ".join(com_names)
            elif self.short_ids.get(committee):
                descr = "{} {}".format(
                    self.chambers[self.short_ids[committee]["chamber"]],
                    self.short_ids[committee]["name"],
                )
            else:
                descr = [x.text_content() for x in tds[1].xpath(".//span")]
                if len(descr) != 1:
                    raise Exception
                descr = descr[0].replace(".", "").strip()

            when = tds[2].text_content().strip()
            where = tds[3].text_content().strip()
            notice = tds[4].xpath(".//a")[0]
            notice_href = notice.attrib["href"]
            notice_name = notice.text

            # the listing page shows the same hearing in multiple rows.
            # combine these -- get_related_bills() will take care of adding the bills
            # and descriptions
            if notice_href in self.seen_hearings:
                continue
            else:
                self.seen_hearings.append(notice_href)

            when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p")
            when = TIMEZONE.localize(when)
            event = Event(
                name=descr,
                start_date=when,
                classification="committee-meeting",
                description=descr,
                location_name=where,
            )

            if "/" in committee:
                committees = committee.split("/")
            else:
                committees = [committee]

            for committee in committees:
                if "INFO" not in committee and committee in self.short_ids:
                    committee = "{} {}".format(
                        self.chambers[self.short_ids[committee]["chamber"]],
                        self.short_ids[committee]["name"],
                    )
                event.add_committee(committee, note="host")

            event.add_source(URL)
            event.add_document(notice_name,
                               notice_href,
                               media_type="text/html")
            for bill in self.get_related_bills(notice_href):
                a = event.add_agenda_item(description=bill["descr"].strip())
                bill["bill_id"] = bill["bill_id"].split(",")[0]
                a.add_bill(bill["bill_id"], note=bill["type"])
            yield event
    def scrape_chamber(self, chamber=None):
        # If chamber is None, don't exclude any events from the results based on chamber
        chmbr = cal_chamber_text.get(chamber)
        tables = url_xpath(cal_weekly_events, "//table[@class='date-table']")
        for table in tables:
            date = table.xpath("../.")[0].getprevious().text_content()
            trs = table.xpath("./tr")
            for tr in trs:
                order = [
                    "time", "chamber", "type", "agenda", "location", "video"
                ]

                tds = tr.xpath("./td")
                metainf = {}

                if not tds:
                    continue

                for el in range(0, len(order)):
                    metainf[order[el]] = tds[el]

                if chmbr and metainf["chamber"].text_content() != chmbr:
                    self.info("Skipping event based on chamber.")
                    continue

                time = metainf["time"].text_content()
                datetime_string = "%s %s" % (date.strip(" \r\n"),
                                             time.strip(" \r\n"))
                location = metainf["location"].text_content()
                description = metainf["type"].text_content()
                dtfmt = "%A, %B %d, %Y %I:%M %p"
                dtfmt_no_time = "%A, %B %d, %Y"
                if time == "Cancelled":
                    self.log("Skipping cancelled event.")
                    continue
                else:
                    if "Immediately follows H-FLOOR" in datetime_string:
                        continue
                    if " Immediately follows" in datetime_string:
                        datetime_string, _ = datetime_string.split(
                            "Immediately follows")
                    if "canceled" in datetime_string.lower():
                        continue
                    if "TBA" in datetime_string:
                        continue

                    datetime_string = datetime_string.strip()

                    try:
                        when = dt.datetime.strptime(datetime_string, dtfmt)
                    except ValueError:
                        when = dt.datetime.strptime(datetime_string,
                                                    dtfmt_no_time)
                    when = self._utc.localize(when)

                event = Event(
                    name=description,
                    start_date=when,
                    location_name=location,
                    description=description,
                )
                # The description is a committee name
                event.add_committee(name=description)
                event.add_source(cal_weekly_events)

                agenda = metainf["agenda"].xpath(".//a")
                if len(agenda) > 0:
                    agenda = agenda
                    for doc in agenda:
                        if not doc.text_content():
                            continue
                        agenda_url = doc.attrib["href"]
                        self.add_agenda(agenda_url, doc.text_content(), event)
                yield event