Exemplo n.º 1
0
    def scrape_upper(self):
        url = "https://www.senate.mn/api/schedule/upcoming"
        data = self.get(url).json()

        for row in data["events"]:
            com = row["committee"]["committee_name"]
            start = dateutil.parser.parse(row["hearing_start"])
            start = self._tz.localize(start)

            if (row["hearing_room"] and "hearing_building" in row
                    and row["hearing_building"]):
                where = f"{row['hearing_building']} {row['hearing_room']}"
            elif "hearing_building" in row and row["hearing_building"]:
                where = row["hearing_building"]
            else:
                where = "TBD"

            description = ""

            if "hearing_notes" in row and row["hearing_notes"]:
                description = row["hearing_notes"]

            event = Event(
                name=com,
                location_name=where,
                start_date=start,
                classification="committee-meeting",
                description=description,
            )

            for bill in get_bill_ids(description):
                event.add_bill(description)

            if "lrl_schedule_link" in row:
                event.add_source(row["lrl_schedule_link"])
            else:
                if "link" in row["committee"]:
                    if row["committee"]["link"].startswith("http"):
                        event.add_source(row["committee"]["link"])
                    elif row["committee"]["link"].startswith("www"):
                        event.add_source(f"http://{row['committee']['link']}")
                    else:
                        event.add_source(
                            f"https://www.senate.mn/{row['committee']['link']}"
                        )
                elif "senate_chair_link" in row["committee"]:
                    event.add_source(
                        f"https://www.senate.mn/{row['committee']['senate_chair_link']}"
                    )

            if "agenda" in row:
                for agenda_row in row["agenda"]:
                    if (agenda_row["description"] is None
                            or agenda_row["description"].strip() == ""):
                        # sometimes they have blank agendas but bills or files
                        agenda_row["description"] = "Agenda"
                    agenda = event.add_agenda_item(agenda_row["description"])
                    if "bill_type" in agenda_row:
                        agenda.add_bill("{} {}".format(
                            agenda_row["bill_type"].replace(".", ""),
                            agenda_row["bill_number"],
                        ))

                    if "files" in agenda_row:
                        for file_row in agenda_row["files"]:
                            doc_name = file_row["filename"]
                            doc_url = file_row["file_path"]

                            # if they don't provide a name just use the filename
                            if doc_name == "":
                                parsed_url = urlparse(doc_url)
                                doc_name = os.path.basename(parsed_url.path)

                            event.add_document(
                                doc_name,
                                f"https://www.senate.mn/{doc_url}",
                                media_type="text/html",
                                on_duplicate="ignore",
                            )

            if "video_link" in row:
                event.add_media_link("Video", row["video_link"], "text/html")

            if "audio_link" in row:
                event.add_media_link("Audio", row["audio_link"], "text/html")

            yield event
Exemplo n.º 2
0
    def scrape_lower(self):
        url = "https://www.house.leg.state.mn.us/Schedules/All"
        page = self.lxmlize(url)

        for row in page.xpath('//div[contains(@class,"my-2 d-print-block")]'):
            # print(row.text_content())

            # skip floor sessions and unlinked events
            if not row.xpath(
                    'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/b'
            ):
                continue

            # skip joint ones, we'll get those from the senate API
            if row.xpath('div[contains(@class,"card-header bg-joint")]'):
                continue

            # top-level committee
            com = row.xpath(
                'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/b/text()'
            )[0].strip()
            com_link = row.xpath(
                'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/@href'
            )[0]

            when = (row.xpath(
                'div[contains(@class,"card-header")]/span[contains(@class,"text-white")]/text()'
            )[0].replace("\r\n", "").strip())
            when = dateutil.parser.parse(when)
            when = self._tz.localize(when)

            if row.xpath('.//b[.="Location:"]'):
                where = row.xpath(
                    './/b[.="Location:"]/following-sibling::text()[1]'
                )[0].strip()
            else:
                where = "See committee page"

            if row.xpath('.//b[.="Agenda:"]'):
                desc = "\n".join(
                    row.xpath('.//b[.="Agenda:"]/following-sibling::div/text()'
                              )).strip()
            else:
                desc = "See committee page"

            event = Event(
                name=com,
                start_date=when,
                location_name=where,
                classification="committee-meeting",
                description=desc,
            )

            event.add_source(com_link)

            for bill in get_bill_ids(desc):
                event.add_bill(desc)

            if row.xpath(
                    ".//a[contains(@href,'/bills/bill.php') and contains(@class,'pull-left')]"
            ):
                agenda = event.add_agenda_item("Bills")
                for bill_id in row.xpath(
                        ".//a[contains(@href,'/bills/bill.php') and contains(@class,'pull-left')]/text()"
                ):
                    agenda.add_bill(bill_id.strip())

            for attachment in row.xpath(".//ul/li/div/a"):
                doc_url = attachment.xpath("@href")[0]
                doc_name = attachment.xpath("text()")[0].strip()
                # if they don't provide a name just use the filename
                if doc_name == "":
                    parsed_url = urlparse(doc_url)
                    doc_name = os.path.basename(parsed_url)

                # sometimes broken links to .msg files (emails?) are attached,
                # they always 404.
                if doc_url.endswith(".msg"):
                    continue
                media_type = get_media_type(doc_url)
                event.add_document(doc_name,
                                   doc_url,
                                   media_type=media_type,
                                   on_duplicate="ignore")

            for committee in row.xpath(
                    'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/b/text()'
            ):
                event.add_participant(committee, type="committee", note="host")

            yield event
Exemplo n.º 3
0
    def scrape(self, chamber=None, session=None):
        """
        Scrape the events data from all dates from the sc meetings page,
        then create and yield the events objects from the data.
        :param chamber:
        :param session:
        :return: yielded Event objects
        """

        chambers = {
            "upper": {"name": "Senate", "title": "Senator"},
            "lower": {"name": "House", "title": "Representative"},
        }
        if chamber == "other":
            return

        if chamber is None:
            self.info("no chamber specified, using Joint Committee Meeting Schedule")
            events_url = "http://www.scstatehouse.gov/meetings.php"
        else:
            events_url = "http://www.scstatehouse.gov/meetings.php?chamber=%s" % (
                chambers[chamber]["name"].upper()[0]
            )

        page = self.get_page_from_url(events_url)

        meeting_year = page.xpath('//h2[@class="barheader"]/span')[0].text_content()
        meeting_year = re.search(
            r"Week of [A-Z][a-z]+\s+[0-9]{1,2}, ([0-9]{4})", meeting_year
        ).group(1)

        dates = page.xpath("//div[@id='contentsection']/ul")

        for date in dates:
            date_string = date.xpath("span")

            if len(date_string) == 1:
                date_string = date_string[0].text_content()
            else:
                continue

            # If a event is in the next calendar year, the date_string
            # will have a year in it
            if date_string.count(",") == 2:
                event_year = date_string[-4:]
                date_string = date_string[:-6]
            elif date_string.count(",") == 1:
                event_year = meeting_year
            else:
                raise AssertionError("This is not a valid date: '{}'").format(
                    date_string
                )

            for meeting in date.xpath("li"):
                time_string = meeting.xpath("span")[0].text_content()

                if (
                    time_string == "CANCELED"
                    or len(meeting.xpath('.//span[contains(text(), "CANCELED")]')) > 0
                ):
                    continue

                time_string = normalize_time(time_string)
                date_time = datetime.datetime.strptime(
                    event_year + " " + date_string + " " + time_string,
                    "%Y %A, %B %d %I:%M %p",
                )

                date_time = self._tz.localize(date_time)
                meeting_info = meeting.xpath("br[1]/preceding-sibling::node()")[1]
                location, description = re.search(
                    r"-- (.*?) -- (.*)", meeting_info
                ).groups()

                # if re.search(r'committee', description, re.I):
                #     meeting_type = 'committee:meeting'
                # else:
                #     meeting_type = 'other:meeting'

                event = Event(
                    name=description,  # Event Name
                    start_date=date_time,  # When the event will take place
                    location_name=location,
                )  # Where the event will be

                event.add_source(events_url)

                agenda_url = meeting.xpath(".//a[contains(@href,'agendas')]")

                if agenda_url:
                    agenda_url = agenda_url[0].attrib["href"]
                    event.add_source(agenda_url)
                    event.add_document(
                        note="Agenda", url=agenda_url, media_type="application/pdf"
                    )

                    agenda_page = self.get_page_from_url(agenda_url)

                    for bill in agenda_page.xpath(
                        ".//a[contains(@href,'billsearch.php')]"
                    ):
                        # bill_url = bill.attrib['href']
                        bill_id = bill.text_content().replace(".", "").replace(" ", "")
                        # bill_description = self.get_bill_description(bill_url)

                        event.add_bill(bill_id)

                yield event