Python get_media_type 예제들, utils.media.get_media_type Python 예제들

예제 #1

0

파일 보기

    def parse_versions(self, page, bill):
        xpath_expr = '//tr[th[text()="Bill Documents"]]/td[1]/a'
        version_count = 0
        for row in page.xpath(xpath_expr):
            source_url = row.attrib["href"]
            version_title = row.xpath("text()")[0].strip()

            mimetype = get_media_type(source_url)
            if mimetype is None:
                self.warning("Unknown mimetype for {}".format(source_url))

            bill.add_version_link(version_title, source_url, media_type=mimetype)
            version_count += 1
        return version_count

예제 #2

0

파일 보기

파일: bills.py 프로젝트: jessemortenson/openstates

    def parse_bill(self, chamber, session, bill_id, url):
        try:
            page = self.lxmlize(url)
        except scrapelib.HTTPError as e:
            self.logger.warning(e)
            return

        if self.parse_bill_field(page, "Last Action") != "":
            last_action = self.parse_bill_field(
                page, "Last Action").xpath("text()")[0]
            if "WITHDRAWN" in last_action.upper():
                self.info("{} Withdrawn, skipping".format(bill_id))
                return

        title = self.parse_bill_field(page, "Title").text_content()

        if "CR" in bill_id:
            bill_type = "concurrent resolution"
        elif "JR" in bill_id:
            bill_type = "joint resolution"
        elif "R" in bill_id:
            bill_type = "resolution"
        else:
            bill_type = "bill"

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )
        bill.subject = self._subjects[bill_id]
        bill.add_source(url)

        version_ct = self.parse_versions(page, bill)

        if version_ct < 1:
            # Bill withdrawn
            self.logger.warning("Bill withdrawn.")
            return

        self.parse_actions(page, bill, chamber)
        self.parse_subjects(page, bill)
        self.parse_proposed_amendments(page, bill)

        # LM is "Locally Mandated fiscal impact"
        fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]')
        for fiscal_note in fiscal_notes:
            source_url = fiscal_note.attrib["href"]
            mimetype = get_media_type(source_url)

            bill.add_document_link("Fiscal Note",
                                   source_url,
                                   media_type=mimetype)

        for link in page.xpath(
                "//td/span/a[contains(@href, 'Legislator-Profile')]"):
            bill.add_sponsorship(
                link.text.strip(),
                classification="primary",
                entity_type="person",
                primary=True,
            )

        if page.xpath("//th[contains(text(),'Votes')]"):
            vote_url = page.xpath(
                "//a[contains(text(),'Vote History')]/@href")[0]
            yield from self.scrape_votes(vote_url, bill, chamber)

        bdr_no = self.parse_bill_field(page, "Bill Request Number")
        if bdr_no != "" and bdr_no.xpath("text()"):
            bdr = bdr_no.xpath("text()")[0].strip()
            bill.extras["BDR"] = bdr

        yield bill

예제 #3

0

파일 보기

    def scrape_lower(self):
        url = "https://www.house.leg.state.mn.us/Schedules/All"
        page = self.lxmlize(url)

        for row in page.xpath('//div[contains(@class,"my-2 d-print-block")]'):
            # print(row.text_content())

            # skip floor sessions and unlinked events
            if not row.xpath(
                    'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/b'
            ):
                continue

            # skip joint ones, we'll get those from the senate API
            if row.xpath('div[contains(@class,"card-header bg-joint")]'):
                continue

            # top-level committee
            com = row.xpath(
                'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/b/text()'
            )[0].strip()
            com_link = row.xpath(
                'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/@href'
            )[0]

            when = (row.xpath(
                'div[contains(@class,"card-header")]/span[contains(@class,"text-white")]/text()'
            )[0].replace("\r\n", "").strip())
            when = dateutil.parser.parse(when)
            when = self._tz.localize(when)

            if row.xpath('.//b[.="Location:"]'):
                where = row.xpath(
                    './/b[.="Location:"]/following-sibling::text()[1]'
                )[0].strip()
            else:
                where = "See committee page"

            if row.xpath('.//b[.="Agenda:"]'):
                desc = "\n".join(
                    row.xpath('.//b[.="Agenda:"]/following-sibling::div/text()'
                              )).strip()
            else:
                desc = "See committee page"

            event = Event(
                name=com,
                start_date=when,
                location_name=where,
                classification="committee-meeting",
                description=desc,
            )

            event.add_source(com_link)

            for bill in get_bill_ids(desc):
                event.add_bill(desc)

            if row.xpath(
                    ".//a[contains(@href,'/bills/bill.php') and contains(@class,'pull-left')]"
            ):
                agenda = event.add_agenda_item("Bills")
                for bill_id in row.xpath(
                        ".//a[contains(@href,'/bills/bill.php') and contains(@class,'pull-left')]/text()"
                ):
                    agenda.add_bill(bill_id.strip())

            for attachment in row.xpath(".//ul/li/div/a"):
                doc_url = attachment.xpath("@href")[0]
                doc_name = attachment.xpath("text()")[0].strip()
                # if they don't provide a name just use the filename
                if doc_name == "":
                    parsed_url = urlparse(doc_url)
                    doc_name = os.path.basename(parsed_url)

                # sometimes broken links to .msg files (emails?) are attached,
                # they always 404.
                if doc_url.endswith(".msg"):
                    continue
                media_type = get_media_type(doc_url)
                event.add_document(doc_name,
                                   doc_url,
                                   media_type=media_type,
                                   on_duplicate="ignore")

            for committee in row.xpath(
                    'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/b/text()'
            ):
                event.add_participant(committee, type="committee", note="host")

            yield event

예제 #4

0

파일 보기

파일: bills.py 프로젝트: jealob/openstates-scrapers

    def parse_bill(self, chamber, session, bill_id, url):
        try:
            page = self.lxmlize(url)
        except scrapelib.HTTPError as e:
            self.logger.warning(e)
            return

        withdrawn = False

        if self.parse_bill_field(page, "Last Action") != "":
            last_action = self.parse_bill_field(page, "Last Action").xpath("text()")[0]
            if "WITHDRAWN" in last_action.upper():
                self.info("{} Withdrawn, skipping".format(bill_id))
                withdrawn = True

        if withdrawn:
            title = "Withdrawn."
        else:
            title = self.parse_bill_field(page, "Title").text_content()

        if "CR" in bill_id:
            bill_type = "concurrent resolution"
        elif "JR" in bill_id:
            bill_type = "joint resolution"
        elif "R" in bill_id:
            bill_type = "resolution"
        else:
            bill_type = "bill"

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )
        bill.subject = self._subjects[bill_id]
        bill.add_source(url)

        self.parse_versions(page, bill)

        self.parse_actions(page, bill, chamber)
        self.parse_subjects(page, bill)
        self.parse_proposed_amendments(page, bill)

        # LM is "Locally Mandated fiscal impact"
        fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]')
        for fiscal_note in fiscal_notes:
            source_url = fiscal_note.attrib["href"]
            mimetype = get_media_type(source_url)

            bill.add_document_link("Fiscal Note", source_url, media_type=mimetype)

        # only grab links in the first table, because proposed amendments have sponsors that are not bill sponsors.
        for link in page.xpath(
            "//div[contains(@class,'bill-table')][1]//td/span/a[contains(@href, 'Legislator-Profile')]"
        ):
            bill.add_sponsorship(
                link.text.strip(),
                classification="primary",
                entity_type="person",
                primary=True,
            )

        if page.xpath("//th[contains(text(),'Votes')]"):
            vote_url = page.xpath("//a[contains(text(),'Vote History')]/@href")[0]
            yield from self.scrape_votes(vote_url, bill, chamber)

        bdr_no = self.parse_bill_field(page, "Bill Request Number")
        if bdr_no != "" and bdr_no.xpath("text()"):
            bdr = bdr_no.xpath("text()")[0].strip()
            bill.extras["BDR"] = bdr

        if self.parse_bill_field(page, "Summary of Original Version") != "":
            summary = (
                self.parse_bill_field(page, "Summary of Original Version")
                .text_content()
                .strip()
            )
            bill.add_abstract(summary, note="Summary of Original Version")

        if withdrawn:
            action = self.parse_bill_field(page, "Last Action").text_content().strip()
            wd_date = re.findall(r"\d{2}\/\d{2}\/\d+", action)[0]
            wd_date = dateutil.parser.parse(wd_date).date()
            bill.add_action(
                action, wd_date, chamber=chamber, classification="withdrawal"
            )

        yield bill