Python CSS.text_content示例，spatula.CSS.text_content Python示例

示例#1

0

显示文件

    def process_item(self, item):
        try:
            link = CSS("a").match(item)[1]
        except SelectorError:
            self.skip()
        data = {
            "last_name": link.text_content(),
            "url": link.get("href"),
        }
        for key, label in self.LABELS.items():
            data[key] = CSS(f"[id$={label}]").match_one(
                item).text_content().strip()

        party = {"(D)": "Democratic", "(R)": "Republican"}[data["party"]]
        address = "Hawaii State Capitol, Room " + data["room"]
        chamber = "upper" if data["chamber"] == "S" else "lower"

        p = ScrapePerson(
            name=data["first_name"] + " " + data["last_name"],
            state="hi",
            chamber=chamber,
            district=data["district"],
            given_name=data["first_name"],
            family_name=data["last_name"],
            party=party,
            email=data["email"],
        )
        p.capitol_office.address = address
        p.capitol_office.voice = data["voice"]
        p.capitol_office.fax = data["fax"]
        p.add_source(data["url"])
        p.add_link(data["url"])
        return p

示例#2

0

显示文件

文件： committees.py 项目： jealob/openstates-scrapers

    def process_item(self, item):
        try:
            title = XPath("..//preceding-sibling::h3/text()").match(item)

        except SelectorError:
            title = XPath("../../..//preceding-sibling::h3/text()").match(item)

        for comm_name in title:
            if (comm_name == "Standing Committees"
                    or comm_name == "Appropriations Subcommittees"):
                name_link = CSS("a").match_one(item)
                name = name_link.text_content()
                source = name_link.get("href")
                if comm_name == "Standing Committees":
                    com = ScrapeCommittee(name=name, chamber=self.chamber)
                else:
                    com = ScrapeCommittee(
                        name=name,
                        classification="subcommittee",
                        chamber=self.chamber,
                        parent="Appropriations",
                    )
                return SenateCommitteeDetail(com, source=source)
            else:
                self.skip()

示例#3

0

显示文件

    def process_item(self, item):
        # skip header rows
        if (
            len(CSS("td").match(item)) == 1
            or CSS("td").match(item)[0].get("class") == "header"
        ):
            self.skip()

        first_link = CSS("td a").match(item)[0]
        name = first_link.text_content()
        detail_link = first_link.get("href")

        district = CSS("td").match(item)[3].text_content()
        party_letter = CSS("td").match(item)[4].text_content()
        party_dict = {"D": "Democratic", "R": "Republican", "I": "Independent"}
        party = party_dict[party_letter]

        p = ScrapePerson(
            name=name,
            state="il",
            party=party,
            chamber=self.chamber,
            district=district,
        )

        p.add_source(self.source.url)
        p.add_source(detail_link)
        p.add_link(detail_link, note="homepage")

        return LegDetail(p, source=detail_link)

示例#4

0

显示文件

文件： committees.py 项目： jealob/openstates-scrapers

 def process_item(self, item):
     com_link = CSS("a").match(item)[0]
     name = com_link.text_content()
     com = ScrapeCommittee(name=name,
                           classification="committee",
                           chamber=self.chamber)
     detail_link = com_link.get("href")
     com.add_source(detail_link)
     com.add_link(detail_link, note="homepage")
     return CommitteeDetail(com, source=detail_link)

示例#5

0

显示文件

文件： people.py 项目： csnardi/openstates

    def process_page(self):
        p = self.input

        district = CSS("div.hidden-xs.mem-info h3").match_one(self.root).text_content()
        title, district = re.search(r"(.+)\s\|\sDistrict\s(\d+)", district).groups()
        p.district = district
        if title != "Representative":
            p.extras["title"] = title

        assistant = CSS("div.hidden-xs.mem-info a").match(self.root)[0]
        assistant_name = assistant.text_content()
        assistant_email = assistant.get("href")
        assistant_email = re.search(r"mailto:(.+)", assistant_email).groups()[0]
        assistant_phones = (
            CSS("div.hidden-xs.mem-info p.no-margin").match(self.root)[1].text_content()
        )
        phone1, phone2 = re.search(r"Phone:\s(.+)\s\|\s(.+)", assistant_phones).groups()

        p.extras["assistant name"] = assistant_name
        p.extras["assistant email"] = assistant_email
        p.extras["assistant phone1"] = phone1
        p.extras["assistant phone2"] = phone2

        press_name = (
            CSS("div.hidden-xs.mem-info div.small-block.last p")
            .match(self.root)[0]
            .text_content()
        )
        press_phone = (
            CSS("div.hidden-xs.mem-info div.small-block.last p")
            .match(self.root)[1]
            .text_content()
        )
        press_phone = re.search(r"Phone:\s(.+)", press_phone).groups()[0]
        press_email = (
            CSS("div.hidden-xs.mem-info div.small-block.last a")
            .match_one(self.root)
            .text_content()
        )

        p.extras["press contact name"] = press_name
        p.extras["press contact phone"] = press_phone
        p.extras["press contact email"] = press_email

        return p

示例#6

0

显示文件

    def process_item(self, item):
        com_link = CSS("a").match_one(item)
        name = com_link.text_content()

        com = ScrapeCommittee(
            name=name,
            chamber=self.chamber,
        )

        detail_link = com_link.get("href")

        com.add_source(self.source.url)
        com.add_source(detail_link)
        com.add_link(detail_link, note="homepage")

        # this link has broken html (not able to grab member info)
        # just returning name, chamber, and link
        if detail_link == "https://legislature.idaho.gov/sessioninfo/2021/joint/cec/":
            return com

        return DetailCommitteePage(com, source=detail_link)

示例#7

0

显示文件

文件： people.py 项目： jealob/openstates-scrapers

    def process_page(self):

        name = (
            CSS(".container-main #ContentPlaceHolder1_lblMember")
            .match_one(self.root)
            .text_content()
        )

        if self.input.chamber == "upper":
            name_split = re.split("SENATOR|, ", name)
        elif self.input.chamber == "lower":
            name_split = re.split("REPRESENTATIVE|, ", name)
        full_name = name_split[2] + name_split[1]

        table = CSS("#ContentPlaceHolder1_TabSenator_TabLeg_gvLEG").match_one(self.root)

        party = (
            district
        ) = county = phone = fax = street = office = city = postal = email = ""

        for tr in CSS("tr").match(table):
            type, info = CSS("td").match(tr)
            type = type.text_content()
            info = info.text_content()

            if type == "Affiliation:":
                party = ""
                if info == "(R)":
                    party = "Republican"
                elif info == "(D)":
                    party = "Democrat"
                else:
                    party = info
            elif type == "District:":
                district = info.split(" ")[2]
            elif type == "County:":
                county = info
            elif type == "Phone Number:":
                phone = info
            elif type == "Fax Number:":
                if info != "":
                    fax = info
            elif type == "Street:":
                street = info
            elif type == "Office:":
                office = info
            elif type == "City:":
                city = info
            elif type == "Postal Code:":
                postal = info
            elif type == "Email:":
                email = info

        address = f"{street}, {office}, {city} AL"

        image = (
            CSS("#ContentPlaceHolder1_TabSenator_TabLeg_imgLEG")
            .match_one(self.root)
            .get("src")
        )

        p = ScrapePerson(
            name=full_name.title(),
            state="al",
            chamber=self.input.chamber,
            party=party,
            district=district,
            email=email,
            image=image,
        )
        p.add_source(self.source.url)
        p.add_source(self.input.url)

        # This address is the capitol office
        if re.search("11 South Union Street", street):
            p.capitol_office.address = address
            p.capitol_office.voice = phone
            try:
                p.capitol_office.fax = fax
            except ValueError:
                pass
        else:
            p.district_office.address = address
            p.district_office.voice = phone
            try:
                p.district_office.fax = fax
            except ValueError:
                pass

        p.extras["postal code"] = postal
        p.extras["county"] = county

        return p