Python URL示例，spatula.URL Python示例

示例#1

0

显示文件

文件： committees.py 项目： jessemortenson/openstates

class SenateCommitteeList(HtmlListPage):
    source = URL("http://senate.ca.gov/committees")

    selector = XPath("//h2/../following-sibling::div//a")

    def process_item(self, item):
        comm_name = XPath("text()").match_one(item)
        if comm_name in [
                "Teleconference How-To Information", "Legislative Process"
        ]:
            self.skip()

        comm_url = XPath("@href").match_one(item)

        if comm_name.startswith("Joint"):
            com = ScrapeCommittee(name=comm_name,
                                  classification="committee",
                                  chamber="legislature")
        elif comm_name.startswith("Subcommittee"):
            parent_comm = (item.getparent().getparent().getparent().getparent(
            ).getchildren()[0].text_content())
            com = ScrapeCommittee(
                name=comm_name,
                classification="subcommittee",
                chamber="upper",
                parent=parent_comm,
            )
        else:
            com = ScrapeCommittee(name=comm_name,
                                  classification="committee",
                                  chamber="upper")
        com.add_source(self.source.url)
        com.add_source(comm_url)
        com.add_link(comm_url, note="homepage")
        return ChooseType(com, source=URL(comm_url))

示例#2

0

显示文件

文件： people.py 项目： csnardi/openstates

class House(HtmlListPage):
    source = URL(
        "http://www.tucamarapr.org/dnncamara/ComposiciondelaCamara/Biografia.aspx"
    )
    selector = CSS("ul.list-article li", num_items=49)

    def process_item(self, item):
        bio_info = (CSS("div.biodiv a").match_one(
            item).text_content().strip().split("\n"))
        name = bio_info[0].strip()
        name = re.sub(r"^Hon\.", "", name, flags=re.IGNORECASE).strip()

        district = bio_info[2].strip()
        if district == "Representante por Acumulación":
            district = "At-Large"
        else:
            district = re.search(r"Representante\sdel\sDistrito\s(.+)",
                                 district).groups()[0]

        partial = PartialRep(name=name,
                             district=district,
                             source=self.source.url)

        detail_link = CSS("a").match_one(item).get("href")

        return RepDetail(partial, source=detail_link)

示例#3

0

显示文件

文件： people.py 项目： jealob/openstates-scrapers

    def process_page(self):
        for item in self.data["Data"]:
            name = item["PersonFullName"]
            party_code = item["PartyCode"]
            party_dict = {
                "D": "Democratic",
                "R": "Republican",
                "I": "Independent"
            }
            party = party_dict[party_code]
            district = item["DistrictNumber"]

            p = ScrapePerson(
                name=name,
                state="de",
                party=party,
                chamber=self.chamber,
                district=district,
            )

            p.add_source(self.source.url)
            detail_link = URL(
                f"https://legis.delaware.gov/LegislatorDetail?personId={item['PersonId']}"
            )
            p.add_source(detail_link.url)
            p.add_link(detail_link.url, note="homepage")

            yield LegDetail(p, source=detail_link.url)

示例#4

0

显示文件

文件： people.py 项目： jealob/openstates-scrapers

    def process_item(self, item):
        name = CSS("a.membername").match_one(item).text_content()
        name = re.search(r"(Senator|Representative)\s(.+)", name).groups()[1]

        party = CSS("a.membername").match_one(item).tail.strip()
        if party == "(D)":
            party = "Democratic"
        elif party == "(R)":
            party = "Republican"

        district = CSS("div.district a").match_one(item).text_content().strip()
        district = re.search(r"District\s(.+)", district).groups()[0]

        p = ScrapePerson(
            name=name,
            state="sc",
            chamber=self.chamber,
            district=district,
            party=party,
        )

        detail_link = CSS("div.district a").match_one(item).get("href")

        p.add_source(self.source.url)
        p.add_source(detail_link)
        p.add_link(detail_link, note="homepage")

        img = CSS("img").match_one(item).get("src")
        p.image = img

        return LegDetail(p, source=URL(detail_link, timeout=20))

示例#5

0

显示文件

文件： people.py 项目： jessemortenson/openstates

class EmailAugmentation(HtmlListPage):
    """
    WA Email addresses are listed on a separate page.
    """

    source = URL("https://app.leg.wa.gov/memberemail/Default.aspx")

    def find_rows(self):
        return CSS("#membertable tbody tr").match(self.root, num_items=147)

    def process_page(self):
        # We need it to find the member's email address.
        # These positions are enough to discriminate the chamber too (0 = upper, 1,2 = lower)
        mapping = {}
        rows = self.find_rows()
        for row in rows:
            tds = row.getchildren()
            name = CSS("a").match_one(tds[0]).text_content().strip()
            name = re.sub(r"^(Rep\.\s|Senator\s)", "", name)
            email = tds[1].text_content().strip()
            dist = tds[2].text_content().strip()
            position = tds[3].text_content().strip()
            party = tds[4].text_content().strip()
            mapping[name] = (email, party, dist, position)
        return mapping

示例#6

0

显示文件

class PartyAugmentation(HtmlPage):
    """
    NY Assembly does not have partisan information on their site.

    In the past we scraped NYBOE, but that broke.  This is our best option
    besides hard-coding... and it isn't good.
    """

    source = URL("https://en.wikipedia.org/wiki/New_York_State_Assembly")

    def find_rows(self):
        # the first table on the page that has a bunch of rows
        for table in CSS("table.wikitable").match(self.root):
            rows = CSS("tr").match(table)
            if len(rows) >= 150:
                return rows

    def process_page(self):
        mapping = {}
        rows = self.find_rows()
        for row in rows:
            tds = row.getchildren()
            dist = tds[0].text_content().strip()
            name = tds[1].text_content().strip()
            party = tds[2].text_content().strip()
            if "[" in party:
                party = party.split("[")[0]
            mapping[dist] = (name, party)
        return mapping

示例#7

0

显示文件

文件： committees.py 项目： jealob/openstates-scrapers

class SenList(HtmlListPage):
    source = URL("https://senate.arkansas.gov/senators/committees/")
    selector = CSS("ins > ul > li", num_items=45)

    def process_item(self, item):
        comm_name = CSS("a").match(item)[0].text_content().strip()

        previous_sibs = item.getparent().itersiblings(preceding=True)
        for sib in previous_sibs:
            if len(sib.getchildren()) == 0:
                chamber_type = sib.text_content().strip()
                break

        if chamber_type == "Senate Committees":
            chamber = "upper"
        elif chamber_type == "Joint Committees":
            self.skip()
        elif chamber_type == "Task Forces":
            self.skip()

        com = ScrapeCommittee(
            name=comm_name,
            classification="committee",
            chamber=chamber,
        )

        detail_link = CSS("a").match(item)[0].get("href")

        com.add_source(self.source.url)
        com.add_source(detail_link)
        com.add_link(detail_link, note="homepage")

        return SenDetail(com, source=detail_link)

示例#8

0

显示文件

文件： committees.py 项目： jealob/openstates-scrapers

class JointSubComms(HtmlListPage):
    source = URL("https://www.arkleg.state.ar.us/Committees/List?type=Joint")
    selector = CSS("div#bodyContent li a", num_items=31)

    def process_item(self, item):
        sub_name = item.text_content().strip()

        parent = (item.getparent().getparent().getparent().getparent().
                  getchildren()[0].text_content().strip())

        if parent.title() == "Alc-Jbc Budget Hearings":
            self.skip()

        com = ScrapeCommittee(
            name=sub_name.title(),
            classification="subcommittee",
            chamber="legislature",
            parent=parent.title(),
        )

        detail_link = item.get("href")

        com.add_source(self.source.url)
        com.add_source(detail_link)
        com.add_link(detail_link, note="homepage")

        return HouseJointDetail(com, source=detail_link)

示例#9

0

显示文件

class CouncilList(HtmlListPage):
    source = URL("http://dccouncil.us/councilmembers/")
    selector = CSS("li.column", num_items=14)

    def process_item(self, item):
        try:
            title = CSS("h3").match_one(item).text_content()
            if title == "Chair Pro Tempore":
                # this member is listed twice. skip the 1st time
                self.skip()
        except SelectorError:
            title = None

        img = CSS("img").match_one(item).get("src")

        detail_link = CSS("a").match(item)[1].get("href")

        partial_p = PartialPerson(
            state="dc",
            chamber="legislature",
            image=img,
            source1=self.source.url,
            source2=detail_link,
            link=detail_link,
        )

        return CouncilDetail(partial_p, source=detail_link)

示例#10

0

显示文件

文件： people.py 项目： jealob/openstates-scrapers

class Representatives(HtmlListPage):
    # note: there is a CSV, but it requires a bunch of ASP.net hoops to actually get
    source = URL(
        "https://house.mo.gov/MemberGridCluster.aspx?year=2021&code=R+&filter=clear"
    )
    selector = CSS("tr")

    def process_item(self, item):
        tds = CSS("td").match(item, min_items=0, max_items=8)
        if not tds:
            self.skip()
        _, last, first, district, party, town, phone, room = tds
        if last.text_content() == "Vacant":
            self.skip()
        return HouseDetail(
            HousePartial(
                last_name=last.text_content(),
                first_name=first.text_content(),
                district=int(district.text_content()),
                party=party.text_content(),
                hometown=town.text_content().strip(),
                voice=phone.text_content(),
                room=room.text_content(),
                url=CSS("a").match_one(last).get("href"),
            )
        )

示例#11

0

显示文件

文件： people.py 项目： jealob/openstates-scrapers

class RepublicanHouse(RedRepList):
    source = URL(
        "https://www.indianahouserepublicans.com/members/?pos=0,100,100",
        timeout=10)
    selector = CSS("div.member-list a", min_items=60, max_items=100)
    chamber = "lower"
    party = "Republican"

示例#12

0

显示文件

文件： committees.py 项目： jessemortenson/openstates

    def process_item(self, item):
        comm_name = CSS("a").match_one(item).text_content()
        comm_url = CSS("a").match_one(item).get("href")

        # "https://jtlegbudget.legislature.ca.gov/sublegislativeanalyst" has no members
        if comm_url == "https://jtlegbudget.legislature.ca.gov/sublegislativeanalyst":
            self.skip()

        # Joint Committees are being skipped to avoid duplicates (they were already grabbed during SenateCommitteeList())
        if comm_name.startswith("Joint Committee") or comm_name.startswith(
                "Joint Legislative"):
            self.skip()
        elif comm_name.startswith("Subcommittee"):
            parent_comm = item.getparent().getparent().getchildren(
            )[0].text_content()
            com = ScrapeCommittee(
                name=comm_name,
                classification="subcommittee",
                chamber="lower",
                parent=parent_comm,
            )
        else:
            com = ScrapeCommittee(name=comm_name,
                                  classification="committee",
                                  chamber="lower")
        com.add_source(self.source.url)
        com.add_source(comm_url)
        com.add_link(comm_url, note="homepage")
        return ChooseType(com, source=URL(comm_url))

示例#13

0

显示文件

    def process_item(self, item):
        name_dirty = CSS("a").match_one(item).text_content().strip().split(
            ", ")
        name = name_dirty[1] + " " + name_dirty[0]

        district = CSS("br").match(item)[-1].tail.strip()
        district = re.search(r"District\s(.+)", district).groups()[0]

        party = CSS("b").match_one(item).tail.strip()
        if party == "(D)":
            party = "Democratic"
        elif party == "(R)":
            party = "Republican"
        elif party == "(I)":
            party = "Independent"

        p = ScrapePerson(
            name=name,
            state="pa",
            chamber=self.chamber,
            district=district,
            party=party,
        )

        detail_link = CSS("a").match_one(item).get("href")

        p.add_source(self.source.url)
        p.add_source(detail_link)
        p.add_link(detail_link, note="homepage")

        return LegDetail(p, source=URL(detail_link, timeout=10))

示例#14

0

显示文件

文件： committees.py 项目： jealob/openstates-scrapers

 def process_item(self, item):
     ctype = item.text_content()
     if ctype == "Standing" or ctype == "Statutory":
         return SenateTypeCommitteeList(
             source=URL(item.get("href"), timeout=30))
     else:
         self.skip()

示例#15

0

显示文件

文件： committees.py 项目： jessemortenson/openstates

    def process_item(self, item):
        comm_name = XPath("text()").match_one(item)
        if comm_name in [
                "Teleconference How-To Information", "Legislative Process"
        ]:
            self.skip()

        comm_url = XPath("@href").match_one(item)

        if comm_name.startswith("Joint"):
            com = ScrapeCommittee(name=comm_name,
                                  classification="committee",
                                  chamber="legislature")
        elif comm_name.startswith("Subcommittee"):
            parent_comm = (item.getparent().getparent().getparent().getparent(
            ).getchildren()[0].text_content())
            com = ScrapeCommittee(
                name=comm_name,
                classification="subcommittee",
                chamber="upper",
                parent=parent_comm,
            )
        else:
            com = ScrapeCommittee(name=comm_name,
                                  classification="committee",
                                  chamber="upper")
        com.add_source(self.source.url)
        com.add_source(comm_url)
        com.add_link(comm_url, note="homepage")
        return ChooseType(com, source=URL(comm_url))

示例#16

0

显示文件

文件： committees.py 项目： jealob/openstates-scrapers

class CommitteeList(JsonListPage):

    source = URL(
        "https://www.legis.ga.gov/api/committees/List/1029",
        headers={"Authorization": get_token()},
    )

    def process_item(self, item):
        if item["chamber"] == 2:
            chamber = "upper"
        elif item["chamber"] == 1:
            chamber = "lower"

        source = URL(
            f"https://www.legis.ga.gov/api/committees/details/{item['id']}/1029",
            headers={"Authorization": get_token()},
        )

        com = ScrapeCommittee(
            name=item["name"],
            chamber=chamber,
        )

        com.add_source(
            self.source.url, note="Initial list page (requires authorization token)"
        )

        return CommitteeDetail(
            com,
            source=source,
        )

示例#17

0

显示文件

文件： committees.py 项目： jealob/openstates-scrapers

    def process_item(self, item):
        committee_name = item.text_content()

        # only scrape joint coms on senate scrape
        if ("Joint" in committee_name or "Task Force" in committee_name
                or "Conference" in committee_name):
            self.skip()

        committee_name = remove_comm(committee_name)
        committee_name = committee_name.strip()

        if "Subcommittee" in committee_name:
            name = committee_name.replace("Subcommittee on ",
                                          "").replace(", Subcommittee", "")

            parent = remove_comm(
                XPath("..//..//preceding-sibling::a").match(item)
                [0].text_content())

            com = ScrapeCommittee(
                name=name,
                chamber=self.chamber,
                classification="subcommittee",
                parent=parent,
            )
        else:
            com = ScrapeCommittee(name=committee_name, chamber=self.chamber)

        # We can construct a URL that would make scraping easier, as opposed to the link that is directly given
        comm_link = item.get("href").replace("https://www.house.mo.gov/", "")
        source = f"https://www.house.mo.gov/MemberGridCluster.aspx?filter=compage&category=committee&{comm_link}"
        return HouseCommitteeDetail(com, source=URL(source, timeout=30))

示例#18

0

显示文件

文件： people.py 项目： csnardi/openstates

class DemocraticSenate(BlueSenList):
    source = URL("https://www.indianasenatedemocrats.org/senators/")
    selector = CSS(
        "article ul li",
        num_items=11,
    )
    chamber = "upper"
    party = "Democratic"

示例#19

0

显示文件

文件： test_pages.py 项目： jamesturk/spatula

def test_html_page():
    p = HtmlPage(source=URL(SOURCE))
    p.response = Response(b"<html><a href='/test'>link</a></html>")
    p.postprocess_response()
    # test existence of page.root
    link = p.root.xpath("//a")[0]
    # test that links were normalized to example.com
    assert link.get("href") == "https://example.com/test"

示例#20

0

显示文件

文件： people.py 项目： jealob/openstates-scrapers

    def process_item(self, item):
        chamber_id = item["district"]["chamberType"]

        p = ScrapePerson(
            state="ga",
            chamber=self.chamber_types[chamber_id],
            district=str(item["district"]["number"]),
            name=item["fullName"],
            family_name=item["name"]["familyName"],
            given_name=item["name"]["first"],
            suffix=item["name"]["suffix"] or "",
            party=self.party_ids[item["party"]],
        )

        # district address
        da = item["districtAddress"]
        if da["email"]:
            p.email = da["email"]

        if da["phone"]:
            p.district_office.voice = da["phone"]
        if da["fax"]:
            p.district_office.fax = da["fax"]
        if da["address1"]:
            p.district_office.address = da["address1"]
            if da["address2"]:
                p.district_office.address += "; " + da["address2"]
            p.district_office.address += "; {city}, {state} {zip}".format(**da)
            p.district_office.address = p.district_office.address.strip()

        # photos
        if not item["photos"]:
            pass
        elif len(item["photos"]) == 1:
            p.image = item["photos"][0]["url"].split("?")[
                0]  # strip off ?size=mpSm for full size
        else:
            raise Exception("unknown photos configuration: " +
                            str(item["photos"]))

        # extras

        p.extras["residence"] = item["residence"]
        p.extras["city"] = item["city"].strip()
        p.extras["georgia_id"] = item["id"]

        url = (
            f"https://www.legis.ga.gov/members/{self.chamber_names[chamber_id]}/"
            f"{item['id']}?session={item['sessionId']}")
        p.add_source(url,
                     note="Initial list page (requires authorization token)")

        source = URL(
            f"https://www.legis.ga.gov/api/members/detail/{item['id']}?session=1029&chamber={chamber_id}",
            headers={"Authorization": get_token()},
        )

        return LegDetail(p, source=source)

示例#21

0

显示文件

 def process_page(self):
     for member in (
         self.data["content"]["house_members"]
         + self.data["content"]["senate_members"]
     ):
         # source is a URL object, we need the .url member
         yield MembersDetail(
             source=URL(self.source.url + member["KPID"] + "/", timeout=10)
         )

示例#22

0

显示文件

文件： committees.py 项目： jealob/openstates-scrapers

 def process_item(self, item):
     com_link = CSS("a").match(item)[0]
     name = com_link.text_content()
     com = ScrapeCommittee(
         name=name, classification="committee", chamber=self.chamber
     )
     detail_link = com_link.get("href")
     com.add_source(detail_link)
     com.add_link(detail_link, "homepage")
     return HouseCommitteeDetail(com, source=URL(detail_link, timeout=30))

示例#23

0

显示文件

文件： people.py 项目： jealob/openstates-scrapers

    def process_item(self, item):
        name = item.text

        lname = name.lower()
        if "resigned" in lname or "vacated" in lname or "retired" in lname:
            return

        name, action, date = clean_name(name)

        return self.next_page_cls(
            PartialMember(name=name, url=item.get("href")),
            source=URL(item.get("href"), timeout=10),
        )

示例#24

0

显示文件

class CommitteeList(HtmlListPage):
    source = URL("http://www.akleg.gov/basis/Committee/List/32")
    selector = CSS("div.area-frame ul.list li", num_items=112)

    def process_item(self, item):
        comm_name = (
            item.text_content().strip().split(" (")[0].title().replace(
                "(Fin Sub)", ""))

        if "Conference" in comm_name:
            self.skip()

        chamber = item.getparent().getprevious().getprevious().text_content(
        ).strip()
        if chamber == "House":
            chamber = "lower"
        elif chamber == "Senate":
            chamber = "upper"
        elif chamber == "Joint Committee":
            chamber = "legislature"

        classification = item.getparent().getprevious().text_content().strip()

        if classification == "Finance Subcommittee":
            # work around duplicate name of Judiciary committees
            # a current limitation in how Open States can handle committees
            # see https://github.com/openstates/issues/issues/598
            if comm_name == "Judiciary":
                comm_name = "Judiciary (Finance)"
            com = ScrapeCommittee(
                name=comm_name,
                classification="subcommittee",
                chamber=chamber,
                parent="Finance",
            )
        else:
            com = ScrapeCommittee(
                name=comm_name,
                classification="committee",
                chamber=chamber,
            )

        detail_link = CSS("a").match_one(item).get("href")

        com.add_source(self.source.url)
        com.add_source(detail_link)
        com.add_link(detail_link, note="homepage")

        return CommiteeDetail(com, source=URL(detail_link, timeout=30))

示例#25

0

显示文件

文件： committees.py 项目： jealob/openstates-scrapers

class CommitteeList(HtmlListPage):
    source = URL("https://dccouncil.us/committees-for-council-period-23/")
    selector = CSS("div ul li div")
    chamber = "legislature"

    def process_item(self, item):
        com_link = CSS("a").match(item)[0]
        name = com_link.text_content()
        com = ScrapeCommittee(name=name,
                              classification="committee",
                              chamber=self.chamber)
        detail_link = com_link.get("href")
        com.add_source(detail_link)
        com.add_link(detail_link, note="homepage")
        return CommitteeDetail(com, source=detail_link)

示例#26

0

显示文件

文件： people.py 项目： jealob/openstates-scrapers

class Legislators(JsonListPage):
    source = URL("https://sdlegislature.gov/api/SessionMembers/Session/44")

    def process_item(self, item):

        first = item["FirstName"]
        last = item["LastName"]
        initial = item["Initial"]

        if initial:
            # V. J. puts his initials as his first name
            if first == "V. J.":
                name = f"{first} {last}"
            else:
                name = f"{first} {initial}. {last}"
        else:
            name = f"{first} {last}"

        p = ScrapePerson(
            name=name,
            family_name=last,
            given_name=first,
            state="sd",
            district=item["District"].lstrip("0"),
            chamber="upper" if item["MemberType"] == "S" else "lower",
            party=item["Politics"],
            email=item["EmailState"] or "",
            image=
            "https://lawmakerdocuments.blob.core.usgovcloudapi.net/photos/" +
            item["Picture"].lower(),
        )

        address = item["HomeAddress1"]
        if item["HomeAddress2"]:
            address += "; " + item["HomeAddress2"]
        address += f"; {item['HomeCity']}, {item['HomeState']} {item['HomeZip']}"

        p.district_office.address = address
        p.district_office.voice = item["HomePhone"] or ""
        p.capitol_office.voice = item["CapitolPhone"] or ""
        p.extras["occupation"] = item["Occupation"]

        url = f"https://sdlegislature.gov/Legislators/Profile/{item['SessionMemberId']}/Detail"
        p.add_link(url)
        p.add_source(url)
        return p

示例#27

0

显示文件

文件： people.py 项目： csnardi/openstates

class Senate(HtmlListPage):
    source = URL("https://senado.pr.gov/Pages/Senadores.aspx")
    selector = CSS("ul.senadores-list li", num_items=27)

    def process_item(self, item):
        # Convert names to title case as they are in all-caps
        name = CSS("span.name").match_one(item).text_content().strip()
        name = re.sub(r"^Hon\.", "", name, flags=re.IGNORECASE).strip().title()

        party = CSS("span.partido").match_one(item).text_content().strip()
        # Translate to English since being an Independent is a universal construct
        if party == "Independiente":
            party = "Independent"

        detail_link = CSS("a").match_one(item).get("href")
        partial = PartialSen(name=name, party=party, source=self.source.url)
        return SenDetail(partial, source=detail_link)

示例#28

0

显示文件

文件： people.py 项目： jealob/openstates-scrapers

class Legislators(XmlListPage):
    session_num = "32"
    source = URL(
        "http://www.legis.state.ak.us/publicservice/basis/members"
        f"?minifyresult=false&session={session_num}",
        headers={"X-Alaska-Legislature-Basis-Version": "1.4"},
    )
    selector = XPath("//Member/MemberDetails")

    def process_item(self, item):
        item_dict = {elem: _get_if_exists(item, elem) for elem in ELEMENTS}
        chamber = item.attrib["chamber"]
        code = item.attrib["code"].lower()

        party = item_dict["Party"]
        if party == "N":
            party = "Independent"

        person = ScrapePerson(
            name="{FirstName} {LastName}".format(**item_dict),
            given_name=item_dict["FirstName"],
            family_name=item_dict["LastName"],
            state="ak",
            party=party,
            chamber=("upper" if chamber == "S" else "lower"),
            district=item_dict["District"],
            image=f"http://akleg.gov/images/legislators/{code}.jpg",
            email=item_dict["EMail"],
        )
        person.add_link(
            "http://www.akleg.gov/basis/Member/Detail/{}?code={}".format(
                self.session_num, code))
        person.add_source("http://w3.akleg.gov/")

        if item_dict["Phone"]:
            phone = "907-" + item_dict["Phone"][0:3] + "-" + item_dict[
                "Phone"][3:]
            person.capitol_office.voice = phone

        if item_dict["Building"] == "CAPITOL":
            person.capitol_office.address = (
                "State Capitol Room {}; Juneau, AK, 99801".format(
                    item_dict["Room"]))

        return person

示例#29

0

显示文件

文件： committees.py 项目： jealob/openstates-scrapers

class SenateCommittee(HtmlListPage):
    source = URL(
        "http://www.gencourt.state.nh.us/Senate/committees/senate_committees.aspx",
        timeout=30,
    )
    chamber = "upper"
    selector = CSS("#form1 div h5")

    def process_item(self, item):
        com_link = CSS("a").match(item)[0]
        name = com_link.text_content()
        com = ScrapeCommittee(
            name=name, classification="committee", chamber=self.chamber
        )
        detail_link = com_link.get("href")
        com.add_source(detail_link)
        com.add_link(detail_link, "homepage")
        return SenateCommitteeDetail(com, source=URL(detail_link, timeout=30))

示例#30

0

显示文件

文件： people.py 项目： jealob/openstates-scrapers

class LegList(HtmlListPage):
    source = URL(
        "https://www.legis.nd.gov/assembly/67-2021/members/members-by-district"
    )
    selector = CSS("div.view-content > div", num_items=142)

    def process_item(self, item):
        name = CSS("div.name").match_one(item).text_content().strip()
        name = re.search(r"(Senator|Representative)\s(.+)", name).groups()[1]
        # Luke Simons was expelled on 3/4/21
        if name == "Luke Simons":
            self.skip()

        chamber = CSS("div.chamber").match_one(item).text_content().strip()
        if chamber == "Senate":
            chamber = "upper"
        elif chamber == "House":
            chamber = "lower"

        for previous_tag in item.itersiblings(preceding=True):
            if previous_tag.get("class") == "title":
                district = previous_tag.text_content().strip()
                district = re.search(r"District\s(.+)", district).groups()[0]
                break

        party = CSS("div.party").match_one(item).text_content().strip()
        if party == "Democrat":
            party = "Democratic"

        p = ScrapePerson(
            name=name,
            state="nd",
            chamber=chamber,
            district=district,
            party=party,
        )

        detail_link = CSS("div.name a").match_one(item).get("href")

        p.add_source(self.source.url)
        p.add_source(detail_link)
        p.add_link(detail_link, note="homepage")

        return LegDetail(p, source=detail_link)