Python XPath.text_content 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: spatula

클래스/타입: XPath

메소드/함수: text_content

hotexamples.com에서의 예제들: 2

Python XPath.text_content - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 spatula.XPath.text_content에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

XPath(30)

itersiblings(2)

split(2)

startswith(2)

strip(2)

text_content(2)

endswith(1)

getnext(1)

예제 #1

파일 보기

    def process_page(self):

        try:
            email, legislative_assistant = XPath(
                "//a[contains(@href, 'mailto')]"
            ).match(self.root)
        except ValueError:
            email = XPath("//a[contains(@href, 'mailto')]").match_one(self.root)

        image = (
            XPath("//img[contains(@src, '/Members/MemberImage')]")
            .match_one(self.root)
            .get("src")
        )

        table = XPath("//div[@class='row mx-md-0']/div").match(self.root)

        # there are different combinations of information the page can have
        if len(table) == 6:
            __, terms, __, main_phone, __, assistant = table
        elif len(table) == 10:
            __, terms, __, occupation, __, main_phone, __, military, __, __ = table
        else:
            __, terms, __, occupation, __, main_phone, __, __ = table

        p = ScrapePerson(
            name=self.input.name,
            state="nc",
            chamber=self.input.chamber,
            party=self.input.party,
            district=self.input.district,
            email=email.text_content(),
            image=image,
        )

        address_header = XPath("//h6[@class='mt-3']").match(self.root)

        address = XPath(".//following-sibling::p").match(address_header[0])
        address = address[0].text_content() + "; " + address[1].text_content()
        main_phone = main_phone.text_content().replace("\r\n", "").strip()

        # representatives have both legislative office addresses and mailing addresses,
        # while senators only have mailing addresses
        try:
            if address_header[1].text_content() == "Mailing Address:":
                mailing_address = XPath(".//following-sibling::p").match(
                    address_header[0]
                )
                mailing_address = (
                    mailing_address[0].text_content()
                    + "; "
                    + mailing_address[1].text_content()
                )
                p.extras["mailing address"] = mailing_address
                office_number = (
                    XPath(".//preceding-sibling::p[1]")
                    .match_one(address_header[1])
                    .text_content()
                    .replace("\r\n", "")
                    .strip()
                )

                # some reps have main phones and capitol office phones,
                # and senators only have capitol office phones
                if office_number != main_phone:
                    p.capitol_office.voice = office_number
                    p.extras["main phone"] = main_phone
                else:
                    p.capitol_office.voice = main_phone
        except IndexError:
            p.capitol_office.voice = main_phone

        p.capitol_office.address = address

        p.extras["terms in senate"] = (
            terms.text_content().replace("( ", "(").replace(" )", ")")
        )

        p.extras["represented counties"] = self.input.counties

        try:
            p.extras["legislative assistant"] = legislative_assistant.text_content()
            p.extras["legislative assistant email"] = legislative_assistant.get(
                "href"
            ).split(":")[1]
        except UnboundLocalError:
            pass

        try:
            p.extras["occupation"] = occupation.text_content()
        except UnboundLocalError:
            pass

        try:
            p.extras["military experience"] = military.text_content()
        except UnboundLocalError:
            pass

        if self.input.appointment:
            p.extras["appointment date"] = self.input.appointment

        p.add_source(self.source.url)
        p.add_source(self.input.url)

        for url in XPath(
            "//nav[contains(@class, 'nav nav-pills')]/a[@class='nav-item nav-link']"
        ).match(self.root):
            p.add_link(url.get("href"))

        return p

예제 #2

파일 보기

    def process_item(self, item):
        name = XPath(".//h3/text()").match(item)[0]
        if name.endswith(" (R)"):
            party = "Republican"
        elif name.endswith(" (D)"):
            party = "Democratic"
        else:
            self.skip("skipping " + name)
        name = name.split(" (")[0]

        district = (
            XPath('.//div[contains(@class, "senator-district")]/div/text()'
                  ).match(item)[0].strip().lstrip("0"))

        photo_url = XPath(".//img/@src").match_one(item)

        p = ScrapePerson(
            name=name,
            state="ca",
            chamber="upper",
            district=district,
            party=party,
            image=photo_url,
        )

        capitol_office = XPath(
            ".//div[contains(@class, 'views-field-field-senator-capitol-office')]//p"
        ).match_one(item)
        capitol_address, capitol_phone = (
            capitol_office.text_content().replace(u"\xa0", " ").split("; "))
        p.capitol_office.address = capitol_address.strip()
        p.capitol_office.voice = capitol_phone.strip()

        district_office = XPath(
            ".//div[contains(@class, 'views-field-field-senator-district-office')]"
        ).match_one(item)
        for line in district_office.text_content().strip().splitlines():
            try:
                if re.search(r"District Offices?", line):
                    continue
                addr, phone = line.strip().replace(u"\xa0", " ").split("; ")
                p.add_office(
                    classification="district",
                    address=addr.strip(),
                    voice=phone.strip(),
                )
            except ValueError:
                # Steven Bradford address/phone separated by period instead of semi-colon
                if re.search(r"\w+\.\s\(\d{3}\)", line):
                    addr, phone = line.strip().replace(u"\xa0",
                                                       " ").split(". (")
                    phone = "(" + phone
                    p.add_office(
                        classification="district",
                        address=addr.strip(),
                        voice=phone.strip(),
                    )

        url = XPath(".//a/@href").match(item)[0]
        p.add_link(url)
        p.add_source(self.source.url)

        return p