예제 #1
0
    def bos_scrape_people(self):
        page = self.lxmlize(MEMBER_LIST)
        people = page.xpath(
            "//table[@width='100%']//td[@style='TEXT-ALIGN: center']")

        for person in people:
            image, name = [
                self.get_one(person, x) for x in [
                    ".//img",
                    ".//a[contains(@href, 'councillors') and (text()!='')]"
                ]
            ]
            role = person.xpath(".//br")[0].tail.strip()
            image = image.attrib[
                'src']  # Fallback if we don't get one from the
            # homepage.
            homepage = name.attrib['href']
            name = clean_name(name.text)
            info = self.scrape_homepage(homepage)
            if info.get('image', None):
                image = info['image']

            p = Legislator(name=name,
                           post_id=role,
                           image=image,
                           biography=info['bio'])
            p.add_link(homepage, 'homepage')
            p.add_source(homepage)
            p.add_source(MEMBER_LIST)
            yield p
예제 #2
0
    def _scrape_people(self):
        url = 'http://www.cabq.gov/council/councilors'
        page = self.lxmlize(url)
        names = page.xpath("//div[@id='parent-fieldname-text']/*")[3:]
        it = iter(names)
        for entry in zip(it, it, it):
            name, info, _ = entry
            image_small = name.xpath(".//img")[0].attrib['src']
            name = name.text_content()
            infopage, email, policy_analyst = info.xpath(".//a")
            phone = info.xpath(".//b")[-1].tail.strip()
            district = infopage.text_content()
            homepage = self.lxmlize(infopage.attrib['href'])
            photo = homepage.xpath(
                "//div[@class='featureContent']//img")[0].attrib['src']

            bio = "\n".join((x.text_content() for x in homepage.xpath(
                "//div[@class='featureContent']//div[@class='stx']/p")))

            p = Legislator(name=name,
                           district=district,
                           image=photo,
                           biography=bio)

            p.add_source(url)
            p.add_source(infopage.attrib['href'])
            yield p
예제 #3
0
    def scrape(self):
        page = self.lxmlize(MEMBER_LIST)
        for row in page.xpath("//table[@frame='void']/tbody/tr")[1:]:
            role, whos, expire = row.xpath("./*")
            people = zip([x.text_content() for x in whos.xpath(".//font")],
                         [x.text_content() for x in expire.xpath(".//font")])
            thing = role.text_content()

            comm = Committee(name=thing)
            url = role.xpath(".//a")[0].attrib['href']
            comm.add_link(url=url, note='homepage')

            for person, expire in people:
                if "TBA" in person:
                    continue
                info = {}

                try:
                   info = re.match("(?P<name>.*), (?P<addr>\d+\w* .*)",
                                   person).groupdict()
                except AttributeError:
                    info = re.match("(?P<name>.*) (?P<addr>\d+\w* .*)",
                                    person).groupdict()

                addr = info['addr']

                roles = {"Vice Chair": "Vice Chair",
                         "Chair": "Chair",
                         "CHAIR": "Chair",
                         "Appt": "member",}

                position = "member"

                if "Resigned" in addr:
                    continue

                for role in roles:
                    if role in addr:
                        addr, chair = [x.strip() for x in addr.rsplit(role, 1)]
                        position = roles[role]

                addr = clean_address(addr)
                leg = Legislator(name=info['name'], district=position)
                leg.add_contact_detail(type="address",
                                       value=addr,
                                       note="Address")
                leg.add_source(MEMBER_LIST)
                yield leg

                leg.add_membership(comm)
            comm.add_source(MEMBER_LIST)
            yield comm
예제 #4
0
    def get_people(self):

        html = self.urlopen(self.url)
        doc = lxml.html.fromstring(html)

        title_xpath = '//div[contains(@class, "biotitle")]'
        name_xpath = '//div[contains(@class, "bioname")]'
        for title, name in zip(doc.xpath(title_xpath), doc.xpath(name_xpath)):
            name = name.text_content().strip()
            title = title.text_content().strip()
            p = Legislator(name=name, post_id=title)
            p.add_source(self.url)
            yield p
예제 #5
0
    def cleveland_scrape_people(self):
        listing = "http://www.clevelandcitycouncil.org/council-members/"
        page = self.lxmlize(listing)

        table = page.xpath("//div[@class='standard-content column']//table")[0]
        for person in table.xpath(".//td[@align='center']"):
            strong = person.xpath(".//strong")[0]
            who = strong.text.strip()
            role = strong.xpath("./br")[0].tail.strip()
            img = person.xpath(".//img")[0].attrib['src']
            info = INFOSLUG.match(role).groupdict()

            scraped_info = {}
            page = person.xpath(".//a")
            if page != []:
                page = page[0].attrib['href']
                scraped_info = self.scrape_page(page)

            kwargs = {}
            biography = scraped_info.get('bio', None)
            if biography:
                kwargs['biography'] = biography

            p = Legislator(name=who,
                           post_id=info['district'],
                           gender=info['gender'],
                           image=img,
                           **kwargs)
            p.add_source(listing)

            valid_titles = ["Chair", "Vice Chair"]

            for what in scraped_info.get('committees', []):
                what = what.strip()
                if what == "":
                    continue

                role = "member"
                if "-" in what:
                    c, title = (x.strip() for x in what.rsplit("-", 1))
                    if title in valid_titles:
                        what = c
                        role = title
                p.add_committee_membership(what, role=role)
            yield p
예제 #6
0
    def scrape_ward(self, el):
        url = el.attrib['href']
        page = self.lxmlize(url)
        name = page.xpath("//div[@id='content-content']/h3")[0].text_content()
        badthings = ["Alderman"]
        for thing in badthings:
            if name.startswith(thing):
                name = name[len(thing):].strip()

        district = page.xpath("//h1[@class='page-heading']/text()")[0]
        leg = Legislator(name=name, post_id=district)
        leg.add_source(url)

        type_types = {
            "City Hall Office:": ("address", "City Hall Office"),
            "City Hall Phone:": ("phone", "City Hall Phone"),
            "Phone:": ("phone", "Personal Phone"),
            "Office:": ("address", "Personal Office"),
            "Fax:": ("fax", "Fax"),
            "Fax": ("fax", "Fax"),
        }

        for row in page.xpath("//table//tr"):
            type_, val = (x.text_content().strip() for x in row.xpath("./td"))
            if val == "":
                continue

            types = [type_]
            vals = [val]

            if "\n" in type_:
                if "\n" in val:
                    types = type_.split("\n")
                    vals = val.split("\n")
                else:
                    continue

            for type_ in types:
                for val in vals:
                    ctype, note = type_types[type_]
                    leg.add_contact(ctype, val, note)

        return leg
예제 #7
0
    def scrape_homepage(self, folk):
        url = folk.attrib['href']
        page = self.lxmlize(url)
        image = page.xpath(
            "//img[contains(@src, 'uploadedImages/City_Council/Members/')]"
        )[0].attrib['src']

        name = page.xpath("//div[@id='ctl00_ctl00_Body_body_cntCommon']/h3")
        name, = name

        bio = "\n\n".join([
            x.text_content() for x in page.xpath(
                "//div[@id='ctl00_ctl00_Body_body_cntCommon']/p")
        ])

        leg = Legislator(name=name.text,
                         post_id='member',
                         biography=bio,
                         image=image)
        leg.add_source(url)
        return leg
예제 #8
0
    def nyc_scrape_people(self):
        page = self.lxmlize(MEMBER_PAGE)
        for entry in page.xpath("//table[@id='members_table']//tr"):
            entries = entry.xpath(".//td")
            if entries == []:
                continue

            name, district, borough, party = entries
            name = name.xpath(".//a")[0]
            homepage = name.attrib['href']
            name, district, borough, party = [
                x.text for x in [name, district, borough, party]
            ]

            info = self.scrape_homepage(homepage)
            p = Legislator(
                name=name,
                post_id=district,
                # borough=borough,
                party=party.strip() or "other")
            p.add_link(homepage, 'homepage')
            p.add_source(homepage)
            p.add_source(MEMBER_PAGE)
            yield p
예제 #9
0
    def get_people(self):
        people = [
            {
                "name": "Mckenzie A. Cannon",
                "district": "10a",
            },
            {
                "name": "Yandel V. Watkins",
                "district": "Second Fnord and Norfolk",
            },
            {
                "name": "Adrien A. Coffey",
                "district": "A",
            },
            {
                "district": "10c",
                "name": "Natasha Moon",
            },
            {
                "district": "Berkshire, Hampshire, Franklin and Hampden",
                "name": "Ramon Harmon",
            },
            {
                "district": "5",
                "name": "Sam Sellers",
            },
            {
                "district": "6",
                "name": "Estrella Hahn",
            },
            {
                "district": "B",
                "name": "Teagan Rojas",
            },
            {
                "district": "C",
                "name": "Barrett Adams",
            },
            {
                "district": "D",
                "name": "Kayla Shelton",
            },
            {
                "district": "E",
                "name": "Kohen Dudley",
            },
            {
                "district": "F",
                "name": "Cayden Norman",
            },
            {
                "district": "ZZ",
                "name": "Shayla Fritz",
            },
            {
                "district": "Ward 2",
                "name": "Gunnar Luna",
            },
            {
                "district": "Green",
                "name": "Regina Cruz",
            },
            {
                "district": "Blue",
                "name": "Makenzie Keller",
            },
            {
                "district": "Red",
                "name": "Eliana Meyer",
            },
            {
                "district": "Yellow",
                "name": "Taylor Parrish",
            },
            {
                "district": "Silver",
                "name": "Callie Craig",
            },
        ]

        for person in people:
            l = Legislator(**person)
            l.add_source("http://example.com")
            dslug = (person['district'].lower().replace(" ",
                                                        "-").replace(",", ""))
            l.add_contact_detail(type='email',
                                 value="*****@*****.**" % (dslug),
                                 note='office email')
            yield l