Exemplo n.º 1
0
    def scrape(self, chamber, term):
        url = "http://www.assembly.ab.ca/net/index.aspx?p=membership_list"
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        committees = doc.xpath('//div[@id="_ctl0_Panel_committees"]')
        divs = committees[0].xpath("div")[1:]
        for div in divs[:]:
            if "class" in div.attrib and div.attrib["class"] == "committeetype_header":
                divs.remove(div)
        divs = iter(divs)

        while True:
            try:
                name, _, content = itertools.islice(divs, 3)
            except ValueError, StopIteration:
                break

            committee_name = name.text_content()[4:]
            committee = Committee("lower", committee_name)
            for td in content.xpath("table/descendant::td"):
                if td.xpath('a[contains(@href, "number")]'):
                    name = td.xpath("a")[0].text_content()
                    role = (td.xpath("a")[0].tail or "").strip("() ")
                    committee.add_member(name, role or "member")

            xpath = 'table/descendant::td/a[contains(@href, "committees")]/@href'
            committee_url = content.xpath(xpath).pop()
            committee.add_source(url)
            committee.add_source(committee_url)
            self.save_committee(committee)
Exemplo n.º 2
0
    def scrape_committee(self, name, url, chamber):
        com = Committee(chamber, name)
        com.add_source(url)
        data = self.get(url).text
        doc = lxml.html.fromstring(data)

        for leg in doc.xpath('//div[@id="members"]/div[@id="members"]/p/a/text()'):
            leg = leg.replace('Representative ', '')
            leg = leg.replace('Senator ', '')
            leg = leg.strip()
            if ' (' in leg:
                leg, role = leg.split(' (')
                if 'Vice-Chair' in role:
                    role = 'vice-chair'
                elif 'Co-Chair' in role:
                    role = 'co-chair'
                elif 'Chair' in role:
                    role = 'chair'
                else:
                    raise Exception('unknown role: %s' % role)
            else:
                role = 'member'
            com.add_member(leg, role)

        self.save_committee(com)
Exemplo n.º 3
0
    def select_special_comm(self):
        main_url = 'http://www.nebraskalegislature.gov/committees/select-committees.php'
        with self.urlopen(main_url) as page:
            page = lxml.html.fromstring(page)

            for comm_names in page.xpath('/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[@class="content_box_container"]/div[@class="content_box"]'):
               name = comm_names.xpath('h2')[0].text
               if name != None:
                   committee = Committee('upper', name)
                   committee.add_source(main_url)
                   for senators in comm_names.xpath('ul[@class="nobullet"]/li'):
                       senator = senators[0].text
                       if 'Chairperson' in senator:
                           role = 'Chairperson'
                           senator = senator[5:-13]
                       else:
                           role = 'member'
                           senator = senator[5:-1]
                       committee.add_member(senator, role)
                   self.save_committee(committee)
               else:
                   name = comm_names.xpath('h2/a')[0].text
                   committee = Committee('upper', name)
                   committee.add_source(main_url)
                   for senators in comm_names.xpath('ul[@class="nobullet"]/li'):
                       senator = senators[0].text
                       if 'Chairperson' in senator:
                           role = 'chairperson'
                           senator = senator[5:-13]
                       else:
                           role = 'member'
                           senator = senator[5:-1]
                       committee.add_member(senator, role)
                   self.save_committee(committee)
Exemplo n.º 4
0
    def scrape(self, term, chambers):
        com_url = 'http://www.dccouncil.washington.dc.us/committees'
        data = self.urlopen(com_url)
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(com_url)

        urls = set(doc.xpath('//a[contains(@href, "committee-on")]/@href'))
        for url in urls:
            data = self.urlopen(url)
            doc = lxml.html.fromstring(data)

            try:
                name = doc.xpath('//h1/text()')[0].replace('Committee on ', '')
            except IndexError:
                name = doc.xpath('//h2/text()')[0].replace('Committee on ', '')


            # skip link to Committees page
            if name == 'Committees':
                continue

            com = Committee('upper', name)

            for chair in doc.xpath('//h3[text()="Committee Chair"]/following-sibling::p'):
                com.add_member(chair.text_content(), role='chairperson')

            for member in doc.xpath('//h3[text()="Councilmembers"]/following-sibling::ul//a'):
                com.add_member(member.text_content(), role='member')

            com.add_source(url)
            self.save_committee(com)
Exemplo n.º 5
0
    def scrape_joint_committee(self, url):
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)

        name = doc.xpath('//h1/text()') or doc.xpath('//h2/text()')
        name = name[0].strip()

        comm = Committee('joint', name)
        comm.add_source(url)

        members = chain(doc.xpath('//a[contains(@href, "MemberId")]'),
                        doc.xpath('//a[contains(@href, "Senators")]'))

        seen = set()
        for a in members:
            parent_content = a.getparent().text_content()
            if ':' in parent_content:
                title = parent_content.split(':')[0].strip()
            else:
                title = 'member'

            name = a.text.split(' (')[0].strip()
            if (name, title) not in seen:
                comm.add_member(name, title)
                seen.add((name, title))

        if comm['members']:
            self.save_committee(comm)
Exemplo n.º 6
0
    def scrape_senate_committee(self, url):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)

        name = doc.xpath('//h3/text()')[0]
        name = name.replace(' Committee', '')

        com = Committee(chamber='upper', committee=name)

        for member in doc.xpath('//div[@id="committeeright"]//a'):
            member_name = member.text.strip()

            # don't add clerks
            if member_name == 'Committee Clerk':
                continue

            # skip phone links
            if member.get("href").startswith("tel:"):
                continue

            if 'Committee Chair' in member.tail:
                role = 'chair'
            elif 'Majority Vice' in member.tail:
                role = 'majority vice chair'
            elif 'Minority Vice' in member.tail:
                role = 'minority vice chair'
            else:
                role = 'member'

            com.add_member(member_name, role=role)

        com.add_source(url)
        self.save_committee(com)
Exemplo n.º 7
0
    def scrape_senate_committee(self, term, link):
        with self.urlopen(link) as html:
            doc = lxml.html.fromstring(html)

            # strip first 30 and last 10
            # Minnesota Senate Committees - __________ Committee
            committee_name = doc.xpath('//title/text()')[0][30:-10]

            com = Committee('upper', committee_name)

            # first id=bio table is members
            for row in doc.xpath('//table[@id="bio"]')[0].xpath('tr'):
                row = fix_whitespace(row.text_content())

                # switch role
                if ':' in row:
                    position, name = row.split(': ')
                    role = position.lower().strip()
                else:
                    name = row

                # add the member
                com.add_member(name.strip(), role)

            com.add_source(link)
            self.save_committee(com)
Exemplo n.º 8
0
    def standing_comm(self):
       main_url = 'http://www.nebraskalegislature.gov/committees/standing-committees.php'
       with self.urlopen(main_url) as page:
           page = lxml.html.fromstring(page)
           
           for comm_links in page.xpath('/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[@class="content_box_container"]/div[@class="content_box"][1]/ul[@class="nobullet"]/li/a'):
               detail_link = comm_links.attrib['href']

               with self.urlopen(detail_link) as detail_page:
                   detail_page = lxml.html.fromstring(detail_page)
                   name = detail_page.xpath('/html/body[@class="home blog"]/div[@id="page"]/div[@id="content"]/div[@class="content_header"]/div[@class="content_header_right"]/a')[0].text
                   name = name.split()
                   name = name[0:-1]
                   comm_name = ''
                   for x in range(len(name)):
                       comm_name += name[x] + ' '
                   comm_name = comm_name[0: -1]
                   committee = Committee('upper', comm_name)

                   for senators in detail_page.xpath('/html/body[@class="home blog"]/div[@id="page"]/div[@id="sidebar"]/ul[1]/li[1]/ul/li/a'):
                       senator = senators.text
                       if 'Chairperson' in senator:
                           role = 'Chairperson'
                           senator = senator[6: -13]
                       else:
                            role = 'member'
                            senator = senator[6:-1]
                       committee.add_member(senator, role)
                   committee.add_source(main_url)
                   committee.add_source(detail_link)
                   self.save_committee(committee)
Exemplo n.º 9
0
    def scrape_committee(self, chamber, term, name, url):
        page = self.urlopen(url)
        page = lxml.html.fromstring(page)

        mlist = page.xpath("//strong[contains(., 'Members:')]")[0].tail
        mlist = re.sub(r'\s+', ' ', mlist)

        committee = Committee(chamber, name)
        committee.add_source(url)

        # split on periods not preceeded by capital letters
        for member in re.split('(?<![A-Z])[.,] ', mlist):
            member = re.sub(r'R\.M\.(M\.)?$', '', member.strip()).strip()
            if member:
                committee.add_member(member)

        chair = page.xpath("//strong[contains(., 'Chair:')]")[0]
        chair_name = chair.tail.strip()
        if chair_name:
            committee.add_member(chair_name, 'chair')

        vc = page.xpath("//strong[contains(., 'Vice Chair:')]")[0]
        vc_name = vc.tail.strip()
        if vc_name:
            committee.add_member(vc_name, 'vice chair')

        self.save_committee(committee)
Exemplo n.º 10
0
    def scrape_lower_committee(self, name, url):
        com = Committee('lower', name)
        com.add_source(url)
        doc = self.lxmlize(url)

        contact, directiva, reps = doc.xpath('//div[@class="sbox"]/div[2]')
        # all members are tails of images (they use img tags for bullets)
        # first three members are in the directiva div
        chair = directiva.xpath('b[text()="Presidente:"]/following-sibling::img[1]')
        vchair = directiva.xpath('b[text()="Vice Presidente:"]/following-sibling::img[1]')
        sec = directiva.xpath('b[text()="Secretario(a):"]/following-sibling::img[1]')
        member = 0
        if chair and chair[0].tail is not None:
            chair = chair[0].tail
            com.add_member(clean_spaces(chair), 'chairman')
            member += 1
        if vchair and vchair[0].tail is not None:
            vchair = vchair[0].tail
            com.add_member(clean_spaces(vchair), 'vice chairman')
            member += 1
        if sec and sec is not None:
            sec = sec[0].tail
            com.add_member(clean_spaces(sec), 'secretary')
            member += 1

        for img in reps.xpath('.//img'):
            member_name = clean_spaces(img.tail)
            if member_name is not None:
                com.add_member(member_name)
                member += 1
        if member > 0:
            self.save_committee(com)
Exemplo n.º 11
0
    def _scrape_upper_committee(self, name, url2):
        cat = "Assignments.asp"
        url3 = "".join((url2, cat))

        committee = Committee('upper', name)
        committee.add_source(url2)

        page = self.lxmlize(url3)

        members = page.xpath('//table[@id="table38"]//font/a/b')

        for link in members:
            role = "member"
            if link == members[0]:
                role = "Chairman"
            if link == members[1]:
                role = "Vice-Chairman"

            name = link.xpath('string()')
            name = name.replace('Senator ', '')
            name = re.sub('[\s]{2,}', ' ', name).strip()

            committee.add_member(name, role)

        self.save_committee(committee)
Exemplo n.º 12
0
    def scrape_house_committee(self, committee_name, link):
        """Scrape individual committee page and add members"""

        html = self.urlopen(link)
        doc = lxml.html.fromstring(html)

        subcommittee = False
        for h1 in doc.xpath('//h1/text()'):
            if 'subcommittee' in h1.lower():
                subcommittee = True

        subcomm_name = ('Subcommittee' if subcommittee else None)

        if subcommittee:
            committee_name = committee_name.replace(' Subcommittee', '')
        com = Committee('lower', committee_name, subcomm_name)

        find_expr = "//div[@class='col1']/ul[position()<3]/li/a"
        for a in doc.xpath(find_expr):
            name = a.text
            role = (a.tail or '').strip(', ') or 'member'
            if name:
                com.add_member(name, role)

        com.add_source(link)
        if com['members']:
            self.save_committee(com)
Exemplo n.º 13
0
    def scrape_lower_committee(self, name, parent, url):
        page = lxml.html.fromstring(self.urlopen(url))
        page.make_links_absolute(url)

        if "Joint" in name or (parent and "Joint" in parent):
            chamber = "joint"
        else:
            chamber = "lower"

        if parent:
            comm = Committee(chamber, parent, subcommittee=name)
        else:
            comm = Committee(chamber, name)
        comm.add_source(url)

        for link in page.xpath("//a[contains(@href, 'District')]"):
            member = link.xpath("string()").strip()
            member = re.sub(r"\s+", " ", member)

            if not member:
                continue

            match = re.match(r"((Co-)?(Vice )?Chair)?Rep\. ([^\(]+)", member)
            member = match.group(4).strip()
            role = match.group(1) or "member"

            comm.add_member(member, role.lower())

        self.save_committee(comm)
Exemplo n.º 14
0
    def scrape_upper_committee(self,url):
        filename, resp = self.urlretrieve(url)
        root = lxml.etree.fromstring( convert_pdf(filename,'xml'))
        for link in root.xpath('/pdf2xml/page'):
            comm = None
            for line in link.findall('text'):
                text = line.findtext('b')
                if text is not None and text.startswith('Comisi'):
                    comm = Committee('upper',text);
                    comm.add_source(url)
                else:
                    if line.text and line.text.startswith('Hon.'):
                        line_text = line.text.replace(u'–','-')
                        name_split = line_text.split(u'-',1)
                        title = 'member'
#           print name_split
                        if len(name_split) >= 2:
                            name_split[1] = name_split[1].strip()
                            if name_split[1] == 'Presidenta' or name_split[1] == 'Presidente':
                                title = 'chairman'
                            elif name_split[1] == 'Vicepresidente' or name_split[1] == 'Vicepresidenta':
                                title = 'vicechairman'
                            elif name_split[1] == 'Secretaria' or name_split[1] == 'Secretario':
                                title = 'secretary'
#           if title != 'member':
#               print name_split[0]
                        if name_split[0] != 'VACANTE':
                            comm.add_member(name_split[0].replace('Hon.',''),title)
            self.save_committee(comm)
                        
        
        os.remove(filename);
Exemplo n.º 15
0
    def scrape_upper(self):
        url = "http://senadopr.us/Lists/Listado%20de%20Comisiones/Comisiones%20del%20Senado.aspx"
        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)
            doc.make_links_absolute(url)
            table = doc.xpath(
                '//table[@id="{C05AFE0D-D977-4033-8D7B-C43ABF948A4A}-{3E52C91B-AFC8-4493-967A-C8A47AC4E7B6}"]'
            )

            for link in table[0].iterchildren("tr"):
                td_column = list(link)
                name = td_column[0].find("a")
                if name is not None:
                    com_source = name.get("href")
                    # if committee does not have a url use the default.
                    if com_source == "http://senadopr.us/":
                        com_source = url

                    com_name = name.text
                    # check the committee name to see if it's a join one.
                    if td_column[1].text == "Comisi\xf3n Conjunta":
                        chamber = "joint"
                    else:
                        chamber = "upper"
                    com = Committee(chamber, com_name)
                    com.add_source(com_source)
                    com.add_member(clean_spaces(td_column[2].find("a").text), "chairman")
                    self.save_committee(com)
Exemplo n.º 16
0
    def scrape_comm(self, chamber, term_name):
        url = 'http://billstatus.ls.state.ms.us/htms/%s_cmtememb.xml' % chamber
        comm_page =  self.urlopen(url)
        root = lxml.etree.fromstring(comm_page.bytes)
        if chamber == 'h':
            chamber = "lower"
        else:
            chamber = "upper"
        for mr in root.xpath('//COMMITTEE'):
            name = mr.xpath('string(NAME)')
            comm = Committee(chamber, name)

            chair = mr.xpath('string(CHAIR)')
            chair = chair.replace(", Chairman", "")
            role = "Chairman"
            if len(chair) > 0:
                comm.add_member(chair, role=role)
            vice_chair = mr.xpath('string(VICE_CHAIR)')
            vice_chair = vice_chair.replace(", Vice-Chairman", "")
            role = "Vice-Chairman"
            if len(vice_chair) > 0:
                comm.add_member(vice_chair, role=role)
            members = mr.xpath('string(MEMBERS)').split(";")
            if "" in members:
                members.remove("")

            for leg in members:
                leg = leg.strip()
                comm.add_member(leg)

            comm.add_source(url)
            self.save_committee(comm)
Exemplo n.º 17
0
    def scrape_lower_committee(self, name, parent, url):
        page = lxml.html.fromstring(self.urlopen(url))
        page.make_links_absolute(url)

        if 'Joint' in name or (parent and 'Joint' in parent):
            chamber = 'joint'
        else:
            chamber = 'lower'

        if parent:
            comm = Committee(chamber, parent, subcommittee=name)
        else:
            comm = Committee(chamber, name)
        comm.add_source(url)

        xpath = "//a[contains(@href, 'District')]"
        for link in page.xpath(xpath):
            member = link.xpath('string()').strip()
            member = re.sub(r'\s+', ' ', member)

            if not member:
                continue

            match = re.match(r'((Co-)?(Vice )?Chair)?Rep\. ([^\(]+)', member)
            member = match.group(4).strip()
            role = match.group(1) or 'member'

            comm.add_member(member, role.lower())

        self.save_committee(comm)
Exemplo n.º 18
0
    def scrape_comm(self, url, chamber):
        data = self.post(url).json()['Data']

        for item in data:
            comm_name = item['CommitteeName']
            committee = Committee(chamber, comm_name)
            chair_man = str(item['ChairName'])
            vice_chair = str(item['ViceChairName'])
            comm_id = item['CommitteeId']
            comm_url = self.get_comm_url(chamber, comm_id, comm_name)
            members = self.scrape_member_info(comm_url)
            if vice_chair != 'None':
                committee.add_member(vice_chair, 'Vice-Chair')
            if chair_man  != 'None':
                committee.add_member(chair_man, 'Chairman')


            for member in members:
                # vice_chair and chair_man already added.
                if chair_man not in member and vice_chair not in member:
                    member = " ".join(member.split())
                    if member:
                        committee.add_member(member)

            committee.add_source(comm_url)
            committee.add_source(url)
            self.save_committee(committee)
Exemplo n.º 19
0
    def scrape_reps_comm(self):

       url = 'http://www.maine.gov/legis/house/hsecoms.htm'

       with self.urlopen(url) as page:
            root = lxml.html.fromstring(page)

            count = 0

            for n in range(1, 12, 2):
                path = 'string(//body/center[%s]/h1/a)' % (n)
                comm_name = root.xpath(path)
                committee = Committee('lower', comm_name)
                count = count + 1

                path2 = '/html/body/ul[%s]/li/a' % (count)

                for el in root.xpath(path2):
                   rep = el.text
                   if rep.find('(') != -1:
                        mark = rep.find('(')
                        rep = rep[15: mark]
                   committee.add_member(rep)
                committee.add_source(url)

                self.save_committee(committee)
Exemplo n.º 20
0
    def scrape(self, chamber, term):

        for t in self.metadata['terms']:
            if t['name'] == term:
                session = t['sessions'][-1]

        sessionsuffix = 'th'
        if str(session)[-1] == '1':
            sessionsuffix = 'st'
        elif str(session)[-1] == '2':
            sessionsuffix = 'nd'
        elif str(session)[-1] == '3':
            sessionsuffix = 'rd'
        insert = str(session) + sessionsuffix + str(term[0:4])

        chamber_letter = {'lower':'A', 'upper':'S'}[chamber]

        url = 'http://www.leg.state.nv.us/Session/%s/Committees/%s_Committees/' % (
            insert, chamber_letter)

        page = self.urlopen(url)
        root = lxml.html.fromstring(page)
        for com_a in root.xpath('//strong/a'):
            com_url = url + com_a.get('href')
            if com_a.text == 'Committee of the Whole':
                continue
            com = Committee(chamber, com_a.text)
            com.add_source(com_url)
            self.scrape_comm_members(chamber, com, com_url)
            self.save_committee(com)
Exemplo n.º 21
0
    def scrape_approp_subcommittees(self, url):
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)

        for strong in doc.xpath('//strong'):
            com = Committee(chamber='upper', committee='Appropriations',
                            subcommittee=strong.text.strip())
            com.add_source(url)

            legislators = strong.getnext().tail.replace('Senators', '').strip()
            for leg in re.split(', | and ', legislators):
                if leg.endswith('(C)'):
                    role = 'chairman'
                    leg = leg[:-4]
                elif leg.endswith('(VC)'):
                    role = 'vice chairman'
                    leg = leg[:-5]
                elif leg.endswith('(MVC)'):
                    role = 'minority vice chairman'
                    leg = leg[:-6]
                else:
                    role = 'member'
                com.add_member(leg, role=role)

            self.save_committee(com)
Exemplo n.º 22
0
    def scrape_committee(self, term, chambers, href, name):
        page = self.get(href).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(href)
        members = page.xpath("//div[@class='view-content']" "//a[contains(@href, 'members')]")

        if "/joint/" in href:
            chamber = "joint"
        elif "/senate/" in href:
            chamber = "upper"
        elif "/house/" in href:
            chamber = "lower"
        else:
            print "XXX: Fail! %s" % (href)
            return

        cttie = Committee(chamber, name)

        for a in members:
            member = a.text
            role = a.xpath("ancestor::div/h2[@class='pane-title']/text()")[0]
            role = {"Legislative Members": "member", "Chairman": "chair", "Vice Chairman": "member"}[role]

            if member is None or member.startswith("District"):
                continue

            cttie.add_member(member, role=role)

        cttie.add_source(href)
        self.save_committee(cttie)
Exemplo n.º 23
0
    def scrape_assembly(self):
        """Scrape Assembly Committees"""
        assembly_committees_url = "http://assembly.state.ny.us/comm/"

        with self.urlopen(assembly_committees_url) as html:
            doc = lxml.html.fromstring(html)
            standing_committees, subcommittees, legislative_commissions, task_forces = doc.cssselect('#sitelinks ul')
            committee_paths = set([l.get('href') for l in standing_committees.cssselect("li a[href]")
                              if l.get("href").startswith('?sec=mem')])

        for committee_path in committee_paths:
            committee_url = assembly_committees_url+committee_path
            with self.urlopen(committee_url) as chtml:
                cdoc = lxml.html.fromstring(chtml)
                for h in cdoc.cssselect("#content .pagehdg"):
                    if h.text:
                        committee_name = h.text.split('Committee Members')[0].strip()
                        break

                committee = Committee("lower", committee_name)
                committee.add_source(committee_url)
                members = cdoc.cssselect("#sitelinks")[0]

                first = 1
                for member in members.iter('span'):
                    member = member.xpath('li/a')[0].text
                    if first == 1:
                        committee.add_member(member, 'chair')
                        first = 0
                    else:
                        committee.add_member(member)

                self.save_committee(committee)
Exemplo n.º 24
0
    def scrape_senate_committee(self, url):
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)

        name = doc.xpath('//h6/text()')[0]

        com = Committee(chamber='upper', committee=name)

        for member in doc.xpath('//div[@id="committeelist"]//a'):
            member_name = member.text.strip()

            # don't add clerks
            if member_name == 'Committee Clerk':
                continue

            if 'Committee Chair' in member.tail:
                role = 'chair'
            elif 'Majority Vice' in member.tail:
                role = 'majority vice chair'
            elif 'Minority Vice' in member.tail:
                role = 'minority vice chair'
            else:
                role = 'member'

            com.add_member(member_name, role=role)

        com.add_source(url)
        self.save_committee(com)
Exemplo n.º 25
0
    def standing_comm(self):
        main_url = "http://www.nebraskalegislature.gov/committees/standing-committees.php"
        with self.urlopen(main_url) as page:
            page = lxml.html.fromstring(page)

            for comm_links in page.xpath(
                '//div[@id="content_text"]/div[@class="content_box_container"]/div[@class="content_box"][1]/ul[@class="nobullet"]/li/a'
            ):
                detail_link = comm_links.attrib["href"]

                with self.urlopen(detail_link) as detail_page:
                    detail_page = lxml.html.fromstring(detail_page)
                    name = detail_page.xpath(
                        '//div[@id="content"]/div[@class="content_header"]/div[@class="content_header_right"]/a'
                    )[0].text
                    name = name.split()
                    name = name[0:-1]
                    comm_name = ""
                    for x in range(len(name)):
                        comm_name += name[x] + " "
                    comm_name = comm_name[0:-1]
                    committee = Committee("upper", comm_name)

                    for senators in detail_page.xpath('//div[@id="sidebar"]/ul[1]/li[1]/ul/li/a'):
                        senator = senators.text
                        if "Chairperson" in senator:
                            role = "Chairperson"
                            senator = senator[6:-13].strip()
                        else:
                            role = "member"
                            senator = senator[6:].strip()
                        committee.add_member(senator, role)
                    committee.add_source(main_url)
                    committee.add_source(detail_link)
                    self.save_committee(committee)
Exemplo n.º 26
0
    def scrape_upper_committee(self, name, url):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            comm = Committee('upper', name)
            comm.add_source(url)

            member_div = page.xpath("//div[@class = 'committee-members']")[0]

            seen = set()
            for link in member_div.xpath(".//a"):
                if not link.text:
                    continue

                member = link.text.strip()

                next_elem = link.getnext()
                if (next_elem is not None and
                    next_elem.tag == 'a' and
                    next_elem.attrib['href'] == link.attrib['href']):
                    # Sometimes NY is cool and splits names across a
                    # couple links
                    member = "%s %s" % (member, next_elem.text.strip())

                member = re.sub(r'\s+', ' ', member)

                if member in seen or not member:
                    continue
                seen.add(member)

                name, role = parse_name(member)
                comm.add_member(name, role)

            self.save_committee(comm)
Exemplo n.º 27
0
    def scrape_committee(self, chamber, url):
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)

        name = doc.xpath('//span[@class="committeeShortName"]/text()')
        if len(name) == 0:
            self.warning("Had to skip this malformed page.")
            return
        # Because of http://www.malegislature.gov/Committees/Senate/S29 this
        # XXX: hack had to be pushed in. Remove me ASAP. This just skips
        #      malformed pages.

        name = name[0]
        com = Committee(chamber, name)
        com.add_source(url)

        # get both titles and names, order is consistent
        titles = doc.xpath('//p[@class="rankingMemberTitle"]/text()')
        names = doc.xpath('//p[@class="rankingMemberName"]/a/text()')

        for title, name in zip(titles, names):
            com.add_member(name, title)

        for member in doc.xpath('//div[@class="committeeRegularMembers"]//a/text()'):
            com.add_member(member)

        if com['members']:
            self.save_committee(com)
Exemplo n.º 28
0
    def scrape_lower_committee(self, name, url):
        com = Committee("lower", name)
        com.add_source(url)

        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)

            contact, directiva, reps = doc.xpath('//div[@class="sbox"]/div[2]')

            # all members are tails of images (they use img tags for bullets)

            # first three members are in the directiva div
            # pres, vpres, secretary, _ = directiva.xpath('.//img')
            chair = directiva.xpath('b[text()="Presidente:"]/following-sibling::img[1]')
            vchair = directiva.xpath('b[text()="Vice Presidente:"]/following-sibling::img[1]')
            sec = directiva.xpath('b[text()="Secretario(a):"]/following-sibling::img[1]')
            member = 0
            if chair:
                com.add_member(clean_spaces(chair[0].tail), "chairman")
                ++member
            if vchair:
                com.add_member(clean_spaces(vchair[0].tail), "vice chairman")
                ++member
            if sec:
                com.add_member(clean_spaces(sec[0].tail), "secretary")
                ++member

            for img in reps.xpath(".//img"):
                com.add_member(clean_spaces(img.tail))
                ++member
            if member > 0:
                self.save_committee(com)
Exemplo n.º 29
0
    def _scrape_lower_special_committees(self):
        url = 'http://house.louisiana.gov/H_Cmtes/SpecialCommittees.aspx'
        page = self.lxmlize(url)
        
        committee_list = page.xpath('//table[@id="table106"]//div[@class='
            '"exBody1A"]/div[@class="accordion"]')[0]
        headers = committee_list.xpath('./h3')

        for header in headers:
            committee_name_text = header.xpath('string()')
            committee_name = committee_name_text.strip()
            committee_name = self._normalize_committee_name(committee_name)

            chamber = 'joint' if committee_name.startswith('Joint') else 'lower'

            committee = Committee(chamber, committee_name)
            committee.add_source(url)

            committee_memberlist = header.xpath('./following-sibling::div['
                '@class="pane"]//tr[@class="linkStyle2"]')

            for row in committee_memberlist:
                member_name = row.xpath('normalize-space(string(./td[1]))')
                member_name = ' '.join(filter(None, name_tools.split(member_name)))
                member_role = row.xpath('normalize-space(string(./td[2]))')

                member_role = self._normalize_member_role(member_role)

                committee.add_member(member_name, member_role)

            self.save_committee(committee)
Exemplo n.º 30
0
    def select_special_comm(self):
        main_url = "http://www.nebraskalegislature.gov/committees/select-committees.php"
        with self.urlopen(main_url) as page:
            page = lxml.html.fromstring(page)

            for comm_names in page.xpath('//div[@class="content_box"]'):
                name = comm_names.xpath("h2")[0].text
                if name != None:
                    committee = Committee("upper", name)
                    committee.add_source(main_url)
                    for senators in comm_names.xpath('ul[@class="nobullet"]/li'):
                        senator = senators[0].text
                        if "Chairperson" in senator:
                            role = "Chairperson"
                            senator = senator[5:-13].strip()
                        else:
                            role = "member"
                            senator = senator[5:].strip()
                        committee.add_member(senator, role)
                    self.save_committee(committee)
                else:
                    name = comm_names.xpath("h2/a")[0].text
                    committee = Committee("upper", name)
                    committee.add_source(main_url)
                    for senators in comm_names.xpath('ul[@class="nobullet"]/li'):
                        senator = senators[0].text
                        if "Chairperson" in senator:
                            role = "chairperson"
                            senator = senator[5:-13].strip()
                        else:
                            role = "member"
                            senator = senator[5:].strip()
                        committee.add_member(senator, role)
                    self.save_committee(committee)
Exemplo n.º 31
0
    def scrape_joint_committee(self, url):
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)

        name = doc.xpath('//h1/text()') or doc.xpath('//h2/text()')
        name = name[0]

        comm = Committee('joint', name)
        comm.add_source(url)

        members = chain(doc.xpath('//a[contains(@href, "MemberId")]'),
                        doc.xpath('//a[contains(@href, "Senators")]'))

        for a in members:
            parent_content = a.getparent().text_content()
            if ':' in parent_content:
                title = parent_content.split(':')[0].strip()
            else:
                title = 'member'
            comm.add_member(a.text.split(' (')[0].strip(), title)

        self.save_committee(comm)
Exemplo n.º 32
0
    def scrape(self, chamber, term):

        urls = {
            'upper': 'http://legis.delaware.gov/LIS/LIS%s.nsf/SCommittees',
            'lower': 'http://legis.delaware.gov/LIS/LIS%s.nsf/HCommittees'
        }

        # Mapping of term names to session numbers (see metatdata).
        term2session = {"2011-2012": "146"}

        session = term2session[term]

        url = urls[chamber] % (session, )
        self.log(url)
        page = lxml.html.fromstring(self.urlopen(url))
        page.make_links_absolute(url)

        committees = {}

        for row in page.xpath('//td[@width="96%"]/table/tr[@valign="top"]'):
            link = row.xpath('td/font/a[contains(@href, "opendocument")]')[0]
            committees[link.text] = link.attrib['href']
            self.log(link.attrib['href'])

        for c in committees:
            url = committees[c]
            page = lxml.html.fromstring(self.urlopen(url))
            page.make_links_absolute(url)
            committee = Committee(chamber, c)
            committee.add_source(url)

            for tr in page.xpath('//td[@width="96%"]/table/tr'):
                role_section = tr.xpath('td/b/font')
                if (len(role_section) > 0):
                    role = re.sub(r's?:$', '', role_section[0].text).lower()
                    for member in tr.xpath('td/font/a'):
                        committee.add_member(member.text, role)

            self.save_committee(committee)
Exemplo n.º 33
0
    def scrape_house_committees(self, term):
        url = 'http://www.house.leg.state.mn.us/comm/commemlist.asp'

        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)

        for com in doc.xpath('//h2[@class="commhighlight"]'):
            members_url = com.xpath(
                'following-sibling::p[1]/a[text()="Members"]/@href')[0]

            com = Committee('lower', com.text)
            com.add_source(members_url)

            member_html = self.urlopen(members_url)
            mdoc = lxml.html.fromstring(member_html)

            # each legislator in their own table
            # first row, second column contains all the info
            for ltable in mdoc.xpath('//table/tr[1]/td[2]/p/b[1]'):

                # name is tail string of last element
                name = ltable.text_content()
                text = ltable.text
                if text and name != text:
                    name = name.replace(text, '')

                # role is inside a nested b tag
                role = ltable.xpath('b/*/text()')
                if role:
                    # if there was a role, remove it from name
                    role = role[0]
                    name = name.replace(role, '')
                else:
                    role = 'member'
                name = name.split(' (')[0]
                com.add_member(name, role)

            # save
            self.save_committee(com)
Exemplo n.º 34
0
    def scrape_committee(self, chamber, name, url):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            if page.xpath("//h3[. = 'Joint Committee']"):
                chamber = 'joint'

            comm = Committee(chamber, name)
            comm.add_source(url)

            for link in page.xpath("//a[contains(@href, 'member=')]"):
                member = link.text.strip()

                mtype = link.xpath("string(../preceding-sibling::td[1])")
                mtype = mtype.strip(": \r\n\t").lower()

                comm.add_member(member, mtype)

            if not comm['members']:
                self.warning('not saving %s, appears to be empty' % name)
            else:
                self.save_committee(comm)
Exemplo n.º 35
0
    def get_jfac(self, name, url):
        """gets membership info for the Joint Finance and Appropriations
        Committee."""
        with self.urlopen(url) as jfac_page:
            html = lxml.html.fromstring(jfac_page)
            table = html.xpath('body/table/tr/td[2]/table')[0]
            committee = Committee('joint', name)
            for row in table.xpath('tr')[1:]:
                senate, house = row.xpath('td/strong')
                senate = senate.text.replace(u'\xa0', ' ')
                house = house.text.replace(u'\xa0', ' ')
                if ',' in senate:
                    committee.add_member(*senate.split(','), chamber='upper')
                else:
                    committee.add_member(senate, chamber='upper')
                if ',' in house:
                    committee.add_member(*house.split(','), chamber='lower')
                else:
                    committee.add_member(house, chamber='lower')

            committee.add_source(url)
            self.save_committee(committee)
Exemplo n.º 36
0
    def _scrape_committee(self, committee_name, link, chamber):
        """Scrape individual committee page and add members"""

        page = self.get(link).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(link)

        is_subcommittee = bool(page.xpath('//li/a[text()="Committee"]'))
        if is_subcommittee:
            com = Committee(chamber,
                            re.sub(r'\s*Subcommittee\s*', '', committee_name),
                            committee_name)
        else:
            com = Committee(chamber, committee_name)

        OFFICER_SEARCH = '//h2[contains(text(), "Committee Officers")]/' \
                     'following-sibling::div/ul/li/a'
        MEMBER_SEARCH = '//h2[contains(text(), "Committee Members")]/' \
                     'following-sibling::div/ul/li/a'
        HOUSE_SEARCH = '//h2[contains(text(), "House Members")]/' \
                     'following-sibling::div/ul/li/a'
        SENATE_SEARCH = '//h2[contains(text(), "House Members")]/' \
                     'following-sibling::div/ul/li/a'
        for a in (page.xpath(OFFICER_SEARCH) + page.xpath(MEMBER_SEARCH)):

            member_name = ' '.join([
                x.strip() for x in a.xpath('text()') + a.xpath('span/text()')
                if x.strip()
            ])
            role = a.xpath('small')
            if role:
                role = role[0].xpath('text()')[0].strip()
            else:
                role = 'member'

            com.add_member(member_name, role)

        com.add_source(link)
        self.save_committee(com)
Exemplo n.º 37
0
    def scrapeProcedural(self, chamber, page, url):
        comm_count = 1
        for comm_names in page.xpath('//div[@class="content"][1]/p/a'):
            name = re.sub('[^A-Za-z0-9]+', ' ',
                          comm_names.text).replace(' ', '')
            comm = Committee(chamber, name)

            members_path = '//div[@class="content"][1]/table[@class="p"][%s]//tr/td[2]/a' % (
                str(comm_count))
            for members in comm_names.xpath(members_path):
                member = members.text
                member = re.sub('[^A-Za-z0-9]+', ' ', member)
                role = members.tail
                if (role != None) and ('Chairman' in role):
                    role = 'Chairman'
                else:
                    role = 'Member'
                comm.add_member(member, role)

            comm.add_source(url)
            self.save_committee(comm)
            comm_count += 1
Exemplo n.º 38
0
    def scrape_senate_comm(self):
        url = ('http://legislature.maine.gov/committee-information/'
               'standing-committees-of-the-senate')
        html = self.get(url).text
        doc = lxml.html.fromstring(html)

        headings = doc.xpath('//p/strong')
        for heading in headings:
            committee = Committee('upper', heading.text.strip(':'))
            committee.add_source(url)
            par = heading.getparent().getnext()
            while True:
                link = par.xpath('a')
                if len(link) == 0:
                    break
                res = self.senate_committee_pattern.search(link[0].text)
                name, chair = res.groups()
                committee.add_member(
                    name, 'chair' if chair is not None else 'member')
                par = par.getnext()

            self.save_committee(committee)
Exemplo n.º 39
0
    def scrape_senate_committee(self, name, url):
        url = url.replace('Default.asp', 'Assignments.asp')

        committee = Committee('upper', name)
        committee.add_source(url)

        text = self.urlopen(url)
        page = lxml.html.fromstring(text)

        links = page.xpath('//table[@bordercolor="#EBEAEC"]/tr/td/font/a')

        for link in links:
            role = "member"
            if link.tail:
                role = link.tail.strip().strip("() ")

            name = link.xpath('string()')
            name = name.replace('Senator ', '').strip()

            committee.add_member(name, role)

        self.save_committee(committee)
Exemplo n.º 40
0
    def scrape(self, term, chambers):
        self.validate_term(term, latest_only=True)

        url = "http://le.utah.gov/asp/interim/Main.asp?ComType=All&Year=2014&List=2#Results"
        page = self.lxmlize(url)

        for comm_link in page.xpath("//a[contains(@href, 'Com=')]"):
            comm_name = comm_link.text.strip()

            if "House" in comm_name:
                chamber = "lower"
            if "Senate" in comm_name:
                chamber = "upper"
            else:
                chamber = "joint"


            # Drop leading "House" or "Senate" from name
            comm_name = re.sub(r"^(House|Senate) ", "", comm_name)
            comm = Committee(chamber, comm_name)

            committee_page = self.lxmlize(comm_link.attrib['href'])

            for mbr_link in committee_page.xpath(
                    "//table[@class='memberstable']//a"):

                name = mbr_link.text.strip()
                role = mbr_link.tail.strip().strip(",").strip()
                type = "member"
                if role:
                    type = role

                comm.add_member(name, type)

            comm.add_source(url)
            comm.add_source(comm_link.get('href'))

            self.save_committee(comm)
Exemplo n.º 41
0
    def select_special_comm(self):
        main_url = 'http://www.nebraskalegislature.gov/committees/select-committees.php'
        with self.urlopen(main_url) as page:
            page = lxml.html.fromstring(page)

            for comm_names in page.xpath(
                    '/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[@class="content_box_container"]/div[@class="content_box"]'
            ):
                name = comm_names.xpath('h2')[0].text
                if name != None:
                    committee = Committee('upper', name)
                    committee.add_source(main_url)
                    for senators in comm_names.xpath(
                            'ul[@class="nobullet"]/li'):
                        senator = senators[0].text
                        if 'Chairperson' in senator:
                            role = 'Chairperson'
                            senator = senator[5:-13].strip()
                        else:
                            role = 'member'
                            senator = senator[5:].strip()
                        committee.add_member(senator, role)
                    self.save_committee(committee)
                else:
                    name = comm_names.xpath('h2/a')[0].text
                    committee = Committee('upper', name)
                    committee.add_source(main_url)
                    for senators in comm_names.xpath(
                            'ul[@class="nobullet"]/li'):
                        senator = senators[0].text
                        if 'Chairperson' in senator:
                            role = 'chairperson'
                            senator = senator[5:-13].strip()
                        else:
                            role = 'member'
                            senator = senator[5:].strip()
                        committee.add_member(senator, role)
                    self.save_committee(committee)
Exemplo n.º 42
0
    def scrape_assembly(self):
        """Scrape Assembly Committees"""
        assembly_committees_url = "http://assembly.state.ny.us/comm/"

        with self.urlopen(assembly_committees_url) as html:
            doc = lxml.html.fromstring(html)
            standing_committees, subcommittees, legislative_commissions, task_forces = doc.cssselect(
                '#sitelinks ul')
            committee_paths = set([
                l.get('href')
                for l in standing_committees.cssselect("li a[href]")
                if l.get("href").startswith('?sec=mem')
            ])

        for committee_path in committee_paths:
            committee_url = assembly_committees_url + committee_path
            with self.urlopen(committee_url) as chtml:
                cdoc = lxml.html.fromstring(chtml)
                for h in cdoc.cssselect("#content .pagehdg"):
                    if h.text:
                        committee_name = h.text.split(
                            'Committee Members')[0].strip()
                        break

                committee = Committee("lower", committee_name)
                committee.add_source(committee_url)
                members = cdoc.cssselect("#sitelinks")[0]

                first = 1
                for member in members.iter('span'):
                    member = member.xpath('li/a')[0].text
                    if first == 1:
                        committee.add_member(member, 'chair')
                        first = 0
                    else:
                        committee.add_member(member)

                self.save_committee(committee)
Exemplo n.º 43
0
    def scrape_lower_committee(self, name, url):
        page = self.lxmlize(url)

        committee = Committee('lower', name)
        committee.add_source(url)

        seen = set()

        member_links = self.get_nodes(
            page, '//div[@class="commlinks"]//a[contains(@href, "mem")]')

        for member_link in member_links:
            member_name = None
            member_role = None

            member_text = member_link.text
            if member_text is not None:
                member = member_text.strip()
                member = re.sub(r'\s+', ' ', member)
                member_name, member_role = self._parse_name(member)

            if member_name is None:
                continue

            # Figure out if this person is the chair.
            role_type = self.get_node(
                member_link, '../../preceding-sibling::div[1]/text()')

            if role_type in (['Chair'], ['Co-Chair']):
                member_role = 'chair'
            else:
                member_role = 'member'

            if name not in seen:
                committee.add_member(member_name, member_role)
                seen.add(member_name)

        return committee
Exemplo n.º 44
0
    def scrape_chamber(self, url, orig_chamber):
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        for a in doc.xpath('//a[contains(@href, "committee.aspx")]'):
            com_name = a.text
            com_url = a.get('href')
            com_html = self.urlopen(com_url)
            com_data = lxml.html.fromstring(com_html)

            if 'Joint' in com_name:
                chamber = 'joint'
            else:
                chamber = orig_chamber

            if chamber == 'joint':
                if com_name not in self.joint_coms:
                    self.joint_coms[com_name] = Committee(chamber, com_name)
                com = self.joint_coms.get(com_name)
                self.joint_coms[com_name] = com
            else:
                com = Committee(chamber, com_name)

            for a in com_data.xpath('//a[contains(@href, "Member=")]'):
                member = a.text
                role = a.xpath('../following-sibling::span/text()')
                if role:
                    role = role[0].lower().replace(u'\xa0', ' ')
                    # skip former members
                    if 'until' in role:
                        continue
                else:
                    role = 'member'
                com.add_member(member, role)

            com.add_source(com_url)
            self.save_committee(com)
Exemplo n.º 45
0
    def scrape_house_committees(self):
        base_url = 'http://house.mi.gov/MHRPublic/CommitteeInfo.aspx?comkey='
        html = self.urlopen('http://house.mi.gov/mhrpublic/committee.aspx')
        doc = lxml.html.fromstring(html)

        # get values out of drop down
        for opt in doc.xpath('//option'):
            name = opt.text
            # skip invalid choice
            if opt.text in ('Statutory Committees', 'Select One'):
                continue
            if 'have not been created' in opt.text:
                self.warning('no committees yet for the house')
                return
            com_url = base_url + opt.get('value')
            com_html = self.urlopen(com_url)
            cdoc = lxml.html.fromstring(com_html)
            com = Committee(chamber='lower', committee=name)
            com.add_source(com_url)

            for a in doc.xpath('//a[starts-with(@id, "memberLink")]'):
                name = a.text.strip()

            # all links to http:// pages in servicecolumn2 are legislators
            for a in cdoc.xpath(
                    '//div[@class="servicecolumn2"]//a[starts-with(@href, "http")]'
            ):
                name = a.text.strip()
                text = a.xpath('following-sibling::span/text()')[0]
                if 'Committee Chair' in text:
                    role = 'chair'
                elif 'Vice-Chair' in text:
                    role = 'vice chair'
                else:
                    role = 'member'
                com.add_member(name, role=role)

            self.save_committee(com)
Exemplo n.º 46
0
    def scrape_lower_committee(self, name, parent, url):
        page = lxml.html.fromstring(self.get(url).text)
        page.make_links_absolute(url)

        if 'Joint' in name or (parent and 'Joint' in parent):
            chamber = 'joint'
        else:
            chamber = 'lower'

        if parent:
            comm = Committee(chamber, parent, subcommittee=name)
        else:
            comm = Committee(chamber, name)
        comm.add_source(url)

        xpath = "//a[contains(@href, 'District')]"
        for link in page.xpath(xpath):
            member = link.xpath('string()').strip()
            member = re.sub(r'\s+', ' ', member)

            if not member or member == 'House District Maps':
                continue

            match = re.match(r'((Co-)?(Vice )?Chair)?Rep\. ([^\(]+)', member)
            member = match.group(4).strip()
            role = match.group(1) or 'member'

            comm.add_member(member, role.lower())

        if not comm['members']:
            if comm['subcommittee'] == 'test':
                # Whoopsie, prod data.
                return

            raise Exception('no members for %s (%s)' %
                            (comm['committee'], comm['subcommittee']))

        self.save_committee(comm)
Exemplo n.º 47
0
    def scrape_senate_committees(self, term_name, chamber):
        years = [t[2:] for t in term_name.split('-')]

        for year in years:
            if int(year) > int(str(dt.datetime.now().year)[2:]):
                self.log("Not running session %s, it's in the future." %
                         (term_name))
                continue
            url = '{base}{year}info/com-standing.htm'.format(
                base=self.senate_url_base, year=year)
            page_string = self.urlopen(url)
            page = lxml.html.fromstring(page_string)
            ps = page.xpath('id("mainContent")/table/*[3]/p')
            for p in ps:
                links = p.xpath('a[1]')
                if not links:
                    continue
                a = links[0]
                committee_name = a.text_content().strip()
                committee_url = a.attrib.get('href')
                committee = Committee(chamber, committee_name)
                committee_page_string = self.urlopen(committee_url)
                committee_page = lxml.html.fromstring(committee_page_string)
                lis = committee_page.xpath(
                    "//div[@id='mainContent']/ul/ul[1]/li")
                if len(lis) == 0:
                    lis = committee_page.xpath("//div[@id='mainContent']//li")
                    # This MIGHT cause issues.
                for li in lis:
                    mem_parts = li.text_content().strip().split(',')
                    mem_name = mem_parts[0]
                    mem_role = 'member'
                    if len(mem_parts) > 2:
                        mem_role = mem_parts[2].lower()
                    committee.add_member(mem_name, role=mem_role)
                committee.add_source(url)
                committee.add_source(committee_url)
                self.save_committee(committee)
Exemplo n.º 48
0
    def scrape(self, chamber, term):
        self.validate_term(term)
        session = self.get_session_for_term(term)
        try:
            session_id = self.get_session_id(session)
        except KeyError:
            raise NoDataForPeriod

        url = 'http://www.azleg.gov/StandingCom.asp'
        html = self.get(url).text
        doc = lxml.html.fromstring(html)

        chamber_name = dict(upper="Senate",
                            lower="House of Representatives")[chamber]
        xpath = '//strong[contains(text(), "%s")]/../../following-sibling::tr/td'
        tds = doc.xpath(xpath % chamber_name)
        for td in tds:
            name = td.text_content().strip()
            source_url = td.xpath('a/@href')[0]
            query = urlparse.urlparse(source_url).query
            params = dict(urlparse.parse_qsl(query))
            c_id = params['Committee_ID']
            session_id = params['Session_ID']

            c = Committee(chamber, name, session=session, az_committee_id=c_id)

            c.add_source(source_url)
            #for some reason they don't always have any info on the committees'
            try:
                self.scrape_com_info(session, session_id, c_id, c)
            except HTTPError:
                pass

            if not c['members']:
                msg = 'No members found: not saving {committee}.'
                self.logger.warning(msg.format(**c))
                continue
            self.save_committee(c)
Exemplo n.º 49
0
    def scrape_senate_committee(self, committee_name, link):
        """Scrape individual committee page and add members"""
        find_expr = "//div[@class='col1']/ul[position()<3]/li"

        com = Committee('upper', committee_name)

        with self.urlopen(link) as page:
            # Find individual committee urls
            page = lxml.html.fromstring(page)

            for el in page.xpath(find_expr):
                chunks = el.text_content().split(',', 1)
                member = [item.strip() for item in chunks]
                if len(member) > 1:
                    member_name, role = member
                else:
                    member_name, role = member[0], 'member'

                if member_name != "":
                    com.add_member(member_name, role)

        com.add_source(link)
        self.save_committee(com)
Exemplo n.º 50
0
    def scrape(self, chamber, term):
        biennium = "%s-%s" % (term[0:4], term[7:9])

        url = "%s/GetActiveCommittees?biennium=%s" % (self._base_url, biennium)
        page = self.urlopen(url)
        page = lxml.etree.fromstring(page.bytes)

        for comm in xpath(page, "//wa:Committee"):
            agency = xpath(comm, "string(wa:Agency)")
            comm_chamber = {'House': 'lower', 'Senate': 'upper'}[agency]
            if comm_chamber != chamber:
                continue

            name = xpath(comm, "string(wa:Name)")
            comm_id = xpath(comm, "string(wa:Id)")
            # acronym = xpath(comm, "string(wa:Acronym)")
            phone = xpath(comm, "string(wa:Phone)")

            comm = Committee(chamber, name, _code=comm_id, office_phone=phone)
            self.scrape_members(comm, agency)
            comm.add_source(url)
            if comm['members']:
                self.save_committee(comm)
Exemplo n.º 51
0
    def scrape_upper_committee(self, name, url):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            comm = Committee('upper', name)
            comm.add_source(url)

            member_div = page.xpath("//div[@class = 'committee-members']")[0]

            seen = set()
            for link in member_div.xpath(".//a"):
                if not link.text:
                    continue

                member = link.text.strip()

                if member in seen or not member:
                    continue
                seen.add(member)

                comm.add_member(member)

            self.save_committee(comm)
Exemplo n.º 52
0
    def scrape_upper_committee(self, name, url):
        page = lxml.html.fromstring(self.urlopen(url))

        comm = Committee('upper', name)
        comm.add_source(url)

        for link in page.xpath("//a[contains(@href, 'biographies')]"):
            member = link.xpath("string()").strip()
            member = re.sub(r'\s+', ' ', member)
            if not member:
                continue
            role = link.tail
            if not role:
                role = 'member'
            elif 'Vice Chair' in role:
                role = 'vice chair'
            elif 'Chair' in role:
                role = 'chair'
            comm.add_member(member, role=role)

        if not comm['members']:
            raise Exception('no members for %s', comm['name'])
        self.save_committee(comm)
Exemplo n.º 53
0
    def scrape(self, term, chambers):
        com_url = 'http://www.dccouncil.washington.dc.us/committees'
        data = self.urlopen(com_url)
        doc = lxml.html.fromstring(data)

        urls = set(doc.xpath('//a[contains(@href, "committee-on")]/@href'))
        for url in urls:
            data = self.urlopen(url)
            doc = lxml.html.fromstring(data)

            name = doc.xpath('//h1/text()')[0].replace('Committee on ', '')
            com = Committee('upper', name)

            for chair in doc.xpath(
                    '//h3[text()="Committee Chair"]/following-sibling::p'):
                com.add_member(chair.text_content(), role='chairperson')

            for member in doc.xpath(
                    '//h3[text()="Councilmembers"]/following-sibling::p/a'):
                com.add_member(member.text_content(), role='member')

            com.add_source(url)
            self.save_committee(com)
Exemplo n.º 54
0
    def scrape_senate_comm(self):
        url = 'http://www.maine.gov/legis/senate/Senate-Standing-Committees.html'

        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)

            # committee titles
            for item in doc.xpath('//span[@style="FONT-SIZE: 11pt"]'):
                text = item.text_content().strip()
                # some contain COMMITTEE ON & some are blank, drop those
                if not text or text.startswith('COMMITTEE'):
                    continue

                # titlecase committee name
                com = Committee('upper', text.title())
                com.add_source(url)

                # up two and get ul sibling
                for leg in item.xpath('../../following-sibling::ul[1]/li'):
                    lname = leg.text_content().strip().split(' of ')[0]
                    com.add_member(lname)

                self.save_committee(com)
Exemplo n.º 55
0
    def scrape(self, term, chambers):
        base_url = 'http://www.ncga.state.nc.us/gascripts/Committees/Committees.asp?bPrintable=true&sAction=ViewCommitteeType&sActionDetails='

        chamber_slugs = {
            'upper': ['Senate%20Standing', 'Senate%20Select'],
            'lower': ['House%20Standing', 'House%20Select']
        }

        for chamber in chambers:
            for ctype in chamber_slugs[chamber]:
                data = self.urlopen(base_url + ctype)
                doc = lxml.html.fromstring(data)
                doc.make_links_absolute(base_url + ctype)
                for comm in doc.xpath('//ul/li/a'):
                    name = comm.text
                    # skip committee of whole Senate
                    if 'Whole Senate' in name:
                        continue
                    url = comm.get('href')
                    committee = Committee(chamber, name)
                    self.scrape_committee(committee, url)
                    committee.add_source(url)
                    self.save_committee(committee)
Exemplo n.º 56
0
    def scrape(self, chamber, term):

        for t in self.metadata['terms']:
            if t['name'] == term:
                session = t['sessions'][-1]

        sessionsuffix = 'th'
        if str(session)[-1] == '1':
            sessionsuffix = 'st'
        elif str(session)[-1] == '2':
            sessionsuffix = 'nd'
        elif str(session)[-1] == '3':
            sessionsuffix = 'rd'
        insert = str(session) + sessionsuffix + str(term[0:4])

        chamber_letter = {'lower':'A', 'upper':'S'}[chamber]

        insert = self.metadata['session_details'][session].get(
            '_committee_session', insert
        )


        url = 'http://www.leg.state.nv.us/Session/%s/Committees/%s_Committees/' % (
            insert, chamber_letter)
        if insert in ['28th2014Special']:
            raise NoDataForPeriod(insert)

        page = self.urlopen(url)
        root = lxml.html.fromstring(page)
        for com_a in root.xpath('//strong/a'):
            com_url = url + com_a.get('href')
            if com_a.text == 'Committee of the Whole':
                continue
            com = Committee(chamber, com_a.text)
            com.add_source(com_url)
            self.scrape_comm_members(chamber, com, com_url)
            self.save_committee(com)
Exemplo n.º 57
0
    def scrape_lower_committee(self, committee_name, url):
        page = self.lxmlize(url)

        committee_name = committee_name.strip()
        committee = Committee('lower', committee_name)
        committee.add_source(url)

        info_node = self.get_node(
            page, './/div[@id = "dnn_ctr1109_ViewWebCommission_WebCommission1_'
            'pnlCommission"]')

        # This will likely capture empty text nodes as well.
        members = self.get_nodes(
            info_node,
            './/div[@class="two-cols com"]/div[@class="col"]//text()'
            '[normalize-space() and preceding-sibling::br]')

        member_count = 0

        for member in members:
            member = re.sub(r'Hon\.\s*', '', member).strip()

            # Skip empty nodes.
            if not member:
                continue

            member, title = self._match_title(member)

            if title is not None:
                committee.add_member(member, title)
            else:
                committee.add_member(member)

            member_count += 1

        if member_count > 0:
            self.save_committee(committee)
Exemplo n.º 58
0
    def scrape_committees(self, chamber, url):
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        # distinct URLs containing /com/
        committees = set([
            l.get('href') for l in doc.xpath('//li/a')
            if l.get('href', '').find('/com/') != -1
        ])

        for com in committees:
            com_url = 'http://www.msa.md.gov' + com
            chtml = self.urlopen(com_url)
            cdoc = lxml.html.fromstring(chtml)
            for h in cdoc.xpath('//*[self::h2 or self::h3]'):
                if h.text:
                    committee_name = h.text
                    break

            # non committees
            if 'DEFUNCT' in committee_name or 'ORGANIZATION' in committee_name:
                continue

            cur_com = Committee(chamber, committee_name)
            cur_com.add_source(com_url)
            for l in cdoc.xpath('//a[@href]'):
                txt = l.text or ''
                if ' SUBCOMMITTEE' in txt or 'OVERSIGHT COMMITTEE' in txt:
                    self.save_committee(cur_com)
                    cur_com = Committee(chamber, committee_name, l.text)
                    cur_com.add_source(com_url)
                elif 'html/msa' in l.get('href'):
                    prev = l.getprevious()
                    name = l.text
                    if name.endswith(','):
                        name = name[:-1]
                    cur_com.add_member(name)
            self.save_committee(cur_com)
Exemplo n.º 59
0
    def select_special_comm(self):
        main_url = 'http://www.nebraskalegislature.gov/committees/select-committees.php'
        page = self.get(main_url).text
        page = lxml.html.fromstring(page)

        for comm_names in page.xpath('//div[@class="content_box"]'):
            name = comm_names.xpath('h2')[0].text
            if name != None:
                committee = Committee('upper', name)
                committee.add_source(main_url)
                for senators in comm_names.xpath('ul[@class="nobullet"]/li'):
                    senator = senators[0].text
                    if 'Chairperson' in senator:
                        role = 'Chairperson'
                        senator = senator[5:-13].strip()
                    else:
                        role = 'member'
                        senator = senator[5:].strip()
                    committee.add_member(senator, role)
            else:
                name = comm_names.xpath('h2/a')[0].text
                committee = Committee('upper', name)
                committee.add_source(main_url)
                for senators in comm_names.xpath('ul[@class="nobullet"]/li'):
                    senator = senators[0].text
                    if 'Chairperson' in senator:
                        role = 'chairperson'
                        senator = senator[5:-13].strip()
                    else:
                        role = 'member'
                        senator = senator[5:].strip()
                    committee.add_member(senator, role)

            if not committee['members']:
                self.warning('no members in %s', committee['committee'])
            else:
                self.save_committee(committee)
Exemplo n.º 60
0
    def scrapeStanding(self, chamber, page, url):
        comm_count = 1
        for comm_names in page.xpath('//div[@class="content"][1]/p//span'):
            name = re.sub('[^A-Za-z0-9]+', ' ', comm_names.text).replace(' ', '')
            comm = Committee(chamber, name)

            member_count = 1
            members_path = '//div[@class="content"][1]/table[@class="p"][%s]//tr/td[2]' % (str(comm_count))
            for members in comm_names.xpath(members_path):
                memberName = members.xpath('a')[0].text
                if memberName == None: #special case for Randy Boehning under Goverment and Vetran Affairs in House
                    memberName = members.xpath('a')[1].text
                memberName = re.sub('[^A-Za-z0-9]+', ' ', memberName)

                #role
                role_path = '//div[@class="content"][1]/table[@class="p"][%s]//tr[%s]/td[2]/a' % (comm_count, member_count)
                role_text = page.xpath(role_path)[0].tail
                if role_text != None:
                    if "Vice" in role_text:
                        role = "Vice-Chairman"
                    elif "Chairman" in role_text:
                        role = "Chairman"
                    else:
                        role = "Member"
                    comm.add_member(memberName, role) 
                else:
                    if member_count == 1:
                        role = "Chairman"
                    elif member_count == 2:
                        role = "Vice-Chairman"
                    else:
                        role = "Member"
                    comm.add_member(memberName, role)
                member_count += 1
            comm.add_source(url)
            self.save_committee(comm)
            comm_count += 1