Пример #1
0
    def scrape_reps_comm(self, chamber, session):        
        
       url = 'http://www.maine.gov/legis/house/hsecoms.htm'

       with self.urlopen(url) as page:
            root = lxml.etree.fromstring(page, lxml.etree.HTMLParser())

            count = 0

            for n in range(1, 12, 2):
                path = 'string(//body/center[%s]/h1/a)' % (n)
                comm_name = root.xpath(path)
                committee = Committee(chamber, comm_name)
                count = count + 1                

                path2 = '/html/body/ul[%s]/li/a' % (count)

                for el in root.xpath(path2):
                   rep = el.text
                   if rep.find('(') != -1:
                        mark = rep.find('(')
                        rep = rep[15: mark]
                   committee.add_member(rep)
                committee.add_source(url)
                
                self.save_committee(committee)
Пример #2
0
    def scrape_senate_committee(self, term, link):
        with self.urlopen(link) as html:
            doc = lxml.html.fromstring(html)

            # strip first 30 and last 10
            # Minnesota Senate Committees - __________ Committee
            committee_name = doc.xpath('//title/text()')[0][30:-10]

            com = Committee('upper', committee_name)

            # first id=bio table is members
            for row in doc.xpath('//table[@id="bio"]')[0].xpath('tr'):
                row = fix_whitespace(row.text_content())

                # switch role
                if ':' in row:
                    position, name = row.split(': ')
                    role = position.lower().strip()
                else:
                    name = row

                # add the member
                com.add_member(name, role)

            com.add_source(link)
            self.save_committee(com)
Пример #3
0
    def scrape(self, chamber, term):
        if term != '2011-2012':
            raise NoDataForPeriod(term)

        chamber_abbr = {'upper': 's', 'lower': 'h'}[chamber]

        url = "http://le.utah.gov/asp/interim/standing.asp?house=%s" % chamber_abbr
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            for comm_link in page.xpath("//a[contains(@href, 'Com=')]"):
                comm_name = comm_link.text.strip()

                # Drop leading "House" or "Senate" from name
                comm_name = re.sub(r"^(House|Senate) ", "", comm_name)

                comm = Committee(chamber, comm_name)

                for mbr_link in comm_link.xpath(
                    "../../../font[2]/a[not(contains(@href, 'mailto'))]"):

                    name = mbr_link.text.strip()

                    next_el = mbr_link.getnext()
                    if next_el is not None and next_el.tag == 'i':
                        type = next_el.text.strip()
                    else:
                        type = 'member'

                    comm.add_member(name, type)

                self.save_committee(comm)
Пример #4
0
    def scrape_reps_comm(self, chamber, term):
        save_chamber = chamber

        # id range for senate committees on their website
        for comm_id in range(87, 124):
            chamber = save_chamber
            comm_url = (
                "http://www.house.state.oh.us/index.php?option="
                "com_displaycommittees&task=2&type=Regular&"
                "committeeId=%d" % comm_id
            )

            with self.urlopen(comm_url) as page:
                page = lxml.etree.fromstring(page, lxml.etree.HTMLParser())

                comm_name = page.xpath('string(//table/tr[@class="committeeHeader"]/td)')
                comm_name = comm_name.replace("/", " ")

                if comm_id < 92:
                    chamber = "joint"

                committee = Committee(chamber, comm_name)
                committee.add_source(comm_url)

                for link in page.xpath("//a[contains(@href, 'district')]"):
                    name = link.text
                    if name and name.strip():
                        committee.add_member(name.strip())

                self.save_committee(committee)
Пример #5
0
    def scrape_house_committees(self, term):
        url = 'http://www.house.leg.state.mn.us/comm/commemlist.asp'

        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)

            for com in doc.xpath('//h2[@class="commhighlight"]'):
                members_url = com.xpath('following-sibling::p[1]/a[text()="Members"]/@href')[0]

                com = Committee('lower', com.text)
                com.add_source(members_url)

                with self.urlopen(members_url) as member_html:
                    mdoc = lxml.html.fromstring(member_html)

                    # each legislator in their own table
                    # first row, second column contains all the info
                    for ltable in mdoc.xpath('//table/tr[1]/td[2]/p/b[1]'):

                        # name is tail string of last element
                        name = ltable.text_content()

                        # role is inside a nested b tag
                        role = ltable.xpath('b/*/text()')
                        if role:
                            # if there was a role, remove it from name
                            role = role[0]
                            name = name.replace(role, '')
                        else:
                            role = 'member'
                        com.add_member(name, role)

                # save
                self.save_committee(com)
Пример #6
0
    def scrape_reps_comm(self, chamber, year):

        save_chamber = chamber

        #id range for senate committees on their website
        for comm_id in range(87, 124):

            chamber = save_chamber

            comm_url = 'http://www.house.state.oh.us/index.php?option=com_displaycommittees&task=2&type=Regular&committeeId=' + str(comm_id)
            with self.urlopen(comm_url) as page:
                root = lxml.etree.fromstring(page, lxml.etree.HTMLParser())

                comm_name = root.xpath('string(//table/tr[@class="committeeHeader"]/td)')
                comm_name = comm_name.replace("/", " ")                
                
                #joint legislative committiees
                if comm_id < 92:
                    chamber = "joint_legislation"

                committee = Committee(chamber, comm_name)
               
                path = '/html/body[@id="bd"]/div[@id="ja-wrapper"]/div[@id="ja-containerwrap-f"]/div[@id="ja-container"]/div[@id="ja-mainbody-f"]/div[@id="ja-contentwrap"]/div[@id="ja-content"]/table/tr[position() >=3]'
                
                for el in root.xpath(path):
                    rep1 = el.xpath('string(td[1]/a)')
                    rep2 = el.xpath('string(td[4]/a)')
                    committee.add_member(rep1)
                    committee.add_member(rep2)
                    
                committee.add_source(comm_url)
                self.save_committee(committee)
Пример #7
0
    def scrape_house(self):
        url = "http://house.louisiana.gov/H_Reps/H_Reps_CmtesFull.asp"
        comm_cache = {}
        with self.urlopen(url) as text:
            page = lxml.html.fromstring(text)

            for row in page.xpath("//table[@bordercolorlight='#EAEAEA']/tr"):
                cells = row.xpath('td')

                name = cells[0].xpath('string()').strip()

                if name.startswith('Vacant'):
                    continue

                font = cells[1].xpath('font')[0]
                committees = []

                if font.text:
                    committees.append(font.text.strip())
                for br in font.xpath('br'):
                    if br.text:
                        committees.append(br.text.strip())
                    if br.tail:
                        committees.append(br.tail)

                for comm_name in committees:
                    mtype = 'member'
                    if comm_name.endswith(', Chairman'):
                        mtype = 'chairman'
                        comm_name = comm_name.replace(', Chairman', '')
                    elif comm_name.endswith(', Co-Chairmain'):
                        mtype = 'co-chairmain'
                        comm_name = comm_name.replace(', Co-Chairmain', '')
                    elif comm_name.endswith(', Vice Chair'):
                        mtype = 'vice chair'
                        comm_name = comm_name.replace(', Vice Chair', '')
                    elif comm_name.endswith(', Ex Officio'):
                        mtype = 'ex officio'
                        comm_name = comm_name.replace(', Ex Officio', '')

                    if comm_name.startswith('Joint'):
                        chamber = 'joint'
                    else:
                        chamber = 'lower'

                    try:
                        committee = comm_cache[comm_name]
                    except KeyError:
                        committee = Committee(chamber, comm_name)
                        committee.add_source(url)
                        comm_cache[comm_name] = committee

                    committee.add_member(name, mtype)

            for committee in comm_cache.values():
                self.save_committee(committee)
Пример #8
0
    def scrape_committee(self, chamber, term, name, url):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            mlist = page.xpath("//strong[contains(., 'Members:')]")[0].tail
            mlist = re.sub(r'\s+', ' ', mlist)

            committee = Committee(chamber, name)
            committee.add_source(url)

            for member in mlist.split(','):
                member = re.sub(r'R\.M\.(M\.)?$', '', member.strip())
                committee.add_member(member.strip())

            chair = page.xpath("//strong[contains(., 'Chair:')]")[0]
            chair_name = chair.tail.strip()
            if chair_name:
                committee.add_member(chair_name, 'chair')

            vc = page.xpath("//strong[contains(., 'Vice Chair:')]")[0]
            vc_name = vc.tail.strip()
            if vc_name:
                committee.add_member(vc_name, 'vice chair')

            self.save_committee(committee)
Пример #9
0
    def scrape(self, chamber, year):
        if year != '2009':
            raise NoDataForPeriod(year)

        if chamber == 'upper':
            url = ('http://www.legis.state.pa.us/cfdocs/legis/'
                   'home/member_information/senators_ca.cfm')
        else:
            url = ('http://www.legis.state.pa.us/cfdocs/legis/'
                   'home/member_information/representatives_ca.cfm')

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            committees = {}

            for li in page.xpath("//a[contains(@href, 'bio.cfm')]/../.."):
                name = li.xpath("string(b/a[contains(@href, 'bio.cfm')])")
                name = name[0:-4]

                for link in li.xpath("a"):
                    if not link.tail:
                        continue

                    committee_name = link.tail.strip()
                    committee_name = re.sub(r"\s+", " ", committee_name)
                    subcommittee_name = None
                    role = 'member'

                    rest = link.xpath('string(../i)')
                    if rest:
                        match = re.match(r',\s+(Subcommittee on .*)\s+-',
                                         rest)

                        if match:
                            subcommittee_name = match.group(1)
                            role = rest.split('-')[1].strip()
                        else:
                            role = rest.replace(', ', '').strip()

                    try:
                        committee = committees[(chamber, committee_name,
                                                subcommittee_name)]
                    except KeyError:
                        committee = Committee(chamber, committee_name)
                        if subcommittee_name:
                            committee['subcommittee'] = subcommittee_name

                        committees[(chamber, committee_name,
                                    subcommittee_name)] = committee

                    committee.add_member(name, role)

            for committee in committees.values():
                self.save_committee(committee)
Пример #10
0
    def scrape_senate(self):
        """Scrape Senate Committees"""
        for name, comm in nyss_openlegislation.models.committees.items():
            name = name.title().replace('And', 'and')

            committee = Committee('upper', name)

            for member in comm.members:
                committee.add_member(member.fullname)

            self.save_committee(committee)
Пример #11
0
    def scrape(self, chamber, year):
        # TODO: scrape senate committees
        house_url = 'http://www.msa.md.gov/msa/mdmanual/06hse/html/hsecom.html'

        with self.urlopen(house_url) as html:
            doc = lxml.html.fromstring(html)
            # distinct URLs containing /com/
            committees = set([l.get('href') for l in doc.cssselect('li a')
                              if l.get('href', '').find('/com/') != -1])

        for com in committees:
            com_url = 'http://www.msa.md.gov'+com
            with self.urlopen(com_url) as chtml:
                cdoc = lxml.html.fromstring(chtml)
                for h in cdoc.cssselect('h2, h3'):
                    if h.text:
                        committee_name = h.text
                        break
                cur_com = Committee('lower', committee_name)
                cur_com.add_source(com_url)
                for l in cdoc.cssselect('a[href]'):
                    if ' SUBCOMMITTEE' in (l.text or ''):
                        self.save_committee(cur_com)
                        cur_com = Committee('lower', l.text, committee_name)
                        cur_com.add_source(com_url)
                    elif 'html/msa' in l.get('href'):
                        cur_com.add_member(l.text)
                self.save_committee(cur_com)
Пример #12
0
    def scrape_senate_committee(self, name, url):
        url = url.replace('Default.asp', 'Assignments.asp')

        committee = Committee('upper', name)
        with self.urlopen(url) as text:
            page = lxml.html.fromstring(text)

            links = page.xpath('//table[@bordercolor="#EBEAEC"]/tr/td/font/a')

            for link in links:
                name = link.xpath('string()')
                name = name.replace('Senator ', '').strip()

                committee.add_member(name)

        self.save_committee(committee)
Пример #13
0
 def scrape_index(self, chamber, session, session_id, committee_type):
     url = base_url + 'xml/committees.asp?session=%s&type=%s' % (session_id,
                                                              committee_type)
     with self.urlopen(url) as page:
         root = etree.fromstring(page, etree.XMLParser(recover=True))
         
         body = '//body[@Body="%s"]/committee' % {'upper': 'S',
                                                  'lower': 'H'}[chamber]
         # TODO need to and make sure to add sub committees
         for com in root.xpath(body):
             c_id, name, short_name, sub = com.values()
             c = Committee(chamber, name, short_name=short_name, 
                           session=session, az_committee_id=c_id)
             c.add_source(url)
             self.scrape_com_info(session, session_id, c_id, c)
             self.save_committee(c)
Пример #14
0
    def scrape(self, chamber, term):
        base_url = 'http://www.ncga.state.nc.us/gascripts/Committees/Committees.asp?bPrintable=true&sAction=ViewCommitteeType&sActionDetails='

        chambers = {'upper': ['Senate%20Standing', 'Senate%20Select'],
                    'lower': ['House%20Standing', 'House%20Select']}

        for ctype in chambers[chamber]:
            with self.urlopen(base_url + ctype) as data:
                doc = lxml.html.fromstring(data)
                doc.make_links_absolute(base_url+ctype)
                for comm in doc.xpath('//ul/li/a'):
                    name = comm.text
                    url = comm.get('href')
                    committee = Committee(chamber, name)
                    self.scrape_committee(committee, url)
                    committee.add_source(url)
                    self.save_committee(committee)
Пример #15
0
    def scrape_senate(self):
        """Scrape Senate Committees"""
        senate_url = "http://www.nysenate.gov"
        senate_committees_url = senate_url + "/committees"

        with self.urlopen(senate_committees_url) as html:
            doc = lxml.html.fromstring(html)
            committee_paths = set([l.get("href") for l in doc.cssselect("li a")
                              if l.get("href", "").find("/committee/") != -1])

        for committee_path in committee_paths:
            committee_url = senate_url+committee_path
            with self.urlopen(committee_url) as chtml:
                cdoc = lxml.html.fromstring(chtml)
                for h in cdoc.cssselect(".committee_name"):
                    if h.text:
                        committee_name = h.text
                        break

                committee = Committee("upper", committee_name)
                committee.add_source(committee_url)
                for l in cdoc.cssselect(".committee-chair a[href]"):
                    if "/senator/" in l.get("href") and l.text and l.text.startswith("Sen."):
                        committee.add_member(l.text.split('Sen. ', 1)[1], "chair")

                for l in cdoc.cssselect(".committee-members a[href]"):
                    if "/senator/" in l.get("href"):
                        committee.add_member(l.text)

                self.save_committee(committee)
Пример #16
0
    def scrape_assembly(self):
        """Scrape Assembly Committees"""
        assembly_committees_url = "http://assembly.state.ny.us/comm/"

        with self.urlopen(assembly_committees_url) as html:
            doc = lxml.html.fromstring(html)
            standing_committees, subcommittees, legislative_commissions, task_forces = doc.cssselect('#sitelinks ul')
            committee_paths = set([l.get('href') for l in standing_committees.cssselect("li a[href]")
                              if l.get("href").startswith('?sec=mem')])

        for committee_path in committee_paths:
            committee_url = assembly_committees_url+committee_path
            with self.urlopen(committee_url) as chtml:
                cdoc = lxml.html.fromstring(chtml)
                for h in cdoc.cssselect("#content .pagehdg"):
                    if h.text:
                        committee_name = h.text.split('Committee Members')[0].strip()
                        break

                committee = Committee("lower", committee_name)
                committee.add_source(committee_url)
                members = cdoc.cssselect("#sitelinks")[0]

                first = 1
                for member in members.iter('span'):
                    member = member.xpath('li/a')[0].text
                    if first == 1:
                        committee.add_member(member, 'chair')
                        first = 0
                    else:
                        committee.add_member(member)

                self.save_committee(committee)
Пример #17
0
    def get_committees(self, term, chamber, laws_year):
        committee_list = []

        committee_list_url = self.committee_list_url_template % laws_year
        list_page = ElementTree(lxml.html.fromstring(self.urlopen(committee_list_url)))
        com_select = list_page.find('//select[@name="P_COM_NM"]')

        for option in com_select.findall("option"):
            if option.text:
                committee_url = self.committee_url_template % (laws_year,
                                                               urllib.quote(option.text.strip()))
                c_chamber, name = option.text.split(" ", 1)
                c_chamber = c_chamber[1]
                if (('H' == c_chamber and 'lower' == chamber) or
                   ('S' == c_chamber and 'upper' == chamber)):
                    # committee = Committee(term['name'], chamber, name)
                    committee = Committee(chamber, name)
                    committee.add_source(committee_url)
                    committee_list.append(committee)
        return committee_list
Пример #18
0
    def scrape_committees(self, year_abr, session):

        members_url = 'ftp://www.njleg.state.nj.us/ag/%sdata/COMEMB.DBF' % (year_abr)
        comm_info_url = 'ftp://www.njleg.state.nj.us/ag/%sdata/COMMITT.DBF' % (year_abr)
     
        COMEMB_dbf, resp = self.urlretrieve(members_url)
        COMMIT_dbf, resp2 = self.urlretrieve(comm_info_url)

        members_db = dbf.Dbf(COMEMB_dbf)
        info_db = dbf.Dbf(COMMIT_dbf)

        comm_dictionary = {}

        #Committe Info Database
        for name_rec in info_db:
            abrv = name_rec["code"]
            comm_name = name_rec["descriptio"]
            comm_type = name_rec["type"]
            aide = name_rec["aide"]
            contact_info = name_rec["phone"]

            if abrv[0] == "A":
                chamber = "upper"
            elif abrv[0] == "S":
                chamber = "lower"

            comm = Committee(chamber, comm_name, comm_type = comm_type, aide = aide, contact_info = contact_info)
            comm.add_source(members_url)
            comm.add_source(comm_info_url)
            comm_dictionary[abrv] = comm

        #Committee Member Database
        for member_rec in members_db:
            abr = member_rec["code"]
            comm_name = comm_dictionary[abr]
            
            leg = member_rec["member"]            
            comm_name.add_member(leg)
            
            self.save_committee(comm_name) 
Пример #19
0
    def scrape_senate_comm(self, chamber, term):
        committees = [
            "agriculture",
            "education",
            "energy-and-public-utilities",
            "environment-and-natural-resources",
            "finance-and-financial-institutions",
            "government-oversight",
            "health-human-services-and-aging",
            "highways-and-transportation",
            "insurance-commerce-and-labor",
            "judiciary-civil-justice",
            "judiciary-criminal-justice",
            "reference",
            "rules",
            "state-and-local-government-and-veterans-affairs",
            "ways-and-means-and-economic-development",
        ]

        for name in committees:
            comm_url = "http://www.ohiosenate.gov/committees/standing/detail/" "%s.html" % name

            with self.urlopen(comm_url) as page:
                root = lxml.etree.fromstring(page, lxml.etree.HTMLParser())

                comm_name = name
                comm_name = comm_name.replace("-", " ")
                comm_name = comm_name.title()
                committee = Committee(chamber, comm_name)
                committee.add_source(comm_url)

                for el in root.xpath("//table/tr/td"):
                    sen_name = el.xpath('string(a[@class="senatorLN"])')
                    mark = sen_name.find("(")
                    full_name = sen_name[0:mark]
                    full_name = full_name.strip()
                    if full_name:
                        committee.add_member(full_name)

                self.save_committee(committee)
Пример #20
0
 def scrape(self, chamber, term):
     self.validate_term(term)
     session = self.get_session_for_term(term)
     try:
         session_id = self.get_session_id(session)
     except KeyError:
         raise NoDataForPeriod
     
     # not getting the floor committees maybe try it during the new session
     # for committee_type in ('S', 'F'):
     #     self.scrape_index(chamber, session, session_id, committee_type)
     
     url = base_url + 'xml/committees.asp?session=%s' % session_id
     
     with self.urlopen(url) as page:
         root = etree.fromstring(page, etree.XMLParser(recover=True))
         
         body = '//body[@Body="%s"]/committee' % {'upper': 'S',
                                                  'lower': 'H'}[chamber]
         for com in root.xpath(body):
             c_id, name, short_name, sub = com.values()
             if sub == '1':
                 parent = name.split('Subcommittee')[0].strip()
                 name = name[name.index('Subcommittee'):]
                 
                 c = Committee(chamber, parent, short_name=short_name, 
                           subcommittee=name, session=session,
                           az_committee_id=c_id)
             else:
                 c = Committee(chamber, name, short_name=short_name, 
                               session=session, az_committee_id=c_id)
                               
             c.add_source(url)
             #for some reason they don't always have any info on the committees'
             try:
                 self.scrape_com_info(session, session_id, c_id, c)
             except HTTPError:
                 pass
             
             self.save_committee(c)
Пример #21
0
    def scrape_joint_comm(self, chamber, session):

        fileurl = 'http://www.maine.gov/legis/house/commlist.xls'
        
        joint = urllib.urlopen(fileurl).read()
        f = open('me_joint.xls', 'w')
        f.write(joint)
        f.close()

        wb = xlrd.open_workbook('me_joint.xls')
        sh = wb.sheet_by_index(0)

        cur_comm_name = ''
        chamber = 'joint'

        for rownum in range(1, sh.nrows):
            
            comm_name = sh.cell(rownum, 0).value

            first_name = sh.cell(rownum, 3).value
            middle_name = sh.cell(rownum, 4).value
            last_name = sh.cell(rownum, 5).value
            jrsr = sh.cell(rownum, 6).value
            full_name = first_name + " " + middle_name + " " + last_name + " " + jrsr

            party = sh.cell(rownum, 7).value
            legalres = sh.cell(rownum, 8).value
            address1 = sh.cell(rownum, 9).value
            address2 = sh.cell(rownum, 10).value
            town = sh.cell(rownum, 11).value
            state = sh.cell(rownum, 12).value
            zipcode = int(sh.cell(rownum, 13).value)
            phone = str(sh.cell(rownum, 14).value)
            home_email = sh.cell(rownum, 15).value
            leg_email = sh.cell(rownum, 16).value
            
            leg_chamber = sh.cell(rownum, 2).value
            chair = sh.cell(rownum, 1).value
            role = "member"

            if chair == 1:
                role = leg_chamber + " " + "Chair"

            if comm_name != cur_comm_name:
                cur_comm_name = comm_name 
                committee = Committee(chamber, comm_name)
                committee.add_member(full_name, role = role, party = party, legalres= legalres, address1 = address1, address2 = address2, town = town, state = state, zipcode = zipcode, phone = phone, home_email = home_email, leg_email = leg_email)
                committee.add_source(fileurl)
            else:
                committee.add_member(full_name, role = role, party = party, legalres = legalres, address1 = address1, address2 = address2, town = town, state = state, zipcode = zipcode, phone = phone, home_email = home_email, leg_email = leg_email)
               
            self.save_committee(committee) 
Пример #22
0
    def scrape(self, chamber, term):
        com_url = {'lower': 'http://www.msa.md.gov/msa/mdmanual/06hse/html/hsecom.html',
                   'upper': 'http://www.msa.md.gov/msa/mdmanual/05sen/html/sencom.html'}
        # joint: http://www.msa.md.gov/msa/mdmanual/07leg/html/ga.html

        with self.urlopen(com_url[chamber]) as html:
            doc = lxml.html.fromstring(html)
            # distinct URLs containing /com/
            committees = set([l.get('href') for l in doc.cssselect('li a')
                              if l.get('href', '').find('/com/') != -1])

        for com in committees:
            com_url = 'http://www.msa.md.gov'+com
            with self.urlopen(com_url) as chtml:
                cdoc = lxml.html.fromstring(chtml)
                for h in cdoc.cssselect('h2, h3'):
                    if h.text:
                        committee_name = h.text
                        break
                cur_com = Committee(chamber, committee_name)
                cur_com.add_source(com_url)
                for l in cdoc.cssselect('a[href]'):
                    if ' SUBCOMMITTEE' in (l.text or ''):
                        self.save_committee(cur_com)
                        cur_com = Committee(chamber, committee_name, l.text)
                        cur_com.add_source(com_url)
                    elif 'html/msa' in l.get('href'):
                        prev = l.getprevious()
                        name = l.text
                        if name.endswith(','):
                            name = name[:-1]
                        if prev is not None and prev.tag == 'i':
                            cur_com.add_member(name, 'ex-officio')
                        else:
                            cur_com.add_member(name)
                self.save_committee(cur_com)
Пример #23
0
    def scrape_comm(self, chamber, term_name):
        url = "http://billstatus.ls.state.ms.us/htms/%s_cmtememb.xml" % chamber
        with self.urlopen(url) as comm_page:
            root = lxml.etree.fromstring(comm_page, lxml.etree.HTMLParser())
            if chamber == "h":
                chamber = "lower"
            else:
                chamber = "upper"
            for mr in root.xpath("//committee"):
                name = mr.xpath("string(name)")
                comm = Committee(chamber, name)

                chair = mr.xpath("string(chair)")
                chair = chair.replace(", Chairman", "")
                role = "Chairman"
                if len(chair) > 0:
                    comm.add_member(chair, role=role)
                vice_chair = mr.xpath("string(vice_chair)")
                vice_chair = vice_chair.replace(", Vice-Chairman", "")
                role = "Vice-Chairman"
                if len(vice_chair) > 0:
                    comm.add_member(vice_chair, role=role)
                members = mr.xpath("string(members)").split(";")

                for leg in members:
                    if leg[0] == " ":
                        comm.add_member(leg[1 : len(leg)])
                    else:
                        comm.add_member(leg)
                comm.add_source(url)
                self.save_committee(comm)
Пример #24
0
    def scrape(self, chamber, year):
        com = Committee('lower', 'Committee on Finance')
        com.add_source('http://example.com')
        # can optionally specify role
        com.add_member('Lou Adams', 'chairman')
        com.add_member('Bill Smith')

        # can also specify subcommittees
        subcom = Committee('lower', 'Finance Subcommittee on Banking', 'Committee on Finance')
        com.add_source('http://example.com')
        com.add_member('Bill Smith')
Пример #25
0
    def scrape_senate_comm(self, chamber, insert, session):

        committees = self.scrape_comm(chamber, insert, session)

        for committee in committees:
            leg_url = 'http://www.leg.state.nv.us/Session/' + insert  + '/Committees/S_Committees/' + committee
           

            with self.urlopen(leg_url) as page:
                root = lxml.etree.fromstring(page, lxml.etree.HTMLParser())

                comm_name = root.xpath('string(/html/body/div[@id="content"]/center/h2)')

                #special cases for each session to grab the name
                if session == 73:
                    comm_name = root.xpath('string(/html/body/div[@id="content"]/h2[1])')
                elif session == 72: 
                    comm_name = root.xpath('string(/html/body/h2[1]/font)')
                elif session == 71:
                    comm_name = root.xpath('string(/html/body/h2)')
                elif committee == 'NR.cfm' and session != 72 and session != 71:
                    comm_name = root.xpath('string(/html/body/div[@id="content"]/h2)')

                #Marking for grabbing only the name of the committee
                startmark = comm_name.find("Senate")
                if startmark == -1:
                    startmark = 0
                else:
                    startmark = 7
                endmark = comm_name.find(str(session))

                if session <= 73:
                    comm_name = comm_name[startmark: len(comm_name)]
                else:
                    comm_name = comm_name[startmark: endmark - 3]
                

                comm = Committee(chamber, comm_name)
                count = 0
                #print comm_name
                if session == 73 or session == 71:
                    path = '//li'
                elif session == 72:
                    path = '/html/body/ul/li' 
                else:
                    path = '/html/body/div[@id="content"]/ul/li'

                for mr in root.xpath(path):
                    name = mr.xpath('string(a)')
                    name = name.replace(' \r\n ', '')

                    if session == 72:
                        name = mr.xpath('string()')
                        name = name.replace('\r\n', '')
                        name = name.replace(' -Vice Chair', '')
                        name = name.replace(' -Chair', '')

                    count = count + 1                

                    if count == 1 and committee[0:3] != 'EPE.cfm':
                        role = 'Chair'
                    elif count == 2 and committee[0:3] != 'EPE.cfm':
                        role = 'Vice Chair'
                    else:
                        role = 'member'
                    comm.add_member(name, role)
                comm.add_source(leg_url)
                self.save_committee(comm)
Пример #26
0
    def scrape_assem_comm(self, chamber, insert, year, session):

        committees = self.scrape_comm(chamber, insert, session)

        for committee in committees:
            leg_url = 'http://www.leg.state.nv.us/Session/' + insert  + '/Committees/A_Committees/' + committee 
            

            with self.urlopen(leg_url) as page:
                root = lxml.etree.fromstring(page, lxml.etree.HTMLParser())
               
                comm_name = root.xpath('string(/html/body/div[@id="content"]/h1)')

                #special cases for each session to grab the name
                if session == 73:
                    comm_name = root.xpath('string(/html/body/div[@id="content"]/h1)')
                elif session == 72:
                    comm_name = root.xpath('string(/html/body/h2[1]/font)')
                elif session == 71:
                    comm_name = root.xpath('string(/html/body/h2)')
                elif committee == 'NR.cfm' and session != 72 and session != 71:
                    comm_name = root.xpath('string(/html/body/div[@id="content"]/h2)')

                #Marking for grabbing only the name of the committee
                startmark = comm_name.find("Assembly")
                if startmark == -1:
                    startmark = 0
                else:
                    startmark = 9
                endmark = comm_name.find(str(session))

                if session <= 73:
                    comm_name = comm_name[startmark: len(comm_name)]
                else:
                    comm_name = comm_name[startmark: endmark - 3]

                comm_name = comm_name.replace(' \r\n ', '')


                if committee == 'EPE.cfm' and (year == '2005' or year == '2007'):
                    note1 = root.xpath('string(/html/body/div[@id="content"]/ul[1]/li[1])') 
                    note2 = root.xpath('string(/html/body/div[@id="content"]/ul[1]/li[2])')
                    comm = Committee(chamber, comm_name, note1 = note1, note2 = note2)
                else:
                    comm = Committee(chamber, comm_name) 
                count = 0    

                #special case
                if committee == 'EPE.cfm' and year == '2009':
                    special_name1 = root.xpath('string(/html/body/div[@id="content"]/p/a[1])')
                    special_name1 = special_name1.split()[0] + " " + special_name1.split()[1]
                    name1_2ndrole = "Constitutional Amendments Vice Chair"                    

                    special_name2 = root.xpath('string(/html/body/div[@id="content"]/p/a[2])')
                    special_name2 = special_name2.split()[0] + " " + special_name2.split()[1]
                    name2_2ndrole = "Elections Procedures and Ethics Vice Chair"

                    comm.add_member(special_name1, role="Elections Procedures and Ethics Chair", name1_2ndrole = name1_2ndrole)
                    comm.add_member(special_name2, role="Constitutional Admendments Chair", name2_2ndrole = name2_2ndrole)                   

                #paths for grabbing names
                if session == 73 or session == 71:
                    path = '//li'
                elif session == 72:
                    path = '/html/body/ul/li'
                else:
                    path = '/html/body/div[@id="content"]/ul/li'

                #grabbing names
                for mr in root.xpath(path):
                    name = mr.xpath('string(a)')
                    name = name.strip()

                    if session == 72 or session == 71:
                        name = mr.xpath('string()')
                        name = name.replace('\r\n', '')
                        name = name.replace(' -Vice Chair', '')
                        name = name.replace(' -Chair', '')
                        name = name.replace('-Chair', '')
                        name = name.replace('\u', '')
                        name = name.replace('\u00a0', '')
                        name = name.replace('  ', ' ')

                    count = count + 1
                    if count == 1 and committee[0:3] != 'EPE' and session != 72:
                        role = 'Chair'
                    elif count == 2 and committee[0:3] != 'EPE' and session != 72:
                        role = 'Vice Chair'
                    else:
                        role = 'member'
                    if len(name) > 0:
                        comm.add_member(name, role = role) 
                comm.add_source(leg_url)
                self.save_committee(comm)