def scrape_reps_comm(self, chamber, session): url = 'http://www.maine.gov/legis/house/hsecoms.htm' with self.urlopen(url) as page: root = lxml.etree.fromstring(page, lxml.etree.HTMLParser()) count = 0 for n in range(1, 12, 2): path = 'string(//body/center[%s]/h1/a)' % (n) comm_name = root.xpath(path) committee = Committee(chamber, comm_name) count = count + 1 path2 = '/html/body/ul[%s]/li/a' % (count) for el in root.xpath(path2): rep = el.text if rep.find('(') != -1: mark = rep.find('(') rep = rep[15: mark] committee.add_member(rep) committee.add_source(url) self.save_committee(committee)
def scrape_senate_committee(self, term, link): with self.urlopen(link) as html: doc = lxml.html.fromstring(html) # strip first 30 and last 10 # Minnesota Senate Committees - __________ Committee committee_name = doc.xpath('//title/text()')[0][30:-10] com = Committee('upper', committee_name) # first id=bio table is members for row in doc.xpath('//table[@id="bio"]')[0].xpath('tr'): row = fix_whitespace(row.text_content()) # switch role if ':' in row: position, name = row.split(': ') role = position.lower().strip() else: name = row # add the member com.add_member(name, role) com.add_source(link) self.save_committee(com)
def scrape(self, chamber, term): if term != '2011-2012': raise NoDataForPeriod(term) chamber_abbr = {'upper': 's', 'lower': 'h'}[chamber] url = "http://le.utah.gov/asp/interim/standing.asp?house=%s" % chamber_abbr with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) for comm_link in page.xpath("//a[contains(@href, 'Com=')]"): comm_name = comm_link.text.strip() # Drop leading "House" or "Senate" from name comm_name = re.sub(r"^(House|Senate) ", "", comm_name) comm = Committee(chamber, comm_name) for mbr_link in comm_link.xpath( "../../../font[2]/a[not(contains(@href, 'mailto'))]"): name = mbr_link.text.strip() next_el = mbr_link.getnext() if next_el is not None and next_el.tag == 'i': type = next_el.text.strip() else: type = 'member' comm.add_member(name, type) self.save_committee(comm)
def scrape_reps_comm(self, chamber, term): save_chamber = chamber # id range for senate committees on their website for comm_id in range(87, 124): chamber = save_chamber comm_url = ( "http://www.house.state.oh.us/index.php?option=" "com_displaycommittees&task=2&type=Regular&" "committeeId=%d" % comm_id ) with self.urlopen(comm_url) as page: page = lxml.etree.fromstring(page, lxml.etree.HTMLParser()) comm_name = page.xpath('string(//table/tr[@class="committeeHeader"]/td)') comm_name = comm_name.replace("/", " ") if comm_id < 92: chamber = "joint" committee = Committee(chamber, comm_name) committee.add_source(comm_url) for link in page.xpath("//a[contains(@href, 'district')]"): name = link.text if name and name.strip(): committee.add_member(name.strip()) self.save_committee(committee)
def scrape_house_committees(self, term): url = 'http://www.house.leg.state.mn.us/comm/commemlist.asp' with self.urlopen(url) as html: doc = lxml.html.fromstring(html) for com in doc.xpath('//h2[@class="commhighlight"]'): members_url = com.xpath('following-sibling::p[1]/a[text()="Members"]/@href')[0] com = Committee('lower', com.text) com.add_source(members_url) with self.urlopen(members_url) as member_html: mdoc = lxml.html.fromstring(member_html) # each legislator in their own table # first row, second column contains all the info for ltable in mdoc.xpath('//table/tr[1]/td[2]/p/b[1]'): # name is tail string of last element name = ltable.text_content() # role is inside a nested b tag role = ltable.xpath('b/*/text()') if role: # if there was a role, remove it from name role = role[0] name = name.replace(role, '') else: role = 'member' com.add_member(name, role) # save self.save_committee(com)
def scrape_reps_comm(self, chamber, year): save_chamber = chamber #id range for senate committees on their website for comm_id in range(87, 124): chamber = save_chamber comm_url = 'http://www.house.state.oh.us/index.php?option=com_displaycommittees&task=2&type=Regular&committeeId=' + str(comm_id) with self.urlopen(comm_url) as page: root = lxml.etree.fromstring(page, lxml.etree.HTMLParser()) comm_name = root.xpath('string(//table/tr[@class="committeeHeader"]/td)') comm_name = comm_name.replace("/", " ") #joint legislative committiees if comm_id < 92: chamber = "joint_legislation" committee = Committee(chamber, comm_name) path = '/html/body[@id="bd"]/div[@id="ja-wrapper"]/div[@id="ja-containerwrap-f"]/div[@id="ja-container"]/div[@id="ja-mainbody-f"]/div[@id="ja-contentwrap"]/div[@id="ja-content"]/table/tr[position() >=3]' for el in root.xpath(path): rep1 = el.xpath('string(td[1]/a)') rep2 = el.xpath('string(td[4]/a)') committee.add_member(rep1) committee.add_member(rep2) committee.add_source(comm_url) self.save_committee(committee)
def scrape_house(self): url = "http://house.louisiana.gov/H_Reps/H_Reps_CmtesFull.asp" comm_cache = {} with self.urlopen(url) as text: page = lxml.html.fromstring(text) for row in page.xpath("//table[@bordercolorlight='#EAEAEA']/tr"): cells = row.xpath('td') name = cells[0].xpath('string()').strip() if name.startswith('Vacant'): continue font = cells[1].xpath('font')[0] committees = [] if font.text: committees.append(font.text.strip()) for br in font.xpath('br'): if br.text: committees.append(br.text.strip()) if br.tail: committees.append(br.tail) for comm_name in committees: mtype = 'member' if comm_name.endswith(', Chairman'): mtype = 'chairman' comm_name = comm_name.replace(', Chairman', '') elif comm_name.endswith(', Co-Chairmain'): mtype = 'co-chairmain' comm_name = comm_name.replace(', Co-Chairmain', '') elif comm_name.endswith(', Vice Chair'): mtype = 'vice chair' comm_name = comm_name.replace(', Vice Chair', '') elif comm_name.endswith(', Ex Officio'): mtype = 'ex officio' comm_name = comm_name.replace(', Ex Officio', '') if comm_name.startswith('Joint'): chamber = 'joint' else: chamber = 'lower' try: committee = comm_cache[comm_name] except KeyError: committee = Committee(chamber, comm_name) committee.add_source(url) comm_cache[comm_name] = committee committee.add_member(name, mtype) for committee in comm_cache.values(): self.save_committee(committee)
def scrape_committee(self, chamber, term, name, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) mlist = page.xpath("//strong[contains(., 'Members:')]")[0].tail mlist = re.sub(r'\s+', ' ', mlist) committee = Committee(chamber, name) committee.add_source(url) for member in mlist.split(','): member = re.sub(r'R\.M\.(M\.)?$', '', member.strip()) committee.add_member(member.strip()) chair = page.xpath("//strong[contains(., 'Chair:')]")[0] chair_name = chair.tail.strip() if chair_name: committee.add_member(chair_name, 'chair') vc = page.xpath("//strong[contains(., 'Vice Chair:')]")[0] vc_name = vc.tail.strip() if vc_name: committee.add_member(vc_name, 'vice chair') self.save_committee(committee)
def scrape(self, chamber, year): if year != '2009': raise NoDataForPeriod(year) if chamber == 'upper': url = ('http://www.legis.state.pa.us/cfdocs/legis/' 'home/member_information/senators_ca.cfm') else: url = ('http://www.legis.state.pa.us/cfdocs/legis/' 'home/member_information/representatives_ca.cfm') with self.urlopen(url) as page: page = lxml.html.fromstring(page) committees = {} for li in page.xpath("//a[contains(@href, 'bio.cfm')]/../.."): name = li.xpath("string(b/a[contains(@href, 'bio.cfm')])") name = name[0:-4] for link in li.xpath("a"): if not link.tail: continue committee_name = link.tail.strip() committee_name = re.sub(r"\s+", " ", committee_name) subcommittee_name = None role = 'member' rest = link.xpath('string(../i)') if rest: match = re.match(r',\s+(Subcommittee on .*)\s+-', rest) if match: subcommittee_name = match.group(1) role = rest.split('-')[1].strip() else: role = rest.replace(', ', '').strip() try: committee = committees[(chamber, committee_name, subcommittee_name)] except KeyError: committee = Committee(chamber, committee_name) if subcommittee_name: committee['subcommittee'] = subcommittee_name committees[(chamber, committee_name, subcommittee_name)] = committee committee.add_member(name, role) for committee in committees.values(): self.save_committee(committee)
def scrape_senate(self): """Scrape Senate Committees""" for name, comm in nyss_openlegislation.models.committees.items(): name = name.title().replace('And', 'and') committee = Committee('upper', name) for member in comm.members: committee.add_member(member.fullname) self.save_committee(committee)
def scrape(self, chamber, year): # TODO: scrape senate committees house_url = 'http://www.msa.md.gov/msa/mdmanual/06hse/html/hsecom.html' with self.urlopen(house_url) as html: doc = lxml.html.fromstring(html) # distinct URLs containing /com/ committees = set([l.get('href') for l in doc.cssselect('li a') if l.get('href', '').find('/com/') != -1]) for com in committees: com_url = 'http://www.msa.md.gov'+com with self.urlopen(com_url) as chtml: cdoc = lxml.html.fromstring(chtml) for h in cdoc.cssselect('h2, h3'): if h.text: committee_name = h.text break cur_com = Committee('lower', committee_name) cur_com.add_source(com_url) for l in cdoc.cssselect('a[href]'): if ' SUBCOMMITTEE' in (l.text or ''): self.save_committee(cur_com) cur_com = Committee('lower', l.text, committee_name) cur_com.add_source(com_url) elif 'html/msa' in l.get('href'): cur_com.add_member(l.text) self.save_committee(cur_com)
def scrape_senate_committee(self, name, url): url = url.replace('Default.asp', 'Assignments.asp') committee = Committee('upper', name) with self.urlopen(url) as text: page = lxml.html.fromstring(text) links = page.xpath('//table[@bordercolor="#EBEAEC"]/tr/td/font/a') for link in links: name = link.xpath('string()') name = name.replace('Senator ', '').strip() committee.add_member(name) self.save_committee(committee)
def scrape_index(self, chamber, session, session_id, committee_type): url = base_url + 'xml/committees.asp?session=%s&type=%s' % (session_id, committee_type) with self.urlopen(url) as page: root = etree.fromstring(page, etree.XMLParser(recover=True)) body = '//body[@Body="%s"]/committee' % {'upper': 'S', 'lower': 'H'}[chamber] # TODO need to and make sure to add sub committees for com in root.xpath(body): c_id, name, short_name, sub = com.values() c = Committee(chamber, name, short_name=short_name, session=session, az_committee_id=c_id) c.add_source(url) self.scrape_com_info(session, session_id, c_id, c) self.save_committee(c)
def scrape(self, chamber, term): base_url = 'http://www.ncga.state.nc.us/gascripts/Committees/Committees.asp?bPrintable=true&sAction=ViewCommitteeType&sActionDetails=' chambers = {'upper': ['Senate%20Standing', 'Senate%20Select'], 'lower': ['House%20Standing', 'House%20Select']} for ctype in chambers[chamber]: with self.urlopen(base_url + ctype) as data: doc = lxml.html.fromstring(data) doc.make_links_absolute(base_url+ctype) for comm in doc.xpath('//ul/li/a'): name = comm.text url = comm.get('href') committee = Committee(chamber, name) self.scrape_committee(committee, url) committee.add_source(url) self.save_committee(committee)
def scrape_senate(self): """Scrape Senate Committees""" senate_url = "http://www.nysenate.gov" senate_committees_url = senate_url + "/committees" with self.urlopen(senate_committees_url) as html: doc = lxml.html.fromstring(html) committee_paths = set([l.get("href") for l in doc.cssselect("li a") if l.get("href", "").find("/committee/") != -1]) for committee_path in committee_paths: committee_url = senate_url+committee_path with self.urlopen(committee_url) as chtml: cdoc = lxml.html.fromstring(chtml) for h in cdoc.cssselect(".committee_name"): if h.text: committee_name = h.text break committee = Committee("upper", committee_name) committee.add_source(committee_url) for l in cdoc.cssselect(".committee-chair a[href]"): if "/senator/" in l.get("href") and l.text and l.text.startswith("Sen."): committee.add_member(l.text.split('Sen. ', 1)[1], "chair") for l in cdoc.cssselect(".committee-members a[href]"): if "/senator/" in l.get("href"): committee.add_member(l.text) self.save_committee(committee)
def scrape_assembly(self): """Scrape Assembly Committees""" assembly_committees_url = "http://assembly.state.ny.us/comm/" with self.urlopen(assembly_committees_url) as html: doc = lxml.html.fromstring(html) standing_committees, subcommittees, legislative_commissions, task_forces = doc.cssselect('#sitelinks ul') committee_paths = set([l.get('href') for l in standing_committees.cssselect("li a[href]") if l.get("href").startswith('?sec=mem')]) for committee_path in committee_paths: committee_url = assembly_committees_url+committee_path with self.urlopen(committee_url) as chtml: cdoc = lxml.html.fromstring(chtml) for h in cdoc.cssselect("#content .pagehdg"): if h.text: committee_name = h.text.split('Committee Members')[0].strip() break committee = Committee("lower", committee_name) committee.add_source(committee_url) members = cdoc.cssselect("#sitelinks")[0] first = 1 for member in members.iter('span'): member = member.xpath('li/a')[0].text if first == 1: committee.add_member(member, 'chair') first = 0 else: committee.add_member(member) self.save_committee(committee)
def get_committees(self, term, chamber, laws_year): committee_list = [] committee_list_url = self.committee_list_url_template % laws_year list_page = ElementTree(lxml.html.fromstring(self.urlopen(committee_list_url))) com_select = list_page.find('//select[@name="P_COM_NM"]') for option in com_select.findall("option"): if option.text: committee_url = self.committee_url_template % (laws_year, urllib.quote(option.text.strip())) c_chamber, name = option.text.split(" ", 1) c_chamber = c_chamber[1] if (('H' == c_chamber and 'lower' == chamber) or ('S' == c_chamber and 'upper' == chamber)): # committee = Committee(term['name'], chamber, name) committee = Committee(chamber, name) committee.add_source(committee_url) committee_list.append(committee) return committee_list
def scrape_committees(self, year_abr, session): members_url = 'ftp://www.njleg.state.nj.us/ag/%sdata/COMEMB.DBF' % (year_abr) comm_info_url = 'ftp://www.njleg.state.nj.us/ag/%sdata/COMMITT.DBF' % (year_abr) COMEMB_dbf, resp = self.urlretrieve(members_url) COMMIT_dbf, resp2 = self.urlretrieve(comm_info_url) members_db = dbf.Dbf(COMEMB_dbf) info_db = dbf.Dbf(COMMIT_dbf) comm_dictionary = {} #Committe Info Database for name_rec in info_db: abrv = name_rec["code"] comm_name = name_rec["descriptio"] comm_type = name_rec["type"] aide = name_rec["aide"] contact_info = name_rec["phone"] if abrv[0] == "A": chamber = "upper" elif abrv[0] == "S": chamber = "lower" comm = Committee(chamber, comm_name, comm_type = comm_type, aide = aide, contact_info = contact_info) comm.add_source(members_url) comm.add_source(comm_info_url) comm_dictionary[abrv] = comm #Committee Member Database for member_rec in members_db: abr = member_rec["code"] comm_name = comm_dictionary[abr] leg = member_rec["member"] comm_name.add_member(leg) self.save_committee(comm_name)
def scrape_senate_comm(self, chamber, term): committees = [ "agriculture", "education", "energy-and-public-utilities", "environment-and-natural-resources", "finance-and-financial-institutions", "government-oversight", "health-human-services-and-aging", "highways-and-transportation", "insurance-commerce-and-labor", "judiciary-civil-justice", "judiciary-criminal-justice", "reference", "rules", "state-and-local-government-and-veterans-affairs", "ways-and-means-and-economic-development", ] for name in committees: comm_url = "http://www.ohiosenate.gov/committees/standing/detail/" "%s.html" % name with self.urlopen(comm_url) as page: root = lxml.etree.fromstring(page, lxml.etree.HTMLParser()) comm_name = name comm_name = comm_name.replace("-", " ") comm_name = comm_name.title() committee = Committee(chamber, comm_name) committee.add_source(comm_url) for el in root.xpath("//table/tr/td"): sen_name = el.xpath('string(a[@class="senatorLN"])') mark = sen_name.find("(") full_name = sen_name[0:mark] full_name = full_name.strip() if full_name: committee.add_member(full_name) self.save_committee(committee)
def scrape(self, chamber, term): self.validate_term(term) session = self.get_session_for_term(term) try: session_id = self.get_session_id(session) except KeyError: raise NoDataForPeriod # not getting the floor committees maybe try it during the new session # for committee_type in ('S', 'F'): # self.scrape_index(chamber, session, session_id, committee_type) url = base_url + 'xml/committees.asp?session=%s' % session_id with self.urlopen(url) as page: root = etree.fromstring(page, etree.XMLParser(recover=True)) body = '//body[@Body="%s"]/committee' % {'upper': 'S', 'lower': 'H'}[chamber] for com in root.xpath(body): c_id, name, short_name, sub = com.values() if sub == '1': parent = name.split('Subcommittee')[0].strip() name = name[name.index('Subcommittee'):] c = Committee(chamber, parent, short_name=short_name, subcommittee=name, session=session, az_committee_id=c_id) else: c = Committee(chamber, name, short_name=short_name, session=session, az_committee_id=c_id) c.add_source(url) #for some reason they don't always have any info on the committees' try: self.scrape_com_info(session, session_id, c_id, c) except HTTPError: pass self.save_committee(c)
def scrape_joint_comm(self, chamber, session): fileurl = 'http://www.maine.gov/legis/house/commlist.xls' joint = urllib.urlopen(fileurl).read() f = open('me_joint.xls', 'w') f.write(joint) f.close() wb = xlrd.open_workbook('me_joint.xls') sh = wb.sheet_by_index(0) cur_comm_name = '' chamber = 'joint' for rownum in range(1, sh.nrows): comm_name = sh.cell(rownum, 0).value first_name = sh.cell(rownum, 3).value middle_name = sh.cell(rownum, 4).value last_name = sh.cell(rownum, 5).value jrsr = sh.cell(rownum, 6).value full_name = first_name + " " + middle_name + " " + last_name + " " + jrsr party = sh.cell(rownum, 7).value legalres = sh.cell(rownum, 8).value address1 = sh.cell(rownum, 9).value address2 = sh.cell(rownum, 10).value town = sh.cell(rownum, 11).value state = sh.cell(rownum, 12).value zipcode = int(sh.cell(rownum, 13).value) phone = str(sh.cell(rownum, 14).value) home_email = sh.cell(rownum, 15).value leg_email = sh.cell(rownum, 16).value leg_chamber = sh.cell(rownum, 2).value chair = sh.cell(rownum, 1).value role = "member" if chair == 1: role = leg_chamber + " " + "Chair" if comm_name != cur_comm_name: cur_comm_name = comm_name committee = Committee(chamber, comm_name) committee.add_member(full_name, role = role, party = party, legalres= legalres, address1 = address1, address2 = address2, town = town, state = state, zipcode = zipcode, phone = phone, home_email = home_email, leg_email = leg_email) committee.add_source(fileurl) else: committee.add_member(full_name, role = role, party = party, legalres = legalres, address1 = address1, address2 = address2, town = town, state = state, zipcode = zipcode, phone = phone, home_email = home_email, leg_email = leg_email) self.save_committee(committee)
def scrape(self, chamber, term): com_url = {'lower': 'http://www.msa.md.gov/msa/mdmanual/06hse/html/hsecom.html', 'upper': 'http://www.msa.md.gov/msa/mdmanual/05sen/html/sencom.html'} # joint: http://www.msa.md.gov/msa/mdmanual/07leg/html/ga.html with self.urlopen(com_url[chamber]) as html: doc = lxml.html.fromstring(html) # distinct URLs containing /com/ committees = set([l.get('href') for l in doc.cssselect('li a') if l.get('href', '').find('/com/') != -1]) for com in committees: com_url = 'http://www.msa.md.gov'+com with self.urlopen(com_url) as chtml: cdoc = lxml.html.fromstring(chtml) for h in cdoc.cssselect('h2, h3'): if h.text: committee_name = h.text break cur_com = Committee(chamber, committee_name) cur_com.add_source(com_url) for l in cdoc.cssselect('a[href]'): if ' SUBCOMMITTEE' in (l.text or ''): self.save_committee(cur_com) cur_com = Committee(chamber, committee_name, l.text) cur_com.add_source(com_url) elif 'html/msa' in l.get('href'): prev = l.getprevious() name = l.text if name.endswith(','): name = name[:-1] if prev is not None and prev.tag == 'i': cur_com.add_member(name, 'ex-officio') else: cur_com.add_member(name) self.save_committee(cur_com)
def scrape_comm(self, chamber, term_name): url = "http://billstatus.ls.state.ms.us/htms/%s_cmtememb.xml" % chamber with self.urlopen(url) as comm_page: root = lxml.etree.fromstring(comm_page, lxml.etree.HTMLParser()) if chamber == "h": chamber = "lower" else: chamber = "upper" for mr in root.xpath("//committee"): name = mr.xpath("string(name)") comm = Committee(chamber, name) chair = mr.xpath("string(chair)") chair = chair.replace(", Chairman", "") role = "Chairman" if len(chair) > 0: comm.add_member(chair, role=role) vice_chair = mr.xpath("string(vice_chair)") vice_chair = vice_chair.replace(", Vice-Chairman", "") role = "Vice-Chairman" if len(vice_chair) > 0: comm.add_member(vice_chair, role=role) members = mr.xpath("string(members)").split(";") for leg in members: if leg[0] == " ": comm.add_member(leg[1 : len(leg)]) else: comm.add_member(leg) comm.add_source(url) self.save_committee(comm)
def scrape(self, chamber, year): com = Committee('lower', 'Committee on Finance') com.add_source('http://example.com') # can optionally specify role com.add_member('Lou Adams', 'chairman') com.add_member('Bill Smith') # can also specify subcommittees subcom = Committee('lower', 'Finance Subcommittee on Banking', 'Committee on Finance') com.add_source('http://example.com') com.add_member('Bill Smith')
def scrape_senate_comm(self, chamber, insert, session): committees = self.scrape_comm(chamber, insert, session) for committee in committees: leg_url = 'http://www.leg.state.nv.us/Session/' + insert + '/Committees/S_Committees/' + committee with self.urlopen(leg_url) as page: root = lxml.etree.fromstring(page, lxml.etree.HTMLParser()) comm_name = root.xpath('string(/html/body/div[@id="content"]/center/h2)') #special cases for each session to grab the name if session == 73: comm_name = root.xpath('string(/html/body/div[@id="content"]/h2[1])') elif session == 72: comm_name = root.xpath('string(/html/body/h2[1]/font)') elif session == 71: comm_name = root.xpath('string(/html/body/h2)') elif committee == 'NR.cfm' and session != 72 and session != 71: comm_name = root.xpath('string(/html/body/div[@id="content"]/h2)') #Marking for grabbing only the name of the committee startmark = comm_name.find("Senate") if startmark == -1: startmark = 0 else: startmark = 7 endmark = comm_name.find(str(session)) if session <= 73: comm_name = comm_name[startmark: len(comm_name)] else: comm_name = comm_name[startmark: endmark - 3] comm = Committee(chamber, comm_name) count = 0 #print comm_name if session == 73 or session == 71: path = '//li' elif session == 72: path = '/html/body/ul/li' else: path = '/html/body/div[@id="content"]/ul/li' for mr in root.xpath(path): name = mr.xpath('string(a)') name = name.replace(' \r\n ', '') if session == 72: name = mr.xpath('string()') name = name.replace('\r\n', '') name = name.replace(' -Vice Chair', '') name = name.replace(' -Chair', '') count = count + 1 if count == 1 and committee[0:3] != 'EPE.cfm': role = 'Chair' elif count == 2 and committee[0:3] != 'EPE.cfm': role = 'Vice Chair' else: role = 'member' comm.add_member(name, role) comm.add_source(leg_url) self.save_committee(comm)
def scrape_assem_comm(self, chamber, insert, year, session): committees = self.scrape_comm(chamber, insert, session) for committee in committees: leg_url = 'http://www.leg.state.nv.us/Session/' + insert + '/Committees/A_Committees/' + committee with self.urlopen(leg_url) as page: root = lxml.etree.fromstring(page, lxml.etree.HTMLParser()) comm_name = root.xpath('string(/html/body/div[@id="content"]/h1)') #special cases for each session to grab the name if session == 73: comm_name = root.xpath('string(/html/body/div[@id="content"]/h1)') elif session == 72: comm_name = root.xpath('string(/html/body/h2[1]/font)') elif session == 71: comm_name = root.xpath('string(/html/body/h2)') elif committee == 'NR.cfm' and session != 72 and session != 71: comm_name = root.xpath('string(/html/body/div[@id="content"]/h2)') #Marking for grabbing only the name of the committee startmark = comm_name.find("Assembly") if startmark == -1: startmark = 0 else: startmark = 9 endmark = comm_name.find(str(session)) if session <= 73: comm_name = comm_name[startmark: len(comm_name)] else: comm_name = comm_name[startmark: endmark - 3] comm_name = comm_name.replace(' \r\n ', '') if committee == 'EPE.cfm' and (year == '2005' or year == '2007'): note1 = root.xpath('string(/html/body/div[@id="content"]/ul[1]/li[1])') note2 = root.xpath('string(/html/body/div[@id="content"]/ul[1]/li[2])') comm = Committee(chamber, comm_name, note1 = note1, note2 = note2) else: comm = Committee(chamber, comm_name) count = 0 #special case if committee == 'EPE.cfm' and year == '2009': special_name1 = root.xpath('string(/html/body/div[@id="content"]/p/a[1])') special_name1 = special_name1.split()[0] + " " + special_name1.split()[1] name1_2ndrole = "Constitutional Amendments Vice Chair" special_name2 = root.xpath('string(/html/body/div[@id="content"]/p/a[2])') special_name2 = special_name2.split()[0] + " " + special_name2.split()[1] name2_2ndrole = "Elections Procedures and Ethics Vice Chair" comm.add_member(special_name1, role="Elections Procedures and Ethics Chair", name1_2ndrole = name1_2ndrole) comm.add_member(special_name2, role="Constitutional Admendments Chair", name2_2ndrole = name2_2ndrole) #paths for grabbing names if session == 73 or session == 71: path = '//li' elif session == 72: path = '/html/body/ul/li' else: path = '/html/body/div[@id="content"]/ul/li' #grabbing names for mr in root.xpath(path): name = mr.xpath('string(a)') name = name.strip() if session == 72 or session == 71: name = mr.xpath('string()') name = name.replace('\r\n', '') name = name.replace(' -Vice Chair', '') name = name.replace(' -Chair', '') name = name.replace('-Chair', '') name = name.replace('\u', '') name = name.replace('\u00a0', '') name = name.replace(' ', ' ') count = count + 1 if count == 1 and committee[0:3] != 'EPE' and session != 72: role = 'Chair' elif count == 2 and committee[0:3] != 'EPE' and session != 72: role = 'Vice Chair' else: role = 'member' if len(name) > 0: comm.add_member(name, role = role) comm.add_source(leg_url) self.save_committee(comm)