def parse_cosponsors_from_bill(self, bill, url): with self.urlopen(url) as bill_page: bill_page = lxml.html.fromstring(bill_page) sponsors_text = find_nodes_with_matching_text( bill_page, '//p/span', r'\s*INTRODUCED.*') if len(sponsors_text) == 0: # probably its withdrawn return sponsors_text = sponsors_text[0].text_content() sponsors = clean_text(sponsors_text).split(',') if len( sponsors ) > 1: # if there are several comma separated entries, list them. # the sponsor and the cosponsor were already got from the previous page, so ignore those: sponsors = sponsors[2::] for part in sponsors: parts = re.split(r' (?i)and ', part) for sponsor in parts: cosponsor_name = clean_text(sponsor) if cosponsor_name != "": cosponsor_name = cosponsor_name.replace( u'\u00a0', " ") # epic hax for name in re.split(r'\s+AND\s+', cosponsor_name): # for name in cosponsor_name.split("AND"): name = name.strip() if name: bill.add_sponsor('cosponsor', name)
def parse_cosponsors_from_bill(self, bill, url): bill_page = self.urlopen(url) bill_page = lxml.html.fromstring(bill_page) sponsors_text = find_nodes_with_matching_text( bill_page, '//p/span', r'\s*INTRODUCED.*') if len(sponsors_text) == 0: # probably its withdrawn return sponsors_text = sponsors_text[0].text_content() sponsors = clean_text(sponsors_text).split(',') # if there are several comma separated entries, list them. if len(sponsors) > 1: # the sponsor and the cosponsor were already got from the previous # page, so ignore those: sponsors = sponsors[2::] for part in sponsors: parts = re.split(r' (?i)and ', part) for sponsor in parts: cosponsor_name = clean_text(sponsor) if cosponsor_name != "": cosponsor_name = cosponsor_name.replace( u'\u00a0', " ") # epic hax for name in re.split(r'\s+AND\s+', cosponsor_name): # for name in cosponsor_name.split("AND"): name = name.strip() if name: bill.add_sponsor('cosponsor', name)
def parse_cosponsors_from_bill(self, bill, url): with self.urlopen(url) as bill_page: bill_page = lxml.html.fromstring(bill_page) sponsors_text = find_nodes_with_matching_text(bill_page,'//p/span',r'\s*INTRODUCED.*') if len(sponsors_text) == 0: # probably its withdrawn return sponsors_text = sponsors_text[0].text_content() sponsors = clean_text(sponsors_text).split(',') if len(sponsors) > 1: # if there are several comma separated entries, list them. # the sponsor and the cosponsor were already got from the previous page, so ignore those: sponsors = sponsors[2::] for part in sponsors: parts = re.split(r' (?i)and ',part) for sponsor in parts: bill.add_sponsor('cosponsor', clean_text(sponsor))