def scrape_vote(self, url, date, chamber, passed, motion, re_digit=re.compile(r'\d{1,3}'), re_totals=re.compile( r'(?:Yes|No|Not Voting|Absent):\s{,3}(\d{,3})', re.I)): namespaces = {"re": "http://exslt.org/regular-expressions"} try: doc = lxml.html.fromstring(self.urlopen(url)) except scrapelib.HTTPError as e: known_fail_links = [ "http://legis.delaware.gov/LIS/lis146.nsf/7712cf7cc0e9227a852568470077336f/cdfd8149e79c2bb385257a24006e9f7a?OpenDocument" ] if "404" in str(e.response): # XXX: Ugh, ok, so there's no way (that I could find quickly) # to get the _actual_ response (just "ok") from the object. # As a result, this. Forgive me. # -PRT if url in known_fail_links: return raise xpath = ("//font[re:match(., '^(Yes|No|Not Voting|Absent):', 'i')]" "/ancestor::tr[1]") # Get the vote tallies. try: totals = doc.xpath(xpath, namespaces=namespaces) totals = totals[0].text_content() except IndexError: # Here the vote page didn't have have the typical format. # Maybe it's a hand edited page. Log and try to parse # the vitals from plain text. self.log('Found an unusual votes page at url: "%s"' % url) totals = re_totals.findall(doc.text_content()) if len(totals) == 4: self.log('...was able to parse vote tallies from "%s"' % url) else: totals = re_digit.findall(totals) try: yes_count, no_count, abstentions, absent = map(int, totals) except ValueError: # There were'nt any votes listed on this page. This is probably # a "voice vote" lacking actual vote tallies. yes_count, no_count, other_count = 0, 0, 0 else: other_count = abstentions + absent # Create the vote object. vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) # Add source. vote.add_source(url) # Get the "vote type" el = doc.xpath('//font[contains(., "Vote Type:")]')[0] try: vote_type = el.xpath('following-sibling::font[1]/text()')[0] except IndexError: vote_type = el.xpath('../following-sibling::font[1]/text()')[0] vote['vote_type'] = vote_type # Get an iterator like: name1, vote1, name2, vote2, ... xpath = ("//font[re:match(., '^[A-Z]$')]" "/../../../descendant::td/font/text()") data = doc.xpath(xpath, namespaces=namespaces) data = filter(lambda s: s.strip(), data) # Handle the rare case where not all names have corresponding # text indicating vote value. See e.g. session 146 HB10. data_len = len(data) / 2 tally = sum(v for (k, v) in vote.items() if '_count' in k) if (0 < data_len) and ((data_len) != tally): xpath = ("//font[re:match(., '^[A-Z]$')]/ancestor::table") els = doc.xpath(xpath, namespaces=namespaces)[-1] els = els.xpath('descendant::td') data = [e.text_content().strip() for e in els] data = iter(data) # Add names and vote values. vote_map = { 'Y': 'yes', 'N': 'no', } while True: try: name = data.next() _vote = data.next() # Evidently, the motion for vote can be rescinded before # the vote is cast, perhaps due to a quorum failure. # (See the Senate vote (1/26/2011) for HB 10 w/HA 1.) In # this rare case, values in the vote col are whitespace. Skip. if not _vote.strip(): continue _vote = vote_map.get(_vote, 'other') getattr(vote, _vote)(name) except StopIteration: break return vote
def scrape_vote(self, url, re_digit=re.compile(r'\d{1,3}'), re_totals=re.compile( r'(?:Yes|No|Not Voting|Absent):\s{,3}(\d{,3})', re.I)): namespaces = {"re": "http://exslt.org/regular-expressions"} try: html = self.urlopen(url) doc = lxml.html.fromstring(html) except scrapelib.HTTPError as e: known_fail_links = [ "http://legis.delaware.gov/LIS/lis146.nsf/7712cf7cc0e9227a852568470077336f/cdfd8149e79c2bb385257a24006e9f7a?OpenDocument" ] if "404" in str(e.response): # XXX: Ugh, ok, so there's no way (that I could find quickly) # to get the _actual_ response (just "ok") from the object. # As a result, this. Forgive me. # -PRT if url in known_fail_links: msg = 'Recieved a bogus 22/404 return code. Skipping vote.' self.warning(msg) return raise if 'Committee Report' in lxml.html.tostring(doc): # This was a committee vote with weird formatting. self.info('Skipping committee report.') return xpath = ("//font[re:match(., '^(Yes|No|Not Voting|Absent):', 'i')]" "/ancestor::tr[1]") # Get the vote tallies. try: totals = doc.xpath(xpath, namespaces=namespaces) totals = totals[0].text_content() except IndexError: # Here the vote page didn't have have the typical format. # Maybe it's a hand edited page. Log and try to parse # the vitals from plain text. self.warning('Found an unusual votes page at url: "%s"' % url) totals = re_totals.findall(doc.text_content()) if len(totals) == 4: self.warning('...was able to parse vote tallies from "%s"' % url) else: totals = re_digit.findall(totals) try: yes_count, no_count, abstentions, absent = map(int, totals) except ValueError: # There were'nt any votes listed on this page. This is probably # a "voice vote" lacking actual vote tallies. yes_count, no_count, other_count = 0, 0, 0 else: other_count = abstentions + absent font_text = [s.strip() for s in doc.xpath('//font/text()')] date_index = font_text.index('Date:') date_string = font_text[date_index + 2] date = datetime.strptime(date_string, '%m/%d/%Y %H:%M %p') passed = True if font_text[date_index + 4] else False counts = defaultdict(int) for key, string in [ ('yes_count', 'Yes:'), ('no_count', 'No:'), ('absent_count', 'Absent:'), ('not_voting', 'Not Voting:')]: try: counts[key] = int(font_text[font_text.index(string) + 2]) except ValueError: continue counts['other_count'] = counts['absent_count'] + counts['not_voting'] chamber_string = doc.xpath('string(//b/u/font/text())').lower() if 'senate' in chamber_string: chamber = 'upper' elif 'house' in chamber_string: chamber = 'lower' for xpath in ( 'string(//td/b/text())', 'string(//td/b/font/text())', 'string(//form/b/font/text())'): motion = doc.xpath(xpath) if motion: break # Will fail at validictory level if no motion found. # Create the vote object. vote = Vote(chamber, date, motion, passed, counts['yes_count'], counts['no_count'], counts['other_count']) # Add source. vote.add_source(url) # Get the "vote type" el = doc.xpath('//font[contains(., "Vote Type:")]')[0] try: vote_type = el.xpath('following-sibling::font[1]/text()')[0] except IndexError: vote_type = el.xpath('../following-sibling::font[1]/text()')[0] vote['vote_type'] = vote_type # Get an iterator like: name1, vote1, name2, vote2, ... xpath = ("//font[re:match(., '^[A-Z]$')]" "/../../../descendant::td/font/text()") data = doc.xpath(xpath, namespaces=namespaces) data = filter(lambda s: s.strip(), data) # Handle the rare case where not all names have corresponding # text indicating vote value. See e.g. session 146 HB10. data_len = len(data) / 2 tally = sum(v for (k, v) in vote.items() if '_count' in k) if (0 < data_len) and ((data_len) != tally): xpath = ("//font[re:match(., '^[A-Z]$')]/ancestor::table") els = doc.xpath(xpath, namespaces=namespaces)[-1] els = els.xpath('descendant::td') data = [e.text_content().strip() for e in els] data = iter(data) # Add names and vote values. vote_map = { 'Y': 'yes', 'N': 'no', } while True: try: name = data.next() _vote = data.next() # Evidently, the motion for vote can be rescinded before # the vote is cast, perhaps due to a quorum failure. # (See the Senate vote (1/26/2011) for HB 10 w/HA 1.) In # this rare case, values in the vote col are whitespace. Skip. if not _vote.strip(): continue _vote = vote_map.get(_vote, 'other') getattr(vote, _vote)(name) except StopIteration: break return vote
def scrape_vote(self, url, date, chamber, passed, motion, re_digit=re.compile(r'\d{1,3}'), re_totals=re.compile( r'(?:Yes|No|Not Voting|Absent):\s{,3}(\d{,3})', re.I)): namespaces = {"re": "http://exslt.org/regular-expressions"} try: doc = lxml.html.fromstring(self.urlopen(url)) except scrapelib.HTTPError as e: known_fail_links = [ "http://legis.delaware.gov/LIS/lis146.nsf/7712cf7cc0e9227a852568470077336f/cdfd8149e79c2bb385257a24006e9f7a?OpenDocument" ] if "404" in str(e.response): # XXX: Ugh, ok, so there's no way (that I could find quickly) # to get the _actual_ response (just "ok") from the object. # As a result, this. Forgive me. # -PRT if url in known_fail_links: return raise xpath = ("//font[re:match(., '^(Yes|No|Not Voting|Absent):', 'i')]" "/ancestor::tr[1]") # Get the vote tallies. try: totals = doc.xpath(xpath, namespaces=namespaces) totals = totals[0].text_content() except IndexError: # Here the vote page didn't have have the typical format. # Maybe it's a hand edited page. Log and try to parse # the vitals from plain text. self.log('Found an unusual votes page at url: "%s"' % url) totals = re_totals.findall(doc.text_content()) if len(totals) == 4: self.log('...was able to parse vote tallies from "%s"' % url) else: totals = re_digit.findall(totals) try: yes_count, no_count, abstentions, absent = map(int, totals) except ValueError: # There were'nt any votes listed on this page. This is probably # a "voice vote" lacking actual vote tallies. yes_count, no_count, other_count = 0, 0, 0 else: other_count = abstentions + absent # Create the vote object. vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) # Add source. vote.add_source(url) # Get the "vote type" el = doc.xpath('//font[contains(., "Vote Type:")]')[0] try: vote_type = el.xpath('following-sibling::font[1]/text()')[0] except IndexError: vote_type = el.xpath('../following-sibling::font[1]/text()')[0] vote['vote_type'] = vote_type # Get an iterator like: name1, vote1, name2, vote2, ... xpath = ("//font[re:match(., '^[A-Z]$')]" "/../../../descendant::td/font/text()") data = doc.xpath(xpath, namespaces=namespaces) data = filter(lambda s: s.strip(), data) # Handle the rare case where not all names have corresponding # text indicating vote value. See e.g. session 146 HB10. data_len = len(data)/2 tally = sum(v for (k, v) in vote.items() if '_count' in k) if (0 < data_len) and ((data_len) != tally): xpath = ("//font[re:match(., '^[A-Z]$')]/ancestor::table") els = doc.xpath(xpath, namespaces=namespaces)[-1] els = els.xpath('descendant::td') data = [e.text_content().strip() for e in els] data = iter(data) # Add names and vote values. vote_map = { 'Y': 'yes', 'N': 'no', } while True: try: name = data.next() _vote = data.next() # Evidently, the motion for vote can be rescinded before # the vote is cast, perhaps due to a quorum failure. # (See the Senate vote (1/26/2011) for HB 10 w/HA 1.) In # this rare case, values in the vote col are whitespace. Skip. if not _vote.strip(): continue _vote = vote_map.get(_vote, 'other') getattr(vote, _vote)(name) except StopIteration: break return vote
def scrape_vote(self, url, re_digit=re.compile(r'\d{1,3}'), re_totals=re.compile( r'(?:Yes|No|Not Voting|Absent):\s{,3}(\d{,3})', re.I)): namespaces = {"re": "http://exslt.org/regular-expressions"} try: html = self.urlopen(url) doc = lxml.html.fromstring(html) except scrapelib.HTTPError as e: known_fail_links = [ "http://legis.delaware.gov/LIS/lis146.nsf/7712cf7cc0e9227a852568470077336f/cdfd8149e79c2bb385257a24006e9f7a?OpenDocument", 'http://legis.delaware.gov/LIS/lis147.nsf/7712cf7cc0e9227a852568470077336f/5f86852ea6649fa285257d08001bbe06?OpenDocument' ] if "404" in str(e.response): # XXX: Ugh, ok, so there's no way (that I could find quickly) # to get the _actual_ response (just "ok") from the object. # As a result, this. Forgive me. # -PRT # XXX: THERE SHALL BE NO FORGIVENESS FOR PAULTAG!!!! # # Just kidding. I blame Delaware. # -TWN if url in known_fail_links: msg = 'Recieved a bogus 22/404 return code. Skipping vote.' self.warning(msg) return raise if 'Committee Report' in lxml.html.tostring(doc): # This was a committee vote with weird formatting. self.info('Skipping committee report.') return xpath = ("//font[re:match(., '^(Yes|No|Not Voting|Absent):', 'i')]" "/ancestor::tr[1]") # Get the vote tallies. try: totals = doc.xpath(xpath, namespaces=namespaces) totals = totals[0].text_content() except IndexError: # Here the vote page didn't have have the typical format. # Maybe it's a hand edited page. Log and try to parse # the vitals from plain text. self.warning('Found an unusual votes page at url: "%s"' % url) totals = re_totals.findall(doc.text_content()) if len(totals) == 4: self.warning('...was able to parse vote tallies from "%s"' % url) else: totals = re_digit.findall(totals) try: yes_count, no_count, abstentions, absent = map(int, totals) except ValueError: # There were'nt any votes listed on this page. This is probably # a "voice vote" lacking actual vote tallies. yes_count, no_count, other_count = 0, 0, 0 else: other_count = abstentions + absent font_text = [s.strip() for s in doc.xpath('//font/text()')] date_index = font_text.index('Date:') date_string = font_text[date_index + 2] date = datetime.strptime(date_string, '%m/%d/%Y %H:%M %p') passed = True if font_text[date_index + 4] else False counts = defaultdict(int) for key, string in [('yes_count', 'Yes:'), ('no_count', 'No:'), ('absent_count', 'Absent:'), ('not_voting', 'Not Voting:')]: try: counts[key] = int(font_text[font_text.index(string) + 2]) except ValueError: continue counts['other_count'] = counts['absent_count'] + counts['not_voting'] chamber_string = doc.xpath('string(//b/u/font/text())').lower() if 'senate' in chamber_string: chamber = 'upper' elif 'house' in chamber_string: chamber = 'lower' for xpath in ('string(//td/b/text())', 'string(//td/b/font/text())', 'string(//form/b/font/text())'): motion = doc.xpath(xpath) if motion: break # Will fail at validictory level if no motion found. # Create the vote object. vote = Vote(chamber, date, motion, passed, counts['yes_count'], counts['no_count'], counts['other_count']) # Add source. vote.add_source(url) # Get the "vote type" el = doc.xpath('//font[contains(., "Vote Type:")]')[0] try: vote_type = el.xpath('following-sibling::font[1]/text()')[0] except IndexError: vote_type = el.xpath('../following-sibling::font[1]/text()')[0] vote['vote_type'] = vote_type # Get an iterator like: name1, vote1, name2, vote2, ... xpath = ("//font[re:match(., '^[A-Z]$')]" "/../../../descendant::td/font/text()") data = doc.xpath(xpath, namespaces=namespaces) data = filter(lambda s: s.strip(), data) # Handle the rare case where not all names have corresponding # text indicating vote value. See e.g. session 146 HB10. data_len = len(data) / 2 tally = sum(v for (k, v) in vote.items() if '_count' in k) if (0 < data_len) and ((data_len) != tally): xpath = ("//font[re:match(., '^[A-Z]$')]/ancestor::table") els = doc.xpath(xpath, namespaces=namespaces)[-1] els = els.xpath('descendant::td') data = [e.text_content().strip() for e in els] data = iter(data) # Add names and vote values. vote_map = { 'Y': 'yes', 'N': 'no', } while True: try: name = data.next() _vote = data.next() # Evidently, the motion for vote can be rescinded before # the vote is cast, perhaps due to a quorum failure. # (See the Senate vote (1/26/2011) for HB 10 w/HA 1.) In # this rare case, values in the vote col are whitespace. Skip. if not _vote.strip(): continue _vote = vote_map.get(_vote, 'other') getattr(vote, _vote)(name) except StopIteration: break return vote