def scrape_chamber(self, chamber, session): if int(session) < 2017: legacy = NHLegacyBillScraper(self.metadata, self.datadir) yield from legacy.scrape(chamber, session) # This throws an error because object_count isn't being properly incremented, # even though it saves fine. So fake the output_names self.output_names = ["1"] return # bill basics self.bills = {} # LSR->Bill self.bills_by_id = {} # need a second table to attach votes self.versions_by_lsr = {} # mapping of bill ID to lsr self.amendments_by_lsr = {} # pre load the mapping table of LSR -> version id self.scrape_version_ids() self.scrape_amendments() last_line = [] for line in ( self.get("http://gencourt.state.nh.us/dynamicdatafiles/LSRs.txt") .content.decode("utf-8") .split("\n") ): line = line.split("|") if len(line) < 1: continue if len(line) < 36: if len(last_line + line[1:]) == 36: # combine two lines for processing # (skip an empty entry at beginning of second line) line = last_line + line self.warning("used bad line") else: # skip this line, maybe we'll use it later self.warning("bad line: %s" % "|".join(line)) last_line = line continue session_yr = line[0] lsr = line[1] title = line[2] body = line[3] # type_num = line[4] expanded_bill_id = line[9] bill_id = line[10] if body == body_code[chamber] and session_yr == session: if expanded_bill_id.startswith("CACR"): bill_type = "constitutional amendment" elif expanded_bill_id.startswith("PET"): bill_type = "petition" elif expanded_bill_id.startswith("AR") and bill_id.startswith("CACR"): bill_type = "constitutional amendment" elif expanded_bill_id.startswith("SSSB") or expanded_bill_id.startswith( "SSHB" ): # special session house/senate bills bill_type = "bill" else: bill_type = bill_type_map[expanded_bill_id.split(" ")[0][1:]] if title.startswith("("): title = title.split(")", 1)[1].strip() self.bills[lsr] = Bill( legislative_session=session, chamber=chamber, identifier=bill_id, title=title, classification=bill_type, ) # http://www.gencourt.state.nh.us/bill_status/billText.aspx?sy=2017&id=95&txtFormat=html if lsr in self.versions_by_lsr: version_id = self.versions_by_lsr[lsr] version_url = ( "http://www.gencourt.state.nh.us/bill_status/" "billText.aspx?sy={}&id={}&txtFormat=html".format( session, version_id ) ) self.bills[lsr].add_version_link( note="latest version", url=version_url, media_type="text/html" ) # http://gencourt.state.nh.us/bill_status/billtext.aspx?sy=2017&txtFormat=amend&id=2017-0464S if lsr in self.amendments_by_lsr: amendment_id = self.amendments_by_lsr[lsr] amendment_url = ( "http://www.gencourt.state.nh.us/bill_status/" "billText.aspx?sy={}&id={}&txtFormat=amend".format( session, amendment_id ) ) amendment_name = "Amendment #{}".format(amendment_id) self.bills[lsr].add_version_link( note=amendment_name, url=amendment_url, media_type="application/pdf", ) self.bills_by_id[bill_id] = self.bills[lsr] # load legislators self.legislators = {} for line in ( self.get("http://gencourt.state.nh.us/dynamicdatafiles/legislators.txt") .content.decode("utf-8") .split("\n") ): if len(line) < 1: continue line = line.split("|") employee_num = line[0] # first, last, middle if line[3]: name = "%s %s %s" % (line[2], line[3], line[1]) else: name = "%s %s" % (line[2], line[1]) self.legislators[employee_num] = {"name": name, "seat": line[5]} # body = line[4] # sponsors for line in ( self.get("http://gencourt.state.nh.us/dynamicdatafiles/LsrSponsors.txt") .content.decode("utf-8") .split("\n") ): if len(line) < 1: continue session_yr, lsr, _seq, employee, primary = line.strip().split("|") if session_yr == session and lsr in self.bills: sp_type = "primary" if primary == "1" else "cosponsor" try: self.bills[lsr].add_sponsorship( classification=sp_type, name=self.legislators[employee]["name"], entity_type="person", primary=True if sp_type == "primary" else False, ) self.bills[lsr].extras = { "_code": self.legislators[employee]["seat"] } except KeyError: self.warning("Error, can't find person %s" % employee) # actions for line in ( self.get("http://gencourt.state.nh.us/dynamicdatafiles/Docket.txt") .content.decode("utf-8") .split("\n") ): if len(line) < 1: continue # a few blank/irregular lines, irritating if "|" not in line: continue (session_yr, lsr, timestamp, bill_id, body, action, _) = line.split("|") if session_yr == session and lsr in self.bills: actor = "lower" if body == "H" else "upper" time = dt.datetime.strptime(timestamp, "%m/%d/%Y %H:%M:%S %p") action = action.strip() atype = classify_action(action) self.bills[lsr].add_action( chamber=actor, description=action, date=time.strftime("%Y-%m-%d"), classification=atype, ) amendment_id = extract_amendment_id(action) if amendment_id: self.bills[lsr].add_document_link( note="amendment %s" % amendment_id, url=AMENDMENT_URL % amendment_id, ) yield from self.scrape_votes(session) # save all bills for bill in self.bills: # bill.add_source(zip_url) self.add_source(self.bills[bill], bill, session) yield self.bills[bill]
def scrape(self, chamber, session): if int(session) < 2017: legacy = NHLegacyBillScraper(self.metadata, self.output_dir, self.strict_validation) legacy.scrape(chamber, session) # This throws an error because object_count isn't being properly incremented, # even though it saves fine. So fake the output_names self.output_names = ['1'] return # bill basics self.bills = {} # LSR->Bill self.bills_by_id = {} # need a second table to attach votes self.versions_by_lsr = {} # mapping of bill ID to lsr self.amendments_by_lsr = {} # pre load the mapping table of LSR -> version id self.scrape_version_ids() self.scrape_amendments() last_line = [] for line in self.get( 'http://gencourt.state.nh.us/dynamicdatafiles/LSRs.txt' ).content.split("\n"): line = line.split('|') if len(line) < 1: continue if len(line) < 36: if len(last_line + line[1:]) == 36: # combine two lines for processing # (skip an empty entry at beginning of second line) line = last_line + line self.warning('used bad line') else: # skip this line, maybe we'll use it later self.warning('bad line: %s' % '|'.join(line)) last_line = line continue session_yr = line[0] lsr = line[1] title = line[2] body = line[3] type_num = line[4] expanded_bill_id = line[9] bill_id = line[10] if body == body_code[chamber] and session_yr == session: if expanded_bill_id.startswith('CACR'): bill_type = 'constitutional amendment' elif expanded_bill_id.startswith('PET'): bill_type = 'petition' elif expanded_bill_id.startswith('AR') and bill_id.startswith( 'CACR'): bill_type = 'constitutional amendment' else: bill_type = bill_type_map[expanded_bill_id.split(' ')[0] [1:]] if title.startswith('('): title = title.split(')', 1)[1].strip() self.bills[lsr] = Bill(session, chamber, bill_id, title, type=bill_type) # http://www.gencourt.state.nh.us/bill_status/billText.aspx?sy=2017&id=95&txtFormat=html if lsr in self.versions_by_lsr: version_id = self.versions_by_lsr[lsr] version_url = 'http://www.gencourt.state.nh.us/bill_status/' \ 'billText.aspx?sy={}&id={}&txtFormat=html' \ .format(session, version_id) self.bills[lsr].add_version('latest version', version_url, mimetype='text/html', on_duplicate='use_new') # http://gencourt.state.nh.us/bill_status/billtext.aspx?sy=2017&txtFormat=amend&id=2017-0464S if lsr in self.amendments_by_lsr: amendment_id = self.amendments_by_lsr[lsr] amendment_url = 'http://www.gencourt.state.nh.us/bill_status/' \ 'billText.aspx?sy={}&id={}&txtFormat=amend' \ .format(session, amendment_id) amendment_name = 'Amendment #{}'.format(amendment_id) self.bills[lsr].add_version(amendment_name, amendment_url, mimetype='application/pdf', on_duplicate='use_new') self.bills_by_id[bill_id] = self.bills[lsr] # load legislators self.legislators = {} for line in self.get( 'http://gencourt.state.nh.us/dynamicdatafiles/legislators.txt' ).content.split("\n"): if len(line) < 1: continue line = line.split('|') employee_num = line[0] # first, last, middle if line[3]: name = '%s %s %s' % (line[2], line[3], line[1]) else: name = '%s %s' % (line[2], line[1]) self.legislators[employee_num] = {'name': name, 'seat': line[5]} #body = line[4] # sponsors for line in self.get( 'http://gencourt.state.nh.us/dynamicdatafiles/LsrSponsors.txt' ).content.split("\n"): if len(line) < 1: continue session_yr, lsr, seq, employee, primary = line.strip().split('|') if session_yr == session and lsr in self.bills: sp_type = 'primary' if primary == '1' else 'cosponsor' try: self.bills[lsr].add_sponsor( sp_type, self.legislators[employee]['name'], _code=self.legislators[employee]['seat']) except KeyError: self.warning("Error, can't find person %s" % employee) # actions for line in self.get( 'http://gencourt.state.nh.us/dynamicdatafiles/Docket.txt' ).content.split("\n"): if len(line) < 1: continue # a few blank/irregular lines, irritating if '|' not in line: continue (session_yr, lsr, timestamp, bill_id, body, action, _) = line.split('|') if session_yr == session and lsr in self.bills: actor = 'lower' if body == 'H' else 'upper' time = dt.datetime.strptime(timestamp, '%m/%d/%Y %H:%M:%S %p') action = action.strip() atype = classify_action(action) self.bills[lsr].add_action(actor, action, time, type=atype) amendment_id = extract_amendment_id(action) if amendment_id: self.bills[lsr].add_document('amendment %s' % amendment_id, AMENDMENT_URL % amendment_id) self.scrape_votes(session) # save all bills for bill in self.bills: #bill.add_source(zip_url) self.add_source(self.bills[bill], bill, session) self.save_bill(self.bills[bill])
def scrape_chamber(self, chamber, session): if int(session) < 2017: legacy = NHLegacyBillScraper(self.metadata, self.datadir) yield from legacy.scrape(chamber, session) # This throws an error because object_count isn't being properly incremented, # even though it saves fine. So fake the output_names self.output_names = ['1'] return # bill basics self.bills = {} # LSR->Bill self.bills_by_id = {} # need a second table to attach votes self.versions_by_lsr = {} # mapping of bill ID to lsr self.amendments_by_lsr = {} # pre load the mapping table of LSR -> version id self.scrape_version_ids() self.scrape_amendments() last_line = [] for line in self.get('http://gencourt.state.nh.us/dynamicdatafiles/LSRs.txt') \ .content.decode('utf-8').split("\n"): line = line.split('|') if len(line) < 1: continue if len(line) < 36: if len(last_line + line[1:]) == 36: # combine two lines for processing # (skip an empty entry at beginning of second line) line = last_line + line self.warning('used bad line') else: # skip this line, maybe we'll use it later self.warning('bad line: %s' % '|'.join(line)) last_line = line continue session_yr = line[0] lsr = line[1] title = line[2] body = line[3] # type_num = line[4] expanded_bill_id = line[9] bill_id = line[10] if body == body_code[chamber] and session_yr == session: if expanded_bill_id.startswith('CACR'): bill_type = 'constitutional amendment' elif expanded_bill_id.startswith('PET'): bill_type = 'petition' elif expanded_bill_id.startswith('AR') and bill_id.startswith('CACR'): bill_type = 'constitutional amendment' else: bill_type = bill_type_map[expanded_bill_id.split(' ')[0][1:]] if title.startswith('('): title = title.split(')', 1)[1].strip() self.bills[lsr] = Bill(legislative_session=session, chamber=chamber, identifier=bill_id, title=title, classification=bill_type) # http://www.gencourt.state.nh.us/bill_status/billText.aspx?sy=2017&id=95&txtFormat=html if lsr in self.versions_by_lsr: version_id = self.versions_by_lsr[lsr] version_url = 'http://www.gencourt.state.nh.us/bill_status/' \ 'billText.aspx?sy={}&id={}&txtFormat=html' \ .format(session, version_id) self.bills[lsr].add_version_link(note='latest version', url=version_url, media_type='text/html') # http://gencourt.state.nh.us/bill_status/billtext.aspx?sy=2017&txtFormat=amend&id=2017-0464S if lsr in self.amendments_by_lsr: amendment_id = self.amendments_by_lsr[lsr] amendment_url = 'http://www.gencourt.state.nh.us/bill_status/' \ 'billText.aspx?sy={}&id={}&txtFormat=amend' \ .format(session, amendment_id) amendment_name = 'Amendment #{}'.format(amendment_id) self.bills[lsr].add_version_link(note=amendment_name, url=amendment_url, media_type='application/pdf') self.bills_by_id[bill_id] = self.bills[lsr] # load legislators self.legislators = {} for line in self.get('http://gencourt.state.nh.us/dynamicdatafiles/legislators.txt') \ .content.decode('utf-8').split("\n"): if len(line) < 1: continue line = line.split('|') employee_num = line[0] # first, last, middle if line[3]: name = '%s %s %s' % (line[2], line[3], line[1]) else: name = '%s %s' % (line[2], line[1]) self.legislators[employee_num] = {'name': name, 'seat': line[5]} # body = line[4] # sponsors for line in self.get('http://gencourt.state.nh.us/dynamicdatafiles/LsrSponsors.txt') \ .content.decode('utf-8').split("\n"): if len(line) < 1: continue session_yr, lsr, seq, employee, primary = line.strip().split('|') if session_yr == session and lsr in self.bills: sp_type = 'primary' if primary == '1' else 'cosponsor' try: self.bills[lsr].add_sponsorship(classification=sp_type, name=self.legislators[employee]['name'], entity_type='person', primary=True if sp_type == 'primary' else False) self.bills[lsr].extras = {'_code': self.legislators[employee]['seat']} except KeyError: self.warning("Error, can't find person %s" % employee) # actions for line in self.get('http://gencourt.state.nh.us/dynamicdatafiles/Docket.txt') \ .content.decode('utf-8').split("\n"): if len(line) < 1: continue # a few blank/irregular lines, irritating if '|' not in line: continue (session_yr, lsr, timestamp, bill_id, body, action, _) = line.split('|') if session_yr == session and lsr in self.bills: actor = 'lower' if body == 'H' else 'upper' time = dt.datetime.strptime(timestamp, '%m/%d/%Y %H:%M:%S %p') action = action.strip() atype = classify_action(action) self.bills[lsr].add_action(chamber=actor, description=action, date=time.strftime("%Y-%m-%d"), classification=atype) amendment_id = extract_amendment_id(action) if amendment_id: self.bills[lsr].add_document_link(note='amendment %s' % amendment_id, url=AMENDMENT_URL % amendment_id) yield from self.scrape_votes(session) # save all bills for bill in self.bills: # bill.add_source(zip_url) self.add_source(self.bills[bill], bill, session) yield self.bills[bill]