def scrape_approp_subcommittees(self): URL = 'http://www.senate.michigan.gov/committee/appropssubcommittee.html' html = self.get(URL).text doc = lxml.html.fromstring(html) for strong in doc.xpath('//strong'): com = Organization( name=strong.text.strip(), parent_id=self._senate_appropriations, classification='committee', ) com.add_source(URL) legislators = strong.getnext().tail.replace('Senators', '').strip() for leg in re.split(', | and ', legislators): if leg.endswith('(C)'): role = 'chairman' leg = leg[:-4] elif leg.endswith('(VC)'): role = 'vice chairman' leg = leg[:-5] elif leg.endswith('(MVC)'): role = 'minority vice chairman' leg = leg[:-6] else: role = 'member' com.add_member(leg, role=role) yield com
def scrape_page(self, link, chamber=None): page = self.lxmlize(link.attrib['href']) comName = link.text roles = { "Chair": "chair", "Vice Chair": "vice-chair", "Vice-Chair": "vice-chair", } committee = Organization(comName, chamber=chamber, classification='committee') committee.add_source(link.attrib['href']) for member in page.xpath('//div[@class="members"]/' + 'div[@class="roster-item"]'): details = member.xpath('.//div[@class="member-details"]')[0] person = details.xpath('./h4')[0].text_content() # This page does random weird things with whitepace to names person = ' '.join(person.strip().split()) if not person: continue role = details.xpath('./span[@class="member-role"]') if role: role = roles[role[0].text] else: role = 'member' committee.add_member(person, role=role) yield committee
def scrape(self, chamber=None): if chamber: chambers = [chamber] else: chambers = ['upper', 'lower'] for chamber in chambers: insert = self.jurisdiction.session_slugs[self.latest_session()] chamber_names = {'lower': 'Assembly', 'upper': 'Senate'} list_url = '%s/%s/HomeCommittee/LoadCommitteeListTab' % (nelis_root, insert) html = self.get(list_url).text doc = lxml.html.fromstring(html) sel = 'panel%sCommittees' % chamber_names[chamber] ul = doc.xpath('//ul[@id="%s"]' % sel)[0] coms = ul.xpath('li/div/div/div[@class="col-md-4"]/a') for com in coms: name = com.text.strip() com_id = (re.match(r'.*/Committee/(?P<id>[0-9]+)/Overview', com.attrib['href']) .group('id')) com_url = '%s/%s/Committee/FillSelectedCommitteeTab?committeeOrSubCommitteeKey=%s'\ '&selectedTab=Overview' % (nelis_root, insert, com_id) org = Organization(name=name, chamber=chamber, classification="committee") org.add_source(com_url) self.scrape_comm_members(chamber, org, com_url) yield org
def _scrape_lower_special_committees(self): url = 'http://house.louisiana.gov/H_Cmtes/SpecialCommittees.aspx' page = self.lxmlize(url) committee_list = page.xpath('//div[@class="accordion"]')[0] headers = committee_list.xpath('./h3') for header in headers: committee_name_text = header.xpath('string()') committee_name = committee_name_text.strip() committee_name = self._normalize_committee_name(committee_name) chamber = 'legislature' if committee_name.startswith('Joint') else 'lower' committee = Organization(committee_name, chamber=chamber, classification='committee') committee.add_source(url) committee_memberlist = header.xpath('./following-sibling::div[@class="pane"]' '//tr[@class="linkStyle2"]') for row in committee_memberlist: member_name = row.xpath('normalize-space(string(./th[1]))') member_name = self._normalize_member_name(member_name) member_role = row.xpath('normalize-space(string(./th[2]))') member_role = self._normalize_member_role(member_role) committee.add_member(member_name, member_role) yield committee
def test_fix_bill_id(): j = create_jurisdiction() j.legislative_sessions.create(name='1900', identifier='1900') org1 = ScrapeOrganization(name='House', classification='lower') bill = ScrapeBill('HB 1', '1900', 'Test Bill ID', classification='bill', chamber='lower') oi = OrganizationImporter('jid') oi.import_data([org1.as_dict()]) from pupa.settings import IMPORT_TRANSFORMERS IMPORT_TRANSFORMERS['bill'] = { 'identifier': lambda x: re.sub(r'([A-Z]*)\s*0*([-\d]+)', r'\1 \2', x, 1) } bi = BillImporter('jid', oi, DumbMockImporter()) bi.import_data([bill.as_dict()]) ve = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', start_date='1900-04-02', classification='passage:bill', result='fail', bill_chamber='lower', bill='HB1', identifier='4', bill_action='passage', organization=org1._id) VoteEventImporter('jid', DumbMockImporter(), oi, bi).import_data([ ve.as_dict(), ]) IMPORT_TRANSFORMERS['bill'] = {} ve = VoteEvent.objects.get() ve.bill.identifier == 'HB 1'
def scrape_committees(self, session): session_key = SESSION_KEYS[session] committees_response = self.api_client.get('committees', session=session_key) legislators = index_legislators(self, session_key) for committee in committees_response: org = Organization( chamber={'S': 'upper', 'H': 'lower', 'J': 'legislature'}[committee['HouseOfAction']], name=committee['CommitteeName'], classification='committee') org.add_source( 'https://olis.leg.state.or.us/liz/{session}' '/Committees/{committee}/Overview'.format(session=session_key, committee=committee['CommitteeName'])) members_response = self.api_client.get('committee_members', session=session_key, committee=committee['CommitteeCode']) for member in members_response: try: member_name = legislators[member['LegislatorCode']] except KeyError: logger.warn('Legislator {} not found in session {}'.format( member['LegislatorCode'], session_key)) member_name = member['LegislatorCode'] org.add_member(member_name, role=member['Title'] if member['Title'] else '') yield org
def get_organizations(self): organization = Organization(self.name, classification=self.classification) leader_role = styles_of_address[self.division_id]['Leader'] member_role = self.member_role or styles_of_address[self.division_id]['Member'] parent = Division.get(self.division_id) # Don't yield posts for premiers. if parent._type not in ('province', 'territory'): # Yield posts to allow ca_on_toronto to make changes. post = Post(role=leader_role, label=parent.name, division_id=parent.id, organization_id=organization._id) yield post children = [child for child in parent.children() if child._type != 'place' and child._type not in self.exclude_types] for child in children: if not self.skip_null_valid_from and not child.attrs.get('validFrom') or child.attrs.get('validFrom') and (child.attrs['validFrom'] <= datetime.now().strftime('%Y-%m-%d') or child.attrs['validFrom'] == self.valid_from): if self.use_type_id: label = child.id.rsplit('/', 1)[1].capitalize().replace(':', ' ') else: label = child.name # Yield posts to allow ca_on_toronto to make changes. post = Post(role=member_role, label=label, division_id=child.id, organization_id=organization._id) yield post if not children and parent.attrs['posts_count']: for i in range(1, int(parent.attrs['posts_count'])): # exclude Mayor organization.add_post(role=member_role, label='{} (seat {})'.format(parent.name, i), division_id=parent.id) yield organization
def test_parent_id_resolution(): parent = ScrapeOrganization('UN', classification='international') child = ScrapeOrganization('UNESCO', classification='unknown', parent_id=parent._id) OrganizationImporter('jurisdiction-id').import_data([parent.as_dict(), child.as_dict()]) assert Organization.objects.count() == 2 assert Organization.objects.get(name='UN').children.count() == 1 assert Organization.objects.get(name='UNESCO').parent.name == 'UN'
def test_deduplication_other_name_exists(): create_jurisdictions() create_org() org = ScrapeOrganization('UN', classification='international') od = org.as_dict() OrganizationImporter('jid1').import_data([od]) assert Organization.objects.all().count() == 1
def scrape(self): urls = Urls(dict(list=legislators_url), self) council = Organization( 'Temecula City Council', classification='legislature') council.add_source(urls.list.url) yield council for tr in urls.list.xpath('//table[2]//tr')[1:]: # Parse some attributes. name, role = tr.xpath('td/p[1]//font/text()') image = tr.xpath('td/img/@src').pop() # Create legislator. person = Person(name, image=image) # Add membership on council. memb = person.add_membership(council, role=role) # Add email address. email, detail_url = tr.xpath('td//a/@href') email = email[7:] memb.contact_details.append( dict(type='email', value=email, note='work')) # Add sources. person.add_source(urls.list.url) person.add_source(detail_url) yield person
def scrape(self): url = 'http://www.mec.mo.gov/EthicsWeb/CampaignFinance/CF11_SearchComm.aspx' for letter in ['a', 'e', 'i', 'o', 'u', 'y']: print("Searching '{}'".format(letter)) initial = self.get(url).text parsed = lxml.html.fromstring(initial) page_n = 0 data = get_form_data(parsed, first_time=True) data['ctl00$ContentPlaceHolder$txtCandLast'] = letter while True: page_n += 1 print("Page: {}".format(page_n)) r = self.post(url, data=data, cookies=dict(PageIndex=str(1))) output = lxml.html.fromstring(r.text) rows = output.cssselect('#ctl00_ContentPlaceHolder_grvSearch tr') for r in rows: tds = r.cssselect('td') if len(tds) > 3: name = tds[2].text_content().strip() _registrant = Person( name=name, source_identified=True ) committee_name = tds[1].text_content().strip() _office = Organization( name=committee_name, classification='Committee', # parent_id=self.jurisdiction._state, source_identified=True ) _office.add_member( _registrant, role='committee candidate', label='candidate for {n}'.format(n=_office.name), ) yield _registrant yield _office if not output.xpath("//*[@id='ctl00_ContentPlaceHolder_grvSearch_ctl28_lbtnNextPage']"): print(output.xpath("//*[@id='ctl00_ContentPlaceHolder_grvSearch_ctl28_lbtnNextPage']")) break data = get_form_data(output)
def _scrape_upper_committee(self, name, url2): cat = "Assignments.asp" url3 = url2.replace("default.asp", cat) committee = Organization(name, chamber="upper", classification="committee" ) committee.add_source(url2) page = self.lxmlize(url3) members = page.xpath('//table[@id="table38"]//font/a/b') for link in members: role = "member" if link == members[0]: role = "Chairman" if link == members[1]: role = "Vice-Chairman" name = link.xpath('string()') name = name.replace('Senator ', '') name = re.sub('[\s]{2,}', ' ', name).strip() committee.add_member(name, role) yield committee
def scrape_approp_subcommittees(self, url): html = self.get(url).text doc = lxml.html.fromstring(html) for strong in doc.xpath('//strong'): com = Organization( name=strong.text.strip(), parent_id={ 'name': 'Appropriations', 'classification': 'committee', }, classification='committee', ) com.add_source(url) legislators = strong.getnext().tail.replace('Senators', '').strip() for leg in re.split(', | and ', legislators): if leg.endswith('(C)'): role = 'chairman' leg = leg[:-4] elif leg.endswith('(VC)'): role = 'vice chairman' leg = leg[:-5] elif leg.endswith('(MVC)'): role = 'minority vice chairman' leg = leg[:-6] else: role = 'member' com.add_member(leg, role=role) yield com
def get_organizations(self): exclude_type_ids = getattr(self, 'exclude_type_ids', []) use_type_id = getattr(self, 'use_type_id', False) organization = Organization(self.name, classification=self.classification) parent = Division.get(self.division_id) if parent._type not in ('province', 'territory'): post = Post(role=styles_of_address[self.division_id]['Leader'], label=parent.name, division_id=parent.id, organization_id=organization._id) yield post children = [child for child in parent.children() if child._type != 'place' and child._type not in exclude_type_ids] for child in children: if child: if use_type_id: label = child.id.rsplit('/', 1)[1].capitalize().replace(':', ' ') else: label = child.name post = Post(role=styles_of_address[self.division_id]['Member'], label=label, division_id=child.id, organization_id=organization._id) yield post if not children and parent.attrs['posts_count']: for i in range(1, int(parent.attrs['posts_count'])): # exclude Mayor organization.add_post(role=styles_of_address[self.division_id]['Member'], label='{} (seat {})'.format(parent.name, i), division_id=parent.id) yield organization
def scrape_senate_committee(self, url): html = self.get(url).text doc = lxml.html.fromstring(html) headers = doc.xpath('(//div[@class="row"])[2]//h1') assert len(headers) == 1 name = ' '.join(headers[0].xpath('./text()')) name = re.sub(r'\s+Committee.*$', '', name) com = Organization(chamber='upper', name=name, classification='committee') for member in doc.xpath('(//div[@class="row"])[3]/div[1]/ul[1]/li'): text = member.text_content() member_name = member.xpath('./a/text()')[0].replace('Representative ', '') if 'Committee Chair' in text: role = 'chair' elif 'Minority Vice' in text: role = 'minority vice chair' elif 'Vice' in text: role = 'majority vice chair' else: role = 'member' com.add_member(member_name, role=role) com.add_source(url) yield com
def scrape_reps_comm(self): # As of 1/27/15, the committee page has the wrong # session number (126th) at the top, but # has newly elected people, so we're rolling with it. url = 'http://legislature.maine.gov/house/hsecoms.htm' page = self.get(url).text root = lxml.html.fromstring(page) count = 0 for n in range(1, 12, 2): path = 'string(//body/center[%s]/h1/a)' % (n) comm_name = root.xpath(path) committee = Organization(chamber='lower', name=comm_name, classification='committee') count = count + 1 path2 = '/html/body/ul[%s]/li/a' % (count) for el in root.xpath(path2): rep = el.text if rep.find('(') != -1: mark = rep.find('(') rep = rep[15: mark].strip() if 'chair' in rep.lower(): role = 'chair' rep = re.sub(r'(?i)[\s,]*chair\s*$', '', rep).strip() else: role = 'member' committee.add_member(rep, role) committee.add_source(url) yield committee
def scrape_lower_committee(self, name, url): page = self.lxmlize(url) committee = Organization(chamber='lower', name=name, classification="committee") committee.add_source(url) seen = set() member_links = self.get_nodes( page, '//div[@class="mod-inner"]//a[contains(@href, "mem")]') for member_link in member_links: member_name = None member_role = None member_name = member_link.text if member_name is None: continue # Figure out if this person is the chair. if member_link == member_links[0]: member_role = 'chair' else: member_role = 'member' if name not in seen: committee.add_member(member_name, member_role) seen.add(member_name) return committee
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@class="entry-content"]//p/strong') for councillor in councillors: district = councillor.xpath('./ancestor::p/preceding-sibling::h2')[-1].text_content().split('–'.decode('utf-8'))[0] name = ' '.join(councillor.text_content().split()[-2:]).replace('-Â'.decode('utf-8'), '') role = councillor.text_content().replace(name, '').split('-')[0] if 'SAO' in role or not role: continue org = Organization(name=district + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(COUNCIL_PAGE) yield org p = Person(primary_org='legislature', name=name, district=district) p.add_source(COUNCIL_PAGE) membership = p.add_membership(org, role=role, district=district) info = councillor.xpath('./ancestor::p/text()') for contact in info: if 'NT' in contact: membership.add_contact_detail('address', contact.strip(), 'legislature') if 'Tel' in contact: contact = contact.replace('Tel. ', '').replace('(', '').replace(') ', '-').strip() membership.add_contact_detail('voice', contact, 'legislature') if 'Fax' in contact: contact = contact.replace('Fax ', '').replace('(', '').replace(') ', '-').strip() membership.add_contact_detail('fax', contact, 'legislature') email = self.get_email(councillor, './parent::p') membership.add_contact_detail('email', email) if 'Website' in councillor.xpath('./parent::p')[0].text_content(): p.add_link(councillor.xpath('./parent::p//a')[1].attrib['href']) yield p
def scrape(self, chamber=None): base_url = ('http://www.ncga.state.nc.us/gascripts/Committees/' 'Committees.asp?bPrintable=true&sAction=ViewCommitteeType&sActionDetails=') chamber_slugs = {'upper': ['Senate%20Standing', 'Senate%20Select'], 'lower': ['House%20Standing', 'House%20Select']} if chamber: chambers = [chamber] else: chambers = ['upper', 'lower'] for chamber in chambers: for ctype in chamber_slugs[chamber]: data = self.get(base_url + ctype).text doc = lxml.html.fromstring(data) doc.make_links_absolute(base_url + ctype) for comm in doc.xpath('//ul/li/a'): name = comm.text # skip committee of whole Senate if 'Whole Senate' in name: continue url = comm.get('href') committee = Organization(name=name, chamber=chamber, classification="committee") self.scrape_committee(committee, url) committee.add_source(url) if not committee._related: self.warning('empty committee: %s', name) else: yield committee
def scrape_chamber(self, chamber, session): url = "%s/GetActiveCommittees?biennium=%s" % (self._base_url, session) page = self.get(url) page = lxml.etree.fromstring(page.content) for comm in xpath(page, "//wa:Committee"): agency = xpath(comm, "string(wa:Agency)") comm_chamber = {'House': 'lower', 'Senate': 'upper'}[agency] if comm_chamber != chamber: continue name = xpath(comm, "string(wa:Name)") # comm_id = xpath(comm, "string(wa:Id)") # acronym = xpath(comm, "string(wa:Acronym)") phone = xpath(comm, "string(wa:Phone)") comm = Organization(name, chamber=chamber, classification='committee') comm.extras['phone'] = phone self.scrape_members(comm, agency) comm.add_source(url) if not comm._related: self.warning('empty committee: %s', name) else: yield comm
def test_committee_add_member_person(): c = Organization('Defense', classification='committee') p = Person('John Adams') c.add_member(p, role='chairman') assert c._related[0].person_id == p._id assert c._related[0].organization_id == c._id assert c._related[0].role == 'chairman'
def scrape_committee(self, name, url, chamber): org = Organization(name=name, chamber=chamber, classification='committee') org.add_source(url) data = self.get(url).text doc = lxml.html.fromstring(data) for leg in doc.xpath('//div[@id="members"]/div[@id="members"]/p/a/text()'): leg = leg.replace('Representative ', '') leg = leg.replace('Senator ', '') leg = leg.strip() if ' (' in leg: leg, role = leg.split(' (') if 'Vice-Chair' in role: role = 'vice-chair' elif 'Co-Chair' in role: role = 'co-chair' elif 'Chair' in role: role = 'chair' else: raise Exception('unknown role: %s' % role) else: role = 'member' org.add_member(leg, role) return org
def scrape_committee(self, term, href, name): page = self.get(href).text page = lxml.html.fromstring(page) page.make_links_absolute(href) members = page.xpath("//div[@class='view-content']" "//a[contains(@href, 'members')]") if '/joint/' in href: chamber = 'legislature' elif '/senate/' in href: chamber = 'upper' elif '/house/' in href: chamber = 'lower' else: # interim committees and others were causing duplicate committee issues, skipping self.warning('Failed to identify chamber for {}; skipping'.format(href)) return cttie = Organization(name, chamber=chamber, classification='committee') for a in members: member = a.text role = a.xpath("ancestor::div/h2[@class='pane-title']/text()")[0].strip() role = {"Legislative Members": "member", "Chairman": "chair", "Vice Chairman": "member"}[role] if member is None or member.startswith("District"): continue member = member.replace('Senator ', '').replace('Representative ', '') cttie.add_member(member, role=role) cttie.add_source(href) yield cttie
def add_committees(self, legislator_page, legislator, chamber, url): # as of today, both chambers do committees the same way! Yay! rows = self.get_nodes( legislator_page, '//div[@id="ContentPlaceHolder1_TabSenator_TabCommittees"]//table/' 'tr') if len(rows) == 0: return for row in rows[1:]: committee_name_text = self.get_node(row, './td[2]').text_content() committee_name = committee_name_text.strip() if not committee_name: continue role_text = self.get_node(row, './td[3]').text_content() role = role_text.strip() if committee_name not in self.committees: comm = Organization( name=committee_name, chamber=chamber, classification='committee') comm.add_source(url) self.committees[committee_name] = comm self.committees[committee_name].add_member( legislator.name, role=role, )
def board_of_aldermen(self): org = Organization(name="St Louis Board of Aldermen", classification="legislature") # add a post for each Ward for ward_num in range(1, self.WARD_COUNT + 1): org.add_post(label="Ward {} Alderman".format(ward_num), role="Alderman") yield org
def test_extras_organization(): org = ScrapeOrganization('United Nations', classification='international') org.extras = {"hello": "world", "foo": {"bar": "baz"}} od = org.as_dict() OrganizationImporter('jurisdiction-id').import_data([od]) o = Organization.objects.get() assert o.extras['foo']['bar'] == 'baz'
def get_organizations(self): organization = Organization(self.name, classification=self.classification) organization.add_post(role='Mayor', label='Waterloo', division_id=self.division_id) for ward_number in range(1, 8): organization.add_post(role='Councillor', label='Ward {}'.format(ward_number)) yield organization
def get_organizations(self): # @todo Eliminate once shapefile is found and ocd-division-ids is updated. organization = Organization(self.name, classification=self.classification) organization.add_post(role='Maire', label='Mercier', division_id=self.division_id) for district_number in range(1, 7): organization.add_post(role='Conseiller', label='District {}'.format(district_number)) yield organization
def get_organizations(self): organization = Organization(self.name, classification=self.classification) organization.add_post(role='Mayor', label='Grande Prairie', division_id=self.division_id) for seat_number in range(1, 9): organization.add_post(role='Councillor', label='Grande Prairie (seat {})'.format(seat_number), division_id=self.division_id) yield organization
def get_organizations(self): organization = Organization(self.name, classification=self.classification) organization.add_post(role='Mayor', label='Welland', division_id=self.division_id) for ward_number in range(1, 7): for seat_number in range(1, 3): organization.add_post(role='Councillor', label='Ward {} (seat {})'.format(ward_number, seat_number)) yield organization
def get_organizations(self): organization = Organization(self.name, classification=self.classification) organization.add_post(role='Mayor', label=self.division_name, division_id=self.division_id) for ward_number in range(1, 5): for seat_number in range(1, 3): organization.add_post( role='Alderman', label='Ward {} (seat {})'.format(ward_number, seat_number), division_id='{}/ward:{}'.format(self.division_id, ward_number)) yield organization
def get_organizations(self): organization = Organization(self.name, classification=self.classification) organization.add_post(role='Mayor', label=self.division_name, division_id=self.division_id) for ward_number, stop in enumerate((3, 4, 3, 3, 3, 7), 1): for seat_number in range(1, stop): organization.add_post( role='Councillor', label='Ward {} (seat {})'.format(ward_number, seat_number), division_id='{}/ward:{}'.format(self.division_id, ward_number)) yield organization
def scrape_committee(self, chamber, url): html = self.get(url, verify=False).text doc = lxml.html.fromstring(html) name = doc.xpath('//title/text()')[0] com = Organization(name, chamber=chamber, classification='committee') com.add_source(url) members = doc.xpath('//a[contains(@href, "/Legislators/Profile")]') for member in members: title = member.xpath('../span') role = title[0].text.lower() if title else 'member' com.add_member(member.text, role) if members: return com
def get_organizations(self): organization = Organization(self.name, classification=self.classification) organization.add_post(role='Mayor', label=self.division_name, division_id=self.division_id) for ward_number in range(1, 7): # @todo Until a boundary set is received and loaded into Represent, we treat this as having no divisions. # organization.add_post(role='Councillor', label='Ward {}'.format(ward_number), division_id=self.division_id) organization.add_post( role='Councillor', label='Whitchurch-Stouffville (seat {})'.format(ward_number), division_id=self.division_id) yield organization
def get_organizations(self): #REQUIRED: define an organization using this format #where org_name is something like Seattle City Council #and classification is described here: org = Organization(name="Common Council", classification="legislature") for x in range(1, 16): org.add_post( "District {}".format(x), "Alderman", division_id= 'ocd-division/country:us/state:wi/place:milwaukee/council_district:{}' .format(x)) # OPTIONAL: add posts to your organizaion using this format, # where label is a human-readable description of the post (eg "Ward 8 councilmember") # and role is the position type (eg councilmember, alderman, mayor...) # skip entirely if you're not writing a people scraper. #org.add_post(label="position_description", role="position_type") #REQUIRED: yield the organization yield org city = Organization("City of Milwaukee", classification='executive') city.add_post( 'Mayor', 'Mayor', division_id='ocd-division/country:us/state:wi/place:milwaukee') yield city barrett = Person(name="Barrett, Tom") barrett.add_term('Mayor', 'executive', start_date=datetime.date(2004, 4, 15), appointment=True) barrett.add_source('https://milwaukee.legistar.com/People.aspx') yield barrett
def scrape_house_committees(self): url = 'http://www.house.leg.state.mn.us/comm/commemlist.asp' html = self.get(url).text doc = lxml.html.fromstring(html) for com in doc.xpath('//h2[@class="commhighlight"]'): members_url = com.xpath( 'following-sibling::p[1]/a[text()="Members"]/@href')[0] com = Organization(com.text, chamber='lower', classification='committee') com.add_source(members_url) try: member_html = self.get(members_url).text mdoc = lxml.html.fromstring(member_html) except HTTPError: self.warning( "Member list for {} failed to respond; skipping".format( com.name)) continue # each legislator in their own table # first row, second column contains all the info for ltable in mdoc.xpath('//table/tr[1]/td[2]/p/b[1]'): # name is tail string of last element name = ltable.text_content() text = ltable.text if text and name != text: name = name.replace(text, '') # role is inside a nested b tag role = ltable.xpath('b/*/text()') if role: # if there was a role, remove it from name role = role[0] name = name.replace(role, '') else: role = 'member' name = name.split(' (')[0] com.add_member(name, role) # save yield com
def scrape_senate_committee(self, url): html = self.get(url).text doc = lxml.html.fromstring(html) com_name = doc.xpath('//a[contains(@href, "committee_bio")]/text()')[0] parent = doc.xpath('//h4//a[contains(@href, "committee_bio")]/text()') if parent: self.log("%s is subcommittee of %s", com_name, parent[0]) com = Organization( com_name, chamber="upper", classification="committee", parent_id={"name": parent[0], "classification": "upper"}, ) else: com = Organization(com_name, chamber="upper", classification="committee") for link in doc.xpath('//div[@id="members"]//a[contains(@href, "member_bio")]'): name = link.text_content().strip() if name: position = link.xpath(".//preceding-sibling::b/text()") if not position: position = "member" elif position[0] == "Chair:": position = "chair" elif position[0] == "Vice Chair:": position = "vice chair" elif position[0] == "Ranking Minority Member:": position = "ranking minority member" else: raise ValueError("unknown position: %s" % position[0]) name = name.split(" (")[0] com.add_member(name.strip(), position) com.add_source(url) yield com
def scrape_committee(self, chamber, name, url): page = self.get(url).text page = lxml.html.fromstring(page) if page.xpath("//h3[. = 'Joint Committee']"): chamber = "joint" subcommittee = page.xpath("//h3[@align='center']/text()")[0] if "Subcommittee" not in subcommittee: comm = Organization(chamber=chamber, name=name, classification="committee") else: comm = Organization( name=subcommittee, classification="committee", parent_id={ "classification": chamber, "name": name }, ) comm.add_source(url) for link in page.xpath("//a[contains(@href, 'member=')]"): member = link.text.strip() mtype = link.xpath("string(../preceding-sibling::td[1])") mtype = mtype.strip(": \r\n\t").lower() comm.add_member(member, mtype) if not comm._related: self.warning("not saving %s, appears to be empty" % name) else: yield comm
def scrape(self, session=None): if not session: session = self.jurisdiction.legislative_sessions[-1]['name'] self.info('no session specified, using %s', session) year_abr = session[0:4] self._init_mdb(year_abr) members_csv = self.access_to_csv('COMember') info_csv = self.access_to_csv('Committee') org_dictionary = {} # Committee Info Database for rec in info_csv: abrv = rec["Code"] comm_name = rec["Description"] if abrv[0] == "A": chamber = "lower" elif abrv[0] == "S": chamber = "upper" org = Organization( name=comm_name, chamber=chamber, classification='committee', ) org.add_source('http://www.njleg.state.nj.us/downloads.asp') org_dictionary[abrv] = org # Committee Member Database POSITIONS = {'C': 'chair', 'V': 'vice-chair', '': 'member'} for member_rec in members_csv: # assignment=P means they are active, assignment=R means removed if member_rec['Assignment_to_Committee'] == 'P': abr = member_rec["Code"] org = org_dictionary[abr] leg = member_rec["Member"] role = POSITIONS[member_rec["Position_on_Committee"]] leg = ' '.join(leg.split(', ')[::-1]) org.add_member(leg, role=role) for org in org_dictionary.values(): yield org
def get_organizations(self): organization = Organization(self.name, classification=self.classification) organization.add_post(role='Mayor', label=self.division_name, division_id=self.division_id) for ward_name in ('Grantham', 'Merritton', 'Port Dalhousie', "St. Andrew's", "St. George's", "St. Patrick's"): for seat_number in range(1, 3): organization.add_post( role='Councillor', label='{} (seat {})'.format(ward_name, seat_number), division_id='{}/ward:{}'.format(self.division_id, clean_type_id(ward_name))) yield organization
def get_organizations(self): org = Organization(name="Sacramento City Council", classification="legislature") org.add_post( 'Mayor of the City of Sacramento', 'Mayor', division_id='ocd-division/country:us/state:ca/place:sacramento') for district in range(1, 9): org.add_post( 'Sacramento City Council Member, District {}'.format(district), 'Member', division_id= 'ocd-division/country:us/state:ca/place:sacramento/council_district:{}' .format(district)) yield org
def scrape(self, session=None): if not session: session = self.jurisdiction.legislative_sessions[-1]["name"] self.info("no session specified, using %s", session) year_abr = session[0:4] self._init_mdb(year_abr) members_csv = self.access_to_csv("COMember") info_csv = self.access_to_csv("Committee") org_dictionary = {} # Committee Info Database for rec in info_csv: abrv = rec["Code"] comm_name = rec["Description"] if abrv[0] == "A": chamber = "lower" elif abrv[0] == "S": chamber = "upper" org = Organization( name=comm_name, chamber=chamber, classification="committee" ) org.add_source("http://www.njleg.state.nj.us/downloads.asp") org_dictionary[abrv] = org # Committee Member Database POSITIONS = {"C": "chair", "V": "vice-chair", "": "member"} for member_rec in members_csv: # assignment=P means they are active, assignment=R means removed if member_rec["Assignment_to_Committee"] == "P": abr = member_rec["Code"] org = org_dictionary[abr] leg = member_rec["Member"] role = POSITIONS[member_rec["Position_on_Committee"]] leg = " ".join(leg.split(", ")[::-1]) org.add_member(leg, role=role) for org in org_dictionary.values(): yield org
def scrape_house_committees(self): base_url = 'http://house.mi.gov/MHRPublic/CommitteeInfo.aspx?comkey=' html = self.get('http://house.mi.gov/mhrpublic/committee.aspx').text doc = lxml.html.fromstring(html) # get values out of drop down for opt in doc.xpath('//option'): name = opt.text # skip invalid choice if opt.text in ('Statutory Committees', 'Select One'): continue if 'have not been created' in opt.text: self.warning('no committees yet for the house') return com_url = base_url + opt.get('value') com_html = self.get(com_url).text cdoc = lxml.html.fromstring(com_html) com = Organization(chamber='lower', name=name, classification='committee') com.add_source(com_url) for a in doc.xpath('//a[starts-with(@id, "memberLink")]'): name = a.text.strip() # all links to http:// pages in servicecolumn2 are legislators members = cdoc.xpath('//div[contains(@id,"memberPanelRow")]') for mem in members: name = mem.xpath('./a') if name: name = name[0].text.strip() else: # this is a blank row continue text = mem.xpath('./span')[0].text if 'Committee Chair' in text: role = 'chair' elif 'Vice-Chair' in text: role = 'vice chair' else: role = 'member' com.add_member(name, role=role) yield com
def handle_page(self): # don't use handle_page_item because we need to look back at prior element parent = None for item in self.doc.xpath(self.list_xpath): cssclass = item.attrib.get('class', '') name = item.text_content().strip() if 'parentcommittee' in cssclass: parent = None chamber = 'lower' comm = Organization(name=name, classification="committee", chamber=chamber, parent_id=parent) yield self.scrape_page(HouseComDetail, item.attrib['href'], obj=comm) # parent for next time if 'parentcommittee' in cssclass: parent = comm._id chamber = None
def _scrape_select_special_committees(self): """Scrapes the Select and Special Committees page of the Nebraska state legislature.""" main_url = 'http://www.nebraskalegislature.gov/committees/select-committees.php' page = self.lxmlize(main_url) committee_nodes = self.get_nodes( page, '//a[contains(@class, "accordion-switch")]' '/ancestor::div[@class="panel panel-leg"]') for committee_node in committee_nodes: committee_name = self.get_node( committee_node, './/h2[@class="panel-title"]/text()[normalize-space()]') if committee_name is None: committee_name = self.get_node( committee_node, './/h2[@class="panel-title"]/a/text()[normalize-space()]') org = Organization(name=committee_name, chamber='legislature', classification='committee') org.add_source(main_url) members = self.get_nodes( committee_node, './/a[@class="list-group-item"]' '/text()[normalize-space()]') for member in members: member_name = re.sub(r'\Sen\.\s+', '', member) member_name = re.sub(r', Chairperson', '', member_name).strip() if 'Chairperson' in member: member_role = 'Chairperson' else: member_role = 'member' org.add_member(member_name, member_role) if not org._related: self.warning('No members found in {} committee.'.format( org.name)) else: yield org
def scrape_lower_committee(self, link, name): url = re.sub(r'\s+', '', link.attrib['href']) html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) comm = Organization(name=name, chamber='lower', classification='committee') comm.add_source(url) xpath = '//a[contains(@href, "?member=")]' for link in doc.xpath(xpath): name = link.text_content().strip() name = re.sub(r'^Delegate\s+', '', name) role = link.getnext().text or 'member' comm.add_member(name, role.strip()) return comm
def scrape_upper_committee(self, link, name): url = re.sub(r"\s+", "", link.attrib["href"]) html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) comm = Organization(name=name, chamber="upper", classification="committee") comm.add_source(url) xpath = '//a[contains(@href, "?member=")]' for link in doc.xpath(xpath): name = link.text_content().strip() name = re.sub(r"^Delegate\s+", "", name) role = link.getnext().text or "member" comm.add_member(name, role.strip()) return comm
def scrape_committee(self, chamber, link, parent_comm=None): home_link = link.attrib["href"] name = re.sub(r'\s+\((H|S)\)$', '', link.text).strip().title() name = name.replace(".", "").strip() if "Subcommittee " in name and parent_comm: name = name.split("Subcommittee")[1] name = name.replace(" on ", "").replace(" On ", "") name = name.strip() comm = Organization(name, parent_id={ 'name': parent_comm, 'classification': chamber }, classification='committee') else: for c in ["Committee", "Comm", "Sub", "Subcommittee"]: if name.endswith(c): name = name[:-1 * len(c)].strip() comm = Organization(name, chamber=chamber, classification='committee') comm.add_source(home_link) comm_url = home_link.replace('home.htm', 'members.htm') self.scrape_members(comm, comm_url) if comm._related: yield comm else: self.logger.warning("Empty committee, skipping.") # deal with subcommittees if parent_comm is None: # checking parent_comm so we don't look for subcommittees # in subcommittees leaving us exposed to infinity page = self.get(home_link).text page = lxml.html.fromstring(page) page.make_links_absolute(home_link) sub_links = page.xpath("//li/a[contains(@href, '/home.htm')]") for l in sub_links: if "committee" in l.text.lower(): yield from self.scrape_committee(chamber, l, name)
def _scrape_lower_standing_committee(self, committee_name, url): page = self.lxmlize(url) committee = Organization(committee_name, chamber="lower", classification="committee" ) committee.add_source(url) rows = page.xpath('//table[@id="body_ListView1_itemPlaceholderContainer"]' '/tr[@class="linkStyle2"]') for row in rows: member_name = row.xpath('normalize-space(string(./td[1]/a))') member_name = self._normalize_member_name(member_name) member_role = row.xpath('normalize-space(string(./td[2]))') member_role = self._normalize_member_role(member_role) committee.add_member(member_name, member_role) yield committee
def get_organizations(self): parliament = Organization(self.name, classification=self.classification) yield parliament upper = Organization('Senate', classification='upper', parent_id=parliament) lower = Organization('House of Commons', classification='lower', parent_id=parliament) for division in Division.get(self.division_id).children('ed'): if division.attrs.get('validFrom') and division.attrs[ 'validFrom'] <= datetime.now().strftime('%Y-%m-%d'): lower.add_post(role='MP', label=division.name, division_id=division.id) yield upper yield lower
def scrape_lower_committee(self, name, url): page = self.lxmlize(url) committee = Organization(chamber='lower', name=name, classification="committee") committee.add_source(url) seen = set() member_links = self.get_nodes( page, '//div[@class="commlinks"]//a[contains(@href, "mem")]') for member_link in member_links: member_name = None member_role = None member_text = member_link.text if member_text is not None: member = member_text.strip() member = re.sub(r'\s+', ' ', member) member_name, member_role = self._parse_name(member) if member_name is None: continue # Figure out if this person is the chair. role_type = self.get_node( member_link, '../../preceding-sibling::div[1]/text()') if role_type in (['Chair'], ['Co-Chair']): member_role = 'chair' else: member_role = 'member' if name not in seen: committee.add_member(member_name, member_role) seen.add(member_name) return committee
def scrape_committee(self, term, href, name): page = self.get(href).text page = lxml.html.fromstring(page) page.make_links_absolute(href) members = page.xpath("//div[@class='view-content']" "//a[contains(@href, 'members')]") if '/joint/' in href: chamber = 'legislature' elif '/senate/' in href: chamber = 'upper' elif '/house/' in href: chamber = 'lower' else: # interim committees and others were causing duplicate committee issues, skipping self.warning( 'Failed to identify chamber for {}; skipping'.format(href)) return cttie = Organization(name, chamber=chamber, classification='committee') for a in members: member = a.text role = a.xpath( "ancestor::div/h2[@class='pane-title']/text()")[0].strip() role = { "Legislative Members": "member", "Chairman": "chair", "Vice Chairman": "member" }[role] if member is None or member.startswith("District"): continue member = member.replace('Senator ', '').replace('Representative ', '') cttie.add_member(member, role=role) cttie.add_source(href) yield cttie
def get_organizations(self): org = Organization(name="Chicago City Council", classification="legislature") for x in range(1, 51): org.add_post( "Ward {}".format(x), "Alderman", division_id= 'ocd-division/country:us/state:il/place:chicago/ward:{}'. format(x)) yield org mayor = Organization('Office of the Mayor', classification='executive') yield mayor #I'm not sure how to model the office of the city clerk it's #a seperately elected executive I think. clerk = Organization('Office of the City Clerk', classification='executive') yield clerk
def scrape_committees(self, session): session_key = SESSION_KEYS[session] committees_response = self.api_client.get("committees", session=session_key) legislators = index_legislators(self, session_key) for committee in committees_response: org = Organization( chamber={ "S": "upper", "H": "lower", "J": "legislature" }[committee["HouseOfAction"]], name=committee["CommitteeName"], classification="committee", ) org.add_source("https://olis.leg.state.or.us/liz/{session}" "/Committees/{committee}/Overview".format( session=session_key, committee=committee["CommitteeName"])) members_response = self.api_client.get( "committee_members", session=session_key, committee=committee["CommitteeCode"], ) for member in members_response: try: member_name = legislators[member["LegislatorCode"]] except KeyError: logger.warn("Legislator {} not found in session {}".format( member["LegislatorCode"], session_key)) member_name = member["LegislatorCode"] org.add_member(member_name, role=member["Title"] if member["Title"] else "") yield org
def scrape_senate_comm(self): url = ('http://legislature.maine.gov/committee-information/' 'standing-committees-of-the-senate') html = self.get(url).text doc = lxml.html.fromstring(html) headings = doc.xpath('//p/strong') for heading in headings: committee = Organization(chamber='upper', name=heading.text.strip(':'), classification='committee') committee.add_source(url) par = heading.getparent().getnext() while True: link = par.xpath('a') if len(link) == 0: break res = self.senate_committee_pattern.search(link[0].text) name, chair = res.groups() committee.add_member( name, 'chair' if chair is not None else 'member') par = par.getnext() yield committee
def scrape_upper_committee(self, name, url): page = lxml.html.fromstring(self.get(url).text) comm = Organization(name=name, chamber="upper", classification="committee") comm.add_source(url) for link in page.xpath("//a[contains(@href, 'biographies')]"): member = link.xpath("string()").strip() member = re.sub(r"\s+", " ", member) if not member: continue role = link.tail if not role: role = "member" elif "Vice Chair" in role: role = "vice chair" elif "Chair" in role: role = "chair" member = member.replace("Senator ", "") comm.add_member(member, role=role) if not comm._related: raise Exception("no members for %s", comm.name) yield comm
def get_organizations(self): legislature_name = "Maryland General Assembly" lower_chamber_name = "House" lower_title = "Delegate" upper_chamber_name = "Senate" upper_seats = 47 upper_title = "Senator" legislature = Organization(name=legislature_name, classification="legislature") upper = Organization(upper_chamber_name, classification='upper', parent_id=legislature._id) lower = Organization(lower_chamber_name, classification='lower', parent_id=legislature._id) for n in range(1, upper_seats + 1): upper.add_post( label=str(n), role=upper_title, division_id='{}/sldu:{}'.format(self.division_id, n)) lower.add_post('1A', # maximum=1, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:1a') lower.add_post('1B', # maximum=1, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:1b') lower.add_post('1C', # maximum=1, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:1c') lower.add_post('2A', # maximum=2, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:2a') lower.add_post('2B', # maximum=1, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:2b') lower.add_post('3A', # maximum=2, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:3a') lower.add_post('3B', # maximum=1, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:3b') lower.add_post('4', # maximum=3, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:4') lower.add_post('5', # maximum=3, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:5') lower.add_post('6', # maximum=3, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:6') lower.add_post('7', # maximum=3, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:7') lower.add_post('8', # maximum=3, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:8') lower.add_post('9A', # maximum=2, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:9a') lower.add_post('9B', # maximum=1, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:9b') lower.add_post('10', # maximum=3, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:10') lower.add_post('11', # maximum=3, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:11') lower.add_post('12', # maximum=3, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:12') lower.add_post('13', # maximum=3, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:13') lower.add_post('14', # maximum=3, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:14') lower.add_post('15', # maximum=3, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:15') lower.add_post('16', # maximum=3, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:16') lower.add_post('17', # maximum=3, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:17') lower.add_post('18', # maximum=3, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:18') lower.add_post('19', # maximum=3, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:19') lower.add_post('20', # maximum=3, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:20') lower.add_post('21', # maximum=3, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:21') lower.add_post('22', # maximum=3, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:22') lower.add_post('23A', # maximum=1, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:23a') lower.add_post('23B', # maximum=2, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:23b') lower.add_post('24', # maximum=3, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:24') lower.add_post('25', # maximum=3, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:25') lower.add_post('26', # maximum=3, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:26') lower.add_post('27A', # maximum=1, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:27a') lower.add_post('27B', # maximum=1, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:27b') lower.add_post('27C', # maximum=1, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:27c') lower.add_post('28', # maximum=3, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:28') lower.add_post('29A', # maximum=1, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:29a') lower.add_post('29B', # maximum=1, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:29b') lower.add_post('29C', # maximum=1, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:29c') lower.add_post('30A', # maximum=2, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:30a') lower.add_post('30B', # maximum=1, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:30b') lower.add_post('31A', # maximum=1, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:31a') lower.add_post('31B', # maximum=2, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:31b') lower.add_post('32', # maximum=3, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:32') lower.add_post('33', # maximum=3, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:33') lower.add_post('34A', # maximum=2, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:34a') lower.add_post('34B', # maximum=1, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:34b') lower.add_post('35A', # maximum=1, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:35a') lower.add_post('35B', # maximum=2, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:35b') lower.add_post('36', # maximum=3, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:36') lower.add_post('37A', # maximum=1, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:37a') lower.add_post('37B', # maximum=2, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:37b') lower.add_post('38A', # maximum=1, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:38a') lower.add_post('38B', # maximum=1, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:38b') lower.add_post('38C', # maximum=1, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:38c') lower.add_post('39', # maximum=3, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:39') lower.add_post('40', # maximum=3, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:40') lower.add_post('41', # maximum=3, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:41') lower.add_post('42A', # maximum=1, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:42a') lower.add_post('42B', # maximum=2, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:42b') lower.add_post('43', # maximum=3, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:43') lower.add_post('44A', # maximum=1, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:44a') lower.add_post('44B', # maximum=2, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:44b') lower.add_post('45', # maximum=3, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:45') lower.add_post('46', # maximum=3, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:46') lower.add_post('47A', # maximum=2, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:47a') lower.add_post('47B', # maximum=1, role=lower_title, division_id='ocd-division/country:us/state:md/sldl:47b') yield Organization('Office of the Governor', classification='executive') yield legislature yield upper yield lower
def get_organizations(self): organization = Organization(self.name, classification=self.classification) organization.add_post(role='Regional Chair', label=self.division_name, division_id=self.division_id) divisions = { 'Fort Erie': { 'stop': 3, 'type_id': '3526003', }, 'Grimsby': { 'stop': 3, 'type_id': '3526065', }, 'Lincoln': { 'stop': 3, 'type_id': '3526057', }, 'Niagara Falls': { 'stop': 5, 'type_id': '3526043', }, 'Niagara-on-the-Lake': { 'stop': 3, 'type_id': '3526047', }, 'Pelham': { 'stop': 3, 'type_id': '3526028', }, 'Port Colborne': { 'stop': 3, 'type_id': '3526011', }, 'St. Catharines': { 'stop': 8, 'type_id': '3526053', }, 'Thorold': { 'stop': 3, # can be 2 'type_id': '3526037', }, 'Wainfleet': { 'stop': 2, # can be 3 'type_id': '3526014', }, 'Welland': { 'stop': 4, 'type_id': '3526032', }, 'West Lincoln': { 'stop': 2, 'type_id': '3526021', }, } for division_name, division in divisions.items(): division_id = 'ocd-division/country:ca/csd:{}'.format( division['type_id']) organization.add_post(role='Mayor', label=division_name, division_id=division_id) for seat_number in range(1, division['stop']): organization.add_post(role='Councillor', label='{} (seat {})'.format( division_name, seat_number), division_id=division_id) yield organization
def test_full_bill(): create_jurisdiction() sp = ScrapePerson('Adam Smith') org = ScrapeOrganization(name='House', classification='lower') com = ScrapeOrganization(name='Arbitrary Committee', classification='committee', parent_id=org._id) oldbill = ScrapeBill('HB 99', '1899', 'Axe & Tack Tax Act', classification='tax bill', from_organization=org._id) bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', from_organization=org._id) bill.subject = ['taxes', 'axes'] bill.add_identifier('SB 9') bill.add_title('Tack & Axe Tax Act') bill.add_action('introduced in house', '1900-04-01', chamber='lower') act = bill.add_action('sent to arbitrary committee', '1900-04-04', chamber='lower') act.add_related_entity('arbitrary committee', 'organization', com._id) bill.add_related_bill("HB 99", legislative_session="1899", relation_type="prior-session") bill.add_sponsorship('Adam Smith', classification='extra sponsor', entity_type='person', primary=False, entity_id=sp._id) bill.add_sponsorship('Jane Smith', classification='lead sponsor', entity_type='person', primary=True) bill.add_abstract('This is an act about axes and taxes and tacks.', note="official", date='1969-10-20') bill.add_document_link('Fiscal Note', 'http://example.com/fn.pdf', media_type='application/pdf') bill.add_document_link('Fiscal Note', 'http://example.com/fn.html', media_type='text/html') bill.add_version_link('Fiscal Note', 'http://example.com/v/1', media_type='text/html') bill.add_source('http://example.com/source') # import bill oi = OrganizationImporter('jid') oi.import_data([org.as_dict(), com.as_dict()]) pi = PersonImporter('jid') pi.import_data([sp.as_dict()]) BillImporter('jid', oi, pi).import_data([oldbill.as_dict(), bill.as_dict()]) # get bill from db and assert it imported correctly b = Bill.objects.get(identifier='HB 1') assert b.from_organization.classification == 'lower' assert b.identifier == bill.identifier assert b.title == bill.title assert b.classification == bill.classification assert b.subject == ['taxes', 'axes'] assert b.abstracts.get().note == 'official' assert b.abstracts.get().date == '1969-10-20' # other_title, other_identifier added assert b.other_titles.get().title == 'Tack & Axe Tax Act' assert b.other_identifiers.get().identifier == 'SB 9' # actions actions = list(b.actions.all()) assert len(actions) == 2 # ensure order was preserved (if this breaks it'll be intermittent) assert actions[0].organization == Organization.objects.get( classification='lower') assert actions[0].description == "introduced in house" assert actions[1].description == "sent to arbitrary committee" assert (actions[1].related_entities.get().organization == Organization.objects.get(classification='committee')) # related_bills were added rb = b.related_bills.get() assert rb.identifier == 'HB 99' # and bill got resolved assert rb.related_bill.identifier == 'HB 99' # sponsors added, linked & unlinked sponsorships = b.sponsorships.all() assert len(sponsorships) == 2 person = Person.objects.get(name='Adam Smith') for ss in sponsorships: if ss.primary: assert ss.person is None assert ss.organization is None else: assert ss.person == person # versions & documents with their links versions = b.versions.all() assert len(versions) == 1 assert versions[0].links.count() == 1 documents = b.documents.all() assert len(documents) == 1 assert documents[0].links.count() == 2 # sources assert b.sources.count() == 1
def get_organizations(self): organization = Organization(self.name, classification=self.classification) organization.add_post(role='Mayor', label='Thunder Bay', division_id=self.division_id) for seat_number in range(1, 6): organization.add_post(role='Councillor at Large', label='Thunder Bay (seat {})'.format(seat_number), division_id=self.division_id) organization.add_post(role='Councillor', label='Current River') organization.add_post(role='Councillor', label='Red River') organization.add_post(role='Councillor', label='McKellar') organization.add_post(role='Councillor', label='McIntyre') organization.add_post(role='Councillor', label='Northwood') organization.add_post(role='Councillor', label='Westfort') organization.add_post(role='Councillor', label='Neebing') yield organization