def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@class="entry-content"]//p/strong') for councillor in councillors: district = councillor.xpath('./ancestor::p/preceding-sibling::h2')[-1].text_content().split('–'.decode('utf-8'))[0] name = ' '.join(councillor.text_content().split()[-2:]).replace('-Â'.decode('utf-8'), '') role = councillor.text_content().replace(name, '').split('-')[0] if 'SAO' in role or not role: continue org = Organization(name=district + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(COUNCIL_PAGE) yield org p = Legislator(name=name, post_id=district) p.add_source(COUNCIL_PAGE) membership = p.add_membership(org, role=role, post_id=district) info = councillor.xpath('./ancestor::p/text()') for contact in info: if 'NT' in contact: membership.add_contact_detail('address', contact.strip(), 'legislature') if 'Tel' in contact: contact = contact.replace('Tel. ', '').replace('(', '').replace(') ', '-').strip() membership.add_contact_detail('voice', contact, 'legislature') if 'Fax' in contact: contact = contact.replace('Fax ', '').replace('(', '').replace(') ', '-').strip() membership.add_contact_detail('fax', contact, 'legislature') email = councillor.xpath('./parent::p//a[contains(@href, "mailto:")]/text()')[0] membership.add_contact_detail('email', email, None) if 'Website' in councillor.xpath('./parent::p')[0].text_content(): p.add_link(councillor.xpath('./parent::p//a')[1].attrib['href'], None) yield p
def get_people(self): urls = Urls(dict(list=legislators_url), self) council = Organization( 'Temecula City Council', classification='legislature') council.add_source(urls.list.url) yield council for tr in urls.list.xpath('//table[2]//tr')[1:]: # Parse some attributes. name, role = tr.xpath('td/p[1]//font/text()') image = tr.xpath('td/img/@src').pop() # Create legislator. person = Person(name, image=image) # Add membership on council. memb = person.add_membership(council, role=role) # Add email address. email, detail_url = tr.xpath('td//a/@href') email = email[7:] memb.contact_details.append( dict(type='email', value=email, note='work')) # Add sources. person.add_source(urls.list.url) person.add_source(detail_url) yield person
def get_people(self): urls = Urls(dict(list=legislators_url), self) council = Organization('Temecula City Council', classification='legislature') council.add_source(urls.list.url) yield council for tr in urls.list.xpath('//table[2]//tr')[1:]: # Parse some attributes. name, role = tr.xpath('td/p[1]//font/text()') image = tr.xpath('td/img/@src').pop() # Create legislator. person = Person(name, image=image) # Add membership on council. memb = person.add_membership(council, role=role) # Add email address. email, detail_url = tr.xpath('td//a/@href') email = email[7:] memb.contact_details.append( dict(type='email', value=email, note='work')) # Add sources. person.add_source(urls.list.url) person.add_source(detail_url) yield person
def import_jurisdiction(org_importer, jurisdiction): obj = jurisdiction.get_db_object() obj['_type'] = 'jurisdiction' obj['_id'] = jurisdiction.jurisdiction_id obj['latest_update'] = datetime.datetime.utcnow() # validate jurisdiction validator = DatetimeValidator() try: validator.validate(obj, jurisdiction_schema) except ValueError as ve: raise ve db.jurisdictions.save(obj) # create organization(s) (TODO: if there are multiple chambers this isn't right) org = Organization(name=jurisdiction.name, classification='legislature', jurisdiction_id=jurisdiction.jurisdiction_id) if jurisdiction.other_names: org.other_names = jurisdiction.other_names if jurisdiction.parent_id: org.parent_id = jurisdiction.parent_id org_importer.import_object(org) # create parties for party in jurisdiction.parties: org = Organization(**{'classification': 'party', 'name': party['name'], 'parent_id': None}) org_importer.import_object(org)
def get_people(self): page = lxmlize(COUNCIL_PAGE) districts = page.xpath( '//div[@id="ctl00_PublicContent_divSearchContent"]//tr')[5::3] for district in districts: title = district.xpath('.//td//text()') if len(title[0]) > 1: title = title[0] else: title = ''.join(title[:2]) # @todo Need to distinguish between, e.g., R.M. and Town title = title.title() organization = Organization( name=title + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) organization.add_source(COUNCIL_PAGE) yield organization contact = district.xpath('.//td/text()') address = ' '.join(contact[:4]) address = re.sub(r'(Fax:.*)', '', address).strip() contact = [x for x in contact if 'Fax' in x] fax = contact[0].split(':')[1].strip() phone = district.xpath('.//b[contains(text(), "Phone")]/text()' )[0].split(':')[1].strip() email = district.xpath( './/a[contains(@href, "mailto:")]/text()')[0].strip() councillors = district.xpath('.//td[3]/text()') positions = district.xpath('.//td[2]/b/text()') for i, councillor in enumerate(councillors): p = Legislator(name=councillor, post_id=title) p.add_source(COUNCIL_PAGE) if i >= 2: membership = p.add_membership(organization, role='Councillor') else: membership = p.add_membership( organization, role=positions[i] ) # @todo "Resident Administrator & Chief Administrative Officer" is split on two lines membership.post_id = title membership.add_contact_detail('address', address, 'legislature') membership.add_contact_detail('fax', fax, 'legislature') membership.add_contact_detail('voice', phone, 'legislature') membership.add_contact_detail('email', email, None) yield p
def test_add_contact(): """ test we can add a contact detail to an org """ orga = Organization("name") orga.add_source(url='foo') orga.validate() orga.add_contact_detail(type='voice', value='555-393-2821', note='nothing') orga.validate()
def get_people(self): response = urllib2.urlopen(COUNCIL_PAGE).read() pdf = open('/tmp/ns.pdf', 'w') pdf.write(response) pdf.close() data = subprocess.check_output(['pdftotext', '/tmp/ns.pdf', '-']) emails = re.findall(r'(?<=E-mail: ).+', data) data = re.split(r'Mayor |Warden ', data)[1:] for i, mayor in enumerate(data): lines = mayor.splitlines(True) name = lines.pop(0).strip() if name == "Jim Smith": continue district = lines.pop(0).strip() if not re.findall(r'[0-9]', lines[0]): district = district + ' ' + lines.pop(0).strip() org = Organization(name=district + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(COUNCIL_PAGE) yield org p = Legislator(name=name, post_id=district) p.add_source(COUNCIL_PAGE) membership = p.add_membership(org, role='Mayor', post_id=district) address = lines.pop(0).strip() + ', ' + lines.pop(0).strip() if not 'Phone' in lines[0]: address = address + ', ' + lines.pop(0).strip() if not 'Phone' in lines[0]: address = address + ', ' + lines.pop(0).strip() phone = lines.pop(0).split(':')[1].strip() if 'Fax' in lines.pop(0): fax = lines.pop(0) membership.add_contact_detail('address', address, 'legislature') membership.add_contact_detail('voice', phone, 'legislature') membership.add_contact_detail('fax', fax, 'legislature') # @todo emails are being assigned incorrectly, e.g. Town of Berwick picks # up Cape Breton Regional Municipality and Region of Queens Municipality for i, email in enumerate(emails): regex = name.split()[-1].lower() + '|' + '|'.join(district.split()[-2:]).replace('of', '').lower() regex = regex.replace('||', '|') matches = re.findall(r'%s' % regex, email) if matches: membership.add_contact_detail('email', emails.pop(i), None) yield p os.system('rm /tmp/ns.pdf')
def get_people(self): page = lxmlize(COUNCIL_PAGE) types = page.xpath('//div[@class="bluearrow shaded bottomborder "][1]/ul/li/a/@href')[:4] for org_type, link in enumerate(types): page = lxmlize(link) district_urls = page.xpath('//div[@class="parbase list section cplist"]/table/tr/td[1]/b/a/@href') for district_url in district_urls: page = lxmlize(district_url) district = page.xpath('//div[@class="pageHeader"]/h1/text()')[0].split(' - ')[1].strip() org = Organization(name=district + org_types[org_type], classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(district_url) yield org address = ', '.join(page.xpath('//div[@class="left_contents"]/p[1]/text()')) contacts = page.xpath('//div[@class="left_contents"]/p[b[text() = "Contact"]]/text()') phone = contacts[0].split(':')[1].strip().replace(' ', '-') fax = contacts[1].split(':')[1].strip().replace(' ', '-') email = page.xpath('//div[@class="left_contents"]//a[contains(@href, "mailto:")]') if email: email = email[0].text_content() site = page.xpath('//div[@class="left_contents"]//a[not(contains(@href,"mailto:"))]') if site: site = site[0].text_content() councillors = page.xpath('//div[@class="right_contents"]//p/text()') for i, councillor in enumerate(councillors): if 'Vacant' in councillor: continue p = Legislator(name=councillor, post_id=district) p.add_source(COUNCIL_PAGE) p.add_source(link) p.add_source(district_url) if i == 0: membership = p.add_membership(org, role='Mayor') else: membership = p.add_membership(org, role='Councillor') membership.post_id = district membership.add_contact_detail('address', address, 'legislature') if phone: membership.add_contact_detail('voice', phone, 'legislature') if fax: membership.add_contact_detail('fax', fax, 'legislature') if email: membership.add_contact_detail('email', email, None) if site: p.add_link(site, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@class="entry-content"]//p/strong') for councillor in councillors: district = councillor.xpath('./ancestor::p/preceding-sibling::h2' )[-1].text_content().split( '–'.decode('utf-8'))[0] name = ' '.join(councillor.text_content().split()[-2:]).replace( '-Â'.decode('utf-8'), '') role = councillor.text_content().replace(name, '').split('-')[0] if 'SAO' in role or not role: continue org = Organization( name=district + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(COUNCIL_PAGE) yield org p = Legislator(name=name, post_id=district) p.add_source(COUNCIL_PAGE) membership = p.add_membership(org, role=role, post_id=district) info = councillor.xpath('./ancestor::p/text()') for contact in info: if 'NT' in contact: membership.add_contact_detail('address', contact.strip(), 'legislature') if 'Tel' in contact: contact = contact.replace('Tel. ', '').replace('(', '').replace( ') ', '-').strip() membership.add_contact_detail('voice', contact, 'legislature') if 'Fax' in contact: contact = contact.replace('Fax ', '').replace('(', '').replace( ') ', '-').strip() membership.add_contact_detail('fax', contact, 'legislature') email = councillor.xpath( './parent::p//a[contains(@href, "mailto:")]/text()')[0] membership.add_contact_detail('email', email, None) if 'Website' in councillor.xpath('./parent::p')[0].text_content(): p.add_link( councillor.xpath('./parent::p//a')[1].attrib['href'], None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) districts = page.xpath('//div[@id="left-content" or @id="right-content"]//a') for district in districts: url = district.attrib['href'] page = lxmlize(url) org = Organization(name=district.text_content() + ' Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(url) yield org info = page.xpath('//div[@style="WIDTH:750"]/dl') for contact in info: contact_type = contact.xpath('./dt')[0].text_content() contact = contact.xpath('./dd')[0].text_content().replace('(', '').replace(') ', '-') if 'Officials' in contact_type: break if 'Tel' in contact_type: phone = contact if 'Fac' in contact_type: fax = contact if 'Address' in contact_type: address = contact if 'Email' in contact_type: email = contact if 'Website' in contact_type: site = contact councillors = page.xpath('//div[@style="WIDTH:750"]/dl/dt[contains(text(), "Elected Officials")]/parent::dl/dd/pre/text()')[0].splitlines(True) for councillor in councillors: name = councillor.replace('(Mayor)', '').replace('(Deputy Mayor)', '').replace('(Chairperson)', '').strip() role = re.sub(r'\(|\)', '', councillor.replace(name, '').strip()) if not role: role = 'Councillor' p = Legislator(name=name, post_id=district.text_content()) p.add_source(COUNCIL_PAGE) p.add_source(url) membership = p.add_membership(org, role=role, post_id=district.text_content()) membership.add_contact_detail('voice', clean_telephone_number(phone), 'legislature') membership.add_contact_detail('fax', clean_telephone_number(fax), 'legislature') membership.add_contact_detail('address', clean_address(address), 'legislature') membership.add_contact_detail('email', email, None) if site: p.add_link(site, None) yield p
def test_basic_invalid_organization(): """ Make sure we can make an invalid orga """ orga = Organization("name") orga.add_source(url='foo') orga.validate() orga.name = None with assert_raises(ValidationError): orga.validate()
def import_jurisdiction(org_importer, jurisdiction): obj = jurisdiction.get_db_object() obj['_type'] = 'jurisdiction' obj['_id'] = jurisdiction.jurisdiction_id if not obj['_id'].startswith("ocd-jurisdiction/"): raise ValueError("The Jurisdiction appears to have an ID that does not" " begin with 'ocd-jurisdiction'. I found '%s'" % ( jurisdiction.jurisdiction_id)) obj['latest_update'] = datetime.datetime.utcnow() # validate jurisdiction validator = DatetimeValidator() try: validator.validate(obj, jurisdiction_schema) except ValueError as ve: raise ve db.jurisdictions.save(obj) # create organization(s) org = Organization(name=jurisdiction.name, classification='legislature', jurisdiction_id=jurisdiction.jurisdiction_id) if jurisdiction.other_names: org.other_names = jurisdiction.other_names if jurisdiction.parent_id: org.parent_id = jurisdiction.parent_id parent_id = org_importer.import_object(org) if jurisdiction.chambers: for chamber, properties in jurisdiction.chambers.items(): org = Organization(name=properties['name'], classification='legislature', chamber=chamber, parent_id=parent_id, jurisdiction_id=jurisdiction.jurisdiction_id) org_importer.import_object(org) # create parties for party in jurisdiction.parties: org = Organization(**{'classification': 'party', 'name': party['name'], 'parent_id': None}) org_importer.import_object(org)
def get_people(self): page = lxmlize(COUNCIL_PAGE) districts = page.xpath('//div[@id="ctl00_PublicContent_divSearchContent"]//tr')[5::3] for district in districts: title = district.xpath('.//td//text()') if len(title[0]) > 1: title = title[0] else: title = ''.join(title[:2]) # @todo Need to distinguish between, e.g., R.M. and Town title = title.title() organization = Organization(name=title + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) organization.add_source(COUNCIL_PAGE) yield organization contact = district.xpath('.//td/text()') address = ' '.join(contact[:4]) address = re.sub(r'(Fax:.*)', '', address).strip() contact = [x for x in contact if 'Fax' in x] fax = contact[0].split(':')[1].strip() phone = district.xpath('.//b[contains(text(), "Phone")]/text()')[0].split(':')[1].strip() email = district.xpath('.//a[contains(@href, "mailto:")]/text()')[0].strip() councillors = district.xpath('.//td[3]/text()') positions = district.xpath('.//td[2]/b/text()') for i, councillor in enumerate(councillors): p = Legislator(name=councillor, post_id=title) p.add_source(COUNCIL_PAGE) if i >= 2: membership = p.add_membership(organization, role='Councillor') else: membership = p.add_membership(organization, role=positions[i]) # @todo "Resident Administrator & Chief Administrative Officer" is split on two lines membership.post_id = title membership.add_contact_detail('address', address, 'legislature') membership.add_contact_detail('fax', fax, 'legislature') membership.add_contact_detail('voice', phone, 'legislature') membership.add_contact_detail('email', email, None) yield p
def get_people(self): urls = Urls(dict(list=legislators_url), self) council = Organization('Boise City Council') council.add_source(legislators_url) yield council xpath = '//div[@id="content"]/div/a/@href' people_urls = urls.list.xpath(xpath) # SKip the mayor because his page has no name or email. people_urls = people_urls[1:] for url in people_urls: urls.add(detail=url) # Parse some attributes. image = urls.detail.xpath('//div[@id="content"]/p/img/@src').pop() name = urls.detail.xpath('//h1/text()').pop() name = name.replace('Council ', '') role, _, name = name.partition(' ') # Create legislator. person = Person(name, image=image) # Add membership on council. memb = person.add_membership(council, role=role) memb.add_source(urls.detail.url) # Add email address. email_xpath = '//a[contains(@href, "mailto")]/@href' email = urls.detail.xpath(email_xpath).pop()[7:] memb.contact_details.append( dict(type='email', value=email, note='work')) # Add sources. person.add_source(urls.list.url) person.add_source(urls.detail.url) yield person
def get_people(self): # committee tech = Organization('Technology') tech.add_post('Chairman', 'chairman') tech.add_source('https://example.com') yield tech # subcommittee ecom = Organization('Subcommittee on E-Commerce', parent=tech) ecom.add_source('https://example.com') yield ecom p = Person('Paul Tagliamonte', district='6', chamber='upper') p.add_membership(tech, role='chairman') p.add_source('https://example.com') yield p
def migrate_legislatures(self, state): spec = {} if state: spec['_id'] = state for metad in self.billy_db.metadata.find(spec, timeout=False): abbr = metad['abbreviation'] geoid = "ocd-division/country:us/state:%s" % (abbr) for chamber in metad['chambers']: cn = metad['chambers'][chamber]['name'] cow = Organization("%s, %s" % (metad['legislature_name'], cn), classification="legislature", chamber=chamber, division_id=geoid, abbreviation=abbr) cow._openstates_id = "%s-%s" % (abbr, chamber) cow.add_source(metad['legislature_url']) for post in self.billy_db.districts.find({"abbr": abbr}): if post['chamber'] != chamber: continue cow.add_post(label="Member", role="member", num_seats=post['num_seats'], id=post['name']) self.save_object(cow) meta = self.billy_db.metadata.find_one({"_id": cow.abbreviation}) if meta is None: raise Exception meta.pop("_id") meta['_id'] = cow.jurisdiction_id for badtag in ["latest_json_url", "latest_json_date", "latest_csv_url", "latest_csv_date"]: meta.pop(badtag, None) meta['division_id'] = "ocd-division/country:us/state:%s" % ( cow.abbreviation ) db.jurisdictions.save(meta)
def get_people(self): page = lxmlize(COUNCIL_PAGE) types = page.xpath( '//div[@class="bluearrow shaded bottomborder "][1]/ul/li/a/@href' )[:4] for org_type, link in enumerate(types): page = lxmlize(link) district_urls = page.xpath( '//div[@class="parbase list section cplist"]/table/tr/td[1]/b/a/@href' ) for district_url in district_urls: page = lxmlize(district_url) district = page.xpath('//div[@class="pageHeader"]/h1/text()' )[0].split(' - ')[1].strip() org = Organization( name=district + org_types[org_type], classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(district_url) yield org address = ', '.join( page.xpath('//div[@class="left_contents"]/p[1]/text()')) contacts = page.xpath( '//div[@class="left_contents"]/p[b[text() = "Contact"]]/text()' ) phone = contacts[0].split(':')[1].strip().replace(' ', '-') fax = contacts[1].split(':')[1].strip().replace(' ', '-') email = page.xpath( '//div[@class="left_contents"]//a[contains(@href, "mailto:")]' ) if email: email = email[0].text_content() site = page.xpath( '//div[@class="left_contents"]//a[not(contains(@href,"mailto:"))]' ) if site: site = site[0].text_content() councillors = page.xpath( '//div[@class="right_contents"]//p/text()') for i, councillor in enumerate(councillors): if 'Vacant' in councillor: continue p = Legislator(name=councillor, post_id=district) p.add_source(COUNCIL_PAGE) p.add_source(link) p.add_source(district_url) if i == 0: membership = p.add_membership(org, role='Mayor') else: membership = p.add_membership(org, role='Councillor') membership.post_id = district membership.add_contact_detail('address', address, 'legislature') if phone: membership.add_contact_detail('voice', phone, 'legislature') if fax: membership.add_contact_detail('fax', fax, 'legislature') if email: membership.add_contact_detail('email', email, None) if site: p.add_link(site, None) yield p
def get_people(self): response = urllib2.urlopen(COUNCIL_PAGE).read() pdf = open('/tmp/sk.pdf', 'w') pdf.write(response) pdf.close() data = subprocess.check_output(['pdftotext', '-layout', '/tmp/sk.pdf', '-']) data = data.splitlines(True) pages = [] page = [] for line in data: if line.strip() and not 'Page' in line and not 'CITIES' in line and not 'NORTHERN TOWNS, VILLAGES' in line: page.append(line) elif page: pages.append(page) page = [] districts = [] for page in pages: index = re.search(r'(\s{6,})', page[0]) if index: index = index.end() - 1 else: index = -1 dist1 = [] dist2 = [] for line in page: dist1.append(line[:index].strip()) dist2.append(line[index:].strip()) districts.append(dist1) districts.append(dist2) for district in districts: district_name = district.pop(0).split(',')[0].title() org = Organization(name=district_name + ' Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(COUNCIL_PAGE) councillors = [] contacts = {} for i, line in enumerate(district): if 'Phone' in line: phone = line.split(':')[1].replace('(', '').replace(') ', '-').strip() if phone: contacts['voice'] = phone if 'Fax' in line: fax = line.split(':')[1].replace('(', '').replace(') ', '-').strip() if fax: contacts['fax'] = fax if 'E-Mail' in line: email = line.split(':')[1].strip() if email: contacts['email'] = email if 'Address' in line and line.split(':')[1].strip(): address = line.split(':')[1].strip() + ', ' + ', '.join(district[i + 1:]).replace(' ,', '') contacts['address'] = address if 'Mayor' in line or 'Councillor' in line or 'Alderman' in line: councillor = line.split(':')[1].replace('Mr.', '').replace('Mrs.', '').replace('Ms.', '').replace('His Worship', '').replace('Her Worship', '').strip() role = line.split(':')[0].strip() if councillor: councillors.append([councillor, role]) if not councillors: continue yield org for councillor in councillors: p = Legislator(name=councillor[0], post_id=district_name) p.add_source(COUNCIL_PAGE) membership = p.add_membership(org, role=councillor[1], post_id=district_name) for key, value in contacts.iteritems(): membership.add_contact_detail(key, value, None if key == 'email' else 'legislature') yield p os.system('rm /tmp/sk.pdf')
def get_people(self): page = lxmlize(COUNCIL_PAGE) url = page.xpath( '//a[contains(text(),"Municipal Directory")]/@href')[0] response = urllib2.urlopen(url).read() pdf = open('/tmp/nl.pdf', 'w') pdf.write(response) pdf.close() data = subprocess.check_output( ['pdftotext', '-layout', '/tmp/nl.pdf', '-']) pages = data.split('Municipal Directory')[1:] for page in pages: page = page.splitlines(True) column_index = {} for line in page: if 'Official Name' in line: column_index['dist_end'] = re.search('Region', line).start() column_index['name_start'] = re.search('Mayor', line).start() + 1 column_index['name_end'] = re.search('Clerk', line).start() - 1 column_index['phone_start'] = re.search('Line 1', line).start() column_index['phone_end'] = re.search('Line 2', line).start() - 1 column_index['fax_start'] = re.search('Fax', line).start() column_index['fax_end'] = re.search('E-mail', line).start() - 2 column_index['email_start'] = column_index['fax_end'] + 1 column_index['email_end'] = re.search('Address', line).start() - 1 column_index[ 'address_start'] = column_index['email_end'] + 1 column_index['address_end'] = re.search('Days', line).start() - 1 break for line in page: if 'Official Name' in line or not line.strip(): continue district = line[:column_index['dist_end']] name = line[column_index['name_start']: column_index['name_end']].strip() phone = line[column_index['phone_start']: column_index['phone_end']].strip().replace( '(', '').replace(') ', '-') fax = line[column_index['fax_start']: column_index['fax_end']].strip().replace( '(', '').replace(') ', '-') email = line[column_index['email_start']: column_index['email_end']].strip() address = line[column_index['address_start']: column_index['address_end']].strip() address = re.sub(r'\s{2,}', ', ', address) if not name or not district: continue org = Organization( name=district + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(COUNCIL_PAGE) org.add_source(url) yield org p = Legislator(name=name, post_id=district) p.add_source(COUNCIL_PAGE) p.add_source(url) membership = p.add_membership(org, role='Mayor', post_id=district) if phone: membership.add_contact_detail('voice', phone, 'legislature') # Im excluding fax because that column isn't properly aligned # if fax: # membership.add_contact_detail('fax', fax, None) if email: membership.add_contact_detail('email', email, None) if address: membership.add_contact_detail('address', address, 'legislature') yield p os.system('rm /tmp/nl.pdf')
def get_people(self): response = urllib2.urlopen(COUNCIL_PAGE).read() pdf = open('/tmp/yt.pdf', 'w') pdf.write(response) pdf.close() data = subprocess.check_output(['pdftotext', '-layout', '/tmp/yt.pdf', '-']) data = re.split(r'\n\s*\n', data) for municipality in data: if not 'Councillors' in municipality: continue lines = municipality.split('\n') if 'Page' in lines[0]: lines.pop(0) if not lines[0].strip(): lines.pop(0) col1end = re.search(r'\s{2,}(\w)', lines[0].strip()).end() col2end = re.search(r':\s{2,}(\w)', lines[0].strip()).end() if 'Council' in lines[1]: address = lines[2][:col1end - 1].strip() + ' ' + lines[3][:col1end - 1].strip() district = lines[0][:col1end - 1].strip() + ' ' + lines[1][:col1end - 1].strip() else: address = lines[1][:col1end - 1].strip() + ' ' + lines[2][:col1end - 1].strip() district = lines[0][:col1end - 1].strip() organization = Organization(name=district + ' Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) organization.add_source(COUNCIL_PAGE) yield organization phone = re.findall(r'(?<=Phone: )\(?(\d{3}[\)-] ?\d{3}-\d{4})', municipality)[0].replace(') ', '-') email = re.findall(r'(?<=E-mail:) (\S*)', municipality)[0] fax = None if 'Fax' in municipality: fax = re.findall(r'(?<=Fax: )\(?(\d{3}[\)-] ?\d{3}-\d{4})', municipality)[0].replace(') ', '-') website = None if 'Website' in municipality: website = re.findall(r'((http:\/\/|www.)(\S*))', municipality)[0][0] councillor_or_mayor = False for line in lines: if 'Mayor:' in line: councillor_or_mayor = True role = 'Mayor' continue if 'Councillors' in line: councillor_or_mayor = True role = 'Councillor' continue if councillor_or_mayor: councillor = line[col1end - 1:col2end - 1].strip() if not councillor: continue p = Legislator(name=councillor, post_id=district) p.add_source(COUNCIL_PAGE) membership = p.add_membership(organization, role=role, post_id=district) membership.add_contact_detail('address', address, 'legislature') membership.add_contact_detail('voice', phone, 'legislature') membership.add_contact_detail('email', email, None) if fax: membership.add_contact_detail('fax', fax, 'legislature') if website: p.add_link(website, None) yield p os.system('rm /tmp/yt.pdf')
def get_people(self): response = urllib2.urlopen(COUNCIL_PAGE).read() pdf = open('/tmp/sk.pdf', 'w') pdf.write(response) pdf.close() data = subprocess.check_output( ['pdftotext', '-layout', '/tmp/sk.pdf', '-']) data = data.splitlines(True) pages = [] page = [] for line in data: if line.strip( ) and not 'Page' in line and not 'CITIES' in line and not 'NORTHERN TOWNS, VILLAGES' in line: page.append(line) elif page: pages.append(page) page = [] districts = [] for page in pages: index = re.search(r'(\s{6,})', page[0]) if index: index = index.end() - 1 else: index = -1 dist1 = [] dist2 = [] for line in page: dist1.append(line[:index].strip()) dist2.append(line[index:].strip()) districts.append(dist1) districts.append(dist2) for district in districts: district_name = district.pop(0).split(',')[0].title() org = Organization( name=district_name + ' Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(COUNCIL_PAGE) councillors = [] contacts = {} for i, line in enumerate(district): if 'Phone' in line: phone = line.split(':')[1].replace('(', '').replace( ') ', '-').strip() if phone: contacts['voice'] = phone if 'Fax' in line: fax = line.split(':')[1].replace('(', '').replace(') ', '-').strip() if fax: contacts['fax'] = fax if 'E-Mail' in line: email = line.split(':')[1].strip() if email: contacts['email'] = email if 'Address' in line and line.split(':')[1].strip(): address = line.split(':')[1].strip() + ', ' + ', '.join( district[i + 1:]).replace(' ,', '') contacts['address'] = address if 'Mayor' in line or 'Councillor' in line or 'Alderman' in line: councillor = line.split(':')[1].replace('Mr.', '').replace( 'Mrs.', '').replace('Ms.', '').replace('His Worship', '').replace('Her Worship', '').strip() role = line.split(':')[0].strip() if councillor: councillors.append([councillor, role]) if not councillors: continue yield org for councillor in councillors: p = Legislator(name=councillor[0], post_id=district_name) p.add_source(COUNCIL_PAGE) membership = p.add_membership(org, role=councillor[1], post_id=district_name) for key, value in contacts.iteritems(): membership.add_contact_detail( key, value, None if key == 'email' else 'legislature') yield p os.system('rm /tmp/sk.pdf')
def get_people(self): urls = Urls(dict(list=legislators_url), self) council = Organization('Denver City Council') council.add_source(legislators_url) # Get image urls, names, detail urls, and districts. image_xpath = '//a[contains(@href, "councildistrict")]/img/@src' image_urls = urls.list.xpath(image_xpath) name_xpath = '//a[contains(@href, "councildistrict")]' names = [a.text_content() for a in urls.list.xpath(name_xpath)][:-1] names = filter(None, names) person_urls_xpath = '//a[contains(@href, "councildistrict")]/@href' person_urls = urls.list.xpath(person_urls_xpath) post_ids = [] xpath = '//a[contains(@href, "councildistrict")]/img/ancestor::td' for td in urls.list.xpath(xpath): text = td.text_content() m = re.search('Council District \d+', text) if m: post_ids.append(m.group()) continue m = re.search('Council At-Large', text) if m: post_ids.append('Council At-Large') for post_id in post_ids: council.add_post(post_id, post_id) yield council data = zip(image_urls, names, person_urls, post_ids) for image_url, name, person_url, post_id in data: # Create legislator. person = Person(name, image=image_url) # Add sources. urls.add(detail=person_url) person.add_source(urls.list.url, note='list') person.add_source(urls.detail.url, note='detail') # Add membership on council. memb = person.add_membership(council, post_id=post_id.strip()) memb.add_source(urls.detail.url) xpath = '//div[@id="dnn_column3"]' contact_text = urls.detail.xpath(xpath)[0].text_content() if not contact_text.strip(): xpath = '//div[contains(@id, "dnn_RightPaneWide")]' contact_text = urls.detail.xpath(xpath)[0].text_content() phone_regex = r'\(\d{3}\)[ -]*\d{3}-\d{4}' phone = re.search(phone_regex, contact_text).group() memb.contact_details.append( dict(type='phone', value=phone, note='work')) # Add email address. email_regex = r'\[email protected]' email = re.search(email_regex, contact_text).group() memb.contact_details.append( dict(type='email', value=email, note='work')) yield person
def test_add_post(): """ Test that we can hack posts in on the fly'""" orga = Organization("name") orga.add_source(url='foo') orga.validate() orga.add_post("Human Readable Name", "Chef") assert orga.posts[0]['role'] == "Chef" assert orga.posts[0]['label'] == "Human Readable Name" with assert_raises(TypeError): orga.add_identifier("id10t", foo="bar") orga.add_identifier("id10t") orga.add_identifier("l0l", scheme="kruft") assert orga.identifiers[-1]['scheme'] == "kruft" assert orga.identifiers[0]['identifier'] == "id10t" assert not hasattr(orga.identifiers[0], "scheme")
def migrate_committees(self, state): def attach_members(committee, org): term = get_current_term(obj_to_jid(org)) for member in committee['members']: osid = member.get('leg_id', None) person_id = lookup_entry_id('people', osid) if person_id: m = Membership(person_id, org._id, role=member['role'], chamber=org.chamber, # term=term['name'], start_date=str(term['start_year'])) m.add_extra('term', term['name']) # We can assume there's no end_year because it's a current # member of the committee. If they left the committee, we don't # know about it yet :) self.save_object(m) if m.role != 'member': # In addition to being the (chair|vice-chair), # they should also be noted as a member. m = Membership(person_id, org._id, role='member', chamber=org.chamber, start_date=str(term['start_year'])) m.add_extra('term', term['name']) self.save_object(m) spec = {"subcommittee": None} if state: spec['state'] = state for committee in self.billy_db.committees.find(spec, timeout=False): # OK, we need to do the root committees first, so that we have IDs that # we can latch onto down below. org = Organization(committee['committee'], classification="committee") org.chamber = committee['chamber'] org.parent_id = lookup_entry_id('organizations', committee['state']) org.identifiers = [{'scheme': 'openstates', 'identifier': committee['_id']}] org._openstates_id = committee['_id'] org.sources = committee['sources'] org.created_at = committee['created_at'] org.updated_at = committee['updated_at'] # Look into posts; but we can't be sure. self.save_object(org) attach_members(committee, org) spec.update({"subcommittee": {"$ne": None}}) for committee in self.billy_db.committees.find(spec, timeout=False): org = Organization(committee['subcommittee'], classification="committee") org.parent_id = lookup_entry_id( 'organizations', committee['parent_id'] ) or lookup_entry_id( 'organizations', committee['state'] ) org.identifiers = [{'scheme': 'openstates', 'identifier': committee['_id']}] org._openstates_id = committee['_id'] org.sources = committee['sources'] org.chamber = committee['chamber'] # Look into posts; but we can't be sure. self.save_object(org) attach_members(committee, org)
def get_people(self): page = lxmlize(COUNCIL_PAGE) url = page.xpath('//a[contains(text(),"Municipal Directory")]/@href')[0] response = urllib2.urlopen(url).read() pdf = open('/tmp/nl.pdf', 'w') pdf.write(response) pdf.close() data = subprocess.check_output(['pdftotext', '-layout', '/tmp/nl.pdf', '-']) pages = data.split('Municipal Directory')[1:] for page in pages: page = page.splitlines(True) column_index = {} for line in page: if 'Official Name' in line: column_index['dist_end'] = re.search('Region', line).start() column_index['name_start'] = re.search('Mayor', line).start() + 1 column_index['name_end'] = re.search('Clerk', line).start() - 1 column_index['phone_start'] = re.search('Line 1', line).start() column_index['phone_end'] = re.search('Line 2', line).start() - 1 column_index['fax_start'] = re.search('Fax', line).start() column_index['fax_end'] = re.search('E-mail', line).start() - 2 column_index['email_start'] = column_index['fax_end'] + 1 column_index['email_end'] = re.search('Address', line).start() - 1 column_index['address_start'] = column_index['email_end'] + 1 column_index['address_end'] = re.search('Days', line).start() - 1 break for line in page: if 'Official Name' in line or not line.strip(): continue district = line[:column_index['dist_end']] name = line[column_index['name_start']:column_index['name_end']].strip() phone = line[column_index['phone_start']:column_index['phone_end']].strip().replace('(', '').replace(') ', '-') fax = line[column_index['fax_start']:column_index['fax_end']].strip().replace('(', '').replace(') ', '-') email = line[column_index['email_start']:column_index['email_end']].strip() address = line[column_index['address_start']:column_index['address_end']].strip() address = re.sub(r'\s{2,}', ', ', address) if not name or not district: continue org = Organization(name=district + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(COUNCIL_PAGE) org.add_source(url) yield org p = Legislator(name=name, post_id=district) p.add_source(COUNCIL_PAGE) p.add_source(url) membership = p.add_membership(org, role='Mayor', post_id=district) if phone: membership.add_contact_detail('voice', phone, 'legislature') # Im excluding fax because that column isn't properly aligned # if fax: # membership.add_contact_detail('fax', fax, None) if email: membership.add_contact_detail('email', email, None) if address: membership.add_contact_detail('address', address, 'legislature') yield p os.system('rm /tmp/nl.pdf')
def get_people(self): response = urllib2.urlopen(COUNCIL_PAGE).read() pdf = open('/tmp/yt.pdf', 'w') pdf.write(response) pdf.close() data = subprocess.check_output( ['pdftotext', '-layout', '/tmp/yt.pdf', '-']) data = re.split(r'\n\s*\n', data) for municipality in data: if not 'Councillors' in municipality: continue lines = municipality.split('\n') if 'Page' in lines[0]: lines.pop(0) if not lines[0].strip(): lines.pop(0) col1end = re.search(r'\s{2,}(\w)', lines[0].strip()).end() col2end = re.search(r':\s{2,}(\w)', lines[0].strip()).end() if 'Council' in lines[1]: address = lines[2][:col1end - 1].strip() + ' ' + lines[3][:col1end - 1].strip() district = lines[0][:col1end - 1].strip() + ' ' + lines[1][:col1end - 1].strip() else: address = lines[1][:col1end - 1].strip() + ' ' + lines[2][:col1end - 1].strip() district = lines[0][:col1end - 1].strip() organization = Organization( name=district + ' Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) organization.add_source(COUNCIL_PAGE) yield organization phone = re.findall(r'(?<=Phone: )\(?(\d{3}[\)-] ?\d{3}-\d{4})', municipality)[0].replace(') ', '-') email = re.findall(r'(?<=E-mail:) (\S*)', municipality)[0] fax = None if 'Fax' in municipality: fax = re.findall(r'(?<=Fax: )\(?(\d{3}[\)-] ?\d{3}-\d{4})', municipality)[0].replace(') ', '-') website = None if 'Website' in municipality: website = re.findall(r'((http:\/\/|www.)(\S*))', municipality)[0][0] councillor_or_mayor = False for line in lines: if 'Mayor:' in line: councillor_or_mayor = True role = 'Mayor' continue if 'Councillors' in line: councillor_or_mayor = True role = 'Councillor' continue if councillor_or_mayor: councillor = line[col1end - 1:col2end - 1].strip() if not councillor: continue p = Legislator(name=councillor, post_id=district) p.add_source(COUNCIL_PAGE) membership = p.add_membership(organization, role=role, post_id=district) membership.add_contact_detail('address', address, 'legislature') membership.add_contact_detail('voice', phone, 'legislature') membership.add_contact_detail('email', email, None) if fax: membership.add_contact_detail('fax', fax, 'legislature') if website: p.add_link(website, None) yield p os.system('rm /tmp/yt.pdf')