def get_people(self): page = lxmlize(COUNCIL_PAGE) districts = page.xpath( '//div[@id="ctl00_PublicContent_divSearchContent"]//tr')[5::3] for district in districts: title = district.xpath('.//td//text()') if len(title[0]) > 1: title = title[0] else: title = ''.join(title[:2]) # @todo Need to distinguish between, e.g., R.M. and Town title = title.title() organization = Organization( name=title + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) organization.add_source(COUNCIL_PAGE) yield organization contact = district.xpath('.//td/text()') address = ' '.join(contact[:4]) address = re.sub(r'(Fax:.*)', '', address).strip() contact = [x for x in contact if 'Fax' in x] fax = contact[0].split(':')[1].strip() phone = district.xpath('.//b[contains(text(), "Phone")]/text()' )[0].split(':')[1].strip() email = district.xpath( './/a[contains(@href, "mailto:")]/text()')[0].strip() councillors = district.xpath('.//td[3]/text()') positions = district.xpath('.//td[2]/b/text()') for i, councillor in enumerate(councillors): p = Legislator(name=councillor, post_id=title) p.add_source(COUNCIL_PAGE) if i >= 2: membership = p.add_membership(organization, role='Councillor') else: membership = p.add_membership( organization, role=positions[i] ) # @todo "Resident Administrator & Chief Administrative Officer" is split on two lines membership.post_id = title membership.add_contact_detail('address', address, 'legislature') membership.add_contact_detail('fax', fax, 'legislature') membership.add_contact_detail('voice', phone, 'legislature') membership.add_contact_detail('email', email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) types = page.xpath('//div[@class="bluearrow shaded bottomborder "][1]/ul/li/a/@href')[:4] for org_type, link in enumerate(types): page = lxmlize(link) district_urls = page.xpath('//div[@class="parbase list section cplist"]/table/tr/td[1]/b/a/@href') for district_url in district_urls: page = lxmlize(district_url) district = page.xpath('//div[@class="pageHeader"]/h1/text()')[0].split(' - ')[1].strip() org = Organization(name=district + org_types[org_type], classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(district_url) yield org address = ', '.join(page.xpath('//div[@class="left_contents"]/p[1]/text()')) contacts = page.xpath('//div[@class="left_contents"]/p[b[text() = "Contact"]]/text()') phone = contacts[0].split(':')[1].strip().replace(' ', '-') fax = contacts[1].split(':')[1].strip().replace(' ', '-') email = page.xpath('//div[@class="left_contents"]//a[contains(@href, "mailto:")]') if email: email = email[0].text_content() site = page.xpath('//div[@class="left_contents"]//a[not(contains(@href,"mailto:"))]') if site: site = site[0].text_content() councillors = page.xpath('//div[@class="right_contents"]//p/text()') for i, councillor in enumerate(councillors): if 'Vacant' in councillor: continue p = Legislator(name=councillor, post_id=district) p.add_source(COUNCIL_PAGE) p.add_source(link) p.add_source(district_url) if i == 0: membership = p.add_membership(org, role='Mayor') else: membership = p.add_membership(org, role='Councillor') membership.post_id = district membership.add_contact_detail('address', address, 'legislature') if phone: membership.add_contact_detail('voice', phone, 'legislature') if fax: membership.add_contact_detail('fax', fax, 'legislature') if email: membership.add_contact_detail('email', email, None) if site: p.add_link(site, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@class="entry-content"]//p/strong') for councillor in councillors: district = councillor.xpath('./ancestor::p/preceding-sibling::h2')[-1].text_content().split('–'.decode('utf-8'))[0] name = ' '.join(councillor.text_content().split()[-2:]).replace('-Â'.decode('utf-8'), '') role = councillor.text_content().replace(name, '').split('-')[0] if 'SAO' in role or not role: continue org = Organization(name=district + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(COUNCIL_PAGE) yield org p = Legislator(name=name, post_id=district) p.add_source(COUNCIL_PAGE) membership = p.add_membership(org, role=role, post_id=district) info = councillor.xpath('./ancestor::p/text()') for contact in info: if 'NT' in contact: membership.add_contact_detail('address', contact.strip(), 'legislature') if 'Tel' in contact: contact = contact.replace('Tel. ', '').replace('(', '').replace(') ', '-').strip() membership.add_contact_detail('voice', contact, 'legislature') if 'Fax' in contact: contact = contact.replace('Fax ', '').replace('(', '').replace(') ', '-').strip() membership.add_contact_detail('fax', contact, 'legislature') email = councillor.xpath('./parent::p//a[contains(@href, "mailto:")]/text()')[0] membership.add_contact_detail('email', email, None) if 'Website' in councillor.xpath('./parent::p')[0].text_content(): p.add_link(councillor.xpath('./parent::p//a')[1].attrib['href'], None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) districts = page.xpath('//div[@id="ctl00_PublicContent_divSearchContent"]//tr')[5::3] for district in districts: title = district.xpath('.//td//text()') if len(title[0]) > 1: title = title[0] else: title = ''.join(title[:2]) # @todo Need to distinguish between, e.g., R.M. and Town title = title.title() organization = Organization(name=title + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) organization.add_source(COUNCIL_PAGE) yield organization contact = district.xpath('.//td/text()') address = ' '.join(contact[:4]) address = re.sub(r'(Fax:.*)', '', address).strip() contact = [x for x in contact if 'Fax' in x] fax = contact[0].split(':')[1].strip() phone = district.xpath('.//b[contains(text(), "Phone")]/text()')[0].split(':')[1].strip() email = district.xpath('.//a[contains(@href, "mailto:")]/text()')[0].strip() councillors = district.xpath('.//td[3]/text()') positions = district.xpath('.//td[2]/b/text()') for i, councillor in enumerate(councillors): p = Legislator(name=councillor, post_id=title) p.add_source(COUNCIL_PAGE) if i >= 2: membership = p.add_membership(organization, role='Councillor') else: membership = p.add_membership(organization, role=positions[i]) # @todo "Resident Administrator & Chief Administrative Officer" is split on two lines membership.post_id = title membership.add_contact_detail('address', address, 'legislature') membership.add_contact_detail('fax', fax, 'legislature') membership.add_contact_detail('voice', phone, 'legislature') membership.add_contact_detail('email', email, None) yield p
def get_people(self): response = urllib2.urlopen(COUNCIL_PAGE).read() pdf = open('/tmp/ns.pdf', 'w') pdf.write(response) pdf.close() data = subprocess.check_output(['pdftotext', '/tmp/ns.pdf', '-']) emails = re.findall(r'(?<=E-mail: ).+', data) data = re.split(r'Mayor |Warden ', data)[1:] for i, mayor in enumerate(data): lines = mayor.splitlines(True) name = lines.pop(0).strip() if name == "Jim Smith": continue district = lines.pop(0).strip() if not re.findall(r'[0-9]', lines[0]): district = district + ' ' + lines.pop(0).strip() org = Organization(name=district + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(COUNCIL_PAGE) yield org p = Legislator(name=name, post_id=district) p.add_source(COUNCIL_PAGE) membership = p.add_membership(org, role='Mayor', post_id=district) address = lines.pop(0).strip() + ', ' + lines.pop(0).strip() if not 'Phone' in lines[0]: address = address + ', ' + lines.pop(0).strip() if not 'Phone' in lines[0]: address = address + ', ' + lines.pop(0).strip() phone = lines.pop(0).split(':')[1].strip() if 'Fax' in lines.pop(0): fax = lines.pop(0) membership.add_contact_detail('address', address, 'legislature') membership.add_contact_detail('voice', phone, 'legislature') membership.add_contact_detail('fax', fax, 'legislature') # @todo emails are being assigned incorrectly, e.g. Town of Berwick picks # up Cape Breton Regional Municipality and Region of Queens Municipality for i, email in enumerate(emails): regex = name.split()[-1].lower() + '|' + '|'.join(district.split()[-2:]).replace('of', '').lower() regex = regex.replace('||', '|') matches = re.findall(r'%s' % regex, email) if matches: membership.add_contact_detail('email', emails.pop(i), None) yield p os.system('rm /tmp/ns.pdf')
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@class="entry-content"]//p/strong') for councillor in councillors: district = councillor.xpath('./ancestor::p/preceding-sibling::h2' )[-1].text_content().split( '–'.decode('utf-8'))[0] name = ' '.join(councillor.text_content().split()[-2:]).replace( '-Â'.decode('utf-8'), '') role = councillor.text_content().replace(name, '').split('-')[0] if 'SAO' in role or not role: continue org = Organization( name=district + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(COUNCIL_PAGE) yield org p = Legislator(name=name, post_id=district) p.add_source(COUNCIL_PAGE) membership = p.add_membership(org, role=role, post_id=district) info = councillor.xpath('./ancestor::p/text()') for contact in info: if 'NT' in contact: membership.add_contact_detail('address', contact.strip(), 'legislature') if 'Tel' in contact: contact = contact.replace('Tel. ', '').replace('(', '').replace( ') ', '-').strip() membership.add_contact_detail('voice', contact, 'legislature') if 'Fax' in contact: contact = contact.replace('Fax ', '').replace('(', '').replace( ') ', '-').strip() membership.add_contact_detail('fax', contact, 'legislature') email = councillor.xpath( './parent::p//a[contains(@href, "mailto:")]/text()')[0] membership.add_contact_detail('email', email, None) if 'Website' in councillor.xpath('./parent::p')[0].text_content(): p.add_link( councillor.xpath('./parent::p//a')[1].attrib['href'], None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) districts = page.xpath('//div[@id="left-content" or @id="right-content"]//a') for district in districts: url = district.attrib['href'] page = lxmlize(url) org = Organization(name=district.text_content() + ' Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(url) yield org info = page.xpath('//div[@style="WIDTH:750"]/dl') for contact in info: contact_type = contact.xpath('./dt')[0].text_content() contact = contact.xpath('./dd')[0].text_content().replace('(', '').replace(') ', '-') if 'Officials' in contact_type: break if 'Tel' in contact_type: phone = contact if 'Fac' in contact_type: fax = contact if 'Address' in contact_type: address = contact if 'Email' in contact_type: email = contact if 'Website' in contact_type: site = contact councillors = page.xpath('//div[@style="WIDTH:750"]/dl/dt[contains(text(), "Elected Officials")]/parent::dl/dd/pre/text()')[0].splitlines(True) for councillor in councillors: name = councillor.replace('(Mayor)', '').replace('(Deputy Mayor)', '').replace('(Chairperson)', '').strip() role = re.sub(r'\(|\)', '', councillor.replace(name, '').strip()) if not role: role = 'Councillor' p = Legislator(name=name, post_id=district.text_content()) p.add_source(COUNCIL_PAGE) p.add_source(url) membership = p.add_membership(org, role=role, post_id=district.text_content()) membership.add_contact_detail('voice', clean_telephone_number(phone), 'legislature') membership.add_contact_detail('fax', clean_telephone_number(fax), 'legislature') membership.add_contact_detail('address', clean_address(address), 'legislature') membership.add_contact_detail('email', email, None) if site: p.add_link(site, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) types = page.xpath( '//div[@class="bluearrow shaded bottomborder "][1]/ul/li/a/@href' )[:4] for org_type, link in enumerate(types): page = lxmlize(link) district_urls = page.xpath( '//div[@class="parbase list section cplist"]/table/tr/td[1]/b/a/@href' ) for district_url in district_urls: page = lxmlize(district_url) district = page.xpath('//div[@class="pageHeader"]/h1/text()' )[0].split(' - ')[1].strip() org = Organization( name=district + org_types[org_type], classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(district_url) yield org address = ', '.join( page.xpath('//div[@class="left_contents"]/p[1]/text()')) contacts = page.xpath( '//div[@class="left_contents"]/p[b[text() = "Contact"]]/text()' ) phone = contacts[0].split(':')[1].strip().replace(' ', '-') fax = contacts[1].split(':')[1].strip().replace(' ', '-') email = page.xpath( '//div[@class="left_contents"]//a[contains(@href, "mailto:")]' ) if email: email = email[0].text_content() site = page.xpath( '//div[@class="left_contents"]//a[not(contains(@href,"mailto:"))]' ) if site: site = site[0].text_content() councillors = page.xpath( '//div[@class="right_contents"]//p/text()') for i, councillor in enumerate(councillors): if 'Vacant' in councillor: continue p = Legislator(name=councillor, post_id=district) p.add_source(COUNCIL_PAGE) p.add_source(link) p.add_source(district_url) if i == 0: membership = p.add_membership(org, role='Mayor') else: membership = p.add_membership(org, role='Councillor') membership.post_id = district membership.add_contact_detail('address', address, 'legislature') if phone: membership.add_contact_detail('voice', phone, 'legislature') if fax: membership.add_contact_detail('fax', fax, 'legislature') if email: membership.add_contact_detail('email', email, None) if site: p.add_link(site, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) url = page.xpath( '//a[contains(text(),"Municipal Directory")]/@href')[0] response = urllib2.urlopen(url).read() pdf = open('/tmp/nl.pdf', 'w') pdf.write(response) pdf.close() data = subprocess.check_output( ['pdftotext', '-layout', '/tmp/nl.pdf', '-']) pages = data.split('Municipal Directory')[1:] for page in pages: page = page.splitlines(True) column_index = {} for line in page: if 'Official Name' in line: column_index['dist_end'] = re.search('Region', line).start() column_index['name_start'] = re.search('Mayor', line).start() + 1 column_index['name_end'] = re.search('Clerk', line).start() - 1 column_index['phone_start'] = re.search('Line 1', line).start() column_index['phone_end'] = re.search('Line 2', line).start() - 1 column_index['fax_start'] = re.search('Fax', line).start() column_index['fax_end'] = re.search('E-mail', line).start() - 2 column_index['email_start'] = column_index['fax_end'] + 1 column_index['email_end'] = re.search('Address', line).start() - 1 column_index[ 'address_start'] = column_index['email_end'] + 1 column_index['address_end'] = re.search('Days', line).start() - 1 break for line in page: if 'Official Name' in line or not line.strip(): continue district = line[:column_index['dist_end']] name = line[column_index['name_start']: column_index['name_end']].strip() phone = line[column_index['phone_start']: column_index['phone_end']].strip().replace( '(', '').replace(') ', '-') fax = line[column_index['fax_start']: column_index['fax_end']].strip().replace( '(', '').replace(') ', '-') email = line[column_index['email_start']: column_index['email_end']].strip() address = line[column_index['address_start']: column_index['address_end']].strip() address = re.sub(r'\s{2,}', ', ', address) if not name or not district: continue org = Organization( name=district + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(COUNCIL_PAGE) org.add_source(url) yield org p = Legislator(name=name, post_id=district) p.add_source(COUNCIL_PAGE) p.add_source(url) membership = p.add_membership(org, role='Mayor', post_id=district) if phone: membership.add_contact_detail('voice', phone, 'legislature') # Im excluding fax because that column isn't properly aligned # if fax: # membership.add_contact_detail('fax', fax, None) if email: membership.add_contact_detail('email', email, None) if address: membership.add_contact_detail('address', address, 'legislature') yield p os.system('rm /tmp/nl.pdf')
def get_people(self): response = urllib2.urlopen(COUNCIL_PAGE).read() pdf = open('/tmp/yt.pdf', 'w') pdf.write(response) pdf.close() data = subprocess.check_output(['pdftotext', '-layout', '/tmp/yt.pdf', '-']) data = re.split(r'\n\s*\n', data) for municipality in data: if not 'Councillors' in municipality: continue lines = municipality.split('\n') if 'Page' in lines[0]: lines.pop(0) if not lines[0].strip(): lines.pop(0) col1end = re.search(r'\s{2,}(\w)', lines[0].strip()).end() col2end = re.search(r':\s{2,}(\w)', lines[0].strip()).end() if 'Council' in lines[1]: address = lines[2][:col1end - 1].strip() + ' ' + lines[3][:col1end - 1].strip() district = lines[0][:col1end - 1].strip() + ' ' + lines[1][:col1end - 1].strip() else: address = lines[1][:col1end - 1].strip() + ' ' + lines[2][:col1end - 1].strip() district = lines[0][:col1end - 1].strip() organization = Organization(name=district + ' Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) organization.add_source(COUNCIL_PAGE) yield organization phone = re.findall(r'(?<=Phone: )\(?(\d{3}[\)-] ?\d{3}-\d{4})', municipality)[0].replace(') ', '-') email = re.findall(r'(?<=E-mail:) (\S*)', municipality)[0] fax = None if 'Fax' in municipality: fax = re.findall(r'(?<=Fax: )\(?(\d{3}[\)-] ?\d{3}-\d{4})', municipality)[0].replace(') ', '-') website = None if 'Website' in municipality: website = re.findall(r'((http:\/\/|www.)(\S*))', municipality)[0][0] councillor_or_mayor = False for line in lines: if 'Mayor:' in line: councillor_or_mayor = True role = 'Mayor' continue if 'Councillors' in line: councillor_or_mayor = True role = 'Councillor' continue if councillor_or_mayor: councillor = line[col1end - 1:col2end - 1].strip() if not councillor: continue p = Legislator(name=councillor, post_id=district) p.add_source(COUNCIL_PAGE) membership = p.add_membership(organization, role=role, post_id=district) membership.add_contact_detail('address', address, 'legislature') membership.add_contact_detail('voice', phone, 'legislature') membership.add_contact_detail('email', email, None) if fax: membership.add_contact_detail('fax', fax, 'legislature') if website: p.add_link(website, None) yield p os.system('rm /tmp/yt.pdf')
def get_people(self): response = urllib2.urlopen(COUNCIL_PAGE).read() pdf = open('/tmp/yt.pdf', 'w') pdf.write(response) pdf.close() data = subprocess.check_output( ['pdftotext', '-layout', '/tmp/yt.pdf', '-']) data = re.split(r'\n\s*\n', data) for municipality in data: if not 'Councillors' in municipality: continue lines = municipality.split('\n') if 'Page' in lines[0]: lines.pop(0) if not lines[0].strip(): lines.pop(0) col1end = re.search(r'\s{2,}(\w)', lines[0].strip()).end() col2end = re.search(r':\s{2,}(\w)', lines[0].strip()).end() if 'Council' in lines[1]: address = lines[2][:col1end - 1].strip() + ' ' + lines[3][:col1end - 1].strip() district = lines[0][:col1end - 1].strip() + ' ' + lines[1][:col1end - 1].strip() else: address = lines[1][:col1end - 1].strip() + ' ' + lines[2][:col1end - 1].strip() district = lines[0][:col1end - 1].strip() organization = Organization( name=district + ' Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) organization.add_source(COUNCIL_PAGE) yield organization phone = re.findall(r'(?<=Phone: )\(?(\d{3}[\)-] ?\d{3}-\d{4})', municipality)[0].replace(') ', '-') email = re.findall(r'(?<=E-mail:) (\S*)', municipality)[0] fax = None if 'Fax' in municipality: fax = re.findall(r'(?<=Fax: )\(?(\d{3}[\)-] ?\d{3}-\d{4})', municipality)[0].replace(') ', '-') website = None if 'Website' in municipality: website = re.findall(r'((http:\/\/|www.)(\S*))', municipality)[0][0] councillor_or_mayor = False for line in lines: if 'Mayor:' in line: councillor_or_mayor = True role = 'Mayor' continue if 'Councillors' in line: councillor_or_mayor = True role = 'Councillor' continue if councillor_or_mayor: councillor = line[col1end - 1:col2end - 1].strip() if not councillor: continue p = Legislator(name=councillor, post_id=district) p.add_source(COUNCIL_PAGE) membership = p.add_membership(organization, role=role, post_id=district) membership.add_contact_detail('address', address, 'legislature') membership.add_contact_detail('voice', phone, 'legislature') membership.add_contact_detail('email', email, None) if fax: membership.add_contact_detail('fax', fax, 'legislature') if website: p.add_link(website, None) yield p os.system('rm /tmp/yt.pdf')
def get_people(self): page = lxmlize(COUNCIL_PAGE) url = page.xpath('//a[contains(text(),"Municipal Directory")]/@href')[0] response = urllib2.urlopen(url).read() pdf = open('/tmp/nl.pdf', 'w') pdf.write(response) pdf.close() data = subprocess.check_output(['pdftotext', '-layout', '/tmp/nl.pdf', '-']) pages = data.split('Municipal Directory')[1:] for page in pages: page = page.splitlines(True) column_index = {} for line in page: if 'Official Name' in line: column_index['dist_end'] = re.search('Region', line).start() column_index['name_start'] = re.search('Mayor', line).start() + 1 column_index['name_end'] = re.search('Clerk', line).start() - 1 column_index['phone_start'] = re.search('Line 1', line).start() column_index['phone_end'] = re.search('Line 2', line).start() - 1 column_index['fax_start'] = re.search('Fax', line).start() column_index['fax_end'] = re.search('E-mail', line).start() - 2 column_index['email_start'] = column_index['fax_end'] + 1 column_index['email_end'] = re.search('Address', line).start() - 1 column_index['address_start'] = column_index['email_end'] + 1 column_index['address_end'] = re.search('Days', line).start() - 1 break for line in page: if 'Official Name' in line or not line.strip(): continue district = line[:column_index['dist_end']] name = line[column_index['name_start']:column_index['name_end']].strip() phone = line[column_index['phone_start']:column_index['phone_end']].strip().replace('(', '').replace(') ', '-') fax = line[column_index['fax_start']:column_index['fax_end']].strip().replace('(', '').replace(') ', '-') email = line[column_index['email_start']:column_index['email_end']].strip() address = line[column_index['address_start']:column_index['address_end']].strip() address = re.sub(r'\s{2,}', ', ', address) if not name or not district: continue org = Organization(name=district + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(COUNCIL_PAGE) org.add_source(url) yield org p = Legislator(name=name, post_id=district) p.add_source(COUNCIL_PAGE) p.add_source(url) membership = p.add_membership(org, role='Mayor', post_id=district) if phone: membership.add_contact_detail('voice', phone, 'legislature') # Im excluding fax because that column isn't properly aligned # if fax: # membership.add_contact_detail('fax', fax, None) if email: membership.add_contact_detail('email', email, None) if address: membership.add_contact_detail('address', address, 'legislature') yield p os.system('rm /tmp/nl.pdf')
def get_people(self): response = urllib2.urlopen(COUNCIL_PAGE).read() pdf = open('/tmp/sk.pdf', 'w') pdf.write(response) pdf.close() data = subprocess.check_output( ['pdftotext', '-layout', '/tmp/sk.pdf', '-']) data = data.splitlines(True) pages = [] page = [] for line in data: if line.strip( ) and not 'Page' in line and not 'CITIES' in line and not 'NORTHERN TOWNS, VILLAGES' in line: page.append(line) elif page: pages.append(page) page = [] districts = [] for page in pages: index = re.search(r'(\s{6,})', page[0]) if index: index = index.end() - 1 else: index = -1 dist1 = [] dist2 = [] for line in page: dist1.append(line[:index].strip()) dist2.append(line[index:].strip()) districts.append(dist1) districts.append(dist2) for district in districts: district_name = district.pop(0).split(',')[0].title() org = Organization( name=district_name + ' Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(COUNCIL_PAGE) councillors = [] contacts = {} for i, line in enumerate(district): if 'Phone' in line: phone = line.split(':')[1].replace('(', '').replace( ') ', '-').strip() if phone: contacts['voice'] = phone if 'Fax' in line: fax = line.split(':')[1].replace('(', '').replace(') ', '-').strip() if fax: contacts['fax'] = fax if 'E-Mail' in line: email = line.split(':')[1].strip() if email: contacts['email'] = email if 'Address' in line and line.split(':')[1].strip(): address = line.split(':')[1].strip() + ', ' + ', '.join( district[i + 1:]).replace(' ,', '') contacts['address'] = address if 'Mayor' in line or 'Councillor' in line or 'Alderman' in line: councillor = line.split(':')[1].replace('Mr.', '').replace( 'Mrs.', '').replace('Ms.', '').replace('His Worship', '').replace('Her Worship', '').strip() role = line.split(':')[0].strip() if councillor: councillors.append([councillor, role]) if not councillors: continue yield org for councillor in councillors: p = Legislator(name=councillor[0], post_id=district_name) p.add_source(COUNCIL_PAGE) membership = p.add_membership(org, role=councillor[1], post_id=district_name) for key, value in contacts.iteritems(): membership.add_contact_detail( key, value, None if key == 'email' else 'legislature') yield p os.system('rm /tmp/sk.pdf')
def get_people(self): response = urllib2.urlopen(COUNCIL_PAGE).read() pdf = open('/tmp/sk.pdf', 'w') pdf.write(response) pdf.close() data = subprocess.check_output(['pdftotext', '-layout', '/tmp/sk.pdf', '-']) data = data.splitlines(True) pages = [] page = [] for line in data: if line.strip() and not 'Page' in line and not 'CITIES' in line and not 'NORTHERN TOWNS, VILLAGES' in line: page.append(line) elif page: pages.append(page) page = [] districts = [] for page in pages: index = re.search(r'(\s{6,})', page[0]) if index: index = index.end() - 1 else: index = -1 dist1 = [] dist2 = [] for line in page: dist1.append(line[:index].strip()) dist2.append(line[index:].strip()) districts.append(dist1) districts.append(dist2) for district in districts: district_name = district.pop(0).split(',')[0].title() org = Organization(name=district_name + ' Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(COUNCIL_PAGE) councillors = [] contacts = {} for i, line in enumerate(district): if 'Phone' in line: phone = line.split(':')[1].replace('(', '').replace(') ', '-').strip() if phone: contacts['voice'] = phone if 'Fax' in line: fax = line.split(':')[1].replace('(', '').replace(') ', '-').strip() if fax: contacts['fax'] = fax if 'E-Mail' in line: email = line.split(':')[1].strip() if email: contacts['email'] = email if 'Address' in line and line.split(':')[1].strip(): address = line.split(':')[1].strip() + ', ' + ', '.join(district[i + 1:]).replace(' ,', '') contacts['address'] = address if 'Mayor' in line or 'Councillor' in line or 'Alderman' in line: councillor = line.split(':')[1].replace('Mr.', '').replace('Mrs.', '').replace('Ms.', '').replace('His Worship', '').replace('Her Worship', '').strip() role = line.split(':')[0].strip() if councillor: councillors.append([councillor, role]) if not councillors: continue yield org for councillor in councillors: p = Legislator(name=councillor[0], post_id=district_name) p.add_source(COUNCIL_PAGE) membership = p.add_membership(org, role=councillor[1], post_id=district_name) for key, value in contacts.iteritems(): membership.add_contact_detail(key, value, None if key == 'email' else 'legislature') yield p os.system('rm /tmp/sk.pdf')