def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@class="entry-content"]//p/strong') for councillor in councillors: district = councillor.xpath('./ancestor::p/preceding-sibling::h2')[-1].text_content().split('–'.decode('utf-8'))[0] name = ' '.join(councillor.text_content().split()[-2:]).replace('-Â'.decode('utf-8'), '') role = councillor.text_content().replace(name, '').split('-')[0] if 'SAO' in role or not role: continue org = Organization(name=district + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(COUNCIL_PAGE) yield org p = Legislator(name=name, post_id=district) p.add_source(COUNCIL_PAGE) membership = p.add_membership(org, role=role, post_id=district) info = councillor.xpath('./ancestor::p/text()') for contact in info: if 'NT' in contact: membership.add_contact_detail('address', contact.strip(), 'legislature') if 'Tel' in contact: contact = contact.replace('Tel. ', '').replace('(', '').replace(') ', '-').strip() membership.add_contact_detail('voice', contact, 'legislature') if 'Fax' in contact: contact = contact.replace('Fax ', '').replace('(', '').replace(') ', '-').strip() membership.add_contact_detail('fax', contact, 'legislature') email = councillor.xpath('./parent::p//a[contains(@href, "mailto:")]/text()')[0] membership.add_contact_detail('email', email, None) if 'Website' in councillor.xpath('./parent::p')[0].text_content(): p.add_link(councillor.xpath('./parent::p//a')[1].attrib['href'], None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) types = page.xpath('//div[@class="bluearrow shaded bottomborder "][1]/ul/li/a/@href')[:4] for org_type, link in enumerate(types): page = lxmlize(link) district_urls = page.xpath('//div[@class="parbase list section cplist"]/table/tr/td[1]/b/a/@href') for district_url in district_urls: page = lxmlize(district_url) district = page.xpath('//div[@class="pageHeader"]/h1/text()')[0].split(' - ')[1].strip() org = Organization(name=district + org_types[org_type], classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(district_url) yield org address = ', '.join(page.xpath('//div[@class="left_contents"]/p[1]/text()')) contacts = page.xpath('//div[@class="left_contents"]/p[b[text() = "Contact"]]/text()') phone = contacts[0].split(':')[1].strip().replace(' ', '-') fax = contacts[1].split(':')[1].strip().replace(' ', '-') email = page.xpath('//div[@class="left_contents"]//a[contains(@href, "mailto:")]') if email: email = email[0].text_content() site = page.xpath('//div[@class="left_contents"]//a[not(contains(@href,"mailto:"))]') if site: site = site[0].text_content() councillors = page.xpath('//div[@class="right_contents"]//p/text()') for i, councillor in enumerate(councillors): if 'Vacant' in councillor: continue p = Legislator(name=councillor, post_id=district) p.add_source(COUNCIL_PAGE) p.add_source(link) p.add_source(district_url) if i == 0: membership = p.add_membership(org, role='Mayor') else: membership = p.add_membership(org, role='Councillor') membership.post_id = district membership.add_contact_detail('address', address, 'legislature') if phone: membership.add_contact_detail('voice', phone, 'legislature') if fax: membership.add_contact_detail('fax', fax, 'legislature') if email: membership.add_contact_detail('email', email, None) if site: p.add_link(site, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@class="entry-content"]//p/strong') for councillor in councillors: district = councillor.xpath('./ancestor::p/preceding-sibling::h2' )[-1].text_content().split( '–'.decode('utf-8'))[0] name = ' '.join(councillor.text_content().split()[-2:]).replace( '-Â'.decode('utf-8'), '') role = councillor.text_content().replace(name, '').split('-')[0] if 'SAO' in role or not role: continue org = Organization( name=district + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(COUNCIL_PAGE) yield org p = Legislator(name=name, post_id=district) p.add_source(COUNCIL_PAGE) membership = p.add_membership(org, role=role, post_id=district) info = councillor.xpath('./ancestor::p/text()') for contact in info: if 'NT' in contact: membership.add_contact_detail('address', contact.strip(), 'legislature') if 'Tel' in contact: contact = contact.replace('Tel. ', '').replace('(', '').replace( ') ', '-').strip() membership.add_contact_detail('voice', contact, 'legislature') if 'Fax' in contact: contact = contact.replace('Fax ', '').replace('(', '').replace( ') ', '-').strip() membership.add_contact_detail('fax', contact, 'legislature') email = councillor.xpath( './parent::p//a[contains(@href, "mailto:")]/text()')[0] membership.add_contact_detail('email', email, None) if 'Website' in councillor.xpath('./parent::p')[0].text_content(): p.add_link( councillor.xpath('./parent::p//a')[1].attrib['href'], None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) districts = page.xpath('//div[@id="left-content" or @id="right-content"]//a') for district in districts: url = district.attrib['href'] page = lxmlize(url) org = Organization(name=district.text_content() + ' Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(url) yield org info = page.xpath('//div[@style="WIDTH:750"]/dl') for contact in info: contact_type = contact.xpath('./dt')[0].text_content() contact = contact.xpath('./dd')[0].text_content().replace('(', '').replace(') ', '-') if 'Officials' in contact_type: break if 'Tel' in contact_type: phone = contact if 'Fac' in contact_type: fax = contact if 'Address' in contact_type: address = contact if 'Email' in contact_type: email = contact if 'Website' in contact_type: site = contact councillors = page.xpath('//div[@style="WIDTH:750"]/dl/dt[contains(text(), "Elected Officials")]/parent::dl/dd/pre/text()')[0].splitlines(True) for councillor in councillors: name = councillor.replace('(Mayor)', '').replace('(Deputy Mayor)', '').replace('(Chairperson)', '').strip() role = re.sub(r'\(|\)', '', councillor.replace(name, '').strip()) if not role: role = 'Councillor' p = Legislator(name=name, post_id=district.text_content()) p.add_source(COUNCIL_PAGE) p.add_source(url) membership = p.add_membership(org, role=role, post_id=district.text_content()) membership.add_contact_detail('voice', clean_telephone_number(phone), 'legislature') membership.add_contact_detail('fax', clean_telephone_number(fax), 'legislature') membership.add_contact_detail('address', clean_address(address), 'legislature') membership.add_contact_detail('email', email, None) if site: p.add_link(site, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) types = page.xpath( '//div[@class="bluearrow shaded bottomborder "][1]/ul/li/a/@href' )[:4] for org_type, link in enumerate(types): page = lxmlize(link) district_urls = page.xpath( '//div[@class="parbase list section cplist"]/table/tr/td[1]/b/a/@href' ) for district_url in district_urls: page = lxmlize(district_url) district = page.xpath('//div[@class="pageHeader"]/h1/text()' )[0].split(' - ')[1].strip() org = Organization( name=district + org_types[org_type], classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(district_url) yield org address = ', '.join( page.xpath('//div[@class="left_contents"]/p[1]/text()')) contacts = page.xpath( '//div[@class="left_contents"]/p[b[text() = "Contact"]]/text()' ) phone = contacts[0].split(':')[1].strip().replace(' ', '-') fax = contacts[1].split(':')[1].strip().replace(' ', '-') email = page.xpath( '//div[@class="left_contents"]//a[contains(@href, "mailto:")]' ) if email: email = email[0].text_content() site = page.xpath( '//div[@class="left_contents"]//a[not(contains(@href,"mailto:"))]' ) if site: site = site[0].text_content() councillors = page.xpath( '//div[@class="right_contents"]//p/text()') for i, councillor in enumerate(councillors): if 'Vacant' in councillor: continue p = Legislator(name=councillor, post_id=district) p.add_source(COUNCIL_PAGE) p.add_source(link) p.add_source(district_url) if i == 0: membership = p.add_membership(org, role='Mayor') else: membership = p.add_membership(org, role='Councillor') membership.post_id = district membership.add_contact_detail('address', address, 'legislature') if phone: membership.add_contact_detail('voice', phone, 'legislature') if fax: membership.add_contact_detail('fax', fax, 'legislature') if email: membership.add_contact_detail('email', email, None) if site: p.add_link(site, None) yield p
def get_people(self): response = urllib2.urlopen(COUNCIL_PAGE).read() pdf = open('/tmp/yt.pdf', 'w') pdf.write(response) pdf.close() data = subprocess.check_output(['pdftotext', '-layout', '/tmp/yt.pdf', '-']) data = re.split(r'\n\s*\n', data) for municipality in data: if not 'Councillors' in municipality: continue lines = municipality.split('\n') if 'Page' in lines[0]: lines.pop(0) if not lines[0].strip(): lines.pop(0) col1end = re.search(r'\s{2,}(\w)', lines[0].strip()).end() col2end = re.search(r':\s{2,}(\w)', lines[0].strip()).end() if 'Council' in lines[1]: address = lines[2][:col1end - 1].strip() + ' ' + lines[3][:col1end - 1].strip() district = lines[0][:col1end - 1].strip() + ' ' + lines[1][:col1end - 1].strip() else: address = lines[1][:col1end - 1].strip() + ' ' + lines[2][:col1end - 1].strip() district = lines[0][:col1end - 1].strip() organization = Organization(name=district + ' Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) organization.add_source(COUNCIL_PAGE) yield organization phone = re.findall(r'(?<=Phone: )\(?(\d{3}[\)-] ?\d{3}-\d{4})', municipality)[0].replace(') ', '-') email = re.findall(r'(?<=E-mail:) (\S*)', municipality)[0] fax = None if 'Fax' in municipality: fax = re.findall(r'(?<=Fax: )\(?(\d{3}[\)-] ?\d{3}-\d{4})', municipality)[0].replace(') ', '-') website = None if 'Website' in municipality: website = re.findall(r'((http:\/\/|www.)(\S*))', municipality)[0][0] councillor_or_mayor = False for line in lines: if 'Mayor:' in line: councillor_or_mayor = True role = 'Mayor' continue if 'Councillors' in line: councillor_or_mayor = True role = 'Councillor' continue if councillor_or_mayor: councillor = line[col1end - 1:col2end - 1].strip() if not councillor: continue p = Legislator(name=councillor, post_id=district) p.add_source(COUNCIL_PAGE) membership = p.add_membership(organization, role=role, post_id=district) membership.add_contact_detail('address', address, 'legislature') membership.add_contact_detail('voice', phone, 'legislature') membership.add_contact_detail('email', email, None) if fax: membership.add_contact_detail('fax', fax, 'legislature') if website: p.add_link(website, None) yield p os.system('rm /tmp/yt.pdf')
def get_people(self): response = urllib2.urlopen(COUNCIL_PAGE).read() pdf = open('/tmp/yt.pdf', 'w') pdf.write(response) pdf.close() data = subprocess.check_output( ['pdftotext', '-layout', '/tmp/yt.pdf', '-']) data = re.split(r'\n\s*\n', data) for municipality in data: if not 'Councillors' in municipality: continue lines = municipality.split('\n') if 'Page' in lines[0]: lines.pop(0) if not lines[0].strip(): lines.pop(0) col1end = re.search(r'\s{2,}(\w)', lines[0].strip()).end() col2end = re.search(r':\s{2,}(\w)', lines[0].strip()).end() if 'Council' in lines[1]: address = lines[2][:col1end - 1].strip() + ' ' + lines[3][:col1end - 1].strip() district = lines[0][:col1end - 1].strip() + ' ' + lines[1][:col1end - 1].strip() else: address = lines[1][:col1end - 1].strip() + ' ' + lines[2][:col1end - 1].strip() district = lines[0][:col1end - 1].strip() organization = Organization( name=district + ' Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) organization.add_source(COUNCIL_PAGE) yield organization phone = re.findall(r'(?<=Phone: )\(?(\d{3}[\)-] ?\d{3}-\d{4})', municipality)[0].replace(') ', '-') email = re.findall(r'(?<=E-mail:) (\S*)', municipality)[0] fax = None if 'Fax' in municipality: fax = re.findall(r'(?<=Fax: )\(?(\d{3}[\)-] ?\d{3}-\d{4})', municipality)[0].replace(') ', '-') website = None if 'Website' in municipality: website = re.findall(r'((http:\/\/|www.)(\S*))', municipality)[0][0] councillor_or_mayor = False for line in lines: if 'Mayor:' in line: councillor_or_mayor = True role = 'Mayor' continue if 'Councillors' in line: councillor_or_mayor = True role = 'Councillor' continue if councillor_or_mayor: councillor = line[col1end - 1:col2end - 1].strip() if not councillor: continue p = Legislator(name=councillor, post_id=district) p.add_source(COUNCIL_PAGE) membership = p.add_membership(organization, role=role, post_id=district) membership.add_contact_detail('address', address, 'legislature') membership.add_contact_detail('voice', phone, 'legislature') membership.add_contact_detail('email', email, None) if fax: membership.add_contact_detail('fax', fax, 'legislature') if website: p.add_link(website, None) yield p os.system('rm /tmp/yt.pdf')