def parse_profile(self, response): response = parse.get_decoded_response(response, 'Big5') meta = response.request.meta sel = Selector(response) curr_url = response.url county = u'宜蘭縣' tables = sel.xpath('//table[@bgcolor="#333333"]') item = Councilor() item['contact_details'] = [] item['election_year'] = '2009' item['term_end'] = {'date': '2014-12-25'} item['term_start'] = '%s-12-25' % item['election_year'] item['in_office'] = True item['county'] = county item['links'] = [{'url': response.url, 'note': u'議會個人官網'}] img_url = sel.xpath('.//div[@id="Layer2"]/img/@src').extract()[0] item['image'] = urljoin(curr_url, img_url) if meta: area = meta['area'] item['constituency'] = county + area[0] item['district'] = area[1] key_map = { u'黨籍': 'party', u'姓名': 'name' } tds = tables[0].xpath('.//td') pairs = [(tds[2 * i], tds[2 * i + 1]) for i in range(len(tds) / 2)] for k, v in pairs: key = parse.get_inner_text(k, remove_white=True) value = parse.get_inner_text(v).strip() k_eng = key_map.get(key) if k_eng: item[k_eng] = value elif key == 'E-mail': if value: misc.append_contact(item, 'email', key, value) elif u'電話' in key: misc.append_contact_list(item, 'voice', key, value.split(u'、')) elif key == u'服務處所': misc.append_contact(item, 'address', key, value) elif key == u'學歷': item['education'] = value.split() exp_node = tables[1].xpath('.//td[@bgcolor="#FFFFFF"]') experience = [] for ex in exp_node: ex = parse.get_inner_text(ex).split() experience += ex item['experience'] = experience m = re.search(u'(副?議長)。?$', item['experience'][0]) item['title'] = m.group(1) if m else u'議員' item['platform'] = parse.get_inner_text(tables[2].xpath('.//td[@bgcolor="#FFFFFF"]')).split() return item
def parse_profile(self, response): response = parse.get_decoded_response(response, 'Big5') meta = response.request.meta sel = Selector(response) curr_url = response.url county = u'宜蘭縣' tables = sel.xpath('//table[@bgcolor="#333333"]') item = Councilor() item['contact_details'] = [] item['election_year'] = '2009' item['term_end'] = {'date': '2014-12-25'} item['term_start'] = '%s-12-25' % item['election_year'] item['in_office'] = True item['county'] = county item['links'] = [{'url': response.url, 'note': u'議會個人官網'}] img_url = sel.xpath('.//div[@id="Layer2"]/img/@src').extract()[0] item['image'] = urljoin(curr_url, img_url) if meta: area = meta['area'] item['constituency'] = county + area[0] item['district'] = area[1] key_map = {u'黨籍': 'party', u'姓名': 'name'} tds = tables[0].xpath('.//td') pairs = [(tds[2 * i], tds[2 * i + 1]) for i in range(len(tds) / 2)] for k, v in pairs: key = parse.get_inner_text(k, remove_white=True) value = parse.get_inner_text(v).strip() k_eng = key_map.get(key) if k_eng: item[k_eng] = value elif key == 'E-mail': if value: misc.append_contact(item, 'email', key, value) elif u'電話' in key: misc.append_contact_list(item, 'voice', key, value.split(u'、')) elif key == u'服務處所': misc.append_contact(item, 'address', key, value) elif key == u'學歷': item['education'] = value.split() exp_node = tables[1].xpath('.//td[@bgcolor="#FFFFFF"]') experience = [] for ex in exp_node: ex = parse.get_inner_text(ex).split() experience += ex item['experience'] = experience m = re.search(u'(副?議長)。?$', item['experience'][0]) item['title'] = m.group(1) if m else u'議員' item['platform'] = parse.get_inner_text( tables[2].xpath('.//td[@bgcolor="#FFFFFF"]')).split() return item
def parse(self, response): sel = Selector(response) rows = sel.xpath('//table[@bordercolordark="#4292d6"]/tbody/tr') sitting = None for row in rows: text = ''.join(row.xpath('.//text()').extract()).strip() text = parse.remove_whitespaces(text) if not text: continue if re.match(u'第.*屆', text): sitting = text continue anchors = row.xpath(".//a") links = [] for anchor in anchors: link_text = parse.get_inner_text(anchor) if link_text: links.append(anchor.xpath('@href').extract()[0]) url = parse.take_first(links) item = Bills() print sitting, text if url: url = self.base_url + url yield Request(url, callback=self.parse_files, meta={'item': item})
def parse_profile(self, response): sel = Selector(response) main_node = sel.xpath( '//table[@class="specpage_data_table"]//table[2]') info_node = main_node.xpath('.//table[2]') curr_url = response.url logging.info('to setup item: curr_url: %s', curr_url) item = Councilor() item['contact_details'] = [] item['county'] = u'桃園縣' item['election_year'] = '2009' item['term_start'] = '%s-12-25' % item['election_year'] item['term_end'] = {'date': '2014-12-25'} item['in_office'] = True item['name'], item['title'] = \ sel.xpath('//span[@id="ctl04_ctl08_pageControl_LB_MEM_NAME"]/text()').extract()[0].split() item['links'] = [{'url': response.url, 'note': u'議會個人官網'}] img_url = main_node.xpath('.//img[@class="memImg"]/@src').extract()[0] item['image'] = urljoin(response.url, urllib.quote(img_url.encode('utf8'))) logging.info('after image: item: %s', item) key_map = {u'學歷': 'education', u'經歷': 'experience'} rows = main_node.xpath('.//tr') is_contact_info = False for row in rows: key = parse.get_extracted(row.xpath('.//img/@alt')) if key == u'聯絡資訊': is_contact_info = True elif key == u'首頁圖示': info = parse.get_inner_text(row).split() logging.info('info: %s', info) address_str = info[0] if u'電話:' not in info[1]: address_str += info[1] address = re.sub(ur'.*服務處.*:', '', address_str).strip() misc.append_contact(item, 'address', '服務處', address) for group in info: if re.search(ur'電話:', group): tel_val = re.sub(ur'/.*', '', re.sub(ur'.*電話:', '', group)).strip() if tel_val: misc.append_contact(item, 'voice', '電話', tel_val) if re.search(ur'傳真:', group): fax_val = re.sub(ur'/.*', '', re.sub(ur'.*傳真:', '', group)).strip() if fax_val: misc.append_contact(item, 'fax', '傳真', fax_val)
def parse_profile(self, response): sel = Selector(response) main_node = sel.xpath('//table[@class="specpage_data_table"]//table[2]') info_node = main_node.xpath('.//table[2]') curr_url = response.url logging.info('to setup item: curr_url: %s', curr_url) item = Councilor() item['contact_details'] = [] item['county'] = u'桃園縣' item['election_year'] = '2009' item['term_start'] = '%s-12-25' % item['election_year'] item['term_end'] = {'date': '2014-12-25'} item['in_office'] = True item['name'], item['title'] = \ sel.xpath('//span[@id="ctl04_ctl08_pageControl_LB_MEM_NAME"]/text()').extract()[0].split() item['links'] = [{'url': response.url, 'note': u'議會個人官網'}] img_url = main_node.xpath('.//img[@class="memImg"]/@src').extract()[0] item['image'] = urljoin(response.url, urllib.quote(img_url.encode('utf8'))) logging.info('after image: item: %s', item) key_map = { u'學歷': 'education', u'經歷': 'experience' } rows = main_node.xpath('.//tr') is_contact_info = False for row in rows: key = parse.get_extracted(row.xpath('.//img/@alt')) if key == u'聯絡資訊': is_contact_info = True elif key == u'首頁圖示': info = parse.get_inner_text(row).split() logging.info('info: %s', info) address_str = info[0] if u'電話:' not in info[1]: address_str += info[1] address = re.sub(ur'.*服務處.*:', '', address_str).strip() misc.append_contact(item, 'address', '服務處', address) for group in info: if re.search(ur'電話:', group): tel_val = re.sub(ur'/.*', '', re.sub(ur'.*電話:', '', group)).strip() if tel_val: misc.append_contact(item, 'voice', '電話', tel_val) if re.search(ur'傳真:', group): fax_val = re.sub(ur'/.*', '', re.sub(ur'.*傳真:', '', group)).strip() if fax_val: misc.append_contact(item, 'fax', '傳真', fax_val)
def parse_files(self, response): sel = Selector(response) meta = response.request.meta item = meta['item'] rows = sel.xpath('//table[@bordercolordark="#4292d6"]/tbody/tr') for row in rows: anchors = row.xpath('.//a') if anchors: text = parse.get_inner_text(row) url = anchors.xpath('@href').extract() url = parse.take_first(url) print text, url return item
def parse_profile(self, response): sel = Selector(response) main_node = sel.xpath('/html/body/table/tbody/tr[1]/td/table[2]/tbody') basic_info_node = main_node.xpath('tr[1]/td[2]/p') sub_table_node = main_node.xpath('.//tbody') base_url = self.base_url + '/content/' item = Councilor() item['contact_details'] = [] item['links'] = [{'url': response.url, 'note': u'議會個人官網'}] item['image'] = base_url + parse.get_extracted(sel.xpath(u'//div/img/@src')) key_map = { u'性別': 'gender', u'黨籍': 'party', u'選區': 'constituency', } for i, line in enumerate(basic_info_node.xpath('.//text()').extract()): line = line.strip() if i == 0: item['name'] = line continue cols = line.split(u':') k_chinese = parse.remove_whitespaces(cols[0]) value = cols[1] k_eng = key_map.get(k_chinese) if k_eng: item[k_eng] = value for tr in sub_table_node.xpath('tr'): cols = tr.xpath('td') left = parse.remove_whitespaces(parse.get_extracted(cols[0].xpath('text()')).strip()) right = parse.get_inner_text(cols[1]) if left == u'政見': item['platform'] = parse.get_inner_text_lines(cols[1]) if left == u'服務處地址': misc.append_contact(item, 'address', left, right) if left == u'電子郵件信箱': misc.append_contact(item, 'email', left, right) if u'電話' in left: misc.append_contact(item, 'voice', left, right) if u'網址' in left: item['links'].append({'url': right, 'note': left}) return item
def parse_bill(self, response): response = parse.get_decoded_response(response, 'Big5') sel = Selector(response) # convert to list of pairs rows = sel.xpath('//tr') pairs = misc.rows_to_pairs(rows) item = Bills() item['election_year'] = self.election_year[int(sel.xpath('//span[@id="lbFmotion_expireb"]/text()').re('\d+')[0])] item['county'] = u'宜蘭縣' item['links'] = response.url print response.url get_param = parse_qs(urlparse(response.url).query) item['id'] = get_param['Fmotion_instanceOS'][0].decode('Big5') item['proposed_by'] = re.sub(u'、', ' ', sel.xpath('//*[@id="lbFmotion_People"]/text()').extract()[0]).split() petitioned_by = sel.xpath('//*[@id="lbFmotion_AddTo"]/text()').extract() item['petitioned_by'] = re.sub(u'、', ' ', petitioned_by[0]).split() if petitioned_by else [] item['motions'] = [] main_title = parse.get_inner_text(sel.xpath('//font[@color="#800000"]'), remove_white=True) m = re.match(u'宜蘭縣議會(.*)議案資料', main_title) if m: main_sitting = m.group(1) k_map = { u'來源別':'type', # u'建檔日期':'', # u'議案程序':'', # u'系統編號':'', u'案號': 'bill_no', u'類別': 'category', # u'小組':'', u'案由': 'abstract', # u'法規名稱':'', u'辦法': 'methods', u'理由': 'description', # u'附件':'', # u'審議日期':'', # u'大會決議':'', } curr_motion = None for i, pair in enumerate(pairs): n = len(pair) if n < 2: if n == 1: td = pair[0] text = parse.get_inner_text(td, remove_white=True) if td.xpath(u'.//img[@alt="小圖示"]'): if text != u'案由、辦法、理由及附件': if curr_motion: item['motions'].append(curr_motion) curr_motion = {'motion': text} elif curr_motion is not None and not curr_motion.get('sitting'): curr_motion['sitting'] = ' '.join(td.xpath('.//span/text()').extract()) continue k_raw, v_raw = pair k = parse.get_inner_text(k_raw, remove_white=True) v = parse.get_inner_text(v_raw) k_eng = k_map.get(k) if k_eng: item[k_eng] = v elif k == u'建檔日期': misc.append_motion(item, u'建檔', None, v, main_sitting) if curr_motion is not None: if u'日期' in k: curr_motion['date'] = v elif 'date' in curr_motion: curr_motion['resolution'] = v if curr_motion: item['motions'].append(curr_motion) return item
def parse_bill(self, response): response = parse.get_decoded_response(response, 'Big5') sel = Selector(response) # convert to list of pairs rows = sel.xpath('//tr') pairs = misc.rows_to_pairs(rows) item = Bills() item['links'] = response.url item['motions'] = [] main_title = parse.get_inner_text( sel.xpath('//font[@color="#800000"]'), remove_white=True) m = re.match(u'宜蘭縣議會(.*)議案資料', main_title) if m: main_sitting = m.group(1) k_map = { # u'來源別':'', # u'建檔日期':'', # u'議案程序':'', # u'系統編號':'', u'動議人': 'proposed_by', u'提案單位': 'proposed_by', u'案號': 'bill_no', u'附議人': 'petitioned_by', u'類別': 'category', # u'小組':'', u'案由': 'abstract', # u'法規名稱':'', u'辦法': 'methods', u'理由': 'description', # u'附件':'', # u'審議日期':'', # u'大會決議':'', } curr_motion = None for i, pair in enumerate(pairs): n = len(pair) if n < 2: if n == 1: td = pair[0] text = parse.get_inner_text(td, remove_white=True) if td.xpath(u'.//img[@alt="小圖示"]'): if text != u'案由、辦法、理由及附件': if curr_motion: item['motions'].append(curr_motion) curr_motion = {'motion': text} elif curr_motion is not None and not curr_motion.get( 'sitting'): curr_motion['sitting'] = ' '.join( td.xpath('.//span/text()').extract()) continue k_raw, v_raw = pair k = parse.get_inner_text(k_raw, remove_white=True) v = parse.get_inner_text(v_raw) k_eng = k_map.get(k) if k_eng: new_v = v if k_eng in ['petitioned_by', 'proposed_by']: new_v = v.split() item[k_eng] = new_v elif k == u'建檔日期': misc.append_motion(item, u'建檔', None, v, main_sitting) if curr_motion is not None: if u'日期' in k: curr_motion['date'] = v elif 'date' in curr_motion: curr_motion['resolution'] = v if curr_motion: item['motions'].append(curr_motion) return item
def parse_profile(self, response): sel = Selector(response) main_node = sel.xpath('//table[@class="specpage_data_table"]//table[2]') info_node = main_node.xpath('.//table[2]') curr_url = response.url item = Councilor() item['contact_details'] = [] item['name'] = \ info_node.xpath('.//span[@id="ctl04_ctl08_pageControl_LB_MEM_NAME"]/text()').extract()[0].split()[0] item['links'] = [{'url': response.url, 'note': u'議會個人官網'}] img_url = main_node.xpath('.//img[@class="memImg"]/@src').extract()[0] item['image'] = urljoin(curr_url, img_url) key_map = { u'學歷': 'education', u'經歷': 'experience' } county = u'桃園縣' rows = info_node.xpath('.//tr') is_contact_info = False for row in rows: key = parse.get_extracted(row.xpath('.//img/@alt')) if key == u'聯絡資訊': is_contact_info = True elif key == u'首頁圖示': info = parse.get_inner_text(row).split() for group in info: split = group.split(u':') if len(split) > 1: left, right = split misc.append_contact(item, 'address', left, right) td = row.xpath('./td[2]') value = parse.get_inner_text(td) if not value: continue k_eng = key_map.get(key) if is_contact_info: left, right = value.split(u':') url = parse.get_extracted(row.xpath('.//a/@href')) if left == 'EMAIL': url = url.lstrip('mailto://') for u in url.split(';'): misc.append_contact(item, 'email', left, u.strip()) if left == u'聯絡電話': for x in right.split(';'): misc.append_contact(item, 'voice', left, x.strip()) if left == u'傳真': for x in right.split(';'): misc.append_contact(item, 'fax', left, x.strip()) if left in [u'部落格', u'FACEBOOK', u'臉書']: item['links'].append({'url': url, 'note': left}) elif k_eng: values = parse.get_inner_text_lines(td) values = [parse.remove_whitespaces(v) for v in values] item[k_eng] = values elif key == u'選區': split = value.split() item['county'] = county item['district'] = split[1] if len(split) > 1 else '' item['constituency'] = county + split[0] return item
def parse_profile(self, response): response = parse.get_decoded_response(response, 'Big5') sel = Selector(response) name_node = sel.xpath('//td[@class="w06"]') logging.warning('name_node: %s', name_node) name_str = parse.get_inner_text(name_node) logging.warning('name_str: %s', name_str) item = response.request.meta['item'] item['county'] = u'新竹縣' item['election_year'] = '2009' item['term_start'] = '%s-12-25' % item['election_year'] item['term_end'] = {'date': '2014-12-25'} item['in_office'] = True item['name'] = name_str.split('-')[-1] item['title'] = re.search(u'(副?議長|議員)', name_str).group() w02_nodes = sel.xpath('//th[@class="w02"]') for each_node in w02_nodes: key = parse.get_inner_text(each_node).strip() logging.warning('w02_node: key: %s', key) if key != u'學歷': continue education_node = each_node.xpath('../td') education_str = parse.get_inner_text(education_node) logging.warning('key: %s education_str: %s', key, education_str) item['education'] = education_str.split('\n') image_node = each_node.xpath('../../../../td[2]/img/@src') image_str = parse.get_extracted(image_node) logging.warning('key: %s education_str: %s image_str: %s', key, education_str, image_str) item['image'] = urljoin(response.url, urllib.quote(image_str.encode('utf8'))) main_nodes = sel.xpath('//tr[@class="line_02"]') contact_details = [] links = [{'url': response.url, 'note': u'議會個人官網'}] for each_node in main_nodes: key = parse.get_inner_text(each_node.xpath('./th')) item_key = _key_map.get(key, '') if item_key == 'experience': val_nodes = each_node.xpath('./td/ol/li') if val_nodes: val = [re.sub(ur' ', '', re.sub(ur'。', '', parse.get_inner_text(each_each_node))) for each_each_node in val_nodes] else: val = parse.get_inner_text(each_node.xpath('./td')).split("\n") val = [re.sub(ur' ', '', each_val) for each_val in val] elif item_key == 'platform': val_nodes = each_node.xpath('./td/ol/li') val = [re.sub(ur' ', '', parse.get_inner_text(each_each_node)) for each_each_node in val_nodes] else: val = parse.get_inner_text(each_node.xpath('./td')) if key not in _key_map: logging.error('key not in _key_map!: key: %s', key) continue if item_key in ['email', 'address', 'voice']: contact_details.append({"type": item_key, "value": val, "label": key}) elif item_key in ['link']: val = re.sub(ur'^\.\.', 'http://www.hcc.gov.tw', val) links.append({"url": val, "note": key}) else: item[item_key] = val logging.warning('key: %s val: %s item_key: %s', key, val, item_key) # item[item_key] = val item['contact_details'] = contact_details item['links'] = links return item
class Spider(scrapy.Spider): name = "councilors" start_urls = [ "http://www.tycc.gov.tw/page.aspx?wtp=1&wnd=204", ] download_delay = 0.5 def parse(self, response): sel = Selector(response) urls = sel.xpath('//map/area/@href').extract() for url in urls: url = urljoin(response.url, url) yield Request(url, callback=self.parse_selection_index) # XXX hack for correcting information special_urls = [ "http://www.tycc.gov.tw/page.aspx?wtp=1&wnd=204&town=%E5%B1%B1%E5%9C%B0%E5%8E%9F%E4%BD%8F%E6%B0%91", "http://www.tycc.gov.tw/page.aspx?wtp=1&wnd=204&page=2&town=%E7%AC%AC%E4%B8%80%E9%81%B8%E5%8D%80" ] for special_url in special_urls: yield Request(special_url, callback=self.parse_selection_index) def parse_selection_index(self, response): sel = Selector(response) urls = sel.xpath( '//div[@id="ctl04_ctl08_pageControl_PN_LIST"]//a/@href').extract() for url in urls: url = urljoin(response.url, url) logging.info('to request id: url: %s', url) yield Request(url, callback=self.parse_profile) def parse_profile(self, response): sel = Selector(response) main_node = sel.xpath( '//table[@class="specpage_data_table"]//table[2]') info_node = main_node.xpath('.//table[2]') curr_url = response.url logging.info('to setup item: curr_url: %s', curr_url) item = Councilor() item['contact_details'] = [] item['county'] = u'桃園縣' item['election_year'] = '2009' item['term_start'] = '%s-12-25' % item['election_year'] item['term_end'] = {'date': '2014-12-25'} item['in_office'] = True item['name'], item['title'] = \ sel.xpath('//span[@id="ctl04_ctl08_pageControl_LB_MEM_NAME"]/text()').extract()[0].split() item['links'] = [{'url': response.url, 'note': u'議會個人官網'}] img_url = main_node.xpath('.//img[@class="memImg"]/@src').extract()[0] item['image'] = urljoin(response.url, urllib.quote(img_url.encode('utf8'))) logging.info('after image: item: %s', item) key_map = {u'學歷': 'education', u'經歷': 'experience'} rows = main_node.xpath('.//tr') is_contact_info = False for row in rows: key = parse.get_extracted(row.xpath('.//img/@alt')) if key == u'聯絡資訊': is_contact_info = True elif key == u'首頁圖示': info = parse.get_inner_text(row).split() logging.info('info: %s', info) address_str = info[0] if u'電話:' not in info[1]: address_str += info[1] address = re.sub(ur'.*服務處.*:', '', address_str).strip() misc.append_contact(item, 'address', '服務處', address) for group in info: if re.search(ur'電話:', group): tel_val = re.sub(ur'/.*', '', re.sub(ur'.*電話:', '', group)).strip() if tel_val: misc.append_contact(item, 'voice', '電話', tel_val) if re.search(ur'傳真:', group): fax_val = re.sub(ur'/.*', '', re.sub(ur'.*傳真:', '', group)).strip() if fax_val: misc.append_contact(item, 'fax', '傳真', fax_val) td = row.xpath('./td[2]') value = parse.get_inner_text(td) if not value: continue logging.info( 'contact_info: key: %s value: %s td: %s is_contact_info: %s', key, value, td, is_contact_info) k_eng = key_map.get(key) if is_contact_info: blog_url = td.xpath( './/span[@id="ctl04_ctl08_pageControl_LB_MEM_BLOG"]/a/@href' ).extract() if blog_url: blog_url = blog_url[0].strip() logging.info('blog_url: %s dir: %s', blog_url, dir(blog_url)) item['links'].append({"url": blog_url, "note": "部落格"}) facebook_url = td.xpath( './/span[@id="ctl04_ctl08_pageControl_LB_MEM_FACEBOOK"]/a/@href' ).extract() if facebook_url: facebook_url = facebook_url[0].strip() logging.info('facebook_url: %s', facebook_url) item['links'].append({"url": facebook_url, "note": "臉書"}) emails = td.xpath( './/span[@id="ctl04_ctl08_pageControl_LB_MEM_EMAIL"]/a/@href' ).extract() if emails: emails = emails[0] emails = emails.split(';') emails = [ re.sub(ur'^mailto://', '', email.strip()) for email in emails ] logging.info('emails: %s', emails) for each_email in emails: misc.append_contact(item, 'email', 'EMAIL', each_email)
def parse_profile(self, response): response = parse.get_decoded_response(response, 'Big5') sel = Selector(response) name_node = sel.xpath('//td[@class="w06"]') logging.warning('name_node: %s', name_node) name_str = parse.get_inner_text(name_node) logging.warning('name_str: %s', name_str) item = response.request.meta['item'] item['county'] = u'新竹縣' item['election_year'] = '2009' item['term_start'] = '%s-12-25' % item['election_year'] item['term_end'] = {'date': '2014-12-25'} item['in_office'] = True item['name'] = name_str.split('-')[-1] item['title'] = re.search(u'(副?議長|議員)', name_str).group() w02_nodes = sel.xpath('//th[@class="w02"]') for each_node in w02_nodes: key = parse.get_inner_text(each_node).strip() logging.warning('w02_node: key: %s', key) if key != u'學歷': continue education_node = each_node.xpath('../td') education_str = parse.get_inner_text(education_node) logging.warning('key: %s education_str: %s', key, education_str) item['education'] = education_str.split('\n') image_node = each_node.xpath('../../../../td[2]/img/@src') image_str = parse.get_extracted(image_node) logging.warning('key: %s education_str: %s image_str: %s', key, education_str, image_str) item['image'] = urljoin(response.url, urllib.quote(image_str.encode('utf8'))) main_nodes = sel.xpath('//tr[@class="line_02"]') contact_details = [] links = [{'url': response.url, 'note': u'議會個人官網'}] for each_node in main_nodes: key = parse.get_inner_text(each_node.xpath('./th')) item_key = _key_map.get(key, '') if item_key == 'experience': val_nodes = each_node.xpath('./td/ol/li') if val_nodes: val = [ re.sub( ur' ', '', re.sub(ur'。', '', parse.get_inner_text(each_each_node))) for each_each_node in val_nodes ] else: val = parse.get_inner_text( each_node.xpath('./td')).split("\n") val = [re.sub(ur' ', '', each_val) for each_val in val] elif item_key == 'platform': val_nodes = each_node.xpath('./td/ol/li') val = [ re.sub(ur' ', '', parse.get_inner_text(each_each_node)) for each_each_node in val_nodes ] else: val = parse.get_inner_text(each_node.xpath('./td')) if key not in _key_map: logging.error('key not in _key_map!: key: %s', key) continue if item_key in ['email', 'address', 'voice']: contact_details.append({ "type": item_key, "value": val, "label": key }) elif item_key in ['link']: val = re.sub(ur'^\.\.', 'http://www.hcc.gov.tw', val) links.append({"url": val, "note": key}) else: item[item_key] = val logging.warning('key: %s val: %s item_key: %s', key, val, item_key) # item[item_key] = val item['contact_details'] = contact_details item['links'] = links return item