Python get_inner_text примеры, crawler_lib.parse.get_inner_text Python примеры использования

Пример #1

0

Показать файл

Файл: councilors.py Проект: AlcHawk/councilor-voter-guide

    def parse_profile(self, response):
        response = parse.get_decoded_response(response, 'Big5')
        meta = response.request.meta
        sel = Selector(response)
        curr_url = response.url
        county = u'宜蘭縣'

        tables = sel.xpath('//table[@bgcolor="#333333"]')

        item = Councilor()
        item['contact_details'] = []
        item['election_year'] = '2009'
        item['term_end'] = {'date': '2014-12-25'}
        item['term_start'] = '%s-12-25' % item['election_year']
        item['in_office'] = True
        item['county'] = county
        item['links'] = [{'url': response.url, 'note': u'議會個人官網'}]
        img_url = sel.xpath('.//div[@id="Layer2"]/img/@src').extract()[0]
        item['image'] = urljoin(curr_url, img_url)

        if meta:
            area = meta['area']
            item['constituency'] = county + area[0]
            item['district'] = area[1]

        key_map = {
            u'黨籍': 'party',
            u'姓名': 'name'
        }
        tds = tables[0].xpath('.//td')
        pairs = [(tds[2 * i], tds[2 * i + 1]) for i in range(len(tds) / 2)]
        for k, v in pairs:
            key = parse.get_inner_text(k, remove_white=True)
            value = parse.get_inner_text(v).strip()

            k_eng = key_map.get(key)
            if k_eng:
                item[k_eng] = value
            elif key == 'E-mail':
                if value:
                    misc.append_contact(item, 'email', key, value)
            elif u'電話' in key:
                misc.append_contact_list(item, 'voice', key, value.split(u'、'))
            elif key == u'服務處所':
                misc.append_contact(item, 'address', key, value)
            elif key == u'學歷':
                item['education'] = value.split()

        exp_node = tables[1].xpath('.//td[@bgcolor="#FFFFFF"]')
        experience = []
        for ex in exp_node:
            ex = parse.get_inner_text(ex).split()
            experience += ex

        item['experience'] = experience
        m = re.search(u'(副?議長)。?$', item['experience'][0])
        item['title'] = m.group(1) if m else u'議員'
        item['platform'] = parse.get_inner_text(tables[2].xpath('.//td[@bgcolor="#FFFFFF"]')).split()
        return item

Пример #2

0

Показать файл

    def parse_profile(self, response):
        response = parse.get_decoded_response(response, 'Big5')
        meta = response.request.meta
        sel = Selector(response)
        curr_url = response.url
        county = u'宜蘭縣'

        tables = sel.xpath('//table[@bgcolor="#333333"]')

        item = Councilor()
        item['contact_details'] = []
        item['election_year'] = '2009'
        item['term_end'] = {'date': '2014-12-25'}
        item['term_start'] = '%s-12-25' % item['election_year']
        item['in_office'] = True
        item['county'] = county
        item['links'] = [{'url': response.url, 'note': u'議會個人官網'}]
        img_url = sel.xpath('.//div[@id="Layer2"]/img/@src').extract()[0]
        item['image'] = urljoin(curr_url, img_url)

        if meta:
            area = meta['area']
            item['constituency'] = county + area[0]
            item['district'] = area[1]

        key_map = {u'黨籍': 'party', u'姓名': 'name'}
        tds = tables[0].xpath('.//td')
        pairs = [(tds[2 * i], tds[2 * i + 1]) for i in range(len(tds) / 2)]
        for k, v in pairs:
            key = parse.get_inner_text(k, remove_white=True)
            value = parse.get_inner_text(v).strip()

            k_eng = key_map.get(key)
            if k_eng:
                item[k_eng] = value
            elif key == 'E-mail':
                if value:
                    misc.append_contact(item, 'email', key, value)
            elif u'電話' in key:
                misc.append_contact_list(item, 'voice', key, value.split(u'、'))
            elif key == u'服務處所':
                misc.append_contact(item, 'address', key, value)
            elif key == u'學歷':
                item['education'] = value.split()

        exp_node = tables[1].xpath('.//td[@bgcolor="#FFFFFF"]')
        experience = []
        for ex in exp_node:
            ex = parse.get_inner_text(ex).split()
            experience += ex

        item['experience'] = experience
        m = re.search(u'(副?議長)。?$', item['experience'][0])
        item['title'] = m.group(1) if m else u'議員'
        item['platform'] = parse.get_inner_text(
            tables[2].xpath('.//td[@bgcolor="#FFFFFF"]')).split()
        return item

Пример #3

0

Показать файл

Файл: bills.py Проект: theacat/councilor-voter-guide

    def parse(self, response):
        sel = Selector(response)
        rows = sel.xpath('//table[@bordercolordark="#4292d6"]/tbody/tr')
        sitting = None
        for row in rows:
            text = ''.join(row.xpath('.//text()').extract()).strip()
            text = parse.remove_whitespaces(text)
            if not text:
                continue

            if re.match(u'第.*屆', text):
                sitting = text
                continue

            anchors = row.xpath(".//a")
            links = []
            for anchor in anchors:
                link_text = parse.get_inner_text(anchor)
                if link_text:
                    links.append(anchor.xpath('@href').extract()[0])

            url = parse.take_first(links)
            item = Bills()
            print sitting, text

            if url:
                url = self.base_url + url
                yield Request(url,
                              callback=self.parse_files,
                              meta={'item': item})

Пример #4

0

Показать файл

    def parse_profile(self, response):
        sel = Selector(response)

        main_node = sel.xpath(
            '//table[@class="specpage_data_table"]//table[2]')
        info_node = main_node.xpath('.//table[2]')
        curr_url = response.url

        logging.info('to setup item: curr_url: %s', curr_url)

        item = Councilor()
        item['contact_details'] = []
        item['county'] = u'桃園縣'
        item['election_year'] = '2009'
        item['term_start'] = '%s-12-25' % item['election_year']
        item['term_end'] = {'date': '2014-12-25'}
        item['in_office'] = True
        item['name'], item['title'] = \
            sel.xpath('//span[@id="ctl04_ctl08_pageControl_LB_MEM_NAME"]/text()').extract()[0].split()
        item['links'] = [{'url': response.url, 'note': u'議會個人官網'}]
        img_url = main_node.xpath('.//img[@class="memImg"]/@src').extract()[0]
        item['image'] = urljoin(response.url,
                                urllib.quote(img_url.encode('utf8')))

        logging.info('after image: item: %s', item)

        key_map = {u'學歷': 'education', u'經歷': 'experience'}

        rows = main_node.xpath('.//tr')
        is_contact_info = False
        for row in rows:
            key = parse.get_extracted(row.xpath('.//img/@alt'))
            if key == u'聯絡資訊':
                is_contact_info = True
            elif key == u'首頁圖示':
                info = parse.get_inner_text(row).split()
                logging.info('info: %s', info)

                address_str = info[0]
                if u'電話:' not in info[1]:
                    address_str += info[1]
                address = re.sub(ur'.*服務處.*：', '', address_str).strip()
                misc.append_contact(item, 'address', '服務處', address)

                for group in info:
                    if re.search(ur'電話:', group):
                        tel_val = re.sub(ur'/.*', '',
                                         re.sub(ur'.*電話:', '', group)).strip()
                        if tel_val:
                            misc.append_contact(item, 'voice', '電話', tel_val)
                    if re.search(ur'傳真:', group):
                        fax_val = re.sub(ur'/.*', '',
                                         re.sub(ur'.*傳真:', '', group)).strip()
                        if fax_val:
                            misc.append_contact(item, 'fax', '傳真', fax_val)

Пример #5

0

Показать файл

Файл: councilors.py Проект: AlcHawk/councilor-voter-guide

    def parse_profile(self, response):
        sel = Selector(response)

        main_node = sel.xpath('//table[@class="specpage_data_table"]//table[2]')
        info_node = main_node.xpath('.//table[2]')
        curr_url = response.url

        logging.info('to setup item: curr_url: %s', curr_url)

        item = Councilor()
        item['contact_details'] = []
        item['county'] = u'桃園縣'
        item['election_year'] = '2009'
        item['term_start'] = '%s-12-25' % item['election_year']
        item['term_end'] = {'date': '2014-12-25'}
        item['in_office'] = True
        item['name'], item['title'] = \
            sel.xpath('//span[@id="ctl04_ctl08_pageControl_LB_MEM_NAME"]/text()').extract()[0].split()
        item['links'] = [{'url': response.url, 'note': u'議會個人官網'}]
        img_url = main_node.xpath('.//img[@class="memImg"]/@src').extract()[0]
        item['image'] = urljoin(response.url, urllib.quote(img_url.encode('utf8')))

        logging.info('after image: item: %s', item)

        key_map = {
            u'學歷': 'education',
            u'經歷': 'experience'
        }

        rows = main_node.xpath('.//tr')
        is_contact_info = False
        for row in rows:
            key = parse.get_extracted(row.xpath('.//img/@alt'))
            if key == u'聯絡資訊':
                is_contact_info = True
            elif key == u'首頁圖示':
                info = parse.get_inner_text(row).split()
                logging.info('info: %s', info)

                address_str = info[0]
                if u'電話:' not in info[1]:
                    address_str += info[1]
                address = re.sub(ur'.*服務處.*：', '', address_str).strip()
                misc.append_contact(item, 'address', '服務處', address)

                for group in info:
                    if re.search(ur'電話:', group):
                        tel_val = re.sub(ur'/.*', '', re.sub(ur'.*電話:', '', group)).strip()
                        if tel_val:
                            misc.append_contact(item, 'voice', '電話', tel_val)
                    if re.search(ur'傳真:', group):
                        fax_val = re.sub(ur'/.*', '', re.sub(ur'.*傳真:', '', group)).strip()
                        if fax_val:
                            misc.append_contact(item, 'fax', '傳真', fax_val)

Пример #6

0

Показать файл

Файл: bills.py Проект: theacat/councilor-voter-guide

 def parse_files(self, response):
     sel = Selector(response)
     meta = response.request.meta
     item = meta['item']
     rows = sel.xpath('//table[@bordercolordark="#4292d6"]/tbody/tr')
     for row in rows:
         anchors = row.xpath('.//a')
         if anchors:
             text = parse.get_inner_text(row)
             url = anchors.xpath('@href').extract()
             url = parse.take_first(url)
             print text, url
     return item

Пример #7

0

Показать файл

Файл: councilors.py Проект: theacat/councilor-voter-guide

    def parse_profile(self, response):
        sel = Selector(response)
        main_node = sel.xpath('/html/body/table/tbody/tr[1]/td/table[2]/tbody')
        basic_info_node = main_node.xpath('tr[1]/td[2]/p')
        sub_table_node = main_node.xpath('.//tbody')
        base_url = self.base_url + '/content/'

        item = Councilor()
        item['contact_details'] = []
        item['links'] = [{'url': response.url, 'note': u'議會個人官網'}]
        item['image'] = base_url + parse.get_extracted(sel.xpath(u'//div/img/@src'))

        key_map = {
            u'性別': 'gender',
            u'黨籍': 'party',
            u'選區': 'constituency',
        }

        for i, line in enumerate(basic_info_node.xpath('.//text()').extract()):
            line = line.strip()
            if i == 0:
                item['name'] = line
                continue

            cols = line.split(u'：')
            k_chinese = parse.remove_whitespaces(cols[0])
            value = cols[1]

            k_eng = key_map.get(k_chinese)
            if k_eng:
                item[k_eng] = value

        for tr in sub_table_node.xpath('tr'):
            cols = tr.xpath('td')
            left = parse.remove_whitespaces(parse.get_extracted(cols[0].xpath('text()')).strip())
            right = parse.get_inner_text(cols[1])

            if left == u'政見':
                item['platform'] = parse.get_inner_text_lines(cols[1])
            if left == u'服務處地址':
                misc.append_contact(item, 'address', left, right)
            if left == u'電子郵件信箱':
                misc.append_contact(item, 'email', left, right)
            if u'電話' in left:
                misc.append_contact(item, 'voice', left, right)
            if u'網址' in left:
                item['links'].append({'url': right, 'note': left})

        return item

Пример #8

0

Показать файл

Файл: bills.py Проект: paullo0106/councilor-voter-guide

    def parse_bill(self, response):
        response = parse.get_decoded_response(response, 'Big5')
        sel = Selector(response)

        # convert to list of pairs
        rows = sel.xpath('//tr')
        pairs = misc.rows_to_pairs(rows)

        item = Bills()
        item['election_year'] = self.election_year[int(sel.xpath('//span[@id="lbFmotion_expireb"]/text()').re('\d+')[0])]
        item['county'] = u'宜蘭縣'
        item['links'] = response.url
        print response.url
        get_param = parse_qs(urlparse(response.url).query)
        item['id'] = get_param['Fmotion_instanceOS'][0].decode('Big5')
        item['proposed_by'] = re.sub(u'、', ' ', sel.xpath('//*[@id="lbFmotion_People"]/text()').extract()[0]).split()
        petitioned_by = sel.xpath('//*[@id="lbFmotion_AddTo"]/text()').extract()
        item['petitioned_by'] = re.sub(u'、', ' ', petitioned_by[0]).split() if petitioned_by else []
        item['motions'] = []
        main_title = parse.get_inner_text(sel.xpath('//font[@color="#800000"]'), remove_white=True)
        m = re.match(u'宜蘭縣議會(.*)議案資料', main_title)
        if m:
            main_sitting = m.group(1)

        k_map = {
            u'來源別':'type',
            # u'建檔日期':'',
            # u'議案程序':'',
            # u'系統編號':'',
            u'案號': 'bill_no',
            u'類別': 'category',
            # u'小組':'',
            u'案由': 'abstract',
            # u'法規名稱':'',
            u'辦法': 'methods',
            u'理由': 'description',
            # u'附件':'',
            # u'審議日期':'',
            # u'大會決議':'',
        }

        curr_motion = None
        for i, pair in enumerate(pairs):
            n = len(pair)
            if n < 2:
                if n == 1:
                    td = pair[0]
                    text = parse.get_inner_text(td, remove_white=True)
                    if td.xpath(u'.//img[@alt="小圖示"]'):
                        if text != u'案由、辦法、理由及附件':
                            if curr_motion: item['motions'].append(curr_motion)
                            curr_motion = {'motion': text}
                    elif curr_motion is not None and not curr_motion.get('sitting'):
                        curr_motion['sitting'] = ' '.join(td.xpath('.//span/text()').extract())

                continue

            k_raw, v_raw = pair
            k = parse.get_inner_text(k_raw, remove_white=True)
            v = parse.get_inner_text(v_raw)
            k_eng = k_map.get(k)

            if k_eng:
                item[k_eng] = v
            elif k == u'建檔日期':
                misc.append_motion(item, u'建檔', None, v, main_sitting)

            if curr_motion is not None:
                if u'日期' in k:
                    curr_motion['date'] = v
                elif 'date' in curr_motion:
                    curr_motion['resolution'] = v

        if curr_motion:
            item['motions'].append(curr_motion)

        return item

Пример #9

0

Показать файл

Файл: bills.py Проект: theacat/councilor-voter-guide

    def parse_bill(self, response):
        response = parse.get_decoded_response(response, 'Big5')
        sel = Selector(response)

        # convert to list of pairs
        rows = sel.xpath('//tr')
        pairs = misc.rows_to_pairs(rows)

        item = Bills()
        item['links'] = response.url
        item['motions'] = []
        main_title = parse.get_inner_text(
            sel.xpath('//font[@color="#800000"]'), remove_white=True)
        m = re.match(u'宜蘭縣議會(.*)議案資料', main_title)
        if m:
            main_sitting = m.group(1)

        k_map = {
            # u'來源別':'',
            # u'建檔日期':'',
            # u'議案程序':'',
            # u'系統編號':'',
            u'動議人': 'proposed_by',
            u'提案單位': 'proposed_by',
            u'案號': 'bill_no',
            u'附議人': 'petitioned_by',
            u'類別': 'category',
            # u'小組':'',
            u'案由': 'abstract',
            # u'法規名稱':'',
            u'辦法': 'methods',
            u'理由': 'description',
            # u'附件':'',
            # u'審議日期':'',
            # u'大會決議':'',
        }

        curr_motion = None
        for i, pair in enumerate(pairs):
            n = len(pair)
            if n < 2:
                if n == 1:
                    td = pair[0]
                    text = parse.get_inner_text(td, remove_white=True)
                    if td.xpath(u'.//img[@alt="小圖示"]'):
                        if text != u'案由、辦法、理由及附件':
                            if curr_motion: item['motions'].append(curr_motion)
                            curr_motion = {'motion': text}
                    elif curr_motion is not None and not curr_motion.get(
                            'sitting'):
                        curr_motion['sitting'] = ' '.join(
                            td.xpath('.//span/text()').extract())

                continue

            k_raw, v_raw = pair
            k = parse.get_inner_text(k_raw, remove_white=True)
            v = parse.get_inner_text(v_raw)
            k_eng = k_map.get(k)

            if k_eng:
                new_v = v
                if k_eng in ['petitioned_by', 'proposed_by']:
                    new_v = v.split()
                item[k_eng] = new_v
            elif k == u'建檔日期':
                misc.append_motion(item, u'建檔', None, v, main_sitting)

            if curr_motion is not None:
                if u'日期' in k:
                    curr_motion['date'] = v
                elif 'date' in curr_motion:
                    curr_motion['resolution'] = v

        if curr_motion:
            item['motions'].append(curr_motion)

        return item

Пример #10

0

Показать файл

Файл: councilors.py Проект: alecchen/councilor-voter-guide

    def parse_profile(self, response):
        sel = Selector(response)

        main_node = sel.xpath('//table[@class="specpage_data_table"]//table[2]')
        info_node = main_node.xpath('.//table[2]')
        curr_url = response.url

        item = Councilor()
        item['contact_details'] = []
        item['name'] = \
            info_node.xpath('.//span[@id="ctl04_ctl08_pageControl_LB_MEM_NAME"]/text()').extract()[0].split()[0]
        item['links'] = [{'url': response.url, 'note': u'議會個人官網'}]
        img_url = main_node.xpath('.//img[@class="memImg"]/@src').extract()[0]
        item['image'] = urljoin(curr_url, img_url)

        key_map = {
            u'學歷': 'education',
            u'經歷': 'experience'
        }

        county = u'桃園縣'
        rows = info_node.xpath('.//tr')
        is_contact_info = False
        for row in rows:
            key = parse.get_extracted(row.xpath('.//img/@alt'))
            if key == u'聯絡資訊':
                is_contact_info = True
            elif key == u'首頁圖示':
                info = parse.get_inner_text(row).split()
                for group in info:
                    split = group.split(u'：')
                    if len(split) > 1:
                        left, right = split
                        misc.append_contact(item, 'address', left, right)

            td = row.xpath('./td[2]')
            value = parse.get_inner_text(td)
            if not value:
                continue

            k_eng = key_map.get(key)
            if is_contact_info:
                left, right = value.split(u'：')
                url = parse.get_extracted(row.xpath('.//a/@href'))
                if left == 'EMAIL':
                    url = url.lstrip('mailto://')
                    for u in url.split(';'):
                        misc.append_contact(item, 'email', left, u.strip())
                if left == u'聯絡電話':
                    for x in right.split(';'):
                        misc.append_contact(item, 'voice', left, x.strip())
                if left == u'傳真':
                    for x in right.split(';'):
                        misc.append_contact(item, 'fax', left, x.strip())
                if left in [u'部落格', u'FACEBOOK', u'臉書']:
                    item['links'].append({'url': url, 'note': left})
            elif k_eng:
                values = parse.get_inner_text_lines(td)
                values = [parse.remove_whitespaces(v) for v in values]
                item[k_eng] = values
            elif key == u'選區':
                split = value.split()
                item['county'] = county
                item['district'] = split[1] if len(split) > 1 else ''
                item['constituency'] = county + split[0]

        return item

Пример #11

0

Показать файл

Файл: councilors.py Проект: AlcHawk/councilor-voter-guide

    def parse_profile(self, response):
        response = parse.get_decoded_response(response, 'Big5')
        sel = Selector(response)
        name_node = sel.xpath('//td[@class="w06"]')
        logging.warning('name_node: %s', name_node)
        name_str = parse.get_inner_text(name_node)

        logging.warning('name_str: %s', name_str)

        item = response.request.meta['item']
        item['county'] = u'新竹縣'
        item['election_year'] = '2009'
        item['term_start'] = '%s-12-25' % item['election_year']
        item['term_end'] = {'date': '2014-12-25'}
        item['in_office'] = True
        item['name'] = name_str.split('-')[-1]
        item['title'] = re.search(u'(副?議長|議員)', name_str).group()

        w02_nodes = sel.xpath('//th[@class="w02"]')
        for each_node in w02_nodes:
            key = parse.get_inner_text(each_node).strip()
            logging.warning('w02_node: key: %s', key)
            if key != u'學歷':
                continue
            education_node = each_node.xpath('../td')
            education_str = parse.get_inner_text(education_node)
            logging.warning('key: %s education_str: %s', key, education_str)
            item['education'] = education_str.split('\n')

            image_node = each_node.xpath('../../../../td[2]/img/@src')
            image_str = parse.get_extracted(image_node)

            logging.warning('key: %s education_str: %s image_str: %s', key, education_str, image_str)
            item['image'] = urljoin(response.url, urllib.quote(image_str.encode('utf8')))

        main_nodes = sel.xpath('//tr[@class="line_02"]')

        contact_details = []
        links = [{'url': response.url, 'note': u'議會個人官網'}]
        for each_node in main_nodes:
            key = parse.get_inner_text(each_node.xpath('./th'))
            item_key = _key_map.get(key, '')

            if item_key == 'experience':
                val_nodes = each_node.xpath('./td/ol/li')
                if val_nodes:
                    val = [re.sub(ur' ', '', re.sub(ur'。', '', parse.get_inner_text(each_each_node))) for each_each_node in val_nodes]
                else:
                    val = parse.get_inner_text(each_node.xpath('./td')).split("\n")
                    val = [re.sub(ur' ', '', each_val) for each_val in val]
            elif item_key == 'platform':
                val_nodes = each_node.xpath('./td/ol/li')
                val = [re.sub(ur' ', '', parse.get_inner_text(each_each_node)) for each_each_node in val_nodes]
            else:
                val = parse.get_inner_text(each_node.xpath('./td'))

            if key not in _key_map:
                logging.error('key not in _key_map!: key: %s', key)
                continue

            if item_key in ['email', 'address', 'voice']:
                contact_details.append({"type": item_key, "value": val, "label": key})
            elif item_key in ['link']:
                val = re.sub(ur'^\.\.', 'http://www.hcc.gov.tw', val)
                links.append({"url": val, "note": key})
            else:
                item[item_key] = val

            logging.warning('key: %s val: %s item_key: %s', key, val, item_key)

            # item[item_key] = val
        item['contact_details'] = contact_details
        item['links'] = links

        return item

Пример #12

0

Показать файл

class Spider(scrapy.Spider):
    name = "councilors"
    start_urls = [
        "http://www.tycc.gov.tw/page.aspx?wtp=1&wnd=204",
    ]
    download_delay = 0.5

    def parse(self, response):
        sel = Selector(response)
        urls = sel.xpath('//map/area/@href').extract()
        for url in urls:
            url = urljoin(response.url, url)
            yield Request(url, callback=self.parse_selection_index)

        # XXX hack for correcting information

        special_urls = [
            "http://www.tycc.gov.tw/page.aspx?wtp=1&wnd=204&town=%E5%B1%B1%E5%9C%B0%E5%8E%9F%E4%BD%8F%E6%B0%91",
            "http://www.tycc.gov.tw/page.aspx?wtp=1&wnd=204&page=2&town=%E7%AC%AC%E4%B8%80%E9%81%B8%E5%8D%80"
        ]
        for special_url in special_urls:
            yield Request(special_url, callback=self.parse_selection_index)

    def parse_selection_index(self, response):
        sel = Selector(response)
        urls = sel.xpath(
            '//div[@id="ctl04_ctl08_pageControl_PN_LIST"]//a/@href').extract()
        for url in urls:
            url = urljoin(response.url, url)
            logging.info('to request id: url: %s', url)
            yield Request(url, callback=self.parse_profile)

    def parse_profile(self, response):
        sel = Selector(response)

        main_node = sel.xpath(
            '//table[@class="specpage_data_table"]//table[2]')
        info_node = main_node.xpath('.//table[2]')
        curr_url = response.url

        logging.info('to setup item: curr_url: %s', curr_url)

        item = Councilor()
        item['contact_details'] = []
        item['county'] = u'桃園縣'
        item['election_year'] = '2009'
        item['term_start'] = '%s-12-25' % item['election_year']
        item['term_end'] = {'date': '2014-12-25'}
        item['in_office'] = True
        item['name'], item['title'] = \
            sel.xpath('//span[@id="ctl04_ctl08_pageControl_LB_MEM_NAME"]/text()').extract()[0].split()
        item['links'] = [{'url': response.url, 'note': u'議會個人官網'}]
        img_url = main_node.xpath('.//img[@class="memImg"]/@src').extract()[0]
        item['image'] = urljoin(response.url,
                                urllib.quote(img_url.encode('utf8')))

        logging.info('after image: item: %s', item)

        key_map = {u'學歷': 'education', u'經歷': 'experience'}

        rows = main_node.xpath('.//tr')
        is_contact_info = False
        for row in rows:
            key = parse.get_extracted(row.xpath('.//img/@alt'))
            if key == u'聯絡資訊':
                is_contact_info = True
            elif key == u'首頁圖示':
                info = parse.get_inner_text(row).split()
                logging.info('info: %s', info)

                address_str = info[0]
                if u'電話:' not in info[1]:
                    address_str += info[1]
                address = re.sub(ur'.*服務處.*：', '', address_str).strip()
                misc.append_contact(item, 'address', '服務處', address)

                for group in info:
                    if re.search(ur'電話:', group):
                        tel_val = re.sub(ur'/.*', '',
                                         re.sub(ur'.*電話:', '', group)).strip()
                        if tel_val:
                            misc.append_contact(item, 'voice', '電話', tel_val)
                    if re.search(ur'傳真:', group):
                        fax_val = re.sub(ur'/.*', '',
                                         re.sub(ur'.*傳真:', '', group)).strip()
                        if fax_val:
                            misc.append_contact(item, 'fax', '傳真', fax_val)

            td = row.xpath('./td[2]')
            value = parse.get_inner_text(td)
            if not value:
                continue

            logging.info(
                'contact_info: key: %s value: %s td: %s is_contact_info: %s',
                key, value, td, is_contact_info)

            k_eng = key_map.get(key)
            if is_contact_info:
                blog_url = td.xpath(
                    './/span[@id="ctl04_ctl08_pageControl_LB_MEM_BLOG"]/a/@href'
                ).extract()
                if blog_url:
                    blog_url = blog_url[0].strip()
                    logging.info('blog_url: %s dir: %s', blog_url,
                                 dir(blog_url))
                    item['links'].append({"url": blog_url, "note": "部落格"})

                facebook_url = td.xpath(
                    './/span[@id="ctl04_ctl08_pageControl_LB_MEM_FACEBOOK"]/a/@href'
                ).extract()
                if facebook_url:
                    facebook_url = facebook_url[0].strip()
                    logging.info('facebook_url: %s', facebook_url)
                    item['links'].append({"url": facebook_url, "note": "臉書"})

                emails = td.xpath(
                    './/span[@id="ctl04_ctl08_pageControl_LB_MEM_EMAIL"]/a/@href'
                ).extract()
                if emails:
                    emails = emails[0]
                    emails = emails.split(';')
                    emails = [
                        re.sub(ur'^mailto://', '', email.strip())
                        for email in emails
                    ]
                    logging.info('emails: %s', emails)
                    for each_email in emails:
                        misc.append_contact(item, 'email', 'EMAIL', each_email)

Пример #13

0

Показать файл

Файл: councilors.py Проект: theacat/councilor-voter-guide

    def parse_profile(self, response):
        response = parse.get_decoded_response(response, 'Big5')
        sel = Selector(response)
        name_node = sel.xpath('//td[@class="w06"]')
        logging.warning('name_node: %s', name_node)
        name_str = parse.get_inner_text(name_node)

        logging.warning('name_str: %s', name_str)

        item = response.request.meta['item']
        item['county'] = u'新竹縣'
        item['election_year'] = '2009'
        item['term_start'] = '%s-12-25' % item['election_year']
        item['term_end'] = {'date': '2014-12-25'}
        item['in_office'] = True
        item['name'] = name_str.split('-')[-1]
        item['title'] = re.search(u'(副?議長|議員)', name_str).group()

        w02_nodes = sel.xpath('//th[@class="w02"]')
        for each_node in w02_nodes:
            key = parse.get_inner_text(each_node).strip()
            logging.warning('w02_node: key: %s', key)
            if key != u'學歷':
                continue
            education_node = each_node.xpath('../td')
            education_str = parse.get_inner_text(education_node)
            logging.warning('key: %s education_str: %s', key, education_str)
            item['education'] = education_str.split('\n')

            image_node = each_node.xpath('../../../../td[2]/img/@src')
            image_str = parse.get_extracted(image_node)

            logging.warning('key: %s education_str: %s image_str: %s', key,
                            education_str, image_str)
            item['image'] = urljoin(response.url,
                                    urllib.quote(image_str.encode('utf8')))

        main_nodes = sel.xpath('//tr[@class="line_02"]')

        contact_details = []
        links = [{'url': response.url, 'note': u'議會個人官網'}]
        for each_node in main_nodes:
            key = parse.get_inner_text(each_node.xpath('./th'))
            item_key = _key_map.get(key, '')

            if item_key == 'experience':
                val_nodes = each_node.xpath('./td/ol/li')
                if val_nodes:
                    val = [
                        re.sub(
                            ur' ', '',
                            re.sub(ur'。', '',
                                   parse.get_inner_text(each_each_node)))
                        for each_each_node in val_nodes
                    ]
                else:
                    val = parse.get_inner_text(
                        each_node.xpath('./td')).split("\n")
                    val = [re.sub(ur' ', '', each_val) for each_val in val]
            elif item_key == 'platform':
                val_nodes = each_node.xpath('./td/ol/li')
                val = [
                    re.sub(ur' ', '', parse.get_inner_text(each_each_node))
                    for each_each_node in val_nodes
                ]
            else:
                val = parse.get_inner_text(each_node.xpath('./td'))

            if key not in _key_map:
                logging.error('key not in _key_map!: key: %s', key)
                continue

            if item_key in ['email', 'address', 'voice']:
                contact_details.append({
                    "type": item_key,
                    "value": val,
                    "label": key
                })
            elif item_key in ['link']:
                val = re.sub(ur'^\.\.', 'http://www.hcc.gov.tw', val)
                links.append({"url": val, "note": key})
            else:
                item[item_key] = val

            logging.warning('key: %s val: %s item_key: %s', key, val, item_key)

            # item[item_key] = val
        item['contact_details'] = contact_details
        item['links'] = links

        return item

Python get_inner_text примеры использования