Python extract_text示例，crawler.utils.extract_text Python示例

示例#1

0

显示文件

 def extract_topics(cell):
     topics = []
     for li in cell.xpath('.//li'):
         topics.append(clean_topic(extract_text(li)))
     if len(topics) == 0:
         topics.append(clean_topic(extract_text(cell)))
     return topics

示例#2

0

显示文件

文件： sangiin_minutes_spider.py 项目： sarcastic555/politylink-crawler

    def parse_keika(self, response):
        url = build_url(response.url,
                        title=UrlTitle.IINKAI_KEIKA,
                        domain=self.domain)
        self.gql_client.merge(url)

        contents = response.xpath('//div[@id="ContentsBox"]')
        h2_text = contents.xpath('.//h2/text()').get()
        assert h2_text[-2:] == '経過'
        committee_name = '参議院' + h2_text[:-2]

        h4_list = contents.xpath('./h4')
        pre_list = contents.xpath('./pre')
        assert len(h4_list) == len(pre_list)
        for h4, pre in zip(h4_list, pre_list):
            dt = DateConverter.convert(extract_text(h4))
            summary = ''.join(extract_text(pre).strip().split())
            if '誤りにつき訂正' in summary:
                LOGGER.warning(f'skip non summary: {summary}')
                continue
            minutes_list = self.minutes_finder.find(committee_name, dt)
            if len(minutes_list) != 1:
                LOGGER.warning(
                    f'found {len(minutes_list)} Minutes that match with ({committee_name}, {dt}): {minutes_list}'
                )
            for minutes in minutes_list:
                minutes.summary = summary
                self.gql_client.merge(minutes)
                self.gql_client.link(url.id, minutes.id)

示例#3

0

显示文件

文件： sangiin_spider.py 项目： sarcastic555/politylink-crawler

 def parse_meisai_table(table):
     data = dict()
     for row in table.xpath('./tr'):
         key = extract_text(row.xpath('./th'))
         val = extract_text(row.xpath('./td'))
         data[key] = val
     return data

示例#4

0

显示文件

文件： sangiin_spider.py 项目： sarcastic555/politylink-crawler

    def scrape_bills_and_urls_from_table(self, table, bill_category,
                                         response_url):
        bills, urls = [], []
        for row in table.xpath('./tr')[1:]:  # skip header
            cells = row.xpath('./td')
            assert len(cells) == 5

            # build Bill instance with necessary info
            try:
                diet_number = int(extract_text(cells[0]))
                submission_number = int(extract_text(cells[1]))
                bill_name = extract_text(cells[2])
            except Exception as e:
                LOGGER.warning(f'failed to parse row:\n{row.get()}\n{e}')
                continue
            bill = build_bill(bill_category, diet_number, submission_number,
                              bill_name)
            bills.append(bill)

            # build  URL if exists
            maybe_meisai_href = extract_full_href_or_none(
                cells[2], response_url)
            if maybe_meisai_href:
                url = build_url(maybe_meisai_href, UrlTitle.GIAN_ZYOUHOU,
                                self.domain)
                url.meta = {'bill_id': bill.id}
                urls.append(url)

        return bills, urls

示例#5

0

显示文件

    def scrape_bills_and_urls_from_table(table, response_url):
        def get_bill_category_or_none(caption):
            if caption == '閣法の一覧':
                return BillCategory.KAKUHOU
            elif caption == '衆法の一覧':
                return BillCategory.SHUHOU
            elif caption == '参法の一覧':
                return BillCategory.SANHOU
            else:
                return None

        bills, urls = [], []

        caption = extract_text(table.xpath('./caption')).strip()
        maybe_bill_category = get_bill_category_or_none(caption)
        if not maybe_bill_category:
            return bills, urls
        bill_category = maybe_bill_category

        for row in table.xpath('./tr')[1:]:  # skip header
            cells = row.xpath('./td')
            assert len(cells) == 6

            # build Bill instance with necessary info
            try:
                diet_number = int(extract_text(cells[0]))
                submission_number = int(extract_text(cells[1]))
                bill_name = extract_text(cells[2])
            except Exception as e:
                LOGGER.warning(f'failed to parse row:\n{row.get()}\n{e}')
                continue
            bill = build_bill(bill_category, diet_number, submission_number,
                              bill_name)
            bills.append(bill)

            # build keika URL if exists
            maybe_keika_href = extract_full_href_or_none(
                cells[4], response_url)
            if maybe_keika_href:
                url = build_url(maybe_keika_href, UrlTitle.KEIKA,
                                ShugiinSpider.domain)
                url.meta = {'bill_id': bill.id}
                urls.append(url)

            # build honbun URL if exists
            maybe_honbun_href = extract_full_href_or_none(
                cells[5], response_url)
            if maybe_honbun_href:
                url = build_url(maybe_honbun_href, UrlTitle.HONBUN,
                                ShugiinSpider.domain)
                url.meta = {'bill_id': bill.id}
                urls.append(url)

        return bills, urls

示例#6

0

显示文件

文件： shugiin_minutes_spider.py 项目： sarcastic555/politylink-crawler

    def parse_minutes(self, response):
        # merge url if exists
        maybe_href = extract_full_href_or_none(response.xpath('//h4'),
                                               response.url)
        if not maybe_href:
            LOGGER.warning(f'failed to find url in {response.url}')
            return
        url = build_url(maybe_href,
                        title=UrlTitle.GAIYOU_PDF,
                        domain=self.domain)
        self.gql_client.merge(url)
        LOGGER.debug(f'merged {url.id}')

        # link to minutes
        title = extract_text(response.xpath('//title'))
        committee_name = response.meta['committee_name']
        date_time = self.extract_datetime_from_title(title)
        minutes = build_minutes(committee_name, date_time)
        try:
            self.gql_client.get(minutes.id,
                                ['id'])  # minutes should already exist
            self.gql_client.link(url.id, minutes.id)
        except GraphQLException:
            LOGGER.warning(
                f'failed to find minutes ({committee_name}, {date_time})')

示例#7

0

显示文件

文件： sangiin_spider.py 项目： sarcastic555/politylink-crawler

    def scrape_bills_and_urls(self, response):
        def get_bill_category_or_none(caption):
            if caption == '法律案（内閣提出）一覧':
                return BillCategory.KAKUHOU
            elif caption == '法律案（衆法）一覧':
                return BillCategory.SHUHOU
            elif caption == '法律案（参法）一覧':
                return BillCategory.SANHOU
            else:
                return None

        bills, urls = [], []

        div = response.xpath('//div[@id="ContentsBox"]')[0]
        tables = div.xpath('./table')
        captions = list(
            map(lambda x: extract_text(x), div.css('h2.title_text')))
        assert len(tables) == len(captions)

        for table, caption in zip(tables, captions):
            maybe_bill_category = get_bill_category_or_none(caption)
            if maybe_bill_category:
                res = self.scrape_bills_and_urls_from_table(
                    table, maybe_bill_category, response.url)
                bills.extend(res[0])
                urls.extend(res[1])
        return bills, urls

示例#8

0

显示文件

 def scrape_committees_from_table(table):
     committees = []
     for row in table.xpath('.//tr')[1:]:  # skip header
         cells = row.xpath('.//td')
         assert len(cells) == 3
         try:
             committee_name = '衆議院' + extract_text(cells[0]).strip()
             num_members = int(extract_text(cells[1]).replace('人', ''))
             topics = ShugiinCommitteeSpider.extract_topics(cells[2])
         except Exception as e:
             LOGGER.warning(f'failed to parse row:\n{row.get()}\n{e}')
             continue
         committee = build_committee(committee_name, 'REPRESENTATIVES')
         committee.num_members = num_members
         committee.topics = topics
         committees.append(committee)
     return committees

示例#9

0

显示文件

文件： sangiin_committee_spider.py 项目： sarcastic555/politylink-crawler

 def scrape_topics_list(div):
     ret = []
     for oul in div.css('ol, ul'):
         topics = []
         for li in oul.css('li'):
             topics.append(clean_topic(extract_text(li)))
         ret.append(topics)
     return ret

示例#10

0

显示文件

文件： shugiin_minutes_spider.py 项目： sarcastic555/politylink-crawler

 def scrape_committees_from_table(table, root_url):
     committees = []
     for row in table.xpath('./tr'):
         for cell in row.xpath('./td'):
             committee = Committee(None)
             committee.name = '衆議院' + extract_text(cell.xpath('./span/a'))
             committee.url = extract_full_href_or_none(cell, root_url)
             committees.append(committee)
     return committees

示例#11

0

显示文件

文件： sangiin_committee_spider.py 项目： sarcastic555/politylink-crawler

 def scrape_num_members_list(div):
     ret = []
     for p in div.css('p'):
         text = extract_text(p)
         pattern = r'委員数：([0-9]+)人'
         match = re.fullmatch(pattern, text)
         if match:
             ret.append(int(match.group(1)))
     return ret

示例#12

0

显示文件

 def extract_urls(self, cell):
     urls = []
     for a in cell.xpath('.//a'):
         text = extract_text(a)
         href = urljoin(self.start_urls[0], a.xpath('./@href').get())
         if '概要' in text:
             urls.append(build_url(href, UrlTitle.GAIYOU_PDF, self.domain))
         elif '新旧' in text:
             urls.append(build_url(href, UrlTitle.SINKYU_PDF, self.domain))
     return urls

示例#13

0

显示文件

 def parse_table(self, table, bill_category=None, diet_number=None):
     for row in table.xpath('.//tr'):
         cells = row.xpath('.//td')
         if len(cells) > max(self.bill_col, self.url_col):
             try:
                 bill_query = extract_text(cells[self.bill_col]).strip()
                 urls = self.extract_urls(cells[self.url_col])
                 LOGGER.info(f'scraped {len(urls)} urls for {bill_query}')
                 self.store_urls_for_bill(urls, bill_query, bill_category, diet_number)
             except Exception as e:
                 LOGGER.warning(f'failed to parse {row}: {e}')
                 continue

示例#14

0

显示文件

文件： __init__.py 项目： sarcastic555/politylink-crawler

 def parse_table(self, response):
     table = response.xpath('//table')[self.table_idx]
     for row in table.xpath('.//tr'):
         cells = row.xpath('.//td')
         if len(cells) > max(self.bill_col, self.url_col):
             try:
                 bill_query = extract_text(cells[self.bill_col]).strip()
                 urls = self.extract_urls(cells[self.url_col])
                 self.store_urls_for_bill(urls, bill_query)
                 LOGGER.info(f'scraped {len(urls)} urls for {bill_query}')
             except Exception as e:
                 LOGGER.warning(f'failed to parse {row}: {e}')
                 continue

示例#15

0

显示文件

    def scrape_members_and_urls(self, response):
        members, urls = [], []
        table = response.xpath('//table[@summary="議員一覧（50音順）"]')[0]
        for row in table.xpath('./tr')[1:]:  # skip header
            cells = row.xpath('./td')
            assert len(cells) == 6

            name = ''.join(extract_text(cells[0]).strip().split())
            tags = [  # store 会派 and 選挙区 as tags for now
                extract_text(cells[2]).strip(),
                extract_text(cells[3]).strip()
            ]
            member = build_member(name)
            member.tags = tags
            member.house = 'COUNCILORS'
            members.append(member)

            maybe_href = extract_full_href_or_none(cells[0], response.url)
            if maybe_href:
                url = build_url(maybe_href, UrlTitle.GIIN_ZYOUHOU, self.domain)
                url.meta = {'member_id': member.id}
                urls.append(url)
        return members, urls

示例#16

0

显示文件

文件： sangiin_committee_spider.py 项目： sarcastic555/politylink-crawler

 def scrape_name_list(div):
     ret = []
     for h4 in div.css('h4.ta_l').css('h4.mt20').css('h4.fl_l'):
         name = '参議院' + extract_text(h4).strip()
         ret.append(name)
     return ret