Exemplo n.º 1
0
def push_item(json_list, item: DataItem, title, name):
    """
    detail页面的解析函数
    :param json_list:
    :param item:
    :param title:
    :param name:
    :return:
    """
    aitem_list = json_list.get('abstractInfoDTO').get('abstractItemList')
    for a_item in aitem_list:
        if a_item.get('indexCnName').find(name) != -1:
            item.__setattr__(title,
                             ResultItem(title=name, value=a_item.get('value')))
            break
    return item
Exemplo n.º 2
0
    def parse_not_first_page(self, response):
        """
        解析请求结果非首页
        :param response:
        :return:
        """
        sipo = response.meta['sipo']
        soup = BeautifulSoup(response.body_as_unicode(), 'lxml')
        itemList = soup.find_all(attrs={"class": "item"})
        for item in itemList:
            data_item = DataItem()
            itemSoup = BeautifulSoup(item.prettify(), 'lxml')
            patent_id = itemSoup.find(attrs={'name': 'idHidden'}).get('value')
            nrdAn = itemSoup.find(attrs={'name': 'nrdAnHidden'}).get('value')
            nrdPn = itemSoup.find(attrs={'name': 'nrdPnHidden'}).get('value')

            for crawler in info.crawler_dict.get(
                    url_page_turning.get('crawler_id')):
                crawler.parse(item.prettify(), data_item, itemSoup)

            yield self.turn_to_request(int(url_page_turning.get('crawler_id')),
                                       patent_id=patent_id,
                                       nrdPn=nrdPn,
                                       nrdAn=nrdAn,
                                       sipo=sipo,
                                       data_item=data_item)
Exemplo n.º 3
0
    def parse_not_first_page(self, response):
        body = response.body_as_unicode()
        sipo = response.meta['sipo']
        crt_page_result = SearchResultUtil(body)

        # 处理详情等
        for record in crt_page_result.get_searchResultRecord_list():
            crt_record = SearchResultRecord(record)
            nrdAn = crt_record.get_nrdAn()
            nrdPn = crt_record.get_nrdPn()
            patent_id = crt_record.get_patent_id()
            data_item = DataItem()

            for crawler in info.crawler_dict.get('0'):
                crawler.parse(body, data_item, patent_id)
            yield self.turn_to_request(int(url_search.get('crawler_id')), data_item=data_item, nrdAn=nrdAn, nrdPn=nrdPn,
                                       patent_id=patent_id, sipo=sipo)
Exemplo n.º 4
0
    def parse(self, response):
        body = response.body_as_unicode()
        sipo = response.meta['sipo']
        top_page_result = SearchResultUtil(body)

        if top_page_result.get_totalCount() == 0:
            logger.info('共0页')
        else:
            page_sum = int(math.ceil(top_page_result.get_totalCount() / top_page_result.get_limit()))
            logger.info('共 %s 页' % page_sum)
            if top_page_result.get_executableSearchExp() is None:
                return

            # 处理详情等
            for record in top_page_result.get_searchResultRecord_list():
                crt_record = SearchResultRecord(record)
                nrdAn = crt_record.get_nrdAn()
                nrdPn = crt_record.get_nrdPn()
                patent_id = crt_record.get_patent_id()
                data_item = DataItem()

                for crawler in info.crawler_dict.get('0'):
                    crawler.parse(body, data_item, patent_id)
                yield self.turn_to_request(int(url_search.get('crawler_id')), data_item=data_item, nrdAn=nrdAn, nrdPn=nrdPn, patent_id=patent_id, sipo=sipo)


            # 处理翻页
            for index in range(1, page_sum):
                formdata = url_page_turning.get('form_data')
                formdata.__setitem__('resultPagination.start', str(top_page_result.get_limit() * index))
                formdata.__setitem__('resultPagination.totalCount', str(top_page_result.get_totalCount()))
                formdata.__setitem__('searchCondition.searchExp', top_page_result.get_searchExp())
                formdata.__setitem__('searchCondition.executableSearchExp', top_page_result.get_executableSearchExp())
                yield FormRequest(
                    url=url_page_turning.get('url'),
                    callback=self.parse_not_first_page,
                    method="POST",
                    headers=url_page_turning.get('headers'),
                    formdata=formdata,
                    meta={
                        'sipo': sipo
                    }
                )
Exemplo n.º 5
0
def push_item(details_str, item: DataItem, title, name):
    item.__setattr__(title, ResultItem(title=name, value=details_str))
    return item
Exemplo n.º 6
0
    def parse(self, response):
        """
        解析请求结果第一页
        :param response:
        :return:
        """
        body = response.body_as_unicode()
        sipo = response.meta['sipo']
        soup = BeautifulSoup(body, 'lxml')
        # 解析总专利数和专利页码数
        page_top = soup.find(attrs={"class": "page_top"})
        if page_top == 0:
            logger.info('共0页')
        else:
            page_top_line = page_top.get_text(strip=True)
            patent_sum = int(page_top_line[page_top_line[2:].find("页") +
                                           3:page_top_line.find("条")])
            page_sum = int(math.ceil(patent_sum / 12))
            logger.info('共 %s 页' % page_sum)
            search_en_div = soup.find(id='result_executableSearchExp')
            if search_en_div is None:
                return
            item_list = soup.find_all(attrs={"class": "item"})
            for item in item_list:
                data_item = DataItem()
                itemSoup = BeautifulSoup(item.prettify(), 'lxml')

                for crawler in info.crawler_dict.get('0'):
                    crawler.parse(item.prettify(), data_item, itemSoup)

                patent_id = itemSoup.find(attrs={
                    'name': 'idHidden'
                }).get('value')
                nrdAn = itemSoup.find(attrs={
                    'name': 'nrdAnHidden'
                }).get('value')
                nrdPn = itemSoup.find(attrs={
                    'name': 'nrdPnHidden'
                }).get('value')

                yield self.turn_to_request(int(url_search.get('crawler_id')),
                                           data_item=data_item,
                                           nrdAn=nrdAn,
                                           nrdPn=nrdPn,
                                           patent_id=patent_id,
                                           sipo=sipo)

            for index in range(1, page_sum):
                formdata = url_page_turning.get('form_data')
                formdata.__setitem__('resultPagination.start', str(12 * index))
                formdata.__setitem__('resultPagination.totalCount',
                                     str(patent_sum))
                formdata.__setitem__('searchCondition.searchExp',
                                     sipo.search_exp_cn)
                formdata.__setitem__('searchCondition.executableSearchExp',
                                     search_en_div.get_text())
                yield FormRequest(url=url_page_turning.get('url'),
                                  callback=self.parse_not_first_page,
                                  method="POST",
                                  headers=url_page_turning.get('headers'),
                                  formdata=formdata,
                                  meta={'sipo': sipo})