示例#1
0
 def parseFirstPage(self, response):
     if response.status_code == 200:
         try:
             result = json.loads(response.text)
             searchResultRecord = result['searchResultDTO'][
                 'searchResultRecord']
             if searchResultRecord:
                 result_list = self._parse_basic(searchResultRecord)
                 if len(result_list) > 0:
                     # 只取第一项
                     item = result_list[0]
                     patentid = item.get('patent_id')
                     return patentid
                 else:
                     logger.info('无记录!')
                     return None
             else:
                 logger.error('检索列表出错了!')
                 return None
         except Exception as e:
             logger.error('{},\n{}'.format(response.text, response.headers))
             logger.error(e)
             return None
     else:
         return None
示例#2
0
def login(username=None, password=None):
    """
    登录API
    :return: True: 登录成功; False: 登录失败
    """
    if username is None or password is None:
        username = account.username
        password = account.password
    ctrl.BEING_LOG = True
    if check_login_status():
        ctrl.BEING_LOG = False
        return True

    error_times = 0
    while True:
        try:
            update_proxy()
            update_cookies()
            busername = change_to_base64(username)
            bpassword = change_to_base64(password)
            captcha = get_captcha()
            logger.info('验证码识别结果:%s' % captcha)
            form_data = url_login.get('form_data')
            form_data.__setitem__('j_validation_code', captcha)
            form_data.__setitem__('j_username', busername)
            form_data.__setitem__('j_password', bpassword)

            resp = requests.post(url=url_login.get('url'),
                                 headers=url_login.get('headers'),
                                 data=form_data,
                                 cookies=ctrl.COOKIES,
                                 proxies=ctrl.PROXIES,
                                 timeout=TIMEOUT)
            if resp.text.find(username + ',欢迎访问') != -1:
                jsession = ctrl.COOKIES.get('JSESSIONID')
                resp.cookies.__delitem__('JSESSIONID')
                resp.cookies.set('JSESSIONID',
                                 jsession,
                                 domain='www.pss-system.gov.cn')
                update_cookies(resp.cookies)
                requests.post(
                    'http://www.pss-system.gov.cn/sipopublicsearch/patentsearch/showViewList-jumpToView.shtml',
                    cookies=ctrl.COOKIES,
                    proxies=ctrl.PROXIES)
                ctrl.BEING_LOG = False
                logger.info('登录成功')
                return True
            else:
                if error_times > 5:
                    break
                logger.error('登录失败')
                error_times += 1
        except Exception as e:
            logger.error(e)

    ctrl.BEING_LOG = False
    return False
示例#3
0
def check_login_status():
    if USE_PROXY:
        try:
            if ctrl.PROXIES is not None:
                notify_ip_address()
                logger.info('当前已有登录状态')
                return True
        except:
            pass
    return False
示例#4
0
 def write_patent_item(self, count, patent_item):
     """
     写入 标题,摘要,权利要求书,说明书
     :param count:
     :param patent_item:
     :return:
     """
     if patent_item:
         # 拼接专利全文的路径
         patent_path = os.path.join(PATENT_TEXT_DIR,
                                    patent_item['request_number'] + '.txt')
         with open(patent_path, 'w', encoding='utf-8') as w:
             w.write("{}\n{}\n{}\n{}".format(patent_item['title'],
                                             patent_item['abstract'],
                                             patent_item['claim'],
                                             patent_item['instructions']))
         logger.info('第{}篇专利全文写入{} 完成!'.format(count, patent_path))
     else:
         logger.error('专利全文写到本地失败!')
示例#5
0
    def get_cookies(self):
        login_ok = False
        if ctrl.BEING_LOG is False:
            login_ok = login()
        if login_ok:
            self.cookies = ctrl.COOKIES
        else:
            while not login_ok:
                time.sleep(30 * 60 * 60)
                login_ok = login()
            self.cookies = ctrl.COOKIES
            # cookie_str = input("初始化cookie值:")
            # self.cookies = self.parse_cookie_str(cookie_str)

        self.deal_with_cookies += 1
        now_time = time.time()
        # 计算时间间隔
        interval = now_time - self.cookies_time
        # 更新时间点
        self.cookies_time = time.time()
        if self.deal_with_cookies == 1:
            interval = 0
        logger.info('第{}次,输入cookies值,间隔时间:{:.1f} 秒, {}'.format(
            self.deal_with_cookies, interval, self.cookies))
示例#6
0
    def start_requests(self):
        mainSearch = url_config.mainSearch
        headers = mainSearch.get('headers')
        # cookie_str = input("请输入cookie值进行初始化:")
        # cookies = self.parse_cookie_str(cookie_str)
        # self.cookies = self.get_cookies()
        self.get_cookies()
        # 专利全文
        patent_item = {}
        for count, request_number in enumerate(self.sipoList):
            count += 1
            if request_number in self.down_set:
                logger.info('{}已经存在,跳过下载'.format(request_number))
                continue

            patent_item['request_number'] = request_number
            # searchExpCn = "申请号=(CN201410811795+)"
            searchExpCn = self.componet_search(request_number)
            logger.info('第{}个检索表达式--- {}'.format(count, searchExpCn))
            form_data = mainSearch.get('form_data')
            form_data.__setitem__('searchCondition.searchExp', searchExpCn)
            # 检索patent_id
            first_response = requests.post(
                url=url_config.mainSearch.get('url'),
                headers=headers,
                cookies=self.cookies,
                data=form_data)
            # 抽取patent_id
            patent_id = self.parseFirstPage(first_response)
            if not patent_id:
                logger.error('patent_id is {}'.format(patent_id))
                self.get_cookies()  # 更新一次cookies,当前专利记录不完整,放弃当前记录,进行下一条。
                continue
            # 专利ID
            patent_item['patent_id'] = patent_id
            # 组建标题和摘要的表单
            form_data = url_config.detailSearch.get('form_data')
            form_data.__setitem__('nrdAn', str(patent_id).split('.')[0])
            form_data.__setitem__('cid', str(patent_id))
            form_data.__setitem__('sid', str(patent_id))
            logger.info('获取专利ID:{}\n'.format(patent_id))
            # print(form_data)
            # 检索摘要和标题
            abstract_title_response = requests.post(
                url=url_config.detailSearch.get('url'),
                headers=url_config.detailSearch.get('headers'),
                cookies=self.cookies,
                data=form_data)
            # 解析摘要和标题
            abstract, title = self.parsePatentDetail(abstract_title_response)
            if not abstract and not title:
                logger.error('abstract is {}, title is {}'.format(
                    abstract, title))
                self.get_cookies()  # 更新一次cookies,当前专利记录不完整,放弃当前记录,进行下一条。
                continue
            patent_item['abstract'] = abstract
            patent_item['title'] = title

            # 组建权利要求和说明书表单
            form_data = url_config.full_text.get('form_data')
            form_data.__setitem__('nrdAn', str(patent_id).split('.')[0])
            form_data.__setitem__('cid', str(patent_id))
            form_data.__setitem__('sid', str(patent_id))
            # 检索权利要求和说明书
            full_text_response = requests.post(
                url=url_config.full_text.get('url'),
                headers=url_config.full_text.get('headers'),
                cookies=self.cookies,
                data=form_data)
            # 解析权利要求和说明书
            claim, instructions = self.parse_full_text(full_text_response)
            if not claim and not instructions:
                logger.error('claim is {}, instructions is {}'.format(
                    claim, instructions))
                self.get_cookies()  # 更新一次cookies,当前专利记录不完整,放弃当前记录,进行下一条。
                continue
            patent_item['claim'] = claim
            patent_item['instructions'] = instructions
            # 写入到本地
            self.write_patent_item(count, patent_item)
示例#7
0
        result = ''
        for seg in split_list:
            result += seg + '\n'
        return result

    def write_patent_item(self, count, patent_item):
        """
        写入 标题,摘要,权利要求书,说明书
        :param count:
        :param patent_item:
        :return:
        """
        if patent_item:
            # 拼接专利全文的路径
            patent_path = os.path.join(PATENT_TEXT_DIR,
                                       patent_item['request_number'] + '.txt')
            with open(patent_path, 'w', encoding='utf-8') as w:
                w.write("{}\n{}\n{}\n{}".format(patent_item['title'],
                                                patent_item['abstract'],
                                                patent_item['claim'],
                                                patent_item['instructions']))
            logger.info('第{}篇专利全文写入{} 完成!'.format(count, patent_path))
        else:
            logger.error('专利全文写到本地失败!')


if __name__ == '__main__':
    patent_spider = PatentSpider()
    logger.info('爬取专利全文开始...')
    patent_spider.start_requests()