def parseFirstPage(self, response): if response.status_code == 200: try: result = json.loads(response.text) searchResultRecord = result['searchResultDTO'][ 'searchResultRecord'] if searchResultRecord: result_list = self._parse_basic(searchResultRecord) if len(result_list) > 0: # 只取第一项 item = result_list[0] patentid = item.get('patent_id') return patentid else: logger.info('无记录!') return None else: logger.error('检索列表出错了!') return None except Exception as e: logger.error('{},\n{}'.format(response.text, response.headers)) logger.error(e) return None else: return None
def login(username=None, password=None): """ 登录API :return: True: 登录成功; False: 登录失败 """ if username is None or password is None: username = account.username password = account.password ctrl.BEING_LOG = True if check_login_status(): ctrl.BEING_LOG = False return True error_times = 0 while True: try: update_proxy() update_cookies() busername = change_to_base64(username) bpassword = change_to_base64(password) captcha = get_captcha() logger.info('验证码识别结果:%s' % captcha) form_data = url_login.get('form_data') form_data.__setitem__('j_validation_code', captcha) form_data.__setitem__('j_username', busername) form_data.__setitem__('j_password', bpassword) resp = requests.post(url=url_login.get('url'), headers=url_login.get('headers'), data=form_data, cookies=ctrl.COOKIES, proxies=ctrl.PROXIES, timeout=TIMEOUT) if resp.text.find(username + ',欢迎访问') != -1: jsession = ctrl.COOKIES.get('JSESSIONID') resp.cookies.__delitem__('JSESSIONID') resp.cookies.set('JSESSIONID', jsession, domain='www.pss-system.gov.cn') update_cookies(resp.cookies) requests.post( 'http://www.pss-system.gov.cn/sipopublicsearch/patentsearch/showViewList-jumpToView.shtml', cookies=ctrl.COOKIES, proxies=ctrl.PROXIES) ctrl.BEING_LOG = False logger.info('登录成功') return True else: if error_times > 5: break logger.error('登录失败') error_times += 1 except Exception as e: logger.error(e) ctrl.BEING_LOG = False return False
def check_login_status(): if USE_PROXY: try: if ctrl.PROXIES is not None: notify_ip_address() logger.info('当前已有登录状态') return True except: pass return False
def write_patent_item(self, count, patent_item): """ 写入 标题,摘要,权利要求书,说明书 :param count: :param patent_item: :return: """ if patent_item: # 拼接专利全文的路径 patent_path = os.path.join(PATENT_TEXT_DIR, patent_item['request_number'] + '.txt') with open(patent_path, 'w', encoding='utf-8') as w: w.write("{}\n{}\n{}\n{}".format(patent_item['title'], patent_item['abstract'], patent_item['claim'], patent_item['instructions'])) logger.info('第{}篇专利全文写入{} 完成!'.format(count, patent_path)) else: logger.error('专利全文写到本地失败!')
def get_cookies(self): login_ok = False if ctrl.BEING_LOG is False: login_ok = login() if login_ok: self.cookies = ctrl.COOKIES else: while not login_ok: time.sleep(30 * 60 * 60) login_ok = login() self.cookies = ctrl.COOKIES # cookie_str = input("初始化cookie值:") # self.cookies = self.parse_cookie_str(cookie_str) self.deal_with_cookies += 1 now_time = time.time() # 计算时间间隔 interval = now_time - self.cookies_time # 更新时间点 self.cookies_time = time.time() if self.deal_with_cookies == 1: interval = 0 logger.info('第{}次,输入cookies值,间隔时间:{:.1f} 秒, {}'.format( self.deal_with_cookies, interval, self.cookies))
def start_requests(self): mainSearch = url_config.mainSearch headers = mainSearch.get('headers') # cookie_str = input("请输入cookie值进行初始化:") # cookies = self.parse_cookie_str(cookie_str) # self.cookies = self.get_cookies() self.get_cookies() # 专利全文 patent_item = {} for count, request_number in enumerate(self.sipoList): count += 1 if request_number in self.down_set: logger.info('{}已经存在,跳过下载'.format(request_number)) continue patent_item['request_number'] = request_number # searchExpCn = "申请号=(CN201410811795+)" searchExpCn = self.componet_search(request_number) logger.info('第{}个检索表达式--- {}'.format(count, searchExpCn)) form_data = mainSearch.get('form_data') form_data.__setitem__('searchCondition.searchExp', searchExpCn) # 检索patent_id first_response = requests.post( url=url_config.mainSearch.get('url'), headers=headers, cookies=self.cookies, data=form_data) # 抽取patent_id patent_id = self.parseFirstPage(first_response) if not patent_id: logger.error('patent_id is {}'.format(patent_id)) self.get_cookies() # 更新一次cookies,当前专利记录不完整,放弃当前记录,进行下一条。 continue # 专利ID patent_item['patent_id'] = patent_id # 组建标题和摘要的表单 form_data = url_config.detailSearch.get('form_data') form_data.__setitem__('nrdAn', str(patent_id).split('.')[0]) form_data.__setitem__('cid', str(patent_id)) form_data.__setitem__('sid', str(patent_id)) logger.info('获取专利ID:{}\n'.format(patent_id)) # print(form_data) # 检索摘要和标题 abstract_title_response = requests.post( url=url_config.detailSearch.get('url'), headers=url_config.detailSearch.get('headers'), cookies=self.cookies, data=form_data) # 解析摘要和标题 abstract, title = self.parsePatentDetail(abstract_title_response) if not abstract and not title: logger.error('abstract is {}, title is {}'.format( abstract, title)) self.get_cookies() # 更新一次cookies,当前专利记录不完整,放弃当前记录,进行下一条。 continue patent_item['abstract'] = abstract patent_item['title'] = title # 组建权利要求和说明书表单 form_data = url_config.full_text.get('form_data') form_data.__setitem__('nrdAn', str(patent_id).split('.')[0]) form_data.__setitem__('cid', str(patent_id)) form_data.__setitem__('sid', str(patent_id)) # 检索权利要求和说明书 full_text_response = requests.post( url=url_config.full_text.get('url'), headers=url_config.full_text.get('headers'), cookies=self.cookies, data=form_data) # 解析权利要求和说明书 claim, instructions = self.parse_full_text(full_text_response) if not claim and not instructions: logger.error('claim is {}, instructions is {}'.format( claim, instructions)) self.get_cookies() # 更新一次cookies,当前专利记录不完整,放弃当前记录,进行下一条。 continue patent_item['claim'] = claim patent_item['instructions'] = instructions # 写入到本地 self.write_patent_item(count, patent_item)
result = '' for seg in split_list: result += seg + '\n' return result def write_patent_item(self, count, patent_item): """ 写入 标题,摘要,权利要求书,说明书 :param count: :param patent_item: :return: """ if patent_item: # 拼接专利全文的路径 patent_path = os.path.join(PATENT_TEXT_DIR, patent_item['request_number'] + '.txt') with open(patent_path, 'w', encoding='utf-8') as w: w.write("{}\n{}\n{}\n{}".format(patent_item['title'], patent_item['abstract'], patent_item['claim'], patent_item['instructions'])) logger.info('第{}篇专利全文写入{} 完成!'.format(count, patent_path)) else: logger.error('专利全文写到本地失败!') if __name__ == '__main__': patent_spider = PatentSpider() logger.info('爬取专利全文开始...') patent_spider.start_requests()