def startProgress(self): print("开始爬取进程") Config.writeLog("开始爬取进程") try: self.__connection.connectUrl() except Exception as e: Config.writeException(e)
def __isElementLoadingSuccess(self): try: search_button = self.__driver.find_element_by_xpath( Query.search_button_xpath) inventor_input = self.__driver.find_element_by_id( Query.inventor_input_id) proposer_input = self.__driver.find_element_by_id( Query.proposer_input_id) time_select = self.__driver.find_element_by_id( Query.time_select_id) time_input = self.__driver.find_element_by_id(Query.time_input_id) if search_button.is_displayed() and inventor_input.is_displayed( ) and proposer_input.is_displayed() and time_select.is_displayed( ) and time_input.is_displayed(): return True else: print("元素没显示") Config.writeLog("元素没显示") return False except Exception as e: print("元素抛异常") Config.writeLog("元素抛异常") Config.writeException(e) return False
def __wait_for_law_state_loading(self): if not self.wait_state.wait_for_loading(): Config.writeLog("等待超时") print("等待超时") return False # self.__itemCollection.collectingLawDataUnsuccessfully() return True
def changePageSuccessfully(self): Config.writeLog("换页成功") time.sleep(3) if WaitEngine(self.__driver).wait_for_loading(): self.__pageCollection.startCollecting(self.__progressInfo.getPatentTypeIndex()) else: self.changePageUnsuccessfully()
def __check_if_lost(self): if self.wait_state.query_result_state(): pass else: print("加载异常") Config.writeLog("加载异常") self.__itemCollection.collectingLawDataUnsuccessfully( ) # TODO:添加加载失败的处理函数
def changePageUnsuccessfully(self): Config.writeLog("换页失败") print("换页失败") queryInfo = self.__progressInfo.getQueryInfo() inventor = queryInfo.getInventorList()[self.__progressInfo.getInventorIndex()] proposer = queryInfo.getProposer() startDate = queryInfo.getStartDate() patentTypeIndex = self.__progressInfo.getPatentTypeIndex() self.__query.queryTarget(inventor, proposer, startDate, patentTypeIndex)
def collectingUnsuccessfully(self, itemIndex): Config.writeLog("收集信息失败") print("收集信息失败") self.__progressInfo.setItemIndex(itemIndex) queryInfo = self.__progressInfo.getQueryInfo() inventor = queryInfo.getInventorList()[self.__progressInfo.getInventorIndex()] proposer = queryInfo.getProposer() startDate = queryInfo.getStartDate() patentTypeIndex = self.__progressInfo.getPatentTypeIndex() self.__query.queryTarget(inventor, proposer, startDate, patentTypeIndex)
def collectingItemSuccessfullyWithOutData(self): Config.writeLog("采集空item成功") self.__itemIndex += 1 if self.__itemIndex < self.__itemLength: itemCollectiong = ItemCollection( self.__driver, self, CollectionResult.PATENT_TYPE[self.__patentTypeIndex], self.__itemIndex) itemCollectiong.collectingData() else: self.__progressController.collectingSuccessfully()
def loadUrlUnsuccessfully(self): Config.writeLog("url连接失败") self.__refreshLostTime += 1 if self.__refreshLostTime % 5 == 0: self.__rConnectLostTime += 1 if self.__rConnectLostTime % 2 == 0: self.__driver.quit() time.sleep(10) self.__driver = self.__generateWebDriver(self.__browser) self.__connection.connectUrl() else: self.__connection.refreshUrl()
def collectingItemSuccessfully(self, itemData): Config.writeLog("采集item成功") self.__collectionResult.addItem(itemData) self.__itemIndex += 1 Config.writeLog("采集item成功itemIndex = {0}, itemLength = {1}".format( self.__itemIndex, self.__itemLength)) if self.__itemIndex < self.__itemLength: itemCollectiong = ItemCollection( self.__driver, self, CollectionResult.PATENT_TYPE[self.__patentTypeIndex], self.__itemIndex) itemCollectiong.collectingData() else: self.__progressController.collectingSuccessfully()
def loadUrlSuccessfully(self): Config.writeLog("成功连接url") if self.__driver.page_source.find("您的操作太过频繁") != -1: Config.writeLog("操作太过频繁") print(Config.REJECT_WAY) self.endProgress() return self.__refreshLostTime = 1 self.__rConnectLostTime = 1 queryInfo = self.__progressInfo.getQueryInfo() inventor = queryInfo.getInventorList()[self.__progressInfo.getInventorIndex()] proposer = queryInfo.getProposer() startDate = queryInfo.getStartDate() patentTypeIndex = self.__progressInfo.getPatentTypeIndex() self.__query.queryTarget(inventor, proposer, startDate, patentTypeIndex)
def queryTargetSuccessfully(self, pageSum): Config.writeLog("检索成功") self.__refreshLostTime = 1 self.__rConnectLostTime = 1 self.__progressInfo.setPageSum(pageSum) if pageSum == 0: Config.writeLog("pageSum = 0") self.__progressInfo.setItemIndex(0) self.__progressInfo.setPageIndex(1) pt = self.__progressInfo.getPatentTypeIndex() if pt >= 2: Config.writeLog("pt >= 2") self.__progressInfo.setPatentTypeIndex(0) inventorIndex = self.__progressInfo.getInventorIndex() + 1 if inventorIndex >= len( self.__progressInfo.getQueryInfo().getInventorList()): self.endProgress() else: self.__progressInfo.setInventorIndex(inventorIndex) queryInfo = self.__progressInfo.getQueryInfo() print(queryInfo.getInventorList()[ self.__progressInfo.getInventorIndex()]) else: Config.writeLog("pt < 2") self.__progressInfo.setPatentTypeIndex(pt + 1) queryInfo = self.__progressInfo.getQueryInfo() inventor = queryInfo.getInventorList()[ self.__progressInfo.getInventorIndex()] proposer = queryInfo.getProposer() startDate = queryInfo.getStartDate() patentTypeIndex = self.__progressInfo.getPatentTypeIndex() self.__query.queryTarget(inventor, proposer, startDate, patentTypeIndex) else: Config.writeLog("pageSum != 0") if self.__progressInfo.getPageIndex() != 1: self.__query.changePage(self.__progressInfo.getPageIndex()) else: self.__pageCollection.startCollecting( self.__progressInfo.getPatentTypeIndex(), self.__progressInfo.getItemIndex())
def collectingData(self): try: name = self.collecting_name() self.__item_data.set_name(name) type = self.collecting_type() self.__item_data.set_type(type) if name != "" and type != "": pLen = self.__driver.execute_script( "return document.getElementsByClassName(\"item-content-body\")[" + str(self.__whichItem) + "].children.length;") for i in range(pLen): strData = self.__driver.execute_script( "return document.getElementsByClassName(\"item-content-body\")[" + str(self.__whichItem) + "].children[" + str(i) + "].innerText;") strTemp = str(strData) if strTemp.find("申请号") != -1: requestNumber = strTemp[7:] self.__item_data.set_request_number(requestNumber) elif strTemp.find("申请日") != -1: requestDate = strTemp[6:] self.__item_data.set_request_date(requestDate) elif strTemp.find("公告") != -1 and strTemp.find("日") != -1: announcement_date = strTemp[10:] self.__item_data.set_announcement_date( announcement_date) elif strTemp.find("申请") != -1 and strTemp.find("人") != -1: proposer_name = strTemp[11:-2] self.__item_data.set_proposer_name(proposer_name) elif strTemp.find("发明人") != -1: inventor_name = strTemp[6:-2].replace('\n', '') self.__item_data.set_inventor_name(inventor_name) Config.writeLog("准备收集法律信息") LawState(self.__driver, self).collectingLawState(self.__whichItem) else: self.__pageCollection.collectingItemSuccessfullyWithOutData() except Exception as e: # print(e) Config.writeException(e) self.__pageCollection.collectingItemUnsuccessfully() return False
def __writeToExcel(self, index, patentType, name, lawState, lawStateDate, aDate, requestNumber, requestDate, proposerName, inventorName): try: editor = ExcelUtil(Config.FILE_NAME).edit() sh = editor.getSheet(0) sh.write(index, 0, patentType) sh.write(index, 1, name) sh.write(index, 2, lawState) sh.write(index, 3, lawStateDate) sh.write(index, 4, aDate) sh.write(index, 5, requestNumber) sh.write(index, 6, requestDate) sh.write(index, 7, proposerName) sh.write(index, 8, inventorName) editor.commit() except Exception as e: print("写excel报错") Config.writeLog("写excel报错") Config.writeException(e)
def startCollecting(self, patentTypeIndex, startItemIndex=0): self.__itemLength = 0 self.__patentTypeIndex = patentTypeIndex self.__itemIndex = startItemIndex try: self.__itemLength = self.__driver.execute_script( "return document.getElementsByClassName(\"item\").length;") except Exception as e: Config.writeException(e) print(e) self.__itemLength = 0 self.__progressController.collectingUnsuccessfully() return False if self.__itemIndex < self.__itemLength: Config.writeLog("开始收集") itemCollectiong = ItemCollection( self.__driver, self, CollectionResult.PATENT_TYPE[patentTypeIndex], self.__itemIndex) itemCollectiong.collectingData() else: Config.writeLog("收集失败") print("收集失败") self.__progressController.collectingUnsuccessfully( self.__itemIndex) Config.writeLog("itemIndex = {0}".format(self.__itemIndex)) return True
def queryTarget(self, inventor, proposer, startDate, patentTypeIndex): if self.__waitEngine.wait_for_loading(): if self.__isElementLoadingSuccess(): if self.__inputQueryTargetData(inventor, proposer, startDate, patentTypeIndex): if self.__waitEngine.wait_for_loading(): self.__waitEngine.waitForSeconds(1) pageSum = self.__getPageSum() if pageSum is not None: self.__progressController.queryTargetSuccessfully( pageSum) return True else: print("页码为零") Config.writeLog("页码为零") self.__progressController.queryTargetUnsuccessfully( ) return False else: print("查询等待超时") Config.writeLog("查询等待超时") self.__progressController.queryTargetUnsuccessfully() return False else: print("查询失败") Config.writeLog("查询失败") self.__progressController.queryTargetUnsuccessfully() return False else: print("元素未加载") Config.writeLog("元素未加载") self.__progressController.queryTargetUnsuccessfully() return False else: print("url加载超时") Config.writeLog("url加载超时") self.__progressController.queryTargetUnsuccessfully() return False
def collectingSuccessfully(self): Config.writeLog("收集信息成功") pi = self.__progressInfo.getPageIndex() pi += 1 if pi > self.__progressInfo.getPageSum(): self.__progressInfo.setPageIndex(1) self.__progressInfo.setItemIndex(0) pt = self.__progressInfo.getPatentTypeIndex() if pt >= 2: self.__progressInfo.setPatentTypeIndex(0) ii = self.__progressInfo.getInventorIndex() if ii < len(self.__progressInfo.getQueryInfo().getInventorList( )) - 1: self.__progressInfo.setInventorIndex(ii + 1) queryInfo = self.__progressInfo.getQueryInfo() inventor = queryInfo.getInventorList()[ self.__progressInfo.getInventorIndex()] proposer = queryInfo.getProposer() startDate = queryInfo.getStartDate() patentTypeIndex = self.__progressInfo.getPatentTypeIndex() print(inventor) self.__query.queryTarget(inventor, proposer, startDate, patentTypeIndex) else: Config.writeLog("InventorIndex = {0}".format(ii)) self.endProgress() else: pt += 1 self.__progressInfo.setPatentTypeIndex(pt) queryInfo = self.__progressInfo.getQueryInfo() inventor = queryInfo.getInventorList()[ self.__progressInfo.getInventorIndex()] proposer = queryInfo.getProposer() startDate = queryInfo.getStartDate() patentTypeIndex = self.__progressInfo.getPatentTypeIndex() self.__query.queryTarget(inventor, proposer, startDate, patentTypeIndex) else: Config.writeLog("pageIndex = {0}".format(pi)) self.__progressInfo.setPageIndex(pi) self.__progressInfo.setItemIndex(0) self.__query.changePage(pi)
def __wait_for_law_state(self): if not self.wait_state.wait_for_loading(): Config.writeLog("等待超时") print("等待超时") self.__itemCollection.collectingLawDataUnsuccessfully() if self.__wait_for_close_button(): pass else: Config.writeLog("关闭按钮没出来") print("关闭按钮没出来") self.__itemCollection.collectingLawDataUnsuccessfully() if self.wait_state.query_result_state(): pass else: Config.writeLog("加载异常") print("加载异常") self.__itemCollection.collectingLawDataUnsuccessfully( ) # TODO:添加加载失败的处理函数 return
def collectingLawDataSuccessfully(self, lawUpdate, lawState): Config.writeLog("采集法律信息成功") self.__item_data.set_law_state(lawState) self.__item_data.set_law_state_date(lawUpdate) self.__pageCollection.collectingItemSuccessfully(self.__item_data)
def collectingItemUnsuccessfully(self): print("采集item失败") Config.writeLog("采集item失败") self.__progressController.collectingUnsuccessfully(self.__itemIndex)
def endProgress(self): Config.writeLog("结束进程") print("结束进程") self.__driver.quit() time.sleep(1) os._exit(0)
def init_excel_config(): title_list = [ "专利类型", "专利名称", "法律状态", "法律状态最后修改日期", "申请公布日/授权公告日", "申请号", "申请日", "申请人/专利权人", "发明人" ] editor = ExcelUtil(Config.FILE_NAME).edit() sh = editor.getSheet(0) for index, each in enumerate(title_list): sh.write(0, index, each) editor.commit() return if __name__ == '__main__': initProgress() # 这句非常重要,提高python的递归深度,否则递归900次就炸了 sys.setrecursionlimit(1000000) # 例如这里设置为一百万 startDate = input("请输入公布日开始日期,如{0}:".format( TimeUtil.getFormatTime("%Y-%m-%d"))) Config.writeLog("程序启动,输入的公布开始日期为{0}".format(startDate)) init_excel_config() progress = ProgressController(Config.BROSWER_NAME) Config.writeLog("启动{0}浏览器".format(Config.BROSWER_NAME)) queryInfo = progress.getQueryInfo() queryInfo.setStartDate(startDate) progress.startProgress() # print(excel)
def collecting_law_state(self, which_item): try: Config.writeLog("点击按钮") WaitEngine.waitForSeconds(2) self.__click_law_state_button(which_item) if self.__wait_for_law_state_loading() is True: self.__check_for_colse_button() self.__check_if_lost() else: Config.writeLog("点击按钮") WaitEngine.waitForSeconds(2) self.__click_law_state_button(which_item) if self.__wait_for_law_state_loading() is True: self.__check_for_colse_button() self.__check_if_lost() else: self.__itemCollection.collectingLawDataUnsuccessfully() return Config.writeLog("法律状态") law_state = self.__get_law_state() if law_state.find("无数据") == -1: Config.writeLog("法律日期") law_update = self.__get_law_update() else: law_update = "无数据" Config.writeLog("关闭按钮") self.__close_law_state() Config.writeLog("采集成功") self.__itemCollection.collectingLawDataSuccessfully( law_update, law_state) except Exception as e: print("采集异常") Config.writeLog("采集异常") Config.writeException(e) self.__itemCollection.collectingLawDataUnsuccessfully() return
def __check_for_colse_button(self): if self.__wait_for_close_button(): pass else: Config.writeLog("关闭按钮没出来") self.__itemCollection.collectingLawDataUnsuccessfully()
def __inputQueryTargetData(self, inventor, proposer, startDate, patentTypeIndex): try: # 填写发明人 self.__driver.execute_script("document.getElementById(\"" + Query.inventor_input_id + "\").setAttribute(\"value\",\"" + inventor + "\")") Config.writeLog("发明人") # 填写申请人 self.__driver.execute_script("document.getElementById(\"" + Query.proposer_input_id + "\").setAttribute(\"value\",\"" + proposer + "\")") Config.writeLog("申请人") # 点击时间的check_list self.__driver.execute_script( "document.getElementById(\"" + Query.time_select_id + "\").firstElementChild.firstElementChild.click();") WaitEngine.waitForSeconds(2) # 等待两秒 self.__driver.execute_script( "document.getElementById(\"" + Query.time_select_id + "\").firstElementChild.childNodes[2].childNodes[2].firstElementChild.click();" ) Config.writeLog("点击时间") # 填写时间 self.__driver.execute_script("document.getElementById(\"" + Query.time_input_id + "\").setAttribute(\"value\",\"" + startDate + "\")") Config.writeLog("填写时间") # 选择专利类型 self.__choosePatentType(patentTypeIndex) Config.writeLog("专利类型") WaitEngine.waitForSeconds(3) # 等待三秒 # 点击检索按钮 self.__driver.execute_script( "document.getElementsByClassName(\"box-content-bottom\").item(0).childNodes.item(5).click();" ) Config.writeLog("点击按钮") return True except Exception as e: Config.writeException(e) print(e) return False
def collectingLawDataUnsuccessfully(self): Config.writeLog("收集法律信息失败") self.__pageCollection.collectingItemUnsuccessfully()