def work(self, spider_cla): spider = spider_cla() try: spider.start() except Exception as e: logging_except(e, cls_name=spider_cla.__name__+' '+spider.url) print(e) finally: spider.pop() spider.close()
def start(self): logging_except(module='init') logging_sql() while True: main_redis = RedisClient("info_basic_info") if not main_redis.get(): self.main_spider() main_redis.close() for cls in self.spider_list: spider_name = cls.__name__ redis_key = spider_name.replace("Spider", "info") redis_client = RedisClient(redis_key) if redis_client.get(): self.work(cls) redis_client.close() printf("Wait for all spider end!")
def start(self): self.initialization("info_company", "jj_info_company") self.url = self.new_url() self._mysqlclient = [ Mysql_Client('jj_info_company'), Mysql_Client('jj_info_company_honor'), Mysql_Client('jj_info_company_admin'), Mysql_Client('jj_info_company_party') ] source = self._webdriver.get(self.url) company = self.jj_info_company(source) try: self.jj_info_company_honor(company) except Exception as e: logging_except(e) printf("jj_info_company_honor", "NoData!!") self.jj_info_company_admin(company) self.jj_info_company_party(company) pass
if not main_redis.get(): self.main_spider() main_redis.close() for cls in self.spider_list: spider_name = cls.__name__ redis_key = spider_name.replace("Spider", "info") redis_client = RedisClient(redis_key) if redis_client.get(): self.work(cls) redis_client.close() printf("Wait for all spider end!") def work(self, spider_cla): spider = spider_cla() try: spider.start() except Exception as e: logging_except(e, cls_name=spider_cla.__name__ + ':' + str(spider.url)) print(e) finally: spider.pop() spider.close() if __name__ == "__main__": logging_except("????", cls_name="dsadsadsa") # program = Schedule() # program.start()
def start(self): self.initialization("info_all_gonggao", "jj_info_all_gonggao") self._webdriver.get(self.url) js = "window.open('%s');" # start next_botton = "//div[@id='pagebar']//label[last()]" current_xpath = "//div[@id='pagebar']//label[@value='%s']" pages_num = self._webdriver.find_element_by_xpath( "//div[@id='pagebar']//label[last()-1]").text current_page = 0 amount = 0 while int(current_page) < int(pages_num): current_page += 1 try: self._webdriver.find_element_by_xpath(current_xpath % current_page).click() except Exception as e: logging_except(e) self._webdriver.find_element_by_xpath(next_botton).click() time.sleep(3) table = BeautifulSoup( self._webdriver._brower.page_source, 'lxml').find('div', id='ggtable').find('tbody').find_all('tr') for tr in table: tds = tr.find_all('td') if len(tds) == 3: amount += 1 values = [self.code] title = tds[0].text.replace(' ', '').replace('\n', '') report_type = tds[1].text.replace(' ', '').replace('\n', '') date = tds[-1].text.replace(' ', '').replace('\n', '') href = tds[0].find('a')['href'] for td in [title, report_type, date]: values.append(td) self._webdriver._brower.execute_script(js % href) self._webdriver._brower.switch_to_window( self._webdriver._brower.window_handles[1]) time.sleep(3) while not BeautifulSoup( self._webdriver._brower.page_source, "lxml").find( 'pre', id='jjggzwcontentbody'): time.sleep(3) values.append( special_repace( BeautifulSoup(self._webdriver._brower.page_source, "lxml").find( 'pre', id='jjggzwcontentbody').text)) time.sleep(3) self._webdriver._brower.close() self._webdriver._brower.switch_to_window( self._webdriver._brower.window_handles[0]) sel = ("code", self.code, "title", title, 'date', date, 'type', report_type) self.storage(values, sel) amount += 1 else: pass # botton = self._webdriver.find_element_by_xpath(next_botton) # time.sleep(3) # botton.click() # time.sleep(3) # end printf("jj_info_all_gonggao storage ,CODE:%s,AMOUNT:%s" % (self.code, amount))