def view(keyword, driver_path): #for ubuntu display = Display(visible=1, size=(1920, 1080)) display.start() #chrome_options = webdriver.ChromeOptions() #chrome_options.add_argument('headless') #chrome_options.add_argument('--disable-gpu') #chrome_options.add_argument('lang=ko_KR') #driver = webdriver.Chrome(str(driver_path), chrome_options=chrome_options) # 드라이버 설정 driver = webdriver.Chrome(driver_path) # 드라이버 설정 keyword = '{}'.format(keyword) driver.get( "https://search.naver.com/search.naver?where=view&sm=tab_jum&query={}&qvt=0" .format(keyword)) # 키워드 검색 driver.implicitly_wait(time_to_wait=0.3) while True: last = driver.find_element_by_xpath('//*[@id="footer"]') action = ActionChains(driver) action.move_to_element(last).perform() driver.implicitly_wait(time_to_wait=0.3) height = driver.execute_script("return document.body.scrollHeight") print(height) time.sleep(0.5) if len( driver.find_elements_by_xpath( '//*[@class="review_loading _trigger_base"]')) == 0: print("Scroll Finished, Please Check.") break li = driver.find_element_by_xpath('//ul[@Class="lst_total _list_base"]') html = li.get_attribute('innerHTML') soup = bs(html, 'html.parser') Urls = [ k.attrs['href'] for k in soup.find_all(attrs={'class': 'api_txt_lines total_tit'}) ] title = [ k.get_text() for k in soup.find_all(attrs={'class': 'api_txt_lines total_tit'}) ] rank = [ li_.get_attribute('data-cr-rank') for li_ in li.find_elements_by_xpath('//li[@class="bx _svp_item"]') ] date = [ k.get_text() for k in soup.find_all(attrs={'class': 'sub_time sub_txt'}) ] print(len(Urls), len(rank), len(title), len(date)) driver.close() #for ubuntu display.quit() return Urls, rank, title, date
class MailcrawlSpider(scrapy.Spider): name = 'mailcrawl' allowed_domains = ['industrie-expo.com'] #start_urls = ['http://www.industrie-expo.com/liste-catalogue-exposants/'] def start_requests(self): self.setUp() self.driver.get( "http://www.industrie-expo.com/liste-catalogue-exposants/") pageid = 2 while True: try: driver.execute_script("searchExposant(" + str(pageid) + ", '#')") pageid += 1 print(pageid) except: break self.tearDown() #for url in urls: # yield scrapy.Request(url=url, callback=self.parse) def parse(self, response): pass def setUp(self): self.display = Display(visible=0, size=[800, 600]) self.display.start() self.driver = webdriver.Firefox() def tearDown(self): self.driver.close() self.display.quit()
for tag2 in supers1.find_all('span',attrs={'style': 'color: #ff0000'}): m_context2 = tag2.find('p').get_text() #print(m_context2) dict['tag'] = m_context2.replace('类别:','') #break st1 = dict['url']+','+dict['name']+','+dict['author']+','+dict['tag']+','+str(dict['num'])+','+dict['format']+','+dict['iframeid']+'\n' storestr(st1) lst.append(dict['name']) store(lst) print(lst) return dict if __name__=="__main__": display = Display(visible=0, size=(900, 800)) display.start() driver = webdriver.Firefox(executable_path='./geckodriver') lst = load() url = "https://www.5tps.com/mlist/46_1.html" #请求要访问小说页面的主页面 for i in range(2, 65): url = "https://www.5tps.com/mlist/46_%d.html"%i dict1=response = dump_load(lst,driver,url) # 获取小说每页列表并解析出 音频地址 和 小说单张名称 #st1 = dict1['url']+','+dict1['name']+','+dict1['author']+','+dict1['tag']+','+str(dict1['num'])+','+dict1['format'] #print(st1) #lst.append(dict1) #storestr(st1) #print(lst) driver.quit() display.quit()