Пример #1
0
def view(keyword, driver_path):
    #for ubuntu
    display = Display(visible=1, size=(1920, 1080))
    display.start()

    #chrome_options = webdriver.ChromeOptions()
    #chrome_options.add_argument('headless')
    #chrome_options.add_argument('--disable-gpu')
    #chrome_options.add_argument('lang=ko_KR')
    #driver = webdriver.Chrome(str(driver_path), chrome_options=chrome_options)  # 드라이버 설정
    driver = webdriver.Chrome(driver_path)  # 드라이버 설정

    keyword = '{}'.format(keyword)
    driver.get(
        "https://search.naver.com/search.naver?where=view&sm=tab_jum&query={}&qvt=0"
        .format(keyword))  # 키워드 검색
    driver.implicitly_wait(time_to_wait=0.3)
    while True:
        last = driver.find_element_by_xpath('//*[@id="footer"]')
        action = ActionChains(driver)
        action.move_to_element(last).perform()
        driver.implicitly_wait(time_to_wait=0.3)
        height = driver.execute_script("return document.body.scrollHeight")
        print(height)
        time.sleep(0.5)
        if len(
                driver.find_elements_by_xpath(
                    '//*[@class="review_loading _trigger_base"]')) == 0:
            print("Scroll Finished, Please Check.")
            break

    li = driver.find_element_by_xpath('//ul[@Class="lst_total _list_base"]')
    html = li.get_attribute('innerHTML')
    soup = bs(html, 'html.parser')

    Urls = [
        k.attrs['href']
        for k in soup.find_all(attrs={'class': 'api_txt_lines total_tit'})
    ]

    title = [
        k.get_text()
        for k in soup.find_all(attrs={'class': 'api_txt_lines total_tit'})
    ]
    rank = [
        li_.get_attribute('data-cr-rank')
        for li_ in li.find_elements_by_xpath('//li[@class="bx _svp_item"]')
    ]

    date = [
        k.get_text()
        for k in soup.find_all(attrs={'class': 'sub_time sub_txt'})
    ]
    print(len(Urls), len(rank), len(title), len(date))
    driver.close()
    #for ubuntu
    display.quit()

    return Urls, rank, title, date
Пример #2
0
class MailcrawlSpider(scrapy.Spider):
    name = 'mailcrawl'

    allowed_domains = ['industrie-expo.com']

    #start_urls = ['http://www.industrie-expo.com/liste-catalogue-exposants/']

    def start_requests(self):

        self.setUp()
        self.driver.get(
            "http://www.industrie-expo.com/liste-catalogue-exposants/")

        pageid = 2
        while True:
            try:
                driver.execute_script("searchExposant(" + str(pageid) +
                                      ", '#')")
                pageid += 1
                print(pageid)
            except:
                break

        self.tearDown()

        #for url in urls:
        #    yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        pass

    def setUp(self):
        self.display = Display(visible=0, size=[800, 600])
        self.display.start()
        self.driver = webdriver.Firefox()

    def tearDown(self):
        self.driver.close()
        self.display.quit()
Пример #3
0
                for tag2 in supers1.find_all('span',attrs={'style': 'color: #ff0000'}):
                    m_context2 = tag2.find('p').get_text()
                    #print(m_context2)
                    dict['tag'] = m_context2.replace('类别:','')

            #break
            st1 = dict['url']+','+dict['name']+','+dict['author']+','+dict['tag']+','+str(dict['num'])+','+dict['format']+','+dict['iframeid']+'\n'
            storestr(st1)
            lst.append(dict['name'])
            store(lst)
            print(lst)
    return dict

if __name__=="__main__":
    display = Display(visible=0, size=(900, 800))
    display.start()
    driver = webdriver.Firefox(executable_path='./geckodriver')
    lst = load()
    url = "https://www.5tps.com/mlist/46_1.html" #请求要访问小说页面的主页面
    for i in range(2, 65):
        url = "https://www.5tps.com/mlist/46_%d.html"%i
        dict1=response = dump_load(lst,driver,url) # 获取小说每页列表并解析出 音频地址 和 小说单张名称
        #st1 = dict1['url']+','+dict1['name']+','+dict1['author']+','+dict1['tag']+','+str(dict1['num'])+','+dict1['format']
        #print(st1)
        #lst.append(dict1)
        #storestr(st1)
    #print(lst)
    driver.quit()
    display.quit()