示例#1
0
 def parse(self, response):
     res_sel = Selector(response)
     for sel in res_sel.xpath('//div[@class="v va"]'):
         item = VideocrawlItem()
         item['title'] = sel.xpath(
             'div[@class="v-meta"]/div/a/text()').extract()
         item['desc'] = sel.xpath(
             'div[@class="v-meta"]/div/a/text()').extract()
         item['video_url'] = sel.xpath(
             'div[@class="v-meta"]/div/a/@href').extract()
         item['date'] = sel.xpath(
             'div/div/span[@class="v-publishtime"]/text()').extract()
         item['date'][0] = date_cal(item['date'][0])
         item['duration'] = sel.xpath(
             'div/div/span[@class="v-time"]/text()').extract()
         item['classify'] = "01"
         yield Request(url=str(item['video_url'][0]),
                       meta={'item': item},
                       callback=self.parse_classify)
     #进入下一页页面
     links = response.xpath('//li[@class="next"]')
     for every_url in links:
         url = str(self.start_urls[0] +
                   every_url.xpath('a/@href').extract()[0])
         yield Request(url, callback=self.parse)
示例#2
0
 def parse(self, response):
     driver = webdriver.PhantomJS()
     driver.get(self.start_urls[0])
     while True:
         #将滚轮滑到最下端
         js = "var q=document.documentElement.scrollTop=10000"
         driver.execute_script(js)
         time.sleep(3)
         element = WebDriverWait(driver, 20).until(
             lambda x: x.find_element_by_xpath('//p[@class="mB5 lh14"]'))
         page_html = driver.page_source
         sell = etree.HTML(page_html)
         #TODO
         titles = sell.xpath('//li[@class="l pl27 pb15 dr_li"]')
         i = 0
         while (i < len(titles)):
             item = VideocrawlItem()
             item['title'] = titles[i].xpath('p[1]/a/text()')
             item['desc'] = titles[i].xpath('p[1]/a/text()')
             item['video_url'] = titles[i].xpath('p[1]/a/@href')
             item['date'] = titles[i].xpath('p[2]/a[2]/span/text()')
             item['date'][0] = date_cal(item['date'][0])
             item['duration'] = "null"
             item['classify'] = u'数码'
             i = i + 1
             yield item
         try:
             next_page = driver.find_element_by_xpath(
                 '//a[@class="a btnR"]')
             next_page.click()
             time.sleep(3)
         except:
             print "next_page is over"
             break
示例#3
0
 def parse(self, response):
     driver = webdriver.PhantomJS()
     driver.get(self.start_urls[0])
     while True:
         page_html = driver.page_source
         selector = etree.HTML(page_html)
         sel = selector.xpath('//div[@class="site-piclist_pic"]')
         index = 0
         while (index < len(sel)):
             item = VideocrawlItem()
             item['title'] = sel[index].xpath('a/@title')
             item['desc'] = sel[index].xpath('a/@title')
             item['video_url'] = sel[index].xpath('a/@href')
             if not item['video_url']:
                 continue
             item['date'] = "null"
             item['duration'] = sel[index].xpath('a/div/div/span/text()')
             item['classify'] = u'综艺'
             index = index + 1
             yield item
         try:
             next_page = driver.find_element_by_xpath(
                 '//a[@data-key="down"]')
             next_page.click()
             time.sleep(5)
         except:
             print "next_page is over"
             break
示例#4
0
 def parse(self,response):
     time.sleep(5)
     sell=Selector(response)
     driver=webdriver.PhantomJS()
     driver.maximize_window()
     driver.get(self.start_urls[0])
     try:
         js="var q=document.documentElement.scrollTop=10000"
         driver.execute_script(js)
         time.sleep(10)
         movis=driver.page_source
         sell=etree.HTML(movis)
         titles=sell.xpath('//div[@class="yk-col4 yk-pack  p-list mb16"]')
     except:
         print "scoll errot"
     index=0
     while (index<len(titles)):
           item=VideocrawlItem()
           item['title']=titles[index].xpath('div/a/@title')
           item['desc']=titles[index].xpath('div/a/@title')
           item['video_url']=titles[index].xpath('div/a/@href')
           if not item['video_url']:
                continue
           item['date']="null"
           item['duration']=titles[index].xpath('ul/li/span/span/text()')
           item['classify']=u'综艺'
           index=index+1
           yield item
示例#5
0
 def parse(self, response):
     sel_res = Selector(response)
     driver = webdriver.PhantomJS()
     driver.maximize_window()
     driver.get(self.start_urls[0])
     while True:
         time.sleep(5)
         for sel in sel_res.xpath('//li[@j-delegate="colitem"]'):
             item = VideocrawlItem()
             item['title'] = sel.xpath('div[1]/a/@data-title').extract()
             item['desc'] = sel.xpath('div[1]/a/@data-title').extract()
             item['video_url'] = sel.xpath('div[1]/a/@href').extract()
             if not item['video_url']:
                 continue
             item['date'] = sel.xpath(
                 'div[2]/p[2]/span[2]/text()').extract()
             item['date'][0] = date_cal(item['date'][0])
             item['duration'] = sel.xpath(
                 'div[1]/a/div/div/span/text()').extract()
             item['classify'] = u'综艺'
             yield item
         try:
             next_page = driver.find_element_by_xpath('//a[@class="a1"]')
             next_page.click()
         except:
             print "next page is over"
             break
示例#6
0
 def parse(self,response):
     temp=Selector(response)
     try:          
        driver=webdriver.PhantomJS()
     except:
        print "driver error"
     driver.maximize_window()
     driver.get("http://www.soku.com/search_video/q_qww?f=1&kb=040200000000000__qww&")
     time.sleep(2)
     driver.find_element_by_id("headq").clear()
     driver.find_element_by_xpath('//input[@id="headq"]').send_keys(self.keys)
     driver.find_element_by_xpath('//button[@class="btn btn_search"]').click()
     time.sleep(3)
     while True:
           target = driver.find_element_by_xpath('//div[@class="about"]')
           driver.execute_script("arguments[0].scrollIntoView();", target)
           time.sleep(3)
           target = driver.find_element_by_xpath('//div[@class="sk_wrap"]')
           driver.execute_script("arguments[0].scrollIntoView();", target)
           time.sleep(3)
           target = driver.find_element_by_xpath('//div[@class="about"]')
           driver.execute_script("arguments[0].scrollIntoView();", target)
           time.sleep(3)
           movis=driver.page_source
           selector=etree.HTML(movis)
           sel=selector.xpath('//div[@class="v"]')
           index=0                
           while(index<len(sel)):
               item=VideocrawlItem()
               item['title']=sel[index].xpath('.//div[@class="v-link"]/a/@title')
               if not len(item['title']):
                   index=index+1
                   continue
               if is_similar(item['title'][0]):
                   index=index+1
                   continue
               item['desc']=sel[index].xpath('.//div[@class="v-link"]/a/@title')
               item['video_url']=sel[index].xpath('div[@class="v-link"]/a/@href')
               item['date']=sel[index].xpath('.//span[@class="r"]/text()')
               item['date'][0]=date_cal(item['date'][0])
               item['duration']=sel[index].xpath('div/div/span[@class="v-time"]/text()')
               item['classify']='null'
               if not len(item['video_url']):
                   index=index+1
                   continue
               url=str(item['video_url'][0])
               item['classify']=self.classify(url)
               index=index+1
               yield item
           try:
               next_page=driver.find_element_by_xpath('//li[@class="next"]/a')
               next_page.click()
               time.sleep(3)
           except:
               print "nexr page is over"
               break
示例#7
0
 def parse(self, response):
     temp = Selector(response)
     driver = webdriver.PhantomJS()
     driver.maximize_window()
     driver.get(self.start_urls[0])
     time.sleep(2)
     driver.find_element_by_id("data-widget-searchword").clear()
     driver.find_element_by_xpath(
         '//input[@id="data-widget-searchword"]').send_keys(self.keys)
     driver.find_element_by_xpath('//input[@class="search_btn"]').click()
     time.sleep(3)
     while True:
         target = driver.find_element_by_xpath('//div[@class="qy_footer"]')
         driver.execute_script("arguments[0].scrollIntoView();", target)
         time.sleep(3)
         target = driver.find_element_by_xpath('//div[@class="logo_wrap"]')
         driver.execute_script("arguments[0].scrollIntoView();", target)
         time.sleep(3)
         target = driver.find_element_by_xpath('//div[@class="qy_footer"]')
         driver.execute_script("arguments[0].scrollIntoView();", target)
         movis = driver.page_source
         selector = etree.HTML(movis)
         sel = selector.xpath('//li[@class="list_item"]')
         index = 0
         while (index < len(sel)):
             item = VideocrawlItem()
             item['title'] = sel[index].xpath('div/h3/a/@title')
             if not len(item['title']):
                 index = index + 1
                 continue
             if is_similar(item['title'][0]):
                 index = index + 1
                 continue
             item['desc'] = sel[index].xpath('div/h3/a/@title')
             item['video_url'] = sel[index].xpath('a/@href')
             item['date'] = sel[index].xpath(
                 'div/div/div/em[@class="result_info_desc"]/text()')
             if not item['date']:
                 index = index + 1
                 continue
             item['date'][0] = date_cal(item['date'][0])
             item['duration'] = sel[index].xpath('a/p/span/text()')
             item['classify'] = 'null'
             url = str(item['video_url'][0])
             item['classify'] = self.classify(url)
             index = index + 1
             yield item
         try:
             next_page = driver.find_element_by_xpath(
                 '//a[@data-key="down"]')
             next_page.click()
             time.sleep(3)
         except:
             print "next_page is over"
             break
示例#8
0
 def parse(self, response):
     sell = Selector(response)
     for sel in sell.xpath('//span[@class="item item_half"]'):
         item = VideocrawlItem()
         item['title'] = sel.xpath('a/@title').extract()
         item['desc'] = sel.xpath('a/@title').extract()
         item['video_url'] = self.tengxun_url + sel.xpath(
             'a/@href').extract()[0]
         item['date'] = "null"
         item['duration'] = "null"
         item['classify'] = u'动物萌宠'
         yield item
示例#9
0
 def parse(self, response):
     res_sel = Selector(response)
     for sel in res_sel.xpath('//div[@class="item"]'):
         item = VideocrawlItem()
         item['title'] = sel.xpath('div/div/@title').extract()
         item['desc'] = sel.xpath('div/div/@title').extract()
         item['video_url'] = sel.xpath('div/div/a/@href').extract()
         if not item['video_url']:
             continue
         item['date'] = "null"
         item['duration'] = sel.xpath(
             'div/div/a/div/span[@class="c-time"]/span/text()').extract()
         item['classify'] = ""
         yield Request(url=str(item['video_url'][0]),
                       meta={'item': item},
                       callback=self.parse_classify)
示例#10
0
 def parse(self, response):
     sell = Selector(response)
     for sel in sell.xpath('//div[@class="yk-row"]/div'):
         item = VideocrawlItem()
         item['title'] = sel.xpath('div/div/a/@title').extract()
         item['desc'] = sel.xpath('div/div/a/@title').extract()
         item['video_url'] = sel.xpath('div/div/a/@href').extract()
         item['date'] = "null"
         item['duration'] = sel.xpath(
             'div/ul/li/span/span/text()').extract()
         item['classify'] = "01"
         yield Request(url=str(item['video_url'][0]),
                       meta={'item': item},
                       callback=self.parse_classify)
     links = response.xpath('//li[@class="next"]')
     for every_url in links:
         url = str(every_url.xpath('a/@href').extract()[0])
         yield Request(url, callback=self.parse)
示例#11
0
 def parse(self,response):
     driver=webdriver.PhantomJS()
     driver.get(self.start_urls[0])
     element = WebDriverWait(driver, 20).until(lambda x :x.find_element_by_xpath('//li[@class="list_item"]'))
     page_html=driver.page_source
     sell =etree.HTML(page_html)
     titles=sell.xpath('//li[@class="list_item"]')
     i=0
     while (i<len(titles)):
           item=VideocrawlItem()
           item['title']=titles[i].xpath('@data-title')
           if not item['title']:
               break
           item['desc']=titles[i].xpath('@data-title')
           item['video_url']=titles[i].xpath('a/@href')
           item['video_url'][0]=self.tengxun_url+str(item['video_url'][0])
           item['date']="null"
           item['duration']=titles[i].xpath('a/div/div/span[@class="figure_info"]/text()')
           item['classify']=u'母婴育儿'
           i=i+1
           yield item 
示例#12
0
 def parse(self, response):
     temp = Selector(response)
     try:
         driver = webdriver.PhantomJS()
     except:
         print "driver error"
     driver.maximize_window()
     driver.get("http://search.bilibili.com/all?keyword=TFBOYS")
     time.sleep(2)
     driver.find_element_by_id("search-keyword").clear()
     driver.find_element_by_xpath(
         '//input[@id="search-keyword"]').send_keys(self.keys)
     driver.find_element_by_xpath('//div[@id="search-button"]').click()
     time.sleep(3)
     index = 0
     page = 1
     while True:
         #滑动到页面下方
         target = driver.find_element_by_xpath('//a[@id="weixin"]')
         driver.execute_script("arguments[0].scrollIntoView();", target)
         time.sleep(3)
         #滑动到页面上方
         target = driver.find_element_by_xpath('//div[@id="header-search"]')
         driver.execute_script("arguments[0].scrollIntoView();", target)
         time.sleep(3)
         #滑动到页面下方
         target = driver.find_element_by_xpath('//a[@id="weixin"]')
         driver.execute_script("arguments[0].scrollIntoView();", target)
         time.sleep(3)
         page_html = driver.page_source
         selector = etree.HTML(page_html)
         sel = selector.xpath('//li[@class="video matrix "]')
         index = 0
         while (index < len(sel)):
             item = VideocrawlItem()
             item['title'] = sel[index].xpath('a/@title')
             if not len(item['title'][0]):
                 index = index + 1
                 continue
             if is_similar(item['title'][0]):
                 index = index + 1
                 continue
             item['desc'] = sel[index].xpath('a/@title')
             item['video_url'] = sel[index].xpath('a/@href')
             item['date'] = sel[index].xpath(
                 './/span[@class="so-icon time"]/text()')[1].replace(
                     ' ', '').replace('\n', '').replace('\t', '')
             item['date'] = date_cal(item['date'])
             item['duration'] = sel[index].xpath('a/div/span/text()')
             item['duration'] = item['duration'][0].replace(
                 ' ', '').replace('\n', '').replace('\t', '')
             item['classify'] = 'null'
             if not len(item['video_url']):
                 index = index + 1
                 continue
             url = str(item['video_url'][0])
             item['classify'] = self.classify(url)
             index = index + 1
             yield item
         try:
             next_page = driver.find_element_by_xpath(
                 '//a[@class="nextPage"]')
             next_page.click()
             time.sleep(3)
         except:
             print "nexr page is over"
             break
示例#13
0
 def parse(self, response):
     temp = Selector(response)
     try:
         driver = webdriver.PhantomJS()
     except:
         print "driver error"
     driver.maximize_window()
     driver.get("http://v.qq.com/x/search/?q=redis&stag=102&smartbox_ab=")
     time.sleep(2)
     driver.find_element_by_id("keywords").clear()
     driver.find_element_by_xpath('//input[@id="keywords"]').send_keys(
         self.keys)
     driver.find_element_by_xpath('//button[@class="search_btn"]').click()
     time.sleep(3)
     index = 0
     page = 1
     while True:
         #js="var q=document.documentElement.scrollTop=10000"
         #driver.execute_script(js)
         target = driver.find_element_by_xpath('//div[@class="footermenu"]')
         driver.execute_script("arguments[0].scrollIntoView();", target)
         time.sleep(3)
         target = driver.find_element_by_xpath('//div[@class="site_logo"]')
         driver.execute_script("arguments[0].scrollIntoView();", target)
         time.sleep(3)
         target = driver.find_element_by_xpath('//div[@class="footermenu"]')
         driver.execute_script("arguments[0].scrollIntoView();", target)
         #WebDriverWait(driver, 20).until(lambda x:x.find_element_by_xpath('//div[@log_prevsend="true"]'))
         time.sleep(3)
         movis = driver.page_source
         selector = etree.HTML(movis)
         sel = selector.xpath('//div[@class="result_item result_item_h"]')
         index = 0
         while (index < len(sel)):
             item = VideocrawlItem()
             item['title'] = sel[index].xpath('h2/a/text()')
             if not len(item['title']):
                 index = index + 1
                 continue
             if len(item['title']) > 1:
                 item['title'][0] = item['title'][0] + str(
                     self.keys) + item['title'][1]
                 item['title'] = item['title'][:1]
             if is_similar(item['title'][0]):
                 index = index + 1
                 continue
             item['desc'] = sel[index].xpath('h2/a/text()')
             item['video_url'] = sel[index].xpath('a/@href')
             item['date'] = sel[index].xpath(
                 'div/div/div/span[@class="content"]/text()')
             item['date'] = date_cal(item['date'][0])
             item['duration'] = sel[index].xpath('a/span/span/text()')
             item['duration'][0] = item['duration'][0][2:]
             item['classify'] = 'null'
             if not len(item['video_url']):
                 index = index + 1
                 continue
             url = str(item['video_url'][0])
             #item['classify']=self.classify(url)
             index = index + 1
             yield item
         try:
             next_page = driver.find_element_by_xpath(
                 '//a[@class="page_next"]')
             next_page.click()
             time.sleep(3)
         except:
             print "nexr page is over"
             break