예제 #1
0
    def parse_do(self, response):
        item = DianyingItem()
        #取得链接
        #link=response.xpath('//*[@id="Zoom"]/table[2]/tbody/tr/td/a/@href').extract()
        link = response.xpath('//*[@id="Zoom"]//a/text()').extract()
        print("zzzzzzzz=")
        print link
        if (len(link) == 0):
            print "1...begin"
            link = response.xpath(
                '//*[@id="Zoom"]/table[1]/tbody/tr/td/anchor/a/text()'
            ).extract()
            print link
            print('1...over')
        if (not len(link) == 0):
            video_link = ""
            for i in link:
                print "i=========" + i
                video_link += i
                video_link += "##"
            item['VideoId'] = str(uuid.uuid4())
            try:
                item['VideoLink'] = video_link
            except Exception as e:
                item['VideoLink'] = ""

            item['VideoTitle'] = response.xpath(
                '//*[@id="header"]/div/div[3]/div[2]/div[6]/div[1]/h1/text()'
            ).extract()[0]

            try:
                item['VideoTag'] = response.xpath(
                    '//*[@id="header"]/div/div[3]/div[2]/div[6]/div[2]/ul/div[1]/span[2]/a/text()'
                ).extract()[0]
            except Exception as e:
                item['VideoTag'] = ""
            try:
                item['VideoND'] = response.xpath(
                    '//*[@id="header"]/div/div[3]/div[2]/div[6]/div[2]/ul/div[1]/span[3]/text()'
                ).extract()[0]
            except Exception as e:
                item['VideoND'] = ""
            try:
                item['VideoPF'] = response.xpath(
                    '//*[@id="header"]/div/div[3]/div[2]/div[6]/div[2]/ul/div[1]/span[1]/strong/text()'
                ).extract()[0]
            except Exception as e:
                item['VideoPF'] = ""
            try:
                item['VideoContent'] = response.xpath(
                    '//*[@id="Zoom"]').extract()
            except Exception as e:
                item['VideoContent'] = ""

            imglink = response.xpath('//*[@id="Zoom"]/p[1]/img/@src').extract()
            item['VideoImg'] = imglink
            if (not len(imglink) == 0):
                item['VideoImgName'] = imglink[0].split('/')[-1]
            print 'will yield...'
            yield item
예제 #2
0
    def parse_get(self, response):
        print('开始爬取')
        item = DianyingItem()
        lists = response.xpath("//tbody[starts-with(@id,'normalthread')]")
        for list in lists:
            b = '百度云'
            # 获取所有的标题和网址
            print('获取标题')
            lname = list.xpath("./tr/th/a[2]/text()").extract()

            # 返回的是list,我们转化为str
            names = ''.join(lname)
            print(names)
            # 筛选出百度云的链接
            if b in names:
                print('资源网址:')
                lurl = list.xpath("./tr/th/a[2]/@href").extract()

                # 同上进行str转换
                urls = ''.join(lurl)
                print(urls)
                baiduurl, tiquma = self.get_url(urls)
                print(baiduurl, tiquma)
                item['name'] = names
                item['baiduurl'] = baiduurl
                item['tiquma'] = tiquma

                yield item
예제 #3
0
 def neirong(self, response):
     data = Selector(response)
     itme = DianyingItem()
     itme['moviename'] = data.xpath(
         '//h1[@class="font14w"]/text()').extract()
     itme['jianjie'] = ''.join(
         data.xpath(
             '//div[@class="info" and child::h1[@class="font14w"]]/span/text()'
         )[0:2].extract()).strip()
     itme['actor'] = data.xpath(
         '//span/a[contains(@href,"actor")]/text()').extract()
     itme['kind'] = data.xpath(
         '//span/a[contains(@href,"----")]/text()').extract()
     itme['country'] = data.xpath(
         '//span[child::span[contains(text(),"地区")]]/a/text()').extract()
     itme['language'] = data.xpath(
         '//span[child::span[contains(text(),"语言")]]/a/text()').extract()
     itme['daoyan'] = data.xpath(
         '//span/a[contains(@href,"dir")]/text()').extract()
     itme['sysj'] = data.re('上映日期:.*?(\d{4}-\d{2}-\d{2})')
     itme['pc'] = data.re('片长:\D+?(\d+[\u4E00-\u9FA5]+)')
     itme['gxsj'] = data.re('更新日期:.*?(\d{4}-\d{2}-\d{2})')
     itme['jqjs'] = ''.join(
         data.xpath('//div[@id="movie_content"]/text()').extract()).strip()
     itme['dbpf'] = data.xpath(
         '//span[child::span[contains(text(),"豆瓣评分")]]/text()').re(
             '\d+.\d+')
     downlink = data.xpath(
         '//div[@id="cpdl2list"]//a[@rel="nofollow"]/@href').extract()
     downlink_2 = data.xpath('//input[@class="checkone"]/@value').extract()
     for i in downlink_2:
         downlink.append(i)
     itme['downlink'] = downlink
     print(itme)
     return itme
예제 #4
0
    def parse_1(self, response):

        item = DianyingItem()
        item['title'] = response.css('title::text').extract_first()
        item['url'] = response.url

        return item
예제 #5
0
    def parse(self, response):

        # 从响应体中提取出所有的电影信息
        viod_list = response.xpath("//div[@class='co_content8']//table")
        for viod in viod_list:
            item = DianyingItem()
            item['title'] = viod.xpath(".//a/text()").extract_first()
            item['data'] = viod.xpath(".//font/text()").extract_first()
            url_next = "http://www.dytt8.net" + viod.xpath(
                ".//a/@href").extract_first()
            yield scrapy.Request(url=url_next,
                                 callback=self.parse_next,
                                 meta={'item': item})
예제 #6
0
 def parse(self, response):
     reload(sys)
     sys.setdefaultencoding('utf-8')
     print self.start_urls[0]
     items = DianyingItem()
     #取得名字
     a = response.xpath('//*[@id="header"]/div/div[3]/div[2]/div[6]/div[1]/h1/text()').extract()[0]
     items['name'] = a
     #取得链接
     link=response.xpath('//*[@id="Zoom"]/table[2]/tbody/tr/td/a/@href').extract()
     if(not len(link) == 0):
         items['link'] =link
         yield items
     if self.i < 98050:
         self.i+=1
         yield scrapy.Request(self.url+str(self.i)+".html",callback=self.parse)
예제 #7
0
    def parse_onepage(self, response):
        item = DianyingItem()
        # 获取目标网页中的href,有可能包含多个href,且返回结果为list了,所以我们进行遍历
        print('获取页面中的所有链接')
        lists = response.xpath("//td[@class='t_f']//a/@href").extract()
        for list in lists:
            if 'baidu.com' in list:
                # 这块转换str是因为如果是多个a标签的话,list是lxml的unicode对象,所以先进行转换
                item['baiduurl'] = str(list)

        # 获取页面中的所有文本,找到提取码
        all = response.xpath("//td[@class='t_f']//text()").extract()
        for i in all:
            a = '提取'
            if a in i:
                item['tiquma'] = i

        # 获取该链接的标题
        name = response.xpath(
            "//span[@id='thread_subject']/text()").extract_first()
        item['name'] = name
        yield item