Exemplo n.º 1
0
 def get_main_link(self, response):
     item = NewsItem()
     channel = response.meta['channel']
     item['channel'] = channel
     node_list = response.xpath("//*[@class='vervideo-bd']")
     for node in node_list:
         a_link = node.xpath("./a/@href").extract_first()
         a_link = response.urljoin(a_link)
         title = node.xpath(
             "./a/div[@class='vervideo-title']/text()").extract_first()
         duration = node.xpath(
             "./a/div[@class='vervideo-img']/div[@class='cm-duration']/text()"
         ).extract_first()
         cut_url = node.xpath(
             "./a/div[@class='vervideo-img']/div[@class='verimg-view']/div[@class='img']/@style"
         ).extract_first()
         cut_url = re.findall(r'url\((.*?)\);', cut_url)[0]
         authorname = node.xpath(
             "./div[@class='actcont-auto']/a/text()").extract_first()
         print(title, authorname, duration, cut_url)
         if a_link and title:
             item['title'] = title
             item['duration'] = duration
             item['cut_url'] = cut_url
             item['name'] = authorname
             yield scrapy.Request(url=a_link,
                                  callback=self.parse_detail,
                                  meta={'item': item})
Exemplo n.º 2
0
    def parse_item(self, response):
        item = NewsItem()

        item['url'] = response.url
        item['title'] = response.xpath('/html/body/main/section/article/div/div[1]/div[2]/div/div[1]/h1/text()').extract()[0].strip()
        item['pub_time'] = response.xpath('/html/body/main/section/article/div/div[1]/div[2]/div/div[2]/ul/li[2]/text()').extract()[0].strip()
        item['content_code'] = response.xpath('/html/body/main/section/article/div/div[2]').extract()[0].strip()
        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item
Exemplo n.º 3
0
    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] =  response.xpath('/html/body/div[5]/div/div[1]/div/div[1]/h1/text()').extract()[0].strip()
        item['pub_time'] = response.xpath('/html/body/div[5]/div/div[1]/div/div[1]/div[5]/span[1]/text()').extract()[0].strip()
        item['content_code'] = response.xpath('/html/body/div[5]/div/div[1]/div/div[1]/div[6]').extract()[0]

        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item
Exemplo n.º 4
0
    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] =  response.xpath('//*[@id="site-content"]/div/div[2]/h1/text()').extract()[0]
        item['pub_time'] = response.xpath('//*[@id="site-content"]/div/div[1]/div[2]/div/div/div/a[2]/text()').extract()[0][4:]
        item['content_code'] = response.xpath('//*[@id="site-content"]/div/div[3]').extract()[0]

        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item
Exemplo n.º 5
0
    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] = response.xpath('//*[@id="wrapper"]/article/div/header/h1/text()').extract()[0].strip()
        item['pub_time'] = response.xpath('//*[@id="wrapper"]/article/div/header/time/a/text()').extract()[0].strip()
        item['content_code'] = response.xpath('//*[@id="wrapper"]/article/div/div').extract()[0].strip()

        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item
Exemplo n.º 6
0
    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] = response.xpath('//*[@id="artibodyTitle"]/text()').extract()[0].strip()
        item['pub_time'] = response.xpath('//*[@id="pub_date"]/text()').extract()[0].strip()
        item['content_code'] = response.xpath('//*[@id="artibody"]').extract()[0].strip()

        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item
Exemplo n.º 7
0
    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] =  response.xpath('//*[@id="main"]/div[2]/h2/text()').extract()[0]
        item['pub_time'] = response.xpath('//*[@id="main"]/div[2]/div[1]/p/span[2]/text()').extract()[0]
        item['content_code'] = response.xpath('//*[@id="main"]/div[2]/div[2]').extract()[0]

        # 返回每个item
        yield item
Exemplo n.º 8
0
    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] = response.xpath('//h1/text()').extract()[0].strip()
        item['pub_time'] = response.xpath(
            '//span[@class="time"]/text()').extract()[0][:10]
        item['content_code'] = response.xpath('//article/div[2]').extract()[0]

        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item
Exemplo n.º 9
0
    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] = response.xpath('//h1').extract()[0].strip()
        item['pub_time'] = "now"
        item['content_code'] = response.xpath(
            '//section[@class="textblock"]').extract()[0].strip()

        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item
Exemplo n.º 10
0
    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] = response.xpath('//*[@id="Main-Article-QQ"]/div/div[1]/div[1]/div[1]/h1/text()').extract()[0].strip()
        item['pub_time'] = response.xpath('//*[@id="Main-Article-QQ"]/div/div[1]/div[1]/div[1]/div/div[1]/span[3]/text()').extract()[0].strip()
        item['content_code'] = response.xpath('//*[@id="Cnt-Main-Article-QQ"]').extract()[0].strip()


        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item
Exemplo n.º 11
0
    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] = response.xpath('//*[@id="article"]/div[1]/div[1]/h1/text()').extract()[0].strip()
        item['pub_time'] = response.url.split("/")[-2]
        item['content_code'] = response.xpath('//*[@id="article"]/div[1]/div[2]').extract()[0].strip()


        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item
Exemplo n.º 12
0
    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] = response.xpath(
            '//*[@id="articleTitle"]/text()').extract()[0].strip()
        item['pub_time'] = '2017-08-27'
        item['content_code'] = response.xpath(
            '//*[@id="articleContent"]/div/div[1]').extract()[0]

        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item
Exemplo n.º 13
0
    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] = response.xpath(
            '//*[@id="artical_topic"]/text()').extract()[0].strip()
        item['pub_time'] = response.xpath(
            '//*[@id="artical_sth"]/p/span[1]/text()').extract()[0].strip()
        item['content_code'] = response.xpath(
            '//*[@id="main_content"]').extract()[0]

        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item
Exemplo n.º 14
0
    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] = response.xpath(
            '//h1/span[2]/text()').extract()[0].strip()
        item['pub_time'] = response.xpath(
            '//time[1]/text()').extract()[0].strip()
        item['content_code'] = response.xpath(
            '//main/div/div[1]/div/article/div[1]').extract()[0].strip()

        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item
Exemplo n.º 15
0
    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] = response.xpath(
            '//*[@id="conTit"]/h1/text()').extract()[0].strip()
        item['pub_time'] = response.xpath(
            '//*[@id="artInfo"]/text()').extract()[0].strip()[:11]
        item['content_code'] = response.xpath(
            '//*[@id="the_content"]/div[3]/div[2]').extract()[0].strip()

        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item
Exemplo n.º 16
0
    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] = response.xpath(
            '//*[@id="activity-name"]/text()').extract()[0].strip()
        item['pub_time'] = response.xpath(
            '//*[@id="post-date"]/text()').extract()[0]
        item['content_code'] = response.xpath(
            '//*[@id="js_content"]').extract()[0]

        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item
Exemplo n.º 17
0
    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] = response.xpath(
            '//*[@id="epContentLeft"]/h1/text()').extract()[0].strip()
        item['pub_time'] = response.xpath(
            '//*[@id="epContentLeft"]/div[1]/text()').extract()[0].strip()[:10]
        item['content_code'] = response.xpath(
            '//*[@id="endText"]').extract()[0].strip()

        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item
Exemplo n.º 18
0
    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] = Selector(response).xpath(
            '/html/body/div[1]/div/div[1]/div/h2/text()').extract()[0].strip()
        item['pub_time'] = Selector(response).xpath(
            '/html/body/div[1]/div/div[1]/div/div[1]/span[4]/text()').extract(
            )[0].strip()[:11]
        item['content_code'] = Selector(response).xpath(
            '//html/body/div[1]/div/div[1]/div/div[2]').extract()[0].strip()

        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item
Exemplo n.º 19
0
    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] = response.xpath(
            '//*[@id="content"]//h1/text()').extract()[0].strip()
        # item['pub_time'] = response.url.split('/')[-3] + '-' + response.url.split('/')[-2]
        item['pub_time'] = response.xpath(
            '//*[@id="content"]/span/text()').re_first('\d{4}年\d+月\d+日')
        item['content_code'] = ''.join(
            response.xpath('//*[@id="content"]//div/p//text()').extract())

        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item
Exemplo n.º 20
0
    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] = response.xpath(
            '//div[@id="singlepost-wrap"]/article/header/div[2]/h1/text()'
        ).extract()[0].strip()
        item['pub_time'] = response.xpath(
            '//div[@id="singlepost-wrap"]/article/header/div[3]/div[1]/div[3]/a/time/text()'
        ).extract()[0].strip()
        item['content_code'] = response.xpath(
            '//div[@id="entry-content"]').extract()[0]

        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item
Exemplo n.º 21
0
    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] = response.xpath(
            '//*[@id="main-container"]/div/div/div/div[1]/div/div[1]/h2/text()'
        ).extract()[0].strip()
        item['pub_time'] = response.xpath(
            '//*[@id="main-container"]/div/div/div/div[1]/div/div[1]/p/span/span/text()[2]'
        ).extract()[0].strip()[5:15]
        item['content_code'] = response.xpath(
            '//*[@id="contents"]').extract()[0].strip()

        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item
Exemplo n.º 22
0
    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] = response.xpath(
            '//*[@id="container"]/section/div/div[1]/div[2]/h1/text()'
        ).extract()[0].strip()
        item['pub_time'] = response.xpath(
            '//*[@id="container"]/section/div/div[1]/div[2]/p/em[5]/text()'
        ).extract()[0].strip()
        item['content_code'] = response.xpath(
            '//*[@id="container"]/section/div/div[1]/div[4]').extract()[0]

        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item
Exemplo n.º 23
0
    def parse_item(self, response):
        item = NewsItem()

        # extract article info by xpath
        node = response.xpath('//*[@id="articles-show"]/article')
        # get content_div, title, publication time and url
        content_code = response.xpath(
            '//*[@id="js-article-content"]').extract()
        title = node.xpath('./h1/text()').extract()
        pub_time = node.xpath('./div/span/text()').extract()

        item['content_code'] = content_code[0]
        item['title'] = title[0]
        item['pub_time'] = pub_time[0]
        item['url'] = response.url

        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item
Exemplo n.º 24
0
    def parse_item(self, response):
        item = NewsItem()

        item['url'] = response.url
        # get article id
        article_id = response.url.split('/')[-1][:6]
        # generate xpath
        title_xpath = '//*[@id="article' + article_id + '"' + ']/div[2]/div[2]/h1/text()'
        pub_time_xpath = '//*[@id="article' + article_id + '"' + ']/div[2]/div[2]/div[1]/div/span[1]/text()'
        content_xpath = '//*[@id="article_content' + article_id + '"' + ']'

        item['title'] = response.xpath(title_xpath).extract()[0].strip()
        item['pub_time'] = response.xpath(pub_time_xpath).extract()[0].strip()
        item['content_code'] = response.xpath(
            content_xpath).extract()[0].strip()

        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item