def parseSecond(self, response): #解析每个类页面中的新闻链接和相关信息 div_list = response.xpath('//div[@class="ndi_main"]/div') for div in div_list: head = div.xpath( ".//div[@class='news_title']/h3/a/text()").extract_first() url = div.xpath( ".//div[@class='news_title']/h3/a/@href").extract_first() img_url = div.xpath("./a/img/@src").extract_first() tag = ",".join( div.xpath(".//div[@class='keywords']//a/text()").extract()) time = div.xpath( ".//div[@class='news_tag']/span/text()").extract_first() comments = div.xpath( ".//div[@class='post_recommend_tie_wrap']/span/text()" ).extract_first() #实例化item对象,将解析到的值存储到item中 item = WangyiproItem() item["head"] = head item["url"] = url item["img_url"] = img_url item["tag"] = tag item["title"] = response.meta["title"] item["time"] = time item["comments"] = comments yield scrapy.Request(url=url, callback=self.getContent, meta={"item": item})
def parse_second(self, response): div_list = response.xpath( '//div[@class="data_row news_article clearfix "]') print(len(div_list)) # 访问网页新闻对应的url时,数据是动态加载的, # 获取paser传入的title title = response.meta['title'] for div in div_list: news_title = div.xpath( './/div[@class="news_title"]/h3/a/text()').extract_first() news_url = div.xpath( './/div[@class="news_title"]/h3/a/@href').extract_first() img_url = div.xpath('./a/img/@src').extract_first() tag_list = div.xpath('.//div[@class="news_tag"]//text()').extract() tags = [] for t in tag_list: t = t.strip(' \n \t') tags.append(t) tag = "-".join(tags) # print(news_title+','+news_url) item = WangyiproItem() item['news_title'] = news_title item['news_url'] = news_url item['img_url'] = img_url item['tag'] = tag item['title'] = title # 对url发起请求,获取对应页面数据 yield scrapy.Request(url=news_url, callback=self.get_content, meta={'item': item})
def parse_model(self, response): # 该方法中获取的response对象是没有包含动态加载出来的新闻数据的(是一个不满足需求的response) # # 5+n个响应对象经过parse_modele # 5:5大板块对应的响应数据,是不满足需求的响应数据,里面没有新闻数据(是动态加载的ajax数据) # n:进入新闻详情页获取的简介不含ajax数据是满足需求的 # # 如果下载中间件生效了 则返回的response响应数据里面包含了ajax请求 # 则div_list就不是空的 div_list = response.xpath( '/html/body/div/div[3]/div[4]/div[1]/div/div/ul/li/div/div') for div in div_list: detail_url = div.xpath('./a/@href').extract_first() title = div.xpath('./div/div[1]/h3/a/text()').extract_first() item = WangyiproItem() item['title'] = title if detail_url == None: continue if title == None: continue # 对新闻详情页发手动请求 yield scrapy.Request(detail_url, callback=self.parse_new_detail, meta={'item': item})
def parseSecond(self, response): div_list = response.xpath( '//div[@class="data_row news_article clearfix "]') # print(len(div_list)) for div in div_list: head = div.xpath( './/div[@class="news_title"]/h3/a/text()').extract_first() url = div.xpath( './/div[@class="news_title"]/h3/a/@href').extract_first() imgUrl = div.xpath('./a/img/@src').extract_first() tag = div.xpath('.//div[@class="news_tag"]//text()').extract() tags = [] for t in tag: t = t.strip(' \n \t') tags.append(t) tag = "".join(tags) # 获取meta传递过来的数据值title title = response.meta['title'] # 实例化item对象,将解析到的数据值存储到item对象中 item = WangyiproItem() item['head'] = head item['url'] = url item['imgUrl'] = imgUrl item['tag'] = tag item['title'] = title # 对url发起请求,获取对应页面中存储的新闻内容数据 yield scrapy.Request(url=url, callback=self.getContent, meta={'item': item})
def parse_model(self,response): div_list=response.xpath('/html/body/div/div[3]/div[4]/div[1]/div/div/ul/li/div/div') for div in div_list: title=div.xpath('./div/div[1]/h3/a/text()').extract_first() new_detail_url=div.xpath('./div/div[1]/h3/a/@href').extract_first() item=WangyiproItem() item['title']=title yield scrapy.Request(url=new_detail_url,callback=self.parse_detail,meta={'item':item})
def parse_model(self, response): # 解析每一个板块页面中对应新闻的标题和详情内容 divs = response.xpath('//div[@class="ndi_main"]/div') for div in divs: title = div.xpath('.//div[@class="news_title"]/h3/a/text()').get() new_detail_url = div.xpath('.//div[@class="news_title"]/h3/a/@href').get() print('title',title) print('new',new_detail_url) item = WangyiproItem() item['title'] = title yield scrapy.Request(url=new_detail_url, callback=self.parse_detail, meta={'item': item})
def parse_detail(self,response): print('parse_detail()') div_list = response.xpath('//div[@class="ndi_main"]/div') for div in div_list: #只解析到了新闻标题还没有解析到新闻的内容 item = WangyiproItem() new_title = div.xpath('./div/div[1]/h3/a/text()').extract_first() new_detail_url = div.xpath('./div/div[1]/h3/a/@href').extract_first() item['new_title'] = new_title #获取新闻的内用:进行i请求传参,将item传递给下一个解析方法 yield scrapy.Request(url=new_detail_url,callback=self.newContent_parse,meta={'item':item})
def parse_model(self, response): # 解析每一个板块页面中对应新闻的标题和详情页的url div_lists = response.xpath('/html/body/div/div[3]/div[4]/div[1]/div/div/ul/li/div/div') for div in div_lists: item = WangyiproItem() title = div.xpath('./div[1]/div[1]/h3/a/text()').extract_first() item['title'] = title print('------> 标题是:', title) new_detail_url = div.xpath('./div[1]/div[1]/h3/a/@href').extract_first() if new_detail_url: # 对新闻详情页的url发起请求 print('-----> 详情url是:', new_detail_url) yield scrapy.Request(url=new_detail_url, meta={'item': item}, callback=self.parse_detail)
def parse_model(self,response): # 通过中间件中使用selenium处理后每个板块已经获得了动态加载的页面数据后 div_list = response.xpath('/html/body/div/div[3]/div[4]/div[1]/div/div/ul/li/div/div') for div in div_list: title = div.xpath('./div/div[1]/h3/a/text()').extract_first() detail_url = div.xpath('./div/div[1]/h3/a/@href').extract_first() if detail_url: item = WangyiproItem() item['title'] = title # 对详情页发起请求解析出新闻内容,再定义一个回调函数提取新闻内容,将item传递给下一个需要的解析函数 yield scrapy.Request(detail_url,callback=self.parse_new_content,meta={'item':item})
def parse_module(self, response): # 解析每一个板块页面中对应新闻的标题和新闻详情页的url # response.xpath() div_list = response.xpath('/html/body/div/div[3]/div[4]/div[1]/div/div/ul/li/div/div') for div in div_list: title = div.xpath('./div/div[1]/h3/a/text()').extract_first() new_detail_url = div.xpath('./div/div[1]/h3/a/@href').extract_first() # 进行请求传参,传给 WangyiproItem item = WangyiproItem() item['title'] = title # 对新闻详情页的 url 发起请求S yield scrapy.Request(url=new_detail_url, callback=self.parse_detail, meta={'item': item})
def parse(self, response): lis = response.xpath('//div[@class="ns_area list"]/ul/li') index_list = [3, 4, 6, 7] li_list = [] for index in index_list: li_list.append(lis[index]) for li in li_list: url = li.xpath('./a/@href').extract_first() title = li.xpath('./a/text()').extract_first() item = WangyiproItem() item['title'] = title yield scrapy.Request(url=url, callback=self.parseSecond, meta={'item': item})
def parse_model(self, response): # 直接对response解析新闻标题数据是无法获取数据(因为是动态加载数据) # response是不满足当下需求的,需要将其变成符合要求的response(包含动态加载数据的) # 两者区别就是:响应数据不同 # 使用中间件将不满足需求的响应对象中的响应数据篡改成包含动态加载数据的响应数据,将其变成满足需求的响应对象 div_list = response.xpath( '/html/body/div/div[3]/div[4]/div[1]/div[1]/div/ul/li/div/div') for div in div_list: title = div.xpath('./div/div[1]/h3/a/text()').extract_first() new_detail_url = div.xpath( './div/div[1]/h3/a/@href').extract_first() if new_detail_url: # 如果new_detail_url不为空,则发请求 item = WangyiproItem() item['title'] = title # 对新闻详情页url发起请求 yield scrapy.Request(url=new_detail_url, callback=self.parse_new_detail, meta={'item': item})
def parse_model(self, response): # 解析每一个板块页面中对应的新闻标题和新闻详情 # response.xpath() # '/html/body/div/div[3]/div[4]/div[1]/div/div/ul/li/div/div' div_list = response.xpath( '/html/body/div/div[3]/div[4]/div[1]/div/div/ul/li/div/div') for div in div_list: #国内板块格式 title = div.xpath('./div/div[1]/h3/a/text()').extract_first() new_detail_url = div.xpath( './div/div[1]/h3/a/@href').extract_first() item = WangyiproItem() item['title'] = title #对新闻详情页的url发起请求 if new_detail_url != None and title != None: print(new_detail_url, title) yield scrapy.Request(url=new_detail_url, callback=self.parse_detail, meta={'item': item})
def parse_model(self, response): # 直接对response解析新闻标题数据是无法获取该数据的(动态加载) # 满足需求的响应对象就是包含了动态加载数据的response # 满足需求的response和不满足的response区别在哪里? # -区别在于响应数据不同。我可以使用中间件将不满足需求的响应数据篡改成 # 包含了动态加载数据的响应数据,将其变成满足需求的响应数据 div_list = response.xpath( '/html/body/div/div[3]/div[4]/div[1]/div/div/ul/li/div/div') for div in div_list: title = div.xpath('./div/div[1]/h3/a/text()').extract_first() news_detail_url = div.xpath( './div/div[1]/h3/a/@href').extract_first() if news_detail_url: item = WangyiproItem() item['title'] = title # 对新闻详情页的url发起请求 yield scrapy.Request(url=news_detail_url, callback=self.parse_new_detail, meta={'item': item})
def parseSecond(self, response): """声明回调函数""" # 找到页面中新闻的共有标签类型,排除广告标签 # print(response.text) div_list = response.xpath( '//div[@class="data_row news_article clearfix "]') # 注意类最后有一个空格 # print(len(div_list)) # 非空则验证xpath是正确的且动态页面加载到了响应中 for div in div_list: # 文章标题 head = div.xpath( './/div[@class="news_title"]/h3/a/text()').extract_first() # 文章url url = div.xpath( './/div[@class="news_title"]/h3/a/@href').extract_first() # 缩略图 imgUrl = div.xpath('./a/img/@src').extract_first() # 发布时间和标签:提取列表中所有的元素 tag = div.xpath('.//div[@class="news_tag"]//text()').extract() # 列表装化为字符串 tags = [] for t in tag: t = t.strip(' \n \t') # 去除空格 \n换行 \t相当于tab tags.append(t) # 重新装载到列表中 tag = "".join(tags) print(head + ':' + url + ':' + imgUrl + ':' + tag) # 获取meta传递的数据值 title = response.meta['title'] # 实例化item对象,将解析到的数据值存储到item对象中 item = WangyiproItem() item['head'] = head item['url'] = url item['imgUrl'] = imgUrl item['tag'] = tag item['title'] = title # 对url发起请求,获取对应页面中存储的新闻内容数据 yield scrapy.Request(url=url, callback=self.getContent, meta={"item": item})
def parseSecond(self,response): div_list = response.xpath('//div[@class="data_row news_photoview clearfix"]') for div in div_list: head = div.xpath('.//div[@class="news_title"]/h3/a/text()').extract_first() url = div.xpath('.//div[@class="news_title"]/h3/a/@href').extract_first() imgUrl = div.xpath('./a/img/@src').extract_first() tag = div.xpath('.//div[@class="news_tag"]//text()').extract() tag = "".join(tag) #获取meta传过来的数据 title = response.meta['title'] #实例化item对象,将解析到的数据值存到item对象中 item=WangyiproItem() item['head'] = head item['url'] = url item['imgUrl'] = imgUrl item['tag'] = tag item['title'] =title #对url发情请求,获取对应页面中存储的新闻内容数据 yield scrapy.Request(url=url,callback=self.getContent,meta={'item':item})