def parse(self, response): contentLeft = response.xpath( "//div[@id='content-left']/div" ) # 获得的数据是 SelectorList 类型,实际上也是继承 Selector # items = [] for duanzidiv in contentLeft: # print(duanzidiv) author = duanzidiv.xpath(".//h2/text()").get().strip() content = duanzidiv.xpath( ".//div[@class='content']//text()").getall() content = "".join(content).strip() # duanzi = {'author':author, 'content':content} item = QsbkItem(author=author, content=content) # 不使用字典返回,返回item yield item # items.append(item) # return items next_url = response.xpath( "//ul[@class='pagination']/li[last()]/a/@href").get() print('网页' + next_url) print("=" * 50) if not next_url: return else: # 发送另外一个请求 callback 代表请求回来后,执行self.parse()的函数 yield scrapy.Request(self.base_domain + next_url, callback=self.parse)
def parse(self, response): # 返回结果为SelectorList duanzidivs = response.xpath("//div[@id='content-left']/div") items = [] for duanzidiv in duanzidivs: # 循环每一个Selector,调用get()返回unicode字符串 author = duanzidiv.xpath(".//h2/text()").get().strip() # 获取所有内容getall,返回一个列表 content = duanzidiv.xpath( ".//div[@class='content']//text()").getall() # "".join把列表转换为字符串 content = "".join(content).strip() # 约束我们传递的参数数量 item = QsbkItem(author=author, content=content) # 第一种方式:转换为生成器 yield item # 第二种方式定义列表,return列表 # items.append(item) # return items next_url = response.xpath( "//ul[@class='pagination']/li[last()]/a/@href").get() if not next_url: return else: # 携带当前参数(下一页的url),回调当前请求并再次解析 yield scrapy.Request(self.base_domain + next_url, callback=self.parse)
def parse(self, response): content_letf = response.xpath('//div[@id="content-left"]/div') for duanzidiv in content_letf: # 判断是否存在“阅读全文”按钮 content_for_all = duanzidiv.xpath( './/span[@class="contentForAll"]') if content_for_all: content_url = duanzidiv.xpath( './/a[@class="contentHerf"]/@href').get() # 请求详情页,获得完整的段子 yield scrapy.Request(self.base_domain + content_url, callback=self.parse_content) continue # 作者 author = duanzidiv.xpath('.//h2/text()').get().strip() # 好笑值 stats_vote = duanzidiv.xpath( './/span[@class="stats-vote"]/i/text()').get() # 内容 contents = duanzidiv.xpath( './/div[@class="content"]//span/text()').getall() content = "".join(contents).strip().replace('\"', ' ') item = QsbkItem(author=author, stats_vote=stats_vote, content=content) yield item next_url = response.xpath( '//ul[@class="pagination"]/li[last()]/a/@href').get() if not next_url: return else: # 请求下一页 yield scrapy.Request(self.base_domain + next_url, callback=self.parse)
def parse(self, response): """ 提取数据 :param response: 下载完成数据;类型为HtmlResponse """ # response.xpath() 提取出来的数据类型为SelectorList liList = response.xpath('//div[@class="j-r-list"]/ul/li') for li in liList: # Selector: get()将Selector转换为uncode author = li.xpath('.//div[@class="u-txt"]/a/text()').get() content = li.xpath( './/div[@class="j-r-list-c"]/div[@class="j-r-list-c-desc"]/a/text()' ).getall() content = "".join(content).strip() # 规范,固定传递指定参数 item = QsbkItem(author=author, content=content) yield item # ==> itmes.append(item) ''' 爬取下一页:获取"下一页"的链接,再次回调parse方法。直到无"下一页"链接则return方法 ''' next_url = response.xpath( '//div[@class="j-page"]//a[@class="pagenxt"]/@href').get() if not next_url: return elif int(next_url) == 51: # 51页无数据 print("最后一页啦 %s" % next_url) return else: # 返回当前请求给parse();不能用return(会停止执行parse方法) yield scrapy.Request(self.base_domains + next_url, callback=self.parse)
def parse(self, response): base_url = "https://www.qiushibaike.com" # SelectorList duanziDivs = response.xpath("//div[@id='content-left']/div") # 遍历SelectorList得到Selector for duanziDiv in duanziDivs: # extract()得到符合条件的所有元素提取html代码,等价于getall(),返回列表 # extract_first()得到符合条件的第一个元素并提取其html代码,等价于get() author = duanziDiv.xpath(".//h2/text()").get().strip() content = duanziDiv.xpath( ".//div[@class='content']//text()").getall() content = "".join(content).strip() item = QsbkItem(author=author, content=content) yield item next_url = response.xpath( "//ul[@class='pagination']/li[last()]/a/@href").get() if not next_url: return else: next_url = base_url + next_url yield scrapy.Request(next_url, callback=self.parse)
def parse(self, response): duanzidivs = response.xpath('//div[@id="content-left"]/div') for duanzi in duanzidivs: author = duanzi.xpath(".//h2/text()").get().strip() content = duanzi.xpath(".//div[@class='content']//text()").getall() content = "".join(content).strip() item = QsbkItem(author=author, content=content) yield item
def parse(self, response): i = QsbkItem() i['content'] = response.xpath( '//div[@class="content"]/span/text()').extract() # i['link'] = response.xpath('//div[@id="qiushi_tag_120320318"]/@href').extract() # i['data'] = response.xpath('//i[@class="number"]').extract() print(i['content']) return i
def parse(self, response): coon = response.xpath('//div[@id="footzoon"]') contents = coon.xpath('.//div[@id="endtext"]/text()').getall() for content in contents: content = content.strip() item = QsbkItem(content=content) print(content) yield item
def parse_item(self, response): i = QsbkItem() selector=Selector(response) i['content'] = selector.xpath('//*[@id="single-next-link"]/div/text()').extract() # i['link'] = response.xpath('//div[@id="qiushi_tag_120320318"]/@href').extract() # i['data'] = response.xpath('//i[@class="number"]').extract() print(i['content']) return i time.sleep(10)
def parse(self, response): print('*' * 40) # node_list = response.xpath("//div[@id='big_pic']") node_list = response.xpath("//a[@href='/weimei/48095.html']") for node in node_list: item = QsbkItem() name = node.xpath("img/@src").extract() item['url'] = name[0] yield item print('*' * 40)
def parse(self, response): duanzidiv = response.xpath("//div[@id='content-left']/div") for dz in duanzidiv: author = dz.xpath(".//h2/text()").get().strip() content = dz.xpath(".//div[@class='content']//text()").getall() content = ''.join(content).strip() # print(author) # print(content) # duanzi2_ = {'author': author, 'content': content} item = QsbkItem(author=author, content=content) # 可以这样简化写 yield item # 将数据移交给 piplines
def parse_content(self, response): author = response.xpath( './/div[@class="detail-col0"]/a/img/@alt').get() stats_vote = response.xpath( './/span[@class="stats-vote"]/i/text()').get() content_page = response.xpath('.//div[@class="col1 new-style-col1"]') contents = content_page.xpath( './/div[@class="content"]/text()').getall() content = "".join(contents).strip().replace('\"', ' ') item = QsbkItem(author=author, stats_vote=stats_vote, content=content) yield item
def parse(self, response): duanzi_div = response.xpath('//div[@id="content-left"]/div') for duanzi in duanzi_div: author = duanzi.xpath('.//h2/text()').get().strip() content = duanzi.xpath('.//div[@class="content"]//text()').getall() content = ''.join(content).strip() item = QsbkItem(author=author, content=content) yield item next_url = response.xpath( '//ul[@class="pagination"]/li[last()]/a/@href').get() if next_url: yield scrapy.Request(self.base_url + next_url, self.parse)
def parse(self, response): duanzidiv = response.xpath("//div[@id = 'content-left']/div") for duanzi in duanzidiv: author = duanzi.xpath(".//h2/text()").get().strip() content = duanzi.xpath(".//div[@class= 'content']//text()").getall() content = ''.join(content) item = QsbkItem(author=author, content=content) yield item next_url = response.xpath("//ul[@class =[pagination']/li[last()]/a/@href").get() if not next_url: return else: yield scrapy.Request(self.base_domain+next_url, callback=self.parse)
def parse(self, response): duanzidivs = response.xpath('//div[@id="content-left"]/div') for duanzidiv in duanzidivs: author = duanzidiv.xpath(".//h2/text()").get().strip() content = duanzidiv.xpath('.//div[@class="content"]//text()' ).getall() # extract = getall content = ''.join(content).strip() #方法一:以字典的方式传回,不好 # duanzi = {'author':author,'content':content} # yield duanzi #方法二:使用items item = QsbkItem(author=author, content=content) yield item
def parse2(self,response): try: text = json.loads(response.text[20:-1]) comment_lists = text['result']['cmntlist'] # print(comment_lists) for comment in comment_lists: item = QsbkItem() item['mid'] = comment['mid'] item['content'] = comment['content'] item['nick'] = comment['nick'] # print(item) yield item except: pass
def parse(self, response): # SelectorList # 解析页面 content_left = response.xpath('//div[@id="content-left"]/div') # 提取数据 for dz_div in content_left: # Selector author = dz_div.xpath(".//h2/text()").get().strip() content_tmp = dz_div.xpath( ".//div[@class='content']//text()").getall() content = ''.join(content_tmp).strip() item = QsbkItem(author=author, content=content) # 使用yield返回给pipeline yield item
def parse(self, response): contents = response.xpath("//div[@class='content']") for content in contents: text = content.xpath("text()").get().strip() if text != "": item = QsbkItem(content=text) yield item print("page {} finished...".format(response.url)) page_next = response.xpath("//div[@class='pagebar']/a[text()='下一页']/@href").get() if not page_next: return else: page_next_url = self.url_head + page_next yield scrapy.Request(page_next_url,callback=self.parse)
def parse(self, response): # selectorList 类型 duanziDivs = response.xpath("//div[@class='col1 old-style-col1']/div") i = 0 for div in duanziDivs: # div: Selector类型 # div.xpath():selectorList类型 author = div.xpath(".//h2/text()").get().strip() # get()函数,取到第一个结果(str类型), get()<=>extract_first() # extract(): 提取所有结果组成列表,每个元素是str类型 # strip(): 去掉前后的空格 href = div.xpath('./a/@href').get() # 获取a标签内的href的属性值 url = urljoin(self.start_urls[0],href) # 跳转到详情页 content = div.xpath(".//div[@class='content']//text()").extract() # html中该div下有span标签,span中才是内容,//text()直接提取该内容 content = ''.join(content).strip() # tplt = "{0:{2}<20}\t{1:^50}" # print(tplt.format(author,url,chr(12288))) # print(content) # print('=' * 100) # duanzi = {'author':author,'content':content} # 即为一个个的存储项,item,写法如下 ⬇ item = QsbkItem(author=author, content=content) i += 1 print('生成器调用:',i) # yield duanzi yield item # 若不使用生成器,即注释上面的yield item,可使用以下方法 # items = [] # items.append(item) # return items # 返回所有的items=,在pipelines中也可以被解析 next_path = response.xpath("//ul[@class='pagination']/li[last()]/a/@href").get() # 底部显示页面的最后一个li标签,‘下一页’ if not next_path: # 如果没有下一页 return else: next_url = urljoin(self.start_urls[0], next_path) yield scrapy.Request(next_url, callback=self.parse) # 利用当前页的url,执行上面的parse解析函数
def parse(self, response): duanzidivs = response.xpath('//div[@class="col1 old-style-col1"]/div') for duanzidiv in duanzidivs: author = duanzidiv.xpath('.//h2/text()').get().strip() content = duanzidiv.xpath( './/div[@class="content"]//text()').getall() content = ''.join(content).strip() item = QsbkItem(author=author, content=content) yield item nextUrl = duanzidiv.xpath( '//ul[@class="pagination"]/li[last()]/a/@href').get() if not nextUrl: return else: yield scrapy.Request(self.baseDomain + nextUrl, callback=self.parse)
def parse(self, response): # SelectorList duanzidivs = response.xpath("//div[@id = 'content-left']/div") for duanzidiv in duanzidivs: # Selector author = duanzidiv.xpath(".//h2/text()").get().strip() content = duanzidiv.xpath(".//span/text()").getall() content = "".join(content).strip() item = QsbkItem(author=author, content=content) # duanzi = {"author":author,"content":content} yield item next_url = response.xpath("//ul[@class= 'pagination']/li[last()]/a/@href").get() if not next_url: return else: yield scrapy.Request(self.base_domin + next_url, callback=self.parse)
def parse(self, response): duanzidivs = response.xpath('//div[@id="content-left"]/div') # SelectorList类型 for duanzidiv in duanzidivs: # duanzidiv是Selector类型 author = duanzidiv.xpath(".//h2/text()").get().strip() content = duanzidiv.xpath(".//div[@class='content']//text()").getall() content = ''.join(content).strip() # duanzi = {'author': author, 'content': content} item = QsbkItem(author=author, content=content) yield item next_url = response.xpath("//ul[@class='pagination']/li[last()]/a/@href").get() if not next_url: return else: # yield scrapy.Request(self.base_domains+next_url, callback=self.parse) yield response.follow(next_url, callback=self.parse)
def parse(self, response): duanzidivs = response.xpath(".//div[@id='content-left']/div") for duanzidiv in duanzidivs: author = duanzidiv.xpath(".//h2//text()").get().strip() content = duanzidiv.xpath( ".//div[@class='content']//text()").getall() content = "".join(content).strip() item = QsbkItem(author=author, content=content) yield item next_url = response.xpath( '//*[@id="content-left"]/ul/li[last()]/a/@href').get() if not next_url: return else: yield scrapy.Request(self.base_domain + next_url, callback=self.parse)
def parse(self, response): # items = [] outerbox = response.xpath("//div[@id='content-left']/div") for box in outerbox: author = box.xpath(".//div[contains(@class,'author')]//h2/text()").extract_first().strip() content = box.xpath(".//div[@class='content']/span/text()").extract_first().strip() item = QsbkItem(author=author,content=content) # item["author"] = author # item["content"] = content yield item # items.append(item) next_url=response.xpath("//ul[@class='pagination']/li[last()]/a/@href").get() if not next_url: return else : yield scrapy.Request('https://www.qiushibaike.com'+next_url,callback=self.parse) #return items
def parse(self, response): # 返回SelectorList article_divs = response.xpath('//div[@id="content-left"]/div') for article_div in article_divs: # 每个article_div的数据类型是Selector author = article_div.xpath( './/div[@class="author clearfix"]//h2/text()').get() # 获取所有指定div下面的文本 content = article_div.xpath( './/div[@class="content"]//text()').getall() # 把文本连起来(join)并去空格 content = "".join(content).strip() # 使用items.py中定义的字段来接收 item = QsbkItem() item['author'] = author item['content'] = content yield item
def parse(self, response): descs = response.xpath("//div[@class='j-r-list-c-desc']") for desc in descs: jokes = desc.xpath(".//text()").getall() joke = "\n".join(jokes).strip() link = desc.xpath(".//a/@href").get() links = str(self.base_domain + link) # link = self.base_domain+links item = QsbkItem(joke=joke, links=links) yield item next_url = response.xpath("//a[@class='pagenxt']/@href").get() if not next_url: return else: yield scrapy.Request(self.base_domain + "/text/" + next_url, callback=self.parse)
def parse(self, response): divs = response.xpath('//div[@id="content-left"]/div') for div in divs: author = div.xpath('.//h2/text()').get().strip() content = div.xpath( './/div[@class="content"]/span/text()').getall() content = "".join(content).strip() item = QsbkItem(author=author, content=content) yield item href = response.xpath( '//ul[@class="pagination"]/li[last()]/a/@href').get() if not href: return else: nex_url = self.domain_url + href yield scrapy.Request(nex_url, callback=self.parse)
def parse(self, response): duanziDivs = contentLeft = response.xpath("//div[@id='content-left']/div") for duanzidiv in duanziDivs: author = duanzidiv.xpath(".//h2/text()").get().strip() content = duanzidiv.xpath(".//div[@class='content']//text()").getall() content = "".join(content).strip() # duanzi = {"author":author,"content":content} # yield duanzi item = QsbkItem(author=author,content=content) yield item next_url = response.xpath("//ul[@class='pagination']/li[last()]/a/@href").get() if not next_url: return else: yield scrapy.Request(self.base_domain + next_url, self.parse)
def parse(self, response): duanzidivs = response.xpath(r"//div[@class='col1 old-style-col1']/div") for duanzidiv in duanzidivs: author = duanzidiv.xpath(r".//a[2]/h2/text()").get() author = "".join(author).strip() #去掉空格 content = duanzidiv.xpath( r".//div[@class='content']/span/text()").getall() content = "".join(content).strip() #去掉空格 item = QsbkItem(author=author, content=content) yield item next_url = response.xpath( r"//ul[@class='pagination']/li[last()]/a/@href").get() if not next_url: return else: yield scrapy.Request(self.base_domain + next_url, callback=self.parse)
def parse(self, response): contentDivs = response.xpath("//div[@id='j_p_postlist']/div") for contentDiv in contentDivs: author = contentDiv.xpath('.//li[@class="d_name"]/a/text()').get() content = contentDiv.xpath('.//cc/div[2]/text()').get().strip() # print(author, content) # tieba = {'author': author, 'content' : content} item = QsbkItem(author=author, content=content) yield item next_url = response.xpath( '//ul[@class="l_posts_num"]/li/a[last()-1]/@href').get() if not next_url: return else: yield scrapy.Request(self.base_domain + next_url, self.parse)