def parse(self, response): item = MyscrapyItem() movies = response.xpath('//div[@class="info"]') for each in movies: title = each.xpath( '//div[@class="hd"]/a/span[@class="title"]/text()').extract() content = each.xpath('//div[@class="bd"]/p/text()').extract() score = each.xpath( '//div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()' ).extract() ratetotal = each.xpath( '//div[@class="bd"]/div[@class="star"]/span[4]/text()' ).extract() info = extract.xpath( '//div[@class="bd"]/p[@class="quote"]/span[@class="inq"]/text()' ).extract() item['title'] = title[0] item['content'] = ';'.join(content) item['score'] = score[0] item['ratetotal'] = ratetotal[0] item['info'] = info[0] yield item if self.start <= 225: self.start += 25 yield scrapy.Request(self.url + str(self.start) + self.end, callback=self.parse)
def parse(self, response): i = 0 for item in response.xpath( "//div[@class='seo-recommended-notes']/div"): i = i + 1 myItem = MyscrapyItem() myItem['author'] = item.xpath( "./a[@class='author']/span/text()").extract()[0] print("作者:" + myItem['author']) myItem['author_icon_url'] = item.xpath( "./a[@class='author']/@href").extract()[0] myItem['blog_title'] = item.xpath( "./a[@class='title']/text()").extract()[0] print("标题:" + myItem['blog_title']) myItem['content_summary'] = item.xpath("./p/text()").extract()[0] print("内容:" + myItem['content_summary']) myItem['content_url'] = item.xpath( "./a[@class='title']/@href").extract()[0] print("url:" + self.base_url + myItem['content_url']) time.sleep(1) if myItem['content_url'] not in self.viewed: self.url = myItem['content_url'] self.viewed.append(self.url) if (i == len( response.xpath( "//div[@class='seo-recommended-notes']/div"))): self.page_order = self.page_order + 1 yield self.parse_more()
def parse(self, response): names = response.xpath( '//div[@class ="channel-detail movie-item-title"]/@title').extract( ) scores = [ score.xpath('string(.)').extract_first() for score in response.xpath( '//div[@class = "channel-detail channel-detail-orange"]') ] # for score in scores_div: # scores.append(score.xpath('string(.)').extract_first()) # use dictionary to push item # for name, score in zip(names, scores): # # print(name, ':', score) # yield {"name": name, "score": score} # use object to push item item = MyscrapyItem() for name, score in zip(names, scores): item['name'] = name item['score'] = score if response.url.find('catId=2') != -1: item['type'] = 'comedy' elif response.url.find('catId=3') != -1: item['type'] = 'romantic' yield item
def parse(self,response): # current_url = response.url #爬取时请求的URL # body = response.body #返回的Html # unicode_body = response.body_as_uncode()#返回的html unicode编码 item = MyscrapyItem() selector = scrapy.Selector(response) #sites = hxs.select('//ul/li/div/a/img/@src').extract() books = selector.xpath('//div[@class="bd doulist-subject"]') for book in books: #print(item) title = book.xpath('div[@class="title"]/a/text()').extract()[0] rate = book.xpath('div[@class="rating"]/span[@class="rating_nums"]/text()').extract()[0] author = book.xpath('div[@class="abstract"]/text()').extract()[0] title = title.replace(' ','').replace('\n','') author = author.replace(' ','').replace('\n','') item['title'] = title item['rate'] = rate item['author'] = author #print(title) #print(rate) #print(author) #print('\n') yield item nextpage = selector.xpath('//span[@class="next"]/link/@href').extract() if nextpage: next = nextpage[0] yield scrapy.http.Request(next,callback=self.parse)
def parse(self, response): quotes = response.xpath("//div[@class='quote']/span[@class='text']") for x in quotes: item = MyscrapyItem() title = x.xpath(".//text()").get() item['title'] = title yield item
def parse(self, response): movie = MyscrapyItem() movie['name'] = response.xpath( '//div[@id="content"]/h1/span[1]/text()')[0].extract() movie['director'] = response.xpath( '//div[@id="info"]/span[@class="attrs"]/text()')[0].extract() actor_list = [] actor_list.extend(response.xpath('//span[@class="actor"]'))
def parse(self, response): list = response.xpath('//div[@class="li_txt"]') for element in list: item = MyscrapyItem() name = element.xpath('./h3/text()').extract() title = element.xpath('./h4/text()').extract() info = element.xpath('./p/text()').extract() item['name'] = name[0].strip() item['title'] = title[0].strip() item['info'] = info[0].strip() yield item
def parse(self, response): # print("请求:"+response.xpath("//ul[@class='note-list']/li").extract()[0]) for i in response.xpath("//ul[@class='note-list']/li"): print("数量:" + str( len(response.xpath("//ul[@class='note-list']/li").extract()))) item = MyscrapyItem() try: item['_id'] = i.xpath("./@id").extract()[0] except Exception as e: print(e) return try: # text() item['blog_title'] = i.xpath( ".//div[@class='content']/a/text()").extract()[0] except Exception as e: print(e) return try: item['content_url'] = self.get_full_url( i.xpath(".//div[@class='content']/a/@href").extract()[0]) except Exception as e: item['content_url'] = '' try: item['content_summary'] = i.xpath( ".//div[@class='content']/p/text()").extract()[0] except Exception as e: item['content_summary'] = '' try: item['content_figure_url'] = self.get_full_url( i.xpath("./a/img/@src").extract()[0]) except Exception as e: item['content_figure_url'] = '' try: item['author'] = i.xpath( ".//div[@class='author']/div/a/text()").extract()[0] except Exception as e: item['author'] = '' try: item['date'] = i.xpath( ".//div[@class='author']/div/span/@data-shared - at" ).extract()[0] except Exception as e: item['date'] = '' try: item['author_icon_url'] = self.get_full_url( i.xpath(".//div[@class='author']/a/@href").extract()[0]) except Exception as e: item['author_icon_url'] = '' yield item
def parse(self, response): qiubai = MyscrapyItem() # print 'start $$$$$$$$$$$$$$$$$$$$$$$' # print response.xpath('//div[@class="bai"]/a/@href|//div[@class="du"]/a/@href').extract() # print 'end $$$$$$$$$$$$$$$$$$$$$$$' a = 0 for item in response.xpath( '//div[@class="bai"]/a/@href|//div[@class="du"]/a/@href' ).extract(): if a > 1: break a = a + 1 yield scrapy.Request(url=item, callback=self.second_parse)
def parse(self, response): items = [] soup = BeautifulSoup(response.text, 'lxml') sectionList = soup.find_all(class_='stream-list__item') for section in sectionList: title = section.find(class_='title').a.get_text() quote = section.find(class_='excerpt wordbreak hidden-xs').get_text().strip() time = section.find(class_='col-xs-10').get_text().strip()[-12:-2] itemLoader = ItemLoader(item=MyscrapyItem(), response=response) itemLoader.add_value('title', title) itemLoader.add_value('quote', quote) itemLoader.add_value('time', time) items.append(itemLoader.load_item()) return items
def parse(self, response): tr_list = response.xpath( '//div[@class="greyframe"]/table[2]/tr/td/table/tr') for tr in tr_list: item = MyscrapyItem() item['num'] = tr.xpath('./td[1]/text()').extract_first() item['title'] = tr.xpath('./td[2]/a[2]/text()').extract_first() item['href'] = tr.xpath('./td[2]/a[2]/@href').extract_first() item['status'] = tr.xpath('./td[3]/span/text()').extract_first() item['name'] = tr.xpath('./td[4]/text()').extract_first() item['publish_date'] = tr.xpath('./td[5]/text()').extract_first() yield scrapy.Request(item['href'], callback=self.parse_detail, meta={'item': item}) #构造下一页请求,翻页 next_url = response.xpath('//a[text()=">"]/@href').extract_first() if next_url is not None: yield scrapy.Request(next_url, callback=self.parse)
def parse(self, response): titles = response.xpath( '//*[@id="main_content"]/div[2]/ul/li/dl/dt[2]/a/text()').extract( ) authors = response.css('.writing::text').extract() previews = response.css('.lede::text').extract() items = [] # loop for idx in range(len(titles)): item = MyscrapyItem() item['title'] = titles[idx] item['author'] = authors[idx] item['preview'] = previews[idx] items.append(item) return items
def parse_travellist(self, response): """得到每一个地点的游记列表""" # print("into parse_travellist************************************") if response.status == 200 : print("enter") res = response.text content = json.loads(res) s = content.get('list', 0) page_info = content.get('page',0) #提取下一页的请求参数 next = re.compile(r'<a class="pi pg-next" href="/yj/(\d+)/1-0-(\d+).html" title') next = re.search(next,page_info) if s: s = "<html>" + s + "</html>" html = etree.HTML(s) lis = html.xpath('//div[@class="tn-item clearfix"]') for li in lis[:]: href1 = li.xpath('.//a[@class="title-link"]//@href')[0] title = li.xpath('.//a[@class="title-link"]/text()')#('./div[@class="tn-wrapper"]/dl/dt/a/text()') content = li.xpath('./div[@class="tn-wrapper"]/dl/dd/a/text()') zan = li.xpath('./div[@class="tn-wrapper"]/div/span[@class="tn-ding"]/em/text()') user_name = li.xpath('./div[@class="tn-wrapper"]/div/span[@class="tn-user"]/a/text()') item = MyscrapyItem() item['title'] = title[0] if title else '' item['content'] = content[0] if content else '' item['zan'] = zan[0] if zan else '' item['user_name'] = user_name[0] if user_name else '' yield item # url = 'http://www.mafengwo.cn' + href1 # yield Request(url, callback=self.parse_detail, dont_filter=False) if next: next_page = next.group(1) #获取参数midde的值 next_num = next.group(2) #获取参数page的值 every_page_params = self.params every_page_params['mddid'] = next_page every_page_params['page'] = next_num yield FormRequest(self.travel_url, callback=self.parse_travellist,dont_filter=False, formdata=every_page_params) else: logging.warning(f"parse_travellist失败!:{response.status}") # def parse_detail(self, response): # """每一个游记的详细内容""" # print(response.text)
def parse(self, response): # 创建MyscrapyItem类的实例 item = MyscrapyItem() sectionList = response.xpath('//*[@id="all"]/div[1]/section').extract() for section in sectionList: bs = BeautifulSoup(section, 'lxml') articleDict = {} a = bs.find('a') articleDict['title'] = a.text articleDict['href'] = 'https://geekori.com/' + a.get('href') p = bs.find('p', class_='excerpt') articleDict['abstract'] = p.text # 为MyscrapyItem对象的3个属性赋值 item['title'] = articleDict['title'] item['href'] = articleDict['href'] item['abstract'] = articleDict['abstract'] # 本例只保存抓取的第1条博文先关信息,所以迭代一次后退出for循环 break # 返回MyscrapyItem对象 return item
def parse(self, response): items = [] sectionList = response.xpath('//*[@id="all"]/div[1]/section').extract() for section in sectionList: bs = BeautifulSoup(section, 'lxml') articleDict = {} a = bs.find('a') articleDict['title'] = a.text articleDict['href'] = 'https://geekori.com/' + a.get('href') p = bs.find('p', class_='excerpt') articleDict['abstract'] = p.text itemLoader = ItemLoader(item=MyscrapyItem(), response=response) itemLoader.add_value('title', articleDict['title']) itemLoader.add_value('href', articleDict['href']) itemLoader.add_value('abstract', articleDict['abstract']) items.append(itemLoader.load_item()) return items
def parse(self, response): #开始爬取 print("spider start") #取出所有<li>标签中style=_width:183px;的标签 node_list = response.xpath("//li[@style='_width:183px;']") #遍历list for node in node_list: # 创建item字段对象用来存储信息 item = MyscrapyItem() # extract() : 将xpath对象转换为Unicode字符串 novelName = node.xpath("./a/@alt").extract() authorName = node.xpath("./a/label/text()").extract() novelContent = node.xpath("./a/@href").extract() #对的到的信息进行一点加工,并放入item中 item['novelName'] = novelName[0].split(" ")[0] item['authorName'] = authorName[0] item['novelContent'] = "http://www.jjwxc.net/" + novelContent[0] yield item
def parse(self, response): item = MyscrapyItem() tr_list = response.xpath( "//div[@class='greyframe']/table[2]/tr/td/table/tr") for tr in tr_list: item["title"] = tr.xpath( "./td[2]/a[@class='news14']/@title").extract_first() item["href"] = tr.xpath( "./td[2]/a[@class='news14']/@href").extract_first() item["who"] = tr.xpath("./td[4]/text()").extract_first() item["time"] = tr.xpath("./td[5]/text()").extract_first() yield scrapy.Request(item["href"], callback=self.parse_detail, meta={"item": deepcopy(item)}) next_url = response.xpath( "//div[@class='pagination']/a[@text()='>']/@href").extract_first() if next_url is not None: yield scrapy.Request(next_url, callback=self.parse) yield item
def parse(self, response): data = response.body soup = BeautifulSoup(data, "lxml") data = soup.find("table", class_="tablelist") trs = data.select("tr") for tr in trs[1:-2]: td = tr.select("td") item = MyscrapyItem() item["title"] = td[0].string item["position"] = td[3].string item["date"] = td[4].string yield item div = soup.find('div', class_="pagenav") next = div.find("a", id="next") next_url = next['href'] # print(next_url) next_url = "https://hr.tencent.com/" + next_url yield scrapy.Request(next_url, callback=self.parse)
def parse(self, response): # print(response) content = response.body.decode('utf-8') # with open('youbain.html','w',encoding='utf-8')as fp: # fp.write(content) #提取数据: # print(content) tree = etree.HTML(content) cell_list = tree.xpath('//div[@class="cinema-cell"]') for cell in cell_list: #影院名称: item = MyscrapyItem() name = cell.xpath('./div[@class="cinema-info"]/a/text()')[0] item['name'] = name #地址: adress = cell.xpath('./div[@class="cinema-info"]/p/text()')[0] item['adress'] = adress yield item
def parse(self, response): try: print 'url : ', response.url urls = response.xpath( '/html/body/div[8]/div/div[3]/img/@src').extract() title = response.xpath( '/html/body/div[8]/div/div[3]/h1/text()').extract() item = MyscrapyItem() item['title'] = title item['urls'] = urls yield item page_urls = response.xpath( '/html/body/div[8]/div/ul/span/a/@href').extract() print len(page_urls) for url in page_urls: if str(url).startswith('http') or str(url).startswith('HTTP'): full_url = url else: full_url = response.urljoin(url) yield Request(url=full_url, callback=self.parse) except Exception, e: print "ERROR : ", e
def parse(self, response): blogs = response.xpath("//div[@class='post_item_body']") for b_item in blogs: item = MyscrapyItem() item["title"] = b_item.xpath( "./h3/a[@class='titlelnk']/text()").extract()[0].strip() item["link"] = b_item.xpath( "./h3/a[@class='titlelnk']/@href").extract_first().strip() item["discribe"] = b_item.xpath( "./p[@class='post_item_summary']/text()").extract_first( ).strip() item["author"] = b_item.xpath( "./div[@class='post_item_foot']/a[@class='lightblue']/text()" ).extract_first().strip() item["comment"] = b_item.xpath( "./div[@class='post_item_foot']/span[@class='article_comment']/a[@class='gray']/text()" ).extract_first().strip() yield item next_links = response.xpath("//div[@class='pager']/a[last()]") for next_link in next_links: if next_link.xpath("./text()").extract_first() == 'Next >': next_link_href = next_link.xpath("./@href").extract()[0] yield self.make_requests_from_url(u"https://www.cnblogs.com" + next_link_href)
def parse(self, response): for sel in response.xpath( '//div[@class="mod-info-flow"]/div/div[@class="mob-ctt"]'): item = MyscrapyItem() item['title'] = sel.xpath('h3/a/text()')[0].extract() print(item['title'])