def manHuan_Title(self, response): manHuna_Book_Name = response.meta['manHuna_Book_Name'] manHuna_Book_Type = response.meta['manHuna_Book_Type'] manHuna_Book_Auth = response.meta['manHuna_Book_Auth'] manHuna_Book_profile = response.meta['manHuna_Book_profile'] manHuna_Book_cover_url = response.meta['manHuna_Book_cover_url'] manHuna_Book_title = response.xpath('//div[@class="wrap"]/span/text()').extract()[0] manHuna_Book_img_url = self.get_section_page(response.url) if manHuna_Book_title.strip() == '': print 'manHuna_Book_title is null' manHuna_Book_title = 'null' else: manHuna_Book_title = manHuna_Book_title i = MyprojectItem() i['manHuna_Book_Name'] = manHuna_Book_Name i['manHuna_Book_Type'] = manHuna_Book_Type i['manHuna_Book_Auth'] = manHuna_Book_Auth i['manHuna_Book_profile'] = manHuna_Book_profile i['manHuna_Book_title'] = manHuna_Book_title i['manHuna_Book_img_url'] = manHuna_Book_img_url i['manHuna_Book_cover_url'] = manHuna_Book_cover_url yield i
def parse(self, response): job_list = response.css('.sight_item') item = MyprojectItem() for page in job_list: item['name'] = page.xpath( ".//a[@class='name']/text()").extract()[0] level_handler = page.xpath( ".//span[@class='level']/text()").extract() item['level'] = level_handler[0] if level_handler else '' item['hot'] = page.xpath( ".//span[@class='product_star_level']/em/span/text()").extract( )[0][3:] item['area'] = "[" + page.css('.area').xpath( ".//a/text()").extract()[0] + "]" item['address'] = page.css(".address").xpath( ".//span/text()").extract()[0] price_temp = page.css(".sight_item_price").xpath( ".//em/text()").extract() item['price'] = price_temp[0] if price_temp else 0 item['info'] = page.css(".intro").xpath("./text()").extract()[0] num_handler = page.xpath( ".//span[@class='hot_num']/text()").extract() item['num'] = num_handler[0] if num_handler else 0 yield item for key in position: print("正在爬取{}".format(key)) # 取前10页 for page in range(1, 14): print("正在爬取第{}页".format(page)) yield scrapy.Request(url_template.format(key, page), callback=self.parse)
def parse(self, response): movie_list = response.xpath( "//div[@class='article']//ol[@class='grid_view']/li") for i_item in movie_list: douban_item = MyprojectItem() # 写详细的xpath,进行数据的解析 douban_item['serial_number'] = i_item.xpath( ".//div[@class='item']//em/text()").extract_first() douban_item['movie_name'] = i_item.xpath( ".//div[@class='info']/div[@class='hd']/a/span[1]/text()" ).extract_first() content = i_item.xpath( ".//div[@class='info']//div[@class='bd']/p[1]/text()").extract( ) # 数据的处理,因为有多行文本 for i_content in content: content_s = "".join(i_content.split()) douban_item['introduce'] = content_s douban_item['star'] = i_item.xpath( ".//span[@class='rating_num']/text()").extract_first() douban_item['evaluate'] = i_item.xpath( ".//div[@class='star']//span[4]/text()").extract_first() douban_item['describe'] = i_item.xpath( ".//p[@class='quote']/span/text()").extract_first() # 提交到管道pipelines进行数据的清洗啊数据的存储啊 yield douban_item # 解析下一页规则,取得后面页面的xpath next_link = response.xpath( "//span[@class='next']/link/@href").extract() if next_link: next_link = next_link[0] yield scrapy.Request("https://movie.douban.com/top250" + next_link, callback=self.parse)
def parse_question(self, response): print( response.xpath( '//div[@class="ask_cont"]/div[1]/p/text()').extract()) print( response.xpath( '//div[@class="sele_all marg_top"]/div[1]/p/text()').extract()) item = MyprojectItem() item['title'] = response.xpath( '//div[@class="ask_cont"]/div[1]/p/text()').extract() if item['title']: item['title'] = item['title'][0] else: item['title'] = '' item['dafu1'] = response.xpath( '//div[@class="sele_all marg_top"]/p/text()').extract() if item['dafu1']: item['dafu1'] = item['dafu1'][0] else: item['dafu1'] = '' item['dafu2'] = response.xpath( '//div[@class="sele_all"]/p/text()').extract() if item['dafu2']: item['dafu2'] = item['dafu2'][0] else: item['dafu2'] = '' yield item
def parse(self, response): # response に、ウェブサイトの情報が入っている item = MyprojectItem() # items.pyで指定したクラス item['title'] = response.xpath(self.xpath['title']).extract()[0] item['link'] = response.url item['date'] = datetime.datetime.utcnow() + datetime.timedelta( hours=9) # 現在時間。日本時間にして突っ込む。 yield item
def parse_detail(self, response): x = HtmlXPathSelector(response) item = MyprojectItem() item['link'] = response.url item['title'] = x.select('//*[@id="BookCon"]/h1/text()').extract() #item['desc'] = x.select('//*[@id="pagecontent"]/text()').extract() item['desc'] = x.select('//div[contains(@id, "pagecontent")]/text()').extract() return item
def parse_detail(self, response): x = HtmlXPathSelector(response) item = MyprojectItem() item['link'] = response.url item['title'] = x.select('//title/text()').extract() #item['desc'] = x.select('//*[@id="pagecontent"]/text()').extract() item['desc'] = x.select('//*[@id="content"]/p').extract() return item
def parse(self, response): #通过xpath提取图片地址 images = response.selector.xpath( "//img[@class='illustration']/@src").extract() items = MyprojectItem() for i in images: #url写入到item中提交 items['image_urls'] = ['http:' + i.strip()] yield scrapy.Request(url=items['image_urls'][0], callback=self.parse_image)
def parse(self, response): hxs = HtmlXPathSelector(response) sites = hxs.select('//ul/li') items = [] for site in sites: item = MyprojectItem() item['title'] = site.select( '//*[@id="BookCon"]/h1/text()').extract() item['link'] = site.select('a/@href').extract() item['desc'] = site.select( '//*[@id="pagecontent"]/text()').extract() items.append(item) return items
def parse(self, response): jsondata = json.loads(response.text) result = jsondata["data"]["results"] for item in result: myitem = MyprojectItem() deurl = item["positionURL"] myitem["jobname"] = item["jobName"] myitem["salary"] = item["salary"] myitem["companyName"] = item["company"]["name"] # yield myitem yield scrapy.Request(deurl, callback=self.parse22, meta={"istem": myitem})
def parse(self, response): jsondata = json.loads(response.text) result = jsondata["data"]["results"] for item in result: myitem = MyprojectItem() #实例化对象,需要调用items.py的MyprojectItem() deurl = item["positionURL"] myitem["jobname"] = item["jobName"] myitem["salary"] = item["salary"] myitem["companyName"] = item["company"]["name"] #上面的想保存到数据库,下面的继续爬取别的网页 职位描述,pipelines 处理数据库 # yield myitem #不这样写了,就使用下面的方法 meta 给了下面的方法 关联 myitem yield scrapy.Request(deurl, callback=self.parse22, meta={"istem": myitem})
def parse(self, response): for sel in response.xpath('//article'): item = MyprojectItem() item['title'] = sel.xpath('h1/a/text()').extract() item['link'] = sel.xpath('h1/a/@href').extract() yield item # 下一页 next_uri = response.xpath('//li[@class="next"]/a/@href').extract() if len(next_uri): # 修改url地址 # 例如: http://fidding.me?page=2 => http://fidding.me/page?=2 uri = next_uri[0] uri = uri[:17] + '/' + uri[17:] yield scrapy.Request(uri, method='GET', callback=self.parse)
def parse(self, response): item = MyprojectItem() target = response.css("div.r-ent") for tag in target: try: item['title'] = tag.css("div.title a::text")[0].extract() item['author'] = tag.css('div.author::text')[0].extract() item['date'] = tag.css('div.date::text')[0].extract() item['push'] = tag.css('span::text')[0].extract() item['url'] = tag.css('div.title a::attr(href)')[0].extract() yield item except IndexError: pass continue
def parse(self, response): jsondata = json.loads(response.text) result = jsondata["data"]["results"] for item in result: # 初始化数据库的链接,用于保存数据 myitem = MyprojectItem() # 需要继续爬取的对象 deurl = item["positionURL"] # 要保存的数据 myitem["salary"] = item["salary"] myitem["jobName"] = item["jobName"] myitem["companyName"] = item["company"]["name"] # 异步请求 # 数据入库处理,yield把要做的工作交还给引擎 yield myitem # 有下一级的url时,继续向下挖掘 yield scrapy.Request(deurl, callback=self.parse22, meta={"istem": myitem})
def parse(self, response): selector = response.css(".mctable1 tr td::text").getall() selector1 = list( map(lambda x: x.strip().lower().replace(',', ''), selector)) lst1 = [] lst2 = [] lst3 = [] lst4 = [] years = [] l = ItemLoader(item=MyprojectItem(), selector=response) ind = 0 for line in selector1: if line == 'total share capital': for i in range(1, 6): lst1.append(float(selector1[ind + i])) elif line == 'networth': for i in range(1, 6): lst2.append(float(selector1[ind + i])) elif line == 'net worth': for i in range(1, 6): lst2.append(float(selector1[ind + i])) elif line == 'reserves': for i in range(1, 6): lst3.append(float(selector1[ind + i])) elif line == 'total debt': for i in range(1, 6): lst4.append(float(selector1[ind + i])) ind += 1 for i in range(1, 6): data = response.css("tr.lightbg td::text")[i].get() years.append(data) if len(lst4) == 0: lst4.extend(itertools.repeat(0.0, 5)) l.add_value('cmp_name', response.css(".pcstname::text").get()) l.add_value('tsc', lst1[::-1]) l.add_value('net', lst2[::-1]) l.add_value('res', lst3[::-1]) l.add_value('debt', lst4[::-1]) l.add_value('years', years[::-1]) yield l.load_item()
def parse(self, response): #通过xpath提取内容 contents = response.selector.xpath( "//div[@class='content']/span/text()").extract() #定义items作为数据暂存容器 item = MyprojectItem() for i in contents: items['content'] = i.strip() #通过生成器yield将数据传送到pipeline进一步处理 yield items self.log('A response from %s just arrived!' % response.url)
def parse(self, response): myselector = response.css("div.dataList a::attr(title)").getall() l = ItemLoader(item=MyprojectItem(), selector=response) l.add_value('cmp_list', myselector) yield l.load_item()
def parse(self, response): item = MyprojectItem() item['name'] = response.xpath('//title/text()').extract() return item
def parse(self, response): item = MyprojectItem() #MyprojectItem()を変数へ格納 for pdf in response.css('div.body'): #item['link'] = pdf.css('table span::text').extract() item['url'] = pdf.css('table a::attr(href)').extract() #PATH yield item
def parse1(self, response): urls = response.css( 'a.reference.download.internal::attr(href)').extract() for url in urls: yield ExamplesItem(file_urls=[response.urljoin(url)]) # 记录个数 autohomeSpider.count += 1 print("第:", autohomeSpider.count, "个评论。") # print(AutohomeSpider.count) # 获取所有评论div //*[@id="maodian"]/div/div/div[2]/div[4] divs = response.xpath( '//*[@id="maodian"]/div/div/div[2]/div[@class="mouthcon"]') mcount = 0 for div in divs: print("----------------------------------") item = MyprojectItem() # 车ID //*[@id="maodian"]/div/div/div[2]/div[4]/div/div[1]/div[2]/dl[1]/dd/a[1] item['CAR_ID'] = div.xpath( 'div/div[1]/div[2]/dl[1]/dd/a[1]/@href')[0].extract().replace( '/', '') print(item['CAR_ID']) # 车名字 item['CAR_NAME'] = div.xpath( 'div/div[1]/div[2]/dl[1]/dd/a[1]/text()')[0].extract() # 用户ID //*[@id="maodian"]/div/div/div[2]/div[4]/div/div[1]/div[1]/div/div[1]/div[2]/p/a USER_ID1 = div.xpath( 'div/div[1]/div[1]/div/div[1]/div[2]/p/a/@href')[0].extract() item['USER_ID'] = re.findall('\d{1,15}', USER_ID1)[0] item['USER_NAME'] = div.xpath( 'div/div[1]/div[1]/div/div[1]/div[2]/p/a/text()')[0].extract( ).strip() # 购买地点 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[2]/dd PURCHASE_PLACE = div.xpath('div/div[1]/div[2]/dl[2]/dd')[0] item['PURCHASE_PLACE'] = PURCHASE_PLACE.xpath( 'string(.)').extract()[0].strip() # 因为列表属性相同且数量不确定,所要加入判断 dls = div.xpath('div/div[1]/div[2]/dl') # 正常的有7个 if dls.__len__() == 7: # 购买时间 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[4]/dd item['PURCHASE_TIME'] = div.xpath( 'div/div[1]/div[2]/dl[4]/dd/text()')[0].extract().strip() # 裸车购买价 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[5]/dd CAR_PRICE = div.xpath('div/div[1]/div[2]/dl[5]/dd')[0] item['CAR_PRICE'] = CAR_PRICE.xpath( 'string(.)').extract()[0].strip().replace('\xa0', '') # 购车目的 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[7]/dd PURCHASE_PURPOSE = div.xpath('div/div[1]/div[2]/dl[7]/dd')[0] item['PURCHASE_PURPOSE'] = PURCHASE_PURPOSE.xpath( 'string(.)').extract()[0].strip().replace( '\r\n', '').replace(' ', ';') #不正常的有6个,分为两种情况:缺经销商和缺油耗。 elif dls.__len__() == 6: p = div.xpath('div/div[1]/div[2]/dl[5]/dt/p') # 如果有p标签 ,说明有油耗,没有经销商 if p: # 购买时间 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[4]/dd item['PURCHASE_TIME'] = div.xpath( 'div/div[1]/div[2]/dl[3]/dd/text()')[0].extract( ).strip() # 裸车购买价 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[5]/dd CAR_PRICE = div.xpath('div/div[1]/div[2]/dl[4]/dd')[0] item['CAR_PRICE'] = CAR_PRICE.xpath( 'string(.)').extract()[0].strip().replace('\xa0', '') # 购车目的 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[7]/dd PURCHASE_PURPOSE = div.xpath( 'div/div[1]/div[2]/dl[6]/dd')[0] item['PURCHASE_PURPOSE'] = PURCHASE_PURPOSE.xpath( 'string(.)').extract()[0].strip().replace( '\r\n', '').replace(' ', ';') # 如果没有p说明 没有油耗,有经销商 else: # 购买时间 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[4]/dd item['PURCHASE_TIME'] = div.xpath( 'div/div[1]/div[2]/dl[4]/dd/text()')[0].extract( ).strip() # 裸车购买价 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[5]/dd CAR_PRICE = div.xpath('div/div[1]/div[2]/dl[5]/dd')[0] item['CAR_PRICE'] = CAR_PRICE.xpath( 'string(.)').extract()[0].strip().replace('\xa0', '') # 购车目的 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[7]/dd PURCHASE_PURPOSE = div.xpath( 'div/div[1]/div[2]/dl[6]/dd')[0] item['PURCHASE_PURPOSE'] = PURCHASE_PURPOSE.xpath( 'string(.)').extract()[0].strip().replace( '\r\n', '').replace(' ', ';') # 评分- 空间 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/div[1]/dl/dd/span[2] item['SCORE_SPACE'] = div.xpath( 'div/div[1]/div[2]/div[1]/dl/dd/span[2]/text()')[0].extract() # 评分- 动力 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/div[2]/dl/dd/span[2] item['SCORE_POWER'] = div.xpath( 'div/div[1]/div[2]/div[2]/dl/dd/span[2]/text()')[0].extract() # 评分- 操控 item['SCORE_CONTROL'] = div.xpath( 'div/div[1]/div[2]/div[3]/dl/dd/span[2]/text()')[0].extract() # 评分- 油耗 item['SCORE_FUEL_CONSUMPTION'] = div.xpath( 'div/div[1]/div[2]/div[4]/dl/dd/span[2]/text()')[0].extract() # 评分- 舒适性 item['SCORE_COMFORT'] = div.xpath( 'div/div[1]/div[2]/div[5]/dl/dd/span[2]/text()')[0].extract() # 评分- 外观 item['SCORE_EXTERIOR'] = div.xpath( 'div/div[1]/div[2]/div[6]/dl/dd/span[2]/text()')[0].extract() # 评分- 内饰 item['SCORE_INTERIOR'] = div.xpath( 'div/div[1]/div[2]/div[7]/dl/dd/span[2]/text()')[0].extract() # 评分- 性价比 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/div[8]/dl/dd/span[2] item['SCORE_COST_EFFECTIVE'] = div.xpath( 'div/div[1]/div[2]/div[8]/dl/dd/span[2]/text()')[0].extract() #item['SCORE_COST_EFFECTIVE'] = div.xpath('div/div[1]/div[2]/div[8]/dl/dd/span[2]/text()').extract() # 评论的url //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[1]/div/div[2]/div[2] #url_id_pre = div.xpath('div/div[1]/div[1]/div/div[2]/div[2]/@id')[0].extract() # 结果为 DivRelatedTopics_1565672 # url_id_pre = div.xpath('//div[@class="allcont border-b-solid"]/a[1]/@href').extract() url_id_pre = div.xpath( '//div[@class="allcont border-b-solid"]/a[1]/@href' )[mcount].extract() # url_id_pre=div.xpath('/div/div[1]/div[2]/div[1]/div[3]/div[1]')[0].extract() # 截取id #url_id = re.findall('\d{1,20}', url_id_pre)[0] # 存入评论url item['COMMENT_URL'] = url_id_pre # "http://k.autohome.com.cn/FrontAPI/GetFeelingByEvalId?evalId=" + url_id COMMENT_URL = 'https:' + item['COMMENT_URL'] mcount = mcount + 1 print(item) yield SplashRequest(url=COMMENT_URL, callback=self.parse_recommand, magic_response=True, args={ 'timeout': 8, 'wait': 0.5 })