def get_detail(self, response): item = response.meta['item'] description = response.xpath("//p[@class='summary']/text()").extract()[0] tags_text = response.xpath("//div[@class='video_tags _video_tags']/a/text()").extract() if len(tags_text): tags = ','.join(tags_text) else: tags = '' #动漫这里显示集数有几种方式 jishu='' jishu_list1=response.css('.item_detail_half a::attr(href)').extract() jishu_list2=response.css('.mod_episode .item a::attr(href)').extract() if len(jishu_list1): for i in range(1,len(jishu_list1)+1): jishu+="第%s集:https://v.qq.com"%(str(i))+jishu_list1[i-1]+',' elif jishu_list2: if len(jishu_list2): for i in range(1,len(jishu_list2)+1): jishu+="第%s集:https://v.qq.com"%(str(i))+jishu_list2[i-1]+',' else: jishu='' url_object_id=get_md5(response.url) play_time = response.xpath("//div[@class='figure_count']/span[@class='num']/text()").extract_first() item['url_object_id']=url_object_id item['jishu']=jishu item['tags'] = tags item['description'] = description item['play_time'] = play_time yield item
def parse_item(self, response): #每一页有多个电影 需要 对每个电影解析 itemloader = ItemLoader(item=MovieItem(), response=response) itemloader.add_value('url', response.url) itemloader.add_value('url_object_id', get_md5(response.url)) itemloader.add_css('main_title', '.video_title a::text') itemloader.add_css('title', '.video_title::text') itemloader.add_css('tags', '.video_info a::text') itemloader.add_css('score1', '.video_score .units::text') #评分两部分构成 itemloader.add_css('score2', '.video_score .decimal::text') itemloader.add_css('info', '.summary::text') itemloader.add_css('role', '.director a::text') itemloader.add_css('image_url', '.figure_pic::attr(style)') itemloader.add_value('movie_url', self.movie_detail_url + response.url) item = itemloader.load_item() return item
def parse_detail(self,response): print('1') # te='['首页', '保定人才网', '保定销售代表招聘', '6000-10000元/月\xa0', '全职', '不限', '本科', '10人 ', '双休', '试用期缴纳五险一金', '1000-9999人', '民营', '\n石家庄市桥东区中山路39号勒泰中心(B座)写字楼37/38/39层', '\n']' # print(response.text) try: itemloder=ItemLoader(item=ZhilianzhaopinItem(),response=response) itemloder.add_value('url',response.url) itemloder.add_value('url_object_id',get_md5(response.url)) # itemloder.add_css('title','.l.info-h3::text') # itemloder.add_css('all','strong::text') #salary work_years degree_need job_addvantage scale company_type address # itemloder.add_css('tags','.icon-promulgator-person a::text') # itemloder.add_css('job_info','.pos-ul span::text') # itemloder.add_css('company_name','.companny a::text') item=itemloder.load_item() return item except Exception as e: print(e) pass
def parse_item(self, response): itemloder= ItemLoader(item=LagouJobItem(),response=response) itemloder.add_css('title','.job-name .name::text') itemloder.add_value('url',response.url) itemloder.add_value('url_object_id',get_md5(response.url)) itemloder.add_css('salary_min','.salary::text') #工资范围 1k-2k itemloder.add_xpath('job_city','/html/body/div[3]/div/div[1]/dd/p[1]/span[2]/text()') #有斜线 /上海/ itemloder.add_xpath('work_years_min','/html/body/div[3]/div/div[1]/dd/p[1]/span[3]/text()') itemloder.add_xpath('degree_need','/html/body/div[3]/div/div[1]/dd/p[1]/span[4]/text()') itemloder.add_xpath('work_type','/html/body/div[3]/div/div[1]/dd/p[1]/span[5]/text()') itemloder.add_xpath('tags','/html/body/div[3]/div/div[1]/dd/ul/li/text()') #['移动互联网', '房产服务', '金融', '智能硬件', 'ERP', '后台'] itemloder.add_css('publish_time','.publish_time::text') #14:46 发布于拉勾网 itemloder.add_xpath('job_addvantage','//*[@id="job_detail"]/dd[1]/p/text()') itemloder.add_css('job_desc','.job_bt div p') #列表 需要','.join() itemloder.add_xpath('company_name','//*[@id="job_company"]/dt/a/div/h2/text()') #空格处理 itemloder.add_xpath('company_area','//*[@id="job_detail"]/dd[3]/div[1]/a/text()') #空格处理 itemloder.add_xpath('company_develop_state','//*[@id="job_company"]/dd/ul/li[2]/text()') itemloder.add_css('company_url','#job_company dt a::attr(href)') itemloder.add_xpath('company_scale','//*[@id="job_company"]/dd/ul/li[4]/text()') job_itme=itemloder.load_item() return job_itme
def parse_item(self, response): itemloder=ItemLoader(item=ShixisengItem(),response=response) itemloder.add_css('title','.new_job_name::text') itemloder.add_value('url',response.url) itemloder.add_value('url_object_id',get_md5(response.url)) itemloder.add_css('upgrade_time','.job_date span::text') itemloder.add_css('salary_min','.job_money::text') #范围 itemloder.add_css('job_city','.job_position::attr(title)') itemloder.add_css('degree_need','.job_academic::text') itemloder.add_css('work_per_week','.job_week::text') itemloder.add_css('shixi_time','.job_time::text') itemloder.add_css('job_addvantage','.job_good::text') itemloder.add_css('job_info','.job_part ::text') itemloder.add_css('company_name','.job_com_name::text') itemloder.add_css('company_url','.job_link::text') itemloder.add_css('work_address','.com_position::text') itemloder.add_css('tags','.job_detail_msg span::text') itemloder.add_css('end_time','.deadline .job_detail::text') item=itemloder.load_item() return item
def parse_detail(self, response): #提取文章的具体字段 # article_item = JobboleArticalItem() # #通过css选择器提取字段 # front_image_url = response.meta.get("front_image_url", "") #文章封面图 # title = response.css(".entry-header h1::text").extract_first('') # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract_first('').strip().replace("·","").strip() # praise_nums = response.css(".vote-post-up h10::text").extract_first('') # fav_nums = response.css(".bookmark-btn::text").extract_first('') # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.css("a[href='#article-comment'] span::text").extract_first('') # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # content = response.css("div.entry").extract_first('') # # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # article_item["url_object_id"] = get_md5(response.url) # article_item["title"] = title # article_item["url"] = response.url # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()#date类型不可调用json.dumps 进行序列化 # except Exception as e: # create_date = datetime.datetime.now().date() # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content front_image_url = response.meta.get("front_image_url", "") #文章封面图 # item_loder=ItemLoader(item=JobboleArticalItem(),response=response) #在此处实例化一个item对象传进去,格式 item_loder = ArticleItemLoder(item=JobboleArticalItem(), response=response) #改写loderItem item_loder.add_css('title', '.entry-header h1::text') item_loder.add_css('create_date', 'p.entry-meta-hide-on-mobile::text') item_loder.add_css('praise_nums', '.vote-post-up h10::text') item_loder.add_css('fav_nums', '.bookmark-btn::text') item_loder.add_css('comment_nums', "a[href='#article-comment'] span::text") item_loder.add_css('content', '.entry p::text') item_loder.add_css('tags', 'p.entry-meta-hide-on-mobile a::text') item_loder.add_value('url', response.url) item_loder.add_value('url_object_id', get_md5(response.url)) item_loder.add_value('front_image_url', [front_image_url]) article_item = item_loder.load_item() #调用才会解析 yield article_item