def parse_article(self, response): article_item = response.meta['item'] pub_date = response.css('.entry-meta-hide-on-mobile::text').get() cate = response.xpath('//a[@rel="category tag"]/text()').get() tags = response.xpath( '//p[@class="entry-meta-hide-on-mobile"]/a[contains(@href,"tag")]/text()' ).getall() votetotal = response.xpath( '//h10[contains(@id,"votetotal")]/text()').get() booktotal = response.css('span.bookmark-btn::text').get() commenttotal = response.xpath( '//a[@href="#article-comment"]//text()').get() content = response.xpath('//div[@class="entry"]').get() match_obj = '\d+' booktotal = re.match(match_obj, booktotal.strip()) commenttotal = re.match(match_obj, commenttotal.strip()) article_item['url_md5_id'] = get_md5(response.url) article_item['pub_date'] = pub_date.strip().split()[0] article_item['cate'] = cate article_item['tags'] = tags article_item['votetotal'] = int(votetotal) if votetotal else 0 article_item['booktotal'] = int(booktotal.group()) if booktotal else 0 article_item['commenttotal'] = int( commenttotal.group()) if commenttotal else 0 article_item['content'] = content.strip() yield article_item
def parse_article(self, response): item = response.meta['item'] # pub_date = response.css('.entry-meta-hide-on-mobile::text').get().strip().split()[0] # try: # pub_date = datetime.datetime.strptime(pub_date, '%Y/%m/%d').date() # except Exception as e: # pub_date = datetime.datetime.now().date() # # cate = response.xpath('//a[@rel="category tag"]/text()').get() # tags = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a[contains(@href,"tag")]/text()').getall() # votetotal = response.xpath('//h10[contains(@id,"votetotal")]/text()').get() # booktotal = response.css('span.bookmark-btn::text').get() # commenttotal = response.xpath('//a[@href="#article-comment"]//text()').get() # content = response.xpath('//div[@class="entry"]').get() # match_obj = '\d+' # booktotal = re.match(match_obj, booktotal.strip()) # commenttotal = re.match(match_obj, commenttotal.strip()) # # article_item['url_md5_id'] = get_md5(response.url) # article_item['pub_date'] = pub_date # article_item['cate'] = cate # article_item['tags'] = tags # article_item['votetotal'] = int(votetotal) if votetotal else 0 # article_item['booktotal'] = int(booktotal.group()) if booktotal else 0 # article_item['commenttotal'] = int(commenttotal.group()) if commenttotal else 0 # article_item['content'] = content.strip() # # yield article_item # 通过Itemloader加载实例 l = ArticleItemloader(item=BoleArticle(), response=response) l.add_value('title', item['title']) l.add_value('url', item['url']) l.add_value('front_img_url', item['front_img_url']) l.add_css('pub_date', '.entry-meta-hide-on-mobile::text') l.add_xpath( 'cate', '//p[@class="entry-meta-hide-on-mobile"]/a[@rel="category tag"]/text()' ) l.add_xpath( 'tags', '//p[@class="entry-meta-hide-on-mobile"]/a[contains(@href,"tag")]/text()' ) l.add_xpath('votetotal', '//h10[contains(@id,"votetotal")]/text()') l.add_css('booktotal', 'span.bookmark-btn::text') l.add_xpath('commenttotal', '//a[@href="#article-comment"]//text()') l.add_xpath('content', '//div[@class="entry"]') l.add_value('url_md5_id', get_md5(response.url)) article_item = l.load_item() yield article_item
def parse_item(self, response): with open('lagou.html', 'w', encoding='utf-8') as f: f.write(response.text) l = LaGouItemloader(item=LagoujobItem(), response=response) ''' url = Field() url_md5_id = Field() title = Field() salary = Field() job_city = Field() work_years = Field() degree_need = Field() job_type = Field() pub_time = Field() tags = Field() job_advantage = Field() job_desc = Field() job_address = Field() company_name = Field() company_url = Field() crawl_time = Field() ''' l.add_value('url', response.url) l.add_value('url_md5_id', get_md5(response.url)) l.add_css('title', '.job-name::attr(title)') l.add_css('salary', 'span.salary::text') l.add_css('job_city', 'dd.job_request > p >span:nth-child(2)::text') l.add_css('work_years', 'dd.job_request > p >span:nth-child(3)::text') l.add_css('degree_need', 'dd.job_request > p >span:nth-child(4)::text') l.add_css('job_type', 'dd.job_request > p >span:nth-child(5)::text') l.add_css('pub_time', 'p.publish_time::text') l.add_css('tags', '.labels::text') l.add_css('job_advantage', '.job-advantage > p::text') l.add_css('job_desc', '.job-detail') # addr_lst = response.xpath('//div[@class="work_addr"]//text()').getall() # addr = ''.join([i.strip() for i in addr_lst if len(i.strip())>0]) l.add_css('job_address', '.work_addr') l.add_css('company_name', 'img.b2::attr(alt)') l.add_css('company_url', '.job_company > dt > a::attr(href)') l.add_value('crawl_time', datetime.datetime.now()) lagou_item = l.load_item() return lagou_item