def parse(self, response): item = BlogItem() selector = Selector(response) Course = selector.xpath('//div[@class="post_item_body"]') for eachCourse in Course: # Title = eachCourse.xpath('h3/a[@class="titlelnk"]/text()').extract() # Name = eachCourse.xpath('div[@class="post_item_foot"]/a[@class="lightblue"]/text()').extract()[0] # path1 = eachCourse.xpath('p[@class="post_item_summary"]/a/img/@src').extract()[0] # [ @ # class ="pfs"] # path = "http:" + path1 Name = eachCourse.xpath( 'div[@class="post_item_foot"]/a["@class=lightblue"]/text()' ).extract()[0] # # # data = eachCourse.xpath() # num = eachCourse.xpath('span[@class="article_view"]/a["@class="gray"]/text()').extract() # item['Title'] = Title item['Name'] = Name # item['path'] = path # item['num'] = num yield item
def parse(self,response): #print response.body items = response.xpath('//div[@class="showlist"]/li') for item in items: blogitem = BlogItem() blogitem['url'] = item.xpath('//p[@class="showpic"]/a/@href').extract() blogitem['img'] = item.xpath('//p[@class="showpic"]/a/img/@src').extract() blogitem['title']= item.xpath('//p[@class="showpic"]/a/img/@alt').extract() yield blogitem
def parse_details(self, response): item = BlogItem() item['url'] = response.url query_str = parse.parse_qs(parse.urlsplit(response.url).query) item['author'] = query_str['blogId'][0] titlecontent = "" title = "" if 'blog.naver.com' in response.url: title = str( response.xpath( "//div[@class='se-module se-module-text se-title-text']/p/span/text()" ).get()) item['date'] = response.xpath( "//span[contains(@class, 'se_publishDate')]/text()").get() content = str( response.xpath("//div[@class='se-main-container']").get()) if content == 'None': title = str( response.xpath( "//div[contains(@class,'se_title')]//h3").get()) item['date'] = response.xpath( "//span[contains(@class, 'se_publishDate')]/text()").get() content = str( response.xpath( "//div[contains(@class, 'sect_dsc')]").get()) if content == 'None': title = str( response.xpath("//div[@class='htitle']/span/text()").get()) item['date'] = response.xpath( "//p[contains(@class,'_postAddDate')]/text()").get() content = str( response.xpath("//div[@id='postViewArea']/div").get()) title = re.sub( ' +', ' ', str( re.sub(re.compile('<.*?>'), ' ', title.replace('"', '')).replace('\r\n', '').replace( '\n', '').replace('\t', '').replace('\u200b', '').strip())) content = re.sub( ' +', ' ', str( re.sub(re.compile('<.*?>'), ' ', content.replace('"', '')).replace('\r\n', '').replace( '\n', '').replace('\t', '').replace('\u200b', '').strip())) item['title'] = title item['content'] = content yield item
def parse(self, response): sel = Selector(response) articles = sel.xpath('//div[@id="content"]/article') for article in articles: item = BlogItem() item['title'] = article.xpath( 'header/h1[@class="entry-title"]/a/text()').extract() item['link'] = article.xpath( 'header/h1[@class="entry-title"]/a/@href').extract() item['description'] = article.xpath( 'div[@class="entry-content"]/p[position()<3]/text()').extract( ) yield item
def parse(self, response): titles = [] links = [] for sel in response.xpath('//*[@class="postTitle2"]'): title = sel.xpath('text()').extract() for ti in title: titles.append(ti) self.logger.info("<TITLE> : \t" + ti) link = sel.xpath('@href').extract() links.append(link) for i in range(0, len(titles) - 1): item = BlogItem() item['title'] = titles[i] item['link'] = links[i] yield item
def detail_parse(self, response): post = response.xpath('//div[contains(@class,"article-item-box")]') for i in post: item = BlogItem() item["is_origin"] = s_format(i.xpath('//a/span/text()').get()) item["title"] = s_format(i.xpath('./h4//text()[2]').get()) item["commit"] = i.xpath( '//div[contains(@class,"info-box")]/p[3]/span/text()').get( ).split(':')[1].strip(' ') item["look"] = i.xpath( '//div[contains(@class,"info-box")]/p[2]/span/text()').get( ).split(":")[1].strip(' ') item["create_time"] = i.xpath( '//div[contains(@class,"info-box")]/p[1]/span/text()').get() yield item
def parse(self, response): item = BlogItem() items = response.xpath( '//div[@class="blog_list_wrap"]/dl[@class="blog_list clearfix"]') for i in items: item['name'] = i.xpath( './/h3[@class="csdn-tracking-statistics"]/a/text()').extract() item['url'] = i.xpath( './/h3[@class="csdn-tracking-statistics"]/a/@href').extract() item['author'] = i.xpath('.//dt/a[2]/text()').extract() item['skim'] = i.xpath( './/dd/div[2]/div[2]/span/em/text()').extract() item['sort'] = i.xpath( './/dd/div[2]/div[1]/span/a/text()').extract() yield item for i in range(1, num): next_url = 'http://blog.csdn.net/?&page={}'.format(i) yield Request(next_url, headers=self.headers)
def parse_item(self, response): items = BlogItem() sel = Selector(response) btitle = sel.xpath('//*[@class="articalTitle"]/h2/text()').extract( )[0].encode('utf-8') # item['title'] = sel.xpath('//*[@class="link_title"]/a/text()').extract()[0].encode('utf-8') btime = sel.xpath('//*[@class="time SG_txtc"]/text()').extract()[0] bcontents = sel.xpath('//font[contains(@size, "+0")]/text()').extract() boldcontents = sel.xpath('//b/text()').extract() boldfont = sel.xpath('//b/font/text()').extract() like = "".join(response.xpath('//*[@class="IL"]//text()').extract()) print('yuedu/r/n' + like) #获取文章的主要内容 acontent = "".join( response.xpath( '//div[@id="sina_keyword_ad_area2"]//text()').extract()) bcontent = '' for content in bcontents: bcontent += content.encode('utf-8') for bold in boldcontents: bcontent += bold.encode('utf-8') if boldfont != []: bcontent += boldfont[0].encode('utf-8') #获取时间 btime = btime.replace('(', '') btime = btime.replace(')', '') # 初步去除内容的回车 acontent = acontent.replace('\n', '') acontent = acontent.replace('\t', '') # 获取名字id nameid = response.css('link[rel*=alternate]::attr(href)').extract( )[0].split('/')[-1].split('.')[0] name = requests.get( 'http://uic.sso.sina.com.cn/uic/Mutiquery.php?UID=0&Check=null&UIDS=[' + nameid + ']&UserInfoTypes=[1]&ProductType=2&varname =requestId_8481872' ).content.split('"')[-2].decode("unicode-escape") # 获取阅读量 viewids = response.url.split('_')[-1].split('.')[0].split('01') con = requests.get( 'http://comet.blog.sina.com.cn/api?maintype=num&uid=' + viewids[0] + "&aids=" + viewids[1] + "&requestId=aritlces_number_3610").content arr = con.split('{')[-1].split('}')[0].split(',') view = 0 like = 0 comment = 0 for a in arr: if a.find("r") != -1: print("\r\n\r\n\r\n" + a.split(":")[-1] + "\r\n\r\n\r\n") view = a.split(":")[-1] if a.find("d") != -1: like = a.split(":")[-1] # 存入数据库 items['name'] = name items['time'] = btime items['title'] = btitle items['content'] = acontent items['view'] = view items['like'] = like items['arturl'] = response.url yield items
class UserRelationshipNetsSpider(scrapy.Spider): name = "user_relationship_nets" count_dict = {} maxpage_dict = {} focus_user_dict = {} save_uid = [] single_user_uid_save = [] single_user_uid_has_requested = [] headers = {'Host': 'blog.cnfol.com', 'Accept': "application/json, text/javascript, */*; q=0.01", 'Accept-Language': "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", 'Accept-Encoding': "gzip, deflate", 'Connection': "keep-alive", 'Cache-Control': "max-age=0", 'Cookie': "SUV=1471944245695628", 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0',} def __init__(self, first_uid, runner): self.user_url = 'http://blog.cnfol.com/%s/myfocus/friend' self.first_uid = first_uid self.runner = runner def start_requests(self): yield scrapy.Request(self.user_url % (self.first_uid), method='GET', headers=self.headers, callback=self.request_info, meta={'cookiejar': 1, 'request_id': self.first_uid, 'proxy': 'http://%s' % (UsersConfig['proxy']), }, ) def request_info(self, response): requsest_id = response.request.meta.get('request_id') try: page_num = response.xpath('//*[@class="CoRed"]/text()').extract()[0].split('/')[1] except: page_num = 1 for j in range(1, int(page_num) + 1): focus_url = 'http://blog.cnfol.com/%s/myfocus/friend?type=&&p=%s' % (requsest_id, j) yield FormRequest(focus_url, method='GET', headers=self.headers, callback=self.parse_page, meta={ 'proxy': 'http://%s' % (UsersConfig['proxy']), 'maxpage': page_num, 'cookiejar': response.meta['cookiejar'], }, dont_filter=True) def parse_page(self, response): page_data = response.body maxpage = response.request.meta.get('maxpage') page_data_len = len(response.xpath('//div[starts-with(@class,"MyFollowBox FirstMesg")]')) if page_data_len > 0: pattern2 = 'http://blog.cnfol.com/(.*)/myfocus/frien.*' key_user_id2 = re.findall(pattern2, response.url)[0] if key_user_id2 in self.count_dict.keys(): self.count_dict[key_user_id2] = self.count_dict[key_user_id2] + 1 else: self.count_dict[key_user_id2] = 1 self.maxpage_dict[key_user_id2] = maxpage if int(self.count_dict[key_user_id2]) <= int(self.maxpage_dict[key_user_id2]): for i in range(page_data_len): try: focus_link = \ response.xpath( '//div[starts-with(@class,"MyFollowBox FirstMesg")]/div[2]/p[2]/span[1]/a/@href')[ i].extract() if 'returnbolg' not in focus_link: # print focus_link request_id = re.findall(pattern2, focus_link)[0] # print request_id friends_count = response.xpath( '//div[starts-with(@class,"MyFollowBox FirstMesg")]/div[2]/p[2]/span[1]/a/em/text()')[ i].extract() follows_count = response.xpath( '//div[starts-with(@class,"MyFollowBox FirstMesg")]/div[2]/p[2]/span[2]/a/em/text()')[ i].extract() if int(friends_count) < 500: yield scrapy.Request(self.user_url % (request_id), method='GET', headers=self.headers, callback=self.request_info, # errback=self.error_back, meta={ 'proxy': 'http://%s' % (UsersConfig['proxy']), 'request_id': request_id, 'cookiejar': response.meta['cookiejar'], }, ) except: print 'first datalength ' pass for i in range(page_data_len): try: focus_link = \ response.xpath('//div[starts-with(@class,"MyFollowBox FirstMesg")]/div[2]/p[2]/span[1]/a/@href')[ i].extract() if 'returnbolg' not in focus_link: request_id = re.findall(pattern2, focus_link)[0] friends_count = \ response.xpath( '//div[starts-with(@class,"MyFollowBox FirstMesg")]/div[2]/p[2]/span[1]/a/em/text()')[ i].extract() follows_count = \ response.xpath( '//div[starts-with(@class,"MyFollowBox FirstMesg")]/div[2]/p[2]/span[2]/a/em/text()')[ i].extract() if int(friends_count) < 500 and int(follows_count) > 5000: pass_id = request_id if pass_id not in self.single_user_uid_has_requested: data_dict = { 'uid': pass_id } self.runner.crawl(UserInfoSpider, data_dict) self.runner.crawl(ArticleDataSpider, data_dict) # UserInfoSpider(data_dict) self.single_user_uid_has_requested.append(pass_id) else: pass # UserInfoSpider(data_dict) if key_user_id2 in self.focus_user_dict.keys(): self.focus_user_dict[key_user_id2].append('%s' % pass_id) else: self.focus_user_dict[key_user_id2] = [] self.focus_user_dict[key_user_id2].append('%s' % pass_id) except Exception, e: print 'second datalength ' pass item = BlogItem() if self.count_dict[key_user_id2] == self.maxpage_dict[key_user_id2]: self.count = 0 if str(key_user_id2) not in self.save_uid and str(key_user_id2) in self.focus_user_dict.keys(): save_data = '%s\t%s\n' % (key_user_id2.strip(), ','.join(self.focus_user_dict[key_user_id2])) item['user_relationship_nets'] = save_data yield item self.save_uid.append(key_user_id2)