class ZhihuPipeline(object): def open_spider(self, spider): self.write_db = DB(MySQL['db_host'], MySQL['db_port'], MySQL['db_user'], MySQL['db_password'], MySQL['db_dbname']) def close_spider(self, spider): self.write_db.__delete__() def process_item(self, item, spider): sql = """insert ignore into user (u_id, name, avatar, remark, agree, thanks, location, business, gender, employment, education, education_extra, asks, answers, posts, collections, logs) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""" self.write_db.execute(sql, (item['id'], item['name'], item['avatar'], item['remark'], item['agree'], item['thanks'], item['location'], item['business'], item['gender'], item['employment'], item['education'], item['education_extra'], item['asks'], item['answers'], item['posts'], item['collections'], item['logs']))
def parse_urls(self, urls): master_db = DB(MySQL['db_host'], MySQL['db_port'], MySQL['db_user'], MySQL['db_password'], MySQL['db_dbname']) for i in urls: sql = """insert ignore into url (url) values (%s)""" print """insert ignore into url (url) values ('""" + i + """')""" resQuery = master_db.execute(sql, (i)) master_db.__delete__()
def set_refer(self, response): print '--------------------set_refer------------------' if response.status == 200: master_db = DB(MySQL['db_host'], MySQL['db_port'], MySQL['db_user'], MySQL['db_password'], MySQL['db_dbname']) sql = """select * from topic_url where status = 0 limit 1""" res = master_db.query(sql, ()) print res if res: url = res[0]['url'] sql = """update topic_url set status = 1 where id = %s limit 1""" master_db.execute(sql, (str(res[0]['id']))) master_db.__delete__() topic_url = self.base_url + url + """/followers""" yield scrapy.Request(url=topic_url, meta={'cookiejar': 1}, callback=self.parse_follower)
def parse_item(self, response): write_db = DB(MySQL['db_host'], MySQL['db_port'], MySQL['db_user'], MySQL['db_password'], MySQL['db_dbname']) tmp_res = response.url.split('/') country_id = tmp_res[-2] item = MafengwoItem() scenics = response.css('.row-allPlace li a') for i in scenics: item['cn_name'] = i.css('a strong::text').extract() item['en_name'] = i.css('a::text').extract() item['url'] = i.css('a::attr("href")').extract() print '------==========------------========-------=======' item['cn_name'] = item['cn_name'][0].encode('utf-8').strip() if item['cn_name'] else 0 item['en_name'] = item['en_name'][0].encode('utf-8').strip() if item['en_name'] else 0 item['url'] = item['url'][0].encode('utf-8') if item['url'] else 0 if item['cn_name']: print item sql = "insert ignore into scenic (cn_name, en_name, url, country_id) values (%s, %s, %s, %s)" resQuery = write_db.execute(sql, (item['cn_name'], item['en_name'], item['url'], country_id)) write_db.execute('update country set status = 1 where country_id = %s', (str(country_id))) write_db.__delete__()
class pic(scrapy.Spider): name = "pic" base_url = 'http://www.mafengwo.cn/mdd/ajax_photolist.php?act=getPoiPhotoList&poiid=' allowed_domains = ["www.mafengwo.cn"] start_urls = [] def start_requests(self): #重写了爬虫类的方法, 实现了自定义请求, 运行成功后会调用callback回调函数 print '--------------------start request------------------' request_url = self.get_url() if request_url: print request_url yield scrapy.Request(url=request_url, callback=self.parse_item, dont_filter=True, errback=self.errback_httpbin) def get_url(self): self.master_db = DB(MySQL['db_host'], MySQL['db_port'], MySQL['db_user'], MySQL['db_password'], MySQL['db_dbname']) sql = """select * from scenic_info where pic_list is null order by id desc limit 1""" res = self.master_db.query(sql, ()) if not res: self.master_db.__delete__() return False url = res[0]['url'] request_url = self.base_url + url[5:url.index('.')] return request_url def errback_httpbin(self, failure): self.master_db.__delete__() def parse_item(self, response): print '++++++++++++++++++++++++++++++++++++++++++' item_url = response.url item_url = "/poi/" + item_url[item_url.index('d=') + 2:].encode('utf-8') + ".html" res_pic_list = response.css( '.column .cover img::attr("src")').extract() item_pic_list = "," if res_pic_list: for i in res_pic_list: if len(i) > 1: item_pic_list = item_pic_list + i[0:i.index('?')].encode( 'utf-8') + "," sql = "update scenic_info set pic_list = %s where url = %s" resQuery = self.master_db.execute(sql, (item_pic_list, item_url)) self.master_db.__delete__()
def parse_item(self, response): print 'parsing response ', response.url zhihu_item = ZhiHuItem() zhihu_item['id'] = response.css( '.zm-rich-follow-btn::attr("data-id")').extract() zhihu_item['name'] = response.css( '.title-section .name::text').extract() zhihu_item['avatar'] = response.css( '.zm-profile-header-main .Avatar::attr("src")').extract() zhihu_item['remark'] = response.css( '.title-section .bio::text').extract() zhihu_item['agree'] = response.css( '.zm-profile-header-user-agree strong::text').extract() zhihu_item['thanks'] = response.css( '.zm-profile-header-user-thanks strong::text').extract() zhihu_item['location'] = response.css( '.zm-profile-header-user-describe .items .info-wrap .location::attr("title")' ).extract() zhihu_item['business'] = response.css( '.zm-profile-header-user-describe .items .info-wrap .business::attr("title")' ).extract() zhihu_item['gender'] = response.css( '.zm-profile-header-user-describe .items .edit-wrap input[checked=checked]::attr("value")' ).extract() zhihu_item['employment'] = response.css( '.zm-profile-header-user-describe .items .info-wrap .employment::attr("title")' ).extract() zhihu_item['education'] = response.css( '.zm-profile-header-user-describe .items .education::attr("title")' ).extract() zhihu_item['education_extra'] = response.css( '.zm-profile-header-user-describe .items .education-extra::attr("title")' ).extract() zhihu_item['asks'] = response.css( '.profile-navbar a[href*=asks] span::text').extract() zhihu_item['answers'] = response.css( '.profile-navbar a[href*=answers] span::text').extract() zhihu_item['posts'] = response.css( '.profile-navbar a[href*=posts] span::text').extract() zhihu_item['collections'] = response.css( '.profile-navbar a[href*=collections] span::text').extract() zhihu_item['logs'] = response.css( '.profile-navbar a[href*=logs] span::text').extract() print '==============================================================================================================' zhihu_item['id'] = zhihu_item['id'][0].encode( 'utf-8') if zhihu_item['id'] else 0 zhihu_item['name'] = zhihu_item['name'][0].encode( 'utf-8') if zhihu_item['name'] else 0 zhihu_item['avatar'] = zhihu_item['avatar'][0].encode( 'utf-8') if zhihu_item['avatar'] else 0 zhihu_item['remark'] = zhihu_item['remark'][0].encode( 'utf-8') if zhihu_item['remark'] else 0 zhihu_item['agree'] = zhihu_item['agree'][0].encode( 'utf-8') if zhihu_item['agree'] else 0 zhihu_item['thanks'] = zhihu_item['thanks'][0].encode( 'utf-8') if zhihu_item['thanks'] else 0 zhihu_item['location'] = zhihu_item['location'][0].encode( 'utf-8') if zhihu_item['location'] else 0 zhihu_item['business'] = zhihu_item['business'][0].encode( 'utf-8') if zhihu_item['business'] else 0 zhihu_item['gender'] = zhihu_item['gender'][0].encode( 'utf-8') if zhihu_item['gender'] else 0 zhihu_item['employment'] = zhihu_item['employment'][0].encode( 'utf-8') if zhihu_item['employment'] else 0 zhihu_item['education'] = zhihu_item['education'][0].encode( 'utf-8') if zhihu_item['education'] else 0 zhihu_item['education_extra'] = zhihu_item['education_extra'][ 0].encode('utf-8') if zhihu_item['education_extra'] else 0 zhihu_item['asks'] = zhihu_item['asks'][0].encode( 'utf-8') if zhihu_item['asks'] else 0 zhihu_item['answers'] = zhihu_item['answers'][0].encode( 'utf-8') if zhihu_item['answers'] else 0 zhihu_item['posts'] = zhihu_item['posts'][0].encode( 'utf-8') if zhihu_item['posts'] else 0 zhihu_item['collections'] = zhihu_item['collections'][0].encode( 'utf-8') if zhihu_item['collections'] else 0 zhihu_item['logs'] = zhihu_item['logs'][0].encode( 'utf-8') if zhihu_item['logs'] else 0 print zhihu_item if zhihu_item['id']: print '----> db <----' write_db = DB(MySQL['db_host'], MySQL['db_port'], MySQL['db_user'], MySQL['db_password'], MySQL['db_dbname']) sql = """insert ignore into user (u_id, name, avatar, remark, agree, thanks, location, business, gender, employment, education, education_extra, asks, answers, posts, collections, logs, url) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""" resQuery = write_db.execute( sql, (zhihu_item['id'], zhihu_item['name'], zhihu_item['avatar'], zhihu_item['remark'], zhihu_item['agree'], zhihu_item['thanks'], zhihu_item['location'], zhihu_item['business'], zhihu_item['gender'], zhihu_item['employment'], zhihu_item['education'], zhihu_item['education_extra'], zhihu_item['asks'], zhihu_item['answers'], zhihu_item['posts'], zhihu_item['collections'], zhihu_item['logs'], response.url)) write_db.execute( 'update url set status = 1 where url = %s limit 1', (response.url)[21:]) write_db.__delete__() print 'resQuery ------- ', resQuery
class user(scrapy.Spider): name = "user" base_url = 'https://www.zhihu.com' allowed_domains = ["www.zhihu.com", "xpisme.com"] start_urls = [] def errback_httpbin(self, failure): url = failure.request.url print failure sql = """update url set status = 1 where url = %s""" self.master_db.execute(sql, (url[21:])) self.master_db.__delete__() def start_requests(self): #重写了爬虫类的方法, 实现了自定义请求, 运行成功后会调用callback回调函数 print '--------------------start request------------------' while (True): time.sleep(1) request_url = self.get_url() if request_url: print request_url yield scrapy.Request(url=request_url, callback=self.parse_item, dont_filter=True, errback=self.errback_httpbin) def get_employment(self, employments): if not len(employments): return 0 if employments[0].get('job'): return employments[0]['job']['name'].encode('utf-8') else: if employments[0].get('name'): return employments[0]['name'].encode('utf-8') else: return 0 def get_url(self): self.master_db = DB(MySQL['db_host'], MySQL['db_port'], MySQL['db_user'], MySQL['db_password'], MySQL['db_dbname']) sql = """select * from url where status = 0 limit 1""" res = self.master_db.query(sql, ()) if not res: self.master_db.__delete__() return False url = res[0]['url'] sql = """update url set status = 2 where url = %s""" self.master_db.execute(sql, (url)) request_url = self.base_url + url return request_url def parse_item(self, response): print 'parsing response ', response.url if len(response.css('#data::attr("data-state")')) < 1: self.parse_old(response) else: self.parse_react(response) def parse_old(self, response): zhihu_item = ZhiHuItem() zhihu_item['id'] = response.css( '.zm-rich-follow-btn::attr("data-id")').extract() zhihu_item['name'] = response.css( '.title-section .name::text').extract() zhihu_item['avatar'] = response.css( '.zm-profile-header-main .Avatar::attr("src")').extract() zhihu_item['remark'] = response.css( '.title-section .bio::text').extract() zhihu_item['agree'] = response.css( '.zm-profile-header-user-agree strong::text').extract() zhihu_item['thanks'] = response.css( '.zm-profile-header-user-thanks strong::text').extract() zhihu_item['location'] = response.css( '.zm-profile-header-user-describe .items .info-wrap .location::attr("title")' ).extract() zhihu_item['business'] = response.css( '.zm-profile-header-user-describe .items .info-wrap .business::attr("title")' ).extract() zhihu_item['gender'] = response.css( '.zm-profile-header-user-describe .items .edit-wrap input[checked=checked]::attr("value")' ).extract() zhihu_item['employment'] = response.css( '.zm-profile-header-user-describe .items .info-wrap .employment::attr("title")' ).extract() zhihu_item['education'] = response.css( '.zm-profile-header-user-describe .items .education::attr("title")' ).extract() zhihu_item['education_extra'] = response.css( '.zm-profile-header-user-describe .items .education-extra::attr("title")' ).extract() zhihu_item['asks'] = response.css( '.profile-navbar a[href*=asks] span::text').extract() zhihu_item['answers'] = response.css( '.profile-navbar a[href*=answers] span::text').extract() zhihu_item['posts'] = response.css( '.profile-navbar a[href*=posts] span::text').extract() zhihu_item['collections'] = response.css( '.profile-navbar a[href*=collections] span::text').extract() zhihu_item['logs'] = response.css( '.profile-navbar a[href*=logs] span::text').extract() zhihu_item['url'] = response.url[21:] zhihu_item['following'] = response.css( '.zm-profile-side-following strong')[0].extract() zhihu_item['followers'] = response.css( '.zm-profile-side-following strong')[1].extract() zhihu_item['topics'] = response.css('.zg-link-litblue')[1].extract() zhihu_item['columns'] = response.css('.zg-link-litblue')[0].extract() print '______________________________________________________________________________________________________________' zhihu_item['id'] = zhihu_item['id'][0].encode( 'utf-8') if zhihu_item['id'] else 0 zhihu_item['name'] = zhihu_item['name'][0].encode( 'utf-8') if zhihu_item['name'] else 0 zhihu_item['avatar'] = zhihu_item['avatar'][0].encode( 'utf-8') if zhihu_item['avatar'] else 0 zhihu_item['remark'] = zhihu_item['remark'][0].encode( 'utf-8') if zhihu_item['remark'] else 0 zhihu_item['agree'] = zhihu_item['agree'][0].encode( 'utf-8') if zhihu_item['agree'] else 0 zhihu_item['thanks'] = zhihu_item['thanks'][0].encode( 'utf-8') if zhihu_item['thanks'] else 0 zhihu_item['location'] = zhihu_item['location'][0].encode( 'utf-8') if zhihu_item['location'] else 0 zhihu_item['business'] = zhihu_item['business'][0].encode( 'utf-8') if zhihu_item['business'] else 0 zhihu_item['gender'] = zhihu_item['gender'][0].encode( 'utf-8') if zhihu_item['gender'] else 0 zhihu_item['employment'] = zhihu_item['employment'][0].encode( 'utf-8') if zhihu_item['employment'] else 0 zhihu_item['education'] = zhihu_item['education'][0].encode( 'utf-8') if zhihu_item['education'] else 0 zhihu_item['education_extra'] = zhihu_item['education_extra'][ 0].encode('utf-8') if zhihu_item['education_extra'] else 0 zhihu_item['asks'] = zhihu_item['asks'][0].encode( 'utf-8') if zhihu_item['asks'] else 0 zhihu_item['answers'] = zhihu_item['answers'][0].encode( 'utf-8') if zhihu_item['answers'] else 0 zhihu_item['posts'] = zhihu_item['posts'][0].encode( 'utf-8') if zhihu_item['posts'] else 0 zhihu_item['collections'] = zhihu_item['collections'][0].encode( 'utf-8') if zhihu_item['collections'] else 0 zhihu_item['logs'] = zhihu_item['logs'][0].encode( 'utf-8') if zhihu_item['logs'] else 0 zhihu_item['url'] = zhihu_item['url'] zhihu_item['following'] = filter( str.isdigit, zhihu_item['following'].encode('utf-8')) zhihu_item['followers'] = filter( str.isdigit, zhihu_item['followers'].encode('utf-8')) zhihu_item['topics'] = filter(str.isdigit, zhihu_item['topics'].encode('utf-8')) zhihu_item['columns'] = filter(str.isdigit, zhihu_item['columns'].encode('utf-8')) if zhihu_item['id']: sql = """insert ignore into user ( u_id, name, avatar, remark, agree, thanks, location, business, gender, employment, education, education_extra, asks, answers, posts, collections, logs, url, followers, topics, columns) values ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""" resQuery = self.master_db.execute( sql, (zhihu_item['id'], zhihu_item['name'], zhihu_item['avatar'], zhihu_item['remark'], zhihu_item['agree'], zhihu_item['thanks'], zhihu_item['location'], zhihu_item['business'], zhihu_item['gender'], zhihu_item['employment'], zhihu_item['education'], zhihu_item['education_extra'], zhihu_item['asks'], zhihu_item['answers'], zhihu_item['posts'], zhihu_item['collections'], zhihu_item['logs'], zhihu_item['url'], zhihu_item['followers'], zhihu_item['topics'], zhihu_item['columns'])) sql = """update url set status = 1 where url = %s""" self.master_db.execute(sql, (zhihu_item['url'])) self.master_db.__delete__() print 'resQuery ------- ', resQuery def parse_react(self, response): body = response.css('#data::attr("data-state")')[0].extract().encode( 'utf-8') state = json.loads(body) people = response.url[29:] if state['entities']['users'].get(people): users = state['entities']['users'][people] else: users = state['entities']['users']['null'] zhihu_item = ZhiHuItem() zhihu_item['id'] = users['id'] zhihu_item['name'] = users['name'].encode('utf-8') zhihu_item['avatar'] = users['avatarUrl'].replace('_is.', '_xl.') zhihu_item['remark'] = users['headline'].encode( 'utf-8') if users['headline'] else 0 zhihu_item['agree'] = users['voteupCount'] zhihu_item['thanks'] = users['thankedCount'] zhihu_item['location'] = users['locations'][0]['name'].encode( 'utf-8') if users['locations'] else 0 zhihu_item['business'] = users['business']['name'].encode( 'utf-8') if users.get('business', False) else 0 zhihu_item['gender'] = users['gender'] zhihu_item['employment'] = self.get_employment(users['employments']) zhihu_item['education'] = users['educations'][0][ 'school']['name'].encode('utf-8') if users.get( 'educations', False) and users['educations'][0].get( 'school', False) else 0 zhihu_item['education_extra'] = users['educations'][ 0]['major']['name'].encode('utf-8') if users.get( 'educations', False) and users['educations'][0].get( 'major', False) else 0 zhihu_item['asks'] = users['questionCount'] zhihu_item['answers'] = users['answerCount'] zhihu_item['posts'] = users['articlesCount'] zhihu_item['collections'] = users['favoriteCount'] zhihu_item['logs'] = users['logsCount'] zhihu_item['url'] = response.url[21:] zhihu_item['following'] = users['followingCount'] zhihu_item['followers'] = users['followerCount'] zhihu_item['lives'] = users['hostedLiveCount'] zhihu_item['topics'] = users['followingTopicCount'] zhihu_item['columns'] = users['followingColumnsCount'] zhihu_item['questions'] = users['followingQuestionCount'] zhihu_item['weibo'] = users['sinaWeiboUrl'] if users.get( 'sinaWeiboUrl', False) else 0 print zhihu_item if zhihu_item['id']: print '----> db <----' sql = """insert ignore into user ( u_id, name, avatar, remark, agree, thanks, location, business, gender, employment, education, education_extra, asks, answers, posts, collections, logs, url, following, followers, lives, topics, columns, questions, weibo) values ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) """ self.master_db.execute( sql, (zhihu_item['id'], zhihu_item['name'], zhihu_item['avatar'], zhihu_item['remark'], zhihu_item['agree'], zhihu_item['thanks'], zhihu_item['location'], zhihu_item['business'], zhihu_item['gender'], zhihu_item['employment'], zhihu_item['education'], zhihu_item['education_extra'], zhihu_item['asks'], zhihu_item['answers'], zhihu_item['posts'], zhihu_item['collections'], zhihu_item['logs'], zhihu_item['url'], zhihu_item['following'], zhihu_item['followers'], zhihu_item['lives'], zhihu_item['topics'], zhihu_item['columns'], zhihu_item['questions'], zhihu_item['weibo'])) sql = """update url set status = 1 where url = %s""" self.master_db.execute(sql, (zhihu_item['url'])) self.master_db.__delete__()
class info(scrapy.Spider): name = "info" base_url = 'http://www.mafengwo.cn' allowed_domains = ["www.mafengwo.cn"] start_urls = [] headers = { 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding':'gzip, deflate, sdch, br', 'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6', 'Cache-Control':'max-age=0', 'Connection':'keep-alive', 'Host':'www.mafengwo.cn', 'Upgrade-Insecure-Requests':'1', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36', "Referer": "https://www.mafengwo.cn", "Origin" : "https://www.mafengwo.cn" } def start_requests(self): #重写了爬虫类的方法, 实现了自定义请求, 运行成功后会调用callback回调函数 print '--------------------start request------------------' while (True): time.sleep(1.5) request_url = self.get_url() if request_url: print request_url yield scrapy.Request(url=request_url, callback=self.parse_item, dont_filter=True, errback=self.errback_httpbin) def get_url(self): self.master_db = DB(MySQL['db_host'], MySQL['db_port'], MySQL['db_user'], MySQL['db_password'], MySQL['db_dbname']) sql = """select * from scenic where status = 0 limit 1""" res = self.master_db.query(sql, ()) if not res: self.master_db.__delete__() return False url = res[0]['url'] sql = """update scenic set status = 2 where id = %s""" self.master_db.execute(sql, (res[0]['id'])) request_url = self.base_url + url return request_url def errback_httpbin(self, failure): url = failure.request.url print failure sql = """update scenic set status = 3 where url = %s""" self.master_db.execute(sql, (url[22:])) self.master_db.__delete__() def parse_item(self, response): print '++++++++++++++++++++++++++++++++++++++++++' item_url = response.url[22:].encode('utf-8') item_location = response.css('.mod-location .mhd .sub::text')[0].extract().encode('utf-8') item_summary = response.css('.summary')[0].extract().encode('utf-8') if len(response.css('.summary')) else 0 sql = "insert into scenic_info (url, location, summary) values (%s, %s, %s)" resQuery = self.master_db.execute(sql, (item_url, item_location, item_location)) self.master_db.execute('update scenic set status = 1 where url = %s', (item_url)) self.master_db.__delete__()