class XQUserInfoWeiboSpider(Spider): start_at = datetime.now() name = 'xq_user_info_weibo' logger = util.set_logger(name, LOG_FILE_USER_INFO) #handle_httpstatus_list = [404] def start_requests(self): start_url = "https://xueqiu.com/account/oauth/user/show.json?source=sina&userid=" # get start url from MongoDB db = util.set_mongo_server() owner_ids = [] for id in db.xq_cube_info.find({}, {'owner_id': 1, '_id': 0}): owner_ids.append(id['owner_id']) owner_ids = list(set(owner_ids)) # iterate each symbol all_page_n = len(owner_ids) for i in range(all_page_n): now_page_n = i owner_id = owner_ids[i] url = start_url + str(owner_id) # progress if i % 1000 == 0: self.logger.info( '%s (%s / %s) %s%%' % (owner_id, str(now_page_n), str(all_page_n), str(round(float(now_page_n) / all_page_n * 100, 1)))) #util.get_progress(now_page = i, all_page = all_page_n, logger = self.logger, spider_name = self.name, start_at = self.start_at) yield Request(url=url, meta={'user_id': owner_id}, callback=self.parse) def parse(self, response): try: if response.status == 200 and str( response.url) != "https://xueqiu.com/service/captcha": body = json.loads(response.body.decode('utf-8')) if 'id' in body: item = XQItem() content = {} content['user_id'] = response.meta['user_id'] content['weibo_id'] = body['id'] item['url'] = response.url item['content'] = content item['fp'] = request_fingerprint(response.request) yield item elif str(response.url) == "https://xueqiu.com/service/captcha": self.logger.error('CAPTURE ERROR: UID %s' % (response.meta['user_id'])) except Exception as ex: self.logger.warn('Parse Exception: %s %s' % (str(ex), response.url))
class ArtistSpider(Spider): #custom_settings = {'CONCURRENT_REQUESTS', 1} name = 'artist_id' #allow_domains = ['music.163.com'] logger = util.set_logger(name, LOG_FILE_ARTIST) def start_requests(self): ls1 = [ 1001, 1002, 1003, 2001, 2002, 2003, 6001, 6002, 6003, 7001, 7002, 7003, 4001, 4002, 4003 ] # id ls2 = [ 0, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90 ] # initial for i in ls1: for j in ls2: start_url = 'https://music.163.com/#/discover/artist/cat?id=' + str( i) + '&initial=' + str(j) yield Request(url=start_url, callback=self.parse, dont_filter=True) def parse(self, response): item = NetItem() hxs = response.body.decode("utf-8") paths = Selector(text=hxs).xpath( '//li//a[@class="nm nm-icn f-thide s-fc0"]').extract() for path in paths: ids = Selector(text=path).xpath('//a/@href').extract()[0] id = re.search("\?id=(.+)", ids).group(1) item['artist_id'] = id name = Selector(text=path).xpath('//a/text()').extract()[0] item['artist_name'] = name yield item
class HotelaahSpider(Spider): name = 'CrawlerHotelaah' logger = util.set_logger(name, LOG_FILE_POLITICIAN) def start_requests(self): start_url = 'http://www.hotelaah.com/liaoning/dijishi.html' yield Request(url=start_url, callback=self.parse) def parse(self, response): citylist = response.xpath( '//td/a[not(contains(@href, "index")) and not(contains(@href, "dijishi")) and not(contains(@href, "ditu")) and not(contains(@href, "www"))]' ).extract() for city in citylist: item = HotelaahItem() city_name = Selector(text=city).xpath('//a/text()').extract()[0] item['city_name'] = city_name print(item) city_url = Selector(text=city).xpath('//a/@href').extract()[0] print(city_url) yield Request(url='http://www.hotelaah.com/liren/' + 'liaoning' + '_' + city_url, meta={'item': copy.deepcopy(item)}, callback=self.parse_mayor) def parse_mayor(self, response): pass
def __init__(self): # set logger self.logger = util.set_logger('pipeline', LOG_FILE_PIPELINE) # 建立MongoDB server self.db = util.set_mongo_server() # 建立redis server self.redis_server = util.set_redis_server()
class GubaReplyUserInfo(Spider): start_at = datetime.now() name = 'guba_reply_user_info' logger = util.set_logger(name, LOG_FILE_GUBA_REPLY_USER_INFO) def start_requests(self): db = util.set_mongo_server() reply_author_urls = [] #replys = list(db.CrawlerGuba.aggregate([{'$project':{'_id': 0, 'reply': 1}} ,{'$unwind': '$reply'}])) for url in db.guba_stock_posts.find({}, { 'reply.reply_author_url': 1, '_id': 0 }): if 'reply' in url: for e in url['reply']: if 'reply_author_url' in e: reply_author_urls.append(e['reply_author_url']) reply_author_urls = list(set(reply_author_urls)) all_page_n = len(reply_author_urls) for i in range(all_page_n): reply_author_url = reply_author_urls[i] url = reply_author_url if i % 1000 == 0: self.logger.info('%s / %s' % (str(i), str(all_page_n))) util.get_progress(all_page=all_page_n, logger=self.logger, spider_name=self.name, start_at=self.start_at) yield Request(url=url, meta={'reply_author_url': reply_author_url}, callback=self.parse) def parse(self, response): try: if response.status == 200: hxs = Selector(response) reply_author_url = response.meta['reply_author_url'] item = GubaItem() item['content'] = {} reply_author_name = hxs.xpath( '//div[@class="taname"]/text()').extract()[0] item['content']['reply_author_name'] = reply_author_name.strip( ) sign_up_time = hxs.xpath('//div[@id="influence"]').extract()[0] sign_up_time = re.search('999;">\((.+)\)<\/span', sign_up_time).group(1).strip() sign_up_time = datetime.strptime(sign_up_time, "%Y-%m-%d") item['content']['sign_up_time'] = sign_up_time item['content']['reply_author_url'] = reply_author_url yield item except Exception as ex: self.logger.warn('Parse Exception: %s %s' % (str(ex), response.url))
class citspider(Spider): name = "cit_info" logger = util.set_logger(name, LOG_FILE_CIT) def start_requests(self): start_url = "http://www.ccdi.gov.cn/special/zyxszt/" yield Request(url = start_url, callback = self.parse) def parse(self, response): inspt_urls = response.xpath('//div[@class="tith2"]//a/@href').extract() for inspt_url in inspt_urls: inspt_url = "http://www.ccdi.gov.cn/special/zyxszt" + re.sub("\.", "", inspt_url) yield Request(url = inspt_url, callback = self.parse_list, meta = {'inspt_url' : inspt_url}) def parse_list(self, response): post_nums = response.xpath('//div[@class="page"]/script[@type = "text/javascript"]/text()').extract()[0] post_nums = re.search("\((\d?),.+\)", post_nums).group(1) inspt_url = response.meta['inspt_url'] for post_num in post_nums: for i in range(1, int(post_num)+1): if i == 1: page_url = inspt_url + "index.html" yield Request(url = copy.deepcopy(page_url), callback = self.parse_page, meta = {'inspt_url': copy.deepcopy(inspt_url)}) else: page_url = inspt_url + "index_"+ str(i-1) + ".html" yield Request(url = copy.deepcopy(page_url), callback = self.parse_page, meta = {'inspt_url': copy.deepcopy(inspt_url)}) #print(page_url) def parse_page(self, response): #pass post_urls = response.xpath('//li[@class="fixed"]//a/@href').extract() for post_url in post_urls: inspt_url = response.meta['inspt_url'] post_url = inspt_url + re.search("\.\/(.+)", post_url).group(1) yield Request(url = post_url, callback = self.parse_post) def parse_post(self, response): item = CitItem() inspt = response.xpath('//div[@class="fl"]/span').extract()[0] inspt = re.search("专题>(.+)<\/span>", inspt).group(1) inspt_title = re.search("(.+)>(.+)", inspt).group(1) item['inspt_title'] = inspt_title inspt_tag = re.search("(.+)>(.+)", inspt).group(2) item['inspt_tag'] = inspt_tag title = response.xpath('//h2[@class="tit"]/text()').extract()[0].strip() item['title'] = title time = response.xpath('//em[@class="e2"]/text()').extract()[0].strip() time = re.search("发布时间:(.+)", time).group(1).strip() item['time'] = time content = response.xpath('//p[@align="justify"]/text()').extract() item['content'] = content yield item
class sodaspider(Spider): name = "soda_green" logger = util.set_logger(name, LOG_FILE_SODA) def start_requests(self): start_url = "https://music.163.com/#/artist/album?id=12707&limit=48" yield Request(url=start_url, callback=self.parse) def parse(self, response): print(response) pass
class XQUserInfo(Spider): start_at=datetime.now() name = 'xq_user_cube' logger = util.set_logger(name, LOG_FILE_USER_STOCK) #handle_httpstatus_list = [404] def start_requests(self): #start_url="https://xueqiu.com/stock/portfolio/stocks.json?size=5000&tuid=" start_url = "https://stock.xueqiu.com/v5/stock/portfolio/stock/list.json?size=10000&category=3&pid=-120&uid=" db = util.set_mongo_server() owner_ids = [] for id in db.xq_cube_info.find({}, {'owner_id': 1, '_id': 0}): owner_ids.append(id['owner_id']) owner_ids = list(set(owner_ids)) # iterate each symbol all_page_n = len(owner_ids) for i in range(all_page_n): now_page_n = i owner_id = owner_ids[i] url = start_url+str(owner_id) # progress if i%1000==0: self.logger.info('%s (%s / %s) %s%%' % (owner_id, str(now_page_n), str(all_page_n), str(round(float(now_page_n) / all_page_n * 100, 1)))) yield Request(url = url, meta = {'user_id': owner_id}, callback = self.parse) def parse(self, response): try: if response.status == 200 and str(response.url) != "https://xueqiu.com/service/captcha": content = json.loads(response.body.decode('utf-8')) item = XQItem() content['user_id'] = response.meta['user_id'] item['url'] = response.url item['content'] = content item['fp'] = request_fingerprint(response.request) yield item if str(response.url) == "https://xueqiu.com/service/captcha": self.logger.error('CAPTURE ERROR: User ID %s' % (response.meta['owner_id'])) except Exception as ex: self.logger.warn('Parse Exception: %s %s' % (str(ex), response.url))
class DTRank(Spider): name = 'DTRank' logger = util.set_logger(name, LOG_FILE_DTRank) def start_requests(self): page_num = 87 for i in range(1, page_num + 1): urls = 'http://data.10jqka.com.cn/market/jgzy/field/enddate/order/desc/page/' + str( i) yield Request(url=urls, callback=self.parse) def parse(self, response): paths = response.xpath('//tbody/tr').extract() item = DTRankItem() for path in paths: date = Selector( text=path).xpath('//td[@class = "tc cur"]/text()').extract()[0] item['date'] = date stock_symbol = Selector(text=path).xpath( '//td[@class="tc"][position() = 1]/a[@target = "_blank"]/text()' ).extract()[0] item['stock_symbol'] = stock_symbol stock_name = Selector(text=path).xpath( '//td[@class="tc"][position() = 2]/a[@target = "_blank"]/text()' ).extract()[0] # stock_name = stock_name.decode("UTF-8") item['stock_name'] = stock_name buy_inst_num = Selector(text=path).xpath( '//tr/td[@class="c-rise "]/text()').extract()[0] item['buy_inst_num'] = buy_inst_num sell_inst_num = Selector(text=path).xpath( '//tr/td[@class="c-fall "]/text()').extract()[0] item['sell_inst_num'] = sell_inst_num rank_reason = Selector( text=path).xpath('//tr/td[@class = "tl "]/text()').extract()[0] # rank_reason = rank_reason.decode("UTF-8") item['rank_reason'] = rank_reason yield item
class PoliticianSpider(Spider): name = 'CrawlerPolitician' logger = util.set_logger(name, LOG_FILE_POLITICIAN) def start_requests(self): Polist = open("C:/Code/Testing/scrt.csv") lines = set(Polist.readlines()) for line in lines: line = line.rstrip('\n') start_url = "http://www.chinavitae.com/biography/" yield Request(url=start_url + line + "/career", callback=self.parse) def parse(self, response): hxs = Selector(response) item = PoliticianItem() name = hxs.xpath('//div[@class="bioName"]/text()').extract()[0].strip() item['name'] = name try: # borndate birth = response.xpath( '//div[@class="bioDetails"]//text()').extract() if birth: birth = ' '.join(birth).strip() borndate = re.findall('\d+', birth)[0] item['born'] = borndate except Exception as ex: print("With no born:" + response.url) careers = hxs.xpath('//tr[@valign="top"]').extract() for career in careers: duration = re.search('<td width="90" class="cdCell">(.+)<\/td>', career) if duration: duration = re.sub("—", "-", duration.group(1)) item['duration'] = duration occupation = re.search('<strong>(.+)<\/strong>', career) if occupation: item['occupation'] = occupation.group(1).strip() branches = Selector(text=career).xpath( '//a[contains(@class,"link11")]/text()').extract()[0].strip() if branches: item['branch'] = branches yield item
class XQCubeRBSpider(Spider): start_at=datetime.now() name = 'xq_cube_rb' logger = util.set_logger(name, LOG_FILE_CUBE_RB) handle_httpstatus_list = [400] cube_type = 'SP' # 上次维护的时间,每次更新 start_time = time.strptime("2020-01-01", "%Y-%m-%d") def start_requests(self): zh_url = 'https://xueqiu.com/cubes/rebalancing/history.json?count=50&page=1&cube_symbol=' sp_url = 'https://xueqiu.com/service/tc/snowx/PAMID/cubes/rebalancing/history?count=20&page=1&cube_symbol=' # get start url from MongoDB db = util.set_mongo_server() symbols = [] for s in db.xq_cube_info.find({'cube_type':self.cube_type}, {'symbol': 1, '_id': 0}): symbols.append(s['symbol']) symbols = list(set(symbols)) # iterate each symbol all_page_n = len(symbols) for i in range(all_page_n): symbol = symbols[i].strip() now_page_n = i if self.cube_type == 'SP': url = sp_url + symbol elif self.cube_type == 'ZH': url = zh_url + symbol # 进度条 if i%500==0: self.logger.info('%s (%s / %s) %s%%' % (symbol, str(now_page_n), str(all_page_n), str(round(float(now_page_n) / all_page_n * 100, 1)))) yield Request(url = url, callback = self.parse, meta = {'cube_type':self.cube_type, 'symbol':symbol,'page':1}) def parse(self, response): try: if response.status == 200 and str(response.url) != "https://xueqiu.com/service/captcha": cube_type = response.meta['cube_type'] symbol = response.meta['symbol'] page = response.meta['page'] body = re.sub('[\s]', '', response.body.decode('utf-8')) body = json.loads(body) if body['maxPage']: max_page = body['maxPage'] if body['list']: page_first_time = body['list'][0]['updated_at'] page_first_time = time.gmtime(page_first_time / 1000) if page_first_time < self.start_time: return else: for i in body['list']: item = XQItem() # i is of type dict i['cube_symbol'] = symbol i['cube_type'] = cube_type item['url'] = response.url item['content'] = i item['fp'] = request_fingerprint(response.request) yield item # Second + page if page < max_page: page = page + 1 page_string = '&page=' + str(page) url = re.sub(r'&page=(\d+)', page_string, response.url) yield Request(url = url, meta = {'cube_type':cube_type, 'symbol':symbol, 'page':page}, callback = self.parse) elif str(response.url) == "https://xueqiu.com/service/captcha": self.logger.error('CAPTURE ERROR: %s' % (response.meta['symbol'])) except Exception as ex: self.logger.error('Parse Exception: %s %s' % (str(ex), response.url))
class XQUserFensi(Spider): start_at=datetime.now() name = 'xq_user_fans' logger = util.set_logger(name, LOG_FILE_USER_FENSI) #handle_httpstatus_list = [404] cube_type = 'SP' def start_requests(self): start_url="http://xueqiu.com/friendships/followers.json?size=1000&uid=" # get start url from MongoDB db = util.set_mongo_server() owner_ids = [] for id in db.xq_cube_info.find({'cube_type':self.cube_type}, {'owner_id': 1, '_id': 0}): owner_ids.append(id['owner_id']) owner_ids = list(set(owner_ids)) # iterate each symbol all_page_n = len(owner_ids) for i in range(all_page_n): now_page_n = i owner_id = owner_ids[i] url = start_url+str(owner_id) # progress if i%1000==0: self.logger.info('%s (%s / %s) %s%%' % (owner_id, str(now_page_n), str(all_page_n), str(round(float(now_page_n) / all_page_n * 100, 1)))) #util.get_progress(now_page = i, all_page = all_page_n, logger = self.logger, spider_name = self.name, start_at = self.start_at) yield Request(url = url, meta = {'user_id': owner_id}, callback = self.parse) def parse(self, response): try: if response.status == 200 and str(response.url) != "https://xueqiu.com/service/captcha": content = json.loads(response.body.decode('utf-8')) if content['maxPage']: max_page = content['maxPage'] # First page, use parse_gz for item in self.parse_gz(response = response): yield item # Second + page, use parse_gz if max_page > 1: for i in range(2, max_page + 1): url = response.url + '&pageNo=' + str(i) yield Request(url = url, meta = {'user_id': response.meta['user_id']}, callback = self.parse_gz) if str(response.url) == "https://xueqiu.com/service/captcha": self.logger.error('CAPTURE ERROR: User ID %s' % (response.meta['user_id'])) except Exception as ex: self.logger.warn('Parse Exception: %s %s' % (str(ex), response.url)) def parse_gz(self, response): try: body = json.loads(response.body.decode('utf-8')) content = {} content['user_id'] = response.meta['user_id'] content['count'] = body['count'] content['anonymous_count'] = body['anonymous_count'] users = [] for user in body['followers']: users.append(user['id']) content['fans'] = users content['lastcrawl'] = int(time.time()) item = XQItem() item['url'] = response.url item['content'] = content item['fp'] = request_fingerprint(response.request) yield item except Exception as ex: self.logger.warn('Parse Exception: %s %s' % (str(ex), response.url))
class GBFuture(Spider): name = 'GBFuture' logger = util.set_logger(name, LOG_FILE_GBFuture) def start_requests(self): start_url = "http://guba.eastmoney.com/remenba.aspx?type=2" yield Request(url=start_url, callback=self.parse) def parse(self, response): paths = response.xpath('//div[@class = "gbboxb"]//li/a').extract() paths = set(paths) # print(paths) for path in paths: item = MntItem() item['content'] = {} semi_url = Selector(text=path).xpath('//a/@href').extract()[0] semi_url = re.match('(.+)\.', semi_url).group(0) semi_url = re.sub('\.', ',', semi_url) bar_name = Selector(text=path).xpath('//a/text()').extract()[0] item['content']['bar_name'] = bar_name url = 'http://guba.eastmoney.com/' + semi_url + 'f.html' yield Request(url=url, callback=self.parse_page, meta={ 'page': 1, 'semi_url': copy.deepcopy(semi_url), 'item': copy.deepcopy(item) }) def parse_page(self, response): # 确定需要抓取的页数 page = response.meta['page'] item = response.meta['item'] semi_url = response.meta['semi_url'] dates = response.xpath( '//div[contains(@class, "normal_post")]//span[contains(@class, "l5")]/text()' ).extract() md = date.today().strftime("%m-%d") tag_date = [] # 提取时间的前4个字符 for d in dates: tag_date.append(d[:5]) tag_date = set(tag_date) if md in tag_date: yield Request(url='http://guba.eastmoney.com/' + semi_url + 'f_' + str(page + 1) + '.html', callback=self.parse_page, dont_filter=True, meta={ 'page': page + 1, 'item': item, 'semi_url': semi_url }) elif md not in tag_date: for i in range(1, page): yield Request(url='http://guba.eastmoney.com/' + semi_url + 'f_' + str(i) + '.html', callback=self.parse_post, dont_filter=True, meta={ 'item': item, 'md': md }) def parse_post(self, response): item = response.meta['item'] md = response.meta['md'] hxs = response.xpath( '//div[contains(@class, "normal_post")]').extract() for hx in hxs: post_date = Selector(text=hx).xpath( '//span[contains(@class, "l5")]/text()').extract()[0] post_tag_date = post_date[:5] if post_tag_date == md: try: post_date = Selector(text=hx).xpath( '//span[contains(@class, "l5")]/text()').extract()[0] post_date = str(date.today().year) + "-" + post_date post_date = datetime.strptime(post_date, "%Y-%m-%d %H:%M") item['content']['post_date'] = post_date except: pass try: post_title = Selector(text=hx).xpath( '//span[contains(@class, "l3")]/a/@title').extract()[0] item['content']['post_title'] = post_title except: pass try: reply_num = Selector(text=hx).xpath( '//span[contains(@class, "l2")]/text()').extract()[0] item['content']['reply_num'] = reply_num except: pass try: read_num = Selector(text=hx).xpath( '//span[contains(@class, "l1")]/text()').extract()[0] item['content']['read_num'] = read_num except: pass try: post_author = Selector(text=hx).xpath( '//span[contains(@class, "l4")]/a[@target = "_blank"]/font/text()' ).extract()[0] item['content']['post_author'] = post_author except: pass try: author_title = Selector(text=hx).xpath( '//span[contains(@class, "l4")]/a/em/@title').extract( )[0] item['content']['author_title'] = author_title except: pass yield item
# -*- coding: utf-8 -*- import random import redis import time import json import base64 from crawler.settings import * from scrapy.downloadermiddlewares.retry import RetryMiddleware from twisted.web._newclient import ResponseNeverReceived from twisted.python.failure import Failure from twisted.internet.error import TimeoutError, ConnectionRefusedError, ConnectError, TCPTimedOutError, ConnectionDone from datetime import datetime from scrapy import signals from crawler.spiders import util logger = util.set_logger("http_proxy_middleware", LOG_FILE_MIDDLEWARE) class RandomRequestHeaders(object): """Randomly rotate user agents based on a list of predefined ones""" def __init__(self, agents, cookies): self.agents = agents self.cookies = cookies @classmethod def from_crawler(cls, crawler): ua = crawler.settings.getlist('USER_AGENTS') ck = crawler.settings.getlist('COOKIES') return cls(ua, ck) def process_request(self, request, spider):
class GBFuture(Spider): name = 'GBFuturetest' logger = util.set_logger(name, LOG_FILE_GBFuture) def start_requests(self): start_url = "http://guba.eastmoney.com/list,rb,f_1.html" item = MntItem() item['content'] = {} item['content']['bar_name'] = "螺纹钢吧" yield Request(url=start_url, callback=self.parse, dont_filter=True, meta={ 'page': 1, 'item': item }) def parse(self, response): # 确定需要抓取的页数 page = response.meta['page'] item = response.meta['item'] dates = response.xpath( '//div[contains(@class, "normal_post")]//span[contains(@class, "l5")]/text()' ).extract() md = date.today().strftime("%m-%d") tag_date = [] # 提取时间的前4个字符 for d in dates: tag_date.append(d[:5]) tag_date = set(tag_date) if md in tag_date: yield Request(url='http://guba.eastmoney.com/list,rb,f_' + str(page + 1) + '.html', callback=self.parse, dont_filter=True, meta={ 'page': page + 1, 'item': item }) elif md not in tag_date: for i in range(1, page): yield Request(url='http://guba.eastmoney.com/list,rb,f_' + str(i) + '.html', callback=self.parse_page, dont_filter=True, meta={ 'item': item, 'md': md }) def parse_page(self, response): item = response.meta['item'] md = response.meta['md'] hxs = response.xpath( '//div[contains(@class, "normal_post")]').extract() for hx in hxs: post_date = Selector(text=hx).xpath( '//span[contains(@class, "l5")]/text()').extract()[0] post_tag_date = post_date[:5] if post_tag_date == md: try: post_date = Selector(text=hx).xpath( '//span[contains(@class, "l5")]/text()').extract()[0] post_date = str(date.today().year) + "-" + post_date post_date = datetime.strptime(post_date, "%Y-%m-%d %H:%M") item['content']['post_date'] = post_date except: pass try: post_title = Selector(text=hx).xpath( '//span[contains(@class, "l3")]/a/@title').extract()[0] item['content']['post_title'] = post_title except: pass try: reply_num = Selector(text=hx).xpath( '//span[contains(@class, "l2")]/text()').extract()[0] item['content']['reply_num'] = reply_num except: pass try: read_num = Selector(text=hx).xpath( '//span[contains(@class, "l1")]/text()').extract()[0] item['content']['read_num'] = read_num except: pass try: post_author = Selector(text=hx).xpath( '//span[contains(@class, "l4")]/a[@target = "_blank"]/font/text()' ).extract()[0] item['content']['post_author'] = post_author except: pass try: author_title = Selector(text=hx).xpath( '//span[contains(@class, "l4")]/a/em/@title').extract( )[0] item['content']['author_title'] = author_title except: pass yield item
class ChinaVitaeSpider(Spider): name = 'CrawlerChinaVitae' logger = util.set_logger(name, LOG_FILE_CHINAVITAE) def start_requests(self): start_url = 'http://www.chinavitae.com/biography_browse.php?l=' for i in range(97, 123): start_urls = start_url + chr(i) yield Request(url=start_urls, callback=self.parse) #crawl every name and of vitae def parse(self, response): hxs = Selector(response) names = hxs.xpath('//a[@class="link11"]').extract() if names: item = ChinaVitaeItem() item['content'] = {} item['content']['careers'] = [] for name in names: name_pinyin = Selector(text=name).xpath( '//a[@class="link11"]/text()').extract()[0] item['content']['name_pinyin'] = name_pinyin name_url = Selector(text=name).xpath('//a/@href').extract()[0] name_url = "http://www.chinavitae.com" + name_url + "/full" yield Request(url=name_url, meta={'item': item}, callback=self.parse_biog) def parse_biog(self, response): item = response.meta['item'] try: # Some of biography have no item with Chinese name name = response.xpath( '//span[@style="font-family:Courier New, Courier, mono;"]/text()' ).extract()[0] item['content']['name'] = name except Exception as ex: print("With no Chinese name: " + response.url) try: # title of the biography biotitle = response.xpath( '//div[@class="bioTitle"]/text()').extract()[0] item['content']['biotitle'] = biotitle except Exception as ex: print("With no biotitle: " + response.url) try: # whole of biography bigph = response.xpath( '//div[@id="dataPanel"]/p').extract()[0].strip() bigph = re.sub('\r\n', ' ', bigph) bigph = re.sub('<br>', '', bigph) bigph = re.search('<p>(.+)<\/p>', bigph).group(1) item['content']['biography'] = bigph except Exception as ex: print("With no bigph:" + response.url) try: # borndate birth = response.xpath( '//div[@class="bioDetails"]//text()').extract() if birth: birth = ' '.join(birth).strip() borndate = re.findall('\d+', birth)[0] item['content']['borndate'] = borndate except Exception as ex: print("With no borndate:" + response.url) try: # birthplace birth = response.xpath( '//div[@class="bioDetails"]//text()').extract() if birth: birth = ' '.join(birth).strip() birthplace = re.search('Birthplace:(.+)', birth).group(1) birthplace = re.sub(' ', '', birthplace) item['content']['birthplace'] = birthplace except Exception as ex: print("With no birthplace:" + response.url) try: # Careers careers = response.xpath('//tr[@valign="top"]').extract() career = {} for c in careers: duration = re.search( '<td width="90" class="cdCell">(.+)<\/td>', c) if duration: duration = re.sub("—", "-", duration.group(1)) career['duration'] = duration occupation = re.search('<strong>(.+)<\/strong>', c) if occupation: career['occupation'] = occupation.group(1) branch = Selector(text=c).xpath( '//a[contains(@class,"link11")]/text()').extract() if branch: career['branch'] = branch item['content']['careers'].append(copy.deepcopy(career)) except Exception as ex: print("With no careers:" + response.url) yield item
class XQUserStatus(Spider): start_at = datetime.now() name = 'xq_user_cmt' logger = util.set_logger(name, LOG_FILE_USER_STATUS) #handle_httpstatus_list = [404] def start_requests(self): start_url = "https://xueqiu.com/v4/statuses/user_timeline.json?&count=20&user_id=" ## get start url from MongoDB db = util.set_mongo_server() owner_ids = [] for id in db.xq_cube_info.find({}, {'owner_id': 1, '_id': 0}): owner_ids.append(id['owner_id']) owner_ids = list(set(owner_ids)) #owner_ids = ["1001223822"] # iterate each symbol all_page_n = len(owner_ids) for i in range(all_page_n): owner_id = owner_ids[i] now_page_n = i url = start_url + str(owner_id) # progress if i % 1000 == 0: self.logger.info( '%s (%s / %s) %s%%' % (owner_id, str(now_page_n), str(all_page_n), str(round(float(now_page_n) / all_page_n * 100, 1)))) #util.get_progress(all_page = all_page_n, logger = self.logger, spider_name = self.name, start_at = self.start_at) yield Request(url=url, meta={'user_id': owner_id}, callback=self.parse) def parse(self, response): try: if response.status == 200 and str( response.url) != "https://xueqiu.com/service/captcha": body = json.loads(response.body.decode('utf-8')) if body['maxPage']: max_page = body['maxPage'] page = body['page'] # First page if page == 1: content = {} content['user_id'] = response.meta['user_id'] content['statuses'] = body['statuses'] content['total'] = body['total'] content['max_page'] = body['maxPage'] content['page'] = body['page'] item = XQItem() item['content'] = content yield item # Second + page if max_page > 1: for i in range(2, max_page + 1): url = response.url + '&page=' + str(i) yield Request( url=url, meta={'user_id': response.meta['user_id']}, callback=self.parse_status) elif str(response.url) == "https://xueqiu.com/service/captcha": self.logger.error('CAPTURE ERROR: User ID %s' % (response.meta['user_id'])) except Exception as ex: self.logger.warn('Parse Exception: %s %s' % (str(ex), response.url)) def parse_status(self, response): try: body = json.loads(response.body.decode('utf-8')) content = {} content['user_id'] = response.meta['user_id'] content['statuses'] = body['statuses'] content['total'] = body['total'] content['max_page'] = body['maxPage'] content['page'] = body['page'] item = XQItem() item['content'] = content item['fp'] = request_fingerprint(response.request) yield item except Exception as ex: self.logger.warn('Parse Exception: %s %s' % (str(ex), response.url))
class XQCubeInfoSpider(Spider): start_at = datetime.now() name = 'xq_cube_info' logger = util.set_logger(name, LOG_FILE_CUBE_INFO) handle_httpstatus_list = [404] website_possible_httpstatus_list = [404] cube_type = 'SP' def start_requests(self): start_url = "https://xueqiu.com/p/" # 对于ZH,从100至200万 XQ-1803, ZH: 1320315个;SP:1354210(33895)个 # 对于SP,从100万至110万 start_page = 1100000 end_page = 1500000 # iterate each page all_page_n = end_page - start_page + 1 for i in range(start_page, end_page): now_page_n = i - start_page if self.cube_type == 'ZH': if i <= 999999: symbol = "ZH" + str(i).zfill(6) url = start_url + symbol elif i >= 1000000: symbol = "ZH" + str(i).zfill(7) url = start_url + symbol elif self.cube_type == 'SP': symbol = "SP" + str(i).zfill(7) url = start_url + symbol #自定义进度条 if i % 500 == 0: self.logger.info( '%s (%s / %s) %s%%' % (symbol, str(now_page_n), str(all_page_n), str(round(float(now_page_n) / all_page_n * 100, 1)))) #util.get_progress(all_page = all_page_n, logger = self.logger, spider_name = self.name, start_at = self.start_at) yield Request(url=url, callback=self.parse, meta={'cube_type': self.cube_type}) def parse(self, response): try: #print(response.url) #print(response.status) if response.status == 200 and str( response.url) != "https://xueqiu.com/service/captcha": item = XQItem() hxs = Selector(response) info_script = ''.join( hxs.xpath( '//script[contains(., "cubeInfo")]//text()').extract()) info_script = re.sub("[\s]", "", info_script) m = re.search("SNB.cubeInfo=({\S+?});SNB.cube", info_script) if m: content = json.loads(m.group(1).strip()) content['lastcrawl'] = int(time.time()) content['cube_type'] = response.meta['cube_type'] item['content'] = content item['fp'] = request_fingerprint(response.request) item['url'] = response.url yield item # 返回404,但是非验证码情况,说明对应的cube symbol不存在,这些url也要写入redis,避免下次再进行抓取 elif response.status == 404 and str( response.url) != "https://xueqiu.com/service/captcha": item = XQItem() item['fp'] = request_fingerprint(response.request) item['url'] = response.url yield item #self.logger.warn('404: %s' % (str(response.url))) elif str(response.url) == "https://xueqiu.com/service/captcha": self.logger.error('CAPTURE ERROR: %s' % (response.url)) except Exception as ex: self.logger.warn('Parse Exception: %s %s' % (str(ex), response.url))
class SinaNewsSpider(Spider): name = "CrawlerSinaNews" logger = util.set_logger(name, LOG_FILE_SINANEWS) handle_httpstatus_list = [404] def start_requests(self): # 新闻类别 col 取值 (90:国内,91:国际,92:社会,94:体育,95:娱乐,93:军事,96:科技,97:财经,98:股市,99:美股) col = "90" start_url = "http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php?col=%s&num=5010&date=" % (col) start_date = datetime.strptime("2010-01-01", "%Y-%m-%d").date() end_date = datetime.strptime("2018-09-01", "%Y-%m-%d").date() url_date = [] s_d = start_date c_d = s_d.strftime("%Y-%m-%d") url_date.append(c_d) while s_d < end_date: s_d = s_d + timedelta(days=1) c_d = s_d.strftime("%Y-%m-%d") url_date.append(c_d) for i in range(len(url_date)): url = start_url + url_date[i] # 每抓取15天print log if i%15 == 0: self.logger.info("Now crawl: " + url_date[i]) yield Request(url = url, callback = self.parse) def parse(self, response): item = SinaNewsItem() body = re.sub("[\s]", "", response.body.decode("gbk")) m = re.search("varjsonData=({\S+?});", body) if m: js = demjson.decode(m.group(1).strip()) for i in js['list']: item['content'] = i if item['content']['time']: item['content']['time'] = datetime.fromtimestamp(int(item['content']['time'])) url = i['url'] yield Request(url = url, callback = self.parse_content, meta = {'item' : item}) # 抓取正文 def parse_content(self, response): if response.status == 200: # 不抓取video、blog 、guba 以及 weibo 子域名 if re.search("://.*video|://blog|://passport.weibo|://guba|://slide|://survey", response.url): return item = response.meta['item'] # 老的页面可能用gbk/gb2312编码,新的页面一般用utf8编码,因此两种编码都要试一下 try: filter_body = response.body.decode('utf8') except: try: filter_body = response.body.decode("gbk") except: try: filter_body = response.body.decode("gb2312") except Exception as ex: print("Decode webpage failed: " + response.url) return filter_body = re.sub('<[A-Z]+[0-9]*[^>]*>|</[A-Z]+[^>]*>', '', filter_body) response = response.replace(body = filter_body) hxs =Selector(response) # parse news_id # news_id只可能来源于name="publishid",如果不存在,则放弃该条新闻 news_id = hxs.xpath('//head/*[@name="publishid"]/@content').extract() if news_id: item['content']['news_id'] = news_id[0] else: self.logger.info("No 'news_id'! Skip: %s" % (response.url)) return # parse cmt_id # cmt_id可能来源于name="comment"(较新的网页),也可能来源于对html的正则解析(较旧的网页) cid = hxs.xpath('//head/*[@name="comment"]/@content').extract() if cid: # 新网页主要是这种格式 d = cid[0].split(":") cmt_id = {"channel":d[0], "comment_id":d[1]} item['content']['cmt_id'] = cmt_id #print("cmt_id 1") else: # 旧网页主要是这种格式 filter_body = re.sub("[\s]", "", filter_body) m = re.search('''channel:["'](.+?)["'],.*newsid:["'](.+?)['"]''', filter_body) if m: cmt_id = {"channel":m.group(1), "comment_id":m.group(2)} item['content']['cmt_id'] = cmt_id #print("cmt_id 2") else: # 个别特例 m = re.search('channel=(.+?)&newsid', filter_body) if m: cmt_id = {"channel":m.group(1), "comment_id":item['content']['news_id']} item['content']['cmt_id'] = cmt_id #print("cmt_id 3") else: self.logger.info("No 'cmt_id' found: %s" % (response.url)) # keywords / tag key_words = hxs.xpath('//head/*[@name = "keywords"]/@content').extract() if key_words: item['content']['keywords'] = key_words[0] tags = hxs.xpath('//head/*[@name = "tags"]/@content').extract() if tags: item['content']['tags'] = tags[0] # article create / update / publish time create = hxs.xpath('//head/*[@name = "weibo: article:create_at"]/@content').extract() if create: item['content']['news_create_time'] = create[0] update = hxs.xpath('//head/*[@name = "weibo: article:update_at"]/@content').extract() if update: item['content']['news_update_time'] = update[0] publish = hxs.xpath('//head/*[@property = "article:published_time"]/@content').extract() if publish: item['content']['news_publish_time'] = publish[0] # parse content content = hxs.xpath('//*[@id="artibody"]/p/text()').extract() if content: item['content']['content'] = "\n".join(content) item['url'] = response.url # parse source / author source = hxs.xpath('//head/*[@name="mediaid"]/@content').extract() if source: item['content']['source'] = source[0] author = hxs.xpath('//head/*[@property="article:author"]/@content').extract() if author: item['content']['author'] = author[0] # parse reply # cmt_id 包含了新闻的id和channel,用于生成reply_url if "cmt_id" in item['content']: reply_url_stat = "http://comment5.news.sina.com.cn/page/info?version=1&format=json&compress=1&ie=utf-8&oe=utf-8&page=1&page_size=20&channel=" + item['content']['cmt_id']['channel'] + "&newsid=" + item['content']['cmt_id']['comment_id'] reply_url = "http://comment5.news.sina.com.cn/page/info?version=1&format=json&compress=1&ie=utf-8&oe=utf-8&page_size=100&channel=" + item['content']['cmt_id']['channel'] + "&newsid=" + item['content']['cmt_id']['comment_id'] + "&page=" yield Request(url = reply_url_stat, meta = {'item':item, 'cmt_url':reply_url}, callback = self.parse_reply) # 如果解析不出comment reply,那么就不抓reply else: yield item elif response.status == 404: self.logger.error("Page 404: %s" % (response.url)) return # parse_reply并不解析回复正文,只用来确定总回复数replynum,总翻页数rptotal等。解析回复正文在parse_reply_json def parse_reply(self, response): d_json = json.loads(response.body.decode('utf8')) item = response.meta['item'] cmt_url = response.meta['cmt_url'] try: reply = {} if d_json['result']: # 存在回复的情况 if 'count' in d_json['result']: reply['replynum'] = int(d_json['result']['count']['show']) reply['hotness'] = int(d_json['result']['count']['total']) reply['qreply'] = int(d_json['result']['count']['qreply']) # 并不知道qreply是什么 item['content']['reply'] = reply # 确定需要翻页数 rptotal = 0 if reply['replynum']%100 == 0: rptotal = reply['replynum']/100 else: rptotal = int(reply['replynum']/100) + 1 if rptotal > 0: yield Request(url = cmt_url + str(1), meta = {'item':item, 'rptotal':rptotal, 'page':1, 'cmt_url':cmt_url},callback = self.parse_reply_json) else: yield item # 不存在回复,直接返回item except Exception as ex: yield item # parse_reply_json才真正用来解析回复正文 def parse_reply_json(self, response): item = response.meta['item'] cmt_url = response.meta['cmt_url'] page = response.meta['page'] rptotal = response.meta['rptotal'] d_json = json.loads(response.body.decode('utf8')) if d_json['result']: if 'cmntlist' in d_json['result']: # 如果reply_content是空的,说明page=1,直接赋值;否则说明page>1,使用extend方法 if "reply_content" in item['content']['reply']: item['content']['reply']['reply_content'].extend(d_json['result']['cmntlist']) else: item['content']['reply']['reply_content'] = d_json['result']['cmntlist'] # page为当前所处页面,直到page和总页数相等才停止抓取 if page == rptotal: yield item elif page < rptotal: yield Request(url = cmt_url + str(page+1), meta = {'item':item, 'rptotal':rptotal, 'cmt_url':response.meta['cmt_url'], 'page':page+1}, callback = self.parse_reply_json)
class XQUserStatus(Spider): start_at = datetime.now() name = 'xq_user_cmt' logger = util.set_logger(name, LOG_FILE_USER_STATUS) #handle_httpstatus_list = [404] # 上次维护的时间,每次更新 start_time = time.strptime("2020-01-01", "%Y-%m-%d") def start_requests(self): # 雪球的cmt一个页面最多显示20条 start_url = "https://xueqiu.com/v4/statuses/user_timeline.json?&count=20&page=1&user_id=" ## get start url from MongoDB db = util.set_mongo_server() owner_ids = [] for id in db.xq_cube_info.find({}, {'owner_id': 1, '_id': 0}): owner_ids.append(id['owner_id']) owner_ids = list(set(owner_ids)) # iterate each symbol all_page_n = len(owner_ids) for i in range(all_page_n): owner_id = owner_ids[i] now_page_n = i url = start_url + str(owner_id) # progress if i % 1000 == 0: self.logger.info( '%s (%s / %s) %s%%' % (owner_id, str(now_page_n), str(all_page_n), str(round(float(now_page_n) / all_page_n * 100, 1)))) yield Request(url=url, meta={'user_id': owner_id}, callback=self.parse) def parse(self, response): try: if response.status == 200 and str( response.url) != "https://xueqiu.com/service/captcha": body = json.loads(response.body.decode('utf-8')) if body['maxPage']: max_page = body['maxPage'] page = body['page'] if body['statuses']: page_first_time = body['statuses'][0]['created_at'] page_first_time = time.gmtime(page_first_time / 1000) if page_first_time < self.start_time: return content = {} content['user_id'] = response.meta['user_id'] content['statuses'] = body['statuses'] content['total'] = body['total'] content['max_page'] = body['maxPage'] content['page'] = body['page'] item = XQItem() item['content'] = content yield item # Second + page if page < max_page: page = page + 1 page_string = '&page=' + str(page) url = re.sub(r'&page=(\d+)', page_string, response.url) yield Request( url=url, meta={'user_id': response.meta['user_id']}, callback=self.parse) elif str(response.url) == "https://xueqiu.com/service/captcha": self.logger.error('CAPTURE ERROR: User ID %s' % (response.meta['user_id'])) except Exception as ex: self.logger.warn('Parse Exception: %s %s' % (str(ex), response.url))
class VippearSpider(Spider): name = 'CrawlerVippear' logger = util.set_logger(name, LOG_FILE_VIPPEAR) def start_requests(self): start_url = 'http://www.chinavitae.com/vip/index.php?mode=officials&map=show&type=cv' yield Request(url=start_url, callback=self.parse) def parse(self, response): names = response.xpath( '//td[@align="left"]//a[@class="link12"]').extract() for name in names: item = ChinaVitaeItem() item['content'] = {} name_pinyin = Selector( text=name).xpath('//a[@class="link12"]/text()').extract()[0] item['content']['name_pinyin'] = name_pinyin name_url = Selector(text=name).xpath('//a/@href').extract()[0] res_url = 'http://www.chinavitae.com/vip/' + name_url yield Request(url=res_url, meta={ 'url': res_url, 'item': item }, callback=self.parse_year) def parse_year(self, response): year = response.xpath('//p/a/text()').extract() year = ' '.join(year).strip() years = re.findall('\d{4}', year) item = response.meta['item'] for y in years: res_url = response.meta['url'] yield Request(url=res_url + '&filter_year=' + y, meta={'item': item}, callback=self.parse_list) def parse_list(self, response): urls = response.xpath('//div[@class="link12b"]/a/@href').extract() item = response.meta['item'] for url in urls: vippurl = 'http://www.chinavitae.com' + url yield Request(url=vippurl, meta={'item': item}, callback=self.parse_vipp) def parse_vipp(self, response): item = response.meta['item'] acti = response.xpath('//html//tr[2]/td[2]').extract()[0] acti = re.sub('\r\n', '', acti) acti = re.search('td>(.+)<\/td', acti).group(1) item['content']['activity'] = acti infos = response.xpath( '//*[contains(@class, "link12")]//text()').extract() infos = ','.join(infos).strip() infos = re.sub('\n', '', infos) infos = re.sub('\t', '', infos) date = re.search('Date: ,(.+),Activity', infos).group(1).strip() item['content']['date'] = date try: location = re.search('Location: ,(.+),Attendees', infos).group(1).strip() item['content']['location'] = location except Exception as ex: print("With no Location: " + response.url) try: attendees = re.search('Attendees: ,(.+),Source', infos).group(1).strip() item['content']['attendees'] = attendees except: try: attendees = re.search('Attendees: ,(.+),Topics', infos).group(1).strip() item['content']['attendees'] = attendees except: try: attendees = re.search('Attendees: ,(.+)', infos).group(1).strip() item['content']['attendees'] = attendees except Exception as ex: print("With no Attendees: " + response.url) try: source = re.search('Source: ,(.+),Topics', infos).group(1).strip() item['content']['source'] = source except Exception as ex: print("With no Source: " + response.url) try: topics = re.search('Topics: ,(.+)', infos).group(1).strip() item['content']['topics'] = topics except Exception as ex: print("With no Topics: " + response.url) yield item
class GubaSpider(Spider): name = 'CrawlerGuba2' logger = util.set_logger(name, LOG_FILE_GUBA) def start_requests(self): start_urls = "http://guba.eastmoney.com/news,v,47652005.html" yield Request(url=start_urls, meta={'replynum': 1832}, callback=self.parse) def parse(self, response): hxs = Selector(response) posts = hxs.xpath('//div[@class="articleh"]').extract() for post in posts: item = GubaItem() item['content'] = {} readnum = Selector( text=post).xpath('//span[@class="l1"]/text()').extract() if readnum: readnum = readnum[0] replynum = Selector( text=post).xpath('//span[@class="l2"]/text()').extract() if replynum: replynum = replynum[0] url = Selector( text=post).xpath('//span[@class="l3"]/a/@href').extract() if url: url = url[0] guba_id = re.search(',(.+).html', response.url).group(1) if str(guba_id) in str(url): m_stock = re.search("(^\/.+)", url) if m_stock: post_url = "http://guba.eastmoney.com" + m_stock.group(1) post_id = re.search('\/(n.+)\.html', url).group(1) item['content']['readnum'] = readnum item['content']['replynum'] = replynum item['content']['post_id'] = post_id yield Request(url=post_url, meta={ 'item': item, 'replynum': replynum }, callback=self.parse_post) ##对帖子信息进行抓取并翻页 def parse_post(self, response): if response.status == 200: hxs = Selector(response) item = response.meta['item'] dt = hxs.xpath('//div[@class="zwfbtime"]/text()').extract()[0] dt = re.search('\D+(\d{4}-\d{2}-.+:\d{2})', dt).group(1) creat_time = datetime.strptime(dt, "%Y-%m-%d %H:%M:%S") item['content']['create_time'] = creat_time try: author_url = hxs.xpath( '//div[@id="zwconttbn"]/strong/a/@href').extract()[0] item['content']['author_url'] = author_url except: try: author = hxs.xpath( '//div[@id="zwconttbn"]//span/text()').extract()[0] item['content']['author'] = author except Exception as ex: print("Decode webpage failed: " + response.url) return try: #针对普通帖子 postcontent = hxs.xpath( '//div[@id="zwconbody"]/div[@class="stockcodec"]/text()' ).extract()[0].strip() if postcontent: item['content']['content'] = postcontent postitle = hxs.xpath( '//div[@class="zwcontentmain"]/div[@id="zwconttbt"]/text()' ).extract()[0].strip() item['content']['title'] = postitle except: #针对问答帖子 try: postcontent = hxs.xpath( '//div[@class="qa"]//div[contains(@class,"content")]/text()' ).extract() postquestion = postcontent[0] postanswer = postcontent[2].strip() + postcontent[3].strip( ) item['content']['content'] = postquestion item['content']['answer'] = postanswer postanswer_time = hxs.xpath( '//div[@class="sign"]/text()').extract() try: postanswer_time = hxs.xpath( '//div[@class="sign"]/text()').extract() postanswer_time = re.search( '\D+(\d{4}-\d{2}-.+:\d{2})', postanswer_time[1].strip()).group(1) answer_time = datetime.strptime( postanswer_time, "%Y-%m-%d %H:%M:%S") item['content']['answer_time'] = answer_time except Exception as ex: item['content']['answer_time'] = None postitle = "Q&A" item['content']['title'] = postitle except Exception as ex: print("Parse Exception: " + response.url) return replynum = response.meta['replynum'] item['content']['reply'] = [] if int(replynum) % 30 == 0: rptotal = int(int(replynum) / 30) else: rptotal = int(int(replynum) / 30) + 1 if rptotal > 0: head = re.search('(.+)\.html', response.url).group(1) reply_url = head + "_" + str(1) + ".html" yield Request(url=reply_url, meta={ 'item': item, 'page': 1, 'rptotal': rptotal, 'head': head }, callback=self.parse_reply) else: yield item def parse_reply(self, response): hxs = Selector(response) page = response.meta['page'] rptotal = response.meta['rptotal'] item = response.meta['item'] head = response.meta['head'] replists = hxs.xpath( '//div[@id="zwlist"]/div[@class="zwli clearfix"]').extract() for replist in replists: reply = {} try: reply_author = Selector(text=replist).xpath( '//div[@class="zwlianame"]//a/text()').extract()[0] reply['reply_author'] = reply_author reply_author_url = Selector(text=replist).xpath( '//div[@class="zwlianame"]//a/@href').extract()[0] reply['reply_author_url'] = reply_author_url except: try: reply_author = Selector(text=replist).xpath( '//span[@class="zwnick"]/span').extract()[0] reply_author = re.search('"gray">(.+)<\/span>', reply_author).group(1) reply['reply_author'] = reply_author except Exception as ex: print("Decode webpage failed: " + response.url) return reply_time = Selector(text=replist).xpath( '//div[@class="zwlitime"]/text()').extract()[0] reply_time = re.search('\D+(\d{4}-\d{2}-.+:\d{2})', reply_time).group(1) reply_time = datetime.strptime(reply_time, "%Y-%m-%d %H:%M:%S") reply['reply_time'] = reply_time reply_content = Selector(text=replist).xpath( '//div[@class="zwlitext stockcodec"]/text()').extract() if reply_content: reply['reply_content'] = reply_content[0] reply_quote_author = Selector(text=replist).xpath( '//div[@class="zwlitalkboxtext "]//a/text()').extract() if reply_quote_author: reply_quote_author = reply_quote_author[0] reply['reply_quote_author'] = reply_quote_author reply_quote_author_url = Selector(text=replist).xpath( '//div[@class="zwlitalkboxtext "]//a/@href').extract() if reply_quote_author_url: reply_quote_author_url = reply_quote_author_url[0] reply['reply_quote_author_url'] = reply_quote_author_url reply_quote_text = Selector(text=replist).xpath( '//div[@class= "zwlitalkboxtext "]/span/text()').extract() print(reply_quote_text) if reply_quote_text: reply_quote_content = reply_quote_text[0] reply['reply_quote_content'] = reply_quote_content item['content']['reply'].append(reply) print(item)
class MMBHistSpider(Spider): name = 'MMBHist' logger = util.set_logger(name, LOG_FILE_MMB) handle_httpstatus_list = [404, 460, 504] db = util.set_mongo_server() # 抓“一家在售” if_crawl_onestore = True # 抓“多家在售” if_crawl_multstore = False def start_requests(self): #“一家在售”的商品 if self.if_crawl_onestore: bjids = [] for id in self.db["MMB"].find({'bjid': {'$exists': True}}, {'bjid': 1, '_id': 0}): bjids.append(id['bjid']) bjids = list(set(bjids)) # iterate each bjid all_page_n = len(bjids) for i in range(all_page_n): bjid = bjids[i].strip() now_page_n = i url = "http://tool.manmanbuy.com/history.aspx?action=gethistory&bjid=" + str(bjid) # 进度条 if i%500==0: self.logger.info('一家在售:(%s / %s) %s%%' % (str(now_page_n), str(all_page_n), str(round(float(now_page_n) / all_page_n * 100, 1)))) yield Request(url = url, callback = self.parse) # “多家在售”的商品 if self.if_crawl_multstore: p_infos = [] # 挑出spid, name, url 不重复的记录 pipeline = [ {'$match':{'bjid':{'$exists':False}}}, {'$group': {'_id': {'spid': '$spid', 'name': '$name', 'url': '$url'}}}, ] cur = self.db.MMB.aggregate(pipeline) for i in cur: p_infos.append(i['_id']) all_page_n_mult = len(p_infos) for i in range(all_page_n_mult): p_info = p_infos[i] url = p_info['url'] now_page_n = i # 进度条 if i%500==0: self.logger.info('多家在售: (%s / %s) %s%%' % (str(now_page_n), str(all_page_n_mult), str(round(float(now_page_n) / all_page_n_mult * 100, 1)))) yield Request(url = url, meta = {"p_info":p_info}, callback = self.parse_mult) #yield Request(url = 'http://www.manmanbuy.com/pb_567731.aspx', meta = {"p_info":p_info}, callback = self.parse_mult) def parse_mult(self, response): try: if response.status == 200: # 把上一步的 item 传进来 p_info = response.meta['p_info'] # 解析同一个商品下的多家平台的链接 nodes = response.xpath('//div[contains(@class, "pro-mall-list")]//ul//li//div[contains(@class, "item ")]') for n in nodes: # 店铺名,不等于 siteName。例如同样siteName = 天猫。可以有sell_name = “vivo旗舰店”or “vivo天诚专卖店” seller_name = n.xpath('div[contains(@class, "mall")]//text()').extract() seller_name = ' '.join(' '.join(seller_name).split()) # get skuid skuid = n.xpath('@skuid').extract()[0] # get bjid bjid = n.xpath('@v').extract()[0].strip() bjid = ast.literal_eval(bjid)['bjid'] p_info.update({"seller_name":seller_name, "skuid":skuid, "bjid":bjid}) # 生成请求 url = "http://tool.manmanbuy.com/history.aspx?action=gethistory&bjid=" + str(bjid) yield Request(url = url, meta = {"p_info":p_info}, callback = self.parse) else: self.logger.error('HTTP status not 200: %s \n %s' % (response.url, response.body)) except Exception as ex: self.logger.error('Parse Exception - "parse_mult": %s %s' % (str(ex), response.url)) def parse(self, response): try: # 如果 200,按正常解析 if response.status == 200: # 把上一步的 item 传进来(如果有) p_info = {} if "p_info" in response.meta: p_info = response.meta['p_info'] # 解析价格 json body = re.sub('[\s]', '', response.body.decode('gbk')) body = json.loads(body) # 在p_info中添加产品基本信息 p_info.update({k: body[k] for k in ('siteName', 'siteId', 'zouShi', 'bjid', 'spName', 'spUrl', 'spbh', 'zouShi_test')}) # p_hist 只包含价格/日期 p_hist = body['datePrice'] p_hist = re.findall("\[(.+?)\]", p_hist) # 把价格list“展开” docs = [] lastcrawl = datetime.datetime.utcnow() for p in p_hist: # date m = re.search("Date.UTC\((.+?)\),([\d\.]+)", p) if m: date = m.group(1) date = datetime.datetime.strptime(date, "%Y,%m,%d") - datetime.timedelta(hours = 8) # 把 strptime的结果转换成UTC # price price = float(m.group(2).strip()) # create doc and add to docs doc = p_info doc.update({"date":date, "price":price, "lastcrawl":lastcrawl}) docs.append(copy.deepcopy(doc)) item = PriceItem() item['content'] = docs yield item else: self.logger.error('Got %s: %s' % (response.status, response.url)) except Exception as ex: self.logger.error('Parse Exception - "parse": %s %s' % (str(ex), response.url)) self.logger.info(str(response.body))
class GubaSpider(Spider): name = 'CrawlerGuba2' logger = util.set_logger(name, LOG_FILE_GUBA) def start_requests(self): start_urls = "http://guba.eastmoney.com/news,600029,18449146.html" yield Request(url=start_urls, meta={'replynum': 0}, callback=self.parse) #对帖子信息进行抓取并翻页 def parse(self, response): try: if response.status == 200: try: filter_body = response.body.decode('utf8') except: try: filter_body = response.body.decode("gbk") except: try: filter_body = response.body.decode("gb2312") except Exception as ex: print("Decode webpage failed: " + response.url) return filter_body = re.sub('<[A-Z]+[0-9]*[^>]*>|</[A-Z]+[^>]*>', '', filter_body) response = response.replace(body=filter_body) hxs = Selector(response) item = GubaItem() dt = hxs.xpath('//div[@class="zwfbtime"]/text()').extract()[0] dt = re.search('\D+(\d{4}-\d{2}-.+:\d{2}).+', dt).group(1) creat_time = datetime.strptime(dt, "%Y-%m-%d %H:%M:%S") item['content'] = {} item['content']['create_time'] = creat_time try: #针对发帖者是注册会员 author_url = hxs.xpath( '//div[@id="zwconttbn"]/strong/a/@href').extract()[0] item['content']['author_url'] = author_url except Exception as ex: #针对发帖者不是注册会员 author = hxs.xpath( '//div[@id="zwconttbn"]//span').extract()[0] author = re.search('gray">(.+)<\/span', author).group(1) item['content']['author'] = author try: #针对普通帖子 postcontent = hxs.xpath( '//div[@id="zwconbody"]/div[@class="stockcodec"]/text()' ).extract()[0].strip() if postcontent: item['content']['content'] = postcontent postitle = hxs.xpath( '//div[@class="zwcontentmain"]/div[@id="zwconttbt"]/text()' ).extract()[0].strip() item['content']['title'] = postitle except: #针对问答的帖子 try: postcontent = hxs.xpath( '//div[@class="qa"]//div[contains(@class,"content")]/text()' ).extract() postquestion = postcontent[0] postanswer = postcontent[2].strip( ) + postcontent[3].strip() item['content']['content'] = postquestion item['content']['answer'] = postanswer try: postanswer_time = hxs.xpath( '//div[@class="sign"]/text()').extract() postanswer_time = re.search( '\D+(\d{4}-\d{2}-.+:\d{2})', postanswer_time[1].strip()).group(1) answer_time = datetime.strptime( postanswer_time, "%Y-%m-%d %H:%M:%S") item['content']['answer_time'] = answer_time except Exception as ex: item['content']['answer_time'] = None postitle = "Q&A" item['content']['title'] = postitle except Exception as ex: print("Decode webpage content failed: " + response.url) return replynum = response.meta['replynum'] item['content']['replynum'] = replynum item['content']['reply'] = [] if int(replynum) % 30 == 0: rptotal = int(int(replynum) / 30) else: rptotal = int(int(replynum) / 30) + 1 if rptotal > 0: head = re.search('(.+)\.html', response.url).group(1) reply_url = head + "_" + str(1) + ".html" yield Request(url=reply_url, meta={ 'item': item, 'page': 1, 'rptotal': rptotal, 'head': head }, callback=self.parse_reply) else: yield item print(item) except Exception as ex: self.logger.warn('Parse Exception: %s %s' % (str(ex), response.url)) def parse_reply(self, response): page = response.meta['page'] rptotal = response.meta['rptotal'] item = response.meta['item'] head = response.meta['head'] hxs = Selector(response) replists = hxs.xpath( '//div[@id="zwlist"]/div[@class="zwli clearfix"]').extract() for replist in replists: reply = {} try: reply_author = Selector(text=replist).xpath( '//div[@class="zwlianame"]//a/text()').extract()[0] reply['reply_author'] = reply_author reply_author_url = Selector(text=replist).xpath( '//div[@class="zwlianame"]//a/@href').extract()[0] reply['reply_author_url'] = reply_author_url except: try: reply_author = Selector(text=replist).xpath( '//span[@class="zwnick"]/span').extract()[0] reply_author = re.search('"gray">(.+)<\/span>', reply_author).group(1) reply['reply_author'] = reply_author except Exception as ex: print("Decode webpage reply_author failed : " + response.url) return reply_time = Selector(text=replist).xpath( '//div[@class="zwlitime"]/text()').extract()[0] reply_time = re.search('\D+(\d{4}-\d{2}-.+:\d{2})', reply_time).group(1) reply_time = datetime.strptime(reply_time, "%Y-%m-%d %H:%M:%S") reply['reply_time'] = reply_time reply_content = Selector(text=replist).xpath( '//div[contains(@class, "stockcodec")]').extract()[0] try: reply_content = re.search('stockcodec">(.+)<', reply_content).group(1).strip() reply['reply_content'] = reply_content except Exception as ex: reply['reply_content'] = reply_content reply_quote_author = Selector(text=replist).xpath( '//div[@class="zwlitalkboxuinfo"]//a/text()').extract() if reply_quote_author: reply_quote_author = reply_quote_author[0] reply['reply_quote_author'] = reply_quote_author reply_quote_author_url = Selector(text=replist).xpath( '//div[@class="zwlitalkboxuinfo"]//a/@href').extract() if reply_quote_author_url: reply_quote_author_url = reply_quote_author_url[0] reply['reply_quote_author_url'] = reply_quote_author_url reply_quote_text = Selector(text=replist).xpath( '//div[@class= "zwlitalkboxtext"]').extract() if reply_quote_text: reply_quote_text = reply_quote_text[0] reply_quote_content = re.search( '"zwlitalkboxtext">(.+)<\/div>', str(reply_quote_text)).group(1) reply['reply_quote_content'] = reply_quote_content reply_quote_timestamp = Selector(text=replist).xpath( '//div[@class="zwlitalkboxtime"]/text()').extract() if reply_quote_timestamp: reply_quote_timestamp = re.search( '\D+(\d{4}.+:\d{2})', reply_quote_timestamp[0]).group(1) reply_quote_timestamp = re.sub("/", "-", reply_quote_timestamp) reply_quote_time = datetime.strptime( str(reply_quote_timestamp), "%Y-%m-%d %H:%M:%S") reply['reply_quote_time'] = reply_quote_time print(reply_quote_author_url) item['content']['reply'].append(reply) if page == rptotal: author_url = item['content']['author_url'] yield Request(url=author_url, meta={'item': item}, callback=self.parse_author) elif page < rptotal: reply_url = head + "_" + str(page + 1) + ".html" yield Request(url=reply_url, meta={ 'item': item, 'rptotal': rptotal, 'page': page + 1, 'head': head }, callback=self.parse_reply) def parse_author(self, response): item = response.meta['item']
class XQCubeRetSpider(RedisSpider): name = 'xq_cube_ret' start_at = datetime.now() logger = util.set_logger(name, LOG_FILE_CUBE_RET) website_possible_httpstatus_list = [301, 302, 404] handle_httpstatus_list = [301, 302] cube_type = 'SP' # 上次维护的时间,每次更新 start_time = '2020-01-01' def start_requests(self): zh_url = 'https://xueqiu.com/cubes/nav_daily/all.json?cube_symbol=' sp_url = 'https://xueqiu.com/service/tc/snowx/PAMID/cubes/nav_daily/all?cube_symbol=' # get start url from MongoDB db = util.set_mongo_server() symbols = [] for s in db.xq_cube_info.find({'cube_type': self.cube_type}, { 'symbol': 1, '_id': 0 }): symbols.append(s['symbol']) symbols = list(set(symbols)) for s in db.fail.find({}, {'cube_symbol': 1, '_id': 0}): symbols.append(s['cube_symbol']) symbols = list(set(symbols)) # iterate each symbol all_page_n = len(symbols) for i in range(all_page_n): now_page_n = i symbol = symbols[i].strip() if self.cube_type == 'SP': url = sp_url + symbol elif self.cube_type == 'ZH': url = zh_url + symbol # 进度条 if i % 1000 == 0: self.logger.info( '%s (%s / %s) %s%%' % (symbol, str(now_page_n), str(all_page_n), str(round(float(now_page_n) / all_page_n * 100, 1)))) yield Request(url=url, meta={ 'symbol': symbol, 'cube_type': self.cube_type }, callback=self.parse) def parse(self, response): try: if response.status == 200 and str( response.url) != "https://xueqiu.com/service/captcha": item = XQItem() body = re.sub('[\s]', '', response.body.decode('utf-8')) body = json.loads(body) if body: total_num = len(body[0]['list']) for i in range(total_num - 1, -1, -1): content = body[0]['list'][i] if content['date'] < self.start_time: return else: content['cube_symbol'] = response.meta['symbol'] content['cube_type'] = response.meta['cube_type'] item['url'] = response.url item['content'] = content item['fp'] = request_fingerprint(response.request) yield item if response.status == 302 or str( response.url) == "https://xueqiu.com/service/captcha": self.logger.error('CAPTURE ERROR: %s' % (response.meta['symbol'])) oldmeta = response.request.meta oldmeta["change_proxy"] = True yield Request(url=response.request.url, meta=oldmeta, callback=self.parse) except Exception as ex: self.logger.warn('Parse Exception: %s %s' % (str(ex), response.url))
class SinaNewsSpider(Spider): name = "sina_news" logger = util.set_logger(name, LOG_FILE_SINANEWS) handle_httpstatus_list = [404] def start_requests(self): # 新闻类别 lid 取值 (2510:国内,2511:国际,2669:社会,2512:体育,2513:娱乐,2514:军事,2515:科技,2516:财经,2517:股市,2518:美股) channel_list = { '2510': '国内', '2511': '国际', '2669': '社会', '2512': '体育', '2513': '娱乐', '2514': '军事', '2515': '科技', '2516': '财经', '2517': '股市', '2518': '美股' } for lid in channel_list.keys(): lid = "2516" #设置起始时间和终止时间的时间戳 etime = time.strptime("2018-10-01 00:00:00", "%Y-%m-%d %H:%M:%S") stime = time.strptime("2019-01-02 00:00:00", "%Y-%m-%d %H:%M:%S") etime = int(time.mktime(etime)) stime = int(time.mktime(stime)) etime = str(etime) stime = str(stime) ctime = stime # channel channel = { 'title': channel_list[lid], 'id': lid, 'cType': 'lid', 'url': '' } start_url = "https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=%s&etime=%s&stime=%s&ctime=%s&k=&num=50&page=1" % ( lid, etime, stime, ctime) yield Request(url=start_url, meta={ 'start_url': start_url, 'channel': channel }, callback=self.parse_page) def parse_page(self, response): js = json.loads(response.body) if js['result']['total'] % 50: page_total = js['result']['total'] // 50 + 1 else: page_total = js['result']['total'] // 50 start_url = response.meta['start_url'] start_url = start_url.rstrip('1') for i in range(1, page_total + 1): url = start_url + str(i) yield Request(url=url, meta={'channel': response.meta['channel']}, callback=self.parse) def parse(self, response): js = json.loads(response.body.decode(response.encoding)) for i in js['result']['data']: item = SinaNewsItem() item['content'] = {} #api中有好几个time,此处选择ctime item['content']['time'] = datetime.fromtimestamp(int(i['ctime'])) # author, source, keywords, title, news_id, type, pic, channel item['content']['author'] = i['author'] item['content']['source'] = i['media_name'] item['content']['keywords'] = i['keywords'] item['content']['title'] = i['title'] item['content']['news_id'] = i['docid'] item['content']['type'] = i['categoryid'] item['content']['pic'] = i['images'] item['content']['channel'] = response.meta['channel'] # cmt_id cmt_id = {} cmtid = re.search('(.+?):(.+):', i['commentid']) if cmtid: cmt_id['channel'] = cmtid.group(1) cmt_id['comment_id'] = cmtid.group(2) item['content']['cmt_id'] = cmt_id # url url = i['url'] item['content']['url'] = url # reply number if 'comment_show' in i: replynum = int(i['comment_show']) else: replynum = 0 yield Request(url=url, meta={ 'item': item, 'cmt_id': cmt_id, 'replynum': replynum }, callback=self.parse_content) def parse_content(self, response): if response.status == 200: item = response.meta['item'] tags = response.xpath( '//head/*[@name = "tags"]/@content').extract() if tags: item['content']['tags'] = tags[0] #article create / update / publish time create = response.xpath( '//head/*[@name = "weibo: article:create_at"]/@content' ).extract() if create: item['content']['news_create_time'] = create[0] update = response.xpath( '//head/*[@name = "weibo: article:update_at"]/@content' ).extract() if update: item['content']['news_update_time'] = update[0] publish = response.xpath( '//head/*[@property = "article:published_time"]/@content' ).extract() if publish: item['content']['news_publish_time'] = publish[0] #parse content content = response.xpath('//*[@id="artibody"]/p/text()').extract() if content: item['content']['content'] = "\n".join(content) #parse reply replynum = response.meta['replynum'] cmt_id = response.meta['cmt_id'] if replynum & ('channel' in cmt_id): #计算reply api的页数 if replynum % 20: rptotal = replynum // 20 + 1 else: rptotal = replynum // 20 page = 1 cmt_url = 'http://comment5.news.sina.com.cn/page/info?format=json&channel=%s&newsid=%s&page=' % ( cmt_id['channel'], cmt_id['comment_id']) reply_url = cmt_url + str(page) reply = {} yield Request(url=reply_url, meta={ 'item': item, 'page': page, 'rptotal': rptotal, 'cmt_url': cmt_url, 'reply': reply }, callback=self.parse_reply) else: yield item elif response.status == 404: self.logger.error("Page 404: %s" % (response.url)) return def parse_reply(self, response): item = response.meta['item'] page = response.meta['page'] rptotal = response.meta['rptotal'] cmt_url = response.meta['cmt_url'] reply = response.meta['reply'] d_json = json.loads(response.body.decode(response.encoding)) if 'cmntlist' in d_json['result']: if 'reply' in item['content']: item['content']['reply']['reply_content'].extend( d_json['result']['cmntlist']) else: if 'count' in d_json['result']: reply['replynum'] = d_json['result']['count']['show'] reply['hotness'] = d_json['result']['count']['total'] reply['qreply'] = d_json['result']['count']['qreply'] item['content']['reply'] = reply item['content']['reply']['reply_content'] = d_json['result'][ 'cmntlist'] if page == rptotal: yield item else: reply_url = cmt_url + str(page + 1) yield Request(url=reply_url, meta={ 'item': item, 'page': page + 1, 'rptotal': rptotal, 'cmt_url': cmt_url, 'reply': reply }, callback=self.parse_reply)
class GubaExFundSpider(Spider): name = 'guba_stock_posts' logger = util.set_logger(name, LOG_FILE_GUBAEXFUND) #handle_httpstatus_list = [404] #website_possible_httpstatus_list = [404] def start_requests(self): start_url = 'http://guba.eastmoney.com/remenba.aspx?type=' for type in range(1, 5): start_urls = start_url + str(type) yield Request(url=start_urls, meta={'type': type}, callback=self.parse) #解析一开始的网址 def parse(self, response): type = response.meta['type'] hxs = Selector(response) #个股吧 if type == 1: stocks = hxs.xpath( '//div[@class="ngbglistdiv"]/ul[@class="ngblistul2"]/li/a' ).extract() #fund_orgs = hxs.xpath('//div[@class="ngbglistdiv"]/ul[@class="ngblistul2"]/div[@class="ngbglistjjt"]/a').extract() #funds = hxs.xpath('//div[@class="ngbglistdiv"]/ul[@class="ngblistul2"]/ul[@class="ngblistul3"]/li/a').extract() #爬取股票论坛的地址和名字 for stock in stocks: m_stocks = re.search('href="(.+)">(.+)<\/a', stock) if m_stocks: item = GubaItem() item['content'] = {} url_stock = "http://guba.eastmoney.com/" + m_stocks.group( 1) item['content']['guba_url'] = url_stock item['content']['guba_name'] = m_stocks.group(2) yield Request(url=url_stock, meta={'item': item}, callback=self.parse_page_num) #主题吧 elif type == 2: stocks = hxs.xpath( '//div[@class="allzhutilistb"]/ul/li/a').extract() for stock in stocks: m_stocks = re.search('href="(.+)">(.+)<\/a', stock) item = GubaItem() item['content'] = {} url_stock = "http://guba.eastmoney.com/" + m_stocks.group(1) item['content']['guba_url'] = url_stock item['content']['guba_name'] = m_stocks.group(2) yield Request(url=url_stock, meta={'item': item}, callback=self.parse_page_num) #行业吧 elif type == 3: stocks = hxs.xpath('//ul[@class="ngblistitemul"]/li/a').extract() for stock in stocks: m_stocks = re.search('href="(.+)">(.+)<\/a', stock) item = GubaItem() item['content'] = {} url_stock = "http://guba.eastmoney.com/" + m_stocks.group(1) item['content']['guba_url'] = url_stock item['content']['guba_name'] = m_stocks.group(2) yield Request(url=url_stock, meta={'item': item}, callback=self.parse_page_num) #概念吧 elif type == 4: stocks = hxs.xpath('//ul[@class="ngblistitemul"]/li/a').extract() for stock in stocks: m_stocks = re.search('href="(.+)">(.+)<\/a', stock) item = GubaItem() item['content'] = {} url_stock = "http://guba.eastmoney.com/" + m_stocks.group(1) item['content']['guba_url'] = url_stock item['content']['guba_name'] = m_stocks.group(2) yield Request(url=url_stock, meta={'item': item}, callback=self.parse_page_num) #解析每个论坛的页数 def parse_page_num(self, response): item = response.meta['item'] #forum_url = response.meta['forum_url'] hxs = Selector(response) p = hxs.xpath('//div[@id="mainbody"]//div[@class="pager"]//@data-pager' ).extract()[0] m = re.search('(.*_)\|(.*)\|(.+)\|(.*)', p) postnums = m.group(2) heads = m.group(1) #sfnums = headnums.group(1) item['content']['postnums'] = int(postnums) #item['content']['s&f_nums'] = sfnums if item['content']['postnums'] % 80 == 0: ptotal = item['content']['postnums'] / 80 else: ptotal = int(item['content']['postnums'] / 80) + 1 if int(ptotal) == 0: yield item else: for i in range(int(ptotal)): p_url = "http://guba.eastmoney.com/" + heads + str(i) + ".html" yield Request(p_url, meta={'item': item}, callback=self.parse_post_list) #抓取每个子吧的帖子条数并翻页 def parse_post_list(self, response): hxs = Selector(response) posts = hxs.xpath('//div[@class="articleh"]').extract() item = response.meta['item'] for post in posts: readnum = Selector( text=post).xpath('//span[@class="l1"]/text()').extract() if readnum: readnum = readnum[0] item['content']['readnum'] = readnum replynum = Selector( text=post).xpath('//span[@class="l2"]/text()').extract() if replynum: replynum = replynum[0] item['content']['replynum'] = replynum url = Selector( text=post).xpath('//span[@class="l3"]/a/@href').extract() if url: url = url[0] guba_id = re.search(',(.+)_\d+\.html', response.url).group(1) if guba_id in url: m_stock = re.search("(^\/.+)", url) if m_stock: post_url = "http://guba.eastmoney.com" + m_stock.group( 1) item['url'] = post_url post_id = re.search('\/(n.+)\.html', url).group(1) item['content']['post_id'] = post_id yield Request(url=post_url, meta={ 'item': copy.deepcopy(item), 'replynum': replynum }, callback=self.parse_post) def parse_post(self, response): try: if response.status == 200: try: filter_body = response.body.decode('utf8') except: try: filter_body = response.body.decode("gbk") except: try: filter_body = response.body.decode("gb2312") except Exception as ex: print("Decode webpage failed: " + response.url) return filter_body = re.sub('<[A-Z]+[0-9]*[^>]*>|</[A-Z]+[^>]*>', '', filter_body) response = response.replace(body=filter_body) hxs = Selector(response) item = response.meta['item'] dt = hxs.xpath('//div[@class="zwfbtime"]/text()').extract()[0] dt = re.search('\D+(\d{4}-\d{2}-.+:\d{2})', dt).group(1) creat_time = datetime.strptime(dt, "%Y-%m-%d %H:%M:%S") item['content']['create_time'] = creat_time author_url = hxs.xpath( '//div[@id="zwconttbn"]/strong/a/@href').extract()[0] item['content']['author_url'] = author_url try: #针对普通帖子 postcontent = hxs.xpath( '//div[@id="zwconbody"]/div[@class="stockcodec"]/text()' ).extract()[0].strip() if postcontent: item['content']['content'] = postcontent postitle = hxs.xpath( '//div[@class="zwcontentmain"]/div[@id="zwconttbt"]/text()' ).extract()[0].strip() item['content']['title'] = postitle except: #针对问答帖子 try: postcontent = hxs.xpath( '//div[@class="qa"]//div[contains(@class,"content")]/text()' ).extract() postquestion = postcontent[0] postanswer = postcontent[2].strip( ) + postcontent[3].strip() item['content']['content'] = postquestion item['content']['answer'] = postanswer postanswer_time = hxs.xpath( '//div[@class="sign"]/text()').extract() try: postanswer_time = hxs.xpath( '//div[@class="sign"]/text()').extract() postanswer_time = re.search( '\D+(\d{4}-\d{2}-.+:\d{2})', postanswer_time[1].strip()).group(1) answer_time = datetime.strptime( postanswer_time, "%Y-%m-%d %H:%M:%S") item['content']['answer_time'] = answer_time except Exception as ex: item['content']['answer_time'] = None postitle = "Q&A" item['content']['title'] = postitle except Exception as ex: print("Parse Exception: " + response.url) return replynum = response.meta['replynum'] item['content']['reply'] = [] if int(replynum) % 30 == 0: rptotal = int(int(replynum) / 30) else: rptotal = int(int(replynum) / 30) + 1 if rptotal > 0: head = re.search('(.+)\.html', response.url).group(1) reply_url = head + "_" + str(1) + ".html" yield Request(url=reply_url, meta={ 'item': item, 'page': 1, 'rptotal': rptotal, 'head': head }, callback=self.parse_reply) else: yield item except Exception as ex: self.logger.warn('Parse Exception all: %s %s' % (str(ex), response.url)) def parse_reply(self, response): page = response.meta['page'] rptotal = response.meta['rptotal'] item = response.meta['item'] head = response.meta['head'] hxs = Selector(response) replists = hxs.xpath( '//div[@id="zwlist"]/div[@class="zwli clearfix"]').extract() for replist in replists: reply = {} try: reply_author = Selector(text=replist).xpath( '//div[@class="zwlianame"]//a/text()').extract()[0] reply['reply_author'] = reply_author reply_author_url = Selector(text=replist).xpath( '//div[@class="zwlianame"]//a/@href').extract()[0] reply['reply_author_url'] = reply_author_url except: try: reply_author = Selector(text=replist).xpath( '//span[@class="zwnick"]/span').extract()[0] reply_author = re.search('"gray">(.+)<\/span>', reply_author).group(1) reply['reply_author'] = reply_author except Exception as ex: print("Decode webpage failed: " + response.url) return reply_time = Selector(text=replist).xpath( '//div[@class="zwlitime"]/text()').extract()[0] reply_time = re.search('\D+(\d{4}-\d{2}-.+:\d{2})', reply_time).group(1) reply_time = datetime.strptime(reply_time, "%Y-%m-%d %H:%M:%S") reply['reply_time'] = reply_time reply_content = Selector(text=replist).xpath( '//div[@class="zwlitext stockcodec"]/text()').extract() if reply_content: reply['reply_content'] = reply_content[0].strip() reply_quote_author = Selector(text=replist).xpath( '//div[@class="zwlitalkboxtext "]//a/text()').extract() if reply_quote_author: reply_quote_author = reply_quote_author[0] reply['reply_quote_author'] = reply_quote_author reply_quote_author_url = Selector(text=replist).xpath( '//div[@class="zwlitalkboxtext "]//a/@href').extract() if reply_quote_author_url: reply_quote_author_url = reply_quote_author_url[0] reply['reply_quote_author_url'] = reply_quote_author_url reply_quote_text = Selector(text=replist).xpath( '//div[@class= "zwlitalkboxtext "]/span/text()').extract() if reply_quote_text: reply_quote_text = reply_quote_text[0] reply['reply_quote_content'] = reply_quote_text item['content']['reply'].append(reply) if page == rptotal: yield item elif page < rptotal: reply_url = head + "_" + str(page + 1) + ".html" yield Request(url=reply_url, meta={ 'item': item, 'rptotal': rptotal, 'page': page + 1, 'head': head }, callback=self.parse_reply)
#!/usr/bin/python # -*- coding: utf-8 -*- import os import logging from datetime import datetime, timedelta from twisted.web._newclient import ResponseNeverReceived from twisted.internet.error import TimeoutError, ConnectionRefusedError, ConnectError from crawler import fetch_free_proxyes from crawler.spiders import util from crawler.settings import * logger = util.set_logger("http_proxy", LOG_FILE_PROXY) class HttpProxyMiddleware(object): # 遇到这些类型的错误直接当做代理不可用处理掉, 不再传给retrymiddleware DONT_RETRY_ERRORS = (TimeoutError, ConnectionRefusedError, ResponseNeverReceived, ConnectError, ValueError) def __init__(self, settings): # 保存上次不用代理直接连接的时间点 self.last_no_proxy_time = datetime.now() # 一定分钟数后切换回不用代理, 因为用代理影响到速度 self.recover_interval = 20 # 一个proxy如果没用到这个数字就被发现老是超时, 则永久移除该proxy. 设为0则不会修改代理文件. self.dump_count_threshold = 20 # 存放代理列表的文件, 每行一个代理, 格式为proto://ip:port, 这个文件会被修改, 注意备份 self.proxy_file = "proxyes.dat" # 是否在超时的情况下禁用代理 self.invalid_proxy_flag = True # 当有效代理小于这个数时(包括直连), 从网上抓取新的代理, 可以将这个数设为为了满足每个ip被要求输入验证码后得到足够休息时间所需要的代理数