from aloe import * from util import static_vals from util import logger as log from util.test_logic import api_test_logic as api_utils from util.threading_logic import pool_logic as pool from util.neighbor_logic import neighbor_logic as neighbors from util.response_logic import response_handling as responses from time import sleep, time logger = log.getLogger(__name__) testAddress = static_vals.TEST_ADDRESS world.config = {} world.responses = {} @step(r'"([^"]+)" is called on "([^"]+)" with:') def api_method_is_called(step, api_call, node_name): """ This is the general api calling function. There are 3 inputs :param api_call: The api call that will be requested :param node_name: The name identifying the node you would like to make this request on :param step.hashes: A gherkin table outlining any arguments needed for the call (See tests/features/machine1/4_api_tests.feature for examples) The table parameter is unique in that there are several input types available depending on the call being made. :type string: Basic string argument, will be taken as is :type int: Basic integer argument, will be converted to int before call is made
class SharesSpider(RedisSpider): name = 'shares' allowed_domains = ['szse.cn'] logger = getLogger('Shenzhen'); #股票代码 url = Template("http://www.szse.cn/api/report/ShowReport/data?SHOWTYPE=JSON&CATALOGID=1110x&TABKEY=${key}&PAGENO=${pageno}&random=${random}") keys = ['tab1','tab2','tab3','tab4']; #历史日线 historyDay = Template('http://www.szse.cn/api/report/ShowReport/data?SHOWTYPE=JSON&CATALOGID=1815_stock&TABKEY=${key}&radioClass=00%2C20%2C30&txtSite=all&txtDMorJC=${code}&txtBeginDate=${date}&txtEndDate=${date}&random=${random}') #公司信息 company = Template('http://www.szse.cn/api/report/index/companyGeneralization?secCode=${code}&random=${random}') #关键指标 IndexGeneralization = Template('http://www.szse.cn/api/report/index/stockKeyIndexGeneralization?secCode=${code}&random=${random}') #最新公告 annIndex = Template('http://www.szse.cn/api/disc/announcement/annIndex?secCode=${code}&random=${random}&channelCode=${channel}') #市场行情数据 market = Template('http://www.szse.cn/api/market/ssjjhq/getTimeData?code=${code}&random=${random}&marketId=1') #历史数据 history = Template('http://www.szse.cn/api/market/ssjjhq/getHistoryData?code=${code}&random=${random}&marketId=1&cycleType=${type}') #公告 annList = 'http://www.szse.cn/api/disc/announcement/annList' pool = redis.ConnectionPool(host='127.0.0.1', port=6379) cache = redis.Redis(connection_pool=pool) def start_requests(self): pageno = 1; for key in self.keys: meta = {'key':key,'pageno':pageno,'random':random.random()} url = self.url.substitute(meta) yield scrapy.Request(url,meta=meta,callback= self.parse) # return; def parse(self, response): self.logger.info(response.url); key = response.meta['key'] js = json.loads(response.body) for data in js: if data['metadata']['tabkey'] == key: for each in data['data']: soup = BeautifulSoup(each['gsjc'],"lxml"); item = {} #编号 item['code'] = each['zqdm'] #详情 item['details'] = soup.a.get('href') #简称 item['name'] = soup.a.get_text(); #全称 item['fullName'] = each['gsqc'] #行业 item['industry'] = each['sshymc'] #官网 item['url'] = each['http'] #扩展 item["meta"] = data['metadata']['name'] item["type"] = "generalization"; yield item code = item['code']; # 历史数据 指定日期日线 date = self.getDay(code,"historyDay"); meta = { 'key':key, 'code':code, 'date':date, 'random':random.random() }; yield scrapy.Request(self.historyDay.substitute(meta), meta=meta, callback = self.parseHistoryDay) if key != 'tab1': continue; #股票行情 soup = BeautifulSoup(each['jqhq'],'lxml') url = soup.a.get('a-param'); url = 'http://www.szse.cn/api/report' + url; yield scrapy.Request(url, callback = self.parseQuotation) # #公司信息 meta = {'key':key,'code':code,'random':random.random()}; yield scrapy.Request(self.company.substitute(meta), meta=meta, callback = self.parseCompany) # #关键指标 yield scrapy.Request(self.IndexGeneralization.substitute(meta), meta=meta, callback = self.parseIndex) meta = {'key':key,'code':code,'random':random.random(),"channel":"listedNotice_disc"}; # #最新公告 yield scrapy.Request(self.annIndex.substitute(meta), meta=meta, callback = self.parseAnnIndex) meta = {'key':key,'code':code,'random':random.random(),"channel":"fixed_disc"}; # #定期报告 yield scrapy.Request(self.annIndex.substitute(meta), meta=meta, callback = self.parseAnnIndex) meta = {'key':key,'code':code,'random':random.random()}; # #市场行情数据 yield scrapy.Request(self.market.substitute(meta), meta=meta, callback = self.parseMarket) meta = {'key':key,'code':code,'random':random.random(),'type':32}; #历史数据 日线 yield scrapy.Request(self.history.substitute(meta), meta=meta, callback = self.parseHistory) meta = {'key':key,'code':code,'random':random.random(),'type':33}; # #历史数据 周线 yield scrapy.Request(self.history.substitute(meta), meta=meta, callback = self.parseHistory) meta = {'key':key,'code':code,'random':random.random(),'type':34}; # #历史数据 月线 yield scrapy.Request(self.history.substitute(meta), meta=meta, callback = self.parseHistory) #公告 # "fixed_disc" formdata = { 'channelCode': ["listedNotice_disc"], 'pageNum': '1', 'pageSize': '30', 'seDate': ["", ""], 'stock': [code] } yield scrapy.FormRequest( url = self.annList +'?random='+str(random.random()), method="POST", headers={'Content-Type': 'application/json'}, body=json.dumps(formdata), meta = formdata, callback = self.parseAnnList ); if data['metadata']['pageno'] * data['metadata']['pagesize'] < data['metadata']['recordcount']: # 每次处理完一页的数据之后,重新发送下一页页面请求 # self.offset自增10,同时拼接为新的url,并调用回调函数self.parse处理Response pageno = response.meta['pageno'] + 1; meta = {'key':key,'pageno':pageno,'random':random.random()} yield scrapy.Request(self.url.substitute(meta), meta=meta, callback = self.parse) #股票指数 def parseQuotation(self, response): js = json.loads(response.body) for data in js: for each in data['data']: item = {}; item['date'] = each['jyrq']; item['code'] = each['zqdm']; item['name'] = each['zqjc']; item['settlement'] = each['qss']; item['trade'] = each['ss']; item['changepercent'] = each['sdf']; item['amount'] = each['cjje']; item['pb'] = each['syl1']; item['type'] = 'quotation' yield item; #公司信息 def parseCompany(self,response): js = json.loads(response.body) if js['code'] == '0' and js['data'] != None: item = {} for key in js['cols'].keys(): # self.logger.info(js['data'][key]) item[js['cols'][key]] = js['data'][key] item['full'] = js['data']['gsqc']; item['type'] = 'company'; yield item #关键指数 def parseIndex(self,response): js = json.loads(response.body) if js['code'] == '0': now = js['data'][0] last = js['data'][1] change = js['data'][2] #总成交金额 (亿元) cjje #总成交量 (亿股) cjbs #股票总股本(亿股) zgb #股票流通股本(亿股) ltgb #股票总市值(亿元) sjzz #股票流通市值(亿元) ltsz #平均市盈率 syl #平均换手率 hsl item = {} item['now'] = { 'amount':now['now_'+ 'cjje'], 'volume':now['now_'+ 'cjbs'], 'capital':now['now_'+ 'zgb'], 'flowcapital':now['now_'+ 'ltgb'], 'mktcap':now['now_'+ 'sjzz'], 'nmc':now['now_'+ 'ltsz'], 'pb':now['now_'+ 'syl'], 'turnoverratio':now['now_'+ 'hsl'] } item['last'] = { 'amount':last['last_'+ 'cjje'], 'volume':last['last_'+ 'cjbs'], 'capital':last['last_'+ 'zgb'], 'flowcapital':last['last_'+ 'ltgb'], 'mktcap':last['last_'+ 'sjzz'], 'nmc':last['last_'+ 'ltsz'], 'pb':last['last_'+ 'syl'], 'turnoverratio':last['last_'+ 'hsl'] } item['change'] = { 'amount':change['change_'+ 'cjje'], 'volume':change['change_'+ 'cjbs'], 'capital':change['change_'+ 'zgb'], 'flowcapital':change['change_'+ 'ltgb'], 'mktcap':change['change_'+ 'sjzz'], 'nmc':change['change_'+ 'ltsz'], 'pb':change['change_'+ 'syl'], 'turnoverratio':change['change_'+ 'hsl'] } item['lastDate'] = js['lastDate'] item['code'] = response.meta['code'] item['type'] = 'index'; yield item; #最新公告 def parseAnnIndex(self,response): js = json.loads(response.body) code = response.meta['code'] for each in js['data']: item = {} item['code'] = code; item['type'] = 'annIndex'; item['title'] = each['title'] item['publishTime'] = each['publishTime'] item['attachPath'] = each['attachPath'] item['attachFormat'] = each['attachFormat'] yield item; #市场分时数据----一分钟一次实时调用 def parseMarket(self,response): self.logger.info(response.url); js = json.loads(response.body) if js['code'] != '0': return; data = js['data']; data['type'] = 'market' yield data #市场时间 # data['datatime']; #时间 data['marketTime']; #代码 data['code']; #名称 data['name']; #昨收 data['close']; #涨跌 data['delta']; #涨幅 data['deltaPercent'] #最高价 data['high'] #最低价 data['low'] #开盘价 data['open'] #现价 data['now'] #成交额 data['amount'] #成交量 (手) data['volume'] #昨日成交量 (手) data['lastVolume'] #分钟级数据 #均价 分钟 data['picavgprice'] #成交量 分钟 data['picdowndata'] #详细数据 分钟 data['picupdata'] # [ # 0:时间 # 1:最新价 # 2:均价 # 3:涨跌 # 4:涨幅 # 5:成交量 # 6:成交额 # ] #买盘 卖盘 前5是卖 后5是买 data['sellbuy5'] #市场历史数据 def parseHistory(self,response): cycle = response.meta['type']; if cycle == 32: cycle = 'day' elif cycle == 33: cycle = 'week' elif cycle == 34: cycle = 'month' js = json.loads(response.body) if js['code'] == '0': data = js['data'] #成交量 for each in data['picdowndata']: item = {} item["type"] = 'volume'; item["code"] = data['code']; item["cycle"] = cycle; #时间 item['date'] = each[0] #成交量 item['volume'] = each[1] #涨跌状态(minus:跌 plus:升) item['status'] = each[2] yield item; #交易数据 for each in data['picupdata']: item = {} item["type"] = 'transaction'; item["code"] = data['code']; item["cycle"] = cycle; #时间 item['date'] = each[0] #开盘 item['open'] = each[1] #最高 item['high'] = each[2] #最低 item['low'] = each[3] #收盘 item['trade'] = each[4] #涨跌 item['pricechange'] = each[5] #涨幅 item['changepercent'] = each[6] #成交量 item['volume'] = each[7] #成交额 item['amount'] = each[8] yield item; def getDay(self,code,type_): if type_ == 'historyDay': date = self.cache.get(type_ +"-"+ code); if date == None: return "1990-12-01"; return str(date.decode('utf-8')); def setDay(self,code,type_,day): if type_ == 'historyDay': return self.cache.set(type_ +"-"+ code,day); #获取指定日期数据 def parseHistoryDay(self,response): js = json.loads(response.body) data = js[0]['data'] if len(data) == 1: each = js[0]['data'][0] # "jyrq":"交易日期","zqdm":"证券代码", # "zqjc":"证券简称","qss":"前收", # "ss":"今收","sdf":"升跌<br>(%)", # "cjje":"成交金额<br>(万元)","syl1":"市盈率" item = {} item["type"] = "historyDay"; item["code"] = each["zqdm"]; item["name"] = each["zqjc"]; item["date"] = each["jyrq"]; item["settlement"] = each["qss"]; item["trade"] = each["ss"]; item["changepercent"] = each["sdf"]; item["amount"] = each["cjje"]; item["pb"] = each["syl1"]; yield item; key = response.meta["key"]; code = response.meta["code"]; date = response.meta["date"]; d = datetime.datetime.strptime(date, '%Y-%m-%d') delta = datetime.timedelta(days=1) d = d + delta; date = d.strftime('%Y-%m-%d') self.setDay(code,"historyDay",date); now = datetime.datetime.now() now = now.strftime('%Y-%m-%d'); if date < now: meta = {'key':key,'code':code,'date':date,'random':random.random()} yield scrapy.Request(self.historyDay.substitute(meta), meta=meta, callback = self.parseHistoryDay) #公告 def parseAnnList(self,response): pageNum = int(response.meta['pageNum']) pagesize = int(response.meta['pageSize']) js = json.loads(response.body); totalCount = js['announceCount']; index = totalCount - (pageNum-1)*pagesize; for each in js["data"]: index-=1 item = {} item['code'] = each["secCode"][0]; item['type'] = 'annIndex'; item['sortID'] = index; item['title'] = each['title'] item['publishTime'] = each['publishTime'] item['attachPath'] = each['attachPath'] item['attachFormat'] = each['attachFormat'] item['attachSize'] = each['attachSize'] yield item; if "totalCount" in response.meta: totalCount = response.meta['totalCount']; if pageNum * pagesize < int(totalCount): formdata = response.meta; formdata['pageNum'] = pageNum + 1; formdata['totalCount'] = totalCount; yield scrapy.FormRequest( url = self.annList +'?random='+str(random.random()), method="POST", headers={'Content-Type': 'application/json'}, body=json.dumps(formdata), meta = formdata, callback = self.parseAnnList );
class SharesSpider(scrapy.Spider): name = 'shares' rules = RulesControl() allowed_domains = rules.get_domains() logger = getLogger('Shares') keys = ['tab1', 'tab2', 'tab3', 'tab4'] config = [{ 'scrapy:domain': 'Shenzhen', 'scrapy:type': 'Report', 'key': keys[0], 'pageno': 1 }, { 'scrapy:domain': 'Shenzhen', 'scrapy:type': 'Report', 'key': keys[1], 'pageno': 1 }, { 'scrapy:domain': 'Shenzhen', 'scrapy:type': 'Report', 'key': keys[2], 'pageno': 1 }, { 'scrapy:domain': 'Shenzhen', 'scrapy:type': 'Report', 'key': keys[3], 'pageno': 1 }, { 'scrapy:domain': 'Shanghai', 'scrapy:type': 'Stock', 'pageno': 0 }] def start_requests(self): for each in self.config: rule, core = self.rules.findRule(each) if rule == None: self.logger.error("find rule empty:", each) continue data = core['get'](core, each) if isinstance(data, GeneratorType): for req in data: yield self.getRequest(core, req) else: yield self.getRequest(core, data) def Generator3Layer(self, data): if isinstance(data, GeneratorType): for a in data: if isinstance(a, GeneratorType): for b in a: if isinstance(b, GeneratorType): for c in b: if isinstance(c, GeneratorType): for d in c: yield d else: yield c else: yield b else: yield a else: yield data def getRequest(self, core, req): key = 'scrapy:request' if key in core and len(core[key].keys()) > 0: if 'method' not in req: return scrapy.Request(req['url'], meta=req['meta'], **core[key], callback=self.parse) else: return scrapy.FormRequest(**req, **core[key], callback=self.parse) else: if 'method' not in req: return scrapy.Request(req['url'], meta=req['meta'], callback=self.parse) else: return scrapy.FormRequest(**req, callback=self.parse) def parse(self, response): meta = copy.deepcopy(response.meta) del meta['download_slot'] #域 del meta['download_latency'] #延迟 del meta['download_timeout'] #超时 del meta['depth'] #深度 # self.logger.info(response.url # + " 延迟:" + str(response.meta['download_latency']) # + " 超时:" + str(response.meta['download_timeout'])); if settings.get('IGNOREREQUEST'): return rule, core = self.rules.findRule(meta) if rule == None: self.logger.error("find rule empty:", meta) return if rule.language != '': body = response.body.decode(rule.language) else: body = response.body # self.logger.info(response.url); # self.logger.info(meta); # self.logger.info(body); result = core['parse'](core, meta, body) for data in result: if data == None: continue # self.logger.info(data) if isinstance(data, GeneratorType): for each in self.Generator3Layer(data): if each == None: continue if not isinstance(each, dict): self.logger.error("type error:" + each) continue # self.logger.info(each) c = self.rules.findCore(rule, each) for req in c['get'](c, each): yield self.getRequest(c, req) else: if data != None: data['scrapy:type'] = core['type'] data['scrapy:domain'] = rule.static_domain yield data
class SharesSpider(scrapy.Spider): name = 'shares' logger = getLogger('Sina'); allowed_domains = ['sina.com','*.sina.com.cn','sina.com.cn','hq.sinajs.cn' ,'data.gtimg.cn','stock.gtimg.cn','stock.finance.qq.com'] url = "http://vip.stock.finance.sina.com.cn/quotes_service/api/json_v2.php" #http://vip.stock.finance.sina.com.cn/mkt/ codes = { 'count':'Market_Center.getHQNodeStockCount', 'data':'Market_Center.getHQNodeData', 'nodes':'Market_Center.getHQNodes' } nodes = {}; def start_requests(self): param = self.getNodes_request() yield scrapy.Request(param['url'],callback= self.parseNodes) def getNodes_request(self): code = self.codes['nodes'] url = self.url + '/' + code; return { 'url':url } def getCount_request(self,node,tag): code = self.codes['count'] url = self.url + '/' + code + '?' + 'node='+node; return { 'url':url, 'meta': { 'node':node, 'tag':tag } } def getData_request(self,node,tag,page,count): code = self.codes['data'] svcParam = {'page': page, 'num': 80, 'sort': 'symbol', 'asc': 1, 'node': node,'symbol':''}; params ='&'.join([k + '=' + str(v) for k,v in svcParam.items()]) url = self.url + '/' + code + '?' + params; return { 'url':url, 'meta':{ 'node':node, 'count':count, 'page':page, 'tag':tag } } def makeNodes(self,arr): result = {} if len(arr) >= 6: name = arr[0] soup = BeautifulSoup(name,"lxml"); name = soup.text; childs = {} if isinstance(arr[1],str): result[name] = { 'link': arr[1], 'type': arr[2], 'base': arr[3], 'other':arr[4] } else: for item in arr[1]: childs.update(self.makeNodes(item)) result[name] = { 'child':childs, 'link': arr[2], 'type': arr[3], 'base': arr[4], 'other':arr[5] } elif len(arr) == 5: name = arr[0] soup = BeautifulSoup(name,"lxml"); name = soup.text; childs = {} if isinstance(arr[1],str): result[name] = { 'link': arr[1], 'type': arr[2], 'base': arr[3], 'other':arr[4] } else: for item in arr[1]: childs.update(self.makeNodes(item)) result[name] = { 'child':childs, 'link': arr[2], 'type': arr[3], 'base': arr[4] } elif len(arr) == 4: name = arr[0] soup = BeautifulSoup(name,"lxml"); name = soup.text; childs = {} if isinstance(arr[1],str): result[name] = { 'link': arr[1], 'type': arr[2], 'base': arr[3] } else: for item in arr[1]: childs.update(self.makeNodes(item)) result[name] = { 'child':childs, 'link': arr[2], 'type': arr[3] } elif len(arr) == 3: name = arr[0] soup = BeautifulSoup(name,"lxml"); name = soup.text; result[name] = { "link": arr[1], "type": arr[2] } else: self.logger.info(arr); return result; def getNodeCode(self,names): node_ret = {} nodes = self.nodes; for name in names: for node in nodes: if node == name: node_ret = nodes[node] if 'child' in node_ret.keys(): nodes = node_ret['child'] return node_ret['type'] def resolveTree(self,dd): if len(dd.contents) == 2: name = dd.contents[1].text; elif len(dd.contents) == 1: name= dd.a.text; else: return {}; div = dd.find('div') childs = {} if div != None: for dd in div.dl.children: childs.update(self.resolveTree(dd)); pass return {name:childs}; def resolveTreeArray(self,data,headers,trees): if headers == None: headers = [] if trees == None: trees = [] if len(data.keys()) > 0: for i in data.keys(): headers.append(i) self.resolveTreeArray(data[i],headers,trees); headers.pop(); else: trees.append(list(headers)) return trees; def resolveNavTree(self,data): soup = BeautifulSoup(data,"lxml"); tree = soup.find('div',class_='navtree'); trees = [] lstH3 = None for node in tree: if node.name == 'ul': name = lstH3.a.text childs = {} lstH3 = None; for li in node.children: # self.logger.info(li) childs.update(self.resolveTree(li)); treeArray = self.resolveTreeArray(childs,[],[]); if len(treeArray) > 0: for i in treeArray: trees.append([name] + i) else: trees.append([name]) elif node.name == 'h3': if lstH3 != None: name = lstH3.a.text trees.append([name]) lstH3 = node return trees; def parseNodes(self,response): pass data = response.body.decode('gb2312'); # self.logger.info(data); # fp = open("tree.txt",'w') # fp.write(data); # fp.close(); replaceData = re.sub(r"\\'", r"'", data); js = json.loads(replaceData) self.nodes = self.makeNodes(js); # data = json.dumps(self.nodes,ensure_ascii=False); # self.logger.info(data); # fp = open("tree2.txt",'w') # fp.write(data); # fp.close(); fp = open("navtree.txt",'r') data = fp.read(); fp.close(); # self.logger.info(data); trees = self.resolveNavTree(data) for tree in trees: code = self.getNodeCode(['行情中心'] + tree) # if code != 'gqg_hkstock_volume': # continue; # self.logger.info(code) param = self.getCount_request(code,tree); # self.logger.info(param) yield scrapy.Request(param['url'],meta=param['meta'],callback= self.parseCount) break def parseCount(self,response): pass data = response.body.decode('gb2312'); if data == "null": return; count = data[13:][:-3]; if count.strip()=='' or len(count) == 0: return; else: pass tag = response.meta['tag'] node = response.meta['node'] param = self.getData_request(node,tag,1,int(count)) yield scrapy.Request(param['url'],meta=param['meta'],callback= self.parseData) def parseData(self, response): pass self.logger.info(response.url) data = response.body.decode('gb2312'); data = demjson.decode(data); self.logger.info(len(data)); # {symbol:"sz300711",code:"300711",name:"广哈通信", # trade:"19.400",pricechange:"0.210",changepercent:"1.094", # buy:"19.390",sell:"19.400",settlement:"19.190",open:"19.190", # high:"19.520",low:"18.740",volume:2857915,amount:54821946, #ticktime:"15:00:03", #per:40.417, #pb:4.974,mktcap:279740.15076,nmc:88562.94,turnoverratio:6.26036} #symbol:代码 #code:编号 #name:简称 #trade:最新价 #pricechange:涨跌额 #changepercent:涨跌幅 #buy:买入 #sell:卖出 #settlement:昨收 #open:开盘 #high:最高 #low:最低 #volume:成交量 #amount:成交额 #mktcap:总市值 #nmc:流通市值 #ticktime:时间 #pb:市净率 #turnoverratio:换手率 for each in data: item = {}; item = each; item['date'] = getLastClosingDate(); item['type'] = 'DayClosingData' # yield item; #实时数据 # yield scrapy.Request('http://hq.sinajs.cn/list='+ item['symbol'],meta=item,callback= self.parseNewData) code = item['code'] symbol = item['symbol'] #5分钟数据 meta = { 'symbol':symbol, #代码 'scale':'5', #分钟间隔 5,15,30,60 'ma':'5', #均值(5、10、15、20、25) 'count':'1023' #数量 } url = Template('http://money.finance.sina.com.cn/quotes_service/api/json_v2.php/CN_MarketData.getKLineData?symbol=${symbol}&scale=${scale}&ma=${ma}&datalen=${count}') # yield scrapy.Request(url.substitute(meta),meta=meta,callback= self.parseMinuteData) #历史交易 meta = { 'symbol':symbol, #代码 'code':code, #代码 'year':'2018', 'quarter':'1', #季度 1 2 3 4 } url = Template('http://money.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/${code}.phtml?year=${year}&jidu=${quarter}') # yield scrapy.Request(url.substitute(meta),meta=meta,callback= self.parseQuarterData) #历史交易明细数据接口,需要获取开盘日 meta = { 'symbol':symbol, #代码 'date':time.strftime("%Y-%m-%d",time.localtime(time.time())), #代码 'page':0, } url = Template('http://market.finance.sina.com.cn/transHis.php?symbol=${symbol}&date=${date}&page=${page}') yield scrapy.Request(url.substitute(meta),meta=meta,callback= self.parseDetailsData) #资金流 url = "http://vip.stock.finance.sina.com.cn/quotes_service/api/json_v2.php/MoneyFlow.ssi_ssfx_flzjtj?format=text&daima=" + symbol # yield scrapy.Request(url,meta=meta,callback= self.parseCapitalFlow) #https://blog.csdn.net/woloqun/article/details/80734088 #财报数据 url = Template("http://vip.stock.finance.sina.com.cn/corp/go.php/vFD_FinanceSummary/stockid/${code}.phtml?qq-pf-to=pcqq.c2c") meta = { 'code':code, 'symbol':symbol } # yield scrapy.Request(url.substitute(meta),meta=meta,callback= self.parseFinanceSummaryData) #https://blog.csdn.net/luanpeng825485697/article/details/78442062?locationNum=5&fps=1 #腾讯股票数据 #分时图 url = Template('http://data.gtimg.cn/flashdata/hushen/minute/${symbol}.js?maxage=${maxage}&${random}') meta = { 'symbol' :symbol, 'maxage':'110', 'random':random.random() } # yield scrapy.Request(url.substitute(meta),meta=meta,callback= self.parseTencentMinuteData) #5天分时图 url = Template('http://data.gtimg.cn/flashdata/hushen/4day/${tag}/${symbol}.js?maxage=${maxage}&visitDstTime=${visitDstTime}') meta = { 'symbol' :symbol, 'tag':symbol[0:2], 'maxage':'110', 'visitDstTime':1 } # yield scrapy.Request(url.substitute(meta),meta=meta,callback= self.parseTencentDayData) #日k url = Template('http://data.gtimg.cn/flashdata/hushen/latest/daily/${symbol}.js?maxage=${maxage}&visitDstTime=${visitDstTime}') meta = { 'symbol' :symbol, 'maxage':'43201', 'visitDstTime':1 } # yield scrapy.Request(url.substitute(meta),meta=meta,callback= self.parseTencentDayKData) #年日K url = Template('http://data.gtimg.cn/flashdata/hushen/daily/${year}/${symbol}.js?visitDstTime=${visitDstTime}') meta = { 'symbol' :symbol, 'year':'2017'[-2:], 'visitDstTime':1 } # yield scrapy.Request(url.substitute(meta),meta=meta,callback= self.parseTencentYearDayKData) #周K url = Template('http://data.gtimg.cn/flashdata/hushen/latest/weekly/${symbol}.js?maxage=${maxage}&visitDstTime=${visitDstTime}') meta = { 'symbol' :symbol, 'maxage':'43201', 'visitDstTime':1 } # yield scrapy.Request(url.substitute(meta),meta=meta,callback= self.parseTencentWeekKData) #月K url = Template('http://data.gtimg.cn/flashdata/hushen/monthly/${symbol}.js?maxage=${maxage}') meta = { 'symbol' :symbol, 'maxage':'43201', 'visitDstTime':1 } # yield scrapy.Request(url.substitute(meta),meta=meta,callback= self.parseTencentMonthKData) #成交明细 列项 url = Template('http://stock.gtimg.cn/data/index.php?appn=detail&action=timeline&c=${symbol}') meta = { 'symbol' :symbol } # yield scrapy.Request(url.substitute(meta),meta=meta,callback= self.parseTencentClosingDetailsListData) #成交明细 url = Template('http://stock.gtimg.cn/data/index.php?appn=detail&action=data&c=${symbol}&p=${page}') meta = { 'symbol' :symbol, 'page':0, 'date':'20180413' } # yield scrapy.Request(url.substitute(meta),meta=meta,callback= self.parseTencentClosingDetailsData) #大单数据 #opt=10 11 12 13 分别对应成交额大于等于(100万 200万 500万 1000万) #opt=1,2,3,4,5,6,7,8 分别对应成交量大于等于(100手 200手 300手 400手 500手 800手 1000手 1500手 2000手) url = Template('http://stock.finance.qq.com/sstock/list/view/dadan.php?t=js&c=${symbol}&max=${max}&p=${page}&opt=${opt}&o=${o}') meta = { 'symbol' :symbol, 'max':80, 'page':0, 'opt':10, 'o':0, } # yield scrapy.Request(url.substitute(meta),meta=meta,callback= self.parseTencentLargeSingleData) break; #data[0] #https://hq.sinajs.cn/?_=1554047924366&list=ml_sh600100 #公告 #https://vip.stock.finance.sina.com.cn/api/jsonp.php/var%20noticeData=/CB_AllService.getMemordlistbysymbol?num=8&PaperCode=600100 #ttps://news.sinajs.cn/rn=1554047925361&maxcnt=20&scnt=20&list=sh600100,gg_sh600100,ntc_sh600100,blog_sh600100,tg_sh600100,lcs_sh600100 #ttps://vip.stock.finance.sina.com.cn/quotes_service/api/jsonp.php/var%20moneyFlowData=/MoneyFlow.ssi_ssfx_flzjtj?daima=sh600100&gettime=1 #https://finance.sina.com.cn/realstock/company/sh600100/hisdata/klc_kl.js?d=2019_4_1 return; node = response.meta['node'] tag = response.meta['tag'] count = int(response.meta['count']) page = int(response.meta['page']) if page * 80 < count: param = self.getData_request(node,tag,page + 1,count) yield scrapy.Request(param['url'],meta=param['meta'],callback= self.parseData) def parseNewData(self,response): pass; self.logger.info(response.body); symbol = response.meta['symbol'] code = response.meta['code'] data = response.body.decode('gb2312'); index = data.find('\"') data = data[index:-1][0:-2]; each = data.split(',') item = {} item['type'] = 'real' item['symbol'] = symbol item['code'] = code item['name'] = each[0]; item['open'] = each[1]; item['settlement'] = each[2]; item['trade'] = each[3]; item['high'] = each[4]; item['low'] = each[5]; # item[''] = each[6]; #买一 # item[''] = each[7]; #卖一 item['volume'] = each[8]; item['amount'] = each[9]; item['buys'] = each[10:19]; item['sells'] = each[20:29]; item['date'] = each[30]; item['time'] = each[31]; yield item; def parseMinuteData(self,response): pass symbol = response.meta['symbol'] scale = response.meta['scale'] data = response.body.decode('gb2312') js = demjson.decode(data); for each in js: item = {} item['type'] = 'minute_' + str(scale) item['symbol'] = symbol item['date']= each['day']; item['open']= each['open']; item['high']= each['high']; item['low']= each['low']; item['close']= each['close']; item['volume']= each['volume']; yield item def parseQuarterData(self,response): pass symbol = response.meta['symbol'] self.logger.info(response.url); data = response.body.decode('gb2312') data = data.replace('\r','').replace('\n','').replace('\t','') # self.logger.info(data); soup = BeautifulSoup(data,"lxml"); table = soup.find("table",id='FundHoldSharesTable'); tds = table.find_all('td'); # self.logger.info(tds); for i in range(0,len(tds),7): if i == 0: continue; item = {} item['type'] = 'quarter' item['symbol'] = symbol item['date']= tds[i+0].get_text(); item['open']= tds[i+1].get_text(); item['high']= tds[i+2].get_text(); item['low']= tds[i+3].get_text(); item['close']= tds[i+4].get_text(); item['volume']= tds[i+5].get_text(); item['amount']= tds[i+6].get_text(); yield item; def parseDetailsData(self,response): pass symbol = response.meta['symbol'] self.logger.info(response.url); data = response.body.decode('gb2312') # data = data.replace('\r','').replace('\n','').replace('\t','') self.logger.info(data); soup = BeautifulSoup(data,"lxml"); table = soup.find(table,class_='datatbl'); if table == None: return; trs = table.find_all('tr'); index = 0; for tr in trs: index+=1; if index == 1: continue; item = {} item['type'] = 'TencentClosingDetails' item['symbol'] = symbol item['date'] = date item['time'] = tr.contents[0].get_text(); item['trade'] = tr.contents[1].get_text(); item['pricechange'] = tr.contents[2].get_text(); item['volume'] = tr.contents[3].get_text(); item['amount'] = tr.contents[4].get_text(); nature = tr.contents[5].get_text(); item['nature'] = nature yield item; def parseCapitalFlow(self,response): symbol = response.meta['symbol'] data = response.body.decode('gb2312') data = data[1:-1] self.logger.info(data); js = demjson.decode(data); item = js; item['type'] = 'capitalflow' item['symbol'] = symbol; item['date'] = time.strftime("%Y%m%d %H:%M:%S", time.localtime(time.time())) yield item; #财务摘要 def parseFinanceSummaryData(self,response): symbol = response.meta['symbol'] self.logger.info(response.url); data = response.body.decode('gb2312') # self.logger.info(data); soup = BeautifulSoup(data,"lxml"); table = soup.find("table",id='FundHoldSharesTable'); tds = table.find_all('td'); item = {} item['type'] = 'financesummary' item['symbol'] = symbol item['data'] = [] name = '' for td in tds: pass if td.get_text() == '截止日期': yield item item = {} item['type'] = 'financesummary' item['symbol'] = symbol item['data'] = [] if name == '': name = td.get_text().replace('-',''); else: value = td.get_text() self.logger.info(value); if value == '\xa0': value = '' if name == '截止日期': item['date'] = td.get_text() item['data'].append({'name':name,'value':value}) name = '' if len(item['data']) > 0: yield item #腾讯数据 def parseTencentMinuteData(self,response): pass symbol = response.meta['symbol'] self.logger.info(response.url); data = response.body.decode('gb2312') data = data[10:-2] strs = data.split('\\n\\\n'); # self.logger.info(strs); item = {} item['type'] = 'TencentMinute' item['symbol'] = symbol item['data'] = [] for each in strs: if each == '': continue; its = each.split(' ') if len(its) == 1: ds = each.split(':') item['date'] = '20' + ds[1]; else: item['data'].append(its); yield item; def parseTencentDayData(self,response): pass symbol = response.meta['symbol'] # self.logger.info(response.url); data = response.body.decode('gb2312') data = data[16:-1] js = demjson.decode(data); for each in js: # self.logger.info(each); item = {} item['type'] = 'TencentDay' item['symbol'] = symbol item['date'] = each['date'] item['prec'] = each['prec'] item['data'] = [] its = each['data'].split('^'); for it in its: lts = it.split('~') item['data'].append(its); yield item; def parseTencentDayKData(self,response): pass symbol = response.meta['symbol'] # self.logger.info(response.url); data = response.body.decode('gb2312') data = data[19:-2] strs = data.split('\\n\\\n'); for each in strs: if each == '': continue; its = each.split(' ') if len(its) > 6: ds = each.split(':') #暂时不清楚意思 #100 total:3871 start:021009 02:59 03:239 04:241 05:241 06:217 07:240 08:238 09:241 10:240 11:243 12:242 13:238 14:245 15:244 16:244 17:149 18:243 19:67 else: item={} item['symbol'] = symbol item['type'] = 'TencentDayK' item['date'] = '20' + its[0]; # 昨 最新 最高 最低 成交量 item['settlement'] = its[1] item['trade'] = its[2] item['high'] = its[3] item['low'] = its[4] item['volume'] = its[5] yield item; def parseTencentYearDayKData(self,response): pass symbol = response.meta['symbol'] # self.logger.info(response.url); data = response.body.decode('gb2312') data = data[15:-2] strs = data.split('\\n\\\n'); # self.logger.info(strs); for each in strs: if each == '': continue; its = each.split(' ') item={} item['symbol'] = symbol item['type'] = 'TencentYearDayK' item['date'] = '20' + its[0]; # 昨 最新 最高 最低 成交量 item['settlement'] = its[1] item['trade'] = its[2] item['high'] = its[3] item['low'] = its[4] item['volume'] = its[5] yield item; def parseTencentWeekKData(self,response): pass symbol = response.meta['symbol'] # self.logger.info(response.url); data = response.body.decode('gb2312') data = data[20:-2] strs = data.split('\\n\\\n'); for each in strs: if each == '': continue; its = each.split(' ') if len(its) != 6: self.logger.info(each) ds = re.split('[ :]',each); self.logger.info(ds) else: item={} item['symbol'] = symbol item['type'] = 'TencentWeekK' item['date'] = '20' + its[0]; # 昨 最新 最高 最低 成交量 item['settlement'] = its[1] item['trade'] = its[2] item['high'] = its[3] item['low'] = its[4] item['volume'] = its[5] yield item; def parseTencentMonthKData(self,response): pass symbol = response.meta['symbol'] self.logger.info(response.url); data = response.body.decode('gb2312') data = data[14:-2] strs = data.split('\\n\\\n'); for each in strs: if each == '': continue; its = each.split(' ') item={} item['symbol'] = symbol item['type'] = 'TencentMonthK' item['date'] = '20' + its[0]; # 昨 最新 最高 最低 成交量 item['settlement'] = its[1] item['trade'] = its[2] item['high'] = its[3] item['low'] = its[4] item['volume'] = its[5] yield item def parseTencentClosingDetailsListData(self,response): pass symbol = response.meta['symbol'] self.logger.info(response.url); data = response.body.decode('gb2312') data = data[15+len(str(symbol)):]; js = demjson.decode(data) data = re.split('[|]',js[1]) self.logger.info(js[0]) self.logger.info(len(data)); def parseTencentClosingDetailsData(self,response): pass symbol = response.meta['symbol'] date = response.meta['date'] self.logger.info(response.url); data = response.body.decode('gb2312') data = data[15+len(str(symbol)):]; js = demjson.decode(data) data = re.split('[/_]',js[1]) for i in range(0,len(data),7): item = {} item['type'] = 'TencentClosingDetails' item['symbol'] = symbol item['date'] = date item['time'] = data[i+1]; item['index'] = data[i]; item['trade'] = data[i+2]; item['pricechange'] = data[i+3]; item['volume'] = data[i+4]; item['amount'] = data[i+5]; if data[i+6] == 'S': item['nature'] = '卖盘'; elif data[i+6] == 'B': item['nature'] = '买盘'; else: self.logger.warn(data[i+6]); yield item; def parseTencentLargeSingleData(self,response): pass symbol = response.meta['symbol'] self.logger.info(response.url); data = response.body.decode('gb2312') data = data[15+len(str(symbol)):];
class SharesSpider(scrapy.Spider): name = 'shares' allowed_domains = ['sse.com.cn'] logger = getLogger('Shanghai') url = "http://yunhq.sse.com.cn:32041/v1/sh1/list/exchange/equity" snap = "http://yunhq.sse.com.cn:32041/v1/sh1/snap/" line = 'http://yunhq.sse.com.cn:32041/v1/sh1/line/' kline = 'http://yunhq.sse.com.cn:32041/v1/sh1/dayk/' stock = 'http://query.sse.com.cn/security/stock/getStockListData2.do' def start_requests(self): begin = 0 param = self.getCode_requests(begin) yield scrapy.Request(param['url'], meta=param['meta'], callback=self.parseCodes) def getStock_requests(self, code, page, stock): keys = [ "isPagination", "stockCode", "csrcCode", "areaName", "stockType", "pageHelp.cacheSize", "pageHelp.beginPage", "pageHelp.pageSize", "pageHelp.pageNo", "_" ] #stock : 1 A股 2 B股 values = { "isPagination": true, "stockCode": code, "csrcCode": "", "areaName": "", "stockType": stock, "pageHelp.cacheSize": 1, "pageHelp.beginPage": page, "pageHelp.pageSize": 25, "pageHelp.pageNo": page, "_": int(time.time() * 1000) } url = self.stock + "?" + MergeParam(keys, values) # self.logger.info(url); return {'url': url, 'meta': values} #股票代码 def getCode_requests(self, begin): keys = ["callback", "select", "order", "begin", "end", "_"] values = { "callback": "", "select": "code,name,open,high,low,last,prev_close,chg_rate,volume,amount,tradephase,change,amp_rate", "order": "", "begin": begin, "end": begin + 25, "pageno": 25, "_": int(time.time() * 1000) } url = self.url + "?" + MergeParam(keys, values) # self.logger.info(url); return {'url': url, 'meta': values} def parseCodes(self, response): data = response.body[1:][:-1].decode('gbk') # self.logger.info(data); js = json.loads(data) for each in js['list']: item = {} item['type'] = 'code' item['date'] = js["date"] item['time'] = js["time"] #代码 #简称 #开盘 #最高 #最低 #最新 #前收 #涨跌额 #成交量 #成交额 #-------公式:T111 #涨跌 #振幅 item['code'] = each[0] item['name'] = each[1] item['open'] = each[2] item['high'] = each[3] item['low'] = each[4] item['trade'] = each[5] item['settlement'] = each[6] item['pricechange'] = each[7] item['volume'] = each[8] item['amount'] = each[9] item['formula'] = each[10] item['changepercent'] = each[11] item['amplitude'] = each[12] yield item param = self.getSnap_requests(each[0]) yield scrapy.Request(param['url'], meta=param['meta'], callback=self.parseSnap) param = self.getLine_requests(each[0]) yield scrapy.Request(param['url'], meta=param['meta'], callback=self.parseLine) param = self.getKLine_requests(each[0]) yield scrapy.Request(param['url'], meta=param['meta'], callback=self.parseKLine) return begin = int(js["begin"]) end = int(js["end"]) total = int(js["total"]) self.logger.info([begin, end, total]) if end < total: param = self.getCode_requests(end) yield scrapy.Request(param['url'], meta=param['meta'], callback=self.parseCodes) #实时交易数据 def getSnap_requests(self, code): keys = ["callback", "select", "_"] values = { "callback": "", "select": "name,last,chg_rate,change,amount,volume,open,prev_close,ask,bid,high,low,tradephase", "_": int(time.time() * 1000) } url = self.snap + str(code) + "?" + MergeParam(keys, values) # self.logger.info(url); return {'url': url, 'meta': values} # yield scrapy.Request(url,meta=values,callback= self.parseSnap) def parseSnap(self, response): data = response.body[1:][:-1].decode('gbk') js = json.loads(data) item = {} item['type'] = 'snap' item['date'] = js["date"] item['time'] = js["time"] item['code'] = js["code"] #简称 #最新价 #涨幅 #涨跌 #成交额 #成交量 #开盘 #昨收 #卖盘5项 #买盘5项 #最高 #最低 #E111=股票 each = js['snap'] item['name'] = each[0] item['trade'] = each[1] item['pricechange'] = each[2] item['changepercent'] = each[3] item['amount'] = each[4] item['volume'] = each[5] item['open'] = each[6] item['settlement'] = each[7] item['high'] = each[10] item['low'] = each[11] item['amplitude'] = each[12] item['sells'] = each[8] item['buys'] = each[9] yield item #分时线 def getLine_requests(self, code): # http://yunhq.sse.com.cn:32041/v1/sh1/line/000001? # callback=jQuery111202870652140273666_1553179236606 # &begin=0 # &end=-1 # &select=time%2Cprice%2Cvolume # &_=1553179236612 keys = ["callback", "begin", "end", "select", "_"] values = { "callback": "", "begin": 0, "end": -1, "select": "time,price,volume", "_": int(time.time() * 1000) } url = self.line + str(code) + "?" + MergeParam(keys, values) # self.logger.info(url); return {'url': url, 'meta': values} # yield scrapy.Request(url,meta=values,callback= self.parseLine) def parseLine(self, response): data = response.body[1:][:-1].decode('gbk') js = json.loads(data) #code: "600000" 代码 #highest: 11.59 最高 #lowest: 11.44 最低 #prev_close: 11.55 昨收 #begin: 0 #end: 241 #total: 241 总数 #date: 20190321 #time: 154508 item = {} item['type'] = 'line' item['date'] = js["date"] item['time'] = js["time"] item['code'] = js["code"] item['settlement'] = js['prev_close'] item['high'] = js['highest'] item['low'] = js['lowest'] # js['line']; #时间 #成交价 #成交量 item['line'] = js['line'] yield item #日线 def getKLine_requests(self, code): keys = ["callback", "select", "begin", "end", "_"] values = { "callback": "", "begin": 0, "end": -1, "select": "date,open,high,low,close,volume", "_": int(time.time() * 1000) } url = self.kline + str(code) + "?" + MergeParam(keys, values) # self.logger.info(url); return {'url': url, 'meta': values} # yield scrapy.Request(url,meta=values,callback= self.parseKLine) def parseKLine(self, response): data = response.body[1:][:-1].decode('gbk') js = json.loads(data) # code: "600000" #代码 # begin: 4273 #开始索引 # end: 4572 #结束索引 # total: 4572 #总数 item = {} item['type'] = 'kline' item['code'] = js['code'] # js["kline"]; #时间 #开盘 #最高 #最低 #收盘 #成交量 for each in js["kline"]: item['date'] = each[0] item['open'] = each[1] item['high'] = each[2] item['low'] = each[3] item['trade'] = each[4] item['volume'] = each[5] yield item