def parse(self, response): # print "内部网页" # print response.body indatas = response.xpath('//div[@class="ebdp-pc4promote-circularcontainer"]') print len(indatas) for data in indatas: item = FundsInfoItem() item["pname"] = data.xpath('./div[@class="ebdp-pc4promote-circularcontainer-head"]/span/span/a/text()').extract()[0] # print item["pname"] item["pid"] = data.xpath('./div[@class="ebdp-pc4promote-circularcontainer-head"]/span/span/a/@href').extract()[0].split('(')[-1].split(',')[0].strip('\'') temp = data.xpath('./div[@class="ebdp-pc4promote-circularcontainer-content"]/table/tbody/tr/td') if len(temp)==5: item["prate"] = temp[0].xpath('./div/div')[1].xpath('./text()').extract()[0] item["pfloor"] = temp[1].xpath('./div/div')[1].xpath('./b/text()').extract()[0]+temp[1].xpath('./div/div')[1].xpath('./text()').extract()[0] item["pperiod"] = temp[2].xpath('./div/div')[1].xpath('string(.)').extract()[0] elif len(temp)==6: item["prate"] = temp[1].xpath('./div/div')[1].xpath('./text()').extract()[0] item["pfloor"] = temp[2].xpath('./div/div')[1].xpath('./b/text()').extract()[0]+temp[1].xpath('./div/div')[1].xpath('./text()').extract()[0] item["pperiod"] = temp[3].xpath('./div/div')[1].xpath('string(.)').extract()[0] yield item
def parsexk(self, response): print("新客理财爬取") xklc = response.xpath('//div[@class="xklc_con"]') print(len(xklc)) for product in xklc: item = FundsInfoItem() item["pid"] = product.xpath( './div/div/div[@class="xklc_cptab"]/ul[@class="tb2 fl"]/li' )[0].xpath('normalize-space(string(.))').extract()[0] item["pname"] = product.xpath( './div/div/div[@class="xklc_title"]/text()').extract()[0] try: item["prate"] = product.xpath( './div/div/div[@class="xklc_sz"]/div')[0].xpath( 'normalize-space(string(.))').extract()[0] except: pass try: item["pperiod"] = product.xpath( './div/div/div[@class="xklc_cptab"]/ul[@class="tb2 fl"]' )[1].xpath('./li')[1].xpath( 'normalize-space(string(.))').extract()[0] except: pass try: item["pfloor"] = product.xpath( './div/div/div[@class="xklc_cptab"]/ul[@class="tb2 fl"]' )[1].xpath('./li')[0].xpath( 'normalize-space(string(.))').extract()[0] except: pass yield item
def parse(self, response): # print "内部网页" # print response.body indatas = response.xpath('//tbody/tr') # print len(indatas) for data in indatas: item = FundsInfoItem() try: item["pname"] = data.xpath('./td')[0].xpath( './a/text()').extract()[0] except: item["pname"] = data.xpath('./td')[0].xpath( './text()').extract()[0] item["pid"] = data.xpath('./td')[-1].xpath( './img/@src').extract()[0].split('lccp')[-1].split('.')[0] item["prate"] = data.xpath('./td')[6].xpath( './text()').extract()[0] item["pfloor"] = data.xpath('./td')[5].xpath( './text()').extract()[0] item["pperiod"] = data.xpath('./td')[4].xpath( './text()').extract()[0] yield item
def parse(self, response): # print "打印response" # print response.body datas = json.loads(response.body)['content']['resultList'] # print datas[0] for data in datas: item = FundsInfoItem() item["pid"] = data['prdNo'] item["pname"] = data['prdName'] item["prate"] = data['incomerate'] item["pperiod"] = data['dayDeadLine'] item["pfloor"] = data['firstAmt'] pdfUrl = 'https://etrade.citicbank.com/portalweb/findoc/' + str( item["pid"]) + '00.html' item["pscale"] = pdfUrl # self.i = self.i+1 # print self.i # yield item try: subResponse = Request(url=pdfUrl, method='GET', meta={"item": item}, callback=self.get_scale, errback=self.errors) yield subResponse except: yield item
def parsein(self, response): # print "内部网页" indatas = response.xpath('//tr') item = FundsInfoItem() for data in indatas: if u'产品名称' in data.xpath('./th/text()').extract()[0]: # print "产品名称" item["pname"] = data.xpath( 'normalize-space(./td/text())').extract()[0] if u'产品代码' in data.xpath('./th/text()').extract()[0]: # print "产品代码" item["pid"] = data.xpath( 'normalize-space(./td/text())').extract()[0] if u'预计年化收益率' in data.xpath('./th/text()').extract()[0]: # print "预计年化收益率" item["prate"] = data.xpath( 'normalize-space(./td/text())').extract()[0] if u'起点金额' in data.xpath('./th/text()').extract()[0]: # print "起点金额" item["pfloor"] = data.xpath( 'normalize-space(./td/text())').extract()[0] if u'投资期限' in data.xpath('./th/text()').extract()[0]: # print "投资期限" item["pperiod"] = data.xpath( 'normalize-space(./td/text())').extract()[0] yield item
def parse(self, response): datas = json.loads(response.text)['rows'] # print(datas[1]) for data in datas: item = FundsInfoItem() item["pid"] = data['finance_no'] item["pname"] = data['finance_allname'] item["prate"] = data['finance_anticipate_rate'] item["pperiod"] = data['finance_lmttime_info'] item["pfloor"] = data['finance_indi_ipominamnt'] yield item
def parsedq(self,response): # print "打印response" datas = json.loads(response.body)["data"]["compFinancialProducts"] # print len(datas) for data in datas: item = FundsInfoItem() item["pid"] = data["prdCode"] item["pname"] = data["prdName"] item["prate"] = data["indexContent"] item["pperiod"] = data["investTerm"] item["pfloor"] = data["minInvestAmount"] yield item
def parse(self, response): # print "打印response" datas = response.xpath('//Table') # print len(datas) for data in datas: item = FundsInfoItem() item["pid"] = data.xpath('./ProductNo/text()').extract()[0] item["pname"] = data.xpath('./ProdName/text()').extract()[0] item["prate"] = data.xpath('./ProdProfit/text()').extract()[0] item["pperiod"] = data.xpath('./ProdLimit/text()').extract()[0] item["pfloor"] = data.xpath('./PurStarAmo/text()').extract()[0] yield item
def parse(self, response): print('响应开始:') print(response.text) datas = json.loads(response.text)['List'] for data in datas: item = FundsInfoItem() item["pid"] = data['PrdCode'] item["pname"] = data['PrdName'] item["prate"] = data['IncomeRateExt'] # item["pperiod"] = str(data['LiveTime'])+'*'+data['UnitLiveTime'] item["pperiod"] = str(data['LiveTime']) + '天' item["pfloor"] = data['PfirstAmt'] yield item
def parse(self, response): page = response.text reg = r'(永乐\d号[^<]*).*(\w\w\d\d\d\d).*(\d\.\d\d%).*起点金额(.*)[\S]上限'.decode( 'utf-8') reg = re.compile(reg) finfos = reg.findall(page) for data in finfos: item = FundsInfoItem() item["pid"] = data[1] item["pname"] = data[0] item["prate"] = data[2] item["pperiod"] = u'未找到投资期限' item["pfloor"] = data[3] yield item
def parse(self, response): begin = re.search('jsonpCallback', response.text).end() datas = json.loads(response.text[begin + 1:-1])['ProdList'] for data in datas: if data["yieldRate"] == 0.0: # 试图从子页面抓取最新收益率 data["yieldRate"] = self.get_ccb_detail_rate(data['code']) item = FundsInfoItem() item["pid"] = data['code'] item["pname"] = data['name'] item["prate"] = data['yieldRate'] item["pperiod"] = data['investPeriod'] item["pfloor"] = data['purFloorAmt'] # item["pscale"] = data['instructionUrl'] yield item
def parse(self, response): reg = r'ft">(.*)<span>(.*)</span>[\s\S]{1,1000}value="(.*)"\sna[\s\S]{1,2900}font"[^<>]*>([^<>]*)</span>[\s\S]' \ r'{1,2900}<td class="bot"><span class="font" >(.*)</span><span class="grey">(.*)</span></td>' reg = re.compile(reg) funds_info = reg.findall(response.text) print(funds_info) print(len(funds_info)) for data in funds_info: item = FundsInfoItem() item["pid"] = data[1] item["pname"] = data[0] item["prate"] = data[2] item["pperiod"] = data[3] item["pfloor"] = data[4] + data[5] yield item
def parse(self, response): datas = json.loads(response.text)['rows'] # print(datas[1]) for data in datas: item = FundsInfoItem() item["pid"] = data['finance_no'] item["pname"] = data['finance_allname'] item["prate"] = data['finance_anticipate_rate'] # 利率格式归一化 item["prate"] = item["prate"].replace('%', '').replace( '\r', '').replace('\n', '').replace('\t', '') if item["prate"][-1] != '%': item["prate"] = item["prate"] + '%' item["pperiod"] = data['finance_lmttime_info'] item["pfloor"] = data['finance_indi_ipominamnt'] yield item
def parsehq(self, response): # print "打印response" datas = json.loads(response.body)["data"]["recommendAreas"] # print len(datas[0]["recommendProducts"]) for data_list in datas: for data in data_list["recommendProducts"]: item = FundsInfoItem() item["pid"] = data["prdCode"]+"," item["pname"] = data["recommendName"] item["prate"] = data["newIndexContent"] item["pperiod"] = data["recommendType"] item["pscale"] = data["redirectUrl"] if data["product"].has_key("finaSaleStatusInfo"): item["pfloor"] = data["product"]["finaSaleStatusInfo"]["minAmount"] elif data["product"].has_key("fundSaleStatusInfo"): item["pfloor"] = data["product"]["fundSaleStatusInfo"]["pfirstAmt"] yield item
def parse(self, response): # return response # print('响应开始:') # print(response.text) respo = json.loads(response.text) datas = respo["returnData"]["list"] # f = open('test.txt', 'w') # f.write(str(datas)) # f.close() for data in datas: item = FundsInfoItem() item["pid"] = data['PRD_CODE'] item["pname"] = data['PRD_NAME'] item["prate"] = data['NEXT_INCOME_RATE'] item["pperiod"] = str(data['LIVE_TIME'])+u'天' item["pfloor"] = data['PFIRST_AMT'] yield item
def parse(self, response): # print "打印response" datas = response.xpath('//Table') # print len(datas) for data in datas: item = FundsInfoItem() item["pid"] = data.xpath('./ProductNo/text()').extract()[0] item["pname"] = data.xpath('./ProdName/text()').extract()[0] item["prate"] = data.xpath('./ProdProfit/text()').extract()[0] item["pperiod"] = data.xpath('./ProdLimit/text()').extract()[0] item["pfloor"] = data.xpath('./PurStarAmo/text()').extract()[0] productUrl = self.start_urls[1] + '/' + str(item["pid"]) + '.htm' # yield item yield scrapy.FormRequest(url=productUrl, method='GET', meta={"item": item}, callback=self.parse_pdf)
def parse(self, response): re_pid = re.compile(r'(2301\d*)</font>') re_pname = re.compile(r'<fontclass="autosho[^>]*>([^<]*)|<fontclass="xianjin[^>]*>([^<]*)') re_prate = re.compile(r'1-2">([^<]*)<|准-->([^<]*)<') re_pperiod = re.compile(r'日-->([^<]*)<|td>([0123456789天月年\-]+)<') re_pfloor = re.compile(r'aid[^>]*>([^<]*)|位-->([^<]*)|td>([^<]*万元)') res_clean = response.text.replace(' ', '').replace('\r', '').replace('\n', '').replace('\t', '') pids = re_pid.findall(res_clean) pnames = re_pname.findall(res_clean) prates = re_prate.findall(res_clean) pperiods = re_pperiod.findall(res_clean) pfloors = re_pfloor.findall(res_clean) datas = [] for i in range(len(pids)): data = [pids[i], pnames[i][0], prates[i][0], pperiods[i][0], pfloors[i][0]] if data[1] == '': data[1] = pnames[i][1] if data[2] == '': data[2] = prates[i][1] if data[3] == '': data[3] = pperiods[i][1] if data[3] == '': data[3] = '无' if data[4] == '': data[4] = pfloors[i][1] if data[4] == '': data[4] = pfloors[i][2] datas.append(data) for data in datas: item = FundsInfoItem() item["pid"] = data[0] item["pname"] = data[1] item["prate"] = data[2] # 利率格式归一化 item["prate"] = item["prate"].replace('%', '').replace('\r', '').replace('\n', '').replace('\t', '') try: if item["prate"][-1] != '%': item["prate"] = item["prate"] + '%' except Exception as e: print('Error:{}'.format(e)) print(item["prate"]) item["pperiod"] = data[3] item["pfloor"] = data[4] yield item
def parseacb(self, response): print("抓取安存宝") products = response.xpath('//tr[@class="acb_table"]') print(len(products)) for pro in products: procon = pro.xpath('./td') endtime = procon[2].xpath('./span')[1].xpath( 'string(.)').extract()[0] if self.now() < endtime: print(endtime, "未过期") item = FundsInfoItem() item["pname"] = procon[0].xpath('./a/@title').extract()[0] # item["pid"] item["prate"] = procon[5].xpath('string(.)').extract()[0] item["pperiod"] = procon[4].xpath('string(.)').extract()[0] item["pfloor"] = procon[3].xpath('string(.)').extract( )[0] + procon[1].xpath('string(.)').extract()[0] yield item else: print(endtime, "已过期")
def parse_pdf(self, response): print(response.url) filename = 'pdf/' + response.url.split('/')[-1] f = open(filename, 'wb') f.write(response.body) f.close() pdf = pdfplumber.open(filename) p0 = pdf.pages[0] #注意此处的pages是一个列表,索引是从0开始的 table = p0.extract_table() item = FundsInfoItem() item["pid"] = "".join(table[2][1].split()) item["pname"] = table[1][1] item["prate"] = table[11][1] item["pperiod"] = table[10][1] item["pfloor"] = "".join(table[5][1].split()) item['pscale'] = "".join(table[4][1].split()) yield item
def parse(self, response): # print "打印response" datas = response.xpath('//ol/li[@name="pageli"]') # print len(datas) # print datas[0].xpath('./div/p/a/text()').extract()[0].encode("utf-8") # print datas[0].xpath('normalize-space(./div/div[@class="box_lf"]/p[@class="box_num"]/text())').extract()[0] # print datas[0].xpath('./div/ul/li/span[@class="amt"]/text()').extract()[0].encode("utf-8")+'万' # print datas[0].xpath('normalize-space(./div/ul/li/span[@class="highlight"]/text())').extract()[0].encode("utf-8") for data in datas: item = FundsInfoItem() # item["pid"] = item["pname"] = data.xpath('./div/p/a/text()').extract()[0] item["prate"] = data.xpath( 'normalize-space(./div/div[@class="box_lf"]/p[@class="box_num"]/text())' ).extract()[0] item["pperiod"] = data.xpath( 'normalize-space(./div/ul/li/span[@class="highlight"]/text())' ).extract()[0] item["pfloor"] = data.xpath( './div/ul/li/span[@class="amt"]/text()').extract()[0] + '0000' yield item
def parse5(self, response): # print "打印response" # print len(response.xpath('//div[@class="lccp_main_content_tx"]/ul/li')) # datas = response.xpath('//div[@class="lccp_main_content_tx"]/ul/li') datas = response.xpath( '//div[@class="lccp_main_content_lb"]/table/tbody/tr') # print len(datas) for data in datas[1:]: item = FundsInfoItem() temp = data.xpath('./td') # print len(temp) item["pid"] = temp[0].xpath( './a/@data-analytics-click').extract()[0].split('-')[-1] item["pname"] = temp[0].xpath( 'normalize-space(./a/text())').extract()[0] item["prate"] = temp[5].xpath( 'normalize-space(./div/span/text())').extract()[0] item["pperiod"] = temp[4].xpath( 'normalize-space(./text())').extract()[0] item["pfloor"] = temp[3].xpath( 'normalize-space(./text())').extract()[0] yield item
def parsein(self, response): # print "内部网页" indatas = response.xpath('//tr') item = FundsInfoItem() for data in indatas: if u'产品名称' in data.xpath('./th/text()').extract()[0]: # print "产品名称" item["pname"] = data.xpath( 'normalize-space(./td/text())').extract()[0] if u'产品代码' in data.xpath('./th/text()').extract()[0]: # print "产品代码" item["pid"] = data.xpath( 'normalize-space(./td/text())').extract()[0] if u'预计年化收益率' in data.xpath('./th/text()').extract()[0]: # print "预计年化收益率" item["prate"] = data.xpath( 'normalize-space(./td/text())').extract()[0] if u'起点金额' in data.xpath('./th/text()').extract()[0]: # print "起点金额" item["pfloor"] = data.xpath( 'normalize-space(./td/text())').extract()[0] if u'投资期限' in data.xpath('./th/text()').extract()[0]: # print "投资期限" item["pperiod"] = data.xpath( 'normalize-space(./td/text())').extract()[0] url = response.xpath('//ul[@class="title-ul"]/li/a') # print len(url) # yield item if len(url) < 1: item["pscale"] = "not found" yield item else: pdfUrl = url.xpath('@href').extract()[0] # print pdfUrl yield scrapy.FormRequest(url=pdfUrl, method='GET', meta={"item": item}, callback=self.get_scale)
def parse(self, response): # print "内部网页" # print response.body indatas = response.xpath('//tbody/tr') for data in indatas: item = FundsInfoItem() item["pname"] = data.xpath('./td')[1].xpath( './text()').extract()[0] item["pid"] = data.xpath('./td')[0].xpath('./text()').extract()[0] item["prate"] = data.xpath('./td')[3].xpath( './text()').extract()[0] item["pfloor"] = data.xpath('./td')[4].xpath( './text()').extract()[0] item["pperiod"] = data.xpath('./td')[2].xpath( './text()').extract()[0] yield item
def parse(self, response): begin = re.search('jsonpCallback', response.text).end() datas = json.loads(response.text[begin + 1:-1])['ProdList'] for data in datas: item = FundsInfoItem() item["pid"] = data['code'] item["pname"] = data['name'] item["prate"] = data['yieldRate'] item["pperiod"] = data['investPeriod'] item["pfloor"] = data['purFloorAmt'] # item["pscale"] = data['instructionUrl'] if item["prate"] == 0.0: item["prate"] = self.get_ccb_detail_rate(item['pid']) if item["prate"] == 'html中无法获取到收益率': # 试图从子页面抓取最新收益率 url = r'http://finance.ccb.com/cc_webtran/queryFinanceProdDetail.gsp?' headers = { 'User-Agent': r'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0', 'Referer': r'http://finance.ccb.com/cn/finance/product.html', 'Connection': 'keep-alive' } data = { 'jsoncallback': 'jQuery191036942510719116894_1533864732025', 'params.code': item["pid"] } yield scrapy.FormRequest(url=url, method='POST', headers=headers, formdata=data, meta={"item": item}, callback=self.find_pdf) else: yield item
def parse(self, response): # print "内部网页" # print response.body indatas = response.xpath('//tr[@class="bg2"]') # print len(indatas) # item = FundsInfoItem() # item["pid"] = "test" for data in indatas: item = FundsInfoItem() item["pname"] = data.xpath( 'normalize-space(./td[@class="name"]/a/text())').extract()[0] item["pid"] = data.xpath('./td[@class="name"]/a/@href').extract( )[0].split('productno=')[-1] item["prate"] = data.xpath('./td')[4].xpath( './b/text()').extract()[0] item["pfloor"] = data.xpath('./td')[3].xpath( './text()').extract()[0] item["pperiod"] = data.xpath('./td')[2].xpath( 'normalize-space(./text())').extract()[0] yield item
def parse(self, response): # print "内部网页" # print response.body tables = response.xpath('//table') # print len(indatas) for table in tables: floor = 5 rate = 6 period = 4 table_head = table.xpath('./thead') if len(table_head)>0: for data in table.xpath('./tbody/tr'): item = FundsInfoItem() item["pname"] = data.xpath('./td')[0].xpath('string(.)').extract()[0] item["pid"] = data.xpath('./td')[-1].xpath('./img/@src').extract()[0].split('lccp')[-1].split('.')[0] item["prate"] = data.xpath('./td')[rate].xpath('./text()').extract()[0] item["pfloor"] = data.xpath('./td')[floor].xpath('./text()').extract()[0] item["pperiod"] = data.xpath('./td')[period].xpath('./text()').extract()[0] yield item else: table_title = table.xpath('./tbody/tr')[0].xpath('./td') for i in range(len(table_title)): title = table_title[i].xpath('string(.)').extract()[0] # print title if u'起购' in title: floor = i elif u'客户年化' in title or u'比较基准' in title or u'客户参考浮动年化净收益率' in title: rate = i elif u'天' in title: period = i for data in table.xpath('./tbody/tr')[1:]: item = FundsInfoItem() item["pname"] = data.xpath('./td')[0].xpath('string(.)').extract()[0] item["pid"] = data.xpath('./td')[-1].xpath('./img/@src').extract()[0].split('lccp')[-1].split('.')[0] item["prate"] = data.xpath('./td')[rate].xpath('./text()').extract()[0] if item['prate'] == '以我行网站刊登的参考收益率公告为准': try: html_id = '201' + str(item["pid"][-4:-1]) if html_id == '201201': html_id = '201203' inner_url_1 = 'http://wealth.cib.com.cn/retail/duration/cash/referNetValue/' + html_id + '/' + \ html_id + '.html' res = urllib.request.urlopen(inner_url_1) inner_html_1 = res.read().decode('utf-8') re_1 = '(/retail/duration/cash/referNetValue/' + html_id + '/'\ + html_id + '_[\d]*.html)' inner_url_2 = re.search(re_1, inner_html_1).group() inner_url_2 = 'http://wealth.cib.com.cn' + inner_url_2 res_2 = urllib.request.urlopen(inner_url_2) inner_html_2 = res_2.read().decode('utf-8') re_2 = '<td>([\d\.]*%)</td>' final = re.search(re_2, inner_html_2).group(1) item['prate'] = final except Exception as e: print('errorinfo:{}'.format(e)) item['prate'] = '未能在子页面获取到收益' item["pfloor"] = data.xpath('./td')[floor].xpath('./text()').extract()[0] item["pperiod"] = data.xpath('./td')[period].xpath('./text()').extract()[0] for sub_item_key in item.keys(): item[sub_item_key] = str(item[sub_item_key]).replace('\r', '').replace('\n', '')\ .replace('\t', '').strip() print(item[sub_item_key]) yield item