def parse1(self, response): bs_obj = bs4.BeautifulSoup(response.text, 'html.parser') item = response.meta['item'] try: if item['fund_code'] in blacklist: raise Exception('此基金已列入黑名单') # 基金信息 e_div = bs_obj.find('div', class_='infoOfFund') e_table = e_div.table s = re.sub(r'\s+|\xa0','',u','.join([e.get_text(strip=True) for e in e_table.find_all('td')])) ser = pd.Series([]) d = { u'fund_type':u'基金类型:', u'fund_size':u'基金规模:', u'found_date':u'成立日:', u'manager':u'管理人:' } ser['fund_code'] = item['fund_code'] for key in d: ser[key] = re.search(ur'(?<=%s).+?(?=\,)' %d[key], s).group() if re.search(ur'(?<=%s).+(?=,)' %d[key], s) else None df = pd.DataFrame(ser).T df.index = [item['fund_code'], ] mysql_connecter.insert_df_data(df, 'fund_info', method='UPDATE')
def parse1(self, response): item = response.meta['item'] try: str_list = eval(response.text) df = pd.DataFrame([]) for s in str_list: l = s.split(',') df0 = pd.Series([l[1],l[2], item['plate_name'], item['plate_type']], index=['stock_code', 'stock_name', 'plate_name', 'plate_type']) df = df.append(df0, ignore_index=True) if not df.empty: df['crawler_key'] = df['stock_code'] + '/' + df['plate_name'] mysql_connecter.insert_df_data(df, 'stock_info', method='UPDATE') except: log_obj.error("%s中无法解析\n原因:%s" %(self.name, traceback.format_exc()))
def parse(self, response): item = response.meta['item'] try: print u"正在爬取%s板块的历史资金流数据" %item['plate_name'] bs_obj = bs4.BeautifulSoup(response.text, 'html.parser') e_table = bs_obj.find('table', id='tb_lishi') plate_name = item['plate_name'] df = pd.read_html(e_table.prettify(encoding='utf8'), encoding='utf8')[0] df.columns = [ u'日期', u'主力净流入净额', u'主力净流入净占比', u'超大单净流入净额', u'超大单净流入净占比', u'大单净流入净额', u'大单净流入净占比', u'中单净流入净额', u'中单净流入净占比', u'小单净流入净额', u'小单净流入净占比' ] df = df.rename({ u'日期':'value_date', u'主力净流入净额':'main_flow_amount', u'主力净流入净占比':'main_flow_ratio', u'超大单净流入净额':'super_flow_amount', u'超大单净流入净占比':'super_flow_ratio', u'大单净流入净额':'big_flow_amount', u'大单净流入净占比':'big_flow_ratio', u'中单净流入净额':'median_flow_amount', u'中单净流入净占比':'median_flow_ratio', u'小单净流入净额':'small_flow_amount', u'小单净流入净占比':'small_flow_ratio' }, axis=1) df['plate_name'] = plate_name df['crawler_key'] = df['plate_name'] + '/' + df['value_date'] if not df.empty: mysql_connecter.insert_df_data(df, 'capital_flow_data', method='UPDATE') except: log_obj.error("%s中无法解析\n原因:%s" %(self.name, traceback.format_exc()))
def parse1(self, response): bs_obj = bs4.BeautifulSoup(response.text, 'html.parser') item = response.meta['item'] if item['fund_code'] in blacklist: raise Exception('此基金已列入黑名单') # 读取最新净值日期 lastest_date = self.newest_date_df['newest_date'][ self.newest_date_df['fund_code'] == item['fund_code']] if lastest_date.empty: raise Exception('本地数据库没有找到基金代号%s' % item['fund_code']) lastest_date = lastest_date.iat[0] lastest_date = datetime.datetime(lastest_date.year, lastest_date.month, lastest_date.day) # 从date格式转为datetime print "本地%s的最新净值日期为%s" % (item['fund_code'], lastest_date) try: # 净值估算 e_dl = bs_obj.find('dl', class_='dataItem01') data = [ e.get_text(strip=True) for e in e_dl.find('dd', class_='dataNums').find_all('span') ] data_type = e_dl.find('span', class_='sp01').get_text(strip=True) data_date = e_dl.find('span', id='gz_gztime').get_text(strip=True) # if data_date != '--': data_date = datetime.datetime.strptime( re.sub(r'\(|\)', '', data_date), '%y-%m-%d %H:%M') # 周六,周日按周五算 data_date = data_date - datetime.timedelta( days=1) if data_date.isoweekday() == 6 else data_date data_date = data_date - datetime.timedelta( days=2) if data_date.isoweekday() == 7 else data_date df = pd.DataFrame(data + [data_type, data_date], index=[u'净值', u'涨跌值', u'涨跌幅', u'数据类型', u'数据日期']).T df = df.drop([u'涨跌值', u'数据类型'], axis=1) df = df.rename( { u'净值': u'estimate_net_value', u'涨跌幅': u'estimate_daily_growth_rate', u'数据日期': u'value_date' }, axis=1) df[u'fund_code'] = item['fund_code'] df[u'value_date'] = df[u'value_date'].apply( lambda date0: date0.strftime('%Y-%m-%d')) df[u'crawler_key'] = df[u'fund_code'] + '/' + df[u'value_date'] df.index = df[u'crawler_key'] print u"网页日期:", df[u'value_date'].iat[0], u'本地日期:', lastest_date # if datetime.datetime.strptime(df[u'value_date'].iat[0],'%Y-%m-%d').date() <= lastest_date.date(): # mysql_connecter.update_df_data(df, u'eastmoney_daily_data', u'crawler_key') # else: # mysql_connecter.insert_df_data(df, u'eastmoney_daily_data', method='UPDATE') if not df.empty: mysql_connecter.insert_df_data(df, 'eastmoney_daily_data', method='UPDATE') else: print u"无最新数据" except: log_obj.error("%s( %s )中无法解析\n%s" % (self.name, response.url, traceback.format_exc())) with open(u'净值估算.html', 'w') as f: f.write(response.text) try: # 基金净值 e_div = bs_obj.find_all('div', class_='poptableWrap singleStyleHeight01')[ 0] #有三个标签页,分别是净值,分红,评级 e_table = e_div.table df = pd.read_html(e_table.prettify(encoding='utf8'), encoding='utf8', header=0)[0] # 此处有时间BUG year_num = datetime.datetime.now().year df[u'日期'] = pd.to_datetime( df[u'日期'].apply(lambda s: '%s-%s' % (year_num, s))) #print df[u'日期'].dtype #print type(lastest_date) df = df.astype(np.str) df[u'crawler_key'] = df[u'日期'].apply(lambda date: "%s/%s" % (item['fund_code'], date)) df[u'fund_code'] = item['fund_code'] df = df.rename( { u'日期': u'value_date', u'单位净值': u'net_asset_value', u'累计净值': u'accumulative_net_value', u'日增长率': u'daily_growth_rate' }, axis=1) df.index = df[u'crawler_key'] if not df.empty: mysql_connecter.insert_df_data(df, 'eastmoney_daily_data', method='UPDATE') else: print u"无最新数据" except: log_obj.error("%s( %s )中无法解析\n%s" % (self.name, response.url, traceback.format_exc())) with open(u'基金净值.html', 'w') as f: f.write(response.text) try: # js.v中的数据 url = 'http://fund.eastmoney.com/pingzhongdata/%s.js?v=%s' % ( item['fund_code'], datetime.datetime.now().strftime('%Y%m%d%H%M%S')) js_data = requests_manager.get_html(url) js_data = re.sub('\s+', '', js_data) re_func = lambda key: re.search( (r'(?<=%s\=).+?(?=;)' % key), js_data, re.S).group( ) if re.search((r'%s\=.+?;' % key), js_data) else None # 股票仓位 Data_fundSharesPositions = pd.DataFrame( eval(re_func('Data_fundSharesPositions')), columns=[u'value_date', u'fund_shares_positions']).astype(np.str) Data_fundSharesPositions[u'value_date'] = Data_fundSharesPositions[ u'value_date'].apply(lambda s: datetime.datetime.fromtimestamp( int(s[:10])).strftime('%Y-%m-%d')) Data_fundSharesPositions[ u'fund_shares_positions'] = Data_fundSharesPositions[ u'fund_shares_positions'] + '%' Data_fundSharesPositions[u'crawler_key'] = item[ 'fund_code'] + '/' + Data_fundSharesPositions[u'value_date'] Data_fundSharesPositions = Data_fundSharesPositions.drop([ u'value_date', ], axis=1) Data_fundSharesPositions.index = Data_fundSharesPositions[ u'crawler_key'] if not Data_fundSharesPositions.empty: mysql_connecter.insert_df_data(Data_fundSharesPositions, 'eastmoney_daily_data', method='UPDATE') except: log_obj.error("%s( %s )中无法解析\n%s" % (self.name, response.url, traceback.format_exc())) with open(u'js_v中的数据.html', 'w') as f: f.write(response.text)
def parse1(self, response): print "准备解析:", response.url item = response.meta['item'] if item['fund_code'] in blacklist: raise Exception('此基金已列入黑名单') # 年份列表 url = "http://fund.eastmoney.com/f10/FundArchivesDatas.aspx?type=jjcc&code=%s&topline=200" % ( item['fund_code']) html = requests_manager.get_html(url) with open('test1.html', 'w') as f: f.write(html) year_list = eval( re.search( r'(?<=arryear:)\[.+?\](?=,)', html).group()) if re.search( r'(?<=arryear:)\[.+?\](?=,)', html) else None if year_list is None: raise Exception(u'错误的 url %s' % url) for year0 in year_list: url = "http://fund.eastmoney.com/f10/FundArchivesDatas.aspx?type=jjcc&code=%s&topline=200&year=%s" % ( item['fund_code'], str(year0)) html = requests_manager.get_html(url) with open('test2.html', 'w') as f: f.write(html) html = re.search(r"<div class='box'>.+</div>", html).group() if re.search( r"<div class='box'>.+</div>", html) else None if html is None: raise Exception(u'错误的 url %s' % url) bs_obj = bs4.BeautifulSoup(html, 'html.parser') with open('test3.html', 'w') as f: f.write(bs_obj.prettify(encoding='utf8')) for e_div in bs_obj.find_all('div', class_="box"): title = e_div.find('h4', class_="t").get_text(strip=True) print response.url print title converters = {u'股票代码': lambda s: str(s)} df0 = pd.read_html(e_div.table.prettify(encoding='utf8'), encoding='utf8', converters=converters)[0] df0.columns = [re.sub(r'\s+', '', s) for s in df0.columns] func = lambda s: re.search(ur'占净值|持股数|持仓市值', s).group( ) if re.search(ur'占净值|持股数|持仓市值', s) else s df0.columns = [func(s) for s in df0.columns] df0[u'标题'] = title df0[u'cut_off_date'] = title.split(u'截止至:')[-1] df0[u'对应基金'] = item[u'fund_code'] df0[u'年份'] = year0 df0 = df0.rename( { u'股票代码': u'stock_code', u'股票名称': u'stock_name', u'占净值': u'net_value_ratio', u'持股数': u'share_holding', u'持仓市值': u'market_value', u'对应基金': u'fund_code', u'标题': u'title', u'年份': u'year' }, axis=1) df0 = df0.drop([u'序号', u'相关资讯', u'最新价', u'涨跌幅'], axis=1, errors='ignore') df0[u'crawler_key'] = df0[u'fund_code'] + u'/' + df0[ u'stock_code'] + u'/' + df0[u'cut_off_date'] mysql_connecter.insert_df_data(df0, u'fund_holdings', method='UPDATE')
def parse1(self, response): bs_obj = bs4.BeautifulSoup(response.text, 'html.parser') item = response.meta['item'] if item['fund_code'] in blacklist: raise Exception('此基金已列入黑名单') try: # js.v中的数据 url = 'http://fund.eastmoney.com/pingzhongdata/%s.js?v=%s' % ( item['fund_code'], datetime.datetime.now().strftime('%Y%m%d%H%M%S')) js_data = requests_manager.get_html(url) js_data = re.sub('\s+', '', js_data) re_func = lambda key: re.search( (r'(?<=%s\=).+?(?=;)' % key), js_data, re.S).group( ) if re.search((r'%s\=.+?;' % key), js_data) else None # 规模变动 Data_fluctuationScale = pd.read_json( re_func('Data_fluctuationScale') ) # , columns = [u'value_date', u'fund_shares_positions']).astype(np.str) for i in range(Data_fluctuationScale.shape[0]): #print Data_fluctuationScale.loc[i,'series'] #print type(Data_fluctuationScale.loc[i,'series']) ser = pd.Series(Data_fluctuationScale.loc[i, 'series']) ser = ser.rename({'mom': u'较上期环比', 'y': u'净资产规模(亿)'}) ser['value_date'] = Data_fluctuationScale.loc[i, 'categories'] Data_fluctuationScale.loc[i, 'series'] = ser.to_json() Data_fluctuationScale['fund_code'] = item['fund_code'] Data_fluctuationScale['data_type'] = u'规模变动' Data_fluctuationScale['crawler_key'] = Data_fluctuationScale[ 'fund_code'] + '/' + Data_fluctuationScale[ 'data_type'] + '/' + Data_fluctuationScale['categories'] Data_fluctuationScale = Data_fluctuationScale.drop([ 'categories', ], axis=1) Data_fluctuationScale = Data_fluctuationScale.rename( {'series': 'json_data'}, axis=1) Data_fluctuationScale.index = Data_fluctuationScale['crawler_key'] #print Data_fluctuationScale if not Data_fluctuationScale.empty: mysql_connecter.insert_df_data(Data_fluctuationScale, 'fund_mixed_data') # 持有人结构 Data_holderStructure = json.loads( re_func('Data_holderStructure') ) # , columns = [u'value_date', u'fund_shares_positions']).astype(np.str) #print Data_holderStructure categories = Data_holderStructure['categories'] series = Data_holderStructure['series'] d = {d0['name']: d0['data'] for d0 in series} df = pd.DataFrame(d, index=categories) df['value_date'] = df.index ser = df.T.apply(lambda ser: ser.to_json()) ser.name = 'json_data' Data_holderStructure = pd.DataFrame(ser, index=categories) Data_holderStructure['fund_code'] = item['fund_code'] Data_holderStructure['data_type'] = u'持有人结构' Data_holderStructure['crawler_key'] = item[ 'fund_code'] + '/' + Data_holderStructure[ 'data_type'] + '/' + Data_holderStructure.index Data_holderStructure.index = Data_holderStructure['crawler_key'] if not Data_holderStructure.empty: mysql_connecter.insert_df_data(Data_holderStructure, 'fund_mixed_data') # 资产配置 Data_assetAllocation = json.loads(re_func('Data_assetAllocation')) categories = Data_assetAllocation['categories'] series = Data_assetAllocation['series'] d = {d0['name']: d0['data'] for d0 in series} df = pd.DataFrame(d, index=categories) df['value_date'] = df.index ser = df.T.apply(lambda ser: ser.to_json()) ser.name = 'json_data' Data_assetAllocation = pd.DataFrame(ser, index=categories) Data_assetAllocation['fund_code'] = item['fund_code'] Data_assetAllocation['data_type'] = u'资产配置' Data_assetAllocation['crawler_key'] = item[ 'fund_code'] + '/' + Data_assetAllocation[ 'data_type'] + '/' + Data_assetAllocation.index Data_assetAllocation.index = Data_assetAllocation['crawler_key'] if not Data_assetAllocation.empty: mysql_connecter.insert_df_data(Data_assetAllocation, 'fund_mixed_data') # 基金经理变动一览 e_table = bs_obj.find('li', class_='fundManagerTab').table df0 = pd.read_html(e_table.prettify(encoding='utf8'), encoding='utf8')[0] df0.columns = df0.loc[0, :] df0.columns.name = None df0 = df0.drop([ 0, ]) df0.index = range(df0.shape[0]) df = pd.DataFrame( { 'crawler_key': item['fund_code'] + '/' + u'基金经理变动', 'fund_code': item['fund_code'], 'data_type': u'基金经理变动', 'json_data': df0.to_json() }, index=[ 0, ]) if not df.empty: mysql_connecter.insert_df_data(df, 'fund_mixed_data', method='UPDATE') except: log_obj.error("%s( %s )中无法解析\n%s" % (self.name, response.url, traceback.format_exc()))