def _newstocks(data, pageNo, retry_count, pause): for _ in range(retry_count): time.sleep(pause) ct._write_console() try: html = lxml.html.parse(rv.NEW_STOCKS_URL%(ct.P_TYPE['http'],ct.DOMAINS['vsf'], ct.PAGES['newstock'], pageNo)) res = html.xpath('//table[@id=\"NewStockTable\"]/tr') if len(res) == 0: return data if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = sarr.replace('<font color="red">*</font>', '') sarr = '<table>%s</table>'%sarr df = pd.read_html(StringIO(sarr), skiprows=[0, 1])[0] df = df.drop([df.columns[idx] for idx in [12, 13, 14]], axis=1) df.columns = rv.NEW_STOCKS_COLS df['code'] = df['code'].map(lambda x : str(x).zfill(6)) df['xcode'] = df['xcode'].map(lambda x : str(x).zfill(6)) res = html.xpath('//table[@class=\"table2\"]/tr[1]/td[1]/a/text()') tag = '下一页' if ct.PY3 else unicode('下一页', 'utf-8') hasNext = True if tag in res else False data = data.append(df, ignore_index=True) pageNo += 1 if hasNext: data = _newstocks(data, pageNo, retry_count, pause) except Exception as ex: print(ex) else: return data
def _get_cashflow_data(year, quarter, pageNo, dataArr, retry_count=3, pause=0.001): ct._write_console() for _ in range(retry_count): time.sleep(pause) try: request = Request(ct.CASHFLOW_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1])) text = urlopen(request, timeout=10).read() text = text.decode('GBK') text = text.replace('--', '') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@class=\"list_table\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df.columns = ct.CASHFLOW_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_cashflow_data(year, quarter, pageNo, dataArr) else: return dataArr except Exception as e: pass raise IOError(ct.NETWORK_URL_ERROR_MSG)
def _today_ticks(symbol, tdate, pageNo, retry_count, pause): ct._write_console() for _ in range(retry_count): time.sleep(pause) try: html = lxml.html.parse(ct.TODAY_TICKS_URL % (ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['t_ticks'], symbol, tdate, pageNo )) res = html.xpath('//table[@id=\"datatbl\"]/tbody/tr') if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr sarr = sarr.replace('--', '0') df = pd.read_html(StringIO(sarr), parse_dates=False)[0] df.columns = ct.TODAY_TICK_COLUMNS df['pchange'] = df['pchange'].map(lambda x : x.replace('%', '')) except Exception as e: print(e) else: return df raise IOError(ct.NETWORK_URL_ERROR_MSG)
def _get_debtpaying_data(year, quarter, pageNo, dataArr): ct._write_console() try: request = Request(ct.DEBTPAYING_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1])) text = urlopen(request, timeout=ct.HTTP_TIMEOUT).read() text = text.decode('GBK') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@class=\"list_table\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df.columns = ct.DEBTPAYING_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_debtpaying_data(year, quarter, pageNo, dataArr) else: return dataArr except Exception as e: print(e)
def _get_forecast_data(year, quarter, pageNo, dataArr): url = ct.FORECAST_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1]) ct._write_console() try: html = lxml.html.parse(url) xtrs = html.xpath("//table[@class=\"list_table\"]/tr") for trs in xtrs: code = trs.xpath('td[1]//span/a/text()')[0] name = trs.xpath('td[2]/span/a/text()')[0] type = trs.xpath('td[3]/a/text()') type = type[0] if len(type)>0 else trs.xpath('td[3]/text()')[0] report_date = trs.xpath('td[4]/text()')[0] pre_eps = trs.xpath('td[7]/text()')[0] pre_eps = '0' if pre_eps == '--' else pre_eps range = trs.xpath('td[8]/text()')[0] dataArr.append([code,name,type,report_date,pre_eps,range]) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') #获取下一页 if len(nextPage)>0: pageNo = re.findall(r'\d+',nextPage[0])[0] return _get_forecast_data(year,quarter,pageNo,dataArr) else: return dataArr except: pass
def _inst_tops(last=5, pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()): ct._write_console() for _ in range(retry_count): time.sleep(pause) try: request = Request( rv.LHB_SINA_URL % (ct.P_TYPE["http"], ct.DOMAINS["vsf"], rv.LHB_KINDS[2], ct.PAGES["fd"], last, pageNo) ) text = urlopen(request, timeout=10).read() text = text.decode("GBK") html = lxml.html.parse(StringIO(text)) res = html.xpath('//table[@id="dataTable"]/tr') if ct.PY3: sarr = [etree.tostring(node).decode("utf-8") for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = "".join(sarr) sarr = "<table>%s</table>" % sarr df = pd.read_html(sarr)[0] df = df.drop([2, 3], axis=1) df.columns = rv.LHB_JGZZ_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class="pages"]/a[last()]/@onclick') if len(nextPage) > 0: pageNo = re.findall(r"\d+", nextPage[0])[0] return _inst_tops(last, pageNo, retry_count, pause, dataArr) else: return dataArr except Exception as e: print(e)
def _get_detail(tag, retry_count=3, pause=10): dfc = pd.DataFrame() p = 0 num_limit = 100 while(True): p = p+1 for _ in range(retry_count): time.sleep(pause) try: ct._write_console() request = Request(ct.SINA_DATA_DETAIL_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['jv'], p,tag)) text = urlopen(request, timeout=10).read() text = text.decode('gbk') except _network_error_classes: pass else: break reg = re.compile(r'\,(.*?)\:') text = reg.sub(r',"\1":', text) text = text.replace('"{symbol', '{"symbol') text = text.replace('{symbol', '{"symbol"') jstr = json.dumps(text) js = json.loads(jstr) df = pd.DataFrame(pd.read_json(js, dtype={'code':object}), columns=ct.THE_FIELDS) df = df[ct.FOR_CLASSIFY_B_COLS] dfc = pd.concat([dfc, df]) if df.shape[0] < num_limit: return dfc
def _dist_cotent(year, pageNo, retry_count, pause): url = rv.DP_163_URL%(ct.P_TYPE['http'], ct.DOMAINS['163'], ct.PAGES['163dp'], year, pageNo) for _ in range(retry_count): time.sleep(pause) try: if pageNo > 0: ct._write_console() html = lxml.html.parse(url) res = html.xpath('//div[@class=\"fn_rp_list\"]/table') if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) df = pd.read_html(sarr, skiprows=[0])[0] df = df.drop(df.columns[0], axis=1) df.columns = rv.DP_163_COLS df['divi'] = df['plan'].map(_fun_divi) df['shares'] = df['plan'].map(_fun_into) df = df.drop('plan', axis=1) df['code'] = df['code'].astype(object) df['code'] = df['code'].map(lambda x : str(x).zfill(6)) if pageNo == 0: page = html.xpath('//div[@class=\"mod_pages\"]/a') asr = page[len(page)-2] pages = asr.xpath('text()') except _network_error_classes: pass else: if pageNo == 0: return df, pages[0] else: return df raise IOError(ct.NETWORK_URL_ERROR_MSG)
def _today_ticks(symbol, tdate, pageNo, retry_count, pause): ct._write_console() for _ in range(retry_count): time.sleep(pause) try: html = lxml.html.parse(ct.TODAY_TICKS_URL % (ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['t_ticks'], symbol, tdate, pageNo )) res = html.xpath('//table[@id=\"datatbl\"]/tbody/tr') if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr sarr = sarr.replace('--', '0') #df = pd.DataFrame(columns=ct.TODAY_TICK_COLUMNS) df = pd.read_html(StringIO(sarr), infer_types=False)[0] df.columns = list(ct.TODAY_TICK_COLUMNS) df['Pchange'] = df['Pchange'].map(lambda x : x.replace('%','')) df.insert(0,'Date',tdate) #cols = ct.TODAY_TICK_COLUMNS.insert(0,'Date') df = df[cols] except _network_error_classes: pass else: return df raise IOError("获取失败,请检查网络" )
def _newcbonds(pageNo, retry_count, pause): for _ in range(retry_count): time.sleep(pause) if pageNo != 1: ct._write_console() try: html = lxml.html.parse(rv.NEW_CBONDS_URL%(ct.P_TYPE['http'],ct.DOMAINS['sstar'], pageNo)) res = html.xpath('//table/tr') if len(res) == 0: return None if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(StringIO(sarr), skiprows=[0]) if len(df) < 1: return None df = df[0] df = df.drop([df.columns[14], df.columns[15]], axis=1) df.columns = rv.NEW_CBONDS_COLS df['scode'] = df['scode'].map(lambda x: str(x).zfill(6)) df['xcode'] = df['xcode'].map(lambda x: str(x).zfill(6)) except Exception as ex: print(ex) else: return df
def _parsing_dayprice_json(pageNum=1): """ 处理当日行情分页数据,格式为json Parameters ------ pageNum:页码 return ------- DataFrame 当日所有股票交易数据(DataFrame) """ ct._write_console() request = Request(ct.SINA_DAY_PRICE_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['jv'], pageNum)) text = urlopen(request, timeout=10).read() if text == 'null': return None reg = re.compile(r'\,(.*?)\:') text = reg.sub(r',"\1":', text.decode('gbk') if ct.PY3 else text) text = text.replace('"{symbol', '{"symbol') text = text.replace('{symbol', '{"symbol"') if ct.PY3: jstr = json.dumps(text) else: jstr = json.dumps(text, encoding='GBK') js = json.loads(jstr) df = pd.DataFrame(pd.read_json(js, dtype={'code':object}), columns=ct.DAY_TRADING_COLUMNS) df = df.drop('symbol', axis=1) df = df.ix[df.volume > 0] return df
def _get_forecast_data(year, quarter, pageNo, dataArr): ct._write_console() try: html = lxml.html.parse( ct.FORECAST_URL % (ct.P_TYPE["http"], ct.DOMAINS["vsf"], ct.PAGES["fd"], year, quarter, pageNo, ct.PAGE_NUM[1]) ) res = html.xpath('//table[@class="list_table"]/tr') if ct.PY3: sarr = [etree.tostring(node).decode("utf-8") for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = "".join(sarr) sarr = sarr.replace("--", "0") sarr = "<table>%s</table>" % sarr df = pd.read_html(sarr)[0] df = df.drop([4, 5, 8], axis=1) df.columns = ct.FORECAST_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class="pages"]/a[last()]/@onclick') if len(nextPage) > 0: pageNo = re.findall(r"\d+", nextPage[0])[0] return _get_forecast_data(year, quarter, pageNo, dataArr) else: return dataArr except: pass
def _dist_cotent(year, pageNo, retry_count, pause): for _ in range(retry_count): time.sleep(pause) try: if pageNo > 0: ct._write_console() html = lxml.html.parse( rv.DP_163_URL % (ct.P_TYPE["http"], ct.DOMAINS["163"], ct.PAGES["163dp"], year, pageNo) ) res = html.xpath('//div[@class="fn_rp_list"]/table') if ct.PY3: sarr = [etree.tostring(node).decode("utf-8") for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = "".join(sarr) df = pd.read_html(sarr, skiprows=[0])[0] df = df.drop(df.columns[0], axis=1) df.columns = rv.DP_163_COLS df["divi"] = df["plan"].map(_fun_divi) df["shares"] = df["plan"].map(_fun_into) df = df.drop("plan", axis=1) df["code"] = df["code"].astype(object) df["code"] = df["code"].map(lambda x: str(x).zfill(6)) if pageNo == 0: page = html.xpath('//div[@class="mod_pages"]/a') asr = page[len(page) - 2] pages = asr.xpath("text()") except _network_error_classes: pass else: if pageNo == 0: return df, pages[0] else: return df raise IOError(ct.NETWORK_URL_ERROR_MSG)
def _get_forecast_data(year, quarter, pageNo, dataArr): ct._write_console() try: gparser = etree.HTMLParser(encoding='GBK') html = lxml.html.parse(ct.FORECAST_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1]), parser=gparser) res = html.xpath("//table[@class=\"list_table\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = sarr.replace('--', '0') sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df = df.drop([4, 5, 8], axis=1) df.columns = ct.FORECAST_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+',nextPage[0])[0] return _get_forecast_data(year, quarter, pageNo, dataArr) else: return dataArr except Exception as e: print(e)
def _inst_detail(pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()): ct._write_console() for _ in range(retry_count): time.sleep(pause) try: request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[3], ct.PAGES['fd'], '', pageNo)) text = urlopen(request, timeout=10).read() text = text.decode('GBK') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@id=\"dataTable\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df.columns = rv.LHB_JGMX_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _inst_detail(pageNo, retry_count, pause, dataArr) else: return dataArr except Exception as e: print(e)
def _get_report_data(year, quarter, pageNo, dataArr, orderby): ct._write_console() try: request = Request(ct.REPORT_URL % (ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1], orderby)) # 默认排序抓取的信息有重复和遗漏,增加排序功能参数orderby text = urlopen(request, timeout=10).read() text = text.decode('GBK') text = text.replace('--', '') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@class=\"list_table\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>' % sarr df = pd.read_html(sarr)[0] df = df.drop(11, axis=1) df.columns = ct.REPORT_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage) > 0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_report_data(year, quarter, pageNo, dataArr,orderby) else: return dataArr except Exception as e: print(e)
def _get_debtpaying_data(year, quarter, pageNo, dataArr): url = ct.DEBTPAYING_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1]) ct._write_console() try: html = lxml.html.parse(url) xtrs = html.xpath("//table[@class=\"list_table\"]/tr") for trs in xtrs: code = trs.xpath('td[1]/a/text()')[0] name = trs.xpath('td[2]/a/text()')[0] currentratio = trs.xpath('td[3]/text()')[0] currentratio = '0' if currentratio == '--' else currentratio quickratio = trs.xpath('td[4]/text()')[0] quickratio = '0' if quickratio == '--' else quickratio cashratio = trs.xpath('td[5]/text()')[0] cashratio = '0' if cashratio == '--' else cashratio icratio = trs.xpath('td[6]/text()')[0] icratio = '0' if icratio == '--' else icratio sheqratio = trs.xpath('td[7]/text()')[0] sheqratio = '0' if sheqratio == '--' else sheqratio adratio = trs.xpath('td[8]/text()')[0] adratio = '0' if adratio == '--' else adratio dataArr.append([code, name, currentratio, quickratio, cashratio, icratio, sheqratio, adratio]) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') #获取下一页 if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_debtpaying_data(year, quarter, pageNo, dataArr) else: return dataArr except: pass
def _get_cashflow_data(year, quarter, pageNo, dataArr): url = ct.CASHFLOW_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1]) ct._write_console() try: html = lxml.html.parse(url) xtrs = html.xpath("//table[@class=\"list_table\"]/tr") for trs in xtrs: code = trs.xpath('td[1]/a/text()')[0] name = trs.xpath('td[2]/a/text()')[0] cf_sales = trs.xpath('td[3]/text()')[0] cf_sales = '0' if cf_sales == '--' else cf_sales rateofreturn = trs.xpath('td[4]/text()')[0] rateofreturn = '0' if rateofreturn == '--' else rateofreturn cf_nm = trs.xpath('td[5]/text()')[0] cf_nm = '0' if cf_nm == '--' else cf_nm cf_liabilities = trs.xpath('td[6]/text()')[0] cf_liabilities = '0' if cf_liabilities == '--' else cf_liabilities cashflowratio = trs.xpath('td[7]/text()')[0] cashflowratio = '0' if cashflowratio == '--' else cashflowratio dataArr.append([code, name, cf_sales, rateofreturn, cf_nm, cf_liabilities, cashflowratio]) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') #获取下一页 if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_cashflow_data(year, quarter, pageNo, dataArr) else: return dataArr except: pass
def _get_operation_data(year, quarter, pageNo, dataArr): url = ct.OPERATION_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1]) ct._write_console() try: html = lxml.html.parse(url) xtrs = html.xpath("//table[@class=\"list_table\"]/tr") for trs in xtrs: code = trs.xpath('td[1]/a/text()')[0] name = trs.xpath('td[2]/a/text()')[0] arturnover = trs.xpath('td[3]/text()')[0] arturnover = '0' if arturnover == '--' else arturnover arturndays = trs.xpath('td[4]/text()')[0] arturndays = '0' if arturndays == '--' else arturndays inventory_turnover = trs.xpath('td[5]/text()')[0] inventory_turnover = '0' if inventory_turnover == '--' else inventory_turnover inventory_days = trs.xpath('td[6]/text()')[0] inventory_days = '0' if inventory_days == '--' else inventory_days currentasset_turnover = trs.xpath('td[7]/text()')[0] currentasset_turnover = '0' if currentasset_turnover == '--' else currentasset_turnover currentasset_days = trs.xpath('td[8]/text()')[0] currentasset_days = '0' if currentasset_days == '--' else currentasset_days dataArr.append([code, name, arturnover, arturndays, inventory_turnover, inventory_days, currentasset_turnover, currentasset_days]) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') #获取下一页 if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_growth_data(year, quarter, pageNo, dataArr) else: return dataArr except: pass
def _get_growth_data(year, quarter, pageNo, dataArr): url = ct.GROWTH_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1]) ct._write_console() try: html = lxml.html.parse(url) xtrs = html.xpath("//table[@class=\"list_table\"]/tr") for trs in xtrs: code = trs.xpath('td[1]/a/text()')[0] name = trs.xpath('td[2]/a/text()')[0] mbrg = trs.xpath('td[3]/text()')[0] mbrg = '0' if mbrg == '--' else mbrg nprg = trs.xpath('td[4]/text()')[0] nprg = '0' if nprg == '--' else nprg nav = trs.xpath('td[5]/text()')[0] nav = '0' if nav == '--' else nav targ = trs.xpath('td[6]/text()')[0] targ = '0' if targ == '--' else targ epsg = trs.xpath('td[7]/text()')[0] epsg = '0' if epsg == '--' else epsg seg = trs.xpath('td[8]/text()')[0] seg = '0' if seg == '--' else seg dataArr.append([code, name, mbrg, nprg, nav, targ, epsg, seg]) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') #获取下一页 if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_growth_data(year, quarter, pageNo, dataArr) else: return dataArr except: pass
def _get_profit_data(year, quarter, pageNo, dataArr): url = ct.PROFIT_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1]) ct._write_console() try: html = lxml.html.parse(url) xtrs = html.xpath("//table[@class=\"list_table\"]/tr") for trs in xtrs: code = trs.xpath('td[1]/a/text()')[0] name = trs.xpath('td[2]/a/text()')[0] roe = trs.xpath('td[3]/text()')[0] roe = '0' if roe == '--' else roe net_profit_ratio = trs.xpath('td[4]/text()')[0] net_profit_ratio = '0' if net_profit_ratio == '--' else net_profit_ratio gross_profit_rate = trs.xpath('td[5]/text()')[0] gross_profit_rate = '0' if gross_profit_rate == '--' else gross_profit_rate net_profits = trs.xpath('td[6]/text()')[0] net_profits = '0' if net_profits == '--' else net_profits eps = trs.xpath('td[7]/text()')[0] eps = '0' if eps == '--' else eps business_income = trs.xpath('td[8]/text()')[0] business_income = '0' if business_income == '--' else business_income bips = trs.xpath('td[9]/text()')[0] bips = '0' if bips == '--' else bips dataArr.append([code, name, roe, net_profit_ratio, gross_profit_rate, net_profits, eps, business_income, bips]) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') #获取下一页 if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_profit_data(year, quarter, pageNo, dataArr) else: return dataArr except: pass
def _get_report_data(year, quarter, pageNo, dataArr): url = ct.REPORT_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1]) ct._write_console() try: html = lxml.html.parse(url) xtrs = html.xpath("//table[@class=\"list_table\"]/tr") for trs in xtrs: code = trs.xpath('td[1]//span/a/text()')[0] name = trs.xpath('td[2]/span/a/text()')[0] eps = trs.xpath('td[3]/text()')[0] #每股收益(元) eps_yoy = trs.xpath('td[4]/text()')[0] #每股收益同比(%) bvps = trs.xpath('td[5]/text()')[0] #每股净资产(元) bvps = '0' if bvps == '--' else bvps roe = trs.xpath('td[6]/text()')[0] #净资产收益率(%) roe = '0' if roe == '--' else roe epcf = trs.xpath('td[7]/text()')[0] #每股现金流量(元) epcf = '0' if epcf == '--' else epcf net_profits = trs.xpath('td[8]/text()')[0] #净利润(万元) profits_yoy = trs.xpath('td[9]/text()')[0] #净利润同比(%) distrib = trs.xpath('td[10]/text()')[0] #分配方案 report_date = trs.xpath('td[11]/text()')[0] #发布日期 dataArr.append([code, name, eps, eps_yoy, bvps, roe, epcf, net_profits, profits_yoy, distrib, report_date]) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') #获取下一页 if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_report_data(year, quarter, pageNo, dataArr) else: return dataArr except: pass
def _get_report_data(year, quarter, pageNo, dataArr): ct._write_console() try: request = Request( ct.REPORT_URL % (ct.P_TYPE["http"], ct.DOMAINS["vsf"], ct.PAGES["fd"], year, quarter, pageNo, ct.PAGE_NUM[1]) ) text = urlopen(request, timeout=10).read() text = text.decode("GBK") text = text.replace("--", "") html = lxml.html.parse(StringIO(text)) res = html.xpath('//table[@class="list_table"]/tr') if ct.PY3: sarr = [etree.tostring(node).decode("utf-8") for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = "".join(sarr) sarr = "<table>%s</table>" % sarr df = pd.read_html(sarr)[0] df = df.drop(11, axis=1) df.columns = ct.REPORT_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class="pages"]/a[last()]/@onclick') if len(nextPage) > 0: pageNo = re.findall(r"\d+", nextPage[0])[0] return _get_report_data(year, quarter, pageNo, dataArr) else: return dataArr except Exception as e: print(e)
def _sh_mx(data, date="", start="", end="", symbol="", pageNo="", beginPage="", endPage="", retry_count=3, pause=0.001): for _ in range(retry_count): time.sleep(pause) ct._write_console() try: tail = "&pageHelp.pageNo=%s&pageHelp.beginPage=%s&pageHelp.endPage=%s" % (pageNo, beginPage, endPage) if pageNo == "": pageNo = 6 tail = "" else: pageNo += 5 beginPage = pageNo endPage = pageNo + 4 ref = rv.MAR_SH_HZ_REF_URL % (ct.P_TYPE["http"], ct.DOMAINS["sse"]) clt = Client( rv.MAR_SH_MX_URL % ( ct.P_TYPE["http"], ct.DOMAINS["sseq"], ct.PAGES["qmd"], _random(5), date, symbol, start, end, tail, _random(), ), ref=ref, cookie=rv.MAR_SH_COOKIESTR, ) lines = clt.gvalue() lines = lines.decode("utf-8") if ct.PY3 else lines lines = lines[19:-1] lines = json.loads(lines) pagecount = int(lines["pageHelp"].get("pageCount")) datapage = int(pagecount / 5 + 1 if pagecount % 5 > 0 else pagecount / 5) if pagecount == 0: return data if pageNo == 6: ct._write_tips(lines["pageHelp"].get("total")) df = pd.DataFrame(lines["result"], columns=rv.MAR_SH_MX_COLS) df["opDate"] = df["opDate"].map(lambda x: "%s-%s-%s" % (x[0:4], x[4:6], x[6:8])) data = data.append(df, ignore_index=True) if beginPage < datapage * 5: data = _sh_mx( data, start=start, end=end, pageNo=pageNo, beginPage=beginPage, endPage=endPage, retry_count=retry_count, pause=pause, ) except _network_error_classes: pass else: return data raise IOError(ct.NETWORK_URL_ERROR_MSG)
def _holding_cotent(start, end, pageNo, retry_count, pause): for _ in range(retry_count): time.sleep(pause) if pageNo > 0: ct._write_console() try: request = Request( rv.FUND_HOLDS_URL % ( ct.P_TYPE["http"], ct.DOMAINS["163"], ct.PAGES["163fh"], ct.PAGES["163fh"], pageNo, start, end, _random(5), ) ) lines = urlopen(request, timeout=10).read() lines = lines.decode("utf-8") if ct.PY3 else lines lines = lines.replace("--", "0") lines = json.loads(lines) data = lines["list"] df = pd.DataFrame(data) df = df.drop( ["CODE", "ESYMBOL", "EXCHANGE", "NAME", "RN", "SHANGQIGUSHU", "SHANGQISHIZHI", "SHANGQISHULIANG"], axis=1, ) for col in ["GUSHU", "GUSHUBIJIAO", "SHIZHI", "SCSTC27"]: df[col] = df[col].astype(float) df["SCSTC27"] = df["SCSTC27"] * 100 df["GUSHU"] = df["GUSHU"] / 10000 df["GUSHUBIJIAO"] = df["GUSHUBIJIAO"] / 10000 df["SHIZHI"] = df["SHIZHI"] / 10000 df["GUSHU"] = df["GUSHU"].map(ct.FORMAT) df["GUSHUBIJIAO"] = df["GUSHUBIJIAO"].map(ct.FORMAT) df["SHIZHI"] = df["SHIZHI"].map(ct.FORMAT) df["SCSTC27"] = df["SCSTC27"].map(ct.FORMAT) df.columns = rv.FUND_HOLDS_COLS df = df[["code", "name", "date", "nums", "nlast", "count", "clast", "amount", "ratio"]] except _network_error_classes: pass else: if pageNo == 0: return df, int(lines["pagecount"]) else: return df raise IOError(ct.NETWORK_URL_ERROR_MSG)
def _sh_mx(data, date='', start='', end='', symbol='', pageNo='', beginPage='', endPage='', retry_count=3, pause=0.001): for _ in range(retry_count): time.sleep(pause) ct._write_console() try: tail = '&pageHelp.pageNo=%s&pageHelp.beginPage=%s&pageHelp.endPage=%s'%(pageNo, beginPage, endPage) if pageNo == '': pageNo = 6 tail = '' else: pageNo += 5 beginPage = pageNo endPage = pageNo + 4 url = rv.MAR_SH_MX_URL%(ct.P_TYPE['http'], ct.DOMAINS['sseq'], ct.PAGES['qmd'], _random(5), date, symbol, start, end, tail, _random()) ref = rv.MAR_SH_HZ_REF_URL%(ct.P_TYPE['http'], ct.DOMAINS['sse']) clt = Client(url, ref=ref, cookie=rv.MAR_SH_COOKIESTR) lines = clt.gvalue() lines = lines.decode('utf-8') if ct.PY3 else lines lines = lines[19:-1] lines = json.loads(lines) pagecount = int(lines['pageHelp'].get('pageCount')) datapage = int(pagecount/5+1 if pagecount%5>0 else pagecount/5) if pagecount == 0: return data if pageNo == 6: ct._write_tips(lines['pageHelp'].get('total')) df = pd.DataFrame(lines['result'], columns=rv.MAR_SH_MX_COLS) df['opDate'] = df['opDate'].map(lambda x: '%s-%s-%s'%(x[0:4], x[4:6], x[6:8])) df = df.set_index('opDate') data = data.append(df, ignore_index=True) if beginPage < datapage*5: data = _sh_mx(data, start=start, end=end, pageNo=pageNo, beginPage=beginPage, endPage=endPage, retry_count=retry_count, pause=pause) except _network_error_classes: pass else: return data raise IOError(ct.NETWORK_URL_ERROR_MSG)
def _day_cinema(date=None, pNo=1, retry_count=3, pause=0.001): ct._write_console() for _ in range(retry_count): time.sleep(pause) try: request = Request(ct.BOXOFFICE_CBD%(ct.P_TYPE['http'], ct.DOMAINS['mbox'], ct.BOX, pNo, date)) lines = urlopen(request, timeout = 10).read() if len(lines) < 15: #no data return None except Exception as e: print(e) else: js = json.loads(lines.decode('utf-8') if ct.PY3 else lines) df = pd.DataFrame(js['data1']) df = df.drop(['CinemaID'], axis=1) return df
def _sz_hz(date="", retry_count=3, pause=0.001): for _ in range(retry_count): time.sleep(pause) ct._write_console() try: request = Request(rv.MAR_SZ_HZ_URL % (ct.P_TYPE["http"], ct.DOMAINS["szse"], ct.PAGES["szsefc"], date)) lines = urlopen(request, timeout=10).read() if len(lines) <= 200: return pd.DataFrame() df = pd.read_html(lines, skiprows=[0])[0] df.columns = rv.MAR_SZ_HZ_COLS df["opDate"] = date except: pass else: return df raise IOError(ct.NETWORK_URL_ERROR_MSG)
def _profit_divis(pageNo, dataArr, nextPage): ct._write_console() html = lxml.html.parse('%sdata.cfi.cn/%s'%(ct.P_TYPE['http'], nextPage)) res = html.xpath("//table[@class=\"table_data\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = sarr.replace('--', '0') sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr, skiprows=[0])[0] dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@id=\"content\"]/div[2]/a[last()]/@href')[0] np = nextPage.split('&')[2].split('=')[1] if pageNo < int(np): return _profit_divis(int(np), dataArr, nextPage) else: return dataArr
def _broker_tops_detail(last=5, pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()): ct._write_console() for _ in range(retry_count): time.sleep(pause) try: request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[1], ct.PAGES['fd'], last, pageNo)) text = urlopen(request, timeout=10).read() text = text.decode('GBK') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@id=\"dataTable\"]/tr/td/a") urls = [a.attrib['href'] for a in res] if len(urls) <= 0: return dataArr for url in urls: ct._write_console() time.sleep(pause) request2 = Request(url) text2 = urlopen(request2, timeout=10).read() html2 = lxml.html.parse(StringIO(text2.decode('GBK'))) res2 = html2.xpath("//table[@id=\"dataTable\"]/tr") broker = html2.xpath("//div[@class=\"page_config\"]")[0].text_content().strip().split(':')[1].strip() ct._write_msg(broker + "\r\n") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res2] else: sarr = [etree.tostring(node) for node in res2] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df.columns = rv.LHB_YYMX_COLS df['broker'] = pd.Series([broker]*df.index.size, index=list(df.index)) dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _broker_tops_detail(last, pageNo, retry_count, pause, dataArr) else: return dataArr except Exception as e: print(e)
def get_h_data(code, start=None, end=None, autype='qfq', retry_count=3, pause=0.001): ''' 获取历史复权数据 Parameters ------ code:string 股票代码 e.g. 600848 start:string 开始日期 format:YYYY-MM-DD 为空时取当前日期 end:string 结束日期 format:YYYY-MM-DD 为空时取去年今日 autype:string 复权类型,qfq-前复权 hfq-后复权 None-不复权,默认为qfq retry_count : int, 默认 3 如遇网络等问题重复执行的次数 pause : int, 默认 0 重复请求数据过程中暂停的秒数,防止请求间隔时间太短出现的问题 return ------- DataFrame date 交易日期 (index) open 开盘价 high 最高价 close 收盘价 low 最低价 volumn 成交量 amount 成交金额 ''' start = du.today_last_year() if start is None else start end = du.today() if end is None else end qs = du.get_quarts(start, end) qt = qs[0] ct._write_head() data = _parse_fq_data( ct.HIST_FQ_URL % (ct.P_TYPE['http'], ct.DOMAINS['vsf'], code, qt[0], qt[1]), retry_count, pause) if len(qs) > 1: for d in range(1, len(qs)): qt = qs[d] ct._write_console() url = ct.HIST_FQ_URL % (ct.P_TYPE['http'], ct.DOMAINS['vsf'], code, qt[0], qt[1]) df = _parse_fq_data(url, retry_count, pause) data = data.append(df, ignore_index=True) data = data.drop_duplicates('date') if start is not None: data = data[data.date >= start] if end is not None: data = data[data.date <= end] if autype == 'hfq': data = data.drop('factor', axis=1) for label in ['open', 'high', 'close', 'low']: data[label] = data[label].map(ct.FORMAT) data = data.set_index('date') data = data.sort_index(ascending=False) return data else: for label in ['open', 'high', 'close', 'low']: data[label] = data[label] / data['factor'] data = data.drop('factor', axis=1) if autype == 'qfq': df = _parase_fq_factor(code, start, end) df = df.drop_duplicates('date') df = df[df.date >= start] df = df[df.date <= end] df = pd.merge(data, df) df = df.sort('date', ascending=False) frow = df.head(1) rate = float(frow['close']) / float(frow['factor']) df['close_temp'] = df['close'] df['close'] = rate * df['factor'] for label in ['open', 'high', 'low']: df[label] = df[label] * (df['close'] / df['close_temp']) df[label] = df[label].map(ct.FORMAT) df = df.drop(['factor', 'close_temp'], axis=1) df['close'] = df['close'].map(ct.FORMAT) df = df.set_index('date') df = df.sort_index(ascending=False) df = df.astype(float) return df else: for label in ['open', 'high', 'close', 'low']: data[label] = data[label].map(ct.FORMAT) data = data.set_index('date') data = data.sort_index(ascending=False) data = data.astype(float) return data
def get_h_data(code, start=None, end=None, autype='qfq', index=False, retry_count=3, pause=0.001, drop_factor=True): ''' 获取历史复权数据 Parameters ------ code:string 股票代码 e.g. 600848 start:string 开始日期 format:YYYY-MM-DD 为空时取当前日期 end:string 结束日期 format:YYYY-MM-DD 为空时取去年今日 autype:string 复权类型,qfq-前复权 hfq-后复权 None-不复权,默认为qfq retry_count : int, 默认 3 如遇网络等问题重复执行的次数 pause : int, 默认 0 重复请求数据过程中暂停的秒数,防止请求间隔时间太短出现的问题 drop_factor : bool, 默认 True 是否移除复权因子,在分析过程中可能复权因子意义不大,但是如需要先储存到数据库之后再分析的话,有该项目会更加灵活 return ------- DataFrame date 交易日期 (index) open 开盘价 high 最高价 close 收盘价 low 最低价 volume 成交量 amount 成交金额 ''' print("本接口即将停止更新,请尽快使用Pro版接口:https://waditu.com/document/2") start = du.today_last_year() if start is None else start end = du.today() if end is None else end qs = du.get_quarts(start, end) qt = qs[0] ct._write_head() data = _parse_fq_data(_get_index_url(index, code, qt), index, retry_count, pause) if data is None: data = pd.DataFrame() if len(qs)>1: for d in range(1, len(qs)): qt = qs[d] ct._write_console() df = _parse_fq_data(_get_index_url(index, code, qt), index, retry_count, pause) if df is None: # 可能df为空,退出循环 break else: data = data.append(df, ignore_index = True) if len(data) == 0 or len(data[(data.date >= start) & (data.date <= end)]) == 0: return pd.DataFrame() data = data.drop_duplicates('date') if index: data = data[(data.date >= start) & (data.date <= end)] data = data.set_index('date') data = data.sort_index(ascending = False) return data if autype == 'hfq': if drop_factor: data = data.drop('factor', axis=1) data = data[(data.date >= start) & (data.date <= end)] for label in ['open', 'high', 'close', 'low']: data[label] = data[label].map(ct.FORMAT) data[label] = data[label].astype(float) data = data.set_index('date') data = data.sort_index(ascending = False) return data else: if autype == 'qfq': if drop_factor: data = data.drop('factor', axis = 1) df = _parase_fq_factor(code, start, end) df = df.drop_duplicates('date') df = df.sort_values('date', ascending = False) firstDate = data.head(1)['date'] frow = df[df.date == firstDate[0]] rt = get_realtime_quotes(code) if rt is None: return pd.DataFrame() if ((float(rt['high']) == 0) & (float(rt['low']) == 0)): preClose = float(rt['pre_close']) else: if du.is_holiday(du.today()): preClose = float(rt['price']) else: if (du.get_hour() > 9) & (du.get_hour() < 18): preClose = float(rt['pre_close']) else: preClose = float(rt['price']) rate = float(frow['factor']) / preClose data = data[(data.date >= start) & (data.date <= end)] for label in ['open', 'high', 'low', 'close']: data[label] = data[label] / rate data[label] = data[label].map(ct.FORMAT) data[label] = data[label].astype(float) data = data.set_index('date') data = data.sort_index(ascending = False) return data else: for label in ['open', 'high', 'close', 'low']: data[label] = data[label] / data['factor'] if drop_factor: data = data.drop('factor', axis=1) data = data[(data.date >= start) & (data.date <= end)] for label in ['open', 'high', 'close', 'low']: data[label] = data[label].map(ct.FORMAT) data = data.set_index('date') data = data.sort_index(ascending = False) data = data.astype(float) return data
def get_h_data(code, start=None, end=None, autype='qfq', index=False, retry_count=3, pause=0.001): ''' 获取历史复权数据 Parameters ------ code:string 股票代码 e.g. 600848 start:string 开始日期 format:YYYY-MM-DD 为空时取当前日期 end:string 结束日期 format:YYYY-MM-DD 为空时取去年今日 autype:string 复权类型,qfq-前复权 hfq-后复权 None-不复权,默认为qfq retry_count : int, 默认 3 如遇网络等问题重复执行的次数 pause : int, 默认 0 重复请求数据过程中暂停的秒数,防止请求间隔时间太短出现的问题 return ------- DataFrame date 交易日期 (index) open 开盘价 high 最高价 close 收盘价 low 最低价 volume 成交量 amount 成交金额 ''' start = du.today_last_year() if start is None else start end = du.today() if end is None else end qs = du.get_quarts(start, end) qt = qs[0] ct._write_head() data = _parse_fq_data(_get_index_url(index, code, qt), index, retry_count, pause) if len(qs)>1: for d in range(1, len(qs)): qt = qs[d] ct._write_console() df = _parse_fq_data(_get_index_url(index, code, qt), index, retry_count, pause) data = data.append(df, ignore_index=True) if len(data) == 0 or len(data[(data.date>=start)&(data.date<=end)]) == 0: return None data = data.drop_duplicates('date') if index: data = data[(data.date>=start) & (data.date<=end)] data = data.set_index('date') data = data.sort_index(ascending=False) return data if autype == 'hfq': data = data.drop('factor', axis=1) data = data[(data.date>=start) & (data.date<=end)] for label in ['open', 'high', 'close', 'low']: data[label] = data[label].map(ct.FORMAT) data[label] = data[label].astype(float) data = data.set_index('date') data = data.sort_index(ascending = False) return data else: if autype == 'qfq': data = data.drop('factor', axis=1) df = _parase_fq_factor(code, start, end) df = df.drop_duplicates('date') df = df.sort('date', ascending=False) frow = df.head(1) rt = get_realtime_quotes(code) if rt is None: return None if ((float(rt['high']) == 0) & (float(rt['low']) == 0)): preClose = float(rt['pre_close']) else: if du.is_holiday(du.today()): preClose = float(rt['price']) else: if (du.get_hour() > 9) & (du.get_hour() < 18): preClose = float(rt['pre_close']) else: preClose = float(rt['price']) rate = float(frow['factor']) / preClose data = data[(data.date >= start) & (data.date <= end)] for label in ['open', 'high', 'low', 'close']: data[label] = data[label] / rate data[label] = data[label].map(ct.FORMAT) data[label] = data[label].astype(float) data = data.set_index('date') data = data.sort_index(ascending = False) return data else: for label in ['open', 'high', 'close', 'low']: data[label] = data[label] / data['factor'] data = data.drop('factor', axis=1) data = data[(data.date>=start) & (data.date<=end)] for label in ['open', 'high', 'close', 'low']: data[label] = data[label].map(ct.FORMAT) data = data.set_index('date') data = data.sort_index(ascending=False) data = data.astype(float) return data