def get_czce_receipt_1(date: str = None, vars_list: List = cons.contract_symbols): """ 抓取郑州商品交易所注册仓单数据 适用20080222至20100824(包括) :param date: 开始日期 format:YYYY-MM-DD 或 YYYYMMDD 或 datetime.date对象 为空时为当天 :type date: str :param vars_list: list :type vars_list: 合约品种如CF、TA等列表 为空时为所有商品 :return: 展期收益率数据 :rtype: pandas.DataFrame var 商品品种 string receipt 仓单数 int date 日期 string YYYYMMDD """ date = cons.convert_date(date).strftime( '%Y%m%d') if date is not None else datetime.date.today() if date not in calendar: warnings.warn('%s非交易日' % date.strftime('%Y%m%d')) return None if date == '20090820': return pd.DataFrame() url = cons.CZCE_RECEIPT_URL_1 % date r = requests_link(url, encoding='utf-8') context = r.text data = pd.read_html(context)[1] records = pd.DataFrame() indexes = [x for x in data.index if '品种:' in str(data[0].tolist()[x])] ends = [x for x in data.index if '总计' in str(data[0].tolist()[x])] for i in list(range(len(indexes))): if i != len(indexes) - 1: data_cut = data.loc[indexes[i]:ends[i], :] data_cut = data_cut.fillna(method='pad') else: data_cut = data.loc[indexes[i]:, :] data_cut = data_cut.fillna(method='pad') if 'PTA' in data_cut[0].tolist()[0]: var = 'TA' else: var = chinese_to_english( re.sub(r'[A-Z]+', '', data_cut[0].tolist()[0][3:])) if var == 'CF': receipt = data_cut[6].tolist()[-1] receipt_chg = data_cut[7].tolist()[-1] else: receipt = data_cut[5].tolist()[-1] receipt_chg = data_cut[6].tolist()[-1] data_dict = { 'var': var, 'receipt': int(receipt), 'receipt_chg': int(receipt_chg), 'date': date } records = records.append(pd.DataFrame(data_dict, index=[0])) if len(records.index) != 0: records.index = records['var'] vars_in_market = [i for i in vars_list if i in records.index] records = records.loc[vars_in_market, :] return records.reset_index(drop=True)
def get_czce_receipt_2(date: str = None, vars_list: List = cons.contract_symbols): """ 抓取郑州商品交易所注册仓单数据 适用20100825(包括)至20151111(包括) Parameters ------ date: 开始日期 format:YYYY-MM-DD 或 YYYYMMDD 或 datetime.date对象 为空时为当天 vars_list: 合约品种如CF、TA等列表 为空时为所有商品 Return ------- DataFrame: 展期收益率数据(DataFrame): var 商品品种 string receipt 仓单数 int date 日期 string YYYYMMDD """ date = cons.convert_date(date).strftime( '%Y%m%d') if date is not None else datetime.date.today() if date not in calendar: warnings.warn('%s非交易日' % date.strftime('%Y%m%d')) return None url = cons.CZCE_RECEIPT_URL_2 % (date[:4], date) r = requests.get(url) r.encoding = 'utf-8' data = pd.read_html(r.text)[3:] records = pd.DataFrame() for data_cut in data: if len(data_cut.columns) > 3: last_indexes = [ x for x in data_cut.index if '注:' in str(data_cut[0].tolist()[x]) ] if len(last_indexes) > 0: last_index = last_indexes[0] - 1 data_cut = data_cut.loc[:last_index, :] if 'PTA' in data_cut[0].tolist()[0]: var = 'TA' else: strings = data_cut[0].tolist()[0] string = strings.split(' ')[0][3:] var = chinese_to_english(re.sub(r'[A-Z]+', '', string)) data_cut.columns = data_cut.T[1].tolist() receipt = data_cut['仓单数量'].tolist()[-1] receipt_chg = data_cut['当日增减'].tolist()[-1] data_dict = { 'var': var, 'receipt': int(receipt), 'receipt_chg': int(receipt_chg), 'date': date } records = records.append(pd.DataFrame(data_dict, index=[0])) if len(records.index) != 0: records.index = records['var'] vars_in_market = [i for i in vars_list if i in records.index] records = records.loc[vars_in_market, :] return records.reset_index(drop=True)
def get_shfe_receipt_1(date: str = None, vars_list: List = cons.contract_symbols): """ 抓取上海期货交易所注册仓单数据, 适用20081006至20140518(包括) 20100126、20101029日期交易所格式混乱,直接回复脚本中DataFrame, 20100416、20130821日期交易所数据丢失 :param date: 开始日期 format:YYYY-MM-DD 或 YYYYMMDD 或 datetime.date对象 为空时为当天 :param vars_list: 合约品种如RB、AL等列表 为空时为所有商品 :return: pd.DataFrame 展期收益率数据(DataFrame): var 商品品种 string receipt 仓单数 int date 日期 string YYYYMMDD """ date = cons.convert_date(date).strftime( '%Y%m%d') if date is not None else datetime.date.today() if date not in calendar: warnings.warn(f"{date.strftime('%Y%m%d')}非交易日") return None if date == '20100126': shfe_20100126['date'] = date return shfe_20100126 elif date == '20101029': shfe_20101029['date'] = date return shfe_20101029 elif date in ['20100416', '20130821']: return warnings.warn('20100416、20130821日期交易所数据丢失') else: var_list = [ '天然橡胶', '沥青仓库', '沥青厂库', '热轧卷板', '燃料油', '白银', '线材', '螺纹钢', '铅', '铜', '铝', '锌', '黄金', '锡', '镍' ] url = cons.SHFE_RECEIPT_URL_1 % date data = pandas_read_html_link(url)[0] indexes = [x for x in data.index if (data[0].tolist()[x] in var_list)] last_index = [ x for x in data.index if '注' in str(data[0].tolist()[x]) ][0] - 1 records = pd.DataFrame() for i in list(range(len(indexes))): if i != len(indexes) - 1: data_cut = data.loc[indexes[i]:indexes[i + 1] - 1, :] else: data_cut = data.loc[indexes[i]:last_index, :] data_cut = data_cut.fillna(method='pad') data_dict = dict() data_dict['var'] = chinese_to_english(data_cut[0].tolist()[0]) data_dict['receipt'] = int(data_cut[2].tolist()[-1]) data_dict['receipt_chg'] = int(data_cut[3].tolist()[-1]) data_dict['date'] = date records = records.append(pd.DataFrame(data_dict, index=[0])) if len(records.index) != 0: records.index = records['var'] vars_in_market = [i for i in vars_list if i in records.index] records = records.loc[vars_in_market, :] return records.reset_index(drop=True)
def get_dce_receipt(date: str = None, symbol_list: List = cons.contract_symbols): """ 完成 采集大连商品交易所注册仓单数据 :param date: format 开始日期: YYYY-MM-DD 或 YYYYMMDD 或 datetime.date对象, 为空时为当天 :param symbol_list: 合约品种如 RB, AL等列表, 为空时为所有商品数据从 20060106开始,每周五更新仓单数据。直到20090407起,每交易日都更新仓单数据 :return: pd.DataFrame 展期收益率数据(DataFrame): var 商品品种 string receipt 仓单数 int date 日期 string YYYYMMDD """ if not isinstance(symbol_list, list): return warnings.warn(f"symbol_list: 必须是列表") date = cons.convert_date( date) if date is not None else datetime.date.today() if date.strftime('%Y%m%d') not in calendar: warnings.warn(f"{date.strftime('%Y%m%d')}非交易日") return None payload = { "weekQuotes.variety": "all", "year": date.year, "month": date.month - 1, # 网站月份描述少 1 个月, 属于网站问题 "day": date.day } data = pandas_read_html_link(cons.DCE_RECEIPT_URL, method="post", data=payload, headers=cons.dce_headers)[0] records = pd.DataFrame() for x in data.to_dict(orient='records'): if isinstance(x['品种'], str): if x['品种'][-2:] == '小计': var = x['品种'][:-2] temp_data = { 'var': chinese_to_english(var), 'receipt': int(x['今日仓单量']), 'receipt_chg': int(x['增减']), 'date': date.strftime('%Y%m%d') } records = records.append(pd.DataFrame(temp_data, index=[0])) if len(records.index) != 0: records.index = records['var'] vars_in_market = [i for i in symbol_list if i in records.index] records = records.loc[vars_in_market, :] return records.reset_index(drop=True)
def get_shfe_receipt_2(date: str = None, vars_list: List = cons.contract_symbols): """ 抓取上海商品交易所注册仓单数据 适用20140519(包括)至今 Parameters ------ date: 开始日期 format:YYYY-MM-DD 或 YYYYMMDD 或 datetime.date对象 为空时为当天 vars_list: 合约品种如RB、AL等列表 为空时为所有商品 Return ------- DataFrame: 展期收益率数据(DataFrame): var 商品品种 string receipt 仓单数 int date 日期 string YYYYMMDD """ date = cons.convert_date(date).strftime( '%Y%m%d') if date is not None else datetime.date.today() if date not in calendar: warnings.warn('%s非交易日' % date.strftime('%Y%m%d')) return None url = cons.SHFE_RECEIPT_URL_2 % date r = requests_link(url, encoding='utf-8') try: context = r.json() except: return pd.DataFrame() data = pd.DataFrame(context['o_cursor']) if len(data.columns) < 1: return pd.DataFrame() records = pd.DataFrame() for var in set(data['VARNAME'].tolist()): data_cut = data[data['VARNAME'] == var] data_dict = { 'var': chinese_to_english(re.sub(r"\W|[a-zA-Z]", "", var)), 'receipt': int(data_cut['WRTWGHTS'].tolist()[-1]), 'receipt_chg': int(data_cut['WRTCHANGE'].tolist()[-1]), 'date': date } records = records.append(pd.DataFrame(data_dict, index=[0])) if len(records.index) != 0: records.index = records['var'] vars_in_market = [i for i in vars_list if i in records.index] records = records.loc[vars_in_market, :] return records.reset_index(drop=True)
def _check_information(df_data, date): """ 进行数据验证和计算模块 :param df_data: pandas.DataFrame 采集的数据 :param date: datetime.date 具体某一天 YYYYMMDD :return: pandas.DataFrame 中间数据 symbol spot_price near_contract ... near_basis_rate dom_basis_rate date CU 49620.00 cu1811 ... -0.002418 -0.003426 20181108 RB 4551.54 rb1811 ... -0.013521 -0.134359 20181108 ZN 22420.00 zn1811 ... -0.032114 -0.076271 20181108 AL 13900.00 al1812 ... 0.005396 0.003957 20181108 AU 274.10 au1811 ... 0.005655 0.020430 20181108 WR 4806.25 wr1903 ... -0.180026 -0.237035 20181108 RU 10438.89 ru1811 ... -0.020969 0.084406 20181108 PB 18600.00 pb1811 ... -0.001344 -0.010215 20181108 AG 3542.67 ag1811 ... -0.000754 0.009408 20181108 BU 4045.53 bu1811 ... -0.129904 -0.149679 20181108 HC 4043.33 hc1811 ... -0.035449 -0.088128 20... """ df_data = df_data.loc[:, [0, 1, 2, 3, 5, 6]] df_data.columns = [ "symbol", "spot_price", "near_contract", "near_contract_price", "dominant_contract", "dominant_contract_price", ] records = pd.DataFrame() for string in df_data["symbol"].tolist(): if string == "PTA": news = "PTA" else: news = "".join(re.findall(r"[\u4e00-\u9fa5]", string)) if news != "" and news not in [ "商品", "价格", "上海期货交易所", "郑州商品交易所", "大连商品交易所" ]: symbol = chinese_to_english(news) record = pd.DataFrame(df_data[df_data["symbol"] == string]) record.loc[:, "symbol"] = symbol record.loc[:, "spot_price"] = record.loc[:, "spot_price"].astype(float) if (symbol == "JD" ): # 鸡蛋现货为元/公斤, 鸡蛋期货为元/500千克, 其余元/吨(http://www.100ppi.com/sf/) record.loc[:, "spot_price"] = float(record["spot_price"]) * 500 elif ( symbol == "FG" ): # 上表中现货单位为元/平方米, 期货单位为元/吨. 换算公式:元/平方米*80=元/吨(http://www.100ppi.com/sf/959.html) record.loc[:, "spot_price"] = float(record["spot_price"]) * 80 records = records.append(record) records.loc[:, [ "near_contract_price", "dominant_contract_price", "spot_price" ]] = records.loc[:, [ "near_contract_price", "dominant_contract_price", "spot_price" ]].astype("float") records.loc[:, "near_contract"] = records["near_contract"].replace( r"[^0-9]*(\d*)$", r"\g<1>", regex=True) records.loc[:, "dominant_contract"] = records["dominant_contract"].replace( r"[^0-9]*(\d*)$", r"\g<1>", regex=True) records.loc[:, "near_contract"] = records[ "symbol"] + records.loc[:, "near_contract"].astype("int").astype("str") records.loc[:, "dominant_contract"] = records[ "symbol"] + records.loc[:, "dominant_contract"].astype("int").astype( "str") records["near_contract"] = records["near_contract"].apply( lambda x: x.lower() if x[:-4] in cons.market_exchange_symbols[ "shfe"] + cons.market_exchange_symbols["dce"] else x) records.loc[:, "dominant_contract"] = records.loc[:, "dominant_contract"].apply( lambda x: x.lower() if x[:-4] in cons. market_exchange_symbols[ "shfe"] + cons. market_exchange_symbols[ "dce"] else x) records.loc[:, "near_contract"] = records.loc[:, "near_contract"].apply( lambda x: x[:-4] + x[-3:] if x[:-4] in cons.market_exchange_symbols["czce"] else x) records.loc[:, "dominant_contract"] = records.loc[:, "dominant_contract"].apply( lambda x: x[:-4] + x[-3: ] if x[:-4] in cons. market_exchange_symbols[ "czce"] else x) records[ "near_basis"] = records["near_contract_price"] - records["spot_price"] records["dom_basis"] = records["dominant_contract_price"] - records[ "spot_price"] records["near_basis_rate"] = ( records["near_contract_price"] / records["spot_price"] - 1) records["dom_basis_rate"] = ( records["dominant_contract_price"] / records["spot_price"] - 1) records.loc[:, "date"] = date.strftime("%Y%m%d") return records
def get_czce_rank_table(date=None, vars_list=cons.contract_symbols): """ 郑州商品交易所前 20 会员持仓排名数据明细 注:该交易所既公布了品种排名, 也公布了标的排名 :param date: 日期 format:YYYY-MM-DD 或 YYYYMMDD 或 datetime.date对象 为空时为当天 :param vars_list: 合约品种如RB、AL等列表 为空时为所有商品, 数据从20050509开始,每交易日16:30左右更新数据 :return: pd.DataFrame rank 排名 int vol_party_name 成交量排序的当前名次会员 string(中文) vol 该会员成交量 int vol_chg 该会员成交量变化量 int long_party_name 持多单排序的当前名次会员 string(中文) long_open_interest 该会员持多单 int long_open_interest_chg 该会员持多单变化量 int short_party_name 持空单排序的当前名次会员 string(中文) short_open_interest 该会员持空单 int short_open_interest_chg 该会员持空单变化量 int symbol 标的合约 string var 品种 string date 日期 string YYYYMMDD """ date = cons.convert_date(date) if date is not None else datetime.date.today() if date < datetime.date(2005, 5, 9): print("czce数据源开始日期为20050509,跳过") return {} if date.strftime('%Y%m%d') not in calendar: warnings.warn('%s非交易日' % date.strftime('%Y%m%d')) return {} if date <= datetime.date(2010, 8, 25): url = cons.CZCE_VOL_RANK_URL_1 % (date.strftime('%Y%m%d')) data = _czce_df_read(url, skip_rows=0) r = requests_link(url, 'utf-8') r.encoding = 'utf-8' soup = BeautifulSoup(r.text, 'lxml', from_encoding="gb2312") symbols = [] for link in soup.find_all('b'): strings = (str(link).split(' ')) if len(strings) > 5: try: symbol = chinese_to_english(strings[4]) except: symbol = strings[4] symbols.append(symbol) big_dict = {} for i in range(len(symbols)): symbol = symbols[i] table_cut = data[i + 2] table_cut.columns = rank_columns table_cut = table_cut.iloc[:-1, :] table_cut.loc[:, 'rank'] = table_cut.index table_cut.loc['合计', 'rank'] = 999 table_cut.loc['合计', ['vol_party_name', 'long_party_name', 'short_party_name']] = None table_cut.loc[:, 'symbol'] = symbol table_cut.loc[:, 'variety'] = symbol_varieties(symbol) table_cut[intColumns] = table_cut[intColumns].fillna(0) table_cut[intColumns] = table_cut[intColumns].astype(str) table_cut[intColumns] = table_cut[intColumns].applymap(lambda x: x.replace(',', '')) table_cut = table_cut.applymap(lambda x: 0 if x == '-' else x) table_cut[intColumns] = table_cut[intColumns].astype(float) table_cut[intColumns] = table_cut[intColumns].astype(int) big_dict[symbol] = table_cut.reset_index(drop=True) return big_dict elif date <= datetime.date(2015, 11, 11): url = cons.CZCE_VOL_RANK_URL_2 % (date.year, date.strftime('%Y%m%d')) data = _czce_df_read(url, skip_rows=1)[1] elif date < datetime.date(2017, 12, 28): url = cons.CZCE_VOL_RANK_URL_3 % (date.year, date.strftime('%Y%m%d')) data = _czce_df_read(url, skip_rows=1)[0] else: url = cons.CZCE_VOL_RANK_URL_3 % (date.year, date.strftime('%Y%m%d')) data = _czce_df_read(url, skip_rows=0)[0] if len(data.columns) < 6: return {} table = pd.DataFrame(data.iloc[:, :9]) table.columns = rank_columns table.loc[:, 'rank'] = table.index table[intColumns] = table[intColumns].astype(str) table[intColumns] = table[intColumns].applymap(lambda x: x.replace(',', '')) table = table.applymap(lambda x: 0 if x == '-' else x) indexes = [i for i in table.index if '合约' in i or '品种' in i] indexes.insert(0, 0) big_dict = {} for i in range(len(indexes)): if indexes[i] == 0: table_cut = table.loc[:indexes[i + 1], :] string = table_cut.index.name elif i < len(indexes) - 1: table_cut = table.loc[indexes[i]:indexes[i + 1], :] string = table_cut.index[0] else: table_cut = table.loc[indexes[i]:, :] string = table_cut.index[0] if 'PTA' in string: symbol = 'TA' else: try: symbol = chinese_to_english(find_chinese(re.compile(r':(.*) ').findall(string)[0])) except: symbol = re.compile(r':(.*) ').findall(string)[0] var = symbol_varieties(symbol) if var in vars_list: table_cut = table_cut.dropna(how='any').iloc[1:, :] table_cut = table_cut.loc[[x for x in table_cut.index if x in [str(i) for i in range(21)]], :] table_cut = _table_cut_cal(table_cut, symbol) big_dict[symbol] = table_cut.reset_index(drop=True) return big_dict