def generate_answers(): try: raw_data = pd.read_csv('DOHMH_New_York_City_Restaurant_Inspection_Results.csv',low_memory=False) except IOError: sys.exit() print "The program has got the raw data. Now it is cleaning the data... Please wait." raw_data_cleaning = data_clean.clean(raw_data) cleaned_data = raw_data_cleaning.clean_data() # question 4: compute the performance of all restaurants and print the sum for different Boroughs. print "Computing the performance of all restaurants..." grade_analysis_functions.print_restaurant_grades_all(cleaned_data) print "Camputing the performance of restaurants in different boroughs..." grade_analysis_functions.print_restaurant_grades_by_borough(cleaned_data) # question 5: generate the graphs print "Generating the graphs..." df_whole_city = grade_analysis_functions.get_grade_count_values(cleaned_data) graph_plot.generate_graph(df_whole_city,'nyc') for boroname in cleaned_data['BORO'].unique(): graph_plot.generate_graph(grade_analysis_functions.get_grade_count_values(cleaned_data[cleaned_data['BORO'] == boroname]),boroname.lower()) print "The graphs have been generated. Please check the current directory. Thanks!"
def main(): # Load the training dataset training = pd.read_csv( 'tcd ml 2019-20 income prediction training (with labels).csv') # Clean training data training = dc.clean(training) # Format training data training = dp.shape(training) model_columns = range(1, len(training.columns) - 2) X = training.iloc[:, model_columns] y = training.iloc[:, -1] scaler = StandardScaler() X = scaler.fit_transform(X, y) regression = LinearRegression() regression.fit(X, y) first_model = (mean_squared_error(y_true=y, y_pred=regression.predict(X))) print(first_model) # lm=ElasticNet(normalize=True, max_iter=5000) # lm=SGDRegressor(max_iter=5000, penalty='elasticnet', tol=1e-3) lm = MLPRegressor( hidden_layer_sizes=(5, ), # alpha=0.001, activation='relu', solver='adam', learning_rate='constant', max_iter=10000, # n_iter_no_change=10000, learning_rate_init=0.01) # lm=LinearSVR(epsilon=0.0, C=0.5, tol=1e-4, max_iter=5000) search = GridSearchCV( estimator=lm, param_grid={ # 'C':np.linspace(0.5,1,5) 'alpha': np.logspace(-10, 1, 10) # ,'l1_ratio':np.linspace(.3,.7,5) }, scoring='neg_mean_squared_error', n_jobs=1, refit=True, cv=10) search.fit(X, y) # first_model=(mean_squared_error(y_true=y,y_pred=lm.predict(X))) # print(first_model) print(search.best_params_) print(abs(search.best_score_))
def build_lda_for_data(filename, word_set): word_set_dic = dict(zip(word_set, range(0, len(word_set)))) dataset = [] with open(filename, 'rb') as f: for line in f: one_data = np.zeros(len(word_set), dtype=int) words = jieba.cut(data_clean.clean(line), cut_all=False) for word in words: idx = word_set_dic.get(word, -1) if idx < 0: continue else: one_data[idx] += 1 dataset.append(one_data) return np.array(dataset), word_set_dic
def build_word_set(filename, stop_words_filename=None): if stop_words_filename is None: stop_words = load_stop_words() else: stop_words = load_stop_words(stop_words_filename) word_set = {} with open(filename, 'rb') as f: for line in f: words = jieba.cut(data_clean.clean(line), cut_all=False) for word in words: if stop_words.get(word, 0): continue else: word_set[word] = word_set.get(word, 1) return word_set
import data_merge as dm import factor_test as ft cash_flow = dbi.get_stocks_data('equity_selected_cashflow_sheet_q', ['net_incr_cash_cash_equ'], '2004-01-01', '2018-03-01') cash_flow['ncf_ttm'] = cash_flow.groupby('stkcd')[[ 'net_incr_cash_cash_equ' ]].apply(lambda x: x.rolling(4, min_periods=4).sum()) store = pd.HDFStore('test_data.h5') fdmt = store['fundamental_info'] retn_1m = store['retn_1m'] retn_1m_zz500 = store['retn_1m_zz500'] store.close() data = dm.factor_merge(fdmt, cash_flow) data = data.loc[:, ['stkcd', 'trd_dt', 'wind_indcd', 'cap', 'ncf_ttm']] data['NCFP_TTM_raw'] = data['ncf_ttm'] / (10000 * data['cap']) ncf_raw = data['NCFP_TTM_raw'].groupby(level=1).describe() data.drop(pd.to_datetime(['2005-01-31', '2005-02-28', '2005-03-31']), level=1, inplace=True) data = dc.clean(data, 'NCFP_TTM') data = data.set_index(['trd_dt', 'stkcd']) data.index.names = ['trade_date', 'stock_ID'] signal_input = data[['NCFP_TTM_neu']] test_data = ft.data_join(retn_1m, signal_input) btic_des, btic_m = ft.btic(test_data, 'NCFP_TTM') layer_des = ft.layer_result(test_data, retn_1m_zz500, 'NCFP_TTM')
'2005-01-01', '2018-03-01') fdmt = dc.st_list(fdmt) # 将含有ST标记和上市不满一年的标记为1 data = pd.merge(fdmt, equity, on=['stkcd', 'trd_dt'], how='left') data = data.groupby('stkcd').ffill().dropna() # 按股票向前填充 data = data.groupby('stkcd').resample('M', on='trd_dt').last() # 取每月最后的数据 data = data[(data.type_st == 0) & (data.year_1 == 0)] # 去除ST和上市不满一年的数据 bpd = data.loc[:, [ 'stkcd', 'trd_dt', 'wind_indcd', 'cap', 'tot_shrhldr_eqy_excl_min_int' ]] bpd['BP_raw'] = bpd.tot_shrhldr_eqy_excl_min_int / (10000 * bpd.cap) b_raw = bpd.BP_raw.groupby(by='trd_dt').describe() # 前三个月样本量较小,删除前三个月的数据 bpd.drop(pd.to_datetime(['2005-01-31', '2005-02-28', '2005-03-31']), level=1, inplace=True) bpd = dc.clean(bpd, 'BP', by='trd_dt') # 去极值、标准化、行业中性化 bpd2 = bpd.set_index(['trd_dt', 'stkcd']) signal_input = bpd2[['BP_neu']] price_input = pd.DataFrame(close_price_post.stack(), columns=['close_price_post']) signal_input.index.names = price_input.index.names test_data = price_input.join(signal_input, how='left') # 将因子值和后复权价格合并 test_data = test_data.groupby(level=1).ffill().dropna() # 根据股票向前填充 signal_analysis(test_data['BP_neu'].unstack(), test_data['close_price_post'].unstack()) store = pd.HDFStore('test.h5') # 将测试数据保存到 test.h5 store['BP_neu'] = signal_input store['close_price_post'] = price_input store['test_data'] = test_data store.close()
def main(): # Load the training dataset training = pd.read_csv('tcd ml 2019-20 income prediction training (with labels).csv') # Clean training data training = dc.clean(training) # Format training data training = dp.shape(training) # training.to_csv('training_observe.csv', index=False) # # Split training into its different parts # X_train, X_test, y_train, y_test = train_test_split(training.drop(columns = ['Instance', 'Income in EUR'], axis = 1), # training['Income in EUR'], # test_size=0.2, # stratify=y) model_columns = range(1, len(training.columns) - 2) # model_columns = [1,3,4,6,7,8] # Create linear regression object # lm = linear_model.LinearRegression() # lm = LinearSVR(tol=1e-4, max_iter=1000) # lm = linear_model.SGDRegressor(max_iter=10000, tol=1e-6) # lm = ElasticNet(alpha=0.3, l1_ratio=0.4, tol=1e-4, max_iter=10000) lm=MLPRegressor(hidden_layer_sizes=(5,), alpha=0.6, activation='relu', solver='adam', learning_rate='constant', max_iter=10000, # n_iter_no_change=10000, learning_rate_init=0.01) # Use cross validation kFold = KFold(10, True, 1) # # Use principle component analysis # pca = PCA(n_components = 4) # Enumerate KFold splits for training_train, training_test in kFold.split(training): # Complete split X_train = training.iloc[training_train, model_columns] X_test = training.iloc[training_test, model_columns] y_train = training.iloc[training_train, -1] y_test = training.iloc[training_test, -1] # # Transform with PCA # X_train = pca.fit_transform(X_train) # X_test = pca.transform(X_test) # # explained_variance = pca.explained_variance_ratio_ # for i in explained_variance: # print(format(i*100, 'f')) # Train the model using the training sets lm.fit(X_train, y_train) # Make predictions using the testing set y_predict = lm.predict(X_test) # y_test = pd.DataFrame(y_test, columns= {'Income in EUR'}) # y_predict = pd.DataFrame(y_predict, columns= {'Income in EUR'}) # # y_test = dp.scaleOutput(y_test, 'Income in EUR', 'post') # y_predict = dp.scaleOutput(y_predict, 'Income in EUR', 'post') # Make values more realistic # y_predict = realizePrediction(y_predict, y_train) print(lm.score(X_test, y_test)) # The coefficients # print('Coefficients: \n', lm.coef_) # The mean squared error print("Mean squared error: %.2f" % mean_squared_error(y_test, y_predict)) # Explained variance score: 1 is perfect prediction print('Variance score: %.2f' % r2_score(y_test, y_predict)) print(' ') # Load the prediction dataset test = pd.read_csv('tcd ml 2019-20 income prediction test (without labels).csv') test = dc.clean(test, drop=False) #Format test data test = dp.shape(test) # Make predictions using the test set y_predict = lm.predict(test.iloc[:,model_columns]) print(y_predict) test['Income'] = y_predict # realizePrediction(y_predict, training['Income in EUR']) # plot(training, test) # Write prediction to CSV pred_out = pd.read_csv('tcd ml 2019-20 income prediction submission file.csv') pred_out['Income'] = y_predict # pred_out = dp.scaleOutput(pred_out, 'Income', 'post') pred_out.to_csv('tcd ml 2019-20 income prediction submission file.csv', index=False) print(pd.DataFrame(pred_out['Income'], columns={'Income'}).describe())
import sys sys.path.append(r'E:\FT_Users\XQZhu\stocks_backtest\self_lib') # 自定义的函数 import pandas as pd import database_api as dbi import data_clean as dc import data_merge as dm import factor_test as ft divid = dbi.get_stocks_data('equity_cash_dividend', ['cash_div'], '2004-01-01', '2018-03-01') store = pd.HDFStore('test_data.h5') fdmt = store['fundamental_info'] retn_1m = store['retn_1m'] retn_1m_zz500 = store['retn_1m_zz500'] store.close() data = dm.factor_merge(fdmt, divid) data = data.loc[:, ['stkcd', 'trd_dt', 'wind_indcd', 'cap', 'cash_div']] data['DP_raw'] = data['cash_div'] / data['cap'] dp_raw = data['DP_raw'].groupby(level=1).describe() data.drop(pd.to_datetime(['2005-01-31', '2005-02-28', '2005-03-31']), level=1, inplace=True) data = dc.clean(data, 'DP') data = data.set_index(['trd_dt', 'stkcd']) data.index.names = ['trade_date', 'stock_ID'] signal_input = data[['DP_neu']] test_data = ft.data_join(retn_1m, signal_input) btic_des, btic_m = ft.btic(test_data, 'DP') layer_des = ft.layer_result(test_data, retn_1m_zz500, 'NCFP_TTM', quantile=5)
def main(): # Load the training dataset training = pd.read_csv( 'tcd ml 2019-20 income prediction training (with labels).csv') # Clean training data training = dc.clean(training) # Format training data training = dp.shape(training) # Split training into its different parts X_train, X_test, y_train, y_test = train_test_split( training.drop(columns=['Instance', 'Income in EUR'], axis=1), training['Income in EUR'], test_size=0.1, shuffle=True) categorical_features_indices = np.where(X_train.dtypes != np.float)[0] model = CatBoostRegressor(iterations=5000, depth=6, learning_rate=0.15, loss_function='RMSE', random_seed=37, od_type='Iter', metric_period=50, od_wait=200, use_best_model=True, task_type='GPU') model.fit(X_train, y_train, cat_features=categorical_features_indices, eval_set=(X_test, y_test), plot=True) print(model.score(X_test, y_test)) fea_imp = pd.DataFrame({ 'imp': model.feature_importances_, 'col': X_train.columns }) fea_imp = fea_imp.sort_values(['imp', 'col'], ascending=[True, False]).iloc[-30:] fea_imp.plot(kind='barh', x='col', y='imp', figsize=(10, 7), legend=None) plt.title('CatBoost - Feature Importance') plt.ylabel('Features') plt.xlabel('Importance') # Load the prediction dataset test = pd.read_csv( 'tcd ml 2019-20 income prediction test (without labels).csv') test = dc.clean(test, drop=False) #Format test data test = dp.shape(test) # Make predictions using the test set y_predict = model.predict(test.iloc[:, 1:-1]) print(y_predict) test['Income'] = y_predict # Write prediction to CSV pred_out = pd.read_csv( 'tcd ml 2019-20 income prediction submission file.csv') pred_out['Income'] = y_predict pred_out.to_csv('tcd ml 2019-20 income prediction submission file.csv', index=False) print(pd.DataFrame(pred_out['Income'], columns={'Income'}).describe())
def load_raw_data(filename): raw_dataset = [] with open(filename, 'rb') as f: for line in f: raw_dataset.append(data_clean.clean(line)) return raw_dataset
import data_clean data_clean.clean()
# Look at uniqueness of columns #for column_name in training.columns[1:11]: # uniqueAnalysis(column_name, training) # After looking at uniqueness, we know this: # 1. 'Country', 'Size of City', 'Wears Glasses', and 'Body Height [cm]' are complete columns (without NaNs), but set defaults anyway # 2. Though not complete, 'Gender' NaNs can be put into the 'unknown' category # 3. Though not complete, 'Hair Color' NaNs can be put into the 'Unknown' category # 4. The rest of the columns with NaNs will just have to use some best guess for replacing NaNs # 5. Since 'Country' is complete, might be worth adding a regions column for checking whether there's correlation # 6. Should look at distribution of 'Body Height [cm] plt.style.use('ggplot') training = dc.clean(training) #training = dp.scaleOutput(training, 'Income in EUR') training = dp.shape(training) #print('Year of Record ' + str(len(training.index[training['Year of Record'].isna()]))) #print('Gender ' + str(len(training.index[training['Gender'].isna()]))) #print('Age ' + str(len(training.index[training['Age'].isna()]))) #print('Country ' + str(len(training.index[training['Country'].isna()]))) #print('Size of City ' + str(len(training.index[training['Size of City'].isna()]))) #print('Profession ' + str(len(training.index[training['Profession'].isna()]))) #print('University Degree ' + str(len(training.index[training['University Degree'].isna()]))) #print('Wears Glasses ' + str(len(training.index[training['Wears Glasses'].isna()]))) #print('Hair Color ' + str(len(training.index[training['Hair Color'].isna()]))) #print('Body Height [cm] ' + str(len(training.index[training['Body Height [cm]'].isna()]))) #print('Income in EUR ' + str(len(training.index[training['Income in EUR'].isna()])))
exec(open(r'E:\FT_Users\XQZhu\stocks_backtest\prerun.py').read()) import sys sys.path.append(r'E:\FT_Users\XQZhu\stocks_backtest\self_lib') # 自定义的函数 import pandas as pd import database_api as dbi import data_merge as dm import data_clean as dc net_profit = dbi.get_stocks_data('equity_selected_income_sheet_q', ['net_profit_excl_min_int_inc'], '2004-01-01', '2018-03-01') net_profit['net_profit_emi_ttm'] = net_profit.groupby('stkcd')[['net_profit_excl_min_int_inc']].apply( lambda x: x.rolling(4, min_periods=4).sum()) store = pd.HDFStore('test_data.h5') fdmt = store['fundamental_info'] store.close() data = dm.factor_merge(fdmt, net_profit) data = data.loc[:, ['stkcd', 'trd_dt', 'wind_indcd', 'cap', 'net_profit_emi_ttm']] data['EP_TTM_raw'] = data['net_profit_emi_ttm'] / (10000 * data['cap']) e_raw = data['EP_TTM_raw'].groupby(level=1).describe() data.drop(pd.to_datetime(['2005-01-31', '2005-02-28', '2005-03-31']), level=1, inplace=True) data = dc.clean(data, 'EP_TTM') signal_input, test_data = dm.test_merge(data, 'EP_TTM_neu', close_price_post) dm.test_result(test_data, 'EP_TTM_neu', 'close_price_post') store = pd.HDFStore('test_data.h5') store['EP_TTM_neu'] = signal_input store.close()
'EPS_C_1m', 'EPS_C_3m', 'Est_Sales', 'Sales_C_1m', 'Sales_C_3m', 'Fore_Earning', 'Earning_C_1m_ratio', 'Earning_C_3m_ration', 'Retn_30', 'Retn_90', 'Retn_180', 'Inst_Num_30', 'Inst_Num_90', 'Inst_Num_180', 'Rating', 'Rating_C_1m', 'Rating_C_3m', ] for factor in Consensus_names: data = fdmt_m.join(consensus.loc[:, factor]) data = data[(data.type_st == 0) & (data.year_1 == 0)].dropna() data = dc.clean(data, factor) # 对于Rating相关的三个指标,不用去极值 data = data.set_index(['trade_date', 'stock_ID']) Consensus = Consensus.join(data[[factor + '_neu']], how='outer') store = pd.HDFStore('test_data.h5') store['Consensus'] = Consensus store.close() BTIC, IC, IC_corr, Annual, Sharpe, Rela_IR = ct.class_test( Consensus_names, 'Consensus')
exec(open(r'E:\FT_Users\XQZhu\stocks_backtest\prerun.py').read()) import sys sys.path.append(r'E:\FT_Users\XQZhu\stocks_backtest\self_lib') # 自定义的函数 import pandas as pd import database_api as dbi import data_merge as dm import data_clean as dc oper_rev = dbi.get_stocks_data('equity_selected_income_sheet_q', ['oper_rev'], '2004-01-01', '2018-03-01') oper_rev['oper_rev_ttm'] = oper_rev.groupby('stkcd')[['oper_rev']].apply( lambda x: x.rolling(4, min_periods=4).sum()) store = pd.HDFStore('test_data.h5') fdmt = store['fundamental_info'] store.close() data = dm.factor_merge(fdmt, oper_rev) data = data.loc[:, ['stkcd', 'trd_dt', 'wind_indcd', 'cap', 'oper_rev_ttm']] data['SP_TTM_raw'] = data['oper_rev_ttm'] / (10000 * data['cap']) s_raw = data['SP_TTM_raw'].groupby(level=1).describe() data.drop(pd.to_datetime(['2005-01-31', '2005-02-28', '2005-03-31']), level=1, inplace=True) data = dc.clean(data, 'SP_TTM') signal_input, test_data = dm.test_merge(data, 'SP_TTM_neu', close_price_post) dm.test_result(test_data, 'SP_TTM_neu', 'close_price_post') store = pd.HDFStore('test_data.h5') store['SP_TTM_neu'] = signal_input store.close()
# -*- coding: utf-8 -*- import os import h5py import numpy as np from ECoG_model import preprocessing, label_shift_left, binarize from data_clean import clean os.chdir(os.path.dirname(__file__)) with h5py.File("ECoG_data.h5", "r+") as f1: with h5py.File("ECoG_big_data.h5", "w") as f2: for i in range(1, 4): subj = "sub" + str(i) u = f1[subj]["unmixing_matrix"] X = f1[subj]['train_data'][:] clist = clean(X) X = X.dot(u) Y = f1[subj]['cleaned_train_dg'][:] Xt = f1[subj]['test_data'][:].dot(u) Yt = f1[subj]['cleaned_test_dg'][:] X, _, yb = preprocessing(X, Y[:, 0], cleaning=False) meanX = np.mean(X, axis=0) stdX = np.std(X, axis=0) X -= meanX X /= stdX yb = yb[clist, :] clabel = Y[-X.shape[0]:, :][clist, :] blabel = np.empty_like(clabel, 'i') blabel[:, 0] = yb.ravel() for i in range(1, 5): blabel[:, i] = \ label_shift_left(binarize(
nrows = month_start_end_row_indices[month_key][1] - skiprows + 1 train_month = pd.read_csv(data_path + "train_ver2.csv", dtype=load_dtypes, skiprows=range(1, skiprows + 1), nrows=nrows) train_month["fecha_dato"] = pd.to_datetime(train_month["fecha_dato"], format="%Y-%m-%d") train_month["fecha_alta"] = pd.to_datetime(train_month["fecha_alta"], format="%Y-%m-%d") train_month["age"] = pd.to_numeric(train_month["age"], errors="coerce") # Data Cleaning df = train_month try: clean(df) except: logging.info("Exception is thrown") continue assert df.isnull().any().sum( ) == 0, "Data still contains nan values : \n\n {}".format( df.isnull().any()) if REDUCE_SIZE: logging.info("- Reduce size") if selected_clients is None: full_stats = df.describe() unique_ids = pd.Series(df["ncodpers"].unique()) limit_people = 200000 counter = 200
net_profit['net_profit_emi_ttm'] = net_profit.groupby('stkcd')[[ 'net_profit_excl_min_int_inc' ]].apply(lambda x: x.rolling(4, min_periods=4).sum()) net_profit['growth'] = net_profit.groupby('stkcd')[[ 'net_profit_emi_ttm' ]].apply(lambda x: x.pct_change()) store = pd.HDFStore('test_data.h5') fdmt = store['fundamental_info'] retn_1m = store['retn_1m'] retn_1m_zz500 = store['retn_1m_zz500'] store.close() data = dm.factor_merge(fdmt, net_profit) data = data.loc[:, [ 'stkcd', 'trd_dt', 'wind_indcd', 'cap', 'net_profit_emi_ttm', 'growth' ]] data['PEG_TTM_raw'] = 100 * data['cap'] / data['net_profit_emi_ttm'] / data[ 'growth'] p_raw = data['PEG_TTM_raw'].groupby(level=1).describe() data.drop(pd.to_datetime(['2005-01-31', '2005-02-28', '2005-03-31']), level=1, inplace=True) data = dc.clean(data, 'PEG_TTM') data = data.set_index(['trd_dt', 'stkcd']) data.index.names = ['trade_date', 'stock_ID'] signal_input = data[['PEG_TTM_neu']] test_data = ft.data_join(retn_1m, signal_input) btic_des, btic_m = ft.btic(test_data, 'PEG_TTM') layer_des = ft.layer_result(test_data, retn_1m_zz500, 'PEG_TTM')
Vol1 = HighLow.join(retn_std, how='outer') Vol2 = Vol1.join(v_std, how='outer') Vol = Vol2.join(Resid_vol, how='outer') store = pd.HDFStore('test_data.h5') fdmt = store['fundamental_info'] store.close() fdmt.rename(columns={ 'trd_dt': 'trade_date', 'stkcd': 'stock_ID' }, inplace=True) fdmt_m = fdmt.groupby('stock_ID').resample('M', on='trade_date').last() # volatility_index = data.loc[:, ['trade_date', 'stock_ID']] Volatility = store['Volatility_index'] Volatility = Volatility.set_index(['trade_date', 'stock_ID']) Volatility_names = Vol.columns.tolist() for factor in Volatility_names: data = fdmt_m.join(Vol.loc[:, factor]) data = data[(data.type_st == 0) & (data.year_1 == 0)].dropna() data = dc.clean(data, factor) data = data.set_index(['trade_date', 'stock_ID']) Volatility = Volatility.join(data[[factor + '_neu']], how='outer') store['Volatility'] = Volatility ind = list(np.sort(data['wind_2'].unique()))[1:] BTIC, IC, IC_corr, Annual, Sharpe, Rela_IR = ct.class_test( Volatility_names, 'Volatility')
exec(open(r'E:\FT_Users\XQZhu\stocks_backtest\prerun.py').read()) import sys sys.path.append(r'E:\FT_Users\XQZhu\stocks_backtest\self_lib') # 自定义的函数 import pandas as pd import database_api as dbi import data_merge as dm import data_clean as dc cash_flow = dbi.get_stocks_data('equity_selected_cashflow_sheet_q', ['net_cash_flows_oper_act'], '2004-01-01', '2018-03-01') cash_flow['oper_cf_ttm'] = cash_flow.groupby('stkcd')[['net_cash_flows_oper_act']].apply( lambda x: x.rolling(4, min_periods=4).sum()) store = pd.HDFStore('test_data.h5') fdmt = store['fundamental_info'] store.close() data = dm.factor_merge(fdmt, cash_flow) data = data.loc[:, ['stkcd', 'trd_dt', 'wind_indcd', 'cap', 'oper_cf_ttm']] data['OCFP_TTM_raw'] = data['oper_cf_ttm'] / (10000 * data['cap']) ocf_raw = data['OCFP_TTM_raw'].groupby(level=1).describe() data.drop(pd.to_datetime(['2005-01-31', '2005-02-28', '2005-03-31']), level=1, inplace=True) data = dc.clean(data, 'OCFP_TTM') signal_input, test_data = dm.test_merge(data, 'OCFP_TTM_neu', close_price_post) dm.test_result(test_data, 'OCFP_TTM_neu', 'close_price_post') store = pd.HDFStore('test_data.h5') store['OCFP_TTM_neu'] =signal_input store.close()