Exemplo n.º 1
0
def generate_answers():
    try:
        raw_data = pd.read_csv('DOHMH_New_York_City_Restaurant_Inspection_Results.csv',low_memory=False) 
    except IOError:
        sys.exit()

    print "The program has got the raw data. Now it is cleaning the data... Please wait."
    raw_data_cleaning = data_clean.clean(raw_data) 
    cleaned_data = raw_data_cleaning.clean_data()

    # question 4: compute the performance of all restaurants and print the sum for different Boroughs.
           
    print "Computing the performance of all restaurants..."
    grade_analysis_functions.print_restaurant_grades_all(cleaned_data)
    print "Camputing the performance of restaurants in different boroughs..."
    grade_analysis_functions.print_restaurant_grades_by_borough(cleaned_data)


    # question 5: generate the graphs  
    print "Generating the graphs..."      
    df_whole_city = grade_analysis_functions.get_grade_count_values(cleaned_data)
    graph_plot.generate_graph(df_whole_city,'nyc')
    for boroname in cleaned_data['BORO'].unique():
        graph_plot.generate_graph(grade_analysis_functions.get_grade_count_values(cleaned_data[cleaned_data['BORO'] == boroname]),boroname.lower())
    print "The graphs have been generated. Please check the current directory. Thanks!"
Exemplo n.º 2
0
def main():
    # Load the training dataset
    training = pd.read_csv(
        'tcd ml 2019-20 income prediction training (with labels).csv')

    # Clean training data
    training = dc.clean(training)

    # Format training data
    training = dp.shape(training)

    model_columns = range(1, len(training.columns) - 2)

    X = training.iloc[:, model_columns]
    y = training.iloc[:, -1]

    scaler = StandardScaler()

    X = scaler.fit_transform(X, y)

    regression = LinearRegression()
    regression.fit(X, y)
    first_model = (mean_squared_error(y_true=y, y_pred=regression.predict(X)))
    print(first_model)

    #    lm=ElasticNet(normalize=True, max_iter=5000)
    #    lm=SGDRegressor(max_iter=5000, penalty='elasticnet', tol=1e-3)
    lm = MLPRegressor(
        hidden_layer_sizes=(5, ),
        #                        alpha=0.001,
        activation='relu',
        solver='adam',
        learning_rate='constant',
        max_iter=10000,
        #                                       n_iter_no_change=10000,
        learning_rate_init=0.01)
    #    lm=LinearSVR(epsilon=0.0, C=0.5, tol=1e-4, max_iter=5000)
    search = GridSearchCV(
        estimator=lm,
        param_grid={
            #                                                    'C':np.linspace(0.5,1,5)
            'alpha': np.logspace(-10, 1, 10)
            #                                                    ,'l1_ratio':np.linspace(.3,.7,5)
        },
        scoring='neg_mean_squared_error',
        n_jobs=1,
        refit=True,
        cv=10)

    search.fit(X, y)
    #    first_model=(mean_squared_error(y_true=y,y_pred=lm.predict(X)))
    #    print(first_model)
    print(search.best_params_)
    print(abs(search.best_score_))
Exemplo n.º 3
0
def build_lda_for_data(filename, word_set):
    word_set_dic = dict(zip(word_set, range(0, len(word_set))))
    dataset = []
    with open(filename, 'rb') as f:
        for line in f:
            one_data = np.zeros(len(word_set), dtype=int)
            words = jieba.cut(data_clean.clean(line), cut_all=False)
            for word in words:
                idx = word_set_dic.get(word, -1)
                if idx < 0:
                    continue
                else:
                    one_data[idx] += 1
            dataset.append(one_data)
    return np.array(dataset), word_set_dic
Exemplo n.º 4
0
def build_word_set(filename, stop_words_filename=None):
    if stop_words_filename is None:
        stop_words = load_stop_words()
    else:
        stop_words = load_stop_words(stop_words_filename)

    word_set = {}
    with open(filename, 'rb') as f:
        for line in f:
            words = jieba.cut(data_clean.clean(line), cut_all=False)
            for word in words:
                if stop_words.get(word, 0):
                    continue
                else:
                    word_set[word] = word_set.get(word, 1)
    return word_set
Exemplo n.º 5
0
import data_merge as dm
import factor_test as ft

cash_flow = dbi.get_stocks_data('equity_selected_cashflow_sheet_q',
                                ['net_incr_cash_cash_equ'], '2004-01-01',
                                '2018-03-01')
cash_flow['ncf_ttm'] = cash_flow.groupby('stkcd')[[
    'net_incr_cash_cash_equ'
]].apply(lambda x: x.rolling(4, min_periods=4).sum())

store = pd.HDFStore('test_data.h5')
fdmt = store['fundamental_info']
retn_1m = store['retn_1m']
retn_1m_zz500 = store['retn_1m_zz500']
store.close()

data = dm.factor_merge(fdmt, cash_flow)
data = data.loc[:, ['stkcd', 'trd_dt', 'wind_indcd', 'cap', 'ncf_ttm']]
data['NCFP_TTM_raw'] = data['ncf_ttm'] / (10000 * data['cap'])
ncf_raw = data['NCFP_TTM_raw'].groupby(level=1).describe()
data.drop(pd.to_datetime(['2005-01-31', '2005-02-28', '2005-03-31']),
          level=1,
          inplace=True)
data = dc.clean(data, 'NCFP_TTM')
data = data.set_index(['trd_dt', 'stkcd'])
data.index.names = ['trade_date', 'stock_ID']
signal_input = data[['NCFP_TTM_neu']]
test_data = ft.data_join(retn_1m, signal_input)
btic_des, btic_m = ft.btic(test_data, 'NCFP_TTM')
layer_des = ft.layer_result(test_data, retn_1m_zz500, 'NCFP_TTM')
Exemplo n.º 6
0
                           '2005-01-01', '2018-03-01')
fdmt = dc.st_list(fdmt)  # 将含有ST标记和上市不满一年的标记为1
data = pd.merge(fdmt, equity, on=['stkcd', 'trd_dt'], how='left')
data = data.groupby('stkcd').ffill().dropna()  # 按股票向前填充
data = data.groupby('stkcd').resample('M', on='trd_dt').last()  # 取每月最后的数据
data = data[(data.type_st == 0) & (data.year_1 == 0)]  # 去除ST和上市不满一年的数据
bpd = data.loc[:, [
    'stkcd', 'trd_dt', 'wind_indcd', 'cap', 'tot_shrhldr_eqy_excl_min_int'
]]
bpd['BP_raw'] = bpd.tot_shrhldr_eqy_excl_min_int / (10000 * bpd.cap)
b_raw = bpd.BP_raw.groupby(by='trd_dt').describe()
# 前三个月样本量较小,删除前三个月的数据
bpd.drop(pd.to_datetime(['2005-01-31', '2005-02-28', '2005-03-31']),
         level=1,
         inplace=True)
bpd = dc.clean(bpd, 'BP', by='trd_dt')  # 去极值、标准化、行业中性化
bpd2 = bpd.set_index(['trd_dt', 'stkcd'])
signal_input = bpd2[['BP_neu']]
price_input = pd.DataFrame(close_price_post.stack(),
                           columns=['close_price_post'])
signal_input.index.names = price_input.index.names
test_data = price_input.join(signal_input, how='left')  # 将因子值和后复权价格合并
test_data = test_data.groupby(level=1).ffill().dropna()  # 根据股票向前填充
signal_analysis(test_data['BP_neu'].unstack(),
                test_data['close_price_post'].unstack())

store = pd.HDFStore('test.h5')  # 将测试数据保存到 test.h5
store['BP_neu'] = signal_input
store['close_price_post'] = price_input
store['test_data'] = test_data
store.close()
Exemplo n.º 7
0
def main():
    # Load the training dataset
    training = pd.read_csv('tcd ml 2019-20 income prediction training (with labels).csv')
    
    # Clean training data
    training = dc.clean(training)
    
    # Format training data
    training = dp.shape(training)
    
#    training.to_csv('training_observe.csv', index=False)
    
#    # Split training into its different parts
#    X_train, X_test, y_train, y_test = train_test_split(training.drop(columns = ['Instance', 'Income in EUR'], axis = 1),
#                                                        training['Income in EUR'],
#                                                        test_size=0.2,
#                                                        stratify=y)
    
    model_columns = range(1, len(training.columns) - 2)
#    model_columns = [1,3,4,6,7,8]
    
    # Create linear regression object
#    lm = linear_model.LinearRegression()
#    lm = LinearSVR(tol=1e-4, max_iter=1000)
#    lm = linear_model.SGDRegressor(max_iter=10000, tol=1e-6)
#    lm = ElasticNet(alpha=0.3, l1_ratio=0.4, tol=1e-4, max_iter=10000)
    lm=MLPRegressor(hidden_layer_sizes=(5,),
                        alpha=0.6,
                                       activation='relu',
                                       solver='adam',
                                       learning_rate='constant',
                                       max_iter=10000,
#                                       n_iter_no_change=10000,
                                       learning_rate_init=0.01)
    
    # Use cross validation
    kFold = KFold(10, True, 1)
    
#    # Use principle component analysis
#    pca = PCA(n_components = 4)
    
    # Enumerate KFold splits
    for training_train, training_test in kFold.split(training):
        # Complete split
        X_train = training.iloc[training_train, model_columns]
        X_test = training.iloc[training_test, model_columns]
        y_train = training.iloc[training_train, -1]
        y_test = training.iloc[training_test, -1]
         
#        # Transform with PCA
#        X_train = pca.fit_transform(X_train)  
#        X_test = pca.transform(X_test)
#        
#        explained_variance = pca.explained_variance_ratio_ 
#        for i in explained_variance:
#            print(format(i*100, 'f'))
        
        # Train the model using the training sets
        lm.fit(X_train, y_train)
        
        # Make predictions using the testing set
        y_predict = lm.predict(X_test)
        
#        y_test = pd.DataFrame(y_test, columns= {'Income in EUR'})
#        y_predict = pd.DataFrame(y_predict, columns= {'Income in EUR'})
#        
#        y_test = dp.scaleOutput(y_test, 'Income in EUR', 'post')
#        y_predict = dp.scaleOutput(y_predict, 'Income in EUR', 'post')
        
        # Make values more realistic
#        y_predict = realizePrediction(y_predict, y_train)
        
        print(lm.score(X_test, y_test))
        
        # The coefficients    
#        print('Coefficients: \n', lm.coef_)
        # The mean squared error
        print("Mean squared error: %.2f"
              % mean_squared_error(y_test, y_predict))
        # Explained variance score: 1 is perfect prediction
        print('Variance score: %.2f' % r2_score(y_test, y_predict))
        print(' ')
    
    # Load the prediction dataset
    test = pd.read_csv('tcd ml 2019-20 income prediction test (without labels).csv')
    
    test = dc.clean(test, drop=False)
    
    #Format test data
    test = dp.shape(test)
    
    # Make predictions using the test set
    y_predict = lm.predict(test.iloc[:,model_columns])
    
    print(y_predict)
    
    test['Income'] = y_predict
    
#    realizePrediction(y_predict, training['Income in EUR'])
    
#    plot(training, test)
    
    # Write prediction to CSV
    pred_out = pd.read_csv('tcd ml 2019-20 income prediction submission file.csv')
    pred_out['Income'] = y_predict
#    pred_out = dp.scaleOutput(pred_out, 'Income', 'post')
    pred_out.to_csv('tcd ml 2019-20 income prediction submission file.csv', index=False)
    
    print(pd.DataFrame(pred_out['Income'], columns={'Income'}).describe())
Exemplo n.º 8
0
import sys
sys.path.append(r'E:\FT_Users\XQZhu\stocks_backtest\self_lib')  # 自定义的函数
import pandas as pd
import database_api as dbi
import data_clean as dc
import data_merge as dm
import factor_test as ft

divid = dbi.get_stocks_data('equity_cash_dividend', ['cash_div'], '2004-01-01',
                            '2018-03-01')
store = pd.HDFStore('test_data.h5')
fdmt = store['fundamental_info']
retn_1m = store['retn_1m']
retn_1m_zz500 = store['retn_1m_zz500']
store.close()

data = dm.factor_merge(fdmt, divid)
data = data.loc[:, ['stkcd', 'trd_dt', 'wind_indcd', 'cap', 'cash_div']]
data['DP_raw'] = data['cash_div'] / data['cap']
dp_raw = data['DP_raw'].groupby(level=1).describe()
data.drop(pd.to_datetime(['2005-01-31', '2005-02-28', '2005-03-31']),
          level=1,
          inplace=True)
data = dc.clean(data, 'DP')
data = data.set_index(['trd_dt', 'stkcd'])
data.index.names = ['trade_date', 'stock_ID']
signal_input = data[['DP_neu']]
test_data = ft.data_join(retn_1m, signal_input)
btic_des, btic_m = ft.btic(test_data, 'DP')
layer_des = ft.layer_result(test_data, retn_1m_zz500, 'NCFP_TTM', quantile=5)
Exemplo n.º 9
0
def main():
    # Load the training dataset
    training = pd.read_csv(
        'tcd ml 2019-20 income prediction training (with labels).csv')

    # Clean training data
    training = dc.clean(training)

    # Format training data
    training = dp.shape(training)

    # Split training into its different parts
    X_train, X_test, y_train, y_test = train_test_split(
        training.drop(columns=['Instance', 'Income in EUR'], axis=1),
        training['Income in EUR'],
        test_size=0.1,
        shuffle=True)

    categorical_features_indices = np.where(X_train.dtypes != np.float)[0]

    model = CatBoostRegressor(iterations=5000,
                              depth=6,
                              learning_rate=0.15,
                              loss_function='RMSE',
                              random_seed=37,
                              od_type='Iter',
                              metric_period=50,
                              od_wait=200,
                              use_best_model=True,
                              task_type='GPU')
    model.fit(X_train,
              y_train,
              cat_features=categorical_features_indices,
              eval_set=(X_test, y_test),
              plot=True)

    print(model.score(X_test, y_test))

    fea_imp = pd.DataFrame({
        'imp': model.feature_importances_,
        'col': X_train.columns
    })
    fea_imp = fea_imp.sort_values(['imp', 'col'], ascending=[True,
                                                             False]).iloc[-30:]
    fea_imp.plot(kind='barh', x='col', y='imp', figsize=(10, 7), legend=None)
    plt.title('CatBoost - Feature Importance')
    plt.ylabel('Features')
    plt.xlabel('Importance')

    # Load the prediction dataset
    test = pd.read_csv(
        'tcd ml 2019-20 income prediction test (without labels).csv')

    test = dc.clean(test, drop=False)

    #Format test data
    test = dp.shape(test)

    # Make predictions using the test set
    y_predict = model.predict(test.iloc[:, 1:-1])

    print(y_predict)

    test['Income'] = y_predict

    # Write prediction to CSV
    pred_out = pd.read_csv(
        'tcd ml 2019-20 income prediction submission file.csv')
    pred_out['Income'] = y_predict
    pred_out.to_csv('tcd ml 2019-20 income prediction submission file.csv',
                    index=False)

    print(pd.DataFrame(pred_out['Income'], columns={'Income'}).describe())
Exemplo n.º 10
0
def load_raw_data(filename):
    raw_dataset = []
    with open(filename, 'rb') as f:
        for line in f:
            raw_dataset.append(data_clean.clean(line))
    return raw_dataset
Exemplo n.º 11
0
import data_clean

data_clean.clean()
Exemplo n.º 12
0
# Look at uniqueness of columns
#for column_name in training.columns[1:11]:
#    uniqueAnalysis(column_name, training)

# After looking at uniqueness, we know this:
# 1. 'Country', 'Size of City', 'Wears Glasses', and 'Body Height [cm]' are complete columns (without NaNs), but set defaults anyway
# 2. Though not complete, 'Gender' NaNs can be put into the 'unknown' category
# 3. Though not complete, 'Hair Color' NaNs can be put into the 'Unknown' category
# 4. The rest of the columns with NaNs will just have to use some best guess for replacing NaNs
# 5. Since 'Country' is complete, might be worth adding a regions column for checking whether there's correlation
# 6. Should look at distribution of 'Body Height [cm]

plt.style.use('ggplot')

training = dc.clean(training)
#training = dp.scaleOutput(training, 'Income in EUR')
training = dp.shape(training)

#print('Year of Record ' + str(len(training.index[training['Year of Record'].isna()])))
#print('Gender ' + str(len(training.index[training['Gender'].isna()])))
#print('Age ' + str(len(training.index[training['Age'].isna()])))
#print('Country ' + str(len(training.index[training['Country'].isna()])))
#print('Size of City ' + str(len(training.index[training['Size of City'].isna()])))
#print('Profession ' + str(len(training.index[training['Profession'].isna()])))
#print('University Degree ' + str(len(training.index[training['University Degree'].isna()])))
#print('Wears Glasses ' + str(len(training.index[training['Wears Glasses'].isna()])))
#print('Hair Color ' + str(len(training.index[training['Hair Color'].isna()])))
#print('Body Height [cm] ' + str(len(training.index[training['Body Height [cm]'].isna()])))
#print('Income in EUR ' + str(len(training.index[training['Income in EUR'].isna()])))
Exemplo n.º 13
0
exec(open(r'E:\FT_Users\XQZhu\stocks_backtest\prerun.py').read())
import sys
sys.path.append(r'E:\FT_Users\XQZhu\stocks_backtest\self_lib') # 自定义的函数
import pandas as pd
import database_api as dbi
import data_merge as dm
import data_clean as dc

net_profit = dbi.get_stocks_data('equity_selected_income_sheet_q',
                                 ['net_profit_excl_min_int_inc'],
                                 '2004-01-01', '2018-03-01')
net_profit['net_profit_emi_ttm'] = net_profit.groupby('stkcd')[['net_profit_excl_min_int_inc']].apply(
        lambda x: x.rolling(4, min_periods=4).sum())

store = pd.HDFStore('test_data.h5')
fdmt = store['fundamental_info']
store.close()

data = dm.factor_merge(fdmt, net_profit)
data = data.loc[:, ['stkcd', 'trd_dt', 'wind_indcd', 'cap', 'net_profit_emi_ttm']]
data['EP_TTM_raw'] = data['net_profit_emi_ttm'] / (10000 * data['cap'])
e_raw = data['EP_TTM_raw'].groupby(level=1).describe()
data.drop(pd.to_datetime(['2005-01-31', '2005-02-28', '2005-03-31']), level=1, inplace=True)
data = dc.clean(data, 'EP_TTM')
signal_input, test_data = dm.test_merge(data, 'EP_TTM_neu', close_price_post)
dm.test_result(test_data, 'EP_TTM_neu', 'close_price_post')

store = pd.HDFStore('test_data.h5')
store['EP_TTM_neu'] = signal_input
store.close()
Exemplo n.º 14
0
    'EPS_C_1m',
    'EPS_C_3m',
    'Est_Sales',
    'Sales_C_1m',
    'Sales_C_3m',
    'Fore_Earning',
    'Earning_C_1m_ratio',
    'Earning_C_3m_ration',
    'Retn_30',
    'Retn_90',
    'Retn_180',
    'Inst_Num_30',
    'Inst_Num_90',
    'Inst_Num_180',
    'Rating',
    'Rating_C_1m',
    'Rating_C_3m',
]
for factor in Consensus_names:
    data = fdmt_m.join(consensus.loc[:, factor])
    data = data[(data.type_st == 0) & (data.year_1 == 0)].dropna()
    data = dc.clean(data, factor)  # 对于Rating相关的三个指标,不用去极值
    data = data.set_index(['trade_date', 'stock_ID'])
    Consensus = Consensus.join(data[[factor + '_neu']], how='outer')

store = pd.HDFStore('test_data.h5')
store['Consensus'] = Consensus
store.close()
BTIC, IC, IC_corr, Annual, Sharpe, Rela_IR = ct.class_test(
    Consensus_names, 'Consensus')
Exemplo n.º 15
0
exec(open(r'E:\FT_Users\XQZhu\stocks_backtest\prerun.py').read())
import sys
sys.path.append(r'E:\FT_Users\XQZhu\stocks_backtest\self_lib') # 自定义的函数
import pandas as pd
import database_api as dbi
import data_merge as dm
import data_clean as dc

oper_rev = dbi.get_stocks_data('equity_selected_income_sheet_q', ['oper_rev'],
                               '2004-01-01', '2018-03-01')
oper_rev['oper_rev_ttm'] = oper_rev.groupby('stkcd')[['oper_rev']].apply(
        lambda x: x.rolling(4, min_periods=4).sum())

store = pd.HDFStore('test_data.h5')
fdmt = store['fundamental_info']
store.close()

data = dm.factor_merge(fdmt, oper_rev)
data = data.loc[:, ['stkcd', 'trd_dt', 'wind_indcd', 'cap', 'oper_rev_ttm']]
data['SP_TTM_raw'] = data['oper_rev_ttm'] / (10000 * data['cap'])
s_raw = data['SP_TTM_raw'].groupby(level=1).describe()
data.drop(pd.to_datetime(['2005-01-31', '2005-02-28', '2005-03-31']), level=1, inplace=True)
data = dc.clean(data, 'SP_TTM')
signal_input, test_data = dm.test_merge(data, 'SP_TTM_neu', close_price_post)
dm.test_result(test_data, 'SP_TTM_neu', 'close_price_post')

store = pd.HDFStore('test_data.h5')
store['SP_TTM_neu'] = signal_input
store.close()
Exemplo n.º 16
0
# -*- coding: utf-8 -*-
import os
import h5py
import numpy as np
from ECoG_model import preprocessing, label_shift_left, binarize
from data_clean import clean

os.chdir(os.path.dirname(__file__))
with h5py.File("ECoG_data.h5", "r+") as f1:
    with h5py.File("ECoG_big_data.h5", "w") as f2:
        for i in range(1, 4):
            subj = "sub" + str(i)
            u = f1[subj]["unmixing_matrix"]
            X = f1[subj]['train_data'][:]
            clist = clean(X)
            X = X.dot(u)
            Y = f1[subj]['cleaned_train_dg'][:]
            Xt = f1[subj]['test_data'][:].dot(u)
            Yt = f1[subj]['cleaned_test_dg'][:]
            X, _, yb = preprocessing(X, Y[:, 0], cleaning=False)
            meanX = np.mean(X, axis=0)
            stdX = np.std(X, axis=0)
            X -= meanX
            X /= stdX
            yb = yb[clist, :]
            clabel = Y[-X.shape[0]:, :][clist, :]
            blabel = np.empty_like(clabel, 'i')
            blabel[:, 0] = yb.ravel()
            for i in range(1, 5):
                blabel[:, i] = \
                label_shift_left(binarize(
Exemplo n.º 17
0
    nrows = month_start_end_row_indices[month_key][1] - skiprows + 1
    train_month = pd.read_csv(data_path + "train_ver2.csv",
                              dtype=load_dtypes,
                              skiprows=range(1, skiprows + 1),
                              nrows=nrows)

    train_month["fecha_dato"] = pd.to_datetime(train_month["fecha_dato"],
                                               format="%Y-%m-%d")
    train_month["fecha_alta"] = pd.to_datetime(train_month["fecha_alta"],
                                               format="%Y-%m-%d")
    train_month["age"] = pd.to_numeric(train_month["age"], errors="coerce")

    # Data Cleaning
    df = train_month
    try:
        clean(df)
    except:
        logging.info("Exception is thrown")
        continue
    assert df.isnull().any().sum(
    ) == 0, "Data still contains nan values : \n\n {}".format(
        df.isnull().any())

    if REDUCE_SIZE:
        logging.info("- Reduce size")

        if selected_clients is None:
            full_stats = df.describe()
            unique_ids = pd.Series(df["ncodpers"].unique())
            limit_people = 200000
            counter = 200
Exemplo n.º 18
0
net_profit['net_profit_emi_ttm'] = net_profit.groupby('stkcd')[[
    'net_profit_excl_min_int_inc'
]].apply(lambda x: x.rolling(4, min_periods=4).sum())
net_profit['growth'] = net_profit.groupby('stkcd')[[
    'net_profit_emi_ttm'
]].apply(lambda x: x.pct_change())

store = pd.HDFStore('test_data.h5')
fdmt = store['fundamental_info']
retn_1m = store['retn_1m']
retn_1m_zz500 = store['retn_1m_zz500']
store.close()

data = dm.factor_merge(fdmt, net_profit)
data = data.loc[:, [
    'stkcd', 'trd_dt', 'wind_indcd', 'cap', 'net_profit_emi_ttm', 'growth'
]]
data['PEG_TTM_raw'] = 100 * data['cap'] / data['net_profit_emi_ttm'] / data[
    'growth']
p_raw = data['PEG_TTM_raw'].groupby(level=1).describe()
data.drop(pd.to_datetime(['2005-01-31', '2005-02-28', '2005-03-31']),
          level=1,
          inplace=True)
data = dc.clean(data, 'PEG_TTM')
data = data.set_index(['trd_dt', 'stkcd'])
data.index.names = ['trade_date', 'stock_ID']
signal_input = data[['PEG_TTM_neu']]
test_data = ft.data_join(retn_1m, signal_input)
btic_des, btic_m = ft.btic(test_data, 'PEG_TTM')
layer_des = ft.layer_result(test_data, retn_1m_zz500, 'PEG_TTM')
Exemplo n.º 19
0
Vol1 = HighLow.join(retn_std, how='outer')
Vol2 = Vol1.join(v_std, how='outer')
Vol = Vol2.join(Resid_vol, how='outer')

store = pd.HDFStore('test_data.h5')
fdmt = store['fundamental_info']
store.close()

fdmt.rename(columns={
    'trd_dt': 'trade_date',
    'stkcd': 'stock_ID'
},
            inplace=True)
fdmt_m = fdmt.groupby('stock_ID').resample('M', on='trade_date').last()

# volatility_index = data.loc[:, ['trade_date', 'stock_ID']]
Volatility = store['Volatility_index']
Volatility = Volatility.set_index(['trade_date', 'stock_ID'])
Volatility_names = Vol.columns.tolist()
for factor in Volatility_names:
    data = fdmt_m.join(Vol.loc[:, factor])
    data = data[(data.type_st == 0) & (data.year_1 == 0)].dropna()
    data = dc.clean(data, factor)
    data = data.set_index(['trade_date', 'stock_ID'])
    Volatility = Volatility.join(data[[factor + '_neu']], how='outer')

store['Volatility'] = Volatility
ind = list(np.sort(data['wind_2'].unique()))[1:]
BTIC, IC, IC_corr, Annual, Sharpe, Rela_IR = ct.class_test(
    Volatility_names, 'Volatility')
Exemplo n.º 20
0
exec(open(r'E:\FT_Users\XQZhu\stocks_backtest\prerun.py').read())
import sys
sys.path.append(r'E:\FT_Users\XQZhu\stocks_backtest\self_lib') # 自定义的函数
import pandas as pd
import database_api as dbi
import data_merge as dm
import data_clean as dc

cash_flow = dbi.get_stocks_data('equity_selected_cashflow_sheet_q', 
                               ['net_cash_flows_oper_act'],
                               '2004-01-01', '2018-03-01')
cash_flow['oper_cf_ttm'] = cash_flow.groupby('stkcd')[['net_cash_flows_oper_act']].apply(
        lambda x: x.rolling(4, min_periods=4).sum())

store = pd.HDFStore('test_data.h5')
fdmt = store['fundamental_info']
store.close()

data = dm.factor_merge(fdmt, cash_flow)
data = data.loc[:, ['stkcd', 'trd_dt', 'wind_indcd', 'cap', 'oper_cf_ttm']]
data['OCFP_TTM_raw'] = data['oper_cf_ttm'] / (10000 * data['cap'])
ocf_raw = data['OCFP_TTM_raw'].groupby(level=1).describe()
data.drop(pd.to_datetime(['2005-01-31', '2005-02-28', '2005-03-31']), level=1, inplace=True)
data = dc.clean(data, 'OCFP_TTM')
signal_input, test_data = dm.test_merge(data, 'OCFP_TTM_neu', close_price_post)
dm.test_result(test_data, 'OCFP_TTM_neu', 'close_price_post')

store = pd.HDFStore('test_data.h5')
store['OCFP_TTM_neu'] =signal_input
store.close()