Пример #1
0
def compare_s_no(dataPath=""):
    data = pd.read_csv(dataPath)
    # 加入其他的指标
    result = get_other_indicators(data)
    deal_result = result.dropna(axis=0)[-100:]
    # 利用LOF处理原始数据进行重新的决策
    final_data = deal_data_from_dataFrame(deal_result)
    # data_y = final_data[1]
    final_data_x = nr.standardized_mars(final_data[0])
    print(final_data_x.shape)
    # 直接使用pca数据,将100%做特异值处理,随机森林的训练
    # 拿100%的数据进行PCA
    data_y = final_data[1]
    data_x = final_data[0]
    final_data_x = nr.standardized_mars(data_x)
    # 拿100%的数据进行PCA
    pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=54)

    # 奇异值处理
    oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False)

    #random_forest((lof_data_x, new_all_y))

    lof_pred = oc.get_pred_test()
    error_index = oc.get_delete_index()
    lof_data_y = oc.replace_Singular(data_y, lof_pred)

    fig, (ax1, ax2) = plt.subplots(1, 2, sharex=True)
    ax1.scatter(range(len(data_y)), data_y, label='data_y')
    error_close = data_y[error_index]
    # ax1.plot(range(len(lof_pred)), lof_pred, label='lof_pred')

    ax1.scatter(error_index, error_close, label='error_y', c='r', alpha=0.2)
    # ax1.xlabel('x -')
    # ax1.ylabel('y -')
    # ax1.title('plot open')
    ax1.legend()
    # ax2.ylabel('close')

    error_lof_y = lof_data_y[error_index]

    ax2.scatter(range(len(lof_data_y)), lof_data_y, label='lof_data_y')
    ax2.scatter(error_index, error_lof_y, label='error_lof_y', c='r', alpha=0.2)
    # ax2.plot(close**2, label='quadratic')
    ax2.legend()
    # 调整cavas 的间隔
    print(len(data_y))
    print(len(lof_data_y))
    plt.tight_layout()
    plt.show()
Пример #2
0
def test():
    data = pd.read_csv(
        '/home/mars/Data/finialData/electronic_infomation/000021.csv')
    data = data[::-1]
    result = get_other_indicators(data)

    #result = data[['price_change', 'p_change']]
    deal_result = result.dropna(axis=0)
    close = deal_result['close']
    print(close.shape)
    s_deal_data = deal_data_from_dataFrame(deal_result)
    data_x = s_deal_data[0]
    data_y = s_deal_data[1]
    # 特征处理
    #t_deal_data_x = Filter(use=False).Variance_selection(threshold=3, data=s_deal_data)[0]
    # 归一化
    final_data_x = nr.standardized_mars(data_x)

    pca_x = oc.LOF_PCA_for_Clustering(final_data_x)

    final_data_x_LOF = oc.replace_Singular(final_data_x, oc.get_pred_test())
    print('final_data_x_LOF', final_data_x_LOF[:16])

    print(final_data_x_LOF.shape)
    #降维处理
    #pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=0.9)
    # #############################################################################
    # Compute clustering with MeanShift
    x_train = final_data_x_LOF[:int(len(data_x) * 0.7)]
    print('x_train', x_train.shape)
    x_test = final_data_x_LOF[int(len(data_x) * 0.7):]
    # The following bandwidth can be automatically detected using
    bandwidth = estimate_bandwidth(x_train, quantile=0.2, random_state=1)

    ms = MeanShift(bandwidth=bandwidth, bin_seeding=False)
    ms.fit(final_data_x_LOF)
    labels = ms.labels_
    print('error size', labels[labels != 0].size)
    print('index of not 0 *******')
    print([i for i, x in enumerate(labels) if x != 0])
    print('*******')
    print(labels)
    print(labels.shape)
    cluster_centers = ms.cluster_centers_

    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)

    #score = metrics.silhouette_score(pca_x, labels, metric='euclidean')
    #score1 = metrics.calinski_harabaz_score(pca_x, labels)
    #print(score)
    #print(score1)

    print("number of estimated clusters : %d" % n_clusters_)
    plt.plot(range(len(close)), close)
    plt.plot(range(len(labels)), labels)
    plt.show()
    # #############################################################################
    # Plot result
    '''
Пример #3
0
def get_today_data(data_x, today_data, n_components=20):
    add_data_x = np.vstack((data_x, today_data))
    final_data_x = nr.standardized_mars(add_data_x)
    print('final_data_x:', final_data_x.shape)
    # 直接使用pca数据,将0.7做特异值处理,以后重新组合起来进行,随机森林的训练
    # 拿100%的数据进行PCA
    pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=n_components)
    today_x = pca_x[-1].reshape(1, n_components)
    print('today_x:', today_x)
    print(today_x.shape)
    return today_x
Пример #4
0
def analyze_lof(dataPath=""):
    data = pd.read_csv(dataPath)
    data = data[::-1]
    # 加入其他的指标
    result = get_other_indicators(data)
    deal_result = result.dropna(axis=0)
    # 利用LOF处理原始数据进行重新的决策
    # final_data = deal_data_from_dataFrame(deal_result)

    # 获得电子信息的板块的数据
    # NDX_sql = 'SELECT open,close,low,high,volume,other,change_rate, DATE_ADD(date,INTERVAL 1 DAY) as date from global_info where industry_name = "纳斯达克" order by date asc;'
    # NDX_delete_list = ['id', 'category_name', 'industry_name', 'industry_key', 'total_money']
    # 对于nan的值进行向前填充
    # NDXData = deal_dataFrame(NDX_sql, [])

    final_data = deal_data(deal_result)
    # data_y = final_data[1]
    final_data_x = nr.standardized_mars(final_data[0])

    pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=62)
    print(pca_x.shape)
    # x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False)

    # x奇异值处理
    lof_data_x = oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False)
    error_index = oc.get_delete_index()
    print(error_index)
    result = deal_result.index.tolist()
    # 写入所有的日期,奇异值存在的标志为1
    with open('300113_data.csv', 'w+') as f:
        f.write('date')
        f.write(',')
        f.write('300113_Sigular')
        f.write('\n')
        for index, date in enumerate(result):
            if index in error_index:
                f.write(date)
                f.write(',')
                f.write('1')
                f.write('\n')
            else:
                f.write(date)
                f.write(',')
                f.write('0')
                f.write('\n')
Пример #5
0
def analyze_lof_sql(code=""):

    # 获得电子信息的板块的数据
    NDX_sql = 'SELECT open,close,low,high,volume,other,change_rate, DATE_ADD(date,INTERVAL 1 DAY) as date from global_info where industry_name = "纳斯达克" order by date asc;'
    # NDX_delete_list = ['id', 'category_name', 'industry_name', 'industry_key', 'total_money']
    # 对于nan的值进行向前填充
    NDXData = deal_dataFrame(NDX_sql, [])

    final_data = deal_data(NDXData)
    # data_y = final_data[1]
    final_data_x = nr.standardized_mars(final_data[0])

    pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=62)
    print(pca_x.shape)
    # x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False)

    # x奇异值处理
    lof_data_x = oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False)
    error_index = oc.get_delete_index()
    print(error_index)
    result = NDXData.index.tolist()
    # 写入所有的日期,奇异值存在的标志为1
    with open('NDX_data.csv', 'w+') as f:
        f.write('date')
        f.write(',')
        f.write('NDX_Sigular')
        f.write('\n')
        for index, date in enumerate(result):
            if index in error_index:
                f.write(date.strftime('%Y-%m-%d'))
                f.write(',')
                f.write('1')
                f.write('\n')
            else:
                f.write(date.strftime('%Y-%m-%d'))
                f.write(',')
                f.write('0')
                f.write('\n')
Пример #6
0
def fit_randomForest_del(daySpan=0, dataPath=""):
    data = pd.read_csv(dataPath)
    # 加入其他的指标
    result = get_other_indicators(data)
    deal_result = result.dropna(axis=0)
    # 利用LOF处理原始数据进行重新的决策
    #final_data = deal_data_from_dataFrame(deal_result)
    data_x = dataX_from_dataFrame(deal_result)
    if daySpan == 0:
        # 对X处理, Y值做二分化处理
        data_y = dataY_from_dataFrame(deal_result)
    else:

        data_y = dataY_for_Nmean(deal_result, N=daySpan)
        data_x = data_x[:len(data_y)]
    s_deal_data = (data_x, data_y)

    # data_y = final_data[1]
    final_data_x = nr.standardized_mars(s_deal_data[0])
    print(final_data_x.shape)

    # 直接使用pca数据,将0.7做特异值处理,以后重新组合起来进行,随机森林的训练
    # 拿100%的数据进行PCA
    all_y = data_y
    pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=2)
    print(pca_x.shape)
    # x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False)

    # 奇异值处理
    lof_data_x = oc.LOF_PCA_for_Clustering_del(pca_x, isUsePCA=False)
    dele_index = oc.get_delete_index()
    lof_data_y = np.delete(all_y, dele_index, axis=0)
    # all_x = np.vstack((lof_data_x, x_test))
    print(lof_data_x.shape)
    print('y', data_y.shape)
    # all_y = np.concatenate((y_train, y_test), axis=0)
    # print(all_x.shape, all_y.shape)
    feature_importances = np.copy(random_forest((lof_data_x, lof_data_y)))
Пример #7
0
def fit_randomForest_repXY(daySpan=0, dataPath=""):
    data = pd.read_csv(dataPath)
    # 加入其他的指标
    result = get_other_indicators(data)
    deal_result = result.dropna(axis=0)
    # 利用LOF处理原始数据进行重新的决策
    #final_data = deal_data_from_dataFrame(deal_result)
    data_x = dataX_from_dataFrame(deal_result)
    if daySpan == 0:
        #
        data_y = dataY_from_dataFrame(deal_result)
    else:

        data_y = dataY_for_Nmean(deal_result, N=daySpan)
        data_x = data_x[:len(data_y)]
    s_deal_data = (data_x, data_y)

    final_data_x = nr.standardized_mars(s_deal_data[0])
    print(final_data_x.shape)
    # 直接使用pca数据,将100%做特异值处理,随机森林的训练
    # 拿100%的数据进行PCA
    all_y = data_y
    pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=2)
    print(pca_x.shape)
    # x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False)

    # x奇异值处理
    lof_data_x = oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False)
    # y奇异值处理
    print('all_y', all_y[-10:])
    lof_data_y = oc.replace_Singular(all_y, oc.get_pred_test())
    print('lof_y', lof_data_y[-10:])
    # all_x = np.vstack((lof_data_x, x_oc.get_pred_test()test))
    print(pca_x.shape)
    # all_y = np.concatenate((y_train, y_test), axis=0)
    # print(all_x.shape, all_y.shape)
    random_forest((lof_data_x, lof_data_y))
Пример #8
0
        '/home/mars/Data/finialData/electronic_infomation/002544.csv')
    data = data[::-1]
    result = get_other_indicators(data)

    #result = data[['price_change', 'p_change']]
    deal_result = result.dropna(axis=0)
    # close = deal_result['close']
    #
    s_deal_data = deal_data_from_dataFrame(deal_result)
    data_x = s_deal_data[0]
    data_y = s_deal_data[1]
    print('data_x', data_x.shape)
    # 特征处理
    #t_deal_data_x = Filter(use=False).Variance_selection(threshold=3, data=s_deal_data)[0]
    # 归一化
    final_data_x = nr.standardized_mars(data_x)
    #
    # pca_x = oc.LOF_PCA_for_Clustering(final_data_x)
    #
    # final_data_x_LOF = oc.replace_Singular(final_data_x, oc.get_pred_test())
    # print('final_data_x_LOF',final_data_x_LOF[:16])
    #
    # print(final_data_x_LOF.shape)
    #降维处理
    pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=0.9)
    # #############################################################################
    # Compute clustering with MeanShift
    x_train, x_test, y_train, y_test = train_test_split(pca_x,
                                                        data_y,
                                                        test_size=0.3,
                                                        random_state=0,
Пример #9
0
    return Pca_x



if __name__ == "__33main__":
    data = pd.read_csv('/home/mars/Data/finialData/electronic_infomation/000948.csv')
    data = data[::-1]

    result = get_other_indicators(data)
    #result = result[['open', 'close', 'low', 'high', 'volume', 'price_change']]
    deal_result = result.dropna(axis=0)
    s_deal_data = deal_data_from_dataFrame(deal_result)
    # 划分训练集和测试集,将数据集的70%划入训练集,30%划入测试集
    #train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.3, random_state=1)
    # 归一化
    final_data_x = nr.standardized_mars(s_deal_data[0])
    ratio = 0.7
    data_x = final_data_x
    x_train = data_x[:int(len(data_x) * ratio)]
    print('x_train', x_train.shape)
    x_test = data_x[int(len(data_x) * ratio):]
    m, n = np.shape(x_train)


    # 降维, 指定主成分的方差和所占的最小比例阈值
    pca = PCA(n_components=0.9, random_state=1)
    pca.fit(x_train)
    print ('各维度的方差: ', pca.explained_variance_)
    print ('各维度的方差值占总方差值的比例: ', pca.explained_variance_ratio_)
    print ('占总方差值90%的维度数量: ', pca.n_components_, '\n')
    data_x = pca.fit_transform(final_data_x)
Пример #10
0
def other_main():

    np.random.seed(42)
    data = pd.read_csv(
        '/home/mars/Data/finialData/electronic_infomation/300297.csv')
    data = data[::-1]
    result = get_other_indicators(data)
    delete_feature = []
    deal_result = result.dropna(axis=0)
    # print(deal_result)
    print('***')
    #print(len(columns))

    final_data = deal_data_from_dataFrame(deal_result)
    data_y = final_data[1]
    final_data_x = nr.standardized_mars(final_data[0])
    print(final_data_x.shape)
    pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=0.9)

    # xx, yy = np.meshgrid(np.linspace(-5, 5, 500), np.linspace(-5, 5, 500))
    # # Generate normal (not abnormal) training observations
    # X = 0.3 * np.random.randn(100, 2)
    # X_train = np.r_[X + 2, X - 2]
    # # Generate new normal (not abnormal) observations
    # X = 0.3 * np.random.randn(20, 2)
    # X_test = np.r_[X + 2, X - 2]
    # # Generate some abnormal novel observations
    # X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))

    # fit the model for novelty detection (novelty=True)
    print('pca_x', pca_x.shape)
    clf = LocalOutlierFactor(n_neighbors=20, novelty=True, contamination=0.1)
    clf.fit(pca_x)
    # DO NOT use predict, decision_function and score_samples on X_train as this
    # would give wrong results but only on new unseen data (not used in X_train),
    # e.g. X_test, X_outliers or the meshgrid
    y_pred_test = clf.predict(pca_x)
    print(y_pred_test)
    error_index = [i for i, x in enumerate(y_pred_test) if x == -1]

    print('error size', y_pred_test[y_pred_test == -1].size)
    print('index of witch is -1 *******')
    print([i for i, x in enumerate(y_pred_test) if x == -1])
    print('*******')
    # y_pred_outliers = clf.predict(X_outliers)
    # n_error_test = y_pred_test[y_pred_test == -1].size
    # n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size
    '''
    # plot the learned frontier, the points, and the nearest vectors to the plane
    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    plt.title("Novelty Detection with LOF")
    plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.PuBu)
    a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred')
    plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors='palevioletred')
    
    s = 40
    b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=s, edgecolors='k')
    b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='blueviolet', s=s,
                     edgecolors='k')
    c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='gold', s=s,
                    edgecolors='k')
    plt.axis('tight')
    plt.xlim((-5, 5))
    plt.ylim((-5, 5))
    plt.legend([a.collections[0], b1, b2, c],
               ["learned frontier", "training observations",
                "new regular observations", "new abnormal observations"],
               loc="upper left",
               prop=matplotlib.font_manager.FontProperties(size=11))
    plt.xlabel(
        "errors novel regular: %d/40 ; errors novel abnormal: %d/40"
        % (n_error_test, n_error_outliers))
    plt.show()
    
    '''
    '''
Пример #11
0
def fit_randomForest_MS(daySpan=0, dataPath="", stock_code=''):
    # data = pd.read_csv(dataPath)
    # data = data[::-1]
    # print(data[:10])
    # # 加入其他的指标
    # result = get_other_indicators(data)
    # deal_result = result.dropna(axis=0)
    # 利用LOF处理原始数据进行重新的决策
    #final_data = deal_data_from_dataFrame(deal_result)

    # 根据stock_code获得相关性矩阵对应的相关性数据
    #stock_code = '\'300017'
    collection = pd.read_csv('/home/mars/Data/finialData/code_correlation.csv',
                             index_col=0)
    collection = collection.sort_values(stock_code, ascending=False)

    #print(collection)

    # 提取前10的相关性股票
    top_10 = collection.index.tolist()
    top_10 = top_10[:5]
    # 获得对应的数据
    dataList = []
    code_name = []
    for code in top_10:
        code_name.append(code)
        # df[(df.BoolCol==3)&(df.attr==22)].index.tolist()
        # code = code_relation[code_relation.get(stock_code)==score].index
        ##print('code:', code[1:])
        path = '/home/mars/Data/finialData/electronic_infomation/' + code[
            1:] + '.csv'
        code_data = pd.read_csv(path, index_col='date')

        result = get_other_indicators(code_data)
        # 数据整合
        dataList.append(result)
    # 按照时间对接,并且去掉NAN数据
    df = pd.concat(dataList, axis=1)
    # pandas会 按照文件的index索引来进行重新的拼接
    new_df = df.sort_index()
    #print('new_df:', new_df[:5])

    new_df.dropna(axis=0, inplace=True)
    #print('new_df2:', new_df.get('price_change'))
    print('all shape:', new_df.shape)
    #new_df.to_csv('300017_conbine.csv')
    deal_result = new_df

    data_x = dataX_from_dataFrame(deal_result)
    if daySpan == 0:
        #
        data_y = dataY_from_dataFrame(deal_result)
    else:

        data_y = dataY_for_Nmean(deal_result, N=daySpan)
        data_x = data_x[:len(data_y)]
    s_deal_data = (data_x, data_y)

    # data_y = final_data[1]
    final_data_x = nr.standardized_mars(s_deal_data[0])
    print(final_data_x.shape)

    all_y = s_deal_data[1]
    MSx_train, MSx_test, MSy_train, MSy_test = ms.getMS_repx_data(
        final_data_x, all_y)
    all_x = np.vstack((MSx_train, MSx_test))
    all_y = np.concatenate((MSy_train, MSy_test), axis=0)
    max_score = fit_randomForest_rep(data=(all_x, all_y))
    print('综上的最高得分为:', max_score)
Пример #12
0
def get_predict_data(daySpan=0, stock_code='', date='now', dataFrom='csv', n_components=20,
                              classfic=2):
    collection = pd.read_csv('/home/mars/Data/finialData/code_correlation.csv', index_col=0)
    collection = collection.sort_values(stock_code, ascending=False)

    # 提取前10的相关性股票
    top_10 = collection.index.tolist()
    top_10 = top_10[:5]
    # 获得对应的数据
    dataList = []
    code_name = []
    for code in top_10:
        code_name.append(code)
        # df[(df.BoolCol==3)&(df.attr==22)].index.tolist()
        # code = code_relation[code_relation.get(stock_code)==score].index
        #print('code:', code[1:])
        if dataFrom == 'csv':
            path = '/home/mars/Data/finialData/electronic_infomation/' + code[1:] + '.csv'
            code_data = pd.read_csv(path, index_col='date')
            code_data = code_data[::-1]
            #print(code_data)
            result = get_other_indicators(code_data)
        elif dataFrom == 'db':
            code_sql = 'SELECT * from stock_info where stock_code=' + code[1:] + ' order by date asc;'
            code_delete_list = ['id', 'stock_code', 'stock_name', 'turnover']
            result = deal_dataFrame(code_sql, code_delete_list)
            # 获得需要删除的行索引

        else:
            pass

        #result = get_other_indicators(code_data)
        # 数据整合
        dataList.append(result)
    # 按照时间对接,并且去掉NAN数据
    df = pd.concat(dataList, axis=1)

    # pandas会 按照文件的index索引来进行重新的拼接
    new_df = df.sort_index()
    #print('new_df:', new_df[:5])

    new_df.dropna(axis=0, inplace=True)
    #print('new_df2:', new_df.get('price_change'))
    #print('all shape:', new_df.shape)
    global now_df
    # 获得特定的行,获得以后在元数据中删除
    date_index = new_df.index.tolist()
    if isinstance(date_index[0], str):
        try:
            now_index = date_index.index('2018-12-17')
            delete_index_list = date_index[now_index:]
            now_df = pd.DataFrame(new_df, index=[date])
            new_df.drop(index=delete_index_list, inplace=True)
        except Exception as e:
            print(str(e))

    else:
        try:
            now_index = date_index.index(datetime.date(2018, 12, 17))
            delete_index_list = date_index[now_index:]
            date = datetime.datetime.strptime(date, '%Y-%m-%d')
            now_df = pd.DataFrame(new_df, index=[date])
            new_df.drop(index=delete_index_list, inplace=True)
        except Exception as e:
            print(str(e))

    print(date_index)
    try:
        pass
        # 如果给点的预测的日期小于'2018-12-14',只需要获得改天的数据即可
        # if date_index.index('2018-12-14') < date_index.index(date):
        #     now_index = date_index.index('2018-12-17')
        #     delete_index_list = date_index[now_index:]
        #     now_df = pd.DataFrame(new_df, index=date)
        #     new_df.drop(index=delete_index_list, inplace=True)
        # else:
        #     try:
        #         now_index = date_index.index('2018-12-17')
        #         delete_index_list = date_index[now_index:]
        #         new_df.drop(index=delete_index_list, inplace=True)
        #     except Exception as e:
        #         print(str(e))
        #     now_df = pd.DataFrame(new_df, index=[date])
    except Exception:

        return '本天该股票休市'

    print('new_df:', new_df)
    print('now_df:', now_df)
    deal_result = new_df
    data_x = dataX_from_dataFrame(deal_result)

    if classfic == 2:
        if daySpan == 0:
            data_y = dataY_from_dataFrame(deal_result)
        else:
            data_y = dataY_for_Nmean(deal_result, N=daySpan)
            data_x = data_x[:len(data_y)]
    elif classfic == 5:
        if daySpan == 0:
            data_y = dataY_from_dataFrame_5(deal_result)
        else:
            data_y = dataY_for_Nmean(deal_result, N=daySpan)
            data_x = data_x[:len(data_y)]
    else:
        pass

    final_data_x = nr.standardized_mars(data_x)
    print(final_data_x.shape)
    # 直接使用pca数据,将0.7做特异值处理,以后重新组合起来进行,随机森林的训练
    # 拿100%的数据进行PCA
    pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=n_components)
    #x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False)
    predict_y = random_forest((pca_x, data_y))
    print('模型的分数: ', getScore())
    return predict_y
Пример #13
0
def get_fit_model(daySpan=0, stock_code='', today='now', dataFrom='csv', n_components=20,
                              classfic=2):
    collection = pd.read_csv('/home/mars/Data/finialData/code_correlation.csv', index_col=0)
    collection = collection.sort_values(stock_code, ascending=False)

    # 提取前10的相关性股票
    top_10 = collection.index.tolist()
    top_10 = top_10[:5]
    # 获得对应的数据
    dataList = []
    code_name = []
    for code in top_10:
        if collection.loc[code, stock_code] < 0.6:
            continue
        code_name.append(code)
        # df[(df.BoolCol==3)&(df.attr==22)].index.tolist()
        # code = code_relation[code_relation.get(stock_code)==score].index
        # print('code:', code[1:])
        if dataFrom == 'csv':
            path = '/home/mars/Data/finialData/electronic_infomation/' + code[1:] + '.csv'
            code_data = pd.read_csv(path, index_col='date')
            code_data = code_data[::-1]
            # print(code_data)
            result = get_other_indicators(code_data)
        elif dataFrom == 'db':
            code_sql = 'SELECT * from stock_fill where stock_code=' + code[1:] + ' order by date asc;'
            code_delete_list = ['id', 'stock_code', 'stock_name', 'modify']
            result = deal_dataFrame(code_sql, code_delete_list)
            # 获得需要删除的行索引

        else:
            pass

        # result = get_other_indicators(code_data)
        # 数据整合
        dataList.append(result)
    # 按照时间对接,并且去掉NAN数据
    df = pd.concat(dataList, axis=1)

    # pandas会 按照文件的index索引来进行重新的拼接
    new_df = df.sort_index()

    print('new_df:', new_df.shape)
    #print('new_df data:', new_df[:5])

    new_df.dropna(axis=0, inplace=True)

    global now_df
    # 获得特定的行,获得以后在元数据中删除
    date_index = new_df.index.tolist()
    if isinstance(date_index[0], str):
        try:
            # csv文件训练的样本以这作为分割
            #now_index = date_index.index('2018-12-14')
            #delete_index_list = date_index[now_index:]
            now_df = pd.DataFrame(new_df, index=[today])
            #new_df.drop(index=delete_index_list, inplace=True)
        except Exception as e:
            print(str(e))

    else:
        try:
            now_index = date_index.index(datetime.date(2018, 12, 17))
            delete_index_list = date_index[now_index:]
            date = datetime.datetime.strptime(today, '%Y-%m-%d')
            now_df = pd.DataFrame(new_df, index=[date])
            new_df.drop(index=delete_index_list, inplace=True)
        except Exception as e:
            print(str(e))
    deal_result = new_df
    today_x = dataX_today(now_df)
    #print('now_df:', now_df)

    data_x = dataX_from_dataFrame(deal_result)
    today_x2 = get_today_data(data_x, today_x, n_components=n_components)
    if classfic == 2:
        if daySpan == 0:
            if data_x.shape[1] > 80:
                data_y = dataY_from_dataFrame(deal_result)
            else:
                # 表示没有联合股票的参与
                data_y = dataY_no_correlation(deal_result)
        else:
            data_y = dataY_for_Nmean(deal_result, N=daySpan)
            data_x = data_x[:len(data_y)]
    elif classfic == 5:
        if daySpan == 0:
            data_y = dataY_from_dataFrame_5(deal_result)
        else:
            data_y = dataY_for_Nmean(deal_result, N=daySpan)
            data_x = data_x[:len(data_y)]
    else:
        pass
    final_data_x = nr.standardized_mars(data_x)
    print('final_data_x:', final_data_x.shape)
    # 直接使用pca数据,将0.7做特异值处理,以后重新组合起来进行,随机森林的训练
    # 拿100%的数据进行PCA
    pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=n_components)
    #print('data_y: ', data_y)

    # x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False)
    predict_y, future_y = random_forest((pca_x, data_y), another_data_x=today_x2)
    print('模型的分数: ', getScore())
    print('预测今天的趋势是: ', future_y)
    del pca_x
    del final_data_x
    del today_x
    del deal_result
    del date_index
    del dataList
    return (getScore(), future_y)
Пример #14
0
def singular(daySpan=0, stock_code=''):
    # data = pd.read_csv(dataPath)
    # data = data[::-1]
    # # 加入其他的指标
    # result = get_other_indicators(data)
    # deal_result = result.dropna(axis=0)
    # 利用LOF处理原始数据进行重新的决策
    #final_data = deal_data_from_dataFrame(deal_result)

    #stock_code = '\'600775'
    collection = pd.read_csv('/home/mars/Data/finialData/code_correlation.csv',
                             index_col=0)
    collection = collection.sort_values(stock_code, ascending=False)

    #print(collection)

    # 提取前10的相关性股票
    top_10 = collection.index.tolist()
    top_10 = top_10[:5]
    # 获得对应的数据
    dataList = []
    code_name = []

    for code in top_10:
        #code_name.append(code)
        # 除去相关性系数小于0.6的股票
        if collection.loc[code, stock_code] < 0.6:
            continue
        code_sql = 'SELECT * from stock_fill where stock_code=' + code[
            1:] + ' and date < "2018-12-15" order by date asc;'
        code_delete_list = ['id', 'stock_code', 'stock_name', 'modify']
        codeData = deal_dataFrame(code_sql, code_delete_list)
        # 数据整合
        dataList.append(codeData)
    # 按照时间对接,并且去掉NAN数据

    df = pd.concat(dataList, axis=1)

    # pandas会 按照文件的index索引来进行重新的拼接
    new_df = df.sort_index()
    #print('new_df:', new_df[:5])

    print('new_df:', new_df.shape)
    print('new_df data:', new_df[:5])

    new_df.dropna(axis=0, inplace=True)
    # 时间的索引
    global date_index
    date_index = new_df.index.tolist()
    #print('new_df2:', new_df.get('price_change'))
    #print('all shape:', new_df.shape)
    deal_result = new_df
    print('shape:', deal_result.shape)

    data_x = dataX_from_dataFrame(deal_result)

    #print('data_x shape:', data_x[:3])
    if daySpan == 0:
        #
        if data_x.shape[1] > 80:
            data_y = dataY_from_dataFrame_5(deal_result)
        else:
            # 表示没有联合股票的参与
            data_y = dataY_5_no_correlation(deal_result)
    else:

        data_y = dataY_for_Nmean(deal_result, N=daySpan)
        data_x = data_x[:len(data_y)]

    # data_y = final_data[1]
    final_data_x = nr.standardized_mars(data_x)
    print(final_data_x.shape)

    # 直接使用pca数据,将0.7做特异值处理,以后重新组合起来进行,随机森林的训练
    # 拿100%的数据进行PCA
    all_y = data_y
    number = len(all_y)
    #global predict_info
    scoreInfoList = []
    for i in range(6, final_data_x.shape[1] - 10, 1):

        pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=i)
        print(pca_x.shape)
        #x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False)

        # 奇异值处理
        lof_data_x = oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False)
        #all_x = np.vstack((lof_data_x, x_test))
        print(pca_x.shape)
        #all_y = np.concatenate((y_train, y_test), axis=0)
        #print(all_x.shape, all_y.shape)
        predict_y = random_forest((pca_x, all_y))
        ratio_ss = len(oc.get_delete_index()) / number
        scoreInfoList.append((getScore(), ratio_ss, i))

    scoreList = []
    compent = []
    for one in scoreInfoList:
        score = one[0]
        scoreList.append(score)
        compent.append(one[2])
    '''
    plt.title(stock_code + ' --- score of component')
    plt.xlabel('component')
    plt.ylabel('score')
    plt.plot(compent, scoreList,'r-o')

    max_indx = np.argmax(scoreList)  # max value index
    suit_compent = compent[max_indx]
    plt.plot(suit_compent, scoreList[max_indx], 'ks')
    show_max = '[' + str(suit_compent) + ' ' + str(round(scoreList[max_indx], 4)) + ']'
    plt.annotate(show_max, xytext=(suit_compent, scoreList[max_indx]), xy=(suit_compent, scoreList[max_indx]))

    plt.show()
    '''
    max_score = max(scoreList)
    max_index = scoreList.index(max_score)
    error_ratio = scoreInfoList[max_index][1]
    component = scoreInfoList[max_index][2]
    del scoreInfoList
    del scoreList

    print(max_score, error_ratio, component)
    return (max_score, error_ratio, component)
Пример #15
0
def fit_SVM(daySpan=0, code=None):
    stock_code = '\'000021'
    collection = pd.read_csv('/home/mars/Data/finialData/code_correlation.csv',
                             index_col=0)
    collection = collection.sort_values(stock_code, ascending=False)

    print(collection)

    # 提取前10的相关性股票
    top_10 = collection.index.tolist()
    top_10 = top_10[:5]
    # 获得对应的数据
    dataList = []
    code_name = []
    for code in top_10:
        code_name.append(code)
        # df[(df.BoolCol==3)&(df.attr==22)].index.tolist()
        # code = code_relation[code_relation.get(stock_code)==score].index
        print('code:', code[1:])
        path = '/home/mars/Data/finialData/electronic_infomation/' + code[
            1:] + '.csv'
        code_data = pd.read_csv(path, index_col='date')

        result = get_other_indicators(code_data)
        # 数据整合
        dataList.append(result)
    # 按照时间对接,并且去掉NAN数据
    df = pd.concat(dataList, axis=1)

    # pandas会 按照文件的index索引来进行重新的拼接
    new_df = df.sort_index()
    print('new_df:', new_df[:5])

    new_df.dropna(axis=0, inplace=True)
    print('new_df2:', new_df.get('price_change'))
    print('all shape:', new_df.shape)
    deal_result = new_df
    # 利用LOF处理原始数据进行重新的决策
    #final_data = deal_data_from_dataFrame(deal_result)
    data_x = dataX_from_dataFrame(deal_result)
    if daySpan == 0:
        #
        data_y = dataY_from_dataFrame(deal_result)
    else:

        data_y = dataY_for_Nmean(deal_result, N=daySpan)
        data_x = data_x[:len(data_y)]
    s_deal_data = (data_x, data_y)

    # data_y = final_data[1]
    final_data_x = nr.standardized_mars(s_deal_data[0])
    print(final_data_x.shape)

    # 直接使用pca数据,将0.7做特异值处理,以后重新组合起来进行,随机森林的训练
    # 拿100%的数据进行PCA
    all_y = s_deal_data[1]
    scoreListInfo = []
    for i in range(6, 40, 1):
        pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=i)
        print(pca_x.shape)
        #x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False)

        # 奇异值处理
        lof_data_x = oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False)
        #all_x = np.vstack((lof_data_x, x_test))
        print(pca_x.shape)
        x_train, x_test, y_train, y_test = train_test_split(lof_data_x,
                                                            all_y,
                                                            test_size=0.3,
                                                            random_state=0,
                                                            shuffle=False)

        # fit the model
        #for fig_num, kernel in enumerate(('linear', 'rbf', 'poly','sigmoid')):
        for c in np.arange(0.1, 1, 0.1):
            clf = svm.SVC(gamma=c, kernel='rbf')
            clf.fit(x_train, y_train)
            score = clf.score(x_test, y_test)
            print(score)
            scoreListInfo.append((score, i, c))
    #print(scoreListInfo)
    scoreList = []
    for one in scoreListInfo:
        score = one[0]
        scoreList.append(score)
    max_score = max(scoreList)
    max_index = scoreList.index(max_score)
    # error_ratio = scoreInfoList[max_index][1]
    components = scoreListInfo[max_index][1]
    c = scoreListInfo[max_index][2]
    del scoreListInfo
    del scoreList
    print('best paramers:')
    print(max_score, c, components)
    return (max_score, c, components)