Exemplos de LOF_PCA_for_Clustering em Python, exemplos de Feature_Engineering.outliner_check.LOF_PCA_for_Clustering em Python

Exemplo n.º 1

0

Exibir arquivo

def test():
    data = pd.read_csv(
        '/home/mars/Data/finialData/electronic_infomation/000021.csv')
    data = data[::-1]
    result = get_other_indicators(data)

    #result = data[['price_change', 'p_change']]
    deal_result = result.dropna(axis=0)
    close = deal_result['close']
    print(close.shape)
    s_deal_data = deal_data_from_dataFrame(deal_result)
    data_x = s_deal_data[0]
    data_y = s_deal_data[1]
    # 特征处理
    #t_deal_data_x = Filter(use=False).Variance_selection(threshold=3, data=s_deal_data)[0]
    # 归一化
    final_data_x = nr.standardized_mars(data_x)

    pca_x = oc.LOF_PCA_for_Clustering(final_data_x)

    final_data_x_LOF = oc.replace_Singular(final_data_x, oc.get_pred_test())
    print('final_data_x_LOF', final_data_x_LOF[:16])

    print(final_data_x_LOF.shape)
    #降维处理
    #pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=0.9)
    # #############################################################################
    # Compute clustering with MeanShift
    x_train = final_data_x_LOF[:int(len(data_x) * 0.7)]
    print('x_train', x_train.shape)
    x_test = final_data_x_LOF[int(len(data_x) * 0.7):]
    # The following bandwidth can be automatically detected using
    bandwidth = estimate_bandwidth(x_train, quantile=0.2, random_state=1)

    ms = MeanShift(bandwidth=bandwidth, bin_seeding=False)
    ms.fit(final_data_x_LOF)
    labels = ms.labels_
    print('error size', labels[labels != 0].size)
    print('index of not 0 *******')
    print([i for i, x in enumerate(labels) if x != 0])
    print('*******')
    print(labels)
    print(labels.shape)
    cluster_centers = ms.cluster_centers_

    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)

    #score = metrics.silhouette_score(pca_x, labels, metric='euclidean')
    #score1 = metrics.calinski_harabaz_score(pca_x, labels)
    #print(score)
    #print(score1)

    print("number of estimated clusters : %d" % n_clusters_)
    plt.plot(range(len(close)), close)
    plt.plot(range(len(labels)), labels)
    plt.show()
    # #############################################################################
    # Plot result
    '''

Exemplo n.º 2

0

Exibir arquivo

def compare_s_no(dataPath=""):
    data = pd.read_csv(dataPath)
    # 加入其他的指标
    result = get_other_indicators(data)
    deal_result = result.dropna(axis=0)[-100:]
    # 利用ＬＯＦ处理原始数据进行重新的决策
    final_data = deal_data_from_dataFrame(deal_result)
    # data_y = final_data[1]
    final_data_x = nr.standardized_mars(final_data[0])
    print(final_data_x.shape)
    # 直接使用ｐｃａ数据,将１００％做特异值处理，随机森林的训练
    # 拿100%的数据进行ＰＣＡ
    data_y = final_data[1]
    data_x = final_data[0]
    final_data_x = nr.standardized_mars(data_x)
    # 拿100%的数据进行ＰＣＡ
    pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=54)

    # 奇异值处理
    oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False)

    #random_forest((lof_data_x, new_all_y))

    lof_pred = oc.get_pred_test()
    error_index = oc.get_delete_index()
    lof_data_y = oc.replace_Singular(data_y, lof_pred)

    fig, (ax1, ax2) = plt.subplots(1, 2, sharex=True)
    ax1.scatter(range(len(data_y)), data_y, label='data_y')
    error_close = data_y[error_index]
    # ax1.plot(range(len(lof_pred)), lof_pred, label='lof_pred')

    ax1.scatter(error_index, error_close, label='error_y', c='r', alpha=0.2)
    # ax1.xlabel('x -')
    # ax1.ylabel('y -')
    # ax1.title('plot open')
    ax1.legend()
    # ax2.ylabel('close')

    error_lof_y = lof_data_y[error_index]

    ax2.scatter(range(len(lof_data_y)), lof_data_y, label='lof_data_y')
    ax2.scatter(error_index, error_lof_y, label='error_lof_y', c='r', alpha=0.2)
    # ax2.plot(close**2, label='quadratic')
    ax2.legend()
    # 调整cavas 的间隔
    print(len(data_y))
    print(len(lof_data_y))
    plt.tight_layout()
    plt.show()

Exemplo n.º 3

0

Exibir arquivo

def analyze_lof(dataPath=""):
    data = pd.read_csv(dataPath)
    data = data[::-1]
    # 加入其他的指标
    result = get_other_indicators(data)
    deal_result = result.dropna(axis=0)
    # 利用ＬＯＦ处理原始数据进行重新的决策
    # final_data = deal_data_from_dataFrame(deal_result)

    # 获得电子信息的板块的数据
    # NDX_sql = 'SELECT open,close,low,high,volume,other,change_rate, DATE_ADD(date,INTERVAL 1 DAY) as date from global_info where industry_name = "纳斯达克" order by date asc;'
    # NDX_delete_list = ['id', 'category_name', 'industry_name', 'industry_key', 'total_money']
    # 对于ｎａｎ的值进行向前填充
    # NDXData = deal_dataFrame(NDX_sql, [])

    final_data = deal_data(deal_result)
    # data_y = final_data[1]
    final_data_x = nr.standardized_mars(final_data[0])

    pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=62)
    print(pca_x.shape)
    # x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False)

    # ｘ奇异值处理
    lof_data_x = oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False)
    error_index = oc.get_delete_index()
    print(error_index)
    result = deal_result.index.tolist()
    # 写入所有的日期，奇异值存在的标志为1
    with open('300113_data.csv', 'w+') as f:
        f.write('date')
        f.write(',')
        f.write('300113_Sigular')
        f.write('\n')
        for index, date in enumerate(result):
            if index in error_index:
                f.write(date)
                f.write(',')
                f.write('1')
                f.write('\n')
            else:
                f.write(date)
                f.write(',')
                f.write('0')
                f.write('\n')

Exemplo n.º 4

0

Exibir arquivo

def analyze_lof_sql(code=""):

    # 获得电子信息的板块的数据
    NDX_sql = 'SELECT open,close,low,high,volume,other,change_rate, DATE_ADD(date,INTERVAL 1 DAY) as date from global_info where industry_name = "纳斯达克" order by date asc;'
    # NDX_delete_list = ['id', 'category_name', 'industry_name', 'industry_key', 'total_money']
    # 对于ｎａｎ的值进行向前填充
    NDXData = deal_dataFrame(NDX_sql, [])

    final_data = deal_data(NDXData)
    # data_y = final_data[1]
    final_data_x = nr.standardized_mars(final_data[0])

    pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=62)
    print(pca_x.shape)
    # x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False)

    # ｘ奇异值处理
    lof_data_x = oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False)
    error_index = oc.get_delete_index()
    print(error_index)
    result = NDXData.index.tolist()
    # 写入所有的日期，奇异值存在的标志为1
    with open('NDX_data.csv', 'w+') as f:
        f.write('date')
        f.write(',')
        f.write('NDX_Sigular')
        f.write('\n')
        for index, date in enumerate(result):
            if index in error_index:
                f.write(date.strftime('%Y-%m-%d'))
                f.write(',')
                f.write('1')
                f.write('\n')
            else:
                f.write(date.strftime('%Y-%m-%d'))
                f.write(',')
                f.write('0')
                f.write('\n')

Exemplo n.º 5

0

Exibir arquivo

def fit_randomForest_repXY(daySpan=0, dataPath=""):
    data = pd.read_csv(dataPath)
    # 加入其他的指标
    result = get_other_indicators(data)
    deal_result = result.dropna(axis=0)
    # 利用ＬＯＦ处理原始数据进行重新的决策
    #final_data = deal_data_from_dataFrame(deal_result)
    data_x = dataX_from_dataFrame(deal_result)
    if daySpan == 0:
        #
        data_y = dataY_from_dataFrame(deal_result)
    else:

        data_y = dataY_for_Nmean(deal_result, N=daySpan)
        data_x = data_x[:len(data_y)]
    s_deal_data = (data_x, data_y)

    final_data_x = nr.standardized_mars(s_deal_data[0])
    print(final_data_x.shape)
    # 直接使用ｐｃａ数据,将１００％做特异值处理，随机森林的训练
    # 拿100%的数据进行ＰＣＡ
    all_y = data_y
    pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=2)
    print(pca_x.shape)
    # x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False)

    # ｘ奇异值处理
    lof_data_x = oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False)
    # y奇异值处理
    print('all_y', all_y[-10:])
    lof_data_y = oc.replace_Singular(all_y, oc.get_pred_test())
    print('lof_y', lof_data_y[-10:])
    # all_x = np.vstack((lof_data_x, x_oc.get_pred_test()test))
    print(pca_x.shape)
    # all_y = np.concatenate((y_train, y_test), axis=0)
    # print(all_x.shape, all_y.shape)
    random_forest((lof_data_x, lof_data_y))

Exemplo n.º 6

0

Exibir arquivo

print('MSy_train,MSy_test:', MSy_train.shape, MSy_test.shape)
all_x = np.vstack((MSx_train, MSx_test))
all_y = np.concatenate((MSy_train, MSy_test), axis=0)
pca_x = PCA_mars.getPcaComponent(all_x, n_components=35)
print('all_y:', all_y.shape)
MS_predict_y = random_forest((pca_x, all_y))
print('MS_predict_y', MS_predict_y.shape)
ms_score = round(getScore(), 4)
del pca_x
del all_y
del all_x

# 获得singular的train_Y
print('***********开始测试 singular ********************')
pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=50)
lof_data_x = oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False)

singular_predict_y = random_forest((lof_data_x, data_y))
print('singular_predict_y', singular_predict_y.shape)
singular_score = round(getScore(), 4)
del lof_data_x
del pca_x

# 获得original_RF的train_Y
print('***********开始测试 original ********************')
pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=53)
original_predict_y = random_forest((pca_x, data_y))
print('original_predict_y', original_predict_y.shape)
original_score = round(getScore(), 4)
del pca_x
# 获得model4的train_Y

Exemplo n.º 7

0

Exibir arquivo

def singular(daySpan=0, stock_code=''):
    # data = pd.read_csv(dataPath)
    # data = data[::-1]
    # # 加入其他的指标
    # result = get_other_indicators(data)
    # deal_result = result.dropna(axis=0)
    # 利用ＬＯＦ处理原始数据进行重新的决策
    #final_data = deal_data_from_dataFrame(deal_result)

    #stock_code = '\'600775'
    collection = pd.read_csv('/home/mars/Data/finialData/code_correlation.csv',
                             index_col=0)
    collection = collection.sort_values(stock_code, ascending=False)

    #print(collection)

    # 提取前10的相关性股票
    top_10 = collection.index.tolist()
    top_10 = top_10[:5]
    # 获得对应的数据
    dataList = []
    code_name = []

    for code in top_10:
        #code_name.append(code)
        # 除去相关性系数小于0.6的股票
        if collection.loc[code, stock_code] < 0.6:
            continue
        code_sql = 'SELECT * from stock_fill where stock_code=' + code[
            1:] + ' and date < "2018-12-15" order by date asc;'
        code_delete_list = ['id', 'stock_code', 'stock_name', 'modify']
        codeData = deal_dataFrame(code_sql, code_delete_list)
        # 数据整合
        dataList.append(codeData)
    # 按照时间对接，并且去掉NAN数据

    df = pd.concat(dataList, axis=1)

    # pandas会 按照文件的index索引来进行重新的拼接
    new_df = df.sort_index()
    #print('new_df:', new_df[:5])

    print('new_df:', new_df.shape)
    print('new_df data:', new_df[:5])

    new_df.dropna(axis=0, inplace=True)
    # 时间的索引
    global date_index
    date_index = new_df.index.tolist()
    #print('new_df2:', new_df.get('price_change'))
    #print('all shape:', new_df.shape)
    deal_result = new_df
    print('shape:', deal_result.shape)

    data_x = dataX_from_dataFrame(deal_result)

    #print('data_x shape:', data_x[:3])
    if daySpan == 0:
        #
        if data_x.shape[1] > 80:
            data_y = dataY_from_dataFrame_5(deal_result)
        else:
            # 表示没有联合股票的参与
            data_y = dataY_5_no_correlation(deal_result)
    else:

        data_y = dataY_for_Nmean(deal_result, N=daySpan)
        data_x = data_x[:len(data_y)]

    # data_y = final_data[1]
    final_data_x = nr.standardized_mars(data_x)
    print(final_data_x.shape)

    # 直接使用ｐｃａ数据,将０．７做特异值处理，以后重新组合起来进行，随机森林的训练
    # 拿100%的数据进行ＰＣＡ
    all_y = data_y
    number = len(all_y)
    #global predict_info
    scoreInfoList = []
    for i in range(6, final_data_x.shape[1] - 10, 1):

        pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=i)
        print(pca_x.shape)
        #x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False)

        # 奇异值处理
        lof_data_x = oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False)
        #all_x = np.vstack((lof_data_x, x_test))
        print(pca_x.shape)
        #all_y = np.concatenate((y_train, y_test), axis=0)
        #print(all_x.shape, all_y.shape)
        predict_y = random_forest((pca_x, all_y))
        ratio_ss = len(oc.get_delete_index()) / number
        scoreInfoList.append((getScore(), ratio_ss, i))

    scoreList = []
    compent = []
    for one in scoreInfoList:
        score = one[0]
        scoreList.append(score)
        compent.append(one[2])
    '''
    plt.title(stock_code + ' --- score of component')
    plt.xlabel('component')
    plt.ylabel('score')
    plt.plot(compent, scoreList,'r-o')

    max_indx = np.argmax(scoreList)  # max value index
    suit_compent = compent[max_indx]
    plt.plot(suit_compent, scoreList[max_indx], 'ks')
    show_max = '[' + str(suit_compent) + ' ' + str(round(scoreList[max_indx], 4)) + ']'
    plt.annotate(show_max, xytext=(suit_compent, scoreList[max_indx]), xy=(suit_compent, scoreList[max_indx]))

    plt.show()
    '''
    max_score = max(scoreList)
    max_index = scoreList.index(max_score)
    error_ratio = scoreInfoList[max_index][1]
    component = scoreInfoList[max_index][2]
    del scoreInfoList
    del scoreList

    print(max_score, error_ratio, component)
    return (max_score, error_ratio, component)

Exemplo n.º 8

0

Exibir arquivo

def fit_SVM(daySpan=0, code=None):
    stock_code = '\'000021'
    collection = pd.read_csv('/home/mars/Data/finialData/code_correlation.csv',
                             index_col=0)
    collection = collection.sort_values(stock_code, ascending=False)

    print(collection)

    # 提取前10的相关性股票
    top_10 = collection.index.tolist()
    top_10 = top_10[:5]
    # 获得对应的数据
    dataList = []
    code_name = []
    for code in top_10:
        code_name.append(code)
        # df[(df.BoolCol==3)&(df.attr==22)].index.tolist()
        # code = code_relation[code_relation.get(stock_code)==score].index
        print('code:', code[1:])
        path = '/home/mars/Data/finialData/electronic_infomation/' + code[
            1:] + '.csv'
        code_data = pd.read_csv(path, index_col='date')

        result = get_other_indicators(code_data)
        # 数据整合
        dataList.append(result)
    # 按照时间对接，并且去掉NAN数据
    df = pd.concat(dataList, axis=1)

    # pandas会 按照文件的index索引来进行重新的拼接
    new_df = df.sort_index()
    print('new_df:', new_df[:5])

    new_df.dropna(axis=0, inplace=True)
    print('new_df2:', new_df.get('price_change'))
    print('all shape:', new_df.shape)
    deal_result = new_df
    # 利用ＬＯＦ处理原始数据进行重新的决策
    #final_data = deal_data_from_dataFrame(deal_result)
    data_x = dataX_from_dataFrame(deal_result)
    if daySpan == 0:
        #
        data_y = dataY_from_dataFrame(deal_result)
    else:

        data_y = dataY_for_Nmean(deal_result, N=daySpan)
        data_x = data_x[:len(data_y)]
    s_deal_data = (data_x, data_y)

    # data_y = final_data[1]
    final_data_x = nr.standardized_mars(s_deal_data[0])
    print(final_data_x.shape)

    # 直接使用ｐｃａ数据,将０．７做特异值处理，以后重新组合起来进行，随机森林的训练
    # 拿100%的数据进行ＰＣＡ
    all_y = s_deal_data[1]
    scoreListInfo = []
    for i in range(6, 40, 1):
        pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=i)
        print(pca_x.shape)
        #x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False)

        # 奇异值处理
        lof_data_x = oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False)
        #all_x = np.vstack((lof_data_x, x_test))
        print(pca_x.shape)
        x_train, x_test, y_train, y_test = train_test_split(lof_data_x,
                                                            all_y,
                                                            test_size=0.3,
                                                            random_state=0,
                                                            shuffle=False)

        # fit the model
        #for fig_num, kernel in enumerate(('linear', 'rbf', 'poly','sigmoid')):
        for c in np.arange(0.1, 1, 0.1):
            clf = svm.SVC(gamma=c, kernel='rbf')
            clf.fit(x_train, y_train)
            score = clf.score(x_test, y_test)
            print(score)
            scoreListInfo.append((score, i, c))
    #print(scoreListInfo)
    scoreList = []
    for one in scoreListInfo:
        score = one[0]
        scoreList.append(score)
    max_score = max(scoreList)
    max_index = scoreList.index(max_score)
    # error_ratio = scoreInfoList[max_index][1]
    components = scoreListInfo[max_index][1]
    c = scoreListInfo[max_index][2]
    del scoreListInfo
    del scoreList
    print('best paramers:')
    print(max_score, c, components)
    return (max_score, c, components)