Python get_delete_index示例，Feature_Engineering.outliner_check.get_delete_index Python示例

示例#1

0

显示文件

def compare_s_no(dataPath=""):
    data = pd.read_csv(dataPath)
    # 加入其他的指标
    result = get_other_indicators(data)
    deal_result = result.dropna(axis=0)[-100:]
    # 利用ＬＯＦ处理原始数据进行重新的决策
    final_data = deal_data_from_dataFrame(deal_result)
    # data_y = final_data[1]
    final_data_x = nr.standardized_mars(final_data[0])
    print(final_data_x.shape)
    # 直接使用ｐｃａ数据,将１００％做特异值处理，随机森林的训练
    # 拿100%的数据进行ＰＣＡ
    data_y = final_data[1]
    data_x = final_data[0]
    final_data_x = nr.standardized_mars(data_x)
    # 拿100%的数据进行ＰＣＡ
    pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=54)

    # 奇异值处理
    oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False)

    #random_forest((lof_data_x, new_all_y))

    lof_pred = oc.get_pred_test()
    error_index = oc.get_delete_index()
    lof_data_y = oc.replace_Singular(data_y, lof_pred)

    fig, (ax1, ax2) = plt.subplots(1, 2, sharex=True)
    ax1.scatter(range(len(data_y)), data_y, label='data_y')
    error_close = data_y[error_index]
    # ax1.plot(range(len(lof_pred)), lof_pred, label='lof_pred')

    ax1.scatter(error_index, error_close, label='error_y', c='r', alpha=0.2)
    # ax1.xlabel('x -')
    # ax1.ylabel('y -')
    # ax1.title('plot open')
    ax1.legend()
    # ax2.ylabel('close')

    error_lof_y = lof_data_y[error_index]

    ax2.scatter(range(len(lof_data_y)), lof_data_y, label='lof_data_y')
    ax2.scatter(error_index, error_lof_y, label='error_lof_y', c='r', alpha=0.2)
    # ax2.plot(close**2, label='quadratic')
    ax2.legend()
    # 调整cavas 的间隔
    print(len(data_y))
    print(len(lof_data_y))
    plt.tight_layout()
    plt.show()

示例#2

0

显示文件

def analyze_lof(dataPath=""):
    data = pd.read_csv(dataPath)
    data = data[::-1]
    # 加入其他的指标
    result = get_other_indicators(data)
    deal_result = result.dropna(axis=0)
    # 利用ＬＯＦ处理原始数据进行重新的决策
    # final_data = deal_data_from_dataFrame(deal_result)

    # 获得电子信息的板块的数据
    # NDX_sql = 'SELECT open,close,low,high,volume,other,change_rate, DATE_ADD(date,INTERVAL 1 DAY) as date from global_info where industry_name = "纳斯达克" order by date asc;'
    # NDX_delete_list = ['id', 'category_name', 'industry_name', 'industry_key', 'total_money']
    # 对于ｎａｎ的值进行向前填充
    # NDXData = deal_dataFrame(NDX_sql, [])

    final_data = deal_data(deal_result)
    # data_y = final_data[1]
    final_data_x = nr.standardized_mars(final_data[0])

    pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=62)
    print(pca_x.shape)
    # x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False)

    # ｘ奇异值处理
    lof_data_x = oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False)
    error_index = oc.get_delete_index()
    print(error_index)
    result = deal_result.index.tolist()
    # 写入所有的日期，奇异值存在的标志为1
    with open('300113_data.csv', 'w+') as f:
        f.write('date')
        f.write(',')
        f.write('300113_Sigular')
        f.write('\n')
        for index, date in enumerate(result):
            if index in error_index:
                f.write(date)
                f.write(',')
                f.write('1')
                f.write('\n')
            else:
                f.write(date)
                f.write(',')
                f.write('0')
                f.write('\n')

示例#3

0

显示文件

def fit_randomForest_del(daySpan=0, dataPath=""):
    data = pd.read_csv(dataPath)
    # 加入其他的指标
    result = get_other_indicators(data)
    deal_result = result.dropna(axis=0)
    # 利用ＬＯＦ处理原始数据进行重新的决策
    #final_data = deal_data_from_dataFrame(deal_result)
    data_x = dataX_from_dataFrame(deal_result)
    if daySpan == 0:
        # 对X处理, Ｙ值做二分化处理
        data_y = dataY_from_dataFrame(deal_result)
    else:

        data_y = dataY_for_Nmean(deal_result, N=daySpan)
        data_x = data_x[:len(data_y)]
    s_deal_data = (data_x, data_y)

    # data_y = final_data[1]
    final_data_x = nr.standardized_mars(s_deal_data[0])
    print(final_data_x.shape)

    # 直接使用ｐｃａ数据,将０．７做特异值处理，以后重新组合起来进行，随机森林的训练
    # 拿100%的数据进行ＰＣＡ
    all_y = data_y
    pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=2)
    print(pca_x.shape)
    # x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False)

    # 奇异值处理
    lof_data_x = oc.LOF_PCA_for_Clustering_del(pca_x, isUsePCA=False)
    dele_index = oc.get_delete_index()
    lof_data_y = np.delete(all_y, dele_index, axis=0)
    # all_x = np.vstack((lof_data_x, x_test))
    print(lof_data_x.shape)
    print('y', data_y.shape)
    # all_y = np.concatenate((y_train, y_test), axis=0)
    # print(all_x.shape, all_y.shape)
    feature_importances = np.copy(random_forest((lof_data_x, lof_data_y)))

示例#4

0

显示文件

def analyze_lof_sql(code=""):

    # 获得电子信息的板块的数据
    NDX_sql = 'SELECT open,close,low,high,volume,other,change_rate, DATE_ADD(date,INTERVAL 1 DAY) as date from global_info where industry_name = "纳斯达克" order by date asc;'
    # NDX_delete_list = ['id', 'category_name', 'industry_name', 'industry_key', 'total_money']
    # 对于ｎａｎ的值进行向前填充
    NDXData = deal_dataFrame(NDX_sql, [])

    final_data = deal_data(NDXData)
    # data_y = final_data[1]
    final_data_x = nr.standardized_mars(final_data[0])

    pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=62)
    print(pca_x.shape)
    # x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False)

    # ｘ奇异值处理
    lof_data_x = oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False)
    error_index = oc.get_delete_index()
    print(error_index)
    result = NDXData.index.tolist()
    # 写入所有的日期，奇异值存在的标志为1
    with open('NDX_data.csv', 'w+') as f:
        f.write('date')
        f.write(',')
        f.write('NDX_Sigular')
        f.write('\n')
        for index, date in enumerate(result):
            if index in error_index:
                f.write(date.strftime('%Y-%m-%d'))
                f.write(',')
                f.write('1')
                f.write('\n')
            else:
                f.write(date.strftime('%Y-%m-%d'))
                f.write(',')
                f.write('0')
                f.write('\n')

示例#5

0

显示文件

def singular(daySpan=0, stock_code=''):
    # data = pd.read_csv(dataPath)
    # data = data[::-1]
    # # 加入其他的指标
    # result = get_other_indicators(data)
    # deal_result = result.dropna(axis=0)
    # 利用ＬＯＦ处理原始数据进行重新的决策
    #final_data = deal_data_from_dataFrame(deal_result)

    #stock_code = '\'600775'
    collection = pd.read_csv('/home/mars/Data/finialData/code_correlation.csv',
                             index_col=0)
    collection = collection.sort_values(stock_code, ascending=False)

    #print(collection)

    # 提取前10的相关性股票
    top_10 = collection.index.tolist()
    top_10 = top_10[:5]
    # 获得对应的数据
    dataList = []
    code_name = []

    for code in top_10:
        #code_name.append(code)
        # 除去相关性系数小于0.6的股票
        if collection.loc[code, stock_code] < 0.6:
            continue
        code_sql = 'SELECT * from stock_fill where stock_code=' + code[
            1:] + ' and date < "2018-12-15" order by date asc;'
        code_delete_list = ['id', 'stock_code', 'stock_name', 'modify']
        codeData = deal_dataFrame(code_sql, code_delete_list)
        # 数据整合
        dataList.append(codeData)
    # 按照时间对接，并且去掉NAN数据

    df = pd.concat(dataList, axis=1)

    # pandas会 按照文件的index索引来进行重新的拼接
    new_df = df.sort_index()
    #print('new_df:', new_df[:5])

    print('new_df:', new_df.shape)
    print('new_df data:', new_df[:5])

    new_df.dropna(axis=0, inplace=True)
    # 时间的索引
    global date_index
    date_index = new_df.index.tolist()
    #print('new_df2:', new_df.get('price_change'))
    #print('all shape:', new_df.shape)
    deal_result = new_df
    print('shape:', deal_result.shape)

    data_x = dataX_from_dataFrame(deal_result)

    #print('data_x shape:', data_x[:3])
    if daySpan == 0:
        #
        if data_x.shape[1] > 80:
            data_y = dataY_from_dataFrame_5(deal_result)
        else:
            # 表示没有联合股票的参与
            data_y = dataY_5_no_correlation(deal_result)
    else:

        data_y = dataY_for_Nmean(deal_result, N=daySpan)
        data_x = data_x[:len(data_y)]

    # data_y = final_data[1]
    final_data_x = nr.standardized_mars(data_x)
    print(final_data_x.shape)

    # 直接使用ｐｃａ数据,将０．７做特异值处理，以后重新组合起来进行，随机森林的训练
    # 拿100%的数据进行ＰＣＡ
    all_y = data_y
    number = len(all_y)
    #global predict_info
    scoreInfoList = []
    for i in range(6, final_data_x.shape[1] - 10, 1):

        pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=i)
        print(pca_x.shape)
        #x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False)

        # 奇异值处理
        lof_data_x = oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False)
        #all_x = np.vstack((lof_data_x, x_test))
        print(pca_x.shape)
        #all_y = np.concatenate((y_train, y_test), axis=0)
        #print(all_x.shape, all_y.shape)
        predict_y = random_forest((pca_x, all_y))
        ratio_ss = len(oc.get_delete_index()) / number
        scoreInfoList.append((getScore(), ratio_ss, i))

    scoreList = []
    compent = []
    for one in scoreInfoList:
        score = one[0]
        scoreList.append(score)
        compent.append(one[2])
    '''
    plt.title(stock_code + ' --- score of component')
    plt.xlabel('component')
    plt.ylabel('score')
    plt.plot(compent, scoreList,'r-o')

    max_indx = np.argmax(scoreList)  # max value index
    suit_compent = compent[max_indx]
    plt.plot(suit_compent, scoreList[max_indx], 'ks')
    show_max = '[' + str(suit_compent) + ' ' + str(round(scoreList[max_indx], 4)) + ']'
    plt.annotate(show_max, xytext=(suit_compent, scoreList[max_indx]), xy=(suit_compent, scoreList[max_indx]))

    plt.show()
    '''
    max_score = max(scoreList)
    max_index = scoreList.index(max_score)
    error_ratio = scoreInfoList[max_index][1]
    component = scoreInfoList[max_index][2]
    del scoreInfoList
    del scoreList

    print(max_score, error_ratio, component)
    return (max_score, error_ratio, component)