示例#1
0
def outer_cv_loop(Xdata,
                  Ydata,
                  clf,
                  parameters=[],
                  n_splits=10,
                  test_size=0.25):

    pred = numpy.zeros(len(Ydata))
    importances = []
    kf = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size)
    rocscores = []
    for train, test in kf.split(Xdata, Ydata):
        if numpy.var(Ydata[test]) == 0:
            print('zero variance', varname)
            rocscores.append(numpy.nan)
            continue
        Ytrain = Ydata[train]
        Xtrain = fancyimpute.SoftImpute(verbose=False).complete(
            Xdata[train, :])
        Xtest = fancyimpute.SoftImpute(verbose=False).complete(Xdata[test, :])
        if numpy.abs(numpy.mean(Ytrain) - 0.5) > 0.2:
            smt = SMOTETomek()
            Xtrain, Ytrain = smt.fit_sample(Xtrain.copy(), Ydata[train])
        # filter out bad folds
        clf.fit(Xtrain, Ytrain)
        pred = clf.predict(Xtest)
        if numpy.var(pred) > 0:
            rocscores.append(roc_auc_score(Ydata[test], pred))
        else:
            rocscores.append(numpy.nan)
        importances.append(clf.feature_importances_)
    return rocscores, importances
bx_df['beh_nback_neut-gt-neg_rt'] = bx_df['beh_nback_neutface_cor_rt'] - bx_df[
    'beh_nback_negface_cor_rt']
bx_df['beh_nback_neut-gt-pos_rt'] = bx_df['beh_nback_neutface_cor_rt'] - bx_df[
    'bn_posface_cor_rt']
bx_df['sst_cor_stop_pct'] = 1 - bx_df['bs_incor_stop_percent_total']

bx_vars = [
    'cash_choice_task', 'sst_cor_stop_pct', 'upps_y_ss_negative_urgency',
    'upps_y_ss_lack_of_planning', 'upps_y_ss_sensation_seeking',
    'upps_y_ss_positive_urgency', 'upps_y_lack_perseverance',
    'bis_y_ss_bis_sum', 'bis_y_ss_bas_rr', 'bis_y_ss_bas_drive',
    'bis_y_ss_bas_fs', 'nihtbx_flanker_agecorrected',
    'beh_nback_neut-gt-neg_rt', 'beh_nback_neut-gt-pos_rt'
]

impute_pls = fi.SoftImpute(verbose=False)

complete_bx = impute_pls.fit_transform(bx_df[bx_vars])
complete_scaled = scale(complete_bx)

decomp = PCA().fit(complete_scaled)

fig, ax = plt.subplots(figsize=[7, 5])
plt.tight_layout(pad=2)
ax2 = ax.twinx()
g = sns.pointplot(np.arange(1, 15),
                  decomp.explained_variance_ratio_,
                  markers='x',
                  join=True,
                  ax=ax)
h = sns.lineplot(x=np.arange(1, 15),
示例#3
0
def impute(df, method, verbose=False):
    """
    Impute missing data using specified imputation method.
    
    Parameters
    ----------
    df: pd.DataFrame
        Stat DataFrame with source columns and player/team  multi-index.
    method: str/bool
        Imputation method for missing data.
            - False: Do not impute missing data.
            - None: Do not impute missing data.
            - 'BiScaler'
            - 'IterativeImpute'
            - 'IterativeSVD'
            - 'KNN': Impute with nearest neighbors.
            - 'Mean': Impute missing with average of other sources.
            - 'NuclearNorm'
            - 'SoftImpute'
    verbose: bool, default=False
        If True, print debugging information.
        
    Returns
    -------
    df: pd.DataFrame
        Imputed DataFrame with no NaNs.
    """
    warnings.filterwarnings('ignore', category=RuntimeWarning)

    # Subset DataFrame to only include only projection columns.
    ignored_cols = ['Player', 'Team', 'Pos', 'Week', 'STATS']
    impute_cols = [col for col in list(df) if col not in ignored_cols]
    X = df[impute_cols].copy().T

    # Impute DataFrame.
    v = verbose
    if method in [None, False]:
        imputed_vals = X.values
    elif np.sum(np.sum(X.isnull())) == 0:
        # No missing values.
        imputed_vals = X.values
    elif method == 'BiScaler':
        imputed_vals = fi.BiScaler(verbose=v).fit_transform(X)
    elif method == 'IterativeImpute':
        imputed_vals = fi.IterativeImputer(verbose=v).fit_transform(X)
    elif method == 'IterativeSVD':
        imputed_vals = fi.IterativeSVD(verbose=v).fit_transform(X)
    elif method == 'KNN':
        imputed_vals = fi.KNN(k=3, verbose=v).fit_transform(X)
    elif method == 'MatrixFactorization':
        imputed_vals = fi.MatrixFactorization(verbose=v).fit_transform(X)
    elif method == 'Mean':
        imputed_vals = fi.SimpleFill('mean').fit_transform(X)
    elif method == 'Median':
        imputed_vals = fi.SimpleFill('median').fit_transform(X)
    elif method == 'NuclearNorm':
        imputed_vals = fi.NuclearNormMinimization(verbose=v).fit_transform(X)
    elif method == 'SoftImpute':
        imputed_vals = fi.SoftImpute(verbose=v).fit_transform(X)

    # Recombine ignored columns with imputed data.
    imputed_df = pd.DataFrame(imputed_vals.T, columns=X.index)
    for col in impute_cols:
        if len(imputed_df[col]) != len(df[col]):
            print(f'df: {len(df[col])}\nimp: {len(imputed_df[col])}')
        df[col] = imputed_df[col].values

    return df
	def impute(self):
		return fi.SoftImpute(verbose=False).complete(self.missing_data)
示例#5
0
#**********************************
# Load Data
#**********************************

datasets = pickle.load(open('../Data/subset_data.pkl', 'rb'))
data = datasets['all_data']
task_data = datasets['task_data']
survey_data = datasets['survey_data']
verbose_lookup = datasets['verbose_lookup']

# ************************************
# ************ Imputation *******************
# ************************************
data.drop(['ptid', 'gender', 'age'], axis=1, inplace=True)
data_complete = fancyimpute.SoftImpute().complete(data)
data_complete = pd.DataFrame(data_complete,
                             index=data.index,
                             columns=data.columns)

# ************************************
# ************ Connectivity Matrix *******************
# ************************************

spearman_connectivity = calc_connectivity_mat(data_complete,
                                              edge_metric='spearman')
distance_connectivity = calc_connectivity_mat(data_complete,
                                              edge_metric='distance')

# ************************************
# ********* Graphs *******************
示例#6
0
import numpy as np
import fancyimpute

print("Loading memory")
mem = np.load(
    '/home/aocc/code/DL/MDP_learning/save_memory/first_20mill/BipedalWalker-v2CORRUPT0.1.npy'
)
print("Imputing memory")
memory_final = fancyimpute.SoftImpute().complete(mem)
print("Saving imputed memory")
np.save(
    '/home/aocc/code/DL/MDP_learning/save_memory/first_20mill/BipedalWalker-v2IMPUTED.1',
    memory_final)
import pandas as pd
import seaborn as sns
from sklearn import decomposition
import sys

sys.path.append('../utils')
from utils import get_behav_data
from plot_utils import dendroheatmap

# get dependent variables
DV_df = get_behav_data('Discovery_9-26-16', use_EZ=True)

# ************************************
# ************ Imputation *******************
# ************************************
DV_df_complete = fancyimpute.SoftImpute().complete(DV_df)
DV_df_complete = pd.DataFrame(DV_df_complete,
                              index=DV_df.index,
                              columns=DV_df.columns)

# ************************************
# ************ PCA *******************
# ************************************

pca_data = DV_df_complete.corr()
pca = decomposition.PCA()
pca.fit(pca_data)

# plot explained variance vs. components
sns.plt.plot(pca.explained_variance_ratio_.cumsum())
示例#8
0
def softimpute(X, y=None):
    return (fancyimpute.SoftImpute(verbose=False).complete(X))
def preprocess(trainfile,
               testfile,
               outputdir,
               useless_attr,
               miss_threshold,
               xstrategy,
               ymin,
               ymax,
               ystrategy,
               fill_method="MICE",
               normal01=True):
    """对XY进行数据预处理,矩阵补全、正则化标准化等。

    :param trainfile: string, 训练集(d_train_20180102.csv)的路径
    :param testfile: string, 测试集(d_test_A_20180102.csv)的路径
    :param outputdir: string, 预处理后文件保存的路径
    :param useless_attr: list, 需要删除的无用属性,比如[0, 1, 2, 3]
    :param miss_threshold: float, 属性确实严重忽略的阈值,百分比,比如0.7
    :param xstrategy: string, 对x中奇异点的处理方式{"replace", "nothing"}
    :param ymin: float, 对Y中点的最小值,小于这个值,即为奇异点
    :param ymax: float, 对Y中点的最大值,超过这个值,就是奇异点
    :param ystrategy: string, 对y中奇异点的处理方式("delete", "replace", "nothing")
    :param fill_method: string, 矩阵补全的策略,{"KNN", "SoftI", "MF", "MICE"}
    :param normal01: bool, 如果为真,则对结果进行归一化到01,否则,不归一化
    :return: list, 归一化之后的trainX, trainY, testX
    """
    # 0. 读入训练集,测试集
    train_XY = convert(trainfile)
    test_X = convert(testfile)
    print("读入数据集,开始数据预处理")

    # 1. 删除无用属性列
    train_id = train_XY[:, 0:1]
    test_id = test_X[:, 0:1]
    train_XY = np.delete(train_XY, useless_attr, axis=1)
    test_X = np.delete(test_X, useless_attr, axis=1)
    n_test = test_X.shape[0]
    info1 = "1. 删除train_XY, test_X上的无用属性:%s, train_X.shape=%s, test_X.shape=%s"\
          %(str(useless_attr), str(train_XY.shape), str(test_X.shape))
    print(info1)

    # 2. 删除缺失严重的列
    miss_mask = np.isnan(train_XY)
    n = miss_mask.shape[0]
    column_del = []  # 删除列的list
    for i in range(miss_mask.shape[1]):
        miss_n = miss_mask[:, i].sum()
        if miss_n / n >= miss_threshold:
            column_del.append(i)
    train_XY = np.delete(train_XY, column_del, axis=1)
    test_X = np.delete(test_X, column_del, axis=1)
    info2 = "2. 在train_XY, test_X上删除缺失超过%f%%的属性:%s" % (miss_threshold * 100,
                                                       str(column_del))
    print(info2)

    # 3. 对y进行去噪,手动设置阈值
    train_Y = train_XY[:, -1:]
    upper_mask = train_Y > ymax
    lower_mask = train_Y < ymin
    if ystrategy == "replace":
        train_Y[upper_mask] = ymax
        train_Y[lower_mask] = ymin
    elif ystrategy == "delete":
        index = np.array(np.arange(0, train_Y.shape[0], 1), ndmin=2).T
        chsn_mask = upper_mask | lower_mask
        train_XY = np.delete(train_XY, index[chsn_mask], axis=0)
        train_id = np.delete(train_id, index[chsn_mask], axis=0)
    elif ystrategy == "nothing":
        pass
    else:
        raise ValueError(r"'ystrategy'应该是{nothing, replace, delete}中的一个")
    train_Y = train_XY[:, -1:]
    print("3. 对trainY去噪(%s),trainXY.shape=%s" % (ystrategy, train_XY.shape))

    # 4. 对X进行操作,通过boxplot计算阈值
    train_X = train_XY[:, :-1]
    all_X = np.concatenate([train_X, test_X], axis=0)
    attr_n = train_XY.shape[1] - 1
    attr_min_max = np.zeros(
        (attr_n, 2), dtype=np.float64)  # 存储每个属性经过boxplot之后的最小最大值,即阈值array
    if xstrategy == "nothing":
        pass
    elif xstrategy == "replace":
        # 对X中的奇异点 替换为 最值
        for i in range(attr_n):
            # 对每列进行浅拷贝,对crt_attr操作相当于对train_XY操作
            crt_attr = all_X[:, i:i + 1]
            miss = np.isnan(crt_attr)
            box_dic = plt.boxplot(crt_attr[~miss])
            crt_max = box_dic["caps"][0].get_ydata()[0]
            crt_min = box_dic["caps"][1].get_ydata()[0]
            if crt_max < crt_min:
                tmp = crt_max
                crt_max = crt_min
                crt_min = tmp
            attr_min_max[i, 0] = crt_min
            attr_min_max[i, 1] = crt_max
            crt_attr[miss] = 0
            upper_mask = crt_attr > crt_max
            lower_mask = crt_attr < crt_min
            upper_mask &= ~miss
            lower_mask &= ~miss

            crt_attr[upper_mask] = crt_max
            crt_attr[lower_mask] = crt_min
            crt_attr[miss] = np.nan
    else:
        raise ValueError(r"'xstrategy'应该是{nothing, replace}中的一个")
    print(r"4. 对所有的X进行去噪(%s)." % xstrategy)

    # 5. 矩阵补全
    completer = None
    if fill_method == "KNN":
        completer = fi.KNN(verbose=False)
    elif fill_method == "SoftI":
        completer = fi.SoftImpute(verbose=False)
    elif fill_method == "MF":
        completer = fi.MatrixFactorization(verbose=False)
    elif fill_method == "MICE":
        completer = fi.MICE(verbose=False)
    else:
        ValueError(r"'fill_method'应该是{'KNN','SoftI','MF','MICE'}.")
    all_X_complete = completer.complete(all_X)
    print("5. 在all_X上进行矩阵补全(%s)." % fill_method)

    # train_X = all_X_complete[:-1000, :]
    # test_X = all_X_complete[-1000:, :]
    # 6. 归一化,以及01缩放
    if normal01:
        X_nmler = StandardScaler()
        X_01 = MinMaxScaler()
        Y_nmler = StandardScaler()
        Y_01 = MinMaxScaler()

        X_nmler.fit(all_X_complete)
        Y_nmler.fit(train_Y)
        all_X_nml = X_nmler.transform(all_X_complete)
        train_Y_nml = Y_nmler.transform(train_Y)
        X_01.fit(all_X_nml)
        Y_01.fit(train_Y_nml)
        all_X_nml01 = X_01.transform(all_X_nml)
        train_Y_nml01 = Y_01.transform(train_Y_nml)
        final_train_X = all_X_nml01[:-n_test, :]
        final_test_X = all_X_nml01[-n_test:, :]
        final_train_Y = np.concatenate([train_Y_nml01, train_Y], axis=1)
    else:
        final_train_X = all_X_complete[:-n_test, :]
        final_test_X = all_X_complete[-n_test:, :]
        final_train_Y = train_Y
    print(r"6. 对all_X, train_Y归一化到01(%s)." % normal01)

    # 7. 存储数据
    print(r"7. 存储数据为: 集合_类别_日期.csv(%s)." % outputdir)
    # timestamp = datetime.now().strftime("%Y%m%d%H%M")
    timestamp = "0000"
    np.savetxt(outputdir + r"\train_X_" + timestamp + ".csv",
               final_train_X,
               delimiter=",")
    np.savetxt(outputdir + r"\test_X_" + timestamp + ".csv",
               final_test_X,
               delimiter=",")
    np.savetxt(outputdir + r"\train_Y_" + timestamp + ".csv",
               final_train_Y,
               delimiter=",")
    np.savetxt(outputdir + r"\train_id_" + timestamp + ".csv",
               train_id.astype(np.int64),
               delimiter=",")
    np.savetxt(outputdir + r"\test_id_" + timestamp + ".csv",
               test_id.astype(np.int64),
               delimiter=",")
    return train_X, train_Y, test_X, train_id
    if not os.path.exists(os.path.dirname(datafile)):
        os.mkdir(os.path.dirname(datafile))
    behavdata = get_behav_data(dataset)
    filter_by_icc = False

    if filter_by_icc:
        icc_threshold = 0.25
        icc_boot = pandas.read_csv(
            '../Data/Retest_09-27-2017/bootstrap_merged.csv')
        icc = icc_boot.groupby('dv').mean().icc

        for v in behavdata.columns:
            if icc.loc[v] < icc_threshold:
                del behavdata[v]

    behavdata_imputed = fancyimpute.SoftImpute().fit_transform(
        behavdata.values)
    df = pandas.DataFrame(behavdata_imputed, columns=behavdata.columns)

    for dropvar in [
            'kirby_mturk.percent_patient', 'kirby_mturk.exp_discount_rate',
            'kirby_mturk.hyp_discount_rate',
            'kirby_mturk.exp_discount_rate_medium',
            'kirby_mturk.exp_discount_rate_small',
            'kirby_mturk.exp_discount_rate_large',
            'probabilistic_selection_mturk.positive_learning_bias',
            'shift_task_mturk.nonperseverative_errors',
            'stop_signal_mturk.omission_errors', 'stop_signal_mturk.SSRT',
            'stroop_mturk.incongruent_errors'
    ]:
        del df[dropvar]