def outer_cv_loop(Xdata, Ydata, clf, parameters=[], n_splits=10, test_size=0.25): pred = numpy.zeros(len(Ydata)) importances = [] kf = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size) rocscores = [] for train, test in kf.split(Xdata, Ydata): if numpy.var(Ydata[test]) == 0: print('zero variance', varname) rocscores.append(numpy.nan) continue Ytrain = Ydata[train] Xtrain = fancyimpute.SoftImpute(verbose=False).complete( Xdata[train, :]) Xtest = fancyimpute.SoftImpute(verbose=False).complete(Xdata[test, :]) if numpy.abs(numpy.mean(Ytrain) - 0.5) > 0.2: smt = SMOTETomek() Xtrain, Ytrain = smt.fit_sample(Xtrain.copy(), Ydata[train]) # filter out bad folds clf.fit(Xtrain, Ytrain) pred = clf.predict(Xtest) if numpy.var(pred) > 0: rocscores.append(roc_auc_score(Ydata[test], pred)) else: rocscores.append(numpy.nan) importances.append(clf.feature_importances_) return rocscores, importances
bx_df['beh_nback_neut-gt-neg_rt'] = bx_df['beh_nback_neutface_cor_rt'] - bx_df[ 'beh_nback_negface_cor_rt'] bx_df['beh_nback_neut-gt-pos_rt'] = bx_df['beh_nback_neutface_cor_rt'] - bx_df[ 'bn_posface_cor_rt'] bx_df['sst_cor_stop_pct'] = 1 - bx_df['bs_incor_stop_percent_total'] bx_vars = [ 'cash_choice_task', 'sst_cor_stop_pct', 'upps_y_ss_negative_urgency', 'upps_y_ss_lack_of_planning', 'upps_y_ss_sensation_seeking', 'upps_y_ss_positive_urgency', 'upps_y_lack_perseverance', 'bis_y_ss_bis_sum', 'bis_y_ss_bas_rr', 'bis_y_ss_bas_drive', 'bis_y_ss_bas_fs', 'nihtbx_flanker_agecorrected', 'beh_nback_neut-gt-neg_rt', 'beh_nback_neut-gt-pos_rt' ] impute_pls = fi.SoftImpute(verbose=False) complete_bx = impute_pls.fit_transform(bx_df[bx_vars]) complete_scaled = scale(complete_bx) decomp = PCA().fit(complete_scaled) fig, ax = plt.subplots(figsize=[7, 5]) plt.tight_layout(pad=2) ax2 = ax.twinx() g = sns.pointplot(np.arange(1, 15), decomp.explained_variance_ratio_, markers='x', join=True, ax=ax) h = sns.lineplot(x=np.arange(1, 15),
def impute(df, method, verbose=False): """ Impute missing data using specified imputation method. Parameters ---------- df: pd.DataFrame Stat DataFrame with source columns and player/team multi-index. method: str/bool Imputation method for missing data. - False: Do not impute missing data. - None: Do not impute missing data. - 'BiScaler' - 'IterativeImpute' - 'IterativeSVD' - 'KNN': Impute with nearest neighbors. - 'Mean': Impute missing with average of other sources. - 'NuclearNorm' - 'SoftImpute' verbose: bool, default=False If True, print debugging information. Returns ------- df: pd.DataFrame Imputed DataFrame with no NaNs. """ warnings.filterwarnings('ignore', category=RuntimeWarning) # Subset DataFrame to only include only projection columns. ignored_cols = ['Player', 'Team', 'Pos', 'Week', 'STATS'] impute_cols = [col for col in list(df) if col not in ignored_cols] X = df[impute_cols].copy().T # Impute DataFrame. v = verbose if method in [None, False]: imputed_vals = X.values elif np.sum(np.sum(X.isnull())) == 0: # No missing values. imputed_vals = X.values elif method == 'BiScaler': imputed_vals = fi.BiScaler(verbose=v).fit_transform(X) elif method == 'IterativeImpute': imputed_vals = fi.IterativeImputer(verbose=v).fit_transform(X) elif method == 'IterativeSVD': imputed_vals = fi.IterativeSVD(verbose=v).fit_transform(X) elif method == 'KNN': imputed_vals = fi.KNN(k=3, verbose=v).fit_transform(X) elif method == 'MatrixFactorization': imputed_vals = fi.MatrixFactorization(verbose=v).fit_transform(X) elif method == 'Mean': imputed_vals = fi.SimpleFill('mean').fit_transform(X) elif method == 'Median': imputed_vals = fi.SimpleFill('median').fit_transform(X) elif method == 'NuclearNorm': imputed_vals = fi.NuclearNormMinimization(verbose=v).fit_transform(X) elif method == 'SoftImpute': imputed_vals = fi.SoftImpute(verbose=v).fit_transform(X) # Recombine ignored columns with imputed data. imputed_df = pd.DataFrame(imputed_vals.T, columns=X.index) for col in impute_cols: if len(imputed_df[col]) != len(df[col]): print(f'df: {len(df[col])}\nimp: {len(imputed_df[col])}') df[col] = imputed_df[col].values return df
def impute(self): return fi.SoftImpute(verbose=False).complete(self.missing_data)
#********************************** # Load Data #********************************** datasets = pickle.load(open('../Data/subset_data.pkl', 'rb')) data = datasets['all_data'] task_data = datasets['task_data'] survey_data = datasets['survey_data'] verbose_lookup = datasets['verbose_lookup'] # ************************************ # ************ Imputation ******************* # ************************************ data.drop(['ptid', 'gender', 'age'], axis=1, inplace=True) data_complete = fancyimpute.SoftImpute().complete(data) data_complete = pd.DataFrame(data_complete, index=data.index, columns=data.columns) # ************************************ # ************ Connectivity Matrix ******************* # ************************************ spearman_connectivity = calc_connectivity_mat(data_complete, edge_metric='spearman') distance_connectivity = calc_connectivity_mat(data_complete, edge_metric='distance') # ************************************ # ********* Graphs *******************
import numpy as np import fancyimpute print("Loading memory") mem = np.load( '/home/aocc/code/DL/MDP_learning/save_memory/first_20mill/BipedalWalker-v2CORRUPT0.1.npy' ) print("Imputing memory") memory_final = fancyimpute.SoftImpute().complete(mem) print("Saving imputed memory") np.save( '/home/aocc/code/DL/MDP_learning/save_memory/first_20mill/BipedalWalker-v2IMPUTED.1', memory_final)
import pandas as pd import seaborn as sns from sklearn import decomposition import sys sys.path.append('../utils') from utils import get_behav_data from plot_utils import dendroheatmap # get dependent variables DV_df = get_behav_data('Discovery_9-26-16', use_EZ=True) # ************************************ # ************ Imputation ******************* # ************************************ DV_df_complete = fancyimpute.SoftImpute().complete(DV_df) DV_df_complete = pd.DataFrame(DV_df_complete, index=DV_df.index, columns=DV_df.columns) # ************************************ # ************ PCA ******************* # ************************************ pca_data = DV_df_complete.corr() pca = decomposition.PCA() pca.fit(pca_data) # plot explained variance vs. components sns.plt.plot(pca.explained_variance_ratio_.cumsum())
def softimpute(X, y=None): return (fancyimpute.SoftImpute(verbose=False).complete(X))
def preprocess(trainfile, testfile, outputdir, useless_attr, miss_threshold, xstrategy, ymin, ymax, ystrategy, fill_method="MICE", normal01=True): """对XY进行数据预处理,矩阵补全、正则化标准化等。 :param trainfile: string, 训练集(d_train_20180102.csv)的路径 :param testfile: string, 测试集(d_test_A_20180102.csv)的路径 :param outputdir: string, 预处理后文件保存的路径 :param useless_attr: list, 需要删除的无用属性,比如[0, 1, 2, 3] :param miss_threshold: float, 属性确实严重忽略的阈值,百分比,比如0.7 :param xstrategy: string, 对x中奇异点的处理方式{"replace", "nothing"} :param ymin: float, 对Y中点的最小值,小于这个值,即为奇异点 :param ymax: float, 对Y中点的最大值,超过这个值,就是奇异点 :param ystrategy: string, 对y中奇异点的处理方式("delete", "replace", "nothing") :param fill_method: string, 矩阵补全的策略,{"KNN", "SoftI", "MF", "MICE"} :param normal01: bool, 如果为真,则对结果进行归一化到01,否则,不归一化 :return: list, 归一化之后的trainX, trainY, testX """ # 0. 读入训练集,测试集 train_XY = convert(trainfile) test_X = convert(testfile) print("读入数据集,开始数据预处理") # 1. 删除无用属性列 train_id = train_XY[:, 0:1] test_id = test_X[:, 0:1] train_XY = np.delete(train_XY, useless_attr, axis=1) test_X = np.delete(test_X, useless_attr, axis=1) n_test = test_X.shape[0] info1 = "1. 删除train_XY, test_X上的无用属性:%s, train_X.shape=%s, test_X.shape=%s"\ %(str(useless_attr), str(train_XY.shape), str(test_X.shape)) print(info1) # 2. 删除缺失严重的列 miss_mask = np.isnan(train_XY) n = miss_mask.shape[0] column_del = [] # 删除列的list for i in range(miss_mask.shape[1]): miss_n = miss_mask[:, i].sum() if miss_n / n >= miss_threshold: column_del.append(i) train_XY = np.delete(train_XY, column_del, axis=1) test_X = np.delete(test_X, column_del, axis=1) info2 = "2. 在train_XY, test_X上删除缺失超过%f%%的属性:%s" % (miss_threshold * 100, str(column_del)) print(info2) # 3. 对y进行去噪,手动设置阈值 train_Y = train_XY[:, -1:] upper_mask = train_Y > ymax lower_mask = train_Y < ymin if ystrategy == "replace": train_Y[upper_mask] = ymax train_Y[lower_mask] = ymin elif ystrategy == "delete": index = np.array(np.arange(0, train_Y.shape[0], 1), ndmin=2).T chsn_mask = upper_mask | lower_mask train_XY = np.delete(train_XY, index[chsn_mask], axis=0) train_id = np.delete(train_id, index[chsn_mask], axis=0) elif ystrategy == "nothing": pass else: raise ValueError(r"'ystrategy'应该是{nothing, replace, delete}中的一个") train_Y = train_XY[:, -1:] print("3. 对trainY去噪(%s),trainXY.shape=%s" % (ystrategy, train_XY.shape)) # 4. 对X进行操作,通过boxplot计算阈值 train_X = train_XY[:, :-1] all_X = np.concatenate([train_X, test_X], axis=0) attr_n = train_XY.shape[1] - 1 attr_min_max = np.zeros( (attr_n, 2), dtype=np.float64) # 存储每个属性经过boxplot之后的最小最大值,即阈值array if xstrategy == "nothing": pass elif xstrategy == "replace": # 对X中的奇异点 替换为 最值 for i in range(attr_n): # 对每列进行浅拷贝,对crt_attr操作相当于对train_XY操作 crt_attr = all_X[:, i:i + 1] miss = np.isnan(crt_attr) box_dic = plt.boxplot(crt_attr[~miss]) crt_max = box_dic["caps"][0].get_ydata()[0] crt_min = box_dic["caps"][1].get_ydata()[0] if crt_max < crt_min: tmp = crt_max crt_max = crt_min crt_min = tmp attr_min_max[i, 0] = crt_min attr_min_max[i, 1] = crt_max crt_attr[miss] = 0 upper_mask = crt_attr > crt_max lower_mask = crt_attr < crt_min upper_mask &= ~miss lower_mask &= ~miss crt_attr[upper_mask] = crt_max crt_attr[lower_mask] = crt_min crt_attr[miss] = np.nan else: raise ValueError(r"'xstrategy'应该是{nothing, replace}中的一个") print(r"4. 对所有的X进行去噪(%s)." % xstrategy) # 5. 矩阵补全 completer = None if fill_method == "KNN": completer = fi.KNN(verbose=False) elif fill_method == "SoftI": completer = fi.SoftImpute(verbose=False) elif fill_method == "MF": completer = fi.MatrixFactorization(verbose=False) elif fill_method == "MICE": completer = fi.MICE(verbose=False) else: ValueError(r"'fill_method'应该是{'KNN','SoftI','MF','MICE'}.") all_X_complete = completer.complete(all_X) print("5. 在all_X上进行矩阵补全(%s)." % fill_method) # train_X = all_X_complete[:-1000, :] # test_X = all_X_complete[-1000:, :] # 6. 归一化,以及01缩放 if normal01: X_nmler = StandardScaler() X_01 = MinMaxScaler() Y_nmler = StandardScaler() Y_01 = MinMaxScaler() X_nmler.fit(all_X_complete) Y_nmler.fit(train_Y) all_X_nml = X_nmler.transform(all_X_complete) train_Y_nml = Y_nmler.transform(train_Y) X_01.fit(all_X_nml) Y_01.fit(train_Y_nml) all_X_nml01 = X_01.transform(all_X_nml) train_Y_nml01 = Y_01.transform(train_Y_nml) final_train_X = all_X_nml01[:-n_test, :] final_test_X = all_X_nml01[-n_test:, :] final_train_Y = np.concatenate([train_Y_nml01, train_Y], axis=1) else: final_train_X = all_X_complete[:-n_test, :] final_test_X = all_X_complete[-n_test:, :] final_train_Y = train_Y print(r"6. 对all_X, train_Y归一化到01(%s)." % normal01) # 7. 存储数据 print(r"7. 存储数据为: 集合_类别_日期.csv(%s)." % outputdir) # timestamp = datetime.now().strftime("%Y%m%d%H%M") timestamp = "0000" np.savetxt(outputdir + r"\train_X_" + timestamp + ".csv", final_train_X, delimiter=",") np.savetxt(outputdir + r"\test_X_" + timestamp + ".csv", final_test_X, delimiter=",") np.savetxt(outputdir + r"\train_Y_" + timestamp + ".csv", final_train_Y, delimiter=",") np.savetxt(outputdir + r"\train_id_" + timestamp + ".csv", train_id.astype(np.int64), delimiter=",") np.savetxt(outputdir + r"\test_id_" + timestamp + ".csv", test_id.astype(np.int64), delimiter=",") return train_X, train_Y, test_X, train_id
if not os.path.exists(os.path.dirname(datafile)): os.mkdir(os.path.dirname(datafile)) behavdata = get_behav_data(dataset) filter_by_icc = False if filter_by_icc: icc_threshold = 0.25 icc_boot = pandas.read_csv( '../Data/Retest_09-27-2017/bootstrap_merged.csv') icc = icc_boot.groupby('dv').mean().icc for v in behavdata.columns: if icc.loc[v] < icc_threshold: del behavdata[v] behavdata_imputed = fancyimpute.SoftImpute().fit_transform( behavdata.values) df = pandas.DataFrame(behavdata_imputed, columns=behavdata.columns) for dropvar in [ 'kirby_mturk.percent_patient', 'kirby_mturk.exp_discount_rate', 'kirby_mturk.hyp_discount_rate', 'kirby_mturk.exp_discount_rate_medium', 'kirby_mturk.exp_discount_rate_small', 'kirby_mturk.exp_discount_rate_large', 'probabilistic_selection_mturk.positive_learning_bias', 'shift_task_mturk.nonperseverative_errors', 'stop_signal_mturk.omission_errors', 'stop_signal_mturk.SSRT', 'stroop_mturk.incongruent_errors' ]: del df[dropvar]