def impute(self, df): if self.knn: knn = KNN() return pd.DataFrame(knn.fit_transform(df), columns=df.columns) else: mice = IterativeImputer() return pd.DataFrame(mice.fit_transform(df), columns=df.columns)
def complete(self, data: pd.DataFrame): df = data.copy() cols = list(df) df = pd.DataFrame(KNN(k=self.k, verbose=False).fit_transform(df)) df.columns = cols return df
import pandas as pd import numpy as np data = pd.read_excel( "C:/Users/mudmoham/Documents/pr/case studies/Employee Absenteeism/Absenteeism_at_work_Project.xls", sheetname="Year_Sheet") pd.isnull(data).sum() data["Reason for absence"] = data["Reason for absence"].fillna(20) data.shape data = data[data["Reason for absence"] != 0] from fancyimpute import KNN data = pd.DataFrame(KNN(k=3).complete(data), columns=data.columns) data = data.apply(np.round, axis=1) pd.isnull(data).sum() for col in num_columns: q75, q25 = np.percentile(data[col], [75, 25]) iqr = q75 - q25 maximum = q75 + iqr * 1.5 minimum = q25 - iqr * 1.5 data.loc[data[col] < minimum, col] = np.nan data.loc[data[col] > maximum, col] = np.nan data = pd.DataFrame(KNN(k=3).complete(data), columns=data.columns) data = data.apply(np.round, axis=1) data = data.drop(["Weight", "Height", "Disciplinary failure"], axis=1) data["BMI"] = pd.cut( data["Body mass index"], [0, 18.5, 24.9, 29.9, 40],
with open(args.config) as f: config = json.load(f) data_path = config["data_path"] #Ground truth data corrupt_data_path = config["corrupt_data_path"] #Data containing missing values n_neighbor = config["n_neighbor"] trial_ind = config["trial_ind"] # LOAD DATA data= pd.read_csv(data_path).values data_missing = pd.read_csv(corrupt_data_path).values n_row = data_missing.shape[1] # dimensionality of data space non_missing_row_ind= np.where(np.isfinite(np.sum(data_missing,axis=1))) na_ind = np.where(np.isnan(data_missing)) na_count= len(na_ind[0]) knnImpute = KNN(k=n_neighbor) print("Start Knn") #X_impute_KNN = knnImpute.complete(Xdata_Missing) data_impute_KNN = knnImpute.fit_transform(data_missing) print("Knn finished") ReconstructionErrorKNN = sum(((data_impute_KNN[na_ind] - data[na_ind])**2)**0.5)/na_count print('Reconstruction error (KNN):') print(ReconstructionErrorKNN) np.savetxt("./imputed_data_trial_"+str(trial_ind)+"_KNN.csv", data_impute_KNN, delimiter=",")
def impute_knn(X): # Use 3 nearest rows which have a feature to fill in each row's missing features return KNN(k=3).complete(X)
dataset['Mode_transport']) dataset['comorbidity'] = labelencoder_X.fit_transform(dataset['comorbidity']) dataset['Pulmonary score'] = labelencoder_X.fit_transform( dataset['Pulmonary score']) dataset['cardiological pressure'] = labelencoder_X.fit_transform( dataset['cardiological pressure']) dataset = dataset.drop(['Name'], axis=1) scaler = StandardScaler() standardized_features = scaler.fit_transform(dataset.iloc[:, 1:26]) #use the above to standardize all columns #standardized_features = scaler.fit_transform(dataset[['Age', 'Coma score','Diuresis', 'Platelets','HBB','d-dimer','Heart rate','HDL cholesterol', 'Charlson Index','Insurance','salary']]) #dataset[['Age', 'Coma score','Diuresis', 'Platelets','HBB','d-dimer','Heart rate','HDL cholesterol', 'Charlson Index','Insurance','salary']]=standardized_features features_knn_imputed = KNN(k=100, verbose=0).fit_transform(standardized_features) dataset.iloc[:, 1:26] = features_knn_imputed correlation = dataset.iloc[:, 1:].corr(method='pearson') columns = correlation.nlargest(25, 'Infect_Prob').index correlation_map = np.corrcoef(dataset[columns].values.T) sns.set(font_scale=1.0) heatmap = sns.heatmap(correlation_map, cbar=True, annot=True, square=True, fmt='.2f', yticklabels=columns.values, xticklabels=columns.values)
import os os.chdir('....\\data\\input.csv') #You DO NOT talk about Fight Club import pandas as pd from fancyimpute import KNN from sklearn.preprocessing import LabelEncoder from sklearn.ensemble import RandomForestClassifier #Only two guys to a fight train = pd.read_csv('train.csv') test = pd.read_csv('test.csv') #Someone yells stop, goes limp, taps out, the fight is over train.isnull().sum() train = KNN(k=3).complete(train) test = KNN(k=3).complete(test) #One fight at a time le = LabelEncoder() cat = ['genre','certificate', 'distributor'] for col in cat: train[col] = le.fit_transform(train[col]) test[col] = le.fit_transform(test[col]) #no shirts, no shoes train_X = train.drop(['year','oscar', 'movie_name', 'actor_name', 'href'], axis=1) test_X = test.drop(['year','oscar', 'movie_name', 'actor_name', 'href'], axis = 1) train_Y = train['oscar']
# 4.11填充缺失值 import numpy as np from fancyimpute import KNN from sklearn.preprocessing import StandardScaler from sklearn.datasets import make_blobs features, _ = make_blobs(n_samples=1000, n_features=2, random_state=1) scaler = StandardScaler() #标准化特征 standardized_features = scaler.fit_transform(features) true_value = standardized_features[0, 0] #将第一个特征向量的第一个值替换成缺失值 standardized_features[0, 0] = np.nan # [1]KNN 算法来预测缺失值 features_knn_imputed = KNN(k=5, verbose=0).fit_transform( standardized_features) #预测特征矩阵中的缺失值 # 对比真实值和填充值 print('True Value:', true_value) print('Imputed Value:', features_knn_imputed[0, 0]) # [2]特征值的平均值,中位数或者众数来填充 from sklearn.preprocessing import Imputer mean_imputer = Imputer(strategy="mean", axis=0) features_mean_imputed = mean_imputer.fit_transform(features) print('True Value:', true_value) print('Imputed Value:', features_mean_imputed[0, 0])
m = 20 inner_rank = 4 X = np.dot(np.random.randn(n, inner_rank), np.random.randn(inner_rank, m)) print("Mean squared element: %0.4f" % (X**2).mean()) # X is a data matrix which we're going to randomly drop entries from missing_mask = np.random.rand(*X.shape) < 0.1 X_incomplete = X.copy() # missing entries indicated with NaN X_incomplete[missing_mask] = np.nan meanFill = SimpleFill("mean") X_filled_mean = meanFill.fit_transform(X_incomplete) # Use 3 nearest rows which have a feature to fill in each row's missing features knnImpute = KNN(k=3) X_filled_knn = knnImpute.fit_transform(X_incomplete) # matrix completion using convex optimization to find low-rank solution # that still matches observed values. Slow! X_filled_nnm = NuclearNormMinimization().fit_transform(X_incomplete) # Instead of solving the nuclear norm objective directly, instead # induce sparsity using singular value thresholding softImpute = SoftImpute() # simultaneously normalizes the rows and columns of your observed data, # sometimes useful for low-rank imputation methods biscaler = BiScaler() # rescale both rows and columns to have zero mean and unit variance
#strategy: "mean" or "median" or "most_frequent" train['N30_missing_imputed'] = imp.fit_transform(train['N30'].values.reshape( -1, 1)) imp.fit_transform( train.iloc[:, 1:]) #Removing first column as it is a text variable #Reference: https://pypi.python.org/pypi/fancyimpute/0.0.4 #pip3 install fancyimpute #ONLY NUMERIC VALUES from fancyimpute import NuclearNormMinimization, KNN, MICE solver = NuclearNormMinimization(min_value=0.0, max_value=1.0, error_tolerance=0.0005) X_filled = solver.complete(train['N30'].values.reshape(-1, 1)) X_filled = solver.complete(train) X_filled_knn = KNN(k=3).complete(train) #https://github.com/hammerlab/fancyimpute/blob/master/fancyimpute/mice.py X_filled_mice = MICE().complete(train.as_matrix()) X_filled_mice_df = pd.DataFrame(X_filled_mice) X_filled_mice_df.columns = train.columns X_filled_mice_df.index = train.index #Other methods: SimpleFill, SoftImpute, IterativeSVD, MICE, MatrixFactorization, NuclearNormMinimization, KNN, BiScaler #SimpleFill: uses mean or median; SoftImpute: Matrix completion; ###Smote #Only numeric/boolean and non_null values as input to TSNE model :: BETTER TRY THIS AFTER MISSING VALUE IMPUTATION AND ENCODING from imblearn.over_sampling import SMOTE sm = SMOTE(random_state=42) X_train_new, y_train_new = sm.fit_sample(train.dropna().iloc[:, 1:44], train.dropna()['Dependent_Variable'])
from fancyimpute import KNN import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns #%matplotlib inline # to display all the columns of the dataframe in the notebook pd.pandas.set_option('display.max_columns', None) # load dataset data = pd.read_csv('credit-card-data.csv') data.drop('CUST_ID', axis=1, inplace=True) #Apply KNN imputation algorithm data = pd.DataFrame(KNN(k=3).fit_transform(data), columns=data.columns) #Variables with Missing Value percentage data.apply(lambda x: sum(x.isnull() / len(data)) * 100) data.to_csv('credit_card_knn_imputed.csv', index=False) data['MA_PURCHASE'] = data['PURCHASES'] / data['TENURE'] data['MA_CASH_ADVANCE'] = data['CASH_ADVANCE'] / data['TENURE'] data['LIMIT_USAGE'] = data['BALANCE'] / data['CREDIT_LIMIT'] data['PAY_MINPAY_RATIO'] = data['PAYMENTS'] / data['MINIMUM_PAYMENTS'] #drop purchases,cash_advance,tenure(less variability),Balance,CreditLimit def purchase_type(data): if (data['ONEOFF_PURCHASES'] == 0) & (data['INSTALLMENTS_PURCHASES'] == 0):
stock = stock.join(upperband) stock = stock.join(middleband) stock = stock.join(lowerband) stock = stock.join(Roc) stock = stock.join(Atr) stock = stock.join(rollingrank) stock = stock.join(rollingrank1) stock = stock.join(div) stock = stock.join(voldiff) stock = stock.join(VolROC) stock = stock.join(opendiff) # Impute missing values using KNN stock = stock.as_matrix() stock = np.append(stock, arima, 1) stock = KNN(k=15).fit_transform(stock) stock = pd.DataFrame(stock) stock_train = stock.iloc[0:round(len(stock) * 0.8), :] stock_test = stock.iloc[round(len(stock) * 0.8):, :] # Feature Scaling from sklearn.preprocessing import MinMaxScaler sc = MinMaxScaler() training_set_scaled = sc.fit_transform(stock_train) sc_predict = MinMaxScaler() test_set_scaled = sc_predict.fit_transform(stock_test) X_train = []
round(df.loc[df.Cabin.isnull(), :].groupby( ["Pclass"]).size() / df.groupby(["Pclass"]).size() * 100) # Percentage of NAs by Parch: round(df.loc[df.Cabin.isnull(), :].groupby( ["Parch"]).size() / df.groupby(["Parch"]).size() * 100) # How could we impute the missing data? # Instead of using the mean, median or mode to impute the NAs, we could # use KNN imputation or regression. Here I have chosen to use KNN. # Create new columns for KNN num_vars = [ 'Age', 'Survived', 'Pclass', 'SibSp', 'Parch', 'Fare'] df_impute = pd.DataFrame(KNN(k=3).complete(df.loc[:, num_vars])) df_impute.columns = num_vars # Round the predictions df_impute.Age = df_impute.Age.round() # Update df df_impute.isnull().sum(axis=0) df = pd.concat([df.drop(num_vars, axis=1), df_impute], axis=1)
def knn_imputation(k): X_train_new = KNN(k=k).complete(X_train) X_val_new = KNN(k=k).complete(X_val) X_test_new = KNN(k=k).complete(X_test) return X_train_new, X_val_new, X_test_new
absentData['Pet'] = absentData['Pet'].astype('category') absentData.dtypes #--------------------------------- Missing Value Analysis ---------------------------------# missingVal = pd.DataFrame(absentData.isnull().sum()).sum() missingValPercent = missingVal/len(absentData.index)*100 missingValPercent.round() #Approx 24% values are null in the dataset. So we need to impute them by suitable method. #Missing Value Imputation absentData.isnull().sum() absentData = pd.DataFrame(KNN(k = 3).fit_transform(absentData), columns = absentData.columns) absentData = absentData.round() #------------------------------- Outlier Analysis -----------------------------------------# sns.boxplot(data=absentData[['Absenteeism time in hours','Service time','Height','Weight','Transportation expense','Age']]) fig=plt.gcf() fig.set_size_inches(8,12) sns.boxplot(data=absentData[['Work load Average/day']]) #Computing the benchmark for the numeric values numericValues = ['Work load Average/day','Distance from Residence to Work', 'Service time', 'Age','Transportation expense','Hit target', 'Weight', 'Height', 'Body mass index', 'Absenteeism time in hours'] for i in numericValues: q75, q25 = np.percentile(absentData[i], [75,25])
# Wrangle all data into one dataframe allstations = pd.concat([ station1df["Value"], station2df["Value"], station3df["Value"], station4df["Value"] ], axis=1) allstations.columns = ["station1", "station2", "station3", "station4"] # Run the selected imputation routine to fill all missing cases if method == "SoftImpute": allstations_complete = pd.DataFrame( data=SoftImpute().complete(allstations), columns=allstations.columns, index=allstations.index) elif method == "KNN": allstations_complete = pd.DataFrame(data=KNN().complete(allstations), columns=allstations.columns, index=allstations.index) elif method == "MICE": allstations_complete = pd.DataFrame(data=MICE().complete(allstations), columns=allstations.columns, index=allstations.index) else: print "Sorry, the imputation method %s is not available, try MICE, KNN, or SoftImpute" % method # Unstack the data to get values back in monthly columns, and then Output the filled datasets with appended prefixes to the input filenames. Round data values to nearest integer if asked. if Round is True: pd.DataFrame(allstations_complete["station1"].values.reshape(-1, n), columns=names).round(0).astype("int32").to_csv( "%sfilled_%s" % (method, station1)) pd.DataFrame(allstations_complete["station2"].values.reshape(-1, n),
count += 1 pre = count * 1.0 / sum(predict) # 准确率 recall = count * 1.0 / sum(train) # 召回率 return 2 * pre * recall / (pre + recall) train_data = pd.read_csv('C:\\Users\\JingYi\\Desktop\\diabetes_prediction\\train_data.csv', encoding='gbk') # 1000,85 filter_feature = ['id','label'] # 取预测值 features = [] for x in train_data.columns: # 取特征 if x not in filter_feature: features.append(x) train_data_x = train_data[features] train_data_x = pd.DataFrame(KNN(k=6).fit_transform(train_data_x), columns=features) train_data_y = train_data['label'] X_train, X_test, y_train, y_test = train_test_split(train_data_x, train_data_y, random_state=1) # 划分训练集、测试集 linreg = LogisticRegression() linreg.fit(X_train, y_train) # 模型训练 y_pred = linreg.predict(X_train) # 模型预测 print ("训练集",countF1(y_train.values, y_pred)) y_pred = linreg.predict(X_test) # 模型预测 print ("测试集",countF1(y_test.values, y_pred))
# Step 2: Let us impute the missing values. # MisVal = ImputationMissingValues() # imputed_mean_dataset = MisVal.impute_mean(copy.deepcopy(dataset), 'hr_watch_rate') # imputed_median_dataset = MisVal.impute_median(copy.deepcopy(dataset), 'hr_watch_rate') # imputed_interpolation_dataset = MisVal.impute_interpolate(copy.deepcopy(dataset), 'hr_watch_rate') # DataViz.plot_imputed_values(dataset, ['original', 'mean', 'interpolation'], 'hr_watch_rate', imputed_mean_dataset['hr_watch_rate'], imputed_interpolation_dataset['hr_watch_rate']) X_incomplete = dataset # print(list(X_incomplete)) # # X is the complete data matrix # # X_incomplete has the same values as X except a subset have been replace with NaN # # # Use 3 nearest rows which have a feature to fill in each row's missing features X_filled_knn = KNN(k=6).complete(X_incomplete) # X_filled_knn = knnimpute.(X_incomplete) DataViz.plot_imputed_values(dataset, ['original', 'imputed'], 'hr_watch_rate', X_filled_knn[:, 0]) # # matrix completion using convex optimization to find low-rank solution # # that still matches observed values. Slow! # X_filled_nnm = NuclearNormMinimization().complete(X_incomplete) # # # Instead of solving the nuclear norm objective directly, instead # # induce sparsity using singular value thresholding # X_filled_softimpute = SoftImpute().complete(X_incomplete_normalized) # # # print mean squared error for the three imputation methods above # nnm_mse = ((X_filled_nnm[missing_mask] - X[missing_mask]) ** 2).mean()
# #Calculate IQR iqr = q75 - q25 # #Calculate inner and outer fence minimum = q25 - (iqr*1.5) maximum = q75 + (iqr*1.5) # #Replace with NA train[train.iloc[[:,i] < minimum] = np.nan train[train.loc[:,i] > maximum] = np.nan # #Calculate missing value missing_val = pd.DataFrame(marketing_train.isnull().sum()) # #Impute with KNN train = pd.DataFrame(KNN(k = 3).complete(train), columns = train.columns) # In[13]: ##Correlation analysis #Correlation plot df_corr = train.loc[:,cnames] # In[14]: df_corr
if input_file_name in saved_list: print("已经完成%s" % input_file_name) continue print("========正在计算%s========" % input_file_name) # 读取 data_Aqua = pd.read_excel(input_file_path_Aqua + input_file_name) data_Terra = pd.read_excel(input_file_path_Terra + input_file_name) # 删除字符串,便于计算 del data_Terra["监测站"] del data_Aqua["监测站"] data_Aqua = data_Aqua.set_index('日期') data_Terra = data_Terra.set_index('日期') # 时间局部:KNN # 最近邻估算,使用两行都具有观测数据的特征的均方差来对样本进行加权。然后用加权的结果进行特征值填充 # 相当于A0D17个点为特征进行近邻,则参数K为时间,即时间上最近的16行按特征的均方差进行加权,即哪个时间点的权重大一些 data_Aqua_KNN = KNN(k=7).fit_transform(data_Aqua) data_Aqua_KNN = pd.DataFrame(data_Aqua_KNN) # 结果中有许多零值,应为空值 data_Terra_KNN = KNN(k=7).fit_transform(data_Terra) data_Terra_KNN = pd.DataFrame(data_Terra_KNN) # 结果中有许多零值,应为空值 # 时间全局: 平滑,常用于股市 data_Aqua_ewm = pd.DataFrame.ewm(self=data_Aqua, com=0.5, ignore_na=True, adjust=True).mean() data_Terra_ewm = pd.DataFrame.ewm(self=data_Terra, com=0.5, ignore_na=True, adjust=True).mean() # 空间局部: IDW
import numpy as np import pandas as pd from fancyimpute import KNN dataset = pd.read_csv('MissingData2.csv', sep=",", header=None) dataset = dataset.replace(1e99, np.NaN) dataset = dataset.values df_filled = pd.DataFrame(KNN(3).complete(dataset)) np.savetxt('induriMissingResult2.txt', df_filled, delimiter=',', newline='\n')
'pass_yards_mean', 'pass_yards_max', 'pass_td_mean', 'pass_td_max', 'intcp_mean', 'intcp_max', 'rating_mean', 'rating_max', 'rush_att_mean', 'rush_att_max', 'rush_yards_mean', 'rush_yards_max', 'rush_td_mean', 'rush_td_max', 'rec_mean', 'rec_max', 'rec_yards_mean', 'rec_yards_max', 'rec_td_mean', 'rec_td_max', 'college_points_mean', 'college_points_max', 'avg_diff', 'age'] imp_numeric = all_data[['QB', 'RB', 'TE', 'WR', 'height', 'weight', 'bmi', 'arm_length', 'hand_size', 'front_shoulder', 'back_shoulder', 'wonderlic', 'pass_velocity', 'ten_yard', 'twenty_yard', 'forty_yard', 'bench_press', 'vertical_leap', 'broad_jump', 'shuttle', 'sixty_shuttle', 'three_cone', 'four_square', 'games_mean', 'games_max', 'cmp_mean', 'cmp_max', 'pass_att_mean', 'pass_att_max', 'pass_yards_mean', 'pass_yards_max', 'pass_td_mean', 'pass_td_max', 'intcp_mean', 'intcp_max', 'rating_mean', 'rating_max', 'rush_att_mean', 'rush_att_max', 'rush_yards_mean', 'rush_yards_max', 'rush_td_mean', 'rush_td_max', 'rec_mean', 'rec_max', 'rec_yards_mean', 'rec_yards_max', 'rec_td_mean', 'rec_td_max', 'college_points_mean', 'college_points_max', 'avg_diff', 'age']].values # KNN imputing imp = pd.DataFrame(KNN(k=5).fit_transform(imp_numeric), columns=imp_columns) # add imputed values rest of dataset all_data_imp = all_data.drop(imp_columns, axis=1) master_data = all_data_imp.merge(imp, left_index=True, right_index=True) # new combine variables master_data['speed_score'] = (master_data.weight * 200)/(master_data.forty_yard**4) master_data['agility_score'] = master_data.three_cone + master_data.shuttle master_data['height_adj_ss'] = master_data.speed_score * (master_data.height / 73.5) ** 1.5 master_data['burst_score'] = master_data.vertical_leap + master_data.broad_jump # catch radius and weight adjusted bench ? # merge and drop players without combine data or without any stats stats = stats.merge(df[['player_id', 'draft_year']], on='player_id', how='outer')
import sklearn.datasets as SKD data = pd.read_csv('ai_mavan_adhd7.csv', sep=',', index_col=None) # In[ ]: # MICE IMPUTATION mice_impute = IterativeImputer() traindatafill = mice_impute.fit_transform(adhd) # In[ ]: # KNN way to impute adhd_filled_knn = KNN(k=3).fit_transform( adhd ) #use 3 nearest rows which have a feature to fill in each row’s missing features # In[ ]: # NUCLEARNOMMINIMIZATION adhd_filled_nnm = NuclearNormMinimization().fit_transform(adhd) # In[69]: #ENTER COLUMNS LABELS THAT HAVE DISCRETE VARIABLES discrete_columns = [ 'Hamilton', 'gender_male', 'Dob_MONTH_DIGIT', 'Hamilton', 'above_college', 'QuintMat_w', 'QuintSoc_w', 'Mood_drug', 'Pren_income4', 'No_depression', 'Postpartum_depression', 'Materl_anxiety', 'B_HTTLPR_2', 'B_DRD1_hap',
features = data.iloc[:, :-1] labels = data.iloc[:, -1] #%% features.count() (features == 0).astype(int).sum(axis=0) features[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = features[[ 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI' ]].replace(0, np.nan) (features == 0).astype(int).sum(axis=0) #%% features = pd.DataFrame(data=KNN(k=23).fit_transform(features), index=list(range(len(features))), columns=list(data.columns[:-1])) #%% plt.bar(x=0, height=(labels == 0).sum(), width=0.5, color='salmon', label='Outcome 0') plt.bar(x=1, height=(labels == 1).sum(), width=0.5, color='cyan', label='Outcome 1') plt.xticks([0, 1])
m = 20 inner_rank = 4 X = np.dot(np.random.randn(n, inner_rank), np.random.randn(inner_rank, m)) print("Mean squared element: %0.4f" % (X**2).mean()) # X is a data matrix which we're going to randomly drop entries from missing_mask = np.random.rand(*X.shape) < 0.1 X_incomplete = X.copy() # missing entries indicated with NaN X_incomplete[missing_mask] = np.nan meanFill = SimpleFill("mean") X_filled_mean = meanFill.complete(X_incomplete) # Use 3 nearest rows which have a feature to fill in each row's missing features knnImpute = KNN(k=3) X_filled_knn = knnImpute.complete(X_incomplete) # matrix completion using convex optimization to find low-rank solution # that still matches observed values. Slow! X_filled_nnm = NuclearNormMinimization().complete(X_incomplete) # Instead of solving the nuclear norm objective directly, instead # induce sparsity using singular value thresholding softImpute = SoftImpute() # simultaneously normalizes the rows and columns of your observed data, # sometimes useful for low-rank imputation methods biscaler = BiScaler() # rescale both rows and columns to have zero mean and unit variance
rain['Age'].fillna(train.groupby('Sex')['Age'].transform("median"), inplace=True) #Pearson correlation of features colormap = plt.cm.RdBu plt.figure(figsize=(32,10)) plt.title('Pearson correlation of features',y=1.05, size=15) sns.heatmap(train.corr(), linewidths=0.1, vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True) #KNN method to replace missing values of emp_length column by finding 3 nearest from fancyimpute import KNN # we use dataframe, fancyimpute removes column names train_cols=list(train) # use 5 nearest rows to fill missing features train = pd.DataFrame(KNN(k=5).complete(train)) train.columns = train_cols #MICE method uses Bayesian ridge regression avoids baises from fancyimpute import IterativeImputer as MICE #use MICE to fill missing rows train_cols=list(loans) train=pd.DataFrame(MICE(verbose=False).fit_transform(train)) train.columns =train_cols #Linear regression method from sklearn.linear_model import LinearRegression linreg = LinearRegression()
axis=1) numeric_feats = caddn + caddn0 + caddabs + defn + defn0 + ['TrimerAvg'] cat_feats = caddc + defc + ['Trimer', 'TrimerMut'] df[numeric_feats] = df[numeric_feats].convert_objects(convert_numeric=True) from fancyimpute import BiScaler, KNN, NuclearNormMinimization, SoftImpute, MICE import random d = df[numeric_feats + ['gene']] newd = pd.DataFrame() for i in df['gene'].unique().tolist(): print "Doing gene ", i dtemp = d[d['gene'] == i][numeric_feats] tempx = pd.DataFrame(data=KNN(k=3).complete(dtemp), columns=dtemp.columns, index=dtemp.index) newd = pd.concat([newd, tempx]) X = newd #X=pd.DataFrame(data=Ximp, columns=df[numeric_feats].columns, index=df[numeric_feats].index) X[cat_feats + ['gene']] = df[cat_feats + ['gene']] #X.to_csv('ImputedImcomplete.tsv',sep="\t",index=False) #X=df[numeric_feats+cat_feats] import category_encoders as ce encoder = ce.OneHotEncoder(cols=cat_feats + ['gene']) X = encoder.fit_transform(X)
previsores[:, :] = imputer.fit_transform(previsores[:, :]) # Transforma os dados categoricos/nominais em numericos from sklearn.preprocessing import LabelEncoder previsores[:, 0] = LabelEncoder().fit_transform(previsores[:, 0].astype('str')) previsores[:, 1] = LabelEncoder().fit_transform(previsores[:, 1].astype('str')) previsores[:, 2] = LabelEncoder().fit_transform(previsores[:, 2].astype('str')) previsores[:, 3] = LabelEncoder().fit_transform(previsores[:, 3].astype('str')) previsores[:, 5] = LabelEncoder().fit_transform(previsores[:, 5].astype('str')) previsores[:, 6] = LabelEncoder().fit_transform(previsores[:, 6].astype('str')) previsores[:, 7] = LabelEncoder().fit_transform(previsores[:, 7].astype('str')) # Pacote para uso de algoritmos para tratatar valores faltantes em um dataset from fancyimpute import KNN # Usa 5NN que tenham um recurso para preencher os valores ausentes de cada linha previsores = KNN(k=5).fit_transform(previsores) # Transforma Objeto em DATAFRAME para verificar pre-processamento result = pd.DataFrame(previsores) # Cria atributo a ser previsto classe = result.iloc[:, 10].values # Exclui o mesmo atrivuto a ser classificado, da base de dados previsora result = result.drop(columns=10) # Exclui atributo Skin Color por conter muitos valores ausentes result = result.drop(columns=6) # Retorna a modificação previsores = result.iloc[:, :].values # Determina o tipo int para todas bases usadas
test_data_statiton = pd.read_csv('data/test/test_normal.csv') test_data_null = pd.read_csv('data/test/test_null.csv') test_mask = pd.read_csv('data/test/test_mask.csv') trend_matrix = pd.read_csv('data/test/mstlplus_trend.csv', index_col=0) seasonal12_matrix = pd.read_csv('data/test/mstlplus_seasonal12.csv', index_col=0) seasonal84_matrix = pd.read_csv('data/test/mstlplus_seasonal84.csv', index_col=0) remainder_matrix = pd.read_csv('data/test/mstlplus_remainder.csv', index_col=0) validate_null_number = test_data_null.isna().sum().sum( ) - test_data_statiton.isna().sum().sum() k_number = 10 remainder_knn = pd.DataFrame(KNN(k=k_number).fit_transform(remainder_matrix)) data_mstlplus_knn = remainder_knn.to_numpy( ) + trend_matrix + seasonal12_matrix + seasonal84_matrix error_mask = (test_data_statiton.fillna(0).to_numpy() - data_mstlplus_knn) * (1 - test_mask).to_numpy() mse_error = error_mask**2 mae_error = mre_error = abs(error_mask) total_error_MSE = mse_error.sum().sum() total_error_MAE = mae_error.sum().sum() total_error_MRE = mre_error.sum().sum() total_label_MRE = abs(
# use RandomForestRegression to train data #RFR = RandomForestRegressor(n_estimators=80, n_jobs=-1) #RFR1 = RandomForestRegressor(n_estimators=80, n_jobs=-1) #RFR.fit(X,Y) #RFR1.fit(x,y) #predictAges = RFR.predict(age_df_isnull.values[:,1:]) #predictAges1 = RFR1.predict(age_df1_isnull.values[:,1:]) #train.loc[train['Age'].isnull(), ['Age']]= predictAges #test.loc[test['Age'].isnull(), ['Age']]= predictAges1 #print(test.info()) #age_bins = [0,1,4,13,18,35,45,55,65,180] #train['age_group'] = pd.cut(train['Age'],age_bins) #train['age_group'] #train['Age'] from fancyimpute import KNN age_train = KNN(10).complete(train) train = pd.DataFrame(age_train,columns = train.columns) #printtrain['Age'] age_test = KNN(k=10).complete(test) test = pd.DataFrame(age_test, columns = test.columns) age_bins = [0,1,4,13,18,35,45,55,65,180] train['age_group'] = pd.cut(train['Age'],age_bins) label = LabelEncoder() train['age_group'] = label.fit_transform(train['age_group']) test['age_group'] = pd.cut(test['Age'],age_bins) test['age_group'] = label.fit_transform(test['age_group']) train.head()
m = 20 inner_rank = 4 X = np.dot(np.random.randn(n, inner_rank), np.random.randn(inner_rank, m)) print("Mean squared element: %0.4f" % (X ** 2).mean()) # X is a data matrix which we're going to randomly drop entries from missing_mask = np.random.rand(*X.shape) < 0.1 X_incomplete = X.copy() # missing entries indicated with NaN X_incomplete[missing_mask] = np.nan meanFill = SimpleFill("mean") X_filled_mean = meanFill.complete(X_incomplete) # Use 3 nearest rows which have a feature to fill in each row's missing features knnImpute = KNN(k=3) X_filled_knn = knnImpute.complete(X_incomplete) # matrix completion using convex optimization to find low-rank solution # that still matches observed values. Slow! X_filled_nnm = NuclearNormMinimization().complete(X_incomplete) # Instead of solving the nuclear norm objective directly, instead # induce sparsity using singular value thresholding softImpute = SoftImpute() # simultaneously normalizes the rows and columns of your observed data, # sometimes useful for low-rank imputation methods biscaler = BiScaler() # rescale both rows and columns to have zero mean and unit variance
def KNNtrans(d): m = KNN(k=20).fit_transform(d) m = pd.DataFrame(m, columns=d.columns) return m