def impute_missing_values(self, data): """ Method Name: impute_missing_values Description: This method replaces all the missing values in the Dataframe using KNN Imputer. Output: A Dataframe which has all the missing values imputed. On Failure: Raise Exception """ self.logger_object.log(self.file_object, 'Entered the impute_missing_values method of the Preprocessor class') self.data= data try: imputer=KNNImputer(n_neighbors=3, weights='uniform',missing_values=np.nan) self.new_array=imputer.fit_transform(self.data) # impute the missing values # convert the nd-array returned in the step above to a Dataframe self.new_data=pd.DataFrame(data=(self.new_array), columns=self.data.columns) self.logger_object.log(self.file_object, 'Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class') return self.new_data except Exception as e: self.logger_object.log(self.file_object,'Exception occured in impute_missing_values method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log(self.file_object,'Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class') raise Exception()
def impute_feature(data, feature): data.loc[data[feature] < 0, feature] = np.NaN value_count = data.groupby('county_fips').count() counties_with_all_nulls = value_count[value_count[feature] == 0] temp = pd.DataFrame(index=data['county_fips'].unique().tolist(), columns=data['date'].unique().tolist()) for i in data['date'].unique(): temp[i] = data.loc[data['date'] == i, feature].tolist() X = np.array(temp) imputer = KNNImputer(n_neighbors=5) imp = imputer.fit_transform(X) imp = pd.DataFrame(imp) imp.columns = temp.columns imp.index = temp.index for i in data['date'].unique(): data.loc[data['date'] == i, feature] = imp[i].tolist() if (len(counties_with_all_nulls) > 0): data.loc[data['county_fips'].isin(counties_with_all_nulls.index), feature] = np.NaN return (data)
def fill_data(train_data, test_data): imputer = KNNImputer(n_neighbors=3, weights='distance') imputer.fit(train_data) train = imputer.transform(train_data) test = imputer.transform(test_data) return train, test
def knn_impute(bigarray): #perform knn_imputation using sample rows print('*STARTING IMPUTING*') print(datetime.datetime.now()) #impute sample rows so they are full samplerows = get_sample_rows(bigarray) stack_samplerows = np.vstack((samplerows[:])) imputer = KNNImputer(n_neighbors=5, weights='distance') complete_samples = imputer.fit_transform(stack_samplerows) #do knn imputation on each row using samples for i in range(0, len(bigarray)): if (i % 50000 == 0): print(f"Imputing row - {i}") if (rowprop(bigarray[i], 0) == False): #if not full #make big array of row and samples big = np.vstack((bigarray[i], complete_samples)) #do knn of row + samples imputer = KNNImputer(n_neighbors=5, weights='distance') filled = imputer.fit_transform(big) #extract and replace current row newrow = filled[0] bigarray[i] = newrow print('*FINISHED IMPUTING*') print(datetime.datetime.now()) return bigarray
def knn_impute_by_item(matrix, valid_data, k): """ Fill in the missing values using k-Nearest Neighbors based on question similarity. Return the accuracy on valid_data. :param matrix: 2D sparse matrix :param valid_data: A dictionary {user_id: list, question_id: list, is_correct: list} :param k: int :return: float """ ##################################################################### # TODO: # # Implement the function as described in the docstring. # ##################################################################### nbrs = KNNImputer(n_neighbors=k) # We use NaN-Euclidean distance measure. mat = nbrs.fit_transform(matrix.transpose()).transpose() acc = sparse_matrix_evaluate(valid_data, mat) ##################################################################### # END OF YOUR CODE # ##################################################################### return acc
def knn_impute_by_item(matrix, valid_data, k): """ Fill in the missing values using k-Nearest Neighbors based on question similarity. Return the accuracy on valid_data. :param matrix: 2D sparse matrix :param valid_data: A dictionary {user_id: list, question_id: list, is_correct: list} :param k: int :return: float """ ##################################################################### # TODO: # # Implement the function as described in the docstring. # ##################################################################### nbrs = KNNImputer(n_neighbors=k) mat = nbrs.fit_transform(matrix.T) acc = sparse_matrix_evaluate(valid_data, mat.T) print("Validation Accuracy Item_based with k = {} : {}".format(k, acc)) ##################################################################### # END OF YOUR CODE # ##################################################################### return acc
def imputer(df, numerical, binary): imputer_feature = df.copy() features_numerical = imputer_feature[numerical] features_binary = imputer_feature[binary] #Impute values with SimpleImputer for binary s_imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent') s_imp = s_imp.fit(features_binary.values) features_binary = s_imp.transform(features_binary.values) #Impute values with KNNImputer for numerical KNNimp = KNNImputer() KNNimp = KNNimp.fit(features_numerical.values) features_numerical = KNNimp.transform(features_numerical.values) #Add columns and index again imputer_feature[binary] = features_binary imputer_feature[numerical] = features_numerical return imputer_feature, s_imp, KNNimp
def knn_missings(df, n_ngb=3): """ First calls the function to select the numeric columns of the dataframe and transform the NaN through a KNN with 3 neighbors (optional). The return change the values on the original dataframe. Params: df = dataframe. n_ngb = number of neighbors of KNN, by default 3. """ df_knn_msg = df.copy() list_num_cols = num_columns(df_knn_msg) imputer = KNNImputer(n_neighbors=n_ngb) imputer.fit(df[list_num_cols]) df_knn_msg[list_num_cols] = imputer.transform(df_knn_msg[list_num_cols]) return df_knn_msg
def impute_last_new_job(df, cat_var): df['last_new_job'] = df['last_new_job'].replace(['never'], 0) df['last_new_job'] = df['last_new_job'].replace(['>4'], 5) df1 = df df1 = df.drop(cat_var, axis=1) imputer = KNNImputer() df1_imputed = imputer.fit_transform(df1) df1_imputed = pd.DataFrame(df1_imputed, index=df1.index, columns=df1.columns) bins = np.linspace(-1, 5, 7) labels = [ 'lnj_zero', 'lnj_one', 'lnj_two', 'lnj_three', 'lnj_four', 'lnj_five' ] df1_imputed['lnj_bins'] = pd.cut(df1_imputed['last_new_job'], bins=bins, labels=labels) df2 = pd.get_dummies(df1_imputed['lnj_bins']) df = df.drop(['last_new_job'], axis=1) df = pd.concat([df, df2], axis=1) return df
def get_imputed(from_depth=0, to_depth=2, mode=MODE_MEAN): out = pd.DataFrame( index=pd.DatetimeIndex(pd.date_range(FROM_CUTOFF, TO_CUTOFF))) print("OUT:", out) for json_path in base_path.glob('*.csv'): print(json_path) with open(json_path, 'r') as f: df = pd.read_csv(f) df = df[(df.depth >= from_depth) & (df.depth <= to_depth)] df.index = pd.to_datetime(df['time']) df = df.drop(columns=['depth']) df = df.drop(columns=['time']) if df.empty: continue elif mode == MODE_MAX: df = df.groupby(pd.Grouper(freq='D')).max() elif mode == MODE_MIN: df = df.groupby(pd.Grouper(freq='D')).max() elif mode == MODE_MEDIAN: df = df.groupby(pd.Grouper(freq='D')).median() elif mode == MODE_MEAN: df = df.groupby(pd.Grouper(freq='D')).mean() else: raise Exception(mode) df = df.rename(columns={'value': json_path.name.replace('.csv', '')}) out = pd.merge(out, df, left_index=True, right_index=True, how='outer') print(out) imputer = KNNImputer() out = pd.DataFrame(imputer.fit_transform(out), columns=out.columns, index=out.index) return out
def knn(data_mat, n_neighbors=5, weights='uniform', metric='nan_euclidean', copy=True, add_indicator=False): """ @param data: numpy 2d array,missing values are represented by np.nan @param n_neighbors: number of neighbors @return: numpy 2d array after imputed """ # 通过测试 data = data_mat.copy() from sklearn.impute import KNNImputer imp = KNNImputer(n_neighbors=n_neighbors, weights=weights, metric=metric, copy=copy, add_indicator=add_indicator) # imp = KNNImputer(n_neighbors=5) mat = imp.fit_transform(data) return mat
def impute_by_age(train_df, test_df): """ Function that perform missing data imputation on both train and test stratified by interview period. P1: [0; 30m] P2: (30; 72] P3: (72; 156] P4: (156; 204] P5: >204 Parameters ---------- train_df: dataframe test_df: dataframe Returns ------ imputed dataframe train imputed dataframe test """ knnimpute = KNNImputer(n_neighbors=ut.neighbors) col_n = [ nc for nc in train_df.columns if not re.search('subjectkey|interview|respon|relation', nc) ] new_dict_tr, new_dict_ts = {}, {} for yr in sorted(train_df.interview_period.unique()): exp_tr = train_df.interview_period == yr exp_ts = test_df.interview_period == yr tmp_tr = train_df.loc[exp_tr].copy() tmp_ts = test_df.loc[exp_ts].copy() tmp_tr[col_n] = knnimpute.fit_transform(tmp_tr[col_n]) tmp_ts[col_n] = knnimpute.transform(tmp_ts[col_n]) new_dict_tr[yr] = tmp_tr new_dict_ts[yr] = tmp_ts new_tr = pd.concat([df for df in new_dict_tr.values()]) new_ts = pd.concat([df for df in new_dict_ts.values()]) return new_tr, new_ts