def impute_feature(data, feature): data.loc[data[feature] < 0, feature] = np.NaN value_count = data.groupby('county_fips').count() counties_with_all_nulls = value_count[value_count[feature] == 0] temp = pd.DataFrame(index=data['county_fips'].unique().tolist(), columns=data['date'].unique().tolist()) for i in data['date'].unique(): temp[i] = data.loc[data['date'] == i, feature].tolist() X = np.array(temp) imputer = KNNImputer(n_neighbors=5) imp = imputer.fit_transform(X) imp = pd.DataFrame(imp) imp.columns = temp.columns imp.index = temp.index for i in data['date'].unique(): data.loc[data['date'] == i, feature] = imp[i].tolist() if (len(counties_with_all_nulls) > 0): data.loc[data['county_fips'].isin(counties_with_all_nulls.index), feature] = np.NaN return (data)
def impute_missing_values(self, data): """ Method Name: impute_missing_values Description: This method replaces all the missing values in the Dataframe using KNN Imputer. Output: A Dataframe which has all the missing values imputed. On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ self.logger_object.log( self.file_object, 'Entered the impute_missing_values method of the Preprocessor class' ) self.data = data try: imputer = KNNImputer(n_neighbors=3, weights='uniform', missing_values=np.nan) self.new_array = imputer.fit_transform( self.data) # impute the missing values # convert the nd-array returned in the step above to a Dataframe self.new_data = pd.DataFrame(data=self.new_array, columns=self.data.columns) self.logger_object.log( self.file_object, 'Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class' ) return self.new_data except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in impute_missing_values method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class' ) raise Exception()
def impute_last_new_job(df, cat_var): df['last_new_job'] = df['last_new_job'].replace(['never'], 0) df['last_new_job'] = df['last_new_job'].replace(['>4'], 5) df1 = df df1 = df.drop(cat_var, axis=1) imputer = KNNImputer() df1_imputed = imputer.fit_transform(df1) df1_imputed = pd.DataFrame(df1_imputed, index=df1.index, columns=df1.columns) bins = np.linspace(-1, 5, 7) labels = [ 'lnj_zero', 'lnj_one', 'lnj_two', 'lnj_three', 'lnj_four', 'lnj_five' ] df1_imputed['lnj_bins'] = pd.cut(df1_imputed['last_new_job'], bins=bins, labels=labels) df2 = pd.get_dummies(df1_imputed['lnj_bins']) df = df.drop(['last_new_job'], axis=1) df = pd.concat([df, df2], axis=1) return df
def impute_df(df): # imputer = KNN() imputer = KNNImputer(n_neighbors=2) object_types = list(df.select_dtypes(include=['object']).columns) num_types = list(set(df.columns) - set(object_types)) encoders_store = {} for column in num_types: skew = df[column].skew() if (-1 < skew < 1): df[column] = df[column].fillna(df[column].mean()) else: df[column] = df[column].fillna(df[column].median()) #create a for loop to iterate through each column in the data for columns in object_types: new = encode(df[columns]) encoders_store[columns] = new[1] imputed_data = pd.DataFrame(np.round(imputer.fit_transform(df)), columns=df.columns) for columns in object_types: imputed_data[columns] = encoders_store[columns].inverse_transform( np.array(imputed_data[columns]).reshape(-1, 1)) return imputed_data
def get_estimator(): K_imp = KNNImputer(missing_values=np.nan, n_neighbors=3, weights="distance") reg = RandomForestRegressor(n_estimators=10, max_depth=8) cols = [ 'P_MHD', 'DAUD', 'PDD', 'PAD', 'PADHD', 'DMSUD', 'PBD', 'Current health expenditure', 'Current health expenditure per capita', 'Out-of-pocket expenditure', 'Unemployment', 'School enrollment primary', 'School enrollment secondary', 'School enrollment tertiary', 'ghs', 'media integrity', 'military expenditure' ] prep = ColumnTransformer(transformers=[ ('prep', make_pipeline(K_imp, StandardScaler()), cols), ], remainder='drop') estimator = Pipeline(steps=[('prep', prep), ('classifier', reg)]) return estimator
def fit_imputed(v, train, valid): """ Function to test a single model in validation sample [valid], having trained on the training [train] sample, after scaling and imputation. """ # Select features/outcome X_train = train[v] y_train = train['y'] n_train = np.shape(X_train)[0] # Scale/impute scaler = StandardScaler() X_train = scaler.fit_transform(X_train) imputer = KNNImputer() X_train = imputer.fit_transform(X_train) # Train Logistic Regression with inner CV using training sample clf = LogisticRegressionCV(cv=inner, penalty='l1', Cs=10**np.linspace(0.1, -3, 50), random_state=42, solver='liblinear', scoring=roc_auc_scorer).fit(X_train, y_train) # Predict in validation sample X_test = valid[v] y_test = valid['y'] X_test = scaler.transform(X_test) X_test = imputer.transform(X_test) y_pred = clf.predict(X_test) y_prob = clf.predict_proba(X_test)[:, 1] # Return return ({ 'clf': clf, 'n_train': n_train, 'X_train': X_train, 'y_train': y_train, 'X_test': X_test, 'y_test': y_test, 'y_pred': y_pred, 'y_prob': y_prob })
def MLP_model_pred(model,df_train,df_test,sentence_vector=False): ''' Function to predict sentiment score using MLP ''' #Filter Data if not sentence_vector: #Feature Engineering df_train,df_test=PMI(df_train,df_test) for gram in [1,2,3,4]: df_train,df_test=rf_ngram(df_train,df_test,gram=gram) df_train=df_train.drop(['cashtag','spans','text','clean_text','base_text','source'],1) df_test=df_test.drop(['clean_text','base_text'],1) else: df_train=W2V_sentence_embedding(df_train) df_test=W2V_sentence_embedding(df_test) #Split data into dependent and independent variable if 'sentiment score' in df_train.columns.tolist(): X_train=df_train.drop(['sentiment score'],1) else: X_train=df_train.copy() X_test=df_test.copy() #Impute Missing Testues imputer = KNNImputer(n_neighbors=3) X_train=pd.DataFrame(imputer.fit_transform(X_train)) X_test_split = np.array_split(X_test, 20) X_test_pool=pd.DataFrame(imputer.fit_transform(X_test_split[0])) for i in range(1,20): X_imputed=pd.DataFrame(imputer.fit_transform(X_test_split[i])) X_test_pool=pd.concat([X_test_pool,X_imputed],ignore_index=True) X_test=X_test_pool.copy() #Predict y_pred=model.predict(X_test,batch_size=32) y_pred=pd.Series(y_pred.tolist()).apply(lambda x: x[0]) return y_pred
def knn_impute_by_item(matrix, valid_data, k): """ Fill in the missing values using k-Nearest Neighbors based on question similarity. Return the accuracy on valid_data. :param matrix: 2D sparse matrix :param valid_data: A dictionary {user_id: list, question_id: list, is_correct: list} :param k: int :return: float """ ##################################################################### # TODO: # # Implement the function as described in the docstring. # ##################################################################### nbrs = KNNImputer(n_neighbors=k) # We use NaN-Euclidean distance measure. mat = nbrs.fit_transform(matrix.transpose()).transpose() acc = sparse_matrix_evaluate(valid_data, mat) ##################################################################### # END OF YOUR CODE # ##################################################################### return acc
def experiment_setting_5(X, y, runs=5, missingness=0.1): results = [] for i in range(runs): np.random.seed(i) X_missing = make_missing_random(X, missingness) ss = StratifiedKFold(shuffle=True, random_state=i) for train_index, test_index in ss.split(X, y): X_train = X_missing[train_index] y_train = y[train_index] X_test = X[test_index] y_test = y[test_index] si = KNNImputer() X_train = si.fit_transform(X_train) dt = C45DecisionTree(criterion='c45', max_depth=20) dt.fit(X_train, y_train) results.append(accuracy_score(dt.predict(X_test), y_test)) return results
def knn_impute_by_item(matrix, valid_data, k): """ Fill in the missing values using k-Nearest Neighbors based on question similarity. Return the accuracy on valid_data. :param matrix: 2D sparse matrix :param valid_data: A dictionary {user_id: list, question_id: list, is_correct: list} :param k: int :return: float """ ##################################################################### # TODO: # # Implement the function as described in the docstring. # ##################################################################### nbrs = KNNImputer(n_neighbors=k) mat = nbrs.fit_transform(matrix.T) acc = sparse_matrix_evaluate(valid_data, mat.T) print("Validation Accuracy Item_based with k = {} : {}".format(k, acc)) ##################################################################### # END OF YOUR CODE # ##################################################################### return acc
def make_pipeline(df): x = df col_dtypes = get_types(x) encoder = ColumnTransformer( [('categorical', CatBoostEncoder(), col_dtypes['object']), # could use passthrough=remainder, but this way makes column ordering more obvious ('numeric', FunctionTransformer(), col_dtypes['int64'] + col_dtypes['float64']) ] ) all_columns_idx = np.full((len(x)), True, dtype=bool) imputer = ColumnTransformer( [('knn_imputer', KNNImputer(), all_columns_idx)] ) pipeline = Pipeline(steps=[ ('encoder', encoder), ('imputer', imputer), ]) return pipeline, col_dtypes['object'] + col_dtypes['int64'] + col_dtypes['float64']
def imputer(df, numerical, binary): imputer_feature = df.copy() features_numerical = imputer_feature[numerical] features_binary = imputer_feature[binary] #Impute values with SimpleImputer for binary s_imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent') s_imp = s_imp.fit(features_binary.values) features_binary = s_imp.transform(features_binary.values) #Impute values with KNNImputer for numerical KNNimp = KNNImputer() KNNimp = KNNimp.fit(features_numerical.values) features_numerical = KNNimp.transform(features_numerical.values) #Add columns and index again imputer_feature[binary] = features_binary imputer_feature[numerical] = features_numerical return imputer_feature, s_imp, KNNimp
def knn_missings(df, n_ngb=3): """ First calls the function to select the numeric columns of the dataframe and transform the NaN through a KNN with 3 neighbors (optional). The return change the values on the original dataframe. Params: df = dataframe. n_ngb = number of neighbors of KNN, by default 3. """ df_knn_msg = df.copy() list_num_cols = num_columns(df_knn_msg) imputer = KNNImputer(n_neighbors=n_ngb) imputer.fit(df[list_num_cols]) df_knn_msg[list_num_cols] = imputer.transform(df_knn_msg[list_num_cols]) return df_knn_msg
def impute_missing_values(self, data): """ Method Name: impute_missing_values Description: This method replaces all the missing values in the Dataframe using KNN Imputer. Output: A Dataframe which has all the missing values imputed. On Failure: Raise Exception """ self.logger_object.log( self.file_object, 'Entered the impute_missing_values method of the Preprocessor class' ) self.data = data try: imputer = KNNImputer(n_neighbors=3, weights='uniform', missing_values=np.nan) self.new_array = imputer.fit_transform( self.data) # impute the missing values # convert the nd-array returned in the step above to a Dataframe # rounding the value because KNNimputer returns value between 0 and 1, but we need either 0 or 1 self.new_data = pd.DataFrame(data=np.round(self.new_array), columns=self.data.columns) self.logger_object.log( self.file_object, 'Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class' ) return self.new_data except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in impute_missing_values method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class' ) raise Exception()
def get_imputed(from_depth=0, to_depth=2, mode=MODE_MEAN): out = pd.DataFrame( index=pd.DatetimeIndex(pd.date_range(FROM_CUTOFF, TO_CUTOFF))) print("OUT:", out) for json_path in base_path.glob('*.csv'): print(json_path) with open(json_path, 'r') as f: df = pd.read_csv(f) df = df[(df.depth >= from_depth) & (df.depth <= to_depth)] df.index = pd.to_datetime(df['time']) df = df.drop(columns=['depth']) df = df.drop(columns=['time']) if df.empty: continue elif mode == MODE_MAX: df = df.groupby(pd.Grouper(freq='D')).max() elif mode == MODE_MIN: df = df.groupby(pd.Grouper(freq='D')).max() elif mode == MODE_MEDIAN: df = df.groupby(pd.Grouper(freq='D')).median() elif mode == MODE_MEAN: df = df.groupby(pd.Grouper(freq='D')).mean() else: raise Exception(mode) df = df.rename(columns={'value': json_path.name.replace('.csv', '')}) out = pd.merge(out, df, left_index=True, right_index=True, how='outer') print(out) imputer = KNNImputer() out = pd.DataFrame(imputer.fit_transform(out), columns=out.columns, index=out.index) return out
def knn(data_mat, n_neighbors=5, weights='uniform', metric='nan_euclidean', copy=True, add_indicator=False): """ @param data: numpy 2d array,missing values are represented by np.nan @param n_neighbors: number of neighbors @return: numpy 2d array after imputed """ # 通过测试 data = data_mat.copy() from sklearn.impute import KNNImputer imp = KNNImputer(n_neighbors=n_neighbors, weights=weights, metric=metric, copy=copy, add_indicator=add_indicator) # imp = KNNImputer(n_neighbors=5) mat = imp.fit_transform(data) return mat
def impute_missing_values(self, data): self.logger_object.log( self.file_object, 'Entered the Impute_Missing_Values method of Data Proprocessing ') self.data = data try: imputer = KNNImputer(n_neighbors=3, weights='uniform', missing_values=np.nan) self.new_array = imputer.fit_transform(self.data) self.new_data = pd.DataFrame(data=self.new_data, columns=self.data.columns) self.logger_object.log(self.file_object, 'Imputing missing values Successful.') return self.new_data except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in impute_missing_values method Exception message: %s' + str(e)) self.logger_object.log(self.file_object, 'Imputing missing values failed.') raise e
def stats_preprocess(): print("Data preprocessing(imputation) start...") raw_df = data.load(datecol=[1]) dfs_h = [] impute_statistics = {} for station_name in tqdm.tqdm(SEOUL_STATIONS.keys(), total=len(SEOUL_STATIONS.keys())): sdf = data.load_station(raw_df, SEOUL_STATIONS[station_name]) imputer = KNNImputer(n_neighbors=5, weights="distance", missing_values=np.NaN) _df = pd.DataFrame(imputer.fit_transform(sdf)) _df.columns = sdf.columns _df.index = sdf.index dfs_h.append(_df) df = pd.concat(dfs_h) df.to_csv("/input/python/input_seoul_imputed_hourly_pandas.csv")
def impute_by_age(train_df, test_df): """ Function that perform missing data imputation on both train and test stratified by interview period. P1: [0; 30m] P2: (30; 72] P3: (72; 156] P4: (156; 204] P5: >204 Parameters ---------- train_df: dataframe test_df: dataframe Returns ------ imputed dataframe train imputed dataframe test """ knnimpute = KNNImputer(n_neighbors=ut.neighbors) col_n = [ nc for nc in train_df.columns if not re.search('subjectkey|interview|respon|relation', nc) ] new_dict_tr, new_dict_ts = {}, {} for yr in sorted(train_df.interview_period.unique()): exp_tr = train_df.interview_period == yr exp_ts = test_df.interview_period == yr tmp_tr = train_df.loc[exp_tr].copy() tmp_ts = test_df.loc[exp_ts].copy() tmp_tr[col_n] = knnimpute.fit_transform(tmp_tr[col_n]) tmp_ts[col_n] = knnimpute.transform(tmp_ts[col_n]) new_dict_tr[yr] = tmp_tr new_dict_ts[yr] = tmp_ts new_tr = pd.concat([df for df in new_dict_tr.values()]) new_ts = pd.concat([df for df in new_dict_ts.values()]) return new_tr, new_ts
def test_knn_imputer_with_simple_example(na, working_memory): X = np.array( [ [0, na, 0, na], [1, 1, 1, na], [2, 2, na, 2], [3, 3, 3, 3], [4, 4, 4, 4], [5, 5, 5, 5], [6, 6, 6, 6], [na, 7, 7, 7], ] ) r0c1 = np.mean(X[1:6, 1]) r0c3 = np.mean(X[2:-1, -1]) r1c3 = np.mean(X[2:-1, -1]) r2c2 = np.mean(X[[0, 1, 3, 4, 5], 2]) r7c0 = np.mean(X[2:-1, 0]) X_imputed = np.array( [ [0, r0c1, 0, r0c3], [1, 1, 1, r1c3], [2, 2, r2c2, 2], [3, 3, 3, 3], [4, 4, 4, 4], [5, 5, 5, 5], [6, 6, 6, 6], [r7c0, 7, 7, 7], ] ) with config_context(working_memory=working_memory): imputer_comp = KNNImputer(missing_values=na) assert_allclose(imputer_comp.fit_transform(X), X_imputed)
def test_sklearn_knn_imputer_cdist(self): x_train = numpy.array([[1, 2, numpy.nan, 12], [3, numpy.nan, 3, 13], [1, 4, numpy.nan, 1], [numpy.nan, 4, 3, 12]], dtype=numpy.float32) x_test = numpy.array( [[1.3, 2.4, numpy.nan, 1], [-1.3, numpy.nan, 3.1, numpy.nan]], dtype=numpy.float32) model = KNNImputer(n_neighbors=3, metric='nan_euclidean').fit(x_train) with self.assertRaises(NameError): convert_sklearn(model, "KNN imputer", [("input", FloatTensorType( (None, x_test.shape[1])))], target_opset=TARGET_OPSET, options={id(model): { 'optim2': 'cdist' }}) for opset in [TARGET_OPSET, 12, 11, 10, 9]: if opset > TARGET_OPSET: continue model_onnx = convert_sklearn( model, "KNN imputer", [("input", FloatTensorType((None, x_test.shape[1])))], target_opset=opset, options={id(model): { 'optim': 'cdist' }}) self.assertIsNotNone(model_onnx) self.assertIn('op_type: "cdist"', str(model_onnx).lower()) self.assertNotIn('scan', str(model_onnx).lower()) dump_data_and_model(x_test, model, model_onnx, basename="SklearnKNNImputer%dcdist" % opset)
def get_train_test(fnc_file,loadings_file,lablels_file): ''' function to get training and test data sets Works with Rapids.ai ONLY ''' path = "../input/trends-assessment-prediction/" fnc_df = pd.read_csv(os.path.join(path,fnc_file)) loading_df = pd.read_csv(os.path.join(path,loadings_file)) fnc_features, loading_features = list(fnc_df.columns[1:]), list(loading_df.columns[1:]) df = fnc_df.merge(loading_df, on="Id") labels_df = pd.read_csv(os.path.join(path,lablels_file)) labels_df["is_train"] = True df = df.merge(labels_df, on="Id", how="left") test_df = df[df["is_train"] != True].copy() train_df = df[df["is_train"] == True].copy() train_df = train_df.drop(['is_train'], axis=1) target_cols = ['age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2'] test_df = test_df.drop(target_cols + ['is_train'], axis=1) features = loading_features + fnc_features #-----------------Normalizing------------------------ from sklearn.preprocessing import StandardScaler scaler = StandardScaler() train_df[features] = scaler.fit_transform(train_df[features],train_df[target_cols]) test_df[features] = scaler.transform(test_df[features]) #---------------------------------------------------- # Giving less importance to FNC features since they are easier to overfit due to high dimensionality. train_df[fnc_features] = train_df[fnc_features].mul(1/600) test_df[fnc_features] = test_df[fnc_features].mul(1/600) #imputing missing values in targets from sklearn.impute import KNNImputer imputer = KNNImputer(n_neighbors = 5, weights="distance") train_df = cudf.from_pandas(pd.DataFrame(imputer.fit_transform(train_df), columns = list(train_df.columns))) test_df = cudf.from_pandas(test_df)#necessary for casting to gpu matrix del df gc.collect() return train_df,test_df,features,target_cols
def input_missing_data(exploration, df): percentual = st.slider( 'Choose the missing percentage limit for the columns you want to input data', min_value=0, max_value=100) columns_list = list( exploration[(exploration['NA %'] < percentual) & ((exploration['types'] == 'int64') | (exploration['types'] == 'float64'))]['names']) select_method = st.radio('Choose a metod :', ('Mean', 'Median', 'KNN_Imputer')) st.markdown('You chosse : ' + str(select_method)) if select_method == 'Mean': df_inputed = df[columns_list].fillna(df[columns_list].mean()) st.table(df_inputed[columns_list].head(10)) st.subheader('Download data : ') st.markdown(get_table_download_link(df_inputed), unsafe_allow_html=True) elif select_method == 'Median': df_inputed = df[columns_list].fillna(df[columns_list].median()) st.table(df_inputed[columns_list].head(10)) st.subheader('Download data : ') st.markdown(get_table_download_link(df_inputed), unsafe_allow_html=True) elif select_method == 'KNN_Imputer': imputer = KNNImputer(n_neighbors=3) st.markdown(columns_list) df_inputed = pd.DataFrame(imputer.fit_transform(df[columns_list]), columns=columns_list) df_inputed = pd.concat([df.drop(columns_list, axis=1), df_inputed]) st.subheader('Download data : ') st.markdown(get_table_download_link(df_inputed), unsafe_allow_html=True)
def compare_to_lasso_analysis(adata, ccdtranscript): '''Perform a comparison of pseudotime analysis to LASSO analysis for finding CCD proteins''' prevPlotSize = plt.rcParams['figure.figsize'] plt.rcParams['figure.figsize'] = (6, 5) print("ANALYZING SC-RNA-SEQ WITH LASSO") warnings.filterwarnings("ignore") fucci_rna_data = [(adata.obs["Red585"][ii], adata.obs["Green530"][ii]) for ii in np.arange(len(adata.obs))] imputer = KNNImputer(missing_values=0) expression = imputer.fit_transform(adata.X) fucci_rna_path = "output/pickles/fucci_rna_imputed_lasso.pkl" if os.path.exists(fucci_rna_path): fucci_rna = np.load(open(fucci_rna_path, 'rb'), allow_pickle=True) else: fucci_rna = MultiTaskLassoCV() fucci_rna.fit(expression, fucci_rna_data) pickle.dump(fucci_rna, open(fucci_rna_path, 'wb')) nz_coef = np.sum(fucci_rna.coef_, axis=0) != 0 print(f"{sum(nz_coef)}: number of nonzero lasso coefficients") print(f"{adata.var_names[nz_coef]}: genes with nonzero lasso coeff") print(f"{sum(ccdtranscript[nz_coef]) / sum(nz_coef)}: % nonzero lasso found as CCD transcripts") print(f"{np.sum(fucci_rna.coef_, axis=0)[nz_coef]}: coefficients for nonzero lasso coeff") # Generate UMAP for CCD and nonCCD for the LASSO model adataCCd = adata[:,nz_coef] sc.pp.neighbors(adataCCd, n_neighbors=10, n_pcs=40) sc.tl.umap(adataCCd) sc.pl.umap(adataCCd, color="fucci_time", show=False, save=True) shutil.move("figures/umap.pdf", "figures/umapRNALassoCCD.pdf") adataNonCCd = adata[:,~nz_coef] sc.pp.neighbors(adataNonCCd, n_neighbors=10, n_pcs=40) sc.tl.umap(adataNonCCd) sc.pl.umap(adataNonCCd, color="fucci_time", show=False, save=True) shutil.move("figures/umap.pdf", "figures/umapRNALassoNonCCD.pdf") plt.rcParams['figure.figsize'] = prevPlotSize warnings.filterwarnings("default")
def create_preprocessor(df): """ Takes input dataframe and applies imputation on numeric and categoric features Returns x_train and y_train """ # Separate columns for each imputer features_numeric = ["kms"] features_categoric = list(df) features_categoric.remove("kms") # imputer for numerical and one imputer for categorical in pipeline # this imputer imputes with the mean imputer_numeric = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='mean')), ]) # this imputer imputes with an arbitrary value """ imputer_categoric = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='most_frequent')) ]) """ imputer_categoric = KNNImputer(n_neighbors=2, weights="uniform") # Combine features list and the transformers together using the column transformer preprocessor = ColumnTransformer(transformers=[('imputer_numeric', imputer_numeric, features_numeric), ('imputer_categoric', imputer_categoric, features_categoric)]) return preprocessor
def replace_missing_numbers(df, strat='median'): """Nahradi chybajuce numericke data pomocou zvolenej strategie (median, mean alebo kNN).""" x = df.copy() # Pre zvolenu strategiu sa vytvori imputer if strat in ['mean', 'median']: imp = SimpleImputer(strategy=strat) else: imp = KNNImputer() # Doplnia sa chybajuce hodnoty x = imp.fit_transform(x) # Z novych hodnot sa vytvori dataframe x = pd.DataFrame(x) # Pomenujeme stlpce a riadky rovnako ako v povodnom dataframe x.columns = df.columns x.index = df.index x['class'] = x['class'].round() x['income_>50K'] = x['income_>50K'].round() return x
def imputation_statique(df, statique): ############################################################### # Cette fonction vous permettra d'imputer les données manquantes # Si statique=True alors l'imputation se fera par la median ou le mode # selon le type des données en entrée ############################################################### missing_data = df.apply(lambda x: np.round( x.isnull().value_counts() * 100.0 / len(x), 2)).iloc[0] columns_MissingData = missing_data[missing_data < 100].index if imputation_statique: for col in columns_MissingData: if df[col].dtype == 'O': df[col] = df[col].fillna(df[col].mode().iloc[0]) else: df[col] = df[col].fillna(df[col].median()) else: imputer = KNNImputer(n_neighbors=3) ids = df.CustomerID X = pd.concat([ pd.get_dummies(df.drop('CustomerID', axis=1).select_dtypes('O')), df.drop('CustomerID', axis=1).select_dtypes(exclude='O') ], axis=1) X_filled_knn = pd.DataFrame(imputer.fit_transform(X)) X_filled_knn.columns = X.columns for col in columns_MissingData: print(col) if df[col].dtypes == 'O': df_temp = X_filled_knn.filter(regex='^' + col + '*') df_temp.columns = [ x.replace(col + '_', '') for x in df_temp.columns ] df[col] = df_temp.idxmax(1) else: df[col] = np.round(X_filled_knn[col], 2) return (df)
def KNN_imputer(food_data, missed_features): features = [ 'protein', 'fat', 'carbohydrates', 'sugar', 'sodium', 'calories' ] Y = pd.read_csv(url).drop('class', 1).to_numpy() nan = np.nan protein = nan if 'protein' not in food_data else food_data['protein'] fat = nan if 'fat' not in food_data else food_data['fat'] carbohydrates = nan if 'carbohydrates' not in food_data else food_data[ 'carbohydrates'] sugar = nan if 'sugar' not in food_data else food_data['sugar'] sodium = nan if 'sodium' not in food_data else food_data['sodium'] calories = nan if 'calories' not in food_data else food_data['calories'] print('Vector before restoring {}'.format( np.array([[protein, fat, carbohydrates, sugar, sodium, calories]]))) Y = np.concatenate( (Y, np.array([[protein, fat, carbohydrates, sugar, sodium, calories]]))) imputer = KNNImputer(n_neighbors=2, weights="uniform") X = imputer.fit_transform(Y)[-1].reshape(1, -1) print('Restored via KNNImputer vector {}'.format(X)) return X
# RainTomorrow: Datos no NaN: 142193 Datos Nan: 3267 En%: 2.245978275814657 # RainToday: Datos no NaN: 142199 Datos Nan: 3261 En%: 2.241853430496356 # Rainfall: Datos no NaN: 142199 Datos Nan: 3261 En%: 2.241853430496356 # WindSpeed3pm: Datos no NaN: 142398 Datos Nan: 3062 En%: 2.105046060772721 # Humidity9am: Datos no NaN: 142806 Datos Nan: 2654 En%: 1.8245565791282827 # WindSpeed9am: Datos no NaN: 143693 Datos Nan: 1767 En%: 1.214766946239516 # Temp9am: Datos no NaN: 143693 Datos Nan: 1767 En%: 1.214766946239516 # MinTemp: Datos no NaN: 143975 Datos Nan: 1485 En%: 1.0208992162793895 # MaxTemp: Datos no NaN: 144199 Datos Nan: 1261 En%: 0.8669049910628351 x_train['MinTemp']=x_train['MinTemp'].fillna(x_train['MinTemp'].median()) from sklearn.impute import KNNImputer imputer = KNNImputer(n_neighbors=3) x_train_knn_imp = imputer.fit_transform(x_train) x_train.iloc[:,'Pressure9am'] = x_train_knn_imp[:,15] df_train_c2 = pd.DataFrame(np.concatenate([x_train_c2,y_train_c2[:,np.newaxis]], axis=1), columns=data_c2.columns[np.concatenate([sel.get_support(), [True]])]) r = df_train.corr(method='pearson') MI = mutual_info_regression(x_train_cca, y_train_cca) fig, ax = plt.subplots(2,1, figsize=(22,15)) ax[0].set_title('Información mutua')
def fit_neighbours(data: pd.DataFrame, neighbors: int = 5) -> pd.DataFrame: return pd.DataFrame(KNNImputer(n_neighbors=neighbors).fit_transform(data.values), columns=data.columns, index=data.index )