def k_means(input_df): input_df['pH'] = input_df['pH'] * 100 k = input_df.loc[:, 'pH'] X_pH_train, X_pH_test, y_pH_train, y_pH_test = train_test_split( input_df, k, test_size=0.33, random_state=42) X_pH_test = X_pH_test.drop(columns='pH') final = X_pH_train.append(X_pH_test, ignore_index=True) imputer = KNNImputer(n_neighbors=5, weights='uniform') np.set_printoptions(suppress=True) final = imputer.fit_transform(final) df = pd.DataFrame(final) df.columns = [ 'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality' ] df['pH'] = df['pH'] / 100 return df
def process_data(filename, fit_encoder=True): df = pd.read_csv(filename) # Handle categorical attributes df['CabinLetter'] = df['Title'] = df['TicketLabel'] = "" df['CabinLetter'] = list(df['Cabin'].apply(get_cabin_letter)) df['TicketLabel'] = list(df['Ticket'].apply(get_ticket_label)) df['Title'] = list(df['Name'].apply(get_title)) df['Embarked'].fillna('S', inplace=True) X_cat = df[['Pclass', 'Sex', 'Embarked', 'Title', 'CabinLetter']] if fit_encoder: encoder.fit(X_cat) X_cat = encoder.transform(X_cat).toarray() # Handle numerical attributes df['CabinNumber'] = df['TicketNumber'] = "" df['CabinNumber'] = list(df['Cabin'].apply(get_cabin_number)) #median_cabin = df['CabinNumber'].median() #df['CabinNumber'].fillna(median_cabin,inplace=True) df['TicketNumber'] = list(df['Ticket'].apply(get_ticket_number)) median_ticket = df['TicketNumber'].median() df['TicketNumber'].fillna(median_ticket, inplace=True) X_num = df[['Age', 'SibSp', 'Parch', 'Fare', 'TicketNumber']] scaler = MinMaxScaler() X_num = scaler.fit_transform(X_num) imputer = KNNImputer() X_num = imputer.fit_transform(X_num) # Impute Age #median_age = df['Age'].median() #df['Age'].fillna(median_age,inplace=True) # Impute Fare #median_fare = df['Fare'].median() #df['Fare'].fillna(median_fare,inplace=True) # Final X matrix X = np.hstack((X_cat, X_num)) # Final y array if 'Survived' in df: y = df['Survived'].array else: y = None return X, y
def impute_feature(data,feature): data.loc[data[feature]<0,feature]=np.NaN value_count=data.groupby('county_fips').count() counties_with_all_nulls=value_count[value_count[feature]==0] temp=pd.DataFrame(index=data['county_fips'].unique().tolist(),columns=data['date'].unique().tolist()) for i in data['date'].unique(): temp[i]=data.loc[data['date']==i,feature].tolist() X = np.array(temp) imputer = KNNImputer(n_neighbors=5) imp=imputer.fit_transform(X) imp=pd.DataFrame(imp) imp.columns=temp.columns imp.index=temp.index for i in data['date'].unique(): data.loc[data['date']==i,feature]=imp[i].tolist() if(len(counties_with_all_nulls)>0): data.loc[data['county_fips'].isin(counties_with_all_nulls.index),feature]=np.NaN return(data)
def impute_missing_values(self, data): self.logger_object.log(self.file_object, "Entered impute_missing_value method") self.data = data try: imputer = KNNImputer(n_neighbors=3, weights='uniform', missing_values=np.nan) self.new_array = imputer.fit_transform(self.data) self.new_data = pd.DataFrame(data=self.new_array, columns=self.data.columns) self.logger_object.log(self.file_object, "Imputed missing values are success") return self.new_data except Exception: self.logger_object.log(self.file_object, "Eror occured on impute_missing_values") self.logger_object.log(self.file_object, "Imputation unsuccessful") raise Exception()
def cv_preprocessing(X_train, X_test=None, random_state=None): variables_path = r"variables.json" with open(variables_path) as f: variables = json.load(f) t1_features, cogni = variables['t1_features'], variables['cogni'] pcl = variables['questionnaires']['PCL'][:17] mice = KNNImputer() columns = X_train.columns X_train = pd.DataFrame(mice.fit_transform(X_train), columns=columns) #X_train = stds(X_train) #X_train = stats(X_train) #X_train = removal_correlated(X_train) # ss = StandardScaler() # X_train = ss.fit_transform(X_train) # X_train = pd.DataFrame(ss.fit_transform(X_train), columns=columns) if X_test is not None: X_test = pd.DataFrame(mice.transform(X_test), columns=columns) #X_test = stds(X_test) #X_test = stats(X_train, X_test) #_, X_test = removal_correlated(X_train, X_test) # X_test = ss.transform(X_test) # X_test = pd.DataFrame(ss.transform(X_test), columns=columns) X_train, X_test = outliers( X_train, X_test, features=[f"T1q5.{i}" for i in range(1, 10)], name='phq9') #X_train, X_test = outliers(X_train, X_test, features=pcl, name='PCL') X_train, X_test = outliers(X_train, X_test, features=cogni, name='cogni') X_train, X_test = outliers(X_train, X_test, features=t1_features, name='t1') return X_train, X_test else: return X_train
def test_knn_imputer_callable_metric(): # Define callable metric that returns the l1 norm: def custom_callable(x, y, missing_values=np.nan, squared=False): x = np.ma.array(x, mask=np.isnan(x)) y = np.ma.array(y, mask=np.isnan(y)) dist = np.nansum(np.abs(x - y)) return dist X = np.array([[4, 3, 3, np.nan], [6, 9, 6, 9], [4, 8, 6, 9], [np.nan, 9, 11, 10.0]]) X_0_3 = (9 + 9) / 2 X_3_0 = (6 + 4) / 2 X_imputed = np.array([[4, 3, 3, X_0_3], [6, 9, 6, 9], [4, 8, 6, 9], [X_3_0, 9, 11, 10.0]]) imputer = KNNImputer(n_neighbors=2, metric=custom_callable) assert_allclose(imputer.fit_transform(X), X_imputed)
def naKNN(train_x, test_x): """ Sostituisce i valori mancanti nel training set e nel test set con KNNImputer(). :param train_x: training set :param test_x: test set :return: None """ getNaCount(train_x) # calcola il numero di NaN per il training set imputer = KNNImputer(n_neighbors=3) imputed_train = imputer.fit_transform(train_x.data) train_x.data = pd.DataFrame(imputed_train, columns=train_x.data.columns) save_object( imputer, 'imputer.pkl' ) # salva imputer nel file 'imputer.pkl' (serve successivamente per il test finale) if test_x is not None: imputed_test = imputer.transform(test_x.data) test_x.data = pd.DataFrame(imputed_test, columns=test_x.data.columns)
def impute_missing_values(self, data): self.data = data try: self.logger.info('Start of imputing missing values...') imputer = KNNImputer(n_neighbors=3, weights='uniform', missing_values=np.nan) self.new_array = imputer.fit_transform( self.data) # impute the missing values # convert the nd-array returned in the step above to a Data frame self.new_data = pd.DataFrame(data=self.new_array, columns=self.data.columns) self.logger.info('End of imputing missing values...') return self.new_data except Exception as e: self.logger.exception( 'Exception raised while imputing missing values:' + str(e)) raise Exception()
def test_knn_imputer_not_enough_valid_distances(na, weights): # Samples with needed feature has nan distance X1 = np.array([ [na, 11], [na, 1], [3, na] ]) X1_imputed = np.array([ [3, 11], [3, 1], [3, 6] ]) knn = KNNImputer(missing_values=na, n_neighbors=1, weights=weights) assert_allclose(knn.fit_transform(X1), X1_imputed) X2 = np.array([[4, na]]) X2_imputed = np.array([[4, 6]]) assert_allclose(knn.transform(X2), X2_imputed)
def knn_impute_by_user(matrix, valid_data, k): """ Fill in the missing values using k-Nearest Neighbors based on student similarity. Return the accuracy on valid_data. See https://scikit-learn.org/stable/modules/generated/sklearn. impute.KNNImputer.html for details. :param matrix: 2D sparse matrix :param valid_data: A dictionary {user_id: list, question_id: list, is_correct: list} :param k: int :return: float """ nbrs = KNNImputer(n_neighbors=k) # We use NaN-Euclidean distance measure. mat = nbrs.fit_transform(matrix) acc = sparse_matrix_evaluate(valid_data, mat) print("user Validation Accuracy: {}".format(acc)) return acc
def impute_values(df, imp_strategy, neighbors, numeric_vars): X = convert_to_numeric(df, numeric_vars) X = df[numeric_vars].to_numpy() other_vars = list(set(df.columns) - set(numeric_vars) ) X_strings = df[other_vars].reset_index(drop=True) if imp_strategy == "knn": imputer = KNNImputer(n_neighbors = neighbors) #weights = weight_type imputed = imputer.fit_transform(X) # This is very costly # from here https://impyute.readthedocs.io/en/master/api/cross_sectional_imputation.html # https://impyute.readthedocs.io/en/master/api/cross_sectional_imputation.html # imputed = fast_knn(X, k= neighbors) else: imputer = SimpleImputer(missing_values = np.nan, strategy = imp_strategy) imputer.fit(X) imputed = imputer.transform(X) X_imputed = pd.DataFrame.from_records(imputed, columns = numeric_vars) rv = X_strings.join(X_imputed) return rv
def fill_categorical_na(self, df): """ Impute categorical NaN with two methods Args: :df: Dataframe Returns: :df_result: Dataframe without NaN """ df_result = df.copy() imputer = KNNImputer(n_neighbors=2, weights="uniform") df_result.columns data_cat_imputed = imputer.fit_transform(df_result) for i in range(data_cat_imputed.shape[1]): df_result[df_result.columns[i]] = data_cat_imputed[:, i] return df_result
def test_model(k, v, data): results = {} if k[4] == 'nosoc': # If excluding nosocomial patients if data['nosoc'].sum() == 0: return (False) else: data = data[data['nosoc'] == 0] X = data[v['X']] y = data[v['y']].astype('int') # Scale, impute scaler, clf = v['scaler'], v['clf'] imputer = KNNImputer() X = scaler.transform(X) X = imputer.fit_transform(X) # 1. Pre-trained model ------------------------------------------------ y_prob = clf.predict_proba(X)[:, 1] y_pred = clf.predict(X) results['pretrained'] = get_summaries(clf, X, y, y_prob, y_pred) # Get 'treat all' line for net benefit results['treat_all'] = net_benefit(clf, X, y, treat_all=True) # 2. Re-scaled model [based in internal validation] ------------------- scale_coef = np.sum(X * (clf.coef_ * v['shrink_slope']), axis=1) scale_int = clf.intercept_ + v['shrink_int'] odds = np.exp(scale_coef + scale_int) y_prob = odds / (1 + odds) y_pred = np.where(y_prob > 0.5, 1, 0) results['rescaled'] = get_summaries(clf, X, y, y_prob, y_pred) # 3. Re-calibrated model [based on validation sample] ----------------- clf_recal = CalibratedClassifierCV(clf, method='sigmoid', cv='prefit').fit(X, y) y_pred = clf_recal.predict(X) y_prob = clf_recal.predict_proba(X)[:, 1] y_logp = np.log(y_prob / (1 - y_prob)) results['recal'] = get_summaries(clf_recal, X, y, y_prob, y_pred, lp=y_logp) # Store outcome rate results['meany'] = np.mean(y) return (results)
def knn_impute_by_item(matrix, valid_data, k): """ Fill in the missing values using k-Nearest Neighbors based on question similarity. Return the accuracy on valid_data. :param matrix: 2D sparse matrix :param valid_data: A dictionary {user_id: list, question_id: list, is_correct: list} :param k: int :return: float """ ##################################################################### # Implement the function as described in the docstring. # ##################################################################### nbrs = KNNImputer(n_neighbors=k) # We use NaN-Euclidean distance measure. mat = nbrs.fit_transform(matrix.T) acc = sparse_matrix_evaluate(valid_data, mat.T) print("Validation Accuracy on question: {}".format(acc)) return acc
def run_knn(train_data, val_data, k): """Create matrix prediction using KNN trained on train data. k is k nearest neighbors :param train_data: A dictionary {user_id: list, question_id: list, is_correct: list} :param val_data: A dictionary {user_id: list, question_id: list, is_correct: list} :param k: int :return: num_users by num_questions matrix of predictions """ sparse_matrix = load_train_sparse("../data").toarray() size = len(train_data['user_id']) train_data_bootstrap = bootstrap_data(train_data, size) nbrs = KNNImputer(n_neighbors=k) # We use NaN-Euclidean distance measure. mat = nbrs.fit_transform(sparse_matrix) return mat
def _fill_missing_data(self, data): """ :param data: Data-frame after merging user data and job description data, and feature engineered. :return: Data-frame after filling in missing values in the features using KNN imputing. """ sys.setrecursionlimit(100000) non_imput_cols = ['has_applied'] data_to_imput = data.drop(non_imput_cols, axis=1) imput_cols = list(data_to_imput) non_imputed_data = data.drop(imput_cols, axis=1) imp_mean_knn = KNNImputer(n_neighbors=30) imputed_data = imp_mean_knn.fit_transform(data_to_imput) imputed_data = pd.DataFrame(imputed_data, columns=imput_cols) resultant_imputed_data = pd.concat([non_imputed_data, imputed_data], axis=1, join='inner') return resultant_imputed_data
def imputations(df1, cols): df = df1.copy() for variable in cols: mappings = find_category_mappings(df, variable) mappin[variable] = mappings for variable in cols: integer_encode(df, variable, mappin[variable]) sca = mm.fit_transform(df) knn_imputer = KNNImputer() knn = knn_imputer.fit_transform(sca) df.iloc[:, :] = mm.inverse_transform(knn) for i in df.columns: df[i] = round(df[i]).astype('int') for i in cols: inv_map = {v: k for k, v in mappin[i].items()} df[i] = df[i].map(inv_map) return df
def _gen_inits_clustering(X, K, n_iter=10, skip_spectral=True): Xs = [X] Xn = X.copy().astype('float') Xn[Xn == 0] = np.nan imp = SimpleImputer(missing_values=np.nan, strategy='mean') Xs.append(imp.fit_transform(Xn)) imputer = KNNImputer() Xs.append(imputer.fit_transform(Xn)) lb_inits = [] for Xt in Xs: li = gen_inits_for_X(Xt, K, n_iter, skip_spectral) lb_inits = lb_inits + li return dedup_labels(lb_inits)
def impute_missing_values(self, data): """ Method Name: impute_missing_values Description: This method replaces all the missing values in the Dataframe using KNN Imputer. Output: A Dataframe which has all the missing values imputed. On Failure: Raise Exception """ self.logger_object.log(self.file_object, 'Entered the impute_missing_values method of the Preprocessor class') self.data= data try: imputer=KNNImputer(n_neighbors=3, weights='uniform',missing_values=np.nan) self.new_array=imputer.fit_transform(self.data) # impute the missing values # convert the nd-array returned in the step above to a Dataframe self.new_data=pd.DataFrame(data=self.new_array, columns=self.data.columns) self.logger_object.log(self.file_object, 'Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class') return self.new_data except Exception as e: self.logger_object.log(self.file_object,'Exception occured in impute_missing_values method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log(self.file_object,'Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class') raise Exception()
def impute_experience(df, cat_var): df['experience'] = df['experience'].replace(['<1'], 0) df['experience'] = df['experience'].replace(['>20'], 21) df1 = df df1 = df.drop(cat_var + list(['last_new_job']), axis=1) imputer = KNNImputer() df1_imputed = imputer.fit_transform(df1) df1_imputed = pd.DataFrame(df1_imputed, index=df1.index, columns=df1.columns) bins = np.linspace(0, 25, 6) labels = ['exp_one', 'exp_two', 'exp_three', 'exp_four', 'exp_five'] df1_imputed['exp_bins'] = pd.cut(df1_imputed['experience'], bins=bins, labels=labels) df2 = pd.get_dummies(df1_imputed['exp_bins']) df = df.drop(['experience'], axis=1) df = pd.concat([df, df2], axis=1) return df
def compare_to_lasso_analysis(adata, ccdtranscript): '''Perform a comparison of pseudotime analysis to LASSO analysis for finding CCD proteins''' prevPlotSize = plt.rcParams['figure.figsize'] plt.rcParams['figure.figsize'] = (6, 5) print("ANALYZING SC-RNA-SEQ WITH LASSO") warnings.filterwarnings("ignore") fucci_rna_data = [(adata.obs["Red585"][ii], adata.obs["Green530"][ii]) for ii in np.arange(len(adata.obs))] imputer = KNNImputer(missing_values=0) expression = imputer.fit_transform(adata.X) fucci_rna_path = "output/pickles/fucci_rna_imputed_lasso.pkl" if os.path.exists(fucci_rna_path): fucci_rna = np.load(open(fucci_rna_path, 'rb'), allow_pickle=True) else: fucci_rna = MultiTaskLassoCV() fucci_rna.fit(expression, fucci_rna_data) pickle.dump(fucci_rna, open(fucci_rna_path, 'wb')) nz_coef = np.sum(fucci_rna.coef_, axis=0) != 0 print(f"{sum(nz_coef)}: number of nonzero lasso coefficients") print(f"{adata.var_names[nz_coef]}: genes with nonzero lasso coeff") print( f"{sum(ccdtranscript[nz_coef]) / sum(nz_coef)}: % nonzero lasso found as CCD transcripts" ) print( f"{np.sum(fucci_rna.coef_, axis=0)[nz_coef]}: coefficients for nonzero lasso coeff" ) # Generate UMAP for CCD and nonCCD for the LASSO model adataCCd = adata[:, nz_coef] sc.pp.neighbors(adataCCd, n_neighbors=10, n_pcs=40) sc.tl.umap(adataCCd) sc.pl.umap(adataCCd, color="fucci_time", show=False, save=True) shutil.move("figures/umap.pdf", f"figures/umapRNALassoCCD.pdf") adataNonCCd = adata[:, ~nz_coef] sc.pp.neighbors(adataNonCCd, n_neighbors=10, n_pcs=40) sc.tl.umap(adataNonCCd) sc.pl.umap(adataNonCCd, color="fucci_time", show=False, save=True) shutil.move("figures/umap.pdf", f"figures/umapRNALassoNonCCD.pdf") plt.rcParams['figure.figsize'] = prevPlotSize warnings.filterwarnings("default")
def impute_missing_values(self, data): """ Method Name: impute_missing_values Description: This method replaces all the missing values in the Dataframe using KNN Imputer. Output: A Dataframe which has all the missing values imputed. On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ self.logger_object.log( self.file_object, 'Entered the impute_missing_values method of the Preprocessor class' ) self.data = data try: imputer = KNNImputer(n_neighbors=3, weights='uniform', missing_values=np.nan) self.new_array = imputer.fit_transform( self.data) # impute the missing values # convert the nd-array returned in the step above to a Dataframe # rounding the value because KNNimputer returns value between 0 and 1, but we need either 0 or 1 self.new_data = pd.DataFrame(data=np.round(self.new_array), columns=self.data.columns) self.logger_object.log( self.file_object, 'Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class' ) return self.new_data except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in impute_missing_values method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class' ) raise Exception()
def impute(self): """ This method performs the Nearest Neighbor Imputation. """ batches = self.__get_batches() # return batches # The weight parameter was set to distance in order to increase the influence of closer elements imputer = KNNImputer(n_neighbors=self.__n_neighbors, weights='distance') df = pd.DataFrame(columns=list(self.__df.columns)) print('Performing imputations...') for batch in tqdm(batches): data = imputer.fit_transform(batch.drop(['name'], axis=1)) batch.iloc[:, 1:] = data df = pd.concat([df, batch]) self.__update_df(df)
def test_model(feature_set, dataset): """ Test validation sample on pre-trained model for a given feature set. """ if 'y' not in list(dataset): raise ValueError('Dataset must contain binary outcome, y') if not set(models[feature_set]).issubset(list(dataset)): raise ValueError('Dataset must contain required features') clf = pretrained[feature_set] y = dataset['y'] X = dataset[models[feature_set]] # Scale/impute scaler = StandardScaler() imputer = KNNImputer() X = scaler.fit_transform(X) X = imputer.fit_transform(X) # Predict y_pred = clf.predict(X) y_prob = clf.predict_proba(X)[:, 1] # Return return ({'clf': clf, 'X': X, 'y': y, 'y_pred': y_pred, 'y_prob': y_prob})
def fit_imputed(v, train, valid): """ Function to test a single model in validation sample [valid], having trained on the training [train] sample, after scaling and imputation. """ # Select features/outcome X_train = train[v] y_train = train['y'] n_train = np.shape(X_train)[0] # Scale/impute scaler = StandardScaler() X_train = scaler.fit_transform(X_train) imputer = KNNImputer() X_train = imputer.fit_transform(X_train) # Train Logistic Regression with inner CV using training sample clf = LogisticRegressionCV(cv=inner, penalty='l1', Cs=10**np.linspace(0.1, -3, 50), random_state=42, solver='liblinear', scoring=roc_auc_scorer).fit(X_train, y_train) # Predict in validation sample X_test = valid[v] y_test = valid['y'] X_test = scaler.transform(X_test) X_test = imputer.transform(X_test) y_pred = clf.predict(X_test) y_prob = clf.predict_proba(X_test)[:, 1] # Return return ({ 'clf': clf, 'n_train': n_train, 'X_train': X_train, 'y_train': y_train, 'X_test': X_test, 'y_test': y_test, 'y_pred': y_pred, 'y_prob': y_prob })
def knn_impute_by_item(matrix, valid_data, k): """ Fill in the missing values using k-Nearest Neighbors based on question similarity. Return the accuracy on valid_data. :param matrix: 2D sparse matrix :param valid_data: A dictionary {user_id: list, question_id: list, is_correct: list} :param k: int :return: float """ ##################################################################### # TODO: # # Implement the function as described in the docstring. # ##################################################################### nbrs = KNNImputer(n_neighbors=k) mat = nbrs.fit_transform(matrix.T) acc = sparse_matrix_evaluate(valid_data, mat.T) print("Validation Accuracy Item_based with k = {} : {}".format(k, acc)) ##################################################################### # END OF YOUR CODE # ##################################################################### return acc
def impute_df(df): # imputer = KNN() imputer = KNNImputer(n_neighbors=2) object_types = list(df.select_dtypes(include=['object']).columns) num_types = list(set(df.columns) - set(object_types)) encoders_store = {} for column in num_types: skew = df[column].skew() if (-1 < skew < 1): df[column] = df[column].fillna(df[column].mean()) else: df[column] = df[column].fillna(df[column].median()) #create a for loop to iterate through each column in the data for columns in object_types: new = encode(df[columns]) encoders_store[columns] = new[1] imputed_data = pd.DataFrame(np.round(imputer.fit_transform(df)), columns=df.columns) for columns in object_types: imputed_data[columns] = encoders_store[columns].inverse_transform( np.array(imputed_data[columns]).reshape(-1, 1)) return imputed_data
def impute_last_new_job(df, cat_var): df['last_new_job'] = df['last_new_job'].replace(['never'], 0) df['last_new_job'] = df['last_new_job'].replace(['>4'], 5) df1 = df df1 = df.drop(cat_var, axis=1) imputer = KNNImputer() df1_imputed = imputer.fit_transform(df1) df1_imputed = pd.DataFrame(df1_imputed, index=df1.index, columns=df1.columns) bins = np.linspace(-1, 5, 7) labels = [ 'lnj_zero', 'lnj_one', 'lnj_two', 'lnj_three', 'lnj_four', 'lnj_five' ] df1_imputed['lnj_bins'] = pd.cut(df1_imputed['last_new_job'], bins=bins, labels=labels) df2 = pd.get_dummies(df1_imputed['lnj_bins']) df = df.drop(['last_new_job'], axis=1) df = pd.concat([df, df2], axis=1) return df
def experiment_setting_5(X, y, runs=5, missingness=0.1): results = [] for i in range(runs): np.random.seed(i) X_missing = make_missing_random(X, missingness) ss = StratifiedKFold(shuffle=True, random_state=i) for train_index, test_index in ss.split(X, y): X_train = X_missing[train_index] y_train = y[train_index] X_test = X[test_index] y_test = y[test_index] si = KNNImputer() X_train = si.fit_transform(X_train) dt = C45DecisionTree(criterion='c45', max_depth=20) dt.fit(X_train, y_train) results.append(accuracy_score(dt.predict(X_test), y_test)) return results
def get_imputed(from_depth=0, to_depth=2, mode=MODE_MEAN): out = pd.DataFrame( index=pd.DatetimeIndex(pd.date_range(FROM_CUTOFF, TO_CUTOFF))) print("OUT:", out) for json_path in base_path.glob('*.csv'): print(json_path) with open(json_path, 'r') as f: df = pd.read_csv(f) df = df[(df.depth >= from_depth) & (df.depth <= to_depth)] df.index = pd.to_datetime(df['time']) df = df.drop(columns=['depth']) df = df.drop(columns=['time']) if df.empty: continue elif mode == MODE_MAX: df = df.groupby(pd.Grouper(freq='D')).max() elif mode == MODE_MIN: df = df.groupby(pd.Grouper(freq='D')).max() elif mode == MODE_MEDIAN: df = df.groupby(pd.Grouper(freq='D')).median() elif mode == MODE_MEAN: df = df.groupby(pd.Grouper(freq='D')).mean() else: raise Exception(mode) df = df.rename(columns={'value': json_path.name.replace('.csv', '')}) out = pd.merge(out, df, left_index=True, right_index=True, how='outer') print(out) imputer = KNNImputer() out = pd.DataFrame(imputer.fit_transform(out), columns=out.columns, index=out.index) return out