def generateImputedDataset(dataset): sys.setrecursionlimit(10000) #Increase the recursion limit of the OS # start the KNN training imputed_training=fast_knn(dataset.values, k=30) imputed_dataset1 = pd.DataFrame(imputed_training) return imputed_dataset1
def imputation(number_list, mode, k=3): if mode is 'knn': imputed_training = fast_knn(number_list, k) return imputed_training elif mode is 'random': cleanedList = [] for i in number_list: if not math.isnan(i): cleanedList.append(i) for index, value in enumerate(number_list): if math.isnan(value): number_list[index] = random.choice(cleanedList) return number_list elif mode is 'regression': number_list = reg_impute(number_list) return number_list elif mode is 'frequency': cleanedList = [] for i in number_list: if not math.isnan(i): cleanedList.append(i) for index, value in enumerate(number_list): if math.isnan(value): number_list[index] = df.mode(cleanedList) return number_list elif mode is 'mean': cleanedList = [] for i in number_list: if not math.isnan(i): cleanedList.append(i) mean = df.mean(cleanedList) for index, value in enumerate(number_list): if math.isnan(value): number_list[index] = mean return number_list elif mode is 'corrected': number_list = correct_impute(number_list) return number_list elif mode is 'fard': number_list = fard_impute(number_list) return number_list elif mode is 'porsesh': number_list = porsesh_impute(number_list) return number_list elif mode is 'hambaste': number_list = hambaste_impute(number_list) return number_list
def calc_remaining_probs(args1: tuple, args2: tuple) -> (list, list): (ind_spec, ind_dis_vars) = csv_reader(*args1) (rel_spec, rel_dis_vars) = csv_reader(*args2) (joined_table, index_related) = preprocess_dataset(ind_spec, ind_dis_vars, rel_spec, rel_dis_vars) data_set = fast_knn(np.array(joined_table)) return (np.append(index_related, data_set, axis=1), ind_dis_vars, rel_dis_vars)
def replace_missing_knn(x_train, x_test=None, **knn_kwargs): """ Replaces missing data using K nearest neighbors. Parameters ---------- x_train: Dataframe or array like - 2d Dataset x_test: Dataframe or array like - 2d Testing dataset, by default None. k : int, optional Number of rows around the missing data to look at, by default 5 Returns ------- Dataframe, *Dataframe Transformed dataframe with rows with a missing values in a specific row are missing Returns 2 Dataframes if x_test is provided. """ neighbors = knn_kwargs.pop("k", 5) columns = x_train.columns train_knn_transformed = fast_knn(x_train.values, k=neighbors, **knn_kwargs) if x_test is not None: warnings.warn( "If test data does not come from the same distribution of the training data, it may lead to erroneous results." ) test_knn_transformed = fast_knn(x_test.values, k=neighbors, **knn_kwargs) return ( pd.DataFrame(data=train_knn_transformed, columns=columns), pd.DataFrame(data=test_knn_transformed, columns=columns), )
df = pd.read_csv(r'./DATA/diabetes.csv') #before adding nan print(df.head(10)) nan_percent = { 'Pregnancies': 0.10, 'Glucose': 0.15, 'BloodPressure': 0.10, 'SkinThickness': 0.12, 'Insulin': 0.10, 'BMI': 0.13, 'DiabetesPedigreeFunction': 0.11, 'Age': 0.11, 'Outcome': 0.12 } for col in df: for i, row_value in df[col].iteritems(): if random.random() <= nan_percent[col]: df[col][i] = np.nan #after adding nan print(df.head(10)) df.to_csv(r'NaNdiabetes3.csv') diab = np.genfromtxt('NaNdiabetes3.csv', delimiter=",") sys.setrecursionlimit(100000) #Increase the recursion limit of the OS # start the KNN training imputed_training = fast_knn(diab, k=3) print(imputed_training) pd.DataFrame(imputed_training).to_csv("file.csv")
def fit(self, dataframe): """ Method to be fitted, default is the most frequent value for str and int columns and median for float columns Custom: use a dict to set columns and imputation method like: {'mean':[columnname1,columnname2],'knn':[columname3,columnname4],'most_frequent':[columname5,columname6]} All unrelated columns will be imputed using default method :param dataframe: The input dataframe """ self.fill = pd.Series([dataframe[c].value_counts().index[0]\ if dataframe[c].dtype in [np.dtype('O'),np.dtype('int8'),np.dtype('int32'),np.dtype('int64')]\ else dataframe[c].median() for c in dataframe],index=dataframe.columns) if self.strategy is not None: if type(self.strategy) is not dict: raise ValueError( "Dict is required. Try {'method':[columname,...,],'method':['columname'...]} instead" ) else: self.strategy_single_imp = { method: column for method, aux in self.strategy.items() for column in aux } for method, column in self.strategy_single_imp.items(): if column not in dataframe.columns: raise ValueError( "Column {} is not in dataset".format(column)) if method not in [ 'most_frequent', 'mean', 'median', 'mice', 'knn' ]: raise ValueError("Unavailable method") for c in dataframe: if (column == c and method == 'most_frequent'): self.fill[c] = dataframe[c].value_counts().index[0] elif (column == c and method == 'mean'): self.fill[c] = dataframe[c].mean() elif (column == c and method == 'median'): self.fill[c] = dataframe[c].median() for method, columns in self.strategy.items(): if column not in dataframe.columns: raise ValueError( "Column {} is not in dataset".format(column)) if method not in [ 'most_frequent', 'mean', 'median', 'mice', 'knn' ]: raise ValueError("Unavailable method") if method == 'knn': train_cols = list( dataframe.select_dtypes(include=['floating'])) train = pd.DataFrame( cs.fast_knn(dataframe[train_cols], k=5)) train.columns = train_cols dataframe[train_cols] = train.values if method == 'mice': train_cols = list( dataframe.select_dtypes(include=['floating'])) train = pd.DataFrame(cs.mice(dataframe[train_cols])) train.columns = train_cols dataframe[train_cols] = train.values return self
sys.setrecursionlimit(100000) #Increase the recursion limit of the OS cp = data.copy() cp = cp.drop('Date', 1) print(np.count_nonzero(cp.values)) print(np.count_nonzero(~np.isnan(cp.values))) """ Imputation Using k-NN The k nearest neighbours is an algorithm that is used for simple classification. The algorithm uses ‘feature similarity’ to predict the values of any new data points. This means that the new point is assigned a value based on how closely it resembles the points in the training set. This can be very useful in making predictions about the missing values by finding the k’s closest neighbours to the observation with missing data and then imputing them based on the non-missing values in the neighbourhood. """ # start the KNN training imputed_training=fast_knn(cp.values, k=30).tolist() #print(np.count_nonzero(~np.isnan(imputed_training))) #return to pandas index = data.columns.values.tolist() df = pd.DataFrame(imputed_training) df.columns = index[1:] df.insert(loc=0, column='Date', value=data['Date'].values) print(df) #use arima
dataset_replace['LandSlope'].str.contains('Sev'), 3, dataset_replace['LandSlope']) # encode specific LabelEncoder same as auto from sklearn.preprocessing import LabelEncoder lc = LabelEncoder() dataset_replace['MSZoning'] = lc.fit_transform(dataset_replace['MSZoning']) # use knn import sys from impyute.imputation.cs import fast_knn sys.setrecursionlimit(100000) #Increase the recursion limit of the OS # start the KNN training imputed_training = fast_knn(train.values, k=30) # use Multiple Imputations (MIs) from impyute.imputation.cs import mice # start the MICE training imputed_training = mice(train.values) # Deep Neural Networks import datawig df_train, df_test = datawig.utils.random_split(train) #Initialize a SimpleImputer model imputer = datawig.SimpleImputer( input_columns=[
#Randomly replace 30% of the first column with NaN values column = X['Mean1'] print(column.size) missing_pct = int(column.size * 0.4) i = [random.choice(range(column.shape[0])) for _ in range(missing_pct)] column[i] = np.NaN print(column.shape[0]) print(column) # Import train_test_split function from impyute.imputation.cs import fast_knn sys.setrecursionlimit(100000) X= fast_knn(X, k=30) from sklearn.model_selection import train_test_split # Split dataset into training set and test set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 1000) # 70% training and 30% test #Create a Gaussian Classifier model = GaussianNB() # Train the model using the training sets6 model.fit(X_train,y_train) y_pred = model.predict(X_test)
def show(): time_slots = 100 data_path = "../data/DlRsrpSinrStats.txt" #data_set = np.loadtxt(data_path,delimiter=' ',skiprows=1); data_set = pd.read_table(data_path, delimiter='\t') print(data_set.shape) df = pd.DataFrame(data_set) rsrp = [] #construct matrix with location and timeslot bias = 3200 * 149 for i in range(time_slots): temp = np.array((df.loc[bias + i * 3200:bias + (i + 1) * 3200 - 1].sort_values('IMSI'))['rsrp']) temp = temp[0:1600] row = np.array(temp) rsrp.append(row) #rsrp = np.array((df.loc[0:time_slots*total_num-1].sort_values('IMSI'))['rsrp']) #print('rsrp shape:',rsrp.shape) #print(rsrp.shape) mrsrp = [] for power in rsrp: temp_power = 10 * np.log10(power) + 30 mrsrp.append(temp_power) mrsrp = np.array(mrsrp) # mrsrp = scale(mrsrp) print('mrsrp', mrsrp.shape) # generate a maskmatrix m = 40 n = 40 missing_rates = [0.75] #[0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95] #random_mat = np.random.uniform(size=[]) xaxis = [] knnxaxis = [] knnyaxis = [] # iiyaxis=[] nnmxaxis = [] nnmyaxis = [] micexaxis = [] miceyaxis = [] knntime = [] nnmtime = [] micetime = [] for missing_rate in missing_rates: knny = [] # iiy=[] nny = [] micey = [] nnmy = [] ktime = 0 ntime = 0 mtime = 0 for mat_rsrp in mrsrp: mat_rsrp = np.array(mat_rsrp).reshape(40, 40) mask = gen_mask(m, n, missing_rate) # mask = gen_mask2().reshape(40,40) print(mask) sample_data = mat_rsrp * mask print('origin_data') plot_image(mat_rsrp) print('the sample_data') plot_image(sample_data) try: t1 = time.time() sample_data[sample_data == 0] = np.nan knn_recover = fast_knn(sample_data, k=3) print('knn') plot_image(knn_recover) error_knn = mean_absolute_error(mat_rsrp, knn_recover) print(error_knn) knny.append(error_knn) t2 = time.time() ktime = ktime + (t2 - t1) except ValueError: knny.append(2) t2 = time.time() ktime = ktime + 600 * (1 + missing_rate) try: t1 = time.time() mice_data = mice(sample_data) print('mice') plot_image(mice_data) error_mice = mean_absolute_error(mat_rsrp, mice_data) micey.append(error_mice) t2 = time.time() mtime = mtime + (t2 - t1) except ValueError: micey.append(2) t2 = time.time() mtime = mtime + 600 * (1 + missing_rate) try: t1 = time.time() X_filled_nnm = SoftImpute().fit_transform(sample_data) print('NuclearNormMinimization') plot_image(X_filled_nnm) error_nuclear = mean_absolute_error(mat_rsrp, X_filled_nnm) nnmy.append(error_nuclear) t2 = time.time() ntime = ntime + (t2 - t1) except: nnmy.append(2) t2 = time.time() ntime = ntime + 600 * (1 + missing_rate) break # print("\tknn:",error_knn,"\tmice:",error_mice,"\titer:",error_iter,"\tnuclear",error_nuclear) knntime.append(ktime) nnmtime.append(ntime) micetime.append(mtime) knnyaxis.append(np.mean(np.array(knny))) miceyaxis.append(np.mean(np.array(micey))) # iiyaxis.append(np.mean(np.array(iiy))) nnmyaxis.append(np.mean(np.array(nnmy))) xaxis.append(missing_rate) # plt.plot(xaxis,iiy,c='red',label='iter') # plt.plot(xaxis,knny,c='blue',label='knn') # plt.plot(xaxis,nnmy,c='orange',label='nnm') # plt.plot(xaxis,micey,c='black',label='mice') # # plt.xlabel("missing rate") # plt.ylabel("mae") # plt.legend() # plt.show() res = np.array( [xaxis, knnyaxis, nnmyaxis, miceyaxis, knntime, nnmtime, micetime]) return res
k_list = [5] #cont_list = ["auto", 0.01, 0.05, 0.1] cont_list = [0.05] n_estimators_list = [100] max_depth_list = [6] results = pd.DataFrame(columns=['p','k','cont','n_estimators', 'max_depth', 'cv_r2']) count = 0 for p in p_list: for k in k_list: for cont in cont_list: for n_estimators in n_estimators_list: for max_depth in max_depth_list: X_im = fast_knn(X, k) X_im.columns = columns X_train, X_test = split(X_im, nrow_train_raw) # Outlier detection X_train_out, y_train_out = outlier(X_train, y_train, cont) # Feature selection outRF = selectRF(p, X_train_out, y_train_out) X_train = outRF[0] ind_sel = outRF[1]
''' Basic idea & GOAL: Impute array with a basic mean impute and then use the resulting complete array to construct a KDTree. Use this KDTree to compute nearest neighbours. After finding `k` nearest neighbours, take the weighted average of them. Basically, find the nearest row in terms of distance ''' import pandas as pd import numpy as np import random import sys from impyute.imputation.cs import fast_knn diab = np.genfromtxt( 'NaN_k-nn_filling.csv', delimiter="," ) #taking the CSV dataset containing NaN values and converting it into numpy.ndarray # start the KNN training imputed_training = fast_knn( diab, k=5 ) # fast_knn assigns the lowest weight to the nearest neighbor instead of assigning the highest weight to the nearest neighbor. pd.DataFrame(imputed_training).to_csv( "file.csv") # writing the replaced NAN values into a new csv file
def show(): data_path = "../data/DlRsrpSinrStats.txt" grid_width = 40 time_slots = 100 ue_num = grid_width * grid_width uav_num = grid_width * grid_width total_num = ue_num + uav_num #data_set = np.loadtxt(data_path,delimiter=' ',skiprows=1); data_set = pd.read_table(data_path, delimiter='\t') print(data_set.shape) df = pd.DataFrame(data_set) # out = open('/home/tao/dataset/out2.txt','w') # csv_write = csv.writer(out) rsrp = [] #construct matrix with location and timeslot bias = 3200 * 149 for i in range(time_slots): temp = np.array( (df.loc[bias + i * total_num:bias + (i + 1) * total_num - 1].sort_values('IMSI'))['rsrp']) temp = temp[0:1600] row = np.array(temp) # csv_write.writerow(temp) rsrp.append(row) #rsrp = np.array((df.loc[0:time_slots*total_num-1].sort_values('IMSI'))['rsrp']) #print('rsrp shape:',rsrp.shape) #print(rsrp.shape) mrsrp = [] for power in rsrp: temp_power = 10 * np.log10(power) + 30 mrsrp.append(temp_power) mrsrp = np.array(mrsrp) mrsrp = scale(mrsrp) print('mrsrp', mrsrp.shape) mat_rsrp = mrsrp.T print('fdfa') m = mat_rsrp.shape[0] n = mat_rsrp.shape[1] missing_rates = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9] # [0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95] # add some random noise to the data def add_noise(origin_data, noise_level): np.random.seed(42) mu = 0 sigma = noise_level noise = np.random.normal(mu, sigma, origin_data.shape) # vonvert the noise the dBm print(noise) return origin_data + noise #random_mat = np.random.uniform(size=[]) #mat_rsrp = add_noise(mat_rsrp,3) xaxis = [] yaxis = [] knny = [] nnmy = [] micey = [] knntime = [] nnmtime = [] micetime = [] for missing_rate in missing_rates: mask = gen_mask(m, n, missing_rate) sample_data = mat_rsrp * mask print(missing_rate) print('origin_data') plot_image(mat_rsrp) print(mat_rsrp) print('the sample_data') plot_image(sample_data) print(sample_data) try: t1 = time.time() sample_data[sample_data == 0] = np.nan knn_recover = fast_knn(sample_data, k=3) print('knn') plot_image(knn_recover) error_knn = mean_absolute_error(mat_rsrp, knn_recover) knny.append(error_knn) t2 = time.time() ktime = (t2 - t1) knntime.append(ktime) except ValueError: knny.append(2) t2 = time.time() knntime.append(1200 * (1 + missing_rate)) try: t1 = time.time() mice_data = mice(sample_data) print('mice') plot_image(mice_data) error_mice = mean_absolute_error(mat_rsrp, mice_data) micey.append(error_mice) t2 = time.time() micetime.append(t2 - t1) except ValueError: micey.append(2) t2 = time.time() micetime.append(1200 * (1 + missing_rate)) try: t1 = time.time() X_filled_nnm = SoftImpute().fit_transform(sample_data) print('SoftImpute') plot_image(X_filled_nnm) error_nuclear = mean_absolute_error(mat_rsrp, X_filled_nnm) nnmy.append(error_nuclear) t2 = time.time() nnmtime.append(t2 - t1) except: nnmy.append(2) t2 = time.time() nnmtime.append(1200 * (1 + missing_rate)) xaxis.append(missing_rate) # plt.plot(xaxis,yaxis) # plt.xlabel("missing_rate") # plt.ylabel("mae") # plt.show() return np.array([xaxis, knny, micey, nnmy, knntime, micetime, nnmtime])
# Data cleaning df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']]=\ df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN) df.isnull().sum() # Plotting the bar of missing data import missingno as msno p = msno.bar(df) import impyute import sys from impyute.imputation.cs import fast_knn sys.setrecursionlimit(100000) # increase the recursion limit of the system # start the KNN training imputed_training = fast_knn( df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].values, k=30) df_t1 = pd.DataFrame( imputed_training, columns=['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']) df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI' ]] = df_t1[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] # Providing the info of the total nulls df.isnull().sum() # Data describing and visualization df.info() df.describe() # Heatmap plotting import seaborn as sns df.corr() sns.heatmap(df.corr(), annot=True)
targets = np.ravel(data.iloc[:, 0]) targets = np.where(targets == 'kontrol', 0, 1) features = np.array(data.iloc[:, 2:13]) columns = data.columns[2:13] # Implementing one-hot encoding and Knn imputation for the option A dummies_data_a = pd.get_dummies(data.iloc[:, 2:13]) dummies_columns_a = dummies_data_a.columns for clm in columns: dummies_data_a.loc[ data[clm].isnull(), dummies_data_a.columns.str.startswith("{}_".format(clm))] = np.nan sys.setrecursionlimit(100000) #Increase the recursion limit of the OS imputed_knn_a = np.round(fast_knn(np.array(dummies_data_a), k=30)).astype('int') # 3D PCA plots sc = StandardScaler() std_data = sc.fit_transform(imputed_knn_a) pca = PCA(n_components=3) pca_form = pca.fit_transform(std_data) only_cancer = pca_form[targets == 1] colors = ('orange', 'blue', 'green', 'brown', 'red', 'purple', 'gray') markers = ['o', 's', 'X', 'v', '<', 'P', '>'] def plot3d_colon(column_num, name, only_cancer=only_cancer, colors=colors,
#let's name the categorical and numeical attributes categorical_attributes = list(dataInt_Xy.select_dtypes(include=['category']).columns) numerical_attributes = list(dataInt_Xy.select_dtypes(include=['float64', 'int64']).columns) print('categorical_attributes:', categorical_attributes) print('numerical_attributes:', numerical_attributes) dataInt_Xy = dataInt_Xy[['temp_1', 'temp_2', 'mean_national_temp', 'humidity_1', 'humidity_2','consumption_secondary_1', 'consumption_secondary_2','consumption_secondary_3', 'date', 'hour', 'consumption_1', 'consumption_2']] #missing value msno.matrix(dataInt_Xy, figsize=(12,5)) # vizulaize null_values_apptr = dataInt_Xy.isnull().sum() #count missing the same for the same locate null_values_apptr = null_values_apptr[null_values_apptr != 0].sort_values(ascending = False).reset_index() #only show rows with null values null_values_apptr.columns = ["variable", "n_missing"] null_values_apptr.head() # matrice des donnees manquantes dataInt_Xy.isnull().sum() data_missing = dataInt_Xy[['temp_1', 'temp_2', 'mean_national_temp', 'humidity_1', 'humidity_2','consumption_secondary_1', 'consumption_secondary_2','consumption_secondary_3']].copy() # imputation par MICE imputed_training_mice=mice(data_missing.values) # Imputation par KNN sys.setrecursionlimit(100000) #Increase the recursion limit of the OS # start the KNN training imputed_training_KNN=fast_knn(data_missing.values, k=30)
column_type[i] = 1 # 1st row is skipped for row in reader: row_digit = [ float(v) if v != '' else np.nan for i, v in enumerate(row) if column_type[i] == 1 ] row_class = [ v if v != '' else np.nan for i, v in enumerate(row) if column_type[i] == 0 ] row = [i if i != '' else np.nan for i in row] train_digit.append(row_digit) train_class.append(row_class) origin.append(row) train_nd = np.asarray(train_digit) # In[41]: result = fast_knn(train_nd, k=5) # In[52]: train_digit = [list(row) for row in result] train = [] for i, row in enumerate(train_class): train.append(row + train_digit[i]) # In[ ]: