data2 = data.copy() for i in range(0, m): if (isinstance(data.iloc[:, i][0], str) or np.isnan(data.iloc[:, i][0])): categoricalIdx.append(i) modifiedoCol = pd.factorize(data.iloc[:, i], na_sentinel=-2) data2.iloc[:, i] = modifiedoCol[0] + 1 data2 = data2.replace(-1, np.nan) #for i in range(0, m): # for j in range(0, n): # if (isinstance(data.iloc[:,i][0],str)): # print("i=",i," j=",j) knnImpute = KNN(k) data_knnImp = knnImpute.complete(data2.values) data_knnImp = StandardScaler().fit_transform(data_knnImp) pca = PCA(n_components) principalComponents = pca.fit_transform(data_knnImp) pd.DataFrame(principalComponents) train = principalComponents[0:1000, :] test = target[0:1000] XTest = principalComponents[1000:n, :] YTest = target[1000:n] pd.DataFrame(train) #nn = MLPRegressor(hidden_layers, activation='relu', solver='adam', alpha=0.001, batch_size='auto', # learning_rate='constant', learning_rate_init=0.01, power_t=0.5, max_iter=10000, shuffle=True, # random_state=0, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True,
inner_rank = 4 X = np.dot(np.random.randn(n, inner_rank), np.random.randn(inner_rank, m)) print("Mean squared element: %0.4f" % (X ** 2).mean()) # X is a data matrix which we're going to randomly drop entries from missing_mask = np.random.rand(*X.shape) < 0.1 X_incomplete = X.copy() # missing entries indicated with NaN X_incomplete[missing_mask] = np.nan meanFill = SimpleFill("mean") X_filled_mean = meanFill.complete(X_incomplete) # Use 3 nearest rows which have a feature to fill in each row's missing features knnImpute = KNN(k=3) X_filled_knn = knnImpute.complete(X_incomplete) # matrix completion using convex optimization to find low-rank solution # that still matches observed values. Slow! X_filled_nnm = NuclearNormMinimization().complete(X_incomplete) # Instead of solving the nuclear norm objective directly, instead # induce sparsity using singular value thresholding softImpute = SoftImpute() # simultaneously normalizes the rows and columns of your observed data, # sometimes useful for low-rank imputation methods biscaler = BiScaler() # rescale both rows and columns to have zero mean and unit variance X_incomplete_normalized = biscaler.fit_transform(X_incomplete)
test = pd.read_csv("Data/test.csv").iloc[:, 1:].as_matrix() test_incomplete = test.copy() X1 = X[:1565, :] X2 = X[1565:2809, :] X3 = X[2809:4434, :] X4 = X[4434:6106, :] X5 = X[6106:7655, :] X6 = X[7655:, :] print("seting knn object...") knnImpute = KNN(k=3) print("imputing X1 ...") X1_knn = knnImpute.complete(X1) print("imputing X2 ...") X2_knn = knnImpute.complete(X2) print("imputing X3...") X3_knn = knnImpute.complete(X3) print("imputing X4 ...") X4_knn = knnImpute.complete(X4) print("imputing X5 ...") X5_knn = knnImpute.complete(X5) print("imputing X6 ...") X6_knn = knnImpute.complete(X6)
mice_result = MICE.MICE( verbose=False, init_fill_method="median", impute_type="pmm", n_imputations=7).complete( np.matrix(dfnm) ) # Here 7 is number of prediction for generating mean or median print(mice_result.shape) mice_result # ### Generate KNN result # In[53]: knnImpute = KNN(k=7) # Here 7 is number of each cluster size knn_result = knnImpute.complete(dfnm.as_matrix()) # ### Generate KMean result # In[54]: _, _, kmean_result = kmeans.kmeans_missing( dfnm.as_matrix(), n_clusters=7) # Here 7 is number of each cluster size # ### Create new DataFrame from iMICE result # In[55]: newdf = pd.DataFrame(i_mice_result, columns=['Gender', 'AGE', 'Department', 'Sample'])