import pandas as pd import numpy as np import missingpy from _datetime import datetime # mask = np.loadtxt('mask_pattern.csv', delimiter=',') # data = pd.read_csv('final_data_missing.csv') # mask = np.loadtxt('mask_pattern_mcar.csv', delimiter=',') data = pd.read_csv('final_data_missing80n500.csv') print(data) imputer = missingpy.MissForest() now1 = datetime.now() current_time = now1.strftime("%H:%M:%S") print("Starting Time =", current_time) Data_Imputed = imputer.fit_transform(data) now1 = datetime.now() current_time = now1.strftime("%H:%M:%S") print("Final Time =", current_time) df = pd.DataFrame(Data_Imputed) df.to_csv('MissForestImputed80n500.csv', index=False)
features_train = [ "full_sq", "metro_min_walk", "big_market_km", "workplaces_km", "university_km", "cafe_count_1000", "shopping_centers_km", "office_km", "big_church_km", "school_education_centers_top_20_raion", "build_count_after_1995", "cafe_count_1500_price_500", "market_count_500", "oil_chemistry_km", "railroad_km", "ts_km", "young_all", "work_male", "work_female", "ekder_all", "build_count_mix", "build_count_1971-1995", "build_count_1946-1970", "build_count_1921-1945", "build_count_before_1920" ] features_subset = train[features_train] #features_subset[features_subset["build_count_mix"].isna()][["build_count_mix","build_count_1971-1995","build_count_1946-1970"]].head() features_subset["metro_min_walk"] = features_subset["metro_min_walk"].fillna( features_subset["metro_min_walk"].mean()) rf_imp = missingpy.MissForest(criterion="mse") filled_features = rf_imp.fit_transform(features_subset) data_imp = pd.DataFrame(filled_features, columns=features_subset.columns) data_imp["target"] = train["price_doc"] data_imp["timestamp"] = train["timestamp"] l_feat = Lasso(alpha=.4) l_feat.fit(data_imp.drop(["timestamp", "target"], axis=1), data_imp["target"]) coefs = l_feat.coef_ selected_features = [] cols = data_imp.drop(["timestamp", "target"], axis=1).columns for i in range(len(coefs)): if coefs[i] != 0: selected_features.append(cols[i]) #selected_features
new_index.head(5) #redo merge of hourly data to index Hourlys = pd.merge(new_index, Hourlys, on='timestamp', how='left', sort=False,copy=True) Hourlys.set_index(Hourlys.timestamp, inplace=True) Hourlys = Hourlys.loc[:, Hourlys.columns != 'timestamp'] #viz of missing values missingdata_df2 = Hourlys.columns[Hourlys.isnull().any()].tolist() # msno.matrix(Hourlys[missingdata_df2]) gc.collect() # predict Missing hourly pollutants from present pollutants with random forest imputer = mp.MissForest() polluted_imputed = imputer.fit_transform(Hourlys) # polluted_imputed #put imputed data in a data frame Hourlys_cols = Hourlys.columns.values Hourlys = pd.DataFrame(polluted_imputed, columns = Hourlys_cols, index=Hourlys.index ) Hourlys = Hourlys.reindex(sorted(Hourlys.columns), axis=1) # Hourlys.head(3) # Hourlys.shape gc.collect() # group hourly data by daily averages to merge with daily pollutants
import numpy as np import missingpy if __name__ == "__main__": data = np.loadtxt('data0.0_50.csv', delimiter=',') #data = data[,0:2] imputer = missingpy.MissForest(verbose=1) imputer.fit_transform(data)