def dataPreprocessing(df): #imp = preprocessing.Imputer(missing_values='NaN', strategy='most_frequent', axis=0, verbose=1) #imp.fit(df) imputerResult = DataFrameImputer().fit_transform(df) imputerResult = imputerResult.apply(preprocessing.LabelEncoder().fit_transform) #df=df.apply(preprocessing.StandardScaler().fit_transform) #df=df.apply(preprocessing.MinMaxScaler().fit_transform) return imputerResult
def dataLoading(self,file): with open(file) as f: features = [] labels = [] for line in f: line = line.replace("@data","") line = line.replace("yes","0") line = line.replace("no,","1,") line = line.replace("poor","0") line = line.replace("good","1") line = line.replace(",normal",",1") line = line.replace(",abnormal",",0") line = line.replace(",notpresent",",1") line = line.replace(",present",",0") line = line.replace("?", "NaN") line = line.replace(",notckd",",1") line = line.replace(",ckd",",0") line = line.replace("\t","") line = line[:-2] row = line.split(",") labels.append(row[-1]) features.append(row[:-1]) features = features[8:] labels = labels[8:] labels[399] = "1" features = [list(func(i)) for i in features] labels = [list(func(i)) for i in labels] labels = np.asarray(labels) labels = labels.reshape((400,)) features = [pd.to_numeric(i,errors="coerce") for i in features] features_before = pd.DataFrame(features) features_after = DataFrameImputer().fit_transform(features_before) return features_after,labels
def data_prepartion(): num_attribs = ["yearsExperience", "milesFromMetropolis"] cat_attribs = ["companyId", "jobType", "degree", "major", "industry"] num_pipeline = Pipeline([ ('imputer', DataFrameImputer(num_attribs)), ('std_scaler', StandardScaler()), ]) cat_pipeline = Pipeline([ ('imputer', DataFrameImputer(cat_attribs)), ('label_binarizer', OneHotEncoder()), ]) data_pipeline = FeatureUnion( transformer_list=[("num_pipeline", num_pipeline), ("cat_pipeline", cat_pipeline)]) return data_pipeline
# seperate back to training and test sets after one hot encoding dataframe = one_hot.iloc[:dataframe.shape[0], :] test = one_hot.iloc[dataframe.shape[0]:, ] print("Imputing Data...") # imputes with the mean of each col #for col in dataframe: # dataframe[col].fillna(dataframe[col].mean()) #dataframe = dataframe.fillna(dataframe.mean()) #dataframe.dropna(inplace=True) #imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) #imp.fit(dataframe) X = pd.DataFrame(dataframe) dataframe = DataFrameImputer().fit_transform(X) # get numeric variables numeric_variables = list(dataframe.dtypes[dataframe.dtypes != "object"].index) print("fitting baseline model on just numerical values...") # fit model on just numerical variables as a baseline model = RandomForestRegressor(n_estimators=2, oob_score=True, random_state=42) model.fit(dataframe[numeric_variables], labels) # for regression the oob_score_ (out of bag score) gives the R^2 based on oob predictions #print(model.oob_score_) labels_oob = model.oob_prediction_ print("c-stat ", roc_auc_score(labels, labels_oob)) print("Out of bag score...")
print('train data shape', projects.ix[train_idx].shape) print('test data shape', projects.ix[test_idx].shape) print('raw data loaded') print('dropping unnecessary columns...') drop_labels = ['school_ncesid', 'schoolid', 'school_city', 'school_latitude', 'school_longitude', 'school_zip', 'school_district', 'school_county', 'secondary_focus_subject', 'secondary_focus_area'] drop_labels.append('date_posted') # drop_labels.append('school_city') for label in drop_labels: projects.drop(label, axis=1, inplace=True) print('imputing missing elements...') # mean for number, most frequent for nan dfi = DataFrameImputer() projects = dfi.fit_transform(projects) outcomes = dfi.fit_transform(outcomes) print('factorizing catagorical values...') proj_cat_labels = ['teacher_acctid', 'school_state', 'school_metro', 'school_charter', 'school_magnet', 'school_year_round', 'school_nlns', 'school_kipp', 'school_charter_ready_promise', 'teacher_prefix', 'teacher_teach_for_america', 'teacher_ny_teaching_fellow', 'primary_focus_subject', 'primary_focus_area', 'resource_type', 'poverty_level', 'grade_level', 'eligible_double_your_impact_match', 'eligible_almost_home_match' ] # proj_cat_labels.remove('school_city') for label in proj_cat_labels:
# In[282]: import sklearn.preprocessing as pre from DataFrameImputer import DataFrameImputer # In[283]: # Import and replace NaNs with most frequent value for labels and mean for numericals train_original = pd.DataFrame.from_csv("train.csv") test_original = pd.DataFrame.from_csv("test.csv") lenght_train = len(train_original) total = train_original.append(test_original) total['SalePrice'] = total['SalePrice'].fillna(value=0) # don't impute price of the test data total_dframe = DataFrameImputer().fit_transform(total) # In[284]: # Nachbarschaft nbh_score = {None: 0, "CollgCr": 2, "Veenker": 2, "Crawfor": 2, "NoRidge": 3, "Mitchel": 1, "Somerst": 2, "NWAmes": 2, "OldTown": 1, "BrkSide": 1, "Sawyer": 1, "NridgHt": 3, "NAmes": 1, "SawyerW": 2, "IDOTRR": 1, "MeadowV": 1, "Edwards": 1, "Timber": 2, "Gilbert": 2, "StoneBr": 3, "ClearCr": 2, "NPkVill": 1, "Blmngtn": 2, "BrDale": 1, "SWISU": 1, "Blueste": 1} total_dframe['nbh_score'] = total_dframe['Neighborhood'].map(nbh_score) # In[285]: next_to_airport = { 'Blmngtn': 0, # Bloomington Heights
print('raw data loaded') print('dropping unnecessary columns...') drop_labels = [ 'school_ncesid', 'schoolid', 'school_city', 'school_latitude', 'school_longitude', 'school_zip', 'school_district', 'school_county', 'secondary_focus_subject', 'secondary_focus_area' ] drop_labels.append('date_posted') # drop_labels.append('school_city') for label in drop_labels: projects.drop(label, axis=1, inplace=True) print('imputing missing elements...' ) # mean for number, most frequent for nan dfi = DataFrameImputer() projects = dfi.fit_transform(projects) outcomes = dfi.fit_transform(outcomes) print('factorizing catagorical values...') proj_cat_labels = [ 'teacher_acctid', 'school_state', 'school_metro', 'school_charter', 'school_magnet', 'school_year_round', 'school_nlns', 'school_kipp', 'school_charter_ready_promise', 'teacher_prefix', 'teacher_teach_for_america', 'teacher_ny_teaching_fellow', 'primary_focus_subject', 'primary_focus_area', 'resource_type', 'poverty_level', 'grade_level', 'eligible_double_your_impact_match', 'eligible_almost_home_match' ] # proj_cat_labels.remove('school_city') for label in proj_cat_labels:
def fill_missing_values(self, df): return DataFrameImputer().fit_transform(df)