index)] ### now, let's seperate the numeric explanatory data from the string data string_featuresfore = explanatory_dffore.ix[:, explanatory_dffore.dtypes == 'object'] numeric_featuresfore = explanatory_dffore.ix[:, explanatory_dffore. dtypes != 'object'] # that are all NANs, as they will show up as all 'Nothing' when we start binning or look for features with no variation) string_featuresfore = string_featuresfore.fillna('Nothing') # cleaning up string features string_featuresfore = cleanup_data(string_featuresfore) # binarizing string features encoded_datafore = get_binary_values(string_featuresfore) ## imputing features imputer_object = imp(missing_values='NaN', strategy='median', axis=0) imputer_object.fit(numeric_featuresfore) numeric_featuresfore = pandas.DataFrame( imputer_object.transform(numeric_featuresfore), columns=numeric_featuresfore.columns) ## pulling together numeric and encoded data. explanatory_dffore = pandas.concat([numeric_featuresfore, encoded_datafore], axis=1) explanatory_dffore.head() #now, let's find features with no variance no_variationfore = find_zero_var(explanatory_dffore) explanatory_dffore.drop(no_variationfore['toDelete'], inplace=True) # deleting perfect correlation
## seeing which explanatory feature rows got removed. Looks like none. response_seriesfore.index[~response_seriesfore.index.isin(explanatory_dffore.index)] ### now, let's seperate the numeric explanatory data from the string data string_featuresfore = explanatory_dffore.ix[:, explanatory_dffore.dtypes == 'object'] numeric_featuresfore = explanatory_dffore.ix[:, explanatory_dffore.dtypes != 'object'] # that are all NANs, as they will show up as all 'Nothing' when we start binning or look for features with no variation) string_featuresfore = string_featuresfore.fillna('Nothing') # cleaning up string features string_featuresfore = cleanup_data(string_featuresfore) # binarizing string features encoded_datafore = get_binary_values(string_featuresfore) ## imputing features imputer_object = imp(missing_values='NaN', strategy='median', axis=0) imputer_object.fit(numeric_featuresfore) numeric_featuresfore = pandas.DataFrame(imputer_object.transform(numeric_featuresfore), columns = numeric_featuresfore.columns) ## pulling together numeric and encoded data. explanatory_dffore = pandas.concat([numeric_featuresfore, encoded_datafore],axis = 1) explanatory_dffore.head() #now, let's find features with no variance no_variationfore = find_zero_var(explanatory_dffore) explanatory_dffore.drop(no_variationfore['toDelete'], inplace = True) # deleting perfect correlation no_correlationfore = find_perfect_corr(explanatory_dffore) explanatory_dffore.drop(no_correlationfore['toRemove'], 1, inplace = True)
def mval(self): imp1 = imp(missing_values="NaN", strategy="mean", axis=0) imp1 = imp1.fit(self.x[:, 1:3]) return imp1.transform(self.x[:, 1:3])
data.Open.fillna(method='ffill', inplace=True) data.High.fillna(method='ffill', inplace=True) data.Low.fillna(method='ffill', inplace=True) data.Close.fillna(method='ffill', inplace=True) train = data.iloc[:, 1:7] test = data.iloc[:, 7:8] print(train.head()) print(test.head()) from sklearn.preprocessing import Imputer as imp # In[12]: imp = imp(missing_values='NaN', strategy='median', axis=0) # In[13]: train.iloc[:, 1:6] = imp.fit_transform(train.iloc[:, 1:6]) # In[14]: train.isnull().sum() # In[15]: test.fillna(pd.DataFrame.mean(test), inplace=True) # In[16]:
#%% '''Loading dataset''' import numpy as np import pandas as pd dataset = pd.read_csv("cattdat.csv") X = dataset.iloc[:,:-1].values Y = dataset.iloc[:,-1].values print(X) #%% '''Dealing with missing data''' from sklearn.preprocessing import Imputer as imp imputer = imp(missing_values = 'NaN', strategy = 'mean', axis = 0) imputer = imputer.fit(X[:,1:3]) X[:,1:3] = imputer.transform(X[:,1:3]) #%% '''Encoding Y''' from sklearn.preprocessing import LabelEncoder as le labelencoder_x = le() X[:,3]=labelencoder_x.fit_transform(X[:,3]) #%% '''Dummy variable''' from sklearn.preprocessing import OneHotEncoder as hot ohe = hot(categorical_features=[3])