def transform(categorical_columns, numerical_columns, data): cat = ('categorical', ohe(), categorical_columns) num = ('numeric', ss(), numerical_columns) col_trans = ColumnTransformer([cat, num]) df_trans_scaled = col_trans.fit_transform(data) col_names = get_column_names_from_ColumnTransformer(col_trans) for vals in numerical_columns: col_names.append(vals) df_trans_scaled = pd.DataFrame({ col_names[0]: df_trans_scaled[:, 0], col_names[1]: df_trans_scaled[:, 1], col_names[2]: df_trans_scaled[:, 2], col_names[3]: df_trans_scaled[:, 3], col_names[4]: df_trans_scaled[:, 4], col_names[5]: df_trans_scaled[:, 5], col_names[6]: df_trans_scaled[:, 6], col_names[7]: df_trans_scaled[:, 7], col_names[8]: df_trans_scaled[:, 8], col_names[9]: df_trans_scaled[:, 9], col_names[10]: df_trans_scaled[:, 10], col_names[11]: df_trans_scaled[:, 11], col_names[12]: df_trans_scaled[:, 12], col_names[13]: df_trans_scaled[:, 13], col_names[14]: df_trans_scaled[:, 14], col_names[15]: df_trans_scaled[:, 15], col_names[16]: df_trans_scaled[:, 16], col_names[17]: df_trans_scaled[:, 17] }) return df_trans_scaled, col_names
def fit(self, data): if not isinstance(data, pd.DataFrame): # Needs to be dataframe data = pd.DataFrame(data) self.p = data.shape[1] self.cidx = np.where(data.dtypes == 'object')[0] self.nidx = np.where(~(data.dtypes == 'object'))[0] self.cenc = ohe(sparse=False, dtype=int, handle_unknown='ignore', drop=self.drop) self.cenc.categories_ = [ list(data.iloc[:, x].value_counts().index) for x in self.cidx ] self.cenc.drop_idx_ = np.repeat(0, len(self.cenc.categories_)) # Total feature size: categories + num self.p2 = sum([len(x) - 1 for x in self.cenc.categories_]) + len(self.nidx) self.nenc = ss() self.nenc.mean_ = data.iloc[:, self.nidx].mean().values self.nenc.scale_ = data.iloc[:, self.nidx].std().values self.nenc.n_features_in_ = self.nidx.shape[0] self.cn = list(self.cenc.get_feature_names(data.columns[self.cidx].astype(str))) + \ data.columns[self.nidx].to_list() self.lst_enc = [self.cenc, self.nenc] self.lst_cidx = [self.cidx, self.nidx] self.lst_iter = [len(z) > 0 for z in self.lst_cidx]
def createOneHotEncoding(y_labels): ''' Output of an image will be a number between 0-35. Why should convert into OneHot Encoding format ? ''' from sklearn.preprocessing import OneHotEncoder as ohe enc = ohe(46) enc.fit(y_labels.reshape(y_labels.shape[0],1)) return enc
def __init__(self, url, names, label_tag, drop_tags=None, encode_tags=None, normalizer=Normalizer(), normal_tags=None, test_size=0.2): self.url = url self.names = names self.drop_tags = drop_tags self.encode_tags = encode_tags self.data = None self.label_tag = label_tag self.test_size = test_size self.enc = ohe(categories='auto') self.normal_tags = normal_tags self.normalizer = normalizer
def pivot_data(df, cols_and_vals): ''' df - a pandas dataframe cols_and_vals - a list (str, int) of column names and number of values to pivot returns pivoted array and dict {column_name:pivoted_values} ''' encoded_cols = [] pivoted_vals = {} for p in cols_and_vals: col, n_vals = p[0], p[1] encoded_col, new_vals = encode(df[col], n_vals) encoded_cols.append(encoded_col) pivoted_vals[col] = new_vals encoded_array = make_array(encoded_cols) oneHot = ohe(categorical_features = 'all', n_values = 'auto') pivoted_array = oneHot.fit_transform(encoded_array) return pivoted_array, pivoted_vals
# import the dataset dataset = pd.read_csv('data\Data.csv') X = dataset.iloc[:, :-1].values y = dataset.iloc[:, 3].values # replace missing data in X using mean of the whole column imputer = im(missing_values='NaN', strategy='mean', axis=0) imputer = imputer.fit(X[:, 1:3]) X[:, 1:3] = imputer.transform(X[:, 1:3]) # encode categorical data labelencode_X = le() X[:, 0] = labelencode_X.fit_transform(X[:, 0]) # dummy encoding the data ohotencode = ohe(categorical_features=[0]) X = ohotencode.fit_transform(X).toarray() labelencode_Y = le() y = labelencode_Y.fit_transform(y) # splitting the data into train and test set X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=0) # feature scaling standardscale_X = ss() X_train = standardscale_X.fit_transform(X_train) X_test = standardscale_X.transform(X_test)
from sklearn.compose import ColumnTransformer as ct from sklearn.model_selection import train_test_split as tts #read file and split to dependant and independant dataset = pd.read_csv('Data.csv') x = dataset.iloc[:, 0:-1].values y = dataset.iloc[:, 3].values #fill nan values by mean x[:, 1:] = sip(missing_values=np.nan, strategy='mean').fit_transform(x[:, 1:]) #check dataset print("dataset:\n", dataset) #encode x to zeroes and ones x = ct([('Country', ohe(), [0])], remainder='passthrough').fit_transform(x) #encode y to zeroes and ones y = le().fit_transform(y) #count nan """total = dataset.isnull().sum().sort_values(ascending=False) percent = (dataset.isnull().sum()/dataset.isnull().count()).sort_values(ascending=False)*100 missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent']) missing_data.head(20) print("total :\n",missing_data) """ #take some values as training and predict output of some test cases x_train, x_test, y_train, y_test = tts(x, y, test_size=0.2, random_state=0)
def fit(self, x): # Fit the encoder/scaler self.n = x.shape[0] self.p = x.shape[1] dt1 = pd.Series([type(x.iloc[0][kk]).__name__ for kk in range(self.p)]) dt2 = x.dtypes.astype(str).reset_index(drop=True) self.dt = pd.Series( np.where( dt1.isin(['int64', 'float64']) & dt2.isin(['int64', 'float64']), 'float', 'str')) if not all(self.dt.values == 'float'): self.dt[~(self.dt.values == 'float')] = \ np.where(x.loc[:, ~(self.dt.values == 'float')].apply(lambda x: x.str.contains('\\|', na=False).any()), 'lst',self.dt[~(self.dt.values == 'float')]) self.cn = np.array(x.columns) stopifnot(all(self.dt.isin(['float', 'lst', 'str']))) self.cidx = np.where(self.dt == 'str')[0] self.nidx = np.where(self.dt == 'float')[0] self.tidx = np.where(self.dt == 'lst')[0] stopifnot( all( np.sort(reduce(np.union1d, [self.cidx, self.nidx, self.tidx])) == np.arange(self.p))) self.iter = {'cenc': True, 'nenc': True, 'tenc': True} self.all_enc = {} ############################################################# # --- Encoder (i): Categorical/ordinal integer features --- # if len(self.cidx) > 0: self.cenc = ohe(sparse=self.sparse, dtype=self.dtype, handle_unknown='ignore', drop=None) self.cenc.categories_ = [ np.unique(x.iloc[:, kk]) for kk in self.cidx ] self.cmode = [x.iloc[:, kk].mode()[0] for kk in self.cidx] cmode_idx = np.array([ np.where(vec == mm)[0][0] for vec, mm in zip(self.cenc.categories_, self.cmode) ]) cum_idx = np.append([0], np.cumsum( [len(z) for z in self.cenc.categories_])) self.cenc.drop_idx = [] self.cenc.drop_idx_ = None self.cenc.p = cum_idx.max() - len( self.cenc.drop_idx ) # How many features after dropping most common self.cenc.cn = list( np.delete(self.cenc.get_feature_names(self.cn[self.cidx]), self.cenc.drop_idx)) self.all_enc['cenc'] = self.cenc else: self.iter['cenc'] = False ############################################### # --- Encoder (ii): Continuous numerical ---- # if len(self.nidx) > 0: if self.quantize: u_nidx = np.array( [len(x.iloc[:, kk].unique()) for kk in self.nidx]) self.nidx1 = self.nidx[u_nidx > 31] # quantize self.nidx2 = self.nidx[u_nidx <= 31] # one-hot-encode self.nenc = {'enc': {}, 'cn': {}} if len(self.nidx1) > 0: self.nenc1 = KD(n_bins=self.nbins, strategy='quantile') if not self.sparse: self.nenc1.encode = 'onehot-dense' self.nenc1.fit(x.iloc[:, self.nidx1]) self.nenc1.cn = ljoin([ cn + '_q' + pd.Series(qq).astype(str) for cn, qq in zip(self.cn[self.nidx1], [ np.arange(len(z) - 1) + 1 for z in self.nenc1.bin_edges_ ]) ]) self.nenc['enc']['nenc1'] = self.nenc1 self.nenc['cn']['nenc1'] = self.nenc1.cn if len(self.nidx2) > 0: self.nenc2 = ohe(sparse=self.sparse, handle_unknown='ignore', drop=None) self.nenc2.fit(x.iloc[:, self.nidx2]) self.nenc2.cn = self.nenc2.get_feature_names( self.cn[self.nidx2]) self.nenc['enc']['nenc2'] = self.nenc2 self.nenc['cn']['nenc2'] = self.nenc2.cn self.nenc['cn'] = ljoin(list(self.nenc['cn'].values())) self.all_enc['nenc'] = self.nenc else: self.nenc = ss(copy=False) self.nenc.mean_ = x.iloc[:, self.nidx].mean(axis=0).values self.nenc.scale_ = x.iloc[:, self.nidx].std(axis=0).values self.nenc.n_features_in_ = self.nidx.shape[0] self.nenc.p = self.nidx.shape[0] self.nenc.cn = list(self.cn[self.nidx]) self.all_enc['nenc'] = self.nenc else: self.iter['nenc'] = False ################################################ # --- Encoder (iii): Tokenize text blocks ---- # if len(self.tidx) > 0: self.tenc = dict( zip(self.cn[self.tidx], [ cv(tokenizer=lambda x: tok_fun(x), lowercase=False, token_pattern=None, binary=True) for z in range(self.tidx.shape[0]) ])) self.tenc = {'cv': self.tenc} for kk, jj in enumerate(self.cn[self.tidx]): self.tenc['cv'][jj].fit(x.loc[:, jj].astype('U')) self.tenc['p'] = sum( [len(z.vocabulary_) for z in self.tenc['cv'].values()]) self.tenc['cn'] = ljoin([ l + '_' + pd.Series(list(z.vocabulary_.keys())) for z, l in zip(self.tenc['cv'].values(), self.tenc['cv'].keys()) ]) self.all_enc['tenc'] = self.tenc else: self.iter['tenc'] = False # Store all in dictionary to iteration over self.iter self.enc_transform = { 'cenc': self.cenc_transform, 'nenc': self.nenc_transform, 'tenc': self.tenc_transform } # Get the valid categories self.tt = np.array(list(self.iter.keys()))[np.where( list(self.iter.values()))[0]] # Get full feature names cn = [] for ee in self.tt: if hasattr(self.all_enc[ee], 'cn'): cn.append(self.all_enc[ee].cn) else: cn.append(self.all_enc[ee]['cn']) cn = ljoin(cn) self.cn_transform = cn
@author: Gus Yudha """ """ Import Dataset nasabah bank """ import pandas as pd dataset = pd.read_csv('bank_customers.csv') X = dataset.iloc[:, 3: 13].values # Pilah Fitur yang penting (Dari CreditScore - EstimatedSalary) y = dataset.iloc[:, 13].values # Pilah Jawaban (Exited) """ Data preprocessing """ from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import OneHotEncoder as ohe from sklearn.compose import ColumnTransformer as ct le = LabelEncoder() X[:, 1] = le.fit_transform(X[:, 1]) # Ubah nama negara menjadi numerik X[:, 2] = le.fit_transform(X[:, 2]) # Ubah gender menjadi numerik Setarakan = ct([('Pilah Jadi 3', ohe(), [1])], remainder="passthrough") X = Setarakan.fit_transform(X[:, 0:]) # Setarakan kategori negara X = X[:, 1:] # hilangkan 1 fitur variabel sampah """ Pilah Data latihan dengan Data Ujian """ from sklearn.model_selection import train_test_split as tts Soal_latihan, Soal_ujian, Jawaban_latihan, Jawaban_ujian = tts(X, y, test_size=0.2, random_state=0) """ Standarisasi Soal latihan dan Soal Ujian """ from sklearn.preprocessing import StandardScaler ss = StandardScaler() Soal_latihan = ss.fit_transform(Soal_latihan) # Standarisasi Soal Latihan Soal_ujian = ss.transform(Soal_ujian) # Standarisasi Soal Ujian """ Inisialisasi Arsitektur ANN (11-6-6-1) """ from keras.models import Sequential
k = targ * (1 - y) grad = -np.ravel(np.dot(k.T, X).T) #grad = -np.sum(np.sum(targ*(1-y),axis=1).reshape([n_obs,1])*X,axis=0) return grad def class_efficiency(t_act, t_pred): cols = ['t_act', 't_pred'] df = pd.DataFrame(np.concatenate( [training[1], pred_cat.reshape([n_obs, 1])], axis=1), columns=cols) ct = pd.crosstab(df.t_act, df.t_pred) return ct ohe1 = ohe(handle_unknown='ignore') ohe1 = ohe1.fit(training[1]) targ = ohe1.transform(training[1]).toarray() X = scores_trunc.copy() dim = len(X.T) mms1 = mms() X = mms1.fit_transform(X) train = np.concatenate([X, targ], axis=1) n_cats = len(targ.T) n_obs = len(train) #dim = n_obs-n_cats df_train = pd.DataFrame(train) old_col = np.arange(dim, n_obs).tolist()
def catf(self, t): ohe1 = ohe(categorical_features=[t]) return ohe1.fit_transform(self.x).toarray()
from sklearn.preprocessing import OneHotEncoder as ohe from sklearn.preprocessing import MultiLabelBinarizer as mlb """ OneHotEncoder(n_values=’auto’, categorical_features=’all’, dtype=<class ‘numpy.float64’>, sparse=True, handle_unknown=’error’) """ # lianjia_df = pd.DataFrame({'Elevator':[1,2],'Renovation':[4,5]},dtype=np.float32,copy=True) # print(lianjia_df.values) # print(lianjia_df['Elevator']) # l = pd.get_dummies(lianjia_df['Elevator'])#独热编码方法 # print(l) x = np.random.uniform(1, 10, [3, 5]).astype(np.int32) y = np.arange(1, 10, 0.5) # print(x) # # print(y) # encoder = ohe(sparse=False)#指定结果是否稀疏 # encoder.fit(x) # print(encoder.active_features_) # print(encoder.feature_indices_) # print(encoder.n_values_) # print(encoder.transform([[1,2,3,4,5]])) encoder = ohe(sparse=False) #指定结果是否稀疏后者transform .toarray() encoder.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) arra = encoder.transform([[0, 1, 3]]) print(arra)
X = ed.iloc[:,1:] X.shape # 19100 X 12 # 9.1 Which columns are numerical and which categorical? num_columns = X.select_dtypes(include = ['float64','int64']).columns num_columns cat_columns = X.select_dtypes(include = ['object']).columns cat_columns # 10. Start creating transformation objects # 10.1 Tuple for categorical columns cat = ("cattrans", ohe(), cat_columns) # 10.2 tuple for numeric columns num = ("numtrans", ss() , num_columns) # 10.3 Instantiate column transformer object colTrans = ct([num,cat]) # 10.4 Fit and transform X_trans = colTrans.fit_transform(X) X_trans.shape # 19100 X 19 ## 11.0 Label encoding # 11.1 Map labels to 1 and 0 y = y.map({"continue" : 1, "drop" : 0}) y.head()
data = pd.read_csv("Churn_Modelling.csv") x = data.iloc[:, 3:13].values #all the columns except the last one is considered y = data.iloc[:, 13].values #label encoding from sklearn.preprocessing import LabelEncoder as le from sklearn.preprocessing import OneHotEncoder as ohe le_x_1 = le() #label encoder object created for country x[:, 1] = le_x_1.fit_transform( x[:, 1]) #label encoder object linked with the 2nd column of the data table le_x_2 = le() #label encoder object created for gender x[:, 2] = le_x_2.fit_transform(x[:, 2]) ohec = ohe(categorical_features=[ 1 ]) #index of the column is to be specified for the onehot encoding x = ohec.fit_transform(x).toarray() #now we have to fit the ohec object into x = x[:, 1:] #to eliminate the dummy variable trap(like for three classes a dummy variable set of 2 is fine(third is automatically set)) #data splitting from sklearn.model_selection import train_test_split as tts x_train, x_test, y_train, y_test = tts(x, y, test_size=0.2, random_state=0) #feature scaling from sklearn.preprocessing import StandardScaler as sc sc_x = sc() x_train = sc_x.fit_transform(x_train) #standardization scaling we are doing x_test = sc_x.transform(x_test)
get_ipython().run_cell_magic( u'html', u'', u"<div class='tableauPlaceholder' id='viz1535718122614' style='position: relative'><noscript><a href='#'><img alt='Story 2 ' src='https://public.tableau.com/static/images/Ti/Titanic2_32/Story2/1_rss.png' style='border: none' /></a></noscript><object class='tableauViz' style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='' /><param name='name' value='Titanic2_32/Story2' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https://public.tableau.com/static/images/Ti/Titanic2_32/Story2/1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='filter' value='publish=yes' /></object></div> <script type='text/javascript'> var divElement = document.getElementById('viz1535718122614'); var vizElement = divElement.getElementsByTagName('object')[0]; vizElement.style.width='1016px';vizElement.style.height='991px'; var scriptElement = document.createElement('script'); scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js'; vizElement.parentNode.insertBefore(scriptElement, vizElement); </script>" ) # **Converting categorical data to numeric form** # In[ ]: from sklearn.preprocessing import LabelEncoder as le from sklearn.preprocessing import OneHotEncoder as ohe for c in train.columns: if train[c].dtype == 'object': z1 = le().fit_transform(train[c].astype(str)) train[c] = ohe(sparse=False).fit_transform(z1.reshape(len(z1), 1)) z1 = le().fit_transform(test[c].astype(str)) test[c] = ohe(sparse=False).fit_transform(z1.reshape(len(z1), 1)) z1 = le().fit_transform(train['Age'].astype(str)) train['Age'] = ohe(sparse=False).fit_transform(z1.reshape(len(z1), 1)) z1 = le().fit_transform(test['Age'].astype(str)) test['Age'] = ohe(sparse=False).fit_transform(z1.reshape(len(z1), 1)) z1 = le().fit_transform(train['Fare'].astype(str)) train['Fare'] = ohe(sparse=False).fit_transform(z1.reshape(len(z1), 1)) z1 = le().fit_transform(test['Fare'].astype(str)) test['Fare'] = ohe(sparse=False).fit_transform(z1.reshape(len(z1), 1)) # **Dropping Some unnecessary Features** # cabin has more than 70% of the data missing # In[ ]:
maximum = np.max(x,axis=0) rang = maximum-minimum z = (x-minimum)/rang return z raw_data = open("trilogyData.csv") data = np.loadtxt(raw_data,delimiter=",",skiprows=1, dtype=np.str) x0 = np.ones((len(data),1)) x = data [:,1:72] y = data [:,72] y = y.astype(float) ohe = ohe(categories = 'auto') state = ohe.fit_transform(data[:,1].reshape((len(data),1))).toarray().astype(np.float) grade = ohe.fit_transform(data[:,2].reshape((len(data),1))).toarray().astype(np.float) cols = data[:, [2,3,4,5]] norm = normalizeData(cols.astype(int)) x = np.delete(x, [0,1,2,3,4,5], axis=1) arr = np.concatenate((state,grade,norm,x),axis=1) arr = arr.astype(float) arr = np.concatenate((x0,arr), axis=1) x_train, x_test, y_train, y_test = model_selection.train_test_split(arr, y,train_size=0.7, test_size=0.3,
from sklearn.linear_model import LinearRegression import statsmodels.regression.linear_model as lm #read file and split to dependant and independant dataset = pd.read_csv('50_Startups.csv') x = dataset.iloc[:, 0:-1].values y = dataset.iloc[:, 4].values """ #fill nan values by mean x[: , 1: ]= sip(missing_values=np.nan,strategy='mean').fit_transform(x[: , 1: ]) """ #check dataset print("dataset:\n", dataset) #encode x to zeroes and ones x = ct([('Country', ohe(), [3])], remainder='passthrough').fit_transform(x).astype('int') """ #encode y to zeroes and ones y=le().fit_transform(y) """ #remove variable trap x = x[:, 1:] #take some values as training and predict output of some test cases x_train, x_test, y_train, y_test = tts(x, y, test_size=0.2, random_state=0) print("x:\n", x) print("x_train before scaling:\n", x_train)