X = x.reshape(-1, 1) y = (y_no_noise + rnd.normal(size=len(x))) / 2 plt.plot(X, y, 'o', c='r') plt.show() line = np.linspace(-5, 5, 1000, endpoint=False).reshape(-1, 1) mlpr = MLPRegressor().fit(X, y) knr = KNeighborsRegressor().fit(X, y) plt.plot(line, mlpr.predict(line), label='MLP') plt.plot(line, knr.predict(line), label='KNN') plt.plot(X, y, 'o', c='r') plt.legend(loc='best') plt.show() bins = np.linspace(-5, 5, 11) target_bin = np.digitize(X, bins=bins) print(bins) onehot = OneHotEncoder(sparse=False) onehot.fit(target_bin) X_in_bin = onehot.transform(target_bin) new_line = onehot.transform(np.digitize(line, bins=bins)) new_mlpr = MLPRegressor().fit(X_in_bin, y) new_knr = KNeighborsRegressor().fit(X_in_bin, y) plt.plot(line, new_mlpr.predict(new_line), label='New MLP') plt.plot(line, new_knr.predict(new_line), label='New KNN') plt.plot(X, y, 'o', c='r') plt.legend(loc='best') plt.show()
LabelEncoder_X = LabelEncoder() X[:, 0] = LabelEncoder_X.fit_transform(X[:, 0]) X[:, 2] = LabelEncoder_X.fit_transform(X[:, 2]) X[:, 4] = LabelEncoder_X.fit_transform(X[:, 4]) X[:, 5] = LabelEncoder_X.fit_transform(X[:, 5]) X[:, 7] = LabelEncoder_X.fit_transform(X[:, 7]) X[:, 8] = LabelEncoder_X.fit_transform(X[:, 8]) X[:, 10] = LabelEncoder_X.fit_transform(X[:, 10]) X[:, 12] = LabelEncoder_X.fit_transform(X[:, 12]) # Processing from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer ct = ColumnTransformer([('encoder', OneHotEncoder(), [5])], remainder='passthrough') X = np.array(ct.fit_transform(X), dtype=np.float) X = X[:, 1:] # Processing from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer ct = ColumnTransformer([('encoder', OneHotEncoder(), [8])], remainder='passthrough') X = np.array(ct.fit_transform(X), dtype=np.float) X = X[:, 1:] # Spliting the data value in train and test from sklearn.model_selection import train_test_split
imputer = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) imputer = imputer.fit(X[:, 7:8]) X[:, 7:8] = imputer.transform(X[:, 7:8]) #see cor plot of numeric variables not working #sns.set(style="ticks", color_codes=True) #g = sns.pairplot(dataset_pred,hue="isFraud") #use encoder labelencoder_X_1 = LabelEncoder() X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1]) #labelencoder_X_2 = LabelEncoder() #X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2]) #create dummy variable for type 5 type as there are catagory not working onehotencoder = OneHotEncoder(categorical_features=[ 1 ]) #apply in column type as there are more than 2 catagory X = onehotencoder.fit_transform(X).toarray() # splitting train and test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) #check traing data spread fraud and nonfraud are equally sprea in train and test np.count_nonzero(y_test == 1) np.count_nonzero(y_test == 0) np.count_nonzero(y_train == 1) np.count_nonzero(y_train == 0) #feature scalling
# split into attributes and labels target = 'age_group' X = df.drop(['srcid', target], axis=1) X = X.drop(['p_state', 'sDevType', 'sOSName', 'education', 'gender'], axis=1) # X = X.drop(['Q6b','Q6c','Q6d','Q7b','Q7c','Q7d','Q10b','Q10c','Q10d'], axis=1) y = df[target] ''' srcid,Q1,Q2,Q4,Q6a,Q6b,Q6c,Q6d,Q7a,Q7b,Q7c,Q7d,Q7e,Q10a,Q10b,Q10c,Q10d, p_state,sDevType,sOSName,gender,age_group,education''' # # Convert features to Ordinal values # ordinalencoder_X = OrdinalEncoder() # X = ordinalencoder_X.fit_transform(X) # X = X.astype(int) # Convert features to OneHotEncoding values one_hot_encoder_X = OneHotEncoder() X = one_hot_encoder_X.fit_transform(X).toarray() X = X.astype(int) print(X) # Convert target to Ordinal values labelencoder_y = LabelEncoder() y = labelencoder_y.fit_transform(y) nb_list = [MultinomialNB, ComplementNB, GaussianNB, BernoulliNB] result_list = [[] for x in range(len(nb_list))] for j in range(20): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) for i in range(len(nb_list)):
E = df["death"] T = df["futime"] X = df >> drop(X.death, X.futime, X.chapter) \ >> mutate(mgus=X.mgus.astype(float), age=X.age.astype(float)) X = X[T > 0] E = E[T > 0] T = T[T > 0] #Y = np.c_[np.log(T) - np.mean(np.log(T)), C] Y = Y_join(T, E) X_num = X.select_dtypes(include=["float"]) X_cat = X.select_dtypes(exclude=["float"]) imputer = SimpleImputer(strategy="median") X_num = imputer.fit_transform(X_num.values) imputer = SimpleImputer(strategy="most_frequent") X_cat = imputer.fit_transform(X_cat.values) encoder = OneHotEncoder(sparse=False) X_cat = encoder.fit_transform(X_cat) X = np.c_[X_num, X_cat] elif args.dataset == "support": df = pd.read_csv("./data/surv/support2.csv") df = df.rename(columns={"d.time": "dtime"}) T = df["dtime"] E = df["death"] #Y = np.c_[np.log(T) - np.mean(np.log(T)), C] Y = Y_join(T, E) df >>= drop(X.dtime, X.death, X.hospdead, X.prg2m, X.prg6m, X.dnr, X.dnrday, X.aps, X.sps, X.surv2m, X.surv6m, X.totmcst) X_num = df.select_dtypes(include=["float", "int"]) X_cat = df.select_dtypes(exclude=["float", "int"]) imputer = SimpleImputer(strategy="median")
from keras.models import load_model from keras import optimizers from keras_gradient_noise import add_gradient_noise noisy = add_gradient_noise(optimizers.RMSprop) from sklearn.preprocessing import OneHotEncoder from config import window_size, feature_len import numpy as np m = load_model("model", custom_objects={"NoisyRMSprop": noisy}) number_of_notes = 50 rand = np.random.randint(0, feature_len, size=[window_size]) ohe = OneHotEncoder(n_values=feature_len, sparse=False) music = [] music.extend(list(rand)) for i in range(number_of_notes): a = np.array(music[i:i + window_size]).reshape([-1, 1]) rand = ohe.fit_transform(a) pred = m.predict(rand.reshape([1, window_size, feature_len])) music.append(np.argmax(pred)) music = music[window_size:] with open("classes.txt", "r") as f: classes = f.readlines() # one hot decode yap # sonra label decode yap # karsilik gelen note ve chordlardan stream olustur # stream'i midi dosyasina yaz # kaydet labels = []
labelEncoder_previsores = LabelEncoder() #Atributo da coluna 1 é categórico previsores[:, 1] = labelEncoder_previsores.fit_transform(previsores[:, 1]) previsores[:, 3] = labelEncoder_previsores.fit_transform(previsores[:, 3]) previsores[:, 5] = labelEncoder_previsores.fit_transform(previsores[:, 5]) previsores[:, 6] = labelEncoder_previsores.fit_transform(previsores[:, 6]) previsores[:, 7] = labelEncoder_previsores.fit_transform(previsores[:, 7]) previsores[:, 8] = labelEncoder_previsores.fit_transform(previsores[:, 8]) previsores[:, 9] = labelEncoder_previsores.fit_transform(previsores[:, 9]) previsores[:, 13] = labelEncoder_previsores.fit_transform(previsores[:, 13]) #Existe uma ineficiência nessa solução, pois essas variáveis trasnformadas são do tipo nominal #No caso não posso dizer por exemplo que uma raça é melhor que outra onehotencoder = OneHotEncoder(categorical_features=[1, 3, 5, 6, 7, 8, 9, 13]) previsores = onehotencoder.fit_transform(previsores).toarray() labelEncoder_classe = LabelEncoder() classe = labelEncoder_classe.fit_transform(classe) standardScaler = StandardScaler() previsores = standardScaler.fit_transform(previsores) ###########################CRIAÇÃO BASE DE TESTE############################### from sklearn.model_selection import train_test_split previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split( previsores, classe, test_size=0.15, random_state=0)
'SubscriberIndex', 'SubgroupIndex' ] #separate categorical and numeric features Mcat = np.array(Jcodes_w_L[cat_features].tolist()) Mnum = np.array(Jcodes_w_L[numeric_features].tolist()) L = np.array(Jcodes_w_L[label].tolist()) #Setup One Hot Encoding #https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html #https://medium.com/@contactsunny/label-encoder-vs-one-hot-encoder-in-machine-learning-3fc273365621 #https://towardsdatascience.com/encoding-categorical-features-21a2651a065c #https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html ohe = OneHotEncoder(sparse=False) #Easier to read Mcat = ohe.fit_transform(Mcat) ohe.inverse_transform(Mcat) ohe_features = ohe.get_feature_names(cat_features).tolist() M = np.concatenate((Mcat, Mnum), axis=1) #Concatenate the columns #M = np.concatenate((Mcat_subset, Mnum_subset), axis=1) L = Jcodes_w_L[label].astype(int) n_folds = 5 #EDIT: pack the arrays together into "data" data = (M, L, n_folds)
import pandas as pd dataset = pd.read_csv('Churn_Modelling.csv') #Dividing Dataset X = dataset.iloc[:, 3:-1].values Y = dataset.iloc[:, -1].values #encoding categorical data from sklearn.preprocessing import LabelEncoder, OneHotEncoder lb_X = LabelEncoder() X[:, 1] = lb_X.fit_transform(X[:, 1]) X[:, 2] = lb_X.fit_transform(X[:, 2]) oneh = OneHotEncoder(categorical_features=[1]) X = oneh.fit_transform(X).toarray() lb_Y = LabelEncoder() Y = lb_Y.fit_transform(Y) #train test split from sklearn.model_selection import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2) #scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.fit_transform(X_test)
def pcafilter(): # Setting up inputs parser = argparse.ArgumentParser() parser.add_argument('tilefile', metavar='TILEFILE', help='File containing files of tile var matrix') parser.add_argument('tilepath', metavar='TILEPATH', help='File containing information about tile locations') args = parser.parse_args() rcParams.update({'figure.autolayout': True}) if not os.path.exists('Images'): os.makedirs('Images') tiledata_file = args.tilefile tilepath_file = args.tilepath print("Reading in Data...") tiledata= np.load(tiledata_file) pathdata = np.load(tilepath_file) tile_path = np.trunc(pathdata/(16**5)) idx1 = tile_path >= 863 idx2 = tile_path <= 810 idx3 = idx2 idxOP = np.arange(pathdata.shape[0]) idxOP = idxOP[idx3] pathdata = pathdata[idx3] print(tiledata.shape) tiledata = tiledata[:,idx3] print(tiledata.shape) tiledata = tiledata + 2 nnz = np.count_nonzero(tiledata,axis=0) fracnnz = np.divide(nnz.astype(float),tiledata.shape[0]) # Only keeping data that has less than 1% missing data idxKeep = fracnnz >= 0.99 tiledata = tiledata[:,idxKeep] print("Encoding in 1-hot...") print("Determing new path and varval vectors...") print(tiledata.shape) def foo(col): u = np.unique(col) nunq = u.shape return nunq invals = np.apply_along_axis(foo, 0, tiledata) invals = invals[0] varvals = np.full(50*tiledata.shape[1],np.nan) nx=0 varlist = [] for j in range(0,tiledata.shape[1]): u = np.unique(tiledata[:,j]) varvals[nx:nx+u.size] = u nx = nx + u.size varlist.append(u) varvals = varvals[~np.isnan(varvals)] print(varvals.shape) enc = OneHotEncoder(sparse=True, dtype=np.uint16) Xtrain = enc.fit_transform(tiledata) print(Xtrain.shape) to_keep = varvals > 1 idkTK = np.nonzero(to_keep) idkTK = idkTK[0] Xtrain = Xtrain[:,idkTK] scipy.sparse.save_npz('XtrainPCA.npz', Xtrain)
''' pclass sex age sibsp parch fare embarked 0 2 Female 17 0 0 12.00 C 1 3 Female 37 0 0 9.59 S 2 3 Male 18 1 1 20.21 S 3 3 Male 30 0 0 7.90 S 4 3 Male 25 0 0 7.65 S ''' print(x_train_df.columns) ''' Index(['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked'], dtype='object') ''' transformer = make_column_transformer( (OneHotEncoder(), ['pclass', 'sex', 'embarked']), remainder='passthrough') transformer.fit(x_train_df) x_train = transformer.transform(x_train_df) x_test = transformer.transform(x_test_df) y_train = y_train_df.values y_test = y_test_df.values print(x_train.shape) print(y_train.shape) ''' pclass sex age sibsp parch fare embarked 0 2 Female 17 0 0 12.00 C 1 3 Female 37 0 0 9.59 S 2 3 Male 18 1 1 20.21 S 3 3 Male 30 0 0 7.90 S
import matplotlib.pyplot as plt """**Importing Dataset**""" dataset= pd.read_csv("50_Startups.csv") x= dataset.iloc[:, :-1] y=dataset.iloc[:, -1] print (x) """**Encoding Categorical Data**""" from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder ct= ColumnTransformer( transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough') x = np.array(ct.fit_transform(x)) print (x) """**Seperate Test Set and Training Set**""" from sklearn.model_selection import train_test_split x_train, x_test, y_train , y_test = train_test_split(x, y, test_size=0.2 , random_state=0) """**Training the Multiple Linear Regression Model**""" from sklearn.linear_model import LinearRegression regressor= LinearRegression() regressor.fit(x_train, y_train)
# -*- coding: utf-8 -*- """ Created on Tue Mar 20 18:04:03 2018 @author: admin """ import numpy as np import pandas as pd import matplotlib.pyplot as plt data = pd.read_csv("affairs.csv") features = data.iloc[:, :-1].values labels = data.iloc[:, -1].values from sklearn.preprocessing import LabelEncoder, OneHotEncoder oh = OneHotEncoder(categorical_features=[6, 7]) le = LabelEncoder() features = oh.fit_transform(features).toarray() features = features[:, 1:] from sklearn.model_selection import train_test_split f_train, f_test, l_train, l_test = train_test_split(features, labels, test_size=0.25, random_state=0) from sklearn.linear_model import LogisticRegression classifier = LogisticRegression() classifier.fit(f_train, l_train) l_pred = classifier.predict(f_test)
def clean_data(data): # Copy data X = data.to_pandas_dataframe() X.set_index('Id',inplace=True) print(X.head()) print() # Remove rows with missing target, separate target from predictors X.dropna(axis=0, subset=['SalePrice'], inplace=True) y = X.SalePrice # Remove target and 'Utilities' X.drop(['SalePrice', 'Utilities'], axis=1, inplace=True) print(X.shape) # Select object columns categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"] # Select numeric columns numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64','float64']] # Imputation lists # imputation to null values of these numerical columns need to be 'constant' constant_num_cols = ['GarageYrBlt', 'MasVnrArea'] #constant_num_cols = ['MasVnrArea'] print("constant_num_cols") print(constant_num_cols) print # imputation to null values of these numerical columns need to be 'mean' mean_num_cols = list(set(numerical_cols).difference(set(constant_num_cols))) print("mean_num_cols") print(mean_num_cols) print() # imputation to null values of these categorical columns need to be 'constant' constant_categorical_cols = ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond','BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature'] print("constant_categorical_cols") print(constant_categorical_cols) print() # imputation to null values of these categorical columns need to be 'most_frequent' mf_categorical_cols = list(set(categorical_cols).difference(set(constant_categorical_cols))) print("mf_categorical_cols") print(mf_categorical_cols) print() my_cols = constant_num_cols + mean_num_cols + constant_categorical_cols + mf_categorical_cols print("my_cols") print(my_cols) print() # Define transformers # Preprocessing for numerical data numerical_transformer_m = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),('scaler', StandardScaler())]) numerical_transformer_c = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0)),('scaler', StandardScaler())]) # Preprocessing for categorical data for most frequent categorical_transformer_mf = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown = 'ignore', sparse = False))]) # Preprocessing for categorical data for constant categorical_transformer_c = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='NA')), ('onehot', OneHotEncoder(handle_unknown = 'ignore', sparse = False))]) # Bundle preprocessing for numerical and categorical data #preprocessor = ColumnTransformer(transformers=[ # ('num_mean', numerical_transformer_m, mean_num_cols), # ('num_constant', numerical_transformer_c, constant_num_cols), # ('cat_mf', categorical_transformer_mf, mf_categorical_cols), # ('cat_c', categorical_transformer_c, constant_categorical_cols)]) preprocessor = ColumnTransformer(transformers=[ ('num_mean', numerical_transformer_m, mean_num_cols), ('cat_mf', categorical_transformer_mf, mf_categorical_cols), ('cat_c', categorical_transformer_c, constant_categorical_cols)]) X = preprocessor.fit_transform(X) return X, y
df = df_subscribers.pivot( index='ticket_id', columns='item_name', values='item_count').fillna(0) df_subscribers.reset_index(inplace=True) df_subscribers.drop(columns='index', inplace=True) # --- add back date and location df = df.merge(df_subscribers[['ticket_id', 'location', 'order_timestamp'] ].drop_duplicates(), how='left', on='ticket_id') # --- extract hour of day from datetime df['hour'] = df['order_timestamp'].apply(get_hour) # df['hour'] = df['order_timestamp'].apply(lambda x: x.hour) # --- convert categorical store variables to dummies # use sklearn.preprocessing.OneHotEncoder() to create a class object called encoded_data encoded_data = OneHotEncoder(handle_unknown='ignore') # call the method used to fit data for a OneHotEncorder object. # Note: you will have to reshape data from a column of the data frame. # useful functions may be DataFrame methods .to_list(), .reshape(), and .shape() encoded_data.fit(X=np.array(df['location'].tolist()).reshape(df.shape[0], 1)) # fixed split with regex to avoid IndexError col_map_store_binary = dict(zip(list(encoded_data.get_feature_names()), [ 'store_' + re.split('x\d_', x)[1] for x in encoded_data.get_feature_names()])) # fix transform data df_store_binary = pd.DataFrame( encoded_data.fit_transform(df[['location']]).toarray()) # df_store_binary = pd.DataFrame(encoded_data.transform( # X=np.array(df['location'].tolist()).reshape(df.shape[0], 1))) df_store_binary.columns = encoded_data.get_feature_names() df_store_binary.rename(columns=col_map_store_binary, inplace=True)
# 'credit_history': {'critical': 0, # 'delayed': 2, # 'fully repaid': 3, # 'fully repaid this bank': 4, # 'repaid': 1}} for col in cols: df[col] = df[col].map(map[col]) return df ## 5.2 <序号编码>Ordinary Encoding OrdinalEncoder(categories=’auto’, dtype=<class ‘numpy.float64’>) ## 5.3 <独热编码>One-hot Encoding ''' 会导致高维度特征,应配合特征选择来降低维度 ''' OneHotEncoder(n_values=None, categorical_features=None, categories=None, drop=None, sparse=True, dtype=<class ‘numpy.float64’>, handle_unknown=’error’)# 热编码,若有n个类,则生成n个特征,其中一个是1其余是0. # `sparse`:默认为True表示用稀疏矩阵表示,一般使用`.toarray()`转换到False,即数组。 ## 5.4 <二进制编码>Binary Encoding ''' 用二进制来表示不同的类别如3表示为011 维度少于独热编码 ''' ## 5.6 其它编码方式,比如Helmert Contrast、Sum Contrast、Polynomial Contrast、Backward Difference Contrast等。 LabelEncoder().fit_transform(data[feature].astype(np.str) # 对类别特征进行 OneEncoder data = pd.get_dummies(data, columns=['model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage', 'power_bin'])
X = dataset.iloc[:, :-1].values y = dataset.iloc[:, -1].values # print(X); # print(y); #Handle Missing Data from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values=np.nan, strategy="mean") X[:, 1:3] = imputer.fit_transform(X[:, 1:3]) # print(X); #Encoding Categorical Data #Encoding Independent Variable from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder ct = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), [0])], remainder="passthrough") X = np.array(ct.fit_transform(X)) # print(X); #Encoding Dependent Variable from sklearn.preprocessing import LabelEncoder le = LabelEncoder() y = np.array(le.fit_transform(y)) # print(y); #Splitting Dataset into Training Set & Test Set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
smoker = pd.DataFrame(smoker) smoker.columns = ['smoker'] le_smoker_mapping = dict(zip(le.classes_, le.transform(le.classes_))) print("Sklearn label encoder results for smoker:") print(le_smoker_mapping) print(smoker[:10]) #option3: sklearn one hot encoding: maps each category to 0 (cold) or 1 (hot) #one hot encoder = ohe #create ndarray for one hot encodoing (sklearn) region = data.iloc[:, 5:6].values #ndarray ## ohe for region ohe = OneHotEncoder() region = ohe.fit_transform(region).toarray() region = pd.DataFrame(region) region.columns = ['northeast', 'northwest', 'southeast', 'southwest'] print("Sklearn one hot encoder results for region:") print(region[:10]) ############################################01_05_DividingtheDataintoTestandTrain############################################## #putting the data together: ##take the numerical data from the original data X_num = data[['age', 'bmi', 'children']].copy() ##take the encoded data and add to numerical data
import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, OneHotEncoder from sklearn.linear_model import LinearRegression from statsmodels.formula.api import OLS # Importing the dataset dataset = pd.read_csv('50_Startups.csv') X = dataset.iloc[:, :-1].values y = dataset.iloc[:, 4].values # Encoding categorical data labelencoder = LabelEncoder() X[:, 3] = labelencoder.fit_transform(X[:, 3]) onehotencoder = OneHotEncoder(categories='auto') X = onehotencoder.fit_transform(X).toarray() # Avoiding the Dummy Variable Trap X = X[:, 1:] # Splitting the dataset into the Training set and Test set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Fitting Multiple Linear Regression to the Training set regressor = LinearRegression()
allData = pd.read_csv('data_tenis.csv') temperature = allData.iloc[:, 1:2].values #categoric data column humidity = allData.iloc[:, 2:3].values #to be predicted outlook = allData.iloc[:, 0:1].values #to be encoded windy = allData.iloc[:, 3:4].values #to be encoded play = allData.iloc[:, 4:5].values #to be encoded #encoding what to encode(Categoric -> Numeric) and create data frames """ as another method, encode all columns with LabelEncoder and take the part which you need to encode from sklearn.preprocessing import LabelEncoder allDataLabelEncoded = allData.apply(LabelEncoder().fit_transform) labelEncoded = allDataLabelEncoded.iloc[:,-2:] """ from sklearn.preprocessing import OneHotEncoder ohe = OneHotEncoder(categories="auto") outlook = ohe.fit_transform(outlook).toarray() outlook = pd.DataFrame(data=outlook, index=range(14), columns=['overcast', 'rainy', 'sunny']) windy = ohe.fit_transform(windy).toarray() windy = pd.DataFrame(data=windy[:, 1:], index=range(14), columns=['windy']) play = ohe.fit_transform(play).toarray() play = pd.DataFrame(data=play[:, 1:], index=range(14), columns=['play']) temperature = pd.DataFrame(data=temperature, index=range(14), columns=['temperature'])
heatmap = (HeatMap().add_xaxis(xaxis_data=corr.index.to_list()).add_yaxis( '', yaxis_data=corr.index.to_list(), value=data)) heatmap.render() """ 可以看到,“访问深度”和“平均停留时间”相关性比较高,相关性高说明两个变量在建 立模型的时候,作用是一样或者效果是一样的,可以考虑组合或者删除其一。 """ # %% # 数据处理 # 1.缺失值处理(前面已经做了,这里省略) # 2.独热编码 cols = df.columns[-5:].to_list() model_ohe = OneHotEncoder(sparse=False) # 建立OneHotEncoder对象 ohe_matrix = model_ohe.fit_transform(df[cols]) # 3.数据标准化: cols = df.columns[1:-5].to_list() model_scaler = MinMaxScaler() scaler_matrix = model_scaler.fit_transform(df[cols]) # 4.分类数据和数值数据合并 X = np.hstack((scaler_matrix, ohe_matrix)) # %% # 建立模型 # 通过平均轮廓系数检验得到最佳Kmeans聚类模型 score_list = [] # 存储每个k下模型的平均轮廓系数 silhouette_int = -1 # 初始化的平均轮廓系数阀值
# -*- coding: utf-8 -*- """ Created on Wed Jul 11 11:05:48 2018 @author: ASUS """ from numpy import array from numpy import argmax from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import OneHotEncoder # define example data = [ 'cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 'warm', 'hot' ] values = array(data) print(values) # integer encode label_encoder = LabelEncoder() integer_encoded = label_encoder.fit_transform(values) print(integer_encoded) # binary encode onehot_encoder = OneHotEncoder(sparse=False) integer_encoded = integer_encoded.reshape(len(integer_encoded), 1) onehot_encoded = onehot_encoder.fit_transform(integer_encoded) print(onehot_encoded) # invert first example inverted = label_encoder.inverse_transform([argmax(onehot_encoded[0, :])]) print(inverted)
#import the dataset dataset = pd.read_csv('Data.csv') X = dataset.iloc[:, :-1].values y = dataset.iloc[:, -1].values #handling missing data imputer = SimpleImputer(missing_values=np.nan, strategy='mean') imputer.fit(X[:, 1:3]) X[:, 1:3] = imputer.transform(X[:, 1:3]) #encoding categorical data LabelEncoder_X = LabelEncoder() X[:, 0] = LabelEncoder_X.fit_transform(X[:, 0]) ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough') X = np.array(ct.fit_transform(X)) LabelEncoder_y = LabelEncoder() y = LabelEncoder_y.fit_transform(y) #splitting dataset into training and test set X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.25, random_state=0) #feature scaling scale_x = StandardScaler() X_train = scale_x.fit_transform(X_train) X_test = scale_x.transform(X_test)
#transformation of the target class by replacing attack names with "attack" information_base['normal.'] = information_base['normal.'].replace(['back.', 'buffer_overflow.', 'ftp_write.', 'guess_passwd.', 'imap.', 'ipsweep.', 'land.', 'loadmodule.', 'multihop.', 'neptune.', 'nmap.', 'perl.', 'phf.', 'pod.', 'portsweep.', 'rootkit.', 'satan.', 'smurf.', 'spy.', 'teardrop.', 'warezclient.', 'warezmaster.'], 'attack') #preprocessing of data, transformation of categorical values x = information_base.iloc[:, :-1].values y = information_base.iloc[:, 41].values LEncoderX1 = LabelEncoder() LEncoderX2 = LabelEncoder() LEncoderX3 = LabelEncoder() x[:, 1] = LEncoderX1.fit_transform(x[:, 1]) x[:, 2] = LEncoderX2.fit_transform(x[:, 2]) x[:, 3] = LEncoderX3.fit_transform(x[:, 3]) OHEncoder1 = OneHotEncoder(categorical_features = [1]) x = OHEncoder1.fit_transform(x).toarray() OHEncoder2 = OneHotEncoder(categorical_features = [4]) x = OHEncoder2.fit_transform(x).toarray() OHEncoder3 = OneHotEncoder(categorical_features = [70]) x = OHEncoder3.fit_transform(x).toarray() LEncoderY = LabelEncoder() y = LEncoderY.fit_transform(y) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0) #scaling of data scalerX = StandardScaler() x_train = scalerX.fit_transform(x_train) x_test = scalerX.transform(x_test)
from sklearn.metrics import explained_variance_score, mean_absolute_error, \ r2_score from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import PolynomialFeatures from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import GradientBoostingRegressor from ajna_commons.flask.conf import (DATABASE, MONGODB_URI) from ajna_commons.conf import ENCODE from padma.models.peso.peso import PesoModel from padma.models.bbox.bbox import NaiveModel from padma.models.conteiner20e40.bbox import SSDMobileModel pesomodel = PesoModel() bboxmodel = NaiveModel() bboxmodel = SSDMobileModel() encoder = OneHotEncoder() encoder.fit([[i] for i in range(20)]) BASE_PATH = os.path.dirname(__file__) HIST_FILE = os.path.join(BASE_PATH, 'histograms.npy') LABEL_FILE = os.path.join(BASE_PATH, 'labels.npy') CSV_FILE = os.path.join(BASE_PATH, 'pesovolexport.csv') IMGOUT_PATH = os.path.join(BASE_PATH, 'images') def make_histograms(): histograms = [] labels = [] print('Connecting to MongoDB...') db = MongoClient(host=MONGODB_URI)[DATABASE] fs = GridFS(db)
y_train, test_size=0.5) # Unsupervised transformation based on totally random trees rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator, random_state=0) rt_lm = LogisticRegression() pipeline = make_pipeline(rt, rt_lm) pipeline.fit(X_train, y_train) y_pred_rt = pipeline.predict_proba(X_test)[:, 1] fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt) # Supervised transformation based on random forests rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator) rf_enc = OneHotEncoder() rf_lm = LogisticRegression() rf.fit(X_train, y_train) rf_enc.fit(rf.apply(X_train)) rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr) y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1] fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm) grd = GradientBoostingClassifier(n_estimators=n_estimator) grd_enc = OneHotEncoder() grd_lm = LogisticRegression() grd.fit(X_train, y_train) grd_enc.fit(grd.apply(X_train)[:, :, 0]) grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)
ret.append(m) return ret start = time.time() inp = reader(filename1) x_train = inp[0] y_train = inp[1] training_examples = inp[2] outp = reader(filename2) x_test = outp[0] y_test = outp[1] testing_examples = outp[2] enc = OneHotEncoder() x_com = enc.fit_transform(x_train) x_train = x_com.toarray() x_com1 = enc.fit_transform(x_test) x_test = x_com1.toarray() end1 = time.time() # print(x_train) def forward(X,WEIGHTS,BIAS): temp = X
new_encoded = dict_one_hot_encoder.transform(new_dict) print(new_encoded) X_str = np.array([['tech', 'professional'], ['fashion', 'student'], ['fashion', 'professional'], ['sports', 'student'], ['tech', 'student'], ['tech', 'retired'], ['sports', 'professional']]) label_encoder = LabelEncoder() X_int = label_encoder.fit_transform(X_str.ravel()).reshape(*X_str.shape) print(X_int) one_hot_encoder = OneHotEncoder() X_encoded = one_hot_encoder.fit_transform(X_int).toarray() print(X_encoded) # not seen in training data new_dict = [{'interest': 'unknown_interest', 'occupation': 'retired'}, {'interest': 'tech', 'occupation': 'unseen_occupation'}] new_encoded = dict_one_hot_encoder.transform(new_dict) print(new_encoded) # new category not encountered before new_str = np.array([['unknown_interest', 'retired'], ['tech', 'unseen_occupation'], ['unknown_interest', 'unseen_occupation']])
@author: sidneaux Multiple Linear Regression """ import numpy as np import matplotlib.pyplot as plt import pandas as pd #importing a dataset df = pd.read_csv('50_startups.csv') X = df.iloc[:, -1].values y = dataset.iloc[:, 4].values from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelencoder_X = LabelEncoder() X[:, 3] = labelencoder_X.fit_transform(X[:, 3]) onehotencoder = OneHotEncoder(categorical_features=[3]) X = onehotencoder.fit_transform(X).toarray() # Avoidind the dummy variable trap X = X[:, 1:] #using label encoder (il y a deux categorie de y) labelencoder_y = LabelEncoder() y = labelencoder_y.fit_transform(y) #Splitting into training and test sets from sklearn.Cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) from sklearn.linear_model import LinearRegression regressor = LinearRegression() regressor.fit(X_train, y_train)
# Importer les librairies import numpy as np import matplotlib.pyplot as mplt import pandas as pd dtst = pd.read_csv('EU_I_PIB.csv') X = dtst.iloc[:, -4:].values y = dtst.iloc[:, -5].values #Gerer lA dummy from sklearn.preprocessing import LabelEncoder,OneHotEncoder labelEnc_X = LabelEncoder() X[:,0]= labelEnc_X.fit_transform(X[:,0]) OnehotEnc_X = OneHotEncoder(categorical_features= [0]) X= OnehotEnc_X.fit_transform(X).toarray() # division de l'echantillon from sklearn.model_selection import train_test_split X_train, X_test,y_train,y_test = train_test_split(X,y, test_size = 0.2,random_state = 0) #Construire notre modele de regression Multiple from sklearn.linear_model import LinearRegression regresseur = LinearRegression() regresseur.fit(X_train,y_train) #Faire de nouvelles prediction y_prediction = regresseur.predict(X_test) #