import pandas as pd from sklearn.impute import SimpleImputer from sklearn.metrics import confusion_matrix from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler df = pd.read_csv('veriler.csv') missing = SimpleImputer() age = df.iloc[:, [3]].values age = missing.fit_transform(age) age = pd.DataFrame(data=age, columns=['age']) labelEncoder = LabelEncoder() gender = df.iloc[:, [4]].values gender = labelEncoder.fit_transform(gender.ravel()) gender = pd.DataFrame(data=gender, columns=['gender']) oneHotEncoder = OneHotEncoder() country = df.iloc[:, [0]].values country = oneHotEncoder.fit_transform(country).toarray() country = pd.DataFrame(data=country, columns=['fr', 'tr', 'us']) hw = df.iloc[:, [1, 2]].values hw = pd.DataFrame(data=hw, columns=['height', 'weight']) df = pd.concat([country, hw, age, gender], axis=1) print(df) x = df.iloc[:, [3, 4, 5]]
import matplotlib.pyplot as plt import pandas as pd from sklearn.impute import SimpleImputer from sklearn.preprocessing import LabelEncoder, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler #import the dataset dataset = pd.read_csv('Data.csv') X = dataset.iloc[:, :-1].values y = dataset.iloc[:, -1].values #handling missing data imputer = SimpleImputer(missing_values=np.nan, strategy='mean') imputer.fit(X[:, 1:3]) X[:, 1:3] = imputer.transform(X[:, 1:3]) #encoding categorical data LabelEncoder_X = LabelEncoder() X[:, 0] = LabelEncoder_X.fit_transform(X[:, 0]) ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough') X = np.array(ct.fit_transform(X)) LabelEncoder_y = LabelEncoder() y = LabelEncoder_y.fit_transform(y) #splitting dataset into training and test set X_train, X_test, Y_train, Y_test = train_test_split(X,
"B", "B-", "CCC+", "CCC", "D"]).codes X1 = data_NaN.drop(["sic", "naics", "splticrm", "adate", "qdate","gvkey","conm", "cusip", "tic", "CUSIP", "NCUSIP", "NWPERM", "spcindcd", "spcseccd", "tic", "cusip", "public_date", "PERMCO"], axis = 1) X1_column_names = X1.columns.tolist() X2 = data_y.drop(["sic", "naics", "splticrm", "adate", "qdate","gvkey","conm", "cusip", "tic", "CUSIP", "NCUSIP", "NWPERM", "spcindcd", "spcseccd", "tic", "cusip", "public_date", "PERMCO"], axis = 1) X2_column_names = X2.columns.tolist() #Here we replace NaNs with the medain in the respective class, where possible else =0 SimImp = SimpleImputer(missing_values = np.nan, strategy = "median")#, copy = False) X2_0 = SimImp.fit_transform(X2[y2==0]) X2_1 = SimImp.fit_transform(X2[y2==1]) X2_2 = SimImp.fit_transform(X2[y2==2]) X2_3 = SimImp.fit_transform(X2[y2==3]) X2_4 = SimImp.fit_transform(X2[y2==4]) X2_5 = SimImp.fit_transform(X2[y2==5]) X2_6 = SimImp.fit_transform(X2[y2==6]) X2_7 = SimImp.fit_transform(X2[y2==7]) X2_8 = SimImp.fit_transform(X2[y2==8]) X2_9 = SimImp.fit_transform(X2[y2==9]) X2_10 =SimImp.fit_transform(X2[y2==10]) X2_11 =SimImp.fit_transform(X2[y2==11]) X2_12 =SimImp.fit_transform(X2[y2==12]) X2_13 =SimImp.fit_transform(X2[y2==13])
@author: IBM GAMER """ import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.impute import SimpleImputer from sklearn import preprocessing datos = pd.read_csv(r'train.csv') X_inicial = datos.to_numpy() #Preprocesamiento imp = SimpleImputer(missing_values=np.NaN, strategy='mean') X_salida = imp.fit_transform(X_inicial) Aprepro = preprocessing.normalize(X_salida) Aprepro = preprocessing.scale(Aprepro) aux1 = Aprepro #print(Aprepro) X = np.delete(aux1, 20, axis=1) #print(len(X[1])) #y=np.delete(Aprepro, np.arange(20), axis=1) y = np.delete(X_inicial, np.arange(20), axis=1) #print(len(y[1])) #print(y) from sklearn import tree clasificador = tree.DecisionTreeClassifier(criterion='entropy') clasificador.fit(X, y)
Spyder Editor This is a temporary script file. """ #kutuphaneler import pandas as pd import numpy as np import matplotlib.pyplot as plt yorumlar = pd.read_csv('Restaurant_Reviews.csv', error_bad_lines=False) from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent') eklenenDegerler = yorumlar.iloc[:, -1:].values imputer = imputer.fit(eklenenDegerler[:, -1:]) eklenenDegerler[:, -1:] = imputer.transform(eklenenDegerler[:, -1:]) sonuc1 = pd.DataFrame(data=eklenenDegerler, index= range(716),columns = ['Liked']) review = yorumlar.iloc[:,0:1].values sonuc2 = pd.DataFrame(data = review, index= range(716),columns = ['Review']) yorumlar1 = pd.concat([sonuc2, sonuc1],axis=1) import nltk import re
str(current_time.h_24()) + str(current_time.minute()) + str(time.time())[:2] + str(framework)+'.txt' dataset = "uci_bank_marketing_pd" data = pd.read_csv(dirt + dataset + ".csv") # panda.DataFrame data = pd.read_csv("/home/test/bank.csv", delimiter=';') print(data.columns) numeric_features = [ 'age', 'duration', 'pdays', 'previous', 'emp_var_rate', 'cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed' ] categorical_features = [ 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'campaign', 'poutcome' ] numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),\ ('scaler', StandardScaler())]) categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\ ('onehot', OneHotEncoder(sparse=False))]) preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),\ ('cat', categorical_transformer, categorical_features)]) ###################################################################### X = data[categorical_features + numeric_features] y = data["y"] lb = preprocessing.LabelBinarizer() y = lb.fit_transform(y) ########################################################## ##################################################################
#@author: Ananya Roy Choudhury import pandas as pd import numpy as np dataset_train = pd.read_csv('train.csv') dataset_test = pd.read_csv('test.csv') y_train = dataset_train.iloc[:, 1].values x_train = dataset_train.iloc[:, [2, 4, 5, 6, 7, 9, 10, 11]].values x_test2 = dataset_test.iloc[:, [1, 3, 4, 5, 6, 8, 9, 10]].values #Mean for empty columns from sklearn.impute import SimpleImputer #for numerical column imputer = SimpleImputer(missing_values=np.nan, strategy='mean') x_train[:, [2, 5]] = imputer.fit_transform(x_train[:, [2, 5]]) #for non numeric categorical imputer1 = SimpleImputer(missing_values=np.nan, strategy='most_frequent') x_train[:, [1, 7]] = imputer1.fit_transform(x_train[:, [1, 7]]) #for cabin special imputer1 = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='0') x_train[:, [1, 6]] = imputer1.fit_transform(x_train[:, [1, 6]]) #Same as above but for test set #for numerical column imputer = SimpleImputer(missing_values=np.nan, strategy='mean') x_test2[:, [2, 5]] = imputer.fit_transform(x_test2[:, [2, 5]]) #for non numeric categorical
#missing values #--------------------------- #Drop Columns with Missing Values # Get names of columns with missing values cols_with_missing = [ col for col in X_train.columns if X_train[col].isnull().any() ] # Drop columns in training and validation data reduced_X_train = X_train.drop(cols_with_missing, axis=1) reduced_X_valid = X_valid.drop(cols_with_missing, axis=1) #--------------------------- #Imputation from sklearn.impute import SimpleImputer # Imputation my_imputer = SimpleImputer() imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train)) imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid)) # Imputation removed column names; put them back imputed_X_train.columns = X_train.columns imputed_X_valid.columns = X_valid.columns #--------------------------- #An Extension to Imputation # Make copy to avoid changing original data (when imputing) X_train_plus = X_train.copy() X_valid_plus = X_valid.copy() # Make new columns indicating what will be imputed for col in cols_with_missing: X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull() X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()
# Random Forest Classification # Importing the libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd # Importing the dataset dataset = pd.read_csv('LoanRatio.csv') X = dataset.iloc[:, 0:22].values y = dataset.iloc[:, 22].values #Missing Values from sklearn.impute import SimpleImputer simp = SimpleImputer(missing_values='NaN', strategy='mean') simp = SimpleImputer().fit(X[:, 0:22]) X[:, 0:22] = simp.transform(X[:, 0:22]) # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test)
def preprocess_features(dat): """ Plots a lower triangle correlation heatmap Inputs ---------- dat: dataframe dataframe containing raw data for feature creation Outputs ---------- X: dataframe dataframe containing processed features """ ## feature groups ## titles = ['Dr.', 'Rev.', 'Mr.', 'Miss.', 'Mrs', 'Master'] cabins = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T'] embarked = ['C', 'Q', 'S'] ## individual pipelines ## pipeline_onehot = Pipeline([ ('imputer', SimpleImputer(strategy='constant', fill_value=0)), ('onehot', OneHotEncoder()), ]) pipeline_onehot_cap = Pipeline([ ('imputer', SimpleImputer(strategy='constant', fill_value=0)), ('cap', cap_value(max_value=2)), ('onehot', OneHotEncoder()), ]) pipeline_onehot_embarked = Pipeline([ ('imputer', SimpleImputer(strategy='constant', fill_value='nan')), ('onehot', OneHotEncoder(categories=[embarked], handle_unknown='ignore')), ]) pipeline_name = Pipeline([ ('imputer', SimpleImputer(strategy='constant', fill_value='nan')), ('parsing_name', parse_values(feature_list=titles)), ]) pipeline_cabin = Pipeline([ ('imputer', SimpleImputer(strategy='constant', fill_value='no')), ('parsing_name', parse_values(feature_list=cabins)), ]) pipeline_cabin_side = Pipeline([ ('imputer', SimpleImputer(strategy='constant', fill_value='N')), ('cabin_side', cabin_side()), ('onehot', OneHotEncoder(categories=[['Port', 'Starboard', 'N']], handle_unknown='ignore')), ]) pipeline_ordinal = Pipeline([ ('imputer', SimpleImputer(strategy='constant', fill_value='nan')), ('ordinal', OrdinalEncoder()), ]) pipeline_bin = Pipeline([ ('imputer', SimpleImputer(strategy='median')), ('bins', KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')), ('onehot', OneHotEncoder()), ]) pipeline_age = Pipeline([ ('imputer', SimpleImputer(strategy='median')), ('bins', Binarizer(threshold=10)), ]) ## full pipeline ## full_pipeline = ColumnTransformer([ ('oneshot_pclass', pipeline_onehot, ['pclass']), ('parsing_name', pipeline_name, ['name']), ('ordinal', pipeline_ordinal, ['sex']), ('binarizer_age', pipeline_age, ['age']), ('imputer_sibsp', pipeline_onehot_cap, ['sibsp']), ('imputer_parch', pipeline_onehot_cap, ['parch']), ('bins_fare', pipeline_bin, ['fare']), ('parsing_cabin', pipeline_cabin, ['cabin']), ('parsing_cabin_side', pipeline_cabin_side, ['cabin']), ('oneshot_embarked', pipeline_onehot_embarked, ['embarked']), ]) X = full_pipeline.fit_transform(dat) feature_names = ['pclass_' + str(i) for i in set(dat['pclass'])] \ + ['name_' + t.lower().replace('.','') for t in titles] \ + ['sex_male'] \ + ['age_10+'] \ + ['sibsp_' + i for i in ['0', '1', '2+']] \ + ['parch_' + i for i in ['0', '1', '2+']] \ + ['fare_q' + str(i) for i in np.arange(1,6)] \ + ['cabin_' + c for c in cabins] \ + ['cabin_' + c for c in ['Port', 'Starboard', 'NoCabin']] \ + ['embarked_' + i for i in embarked] X = pd.DataFrame(X, index=dat.index, columns=feature_names) return X
#imputer.fit_transform(X_train) #X = X.fillna(0) # instead of imputing from sklearn.impute import SimpleImputer #imputer = SimpleImputer() y = df_modified['poi'] # separate financial and mail features to rescale #X_financial = X.drop(email_features_list, axis=1) X_financial = X[finance_features_list] #X_financial['bonus/salary'] = X_financial['bonus'] / X_financial['salary'] # kinda leaky, but imagine we have the data necessary #X_financial = X_financial.drop(['bonus', 'salary'], axis=1) #X_financial = X_financial.fillna(0) imputerF = SimpleImputer(strategy='median') imputerF.fit(X_financial) X_financial = imputerF.transform(X_financial) from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import StandardScaler #scalerF = MinMaxScaler() scalerF = StandardScaler() scalerF.fit(X_financial) X_financial = scalerF.transform(X_financial) X_financial = pd.DataFrame(X_financial, index=X.index.values) X_mail = X[email_features_list] X_mail['from_poi/from'] = X_mail['from_this_person_to_poi'] / X_mail[ 'from_messages'] X_mail['to_poi/to'] = (
# In[28]: #Dataframe de teste df_test_country = pd.DataFrame([test_country], columns=data_missing.columns) df_test_country # In[29]: pipeline = Pipeline(steps=[ ("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]) # In[30]: pipeline.fit(data_missing.drop(columns=['Country','Region'], axis=1)) test_pipeline = pipeline.transform(df_test_country.drop(columns=['Country','Region'], axis=1)) test_pipeline # In[31]: df_test = pd.DataFrame(test_pipeline, columns=df_test_country.drop(columns=['Country','Region'], axis=1).columns)
import numpy as np import pandas as pd ds = pd.read_csv("Data.csv") x = ds.iloc[:, 0:3].values y = ds.iloc[:, 3].values from sklearn.impute import SimpleImputer imp = SimpleImputer(missing_values=np.nan, strategy="median") imp = imp.fit(x[:, 1:3]) x[:, 1:3] = imp.transform(x[:, 1:3]) from sklearn.preprocessing import LabelEncoder, OneHotEncoder lec = LabelEncoder() lec = lec.fit(x[:, 0]) x[:, 0] = lec.transform(x[:, 0]) ohe = OneHotEncoder(categorical_features=[0]) x = ohe.fit_transform(x).toarray() lec = lec.fit(y) y = lec.transform(y) from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer from sklearn.preprocessing import OneHotEncoder #Preprocessing for numerical data numerical_transformer = SimpleImputer(strategy='constant') #Preprocessing for categorical data categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='most_frequent'), ('onehot', OneHotEncoder(handle_unknown='ignore')) ] #Bundle preprocessing for numerical and categorical data preprocessor = ColumnTransformer( transformers = [ ('num', numerical_transformer, numerical_cols), ('cat', categorical_transformer, categorical_cols) ] ) from sklearn.ensemble import RandomForest Regressor() model = RandomForestRegressor(n_estimators=100, random_state= 0 from sklearn.metrics import mean_absolute_error
import pandas as pd from pandas import DataFrame from typing import Any import math # -------------------------------------------------- # Get Stock Prices # -------------------------------------------------- prices = pd.read_csv('../resources/stock_prices/stock_prices_750.txt', sep='\t', header=0, index_col=0).iloc[:, 10:15] # ----------------------------------------------- # Clean Prices # ----------------------------------------------- col_names = prices.columns prices = SimpleImputer().fit_transform(prices) prices = MinMaxScaler().fit_transform(prices) bins = np.linspace(0, 1, 20) prices = np.digitize(prices, bins, right=True) prices = DataFrame(prices, columns=col_names) # -------------------------------------------------- # Plot Stock Prices # -------------------------------------------------- # _, ax = plt.subplots() # ax.plot(prices.index.values.tolist(), prices) # plt.show() num_cols = len(col_names) tre_matrix = np.zeros(shape=(num_cols, num_cols))
def clean_data(data): # Copy data X = data.to_pandas_dataframe() X.set_index('Id',inplace=True) print(X.head()) print() # Remove rows with missing target, separate target from predictors X.dropna(axis=0, subset=['SalePrice'], inplace=True) y = X.SalePrice # Remove target and 'Utilities' X.drop(['SalePrice', 'Utilities'], axis=1, inplace=True) print(X.shape) # Select object columns categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"] # Select numeric columns numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64','float64']] # Imputation lists # imputation to null values of these numerical columns need to be 'constant' constant_num_cols = ['GarageYrBlt', 'MasVnrArea'] #constant_num_cols = ['MasVnrArea'] print("constant_num_cols") print(constant_num_cols) print # imputation to null values of these numerical columns need to be 'mean' mean_num_cols = list(set(numerical_cols).difference(set(constant_num_cols))) print("mean_num_cols") print(mean_num_cols) print() # imputation to null values of these categorical columns need to be 'constant' constant_categorical_cols = ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond','BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature'] print("constant_categorical_cols") print(constant_categorical_cols) print() # imputation to null values of these categorical columns need to be 'most_frequent' mf_categorical_cols = list(set(categorical_cols).difference(set(constant_categorical_cols))) print("mf_categorical_cols") print(mf_categorical_cols) print() my_cols = constant_num_cols + mean_num_cols + constant_categorical_cols + mf_categorical_cols print("my_cols") print(my_cols) print() # Define transformers # Preprocessing for numerical data numerical_transformer_m = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),('scaler', StandardScaler())]) numerical_transformer_c = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0)),('scaler', StandardScaler())]) # Preprocessing for categorical data for most frequent categorical_transformer_mf = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown = 'ignore', sparse = False))]) # Preprocessing for categorical data for constant categorical_transformer_c = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='NA')), ('onehot', OneHotEncoder(handle_unknown = 'ignore', sparse = False))]) # Bundle preprocessing for numerical and categorical data #preprocessor = ColumnTransformer(transformers=[ # ('num_mean', numerical_transformer_m, mean_num_cols), # ('num_constant', numerical_transformer_c, constant_num_cols), # ('cat_mf', categorical_transformer_mf, mf_categorical_cols), # ('cat_c', categorical_transformer_c, constant_categorical_cols)]) preprocessor = ColumnTransformer(transformers=[ ('num_mean', numerical_transformer_m, mean_num_cols), ('cat_mf', categorical_transformer_mf, mf_categorical_cols), ('cat_c', categorical_transformer_c, constant_categorical_cols)]) X = preprocessor.fit_transform(X) return X, y
# Function to create model, required for the KerasClassifier def create_model(): # create model model = Sequential() model.add(Dense(12, input_dim=8, activation="relu")) model.add(Dense(8, activation="relu")) model.add(Dense(1, activation="sigmoid")) model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy']) return model # fix random seed for reproducibility seed = 42 # Load the dataset data = pd.read_csv('diabetes.csv') X = data.iloc[:, :-1] y = data.iloc[:, -1] # Split the dataset into 80% training and 20% testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # Impute the missing values using feature median values imputer = SimpleImputer(missing_values=0,strategy='median') X_train2 = imputer.fit_transform(X_train) X_test2 = imputer.transform(X_test) # Convert the numpy array into a Dataframe X_train3 = pd.DataFrame(X_train2) # create model model = KerasClassifier(build_fn=create_model, epochs=150, batch_size=10, verbose=0) # evaluate using 10-fold cross validation kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed) # Evaluate using cross_val_score function results = cross_val_score(model, X_train2, y_train, cv=kfold) print(results.mean())
plt.show() return [acc_score,f1_avg,std_avg, C,penalty] rawdata = readFile('percentileDatasetCombined.csv') data=rawdata dataWithLabel=data labels = data[0:,[22]] # For oxxygenation data[0:,[23]] & for complication data[0:,[22]] data=data[0:,[1,2,3,4,5,6,7,8,14,15,16,17,18,19,20,21]] data = np.array(data).astype(float) X=data y=labels y=y.ravel() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=False) imp_mean = SimpleImputer(missing_values=-1, strategy='mean') imp_mean=imp_mean.fit(X_train) X_train = imp_mean.transform(X_train) imp_mean2 = SimpleImputer(missing_values=-1, strategy='mean') imp_mean2=imp_mean2.fit(X_test) X_test = imp_mean2.transform(X_test) # sc = StandardScaler() # X_train = sc.fit_transform(X_train) # X_test = sc.transform(X_test) # print("Number transactions X_train dataset: ", X_train.shape) # print("Number transactions y_train dataset: ", y_train.shape) # print("Number transactions X_test dataset: ", X_test.shape) # print("Number transactions y_test dataset: ", y_test.shape)
# ## Looking on data types #data2.dtypes ## Removing the object features. Maybe, we will onehot-encode them later #data3 = data2.drop(['field', 'from', 'career'], axis=1) ## Make a heatmap #plt.subplots(figsize=(20,15)) #ax = plt.axes() #ax.set_title("Correlation Heatmap") #corr = data3.corr() #sns.heatmap(corr,xticklabels=corr.columns.values,yticklabels=corr.columns.values) #%% #Alternative way to filter away columns with to many NaN values. #Preserve shar value from sklearn.impute import SimpleImputer imputer = SimpleImputer(strategy='mean') imputer.fit(raw_data[["shar", "shar_o"]]) shar = pd.DataFrame(imputer.transform(raw_data[["shar", "shar_o"]]), columns=["shar", "shar_o"], index=raw_data.index) raw_data = replaceGroup(raw_data, shar) null_sum = raw_data.isnull().sum() too_many_nans = null_sum[null_sum < 750].index.values too_many_nans = [str(index) for index in too_many_nans] data = raw_data[too_many_nans] data = data.dropna() data = data.drop(["field", "from", "career"], axis=1) #%%One hot encoding data = data[data.columns.drop(list(data.filter(regex="_3")))]
from bayes_opt import BayesianOptimization from catboost import cv, CatBoostRegressor, Pool train_df = pd.read_csv('training.csv') X = train_df.drop('Instance', axis=1) X = X.drop('Income in EUR', axis=1) y = train_df['Income in EUR'] X_pred = pd.read_csv('test.csv') X_pred = X_pred.drop('Income', axis=1) X_pred = X_pred.drop('Instance', axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.1) ct = ColumnTransformer(transformers=[('num_imp', SimpleImputer(strategy='median'), [0, 2, 4, 9]), ('cat_imp', SimpleImputer(strategy='most_frequent'), [1, 3, 5, 6, 7, 8])], remainder='passthrough') ct.fit(X_train, y_train) X_train = ct.transform(X_train) X_test = ct.transform(X_test) jobs = X_train[:,6] senior_job_terms = ['senior', 'manager', 'doctor', 'lawyer', 'analyst', 'programmer', 'specialist', 'supervisor', 'chief'] senior_job = [] for j in jobs: found=False for s in senior_job_terms: if s in j: senior_job.append('yes') found = True break
#%% # Now train/test split: tv_f, test_f = train_test_split(df, test_size=0.25, random_state=RANDOM_SEED) train_f, vali_f = train_test_split(tv_f, test_size=0.25, random_state=RANDOM_SEED) y_train = np.array(train_f.pop(PREDICT_COL).array) y_vali = np.array(vali_f.pop(PREDICT_COL).array) y_test = np.array(test_f.pop(PREDICT_COL).array) #%% # Now process data: # Note, we don't NEED DictVectorizer... why? # Let's fix missing values; fix_missing = SimpleImputer(missing_values=-200.0) scaler = StandardScaler() X_train = scaler.fit_transform(fix_missing.fit_transform(train_f)) X_vali = scaler.transform(fix_missing.transform(vali_f)) X_test = scaler.transform(fix_missing.transform(test_f)) @dataclass class LinearRegressionModel: # Managed to squeeze bias into this weights array by adding some +1s. weights: np.ndarray @staticmethod def random(D: int) -> "LinearRegressionModel":
# Importing the dataset(Train) dataset = pd.read_csv('Training_data.csv') X = dataset.iloc[:, :-1].values y = dataset.iloc[:, 13].values # Importing the dataset(Test) dataset = pd.read_csv('Test_data.csv') X_test = dataset.iloc[:, :].values # Missing data from sklearn.impute import SimpleImputer imputer = SimpleImputer(copy=True, fill_value=None, missing_values=np.nan, strategy='mean', verbose=0) imputer = imputer.fit(X[:, 0:13]) X[:, 0:13] = imputer.transform(X[:, 0:13]) #TEST imputer_test = imputer.fit(X_test[:, :]) X_test[:, :] = imputer_test.transform(X_test[:, :]) #Feature Scaling from sklearn.preprocessing import StandardScaler as ss sc = ss() X = sc.fit_transform(X) X_test = sc.fit_transform(X_test)
print_df(df) """ +------+------+------+------+------+ | | 국어 | 영어 | 수학 | 과학 | +------+------+------+------+------+ | 철수 | 98.0 | nan | 88.0 | 64.0 | | 영희 | 88.0 | 90.0 | 62.0 | 72.0 | | 민철 | 92.0 | 70.0 | nan | nan | | 수현 | 63.0 | 60.0 | 31.0 | 70.0 | | 호영 | nan | 50.0 | nan | 88.0 | +------+------+------+------+------+ """ #1) 결측치를 정제할 규칙 정의 # → 결측치에 대해 평균점수 imr = SimpleImputer(missing_values=numpy.nan, strategy="mean") #2) dataframe의 값에 대해 규칙 적용 df_imr = imr.fit_transform(df.values) #3) 적용된 규칙으로 새로운 데이터 프레임 생성 re_df2 = DataFrame(df_imr, index=df.index, columns=df.columns) print_df(re_df2) """ +------+-------+------+--------------------+------+ | | 국어 | 영어 | 수학 | 과학 | +------+-------+------+--------------------+------+ | 철수 | 98.0 | 67.5 | 88.0 | 64.0 | | 영희 | 88.0 | 90.0 | 62.0 | 72.0 | | 민철 | 92.0 | 70.0 | 60.333333333333336 | 73.5 | | 수현 | 63.0 | 60.0 | 31.0 | 70.0 |
forest_model = RandomForestRegressor(n_estimators=100, random_state=0) forest_model.fit(train_features, train_target) melb_preds = forest_model.predict(val_features) print('MAE_random_forrest:') MAE_RF = mean_absolute_error(melb_preds, melb_preds) print(MAE_RF) # random forest - cross validation heart_features = [ 'Age', 'Gender', 'Chest_Pain', 'Resting_BP', 'Cholesterol', 'Fasting_BS', 'RECG', 'Max_Heart_Rate', 'Exercise_Ang', 'ST_Depression', 'ST_Segmen', 'Major_Vessels', 'Thalassemia' ] features = heart_data[heart_features] my_pipeline = Pipeline( steps=[('preprocessor', SimpleImputer() ), ('model', RandomForestRegressor(n_estimators=10, random_state=0))]) scores = -1 * cross_val_score( my_pipeline, features, target, cv=10, scoring='neg_mean_absolute_error') print("MAE cross:\n", scores) print("Average MAE score (across experiments):") cross = scores.mean() print(scores.mean()) # extreme graddient boost heart_features = [ 'Age', 'Gender', 'Chest_Pain', 'Resting_BP', 'Cholesterol', 'Fasting_BS', 'RECG', 'Max_Heart_Rate', 'Exercise_Ang', 'ST_Depression', 'ST_Segmen', 'Major_Vessels', 'Thalassemia'
def fit(self, X, y=None): return self def transform(self, X): return X[self.attribute_names] num_attrb_selected = [ "Rooms", "Distance", "Bedroom2", "Bathroom", "Car", "Landsize", "Lattitude", "Longtitude" ] num_pipeline = Pipeline([ ("select_numeric", DataFrameSelector(num_attrb_selected)), ('imputer', SimpleImputer(strategy="median")), ('std_scaler', StandardScaler()), ]) class MostFrequentImputer(BaseEstimator, TransformerMixin): def fit(self, X, y=None): self.most_frequent_ = pd.Series( [X[c].value_counts().index[0] for c in X], index=X.columns) return self def transform(self, X, y=None): return X.fillna(self.most_frequent_) cat_attrb_selected = [
X_train_post_hoc = df df_test, y_test = load_combine_data(X_test, merged_data, dmri) X_test_post_hoc = df_test df = df.drop(columns=['eid', '20016-2.0'], axis=1) df_test = df_test.drop(columns=['eid', '20016-2.0'], axis=1) estimator = RandomForestRegressor(n_estimators=250, criterion='mse', n_jobs=10, verbose=1, random_state=0) pipeline = Pipeline([('imputation', make_union(SimpleImputer(strategy="median"), MissingIndicator())), ('estimator', estimator)]) cv = ShuffleSplit(n_splits=100, test_size=0.1, random_state=0) param_grid = { 'estimator__max_depth': [5, 10, 20, 40, None], 'estimator__max_features': [1, 5, 'log2', 'sqrt', 'auto', None] } grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, verbose=2, n_jobs=10) metrics = []
# then one-hot encode categorical variables if args.dataset == "flchain": df = pd.read_csv("./data/surv/flchain.csv") E = df["death"] T = df["futime"] X = df >> drop(X.death, X.futime, X.chapter) \ >> mutate(mgus=X.mgus.astype(float), age=X.age.astype(float)) X = X[T > 0] E = E[T > 0] T = T[T > 0] #Y = np.c_[np.log(T) - np.mean(np.log(T)), C] Y = Y_join(T, E) X_num = X.select_dtypes(include=["float"]) X_cat = X.select_dtypes(exclude=["float"]) imputer = SimpleImputer(strategy="median") X_num = imputer.fit_transform(X_num.values) imputer = SimpleImputer(strategy="most_frequent") X_cat = imputer.fit_transform(X_cat.values) encoder = OneHotEncoder(sparse=False) X_cat = encoder.fit_transform(X_cat) X = np.c_[X_num, X_cat] elif args.dataset == "support": df = pd.read_csv("./data/surv/support2.csv") df = df.rename(columns={"d.time": "dtime"}) T = df["dtime"] E = df["death"] #Y = np.c_[np.log(T) - np.mean(np.log(T)), C] Y = Y_join(T, E) df >>= drop(X.dtime, X.death, X.hospdead, X.prg2m, X.prg6m, X.dnr,
from sklearn import datasets from sklearn.naive_bayes import GaussianNB from sklearn.model_selection import cross_val_score from os import system from sklearn.impute import SimpleImputer NB = GaussianNB() accuracy=[] cv=[] fsc=[] for i in range(0,10): data = read_csv("lung_cancer.csv") X=data.iloc[:,1:].values Y=data.iloc[:,0] imp = SimpleImputer(missing_values='?', strategy='most_frequent') X=imp.fit_transform(X) X=pd.DataFrame(X) X_train, X_test, Y_train, Y_test =train_test_split(X,Y, test_size=0.2) NB.fit(X_train, Y_train) Y_pred = NB.predict(X_test) print(Y_pred) cross_val= np.max(cross_val_score(NB,X_train,Y_train,cv=5)) cm= confusion_matrix(Y_test, Y_pred) print("\nCross Validation Score: ", cross_val) cv.append(cross_val)
@author: Rajat sharma """ # Importing the libaries import pandas as pd import numpy as np import matplotlib.pyplot as plt # Importing the data set dataset = pd.read_csv('placement_data') X = dataset.iloc[:, 1:-1].values Y = dataset.iloc[:, -1].values # Removing the Nan Values from sklearn.impute import SimpleImputer missing_values = SimpleImputer(missing_values=np.nan, strategy='constant') Y = Y.reshape(-1, 1) missing_values = missing_values.fit(Y) Y = missing_values.transform(Y) # Changing the Cathogorical data from sklearn.preprocessing import LabelEncoder LabelEncoder_X = LabelEncoder() X[:, 0] = LabelEncoder_X.fit_transform(X[:, 0]) X[:, 2] = LabelEncoder_X.fit_transform(X[:, 2]) X[:, 4] = LabelEncoder_X.fit_transform(X[:, 4]) X[:, 5] = LabelEncoder_X.fit_transform(X[:, 5]) X[:, 7] = LabelEncoder_X.fit_transform(X[:, 7]) X[:, 8] = LabelEncoder_X.fit_transform(X[:, 8]) X[:, 10] = LabelEncoder_X.fit_transform(X[:, 10])
import pandas as pd base = pd.read_csv('credit_data.csv') base.loc[base.age < 0, 'age'] = 40.92 previsores = base.iloc[:,1:4].values classe = base.iloc[:,4].values from sklearn.impute import SimpleImputer imputer = SimpleImputer() imputer = imputer.fit(previsores[:,0:3]) previsores[:,0:3] = imputer.transform(previsores) from sklearn.preprocessing import StandardScaler scaler = StandardScaler() previsores = scaler.fit_transform(previsores) from sklearn.model_selection import train_test_split previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split(previsores, classe, test_size=0.25, random_state=0) from import classificador = classificador.fit(previsores_treinamento, classe_treinamento) previsoes = classificador.predict(previsores_teste) from sklearn.metrics import confusion_matrix, accuracy_score precisao = accuracy_score(classe_teste, previsoes) matriz = confusion_matrix(classe_teste, previsoes) from collections import Counter Counter(classe_teste)