示例#1
0
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

df = pd.read_csv('veriler.csv')

missing = SimpleImputer()
age = df.iloc[:, [3]].values
age = missing.fit_transform(age)
age = pd.DataFrame(data=age, columns=['age'])

labelEncoder = LabelEncoder()
gender = df.iloc[:, [4]].values
gender = labelEncoder.fit_transform(gender.ravel())
gender = pd.DataFrame(data=gender, columns=['gender'])

oneHotEncoder = OneHotEncoder()
country = df.iloc[:, [0]].values
country = oneHotEncoder.fit_transform(country).toarray()
country = pd.DataFrame(data=country, columns=['fr', 'tr', 'us'])

hw = df.iloc[:, [1, 2]].values
hw = pd.DataFrame(data=hw, columns=['height', 'weight'])

df = pd.concat([country, hw, age, gender], axis=1)
print(df)

x = df.iloc[:, [3, 4, 5]]
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#import the dataset
dataset = pd.read_csv('Data.csv')

X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

#handling missing data
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

#encoding categorical data
LabelEncoder_X = LabelEncoder()
X[:, 0] = LabelEncoder_X.fit_transform(X[:, 0])

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])],
                       remainder='passthrough')
X = np.array(ct.fit_transform(X))
LabelEncoder_y = LabelEncoder()
y = LabelEncoder_y.fit_transform(y)

#splitting dataset into training and test set
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                      "B", "B-", "CCC+", "CCC", "D"]).codes

X1 = data_NaN.drop(["sic", "naics", "splticrm", "adate", "qdate","gvkey","conm",
                    "cusip", "tic", "CUSIP", "NCUSIP", "NWPERM",
                    "spcindcd", "spcseccd", "tic", "cusip", "public_date",
                    "PERMCO"], axis = 1)
X1_column_names = X1.columns.tolist()

X2 = data_y.drop(["sic", "naics", "splticrm", "adate", "qdate","gvkey","conm",
                    "cusip", "tic", "CUSIP", "NCUSIP", "NWPERM",
                    "spcindcd", "spcseccd", "tic", "cusip", "public_date",
                    "PERMCO"], axis = 1)
X2_column_names = X2.columns.tolist()

#Here we replace NaNs with the medain in the respective class, where possible else =0
SimImp = SimpleImputer(missing_values = np.nan, strategy = "median")#, copy = False)

X2_0 = SimImp.fit_transform(X2[y2==0])
X2_1 = SimImp.fit_transform(X2[y2==1])
X2_2 = SimImp.fit_transform(X2[y2==2])
X2_3 = SimImp.fit_transform(X2[y2==3])
X2_4 = SimImp.fit_transform(X2[y2==4])
X2_5 = SimImp.fit_transform(X2[y2==5])
X2_6 = SimImp.fit_transform(X2[y2==6])
X2_7 = SimImp.fit_transform(X2[y2==7])
X2_8 = SimImp.fit_transform(X2[y2==8])
X2_9 = SimImp.fit_transform(X2[y2==9])
X2_10 =SimImp.fit_transform(X2[y2==10])
X2_11 =SimImp.fit_transform(X2[y2==11])
X2_12 =SimImp.fit_transform(X2[y2==12])
X2_13 =SimImp.fit_transform(X2[y2==13])
示例#4
0
@author: IBM GAMER
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn import preprocessing

datos = pd.read_csv(r'train.csv')

X_inicial = datos.to_numpy()

#Preprocesamiento
imp = SimpleImputer(missing_values=np.NaN, strategy='mean')
X_salida = imp.fit_transform(X_inicial)
Aprepro = preprocessing.normalize(X_salida)
Aprepro = preprocessing.scale(Aprepro)
aux1 = Aprepro
#print(Aprepro)
X = np.delete(aux1, 20, axis=1)
#print(len(X[1]))
#y=np.delete(Aprepro, np.arange(20), axis=1)
y = np.delete(X_inicial, np.arange(20), axis=1)
#print(len(y[1]))
#print(y)

from sklearn import tree
clasificador = tree.DecisionTreeClassifier(criterion='entropy')
clasificador.fit(X, y)
Spyder Editor

This is a temporary script file.
"""

#kutuphaneler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


yorumlar = pd.read_csv('Restaurant_Reviews.csv', error_bad_lines=False)

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
eklenenDegerler = yorumlar.iloc[:, -1:].values
imputer = imputer.fit(eklenenDegerler[:, -1:])


eklenenDegerler[:, -1:] = imputer.transform(eklenenDegerler[:, -1:])

sonuc1 = pd.DataFrame(data=eklenenDegerler, index= range(716),columns = ['Liked'])
review = yorumlar.iloc[:,0:1].values

sonuc2 = pd.DataFrame(data = review, index= range(716),columns = ['Review'])
yorumlar1 = pd.concat([sonuc2, sonuc1],axis=1)

import nltk
import re
示例#6
0
str(current_time.h_24()) + str(current_time.minute())  + str(time.time())[:2] + str(framework)+'.txt'

dataset = "uci_bank_marketing_pd"
data = pd.read_csv(dirt + dataset + ".csv")  # panda.DataFrame
data = pd.read_csv("/home/test/bank.csv", delimiter=';')
print(data.columns)
numeric_features = [
    'age', 'duration', 'pdays', 'previous', 'emp_var_rate', 'cons_price_idx',
    'cons_conf_idx', 'euribor3m', 'nr_employed'
]
categorical_features = [
    'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
    'month', 'day_of_week', 'campaign', 'poutcome'
]

numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),\
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\
    ('onehot', OneHotEncoder(sparse=False))])

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),\
     ('cat', categorical_transformer, categorical_features)])

######################################################################
X = data[categorical_features + numeric_features]
y = data["y"]
lb = preprocessing.LabelBinarizer()
y = lb.fit_transform(y)
##########################################################
##################################################################
示例#7
0
#@author: Ananya Roy Choudhury

import pandas as pd
import numpy as np

dataset_train = pd.read_csv('train.csv')
dataset_test = pd.read_csv('test.csv')
y_train = dataset_train.iloc[:, 1].values
x_train = dataset_train.iloc[:, [2, 4, 5, 6, 7, 9, 10, 11]].values

x_test2 = dataset_test.iloc[:, [1, 3, 4, 5, 6, 8, 9, 10]].values

#Mean for empty columns
from sklearn.impute import SimpleImputer
#for numerical column
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
x_train[:, [2, 5]] = imputer.fit_transform(x_train[:, [2, 5]])
#for non numeric categorical
imputer1 = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
x_train[:, [1, 7]] = imputer1.fit_transform(x_train[:, [1, 7]])
#for cabin special
imputer1 = SimpleImputer(missing_values=np.nan,
                         strategy='constant',
                         fill_value='0')
x_train[:, [1, 6]] = imputer1.fit_transform(x_train[:, [1, 6]])

#Same as above but for test set
#for numerical column
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
x_test2[:, [2, 5]] = imputer.fit_transform(x_test2[:, [2, 5]])
#for non numeric categorical
示例#8
0
#missing values
#---------------------------
#Drop Columns with Missing Values
# Get names of columns with missing values
cols_with_missing = [
    col for col in X_train.columns if X_train[col].isnull().any()
]
# Drop columns in training and validation data
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)
#---------------------------
#Imputation
from sklearn.impute import SimpleImputer
# Imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns
#---------------------------
#An Extension to Imputation
# Make copy to avoid changing original data (when imputing)
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()

# Make new columns indicating what will be imputed
for col in cols_with_missing:
    X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
    X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()
# Random Forest Classification

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('LoanRatio.csv')
X = dataset.iloc[:, 0:22].values
y = dataset.iloc[:, 22].values

#Missing Values
from sklearn.impute import SimpleImputer
simp = SimpleImputer(missing_values='NaN', strategy='mean')
simp = SimpleImputer().fit(X[:, 0:22])
X[:, 0:22] = simp.transform(X[:, 0:22])

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
示例#10
0
def preprocess_features(dat):
    """

    Plots a lower triangle correlation heatmap 

    Inputs
    ----------
    dat: dataframe 
        dataframe containing raw data for feature creation

    Outputs
    ----------
    X: dataframe
        dataframe containing processed features
    
    """
    ## feature groups ##

    titles = ['Dr.', 'Rev.', 'Mr.', 'Miss.', 'Mrs', 'Master']

    cabins = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T']

    embarked = ['C', 'Q', 'S']

    ## individual pipelines ##

    pipeline_onehot = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
        ('onehot', OneHotEncoder()),
    ])

    pipeline_onehot_cap = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
        ('cap', cap_value(max_value=2)),
        ('onehot', OneHotEncoder()),
    ])

    pipeline_onehot_embarked = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='nan')),
        ('onehot', OneHotEncoder(categories=[embarked],
                                 handle_unknown='ignore')),
    ])

    pipeline_name = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='nan')),
        ('parsing_name', parse_values(feature_list=titles)),
    ])

    pipeline_cabin = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='no')),
        ('parsing_name', parse_values(feature_list=cabins)),
    ])

    pipeline_cabin_side = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='N')),
        ('cabin_side', cabin_side()),
        ('onehot',
         OneHotEncoder(categories=[['Port', 'Starboard', 'N']],
                       handle_unknown='ignore')),
    ])

    pipeline_ordinal = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='nan')),
        ('ordinal', OrdinalEncoder()),
    ])

    pipeline_bin = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('bins',
         KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')),
        ('onehot', OneHotEncoder()),
    ])

    pipeline_age = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('bins', Binarizer(threshold=10)),
    ])

    ## full pipeline ##

    full_pipeline = ColumnTransformer([
        ('oneshot_pclass', pipeline_onehot, ['pclass']),
        ('parsing_name', pipeline_name, ['name']),
        ('ordinal', pipeline_ordinal, ['sex']),
        ('binarizer_age', pipeline_age, ['age']),
        ('imputer_sibsp', pipeline_onehot_cap, ['sibsp']),
        ('imputer_parch', pipeline_onehot_cap, ['parch']),
        ('bins_fare', pipeline_bin, ['fare']),
        ('parsing_cabin', pipeline_cabin, ['cabin']),
        ('parsing_cabin_side', pipeline_cabin_side, ['cabin']),
        ('oneshot_embarked', pipeline_onehot_embarked, ['embarked']),
    ])

    X = full_pipeline.fit_transform(dat)

    feature_names = ['pclass_' + str(i) for i in set(dat['pclass'])] \
    + ['name_' + t.lower().replace('.','') for t in titles] \
    + ['sex_male'] \
    + ['age_10+'] \
    + ['sibsp_' + i for i in ['0', '1', '2+']] \
    + ['parch_' + i for i in ['0', '1', '2+']] \
    + ['fare_q' + str(i) for i in np.arange(1,6)] \
    + ['cabin_' + c for c in cabins] \
    + ['cabin_' + c for c in ['Port', 'Starboard', 'NoCabin']] \
    + ['embarked_' + i for i in embarked]

    X = pd.DataFrame(X, index=dat.index, columns=feature_names)

    return X
示例#11
0
#imputer.fit_transform(X_train)

#X = X.fillna(0) # instead of imputing
from sklearn.impute import SimpleImputer
#imputer = SimpleImputer()

y = df_modified['poi']

# separate financial and mail features to rescale
#X_financial = X.drop(email_features_list, axis=1)
X_financial = X[finance_features_list]
#X_financial['bonus/salary'] = X_financial['bonus'] / X_financial['salary'] # kinda leaky, but imagine we have the data necessary
#X_financial = X_financial.drop(['bonus', 'salary'], axis=1)

#X_financial = X_financial.fillna(0)
imputerF = SimpleImputer(strategy='median')
imputerF.fit(X_financial)
X_financial = imputerF.transform(X_financial)

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
#scalerF = MinMaxScaler()
scalerF = StandardScaler()
scalerF.fit(X_financial)
X_financial = scalerF.transform(X_financial)
X_financial = pd.DataFrame(X_financial, index=X.index.values)

X_mail = X[email_features_list]
X_mail['from_poi/from'] = X_mail['from_this_person_to_poi'] / X_mail[
    'from_messages']
X_mail['to_poi/to'] = (

# In[28]:


#Dataframe de teste 
df_test_country = pd.DataFrame([test_country], columns=data_missing.columns)

df_test_country


# In[29]:


pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())])


# In[30]:


pipeline.fit(data_missing.drop(columns=['Country','Region'], axis=1))
test_pipeline = pipeline.transform(df_test_country.drop(columns=['Country','Region'], axis=1))
test_pipeline


# In[31]:


df_test = pd.DataFrame(test_pipeline, columns=df_test_country.drop(columns=['Country','Region'], axis=1).columns)
import numpy as np
import pandas as pd
ds = pd.read_csv("Data.csv")
x = ds.iloc[:, 0:3].values
y = ds.iloc[:, 3].values
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy="median")
imp = imp.fit(x[:, 1:3])
x[:, 1:3] = imp.transform(x[:, 1:3])
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lec = LabelEncoder()
lec = lec.fit(x[:, 0])
x[:, 0] = lec.transform(x[:, 0])
ohe = OneHotEncoder(categorical_features=[0])
x = ohe.fit_transform(x).toarray()
lec = lec.fit(y)
y = lec.transform(y)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

#Preprocessing for numerical data

numerical_transformer = SimpleImputer(strategy='constant')

#Preprocessing for categorical data

categorical_transformer = Pipeline(steps=[
	('imputer', SimpleImputer(strategy='most_frequent'),
	('onehot', OneHotEncoder(handle_unknown='ignore'))
]

#Bundle preprocessing for numerical and categorical data

preprocessor = ColumnTransformer(
	transformers = [
		('num', numerical_transformer, numerical_cols),
		('cat', categorical_transformer, categorical_cols)
	]
)

from sklearn.ensemble import RandomForest Regressor()

model = RandomForestRegressor(n_estimators=100, random_state= 0


from sklearn.metrics import mean_absolute_error
示例#15
0
import pandas as pd
from pandas import DataFrame
from typing import Any
import math


# --------------------------------------------------
# Get Stock Prices
# --------------------------------------------------
prices = pd.read_csv('../resources/stock_prices/stock_prices_750.txt', sep='\t', header=0, index_col=0).iloc[:, 10:15]

# -----------------------------------------------
# Clean Prices
# -----------------------------------------------
col_names = prices.columns
prices = SimpleImputer().fit_transform(prices)
prices = MinMaxScaler().fit_transform(prices)

bins = np.linspace(0, 1, 20)
prices = np.digitize(prices, bins, right=True)
prices = DataFrame(prices, columns=col_names)

# --------------------------------------------------
# Plot Stock Prices
# --------------------------------------------------
# _, ax = plt.subplots()
# ax.plot(prices.index.values.tolist(), prices)
# plt.show()

num_cols = len(col_names)
tre_matrix = np.zeros(shape=(num_cols, num_cols))
示例#16
0
def clean_data(data):

    # Copy data
    X = data.to_pandas_dataframe()
    X.set_index('Id',inplace=True)
    print(X.head())
    print()

    # Remove rows with missing target, separate target from predictors
    X.dropna(axis=0, subset=['SalePrice'], inplace=True)
    y = X.SalePrice 

    # Remove target and 'Utilities' 
    X.drop(['SalePrice', 'Utilities'], axis=1, inplace=True)

    print(X.shape)

    # Select object columns
    categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]

    # Select numeric columns
    numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64','float64']]

    # Imputation lists

    # imputation to null values of these numerical columns need to be 'constant'
    constant_num_cols = ['GarageYrBlt', 'MasVnrArea']
    #constant_num_cols = ['MasVnrArea']
    print("constant_num_cols")
    print(constant_num_cols)
    print

    # imputation to null values of these numerical columns need to be 'mean'
    mean_num_cols = list(set(numerical_cols).difference(set(constant_num_cols)))
    print("mean_num_cols")
    print(mean_num_cols)
    print()

    # imputation to null values of these categorical columns need to be 'constant'
    constant_categorical_cols = ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond','BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']
    print("constant_categorical_cols")
    print(constant_categorical_cols)
    print()

    # imputation to null values of these categorical columns need to be 'most_frequent'
    mf_categorical_cols = list(set(categorical_cols).difference(set(constant_categorical_cols)))
    print("mf_categorical_cols")
    print(mf_categorical_cols)
    print()

    my_cols = constant_num_cols + mean_num_cols + constant_categorical_cols + mf_categorical_cols
    print("my_cols")
    print(my_cols)
    print()

    # Define transformers
    # Preprocessing for numerical data

    numerical_transformer_m = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),('scaler', StandardScaler())])

    numerical_transformer_c = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0)),('scaler', StandardScaler())])

    # Preprocessing for categorical data for most frequent
    categorical_transformer_mf = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown = 'ignore', sparse = False))])

    # Preprocessing for categorical data for constant
    categorical_transformer_c = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='NA')), ('onehot', OneHotEncoder(handle_unknown = 'ignore', sparse = False))])


    # Bundle preprocessing for numerical and categorical data
    #preprocessor = ColumnTransformer(transformers=[
    #    ('num_mean', numerical_transformer_m, mean_num_cols),
    #    ('num_constant', numerical_transformer_c, constant_num_cols),
    #    ('cat_mf', categorical_transformer_mf, mf_categorical_cols),
    #    ('cat_c', categorical_transformer_c, constant_categorical_cols)])
    preprocessor = ColumnTransformer(transformers=[
        ('num_mean', numerical_transformer_m, mean_num_cols),
        ('cat_mf', categorical_transformer_mf, mf_categorical_cols),
        ('cat_c', categorical_transformer_c, constant_categorical_cols)])

    X = preprocessor.fit_transform(X)
    
    
    return X, y
示例#17
0
# Function to create model, required for the KerasClassifier
def create_model():
    # create model
    model = Sequential()
    model.add(Dense(12, input_dim=8, activation="relu"))
    model.add(Dense(8, activation="relu"))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy'])
    return model
# fix random seed for reproducibility
seed = 42
# Load the dataset
data = pd.read_csv('diabetes.csv')
X = data.iloc[:, :-1]
y = data.iloc[:, -1]
# Split the dataset into 80% training and 20% testing sets   
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# Impute the missing values using feature median values
imputer = SimpleImputer(missing_values=0,strategy='median')
X_train2 = imputer.fit_transform(X_train)
X_test2 = imputer.transform(X_test)
# Convert the numpy array into a Dataframe
X_train3 = pd.DataFrame(X_train2)
# create model
model = KerasClassifier(build_fn=create_model, epochs=150, batch_size=10, verbose=0)
# evaluate using 10-fold cross validation
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
# Evaluate using cross_val_score function
results = cross_val_score(model, X_train2, y_train, cv=kfold)
print(results.mean())
示例#18
0
 plt.show()
 return [acc_score,f1_avg,std_avg, C,penalty]

rawdata = readFile('percentileDatasetCombined.csv')
data=rawdata
dataWithLabel=data
labels = data[0:,[22]] # For oxxygenation data[0:,[23]]  & for complication data[0:,[22]] 
data=data[0:,[1,2,3,4,5,6,7,8,14,15,16,17,18,19,20,21]]
data = np.array(data).astype(float)

X=data
y=labels
y=y.ravel()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=False)
imp_mean = SimpleImputer(missing_values=-1, strategy='mean')
imp_mean=imp_mean.fit(X_train)
X_train = imp_mean.transform(X_train)

imp_mean2 = SimpleImputer(missing_values=-1, strategy='mean')
imp_mean2=imp_mean2.fit(X_test)
X_test = imp_mean2.transform(X_test)

# sc = StandardScaler()
# X_train = sc.fit_transform(X_train)
# X_test = sc.transform(X_test)
# print("Number transactions X_train dataset: ", X_train.shape)
# print("Number transactions y_train dataset: ", y_train.shape)
# print("Number transactions X_test dataset: ", X_test.shape)
# print("Number transactions y_test dataset: ", y_test.shape)
#
## Looking on data types
#data2.dtypes
## Removing the object features. Maybe, we will onehot-encode them later
#data3 = data2.drop(['field', 'from', 'career'], axis=1)
## Make a heatmap
#plt.subplots(figsize=(20,15))
#ax = plt.axes()
#ax.set_title("Correlation Heatmap")
#corr = data3.corr()
#sns.heatmap(corr,xticklabels=corr.columns.values,yticklabels=corr.columns.values)
#%%
#Alternative way to filter away columns with to many NaN values.
#Preserve shar value
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
imputer.fit(raw_data[["shar", "shar_o"]])
shar = pd.DataFrame(imputer.transform(raw_data[["shar", "shar_o"]]), columns=["shar", "shar_o"], index=raw_data.index)
raw_data = replaceGroup(raw_data, shar)


null_sum = raw_data.isnull().sum()
too_many_nans = null_sum[null_sum < 750].index.values
too_many_nans = [str(index) for index in too_many_nans]
data = raw_data[too_many_nans]
data = data.dropna()
data = data.drop(["field", "from", "career"], axis=1)

#%%One hot encoding
data = data[data.columns.drop(list(data.filter(regex="_3")))]
示例#20
0
from bayes_opt import BayesianOptimization
from catboost import cv, CatBoostRegressor, Pool

train_df = pd.read_csv('training.csv')

X = train_df.drop('Instance', axis=1)
X = X.drop('Income in EUR', axis=1)
y = train_df['Income in EUR']

X_pred = pd.read_csv('test.csv')
X_pred = X_pred.drop('Income', axis=1)
X_pred = X_pred.drop('Instance', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.1)

ct = ColumnTransformer(transformers=[('num_imp', SimpleImputer(strategy='median'), [0, 2, 4, 9]), ('cat_imp', SimpleImputer(strategy='most_frequent'), [1, 3, 5, 6, 7, 8])], remainder='passthrough')

ct.fit(X_train, y_train)
X_train = ct.transform(X_train)
X_test = ct.transform(X_test)

jobs = X_train[:,6]
senior_job_terms = ['senior', 'manager', 'doctor', 'lawyer', 'analyst', 'programmer', 'specialist', 'supervisor', 'chief']
senior_job = []
for j in jobs:
    found=False
    for s in senior_job_terms:
        if s in j:
            senior_job.append('yes')
            found = True
            break
#%%
#  Now train/test split:
tv_f, test_f = train_test_split(df, test_size=0.25, random_state=RANDOM_SEED)
train_f, vali_f = train_test_split(tv_f, test_size=0.25, random_state=RANDOM_SEED)

y_train = np.array(train_f.pop(PREDICT_COL).array)
y_vali = np.array(vali_f.pop(PREDICT_COL).array)
y_test = np.array(test_f.pop(PREDICT_COL).array)

#%%
# Now process data:
# Note, we don't NEED DictVectorizer... why?

# Let's fix missing values;
fix_missing = SimpleImputer(missing_values=-200.0)

scaler = StandardScaler()

X_train = scaler.fit_transform(fix_missing.fit_transform(train_f))
X_vali = scaler.transform(fix_missing.transform(vali_f))
X_test = scaler.transform(fix_missing.transform(test_f))


@dataclass
class LinearRegressionModel:
    # Managed to squeeze bias into this weights array by adding some +1s.
    weights: np.ndarray

    @staticmethod
    def random(D: int) -> "LinearRegressionModel":
# Importing the dataset(Train)
dataset = pd.read_csv('Training_data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 13].values

# Importing the dataset(Test)
dataset = pd.read_csv('Test_data.csv')
X_test = dataset.iloc[:, :].values

# Missing data

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(copy=True,
                        fill_value=None,
                        missing_values=np.nan,
                        strategy='mean',
                        verbose=0)

imputer = imputer.fit(X[:, 0:13])
X[:, 0:13] = imputer.transform(X[:, 0:13])

#TEST
imputer_test = imputer.fit(X_test[:, :])
X_test[:, :] = imputer_test.transform(X_test[:, :])

#Feature Scaling
from sklearn.preprocessing import StandardScaler as ss
sc = ss()
X = sc.fit_transform(X)
X_test = sc.fit_transform(X_test)
示例#23
0
print_df(df)
"""
+------+------+------+------+------+
|      | 국어 | 영어 | 수학 | 과학 |
+------+------+------+------+------+
| 철수 | 98.0 | nan  | 88.0 | 64.0 |
| 영희 | 88.0 | 90.0 | 62.0 | 72.0 |
| 민철 | 92.0 | 70.0 | nan  | nan  |
| 수현 | 63.0 | 60.0 | 31.0 | 70.0 |
| 호영 | nan  | 50.0 | nan  | 88.0 |
+------+------+------+------+------+
"""

#1) 결측치를 정제할 규칙 정의
#    → 결측치에 대해 평균점수
imr = SimpleImputer(missing_values=numpy.nan, strategy="mean")

#2) dataframe의 값에 대해 규칙 적용
df_imr = imr.fit_transform(df.values)

#3) 적용된 규칙으로 새로운 데이터 프레임 생성
re_df2 = DataFrame(df_imr, index=df.index, columns=df.columns)
print_df(re_df2)
"""
+------+-------+------+--------------------+------+
|      |  국어 | 영어 |        수학        | 과학 |
+------+-------+------+--------------------+------+
| 철수 |  98.0 | 67.5 |        88.0        | 64.0 |
| 영희 |  88.0 | 90.0 |        62.0        | 72.0 |
| 민철 |  92.0 | 70.0 | 60.333333333333336 | 73.5 |
| 수현 |  63.0 | 60.0 |        31.0        | 70.0 |
示例#24
0
forest_model = RandomForestRegressor(n_estimators=100, random_state=0)
forest_model.fit(train_features, train_target)
melb_preds = forest_model.predict(val_features)
print('MAE_random_forrest:')
MAE_RF = mean_absolute_error(melb_preds, melb_preds)
print(MAE_RF)

# random forest - cross validation
heart_features = [
    'Age', 'Gender', 'Chest_Pain', 'Resting_BP', 'Cholesterol', 'Fasting_BS',
    'RECG', 'Max_Heart_Rate', 'Exercise_Ang', 'ST_Depression', 'ST_Segmen',
    'Major_Vessels', 'Thalassemia'
]
features = heart_data[heart_features]
my_pipeline = Pipeline(
    steps=[('preprocessor', SimpleImputer()
            ), ('model',
                RandomForestRegressor(n_estimators=10, random_state=0))])
scores = -1 * cross_val_score(
    my_pipeline, features, target, cv=10, scoring='neg_mean_absolute_error')

print("MAE cross:\n", scores)
print("Average MAE score (across experiments):")
cross = scores.mean()
print(scores.mean())

# extreme graddient boost
heart_features = [
    'Age', 'Gender', 'Chest_Pain', 'Resting_BP', 'Cholesterol', 'Fasting_BS',
    'RECG', 'Max_Heart_Rate', 'Exercise_Ang', 'ST_Depression', 'ST_Segmen',
    'Major_Vessels', 'Thalassemia'
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.attribute_names]


num_attrb_selected = [
    "Rooms", "Distance", "Bedroom2", "Bathroom", "Car", "Landsize",
    "Lattitude", "Longtitude"
]

num_pipeline = Pipeline([
    ("select_numeric", DataFrameSelector(num_attrb_selected)),
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])


class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series(
            [X[c].value_counts().index[0] for c in X], index=X.columns)
        return self

    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)


cat_attrb_selected = [
示例#26
0
X_train_post_hoc = df

df_test, y_test = load_combine_data(X_test, merged_data, dmri)
X_test_post_hoc = df_test

df = df.drop(columns=['eid', '20016-2.0'], axis=1)
df_test = df_test.drop(columns=['eid', '20016-2.0'], axis=1)

estimator = RandomForestRegressor(n_estimators=250,
                                  criterion='mse',
                                  n_jobs=10,
                                  verbose=1,
                                  random_state=0)

pipeline = Pipeline([('imputation',
                      make_union(SimpleImputer(strategy="median"),
                                 MissingIndicator())),
                     ('estimator', estimator)])

cv = ShuffleSplit(n_splits=100, test_size=0.1, random_state=0)

param_grid = {
    'estimator__max_depth': [5, 10, 20, 40, None],
    'estimator__max_features': [1, 5, 'log2', 'sqrt', 'auto', None]
}
grid_search = GridSearchCV(pipeline,
                           param_grid=param_grid,
                           cv=5,
                           verbose=2,
                           n_jobs=10)
metrics = []
示例#27
0
    # then one-hot encode categorical variables

    if args.dataset == "flchain":
        df = pd.read_csv("./data/surv/flchain.csv")
        E = df["death"]
        T = df["futime"]
        X = df >> drop(X.death, X.futime, X.chapter) \
                >> mutate(mgus=X.mgus.astype(float), age=X.age.astype(float))
        X = X[T > 0]
        E = E[T > 0]
        T = T[T > 0]
        #Y = np.c_[np.log(T) - np.mean(np.log(T)), C]
        Y = Y_join(T, E)
        X_num = X.select_dtypes(include=["float"])
        X_cat = X.select_dtypes(exclude=["float"])
        imputer = SimpleImputer(strategy="median")
        X_num = imputer.fit_transform(X_num.values)
        imputer = SimpleImputer(strategy="most_frequent")
        X_cat = imputer.fit_transform(X_cat.values)
        encoder = OneHotEncoder(sparse=False)
        X_cat = encoder.fit_transform(X_cat)
        X = np.c_[X_num, X_cat]

    elif args.dataset == "support":
        df = pd.read_csv("./data/surv/support2.csv")
        df = df.rename(columns={"d.time": "dtime"})
        T = df["dtime"]
        E = df["death"]
        #Y = np.c_[np.log(T) - np.mean(np.log(T)), C]
        Y = Y_join(T, E)
        df >>= drop(X.dtime, X.death, X.hospdead, X.prg2m, X.prg6m, X.dnr,
from sklearn import datasets
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from os import system
from sklearn.impute import SimpleImputer

NB = GaussianNB()
accuracy=[]
cv=[]
fsc=[]
for i in range(0,10):
    data = read_csv("lung_cancer.csv")
    X=data.iloc[:,1:].values
    Y=data.iloc[:,0]

    imp = SimpleImputer(missing_values='?', strategy='most_frequent')
    X=imp.fit_transform(X)
    X=pd.DataFrame(X)

    X_train, X_test, Y_train, Y_test =train_test_split(X,Y, test_size=0.2)

    NB.fit(X_train, Y_train)

    Y_pred = NB.predict(X_test)

    print(Y_pred)

    cross_val= np.max(cross_val_score(NB,X_train,Y_train,cv=5))
    cm= confusion_matrix(Y_test, Y_pred)
    print("\nCross Validation Score: ", cross_val)
    cv.append(cross_val)
示例#29
0
@author: Rajat sharma
"""
# Importing the libaries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Importing the data set
dataset = pd.read_csv('placement_data')
X = dataset.iloc[:, 1:-1].values
Y = dataset.iloc[:, -1].values

# Removing the Nan Values
from sklearn.impute import SimpleImputer

missing_values = SimpleImputer(missing_values=np.nan, strategy='constant')
Y = Y.reshape(-1, 1)
missing_values = missing_values.fit(Y)
Y = missing_values.transform(Y)

# Changing the Cathogorical data
from sklearn.preprocessing import LabelEncoder

LabelEncoder_X = LabelEncoder()
X[:, 0] = LabelEncoder_X.fit_transform(X[:, 0])
X[:, 2] = LabelEncoder_X.fit_transform(X[:, 2])
X[:, 4] = LabelEncoder_X.fit_transform(X[:, 4])
X[:, 5] = LabelEncoder_X.fit_transform(X[:, 5])
X[:, 7] = LabelEncoder_X.fit_transform(X[:, 7])
X[:, 8] = LabelEncoder_X.fit_transform(X[:, 8])
X[:, 10] = LabelEncoder_X.fit_transform(X[:, 10])
import pandas as pd

base = pd.read_csv('credit_data.csv')
base.loc[base.age < 0, 'age'] = 40.92

previsores = base.iloc[:,1:4].values
classe = base.iloc[:,4].values

from sklearn.impute import SimpleImputer
imputer = SimpleImputer()
imputer = imputer.fit(previsores[:,0:3])
previsores[:,0:3] = imputer.transform(previsores)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
previsores = scaler.fit_transform(previsores)

from sklearn.model_selection import train_test_split
previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split(previsores, classe, test_size=0.25, random_state=0)

from import
classificador =
classificador.fit(previsores_treinamento, classe_treinamento)
previsoes = classificador.predict(previsores_teste)

from sklearn.metrics import confusion_matrix, accuracy_score
precisao = accuracy_score(classe_teste, previsoes)
matriz = confusion_matrix(classe_teste, previsoes)

from collections import Counter
Counter(classe_teste)