Exemplo n.º 1
0
def test_imputation_error_invalid_strategy(strategy):
    X = np.ones((3, 5))
    X[0, 0] = np.nan

    with pytest.raises(ValueError, match=str(strategy)):
        imputer = SimpleImputer(strategy=strategy)
        imputer.fit_transform(X)
Exemplo n.º 2
0
def test_imputation_deletion_warning(strategy):
    X = np.ones((3, 5))
    X[:, 0] = np.nan

    with pytest.warns(UserWarning, match="Deleting"):
        imputer = SimpleImputer(strategy=strategy, verbose=True)
        imputer.fit_transform(X)
Exemplo n.º 3
0
def test_imputation_mean_median_error_invalid_type(strategy, dtype):
    X = np.array([["a", "b", 3],
                  [4, "e", 6],
                  ["g", "h", 9]], dtype=dtype)

    with pytest.raises(ValueError, match="non-numeric data"):
        imputer = SimpleImputer(strategy=strategy)
        imputer.fit_transform(X)
Exemplo n.º 4
0
def test_imputation_constant_error_invalid_type(X_data, missing_value):
    # Verify that exceptions are raised on invalid fill_value type
    X = np.full((3, 5), X_data, dtype=float)
    X[0, 0] = missing_value

    with pytest.raises(ValueError, match="imputing numerical"):
        imputer = SimpleImputer(missing_values=missing_value,
                                strategy="constant",
                                fill_value="x")
        imputer.fit_transform(X)
Exemplo n.º 5
0
def test_imputation_shape():
    # Verify the shapes of the imputed matrix for different strategies.
    X = np.random.randn(10, 2)
    X[::2] = np.nan

    for strategy in ['mean', 'median', 'most_frequent']:
        imputer = SimpleImputer(strategy=strategy)
        X_imputed = imputer.fit_transform(X)
        assert_equal(X_imputed.shape, (10, 2))
        X_imputed = imputer.fit_transform(sparse.csr_matrix(X))
        assert_equal(X_imputed.shape, (10, 2))
Exemplo n.º 6
0
def test_imputation_shape():
    # Verify the shapes of the imputed matrix for different strategies.
    X = np.random.randn(10, 2)
    X[::2] = np.nan

    for strategy in ['mean', 'median', 'most_frequent', "constant"]:
        imputer = SimpleImputer(strategy=strategy)
        X_imputed = imputer.fit_transform(sparse.csr_matrix(X))
        assert X_imputed.shape == (10, 2)
        X_imputed = imputer.fit_transform(X)
        assert X_imputed.shape == (10, 2)

        iterative_imputer = IterativeImputer(initial_strategy=strategy)
        X_imputed = iterative_imputer.fit_transform(X)
        assert X_imputed.shape == (10, 2)
Exemplo n.º 7
0
def test_imputation_add_indicator(marker):
    X = np.array([
        [marker, 1,      5,       marker, 1],
        [2,      marker, 1,       marker, 2],
        [6,      3,      marker,  marker, 3],
        [1,      2,      9,       marker, 4]
    ])
    X_true = np.array([
        [3., 1., 5., 1., 1., 0., 0., 1.],
        [2., 2., 1., 2., 0., 1., 0., 1.],
        [6., 3., 5., 3., 0., 0., 1., 1.],
        [1., 2., 9., 4., 0., 0., 0., 1.]
    ])

    imputer = SimpleImputer(missing_values=marker, add_indicator=True)
    X_trans = imputer.fit_transform(X)

    assert_allclose(X_trans, X_true)
    assert_array_equal(imputer.indicator_.features_, np.array([0, 1, 2, 3]))
Exemplo n.º 8
0
def test_simple_imputation_add_indicator_sparse_matrix(arr_type):
    X_sparse = arr_type([
        [np.nan, 1, 5],
        [2, np.nan, 1],
        [6, 3, np.nan],
        [1, 2, 9]
    ])
    X_true = np.array([
        [3., 1., 5., 1., 0., 0.],
        [2., 2., 1., 0., 1., 0.],
        [6., 3., 5., 0., 0., 1.],
        [1., 2., 9., 0., 0., 0.],
    ])

    imputer = SimpleImputer(missing_values=np.nan, add_indicator=True)
    X_trans = imputer.fit_transform(X_sparse)

    assert sparse.issparse(X_trans)
    assert X_trans.shape == X_true.shape
    assert_allclose(X_trans.toarray(), X_true)
Exemplo n.º 9
0
 def __call__(self, data):
     from Orange.data.sql.table import SqlTable
     if isinstance(data, SqlTable):
         return Impute()(data)
     imputer = SimpleImputer(strategy=self.strategy)
     X = imputer.fit_transform(data.X)
     # Create new variables with appropriate `compute_value`, but
     # drop the ones which do not have valid `imputer.statistics_`
     # (i.e. all NaN columns). `sklearn.preprocessing.Imputer` already
     # drops them from the transformed X.
     features = [impute.Average()(data, var, value)
                 for var, value in zip(data.domain.attributes,
                                       imputer.statistics_)
                 if not np.isnan(value)]
     assert X.shape[1] == len(features)
     domain = Orange.data.Domain(features, data.domain.class_vars,
                                 data.domain.metas)
     new_data = data.transform(domain)
     new_data.X = X
     return new_data
Exemplo n.º 10
0
def test_imputation_constant_integer():
    # Test imputation using the constant strategy on integers
    X = np.array([
        [-1, 2, 3, -1],
        [4, -1, 5, -1],
        [6, 7, -1, -1],
        [8, 9, 0, -1]
    ])

    X_true = np.array([
        [0, 2, 3, 0],
        [4, 0, 5, 0],
        [6, 7, 0, 0],
        [8, 9, 0, 0]
    ])

    imputer = SimpleImputer(missing_values=-1, strategy="constant",
                            fill_value=0)
    X_trans = imputer.fit_transform(X)

    assert_array_equal(X_trans, X_true)
Exemplo n.º 11
0
def test_imputation_constant_object(marker):
    # Test imputation using the constant strategy on objects
    X = np.array([
        [marker, "a", "b", marker],
        ["c", marker, "d", marker],
        ["e", "f", marker, marker],
        ["g", "h", "i", marker]
    ], dtype=object)

    X_true = np.array([
        ["missing", "a", "b", "missing"],
        ["c", "missing", "d", "missing"],
        ["e", "f", "missing", "missing"],
        ["g", "h", "i", "missing"]
    ], dtype=object)

    imputer = SimpleImputer(missing_values=marker, strategy="constant",
                            fill_value="missing")
    X_trans = imputer.fit_transform(X)

    assert_array_equal(X_trans, X_true)
Exemplo n.º 12
0
def test_imputation_constant_pandas(dtype):
    # Test imputation using the constant strategy on pandas df
    pd = pytest.importorskip("pandas")

    f = io.StringIO("Cat1,Cat2,Cat3,Cat4\n"
                    ",i,x,\n"
                    "a,,y,\n"
                    "a,j,,\n"
                    "b,j,x,")

    df = pd.read_csv(f, dtype=dtype)

    X_true = np.array([
        ["missing_value", "i", "x", "missing_value"],
        ["a", "missing_value", "y", "missing_value"],
        ["a", "j", "missing_value", "missing_value"],
        ["b", "j", "x", "missing_value"]
    ], dtype=object)

    imputer = SimpleImputer(strategy="constant")
    X_trans = imputer.fit_transform(df)

    assert_array_equal(X_trans, X_true)
Exemplo n.º 13
0
def test_imputation_constant_float(array_constructor):
    # Test imputation using the constant strategy on floats
    X = np.array([
        [np.nan, 1.1, 0, np.nan],
        [1.2, np.nan, 1.3, np.nan],
        [0, 0, np.nan, np.nan],
        [1.4, 1.5, 0, np.nan]
    ])

    X_true = np.array([
        [-1, 1.1, 0, -1],
        [1.2, -1, 1.3, -1],
        [0, 0, -1, -1],
        [1.4, 1.5, 0, -1]
    ])

    X = array_constructor(X)

    X_true = array_constructor(X_true)

    imputer = SimpleImputer(strategy="constant", fill_value=-1)
    X_trans = imputer.fit_transform(X)

    assert_allclose_dense_sparse(X_trans, X_true)
Exemplo n.º 14
0
    #print_histogram(d)
    return results


x_train = pd.read_csv("X_train.csv")
y_train = pd.read_csv("y_train.csv")

y_train = y_train.drop('id', axis=1)
x_train = x_train.drop('id', axis=1)

imputer = SimpleImputer(missing_values=numpy.nan, strategy='median')
# est = ExtraTreesRegressor(n_estimators=10, random_state=42, max_features='sqrt', n_jobs=-1, verbose=0)
# imputer = IterativeImputer( estimator=est, max_iter=10, tol=0.001, n_nearest_features=100
#                               , initial_strategy='median', imputation_order='ascending', verbose=2
#                               , random_state=0)
x_train_filled = imputer.fit_transform(x_train)
x_train = pd.DataFrame(x_train_filled)

results = [
    44, 108, 137, 268, 332, 341, 461, 502, 580, 606, 664, 797, 833, 839, 882,
    1007, 1018, 1148
]  # this is result after 1000 isolation forests

# 3. Scaling

scaler = RobustScaler()
x_train_new = scaler.fit_transform(x_train)
cols = list(x_train.columns.values)
x_train = pd.DataFrame(data=x_train_new, columns=cols, index=x_train.index)

#results = detect_outliers(x_train)
Exemplo n.º 15
0
import numpy as np
import pandas as pd

print("Carregando a base de dados...")
baseDeDados = pd.read_csv('admission.csv', delimiter=';')
X = baseDeDados.iloc[:,:-1].values
y = baseDeDados.iloc[:,-1].values
print("ok!")

print("Preenchendo dados que estão faltando...")
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputer = imputer.fit_transform(X[:,1:])
print("ok!")

print("Computando rotulação...")
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])

X = X[:,1:]
D = pd.get_dummies(X[:,0])
X = np.insert(X, 0, D.values, axis=1)
print("ok!")

print("Separando conjuntos de teste e treino...")
from sklearn.model_selection import train_test_split
XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = 0.2)
print("ok!")

#remover warning de dataconversionwarning
df = df.drop(['branch_id', 'seller_code', 'item_total_price', 'register_date'],
             axis=1)

#Filling in missing values.
#The column 'is_churn' is used as the label of the classes (or dependent variable Y),
#	so it is a binary classification and the mean strategy would not be suitable because it would create a third class.
#The most frequent strategy is adopted due to the imbalance between the classes,
#	what makes the probability of filling in the values correctly extremely high.
#Another possibility would be using clustering on the other features of the vectors with non-missing values,
#	so as to create two cluster and, then, predict in which cluster the vectors with missing values are. This
#	information could be used as the value missing. But this would require handling the categorical features,
#	normalizing all features and finding an appropriate clutering method. And this task of finding a suitable method
#	could even involve developing a specific distance metric, since the categorical features do not lay in an Euclidean space.
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

df.loc[:, :] = imputer.fit_transform(df)

#Feature extraction.
X = compute_features(df)

#Keep feature names to indentify the likely reasons later on.
feature_names = [str(name).replace('_', ' ')
                 for name in X.columns.tolist()][1:]

#Separate features and labels.
y = X['is_churn'].to_numpy()
X = X.drop(['is_churn'], axis=1).to_numpy()

#split train and test sets, 25% for test
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
Exemplo n.º 17
0
    print(i)
    print(min)
    print(max)
    num_data.loc[num_data[i] < min, i] = np.nan
    num_data.loc[num_data[i] > max, i] = np.nan

import matplotlib.pyplot as plt

plt.hist(num_data.avg_training_score)

mis_val(num_data)

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean', missing_values=np.nan)
num_data = pd.DataFrame(imputer.fit_transform(num_data), columns=num_cols)


#FINDING highly correlated variables
def corr_matrix(data):
    #extract numeric data
    num_data = data.select_dtypes('float64').copy()
    # Create correlation matrix
    corr_matrix = num_data.corr().abs()
    # Select upper triangle of correlation matrix
    corr_mat = corr_matrix.where(
        np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    upper = corr_matrix.where(
        np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    # Find index of feature columns with correlation greater than 0.95
    to_drop = [
Exemplo n.º 18
0
    # then one-hot encode categorical variables

    if args.dataset == "flchain":
        df = pd.read_csv("./data/surv/flchain.csv")
        E = df["death"]
        Y = df["futime"]
        X = (df >> drop(X.death, X.futime, X.chapter) >> mutate(
            mgus=X.mgus.astype(float), age=X.age.astype(float)))
        X = X[Y > 0]
        E = E[Y > 0]
        Y = Y[Y > 0]
        # Y = np.c_[np.log(T) - np.mean(np.log(T)), C]
        X_num = X.select_dtypes(include=["float"])
        X_cat = X.select_dtypes(exclude=["float"])
        imputer = SimpleImputer(strategy="median")
        X_num = imputer.fit_transform(X_num.values)
        imputer = SimpleImputer(strategy="most_frequent")
        X_cat = imputer.fit_transform(X_cat.values)
        encoder = OneHotEncoder(sparse=False)
        X_cat = encoder.fit_transform(X_cat)
        X = np.c_[X_num, X_cat]

    elif args.dataset == "support":
        df = pd.read_csv("./data/surv/support2.csv")
        df = df.rename(columns={"d.time": "dtime"})
        Y = df["dtime"]
        E = df["death"]
        # Y = np.c_[np.log(Y) - np.mean(np.log(Y)), C]
        df >>= drop(
            X.dtime,
            X.death,
Exemplo n.º 19
0
y = train['Survived']

# Select features
features = ['Pclass', 'Age', 'Fare', 'SibSp', 'Parch']
X = train[features]
X_test = test[features]
# -

# ## Missing Values

# +
# In order to use 'Age' as a feature we need to impute missing values
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')
X_imp = pd.DataFrame(imputer.fit_transform(X))
X_test_imp = pd.DataFrame(imputer.transform(X_test))

X_imp.columns = X.columns
X_test_imp.columns = X_test.columns

X = X_imp
X_test = X_test_imp
# -

X

X_test

# ## Scaling
Exemplo n.º 20
0
# data.index = data.TIME
data = data.drop(columns=['TIME'])

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
d = {}
by_device = data.groupby('DEVICENAME')
for device, device_df in by_device:
    print(device)
    by_day = device_df.groupby('Day')
    for day, day_df in by_day:
        if 24 > len(day_df) > 12:
            day_df = day_df.sort_values('SSCPUIDLE').drop_duplicates(subset=['Day', 'Hour'], keep='last').sort_values('Hour')
            day_df.index = day_df.Hour
            new_df = pd.DataFrame(index=list(range(0, 24)), columns=day_df.columns)
            new_df.update(day_df)
            new_df.DEVICENAME = device
            new_df.Day = day
            new_df.Hour = new_df.index
            new_df.SSCPUIDLE = imp_mean.fit_transform(new_df.SSCPUIDLE.values.reshape(-1, 1))
            d.update({
                device: [device, day, new_df.SSCPUIDLE.values]
            })
        else:
            continue

df = pd.DataFrame.from_dict(d, orient='index', columns=['Device', 'Day', 'TimeSeries']).reset_index(drop=True)
df['Location'] = df.Device.str.extract(r'([^-]*).*')
ts_data = df.reindex(['Location', 'Device', 'Day', 'TimeSeries'], axis=1)


joblib.dump([data, ts_data], "data/data.job")
Exemplo n.º 21
0
# Data Preprocessing Template

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('c:\src\ML-AtoZ\Part 1 - Data Preprocessing\Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values

#Cleaning Data - Using the new functions.
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
X[:, 1:3] = imp_mean.fit_transform(X[:, 1:3])

#Encode Category Data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le_X = LabelEncoder()
X[:, 0] = le_X.fit_transform(X[:, 0])
ohe_X = OneHotEncoder(categorical_features=[0])
X = ohe_X.fit_transform(X).toarray()

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
process = make_column_transformer((OneHotEncoder(), [0]),
                                  remainder="passthrough")

A = process.fit_transform(X)
Exemplo n.º 22
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from io import StringIO

csv_data = '''A,B,C,D
            1.0,2.0,3.0,4.0
            5.0,6.0,,8.0
            10.0,11.0,12.0,'''

df = pd.read_csv(StringIO(csv_data))
from sklearn.impute import SimpleImputer

imr = SimpleImputer(strategy='constant', fill_value=0)
imr.fit(df)
imr.fit_transform(df)

imputed_data = imr.transform(df.values)
print(imputed_data)
Exemplo n.º 23
0
    train_X = data_set["train_X"].values
    train_y = data_set["train_Y"].values

    for i in range(len(data_set["train_X"])):
        if train_y[i] < 0:
            np.delete(train_y, i, 0)
            np.delete(train_X, i, 0)

    X_train, X_test, y_train, y_test = train_test_split(
        train_X, train_y, test_size=0.1, random_state=0)

    onehot_cats = list()
    for (i, _) in ONEHOT_CATEGORICAL_FEATURE_KEYS:
        cat_impute = SimpleImputer(strategy='constant')
        X_train[:, i] = cat_impute.fit_transform(X_train[:, i].reshape(-1, 1)).reshape(-1)
        onehot = OneHotEncoder()
        onehot_model = onehot.fit(X_train[:, i].reshape(-1, 1))
        onehot_cats.append(onehot_model.categories_)

    tmp_l = list()
    for l in onehot_cats:
        tmp_l.append(l[0].tolist())

    X_train = preprocessing(X_train, categories=tmp_l)
    X_test = preprocessing(X_test, categories=tmp_l)
    X_pred = preprocessing(data_set["pred_X"].values, categories=tmp_l)

    poly = PolynomialFeatures(degree=2)
    X_train = poly.fit_transform(X_train)
    X_test = poly.fit_transform(X_test)
Exemplo n.º 24
0
def init_gmm(features, n_components):
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    init_x = imp.fit_transform(features)
    gmm = GaussianMixture(n_components=n_components,
                          covariance_type='diag').fit(init_x)
    return gmm
Exemplo n.º 25
0
### Approach 1: drop columns with missing values
# Get names of columns with missing value
cols_with_missing = [col for col in X.columns if X[col].isnull().any()]

# Drop columns in training and validation data
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)

# Measure quality of the approach 1
print("MAE (drop columns with missing values):")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))

### Approach 2: imputation
# Imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

# Imputation removes column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

# Measure quality of the approach 2
print("MAE (imputation):")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

### Approach 3: an extension to imputation
# Make copies to avoid changing original data (when imputing)
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()
# In[ ]:


#let's turn sex into a numerical feature instead of categorical
from sklearn.preprocessing import LabelEncoder
train_data['Sex'] = LabelEncoder().fit_transform(train_data['Sex'])


# In[ ]:


#handling missing values
#print(train_data.isnull().sum())
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
train_data['Age'] = imp.fit_transform(train_data['Age'].values.reshape(-1,1)).reshape(-1)
print(train_data.isnull().sum())


# In[ ]:


# Find correlations with the target and sort
correlations = train_data.corr()['Survived'].sort_values()

# Display correlations
print('Correlations: \n', correlations)


# In[ ]:
Exemplo n.º 27
0
p1d = combine[((combine["Pclass"] == 3) & (combine["Embarked"] == "S"))]

combine["Fare"][1043] = p1d['Fare'].median()

ddf = combine.copy()

ddf.drop([
    'PassengerId', 'Name', 'Ticket', 'Cabin', 'Family', 'surname', 'Survived'
],
         axis=1,
         inplace=True)

from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy="most_frequent")
X = imp.fit_transform(ddf)

X = pd.DataFrame(X).copy()
X.columns = [
    "Age", "Embarked", "Fare", "Parch", "Pclass", "Sex", "SibSp", "title",
    "Fsize", "FsizeD", "Deck"
]

combine["Age"] = X["Age"]

combine["IsAdult"] = np.where(combine['Age'] < 18, '0', '1')

combine["IsMother"] = np.where((combine['Sex'] == "female")
                               & (combine["Parch"] > 0) & (combine["Age"] > 18)
                               & (combine["title"] != "Miss"), '1', '0')
Exemplo n.º 28
0
def Train(gender,age,educ,SES,MMSE,CDR,eTIV,nWBV,ASF):
    
    import pandas as pd
    import numpy as np
    import seaborn as sns
    oasis_long = pd.read_csv('data\\oasis_longitudinal.csv')
    oasis_long = oasis_long.drop(columns = ['Hand', 'MRI ID', 'MR Delay', 'Subject ID',
                                            'Visit'])
    
    y = oasis_long['Group'].astype('category')
    X = oasis_long.iloc[:, 1:]
    #X1 = X
    
    
    X["M/F"].fillna("M",inplace = True)
    X["Age"].fillna(method='ffill',inplace=True)
    X["EDUC"].fillna(method='ffill',inplace=True)
    X["CDR"].fillna("0",inplace = True)
    X["eTIV"].fillna(method='ffill',inplace=True)
    
    
    from sklearn.preprocessing import LabelEncoder
    le =  LabelEncoder()
    X.iloc[:, 0] = le.fit_transform(X.iloc[:, 0])
    
    #Female 0 Male 1
    
    
    from sklearn.impute import SimpleImputer
    imputer_SES = SimpleImputer(missing_values=np.nan, strategy='median')
    imputer_MMSE = SimpleImputer(missing_values=np.nan, strategy='mean')
    imputer_nWBV = SimpleImputer(missing_values=np.nan, strategy='mean')
    imputer_ASF = SimpleImputer(missing_values=np.nan, strategy='median')
    X.iloc[:, 3:4] = imputer_SES.fit_transform(X.iloc[:, 3:4])   
    X.iloc[:, 4:5] = imputer_MMSE.fit_transform(X.iloc[:, 4:5])
    X.iloc[:,7:8] = imputer_nWBV.fit_transform(X.iloc[:, 7:8])
    X.iloc[:,8:] = imputer_ASF.fit_transform(X.iloc[:, 8:])
    
    ''''
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    X = sc.fit_transform(X)
    
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
    
    '''
#def __Predict__(self,gender,age,educ,SES,MMSE,CDR,eTIV,nWBV,ASF):
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.naive_bayes import GaussianNB
    from sklearn.svm import SVC
    from sklearn.model_selection import cross_val_score
            
    models_list = []
    models_list.append(('LOG', LogisticRegression()))
    models_list.append(('RFC', RandomForestClassifier()))
    models_list.append(('SVM', SVC(gamma = 'scale'))) 
    models_list.append(('NB', GaussianNB()))
    models_list.append(('KNN', KNeighborsClassifier()))
            
    results = []
    names = []
            
    accuracy_score = {}
            
    for name, model in models_list:
        cv_results = cross_val_score(estimator = model, X = X, y = y, cv=10,
                                             scoring='accuracy', n_jobs = -1)
        results.append(cv_results)
        names.append(name)
        #print( "%s: %f " % (name, cv_results.mean()))
        accuracy_score[name]=cv_results.mean()
            
    #print(accuracy_score)
            
    Pred_Form1 = [[gender,age,educ,SES,MMSE,CDR,eTIV,nWBV,ASF]]
            #Pred_Form2 = [[0,60,16,4,30,0,1500,0.8,1]]
            
    max_key = max(accuracy_score, key=accuracy_score.get)
    #print(max_key)
            
    if(max_key=='RFC'):
                
        from sklearn.ensemble import RandomForestClassifier
                #rf = RandomForestClassifier()
        rf = RandomForestClassifier(n_estimators = 50, criterion = 'entropy', bootstrap= True,
                                            max_features = 'auto')
        rf.fit(X, y)
                
        y_pred1 = rf.predict(Pred_Form1)
                #print(y_pred1)
        return y_pred1
                
    elif(max_key=='LOG'):
        #print("CheckPoint1")
                #LogReg
        from sklearn import linear_model
                
        log_reg = linear_model.LogisticRegression(penalty='l2')
        log_reg.fit(X,y)
                
        y_pred_log1 = log_reg.predict(Pred_Form1)
        #print (y_pred_log1)       
        return y_pred_log1
            
    elif(max_key=='NB'):
            #GAUBAS
        from sklearn.naive_bayes import GaussianNB
                
        GB = GaussianNB()
        GB.fit(X,y)
                
        y_pred_GB1 = GB.predict(Pred_Form1)
                
        return y_pred_GB1
Exemplo n.º 29
0
    def random_boruta(self):
        with open(self.result_folder +
                  '/param_CB_{}.json'.format(self.epoch)) as f:
            dati = json.load(f)

            for data in dati:
                del data['value']

                cb_model = CatBoostClassifier(**data)

                cv = StratifiedKFold(n_splits=5, shuffle=True)

                for train_index, test_index in cv.split(self.X, self.y):

                    X_train = self.X.iloc[lambda x: train_index]
                    X_test = self.X.iloc[lambda x: test_index]
                    y_train = np.take(self.y, train_index)
                    y_test = np.take(self.y, test_index)

                    median_imputer = SimpleImputer(missing_values=np.NaN,
                                                   strategy='median')
                    imputer = median_imputer.fit(X_train)
                    vX_train = imputer.transform(X_train)
                    imputertest = median_imputer.fit(X_test)
                    vX_test = imputertest.transform(X_test)

                    X_train = pd.DataFrame(vX_train,
                                           columns=X_train.columns,
                                           index=X_train.index)
                    X_test = pd.DataFrame(vX_test,
                                          columns=X_test.columns,
                                          index=X_test.index)
                    Feature_Selector = BorutaShap(model=cb_model,
                                                  importance_measure='shap',
                                                  percentile=90,
                                                  pvalue=0.1,
                                                  classification=True)

                    Feature_Selector.fit(X_train,
                                         y_train,
                                         n_trials=500,
                                         random_state=0)
                    Feature_Selector.TentativeRoughFix()
                    Feature_Selector.plot(X_size=12,
                                          figsize=(12, 8),
                                          y_scale='log',
                                          which_features='all')

                    Xstrain = Feature_Selector.Subset()
                    selected = [x for x in Xstrain.columns]
                    print('features selected', selected)

                    v_test_X = median_imputer.fit_transform(self.X_test)
                    test_X = pd.DataFrame(v_test_X,
                                          columns=self.X_test.columns,
                                          index=self.X_test.index)

                    cb_model.fit(Xstrain, y_train)

                    print('AUC')
                    cb_model.fit(X_train, y_train)
                    roc = roc_auc_score(y_test,
                                        cb_model.predict_proba(X_test)[:, 1])

                    print(roc)

                    print('AUC TEST')
                    roc_test = roc_auc_score(
                        self.y_test,
                        cb_model.predict_proba(test_X)[:, 1])

                    print(roc_test)
        imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
    elif configs['imputer'] == 'min':
        imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=configs['min_val'])
    else:
        logger.warning("No imputer selected!")

    # run grid search
    if configs['metric'] == 'corr':
        rcv = GridSearchCV(reg, param_grid, n_jobs=100, cv=cv, scoring=corr_score, refit=True)
    else:
        rcv = GridSearchCV(reg, param_grid, n_jobs=100, cv=cv, scoring=configs['metric'], refit=True)
    if configs['imputer'].lower() == 'none':
        rcv.fit(X_train, y_train)
        y_pred = rcv.best_estimator_.predict(X_test)
    else:
        rcv.fit(imputer.fit_transform(X_train), y_train)
        y_pred = rcv.best_estimator_.predict(imputer.transform(X_test))

    # gather results
    if 'sign' in configs:
        sign = configs['sign']
    else:
        sign = -1 if configs['task'] == 'regression' else 1

    if configs['task'].lower() == 'classification':
        test_auc = roc_auc_score(y_test, y_pred)
        score_dict = {'drug_id': drug_id, 'val_mae': sign * rcv.best_score_, 'test_score': test_auc}
    else:
        test_mae = mean_absolute_error(y_test, y_pred)
        test_rmse = mean_squared_error(y_test, y_pred, squared=False)
        test_r2 = r2_score(y_test, y_pred)
Exemplo n.º 31
0
def nan_padding(data, columns):
    for column in columns:
        imputer = SimpleImputer()
        data[column] = imputer.fit_transform(data[column].values.reshape(
            -1, 1))
    return data
def function_q17(event):
    global screen, df
    df = pd.read_csv("DATA SET-2.csv")
    root = Toplevel(screen)
    big_frame = tk.Frame(root,
                         bg='white',
                         width='600',
                         height='630',
                         bd=4,
                         relief=RIDGE)
    big_frame.place(x=50, y=60)
    w = 700
    h = 700
    ws = screen.winfo_screenwidth()
    hs = screen.winfo_screenheight()
    x = (ws / 2) - (w / 2)
    y = (hs / 2) - (h / 2)
    root.geometry("%dx%d+%d+%d" % (w, h, x, y))
    root.configure(background='white')

    df.drop(9148, axis=0, inplace=True)
    df.drop(10472, axis=0, inplace=True)

    #print(df['Installs'].head(5))
    df['Installs'] = df['Installs'].map(lambda x: x.rstrip('+'))
    df['Installs'] = df['Installs'].map(lambda x: ''.join(x.split(',')))

    df['Installs'] = pd.to_numeric(df['Installs'])

    # Data cleaning for "Size" column
    df['Size'] = df['Size'].map(lambda x: x.rstrip('M'))
    df['Size'] = df['Size'].map(lambda x: str(
        round((float(x.rstrip('k')) / 1024), 1)) if x[-1] == 'k' else x)
    df['Size'] = df['Size'].map(lambda x: np.nan
                                if x.startswith('Varies') else x)

    df['Size'] = pd.to_numeric(df['Size'])

    # Replace "NaN" with mean
    imputer = SimpleImputer()
    df['Size'] = imputer.fit_transform(df[['Size']])
    df['Installs'] = imputer.fit_transform(df[['Installs']])

    #now creating linear approximation
    x = df['Size'].values.reshape(
        -1, 1
    )  # this reshape wil converts the data into the specific format in which fit function is required
    y = df['Installs'].values.reshape(-1, 1)

    reg = LinearRegression()
    reg.fit(x, y)
    #reg.coef_calculates slope , reg.intercept_calculates 'C'
    #print(reg.coef_)
    #print(reg.score(x,y))

    #now creating prediction
    prediction = reg.predict(x)

    #now assesing efficiency using R-squared model
    x = df['Size']
    y = df['Installs']
    x2 = sm.add_constant(
        x
    )  #sci-kit is used to eliminate the value of x because x is indipndent variable
    #Ordinary least squares is the simplest and most common estimator in which the two \(\beta\)s are chosen to minimize the square of the distance between
    est = sm.OLS(y, x2)
    est2 = est.fit()
    #print( est2.summary())

    figure3 = plt.Figure(figsize=(5, 4), dpi=100)
    ax3 = figure3.add_subplot(111)
    ax3.scatter(df['Size'], df['Installs'], color='y')
    ax3.plot(df['Size'], prediction, color='r')
    scatter_plot = FigureCanvasTkAgg(figure3, big_frame)
    scatter_plot.get_tk_widget().place(x=50, y=20)
    ax3.legend()
    ax3.set_xlabel("Size of the App")
    ax3.set_ylabel("Installs")
    ax3.set_title("Trend of Install")

    String = """          Conclusion : -  
                Here we have applied Linear Regression to find the Trend 
                As we can observe from above graph There is a Positive Trend 
                From the trend as increase in the size of App influence the 
                number of installs"""
    tk.Label(big_frame,
             text=String,
             font=("Calibri", 13, 'italic'),
             fg='#ad023e',
             bg='white').place(x=10, y=450)

    root.mainloop()
def getPrediction(big_frame):

    global rating, size, installs, price, type, android, df

    df = pd.read_csv("DATA SET-2.csv")

    category = {
        'SPORTS': 0,
        'ENTERTAINMENT': 1,
        'SOCIAL': 2,
        'NEWS_AND_MAGAZINES': 3,
        'EVENTS': 4,
        'TRAVEL_AND_LOCAL': 5,
        'GAME': 6
    }

    for index in range(len(df['Category'])):
        if df['Category'][index] in category:
            continue
        else:
            df.drop(index, axis=0, inplace=True)

    df['Category'] = df['Category'].map(lambda x: category[x]
                                        if (x in category) else -1)

    dict_content_rating = {
        "Adults only 18+": 0,
        "Everyone": 1,
        "Everyone 10+": 2,
        "Mature 17+": 3,
        "Teen": 4
    }

    df['Content Rating NUM'] = df['Content Rating'].map(
        lambda x: dict_content_rating[x] if (x in dict_content_rating) else -1)

    # Data cleaning for "Size" column
    df['Size'] = df['Size'].map(lambda x: x.rstrip('M'))
    df['Size'] = df['Size'].map(lambda x: str(
        round((float(x.rstrip('k')) / 1024), 1)) if x[-1] == 'k' else x)
    df['Size'] = df['Size'].map(lambda x: np.nan
                                if x.startswith('Varies') else x)

    df['Price'] = df['Price'].map(lambda x: x
                                  if x == 0 else x.lstrip('$').rstrip())

    df['Installs'] = df['Installs'].map(lambda x: x.rstrip('+'))
    df['Installs'] = df['Installs'].map(lambda x: ''.join(x.split(',')))

    # Change datatype
    df['Reviews'] = pd.to_numeric(df['Reviews'])
    df['Installs'] = pd.to_numeric(df['Installs'])
    df['Price'] = pd.to_numeric(df['Price'])

    # Replace "NaN" with mean
    imputer = SimpleImputer()
    df['Rating'] = imputer.fit_transform(df[['Rating']])
    # Rounding the mean value to 1 decimal place
    df['Rating'].round(1)
    df.dropna(axis=0, inplace=True)

    #sns.heatmap(df.isnull())

    df['Type'] = df['Type'].map(lambda x: 1 if (x == "Free") else 0)

    global And, val
    And = {}
    val = -1
    df['Android Ver'] = df['Android Ver'].map(lambda x: And[str(x)]
                                              if (str(x) in And) else value(x))
    # Features selection
    features = ['Rating', 'Size', 'Installs', 'Price', 'Type', 'Android Ver']

    #Spliting the datat fro training and testing
    train, test = train_test_split(df, test_size=0.3)

    #creating a response and target variable
    #taking the training data input
    train_x = train[features]  #multiple indepent variable
    train_y = train['Category']  #only one dependent variable
    ##print(list(train.columns))
    train, test = train_test_split(df, test_size=0.3)
    #train,test=train_test_split(df,test_size = 0.2)
    #taking the testing data input
    test_x = test[features]
    test_y = test['Category']
    ##print(list(test.columns))
    """
    #Creating a decision tree model based on the training data
    model = tree.DecisionTreeClassifier()
    model.fit(train_x,train_y)
    #now prediction using the trained model
    prediction = model.predict(test_x)
    #now displaying the predicted vs actual values
    #dataframe = pd.DataFrame(prediction,test_y)
    """
    #idea of random forest to improve efficiency #will create a small different trees
    """ RANDOM FOREST  """

    model = RandomForestClassifier(
        n_estimators=100)  #this will create the group of 100 data
    model.fit(train_x, train_y)
    prediction = model.predict(test_x)
    #now displaying the predicted vs actual values

    #print(metrics.accuracy_score(prediction,test_y))

    #print(classification_report(test_y , prediction))

    rating_app = float(rating.get())
    size_app = float(size.get())
    installs_app = int(installs.get())
    if type == "Free":
        price_app = 0
    else:
        price_app = int(price.get())
    if type == "Free":
        type_app = 1
    else:
        type_app = 0

    android_app = int(And[android.get()])

    prediction = model.predict(
        np.array([
            rating_app, size_app, installs_app, price_app, type_app,
            android_app
        ]).reshape(1, -1))

    #print(prediction)

    #print(model.score(test_x,test_y))

    #print(category)
    val = ""
    for val in category:
        if category[val] == prediction:
            #print(val)
            break

    tk.Label(big_frame,
             text="-----RESULT-----",
             height='2',
             font=("Calibri", 19, 'bold'),
             fg='#ad023e',
             bg='white').place(x=250, y=400)

    #print(val)
    string = "With help of parameters {} category is most likely to be downloaded in comming years".format(
        val)
    tk.Label(big_frame,
             text=string,
             height='2',
             font=("Calibri", 10, 'italic'),
             fg='#ad023e',
             bg='white').place(x=0, y=450)

    string = "Accuracy score for this model is {:.2f}%".format(
        model.score(test_x, test_y) * 100)
    tk.Label(big_frame,
             text=string,
             height='2',
             font=("Calibri", 11, 'italic'),
             fg='#ad023e',
             bg='white').place(x=0, y=500)
# In[17]:


df[colsobject].head()


# In[18]:


### Use Simple Imputer with max_frequency for imputation for categorical variables

from sklearn.impute import SimpleImputer

imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

df[colsobject] = imp_mode.fit_transform(df[colsobject])


# In[19]:


### Use Simple Imputer with median for imputation for continuous variables

imp_median = SimpleImputer(missing_values=np.nan, strategy='median')

df[colsnumeric] = imp_median.fit_transform(df[colsnumeric])


# In[20]:

Exemplo n.º 35
0
X_train_copy = X_train.select_dtypes(exclude=['object'])
X_val_copy = X_val.select_dtypes(exclude=['object'])
object_cols = [
    col for col in X_train.columns if X_train[col].dtype == 'object'
]
X_train_object = X_train.select_dtypes(exclude=['int64', 'float64'])
X_val_object = X_val.select_dtypes(exclude=['int64', 'float64'])
X_test_copy = X_test.select_dtypes(exclude=['object'])
X_test_object = X_test.select_dtypes(exclude=['int64', 'float64'])

# ---------------------------------------------------------------------------------------------

# Imputing missing values
from sklearn.impute import SimpleImputer
final_imputer = SimpleImputer(strategy='most_frequent')
X_train_imputed = pd.DataFrame(final_imputer.fit_transform(X_train_copy),
                               columns=X_train_copy.columns)
X_val_imputed = pd.DataFrame(final_imputer.transform(X_val_copy),
                             columns=X_val_copy.columns)
X_test_imputed = pd.DataFrame(final_imputer.fit_transform(X_test_copy),
                              columns=X_test_copy.columns)

X_train_imputed['temp'] = 1
X_train_object['temp'] = 1
X_train_copy = pd.merge(X_train_object, X_train_imputed,
                        on=['temp']).reindex(X_train.index)
X_train_copy = X_train_copy.drop('temp', axis=1)
X_train_object = X_train_object.drop('temp', axis=1)
X_train_imputed = X_train_imputed.drop('temp', axis=1)
X_train_copy = X_train_copy[X_train.columns]
X_train_imputed.index = X_train_copy.index
Exemplo n.º 36
0
# ids : all ids of cascades that have emo AND are in size range
IDs = list(set(tweets.cascade_id).intersection(emos.cascade_id))

shuffle(IDs)
split = int(len(IDs) * split_ratio)
train_ids, test_ids = pd.DataFrame({'cascade_id': IDs[:split]}), pd.DataFrame(
    {'cascade_id': IDs[split:]})

tweets_train = pd.merge(tweets, train_ids, how='inner').reset_index(drop=True)
tweets_test = pd.merge(tweets, test_ids, how='inner').reset_index(drop=True)
emo_train = pd.merge(emos, train_ids, how='inner').reset_index(drop=True)
emo_test = pd.merge(emos, test_ids, how='inner').reset_index(drop=True)

tweets_train[['user_followers', 'user_followees',
              'user_account_age']] = si.fit_transform(tweets_train[[
                  'user_followers', 'user_followees', 'user_account_age'
              ]].values)
tweets_test[['user_followers', 'user_followees',
             'user_account_age']] = si.transform(tweets_test[[
                 'user_followers', 'user_followees', 'user_account_age'
             ]].values)

# get log of vars
for cname in [
        'user_followers', 'user_followees', 'user_engagement',
        'user_account_age', 'retweet_delay'
]:
    tweets_train[cname + '_log'] = logp(tweets_train[cname].values)
    tweets_test[cname + '_log'] = logp(tweets_test[cname].values)

tweets_train[to_standardize] = ss.fit_transform(
Exemplo n.º 37
0
def exploring_data(housing):
    # Display info about the data
    if False:
        display_info(housing)
        print('# of ocean_prox. categories: \n',
              housing["ocean_proximity"].value_counts(), '\n')
    if False:
        plot_hist(housing)

    # ------------------------------
    # Split Data
    # ------------------------------
    if False:
        # normal
        train, test = split_train_test(housing, 0.2)

        # by id
        housing_with_id = housing.reset_index()
        train_set, test_set = split_train_test_by_id(housing_with_id, 0.2,
                                                     "index")

        housing_with_id[
            "id"] = housing["longitude"] * 10**3 + housing["latitude"]
        train_set, test_set = split_train_test_by_id(housing_with_id, 0.2,
                                                     "id")

        # use scikit-learn (equivalent to split_train_test)
        train_set, test_set = model_selection.train_test_split(housing,
                                                               test_size=0.2,
                                                               random_state=42)

    if True:
        # if import to keep the distribution of income_cat
        bins = [0., 1.5, 3.0, 4.5, 6., np.inf]
        test_size = 0.2

        strat_train_set, strat_test_set = stratified_split(housing,
                                                           cat="median_income",
                                                           bins=bins,
                                                           test_size=test_size)

        housing = strat_train_set.copy()

    # ------------------------------
    # Investigate Data
    # ------------------------------
    if False:
        housing.plot(
            kind="scatter",
            x="longitude",
            y="latitude",
            alpha=0.4,
            s=housing["population"] / 100,
            label="population",
            figsize=(10, 7),
            c="median_house_value",
            cmap=plt.get_cmap("jet"),
            colorbar=True,
        )
        plt.legend()
        plt.show()

    # Correlation
    if False:
        corr_matrix = housing.corr()
        print(corr_matrix["median_house_value"].sort_values(ascending=False))

        # Plot correlation as scatter plots for diff attributes
        attributes = [
            "median_house_value", "median_income", "total_rooms",
            "housing_median_age"
        ]
        pd.plotting.scatter_matrix(housing[attributes], figsize=(12, 8))
        plt.show()

        housing.plot(kind="scatter",
                     x="median_income",
                     y="median_house_value",
                     alpha=0.1)
        plt.show()
        # -> reveals horizontal lines that we may want to remove

    # Attribute Combination
    if False:
        housing["rooms_per_household"] = housing["total_rooms"] / housing[
            "households"]
        housing["bedrooms_per_room"] = housing["total_bedrooms"] / housing[
            "total_rooms"]
        # housing["population_per_household"] = housing["population"]/housing["households"]

        if False:
            corr_matrix = housing.corr()
            print(
                corr_matrix["median_house_value"].sort_values(ascending=False))

    # Preparing Data for machine learning
    if False:
        housing = strat_train_set.drop("median_house_value", axis=1)
    # housing_labels = strat_train_set["median_house_value"].copy()

    # missing values: 3 possibilities
    if False:
        housing.dropna(subset=["total_bedrooms"])  # Get rid of the data
        housing.drop("total_bedrooms",
                     axis=1)  # Get rid of the whole attribute
        median = housing["total_bedrooms"].median()
        housing["total_bedrooms"].fillna(
            median, inplace=True)  # set missing value to zero/median/mean

    if False:
        # Median of category cannot be calculated -> create a copy without that category
        housing_num = housing.drop("ocean_proximity", axis=1)
        imputer = SimpleImputer(strategy="median")
        imputer.fit(housing_num)
        print(imputer.statistics_, housing_num.median().values)

        # Transform data
        X = imputer.transform(housing_num)

        # Combines the fit and the transform in one action
        imputer.fit_transform(housing_num)

        # Recreate a new DataFrame
        housing_tr = pd.DataFrame(X,
                                  columns=housing_num.columns,
                                  index=housing_num.index)
        if False:
            print(housing_tr)

    # Handle Categorical and text
    if False:
        housing_cat = housing[["ocean_proximity"]]
        print(housing_cat.head(10))
        # convert Cat to number
        ordinal_encoder = OrdinalEncoder()
        housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
        print(ordinal_encoder.categories_)
        print(housing_cat_encoded[:10])
        # Problem with this is that 0 and 1 would be seen as close by the algo but not necessarily true
        # -> Prefer to OneHotEncode: 1 new category for the data per category, and for each of these it's either 1 or 0
        cat_encoder = OneHotEncoder()
        housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
        print(cat_encoder.categories_)
        print(housing_cat_1hot)

    if False:
        # This can be done with pandas directly
        housing = pd.get_dummies(housing, prefix='', prefix_sep='')

    # Custom transformers can be created (like OrdinalEncoder, OneHotEncoder, Imputer,...)
    # -> Create a class with fit() (returning itself), transform() and fit_transform() (not needed if TransformerMixin
    # used as a base class) and if BaseEstimator class -> get_params() and set_params()
    if False:
        attr_adder = CombinedAttributesDivide(housing,
                                              add_bedrooms_per_room=False)
        housing_extra_attribs = attr_adder.transform(housing.values)

    # Feature Scaling: fit on training data and then transform training and test set
    # 2 methods:    -> min-max scaling: normalization
    #               -> Standardization
    if False:
        scaler = MinMaxScaler()
        housing_scaled = scaler.fit_transform(housing_extra_attribs)
    if False:
        scaler = StandardScaler()
        housing_scaled = scaler.fit_transform(housing_extra_attribs)
        print(housing_scaled)

    # Pipeline: to organise all transformation of it in a simpler manner
    # On numerical attributes:
    if False:
        num_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy="median")),
            ('attribs_adder', CombinedAttributesDivide(housing)),
            ('std_scaler', StandardScaler()),
        ])
        housing_num_tr = num_pipeline.fit_transform(housing_num)
        print(housing_num_tr)
    # To also take care of categorical attributes:
    if True:
        housing_num = housing.drop("ocean_proximity", axis=1)
        num_attribs = list(housing_num)
        cat_attribs = ["ocean_proximity"]
        num_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy="median")),
            ('attribs_adder', CombinedAttributesDivide(housing)),
            ('std_scaler', StandardScaler()),
        ])
        full_pipeline = ColumnTransformer([("num", num_pipeline, num_attribs),
                                           ("cat", OneHotEncoder(),
                                            cat_attribs)])
        housing_prepared = full_pipeline.fit_transform(housing)
        print(housing_prepared)
Exemplo n.º 38
0
def test_imputation_mean_median_error_invalid_type(strategy, dtype):
    X = np.array([["a", "b", 3], [4, "e", 6], ["g", "h", 9]], dtype=dtype)

    with pytest.raises(ValueError, match="non-numeric data"):
        imputer = SimpleImputer(strategy=strategy)
        imputer.fit_transform(X)
Exemplo n.º 39
0
    def manual_preprocess(self, config, folderLocation):
        """
        This function is for preprocessing the data when the user selects manual preprocessing.                     
        """
        # config = open("preprocess_config.yaml", 'r')
        config_data = yaml.safe_load(open(config, 'r'))

        df = pd.read_csv(config_data["raw_data_address"])

        #### Handling missing data

        # drop columns

        def drop_NA(df):
            # On calling this function it drops all the columns and rows which are compleatly null.
            nan_value = config_data["na_notation"]
            df.replace("", nan_value, inplace=True)
            df = df.dropna(how='all', axis=1, inplace=True)
            df = df.dropna(how='all', inplace=True)

        if config_data['drop_column_name'][0] != None:
            df = df.drop(config_data["drop_column_name"], axis=1)
            drop_NA(df)
        else:
            drop_NA(df)

        # imputation
        if config_data['imputation_column_name'][0] != None:
            strategy_values_list = []
            for index, column in enumerate(
                    config_data["imputation_column_name"]):
                type = config_data["impution_type"][index]
                df_value = df[[column]].values

                if type == "mean":
                    imputer = SimpleImputer(
                        missing_values=config_data["na_notation"],
                        strategy="mean")
                    strategy_values_list.append(df[column].mean())

                elif type == "median":
                    imputer = SimpleImputer(
                        missing_values=config_data["na_notation"],
                        strategy="median")
                    strategy_values_list.append(df[column].median())

                elif type == "most_frequent":
                    imputer = SimpleImputer(
                        missing_values=config_data["na_notation"],
                        strategy="most_frequent")
                    strategy_values_list.append(df[column].mode())

                elif type == 'knn':
                    imputer = KNNImputer(
                        n_neighbors=4,
                        weights="uniform",
                        missing_values=config_data["na_notation"])

                df[[column]] = imputer.fit_transform(df_value)

            df.replace(to_replace=[config_data["na_notation"]], value=0)
            if strategy_values_list != []:
                config_data['mean_median_mode_values'] = strategy_values_list

        else:
            ## Checkin the z scone and replace it with mean if z < 3
            df.replace(to_replace=[config_data["na_notation"]], value=0)
            ####using others for object type data.

        #feature scaling
        if config_data['scaling_column_name'][0] != None:
            for index, column in enumerate(config_data["scaling_column_name"]):
                type = config_data["scaling_type"][index]
                config_data['scaling_values'] = {}
                df_value = df[[column]].values

                if type == "normalization":
                    df_std = (df_value - df_value.min(axis=0)) / (
                        df_value.max(axis=0) - df_value.min(axis=0))
                    scaled_value = df_std * (1 - 0)

                    config_data['scaling_values'][index] = {
                        "min": df_value.min(axis=0),
                        "max": df_value.max(axis=0)
                    }

                elif type == 'standarization':
                    df_std = (df_value - df_value.min(axis=0)) / (
                        df_value.max(axis=0) - df_value.min(axis=0))
                    scaled_value = (df_value - df.value.mean()) / df_std

                    config_data['scaling_values'][index] = {
                        "std": df_std,
                        "mean": df.value.mean()
                    }

                df[[column]] = scaled_value

        #### handling catogarical data
        # encoding

        # Under the following if block only the columns selected by the used will be encoded as choosed by the used.
        if config_data['encode_column_name'][0] != None:
            for index, column in enumerate(config_data["encode_column_name"]):
                type = config_data["encoding_type"][index]

                if type == "Label Encodeing":
                    encoder = LabelEncoder()
                    df[column] = encoder.fit_transform(df[column])

                    label_encoding_dict = dict(
                        zip(encoder.classes_, range(len(encoder.classes_))))
                    config_data['labels'] = {}
                    config_data['labels'] = [label_encoding_dict]

                elif type == "One-Hot Encoding":
                    encoder = OneHotEncoder(drop='first', sparse=False)
                    df_encoded = pd.DataFrame(
                        encoder.fit_transform(df[[column]]))
                    df_encoded.columns = encoder.get_feature_names([column])
                    df.drop([column], axis=1, inplace=True)
                    df = pd.concat([df, df_encoded], axis=1)

            # In case the user missed any column which is object type and need to be encoded will be encoded using OneHot encoding.

        objest_type_column_list = []
        for col_name in df.columns:
            if df[col_name].dtype == 'object':
                objest_type_column_list.append(col_name)
                config_data['encodeing_type'].extend(['One-Hot Encoding'])

        if objest_type_column_list != []:
            config_data['encode_column_name'] = objest_type_column_list

            encoder = OneHotEncoder(drop='first', sparse=False)
            df_encoded = pd.DataFrame(
                encoder.fit_transform(df[objest_type_column_list]))
            df_encoded.columns = encoder.get_feature_names(
                [objest_type_column_list])
            df.drop([objest_type_column_list], axis=1, inplace=True)
            df = pd.concat([df, df_encoded], axis=1)

        # Feature engineering & Feature Selection
        ### Outlier detection & Removel
        # We are removing the outliers if on the basis on z-score.

        if config_data["Remove_outlier"] == True:
            z = np.abs(stats.zscore(df))
            df = df[(z < 3).all(axis=1)]

        # Here we are selecting the column which are having more then 70 correlation.
        if config_data["feature_selection"] == True:
            col_corr = set()
            corr_matrix = df.corr()
            for i in range(len(corr_matrix.columns)):
                for j in range(i):
                    if abs(corr_matrix.iloc[i, j]) > 0.90:
                        col_corr.add(corr_matrix.columns[i])

            df = df.drop(col_corr, axis=1)

            # with the following function we can select highly correlated features
            # it will remove the first feature that is correlated with anything other feature

        # Droping the columns which are left behind and can cause problem at the time of model training.
        for col_name in df.columns:
            if df[col_name].dtype == 'object':
                df = df.drop(col_name, axis=1)

        df.to_csv('clean_data.csv')
        shutil.move("clean_data.csv", folderLocation)
        clean_data_address = os.path.abspath(
            os.path.join(folderLocation, "clean_data.csv"))
        config_data['clean_data_address'] = clean_data_address

        with open(config, 'w') as yaml_file:
            yaml_file.write(yaml.dump(config_data, default_flow_style=False))

        return clean_data_address
Exemplo n.º 40
0
import pandas as pd
import numpy as np
dataset = pd.read_csv("framingham_heart_disease.csv")

X = dataset.iloc[:,:15]
Y = dataset.iloc[:,15:16]
X = X.drop(columns = ['currentSmoker', "education"])
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = 'most_frequent')
X.iloc[:,1:14] = imputer.fit_transform(X.iloc[:,1:14])

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
'''
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
'''
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Input
from keras import regularizers
from keras.models import Model, load_model
from keras.callbacks import ModelCheckpoint, TensorBoard

'''
# Initialising the ANN
classifier = Sequential()
# Adding the input layer and the first hidden layer
def getResult(rating, installs, big_frame):

    df = pd.read_csv("DATA SET-2.csv")

    # Replace "NaN" with mean
    imputer = SimpleImputer()
    df['Rating'] = imputer.fit_transform(df[['Rating']])

    temp = []
    for index in range(len(df['Rating'])):
        if df['Rating'][index] >= rating:
            temp.append(1)
        else:
            temp.append(0)

    cat_rating = pd.DataFrame(zip(temp, temp),
                              columns=["cat_Ratings", "ignore"])

    df = pd.concat([df, cat_rating], axis=1)

    df.drop("ignore", axis=1, inplace=True)

    df.drop(df.index[9148], inplace=True)

    # Data cleaning for "Installs" column
    df['Installs'] = df['Installs'].map(lambda x: x.rstrip('+'))
    df['Installs'] = df['Installs'].map(lambda x: ''.join(x.split(',')))

    df['Installs'] = pd.to_numeric(df['Installs'])

    rating_sum = 0

    rate = []
    #1169
    """ """
    counter = 0
    for index in range(len(df)):
        try:
            if df['Installs'][index] >= installs:
                #if df['Rating'][index]>=rating:""" """
                rate.append(1)
                rating_sum += df['Rating'][index]
                counter += 1
                """ """
            else:
                rate.append(0)

        except:
            #print(index)
            continue

    #print(len(rate))
    avg_rating = (rating_sum / counter)
    """ """
    #print(df['Installs'].corr(df['Rating']))
    """ """
    val = "Yes" if (rating_sum / counter) >= rating else "No"
    rel = "Greater than" if val == "Yes" else "Lesser than"

    fig, ax = plt.subplots(figsize=(10, 10))

    l1 = '{}>='.format(installs)
    l2 = '<{}'.format(installs)

    size = [rate.count(1), rate.count(0)]
    label = [l1, l2]
    title = 'Count of {}'.format(rating)

    figure1 = plt.Figure(figsize=(5, 5), dpi=70)

    #color = cm.rainbow(np.linspace(0, 1, 10))
    #fig1, ax1 = plt.subplots()
    ax3 = figure1.add_subplot(111)
    ax3.pie(size,
            labels=label,
            colors=['red', 'blue'],
            autopct='%1.1f%%',
            startangle=200)
    ax3.set_title(title)
    #ax3.xlim(0,3.0)
    pie_plot = FigureCanvasTkAgg(figure1, big_frame)
    pie_plot.get_tk_widget().place(x=80, y=190)

    tk.Label(big_frame,
             text="--Results--",
             font=("Calibri", 13, 'italic'),
             fg='#ad023e',
             bg='white').place(x=220, y=470)

    String = "Average rating of all the apps who managed to get over {} download is {:.1f}".format(
        installs, avg_rating)

    tk.Label(big_frame,
             text=String,
             font=("Calibri", 13, 'italic'),
             fg='#ad023e',
             bg='white').place(x=0, y=500)

    String = """{}! All those apps who have managed to get over {} downloads , 
            they have to get an average rating of {:.1f} which is {} than {} """.format(
        val, installs, avg_rating, rel, rating)

    tk.Label(big_frame,
             text=String,
             font=("Calibri", 13, 'italic'),
             fg='#ad023e',
             bg='white').place(x=0, y=530)
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor

# Read the data
trainData = pd.read_csv('../input/train.csv')
testData = pd.read_csv('../input/test.csv')

# Select Predictors
trainData.dropna(axis=0, subset=['SalePrice'], inplace=True)
trainy = trainData.SalePrice
trainX = trainData.drop(['SalePrice'],
                        axis=1).select_dtypes(exclude=['object'])

testX = testData.select_dtypes(exclude=['object'])

# Impute NaN columns
myImputer = SimpleImputer()
train_X = myImputer.fit_transform(trainX)
test_X = myImputer.transform(testX)

# Fit model
my_model = XGBRegressor()
my_model.fit(train_X, trainy, verbose=False)

# Make prediction
prediction = my_model.predict(test_X)

# Make result submission file
my_submission = pd.DataFrame({'Id': testData.Id, 'SalePrice': prediction})
my_submission.to_csv('submission.csv', index=False)
    '糖尿病家族史', '一级亲属', '二级亲属', '父亲', '母亲', '父系', '母系', '孕次(次)', '产次(次)',
    '新生儿性别(男=1,女=2)', '胎膜早破(无=0,有=1)', '早产(无=0,有=1)', '羊水过多(无=0,有=1)',
    '妊娠期高血压(无=0,有=1)', '产后出血(无=0,有=1)', '胎膜早剥(无=0,有=1)', '羊水过少(无=0,有=1)',
    '流产(无=0,有=1)', '孕期并发症(无=0,有=1)', '胎儿宫内生长受限/发育迟缓(无=0,有=1)', '巨大儿(无=0,有=1)',
    '胎儿宫内窘迫(无=0,有=1)', '新生儿窒息(无=0,有=1)', '新生儿黄疸/高胆红素血症(无=0,有=1)',
    '低体重儿或小于胎龄儿(无=0,有=1)', '先天畸形(无=0,有=1)', '新生儿低血糖(无=0,有=1)',
    '新生儿合并症(无=0,有=1)', '营养咨询或治疗(GDM患者进行营养治疗,NGT接受孕妇学校讲座。咨询或治疗=1,无=0)',
    'GDM孕妇用胰岛素治疗(是=1,否=0)'
]
X_cat = X[cat_list]
X_num = X.drop(columns=cat_list)

# 定性变量用众位数填充
from sklearn.impute import SimpleImputer
impute = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
y_tr = pd.DataFrame(impute.fit_transform(y), columns=y.columns)
X_necessary_tr = pd.DataFrame(impute.fit_transform(X_necessary),
                              columns=X_necessary.columns)
X_cat_tr = pd.DataFrame(impute.fit_transform(X_cat), columns=X_cat.columns)

# 定量变量用随机森林填充
X_num_tr = X_num.copy()
df = pd.concat([X_necessary_tr, y_tr], axis=1)
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor(n_estimators=100)
forest_reg.get_params()
for col in list(X_num_tr.columns):
    if X_num_tr[col].isna().sum() == 0:
        continue
    fill = X_num_tr[col]
    Ytrain = fill[fill.notnull()]