示例#1
0
def split_test_train_data():
    df_wine = pd.read_csv(
        'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data',
        header=None)
    df_wine.columns = [
        'Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of Ash',
        'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols',
        'Proanthocyanins', 'Color intensity', 'Hue',
        'OD280/OD315 of diluted wines', 'Proline'
    ]

    X, y = df_wine.iloc[:, 1:].values, df_iloc[:, 0].values
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=0)
    mms = MinMaxScalar()
    stdsc = StandardScalar()
    X_train_norm = mms.fit_transform(X_train)
    X_test_norm = mmx.transform(X_test)
    X_train_std = stdsc.fit_transform(X_train)
    X_test_std = stdsc.transform(X_test)
示例#2
0
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd 

#Importing the dataset
dataset = pd.read_csv('Social_Network_Ads.csv')
X = dataset.iloc[:, [2,3]].values
y = dataset.iloc[:, 4].values

#Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

#Feature Scaling (Zscore, it standardizes the data) no need in 
from sklearn.preprocessing import StandardScalar
sc_X = StandardScalar()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

#Applying Kernal PCA
from sklearn.decomposition import KernelPCA
kpca = KernelPCA(n_components = 2, kernel = 'rbf')
X_train = kpca.fit_transform(X_train)
X_test = kpca.transform(X_test)

#Fitting Logistic regression to the Training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifer.fit(X_train, y_train)

#Predicting the Test set results
示例#3
0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#dataset problem- classify whether the person will purchase a product or not
#age/salary independent,purchase is the dependent variable
D = pd.read_csv("Social_Network_Ads.csv")
X = D.iloc[:, [2, 3]].values
y = D.iloc[:, 4].values  #depedent variable
#maybe curved or linear line for classification
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)
from sklearn.preprocessing import StandardScalar
sc = StandardScalar()  #feature scaling[-2,+2]
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
Classfier = SVC(kernel='linear', random_state=0)
Classfier.fit(X_train, y_train)
y_pred = Classfier.predict(X_test)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

#visualizing the SVM KERNELS
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(
    np.arange(start=X_set[:, 0].min() - 1,
              stop=X_set[:, 0].max() + 1,
示例#4
0
len(X_train[X_train['Embarked'] == 'S'])
len(X_train[X_train['Embarked'] == 'C'])
len(X_train[X_train['Embarked'] == 'Q'])
len(X_train[X_train['Embarked'] ==
            ' nan'])  # no way of dealing with nan's this way

X_train['Embarked'].fillna('S', inplace=True)  # used S as it is mode
X_train['Embarked'] = labelencoder_X.fit_transform(X_train['Embarked'])

np.mean(X_train['Age'])
X_train['Age'].fillna(np.mean(X_train['Age']), inplace=True)

X_Pclass = pd.get_dummies(X_train['Pclass'],
                          prefix=['Pclass'],
                          drop_first=True)  # onehotencoding dataframe
X_Embarked = pd.get_dummies(X_train['Embarked'],
                            prefix=['Embarked'],
                            drop_first=True)

X_train = X_train.drop('Pclass', axis=1)
X_train = X_train.drop('Embarked', axis=1)
X_train = X_train.append(X_Pclass)
X_train = X_train.append(X_Embarked)

sc = StandardScalar()
X_train = sc.fit_transform(X_train)
X_train = sc.transform(X_train)

knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
knn.fit(X_train, y_train)
示例#5
0
from sklearn.preprocessing import StandardScalar

scaler = StandardScalar()
"""
1)Building a diabetes classifier
You'll be using the Pima Indians diabetes dataset to predict whether a person has diabetes using logistic regression. 
There are 8 features and one target in this dataset. The data has been split into a training and test set and pre-loaded for you as X_train, y_train, X_test, and y_test.

A StandardScaler() instance has been predefined as scaler and a LogisticRegression() one as lr

"""

# Fit the scaler on the training features and transform these in one go
X_train_std = scaler.fit_transform(X_train)

# Fit the logistic regression model on the scaled training data
lr.fit(X_train_std, y_train)

# Scale the test features
X_test_std = scaler.transform(X_test)

# Predict diabetes presence on the scaled test set
y_pred = lr.predict(X_test_std)

# Prints accuracy metrics and feature coefficients
print("{0:.1%} accuracy on test set.".format(accuracy_score(y_test, y_pred)))
print(dict(zip(X.columns, abs(lr.coef_[0]).round(2))))
"""
79.6% accuracy on test set.
{'bmi': 0.38, 'insulin': 0.19, 'glucose': 1.23, 'diastolic': 0.03, 'family': 0.34, 'age': 0.34, 'triceps': 0.24, 'pregnant': 0.04}
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

dataset = pd.read_csv('Social_Network.csv')
x = dataset.iloc[:, [2, 3]].values
y = dataset.iloc[:, 4].values

from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)

from sklearn.preprocessing import StandardScalar
sc_x = StandardScalar()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(x_train, y_train)

y_pred = classifier.predict(x_test)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

from matplotlib.colors import ListedColormap
x_set, y_set = x_train, y_train
x1, x2 = np.meshgrid(
# connect imputer to the features rows and columns
imputer.fit(x[:, 1:3])
# return the new matrix with missing data filled in
x[:, 1:3] = imputer.transform(x[:, 1:3])

# Handling Categorical Data

# Features - (country, age, salary)
# One Hot Encoding
# transforms column[0] with the encoder transformer OneHotEncoder().
# Passes through to return all columns created
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])],
                       remainder="passthrough")

# Run transformer on matrix
x = np.array(ct.fit_transform(x))

# Labels - (purchased)
le = LabelEncoder()
y = le.fit_transform(y)

# split training and test data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Feature Scaling
sc = StandardScalar()

# transform only numerical columns
x_train[:, 3:] = sc.fit_transform(x_train[:, 3:])  # all rows and columns 3+
x_test[:, 3:] = sc.fit_transform(x_test[:, 3:])  # all rows and columns 3+
sns.distplot(train.groupby('Item_Fat_Content').size())

outlet_size = pd.pivot_table(data=train,
                             values='Item_Outlet_Sales',
                             index='Item_Fat_Content',
                             aggfunc=[sum, np.mean])

plt.bar(outlet_size.index, outlet_size[outlet_size.columns[1]])

corr_mat = train.corr()
sns.heatmap(corr_mat)

train.isnull().sum()
train.isnotnull().sum()

train.drop('Item_Identifier', axis=1, inplace=True)
train['Outlet_Size'].fillna(train['Outlet_Size'].mode()[0], inplace=True)

train.hist()

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder = LabelEncoder()
labelencoder.fit_transform(train['Outlet_Size'])
onehotencoder = OneHotEncoder(categorical_features='Outlet_Size')
train = onehotencoder.fit_transform(train).toarray()
train = pd.DataFrame(train)

from sklearn.preprocessing import StandardScalar
sc = StandardScalar()
sc_x = sc.fit_transform(train)
示例#9
0
# -*- coding: utf-8 -*-
"""
PCA on iris dataset
"""

import pandas as pd

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"

#load data iris into pandas data frame
df = pd.read_csv(url, names=['sepal length','sepal width','petal length','petal width','target'])
#a = pd.read_table('http://bit.ly/movieusers',sep='|',header=None)
print(df.head())

from sklearn.preprocessing import StandardScalar

features = ['sepal length','sepal width','petal length','petal width']
x = df.i[:,features].values
y = df.loc[:,['target']].values
x = StandardScalar().fit_transform(x)

print(x.head())
示例#10
0
#X_nonnumer = pd.get_dummies(df[NONNUMER])

df = df.get_dummies(drop_first=True, prefix=['',''])
#if ordinal
#df.column = pd.Categorical(values=df.column, categories=[<ascending order>], ordered=True)

y = df[:,-1] #target
x_features = ['', '',...]
X = df[x]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.1, random_state=404, stratify=y)

#build a pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScalar #scale/normalize
from sklearn. import
pipeline=[('imputation', SimpleImputer()), ('scalar', StandardScalar()), ('estimator',estimator)] #tuple (name_to_give_the_step, estimator)

#feed pipeline into a randomsearchcv
from sklearn.model_selection import RandomizedSearchCV
parameters = {estimator__n_neighors: np.arange(1,50)}
cv = RandomizedSearchCV(pipeline, param_grid=parameters, cv=5)
cv.fit(X_train, y_train)
cv.best_params_
cv.score

#feed best parameters into pipeline
estimator = ()
pipeline=[('imputation', SimpleImputer()), ('scalar', StandardScalar()), ('estimator',estimator)]
pipeline.predict(X_test)

"""checkpt"""