def split_test_train_data(): df_wine = pd.read_csv( 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None) df_wine.columns = [ 'Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of Ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline' ] X, y = df_wine.iloc[:, 1:].values, df_iloc[:, 0].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) mms = MinMaxScalar() stdsc = StandardScalar() X_train_norm = mms.fit_transform(X_train) X_test_norm = mmx.transform(X_test) X_train_std = stdsc.fit_transform(X_train) X_test_std = stdsc.transform(X_test)
import numpy as np import matplotlib.pyplot as plt import pandas as pd #Importing the dataset dataset = pd.read_csv('Social_Network_Ads.csv') X = dataset.iloc[:, [2,3]].values y = dataset.iloc[:, 4].values #Splitting the dataset into the Training set and Test set from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0) #Feature Scaling (Zscore, it standardizes the data) no need in from sklearn.preprocessing import StandardScalar sc_X = StandardScalar() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) #Applying Kernal PCA from sklearn.decomposition import KernelPCA kpca = KernelPCA(n_components = 2, kernel = 'rbf') X_train = kpca.fit_transform(X_train) X_test = kpca.transform(X_test) #Fitting Logistic regression to the Training set from sklearn.linear_model import LogisticRegression classifier = LogisticRegression(random_state = 0) classifer.fit(X_train, y_train) #Predicting the Test set results
import pandas as pd import numpy as np import matplotlib.pyplot as plt #dataset problem- classify whether the person will purchase a product or not #age/salary independent,purchase is the dependent variable D = pd.read_csv("Social_Network_Ads.csv") X = D.iloc[:, [2, 3]].values y = D.iloc[:, 4].values #depedent variable #maybe curved or linear line for classification from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) from sklearn.preprocessing import StandardScalar sc = StandardScalar() #feature scaling[-2,+2] X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) Classfier = SVC(kernel='linear', random_state=0) Classfier.fit(X_train, y_train) y_pred = Classfier.predict(X_test) from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) #visualizing the SVM KERNELS from matplotlib.colors import ListedColormap X_set, y_set = X_test, y_test X1, X2 = np.meshgrid( np.arange(start=X_set[:, 0].min() - 1, stop=X_set[:, 0].max() + 1,
len(X_train[X_train['Embarked'] == 'S']) len(X_train[X_train['Embarked'] == 'C']) len(X_train[X_train['Embarked'] == 'Q']) len(X_train[X_train['Embarked'] == ' nan']) # no way of dealing with nan's this way X_train['Embarked'].fillna('S', inplace=True) # used S as it is mode X_train['Embarked'] = labelencoder_X.fit_transform(X_train['Embarked']) np.mean(X_train['Age']) X_train['Age'].fillna(np.mean(X_train['Age']), inplace=True) X_Pclass = pd.get_dummies(X_train['Pclass'], prefix=['Pclass'], drop_first=True) # onehotencoding dataframe X_Embarked = pd.get_dummies(X_train['Embarked'], prefix=['Embarked'], drop_first=True) X_train = X_train.drop('Pclass', axis=1) X_train = X_train.drop('Embarked', axis=1) X_train = X_train.append(X_Pclass) X_train = X_train.append(X_Embarked) sc = StandardScalar() X_train = sc.fit_transform(X_train) X_train = sc.transform(X_train) knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2) knn.fit(X_train, y_train)
from sklearn.preprocessing import StandardScalar scaler = StandardScalar() """ 1)Building a diabetes classifier You'll be using the Pima Indians diabetes dataset to predict whether a person has diabetes using logistic regression. There are 8 features and one target in this dataset. The data has been split into a training and test set and pre-loaded for you as X_train, y_train, X_test, and y_test. A StandardScaler() instance has been predefined as scaler and a LogisticRegression() one as lr """ # Fit the scaler on the training features and transform these in one go X_train_std = scaler.fit_transform(X_train) # Fit the logistic regression model on the scaled training data lr.fit(X_train_std, y_train) # Scale the test features X_test_std = scaler.transform(X_test) # Predict diabetes presence on the scaled test set y_pred = lr.predict(X_test_std) # Prints accuracy metrics and feature coefficients print("{0:.1%} accuracy on test set.".format(accuracy_score(y_test, y_pred))) print(dict(zip(X.columns, abs(lr.coef_[0]).round(2)))) """ 79.6% accuracy on test set. {'bmi': 0.38, 'insulin': 0.19, 'glucose': 1.23, 'diastolic': 0.03, 'family': 0.34, 'age': 0.34, 'triceps': 0.24, 'pregnant': 0.04}
import numpy as np import matplotlib.pyplot as plt import pandas as pd dataset = pd.read_csv('Social_Network.csv') x = dataset.iloc[:, [2, 3]].values y = dataset.iloc[:, 4].values from sklearn.cross_validation import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0) from sklearn.preprocessing import StandardScalar sc_x = StandardScalar() x_train = sc_x.fit_transform(x_train) x_test = sc_x.transform(x_test) from sklearn.linear_model import LogisticRegression classifier = LogisticRegression(random_state=0) classifier.fit(x_train, y_train) y_pred = classifier.predict(x_test) from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) from matplotlib.colors import ListedColormap x_set, y_set = x_train, y_train x1, x2 = np.meshgrid(
# connect imputer to the features rows and columns imputer.fit(x[:, 1:3]) # return the new matrix with missing data filled in x[:, 1:3] = imputer.transform(x[:, 1:3]) # Handling Categorical Data # Features - (country, age, salary) # One Hot Encoding # transforms column[0] with the encoder transformer OneHotEncoder(). # Passes through to return all columns created ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder="passthrough") # Run transformer on matrix x = np.array(ct.fit_transform(x)) # Labels - (purchased) le = LabelEncoder() y = le.fit_transform(y) # split training and test data x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) # Feature Scaling sc = StandardScalar() # transform only numerical columns x_train[:, 3:] = sc.fit_transform(x_train[:, 3:]) # all rows and columns 3+ x_test[:, 3:] = sc.fit_transform(x_test[:, 3:]) # all rows and columns 3+
sns.distplot(train.groupby('Item_Fat_Content').size()) outlet_size = pd.pivot_table(data=train, values='Item_Outlet_Sales', index='Item_Fat_Content', aggfunc=[sum, np.mean]) plt.bar(outlet_size.index, outlet_size[outlet_size.columns[1]]) corr_mat = train.corr() sns.heatmap(corr_mat) train.isnull().sum() train.isnotnull().sum() train.drop('Item_Identifier', axis=1, inplace=True) train['Outlet_Size'].fillna(train['Outlet_Size'].mode()[0], inplace=True) train.hist() from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelencoder = LabelEncoder() labelencoder.fit_transform(train['Outlet_Size']) onehotencoder = OneHotEncoder(categorical_features='Outlet_Size') train = onehotencoder.fit_transform(train).toarray() train = pd.DataFrame(train) from sklearn.preprocessing import StandardScalar sc = StandardScalar() sc_x = sc.fit_transform(train)
# -*- coding: utf-8 -*- """ PCA on iris dataset """ import pandas as pd url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data" #load data iris into pandas data frame df = pd.read_csv(url, names=['sepal length','sepal width','petal length','petal width','target']) #a = pd.read_table('http://bit.ly/movieusers',sep='|',header=None) print(df.head()) from sklearn.preprocessing import StandardScalar features = ['sepal length','sepal width','petal length','petal width'] x = df.i[:,features].values y = df.loc[:,['target']].values x = StandardScalar().fit_transform(x) print(x.head())
#X_nonnumer = pd.get_dummies(df[NONNUMER]) df = df.get_dummies(drop_first=True, prefix=['','']) #if ordinal #df.column = pd.Categorical(values=df.column, categories=[<ascending order>], ordered=True) y = df[:,-1] #target x_features = ['', '',...] X = df[x] X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.1, random_state=404, stratify=y) #build a pipeline from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScalar #scale/normalize from sklearn. import pipeline=[('imputation', SimpleImputer()), ('scalar', StandardScalar()), ('estimator',estimator)] #tuple (name_to_give_the_step, estimator) #feed pipeline into a randomsearchcv from sklearn.model_selection import RandomizedSearchCV parameters = {estimator__n_neighors: np.arange(1,50)} cv = RandomizedSearchCV(pipeline, param_grid=parameters, cv=5) cv.fit(X_train, y_train) cv.best_params_ cv.score #feed best parameters into pipeline estimator = () pipeline=[('imputation', SimpleImputer()), ('scalar', StandardScalar()), ('estimator',estimator)] pipeline.predict(X_test) """checkpt"""