def run(self): """Performs various stages in predictive modeling""" #Path to Data set. path = "../../neeraj/resource/pima-indians-diabetes.data" #Column names of Data set. column_names = [ ' preg ' , ' plas ' , ' pres ' , ' skin ' , ' test ' , ' mass ' , ' pedi ' , ' age ' , ' class ' ] #Loading Data set using class DatasetLoader. load_data = DatasetLoader(path, column_names) data = load_data.load() load_data.print_shape(data) #Understanding data using class DataExplorer. explore_data = DataExplorer() explore_data.print_data_statistics(data) explore_data.visualize(data) #Performing data preprocessing. process_data = DataPreprocessor() input_set, output_set = process_data.split_dataset(data,0,8,8) process_data.display_dataset() process_data.summarize(input_set, 0, 5, 3) #Model evaluation using class Evaluator. evaluator = Evaluator() evaluator.validate(LogisticRegression(), input_set, output_set, 10, 7) evaluator.evaluate(LogisticRegression(), input_set, output_set, 10, 7,'log_loss') #Selecting best model using class ModelSelector. model = ModelSelector() #A set of models for selection. models = [] models.append(( ' LR ' , LogisticRegression())) models.append(( ' LDA ' , LinearDiscriminantAnalysis())) models.append(( ' RF ' , RandomForestClassifier(n_estimators=100, max_features=3))) selected_model = model.select_model(models, input_set, output_set, 10, 7) print("\nSelected Model:\n %s") % (selected_model) #Improving Accuracy using class AccuracyImprover. improve_accuracy = AccuracyImprover() improve_accuracy.tuning(Ridge(),input_set, output_set) improve_accuracy.ensemble_prediction(RandomForestClassifier(n_estimators=100, max_features=3), input_set, output_set, 10, 7) #Finalizing the model and performing prediction. finalize_model = ModelFinalizer() input_train, input_test, output_train, output_test = finalize_model.split_train_test_sets(input_set, output_set, 0.33, 7) finalize_model.finalize_and_save(LogisticRegression(), "../../neeraj/resource/pima_model.sav", input_train, output_train) finalize_model.predict("../../neeraj/resource/pima_model.sav", input_test, output_test)
import argparse # Parse arguments ap = argparse.ArgumentParser() ap.add_argument("-d", "--dataset", required=True, help="path to input dataset") ap.add_argument("-k", "--neighbors", type=int, default=1, help="# of nearest neighbors for classification") ap.add_argument("-j", "--jobs", type=int, default=-1, help="# of CPU cores used for classification") args = vars(ap.parse_args()) # Load images print("[INFO] Loading images") image_paths = list(paths.list_images(args["dataset"])) sdl = DatasetLoader() (data, labels) = sdl.load(image_paths, verbose=500) data = data.reshape((data.shape[0], 32*32*3)) # Encode labels as intergers le = LabelEncoder() labels = le.fit_transform(labels) # Partition the data. # training: 75%, testing: 25% (trainX, testX, trainY, testY) = train_test_split(data, labels, test_size=0.25, random_state=42) # Evaluate k-NN classifier model = KNeighborsClassifier(n_neighbors=args["neighbors"], n_jobs=args["jobs"], weights = 'distance') model.fit(trainX, trainY) print(classification_report(testY, model.predict(testX), target_names=le.classes_))
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Main. @author: neeraj """ from dataset_loader import DatasetLoader from data_preprocessor import DataPreprocessor from model_builder import ModelBuilder #Loading dataset data_loader = DatasetLoader('../resources/ann-train.data') train = data_loader.load() data_loader = DatasetLoader('../resources/ann-test.data') test = data_loader.load() #Preprocessing data dp = DataPreprocessor() train, test = dp.preprocess(train, test) #Splitting data to predictors and target vaiables train_X, train_y = dp.split_predictors(train) test_X, test_y = dp.split_predictors(test) #splitting data for validation set X_train, X_val, y_train, y_val = dp.validation_split(train_X, train_y) #scaling train and validation data X_train, X_val = dp.scale_data(X_train, X_val)