def learn_decision_tree(stratified_data_csv_file, save_filepath):
	# read the stratified dataset
	data = np.genfromtxt(stratified_data_csv_file, delimiter = ',', skip_header = 1)
	X, y = data[:, :-1], data[:, -1]

	# do a 70-30 train-test split.
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 10)

	# testing parameters
	params = {
		'max_depth': [None, 5, 10, 20, 30],
		'min_samples_split': [2, 5, 10]
	}
	stratified_k_fold = StratifiedKFold(n_splits = 10)

	classifier = GridSearchCV(DecisionTreeClassifier(), params, cv = stratified_k_fold, verbose = 5)
	classifier.fit(X_train, y_train)
	best_classifier = classifier.best_estimator_
	y_pred = best_classifier.predict(X_test)

	# model statistics
	print('Decision Trees Model Statistics')
	print('Best params: {0}'.format(classifier.best_params_))
	model_stats.compute_basic_stats(y_test, y_pred)
	model_stats.compute_roc_score(y_test, y_pred)
	model_stats.plot_normalized_confusion_matrix(
		y_test, y_pred, 'Decision Trees Classifier Normalized Confusion Matrix'
	)

	# fit the classifier on the complete dataset once we get best parameters
	best_classifier = DecisionTreeClassifier(**classifier.best_params_)
	best_classifier.fit(X, y)
	# save the model
	save_model(best_classifier, save_filepath)
예제 #2
0
def learn_sgd(stratified_data_csv_file, save_filepath):
    # read the stratified dataset
    data = np.genfromtxt(stratified_data_csv_file,
                         delimiter=',',
                         skip_header=1)
    X, y = data[:, :-1], data[:, -1]

    # do a 70-30 train-test split.
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.30,
                                                        random_state=10)

    # standardize train and test data (scale X_train to [0, 1] range)
    scaler = StandardScaler().fit(X_train)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)

    # testing parameters
    params = {
        'loss':
        ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
        'penalty': ['l1', 'l2', 'elasticnet'],
        'alpha': [1e-5, 1e-4, 1e-3],
        'max_iter': [200]
    }
    stratified_k_fold = StratifiedKFold(n_splits=10)

    classifier = GridSearchCV(SGDClassifier(),
                              params,
                              cv=stratified_k_fold,
                              verbose=5)
    classifier.fit(X_train, y_train)
    best_classifier = classifier.best_estimator_
    y_pred = best_classifier.predict(X_test)

    # model statistics
    print('SGD Model Statistics')
    print('Best params: {}'.format(classifier.best_params_))
    model_stats.compute_basic_stats(y_test, y_pred)
    model_stats.compute_roc_score(y_test, y_pred)
    model_stats.plot_normalized_confusion_matrix(
        y_test, y_pred, 'SGD Classifier Normalized Confusion Matrix')

    # fit the classifier on the complete dataset once we get best parameters
    best_classifier = SGDClassifier(**classifier.best_params_)
    best_classifier.fit(X, y)
    # save the model
    save_model(best_classifier, save_filepath)
def learn_bagging(stratified_data_csv_file, save_filepath):
    # read the stratified dataset
    data = np.genfromtxt(stratified_data_csv_file,
                         delimiter=',',
                         skip_header=1)
    X, y = data[:, :-1], data[:, -1]

    # do a 70-30 train-test split.
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.30,
                                                        random_state=10)

    ############
    # testing parameters
    params = {
        'base_estimator': [None, KNeighborsClassifier()],
        'n_estimators': [10, 20, 30],
        'max_samples': [0.50, 0.75],
        'max_features': [0.50, 0.75]
    }
    stratified_k_fold = StratifiedKFold(n_splits=10)

    classifier = GridSearchCV(BaggingClassifier(),
                              params,
                              cv=stratified_k_fold,
                              verbose=5,
                              n_jobs=3)
    classifier.fit(X_train, y_train)

    best_classifier = classifier.best_estimator_
    y_pred = best_classifier.predict(X_test)

    print('Bagging Classifier Statistics')
    print('Best params: {}'.format(classifier.best_params_))
    model_stats.compute_basic_stats(y_test, y_pred)
    model_stats.compute_roc_score(y_test, y_pred)
    model_stats.plot_normalized_confusion_matrix(
        y_test, y_pred, 'Bagging Classifier Normalized Confusion Matrix')

    # fit the classifier on the complete dataset once we get best parameters
    best_classifier = BaggingClassifier(**classifier.best_params_)
    best_classifier.fit(X, y)
    # save the model
    save_model(best_classifier, save_filepath)
def learn_rbf_svm(stratified_data_csv_file, save_filepath):
    # read the stratified dataset
    data = np.genfromtxt(stratified_data_csv_file,
                         delimiter=',',
                         skip_header=1)
    X, y = data[:, :-1], data[:, -1]

    # do a 70-30 train-test split.
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.30,
                                                        random_state=10)

    # standardize train and test data (scale X_train to [0, 1] range)
    scaler = StandardScaler().fit(X_train)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)

    # testing parameters
    params = {
        'kernel': ['rbf'],
        'C': np.logspace(-2, 2, 5),
        'gamma': np.logspace(-2, 2, 5)
    }
    stratified_k_fold = StratifiedKFold(n_splits=10)

    classifier = GridSearchCV(SVC(), params, cv=stratified_k_fold, verbose=5)
    classifier.fit(X_train, y_train)
    best_classifier = classifier.best_estimator_
    y_pred = best_classifier.predict(X_test)

    # model statistics
    print('RBF Kernel SVM Model Statistics')
    print('Best params: {}'.format(classifier.best_params_))
    model_stats.compute_basic_stats(y_test, y_pred)
    model_stats.compute_roc_score(y_test, y_pred)
    model_stats.plot_normalized_confusion_matrix(
        y_test, y_pred,
        'RBF Kernel SVM Classifier Normalized Confusion Matrix')

    # fit the classifier on the complete dataset once we get best parameters
    best_classifier = SVC(**classifier.best_params_)
    best_classifier.fit(X, y)
    # save the model
    save_model(best_classifier, save_filepath)
예제 #5
0
def learn_voting_classifier(stratified_data_csv_file, save_filepath):
    # read the stratified dataset
    data = np.genfromtxt(stratified_data_csv_file,
                         delimiter=',',
                         skip_header=1)
    X, y = data[:, :-1], data[:, -1]

    # do a 70-30 train-test split.
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.30,
                                                        random_state=10)

    # classifier to test
    classifiers = [
        # ('dt', DecisionTreeClassifier(max_depth = None, min_samples_split = 2)),
        ('knn', KNeighborsClassifier(n_neighbors=5)),
        # ('lin_svm', SVC(C = 100.0, kernel = 'linear')),
        # ('logreg', LogisticRegression(C = '100.0', max_iter = '200', penalty = 'l1')),
        ('nb', GaussianNB()),
        # ('rbf_svm', SVC(C = 100.0, gamma = 0.1, kernel = 'rbf')),
        ('rf',
         RandomForestClassifier(max_depth=20,
                                min_samples_split=5,
                                n_estimators=30)),
        ('sgd',
         SGDClassifier(alpha=0.0001, loss='log', max_iter=200, penalty='l2')),
        ('bagging',
         BaggingClassifier(base_estimator=None,
                           max_features=0.75,
                           max_samples=0.5,
                           n_estimators=30)),
        # ('boosting', GradientBoostingClassifier(
        # 	learning_rate = 0.1, max_depth = 5, min_samples_split = 5, n_estimators = 300
        # ))
    ]

    # create all possible combinations
    combinations_ = list()
    for i in range(len(classifiers)):
        combinations_.extend(list(itertools.combinations(classifiers, i + 1)))

    # testing parameters
    params = {
        'estimators': combinations_,
        'voting': [
            'soft',
            'hard',
        ],
    }
    stratified_k_fold = StratifiedKFold(n_splits=10)

    classifier = RandomizedSearchCV(VotingClassifier(estimators=None),
                                    params,
                                    cv=stratified_k_fold,
                                    verbose=5,
                                    n_jobs=3)
    classifier.fit(X_train, y_train)
    best_classifier = classifier.best_estimator_
    y_pred = best_classifier.predict(X_test)

    # model statistics
    print('Voting Classifier Statistics')
    print('Best params: {}'.format(classifier.best_params_))
    model_stats.compute_basic_stats(y_test, y_pred)
    model_stats.compute_roc_score(y_test, y_pred)
    model_stats.plot_normalized_confusion_matrix(
        y_test, y_pred, 'Voting Classifier Normalized Confusion Matrix')

    # fit the classifier on the complete dataset once we get best parameters
    best_classifier = VotingClassifier(**classifier.best_params_)
    best_classifier.fit(X, y)
    # save the model
    save_model(best_classifier, save_filepath)
예제 #6
0
def learn(training_data_infile,
          trained_model_outfile=None,
          display_metrics: bool = False,
          gs_verbose: int = 0,
          n_jobs=1):
    """
	Trains a voting classifier

	:param training_data_infile: Csv file containing training data (labeled)
								 • The last column should be training labels
								 • Csv file can contain header (line 1 is skipped)
								 • Use: machine_learning.aux.data_processing.create_training_dataset

	:param trained_model_outfile: where to save the model
	:param display_metrics: whether to print model metrics or not
	:param gs_verbose: verbosity of GridSearch
	:param n_jobs: GridSearch parallel jobs
	:return:
	"""

    training_data_infile = os.path.abspath(training_data_infile)

    # start
    print('-' * 25)
    print('Starting learning for `Voting Classifier`')
    print('training_infile: {:s}'.format(
        str(os.path.relpath(training_data_infile))))
    print('trained_outfile: {:s}'.format(
        str(os.path.relpath(trained_model_outfile)
            ) if trained_model_outfile is not None else 'None'))
    print('display_metric: {:s}, gs_verbose: {:d}, n_jobs: {:d}'.format(
        str(display_metrics), gs_verbose, n_jobs))
    print()

    # read the stratified dataset
    data = np.genfromtxt(training_data_infile, delimiter=',', skip_header=1)
    features_x, target_y = data[:, :-1], data[:, -1]

    # do a 70-30 train-test split.
    x_train, x_test, y_train, y_test = train_test_split(features_x,
                                                        target_y,
                                                        test_size=0.30)

    scaler = StandardScaler()
    features_x = scaler.fit_transform(features_x)
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.fit_transform(x_test)

    # classifier to test
    classifiers = [
        # ('dt', DecisionTreeClassifier(max_depth = None, min_samples_split = 2)),
        ('knn', KNeighborsClassifier(n_neighbors=5)),
        # ('lin_svm', SVC(C = 100.0, kernel = 'linear')),
        # ('logreg', LogisticRegression(C = '100.0', max_iter = '200', penalty = 'l1')),
        ('nb', GaussianNB()),
        # ('rbf_svm', SVC(C = 100.0, gamma = 0.1, kernel = 'rbf')),
        ('rf',
         RandomForestClassifier(max_depth=20,
                                min_samples_split=5,
                                n_estimators=30)),
        ('sgd',
         SGDClassifier(alpha=0.0001, loss='log', max_iter=200, penalty='l2')),
        ('bagging',
         BaggingClassifier(base_estimator=None,
                           max_features=0.75,
                           max_samples=0.5,
                           n_estimators=30)),
        # ('boosting', GradientBoostingClassifier(
        # 	learning_rate = 0.1, max_depth = 5, min_samples_split = 5, n_estimators = 300
        # ))
    ]

    # create all possible combinations
    combinations_ = list()
    for i in range(len(classifiers)):
        combinations_.extend(list(itertools.combinations(classifiers, i + 1)))

    # testing parameters
    params = {
        'estimators': combinations_,
        'voting': [
            'soft',
            'hard',
        ],
    }
    stratified_k_fold = StratifiedKFold(n_splits=10)
    classifier = RandomizedSearchCV(VotingClassifier(estimators=None),
                                    params,
                                    cv=stratified_k_fold,
                                    scoring={
                                        'accuracy':
                                        make_scorer(accuracy_score),
                                        'precision':
                                        make_scorer(precision_score),
                                        'recall': make_scorer(recall_score),
                                        'roc_auc': make_scorer(roc_auc_score),
                                        'f1': make_scorer(f1_score),
                                    },
                                    refit='f1',
                                    verbose=gs_verbose,
                                    n_jobs=n_jobs)

    classifier.fit(x_train, y_train)
    best_classifier = classifier.best_estimator_
    y_pred = best_classifier.predict(x_test)

    print('Voting Classifier Statistics')
    print('Best params: {}'.format(classifier.best_params_))
    model_stats.compute_basic_stats(y_test, y_pred)
    model_stats.compute_roc_score(y_test, y_pred)
    model_stats.plot_normalized_confusion_matrix(
        y_test, y_pred, 'Voting Classifier Normalized Confusion Matrix')

    # fit the classifier on the complete dataset once we get best parameters
    complete_classifier = VotingClassifier(**classifier.best_params_)
    complete_classifier.fit(features_x, target_y)

    # save the model
    if trained_model_outfile:
        try:
            trained_model_outfile = os.path.abspath(trained_model_outfile)
            save_model(complete_classifier, trained_model_outfile)
            print('Classifier successfully saved at: {:s}'.format(
                str(os.path.relpath(trained_model_outfile))))
        except Exception as exc:
            print('Error while saving model! Could not save at: '
                  '{:s}'.format(str(os.path.relpath(trained_model_outfile))))
            print(exc)

    print('-' * 25)
    return complete_classifier