Пример #1
0
def initialize_models(X_train, y_train, X_test, y_test, accuracy, fscore):
    # TODO: Initialize the three models
    clf_A = dtc(random_state=13)
    clf_B = rfc(random_state=13)
    clf_C = abc(random_state=13)

    # TODO: Calculate the number of samples for 1%, 10%, and 100% of the training data
    # HINT: samples_100 is the entire training set i.e. len(y_train)
    # HINT: samples_10 is 10% of samples_100 (ensure to set the count of the values to be `int` and not `float`)
    # HINT: samples_1 is 1% of samples_100 (ensure to set the count of the values to be `int` and not `float`)
    samples_100 = len(y_train)
    samples_10 = len(y_train) // 10
    samples_1 = len(y_train) // 100

    # Collect results on the learners
    results = {}
    for clf in [clf_A, clf_B, clf_C]:
        clf_name = clf.__class__.__name__
        results[clf_name] = {}
        for i, samples in enumerate([samples_1, samples_10, samples_100]):
            results[clf_name][i] = train_predict(clf, samples, X_train, y_train, X_test, y_test)

    # Run metrics visualization for the three supervised learning models chosen
    vs.evaluate(results, accuracy, fscore)
    return clf_C
Пример #2
0
def evaluate_algorithms(clf_A, clf_B, clf_C):
    results = {}
    samples_100 = len(y_train)
    samples_10 = int(0.1 * len(y_train))
    samples_1 = int(0.01 * len(y_train))
    for clf in [clf_A, clf_B, clf_C]:
        clf_name = clf.__class__.__name__
        results[clf_name] = {}
        for i, samples in enumerate([samples_1, samples_10, samples_100]):
            results[clf_name][i] = train_predict(clf, samples, X_train,
                                                 y_train, X_test, y_test)

    vs.evaluate(results, accuracy, fscore)
Пример #3
0
#samples_1 is 1% of samples_100 (ensure to set the count of the values to be `int` and not `float`)
samples_100 = len(y_train)
samples_10 = int(0.1 * samples_100)
samples_1 = int(0.01 * samples_100)

#Collect results on the learners
results = {}
for clf in [clf_A, clf_B, clf_C]:
    clf_name = clf.__class__.__name__
    results[clf_name] = {}
    for i, samples in enumerate([samples_1, samples_10, samples_100]):
        results[clf_name][i] = vs.train_predict(clf, samples, X_train, y_train,
                                                X_test, y_test)

#Run metrics visualization for the three supervised learning models chosen
vs.evaluate(results, accuracy, fscore).savefig('performance.jpg')

###GridSearchCV
#Import 'GridSearchCV', 'make_scorer', and any other necessary libraries
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, fbeta_score, accuracy_score
from sklearn.neighbors import KNeighborsClassifier

#Initialize the classifier
clf = DecisionTreeClassifier()

#Create the parameters list you wish to tune, using a dictionary if needed.
#parameters = {'parameter_1': [value1, value2], 'parameter_2': [value1, value2]}
parameters = {
    'min_samples_split': [2, 4, 8, 16, 32],
    'min_samples_leaf': [2, 4, 8, 16, 32],
Пример #4
0
samples_10 = int(0.1 * X_train.shape[0])
samples_100 = X_train.shape[0]

# Collect results on the learners
results = {}
j = 0
for clf in [clf_A, clf_B, clf_C, clf_D, clf_E]:
    clf_name = clf.__class__.__name__ + str(j)
    j += 1
    results[clf_name] = {}
    for i, samples in enumerate([samples_1, samples_10, samples_100]):
        results[clf_name][i] = \
        train_predict(clf, samples, X_train, y_train, X_test, y_test)

# Run metrics visualization for the three supervised learning models chosen
vs.evaluate(results, accuracy, fscore)

#for i in results:
sigmoidsvc = SVC(random_state=1990, kernel='sigmoid')
res = train_predict(sigmoidsvc, samples, X_train, y_train, X_test, y_test)
r = {}
i = 0
for clf in [clf_A, clf_B, clf_C, clf_D, clf_E]:
    clf_name = clf.__class__.__name__ + str(i)
    r[clf_name] = {
        'f_test': results[clf_name][2]['f_test'],
        'acc_test': results[clf_name][2]['acc_test']
    }
    i += 1
r = {}
i = 0
Пример #5
0
results = {}

for clf in [clf_A, clf_B, clf_C]:
    clf_name = clf.__class__.__name__
    results[clf_name] = {}
    results[clf_name] = train_predict(clf, X_train, y_train, X_test, y_test)

# Show processing time in h:m:s
m, s = divmod(time() - time1, 60)
h, m = divmod(m, 60)
print("\nTime elapsed to train classifier: %d:%02d:%02d" % (h, m, s))

### Plot training & prediction times and scores for all classifiers

k = 0  # Training & Prediction times
vs.evaluate(results, 0, k)
k = 1  # Precision score
vs.evaluate(results, precision, k)
k = 2  # F-Score
vs.evaluate(results, fscore, k)

### Model Tuning for Decision Tree or Logistic Regression classifiers
time2 = time()

clf = LogisticRegression()  #clf = DecisionTreeClassifier(random_state=27)

# Create the parameters list to tune
param_grid = {'C': [1, 10, 100, 1000]}
#parameters = {'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random'],
#              'max_depth': [1, 3, 5], 'max_leaf_nodes': [2, 5, 10, 15, 30]}
Пример #6
0
def evaluate(dataset, key_field):
    income_raw = dataset[key_field]

    # 1. start of pre-processing data
    features_raw = dataset.drop(key_field, axis = 1)

    # Visualize skewed continuous features of original data
    # vs.distribution(data)


    # Log-transform the skewed features
    skewed = ['capital-gain', 'capital-loss']
    features_log_transformed = pd.DataFrame(data = features_raw)
    features_log_transformed[skewed] = features_raw[skewed].apply(lambda x: np.log(x + 1))

    # Visualize the new log distributions
    # vs.distribution(features_log_transformed, transformed = True)


    # Initialize a scaler, then apply it to the features
    scaler = MinMaxScaler() # default=(0, 1)
    numerical = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

    features_log_minmax_transform = pd.DataFrame(data = features_log_transformed)
    features_log_minmax_transform[numerical] = scaler.fit_transform(features_log_transformed[numerical])

    # Show an example of a record with scaling applied
    display(features_log_minmax_transform.head(n = 5))

    # vs.distribution(features_log_minmax_transform)


    # TODO: One-hot encode the 'features_log_minmax_transform' data using pandas.get_dummies()
    features_final = pd.get_dummies(features_log_minmax_transform)

    # TODO: Encode the 'income_raw' data to numerical values
    encoder = LabelEncoder()
    income = encoder.fit_transform(income_raw)

    # Print the number of features after one-hot encoding
    encoded = list(features_final.columns)
    print("{} total features after one-hot encoding.".format(len(encoded)))

    # Uncomment the following line to see the encoded feature names
    # print encoded


    X_train, X_test, y_train, y_test = train_test_split(features_final,
                                                        income,
                                                        test_size = 0.2,
                                                        random_state = 0)

    # Show the results of the split
    print("Training set has {} samples.".format(X_train.shape[0]))
    print("Testing set has {} samples.".format(X_test.shape[0]))
    # 1. end of pre-processing data


    # 2. start of building native predictor
    '''
    TP = np.sum(income) # Counting the ones as this is the naive case. Note that 'income' is the 'income_raw' data
    encoded to numerical values done in the data preprocessing step.
    FP = income.count() - TP # Specific to the naive case

    TN = 0 # No predicted negatives in the naive case
    FN = 0 # No predicted negatives in the naive case
    '''
    # TODO: Calculate accuracy, precision and recall
    encoder = LabelEncoder()
    income = encoder.fit_transform(income_raw)

    TP = np.sum(income)
    FP = len(income) - TP

    accuracy = np.true_divide(TP,TP + FP)
    recall = 1
    precision = accuracy

    # TODO: Calculate F-score using the formula above for beta = 0.5 and correct values for precision and recall.
    # HINT: The formula above can be written as (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)
    fscore = (1 + 0.5**2) * (precision * recall) / ((0.5**2 * precision) + recall)

    # Print the results
    print("Naive Predictor: [Accuracy score: {:.4f}, F-score: {:.4f}]".format(accuracy, fscore))
    # 2. end of building native predictor


    # 3. start of evaluation
    # TODO: Initialize the three models
    clf_random_forest = RandomForestClassifier()
    clf_decision_tree = DecisionTreeClassifier(random_state=0)
    clf_C = SVC(kernel = 'rbf')
    clf_M = MLPClassifier(solver='sgd',activation = 'identity',max_iter = 70,alpha = 1e-5,hidden_layer_sizes = (100,50),random_state = 1,verbose = False)

    # TODO: Calculate the number of samples for 1%, 10%, and 100% of the training data
    # HINT: samples_100 is the entire training set i.e. len(y_train)
    # HINT: samples_10 is 10% of samples_100
    # HINT: samples_1 is 1% of samples_100
    samples_100 = len(y_train)
    samples_10 = int(len(y_train)*0.1)
    samples_1 = int(len(y_train)*0.01)

    # Collect results on the learners
    results = {}
    for clf in [clf_random_forest, clf_decision_tree, clf_C, clf_M]:
        clf_name = clf.__class__.__name__
        results[clf_name] = {}
        for i, samples in enumerate([samples_1, samples_10, samples_100]):
            results[clf_name][i] = train_predict(clf, samples, X_train, y_train, X_test, y_test)
            if clf == clf_decision_tree:
                storeTree(clf, "decision_tree")


    # Run metrics visualization for the three supervised learning models chosen
    vs.evaluate(results, accuracy, fscore)
# Calculate the number of samples for 1%, 10%, and 100% of the training data
samples_1 = int(float(len(X_train)) * 0.01)
samples_10 = int(float(len(X_train)) * 0.10)
samples_100 = len(X_train)

# Collect results on the learners
results = {}
for clf in [clf_A, clf_B, clf_C]:
    clf_name = clf.__class__.__name__
    results[clf_name] = {}
    for i, samples in enumerate([samples_1, samples_10, samples_100]):
        results[clf_name][i] = train_predict(clf, samples, X_train, y_train.values.ravel(), X_test, y_test.values.ravel())

# Run metrics visualization for the three supervised learning models chosen
vs.evaluate(results, accuracy, fscore)







# Import 'GridSearchCV', 'make_scorer', and any other necessary libraries
from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import ShuffleSplit

cv_sets = ShuffleSplit(X_train.shape[0], n_iter = 10, test_size = 0.20, random_state = 0)